--- /dev/null
- down_read(&mm->mmap_sem);
+ // SPDX-License-Identifier: GPL-2.0-only
+ /*
+ * Copyright (C) 2010-2012 Advanced Micro Devices, Inc.
+ * Author: Joerg Roedel <jroedel@suse.de>
+ */
+
+ #define pr_fmt(fmt) "AMD-Vi: " fmt
+
+ #include <linux/mmu_notifier.h>
+ #include <linux/amd-iommu.h>
+ #include <linux/mm_types.h>
+ #include <linux/profile.h>
+ #include <linux/module.h>
+ #include <linux/sched.h>
+ #include <linux/sched/mm.h>
+ #include <linux/wait.h>
+ #include <linux/pci.h>
+ #include <linux/gfp.h>
+
+ #include "amd_iommu.h"
+
+ MODULE_LICENSE("GPL v2");
+ MODULE_AUTHOR("Joerg Roedel <jroedel@suse.de>");
+
+ #define MAX_DEVICES 0x10000
+ #define PRI_QUEUE_SIZE 512
+
+ struct pri_queue {
+ atomic_t inflight;
+ bool finish;
+ int status;
+ };
+
+ struct pasid_state {
+ struct list_head list; /* For global state-list */
+ atomic_t count; /* Reference count */
+ unsigned mmu_notifier_count; /* Counting nested mmu_notifier
+ calls */
+ struct mm_struct *mm; /* mm_struct for the faults */
+ struct mmu_notifier mn; /* mmu_notifier handle */
+ struct pri_queue pri[PRI_QUEUE_SIZE]; /* PRI tag states */
+ struct device_state *device_state; /* Link to our device_state */
+ int pasid; /* PASID index */
+ bool invalid; /* Used during setup and
+ teardown of the pasid */
+ spinlock_t lock; /* Protect pri_queues and
+ mmu_notifer_count */
+ wait_queue_head_t wq; /* To wait for count == 0 */
+ };
+
+ struct device_state {
+ struct list_head list;
+ u16 devid;
+ atomic_t count;
+ struct pci_dev *pdev;
+ struct pasid_state **states;
+ struct iommu_domain *domain;
+ int pasid_levels;
+ int max_pasids;
+ amd_iommu_invalid_ppr_cb inv_ppr_cb;
+ amd_iommu_invalidate_ctx inv_ctx_cb;
+ spinlock_t lock;
+ wait_queue_head_t wq;
+ };
+
+ struct fault {
+ struct work_struct work;
+ struct device_state *dev_state;
+ struct pasid_state *state;
+ struct mm_struct *mm;
+ u64 address;
+ u16 devid;
+ u16 pasid;
+ u16 tag;
+ u16 finish;
+ u16 flags;
+ };
+
+ static LIST_HEAD(state_list);
+ static spinlock_t state_lock;
+
+ static struct workqueue_struct *iommu_wq;
+
+ static void free_pasid_states(struct device_state *dev_state);
+
+ static u16 device_id(struct pci_dev *pdev)
+ {
+ u16 devid;
+
+ devid = pdev->bus->number;
+ devid = (devid << 8) | pdev->devfn;
+
+ return devid;
+ }
+
+ static struct device_state *__get_device_state(u16 devid)
+ {
+ struct device_state *dev_state;
+
+ list_for_each_entry(dev_state, &state_list, list) {
+ if (dev_state->devid == devid)
+ return dev_state;
+ }
+
+ return NULL;
+ }
+
+ static struct device_state *get_device_state(u16 devid)
+ {
+ struct device_state *dev_state;
+ unsigned long flags;
+
+ spin_lock_irqsave(&state_lock, flags);
+ dev_state = __get_device_state(devid);
+ if (dev_state != NULL)
+ atomic_inc(&dev_state->count);
+ spin_unlock_irqrestore(&state_lock, flags);
+
+ return dev_state;
+ }
+
+ static void free_device_state(struct device_state *dev_state)
+ {
+ struct iommu_group *group;
+
+ /*
+ * First detach device from domain - No more PRI requests will arrive
+ * from that device after it is unbound from the IOMMUv2 domain.
+ */
+ group = iommu_group_get(&dev_state->pdev->dev);
+ if (WARN_ON(!group))
+ return;
+
+ iommu_detach_group(dev_state->domain, group);
+
+ iommu_group_put(group);
+
+ /* Everything is down now, free the IOMMUv2 domain */
+ iommu_domain_free(dev_state->domain);
+
+ /* Finally get rid of the device-state */
+ kfree(dev_state);
+ }
+
+ static void put_device_state(struct device_state *dev_state)
+ {
+ if (atomic_dec_and_test(&dev_state->count))
+ wake_up(&dev_state->wq);
+ }
+
+ /* Must be called under dev_state->lock */
+ static struct pasid_state **__get_pasid_state_ptr(struct device_state *dev_state,
+ int pasid, bool alloc)
+ {
+ struct pasid_state **root, **ptr;
+ int level, index;
+
+ level = dev_state->pasid_levels;
+ root = dev_state->states;
+
+ while (true) {
+
+ index = (pasid >> (9 * level)) & 0x1ff;
+ ptr = &root[index];
+
+ if (level == 0)
+ break;
+
+ if (*ptr == NULL) {
+ if (!alloc)
+ return NULL;
+
+ *ptr = (void *)get_zeroed_page(GFP_ATOMIC);
+ if (*ptr == NULL)
+ return NULL;
+ }
+
+ root = (struct pasid_state **)*ptr;
+ level -= 1;
+ }
+
+ return ptr;
+ }
+
+ static int set_pasid_state(struct device_state *dev_state,
+ struct pasid_state *pasid_state,
+ int pasid)
+ {
+ struct pasid_state **ptr;
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&dev_state->lock, flags);
+ ptr = __get_pasid_state_ptr(dev_state, pasid, true);
+
+ ret = -ENOMEM;
+ if (ptr == NULL)
+ goto out_unlock;
+
+ ret = -ENOMEM;
+ if (*ptr != NULL)
+ goto out_unlock;
+
+ *ptr = pasid_state;
+
+ ret = 0;
+
+ out_unlock:
+ spin_unlock_irqrestore(&dev_state->lock, flags);
+
+ return ret;
+ }
+
+ static void clear_pasid_state(struct device_state *dev_state, int pasid)
+ {
+ struct pasid_state **ptr;
+ unsigned long flags;
+
+ spin_lock_irqsave(&dev_state->lock, flags);
+ ptr = __get_pasid_state_ptr(dev_state, pasid, true);
+
+ if (ptr == NULL)
+ goto out_unlock;
+
+ *ptr = NULL;
+
+ out_unlock:
+ spin_unlock_irqrestore(&dev_state->lock, flags);
+ }
+
+ static struct pasid_state *get_pasid_state(struct device_state *dev_state,
+ int pasid)
+ {
+ struct pasid_state **ptr, *ret = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&dev_state->lock, flags);
+ ptr = __get_pasid_state_ptr(dev_state, pasid, false);
+
+ if (ptr == NULL)
+ goto out_unlock;
+
+ ret = *ptr;
+ if (ret)
+ atomic_inc(&ret->count);
+
+ out_unlock:
+ spin_unlock_irqrestore(&dev_state->lock, flags);
+
+ return ret;
+ }
+
+ static void free_pasid_state(struct pasid_state *pasid_state)
+ {
+ kfree(pasid_state);
+ }
+
+ static void put_pasid_state(struct pasid_state *pasid_state)
+ {
+ if (atomic_dec_and_test(&pasid_state->count))
+ wake_up(&pasid_state->wq);
+ }
+
+ static void put_pasid_state_wait(struct pasid_state *pasid_state)
+ {
+ atomic_dec(&pasid_state->count);
+ wait_event(pasid_state->wq, !atomic_read(&pasid_state->count));
+ free_pasid_state(pasid_state);
+ }
+
+ static void unbind_pasid(struct pasid_state *pasid_state)
+ {
+ struct iommu_domain *domain;
+
+ domain = pasid_state->device_state->domain;
+
+ /*
+ * Mark pasid_state as invalid, no more faults will we added to the
+ * work queue after this is visible everywhere.
+ */
+ pasid_state->invalid = true;
+
+ /* Make sure this is visible */
+ smp_wmb();
+
+ /* After this the device/pasid can't access the mm anymore */
+ amd_iommu_domain_clear_gcr3(domain, pasid_state->pasid);
+
+ /* Make sure no more pending faults are in the queue */
+ flush_workqueue(iommu_wq);
+ }
+
+ static void free_pasid_states_level1(struct pasid_state **tbl)
+ {
+ int i;
+
+ for (i = 0; i < 512; ++i) {
+ if (tbl[i] == NULL)
+ continue;
+
+ free_page((unsigned long)tbl[i]);
+ }
+ }
+
+ static void free_pasid_states_level2(struct pasid_state **tbl)
+ {
+ struct pasid_state **ptr;
+ int i;
+
+ for (i = 0; i < 512; ++i) {
+ if (tbl[i] == NULL)
+ continue;
+
+ ptr = (struct pasid_state **)tbl[i];
+ free_pasid_states_level1(ptr);
+ }
+ }
+
+ static void free_pasid_states(struct device_state *dev_state)
+ {
+ struct pasid_state *pasid_state;
+ int i;
+
+ for (i = 0; i < dev_state->max_pasids; ++i) {
+ pasid_state = get_pasid_state(dev_state, i);
+ if (pasid_state == NULL)
+ continue;
+
+ put_pasid_state(pasid_state);
+
+ /*
+ * This will call the mn_release function and
+ * unbind the PASID
+ */
+ mmu_notifier_unregister(&pasid_state->mn, pasid_state->mm);
+
+ put_pasid_state_wait(pasid_state); /* Reference taken in
+ amd_iommu_bind_pasid */
+
+ /* Drop reference taken in amd_iommu_bind_pasid */
+ put_device_state(dev_state);
+ }
+
+ if (dev_state->pasid_levels == 2)
+ free_pasid_states_level2(dev_state->states);
+ else if (dev_state->pasid_levels == 1)
+ free_pasid_states_level1(dev_state->states);
+ else
+ BUG_ON(dev_state->pasid_levels != 0);
+
+ free_page((unsigned long)dev_state->states);
+ }
+
+ static struct pasid_state *mn_to_state(struct mmu_notifier *mn)
+ {
+ return container_of(mn, struct pasid_state, mn);
+ }
+
+ static void mn_invalidate_range(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long start, unsigned long end)
+ {
+ struct pasid_state *pasid_state;
+ struct device_state *dev_state;
+
+ pasid_state = mn_to_state(mn);
+ dev_state = pasid_state->device_state;
+
+ if ((start ^ (end - 1)) < PAGE_SIZE)
+ amd_iommu_flush_page(dev_state->domain, pasid_state->pasid,
+ start);
+ else
+ amd_iommu_flush_tlb(dev_state->domain, pasid_state->pasid);
+ }
+
+ static void mn_release(struct mmu_notifier *mn, struct mm_struct *mm)
+ {
+ struct pasid_state *pasid_state;
+ struct device_state *dev_state;
+ bool run_inv_ctx_cb;
+
+ might_sleep();
+
+ pasid_state = mn_to_state(mn);
+ dev_state = pasid_state->device_state;
+ run_inv_ctx_cb = !pasid_state->invalid;
+
+ if (run_inv_ctx_cb && dev_state->inv_ctx_cb)
+ dev_state->inv_ctx_cb(dev_state->pdev, pasid_state->pasid);
+
+ unbind_pasid(pasid_state);
+ }
+
+ static const struct mmu_notifier_ops iommu_mn = {
+ .release = mn_release,
+ .invalidate_range = mn_invalidate_range,
+ };
+
+ static void set_pri_tag_status(struct pasid_state *pasid_state,
+ u16 tag, int status)
+ {
+ unsigned long flags;
+
+ spin_lock_irqsave(&pasid_state->lock, flags);
+ pasid_state->pri[tag].status = status;
+ spin_unlock_irqrestore(&pasid_state->lock, flags);
+ }
+
+ static void finish_pri_tag(struct device_state *dev_state,
+ struct pasid_state *pasid_state,
+ u16 tag)
+ {
+ unsigned long flags;
+
+ spin_lock_irqsave(&pasid_state->lock, flags);
+ if (atomic_dec_and_test(&pasid_state->pri[tag].inflight) &&
+ pasid_state->pri[tag].finish) {
+ amd_iommu_complete_ppr(dev_state->pdev, pasid_state->pasid,
+ pasid_state->pri[tag].status, tag);
+ pasid_state->pri[tag].finish = false;
+ pasid_state->pri[tag].status = PPR_SUCCESS;
+ }
+ spin_unlock_irqrestore(&pasid_state->lock, flags);
+ }
+
+ static void handle_fault_error(struct fault *fault)
+ {
+ int status;
+
+ if (!fault->dev_state->inv_ppr_cb) {
+ set_pri_tag_status(fault->state, fault->tag, PPR_INVALID);
+ return;
+ }
+
+ status = fault->dev_state->inv_ppr_cb(fault->dev_state->pdev,
+ fault->pasid,
+ fault->address,
+ fault->flags);
+ switch (status) {
+ case AMD_IOMMU_INV_PRI_RSP_SUCCESS:
+ set_pri_tag_status(fault->state, fault->tag, PPR_SUCCESS);
+ break;
+ case AMD_IOMMU_INV_PRI_RSP_INVALID:
+ set_pri_tag_status(fault->state, fault->tag, PPR_INVALID);
+ break;
+ case AMD_IOMMU_INV_PRI_RSP_FAIL:
+ set_pri_tag_status(fault->state, fault->tag, PPR_FAILURE);
+ break;
+ default:
+ BUG();
+ }
+ }
+
+ static bool access_error(struct vm_area_struct *vma, struct fault *fault)
+ {
+ unsigned long requested = 0;
+
+ if (fault->flags & PPR_FAULT_EXEC)
+ requested |= VM_EXEC;
+
+ if (fault->flags & PPR_FAULT_READ)
+ requested |= VM_READ;
+
+ if (fault->flags & PPR_FAULT_WRITE)
+ requested |= VM_WRITE;
+
+ return (requested & ~vma->vm_flags) != 0;
+ }
+
+ static void do_fault(struct work_struct *work)
+ {
+ struct fault *fault = container_of(work, struct fault, work);
+ struct vm_area_struct *vma;
+ vm_fault_t ret = VM_FAULT_ERROR;
+ unsigned int flags = 0;
+ struct mm_struct *mm;
+ u64 address;
+
+ mm = fault->state->mm;
+ address = fault->address;
+
+ if (fault->flags & PPR_FAULT_USER)
+ flags |= FAULT_FLAG_USER;
+ if (fault->flags & PPR_FAULT_WRITE)
+ flags |= FAULT_FLAG_WRITE;
+ flags |= FAULT_FLAG_REMOTE;
+
- up_read(&mm->mmap_sem);
++ mmap_read_lock(mm);
+ vma = find_extend_vma(mm, address);
+ if (!vma || address < vma->vm_start)
+ /* failed to get a vma in the right range */
+ goto out;
+
+ /* Check if we have the right permissions on the vma */
+ if (access_error(vma, fault))
+ goto out;
+
+ ret = handle_mm_fault(vma, address, flags);
+ out:
++ mmap_read_unlock(mm);
+
+ if (ret & VM_FAULT_ERROR)
+ /* failed to service fault */
+ handle_fault_error(fault);
+
+ finish_pri_tag(fault->dev_state, fault->state, fault->tag);
+
+ put_pasid_state(fault->state);
+
+ kfree(fault);
+ }
+
+ static int ppr_notifier(struct notifier_block *nb, unsigned long e, void *data)
+ {
+ struct amd_iommu_fault *iommu_fault;
+ struct pasid_state *pasid_state;
+ struct device_state *dev_state;
+ struct pci_dev *pdev = NULL;
+ unsigned long flags;
+ struct fault *fault;
+ bool finish;
+ u16 tag, devid;
+ int ret;
+
+ iommu_fault = data;
+ tag = iommu_fault->tag & 0x1ff;
+ finish = (iommu_fault->tag >> 9) & 1;
+
+ devid = iommu_fault->device_id;
+ pdev = pci_get_domain_bus_and_slot(0, PCI_BUS_NUM(devid),
+ devid & 0xff);
+ if (!pdev)
+ return -ENODEV;
+
+ ret = NOTIFY_DONE;
+
+ /* In kdump kernel pci dev is not initialized yet -> send INVALID */
+ if (amd_iommu_is_attach_deferred(NULL, &pdev->dev)) {
+ amd_iommu_complete_ppr(pdev, iommu_fault->pasid,
+ PPR_INVALID, tag);
+ goto out;
+ }
+
+ dev_state = get_device_state(iommu_fault->device_id);
+ if (dev_state == NULL)
+ goto out;
+
+ pasid_state = get_pasid_state(dev_state, iommu_fault->pasid);
+ if (pasid_state == NULL || pasid_state->invalid) {
+ /* We know the device but not the PASID -> send INVALID */
+ amd_iommu_complete_ppr(dev_state->pdev, iommu_fault->pasid,
+ PPR_INVALID, tag);
+ goto out_drop_state;
+ }
+
+ spin_lock_irqsave(&pasid_state->lock, flags);
+ atomic_inc(&pasid_state->pri[tag].inflight);
+ if (finish)
+ pasid_state->pri[tag].finish = true;
+ spin_unlock_irqrestore(&pasid_state->lock, flags);
+
+ fault = kzalloc(sizeof(*fault), GFP_ATOMIC);
+ if (fault == NULL) {
+ /* We are OOM - send success and let the device re-fault */
+ finish_pri_tag(dev_state, pasid_state, tag);
+ goto out_drop_state;
+ }
+
+ fault->dev_state = dev_state;
+ fault->address = iommu_fault->address;
+ fault->state = pasid_state;
+ fault->tag = tag;
+ fault->finish = finish;
+ fault->pasid = iommu_fault->pasid;
+ fault->flags = iommu_fault->flags;
+ INIT_WORK(&fault->work, do_fault);
+
+ queue_work(iommu_wq, &fault->work);
+
+ ret = NOTIFY_OK;
+
+ out_drop_state:
+
+ if (ret != NOTIFY_OK && pasid_state)
+ put_pasid_state(pasid_state);
+
+ put_device_state(dev_state);
+
+ out:
+ return ret;
+ }
+
+ static struct notifier_block ppr_nb = {
+ .notifier_call = ppr_notifier,
+ };
+
+ int amd_iommu_bind_pasid(struct pci_dev *pdev, int pasid,
+ struct task_struct *task)
+ {
+ struct pasid_state *pasid_state;
+ struct device_state *dev_state;
+ struct mm_struct *mm;
+ u16 devid;
+ int ret;
+
+ might_sleep();
+
+ if (!amd_iommu_v2_supported())
+ return -ENODEV;
+
+ devid = device_id(pdev);
+ dev_state = get_device_state(devid);
+
+ if (dev_state == NULL)
+ return -EINVAL;
+
+ ret = -EINVAL;
+ if (pasid < 0 || pasid >= dev_state->max_pasids)
+ goto out;
+
+ ret = -ENOMEM;
+ pasid_state = kzalloc(sizeof(*pasid_state), GFP_KERNEL);
+ if (pasid_state == NULL)
+ goto out;
+
+
+ atomic_set(&pasid_state->count, 1);
+ init_waitqueue_head(&pasid_state->wq);
+ spin_lock_init(&pasid_state->lock);
+
+ mm = get_task_mm(task);
+ pasid_state->mm = mm;
+ pasid_state->device_state = dev_state;
+ pasid_state->pasid = pasid;
+ pasid_state->invalid = true; /* Mark as valid only if we are
+ done with setting up the pasid */
+ pasid_state->mn.ops = &iommu_mn;
+
+ if (pasid_state->mm == NULL)
+ goto out_free;
+
+ mmu_notifier_register(&pasid_state->mn, mm);
+
+ ret = set_pasid_state(dev_state, pasid_state, pasid);
+ if (ret)
+ goto out_unregister;
+
+ ret = amd_iommu_domain_set_gcr3(dev_state->domain, pasid,
+ __pa(pasid_state->mm->pgd));
+ if (ret)
+ goto out_clear_state;
+
+ /* Now we are ready to handle faults */
+ pasid_state->invalid = false;
+
+ /*
+ * Drop the reference to the mm_struct here. We rely on the
+ * mmu_notifier release call-back to inform us when the mm
+ * is going away.
+ */
+ mmput(mm);
+
+ return 0;
+
+ out_clear_state:
+ clear_pasid_state(dev_state, pasid);
+
+ out_unregister:
+ mmu_notifier_unregister(&pasid_state->mn, mm);
+ mmput(mm);
+
+ out_free:
+ free_pasid_state(pasid_state);
+
+ out:
+ put_device_state(dev_state);
+
+ return ret;
+ }
+ EXPORT_SYMBOL(amd_iommu_bind_pasid);
+
+ void amd_iommu_unbind_pasid(struct pci_dev *pdev, int pasid)
+ {
+ struct pasid_state *pasid_state;
+ struct device_state *dev_state;
+ u16 devid;
+
+ might_sleep();
+
+ if (!amd_iommu_v2_supported())
+ return;
+
+ devid = device_id(pdev);
+ dev_state = get_device_state(devid);
+ if (dev_state == NULL)
+ return;
+
+ if (pasid < 0 || pasid >= dev_state->max_pasids)
+ goto out;
+
+ pasid_state = get_pasid_state(dev_state, pasid);
+ if (pasid_state == NULL)
+ goto out;
+ /*
+ * Drop reference taken here. We are safe because we still hold
+ * the reference taken in the amd_iommu_bind_pasid function.
+ */
+ put_pasid_state(pasid_state);
+
+ /* Clear the pasid state so that the pasid can be re-used */
+ clear_pasid_state(dev_state, pasid_state->pasid);
+
+ /*
+ * Call mmu_notifier_unregister to drop our reference
+ * to pasid_state->mm
+ */
+ mmu_notifier_unregister(&pasid_state->mn, pasid_state->mm);
+
+ put_pasid_state_wait(pasid_state); /* Reference taken in
+ amd_iommu_bind_pasid */
+ out:
+ /* Drop reference taken in this function */
+ put_device_state(dev_state);
+
+ /* Drop reference taken in amd_iommu_bind_pasid */
+ put_device_state(dev_state);
+ }
+ EXPORT_SYMBOL(amd_iommu_unbind_pasid);
+
+ int amd_iommu_init_device(struct pci_dev *pdev, int pasids)
+ {
+ struct device_state *dev_state;
+ struct iommu_group *group;
+ unsigned long flags;
+ int ret, tmp;
+ u16 devid;
+
+ might_sleep();
+
+ if (!amd_iommu_v2_supported())
+ return -ENODEV;
+
+ if (pasids <= 0 || pasids > (PASID_MASK + 1))
+ return -EINVAL;
+
+ devid = device_id(pdev);
+
+ dev_state = kzalloc(sizeof(*dev_state), GFP_KERNEL);
+ if (dev_state == NULL)
+ return -ENOMEM;
+
+ spin_lock_init(&dev_state->lock);
+ init_waitqueue_head(&dev_state->wq);
+ dev_state->pdev = pdev;
+ dev_state->devid = devid;
+
+ tmp = pasids;
+ for (dev_state->pasid_levels = 0; (tmp - 1) & ~0x1ff; tmp >>= 9)
+ dev_state->pasid_levels += 1;
+
+ atomic_set(&dev_state->count, 1);
+ dev_state->max_pasids = pasids;
+
+ ret = -ENOMEM;
+ dev_state->states = (void *)get_zeroed_page(GFP_KERNEL);
+ if (dev_state->states == NULL)
+ goto out_free_dev_state;
+
+ dev_state->domain = iommu_domain_alloc(&pci_bus_type);
+ if (dev_state->domain == NULL)
+ goto out_free_states;
+
+ amd_iommu_domain_direct_map(dev_state->domain);
+
+ ret = amd_iommu_domain_enable_v2(dev_state->domain, pasids);
+ if (ret)
+ goto out_free_domain;
+
+ group = iommu_group_get(&pdev->dev);
+ if (!group) {
+ ret = -EINVAL;
+ goto out_free_domain;
+ }
+
+ ret = iommu_attach_group(dev_state->domain, group);
+ if (ret != 0)
+ goto out_drop_group;
+
+ iommu_group_put(group);
+
+ spin_lock_irqsave(&state_lock, flags);
+
+ if (__get_device_state(devid) != NULL) {
+ spin_unlock_irqrestore(&state_lock, flags);
+ ret = -EBUSY;
+ goto out_free_domain;
+ }
+
+ list_add_tail(&dev_state->list, &state_list);
+
+ spin_unlock_irqrestore(&state_lock, flags);
+
+ return 0;
+
+ out_drop_group:
+ iommu_group_put(group);
+
+ out_free_domain:
+ iommu_domain_free(dev_state->domain);
+
+ out_free_states:
+ free_page((unsigned long)dev_state->states);
+
+ out_free_dev_state:
+ kfree(dev_state);
+
+ return ret;
+ }
+ EXPORT_SYMBOL(amd_iommu_init_device);
+
+ void amd_iommu_free_device(struct pci_dev *pdev)
+ {
+ struct device_state *dev_state;
+ unsigned long flags;
+ u16 devid;
+
+ if (!amd_iommu_v2_supported())
+ return;
+
+ devid = device_id(pdev);
+
+ spin_lock_irqsave(&state_lock, flags);
+
+ dev_state = __get_device_state(devid);
+ if (dev_state == NULL) {
+ spin_unlock_irqrestore(&state_lock, flags);
+ return;
+ }
+
+ list_del(&dev_state->list);
+
+ spin_unlock_irqrestore(&state_lock, flags);
+
+ /* Get rid of any remaining pasid states */
+ free_pasid_states(dev_state);
+
+ put_device_state(dev_state);
+ /*
+ * Wait until the last reference is dropped before freeing
+ * the device state.
+ */
+ wait_event(dev_state->wq, !atomic_read(&dev_state->count));
+ free_device_state(dev_state);
+ }
+ EXPORT_SYMBOL(amd_iommu_free_device);
+
+ int amd_iommu_set_invalid_ppr_cb(struct pci_dev *pdev,
+ amd_iommu_invalid_ppr_cb cb)
+ {
+ struct device_state *dev_state;
+ unsigned long flags;
+ u16 devid;
+ int ret;
+
+ if (!amd_iommu_v2_supported())
+ return -ENODEV;
+
+ devid = device_id(pdev);
+
+ spin_lock_irqsave(&state_lock, flags);
+
+ ret = -EINVAL;
+ dev_state = __get_device_state(devid);
+ if (dev_state == NULL)
+ goto out_unlock;
+
+ dev_state->inv_ppr_cb = cb;
+
+ ret = 0;
+
+ out_unlock:
+ spin_unlock_irqrestore(&state_lock, flags);
+
+ return ret;
+ }
+ EXPORT_SYMBOL(amd_iommu_set_invalid_ppr_cb);
+
+ int amd_iommu_set_invalidate_ctx_cb(struct pci_dev *pdev,
+ amd_iommu_invalidate_ctx cb)
+ {
+ struct device_state *dev_state;
+ unsigned long flags;
+ u16 devid;
+ int ret;
+
+ if (!amd_iommu_v2_supported())
+ return -ENODEV;
+
+ devid = device_id(pdev);
+
+ spin_lock_irqsave(&state_lock, flags);
+
+ ret = -EINVAL;
+ dev_state = __get_device_state(devid);
+ if (dev_state == NULL)
+ goto out_unlock;
+
+ dev_state->inv_ctx_cb = cb;
+
+ ret = 0;
+
+ out_unlock:
+ spin_unlock_irqrestore(&state_lock, flags);
+
+ return ret;
+ }
+ EXPORT_SYMBOL(amd_iommu_set_invalidate_ctx_cb);
+
+ static int __init amd_iommu_v2_init(void)
+ {
+ int ret;
+
+ pr_info("AMD IOMMUv2 driver by Joerg Roedel <jroedel@suse.de>\n");
+
+ if (!amd_iommu_v2_supported()) {
+ pr_info("AMD IOMMUv2 functionality not available on this system\n");
+ /*
+ * Load anyway to provide the symbols to other modules
+ * which may use AMD IOMMUv2 optionally.
+ */
+ return 0;
+ }
+
+ spin_lock_init(&state_lock);
+
+ ret = -ENOMEM;
+ iommu_wq = alloc_workqueue("amd_iommu_v2", WQ_MEM_RECLAIM, 0);
+ if (iommu_wq == NULL)
+ goto out;
+
+ amd_iommu_register_ppr_notifier(&ppr_nb);
+
+ return 0;
+
+ out:
+ return ret;
+ }
+
+ static void __exit amd_iommu_v2_exit(void)
+ {
+ struct device_state *dev_state;
+ int i;
+
+ if (!amd_iommu_v2_supported())
+ return;
+
+ amd_iommu_unregister_ppr_notifier(&ppr_nb);
+
+ flush_workqueue(iommu_wq);
+
+ /*
+ * The loop below might call flush_workqueue(), so call
+ * destroy_workqueue() after it
+ */
+ for (i = 0; i < MAX_DEVICES; ++i) {
+ dev_state = get_device_state(i);
+
+ if (dev_state == NULL)
+ continue;
+
+ WARN_ON_ONCE(1);
+
+ put_device_state(dev_state);
+ amd_iommu_free_device(dev_state->pdev);
+ }
+
+ destroy_workqueue(iommu_wq);
+ }
+
+ module_init(amd_iommu_v2_init);
+ module_exit(amd_iommu_v2_exit);
--- /dev/null
- down_read(&svm->mm->mmap_sem);
+ // SPDX-License-Identifier: GPL-2.0-only
+ /*
+ * Copyright © 2015 Intel Corporation.
+ *
+ * Authors: David Woodhouse <dwmw2@infradead.org>
+ */
+
+ #include <linux/intel-iommu.h>
+ #include <linux/mmu_notifier.h>
+ #include <linux/sched.h>
+ #include <linux/sched/mm.h>
+ #include <linux/slab.h>
+ #include <linux/intel-svm.h>
+ #include <linux/rculist.h>
+ #include <linux/pci.h>
+ #include <linux/pci-ats.h>
+ #include <linux/dmar.h>
+ #include <linux/interrupt.h>
+ #include <linux/mm_types.h>
+ #include <linux/ioasid.h>
+ #include <asm/page.h>
+
+ #include "intel-pasid.h"
+
+ static irqreturn_t prq_event_thread(int irq, void *d);
+ static void intel_svm_drain_prq(struct device *dev, int pasid);
+
+ #define PRQ_ORDER 0
+
+ int intel_svm_enable_prq(struct intel_iommu *iommu)
+ {
+ struct page *pages;
+ int irq, ret;
+
+ pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, PRQ_ORDER);
+ if (!pages) {
+ pr_warn("IOMMU: %s: Failed to allocate page request queue\n",
+ iommu->name);
+ return -ENOMEM;
+ }
+ iommu->prq = page_address(pages);
+
+ irq = dmar_alloc_hwirq(DMAR_UNITS_SUPPORTED + iommu->seq_id, iommu->node, iommu);
+ if (irq <= 0) {
+ pr_err("IOMMU: %s: Failed to create IRQ vector for page request queue\n",
+ iommu->name);
+ ret = -EINVAL;
+ err:
+ free_pages((unsigned long)iommu->prq, PRQ_ORDER);
+ iommu->prq = NULL;
+ return ret;
+ }
+ iommu->pr_irq = irq;
+
+ snprintf(iommu->prq_name, sizeof(iommu->prq_name), "dmar%d-prq", iommu->seq_id);
+
+ ret = request_threaded_irq(irq, NULL, prq_event_thread, IRQF_ONESHOT,
+ iommu->prq_name, iommu);
+ if (ret) {
+ pr_err("IOMMU: %s: Failed to request IRQ for page request queue\n",
+ iommu->name);
+ dmar_free_hwirq(irq);
+ iommu->pr_irq = 0;
+ goto err;
+ }
+ dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL);
+ dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL);
+ dmar_writeq(iommu->reg + DMAR_PQA_REG, virt_to_phys(iommu->prq) | PRQ_ORDER);
+
+ init_completion(&iommu->prq_complete);
+
+ return 0;
+ }
+
+ int intel_svm_finish_prq(struct intel_iommu *iommu)
+ {
+ dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL);
+ dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL);
+ dmar_writeq(iommu->reg + DMAR_PQA_REG, 0ULL);
+
+ if (iommu->pr_irq) {
+ free_irq(iommu->pr_irq, iommu);
+ dmar_free_hwirq(iommu->pr_irq);
+ iommu->pr_irq = 0;
+ }
+
+ free_pages((unsigned long)iommu->prq, PRQ_ORDER);
+ iommu->prq = NULL;
+
+ return 0;
+ }
+
+ static inline bool intel_svm_capable(struct intel_iommu *iommu)
+ {
+ return iommu->flags & VTD_FLAG_SVM_CAPABLE;
+ }
+
+ void intel_svm_check(struct intel_iommu *iommu)
+ {
+ if (!pasid_supported(iommu))
+ return;
+
+ if (cpu_feature_enabled(X86_FEATURE_GBPAGES) &&
+ !cap_fl1gp_support(iommu->cap)) {
+ pr_err("%s SVM disabled, incompatible 1GB page capability\n",
+ iommu->name);
+ return;
+ }
+
+ if (cpu_feature_enabled(X86_FEATURE_LA57) &&
+ !cap_5lp_support(iommu->cap)) {
+ pr_err("%s SVM disabled, incompatible paging mode\n",
+ iommu->name);
+ return;
+ }
+
+ iommu->flags |= VTD_FLAG_SVM_CAPABLE;
+ }
+
+ static void intel_flush_svm_range_dev (struct intel_svm *svm, struct intel_svm_dev *sdev,
+ unsigned long address, unsigned long pages, int ih)
+ {
+ struct qi_desc desc;
+
+ if (pages == -1) {
+ desc.qw0 = QI_EIOTLB_PASID(svm->pasid) |
+ QI_EIOTLB_DID(sdev->did) |
+ QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) |
+ QI_EIOTLB_TYPE;
+ desc.qw1 = 0;
+ } else {
+ int mask = ilog2(__roundup_pow_of_two(pages));
+
+ desc.qw0 = QI_EIOTLB_PASID(svm->pasid) |
+ QI_EIOTLB_DID(sdev->did) |
+ QI_EIOTLB_GRAN(QI_GRAN_PSI_PASID) |
+ QI_EIOTLB_TYPE;
+ desc.qw1 = QI_EIOTLB_ADDR(address) |
+ QI_EIOTLB_IH(ih) |
+ QI_EIOTLB_AM(mask);
+ }
+ desc.qw2 = 0;
+ desc.qw3 = 0;
+ qi_submit_sync(svm->iommu, &desc, 1, 0);
+
+ if (sdev->dev_iotlb) {
+ desc.qw0 = QI_DEV_EIOTLB_PASID(svm->pasid) |
+ QI_DEV_EIOTLB_SID(sdev->sid) |
+ QI_DEV_EIOTLB_QDEP(sdev->qdep) |
+ QI_DEIOTLB_TYPE;
+ if (pages == -1) {
+ desc.qw1 = QI_DEV_EIOTLB_ADDR(-1ULL >> 1) |
+ QI_DEV_EIOTLB_SIZE;
+ } else if (pages > 1) {
+ /* The least significant zero bit indicates the size. So,
+ * for example, an "address" value of 0x12345f000 will
+ * flush from 0x123440000 to 0x12347ffff (256KiB). */
+ unsigned long last = address + ((unsigned long)(pages - 1) << VTD_PAGE_SHIFT);
+ unsigned long mask = __rounddown_pow_of_two(address ^ last);
+
+ desc.qw1 = QI_DEV_EIOTLB_ADDR((address & ~mask) |
+ (mask - 1)) | QI_DEV_EIOTLB_SIZE;
+ } else {
+ desc.qw1 = QI_DEV_EIOTLB_ADDR(address);
+ }
+ desc.qw2 = 0;
+ desc.qw3 = 0;
+ qi_submit_sync(svm->iommu, &desc, 1, 0);
+ }
+ }
+
+ static void intel_flush_svm_range(struct intel_svm *svm, unsigned long address,
+ unsigned long pages, int ih)
+ {
+ struct intel_svm_dev *sdev;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(sdev, &svm->devs, list)
+ intel_flush_svm_range_dev(svm, sdev, address, pages, ih);
+ rcu_read_unlock();
+ }
+
+ /* Pages have been freed at this point */
+ static void intel_invalidate_range(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long start, unsigned long end)
+ {
+ struct intel_svm *svm = container_of(mn, struct intel_svm, notifier);
+
+ intel_flush_svm_range(svm, start,
+ (end - start + PAGE_SIZE - 1) >> VTD_PAGE_SHIFT, 0);
+ }
+
+ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
+ {
+ struct intel_svm *svm = container_of(mn, struct intel_svm, notifier);
+ struct intel_svm_dev *sdev;
+
+ /* This might end up being called from exit_mmap(), *before* the page
+ * tables are cleared. And __mmu_notifier_release() will delete us from
+ * the list of notifiers so that our invalidate_range() callback doesn't
+ * get called when the page tables are cleared. So we need to protect
+ * against hardware accessing those page tables.
+ *
+ * We do it by clearing the entry in the PASID table and then flushing
+ * the IOTLB and the PASID table caches. This might upset hardware;
+ * perhaps we'll want to point the PASID to a dummy PGD (like the zero
+ * page) so that we end up taking a fault that the hardware really
+ * *has* to handle gracefully without affecting other processes.
+ */
+ rcu_read_lock();
+ list_for_each_entry_rcu(sdev, &svm->devs, list)
+ intel_pasid_tear_down_entry(svm->iommu, sdev->dev,
+ svm->pasid, true);
+ rcu_read_unlock();
+
+ }
+
+ static const struct mmu_notifier_ops intel_mmuops = {
+ .release = intel_mm_release,
+ .invalidate_range = intel_invalidate_range,
+ };
+
+ static DEFINE_MUTEX(pasid_mutex);
+ static LIST_HEAD(global_svm_list);
+
+ #define for_each_svm_dev(sdev, svm, d) \
+ list_for_each_entry((sdev), &(svm)->devs, list) \
+ if ((d) != (sdev)->dev) {} else
+
+ int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev,
+ struct iommu_gpasid_bind_data *data)
+ {
+ struct intel_iommu *iommu = intel_svm_device_to_iommu(dev);
+ struct dmar_domain *dmar_domain;
+ struct intel_svm_dev *sdev;
+ struct intel_svm *svm;
+ int ret = 0;
+
+ if (WARN_ON(!iommu) || !data)
+ return -EINVAL;
+
+ if (data->version != IOMMU_GPASID_BIND_VERSION_1 ||
+ data->format != IOMMU_PASID_FORMAT_INTEL_VTD)
+ return -EINVAL;
+
+ if (!dev_is_pci(dev))
+ return -ENOTSUPP;
+
+ /* VT-d supports devices with full 20 bit PASIDs only */
+ if (pci_max_pasids(to_pci_dev(dev)) != PASID_MAX)
+ return -EINVAL;
+
+ /*
+ * We only check host PASID range, we have no knowledge to check
+ * guest PASID range.
+ */
+ if (data->hpasid <= 0 || data->hpasid >= PASID_MAX)
+ return -EINVAL;
+
+ dmar_domain = to_dmar_domain(domain);
+
+ mutex_lock(&pasid_mutex);
+ svm = ioasid_find(NULL, data->hpasid, NULL);
+ if (IS_ERR(svm)) {
+ ret = PTR_ERR(svm);
+ goto out;
+ }
+
+ if (svm) {
+ /*
+ * If we found svm for the PASID, there must be at
+ * least one device bond, otherwise svm should be freed.
+ */
+ if (WARN_ON(list_empty(&svm->devs))) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ for_each_svm_dev(sdev, svm, dev) {
+ /*
+ * For devices with aux domains, we should allow
+ * multiple bind calls with the same PASID and pdev.
+ */
+ if (iommu_dev_feature_enabled(dev,
+ IOMMU_DEV_FEAT_AUX)) {
+ sdev->users++;
+ } else {
+ dev_warn_ratelimited(dev,
+ "Already bound with PASID %u\n",
+ svm->pasid);
+ ret = -EBUSY;
+ }
+ goto out;
+ }
+ } else {
+ /* We come here when PASID has never been bond to a device. */
+ svm = kzalloc(sizeof(*svm), GFP_KERNEL);
+ if (!svm) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ /* REVISIT: upper layer/VFIO can track host process that bind
+ * the PASID. ioasid_set = mm might be sufficient for vfio to
+ * check pasid VMM ownership. We can drop the following line
+ * once VFIO and IOASID set check is in place.
+ */
+ svm->mm = get_task_mm(current);
+ svm->pasid = data->hpasid;
+ if (data->flags & IOMMU_SVA_GPASID_VAL) {
+ svm->gpasid = data->gpasid;
+ svm->flags |= SVM_FLAG_GUEST_PASID;
+ }
+ ioasid_set_data(data->hpasid, svm);
+ INIT_LIST_HEAD_RCU(&svm->devs);
+ mmput(svm->mm);
+ }
+ sdev = kzalloc(sizeof(*sdev), GFP_KERNEL);
+ if (!sdev) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ sdev->dev = dev;
+
+ /* Only count users if device has aux domains */
+ if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX))
+ sdev->users = 1;
+
+ /* Set up device context entry for PASID if not enabled already */
+ ret = intel_iommu_enable_pasid(iommu, sdev->dev);
+ if (ret) {
+ dev_err_ratelimited(dev, "Failed to enable PASID capability\n");
+ kfree(sdev);
+ goto out;
+ }
+
+ /*
+ * PASID table is per device for better security. Therefore, for
+ * each bind of a new device even with an existing PASID, we need to
+ * call the nested mode setup function here.
+ */
+ spin_lock(&iommu->lock);
+ ret = intel_pasid_setup_nested(iommu, dev,
+ (pgd_t *)(uintptr_t)data->gpgd,
+ data->hpasid, &data->vtd, dmar_domain,
+ data->addr_width);
+ spin_unlock(&iommu->lock);
+ if (ret) {
+ dev_err_ratelimited(dev, "Failed to set up PASID %llu in nested mode, Err %d\n",
+ data->hpasid, ret);
+ /*
+ * PASID entry should be in cleared state if nested mode
+ * set up failed. So we only need to clear IOASID tracking
+ * data such that free call will succeed.
+ */
+ kfree(sdev);
+ goto out;
+ }
+
+ svm->flags |= SVM_FLAG_GUEST_MODE;
+
+ init_rcu_head(&sdev->rcu);
+ list_add_rcu(&sdev->list, &svm->devs);
+ out:
+ if (!IS_ERR_OR_NULL(svm) && list_empty(&svm->devs)) {
+ ioasid_set_data(data->hpasid, NULL);
+ kfree(svm);
+ }
+
+ mutex_unlock(&pasid_mutex);
+ return ret;
+ }
+
+ int intel_svm_unbind_gpasid(struct device *dev, int pasid)
+ {
+ struct intel_iommu *iommu = intel_svm_device_to_iommu(dev);
+ struct intel_svm_dev *sdev;
+ struct intel_svm *svm;
+ int ret = -EINVAL;
+
+ if (WARN_ON(!iommu))
+ return -EINVAL;
+
+ mutex_lock(&pasid_mutex);
+ svm = ioasid_find(NULL, pasid, NULL);
+ if (!svm) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (IS_ERR(svm)) {
+ ret = PTR_ERR(svm);
+ goto out;
+ }
+
+ for_each_svm_dev(sdev, svm, dev) {
+ ret = 0;
+ if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX))
+ sdev->users--;
+ if (!sdev->users) {
+ list_del_rcu(&sdev->list);
+ intel_pasid_tear_down_entry(iommu, dev,
+ svm->pasid, false);
+ intel_svm_drain_prq(dev, svm->pasid);
+ kfree_rcu(sdev, rcu);
+
+ if (list_empty(&svm->devs)) {
+ /*
+ * We do not free the IOASID here in that
+ * IOMMU driver did not allocate it.
+ * Unlike native SVM, IOASID for guest use was
+ * allocated prior to the bind call.
+ * In any case, if the free call comes before
+ * the unbind, IOMMU driver will get notified
+ * and perform cleanup.
+ */
+ ioasid_set_data(pasid, NULL);
+ kfree(svm);
+ }
+ }
+ break;
+ }
+ out:
+ mutex_unlock(&pasid_mutex);
+ return ret;
+ }
+
+ /* Caller must hold pasid_mutex, mm reference */
+ static int
+ intel_svm_bind_mm(struct device *dev, int flags, struct svm_dev_ops *ops,
+ struct mm_struct *mm, struct intel_svm_dev **sd)
+ {
+ struct intel_iommu *iommu = intel_svm_device_to_iommu(dev);
+ struct device_domain_info *info;
+ struct intel_svm_dev *sdev;
+ struct intel_svm *svm = NULL;
+ int pasid_max;
+ int ret;
+
+ if (!iommu || dmar_disabled)
+ return -EINVAL;
+
+ if (!intel_svm_capable(iommu))
+ return -ENOTSUPP;
+
+ if (dev_is_pci(dev)) {
+ pasid_max = pci_max_pasids(to_pci_dev(dev));
+ if (pasid_max < 0)
+ return -EINVAL;
+ } else
+ pasid_max = 1 << 20;
+
+ /* Bind supervisor PASID shuld have mm = NULL */
+ if (flags & SVM_FLAG_SUPERVISOR_MODE) {
+ if (!ecap_srs(iommu->ecap) || mm) {
+ pr_err("Supervisor PASID with user provided mm.\n");
+ return -EINVAL;
+ }
+ }
+
+ if (!(flags & SVM_FLAG_PRIVATE_PASID)) {
+ struct intel_svm *t;
+
+ list_for_each_entry(t, &global_svm_list, list) {
+ if (t->mm != mm || (t->flags & SVM_FLAG_PRIVATE_PASID))
+ continue;
+
+ svm = t;
+ if (svm->pasid >= pasid_max) {
+ dev_warn(dev,
+ "Limited PASID width. Cannot use existing PASID %d\n",
+ svm->pasid);
+ ret = -ENOSPC;
+ goto out;
+ }
+
+ /* Find the matching device in svm list */
+ for_each_svm_dev(sdev, svm, dev) {
+ if (sdev->ops != ops) {
+ ret = -EBUSY;
+ goto out;
+ }
+ sdev->users++;
+ goto success;
+ }
+
+ break;
+ }
+ }
+
+ sdev = kzalloc(sizeof(*sdev), GFP_KERNEL);
+ if (!sdev) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ sdev->dev = dev;
+
+ ret = intel_iommu_enable_pasid(iommu, dev);
+ if (ret) {
+ kfree(sdev);
+ goto out;
+ }
+
+ info = get_domain_info(dev);
+ sdev->did = FLPT_DEFAULT_DID;
+ sdev->sid = PCI_DEVID(info->bus, info->devfn);
+ if (info->ats_enabled) {
+ sdev->dev_iotlb = 1;
+ sdev->qdep = info->ats_qdep;
+ if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS)
+ sdev->qdep = 0;
+ }
+
+ /* Finish the setup now we know we're keeping it */
+ sdev->users = 1;
+ sdev->ops = ops;
+ init_rcu_head(&sdev->rcu);
+
+ if (!svm) {
+ svm = kzalloc(sizeof(*svm), GFP_KERNEL);
+ if (!svm) {
+ ret = -ENOMEM;
+ kfree(sdev);
+ goto out;
+ }
+ svm->iommu = iommu;
+
+ if (pasid_max > intel_pasid_max_id)
+ pasid_max = intel_pasid_max_id;
+
+ /* Do not use PASID 0, reserved for RID to PASID */
+ svm->pasid = ioasid_alloc(NULL, PASID_MIN,
+ pasid_max - 1, svm);
+ if (svm->pasid == INVALID_IOASID) {
+ kfree(svm);
+ kfree(sdev);
+ ret = -ENOSPC;
+ goto out;
+ }
+ svm->notifier.ops = &intel_mmuops;
+ svm->mm = mm;
+ svm->flags = flags;
+ INIT_LIST_HEAD_RCU(&svm->devs);
+ INIT_LIST_HEAD(&svm->list);
+ ret = -ENOMEM;
+ if (mm) {
+ ret = mmu_notifier_register(&svm->notifier, mm);
+ if (ret) {
+ ioasid_free(svm->pasid);
+ kfree(svm);
+ kfree(sdev);
+ goto out;
+ }
+ }
+
+ spin_lock(&iommu->lock);
+ ret = intel_pasid_setup_first_level(iommu, dev,
+ mm ? mm->pgd : init_mm.pgd,
+ svm->pasid, FLPT_DEFAULT_DID,
+ (mm ? 0 : PASID_FLAG_SUPERVISOR_MODE) |
+ (cpu_feature_enabled(X86_FEATURE_LA57) ?
+ PASID_FLAG_FL5LP : 0));
+ spin_unlock(&iommu->lock);
+ if (ret) {
+ if (mm)
+ mmu_notifier_unregister(&svm->notifier, mm);
+ ioasid_free(svm->pasid);
+ kfree(svm);
+ kfree(sdev);
+ goto out;
+ }
+
+ list_add_tail(&svm->list, &global_svm_list);
+ } else {
+ /*
+ * Binding a new device with existing PASID, need to setup
+ * the PASID entry.
+ */
+ spin_lock(&iommu->lock);
+ ret = intel_pasid_setup_first_level(iommu, dev,
+ mm ? mm->pgd : init_mm.pgd,
+ svm->pasid, FLPT_DEFAULT_DID,
+ (mm ? 0 : PASID_FLAG_SUPERVISOR_MODE) |
+ (cpu_feature_enabled(X86_FEATURE_LA57) ?
+ PASID_FLAG_FL5LP : 0));
+ spin_unlock(&iommu->lock);
+ if (ret) {
+ kfree(sdev);
+ goto out;
+ }
+ }
+ list_add_rcu(&sdev->list, &svm->devs);
+ success:
+ sdev->pasid = svm->pasid;
+ sdev->sva.dev = dev;
+ if (sd)
+ *sd = sdev;
+ ret = 0;
+ out:
+ return ret;
+ }
+
+ /* Caller must hold pasid_mutex */
+ static int intel_svm_unbind_mm(struct device *dev, int pasid)
+ {
+ struct intel_svm_dev *sdev;
+ struct intel_iommu *iommu;
+ struct intel_svm *svm;
+ int ret = -EINVAL;
+
+ iommu = intel_svm_device_to_iommu(dev);
+ if (!iommu)
+ goto out;
+
+ svm = ioasid_find(NULL, pasid, NULL);
+ if (!svm)
+ goto out;
+
+ if (IS_ERR(svm)) {
+ ret = PTR_ERR(svm);
+ goto out;
+ }
+
+ for_each_svm_dev(sdev, svm, dev) {
+ ret = 0;
+ sdev->users--;
+ if (!sdev->users) {
+ list_del_rcu(&sdev->list);
+ /* Flush the PASID cache and IOTLB for this device.
+ * Note that we do depend on the hardware *not* using
+ * the PASID any more. Just as we depend on other
+ * devices never using PASIDs that they have no right
+ * to use. We have a *shared* PASID table, because it's
+ * large and has to be physically contiguous. So it's
+ * hard to be as defensive as we might like. */
+ intel_pasid_tear_down_entry(iommu, dev,
+ svm->pasid, false);
+ intel_svm_drain_prq(dev, svm->pasid);
+ kfree_rcu(sdev, rcu);
+
+ if (list_empty(&svm->devs)) {
+ ioasid_free(svm->pasid);
+ if (svm->mm)
+ mmu_notifier_unregister(&svm->notifier, svm->mm);
+ list_del(&svm->list);
+ /* We mandate that no page faults may be outstanding
+ * for the PASID when intel_svm_unbind_mm() is called.
+ * If that is not obeyed, subtle errors will happen.
+ * Let's make them less subtle... */
+ memset(svm, 0x6b, sizeof(*svm));
+ kfree(svm);
+ }
+ }
+ break;
+ }
+ out:
+
+ return ret;
+ }
+
+ /* Page request queue descriptor */
+ struct page_req_dsc {
+ union {
+ struct {
+ u64 type:8;
+ u64 pasid_present:1;
+ u64 priv_data_present:1;
+ u64 rsvd:6;
+ u64 rid:16;
+ u64 pasid:20;
+ u64 exe_req:1;
+ u64 pm_req:1;
+ u64 rsvd2:10;
+ };
+ u64 qw_0;
+ };
+ union {
+ struct {
+ u64 rd_req:1;
+ u64 wr_req:1;
+ u64 lpig:1;
+ u64 prg_index:9;
+ u64 addr:52;
+ };
+ u64 qw_1;
+ };
+ u64 priv_data[2];
+ };
+
+ #define PRQ_RING_MASK ((0x1000 << PRQ_ORDER) - 0x20)
+
+ static bool access_error(struct vm_area_struct *vma, struct page_req_dsc *req)
+ {
+ unsigned long requested = 0;
+
+ if (req->exe_req)
+ requested |= VM_EXEC;
+
+ if (req->rd_req)
+ requested |= VM_READ;
+
+ if (req->wr_req)
+ requested |= VM_WRITE;
+
+ return (requested & ~vma->vm_flags) != 0;
+ }
+
+ static bool is_canonical_address(u64 addr)
+ {
+ int shift = 64 - (__VIRTUAL_MASK_SHIFT + 1);
+ long saddr = (long) addr;
+
+ return (((saddr << shift) >> shift) == saddr);
+ }
+
+ /**
+ * intel_svm_drain_prq - Drain page requests and responses for a pasid
+ * @dev: target device
+ * @pasid: pasid for draining
+ *
+ * Drain all pending page requests and responses related to @pasid in both
+ * software and hardware. This is supposed to be called after the device
+ * driver has stopped DMA, the pasid entry has been cleared, and both IOTLB
+ * and DevTLB have been invalidated.
+ *
+ * It waits until all pending page requests for @pasid in the page fault
+ * queue are completed by the prq handling thread. Then follow the steps
+ * described in VT-d spec CH7.10 to drain all page requests and page
+ * responses pending in the hardware.
+ */
+ static void intel_svm_drain_prq(struct device *dev, int pasid)
+ {
+ struct device_domain_info *info;
+ struct dmar_domain *domain;
+ struct intel_iommu *iommu;
+ struct qi_desc desc[3];
+ struct pci_dev *pdev;
+ int head, tail;
+ u16 sid, did;
+ int qdep;
+
+ info = get_domain_info(dev);
+ if (WARN_ON(!info || !dev_is_pci(dev)))
+ return;
+
+ if (!info->pri_enabled)
+ return;
+
+ iommu = info->iommu;
+ domain = info->domain;
+ pdev = to_pci_dev(dev);
+ sid = PCI_DEVID(info->bus, info->devfn);
+ did = domain->iommu_did[iommu->seq_id];
+ qdep = pci_ats_queue_depth(pdev);
+
+ /*
+ * Check and wait until all pending page requests in the queue are
+ * handled by the prq handling thread.
+ */
+ prq_retry:
+ reinit_completion(&iommu->prq_complete);
+ tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK;
+ head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK;
+ while (head != tail) {
+ struct page_req_dsc *req;
+
+ req = &iommu->prq[head / sizeof(*req)];
+ if (!req->pasid_present || req->pasid != pasid) {
+ head = (head + sizeof(*req)) & PRQ_RING_MASK;
+ continue;
+ }
+
+ wait_for_completion(&iommu->prq_complete);
+ goto prq_retry;
+ }
+
+ /*
+ * Perform steps described in VT-d spec CH7.10 to drain page
+ * requests and responses in hardware.
+ */
+ memset(desc, 0, sizeof(desc));
+ desc[0].qw0 = QI_IWD_STATUS_DATA(QI_DONE) |
+ QI_IWD_FENCE |
+ QI_IWD_TYPE;
+ desc[1].qw0 = QI_EIOTLB_PASID(pasid) |
+ QI_EIOTLB_DID(did) |
+ QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) |
+ QI_EIOTLB_TYPE;
+ desc[2].qw0 = QI_DEV_EIOTLB_PASID(pasid) |
+ QI_DEV_EIOTLB_SID(sid) |
+ QI_DEV_EIOTLB_QDEP(qdep) |
+ QI_DEIOTLB_TYPE |
+ QI_DEV_IOTLB_PFSID(info->pfsid);
+ qi_retry:
+ reinit_completion(&iommu->prq_complete);
+ qi_submit_sync(iommu, desc, 3, QI_OPT_WAIT_DRAIN);
+ if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) {
+ wait_for_completion(&iommu->prq_complete);
+ goto qi_retry;
+ }
+ }
+
+ static irqreturn_t prq_event_thread(int irq, void *d)
+ {
+ struct intel_iommu *iommu = d;
+ struct intel_svm *svm = NULL;
+ int head, tail, handled = 0;
+
+ /* Clear PPR bit before reading head/tail registers, to
+ * ensure that we get a new interrupt if needed. */
+ writel(DMA_PRS_PPR, iommu->reg + DMAR_PRS_REG);
+
+ tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK;
+ head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK;
+ while (head != tail) {
+ struct intel_svm_dev *sdev;
+ struct vm_area_struct *vma;
+ struct page_req_dsc *req;
+ struct qi_desc resp;
+ int result;
+ vm_fault_t ret;
+ u64 address;
+
+ handled = 1;
+
+ req = &iommu->prq[head / sizeof(*req)];
+
+ result = QI_RESP_FAILURE;
+ address = (u64)req->addr << VTD_PAGE_SHIFT;
+ if (!req->pasid_present) {
+ pr_err("%s: Page request without PASID: %08llx %08llx\n",
+ iommu->name, ((unsigned long long *)req)[0],
+ ((unsigned long long *)req)[1]);
+ goto no_pasid;
+ }
+
+ if (!svm || svm->pasid != req->pasid) {
+ rcu_read_lock();
+ svm = ioasid_find(NULL, req->pasid, NULL);
+ /* It *can't* go away, because the driver is not permitted
+ * to unbind the mm while any page faults are outstanding.
+ * So we only need RCU to protect the internal idr code. */
+ rcu_read_unlock();
+ if (IS_ERR_OR_NULL(svm)) {
+ pr_err("%s: Page request for invalid PASID %d: %08llx %08llx\n",
+ iommu->name, req->pasid, ((unsigned long long *)req)[0],
+ ((unsigned long long *)req)[1]);
+ goto no_pasid;
+ }
+ }
+
+ result = QI_RESP_INVALID;
+ /* Since we're using init_mm.pgd directly, we should never take
+ * any faults on kernel addresses. */
+ if (!svm->mm)
+ goto bad_req;
+
+ /* If address is not canonical, return invalid response */
+ if (!is_canonical_address(address))
+ goto bad_req;
+
+ /* If the mm is already defunct, don't handle faults. */
+ if (!mmget_not_zero(svm->mm))
+ goto bad_req;
+
- up_read(&svm->mm->mmap_sem);
++ mmap_read_lock(svm->mm);
+ vma = find_extend_vma(svm->mm, address);
+ if (!vma || address < vma->vm_start)
+ goto invalid;
+
+ if (access_error(vma, req))
+ goto invalid;
+
+ ret = handle_mm_fault(vma, address,
+ req->wr_req ? FAULT_FLAG_WRITE : 0);
+ if (ret & VM_FAULT_ERROR)
+ goto invalid;
+
+ result = QI_RESP_SUCCESS;
+ invalid:
++ mmap_read_unlock(svm->mm);
+ mmput(svm->mm);
+ bad_req:
+ /* Accounting for major/minor faults? */
+ rcu_read_lock();
+ list_for_each_entry_rcu(sdev, &svm->devs, list) {
+ if (sdev->sid == req->rid)
+ break;
+ }
+ /* Other devices can go away, but the drivers are not permitted
+ * to unbind while any page faults might be in flight. So it's
+ * OK to drop the 'lock' here now we have it. */
+ rcu_read_unlock();
+
+ if (WARN_ON(&sdev->list == &svm->devs))
+ sdev = NULL;
+
+ if (sdev && sdev->ops && sdev->ops->fault_cb) {
+ int rwxp = (req->rd_req << 3) | (req->wr_req << 2) |
+ (req->exe_req << 1) | (req->pm_req);
+ sdev->ops->fault_cb(sdev->dev, req->pasid, req->addr,
+ req->priv_data, rwxp, result);
+ }
+ /* We get here in the error case where the PASID lookup failed,
+ and these can be NULL. Do not use them below this point! */
+ sdev = NULL;
+ svm = NULL;
+ no_pasid:
+ if (req->lpig || req->priv_data_present) {
+ /*
+ * Per VT-d spec. v3.0 ch7.7, system software must
+ * respond with page group response if private data
+ * is present (PDP) or last page in group (LPIG) bit
+ * is set. This is an additional VT-d feature beyond
+ * PCI ATS spec.
+ */
+ resp.qw0 = QI_PGRP_PASID(req->pasid) |
+ QI_PGRP_DID(req->rid) |
+ QI_PGRP_PASID_P(req->pasid_present) |
+ QI_PGRP_PDP(req->pasid_present) |
+ QI_PGRP_RESP_CODE(result) |
+ QI_PGRP_RESP_TYPE;
+ resp.qw1 = QI_PGRP_IDX(req->prg_index) |
+ QI_PGRP_LPIG(req->lpig);
+
+ if (req->priv_data_present)
+ memcpy(&resp.qw2, req->priv_data,
+ sizeof(req->priv_data));
+ resp.qw2 = 0;
+ resp.qw3 = 0;
+ qi_submit_sync(iommu, &resp, 1, 0);
+ }
+ head = (head + sizeof(*req)) & PRQ_RING_MASK;
+ }
+
+ dmar_writeq(iommu->reg + DMAR_PQH_REG, tail);
+
+ /*
+ * Clear the page request overflow bit and wake up all threads that
+ * are waiting for the completion of this handling.
+ */
+ if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO)
+ writel(DMA_PRS_PRO, iommu->reg + DMAR_PRS_REG);
+
+ if (!completion_done(&iommu->prq_complete))
+ complete(&iommu->prq_complete);
+
+ return IRQ_RETVAL(handled);
+ }
+
+ #define to_intel_svm_dev(handle) container_of(handle, struct intel_svm_dev, sva)
+ struct iommu_sva *
+ intel_svm_bind(struct device *dev, struct mm_struct *mm, void *drvdata)
+ {
+ struct iommu_sva *sva = ERR_PTR(-EINVAL);
+ struct intel_svm_dev *sdev = NULL;
+ int flags = 0;
+ int ret;
+
+ /*
+ * TODO: Consolidate with generic iommu-sva bind after it is merged.
+ * It will require shared SVM data structures, i.e. combine io_mm
+ * and intel_svm etc.
+ */
+ if (drvdata)
+ flags = *(int *)drvdata;
+ mutex_lock(&pasid_mutex);
+ ret = intel_svm_bind_mm(dev, flags, NULL, mm, &sdev);
+ if (ret)
+ sva = ERR_PTR(ret);
+ else if (sdev)
+ sva = &sdev->sva;
+ else
+ WARN(!sdev, "SVM bind succeeded with no sdev!\n");
+
+ mutex_unlock(&pasid_mutex);
+
+ return sva;
+ }
+
+ void intel_svm_unbind(struct iommu_sva *sva)
+ {
+ struct intel_svm_dev *sdev;
+
+ mutex_lock(&pasid_mutex);
+ sdev = to_intel_svm_dev(sva);
+ intel_svm_unbind_mm(sdev->dev, sdev->pasid);
+ mutex_unlock(&pasid_mutex);
+ }
+
+ int intel_svm_get_pasid(struct iommu_sva *sva)
+ {
+ struct intel_svm_dev *sdev;
+ int pasid;
+
+ mutex_lock(&pasid_mutex);
+ sdev = to_intel_svm_dev(sva);
+ pasid = sdev->pasid;
+ mutex_unlock(&pasid_mutex);
+
+ return pasid;
+ }