iommu/vt-d: Move Intel IOMMU driver into subdirectory

author Joerg Roedel <jroedel@suse.de>

Tue, 9 Jun 2020 13:03:03 +0000 (15:03 +0200)

committer Joerg Roedel <jroedel@suse.de>

Wed, 10 Jun 2020 15:46:43 +0000 (17:46 +0200)
author Joerg Roedel <jroedel@suse.de>
Tue, 9 Jun 2020 13:03:03 +0000 (15:03 +0200)
committer Joerg Roedel <jroedel@suse.de>
Wed, 10 Jun 2020 15:46:43 +0000 (17:46 +0200)
diff --git a/MAINTAINERS b/MAINTAINERS

index dd59ec6676d9bdd3951cf2ddd320640adc957fe3..52ab5329becd6a322f8a4333c45c0d7ece25414e 100644 (file)
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8620,8 +8620,7 @@ M:        Lu Baolu <baolu.lu@linux.intel.com>
  L:     iommu@lists.linux-foundation.org
  S:     Supported
  T:     git git://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu.git
-F:     drivers/iommu/dmar.c
-F:     drivers/iommu/intel*.[ch]
+F:     drivers/iommu/intel/
  F:     include/linux/intel-iommu.h
  F:     include/linux/intel-svm.h
  
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile

index 3af7e374b0cba7e695c092aa4b54e9e1c600746b..342190196dfb068d07e1d7a9bd66d29bbcc7a237 100644 (file)
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -17,13 +17,13 @@ obj-$(CONFIG_AMD_IOMMU_V2) += amd/iommu_v2.o
  obj-$(CONFIG_ARM_SMMU) += arm_smmu.o
  arm_smmu-objs += arm-smmu.o arm-smmu-impl.o arm-smmu-qcom.o
  obj-$(CONFIG_ARM_SMMU_V3) += arm-smmu-v3.o
-obj-$(CONFIG_DMAR_TABLE) += dmar.o
-obj-$(CONFIG_INTEL_IOMMU) += intel-iommu.o intel-pasid.o
-obj-$(CONFIG_INTEL_IOMMU) += intel-trace.o
-obj-$(CONFIG_INTEL_IOMMU_DEBUGFS) += intel-iommu-debugfs.o
-obj-$(CONFIG_INTEL_IOMMU_SVM) += intel-svm.o
+obj-$(CONFIG_DMAR_TABLE) += intel/dmar.o
+obj-$(CONFIG_INTEL_IOMMU) += intel/iommu.o intel/pasid.o
+obj-$(CONFIG_INTEL_IOMMU) += intel/trace.o
+obj-$(CONFIG_INTEL_IOMMU_DEBUGFS) += intel/debugfs.o
+obj-$(CONFIG_INTEL_IOMMU_SVM) += intel/svm.o
  obj-$(CONFIG_IPMMU_VMSA) += ipmmu-vmsa.o
-obj-$(CONFIG_IRQ_REMAP) += intel_irq_remapping.o irq_remapping.o
+obj-$(CONFIG_IRQ_REMAP) += intel/irq_remapping.o irq_remapping.o
  obj-$(CONFIG_MTK_IOMMU) += mtk_iommu.o
  obj-$(CONFIG_MTK_IOMMU_V1) += mtk_iommu_v1.o
  obj-$(CONFIG_OMAP_IOMMU) += omap-iommu.o
diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c

deleted file mode 100644 (file)

index 60a2970..0000000
--- a/drivers/iommu/dmar.c
+++ /dev/null
@@ -1,2264 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (c) 2006, Intel Corporation.
- *
- * Copyright (C) 2006-2008 Intel Corporation
- * Author: Ashok Raj <ashok.raj@intel.com>
- * Author: Shaohua Li <shaohua.li@intel.com>
- * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
- *
- * This file implements early detection/parsing of Remapping Devices
- * reported to OS through BIOS via DMA remapping reporting (DMAR) ACPI
- * tables.
- *
- * These routines are used by both DMA-remapping and Interrupt-remapping
- */
-
-#define pr_fmt(fmt)     "DMAR: " fmt
-
-#include <linux/pci.h>
-#include <linux/dmar.h>
-#include <linux/iova.h>
-#include <linux/intel-iommu.h>
-#include <linux/timer.h>
-#include <linux/irq.h>
-#include <linux/interrupt.h>
-#include <linux/tboot.h>
-#include <linux/dmi.h>
-#include <linux/slab.h>
-#include <linux/iommu.h>
-#include <linux/numa.h>
-#include <linux/limits.h>
-#include <asm/irq_remapping.h>
-#include <asm/iommu_table.h>
-
-#include "irq_remapping.h"
-
-typedef int (*dmar_res_handler_t)(struct acpi_dmar_header *, void *);
-struct dmar_res_callback {
-       dmar_res_handler_t      cb[ACPI_DMAR_TYPE_RESERVED];
-       void                    *arg[ACPI_DMAR_TYPE_RESERVED];
-       bool                    ignore_unhandled;
-       bool                    print_entry;
-};
-
-/*
- * Assumptions:
- * 1) The hotplug framework guarentees that DMAR unit will be hot-added
- *    before IO devices managed by that unit.
- * 2) The hotplug framework guarantees that DMAR unit will be hot-removed
- *    after IO devices managed by that unit.
- * 3) Hotplug events are rare.
- *
- * Locking rules for DMA and interrupt remapping related global data structures:
- * 1) Use dmar_global_lock in process context
- * 2) Use RCU in interrupt context
- */
-DECLARE_RWSEM(dmar_global_lock);
-LIST_HEAD(dmar_drhd_units);
-
-struct acpi_table_header * __initdata dmar_tbl;
-static int dmar_dev_scope_status = 1;
-static unsigned long dmar_seq_ids[BITS_TO_LONGS(DMAR_UNITS_SUPPORTED)];
-
-static int alloc_iommu(struct dmar_drhd_unit *drhd);
-static void free_iommu(struct intel_iommu *iommu);
-
-extern const struct iommu_ops intel_iommu_ops;
-
-static void dmar_register_drhd_unit(struct dmar_drhd_unit *drhd)
-{
-       /*
-        * add INCLUDE_ALL at the tail, so scan the list will find it at
-        * the very end.
-        */
-       if (drhd->include_all)
-               list_add_tail_rcu(&drhd->list, &dmar_drhd_units);
-       else
-               list_add_rcu(&drhd->list, &dmar_drhd_units);
-}
-
-void *dmar_alloc_dev_scope(void *start, void *end, int *cnt)
-{
-       struct acpi_dmar_device_scope *scope;
-
-       *cnt = 0;
-       while (start < end) {
-               scope = start;
-               if (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_NAMESPACE ||
-                   scope->entry_type == ACPI_DMAR_SCOPE_TYPE_ENDPOINT ||
-                   scope->entry_type == ACPI_DMAR_SCOPE_TYPE_BRIDGE)
-                       (*cnt)++;
-               else if (scope->entry_type != ACPI_DMAR_SCOPE_TYPE_IOAPIC &&
-                       scope->entry_type != ACPI_DMAR_SCOPE_TYPE_HPET) {
-                       pr_warn("Unsupported device scope\n");
-               }
-               start += scope->length;
-       }
-       if (*cnt == 0)
-               return NULL;
-
-       return kcalloc(*cnt, sizeof(struct dmar_dev_scope), GFP_KERNEL);
-}
-
-void dmar_free_dev_scope(struct dmar_dev_scope **devices, int *cnt)
-{
-       int i;
-       struct device *tmp_dev;
-
-       if (*devices && *cnt) {
-               for_each_active_dev_scope(*devices, *cnt, i, tmp_dev)
-                       put_device(tmp_dev);
-               kfree(*devices);
-       }
-
-       *devices = NULL;
-       *cnt = 0;
-}
-
-/* Optimize out kzalloc()/kfree() for normal cases */
-static char dmar_pci_notify_info_buf[64];
-
-static struct dmar_pci_notify_info *
-dmar_alloc_pci_notify_info(struct pci_dev *dev, unsigned long event)
-{
-       int level = 0;
-       size_t size;
-       struct pci_dev *tmp;
-       struct dmar_pci_notify_info *info;
-
-       BUG_ON(dev->is_virtfn);
-
-       /*
-        * Ignore devices that have a domain number higher than what can
-        * be looked up in DMAR, e.g. VMD subdevices with domain 0x10000
-        */
-       if (pci_domain_nr(dev->bus) > U16_MAX)
-               return NULL;
-
-       /* Only generate path[] for device addition event */
-       if (event == BUS_NOTIFY_ADD_DEVICE)
-               for (tmp = dev; tmp; tmp = tmp->bus->self)
-                       level++;
-
-       size = struct_size(info, path, level);
-       if (size <= sizeof(dmar_pci_notify_info_buf)) {
-               info = (struct dmar_pci_notify_info *)dmar_pci_notify_info_buf;
-       } else {
-               info = kzalloc(size, GFP_KERNEL);
-               if (!info) {
-                       pr_warn("Out of memory when allocating notify_info "
-                               "for %s.\n", pci_name(dev));
-                       if (dmar_dev_scope_status == 0)
-                               dmar_dev_scope_status = -ENOMEM;
-                       return NULL;
-               }
-       }
-
-       info->event = event;
-       info->dev = dev;
-       info->seg = pci_domain_nr(dev->bus);
-       info->level = level;
-       if (event == BUS_NOTIFY_ADD_DEVICE) {
-               for (tmp = dev; tmp; tmp = tmp->bus->self) {
-                       level--;
-                       info->path[level].bus = tmp->bus->number;
-                       info->path[level].device = PCI_SLOT(tmp->devfn);
-                       info->path[level].function = PCI_FUNC(tmp->devfn);
-                       if (pci_is_root_bus(tmp->bus))
-                               info->bus = tmp->bus->number;
-               }
-       }
-
-       return info;
-}
-
-static inline void dmar_free_pci_notify_info(struct dmar_pci_notify_info *info)
-{
-       if ((void *)info != dmar_pci_notify_info_buf)
-               kfree(info);
-}
-
-static bool dmar_match_pci_path(struct dmar_pci_notify_info *info, int bus,
-                               struct acpi_dmar_pci_path *path, int count)
-{
-       int i;
-
-       if (info->bus != bus)
-               goto fallback;
-       if (info->level != count)
-               goto fallback;
-
-       for (i = 0; i < count; i++) {
-               if (path[i].device != info->path[i].device ||
-                   path[i].function != info->path[i].function)
-                       goto fallback;
-       }
-
-       return true;
-
-fallback:
-
-       if (count != 1)
-               return false;
-
-       i = info->level - 1;
-       if (bus              == info->path[i].bus &&
-           path[0].device   == info->path[i].device &&
-           path[0].function == info->path[i].function) {
-               pr_info(FW_BUG "RMRR entry for device %02x:%02x.%x is broken - applying workaround\n",
-                       bus, path[0].device, path[0].function);
-               return true;
-       }
-
-       return false;
-}
-
-/* Return: > 0 if match found, 0 if no match found, < 0 if error happens */
-int dmar_insert_dev_scope(struct dmar_pci_notify_info *info,
-                         void *start, void*end, u16 segment,
-                         struct dmar_dev_scope *devices,
-                         int devices_cnt)
-{
-       int i, level;
-       struct device *tmp, *dev = &info->dev->dev;
-       struct acpi_dmar_device_scope *scope;
-       struct acpi_dmar_pci_path *path;
-
-       if (segment != info->seg)
-               return 0;
-
-       for (; start < end; start += scope->length) {
-               scope = start;
-               if (scope->entry_type != ACPI_DMAR_SCOPE_TYPE_ENDPOINT &&
-                   scope->entry_type != ACPI_DMAR_SCOPE_TYPE_BRIDGE)
-                       continue;
-
-               path = (struct acpi_dmar_pci_path *)(scope + 1);
-               level = (scope->length - sizeof(*scope)) / sizeof(*path);
-               if (!dmar_match_pci_path(info, scope->bus, path, level))
-                       continue;
-
-               /*
-                * We expect devices with endpoint scope to have normal PCI
-                * headers, and devices with bridge scope to have bridge PCI
-                * headers.  However PCI NTB devices may be listed in the
-                * DMAR table with bridge scope, even though they have a
-                * normal PCI header.  NTB devices are identified by class
-                * "BRIDGE_OTHER" (0680h) - we don't declare a socpe mismatch
-                * for this special case.
-                */
-               if ((scope->entry_type == ACPI_DMAR_SCOPE_TYPE_ENDPOINT &&
-                    info->dev->hdr_type != PCI_HEADER_TYPE_NORMAL) ||
-                   (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_BRIDGE &&
-                    (info->dev->hdr_type == PCI_HEADER_TYPE_NORMAL &&
-                     info->dev->class >> 16 != PCI_BASE_CLASS_BRIDGE))) {
-                       pr_warn("Device scope type does not match for %s\n",
-                               pci_name(info->dev));
-                       return -EINVAL;
-               }
-
-               for_each_dev_scope(devices, devices_cnt, i, tmp)
-                       if (tmp == NULL) {
-                               devices[i].bus = info->dev->bus->number;
-                               devices[i].devfn = info->dev->devfn;
-                               rcu_assign_pointer(devices[i].dev,
-                                                  get_device(dev));
-                               return 1;
-                       }
-               BUG_ON(i >= devices_cnt);
-       }
-
-       return 0;
-}
-
-int dmar_remove_dev_scope(struct dmar_pci_notify_info *info, u16 segment,
-                         struct dmar_dev_scope *devices, int count)
-{
-       int index;
-       struct device *tmp;
-
-       if (info->seg != segment)
-               return 0;
-
-       for_each_active_dev_scope(devices, count, index, tmp)
-               if (tmp == &info->dev->dev) {
-                       RCU_INIT_POINTER(devices[index].dev, NULL);
-                       synchronize_rcu();
-                       put_device(tmp);
-                       return 1;
-               }
-
-       return 0;
-}
-
-static int dmar_pci_bus_add_dev(struct dmar_pci_notify_info *info)
-{
-       int ret = 0;
-       struct dmar_drhd_unit *dmaru;
-       struct acpi_dmar_hardware_unit *drhd;
-
-       for_each_drhd_unit(dmaru) {
-               if (dmaru->include_all)
-                       continue;
-
-               drhd = container_of(dmaru->hdr,
-                                   struct acpi_dmar_hardware_unit, header);
-               ret = dmar_insert_dev_scope(info, (void *)(drhd + 1),
-                               ((void *)drhd) + drhd->header.length,
-                               dmaru->segment,
-                               dmaru->devices, dmaru->devices_cnt);
-               if (ret)
-                       break;
-       }
-       if (ret >= 0)
-               ret = dmar_iommu_notify_scope_dev(info);
-       if (ret < 0 && dmar_dev_scope_status == 0)
-               dmar_dev_scope_status = ret;
-
-       return ret;
-}
-
-static void  dmar_pci_bus_del_dev(struct dmar_pci_notify_info *info)
-{
-       struct dmar_drhd_unit *dmaru;
-
-       for_each_drhd_unit(dmaru)
-               if (dmar_remove_dev_scope(info, dmaru->segment,
-                       dmaru->devices, dmaru->devices_cnt))
-                       break;
-       dmar_iommu_notify_scope_dev(info);
-}
-
-static int dmar_pci_bus_notifier(struct notifier_block *nb,
-                                unsigned long action, void *data)
-{
-       struct pci_dev *pdev = to_pci_dev(data);
-       struct dmar_pci_notify_info *info;
-
-       /* Only care about add/remove events for physical functions.
-        * For VFs we actually do the lookup based on the corresponding
-        * PF in device_to_iommu() anyway. */
-       if (pdev->is_virtfn)
-               return NOTIFY_DONE;
-       if (action != BUS_NOTIFY_ADD_DEVICE &&
-           action != BUS_NOTIFY_REMOVED_DEVICE)
-               return NOTIFY_DONE;
-
-       info = dmar_alloc_pci_notify_info(pdev, action);
-       if (!info)
-               return NOTIFY_DONE;
-
-       down_write(&dmar_global_lock);
-       if (action == BUS_NOTIFY_ADD_DEVICE)
-               dmar_pci_bus_add_dev(info);
-       else if (action == BUS_NOTIFY_REMOVED_DEVICE)
-               dmar_pci_bus_del_dev(info);
-       up_write(&dmar_global_lock);
-
-       dmar_free_pci_notify_info(info);
-
-       return NOTIFY_OK;
-}
-
-static struct notifier_block dmar_pci_bus_nb = {
-       .notifier_call = dmar_pci_bus_notifier,
-       .priority = INT_MIN,
-};
-
-static struct dmar_drhd_unit *
-dmar_find_dmaru(struct acpi_dmar_hardware_unit *drhd)
-{
-       struct dmar_drhd_unit *dmaru;
-
-       list_for_each_entry_rcu(dmaru, &dmar_drhd_units, list,
-                               dmar_rcu_check())
-               if (dmaru->segment == drhd->segment &&
-                   dmaru->reg_base_addr == drhd->address)
-                       return dmaru;
-
-       return NULL;
-}
-
-/**
- * dmar_parse_one_drhd - parses exactly one DMA remapping hardware definition
- * structure which uniquely represent one DMA remapping hardware unit
- * present in the platform
- */
-static int dmar_parse_one_drhd(struct acpi_dmar_header *header, void *arg)
-{
-       struct acpi_dmar_hardware_unit *drhd;
-       struct dmar_drhd_unit *dmaru;
-       int ret;
-
-       drhd = (struct acpi_dmar_hardware_unit *)header;
-       dmaru = dmar_find_dmaru(drhd);
-       if (dmaru)
-               goto out;
-
-       dmaru = kzalloc(sizeof(*dmaru) + header->length, GFP_KERNEL);
-       if (!dmaru)
-               return -ENOMEM;
-
-       /*
-        * If header is allocated from slab by ACPI _DSM method, we need to
-        * copy the content because the memory buffer will be freed on return.
-        */
-       dmaru->hdr = (void *)(dmaru + 1);
-       memcpy(dmaru->hdr, header, header->length);
-       dmaru->reg_base_addr = drhd->address;
-       dmaru->segment = drhd->segment;
-       dmaru->include_all = drhd->flags & 0x1; /* BIT0: INCLUDE_ALL */
-       dmaru->devices = dmar_alloc_dev_scope((void *)(drhd + 1),
-                                             ((void *)drhd) + drhd->header.length,
-                                             &dmaru->devices_cnt);
-       if (dmaru->devices_cnt && dmaru->devices == NULL) {
-               kfree(dmaru);
-               return -ENOMEM;
-       }
-
-       ret = alloc_iommu(dmaru);
-       if (ret) {
-               dmar_free_dev_scope(&dmaru->devices,
-                                   &dmaru->devices_cnt);
-               kfree(dmaru);
-               return ret;
-       }
-       dmar_register_drhd_unit(dmaru);
-
-out:
-       if (arg)
-               (*(int *)arg)++;
-
-       return 0;
-}
-
-static void dmar_free_drhd(struct dmar_drhd_unit *dmaru)
-{
-       if (dmaru->devices && dmaru->devices_cnt)
-               dmar_free_dev_scope(&dmaru->devices, &dmaru->devices_cnt);
-       if (dmaru->iommu)
-               free_iommu(dmaru->iommu);
-       kfree(dmaru);
-}
-
-static int __init dmar_parse_one_andd(struct acpi_dmar_header *header,
-                                     void *arg)
-{
-       struct acpi_dmar_andd *andd = (void *)header;
-
-       /* Check for NUL termination within the designated length */
-       if (strnlen(andd->device_name, header->length - 8) == header->length - 8) {
-               pr_warn(FW_BUG
-                          "Your BIOS is broken; ANDD object name is not NUL-terminated\n"
-                          "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
-                          dmi_get_system_info(DMI_BIOS_VENDOR),
-                          dmi_get_system_info(DMI_BIOS_VERSION),
-                          dmi_get_system_info(DMI_PRODUCT_VERSION));
-               add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
-               return -EINVAL;
-       }
-       pr_info("ANDD device: %x name: %s\n", andd->device_number,
-               andd->device_name);
-
-       return 0;
-}
-
-#ifdef CONFIG_ACPI_NUMA
-static int dmar_parse_one_rhsa(struct acpi_dmar_header *header, void *arg)
-{
-       struct acpi_dmar_rhsa *rhsa;
-       struct dmar_drhd_unit *drhd;
-
-       rhsa = (struct acpi_dmar_rhsa *)header;
-       for_each_drhd_unit(drhd) {
-               if (drhd->reg_base_addr == rhsa->base_address) {
-                       int node = acpi_map_pxm_to_node(rhsa->proximity_domain);
-
-                       if (!node_online(node))
-                               node = NUMA_NO_NODE;
-                       drhd->iommu->node = node;
-                       return 0;
-               }
-       }
-       pr_warn(FW_BUG
-               "Your BIOS is broken; RHSA refers to non-existent DMAR unit at %llx\n"
-               "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
-               rhsa->base_address,
-               dmi_get_system_info(DMI_BIOS_VENDOR),
-               dmi_get_system_info(DMI_BIOS_VERSION),
-               dmi_get_system_info(DMI_PRODUCT_VERSION));
-       add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
-
-       return 0;
-}
-#else
-#define        dmar_parse_one_rhsa             dmar_res_noop
-#endif
-
-static void
-dmar_table_print_dmar_entry(struct acpi_dmar_header *header)
-{
-       struct acpi_dmar_hardware_unit *drhd;
-       struct acpi_dmar_reserved_memory *rmrr;
-       struct acpi_dmar_atsr *atsr;
-       struct acpi_dmar_rhsa *rhsa;
-
-       switch (header->type) {
-       case ACPI_DMAR_TYPE_HARDWARE_UNIT:
-               drhd = container_of(header, struct acpi_dmar_hardware_unit,
-                                   header);
-               pr_info("DRHD base: %#016Lx flags: %#x\n",
-                       (unsigned long long)drhd->address, drhd->flags);
-               break;
-       case ACPI_DMAR_TYPE_RESERVED_MEMORY:
-               rmrr = container_of(header, struct acpi_dmar_reserved_memory,
-                                   header);
-               pr_info("RMRR base: %#016Lx end: %#016Lx\n",
-                       (unsigned long long)rmrr->base_address,
-                       (unsigned long long)rmrr->end_address);
-               break;
-       case ACPI_DMAR_TYPE_ROOT_ATS:
-               atsr = container_of(header, struct acpi_dmar_atsr, header);
-               pr_info("ATSR flags: %#x\n", atsr->flags);
-               break;
-       case ACPI_DMAR_TYPE_HARDWARE_AFFINITY:
-               rhsa = container_of(header, struct acpi_dmar_rhsa, header);
-               pr_info("RHSA base: %#016Lx proximity domain: %#x\n",
-                      (unsigned long long)rhsa->base_address,
-                      rhsa->proximity_domain);
-               break;
-       case ACPI_DMAR_TYPE_NAMESPACE:
-               /* We don't print this here because we need to sanity-check
-                  it first. So print it in dmar_parse_one_andd() instead. */
-               break;
-       }
-}
-
-/**
- * dmar_table_detect - checks to see if the platform supports DMAR devices
- */
-static int __init dmar_table_detect(void)
-{
-       acpi_status status = AE_OK;
-
-       /* if we could find DMAR table, then there are DMAR devices */
-       status = acpi_get_table(ACPI_SIG_DMAR, 0, &dmar_tbl);
-
-       if (ACPI_SUCCESS(status) && !dmar_tbl) {
-               pr_warn("Unable to map DMAR\n");
-               status = AE_NOT_FOUND;
-       }
-
-       return ACPI_SUCCESS(status) ? 0 : -ENOENT;
-}
-
-static int dmar_walk_remapping_entries(struct acpi_dmar_header *start,
-                                      size_t len, struct dmar_res_callback *cb)
-{
-       struct acpi_dmar_header *iter, *next;
-       struct acpi_dmar_header *end = ((void *)start) + len;
-
-       for (iter = start; iter < end; iter = next) {
-               next = (void *)iter + iter->length;
-               if (iter->length == 0) {
-                       /* Avoid looping forever on bad ACPI tables */
-                       pr_debug(FW_BUG "Invalid 0-length structure\n");
-                       break;
-               } else if (next > end) {
-                       /* Avoid passing table end */
-                       pr_warn(FW_BUG "Record passes table end\n");
-                       return -EINVAL;
-               }
-
-               if (cb->print_entry)
-                       dmar_table_print_dmar_entry(iter);
-
-               if (iter->type >= ACPI_DMAR_TYPE_RESERVED) {
-                       /* continue for forward compatibility */
-                       pr_debug("Unknown DMAR structure type %d\n",
-                                iter->type);
-               } else if (cb->cb[iter->type]) {
-                       int ret;
-
-                       ret = cb->cb[iter->type](iter, cb->arg[iter->type]);
-                       if (ret)
-                               return ret;
-               } else if (!cb->ignore_unhandled) {
-                       pr_warn("No handler for DMAR structure type %d\n",
-                               iter->type);
-                       return -EINVAL;
-               }
-       }
-
-       return 0;
-}
-
-static inline int dmar_walk_dmar_table(struct acpi_table_dmar *dmar,
-                                      struct dmar_res_callback *cb)
-{
-       return dmar_walk_remapping_entries((void *)(dmar + 1),
-                       dmar->header.length - sizeof(*dmar), cb);
-}
-
-/**
- * parse_dmar_table - parses the DMA reporting table
- */
-static int __init
-parse_dmar_table(void)
-{
-       struct acpi_table_dmar *dmar;
-       int drhd_count = 0;
-       int ret;
-       struct dmar_res_callback cb = {
-               .print_entry = true,
-               .ignore_unhandled = true,
-               .arg[ACPI_DMAR_TYPE_HARDWARE_UNIT] = &drhd_count,
-               .cb[ACPI_DMAR_TYPE_HARDWARE_UNIT] = &dmar_parse_one_drhd,
-               .cb[ACPI_DMAR_TYPE_RESERVED_MEMORY] = &dmar_parse_one_rmrr,
-               .cb[ACPI_DMAR_TYPE_ROOT_ATS] = &dmar_parse_one_atsr,
-               .cb[ACPI_DMAR_TYPE_HARDWARE_AFFINITY] = &dmar_parse_one_rhsa,
-               .cb[ACPI_DMAR_TYPE_NAMESPACE] = &dmar_parse_one_andd,
-       };
-
-       /*
-        * Do it again, earlier dmar_tbl mapping could be mapped with
-        * fixed map.
-        */
-       dmar_table_detect();
-
-       /*
-        * ACPI tables may not be DMA protected by tboot, so use DMAR copy
-        * SINIT saved in SinitMleData in TXT heap (which is DMA protected)
-        */
-       dmar_tbl = tboot_get_dmar_table(dmar_tbl);
-
-       dmar = (struct acpi_table_dmar *)dmar_tbl;
-       if (!dmar)
-               return -ENODEV;
-
-       if (dmar->width < PAGE_SHIFT - 1) {
-               pr_warn("Invalid DMAR haw\n");
-               return -EINVAL;
-       }
-
-       pr_info("Host address width %d\n", dmar->width + 1);
-       ret = dmar_walk_dmar_table(dmar, &cb);
-       if (ret == 0 && drhd_count == 0)
-               pr_warn(FW_BUG "No DRHD structure found in DMAR table\n");
-
-       return ret;
-}
-
-static int dmar_pci_device_match(struct dmar_dev_scope devices[],
-                                int cnt, struct pci_dev *dev)
-{
-       int index;
-       struct device *tmp;
-
-       while (dev) {
-               for_each_active_dev_scope(devices, cnt, index, tmp)
-                       if (dev_is_pci(tmp) && dev == to_pci_dev(tmp))
-                               return 1;
-
-               /* Check our parent */
-               dev = dev->bus->self;
-       }
-
-       return 0;
-}
-
-struct dmar_drhd_unit *
-dmar_find_matched_drhd_unit(struct pci_dev *dev)
-{
-       struct dmar_drhd_unit *dmaru;
-       struct acpi_dmar_hardware_unit *drhd;
-
-       dev = pci_physfn(dev);
-
-       rcu_read_lock();
-       for_each_drhd_unit(dmaru) {
-               drhd = container_of(dmaru->hdr,
-                                   struct acpi_dmar_hardware_unit,
-                                   header);
-
-               if (dmaru->include_all &&
-                   drhd->segment == pci_domain_nr(dev->bus))
-                       goto out;
-
-               if (dmar_pci_device_match(dmaru->devices,
-                                         dmaru->devices_cnt, dev))
-                       goto out;
-       }
-       dmaru = NULL;
-out:
-       rcu_read_unlock();
-
-       return dmaru;
-}
-
-static void __init dmar_acpi_insert_dev_scope(u8 device_number,
-                                             struct acpi_device *adev)
-{
-       struct dmar_drhd_unit *dmaru;
-       struct acpi_dmar_hardware_unit *drhd;
-       struct acpi_dmar_device_scope *scope;
-       struct device *tmp;
-       int i;
-       struct acpi_dmar_pci_path *path;
-
-       for_each_drhd_unit(dmaru) {
-               drhd = container_of(dmaru->hdr,
-                                   struct acpi_dmar_hardware_unit,
-                                   header);
-
-               for (scope = (void *)(drhd + 1);
-                    (unsigned long)scope < ((unsigned long)drhd) + drhd->header.length;
-                    scope = ((void *)scope) + scope->length) {
-                       if (scope->entry_type != ACPI_DMAR_SCOPE_TYPE_NAMESPACE)
-                               continue;
-                       if (scope->enumeration_id != device_number)
-                               continue;
-
-                       path = (void *)(scope + 1);
-                       pr_info("ACPI device \"%s\" under DMAR at %llx as %02x:%02x.%d\n",
-                               dev_name(&adev->dev), dmaru->reg_base_addr,
-                               scope->bus, path->device, path->function);
-                       for_each_dev_scope(dmaru->devices, dmaru->devices_cnt, i, tmp)
-                               if (tmp == NULL) {
-                                       dmaru->devices[i].bus = scope->bus;
-                                       dmaru->devices[i].devfn = PCI_DEVFN(path->device,
-                                                                           path->function);
-                                       rcu_assign_pointer(dmaru->devices[i].dev,
-                                                          get_device(&adev->dev));
-                                       return;
-                               }
-                       BUG_ON(i >= dmaru->devices_cnt);
-               }
-       }
-       pr_warn("No IOMMU scope found for ANDD enumeration ID %d (%s)\n",
-               device_number, dev_name(&adev->dev));
-}
-
-static int __init dmar_acpi_dev_scope_init(void)
-{
-       struct acpi_dmar_andd *andd;
-
-       if (dmar_tbl == NULL)
-               return -ENODEV;
-
-       for (andd = (void *)dmar_tbl + sizeof(struct acpi_table_dmar);
-            ((unsigned long)andd) < ((unsigned long)dmar_tbl) + dmar_tbl->length;
-            andd = ((void *)andd) + andd->header.length) {
-               if (andd->header.type == ACPI_DMAR_TYPE_NAMESPACE) {
-                       acpi_handle h;
-                       struct acpi_device *adev;
-
-                       if (!ACPI_SUCCESS(acpi_get_handle(ACPI_ROOT_OBJECT,
-                                                         andd->device_name,
-                                                         &h))) {
-                               pr_err("Failed to find handle for ACPI object %s\n",
-                                      andd->device_name);
-                               continue;
-                       }
-                       if (acpi_bus_get_device(h, &adev)) {
-                               pr_err("Failed to get device for ACPI object %s\n",
-                                      andd->device_name);
-                               continue;
-                       }
-                       dmar_acpi_insert_dev_scope(andd->device_number, adev);
-               }
-       }
-       return 0;
-}
-
-int __init dmar_dev_scope_init(void)
-{
-       struct pci_dev *dev = NULL;
-       struct dmar_pci_notify_info *info;
-
-       if (dmar_dev_scope_status != 1)
-               return dmar_dev_scope_status;
-
-       if (list_empty(&dmar_drhd_units)) {
-               dmar_dev_scope_status = -ENODEV;
-       } else {
-               dmar_dev_scope_status = 0;
-
-               dmar_acpi_dev_scope_init();
-
-               for_each_pci_dev(dev) {
-                       if (dev->is_virtfn)
-                               continue;
-
-                       info = dmar_alloc_pci_notify_info(dev,
-                                       BUS_NOTIFY_ADD_DEVICE);
-                       if (!info) {
-                               return dmar_dev_scope_status;
-                       } else {
-                               dmar_pci_bus_add_dev(info);
-                               dmar_free_pci_notify_info(info);
-                       }
-               }
-       }
-
-       return dmar_dev_scope_status;
-}
-
-void __init dmar_register_bus_notifier(void)
-{
-       bus_register_notifier(&pci_bus_type, &dmar_pci_bus_nb);
-}
-
-
-int __init dmar_table_init(void)
-{
-       static int dmar_table_initialized;
-       int ret;
-
-       if (dmar_table_initialized == 0) {
-               ret = parse_dmar_table();
-               if (ret < 0) {
-                       if (ret != -ENODEV)
-                               pr_info("Parse DMAR table failure.\n");
-               } else  if (list_empty(&dmar_drhd_units)) {
-                       pr_info("No DMAR devices found\n");
-                       ret = -ENODEV;
-               }
-
-               if (ret < 0)
-                       dmar_table_initialized = ret;
-               else
-                       dmar_table_initialized = 1;
-       }
-
-       return dmar_table_initialized < 0 ? dmar_table_initialized : 0;
-}
-
-static void warn_invalid_dmar(u64 addr, const char *message)
-{
-       pr_warn_once(FW_BUG
-               "Your BIOS is broken; DMAR reported at address %llx%s!\n"
-               "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
-               addr, message,
-               dmi_get_system_info(DMI_BIOS_VENDOR),
-               dmi_get_system_info(DMI_BIOS_VERSION),
-               dmi_get_system_info(DMI_PRODUCT_VERSION));
-       add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
-}
-
-static int __ref
-dmar_validate_one_drhd(struct acpi_dmar_header *entry, void *arg)
-{
-       struct acpi_dmar_hardware_unit *drhd;
-       void __iomem *addr;
-       u64 cap, ecap;
-
-       drhd = (void *)entry;
-       if (!drhd->address) {
-               warn_invalid_dmar(0, "");
-               return -EINVAL;
-       }
-
-       if (arg)
-               addr = ioremap(drhd->address, VTD_PAGE_SIZE);
-       else
-               addr = early_ioremap(drhd->address, VTD_PAGE_SIZE);
-       if (!addr) {
-               pr_warn("Can't validate DRHD address: %llx\n", drhd->address);
-               return -EINVAL;
-       }
-
-       cap = dmar_readq(addr + DMAR_CAP_REG);
-       ecap = dmar_readq(addr + DMAR_ECAP_REG);
-
-       if (arg)
-               iounmap(addr);
-       else
-               early_iounmap(addr, VTD_PAGE_SIZE);
-
-       if (cap == (uint64_t)-1 && ecap == (uint64_t)-1) {
-               warn_invalid_dmar(drhd->address, " returns all ones");
-               return -EINVAL;
-       }
-
-       return 0;
-}
-
-int __init detect_intel_iommu(void)
-{
-       int ret;
-       struct dmar_res_callback validate_drhd_cb = {
-               .cb[ACPI_DMAR_TYPE_HARDWARE_UNIT] = &dmar_validate_one_drhd,
-               .ignore_unhandled = true,
-       };
-
-       down_write(&dmar_global_lock);
-       ret = dmar_table_detect();
-       if (!ret)
-               ret = dmar_walk_dmar_table((struct acpi_table_dmar *)dmar_tbl,
-                                          &validate_drhd_cb);
-       if (!ret && !no_iommu && !iommu_detected && !dmar_disabled) {
-               iommu_detected = 1;
-               /* Make sure ACS will be enabled */
-               pci_request_acs();
-       }
-
-#ifdef CONFIG_X86
-       if (!ret) {
-               x86_init.iommu.iommu_init = intel_iommu_init;
-               x86_platform.iommu_shutdown = intel_iommu_shutdown;
-       }
-
-#endif
-
-       if (dmar_tbl) {
-               acpi_put_table(dmar_tbl);
-               dmar_tbl = NULL;
-       }
-       up_write(&dmar_global_lock);
-
-       return ret ? ret : 1;
-}
-
-static void unmap_iommu(struct intel_iommu *iommu)
-{
-       iounmap(iommu->reg);
-       release_mem_region(iommu->reg_phys, iommu->reg_size);
-}
-
-/**
- * map_iommu: map the iommu's registers
- * @iommu: the iommu to map
- * @phys_addr: the physical address of the base resgister
- *
- * Memory map the iommu's registers.  Start w/ a single page, and
- * possibly expand if that turns out to be insufficent.
- */
-static int map_iommu(struct intel_iommu *iommu, u64 phys_addr)
-{
-       int map_size, err=0;
-
-       iommu->reg_phys = phys_addr;
-       iommu->reg_size = VTD_PAGE_SIZE;
-
-       if (!request_mem_region(iommu->reg_phys, iommu->reg_size, iommu->name)) {
-               pr_err("Can't reserve memory\n");
-               err = -EBUSY;
-               goto out;
-       }
-
-       iommu->reg = ioremap(iommu->reg_phys, iommu->reg_size);
-       if (!iommu->reg) {
-               pr_err("Can't map the region\n");
-               err = -ENOMEM;
-               goto release;
-       }
-
-       iommu->cap = dmar_readq(iommu->reg + DMAR_CAP_REG);
-       iommu->ecap = dmar_readq(iommu->reg + DMAR_ECAP_REG);
-
-       if (iommu->cap == (uint64_t)-1 && iommu->ecap == (uint64_t)-1) {
-               err = -EINVAL;
-               warn_invalid_dmar(phys_addr, " returns all ones");
-               goto unmap;
-       }
-       iommu->vccap = dmar_readq(iommu->reg + DMAR_VCCAP_REG);
-
-       /* the registers might be more than one page */
-       map_size = max_t(int, ecap_max_iotlb_offset(iommu->ecap),
-                        cap_max_fault_reg_offset(iommu->cap));
-       map_size = VTD_PAGE_ALIGN(map_size);
-       if (map_size > iommu->reg_size) {
-               iounmap(iommu->reg);
-               release_mem_region(iommu->reg_phys, iommu->reg_size);
-               iommu->reg_size = map_size;
-               if (!request_mem_region(iommu->reg_phys, iommu->reg_size,
-                                       iommu->name)) {
-                       pr_err("Can't reserve memory\n");
-                       err = -EBUSY;
-                       goto out;
-               }
-               iommu->reg = ioremap(iommu->reg_phys, iommu->reg_size);
-               if (!iommu->reg) {
-                       pr_err("Can't map the region\n");
-                       err = -ENOMEM;
-                       goto release;
-               }
-       }
-       err = 0;
-       goto out;
-
-unmap:
-       iounmap(iommu->reg);
-release:
-       release_mem_region(iommu->reg_phys, iommu->reg_size);
-out:
-       return err;
-}
-
-static int dmar_alloc_seq_id(struct intel_iommu *iommu)
-{
-       iommu->seq_id = find_first_zero_bit(dmar_seq_ids,
-                                           DMAR_UNITS_SUPPORTED);
-       if (iommu->seq_id >= DMAR_UNITS_SUPPORTED) {
-               iommu->seq_id = -1;
-       } else {
-               set_bit(iommu->seq_id, dmar_seq_ids);
-               sprintf(iommu->name, "dmar%d", iommu->seq_id);
-       }
-
-       return iommu->seq_id;
-}
-
-static void dmar_free_seq_id(struct intel_iommu *iommu)
-{
-       if (iommu->seq_id >= 0) {
-               clear_bit(iommu->seq_id, dmar_seq_ids);
-               iommu->seq_id = -1;
-       }
-}
-
-static int alloc_iommu(struct dmar_drhd_unit *drhd)
-{
-       struct intel_iommu *iommu;
-       u32 ver, sts;
-       int agaw = 0;
-       int msagaw = 0;
-       int err;
-
-       if (!drhd->reg_base_addr) {
-               warn_invalid_dmar(0, "");
-               return -EINVAL;
-       }
-
-       iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
-       if (!iommu)
-               return -ENOMEM;
-
-       if (dmar_alloc_seq_id(iommu) < 0) {
-               pr_err("Failed to allocate seq_id\n");
-               err = -ENOSPC;
-               goto error;
-       }
-
-       err = map_iommu(iommu, drhd->reg_base_addr);
-       if (err) {
-               pr_err("Failed to map %s\n", iommu->name);
-               goto error_free_seq_id;
-       }
-
-       err = -EINVAL;
-       agaw = iommu_calculate_agaw(iommu);
-       if (agaw < 0) {
-               pr_err("Cannot get a valid agaw for iommu (seq_id = %d)\n",
-                       iommu->seq_id);
-               goto err_unmap;
-       }
-       msagaw = iommu_calculate_max_sagaw(iommu);
-       if (msagaw < 0) {
-               pr_err("Cannot get a valid max agaw for iommu (seq_id = %d)\n",
-                       iommu->seq_id);
-               goto err_unmap;
-       }
-       iommu->agaw = agaw;
-       iommu->msagaw = msagaw;
-       iommu->segment = drhd->segment;
-
-       iommu->node = NUMA_NO_NODE;
-
-       ver = readl(iommu->reg + DMAR_VER_REG);
-       pr_info("%s: reg_base_addr %llx ver %d:%d cap %llx ecap %llx\n",
-               iommu->name,
-               (unsigned long long)drhd->reg_base_addr,
-               DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver),
-               (unsigned long long)iommu->cap,
-               (unsigned long long)iommu->ecap);
-
-       /* Reflect status in gcmd */
-       sts = readl(iommu->reg + DMAR_GSTS_REG);
-       if (sts & DMA_GSTS_IRES)
-               iommu->gcmd |= DMA_GCMD_IRE;
-       if (sts & DMA_GSTS_TES)
-               iommu->gcmd |= DMA_GCMD_TE;
-       if (sts & DMA_GSTS_QIES)
-               iommu->gcmd |= DMA_GCMD_QIE;
-
-       raw_spin_lock_init(&iommu->register_lock);
-
-       if (intel_iommu_enabled) {
-               err = iommu_device_sysfs_add(&iommu->iommu, NULL,
-                                            intel_iommu_groups,
-                                            "%s", iommu->name);
-               if (err)
-                       goto err_unmap;
-
-               iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
-
-               err = iommu_device_register(&iommu->iommu);
-               if (err)
-                       goto err_unmap;
-       }
-
-       drhd->iommu = iommu;
-
-       return 0;
-
-err_unmap:
-       unmap_iommu(iommu);
-error_free_seq_id:
-       dmar_free_seq_id(iommu);
-error:
-       kfree(iommu);
-       return err;
-}
-
-static void free_iommu(struct intel_iommu *iommu)
-{
-       if (intel_iommu_enabled) {
-               iommu_device_unregister(&iommu->iommu);
-               iommu_device_sysfs_remove(&iommu->iommu);
-       }
-
-       if (iommu->irq) {
-               if (iommu->pr_irq) {
-                       free_irq(iommu->pr_irq, iommu);
-                       dmar_free_hwirq(iommu->pr_irq);
-                       iommu->pr_irq = 0;
-               }
-               free_irq(iommu->irq, iommu);
-               dmar_free_hwirq(iommu->irq);
-               iommu->irq = 0;
-       }
-
-       if (iommu->qi) {
-               free_page((unsigned long)iommu->qi->desc);
-               kfree(iommu->qi->desc_status);
-               kfree(iommu->qi);
-       }
-
-       if (iommu->reg)
-               unmap_iommu(iommu);
-
-       dmar_free_seq_id(iommu);
-       kfree(iommu);
-}
-
-/*
- * Reclaim all the submitted descriptors which have completed its work.
- */
-static inline void reclaim_free_desc(struct q_inval *qi)
-{
-       while (qi->desc_status[qi->free_tail] == QI_DONE ||
-              qi->desc_status[qi->free_tail] == QI_ABORT) {
-               qi->desc_status[qi->free_tail] = QI_FREE;
-               qi->free_tail = (qi->free_tail + 1) % QI_LENGTH;
-               qi->free_cnt++;
-       }
-}
-
-static int qi_check_fault(struct intel_iommu *iommu, int index, int wait_index)
-{
-       u32 fault;
-       int head, tail;
-       struct q_inval *qi = iommu->qi;
-       int shift = qi_shift(iommu);
-
-       if (qi->desc_status[wait_index] == QI_ABORT)
-               return -EAGAIN;
-
-       fault = readl(iommu->reg + DMAR_FSTS_REG);
-
-       /*
-        * If IQE happens, the head points to the descriptor associated
-        * with the error. No new descriptors are fetched until the IQE
-        * is cleared.
-        */
-       if (fault & DMA_FSTS_IQE) {
-               head = readl(iommu->reg + DMAR_IQH_REG);
-               if ((head >> shift) == index) {
-                       struct qi_desc *desc = qi->desc + head;
-
-                       /*
-                        * desc->qw2 and desc->qw3 are either reserved or
-                        * used by software as private data. We won't print
-                        * out these two qw's for security consideration.
-                        */
-                       pr_err("VT-d detected invalid descriptor: qw0 = %llx, qw1 = %llx\n",
-                              (unsigned long long)desc->qw0,
-                              (unsigned long long)desc->qw1);
-                       memcpy(desc, qi->desc + (wait_index << shift),
-                              1 << shift);
-                       writel(DMA_FSTS_IQE, iommu->reg + DMAR_FSTS_REG);
-                       return -EINVAL;
-               }
-       }
-
-       /*
-        * If ITE happens, all pending wait_desc commands are aborted.
-        * No new descriptors are fetched until the ITE is cleared.
-        */
-       if (fault & DMA_FSTS_ITE) {
-               head = readl(iommu->reg + DMAR_IQH_REG);
-               head = ((head >> shift) - 1 + QI_LENGTH) % QI_LENGTH;
-               head |= 1;
-               tail = readl(iommu->reg + DMAR_IQT_REG);
-               tail = ((tail >> shift) - 1 + QI_LENGTH) % QI_LENGTH;
-
-               writel(DMA_FSTS_ITE, iommu->reg + DMAR_FSTS_REG);
-
-               do {
-                       if (qi->desc_status[head] == QI_IN_USE)
-                               qi->desc_status[head] = QI_ABORT;
-                       head = (head - 2 + QI_LENGTH) % QI_LENGTH;
-               } while (head != tail);
-
-               if (qi->desc_status[wait_index] == QI_ABORT)
-                       return -EAGAIN;
-       }
-
-       if (fault & DMA_FSTS_ICE)
-               writel(DMA_FSTS_ICE, iommu->reg + DMAR_FSTS_REG);
-
-       return 0;
-}
-
-/*
- * Function to submit invalidation descriptors of all types to the queued
- * invalidation interface(QI). Multiple descriptors can be submitted at a
- * time, a wait descriptor will be appended to each submission to ensure
- * hardware has completed the invalidation before return. Wait descriptors
- * can be part of the submission but it will not be polled for completion.
- */
-int qi_submit_sync(struct intel_iommu *iommu, struct qi_desc *desc,
-                  unsigned int count, unsigned long options)
-{
-       struct q_inval *qi = iommu->qi;
-       struct qi_desc wait_desc;
-       int wait_index, index;
-       unsigned long flags;
-       int offset, shift;
-       int rc, i;
-
-       if (!qi)
-               return 0;
-
-restart:
-       rc = 0;
-
-       raw_spin_lock_irqsave(&qi->q_lock, flags);
-       /*
-        * Check if we have enough empty slots in the queue to submit,
-        * the calculation is based on:
-        * # of desc + 1 wait desc + 1 space between head and tail
-        */
-       while (qi->free_cnt < count + 2) {
-               raw_spin_unlock_irqrestore(&qi->q_lock, flags);
-               cpu_relax();
-               raw_spin_lock_irqsave(&qi->q_lock, flags);
-       }
-
-       index = qi->free_head;
-       wait_index = (index + count) % QI_LENGTH;
-       shift = qi_shift(iommu);
-
-       for (i = 0; i < count; i++) {
-               offset = ((index + i) % QI_LENGTH) << shift;
-               memcpy(qi->desc + offset, &desc[i], 1 << shift);
-               qi->desc_status[(index + i) % QI_LENGTH] = QI_IN_USE;
-       }
-       qi->desc_status[wait_index] = QI_IN_USE;
-
-       wait_desc.qw0 = QI_IWD_STATUS_DATA(QI_DONE) |
-                       QI_IWD_STATUS_WRITE | QI_IWD_TYPE;
-       if (options & QI_OPT_WAIT_DRAIN)
-               wait_desc.qw0 |= QI_IWD_PRQ_DRAIN;
-       wait_desc.qw1 = virt_to_phys(&qi->desc_status[wait_index]);
-       wait_desc.qw2 = 0;
-       wait_desc.qw3 = 0;
-
-       offset = wait_index << shift;
-       memcpy(qi->desc + offset, &wait_desc, 1 << shift);
-
-       qi->free_head = (qi->free_head + count + 1) % QI_LENGTH;
-       qi->free_cnt -= count + 1;
-
-       /*
-        * update the HW tail register indicating the presence of
-        * new descriptors.
-        */
-       writel(qi->free_head << shift, iommu->reg + DMAR_IQT_REG);
-
-       while (qi->desc_status[wait_index] != QI_DONE) {
-               /*
-                * We will leave the interrupts disabled, to prevent interrupt
-                * context to queue another cmd while a cmd is already submitted
-                * and waiting for completion on this cpu. This is to avoid
-                * a deadlock where the interrupt context can wait indefinitely
-                * for free slots in the queue.
-                */
-               rc = qi_check_fault(iommu, index, wait_index);
-               if (rc)
-                       break;
-
-               raw_spin_unlock(&qi->q_lock);
-               cpu_relax();
-               raw_spin_lock(&qi->q_lock);
-       }
-
-       for (i = 0; i < count; i++)
-               qi->desc_status[(index + i) % QI_LENGTH] = QI_DONE;
-
-       reclaim_free_desc(qi);
-       raw_spin_unlock_irqrestore(&qi->q_lock, flags);
-
-       if (rc == -EAGAIN)
-               goto restart;
-
-       return rc;
-}
-
-/*
- * Flush the global interrupt entry cache.
- */
-void qi_global_iec(struct intel_iommu *iommu)
-{
-       struct qi_desc desc;
-
-       desc.qw0 = QI_IEC_TYPE;
-       desc.qw1 = 0;
-       desc.qw2 = 0;
-       desc.qw3 = 0;
-
-       /* should never fail */
-       qi_submit_sync(iommu, &desc, 1, 0);
-}
-
-void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm,
-                     u64 type)
-{
-       struct qi_desc desc;
-
-       desc.qw0 = QI_CC_FM(fm) | QI_CC_SID(sid) | QI_CC_DID(did)
-                       | QI_CC_GRAN(type) | QI_CC_TYPE;
-       desc.qw1 = 0;
-       desc.qw2 = 0;
-       desc.qw3 = 0;
-
-       qi_submit_sync(iommu, &desc, 1, 0);
-}
-
-void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
-                   unsigned int size_order, u64 type)
-{
-       u8 dw = 0, dr = 0;
-
-       struct qi_desc desc;
-       int ih = 0;
-
-       if (cap_write_drain(iommu->cap))
-               dw = 1;
-
-       if (cap_read_drain(iommu->cap))
-               dr = 1;
-
-       desc.qw0 = QI_IOTLB_DID(did) | QI_IOTLB_DR(dr) | QI_IOTLB_DW(dw)
-               | QI_IOTLB_GRAN(type) | QI_IOTLB_TYPE;
-       desc.qw1 = QI_IOTLB_ADDR(addr) | QI_IOTLB_IH(ih)
-               | QI_IOTLB_AM(size_order);
-       desc.qw2 = 0;
-       desc.qw3 = 0;
-
-       qi_submit_sync(iommu, &desc, 1, 0);
-}
-
-void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid,
-                       u16 qdep, u64 addr, unsigned mask)
-{
-       struct qi_desc desc;
-
-       if (mask) {
-               addr |= (1ULL << (VTD_PAGE_SHIFT + mask - 1)) - 1;
-               desc.qw1 = QI_DEV_IOTLB_ADDR(addr) | QI_DEV_IOTLB_SIZE;
-       } else
-               desc.qw1 = QI_DEV_IOTLB_ADDR(addr);
-
-       if (qdep >= QI_DEV_IOTLB_MAX_INVS)
-               qdep = 0;
-
-       desc.qw0 = QI_DEV_IOTLB_SID(sid) | QI_DEV_IOTLB_QDEP(qdep) |
-                  QI_DIOTLB_TYPE | QI_DEV_IOTLB_PFSID(pfsid);
-       desc.qw2 = 0;
-       desc.qw3 = 0;
-
-       qi_submit_sync(iommu, &desc, 1, 0);
-}
-
-/* PASID-based IOTLB invalidation */
-void qi_flush_piotlb(struct intel_iommu *iommu, u16 did, u32 pasid, u64 addr,
-                    unsigned long npages, bool ih)
-{
-       struct qi_desc desc = {.qw2 = 0, .qw3 = 0};
-
-       /*
-        * npages == -1 means a PASID-selective invalidation, otherwise,
-        * a positive value for Page-selective-within-PASID invalidation.
-        * 0 is not a valid input.
-        */
-       if (WARN_ON(!npages)) {
-               pr_err("Invalid input npages = %ld\n", npages);
-               return;
-       }
-
-       if (npages == -1) {
-               desc.qw0 = QI_EIOTLB_PASID(pasid) |
-                               QI_EIOTLB_DID(did) |
-                               QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) |
-                               QI_EIOTLB_TYPE;
-               desc.qw1 = 0;
-       } else {
-               int mask = ilog2(__roundup_pow_of_two(npages));
-               unsigned long align = (1ULL << (VTD_PAGE_SHIFT + mask));
-
-               if (WARN_ON_ONCE(!ALIGN(addr, align)))
-                       addr &= ~(align - 1);
-
-               desc.qw0 = QI_EIOTLB_PASID(pasid) |
-                               QI_EIOTLB_DID(did) |
-                               QI_EIOTLB_GRAN(QI_GRAN_PSI_PASID) |
-                               QI_EIOTLB_TYPE;
-               desc.qw1 = QI_EIOTLB_ADDR(addr) |
-                               QI_EIOTLB_IH(ih) |
-                               QI_EIOTLB_AM(mask);
-       }
-
-       qi_submit_sync(iommu, &desc, 1, 0);
-}
-
-/* PASID-based device IOTLB Invalidate */
-void qi_flush_dev_iotlb_pasid(struct intel_iommu *iommu, u16 sid, u16 pfsid,
-                             u32 pasid,  u16 qdep, u64 addr,
-                             unsigned int size_order, u64 granu)
-{
-       unsigned long mask = 1UL << (VTD_PAGE_SHIFT + size_order - 1);
-       struct qi_desc desc = {.qw1 = 0, .qw2 = 0, .qw3 = 0};
-
-       desc.qw0 = QI_DEV_EIOTLB_PASID(pasid) | QI_DEV_EIOTLB_SID(sid) |
-               QI_DEV_EIOTLB_QDEP(qdep) | QI_DEIOTLB_TYPE |
-               QI_DEV_IOTLB_PFSID(pfsid);
-       desc.qw1 = QI_DEV_EIOTLB_GLOB(granu);
-
-       /*
-        * If S bit is 0, we only flush a single page. If S bit is set,
-        * The least significant zero bit indicates the invalidation address
-        * range. VT-d spec 6.5.2.6.
-        * e.g. address bit 12[0] indicates 8KB, 13[0] indicates 16KB.
-        * size order = 0 is PAGE_SIZE 4KB
-        * Max Invs Pending (MIP) is set to 0 for now until we have DIT in
-        * ECAP.
-        */
-       desc.qw1 |= addr & ~mask;
-       if (size_order)
-               desc.qw1 |= QI_DEV_EIOTLB_SIZE;
-
-       qi_submit_sync(iommu, &desc, 1, 0);
-}
-
-void qi_flush_pasid_cache(struct intel_iommu *iommu, u16 did,
-                         u64 granu, int pasid)
-{
-       struct qi_desc desc = {.qw1 = 0, .qw2 = 0, .qw3 = 0};
-
-       desc.qw0 = QI_PC_PASID(pasid) | QI_PC_DID(did) |
-                       QI_PC_GRAN(granu) | QI_PC_TYPE;
-       qi_submit_sync(iommu, &desc, 1, 0);
-}
-
-/*
- * Disable Queued Invalidation interface.
- */
-void dmar_disable_qi(struct intel_iommu *iommu)
-{
-       unsigned long flags;
-       u32 sts;
-       cycles_t start_time = get_cycles();
-
-       if (!ecap_qis(iommu->ecap))
-               return;
-
-       raw_spin_lock_irqsave(&iommu->register_lock, flags);
-
-       sts =  readl(iommu->reg + DMAR_GSTS_REG);
-       if (!(sts & DMA_GSTS_QIES))
-               goto end;
-
-       /*
-        * Give a chance to HW to complete the pending invalidation requests.
-        */
-       while ((readl(iommu->reg + DMAR_IQT_REG) !=
-               readl(iommu->reg + DMAR_IQH_REG)) &&
-               (DMAR_OPERATION_TIMEOUT > (get_cycles() - start_time)))
-               cpu_relax();
-
-       iommu->gcmd &= ~DMA_GCMD_QIE;
-       writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
-
-       IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, readl,
-                     !(sts & DMA_GSTS_QIES), sts);
-end:
-       raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
-}
-
-/*
- * Enable queued invalidation.
- */
-static void __dmar_enable_qi(struct intel_iommu *iommu)
-{
-       u32 sts;
-       unsigned long flags;
-       struct q_inval *qi = iommu->qi;
-       u64 val = virt_to_phys(qi->desc);
-
-       qi->free_head = qi->free_tail = 0;
-       qi->free_cnt = QI_LENGTH;
-
-       /*
-        * Set DW=1 and QS=1 in IQA_REG when Scalable Mode capability
-        * is present.
-        */
-       if (ecap_smts(iommu->ecap))
-               val |= (1 << 11) | 1;
-
-       raw_spin_lock_irqsave(&iommu->register_lock, flags);
-
-       /* write zero to the tail reg */
-       writel(0, iommu->reg + DMAR_IQT_REG);
-
-       dmar_writeq(iommu->reg + DMAR_IQA_REG, val);
-
-       iommu->gcmd |= DMA_GCMD_QIE;
-       writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
-
-       /* Make sure hardware complete it */
-       IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, readl, (sts & DMA_GSTS_QIES), sts);
-
-       raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
-}
-
-/*
- * Enable Queued Invalidation interface. This is a must to support
- * interrupt-remapping. Also used by DMA-remapping, which replaces
- * register based IOTLB invalidation.
- */
-int dmar_enable_qi(struct intel_iommu *iommu)
-{
-       struct q_inval *qi;
-       struct page *desc_page;
-
-       if (!ecap_qis(iommu->ecap))
-               return -ENOENT;
-
-       /*
-        * queued invalidation is already setup and enabled.
-        */
-       if (iommu->qi)
-               return 0;
-
-       iommu->qi = kmalloc(sizeof(*qi), GFP_ATOMIC);
-       if (!iommu->qi)
-               return -ENOMEM;
-
-       qi = iommu->qi;
-
-       /*
-        * Need two pages to accommodate 256 descriptors of 256 bits each
-        * if the remapping hardware supports scalable mode translation.
-        */
-       desc_page = alloc_pages_node(iommu->node, GFP_ATOMIC | __GFP_ZERO,
-                                    !!ecap_smts(iommu->ecap));
-       if (!desc_page) {
-               kfree(qi);
-               iommu->qi = NULL;
-               return -ENOMEM;
-       }
-
-       qi->desc = page_address(desc_page);
-
-       qi->desc_status = kcalloc(QI_LENGTH, sizeof(int), GFP_ATOMIC);
-       if (!qi->desc_status) {
-               free_page((unsigned long) qi->desc);
-               kfree(qi);
-               iommu->qi = NULL;
-               return -ENOMEM;
-       }
-
-       raw_spin_lock_init(&qi->q_lock);
-
-       __dmar_enable_qi(iommu);
-
-       return 0;
-}
-
-/* iommu interrupt handling. Most stuff are MSI-like. */
-
-enum faulttype {
-       DMA_REMAP,
-       INTR_REMAP,
-       UNKNOWN,
-};
-
-static const char *dma_remap_fault_reasons[] =
-{
-       "Software",
-       "Present bit in root entry is clear",
-       "Present bit in context entry is clear",
-       "Invalid context entry",
-       "Access beyond MGAW",
-       "PTE Write access is not set",
-       "PTE Read access is not set",
-       "Next page table ptr is invalid",
-       "Root table address invalid",
-       "Context table ptr is invalid",
-       "non-zero reserved fields in RTP",
-       "non-zero reserved fields in CTP",
-       "non-zero reserved fields in PTE",
-       "PCE for translation request specifies blocking",
-};
-
-static const char * const dma_remap_sm_fault_reasons[] = {
-       "SM: Invalid Root Table Address",
-       "SM: TTM 0 for request with PASID",
-       "SM: TTM 0 for page group request",
-       "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", /* 0x33-0x37 */
-       "SM: Error attempting to access Root Entry",
-       "SM: Present bit in Root Entry is clear",
-       "SM: Non-zero reserved field set in Root Entry",
-       "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", /* 0x3B-0x3F */
-       "SM: Error attempting to access Context Entry",
-       "SM: Present bit in Context Entry is clear",
-       "SM: Non-zero reserved field set in the Context Entry",
-       "SM: Invalid Context Entry",
-       "SM: DTE field in Context Entry is clear",
-       "SM: PASID Enable field in Context Entry is clear",
-       "SM: PASID is larger than the max in Context Entry",
-       "SM: PRE field in Context-Entry is clear",
-       "SM: RID_PASID field error in Context-Entry",
-       "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", /* 0x49-0x4F */
-       "SM: Error attempting to access the PASID Directory Entry",
-       "SM: Present bit in Directory Entry is clear",
-       "SM: Non-zero reserved field set in PASID Directory Entry",
-       "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", /* 0x53-0x57 */
-       "SM: Error attempting to access PASID Table Entry",
-       "SM: Present bit in PASID Table Entry is clear",
-       "SM: Non-zero reserved field set in PASID Table Entry",
-       "SM: Invalid Scalable-Mode PASID Table Entry",
-       "SM: ERE field is clear in PASID Table Entry",
-       "SM: SRE field is clear in PASID Table Entry",
-       "Unknown", "Unknown",/* 0x5E-0x5F */
-       "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", /* 0x60-0x67 */
-       "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", /* 0x68-0x6F */
-       "SM: Error attempting to access first-level paging entry",
-       "SM: Present bit in first-level paging entry is clear",
-       "SM: Non-zero reserved field set in first-level paging entry",
-       "SM: Error attempting to access FL-PML4 entry",
-       "SM: First-level entry address beyond MGAW in Nested translation",
-       "SM: Read permission error in FL-PML4 entry in Nested translation",
-       "SM: Read permission error in first-level paging entry in Nested translation",
-       "SM: Write permission error in first-level paging entry in Nested translation",
-       "SM: Error attempting to access second-level paging entry",
-       "SM: Read/Write permission error in second-level paging entry",
-       "SM: Non-zero reserved field set in second-level paging entry",
-       "SM: Invalid second-level page table pointer",
-       "SM: A/D bit update needed in second-level entry when set up in no snoop",
-       "Unknown", "Unknown", "Unknown", /* 0x7D-0x7F */
-       "SM: Address in first-level translation is not canonical",
-       "SM: U/S set 0 for first-level translation with user privilege",
-       "SM: No execute permission for request with PASID and ER=1",
-       "SM: Address beyond the DMA hardware max",
-       "SM: Second-level entry address beyond the max",
-       "SM: No write permission for Write/AtomicOp request",
-       "SM: No read permission for Read/AtomicOp request",
-       "SM: Invalid address-interrupt address",
-       "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", /* 0x88-0x8F */
-       "SM: A/D bit update needed in first-level entry when set up in no snoop",
-};
-
-static const char *irq_remap_fault_reasons[] =
-{
-       "Detected reserved fields in the decoded interrupt-remapped request",
-       "Interrupt index exceeded the interrupt-remapping table size",
-       "Present field in the IRTE entry is clear",
-       "Error accessing interrupt-remapping table pointed by IRTA_REG",
-       "Detected reserved fields in the IRTE entry",
-       "Blocked a compatibility format interrupt request",
-       "Blocked an interrupt request due to source-id verification failure",
-};
-
-static const char *dmar_get_fault_reason(u8 fault_reason, int *fault_type)
-{
-       if (fault_reason >= 0x20 && (fault_reason - 0x20 <
-                                       ARRAY_SIZE(irq_remap_fault_reasons))) {
-               *fault_type = INTR_REMAP;
-               return irq_remap_fault_reasons[fault_reason - 0x20];
-       } else if (fault_reason >= 0x30 && (fault_reason - 0x30 <
-                       ARRAY_SIZE(dma_remap_sm_fault_reasons))) {
-               *fault_type = DMA_REMAP;
-               return dma_remap_sm_fault_reasons[fault_reason - 0x30];
-       } else if (fault_reason < ARRAY_SIZE(dma_remap_fault_reasons)) {
-               *fault_type = DMA_REMAP;
-               return dma_remap_fault_reasons[fault_reason];
-       } else {
-               *fault_type = UNKNOWN;
-               return "Unknown";
-       }
-}
-
-
-static inline int dmar_msi_reg(struct intel_iommu *iommu, int irq)
-{
-       if (iommu->irq == irq)
-               return DMAR_FECTL_REG;
-       else if (iommu->pr_irq == irq)
-               return DMAR_PECTL_REG;
-       else
-               BUG();
-}
-
-void dmar_msi_unmask(struct irq_data *data)
-{
-       struct intel_iommu *iommu = irq_data_get_irq_handler_data(data);
-       int reg = dmar_msi_reg(iommu, data->irq);
-       unsigned long flag;
-
-       /* unmask it */
-       raw_spin_lock_irqsave(&iommu->register_lock, flag);
-       writel(0, iommu->reg + reg);
-       /* Read a reg to force flush the post write */
-       readl(iommu->reg + reg);
-       raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
-}
-
-void dmar_msi_mask(struct irq_data *data)
-{
-       struct intel_iommu *iommu = irq_data_get_irq_handler_data(data);
-       int reg = dmar_msi_reg(iommu, data->irq);
-       unsigned long flag;
-
-       /* mask it */
-       raw_spin_lock_irqsave(&iommu->register_lock, flag);
-       writel(DMA_FECTL_IM, iommu->reg + reg);
-       /* Read a reg to force flush the post write */
-       readl(iommu->reg + reg);
-       raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
-}
-
-void dmar_msi_write(int irq, struct msi_msg *msg)
-{
-       struct intel_iommu *iommu = irq_get_handler_data(irq);
-       int reg = dmar_msi_reg(iommu, irq);
-       unsigned long flag;
-
-       raw_spin_lock_irqsave(&iommu->register_lock, flag);
-       writel(msg->data, iommu->reg + reg + 4);
-       writel(msg->address_lo, iommu->reg + reg + 8);
-       writel(msg->address_hi, iommu->reg + reg + 12);
-       raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
-}
-
-void dmar_msi_read(int irq, struct msi_msg *msg)
-{
-       struct intel_iommu *iommu = irq_get_handler_data(irq);
-       int reg = dmar_msi_reg(iommu, irq);
-       unsigned long flag;
-
-       raw_spin_lock_irqsave(&iommu->register_lock, flag);
-       msg->data = readl(iommu->reg + reg + 4);
-       msg->address_lo = readl(iommu->reg + reg + 8);
-       msg->address_hi = readl(iommu->reg + reg + 12);
-       raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
-}
-
-static int dmar_fault_do_one(struct intel_iommu *iommu, int type,
-               u8 fault_reason, int pasid, u16 source_id,
-               unsigned long long addr)
-{
-       const char *reason;
-       int fault_type;
-
-       reason = dmar_get_fault_reason(fault_reason, &fault_type);
-
-       if (fault_type == INTR_REMAP)
-               pr_err("[INTR-REMAP] Request device [%02x:%02x.%d] fault index %llx [fault reason %02d] %s\n",
-                       source_id >> 8, PCI_SLOT(source_id & 0xFF),
-                       PCI_FUNC(source_id & 0xFF), addr >> 48,
-                       fault_reason, reason);
-       else
-               pr_err("[%s] Request device [%02x:%02x.%d] PASID %x fault addr %llx [fault reason %02d] %s\n",
-                      type ? "DMA Read" : "DMA Write",
-                      source_id >> 8, PCI_SLOT(source_id & 0xFF),
-                      PCI_FUNC(source_id & 0xFF), pasid, addr,
-                      fault_reason, reason);
-       return 0;
-}
-
-#define PRIMARY_FAULT_REG_LEN (16)
-irqreturn_t dmar_fault(int irq, void *dev_id)
-{
-       struct intel_iommu *iommu = dev_id;
-       int reg, fault_index;
-       u32 fault_status;
-       unsigned long flag;
-       static DEFINE_RATELIMIT_STATE(rs,
-                                     DEFAULT_RATELIMIT_INTERVAL,
-                                     DEFAULT_RATELIMIT_BURST);
-
-       raw_spin_lock_irqsave(&iommu->register_lock, flag);
-       fault_status = readl(iommu->reg + DMAR_FSTS_REG);
-       if (fault_status && __ratelimit(&rs))
-               pr_err("DRHD: handling fault status reg %x\n", fault_status);
-
-       /* TBD: ignore advanced fault log currently */
-       if (!(fault_status & DMA_FSTS_PPF))
-               goto unlock_exit;
-
-       fault_index = dma_fsts_fault_record_index(fault_status);
-       reg = cap_fault_reg_offset(iommu->cap);
-       while (1) {
-               /* Disable printing, simply clear the fault when ratelimited */
-               bool ratelimited = !__ratelimit(&rs);
-               u8 fault_reason;
-               u16 source_id;
-               u64 guest_addr;
-               int type, pasid;
-               u32 data;
-               bool pasid_present;
-
-               /* highest 32 bits */
-               data = readl(iommu->reg + reg +
-                               fault_index * PRIMARY_FAULT_REG_LEN + 12);
-               if (!(data & DMA_FRCD_F))
-                       break;
-
-               if (!ratelimited) {
-                       fault_reason = dma_frcd_fault_reason(data);
-                       type = dma_frcd_type(data);
-
-                       pasid = dma_frcd_pasid_value(data);
-                       data = readl(iommu->reg + reg +
-                                    fault_index * PRIMARY_FAULT_REG_LEN + 8);
-                       source_id = dma_frcd_source_id(data);
-
-                       pasid_present = dma_frcd_pasid_present(data);
-                       guest_addr = dmar_readq(iommu->reg + reg +
-                                       fault_index * PRIMARY_FAULT_REG_LEN);
-                       guest_addr = dma_frcd_page_addr(guest_addr);
-               }
-
-               /* clear the fault */
-               writel(DMA_FRCD_F, iommu->reg + reg +
-                       fault_index * PRIMARY_FAULT_REG_LEN + 12);
-
-               raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
-
-               if (!ratelimited)
-                       /* Using pasid -1 if pasid is not present */
-                       dmar_fault_do_one(iommu, type, fault_reason,
-                                         pasid_present ? pasid : -1,
-                                         source_id, guest_addr);
-
-               fault_index++;
-               if (fault_index >= cap_num_fault_regs(iommu->cap))
-                       fault_index = 0;
-               raw_spin_lock_irqsave(&iommu->register_lock, flag);
-       }
-
-       writel(DMA_FSTS_PFO | DMA_FSTS_PPF | DMA_FSTS_PRO,
-              iommu->reg + DMAR_FSTS_REG);
-
-unlock_exit:
-       raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
-       return IRQ_HANDLED;
-}
-
-int dmar_set_interrupt(struct intel_iommu *iommu)
-{
-       int irq, ret;
-
-       /*
-        * Check if the fault interrupt is already initialized.
-        */
-       if (iommu->irq)
-               return 0;
-
-       irq = dmar_alloc_hwirq(iommu->seq_id, iommu->node, iommu);
-       if (irq > 0) {
-               iommu->irq = irq;
-       } else {
-               pr_err("No free IRQ vectors\n");
-               return -EINVAL;
-       }
-
-       ret = request_irq(irq, dmar_fault, IRQF_NO_THREAD, iommu->name, iommu);
-       if (ret)
-               pr_err("Can't request irq\n");
-       return ret;
-}
-
-int __init enable_drhd_fault_handling(void)
-{
-       struct dmar_drhd_unit *drhd;
-       struct intel_iommu *iommu;
-
-       /*
-        * Enable fault control interrupt.
-        */
-       for_each_iommu(iommu, drhd) {
-               u32 fault_status;
-               int ret = dmar_set_interrupt(iommu);
-
-               if (ret) {
-                       pr_err("DRHD %Lx: failed to enable fault, interrupt, ret %d\n",
-                              (unsigned long long)drhd->reg_base_addr, ret);
-                       return -1;
-               }
-
-               /*
-                * Clear any previous faults.
-                */
-               dmar_fault(iommu->irq, iommu);
-               fault_status = readl(iommu->reg + DMAR_FSTS_REG);
-               writel(fault_status, iommu->reg + DMAR_FSTS_REG);
-       }
-
-       return 0;
-}
-
-/*
- * Re-enable Queued Invalidation interface.
- */
-int dmar_reenable_qi(struct intel_iommu *iommu)
-{
-       if (!ecap_qis(iommu->ecap))
-               return -ENOENT;
-
-       if (!iommu->qi)
-               return -ENOENT;
-
-       /*
-        * First disable queued invalidation.
-        */
-       dmar_disable_qi(iommu);
-       /*
-        * Then enable queued invalidation again. Since there is no pending
-        * invalidation requests now, it's safe to re-enable queued
-        * invalidation.
-        */
-       __dmar_enable_qi(iommu);
-
-       return 0;
-}
-
-/*
- * Check interrupt remapping support in DMAR table description.
- */
-int __init dmar_ir_support(void)
-{
-       struct acpi_table_dmar *dmar;
-       dmar = (struct acpi_table_dmar *)dmar_tbl;
-       if (!dmar)
-               return 0;
-       return dmar->flags & 0x1;
-}
-
-/* Check whether DMAR units are in use */
-static inline bool dmar_in_use(void)
-{
-       return irq_remapping_enabled || intel_iommu_enabled;
-}
-
-static int __init dmar_free_unused_resources(void)
-{
-       struct dmar_drhd_unit *dmaru, *dmaru_n;
-
-       if (dmar_in_use())
-               return 0;
-
-       if (dmar_dev_scope_status != 1 && !list_empty(&dmar_drhd_units))
-               bus_unregister_notifier(&pci_bus_type, &dmar_pci_bus_nb);
-
-       down_write(&dmar_global_lock);
-       list_for_each_entry_safe(dmaru, dmaru_n, &dmar_drhd_units, list) {
-               list_del(&dmaru->list);
-               dmar_free_drhd(dmaru);
-       }
-       up_write(&dmar_global_lock);
-
-       return 0;
-}
-
-late_initcall(dmar_free_unused_resources);
-IOMMU_INIT_POST(detect_intel_iommu);
-
-/*
- * DMAR Hotplug Support
- * For more details, please refer to Intel(R) Virtualization Technology
- * for Directed-IO Architecture Specifiction, Rev 2.2, Section 8.8
- * "Remapping Hardware Unit Hot Plug".
- */
-static guid_t dmar_hp_guid =
-       GUID_INIT(0xD8C1A3A6, 0xBE9B, 0x4C9B,
-                 0x91, 0xBF, 0xC3, 0xCB, 0x81, 0xFC, 0x5D, 0xAF);
-
-/*
- * Currently there's only one revision and BIOS will not check the revision id,
- * so use 0 for safety.
- */
-#define        DMAR_DSM_REV_ID                 0
-#define        DMAR_DSM_FUNC_DRHD              1
-#define        DMAR_DSM_FUNC_ATSR              2
-#define        DMAR_DSM_FUNC_RHSA              3
-
-static inline bool dmar_detect_dsm(acpi_handle handle, int func)
-{
-       return acpi_check_dsm(handle, &dmar_hp_guid, DMAR_DSM_REV_ID, 1 << func);
-}
-
-static int dmar_walk_dsm_resource(acpi_handle handle, int func,
-                                 dmar_res_handler_t handler, void *arg)
-{
-       int ret = -ENODEV;
-       union acpi_object *obj;
-       struct acpi_dmar_header *start;
-       struct dmar_res_callback callback;
-       static int res_type[] = {
-               [DMAR_DSM_FUNC_DRHD] = ACPI_DMAR_TYPE_HARDWARE_UNIT,
-               [DMAR_DSM_FUNC_ATSR] = ACPI_DMAR_TYPE_ROOT_ATS,
-               [DMAR_DSM_FUNC_RHSA] = ACPI_DMAR_TYPE_HARDWARE_AFFINITY,
-       };
-
-       if (!dmar_detect_dsm(handle, func))
-               return 0;
-
-       obj = acpi_evaluate_dsm_typed(handle, &dmar_hp_guid, DMAR_DSM_REV_ID,
-                                     func, NULL, ACPI_TYPE_BUFFER);
-       if (!obj)
-               return -ENODEV;
-
-       memset(&callback, 0, sizeof(callback));
-       callback.cb[res_type[func]] = handler;
-       callback.arg[res_type[func]] = arg;
-       start = (struct acpi_dmar_header *)obj->buffer.pointer;
-       ret = dmar_walk_remapping_entries(start, obj->buffer.length, &callback);
-
-       ACPI_FREE(obj);
-
-       return ret;
-}
-
-static int dmar_hp_add_drhd(struct acpi_dmar_header *header, void *arg)
-{
-       int ret;
-       struct dmar_drhd_unit *dmaru;
-
-       dmaru = dmar_find_dmaru((struct acpi_dmar_hardware_unit *)header);
-       if (!dmaru)
-               return -ENODEV;
-
-       ret = dmar_ir_hotplug(dmaru, true);
-       if (ret == 0)
-               ret = dmar_iommu_hotplug(dmaru, true);
-
-       return ret;
-}
-
-static int dmar_hp_remove_drhd(struct acpi_dmar_header *header, void *arg)
-{
-       int i, ret;
-       struct device *dev;
-       struct dmar_drhd_unit *dmaru;
-
-       dmaru = dmar_find_dmaru((struct acpi_dmar_hardware_unit *)header);
-       if (!dmaru)
-               return 0;
-
-       /*
-        * All PCI devices managed by this unit should have been destroyed.
-        */
-       if (!dmaru->include_all && dmaru->devices && dmaru->devices_cnt) {
-               for_each_active_dev_scope(dmaru->devices,
-                                         dmaru->devices_cnt, i, dev)
-                       return -EBUSY;
-       }
-
-       ret = dmar_ir_hotplug(dmaru, false);
-       if (ret == 0)
-               ret = dmar_iommu_hotplug(dmaru, false);
-
-       return ret;
-}
-
-static int dmar_hp_release_drhd(struct acpi_dmar_header *header, void *arg)
-{
-       struct dmar_drhd_unit *dmaru;
-
-       dmaru = dmar_find_dmaru((struct acpi_dmar_hardware_unit *)header);
-       if (dmaru) {
-               list_del_rcu(&dmaru->list);
-               synchronize_rcu();
-               dmar_free_drhd(dmaru);
-       }
-
-       return 0;
-}
-
-static int dmar_hotplug_insert(acpi_handle handle)
-{
-       int ret;
-       int drhd_count = 0;
-
-       ret = dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_DRHD,
-                                    &dmar_validate_one_drhd, (void *)1);
-       if (ret)
-               goto out;
-
-       ret = dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_DRHD,
-                                    &dmar_parse_one_drhd, (void *)&drhd_count);
-       if (ret == 0 && drhd_count == 0) {
-               pr_warn(FW_BUG "No DRHD structures in buffer returned by _DSM method\n");
-               goto out;
-       } else if (ret) {
-               goto release_drhd;
-       }
-
-       ret = dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_RHSA,
-                                    &dmar_parse_one_rhsa, NULL);
-       if (ret)
-               goto release_drhd;
-
-       ret = dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_ATSR,
-                                    &dmar_parse_one_atsr, NULL);
-       if (ret)
-               goto release_atsr;
-
-       ret = dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_DRHD,
-                                    &dmar_hp_add_drhd, NULL);
-       if (!ret)
-               return 0;
-
-       dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_DRHD,
-                              &dmar_hp_remove_drhd, NULL);
-release_atsr:
-       dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_ATSR,
-                              &dmar_release_one_atsr, NULL);
-release_drhd:
-       dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_DRHD,
-                              &dmar_hp_release_drhd, NULL);
-out:
-       return ret;
-}
-
-static int dmar_hotplug_remove(acpi_handle handle)
-{
-       int ret;
-
-       ret = dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_ATSR,
-                                    &dmar_check_one_atsr, NULL);
-       if (ret)
-               return ret;
-
-       ret = dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_DRHD,
-                                    &dmar_hp_remove_drhd, NULL);
-       if (ret == 0) {
-               WARN_ON(dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_ATSR,
-                                              &dmar_release_one_atsr, NULL));
-               WARN_ON(dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_DRHD,
-                                              &dmar_hp_release_drhd, NULL));
-       } else {
-               dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_DRHD,
-                                      &dmar_hp_add_drhd, NULL);
-       }
-
-       return ret;
-}
-
-static acpi_status dmar_get_dsm_handle(acpi_handle handle, u32 lvl,
-                                      void *context, void **retval)
-{
-       acpi_handle *phdl = retval;
-
-       if (dmar_detect_dsm(handle, DMAR_DSM_FUNC_DRHD)) {
-               *phdl = handle;
-               return AE_CTRL_TERMINATE;
-       }
-
-       return AE_OK;
-}
-
-static int dmar_device_hotplug(acpi_handle handle, bool insert)
-{
-       int ret;
-       acpi_handle tmp = NULL;
-       acpi_status status;
-
-       if (!dmar_in_use())
-               return 0;
-
-       if (dmar_detect_dsm(handle, DMAR_DSM_FUNC_DRHD)) {
-               tmp = handle;
-       } else {
-               status = acpi_walk_namespace(ACPI_TYPE_DEVICE, handle,
-                                            ACPI_UINT32_MAX,
-                                            dmar_get_dsm_handle,
-                                            NULL, NULL, &tmp);
-               if (ACPI_FAILURE(status)) {
-                       pr_warn("Failed to locate _DSM method.\n");
-                       return -ENXIO;
-               }
-       }
-       if (tmp == NULL)
-               return 0;
-
-       down_write(&dmar_global_lock);
-       if (insert)
-               ret = dmar_hotplug_insert(tmp);
-       else
-               ret = dmar_hotplug_remove(tmp);
-       up_write(&dmar_global_lock);
-
-       return ret;
-}
-
-int dmar_device_add(acpi_handle handle)
-{
-       return dmar_device_hotplug(handle, true);
-}
-
-int dmar_device_remove(acpi_handle handle)
-{
-       return dmar_device_hotplug(handle, false);
-}
-
-/*
- * dmar_platform_optin - Is %DMA_CTRL_PLATFORM_OPT_IN_FLAG set in DMAR table
- *
- * Returns true if the platform has %DMA_CTRL_PLATFORM_OPT_IN_FLAG set in
- * the ACPI DMAR table. This means that the platform boot firmware has made
- * sure no device can issue DMA outside of RMRR regions.
- */
-bool dmar_platform_optin(void)
-{
-       struct acpi_table_dmar *dmar;
-       acpi_status status;
-       bool ret;
-
-       status = acpi_get_table(ACPI_SIG_DMAR, 0,
-                               (struct acpi_table_header **)&dmar);
-       if (ACPI_FAILURE(status))
-               return false;
-
-       ret = !!(dmar->flags & DMAR_PLATFORM_OPT_IN);
-       acpi_put_table((struct acpi_table_header *)dmar);
-
-       return ret;
-}
-EXPORT_SYMBOL_GPL(dmar_platform_optin);
diff --git a/drivers/iommu/intel-iommu-debugfs.c b/drivers/iommu/intel-iommu-debugfs.c

deleted file mode 100644 (file)

index cf1ebb9..0000000
--- a/drivers/iommu/intel-iommu-debugfs.c
+++ /dev/null
@@ -1,559 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright © 2018 Intel Corporation.
- *
- * Authors: Gayatri Kammela <gayatri.kammela@intel.com>
- *         Sohil Mehta <sohil.mehta@intel.com>
- *         Jacob Pan <jacob.jun.pan@linux.intel.com>
- *         Lu Baolu <baolu.lu@linux.intel.com>
- */
-
-#include <linux/debugfs.h>
-#include <linux/dmar.h>
-#include <linux/intel-iommu.h>
-#include <linux/pci.h>
-
-#include <asm/irq_remapping.h>
-
-#include "intel-pasid.h"
-
-struct tbl_walk {
-       u16 bus;
-       u16 devfn;
-       u32 pasid;
-       struct root_entry *rt_entry;
-       struct context_entry *ctx_entry;
-       struct pasid_entry *pasid_tbl_entry;
-};
-
-struct iommu_regset {
-       int offset;
-       const char *regs;
-};
-
-#define IOMMU_REGSET_ENTRY(_reg_)                                      \
-       { DMAR_##_reg_##_REG, __stringify(_reg_) }
-
-static const struct iommu_regset iommu_regs_32[] = {
-       IOMMU_REGSET_ENTRY(VER),
-       IOMMU_REGSET_ENTRY(GCMD),
-       IOMMU_REGSET_ENTRY(GSTS),
-       IOMMU_REGSET_ENTRY(FSTS),
-       IOMMU_REGSET_ENTRY(FECTL),
-       IOMMU_REGSET_ENTRY(FEDATA),
-       IOMMU_REGSET_ENTRY(FEADDR),
-       IOMMU_REGSET_ENTRY(FEUADDR),
-       IOMMU_REGSET_ENTRY(PMEN),
-       IOMMU_REGSET_ENTRY(PLMBASE),
-       IOMMU_REGSET_ENTRY(PLMLIMIT),
-       IOMMU_REGSET_ENTRY(ICS),
-       IOMMU_REGSET_ENTRY(PRS),
-       IOMMU_REGSET_ENTRY(PECTL),
-       IOMMU_REGSET_ENTRY(PEDATA),
-       IOMMU_REGSET_ENTRY(PEADDR),
-       IOMMU_REGSET_ENTRY(PEUADDR),
-};
-
-static const struct iommu_regset iommu_regs_64[] = {
-       IOMMU_REGSET_ENTRY(CAP),
-       IOMMU_REGSET_ENTRY(ECAP),
-       IOMMU_REGSET_ENTRY(RTADDR),
-       IOMMU_REGSET_ENTRY(CCMD),
-       IOMMU_REGSET_ENTRY(AFLOG),
-       IOMMU_REGSET_ENTRY(PHMBASE),
-       IOMMU_REGSET_ENTRY(PHMLIMIT),
-       IOMMU_REGSET_ENTRY(IQH),
-       IOMMU_REGSET_ENTRY(IQT),
-       IOMMU_REGSET_ENTRY(IQA),
-       IOMMU_REGSET_ENTRY(IRTA),
-       IOMMU_REGSET_ENTRY(PQH),
-       IOMMU_REGSET_ENTRY(PQT),
-       IOMMU_REGSET_ENTRY(PQA),
-       IOMMU_REGSET_ENTRY(MTRRCAP),
-       IOMMU_REGSET_ENTRY(MTRRDEF),
-       IOMMU_REGSET_ENTRY(MTRR_FIX64K_00000),
-       IOMMU_REGSET_ENTRY(MTRR_FIX16K_80000),
-       IOMMU_REGSET_ENTRY(MTRR_FIX16K_A0000),
-       IOMMU_REGSET_ENTRY(MTRR_FIX4K_C0000),
-       IOMMU_REGSET_ENTRY(MTRR_FIX4K_C8000),
-       IOMMU_REGSET_ENTRY(MTRR_FIX4K_D0000),
-       IOMMU_REGSET_ENTRY(MTRR_FIX4K_D8000),
-       IOMMU_REGSET_ENTRY(MTRR_FIX4K_E0000),
-       IOMMU_REGSET_ENTRY(MTRR_FIX4K_E8000),
-       IOMMU_REGSET_ENTRY(MTRR_FIX4K_F0000),
-       IOMMU_REGSET_ENTRY(MTRR_FIX4K_F8000),
-       IOMMU_REGSET_ENTRY(MTRR_PHYSBASE0),
-       IOMMU_REGSET_ENTRY(MTRR_PHYSMASK0),
-       IOMMU_REGSET_ENTRY(MTRR_PHYSBASE1),
-       IOMMU_REGSET_ENTRY(MTRR_PHYSMASK1),
-       IOMMU_REGSET_ENTRY(MTRR_PHYSBASE2),
-       IOMMU_REGSET_ENTRY(MTRR_PHYSMASK2),
-       IOMMU_REGSET_ENTRY(MTRR_PHYSBASE3),
-       IOMMU_REGSET_ENTRY(MTRR_PHYSMASK3),
-       IOMMU_REGSET_ENTRY(MTRR_PHYSBASE4),
-       IOMMU_REGSET_ENTRY(MTRR_PHYSMASK4),
-       IOMMU_REGSET_ENTRY(MTRR_PHYSBASE5),
-       IOMMU_REGSET_ENTRY(MTRR_PHYSMASK5),
-       IOMMU_REGSET_ENTRY(MTRR_PHYSBASE6),
-       IOMMU_REGSET_ENTRY(MTRR_PHYSMASK6),
-       IOMMU_REGSET_ENTRY(MTRR_PHYSBASE7),
-       IOMMU_REGSET_ENTRY(MTRR_PHYSMASK7),
-       IOMMU_REGSET_ENTRY(MTRR_PHYSBASE8),
-       IOMMU_REGSET_ENTRY(MTRR_PHYSMASK8),
-       IOMMU_REGSET_ENTRY(MTRR_PHYSBASE9),
-       IOMMU_REGSET_ENTRY(MTRR_PHYSMASK9),
-       IOMMU_REGSET_ENTRY(VCCAP),
-       IOMMU_REGSET_ENTRY(VCMD),
-       IOMMU_REGSET_ENTRY(VCRSP),
-};
-
-static int iommu_regset_show(struct seq_file *m, void *unused)
-{
-       struct dmar_drhd_unit *drhd;
-       struct intel_iommu *iommu;
-       unsigned long flag;
-       int i, ret = 0;
-       u64 value;
-
-       rcu_read_lock();
-       for_each_active_iommu(iommu, drhd) {
-               if (!drhd->reg_base_addr) {
-                       seq_puts(m, "IOMMU: Invalid base address\n");
-                       ret = -EINVAL;
-                       goto out;
-               }
-
-               seq_printf(m, "IOMMU: %s Register Base Address: %llx\n",
-                          iommu->name, drhd->reg_base_addr);
-               seq_puts(m, "Name\t\t\tOffset\t\tContents\n");
-               /*
-                * Publish the contents of the 64-bit hardware registers
-                * by adding the offset to the pointer (virtual address).
-                */
-               raw_spin_lock_irqsave(&iommu->register_lock, flag);
-               for (i = 0 ; i < ARRAY_SIZE(iommu_regs_32); i++) {
-                       value = dmar_readl(iommu->reg + iommu_regs_32[i].offset);
-                       seq_printf(m, "%-16s\t0x%02x\t\t0x%016llx\n",
-                                  iommu_regs_32[i].regs, iommu_regs_32[i].offset,
-                                  value);
-               }
-               for (i = 0 ; i < ARRAY_SIZE(iommu_regs_64); i++) {
-                       value = dmar_readq(iommu->reg + iommu_regs_64[i].offset);
-                       seq_printf(m, "%-16s\t0x%02x\t\t0x%016llx\n",
-                                  iommu_regs_64[i].regs, iommu_regs_64[i].offset,
-                                  value);
-               }
-               raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
-               seq_putc(m, '\n');
-       }
-out:
-       rcu_read_unlock();
-
-       return ret;
-}
-DEFINE_SHOW_ATTRIBUTE(iommu_regset);
-
-static inline void print_tbl_walk(struct seq_file *m)
-{
-       struct tbl_walk *tbl_wlk = m->private;
-
-       seq_printf(m, "%02x:%02x.%x\t0x%016llx:0x%016llx\t0x%016llx:0x%016llx\t",
-                  tbl_wlk->bus, PCI_SLOT(tbl_wlk->devfn),
-                  PCI_FUNC(tbl_wlk->devfn), tbl_wlk->rt_entry->hi,
-                  tbl_wlk->rt_entry->lo, tbl_wlk->ctx_entry->hi,
-                  tbl_wlk->ctx_entry->lo);
-
-       /*
-        * A legacy mode DMAR doesn't support PASID, hence default it to -1
-        * indicating that it's invalid. Also, default all PASID related fields
-        * to 0.
-        */
-       if (!tbl_wlk->pasid_tbl_entry)
-               seq_printf(m, "%-6d\t0x%016llx:0x%016llx:0x%016llx\n", -1,
-                          (u64)0, (u64)0, (u64)0);
-       else
-               seq_printf(m, "%-6d\t0x%016llx:0x%016llx:0x%016llx\n",
-                          tbl_wlk->pasid, tbl_wlk->pasid_tbl_entry->val[2],
-                          tbl_wlk->pasid_tbl_entry->val[1],
-                          tbl_wlk->pasid_tbl_entry->val[0]);
-}
-
-static void pasid_tbl_walk(struct seq_file *m, struct pasid_entry *tbl_entry,
-                          u16 dir_idx)
-{
-       struct tbl_walk *tbl_wlk = m->private;
-       u8 tbl_idx;
-
-       for (tbl_idx = 0; tbl_idx < PASID_TBL_ENTRIES; tbl_idx++) {
-               if (pasid_pte_is_present(tbl_entry)) {
-                       tbl_wlk->pasid_tbl_entry = tbl_entry;
-                       tbl_wlk->pasid = (dir_idx << PASID_PDE_SHIFT) + tbl_idx;
-                       print_tbl_walk(m);
-               }
-
-               tbl_entry++;
-       }
-}
-
-static void pasid_dir_walk(struct seq_file *m, u64 pasid_dir_ptr,
-                          u16 pasid_dir_size)
-{
-       struct pasid_dir_entry *dir_entry = phys_to_virt(pasid_dir_ptr);
-       struct pasid_entry *pasid_tbl;
-       u16 dir_idx;
-
-       for (dir_idx = 0; dir_idx < pasid_dir_size; dir_idx++) {
-               pasid_tbl = get_pasid_table_from_pde(dir_entry);
-               if (pasid_tbl)
-                       pasid_tbl_walk(m, pasid_tbl, dir_idx);
-
-               dir_entry++;
-       }
-}
-
-static void ctx_tbl_walk(struct seq_file *m, struct intel_iommu *iommu, u16 bus)
-{
-       struct context_entry *context;
-       u16 devfn, pasid_dir_size;
-       u64 pasid_dir_ptr;
-
-       for (devfn = 0; devfn < 256; devfn++) {
-               struct tbl_walk tbl_wlk = {0};
-
-               /*
-                * Scalable mode root entry points to upper scalable mode
-                * context table and lower scalable mode context table. Each
-                * scalable mode context table has 128 context entries where as
-                * legacy mode context table has 256 context entries. So in
-                * scalable mode, the context entries for former 128 devices are
-                * in the lower scalable mode context table, while the latter
-                * 128 devices are in the upper scalable mode context table.
-                * In scalable mode, when devfn > 127, iommu_context_addr()
-                * automatically refers to upper scalable mode context table and
-                * hence the caller doesn't have to worry about differences
-                * between scalable mode and non scalable mode.
-                */
-               context = iommu_context_addr(iommu, bus, devfn, 0);
-               if (!context)
-                       return;
-
-               if (!context_present(context))
-                       continue;
-
-               tbl_wlk.bus = bus;
-               tbl_wlk.devfn = devfn;
-               tbl_wlk.rt_entry = &iommu->root_entry[bus];
-               tbl_wlk.ctx_entry = context;
-               m->private = &tbl_wlk;
-
-               if (dmar_readq(iommu->reg + DMAR_RTADDR_REG) & DMA_RTADDR_SMT) {
-                       pasid_dir_ptr = context->lo & VTD_PAGE_MASK;
-                       pasid_dir_size = get_pasid_dir_size(context);
-                       pasid_dir_walk(m, pasid_dir_ptr, pasid_dir_size);
-                       continue;
-               }
-
-               print_tbl_walk(m);
-       }
-}
-
-static void root_tbl_walk(struct seq_file *m, struct intel_iommu *iommu)
-{
-       unsigned long flags;
-       u16 bus;
-
-       spin_lock_irqsave(&iommu->lock, flags);
-       seq_printf(m, "IOMMU %s: Root Table Address: 0x%llx\n", iommu->name,
-                  (u64)virt_to_phys(iommu->root_entry));
-       seq_puts(m, "B.D.F\tRoot_entry\t\t\t\tContext_entry\t\t\t\tPASID\tPASID_table_entry\n");
-
-       /*
-        * No need to check if the root entry is present or not because
-        * iommu_context_addr() performs the same check before returning
-        * context entry.
-        */
-       for (bus = 0; bus < 256; bus++)
-               ctx_tbl_walk(m, iommu, bus);
-
-       spin_unlock_irqrestore(&iommu->lock, flags);
-}
-
-static int dmar_translation_struct_show(struct seq_file *m, void *unused)
-{
-       struct dmar_drhd_unit *drhd;
-       struct intel_iommu *iommu;
-       u32 sts;
-
-       rcu_read_lock();
-       for_each_active_iommu(iommu, drhd) {
-               sts = dmar_readl(iommu->reg + DMAR_GSTS_REG);
-               if (!(sts & DMA_GSTS_TES)) {
-                       seq_printf(m, "DMA Remapping is not enabled on %s\n",
-                                  iommu->name);
-                       continue;
-               }
-               root_tbl_walk(m, iommu);
-               seq_putc(m, '\n');
-       }
-       rcu_read_unlock();
-
-       return 0;
-}
-DEFINE_SHOW_ATTRIBUTE(dmar_translation_struct);
-
-static inline unsigned long level_to_directory_size(int level)
-{
-       return BIT_ULL(VTD_PAGE_SHIFT + VTD_STRIDE_SHIFT * (level - 1));
-}
-
-static inline void
-dump_page_info(struct seq_file *m, unsigned long iova, u64 *path)
-{
-       seq_printf(m, "0x%013lx |\t0x%016llx\t0x%016llx\t0x%016llx\t0x%016llx\t0x%016llx\n",
-                  iova >> VTD_PAGE_SHIFT, path[5], path[4],
-                  path[3], path[2], path[1]);
-}
-
-static void pgtable_walk_level(struct seq_file *m, struct dma_pte *pde,
-                              int level, unsigned long start,
-                              u64 *path)
-{
-       int i;
-
-       if (level > 5 || level < 1)
-               return;
-
-       for (i = 0; i < BIT_ULL(VTD_STRIDE_SHIFT);
-                       i++, pde++, start += level_to_directory_size(level)) {
-               if (!dma_pte_present(pde))
-                       continue;
-
-               path[level] = pde->val;
-               if (dma_pte_superpage(pde) || level == 1)
-                       dump_page_info(m, start, path);
-               else
-                       pgtable_walk_level(m, phys_to_virt(dma_pte_addr(pde)),
-                                          level - 1, start, path);
-               path[level] = 0;
-       }
-}
-
-static int show_device_domain_translation(struct device *dev, void *data)
-{
-       struct dmar_domain *domain = find_domain(dev);
-       struct seq_file *m = data;
-       u64 path[6] = { 0 };
-
-       if (!domain)
-               return 0;
-
-       seq_printf(m, "Device %s with pasid %d @0x%llx\n",
-                  dev_name(dev), domain->default_pasid,
-                  (u64)virt_to_phys(domain->pgd));
-       seq_puts(m, "IOVA_PFN\t\tPML5E\t\t\tPML4E\t\t\tPDPE\t\t\tPDE\t\t\tPTE\n");
-
-       pgtable_walk_level(m, domain->pgd, domain->agaw + 2, 0, path);
-       seq_putc(m, '\n');
-
-       return 0;
-}
-
-static int domain_translation_struct_show(struct seq_file *m, void *unused)
-{
-       unsigned long flags;
-       int ret;
-
-       spin_lock_irqsave(&device_domain_lock, flags);
-       ret = bus_for_each_dev(&pci_bus_type, NULL, m,
-                              show_device_domain_translation);
-       spin_unlock_irqrestore(&device_domain_lock, flags);
-
-       return ret;
-}
-DEFINE_SHOW_ATTRIBUTE(domain_translation_struct);
-
-static void invalidation_queue_entry_show(struct seq_file *m,
-                                         struct intel_iommu *iommu)
-{
-       int index, shift = qi_shift(iommu);
-       struct qi_desc *desc;
-       int offset;
-
-       if (ecap_smts(iommu->ecap))
-               seq_puts(m, "Index\t\tqw0\t\t\tqw1\t\t\tqw2\t\t\tqw3\t\t\tstatus\n");
-       else
-               seq_puts(m, "Index\t\tqw0\t\t\tqw1\t\t\tstatus\n");
-
-       for (index = 0; index < QI_LENGTH; index++) {
-               offset = index << shift;
-               desc = iommu->qi->desc + offset;
-               if (ecap_smts(iommu->ecap))
-                       seq_printf(m, "%5d\t%016llx\t%016llx\t%016llx\t%016llx\t%016x\n",
-                                  index, desc->qw0, desc->qw1,
-                                  desc->qw2, desc->qw3,
-                                  iommu->qi->desc_status[index]);
-               else
-                       seq_printf(m, "%5d\t%016llx\t%016llx\t%016x\n",
-                                  index, desc->qw0, desc->qw1,
-                                  iommu->qi->desc_status[index]);
-       }
-}
-
-static int invalidation_queue_show(struct seq_file *m, void *unused)
-{
-       struct dmar_drhd_unit *drhd;
-       struct intel_iommu *iommu;
-       unsigned long flags;
-       struct q_inval *qi;
-       int shift;
-
-       rcu_read_lock();
-       for_each_active_iommu(iommu, drhd) {
-               qi = iommu->qi;
-               shift = qi_shift(iommu);
-
-               if (!qi || !ecap_qis(iommu->ecap))
-                       continue;
-
-               seq_printf(m, "Invalidation queue on IOMMU: %s\n", iommu->name);
-
-               raw_spin_lock_irqsave(&qi->q_lock, flags);
-               seq_printf(m, " Base: 0x%llx\tHead: %lld\tTail: %lld\n",
-                          (u64)virt_to_phys(qi->desc),
-                          dmar_readq(iommu->reg + DMAR_IQH_REG) >> shift,
-                          dmar_readq(iommu->reg + DMAR_IQT_REG) >> shift);
-               invalidation_queue_entry_show(m, iommu);
-               raw_spin_unlock_irqrestore(&qi->q_lock, flags);
-               seq_putc(m, '\n');
-       }
-       rcu_read_unlock();
-
-       return 0;
-}
-DEFINE_SHOW_ATTRIBUTE(invalidation_queue);
-
-#ifdef CONFIG_IRQ_REMAP
-static void ir_tbl_remap_entry_show(struct seq_file *m,
-                                   struct intel_iommu *iommu)
-{
-       struct irte *ri_entry;
-       unsigned long flags;
-       int idx;
-
-       seq_puts(m, " Entry SrcID   DstID    Vct IRTE_high\t\tIRTE_low\n");
-
-       raw_spin_lock_irqsave(&irq_2_ir_lock, flags);
-       for (idx = 0; idx < INTR_REMAP_TABLE_ENTRIES; idx++) {
-               ri_entry = &iommu->ir_table->base[idx];
-               if (!ri_entry->present || ri_entry->p_pst)
-                       continue;
-
-               seq_printf(m, " %-5d %02x:%02x.%01x %08x %02x  %016llx\t%016llx\n",
-                          idx, PCI_BUS_NUM(ri_entry->sid),
-                          PCI_SLOT(ri_entry->sid), PCI_FUNC(ri_entry->sid),
-                          ri_entry->dest_id, ri_entry->vector,
-                          ri_entry->high, ri_entry->low);
-       }
-       raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags);
-}
-
-static void ir_tbl_posted_entry_show(struct seq_file *m,
-                                    struct intel_iommu *iommu)
-{
-       struct irte *pi_entry;
-       unsigned long flags;
-       int idx;
-
-       seq_puts(m, " Entry SrcID   PDA_high PDA_low  Vct IRTE_high\t\tIRTE_low\n");
-
-       raw_spin_lock_irqsave(&irq_2_ir_lock, flags);
-       for (idx = 0; idx < INTR_REMAP_TABLE_ENTRIES; idx++) {
-               pi_entry = &iommu->ir_table->base[idx];
-               if (!pi_entry->present || !pi_entry->p_pst)
-                       continue;
-
-               seq_printf(m, " %-5d %02x:%02x.%01x %08x %08x %02x  %016llx\t%016llx\n",
-                          idx, PCI_BUS_NUM(pi_entry->sid),
-                          PCI_SLOT(pi_entry->sid), PCI_FUNC(pi_entry->sid),
-                          pi_entry->pda_h, pi_entry->pda_l << 6,
-                          pi_entry->vector, pi_entry->high,
-                          pi_entry->low);
-       }
-       raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags);
-}
-
-/*
- * For active IOMMUs go through the Interrupt remapping
- * table and print valid entries in a table format for
- * Remapped and Posted Interrupts.
- */
-static int ir_translation_struct_show(struct seq_file *m, void *unused)
-{
-       struct dmar_drhd_unit *drhd;
-       struct intel_iommu *iommu;
-       u64 irta;
-       u32 sts;
-
-       rcu_read_lock();
-       for_each_active_iommu(iommu, drhd) {
-               if (!ecap_ir_support(iommu->ecap))
-                       continue;
-
-               seq_printf(m, "Remapped Interrupt supported on IOMMU: %s\n",
-                          iommu->name);
-
-               sts = dmar_readl(iommu->reg + DMAR_GSTS_REG);
-               if (iommu->ir_table && (sts & DMA_GSTS_IRES)) {
-                       irta = virt_to_phys(iommu->ir_table->base);
-                       seq_printf(m, " IR table address:%llx\n", irta);
-                       ir_tbl_remap_entry_show(m, iommu);
-               } else {
-                       seq_puts(m, "Interrupt Remapping is not enabled\n");
-               }
-               seq_putc(m, '\n');
-       }
-
-       seq_puts(m, "****\n\n");
-
-       for_each_active_iommu(iommu, drhd) {
-               if (!cap_pi_support(iommu->cap))
-                       continue;
-
-               seq_printf(m, "Posted Interrupt supported on IOMMU: %s\n",
-                          iommu->name);
-
-               if (iommu->ir_table) {
-                       irta = virt_to_phys(iommu->ir_table->base);
-                       seq_printf(m, " IR table address:%llx\n", irta);
-                       ir_tbl_posted_entry_show(m, iommu);
-               } else {
-                       seq_puts(m, "Interrupt Remapping is not enabled\n");
-               }
-               seq_putc(m, '\n');
-       }
-       rcu_read_unlock();
-
-       return 0;
-}
-DEFINE_SHOW_ATTRIBUTE(ir_translation_struct);
-#endif
-
-void __init intel_iommu_debugfs_init(void)
-{
-       struct dentry *intel_iommu_debug = debugfs_create_dir("intel",
-                                               iommu_debugfs_dir);
-
-       debugfs_create_file("iommu_regset", 0444, intel_iommu_debug, NULL,
-                           &iommu_regset_fops);
-       debugfs_create_file("dmar_translation_struct", 0444, intel_iommu_debug,
-                           NULL, &dmar_translation_struct_fops);
-       debugfs_create_file("domain_translation_struct", 0444,
-                           intel_iommu_debug, NULL,
-                           &domain_translation_struct_fops);
-       debugfs_create_file("invalidation_queue", 0444, intel_iommu_debug,
-                           NULL, &invalidation_queue_fops);
-#ifdef CONFIG_IRQ_REMAP
-       debugfs_create_file("ir_translation_struct", 0444, intel_iommu_debug,
-                           NULL, &ir_translation_struct_fops);
-#endif
-}
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c

deleted file mode 100644 (file)

index 648a785..0000000
--- a/drivers/iommu/intel-iommu.c
+++ /dev/null
@@ -1,6207 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright © 2006-2014 Intel Corporation.
- *
- * Authors: David Woodhouse <dwmw2@infradead.org>,
- *          Ashok Raj <ashok.raj@intel.com>,
- *          Shaohua Li <shaohua.li@intel.com>,
- *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
- *          Fenghua Yu <fenghua.yu@intel.com>
- *          Joerg Roedel <jroedel@suse.de>
- */
-
-#define pr_fmt(fmt)     "DMAR: " fmt
-#define dev_fmt(fmt)    pr_fmt(fmt)
-
-#include <linux/init.h>
-#include <linux/bitmap.h>
-#include <linux/debugfs.h>
-#include <linux/export.h>
-#include <linux/slab.h>
-#include <linux/irq.h>
-#include <linux/interrupt.h>
-#include <linux/spinlock.h>
-#include <linux/pci.h>
-#include <linux/dmar.h>
-#include <linux/dma-mapping.h>
-#include <linux/mempool.h>
-#include <linux/memory.h>
-#include <linux/cpu.h>
-#include <linux/timer.h>
-#include <linux/io.h>
-#include <linux/iova.h>
-#include <linux/iommu.h>
-#include <linux/intel-iommu.h>
-#include <linux/syscore_ops.h>
-#include <linux/tboot.h>
-#include <linux/dmi.h>
-#include <linux/pci-ats.h>
-#include <linux/memblock.h>
-#include <linux/dma-contiguous.h>
-#include <linux/dma-direct.h>
-#include <linux/crash_dump.h>
-#include <linux/numa.h>
-#include <linux/swiotlb.h>
-#include <asm/irq_remapping.h>
-#include <asm/cacheflush.h>
-#include <asm/iommu.h>
-#include <trace/events/intel_iommu.h>
-
-#include "irq_remapping.h"
-#include "intel-pasid.h"
-
-#define ROOT_SIZE              VTD_PAGE_SIZE
-#define CONTEXT_SIZE           VTD_PAGE_SIZE
-
-#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
-#define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
-#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
-#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
-
-#define IOAPIC_RANGE_START     (0xfee00000)
-#define IOAPIC_RANGE_END       (0xfeefffff)
-#define IOVA_START_ADDR                (0x1000)
-
-#define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
-
-#define MAX_AGAW_WIDTH 64
-#define MAX_AGAW_PFN_WIDTH     (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
-
-#define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
-#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
-
-/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
-   to match. That way, we can use 'unsigned long' for PFNs with impunity. */
-#define DOMAIN_MAX_PFN(gaw)    ((unsigned long) min_t(uint64_t, \
-                               __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
-#define DOMAIN_MAX_ADDR(gaw)   (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
-
-/* IO virtual address start page frame number */
-#define IOVA_START_PFN         (1)
-
-#define IOVA_PFN(addr)         ((addr) >> PAGE_SHIFT)
-
-/* page table handling */
-#define LEVEL_STRIDE           (9)
-#define LEVEL_MASK             (((u64)1 << LEVEL_STRIDE) - 1)
-
-/*
- * This bitmap is used to advertise the page sizes our hardware support
- * to the IOMMU core, which will then use this information to split
- * physically contiguous memory regions it is mapping into page sizes
- * that we support.
- *
- * Traditionally the IOMMU core just handed us the mappings directly,
- * after making sure the size is an order of a 4KiB page and that the
- * mapping has natural alignment.
- *
- * To retain this behavior, we currently advertise that we support
- * all page sizes that are an order of 4KiB.
- *
- * If at some point we'd like to utilize the IOMMU core's new behavior,
- * we could change this to advertise the real page sizes we support.
- */
-#define INTEL_IOMMU_PGSIZES    (~0xFFFUL)
-
-static inline int agaw_to_level(int agaw)
-{
-       return agaw + 2;
-}
-
-static inline int agaw_to_width(int agaw)
-{
-       return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
-}
-
-static inline int width_to_agaw(int width)
-{
-       return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
-}
-
-static inline unsigned int level_to_offset_bits(int level)
-{
-       return (level - 1) * LEVEL_STRIDE;
-}
-
-static inline int pfn_level_offset(unsigned long pfn, int level)
-{
-       return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
-}
-
-static inline unsigned long level_mask(int level)
-{
-       return -1UL << level_to_offset_bits(level);
-}
-
-static inline unsigned long level_size(int level)
-{
-       return 1UL << level_to_offset_bits(level);
-}
-
-static inline unsigned long align_to_level(unsigned long pfn, int level)
-{
-       return (pfn + level_size(level) - 1) & level_mask(level);
-}
-
-static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
-{
-       return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
-}
-
-/* VT-d pages must always be _smaller_ than MM pages. Otherwise things
-   are never going to work. */
-static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
-{
-       return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
-}
-
-static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
-{
-       return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
-}
-static inline unsigned long page_to_dma_pfn(struct page *pg)
-{
-       return mm_to_dma_pfn(page_to_pfn(pg));
-}
-static inline unsigned long virt_to_dma_pfn(void *p)
-{
-       return page_to_dma_pfn(virt_to_page(p));
-}
-
-/* global iommu list, set NULL for ignored DMAR units */
-static struct intel_iommu **g_iommus;
-
-static void __init check_tylersburg_isoch(void);
-static int rwbf_quirk;
-
-/*
- * set to 1 to panic kernel if can't successfully enable VT-d
- * (used when kernel is launched w/ TXT)
- */
-static int force_on = 0;
-int intel_iommu_tboot_noforce;
-static int no_platform_optin;
-
-#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
-
-/*
- * Take a root_entry and return the Lower Context Table Pointer (LCTP)
- * if marked present.
- */
-static phys_addr_t root_entry_lctp(struct root_entry *re)
-{
-       if (!(re->lo & 1))
-               return 0;
-
-       return re->lo & VTD_PAGE_MASK;
-}
-
-/*
- * Take a root_entry and return the Upper Context Table Pointer (UCTP)
- * if marked present.
- */
-static phys_addr_t root_entry_uctp(struct root_entry *re)
-{
-       if (!(re->hi & 1))
-               return 0;
-
-       return re->hi & VTD_PAGE_MASK;
-}
-
-static inline void context_clear_pasid_enable(struct context_entry *context)
-{
-       context->lo &= ~(1ULL << 11);
-}
-
-static inline bool context_pasid_enabled(struct context_entry *context)
-{
-       return !!(context->lo & (1ULL << 11));
-}
-
-static inline void context_set_copied(struct context_entry *context)
-{
-       context->hi |= (1ull << 3);
-}
-
-static inline bool context_copied(struct context_entry *context)
-{
-       return !!(context->hi & (1ULL << 3));
-}
-
-static inline bool __context_present(struct context_entry *context)
-{
-       return (context->lo & 1);
-}
-
-bool context_present(struct context_entry *context)
-{
-       return context_pasid_enabled(context) ?
-            __context_present(context) :
-            __context_present(context) && !context_copied(context);
-}
-
-static inline void context_set_present(struct context_entry *context)
-{
-       context->lo |= 1;
-}
-
-static inline void context_set_fault_enable(struct context_entry *context)
-{
-       context->lo &= (((u64)-1) << 2) | 1;
-}
-
-static inline void context_set_translation_type(struct context_entry *context,
-                                               unsigned long value)
-{
-       context->lo &= (((u64)-1) << 4) | 3;
-       context->lo |= (value & 3) << 2;
-}
-
-static inline void context_set_address_root(struct context_entry *context,
-                                           unsigned long value)
-{
-       context->lo &= ~VTD_PAGE_MASK;
-       context->lo |= value & VTD_PAGE_MASK;
-}
-
-static inline void context_set_address_width(struct context_entry *context,
-                                            unsigned long value)
-{
-       context->hi |= value & 7;
-}
-
-static inline void context_set_domain_id(struct context_entry *context,
-                                        unsigned long value)
-{
-       context->hi |= (value & ((1 << 16) - 1)) << 8;
-}
-
-static inline int context_domain_id(struct context_entry *c)
-{
-       return((c->hi >> 8) & 0xffff);
-}
-
-static inline void context_clear_entry(struct context_entry *context)
-{
-       context->lo = 0;
-       context->hi = 0;
-}
-
-/*
- * This domain is a statically identity mapping domain.
- *     1. This domain creats a static 1:1 mapping to all usable memory.
- *     2. It maps to each iommu if successful.
- *     3. Each iommu mapps to this domain if successful.
- */
-static struct dmar_domain *si_domain;
-static int hw_pass_through = 1;
-
-#define for_each_domain_iommu(idx, domain)                     \
-       for (idx = 0; idx < g_num_of_iommus; idx++)             \
-               if (domain->iommu_refcnt[idx])
-
-struct dmar_rmrr_unit {
-       struct list_head list;          /* list of rmrr units   */
-       struct acpi_dmar_header *hdr;   /* ACPI header          */
-       u64     base_address;           /* reserved base address*/
-       u64     end_address;            /* reserved end address */
-       struct dmar_dev_scope *devices; /* target devices */
-       int     devices_cnt;            /* target device count */
-};
-
-struct dmar_atsr_unit {
-       struct list_head list;          /* list of ATSR units */
-       struct acpi_dmar_header *hdr;   /* ACPI header */
-       struct dmar_dev_scope *devices; /* target devices */
-       int devices_cnt;                /* target device count */
-       u8 include_all:1;               /* include all ports */
-};
-
-static LIST_HEAD(dmar_atsr_units);
-static LIST_HEAD(dmar_rmrr_units);
-
-#define for_each_rmrr_units(rmrr) \
-       list_for_each_entry(rmrr, &dmar_rmrr_units, list)
-
-/* bitmap for indexing intel_iommus */
-static int g_num_of_iommus;
-
-static void domain_exit(struct dmar_domain *domain);
-static void domain_remove_dev_info(struct dmar_domain *domain);
-static void dmar_remove_one_dev_info(struct device *dev);
-static void __dmar_remove_one_dev_info(struct device_domain_info *info);
-static int intel_iommu_attach_device(struct iommu_domain *domain,
-                                    struct device *dev);
-static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
-                                           dma_addr_t iova);
-
-#ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
-int dmar_disabled = 0;
-#else
-int dmar_disabled = 1;
-#endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
-
-#ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
-int intel_iommu_sm = 1;
-#else
-int intel_iommu_sm;
-#endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
-
-int intel_iommu_enabled = 0;
-EXPORT_SYMBOL_GPL(intel_iommu_enabled);
-
-static int dmar_map_gfx = 1;
-static int dmar_forcedac;
-static int intel_iommu_strict;
-static int intel_iommu_superpage = 1;
-static int iommu_identity_mapping;
-static int intel_no_bounce;
-
-#define IDENTMAP_GFX           2
-#define IDENTMAP_AZALIA                4
-
-int intel_iommu_gfx_mapped;
-EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
-
-#define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
-#define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
-struct device_domain_info *get_domain_info(struct device *dev)
-{
-       struct device_domain_info *info;
-
-       if (!dev)
-               return NULL;
-
-       info = dev->archdata.iommu;
-       if (unlikely(info == DUMMY_DEVICE_DOMAIN_INFO ||
-                    info == DEFER_DEVICE_DOMAIN_INFO))
-               return NULL;
-
-       return info;
-}
-
-DEFINE_SPINLOCK(device_domain_lock);
-static LIST_HEAD(device_domain_list);
-
-#define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) &&   \
-                               to_pci_dev(d)->untrusted)
-
-/*
- * Iterate over elements in device_domain_list and call the specified
- * callback @fn against each element.
- */
-int for_each_device_domain(int (*fn)(struct device_domain_info *info,
-                                    void *data), void *data)
-{
-       int ret = 0;
-       unsigned long flags;
-       struct device_domain_info *info;
-
-       spin_lock_irqsave(&device_domain_lock, flags);
-       list_for_each_entry(info, &device_domain_list, global) {
-               ret = fn(info, data);
-               if (ret) {
-                       spin_unlock_irqrestore(&device_domain_lock, flags);
-                       return ret;
-               }
-       }
-       spin_unlock_irqrestore(&device_domain_lock, flags);
-
-       return 0;
-}
-
-const struct iommu_ops intel_iommu_ops;
-
-static bool translation_pre_enabled(struct intel_iommu *iommu)
-{
-       return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
-}
-
-static void clear_translation_pre_enabled(struct intel_iommu *iommu)
-{
-       iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
-}
-
-static void init_translation_status(struct intel_iommu *iommu)
-{
-       u32 gsts;
-
-       gsts = readl(iommu->reg + DMAR_GSTS_REG);
-       if (gsts & DMA_GSTS_TES)
-               iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
-}
-
-static int __init intel_iommu_setup(char *str)
-{
-       if (!str)
-               return -EINVAL;
-       while (*str) {
-               if (!strncmp(str, "on", 2)) {
-                       dmar_disabled = 0;
-                       pr_info("IOMMU enabled\n");
-               } else if (!strncmp(str, "off", 3)) {
-                       dmar_disabled = 1;
-                       no_platform_optin = 1;
-                       pr_info("IOMMU disabled\n");
-               } else if (!strncmp(str, "igfx_off", 8)) {
-                       dmar_map_gfx = 0;
-                       pr_info("Disable GFX device mapping\n");
-               } else if (!strncmp(str, "forcedac", 8)) {
-                       pr_info("Forcing DAC for PCI devices\n");
-                       dmar_forcedac = 1;
-               } else if (!strncmp(str, "strict", 6)) {
-                       pr_info("Disable batched IOTLB flush\n");
-                       intel_iommu_strict = 1;
-               } else if (!strncmp(str, "sp_off", 6)) {
-                       pr_info("Disable supported super page\n");
-                       intel_iommu_superpage = 0;
-               } else if (!strncmp(str, "sm_on", 5)) {
-                       pr_info("Intel-IOMMU: scalable mode supported\n");
-                       intel_iommu_sm = 1;
-               } else if (!strncmp(str, "tboot_noforce", 13)) {
-                       pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
-                       intel_iommu_tboot_noforce = 1;
-               } else if (!strncmp(str, "nobounce", 8)) {
-                       pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
-                       intel_no_bounce = 1;
-               }
-
-               str += strcspn(str, ",");
-               while (*str == ',')
-                       str++;
-       }
-       return 0;
-}
-__setup("intel_iommu=", intel_iommu_setup);
-
-static struct kmem_cache *iommu_domain_cache;
-static struct kmem_cache *iommu_devinfo_cache;
-
-static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
-{
-       struct dmar_domain **domains;
-       int idx = did >> 8;
-
-       domains = iommu->domains[idx];
-       if (!domains)
-               return NULL;
-
-       return domains[did & 0xff];
-}
-
-static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
-                            struct dmar_domain *domain)
-{
-       struct dmar_domain **domains;
-       int idx = did >> 8;
-
-       if (!iommu->domains[idx]) {
-               size_t size = 256 * sizeof(struct dmar_domain *);
-               iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
-       }
-
-       domains = iommu->domains[idx];
-       if (WARN_ON(!domains))
-               return;
-       else
-               domains[did & 0xff] = domain;
-}
-
-void *alloc_pgtable_page(int node)
-{
-       struct page *page;
-       void *vaddr = NULL;
-
-       page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
-       if (page)
-               vaddr = page_address(page);
-       return vaddr;
-}
-
-void free_pgtable_page(void *vaddr)
-{
-       free_page((unsigned long)vaddr);
-}
-
-static inline void *alloc_domain_mem(void)
-{
-       return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
-}
-
-static void free_domain_mem(void *vaddr)
-{
-       kmem_cache_free(iommu_domain_cache, vaddr);
-}
-
-static inline void * alloc_devinfo_mem(void)
-{
-       return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
-}
-
-static inline void free_devinfo_mem(void *vaddr)
-{
-       kmem_cache_free(iommu_devinfo_cache, vaddr);
-}
-
-static inline int domain_type_is_si(struct dmar_domain *domain)
-{
-       return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
-}
-
-static inline bool domain_use_first_level(struct dmar_domain *domain)
-{
-       return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
-}
-
-static inline int domain_pfn_supported(struct dmar_domain *domain,
-                                      unsigned long pfn)
-{
-       int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
-
-       return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
-}
-
-static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
-{
-       unsigned long sagaw;
-       int agaw = -1;
-
-       sagaw = cap_sagaw(iommu->cap);
-       for (agaw = width_to_agaw(max_gaw);
-            agaw >= 0; agaw--) {
-               if (test_bit(agaw, &sagaw))
-                       break;
-       }
-
-       return agaw;
-}
-
-/*
- * Calculate max SAGAW for each iommu.
- */
-int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
-{
-       return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
-}
-
-/*
- * calculate agaw for each iommu.
- * "SAGAW" may be different across iommus, use a default agaw, and
- * get a supported less agaw for iommus that don't support the default agaw.
- */
-int iommu_calculate_agaw(struct intel_iommu *iommu)
-{
-       return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
-}
-
-/* This functionin only returns single iommu in a domain */
-struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
-{
-       int iommu_id;
-
-       /* si_domain and vm domain should not get here. */
-       if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
-               return NULL;
-
-       for_each_domain_iommu(iommu_id, domain)
-               break;
-
-       if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
-               return NULL;
-
-       return g_iommus[iommu_id];
-}
-
-static void domain_update_iommu_coherency(struct dmar_domain *domain)
-{
-       struct dmar_drhd_unit *drhd;
-       struct intel_iommu *iommu;
-       bool found = false;
-       int i;
-
-       domain->iommu_coherency = 1;
-
-       for_each_domain_iommu(i, domain) {
-               found = true;
-               if (!ecap_coherent(g_iommus[i]->ecap)) {
-                       domain->iommu_coherency = 0;
-                       break;
-               }
-       }
-       if (found)
-               return;
-
-       /* No hardware attached; use lowest common denominator */
-       rcu_read_lock();
-       for_each_active_iommu(iommu, drhd) {
-               if (!ecap_coherent(iommu->ecap)) {
-                       domain->iommu_coherency = 0;
-                       break;
-               }
-       }
-       rcu_read_unlock();
-}
-
-static int domain_update_iommu_snooping(struct intel_iommu *skip)
-{
-       struct dmar_drhd_unit *drhd;
-       struct intel_iommu *iommu;
-       int ret = 1;
-
-       rcu_read_lock();
-       for_each_active_iommu(iommu, drhd) {
-               if (iommu != skip) {
-                       if (!ecap_sc_support(iommu->ecap)) {
-                               ret = 0;
-                               break;
-                       }
-               }
-       }
-       rcu_read_unlock();
-
-       return ret;
-}
-
-static int domain_update_iommu_superpage(struct dmar_domain *domain,
-                                        struct intel_iommu *skip)
-{
-       struct dmar_drhd_unit *drhd;
-       struct intel_iommu *iommu;
-       int mask = 0x3;
-
-       if (!intel_iommu_superpage) {
-               return 0;
-       }
-
-       /* set iommu_superpage to the smallest common denominator */
-       rcu_read_lock();
-       for_each_active_iommu(iommu, drhd) {
-               if (iommu != skip) {
-                       if (domain && domain_use_first_level(domain)) {
-                               if (!cap_fl1gp_support(iommu->cap))
-                                       mask = 0x1;
-                       } else {
-                               mask &= cap_super_page_val(iommu->cap);
-                       }
-
-                       if (!mask)
-                               break;
-               }
-       }
-       rcu_read_unlock();
-
-       return fls(mask);
-}
-
-/* Some capabilities may be different across iommus */
-static void domain_update_iommu_cap(struct dmar_domain *domain)
-{
-       domain_update_iommu_coherency(domain);
-       domain->iommu_snooping = domain_update_iommu_snooping(NULL);
-       domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
-}
-
-struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
-                                        u8 devfn, int alloc)
-{
-       struct root_entry *root = &iommu->root_entry[bus];
-       struct context_entry *context;
-       u64 *entry;
-
-       entry = &root->lo;
-       if (sm_supported(iommu)) {
-               if (devfn >= 0x80) {
-                       devfn -= 0x80;
-                       entry = &root->hi;
-               }
-               devfn *= 2;
-       }
-       if (*entry & 1)
-               context = phys_to_virt(*entry & VTD_PAGE_MASK);
-       else {
-               unsigned long phy_addr;
-               if (!alloc)
-                       return NULL;
-
-               context = alloc_pgtable_page(iommu->node);
-               if (!context)
-                       return NULL;
-
-               __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
-               phy_addr = virt_to_phys((void *)context);
-               *entry = phy_addr | 1;
-               __iommu_flush_cache(iommu, entry, sizeof(*entry));
-       }
-       return &context[devfn];
-}
-
-static int iommu_dummy(struct device *dev)
-{
-       return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
-}
-
-static bool attach_deferred(struct device *dev)
-{
-       return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO;
-}
-
-/**
- * is_downstream_to_pci_bridge - test if a device belongs to the PCI
- *                              sub-hierarchy of a candidate PCI-PCI bridge
- * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
- * @bridge: the candidate PCI-PCI bridge
- *
- * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
- */
-static bool
-is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
-{
-       struct pci_dev *pdev, *pbridge;
-
-       if (!dev_is_pci(dev) || !dev_is_pci(bridge))
-               return false;
-
-       pdev = to_pci_dev(dev);
-       pbridge = to_pci_dev(bridge);
-
-       if (pbridge->subordinate &&
-           pbridge->subordinate->number <= pdev->bus->number &&
-           pbridge->subordinate->busn_res.end >= pdev->bus->number)
-               return true;
-
-       return false;
-}
-
-static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
-{
-       struct dmar_drhd_unit *drhd = NULL;
-       struct intel_iommu *iommu;
-       struct device *tmp;
-       struct pci_dev *pdev = NULL;
-       u16 segment = 0;
-       int i;
-
-       if (iommu_dummy(dev))
-               return NULL;
-
-       if (dev_is_pci(dev)) {
-               struct pci_dev *pf_pdev;
-
-               pdev = pci_real_dma_dev(to_pci_dev(dev));
-
-               /* VFs aren't listed in scope tables; we need to look up
-                * the PF instead to find the IOMMU. */
-               pf_pdev = pci_physfn(pdev);
-               dev = &pf_pdev->dev;
-               segment = pci_domain_nr(pdev->bus);
-       } else if (has_acpi_companion(dev))
-               dev = &ACPI_COMPANION(dev)->dev;
-
-       rcu_read_lock();
-       for_each_active_iommu(iommu, drhd) {
-               if (pdev && segment != drhd->segment)
-                       continue;
-
-               for_each_active_dev_scope(drhd->devices,
-                                         drhd->devices_cnt, i, tmp) {
-                       if (tmp == dev) {
-                               /* For a VF use its original BDF# not that of the PF
-                                * which we used for the IOMMU lookup. Strictly speaking
-                                * we could do this for all PCI devices; we only need to
-                                * get the BDF# from the scope table for ACPI matches. */
-                               if (pdev && pdev->is_virtfn)
-                                       goto got_pdev;
-
-                               *bus = drhd->devices[i].bus;
-                               *devfn = drhd->devices[i].devfn;
-                               goto out;
-                       }
-
-                       if (is_downstream_to_pci_bridge(dev, tmp))
-                               goto got_pdev;
-               }
-
-               if (pdev && drhd->include_all) {
-               got_pdev:
-                       *bus = pdev->bus->number;
-                       *devfn = pdev->devfn;
-                       goto out;
-               }
-       }
-       iommu = NULL;
- out:
-       rcu_read_unlock();
-
-       return iommu;
-}
-
-static void domain_flush_cache(struct dmar_domain *domain,
-                              void *addr, int size)
-{
-       if (!domain->iommu_coherency)
-               clflush_cache_range(addr, size);
-}
-
-static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
-{
-       struct context_entry *context;
-       int ret = 0;
-       unsigned long flags;
-
-       spin_lock_irqsave(&iommu->lock, flags);
-       context = iommu_context_addr(iommu, bus, devfn, 0);
-       if (context)
-               ret = context_present(context);
-       spin_unlock_irqrestore(&iommu->lock, flags);
-       return ret;
-}
-
-static void free_context_table(struct intel_iommu *iommu)
-{
-       int i;
-       unsigned long flags;
-       struct context_entry *context;
-
-       spin_lock_irqsave(&iommu->lock, flags);
-       if (!iommu->root_entry) {
-               goto out;
-       }
-       for (i = 0; i < ROOT_ENTRY_NR; i++) {
-               context = iommu_context_addr(iommu, i, 0, 0);
-               if (context)
-                       free_pgtable_page(context);
-
-               if (!sm_supported(iommu))
-                       continue;
-
-               context = iommu_context_addr(iommu, i, 0x80, 0);
-               if (context)
-                       free_pgtable_page(context);
-
-       }
-       free_pgtable_page(iommu->root_entry);
-       iommu->root_entry = NULL;
-out:
-       spin_unlock_irqrestore(&iommu->lock, flags);
-}
-
-static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
-                                     unsigned long pfn, int *target_level)
-{
-       struct dma_pte *parent, *pte;
-       int level = agaw_to_level(domain->agaw);
-       int offset;
-
-       BUG_ON(!domain->pgd);
-
-       if (!domain_pfn_supported(domain, pfn))
-               /* Address beyond IOMMU's addressing capabilities. */
-               return NULL;
-
-       parent = domain->pgd;
-
-       while (1) {
-               void *tmp_page;
-
-               offset = pfn_level_offset(pfn, level);
-               pte = &parent[offset];
-               if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
-                       break;
-               if (level == *target_level)
-                       break;
-
-               if (!dma_pte_present(pte)) {
-                       uint64_t pteval;
-
-                       tmp_page = alloc_pgtable_page(domain->nid);
-
-                       if (!tmp_page)
-                               return NULL;
-
-                       domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
-                       pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
-                       if (domain_use_first_level(domain))
-                               pteval |= DMA_FL_PTE_XD;
-                       if (cmpxchg64(&pte->val, 0ULL, pteval))
-                               /* Someone else set it while we were thinking; use theirs. */
-                               free_pgtable_page(tmp_page);
-                       else
-                               domain_flush_cache(domain, pte, sizeof(*pte));
-               }
-               if (level == 1)
-                       break;
-
-               parent = phys_to_virt(dma_pte_addr(pte));
-               level--;
-       }
-
-       if (!*target_level)
-               *target_level = level;
-
-       return pte;
-}
-
-/* return address's pte at specific level */
-static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
-                                        unsigned long pfn,
-                                        int level, int *large_page)
-{
-       struct dma_pte *parent, *pte;
-       int total = agaw_to_level(domain->agaw);
-       int offset;
-
-       parent = domain->pgd;
-       while (level <= total) {
-               offset = pfn_level_offset(pfn, total);
-               pte = &parent[offset];
-               if (level == total)
-                       return pte;
-
-               if (!dma_pte_present(pte)) {
-                       *large_page = total;
-                       break;
-               }
-
-               if (dma_pte_superpage(pte)) {
-                       *large_page = total;
-                       return pte;
-               }
-
-               parent = phys_to_virt(dma_pte_addr(pte));
-               total--;
-       }
-       return NULL;
-}
-
-/* clear last level pte, a tlb flush should be followed */
-static void dma_pte_clear_range(struct dmar_domain *domain,
-                               unsigned long start_pfn,
-                               unsigned long last_pfn)
-{
-       unsigned int large_page;
-       struct dma_pte *first_pte, *pte;
-
-       BUG_ON(!domain_pfn_supported(domain, start_pfn));
-       BUG_ON(!domain_pfn_supported(domain, last_pfn));
-       BUG_ON(start_pfn > last_pfn);
-
-       /* we don't need lock here; nobody else touches the iova range */
-       do {
-               large_page = 1;
-               first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
-               if (!pte) {
-                       start_pfn = align_to_level(start_pfn + 1, large_page + 1);
-                       continue;
-               }
-               do {
-                       dma_clear_pte(pte);
-                       start_pfn += lvl_to_nr_pages(large_page);
-                       pte++;
-               } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
-
-               domain_flush_cache(domain, first_pte,
-                                  (void *)pte - (void *)first_pte);
-
-       } while (start_pfn && start_pfn <= last_pfn);
-}
-
-static void dma_pte_free_level(struct dmar_domain *domain, int level,
-                              int retain_level, struct dma_pte *pte,
-                              unsigned long pfn, unsigned long start_pfn,
-                              unsigned long last_pfn)
-{
-       pfn = max(start_pfn, pfn);
-       pte = &pte[pfn_level_offset(pfn, level)];
-
-       do {
-               unsigned long level_pfn;
-               struct dma_pte *level_pte;
-
-               if (!dma_pte_present(pte) || dma_pte_superpage(pte))
-                       goto next;
-
-               level_pfn = pfn & level_mask(level);
-               level_pte = phys_to_virt(dma_pte_addr(pte));
-
-               if (level > 2) {
-                       dma_pte_free_level(domain, level - 1, retain_level,
-                                          level_pte, level_pfn, start_pfn,
-                                          last_pfn);
-               }
-
-               /*
-                * Free the page table if we're below the level we want to
-                * retain and the range covers the entire table.
-                */
-               if (level < retain_level && !(start_pfn > level_pfn ||
-                     last_pfn < level_pfn + level_size(level) - 1)) {
-                       dma_clear_pte(pte);
-                       domain_flush_cache(domain, pte, sizeof(*pte));
-                       free_pgtable_page(level_pte);
-               }
-next:
-               pfn += level_size(level);
-       } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
-}
-
-/*
- * clear last level (leaf) ptes and free page table pages below the
- * level we wish to keep intact.
- */
-static void dma_pte_free_pagetable(struct dmar_domain *domain,
-                                  unsigned long start_pfn,
-                                  unsigned long last_pfn,
-                                  int retain_level)
-{
-       BUG_ON(!domain_pfn_supported(domain, start_pfn));
-       BUG_ON(!domain_pfn_supported(domain, last_pfn));
-       BUG_ON(start_pfn > last_pfn);
-
-       dma_pte_clear_range(domain, start_pfn, last_pfn);
-
-       /* We don't need lock here; nobody else touches the iova range */
-       dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
-                          domain->pgd, 0, start_pfn, last_pfn);
-
-       /* free pgd */
-       if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
-               free_pgtable_page(domain->pgd);
-               domain->pgd = NULL;
-       }
-}
-
-/* When a page at a given level is being unlinked from its parent, we don't
-   need to *modify* it at all. All we need to do is make a list of all the
-   pages which can be freed just as soon as we've flushed the IOTLB and we
-   know the hardware page-walk will no longer touch them.
-   The 'pte' argument is the *parent* PTE, pointing to the page that is to
-   be freed. */
-static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
-                                           int level, struct dma_pte *pte,
-                                           struct page *freelist)
-{
-       struct page *pg;
-
-       pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
-       pg->freelist = freelist;
-       freelist = pg;
-
-       if (level == 1)
-               return freelist;
-
-       pte = page_address(pg);
-       do {
-               if (dma_pte_present(pte) && !dma_pte_superpage(pte))
-                       freelist = dma_pte_list_pagetables(domain, level - 1,
-                                                          pte, freelist);
-               pte++;
-       } while (!first_pte_in_page(pte));
-
-       return freelist;
-}
-
-static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
-                                       struct dma_pte *pte, unsigned long pfn,
-                                       unsigned long start_pfn,
-                                       unsigned long last_pfn,
-                                       struct page *freelist)
-{
-       struct dma_pte *first_pte = NULL, *last_pte = NULL;
-
-       pfn = max(start_pfn, pfn);
-       pte = &pte[pfn_level_offset(pfn, level)];
-
-       do {
-               unsigned long level_pfn;
-
-               if (!dma_pte_present(pte))
-                       goto next;
-
-               level_pfn = pfn & level_mask(level);
-
-               /* If range covers entire pagetable, free it */
-               if (start_pfn <= level_pfn &&
-                   last_pfn >= level_pfn + level_size(level) - 1) {
-                       /* These suborbinate page tables are going away entirely. Don't
-                          bother to clear them; we're just going to *free* them. */
-                       if (level > 1 && !dma_pte_superpage(pte))
-                               freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
-
-                       dma_clear_pte(pte);
-                       if (!first_pte)
-                               first_pte = pte;
-                       last_pte = pte;
-               } else if (level > 1) {
-                       /* Recurse down into a level that isn't *entirely* obsolete */
-                       freelist = dma_pte_clear_level(domain, level - 1,
-                                                      phys_to_virt(dma_pte_addr(pte)),
-                                                      level_pfn, start_pfn, last_pfn,
-                                                      freelist);
-               }
-next:
-               pfn += level_size(level);
-       } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
-
-       if (first_pte)
-               domain_flush_cache(domain, first_pte,
-                                  (void *)++last_pte - (void *)first_pte);
-
-       return freelist;
-}
-
-/* We can't just free the pages because the IOMMU may still be walking
-   the page tables, and may have cached the intermediate levels. The
-   pages can only be freed after the IOTLB flush has been done. */
-static struct page *domain_unmap(struct dmar_domain *domain,
-                                unsigned long start_pfn,
-                                unsigned long last_pfn)
-{
-       struct page *freelist;
-
-       BUG_ON(!domain_pfn_supported(domain, start_pfn));
-       BUG_ON(!domain_pfn_supported(domain, last_pfn));
-       BUG_ON(start_pfn > last_pfn);
-
-       /* we don't need lock here; nobody else touches the iova range */
-       freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
-                                      domain->pgd, 0, start_pfn, last_pfn, NULL);
-
-       /* free pgd */
-       if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
-               struct page *pgd_page = virt_to_page(domain->pgd);
-               pgd_page->freelist = freelist;
-               freelist = pgd_page;
-
-               domain->pgd = NULL;
-       }
-
-       return freelist;
-}
-
-static void dma_free_pagelist(struct page *freelist)
-{
-       struct page *pg;
-
-       while ((pg = freelist)) {
-               freelist = pg->freelist;
-               free_pgtable_page(page_address(pg));
-       }
-}
-
-static void iova_entry_free(unsigned long data)
-{
-       struct page *freelist = (struct page *)data;
-
-       dma_free_pagelist(freelist);
-}
-
-/* iommu handling */
-static int iommu_alloc_root_entry(struct intel_iommu *iommu)
-{
-       struct root_entry *root;
-       unsigned long flags;
-
-       root = (struct root_entry *)alloc_pgtable_page(iommu->node);
-       if (!root) {
-               pr_err("Allocating root entry for %s failed\n",
-                       iommu->name);
-               return -ENOMEM;
-       }
-
-       __iommu_flush_cache(iommu, root, ROOT_SIZE);
-
-       spin_lock_irqsave(&iommu->lock, flags);
-       iommu->root_entry = root;
-       spin_unlock_irqrestore(&iommu->lock, flags);
-
-       return 0;
-}
-
-static void iommu_set_root_entry(struct intel_iommu *iommu)
-{
-       u64 addr;
-       u32 sts;
-       unsigned long flag;
-
-       addr = virt_to_phys(iommu->root_entry);
-       if (sm_supported(iommu))
-               addr |= DMA_RTADDR_SMT;
-
-       raw_spin_lock_irqsave(&iommu->register_lock, flag);
-       dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
-
-       writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
-
-       /* Make sure hardware complete it */
-       IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
-                     readl, (sts & DMA_GSTS_RTPS), sts);
-
-       raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
-}
-
-void iommu_flush_write_buffer(struct intel_iommu *iommu)
-{
-       u32 val;
-       unsigned long flag;
-
-       if (!rwbf_quirk && !cap_rwbf(iommu->cap))
-               return;
-
-       raw_spin_lock_irqsave(&iommu->register_lock, flag);
-       writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
-
-       /* Make sure hardware complete it */
-       IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
-                     readl, (!(val & DMA_GSTS_WBFS)), val);
-
-       raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
-}
-
-/* return value determine if we need a write buffer flush */
-static void __iommu_flush_context(struct intel_iommu *iommu,
-                                 u16 did, u16 source_id, u8 function_mask,
-                                 u64 type)
-{
-       u64 val = 0;
-       unsigned long flag;
-
-       switch (type) {
-       case DMA_CCMD_GLOBAL_INVL:
-               val = DMA_CCMD_GLOBAL_INVL;
-               break;
-       case DMA_CCMD_DOMAIN_INVL:
-               val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
-               break;
-       case DMA_CCMD_DEVICE_INVL:
-               val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
-                       | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
-               break;
-       default:
-               BUG();
-       }
-       val |= DMA_CCMD_ICC;
-
-       raw_spin_lock_irqsave(&iommu->register_lock, flag);
-       dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
-
-       /* Make sure hardware complete it */
-       IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
-               dmar_readq, (!(val & DMA_CCMD_ICC)), val);
-
-       raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
-}
-
-/* return value determine if we need a write buffer flush */
-static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
-                               u64 addr, unsigned int size_order, u64 type)
-{
-       int tlb_offset = ecap_iotlb_offset(iommu->ecap);
-       u64 val = 0, val_iva = 0;
-       unsigned long flag;
-
-       switch (type) {
-       case DMA_TLB_GLOBAL_FLUSH:
-               /* global flush doesn't need set IVA_REG */
-               val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
-               break;
-       case DMA_TLB_DSI_FLUSH:
-               val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
-               break;
-       case DMA_TLB_PSI_FLUSH:
-               val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
-               /* IH bit is passed in as part of address */
-               val_iva = size_order | addr;
-               break;
-       default:
-               BUG();
-       }
-       /* Note: set drain read/write */
-#if 0
-       /*
-        * This is probably to be super secure.. Looks like we can
-        * ignore it without any impact.
-        */
-       if (cap_read_drain(iommu->cap))
-               val |= DMA_TLB_READ_DRAIN;
-#endif
-       if (cap_write_drain(iommu->cap))
-               val |= DMA_TLB_WRITE_DRAIN;
-
-       raw_spin_lock_irqsave(&iommu->register_lock, flag);
-       /* Note: Only uses first TLB reg currently */
-       if (val_iva)
-               dmar_writeq(iommu->reg + tlb_offset, val_iva);
-       dmar_writeq(iommu->reg + tlb_offset + 8, val);
-
-       /* Make sure hardware complete it */
-       IOMMU_WAIT_OP(iommu, tlb_offset + 8,
-               dmar_readq, (!(val & DMA_TLB_IVT)), val);
-
-       raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
-
-       /* check IOTLB invalidation granularity */
-       if (DMA_TLB_IAIG(val) == 0)
-               pr_err("Flush IOTLB failed\n");
-       if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
-               pr_debug("TLB flush request %Lx, actual %Lx\n",
-                       (unsigned long long)DMA_TLB_IIRG(type),
-                       (unsigned long long)DMA_TLB_IAIG(val));
-}
-
-static struct device_domain_info *
-iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
-                        u8 bus, u8 devfn)
-{
-       struct device_domain_info *info;
-
-       assert_spin_locked(&device_domain_lock);
-
-       if (!iommu->qi)
-               return NULL;
-
-       list_for_each_entry(info, &domain->devices, link)
-               if (info->iommu == iommu && info->bus == bus &&
-                   info->devfn == devfn) {
-                       if (info->ats_supported && info->dev)
-                               return info;
-                       break;
-               }
-
-       return NULL;
-}
-
-static void domain_update_iotlb(struct dmar_domain *domain)
-{
-       struct device_domain_info *info;
-       bool has_iotlb_device = false;
-
-       assert_spin_locked(&device_domain_lock);
-
-       list_for_each_entry(info, &domain->devices, link) {
-               struct pci_dev *pdev;
-
-               if (!info->dev || !dev_is_pci(info->dev))
-                       continue;
-
-               pdev = to_pci_dev(info->dev);
-               if (pdev->ats_enabled) {
-                       has_iotlb_device = true;
-                       break;
-               }
-       }
-
-       domain->has_iotlb_device = has_iotlb_device;
-}
-
-static void iommu_enable_dev_iotlb(struct device_domain_info *info)
-{
-       struct pci_dev *pdev;
-
-       assert_spin_locked(&device_domain_lock);
-
-       if (!info || !dev_is_pci(info->dev))
-               return;
-
-       pdev = to_pci_dev(info->dev);
-       /* For IOMMU that supports device IOTLB throttling (DIT), we assign
-        * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
-        * queue depth at PF level. If DIT is not set, PFSID will be treated as
-        * reserved, which should be set to 0.
-        */
-       if (!ecap_dit(info->iommu->ecap))
-               info->pfsid = 0;
-       else {
-               struct pci_dev *pf_pdev;
-
-               /* pdev will be returned if device is not a vf */
-               pf_pdev = pci_physfn(pdev);
-               info->pfsid = pci_dev_id(pf_pdev);
-       }
-
-#ifdef CONFIG_INTEL_IOMMU_SVM
-       /* The PCIe spec, in its wisdom, declares that the behaviour of
-          the device if you enable PASID support after ATS support is
-          undefined. So always enable PASID support on devices which
-          have it, even if we can't yet know if we're ever going to
-          use it. */
-       if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
-               info->pasid_enabled = 1;
-
-       if (info->pri_supported &&
-           (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
-           !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
-               info->pri_enabled = 1;
-#endif
-       if (info->ats_supported && pci_ats_page_aligned(pdev) &&
-           !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
-               info->ats_enabled = 1;
-               domain_update_iotlb(info->domain);
-               info->ats_qdep = pci_ats_queue_depth(pdev);
-       }
-}
-
-static void iommu_disable_dev_iotlb(struct device_domain_info *info)
-{
-       struct pci_dev *pdev;
-
-       assert_spin_locked(&device_domain_lock);
-
-       if (!dev_is_pci(info->dev))
-               return;
-
-       pdev = to_pci_dev(info->dev);
-
-       if (info->ats_enabled) {
-               pci_disable_ats(pdev);
-               info->ats_enabled = 0;
-               domain_update_iotlb(info->domain);
-       }
-#ifdef CONFIG_INTEL_IOMMU_SVM
-       if (info->pri_enabled) {
-               pci_disable_pri(pdev);
-               info->pri_enabled = 0;
-       }
-       if (info->pasid_enabled) {
-               pci_disable_pasid(pdev);
-               info->pasid_enabled = 0;
-       }
-#endif
-}
-
-static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
-                                 u64 addr, unsigned mask)
-{
-       u16 sid, qdep;
-       unsigned long flags;
-       struct device_domain_info *info;
-
-       if (!domain->has_iotlb_device)
-               return;
-
-       spin_lock_irqsave(&device_domain_lock, flags);
-       list_for_each_entry(info, &domain->devices, link) {
-               if (!info->ats_enabled)
-                       continue;
-
-               sid = info->bus << 8 | info->devfn;
-               qdep = info->ats_qdep;
-               qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
-                               qdep, addr, mask);
-       }
-       spin_unlock_irqrestore(&device_domain_lock, flags);
-}
-
-static void domain_flush_piotlb(struct intel_iommu *iommu,
-                               struct dmar_domain *domain,
-                               u64 addr, unsigned long npages, bool ih)
-{
-       u16 did = domain->iommu_did[iommu->seq_id];
-
-       if (domain->default_pasid)
-               qi_flush_piotlb(iommu, did, domain->default_pasid,
-                               addr, npages, ih);
-
-       if (!list_empty(&domain->devices))
-               qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
-}
-
-static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
-                                 struct dmar_domain *domain,
-                                 unsigned long pfn, unsigned int pages,
-                                 int ih, int map)
-{
-       unsigned int mask = ilog2(__roundup_pow_of_two(pages));
-       uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
-       u16 did = domain->iommu_did[iommu->seq_id];
-
-       BUG_ON(pages == 0);
-
-       if (ih)
-               ih = 1 << 6;
-
-       if (domain_use_first_level(domain)) {
-               domain_flush_piotlb(iommu, domain, addr, pages, ih);
-       } else {
-               /*
-                * Fallback to domain selective flush if no PSI support or
-                * the size is too big. PSI requires page size to be 2 ^ x,
-                * and the base address is naturally aligned to the size.
-                */
-               if (!cap_pgsel_inv(iommu->cap) ||
-                   mask > cap_max_amask_val(iommu->cap))
-                       iommu->flush.flush_iotlb(iommu, did, 0, 0,
-                                                       DMA_TLB_DSI_FLUSH);
-               else
-                       iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
-                                                       DMA_TLB_PSI_FLUSH);
-       }
-
-       /*
-        * In caching mode, changes of pages from non-present to present require
-        * flush. However, device IOTLB doesn't need to be flushed in this case.
-        */
-       if (!cap_caching_mode(iommu->cap) || !map)
-               iommu_flush_dev_iotlb(domain, addr, mask);
-}
-
-/* Notification for newly created mappings */
-static inline void __mapping_notify_one(struct intel_iommu *iommu,
-                                       struct dmar_domain *domain,
-                                       unsigned long pfn, unsigned int pages)
-{
-       /*
-        * It's a non-present to present mapping. Only flush if caching mode
-        * and second level.
-        */
-       if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
-               iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
-       else
-               iommu_flush_write_buffer(iommu);
-}
-
-static void iommu_flush_iova(struct iova_domain *iovad)
-{
-       struct dmar_domain *domain;
-       int idx;
-
-       domain = container_of(iovad, struct dmar_domain, iovad);
-
-       for_each_domain_iommu(idx, domain) {
-               struct intel_iommu *iommu = g_iommus[idx];
-               u16 did = domain->iommu_did[iommu->seq_id];
-
-               if (domain_use_first_level(domain))
-                       domain_flush_piotlb(iommu, domain, 0, -1, 0);
-               else
-                       iommu->flush.flush_iotlb(iommu, did, 0, 0,
-                                                DMA_TLB_DSI_FLUSH);
-
-               if (!cap_caching_mode(iommu->cap))
-                       iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
-                                             0, MAX_AGAW_PFN_WIDTH);
-       }
-}
-
-static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
-{
-       u32 pmen;
-       unsigned long flags;
-
-       if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
-               return;
-
-       raw_spin_lock_irqsave(&iommu->register_lock, flags);
-       pmen = readl(iommu->reg + DMAR_PMEN_REG);
-       pmen &= ~DMA_PMEN_EPM;
-       writel(pmen, iommu->reg + DMAR_PMEN_REG);
-
-       /* wait for the protected region status bit to clear */
-       IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
-               readl, !(pmen & DMA_PMEN_PRS), pmen);
-
-       raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
-}
-
-static void iommu_enable_translation(struct intel_iommu *iommu)
-{
-       u32 sts;
-       unsigned long flags;
-
-       raw_spin_lock_irqsave(&iommu->register_lock, flags);
-       iommu->gcmd |= DMA_GCMD_TE;
-       writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
-
-       /* Make sure hardware complete it */
-       IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
-                     readl, (sts & DMA_GSTS_TES), sts);
-
-       raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
-}
-
-static void iommu_disable_translation(struct intel_iommu *iommu)
-{
-       u32 sts;
-       unsigned long flag;
-
-       raw_spin_lock_irqsave(&iommu->register_lock, flag);
-       iommu->gcmd &= ~DMA_GCMD_TE;
-       writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
-
-       /* Make sure hardware complete it */
-       IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
-                     readl, (!(sts & DMA_GSTS_TES)), sts);
-
-       raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
-}
-
-static int iommu_init_domains(struct intel_iommu *iommu)
-{
-       u32 ndomains, nlongs;
-       size_t size;
-
-       ndomains = cap_ndoms(iommu->cap);
-       pr_debug("%s: Number of Domains supported <%d>\n",
-                iommu->name, ndomains);
-       nlongs = BITS_TO_LONGS(ndomains);
-
-       spin_lock_init(&iommu->lock);
-
-       iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
-       if (!iommu->domain_ids) {
-               pr_err("%s: Allocating domain id array failed\n",
-                      iommu->name);
-               return -ENOMEM;
-       }
-
-       size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
-       iommu->domains = kzalloc(size, GFP_KERNEL);
-
-       if (iommu->domains) {
-               size = 256 * sizeof(struct dmar_domain *);
-               iommu->domains[0] = kzalloc(size, GFP_KERNEL);
-       }
-
-       if (!iommu->domains || !iommu->domains[0]) {
-               pr_err("%s: Allocating domain array failed\n",
-                      iommu->name);
-               kfree(iommu->domain_ids);
-               kfree(iommu->domains);
-               iommu->domain_ids = NULL;
-               iommu->domains    = NULL;
-               return -ENOMEM;
-       }
-
-       /*
-        * If Caching mode is set, then invalid translations are tagged
-        * with domain-id 0, hence we need to pre-allocate it. We also
-        * use domain-id 0 as a marker for non-allocated domain-id, so
-        * make sure it is not used for a real domain.
-        */
-       set_bit(0, iommu->domain_ids);
-
-       /*
-        * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
-        * entry for first-level or pass-through translation modes should
-        * be programmed with a domain id different from those used for
-        * second-level or nested translation. We reserve a domain id for
-        * this purpose.
-        */
-       if (sm_supported(iommu))
-               set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
-
-       return 0;
-}
-
-static void disable_dmar_iommu(struct intel_iommu *iommu)
-{
-       struct device_domain_info *info, *tmp;
-       unsigned long flags;
-
-       if (!iommu->domains || !iommu->domain_ids)
-               return;
-
-       spin_lock_irqsave(&device_domain_lock, flags);
-       list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
-               if (info->iommu != iommu)
-                       continue;
-
-               if (!info->dev || !info->domain)
-                       continue;
-
-               __dmar_remove_one_dev_info(info);
-       }
-       spin_unlock_irqrestore(&device_domain_lock, flags);
-
-       if (iommu->gcmd & DMA_GCMD_TE)
-               iommu_disable_translation(iommu);
-}
-
-static void free_dmar_iommu(struct intel_iommu *iommu)
-{
-       if ((iommu->domains) && (iommu->domain_ids)) {
-               int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
-               int i;
-
-               for (i = 0; i < elems; i++)
-                       kfree(iommu->domains[i]);
-               kfree(iommu->domains);
-               kfree(iommu->domain_ids);
-               iommu->domains = NULL;
-               iommu->domain_ids = NULL;
-       }
-
-       g_iommus[iommu->seq_id] = NULL;
-
-       /* free context mapping */
-       free_context_table(iommu);
-
-#ifdef CONFIG_INTEL_IOMMU_SVM
-       if (pasid_supported(iommu)) {
-               if (ecap_prs(iommu->ecap))
-                       intel_svm_finish_prq(iommu);
-       }
-       if (ecap_vcs(iommu->ecap) && vccap_pasid(iommu->vccap))
-               ioasid_unregister_allocator(&iommu->pasid_allocator);
-
-#endif
-}
-
-/*
- * Check and return whether first level is used by default for
- * DMA translation.
- */
-static bool first_level_by_default(void)
-{
-       struct dmar_drhd_unit *drhd;
-       struct intel_iommu *iommu;
-       static int first_level_support = -1;
-
-       if (likely(first_level_support != -1))
-               return first_level_support;
-
-       first_level_support = 1;
-
-       rcu_read_lock();
-       for_each_active_iommu(iommu, drhd) {
-               if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) {
-                       first_level_support = 0;
-                       break;
-               }
-       }
-       rcu_read_unlock();
-
-       return first_level_support;
-}
-
-static struct dmar_domain *alloc_domain(int flags)
-{
-       struct dmar_domain *domain;
-
-       domain = alloc_domain_mem();
-       if (!domain)
-               return NULL;
-
-       memset(domain, 0, sizeof(*domain));
-       domain->nid = NUMA_NO_NODE;
-       domain->flags = flags;
-       if (first_level_by_default())
-               domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
-       domain->has_iotlb_device = false;
-       INIT_LIST_HEAD(&domain->devices);
-
-       return domain;
-}
-
-/* Must be called with iommu->lock */
-static int domain_attach_iommu(struct dmar_domain *domain,
-                              struct intel_iommu *iommu)
-{
-       unsigned long ndomains;
-       int num;
-
-       assert_spin_locked(&device_domain_lock);
-       assert_spin_locked(&iommu->lock);
-
-       domain->iommu_refcnt[iommu->seq_id] += 1;
-       domain->iommu_count += 1;
-       if (domain->iommu_refcnt[iommu->seq_id] == 1) {
-               ndomains = cap_ndoms(iommu->cap);
-               num      = find_first_zero_bit(iommu->domain_ids, ndomains);
-
-               if (num >= ndomains) {
-                       pr_err("%s: No free domain ids\n", iommu->name);
-                       domain->iommu_refcnt[iommu->seq_id] -= 1;
-                       domain->iommu_count -= 1;
-                       return -ENOSPC;
-               }
-
-               set_bit(num, iommu->domain_ids);
-               set_iommu_domain(iommu, num, domain);
-
-               domain->iommu_did[iommu->seq_id] = num;
-               domain->nid                      = iommu->node;
-
-               domain_update_iommu_cap(domain);
-       }
-
-       return 0;
-}
-
-static int domain_detach_iommu(struct dmar_domain *domain,
-                              struct intel_iommu *iommu)
-{
-       int num, count;
-
-       assert_spin_locked(&device_domain_lock);
-       assert_spin_locked(&iommu->lock);
-
-       domain->iommu_refcnt[iommu->seq_id] -= 1;
-       count = --domain->iommu_count;
-       if (domain->iommu_refcnt[iommu->seq_id] == 0) {
-               num = domain->iommu_did[iommu->seq_id];
-               clear_bit(num, iommu->domain_ids);
-               set_iommu_domain(iommu, num, NULL);
-
-               domain_update_iommu_cap(domain);
-               domain->iommu_did[iommu->seq_id] = 0;
-       }
-
-       return count;
-}
-
-static struct iova_domain reserved_iova_list;
-static struct lock_class_key reserved_rbtree_key;
-
-static int dmar_init_reserved_ranges(void)
-{
-       struct pci_dev *pdev = NULL;
-       struct iova *iova;
-       int i;
-
-       init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
-
-       lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
-               &reserved_rbtree_key);
-
-       /* IOAPIC ranges shouldn't be accessed by DMA */
-       iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
-               IOVA_PFN(IOAPIC_RANGE_END));
-       if (!iova) {
-               pr_err("Reserve IOAPIC range failed\n");
-               return -ENODEV;
-       }
-
-       /* Reserve all PCI MMIO to avoid peer-to-peer access */
-       for_each_pci_dev(pdev) {
-               struct resource *r;
-
-               for (i = 0; i < PCI_NUM_RESOURCES; i++) {
-                       r = &pdev->resource[i];
-                       if (!r->flags || !(r->flags & IORESOURCE_MEM))
-                               continue;
-                       iova = reserve_iova(&reserved_iova_list,
-                                           IOVA_PFN(r->start),
-                                           IOVA_PFN(r->end));
-                       if (!iova) {
-                               pci_err(pdev, "Reserve iova for %pR failed\n", r);
-                               return -ENODEV;
-                       }
-               }
-       }
-       return 0;
-}
-
-static inline int guestwidth_to_adjustwidth(int gaw)
-{
-       int agaw;
-       int r = (gaw - 12) % 9;
-
-       if (r == 0)
-               agaw = gaw;
-       else
-               agaw = gaw + 9 - r;
-       if (agaw > 64)
-               agaw = 64;
-       return agaw;
-}
-
-static void domain_exit(struct dmar_domain *domain)
-{
-
-       /* Remove associated devices and clear attached or cached domains */
-       domain_remove_dev_info(domain);
-
-       /* destroy iovas */
-       if (domain->domain.type == IOMMU_DOMAIN_DMA)
-               put_iova_domain(&domain->iovad);
-
-       if (domain->pgd) {
-               struct page *freelist;
-
-               freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
-               dma_free_pagelist(freelist);
-       }
-
-       free_domain_mem(domain);
-}
-
-/*
- * Get the PASID directory size for scalable mode context entry.
- * Value of X in the PDTS field of a scalable mode context entry
- * indicates PASID directory with 2^(X + 7) entries.
- */
-static inline unsigned long context_get_sm_pds(struct pasid_table *table)
-{
-       int pds, max_pde;
-
-       max_pde = table->max_pasid >> PASID_PDE_SHIFT;
-       pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
-       if (pds < 7)
-               return 0;
-
-       return pds - 7;
-}
-
-/*
- * Set the RID_PASID field of a scalable mode context entry. The
- * IOMMU hardware will use the PASID value set in this field for
- * DMA translations of DMA requests without PASID.
- */
-static inline void
-context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
-{
-       context->hi |= pasid & ((1 << 20) - 1);
-       context->hi |= (1 << 20);
-}
-
-/*
- * Set the DTE(Device-TLB Enable) field of a scalable mode context
- * entry.
- */
-static inline void context_set_sm_dte(struct context_entry *context)
-{
-       context->lo |= (1 << 2);
-}
-
-/*
- * Set the PRE(Page Request Enable) field of a scalable mode context
- * entry.
- */
-static inline void context_set_sm_pre(struct context_entry *context)
-{
-       context->lo |= (1 << 4);
-}
-
-/* Convert value to context PASID directory size field coding. */
-#define context_pdts(pds)      (((pds) & 0x7) << 9)
-
-static int domain_context_mapping_one(struct dmar_domain *domain,
-                                     struct intel_iommu *iommu,
-                                     struct pasid_table *table,
-                                     u8 bus, u8 devfn)
-{
-       u16 did = domain->iommu_did[iommu->seq_id];
-       int translation = CONTEXT_TT_MULTI_LEVEL;
-       struct device_domain_info *info = NULL;
-       struct context_entry *context;
-       unsigned long flags;
-       int ret;
-
-       WARN_ON(did == 0);
-
-       if (hw_pass_through && domain_type_is_si(domain))
-               translation = CONTEXT_TT_PASS_THROUGH;
-
-       pr_debug("Set context mapping for %02x:%02x.%d\n",
-               bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
-
-       BUG_ON(!domain->pgd);
-
-       spin_lock_irqsave(&device_domain_lock, flags);
-       spin_lock(&iommu->lock);
-
-       ret = -ENOMEM;
-       context = iommu_context_addr(iommu, bus, devfn, 1);
-       if (!context)
-               goto out_unlock;
-
-       ret = 0;
-       if (context_present(context))
-               goto out_unlock;
-
-       /*
-        * For kdump cases, old valid entries may be cached due to the
-        * in-flight DMA and copied pgtable, but there is no unmapping
-        * behaviour for them, thus we need an explicit cache flush for
-        * the newly-mapped device. For kdump, at this point, the device
-        * is supposed to finish reset at its driver probe stage, so no
-        * in-flight DMA will exist, and we don't need to worry anymore
-        * hereafter.
-        */
-       if (context_copied(context)) {
-               u16 did_old = context_domain_id(context);
-
-               if (did_old < cap_ndoms(iommu->cap)) {
-                       iommu->flush.flush_context(iommu, did_old,
-                                                  (((u16)bus) << 8) | devfn,
-                                                  DMA_CCMD_MASK_NOBIT,
-                                                  DMA_CCMD_DEVICE_INVL);
-                       iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
-                                                DMA_TLB_DSI_FLUSH);
-               }
-       }
-
-       context_clear_entry(context);
-
-       if (sm_supported(iommu)) {
-               unsigned long pds;
-
-               WARN_ON(!table);
-
-               /* Setup the PASID DIR pointer: */
-               pds = context_get_sm_pds(table);
-               context->lo = (u64)virt_to_phys(table->table) |
-                               context_pdts(pds);
-
-               /* Setup the RID_PASID field: */
-               context_set_sm_rid2pasid(context, PASID_RID2PASID);
-
-               /*
-                * Setup the Device-TLB enable bit and Page request
-                * Enable bit:
-                */
-               info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
-               if (info && info->ats_supported)
-                       context_set_sm_dte(context);
-               if (info && info->pri_supported)
-                       context_set_sm_pre(context);
-       } else {
-               struct dma_pte *pgd = domain->pgd;
-               int agaw;
-
-               context_set_domain_id(context, did);
-
-               if (translation != CONTEXT_TT_PASS_THROUGH) {
-                       /*
-                        * Skip top levels of page tables for iommu which has
-                        * less agaw than default. Unnecessary for PT mode.
-                        */
-                       for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
-                               ret = -ENOMEM;
-                               pgd = phys_to_virt(dma_pte_addr(pgd));
-                               if (!dma_pte_present(pgd))
-                                       goto out_unlock;
-                       }
-
-                       info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
-                       if (info && info->ats_supported)
-                               translation = CONTEXT_TT_DEV_IOTLB;
-                       else
-                               translation = CONTEXT_TT_MULTI_LEVEL;
-
-                       context_set_address_root(context, virt_to_phys(pgd));
-                       context_set_address_width(context, agaw);
-               } else {
-                       /*
-                        * In pass through mode, AW must be programmed to
-                        * indicate the largest AGAW value supported by
-                        * hardware. And ASR is ignored by hardware.
-                        */
-                       context_set_address_width(context, iommu->msagaw);
-               }
-
-               context_set_translation_type(context, translation);
-       }
-
-       context_set_fault_enable(context);
-       context_set_present(context);
-       domain_flush_cache(domain, context, sizeof(*context));
-
-       /*
-        * It's a non-present to present mapping. If hardware doesn't cache
-        * non-present entry we only need to flush the write-buffer. If the
-        * _does_ cache non-present entries, then it does so in the special
-        * domain #0, which we have to flush:
-        */
-       if (cap_caching_mode(iommu->cap)) {
-               iommu->flush.flush_context(iommu, 0,
-                                          (((u16)bus) << 8) | devfn,
-                                          DMA_CCMD_MASK_NOBIT,
-                                          DMA_CCMD_DEVICE_INVL);
-               iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
-       } else {
-               iommu_flush_write_buffer(iommu);
-       }
-       iommu_enable_dev_iotlb(info);
-
-       ret = 0;
-
-out_unlock:
-       spin_unlock(&iommu->lock);
-       spin_unlock_irqrestore(&device_domain_lock, flags);
-
-       return ret;
-}
-
-struct domain_context_mapping_data {
-       struct dmar_domain *domain;
-       struct intel_iommu *iommu;
-       struct pasid_table *table;
-};
-
-static int domain_context_mapping_cb(struct pci_dev *pdev,
-                                    u16 alias, void *opaque)
-{
-       struct domain_context_mapping_data *data = opaque;
-
-       return domain_context_mapping_one(data->domain, data->iommu,
-                                         data->table, PCI_BUS_NUM(alias),
-                                         alias & 0xff);
-}
-
-static int
-domain_context_mapping(struct dmar_domain *domain, struct device *dev)
-{
-       struct domain_context_mapping_data data;
-       struct pasid_table *table;
-       struct intel_iommu *iommu;
-       u8 bus, devfn;
-
-       iommu = device_to_iommu(dev, &bus, &devfn);
-       if (!iommu)
-               return -ENODEV;
-
-       table = intel_pasid_get_table(dev);
-
-       if (!dev_is_pci(dev))
-               return domain_context_mapping_one(domain, iommu, table,
-                                                 bus, devfn);
-
-       data.domain = domain;
-       data.iommu = iommu;
-       data.table = table;
-
-       return pci_for_each_dma_alias(to_pci_dev(dev),
-                                     &domain_context_mapping_cb, &data);
-}
-
-static int domain_context_mapped_cb(struct pci_dev *pdev,
-                                   u16 alias, void *opaque)
-{
-       struct intel_iommu *iommu = opaque;
-
-       return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
-}
-
-static int domain_context_mapped(struct device *dev)
-{
-       struct intel_iommu *iommu;
-       u8 bus, devfn;
-
-       iommu = device_to_iommu(dev, &bus, &devfn);
-       if (!iommu)
-               return -ENODEV;
-
-       if (!dev_is_pci(dev))
-               return device_context_mapped(iommu, bus, devfn);
-
-       return !pci_for_each_dma_alias(to_pci_dev(dev),
-                                      domain_context_mapped_cb, iommu);
-}
-
-/* Returns a number of VTD pages, but aligned to MM page size */
-static inline unsigned long aligned_nrpages(unsigned long host_addr,
-                                           size_t size)
-{
-       host_addr &= ~PAGE_MASK;
-       return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
-}
-
-/* Return largest possible superpage level for a given mapping */
-static inline int hardware_largepage_caps(struct dmar_domain *domain,
-                                         unsigned long iov_pfn,
-                                         unsigned long phy_pfn,
-                                         unsigned long pages)
-{
-       int support, level = 1;
-       unsigned long pfnmerge;
-
-       support = domain->iommu_superpage;
-
-       /* To use a large page, the virtual *and* physical addresses
-          must be aligned to 2MiB/1GiB/etc. Lower bits set in either
-          of them will mean we have to use smaller pages. So just
-          merge them and check both at once. */
-       pfnmerge = iov_pfn | phy_pfn;
-
-       while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
-               pages >>= VTD_STRIDE_SHIFT;
-               if (!pages)
-                       break;
-               pfnmerge >>= VTD_STRIDE_SHIFT;
-               level++;
-               support--;
-       }
-       return level;
-}
-
-static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
-                           struct scatterlist *sg, unsigned long phys_pfn,
-                           unsigned long nr_pages, int prot)
-{
-       struct dma_pte *first_pte = NULL, *pte = NULL;
-       phys_addr_t uninitialized_var(pteval);
-       unsigned long sg_res = 0;
-       unsigned int largepage_lvl = 0;
-       unsigned long lvl_pages = 0;
-       u64 attr;
-
-       BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
-
-       if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
-               return -EINVAL;
-
-       attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
-       if (domain_use_first_level(domain))
-               attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD;
-
-       if (!sg) {
-               sg_res = nr_pages;
-               pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
-       }
-
-       while (nr_pages > 0) {
-               uint64_t tmp;
-
-               if (!sg_res) {
-                       unsigned int pgoff = sg->offset & ~PAGE_MASK;
-
-                       sg_res = aligned_nrpages(sg->offset, sg->length);
-                       sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
-                       sg->dma_length = sg->length;
-                       pteval = (sg_phys(sg) - pgoff) | attr;
-                       phys_pfn = pteval >> VTD_PAGE_SHIFT;
-               }
-
-               if (!pte) {
-                       largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
-
-                       first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
-                       if (!pte)
-                               return -ENOMEM;
-                       /* It is large page*/
-                       if (largepage_lvl > 1) {
-                               unsigned long nr_superpages, end_pfn;
-
-                               pteval |= DMA_PTE_LARGE_PAGE;
-                               lvl_pages = lvl_to_nr_pages(largepage_lvl);
-
-                               nr_superpages = sg_res / lvl_pages;
-                               end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
-
-                               /*
-                                * Ensure that old small page tables are
-                                * removed to make room for superpage(s).
-                                * We're adding new large pages, so make sure
-                                * we don't remove their parent tables.
-                                */
-                               dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
-                                                      largepage_lvl + 1);
-                       } else {
-                               pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
-                       }
-
-               }
-               /* We don't need lock here, nobody else
-                * touches the iova range
-                */
-               tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
-               if (tmp) {
-                       static int dumps = 5;
-                       pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
-                               iov_pfn, tmp, (unsigned long long)pteval);
-                       if (dumps) {
-                               dumps--;
-                               debug_dma_dump_mappings(NULL);
-                       }
-                       WARN_ON(1);
-               }
-
-               lvl_pages = lvl_to_nr_pages(largepage_lvl);
-
-               BUG_ON(nr_pages < lvl_pages);
-               BUG_ON(sg_res < lvl_pages);
-
-               nr_pages -= lvl_pages;
-               iov_pfn += lvl_pages;
-               phys_pfn += lvl_pages;
-               pteval += lvl_pages * VTD_PAGE_SIZE;
-               sg_res -= lvl_pages;
-
-               /* If the next PTE would be the first in a new page, then we
-                  need to flush the cache on the entries we've just written.
-                  And then we'll need to recalculate 'pte', so clear it and
-                  let it get set again in the if (!pte) block above.
-
-                  If we're done (!nr_pages) we need to flush the cache too.
-
-                  Also if we've been setting superpages, we may need to
-                  recalculate 'pte' and switch back to smaller pages for the
-                  end of the mapping, if the trailing size is not enough to
-                  use another superpage (i.e. sg_res < lvl_pages). */
-               pte++;
-               if (!nr_pages || first_pte_in_page(pte) ||
-                   (largepage_lvl > 1 && sg_res < lvl_pages)) {
-                       domain_flush_cache(domain, first_pte,
-                                          (void *)pte - (void *)first_pte);
-                       pte = NULL;
-               }
-
-               if (!sg_res && nr_pages)
-                       sg = sg_next(sg);
-       }
-       return 0;
-}
-
-static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
-                         struct scatterlist *sg, unsigned long phys_pfn,
-                         unsigned long nr_pages, int prot)
-{
-       int iommu_id, ret;
-       struct intel_iommu *iommu;
-
-       /* Do the real mapping first */
-       ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
-       if (ret)
-               return ret;
-
-       for_each_domain_iommu(iommu_id, domain) {
-               iommu = g_iommus[iommu_id];
-               __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
-       }
-
-       return 0;
-}
-
-static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
-                                   struct scatterlist *sg, unsigned long nr_pages,
-                                   int prot)
-{
-       return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
-}
-
-static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
-                                    unsigned long phys_pfn, unsigned long nr_pages,
-                                    int prot)
-{
-       return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
-}
-
-static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
-{
-       unsigned long flags;
-       struct context_entry *context;
-       u16 did_old;
-
-       if (!iommu)
-               return;
-
-       spin_lock_irqsave(&iommu->lock, flags);
-       context = iommu_context_addr(iommu, bus, devfn, 0);
-       if (!context) {
-               spin_unlock_irqrestore(&iommu->lock, flags);
-               return;
-       }
-       did_old = context_domain_id(context);
-       context_clear_entry(context);
-       __iommu_flush_cache(iommu, context, sizeof(*context));
-       spin_unlock_irqrestore(&iommu->lock, flags);
-       iommu->flush.flush_context(iommu,
-                                  did_old,
-                                  (((u16)bus) << 8) | devfn,
-                                  DMA_CCMD_MASK_NOBIT,
-                                  DMA_CCMD_DEVICE_INVL);
-       iommu->flush.flush_iotlb(iommu,
-                                did_old,
-                                0,
-                                0,
-                                DMA_TLB_DSI_FLUSH);
-}
-
-static inline void unlink_domain_info(struct device_domain_info *info)
-{
-       assert_spin_locked(&device_domain_lock);
-       list_del(&info->link);
-       list_del(&info->global);
-       if (info->dev)
-               info->dev->archdata.iommu = NULL;
-}
-
-static void domain_remove_dev_info(struct dmar_domain *domain)
-{
-       struct device_domain_info *info, *tmp;
-       unsigned long flags;
-
-       spin_lock_irqsave(&device_domain_lock, flags);
-       list_for_each_entry_safe(info, tmp, &domain->devices, link)
-               __dmar_remove_one_dev_info(info);
-       spin_unlock_irqrestore(&device_domain_lock, flags);
-}
-
-struct dmar_domain *find_domain(struct device *dev)
-{
-       struct device_domain_info *info;
-
-       if (unlikely(attach_deferred(dev) || iommu_dummy(dev)))
-               return NULL;
-
-       /* No lock here, assumes no domain exit in normal case */
-       info = get_domain_info(dev);
-       if (likely(info))
-               return info->domain;
-
-       return NULL;
-}
-
-static void do_deferred_attach(struct device *dev)
-{
-       struct iommu_domain *domain;
-
-       dev->archdata.iommu = NULL;
-       domain = iommu_get_domain_for_dev(dev);
-       if (domain)
-               intel_iommu_attach_device(domain, dev);
-}
-
-static inline struct device_domain_info *
-dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
-{
-       struct device_domain_info *info;
-
-       list_for_each_entry(info, &device_domain_list, global)
-               if (info->segment == segment && info->bus == bus &&
-                   info->devfn == devfn)
-                       return info;
-
-       return NULL;
-}
-
-static int domain_setup_first_level(struct intel_iommu *iommu,
-                                   struct dmar_domain *domain,
-                                   struct device *dev,
-                                   int pasid)
-{
-       int flags = PASID_FLAG_SUPERVISOR_MODE;
-       struct dma_pte *pgd = domain->pgd;
-       int agaw, level;
-
-       /*
-        * Skip top levels of page tables for iommu which has
-        * less agaw than default. Unnecessary for PT mode.
-        */
-       for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
-               pgd = phys_to_virt(dma_pte_addr(pgd));
-               if (!dma_pte_present(pgd))
-                       return -ENOMEM;
-       }
-
-       level = agaw_to_level(agaw);
-       if (level != 4 && level != 5)
-               return -EINVAL;
-
-       flags |= (level == 5) ? PASID_FLAG_FL5LP : 0;
-
-       return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
-                                            domain->iommu_did[iommu->seq_id],
-                                            flags);
-}
-
-static bool dev_is_real_dma_subdevice(struct device *dev)
-{
-       return dev && dev_is_pci(dev) &&
-              pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
-}
-
-static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
-                                                   int bus, int devfn,
-                                                   struct device *dev,
-                                                   struct dmar_domain *domain)
-{
-       struct dmar_domain *found = NULL;
-       struct device_domain_info *info;
-       unsigned long flags;
-       int ret;
-
-       info = alloc_devinfo_mem();
-       if (!info)
-               return NULL;
-
-       if (!dev_is_real_dma_subdevice(dev)) {
-               info->bus = bus;
-               info->devfn = devfn;
-               info->segment = iommu->segment;
-       } else {
-               struct pci_dev *pdev = to_pci_dev(dev);
-
-               info->bus = pdev->bus->number;
-               info->devfn = pdev->devfn;
-               info->segment = pci_domain_nr(pdev->bus);
-       }
-
-       info->ats_supported = info->pasid_supported = info->pri_supported = 0;
-       info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
-       info->ats_qdep = 0;
-       info->dev = dev;
-       info->domain = domain;
-       info->iommu = iommu;
-       info->pasid_table = NULL;
-       info->auxd_enabled = 0;
-       INIT_LIST_HEAD(&info->auxiliary_domains);
-
-       if (dev && dev_is_pci(dev)) {
-               struct pci_dev *pdev = to_pci_dev(info->dev);
-
-               if (ecap_dev_iotlb_support(iommu->ecap) &&
-                   pci_ats_supported(pdev) &&
-                   dmar_find_matched_atsr_unit(pdev))
-                       info->ats_supported = 1;
-
-               if (sm_supported(iommu)) {
-                       if (pasid_supported(iommu)) {
-                               int features = pci_pasid_features(pdev);
-                               if (features >= 0)
-                                       info->pasid_supported = features | 1;
-                       }
-
-                       if (info->ats_supported && ecap_prs(iommu->ecap) &&
-                           pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
-                               info->pri_supported = 1;
-               }
-       }
-
-       spin_lock_irqsave(&device_domain_lock, flags);
-       if (dev)
-               found = find_domain(dev);
-
-       if (!found) {
-               struct device_domain_info *info2;
-               info2 = dmar_search_domain_by_dev_info(info->segment, info->bus,
-                                                      info->devfn);
-               if (info2) {
-                       found      = info2->domain;
-                       info2->dev = dev;
-               }
-       }
-
-       if (found) {
-               spin_unlock_irqrestore(&device_domain_lock, flags);
-               free_devinfo_mem(info);
-               /* Caller must free the original domain */
-               return found;
-       }
-
-       spin_lock(&iommu->lock);
-       ret = domain_attach_iommu(domain, iommu);
-       spin_unlock(&iommu->lock);
-
-       if (ret) {
-               spin_unlock_irqrestore(&device_domain_lock, flags);
-               free_devinfo_mem(info);
-               return NULL;
-       }
-
-       list_add(&info->link, &domain->devices);
-       list_add(&info->global, &device_domain_list);
-       if (dev)
-               dev->archdata.iommu = info;
-       spin_unlock_irqrestore(&device_domain_lock, flags);
-
-       /* PASID table is mandatory for a PCI device in scalable mode. */
-       if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
-               ret = intel_pasid_alloc_table(dev);
-               if (ret) {
-                       dev_err(dev, "PASID table allocation failed\n");
-                       dmar_remove_one_dev_info(dev);
-                       return NULL;
-               }
-
-               /* Setup the PASID entry for requests without PASID: */
-               spin_lock(&iommu->lock);
-               if (hw_pass_through && domain_type_is_si(domain))
-                       ret = intel_pasid_setup_pass_through(iommu, domain,
-                                       dev, PASID_RID2PASID);
-               else if (domain_use_first_level(domain))
-                       ret = domain_setup_first_level(iommu, domain, dev,
-                                       PASID_RID2PASID);
-               else
-                       ret = intel_pasid_setup_second_level(iommu, domain,
-                                       dev, PASID_RID2PASID);
-               spin_unlock(&iommu->lock);
-               if (ret) {
-                       dev_err(dev, "Setup RID2PASID failed\n");
-                       dmar_remove_one_dev_info(dev);
-                       return NULL;
-               }
-       }
-
-       if (dev && domain_context_mapping(domain, dev)) {
-               dev_err(dev, "Domain context map failed\n");
-               dmar_remove_one_dev_info(dev);
-               return NULL;
-       }
-
-       return domain;
-}
-
-static int iommu_domain_identity_map(struct dmar_domain *domain,
-                                    unsigned long first_vpfn,
-                                    unsigned long last_vpfn)
-{
-       /*
-        * RMRR range might have overlap with physical memory range,
-        * clear it first
-        */
-       dma_pte_clear_range(domain, first_vpfn, last_vpfn);
-
-       return __domain_mapping(domain, first_vpfn, NULL,
-                               first_vpfn, last_vpfn - first_vpfn + 1,
-                               DMA_PTE_READ|DMA_PTE_WRITE);
-}
-
-static int md_domain_init(struct dmar_domain *domain, int guest_width);
-
-static int __init si_domain_init(int hw)
-{
-       struct dmar_rmrr_unit *rmrr;
-       struct device *dev;
-       int i, nid, ret;
-
-       si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
-       if (!si_domain)
-               return -EFAULT;
-
-       if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
-               domain_exit(si_domain);
-               return -EFAULT;
-       }
-
-       if (hw)
-               return 0;
-
-       for_each_online_node(nid) {
-               unsigned long start_pfn, end_pfn;
-               int i;
-
-               for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
-                       ret = iommu_domain_identity_map(si_domain,
-                                       mm_to_dma_pfn(start_pfn),
-                                       mm_to_dma_pfn(end_pfn));
-                       if (ret)
-                               return ret;
-               }
-       }
-
-       /*
-        * Identity map the RMRRs so that devices with RMRRs could also use
-        * the si_domain.
-        */
-       for_each_rmrr_units(rmrr) {
-               for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
-                                         i, dev) {
-                       unsigned long long start = rmrr->base_address;
-                       unsigned long long end = rmrr->end_address;
-
-                       if (WARN_ON(end < start ||
-                                   end >> agaw_to_width(si_domain->agaw)))
-                               continue;
-
-                       ret = iommu_domain_identity_map(si_domain, start, end);
-                       if (ret)
-                               return ret;
-               }
-       }
-
-       return 0;
-}
-
-static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
-{
-       struct dmar_domain *ndomain;
-       struct intel_iommu *iommu;
-       u8 bus, devfn;
-
-       iommu = device_to_iommu(dev, &bus, &devfn);
-       if (!iommu)
-               return -ENODEV;
-
-       ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
-       if (ndomain != domain)
-               return -EBUSY;
-
-       return 0;
-}
-
-static bool device_has_rmrr(struct device *dev)
-{
-       struct dmar_rmrr_unit *rmrr;
-       struct device *tmp;
-       int i;
-
-       rcu_read_lock();
-       for_each_rmrr_units(rmrr) {
-               /*
-                * Return TRUE if this RMRR contains the device that
-                * is passed in.
-                */
-               for_each_active_dev_scope(rmrr->devices,
-                                         rmrr->devices_cnt, i, tmp)
-                       if (tmp == dev ||
-                           is_downstream_to_pci_bridge(dev, tmp)) {
-                               rcu_read_unlock();
-                               return true;
-                       }
-       }
-       rcu_read_unlock();
-       return false;
-}
-
-/**
- * device_rmrr_is_relaxable - Test whether the RMRR of this device
- * is relaxable (ie. is allowed to be not enforced under some conditions)
- * @dev: device handle
- *
- * We assume that PCI USB devices with RMRRs have them largely
- * for historical reasons and that the RMRR space is not actively used post
- * boot.  This exclusion may change if vendors begin to abuse it.
- *
- * The same exception is made for graphics devices, with the requirement that
- * any use of the RMRR regions will be torn down before assigning the device
- * to a guest.
- *
- * Return: true if the RMRR is relaxable, false otherwise
- */
-static bool device_rmrr_is_relaxable(struct device *dev)
-{
-       struct pci_dev *pdev;
-
-       if (!dev_is_pci(dev))
-               return false;
-
-       pdev = to_pci_dev(dev);
-       if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
-               return true;
-       else
-               return false;
-}
-
-/*
- * There are a couple cases where we need to restrict the functionality of
- * devices associated with RMRRs.  The first is when evaluating a device for
- * identity mapping because problems exist when devices are moved in and out
- * of domains and their respective RMRR information is lost.  This means that
- * a device with associated RMRRs will never be in a "passthrough" domain.
- * The second is use of the device through the IOMMU API.  This interface
- * expects to have full control of the IOVA space for the device.  We cannot
- * satisfy both the requirement that RMRR access is maintained and have an
- * unencumbered IOVA space.  We also have no ability to quiesce the device's
- * use of the RMRR space or even inform the IOMMU API user of the restriction.
- * We therefore prevent devices associated with an RMRR from participating in
- * the IOMMU API, which eliminates them from device assignment.
- *
- * In both cases, devices which have relaxable RMRRs are not concerned by this
- * restriction. See device_rmrr_is_relaxable comment.
- */
-static bool device_is_rmrr_locked(struct device *dev)
-{
-       if (!device_has_rmrr(dev))
-               return false;
-
-       if (device_rmrr_is_relaxable(dev))
-               return false;
-
-       return true;
-}
-
-/*
- * Return the required default domain type for a specific device.
- *
- * @dev: the device in query
- * @startup: true if this is during early boot
- *
- * Returns:
- *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
- *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
- *  - 0: both identity and dynamic domains work for this device
- */
-static int device_def_domain_type(struct device *dev)
-{
-       if (dev_is_pci(dev)) {
-               struct pci_dev *pdev = to_pci_dev(dev);
-
-               /*
-                * Prevent any device marked as untrusted from getting
-                * placed into the statically identity mapping domain.
-                */
-               if (pdev->untrusted)
-                       return IOMMU_DOMAIN_DMA;
-
-               if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
-                       return IOMMU_DOMAIN_IDENTITY;
-
-               if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
-                       return IOMMU_DOMAIN_IDENTITY;
-       }
-
-       return 0;
-}
-
-static void intel_iommu_init_qi(struct intel_iommu *iommu)
-{
-       /*
-        * Start from the sane iommu hardware state.
-        * If the queued invalidation is already initialized by us
-        * (for example, while enabling interrupt-remapping) then
-        * we got the things already rolling from a sane state.
-        */
-       if (!iommu->qi) {
-               /*
-                * Clear any previous faults.
-                */
-               dmar_fault(-1, iommu);
-               /*
-                * Disable queued invalidation if supported and already enabled
-                * before OS handover.
-                */
-               dmar_disable_qi(iommu);
-       }
-
-       if (dmar_enable_qi(iommu)) {
-               /*
-                * Queued Invalidate not enabled, use Register Based Invalidate
-                */
-               iommu->flush.flush_context = __iommu_flush_context;
-               iommu->flush.flush_iotlb = __iommu_flush_iotlb;
-               pr_info("%s: Using Register based invalidation\n",
-                       iommu->name);
-       } else {
-               iommu->flush.flush_context = qi_flush_context;
-               iommu->flush.flush_iotlb = qi_flush_iotlb;
-               pr_info("%s: Using Queued invalidation\n", iommu->name);
-       }
-}
-
-static int copy_context_table(struct intel_iommu *iommu,
-                             struct root_entry *old_re,
-                             struct context_entry **tbl,
-                             int bus, bool ext)
-{
-       int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
-       struct context_entry *new_ce = NULL, ce;
-       struct context_entry *old_ce = NULL;
-       struct root_entry re;
-       phys_addr_t old_ce_phys;
-
-       tbl_idx = ext ? bus * 2 : bus;
-       memcpy(&re, old_re, sizeof(re));
-
-       for (devfn = 0; devfn < 256; devfn++) {
-               /* First calculate the correct index */
-               idx = (ext ? devfn * 2 : devfn) % 256;
-
-               if (idx == 0) {
-                       /* First save what we may have and clean up */
-                       if (new_ce) {
-                               tbl[tbl_idx] = new_ce;
-                               __iommu_flush_cache(iommu, new_ce,
-                                                   VTD_PAGE_SIZE);
-                               pos = 1;
-                       }
-
-                       if (old_ce)
-                               memunmap(old_ce);
-
-                       ret = 0;
-                       if (devfn < 0x80)
-                               old_ce_phys = root_entry_lctp(&re);
-                       else
-                               old_ce_phys = root_entry_uctp(&re);
-
-                       if (!old_ce_phys) {
-                               if (ext && devfn == 0) {
-                                       /* No LCTP, try UCTP */
-                                       devfn = 0x7f;
-                                       continue;
-                               } else {
-                                       goto out;
-                               }
-                       }
-
-                       ret = -ENOMEM;
-                       old_ce = memremap(old_ce_phys, PAGE_SIZE,
-                                       MEMREMAP_WB);
-                       if (!old_ce)
-                               goto out;
-
-                       new_ce = alloc_pgtable_page(iommu->node);
-                       if (!new_ce)
-                               goto out_unmap;
-
-                       ret = 0;
-               }
-
-               /* Now copy the context entry */
-               memcpy(&ce, old_ce + idx, sizeof(ce));
-
-               if (!__context_present(&ce))
-                       continue;
-
-               did = context_domain_id(&ce);
-               if (did >= 0 && did < cap_ndoms(iommu->cap))
-                       set_bit(did, iommu->domain_ids);
-
-               /*
-                * We need a marker for copied context entries. This
-                * marker needs to work for the old format as well as
-                * for extended context entries.
-                *
-                * Bit 67 of the context entry is used. In the old
-                * format this bit is available to software, in the
-                * extended format it is the PGE bit, but PGE is ignored
-                * by HW if PASIDs are disabled (and thus still
-                * available).
-                *
-                * So disable PASIDs first and then mark the entry
-                * copied. This means that we don't copy PASID
-                * translations from the old kernel, but this is fine as
-                * faults there are not fatal.
-                */
-               context_clear_pasid_enable(&ce);
-               context_set_copied(&ce);
-
-               new_ce[idx] = ce;
-       }
-
-       tbl[tbl_idx + pos] = new_ce;
-
-       __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
-
-out_unmap:
-       memunmap(old_ce);
-
-out:
-       return ret;
-}
-
-static int copy_translation_tables(struct intel_iommu *iommu)
-{
-       struct context_entry **ctxt_tbls;
-       struct root_entry *old_rt;
-       phys_addr_t old_rt_phys;
-       int ctxt_table_entries;
-       unsigned long flags;
-       u64 rtaddr_reg;
-       int bus, ret;
-       bool new_ext, ext;
-
-       rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
-       ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
-       new_ext    = !!ecap_ecs(iommu->ecap);
-
-       /*
-        * The RTT bit can only be changed when translation is disabled,
-        * but disabling translation means to open a window for data
-        * corruption. So bail out and don't copy anything if we would
-        * have to change the bit.
-        */
-       if (new_ext != ext)
-               return -EINVAL;
-
-       old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
-       if (!old_rt_phys)
-               return -EINVAL;
-
-       old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
-       if (!old_rt)
-               return -ENOMEM;
-
-       /* This is too big for the stack - allocate it from slab */
-       ctxt_table_entries = ext ? 512 : 256;
-       ret = -ENOMEM;
-       ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
-       if (!ctxt_tbls)
-               goto out_unmap;
-
-       for (bus = 0; bus < 256; bus++) {
-               ret = copy_context_table(iommu, &old_rt[bus],
-                                        ctxt_tbls, bus, ext);
-               if (ret) {
-                       pr_err("%s: Failed to copy context table for bus %d\n",
-                               iommu->name, bus);
-                       continue;
-               }
-       }
-
-       spin_lock_irqsave(&iommu->lock, flags);
-
-       /* Context tables are copied, now write them to the root_entry table */
-       for (bus = 0; bus < 256; bus++) {
-               int idx = ext ? bus * 2 : bus;
-               u64 val;
-
-               if (ctxt_tbls[idx]) {
-                       val = virt_to_phys(ctxt_tbls[idx]) | 1;
-                       iommu->root_entry[bus].lo = val;
-               }
-
-               if (!ext || !ctxt_tbls[idx + 1])
-                       continue;
-
-               val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
-               iommu->root_entry[bus].hi = val;
-       }
-
-       spin_unlock_irqrestore(&iommu->lock, flags);
-
-       kfree(ctxt_tbls);
-
-       __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
-
-       ret = 0;
-
-out_unmap:
-       memunmap(old_rt);
-
-       return ret;
-}
-
-#ifdef CONFIG_INTEL_IOMMU_SVM
-static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
-{
-       struct intel_iommu *iommu = data;
-       ioasid_t ioasid;
-
-       if (!iommu)
-               return INVALID_IOASID;
-       /*
-        * VT-d virtual command interface always uses the full 20 bit
-        * PASID range. Host can partition guest PASID range based on
-        * policies but it is out of guest's control.
-        */
-       if (min < PASID_MIN || max > intel_pasid_max_id)
-               return INVALID_IOASID;
-
-       if (vcmd_alloc_pasid(iommu, &ioasid))
-               return INVALID_IOASID;
-
-       return ioasid;
-}
-
-static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
-{
-       struct intel_iommu *iommu = data;
-
-       if (!iommu)
-               return;
-       /*
-        * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
-        * We can only free the PASID when all the devices are unbound.
-        */
-       if (ioasid_find(NULL, ioasid, NULL)) {
-               pr_alert("Cannot free active IOASID %d\n", ioasid);
-               return;
-       }
-       vcmd_free_pasid(iommu, ioasid);
-}
-
-static void register_pasid_allocator(struct intel_iommu *iommu)
-{
-       /*
-        * If we are running in the host, no need for custom allocator
-        * in that PASIDs are allocated from the host system-wide.
-        */
-       if (!cap_caching_mode(iommu->cap))
-               return;
-
-       if (!sm_supported(iommu)) {
-               pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
-               return;
-       }
-
-       /*
-        * Register a custom PASID allocator if we are running in a guest,
-        * guest PASID must be obtained via virtual command interface.
-        * There can be multiple vIOMMUs in each guest but only one allocator
-        * is active. All vIOMMU allocators will eventually be calling the same
-        * host allocator.
-        */
-       if (!ecap_vcs(iommu->ecap) || !vccap_pasid(iommu->vccap))
-               return;
-
-       pr_info("Register custom PASID allocator\n");
-       iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
-       iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
-       iommu->pasid_allocator.pdata = (void *)iommu;
-       if (ioasid_register_allocator(&iommu->pasid_allocator)) {
-               pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
-               /*
-                * Disable scalable mode on this IOMMU if there
-                * is no custom allocator. Mixing SM capable vIOMMU
-                * and non-SM vIOMMU are not supported.
-                */
-               intel_iommu_sm = 0;
-       }
-}
-#endif
-
-static int __init init_dmars(void)
-{
-       struct dmar_drhd_unit *drhd;
-       struct intel_iommu *iommu;
-       int ret;
-
-       /*
-        * for each drhd
-        *    allocate root
-        *    initialize and program root entry to not present
-        * endfor
-        */
-       for_each_drhd_unit(drhd) {
-               /*
-                * lock not needed as this is only incremented in the single
-                * threaded kernel __init code path all other access are read
-                * only
-                */
-               if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
-                       g_num_of_iommus++;
-                       continue;
-               }
-               pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
-       }
-
-       /* Preallocate enough resources for IOMMU hot-addition */
-       if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
-               g_num_of_iommus = DMAR_UNITS_SUPPORTED;
-
-       g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
-                       GFP_KERNEL);
-       if (!g_iommus) {
-               pr_err("Allocating global iommu array failed\n");
-               ret = -ENOMEM;
-               goto error;
-       }
-
-       for_each_iommu(iommu, drhd) {
-               if (drhd->ignored) {
-                       iommu_disable_translation(iommu);
-                       continue;
-               }
-
-               /*
-                * Find the max pasid size of all IOMMU's in the system.
-                * We need to ensure the system pasid table is no bigger
-                * than the smallest supported.
-                */
-               if (pasid_supported(iommu)) {
-                       u32 temp = 2 << ecap_pss(iommu->ecap);
-
-                       intel_pasid_max_id = min_t(u32, temp,
-                                                  intel_pasid_max_id);
-               }
-
-               g_iommus[iommu->seq_id] = iommu;
-
-               intel_iommu_init_qi(iommu);
-
-               ret = iommu_init_domains(iommu);
-               if (ret)
-                       goto free_iommu;
-
-               init_translation_status(iommu);
-
-               if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
-                       iommu_disable_translation(iommu);
-                       clear_translation_pre_enabled(iommu);
-                       pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
-                               iommu->name);
-               }
-
-               /*
-                * TBD:
-                * we could share the same root & context tables
-                * among all IOMMU's. Need to Split it later.
-                */
-               ret = iommu_alloc_root_entry(iommu);
-               if (ret)
-                       goto free_iommu;
-
-               if (translation_pre_enabled(iommu)) {
-                       pr_info("Translation already enabled - trying to copy translation structures\n");
-
-                       ret = copy_translation_tables(iommu);
-                       if (ret) {
-                               /*
-                                * We found the IOMMU with translation
-                                * enabled - but failed to copy over the
-                                * old root-entry table. Try to proceed
-                                * by disabling translation now and
-                                * allocating a clean root-entry table.
-                                * This might cause DMAR faults, but
-                                * probably the dump will still succeed.
-                                */
-                               pr_err("Failed to copy translation tables from previous kernel for %s\n",
-                                      iommu->name);
-                               iommu_disable_translation(iommu);
-                               clear_translation_pre_enabled(iommu);
-                       } else {
-                               pr_info("Copied translation tables from previous kernel for %s\n",
-                                       iommu->name);
-                       }
-               }
-
-               if (!ecap_pass_through(iommu->ecap))
-                       hw_pass_through = 0;
-               intel_svm_check(iommu);
-       }
-
-       /*
-        * Now that qi is enabled on all iommus, set the root entry and flush
-        * caches. This is required on some Intel X58 chipsets, otherwise the
-        * flush_context function will loop forever and the boot hangs.
-        */
-       for_each_active_iommu(iommu, drhd) {
-               iommu_flush_write_buffer(iommu);
-#ifdef CONFIG_INTEL_IOMMU_SVM
-               register_pasid_allocator(iommu);
-#endif
-               iommu_set_root_entry(iommu);
-               iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
-               iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
-       }
-
-#ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
-       dmar_map_gfx = 0;
-#endif
-
-       if (!dmar_map_gfx)
-               iommu_identity_mapping |= IDENTMAP_GFX;
-
-       check_tylersburg_isoch();
-
-       ret = si_domain_init(hw_pass_through);
-       if (ret)
-               goto free_iommu;
-
-       /*
-        * for each drhd
-        *   enable fault log
-        *   global invalidate context cache
-        *   global invalidate iotlb
-        *   enable translation
-        */
-       for_each_iommu(iommu, drhd) {
-               if (drhd->ignored) {
-                       /*
-                        * we always have to disable PMRs or DMA may fail on
-                        * this device
-                        */
-                       if (force_on)
-                               iommu_disable_protect_mem_regions(iommu);
-                       continue;
-               }
-
-               iommu_flush_write_buffer(iommu);
-
-#ifdef CONFIG_INTEL_IOMMU_SVM
-               if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
-                       /*
-                        * Call dmar_alloc_hwirq() with dmar_global_lock held,
-                        * could cause possible lock race condition.
-                        */
-                       up_write(&dmar_global_lock);
-                       ret = intel_svm_enable_prq(iommu);
-                       down_write(&dmar_global_lock);
-                       if (ret)
-                               goto free_iommu;
-               }
-#endif
-               ret = dmar_set_interrupt(iommu);
-               if (ret)
-                       goto free_iommu;
-       }
-
-       return 0;
-
-free_iommu:
-       for_each_active_iommu(iommu, drhd) {
-               disable_dmar_iommu(iommu);
-               free_dmar_iommu(iommu);
-       }
-
-       kfree(g_iommus);
-
-error:
-       return ret;
-}
-
-/* This takes a number of _MM_ pages, not VTD pages */
-static unsigned long intel_alloc_iova(struct device *dev,
-                                    struct dmar_domain *domain,
-                                    unsigned long nrpages, uint64_t dma_mask)
-{
-       unsigned long iova_pfn;
-
-       /*
-        * Restrict dma_mask to the width that the iommu can handle.
-        * First-level translation restricts the input-address to a
-        * canonical address (i.e., address bits 63:N have the same
-        * value as address bit [N-1], where N is 48-bits with 4-level
-        * paging and 57-bits with 5-level paging). Hence, skip bit
-        * [N-1].
-        */
-       if (domain_use_first_level(domain))
-               dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw - 1),
-                                dma_mask);
-       else
-               dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw),
-                                dma_mask);
-
-       /* Ensure we reserve the whole size-aligned region */
-       nrpages = __roundup_pow_of_two(nrpages);
-
-       if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
-               /*
-                * First try to allocate an io virtual address in
-                * DMA_BIT_MASK(32) and if that fails then try allocating
-                * from higher range
-                */
-               iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
-                                          IOVA_PFN(DMA_BIT_MASK(32)), false);
-               if (iova_pfn)
-                       return iova_pfn;
-       }
-       iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
-                                  IOVA_PFN(dma_mask), true);
-       if (unlikely(!iova_pfn)) {
-               dev_err_once(dev, "Allocating %ld-page iova failed\n",
-                            nrpages);
-               return 0;
-       }
-
-       return iova_pfn;
-}
-
-static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
-                                    size_t size, int dir, u64 dma_mask)
-{
-       struct dmar_domain *domain;
-       phys_addr_t start_paddr;
-       unsigned long iova_pfn;
-       int prot = 0;
-       int ret;
-       struct intel_iommu *iommu;
-       unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
-
-       BUG_ON(dir == DMA_NONE);
-
-       if (unlikely(attach_deferred(dev)))
-               do_deferred_attach(dev);
-
-       domain = find_domain(dev);
-       if (!domain)
-               return DMA_MAPPING_ERROR;
-
-       iommu = domain_get_iommu(domain);
-       size = aligned_nrpages(paddr, size);
-
-       iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
-       if (!iova_pfn)
-               goto error;
-
-       /*
-        * Check if DMAR supports zero-length reads on write only
-        * mappings..
-        */
-       if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
-                       !cap_zlr(iommu->cap))
-               prot |= DMA_PTE_READ;
-       if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
-               prot |= DMA_PTE_WRITE;
-       /*
-        * paddr - (paddr + size) might be partial page, we should map the whole
-        * page.  Note: if two part of one page are separately mapped, we
-        * might have two guest_addr mapping to the same host paddr, but this
-        * is not a big problem
-        */
-       ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
-                                mm_to_dma_pfn(paddr_pfn), size, prot);
-       if (ret)
-               goto error;
-
-       start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
-       start_paddr += paddr & ~PAGE_MASK;
-
-       trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
-
-       return start_paddr;
-
-error:
-       if (iova_pfn)
-               free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
-       dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
-               size, (unsigned long long)paddr, dir);
-       return DMA_MAPPING_ERROR;
-}
-
-static dma_addr_t intel_map_page(struct device *dev, struct page *page,
-                                unsigned long offset, size_t size,
-                                enum dma_data_direction dir,
-                                unsigned long attrs)
-{
-       return __intel_map_single(dev, page_to_phys(page) + offset,
-                                 size, dir, *dev->dma_mask);
-}
-
-static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
-                                    size_t size, enum dma_data_direction dir,
-                                    unsigned long attrs)
-{
-       return __intel_map_single(dev, phys_addr, size, dir, *dev->dma_mask);
-}
-
-static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
-{
-       struct dmar_domain *domain;
-       unsigned long start_pfn, last_pfn;
-       unsigned long nrpages;
-       unsigned long iova_pfn;
-       struct intel_iommu *iommu;
-       struct page *freelist;
-       struct pci_dev *pdev = NULL;
-
-       domain = find_domain(dev);
-       BUG_ON(!domain);
-
-       iommu = domain_get_iommu(domain);
-
-       iova_pfn = IOVA_PFN(dev_addr);
-
-       nrpages = aligned_nrpages(dev_addr, size);
-       start_pfn = mm_to_dma_pfn(iova_pfn);
-       last_pfn = start_pfn + nrpages - 1;
-
-       if (dev_is_pci(dev))
-               pdev = to_pci_dev(dev);
-
-       freelist = domain_unmap(domain, start_pfn, last_pfn);
-       if (intel_iommu_strict || (pdev && pdev->untrusted) ||
-                       !has_iova_flush_queue(&domain->iovad)) {
-               iommu_flush_iotlb_psi(iommu, domain, start_pfn,
-                                     nrpages, !freelist, 0);
-               /* free iova */
-               free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
-               dma_free_pagelist(freelist);
-       } else {
-               queue_iova(&domain->iovad, iova_pfn, nrpages,
-                          (unsigned long)freelist);
-               /*
-                * queue up the release of the unmap to save the 1/6th of the
-                * cpu used up by the iotlb flush operation...
-                */
-       }
-
-       trace_unmap_single(dev, dev_addr, size);
-}
-
-static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
-                            size_t size, enum dma_data_direction dir,
-                            unsigned long attrs)
-{
-       intel_unmap(dev, dev_addr, size);
-}
-
-static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
-               size_t size, enum dma_data_direction dir, unsigned long attrs)
-{
-       intel_unmap(dev, dev_addr, size);
-}
-
-static void *intel_alloc_coherent(struct device *dev, size_t size,
-                                 dma_addr_t *dma_handle, gfp_t flags,
-                                 unsigned long attrs)
-{
-       struct page *page = NULL;
-       int order;
-
-       if (unlikely(attach_deferred(dev)))
-               do_deferred_attach(dev);
-
-       size = PAGE_ALIGN(size);
-       order = get_order(size);
-
-       if (gfpflags_allow_blocking(flags)) {
-               unsigned int count = size >> PAGE_SHIFT;
-
-               page = dma_alloc_from_contiguous(dev, count, order,
-                                                flags & __GFP_NOWARN);
-       }
-
-       if (!page)
-               page = alloc_pages(flags, order);
-       if (!page)
-               return NULL;
-       memset(page_address(page), 0, size);
-
-       *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
-                                        DMA_BIDIRECTIONAL,
-                                        dev->coherent_dma_mask);
-       if (*dma_handle != DMA_MAPPING_ERROR)
-               return page_address(page);
-       if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
-               __free_pages(page, order);
-
-       return NULL;
-}
-
-static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
-                               dma_addr_t dma_handle, unsigned long attrs)
-{
-       int order;
-       struct page *page = virt_to_page(vaddr);
-
-       size = PAGE_ALIGN(size);
-       order = get_order(size);
-
-       intel_unmap(dev, dma_handle, size);
-       if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
-               __free_pages(page, order);
-}
-
-static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
-                          int nelems, enum dma_data_direction dir,
-                          unsigned long attrs)
-{
-       dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
-       unsigned long nrpages = 0;
-       struct scatterlist *sg;
-       int i;
-
-       for_each_sg(sglist, sg, nelems, i) {
-               nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
-       }
-
-       intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
-
-       trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
-}
-
-static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
-                       enum dma_data_direction dir, unsigned long attrs)
-{
-       int i;
-       struct dmar_domain *domain;
-       size_t size = 0;
-       int prot = 0;
-       unsigned long iova_pfn;
-       int ret;
-       struct scatterlist *sg;
-       unsigned long start_vpfn;
-       struct intel_iommu *iommu;
-
-       BUG_ON(dir == DMA_NONE);
-
-       if (unlikely(attach_deferred(dev)))
-               do_deferred_attach(dev);
-
-       domain = find_domain(dev);
-       if (!domain)
-               return 0;
-
-       iommu = domain_get_iommu(domain);
-
-       for_each_sg(sglist, sg, nelems, i)
-               size += aligned_nrpages(sg->offset, sg->length);
-
-       iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
-                               *dev->dma_mask);
-       if (!iova_pfn) {
-               sglist->dma_length = 0;
-               return 0;
-       }
-
-       /*
-        * Check if DMAR supports zero-length reads on write only
-        * mappings..
-        */
-       if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
-                       !cap_zlr(iommu->cap))
-               prot |= DMA_PTE_READ;
-       if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
-               prot |= DMA_PTE_WRITE;
-
-       start_vpfn = mm_to_dma_pfn(iova_pfn);
-
-       ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
-       if (unlikely(ret)) {
-               dma_pte_free_pagetable(domain, start_vpfn,
-                                      start_vpfn + size - 1,
-                                      agaw_to_level(domain->agaw) + 1);
-               free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
-               return 0;
-       }
-
-       for_each_sg(sglist, sg, nelems, i)
-               trace_map_sg(dev, i + 1, nelems, sg);
-
-       return nelems;
-}
-
-static u64 intel_get_required_mask(struct device *dev)
-{
-       return DMA_BIT_MASK(32);
-}
-
-static const struct dma_map_ops intel_dma_ops = {
-       .alloc = intel_alloc_coherent,
-       .free = intel_free_coherent,
-       .map_sg = intel_map_sg,
-       .unmap_sg = intel_unmap_sg,
-       .map_page = intel_map_page,
-       .unmap_page = intel_unmap_page,
-       .map_resource = intel_map_resource,
-       .unmap_resource = intel_unmap_resource,
-       .dma_supported = dma_direct_supported,
-       .mmap = dma_common_mmap,
-       .get_sgtable = dma_common_get_sgtable,
-       .get_required_mask = intel_get_required_mask,
-};
-
-static void
-bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
-                  enum dma_data_direction dir, enum dma_sync_target target)
-{
-       struct dmar_domain *domain;
-       phys_addr_t tlb_addr;
-
-       domain = find_domain(dev);
-       if (WARN_ON(!domain))
-               return;
-
-       tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
-       if (is_swiotlb_buffer(tlb_addr))
-               swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
-}
-
-static dma_addr_t
-bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
-                 enum dma_data_direction dir, unsigned long attrs,
-                 u64 dma_mask)
-{
-       size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
-       struct dmar_domain *domain;
-       struct intel_iommu *iommu;
-       unsigned long iova_pfn;
-       unsigned long nrpages;
-       phys_addr_t tlb_addr;
-       int prot = 0;
-       int ret;
-
-       if (unlikely(attach_deferred(dev)))
-               do_deferred_attach(dev);
-
-       domain = find_domain(dev);
-
-       if (WARN_ON(dir == DMA_NONE || !domain))
-               return DMA_MAPPING_ERROR;
-
-       iommu = domain_get_iommu(domain);
-       if (WARN_ON(!iommu))
-               return DMA_MAPPING_ERROR;
-
-       nrpages = aligned_nrpages(0, size);
-       iova_pfn = intel_alloc_iova(dev, domain,
-                                   dma_to_mm_pfn(nrpages), dma_mask);
-       if (!iova_pfn)
-               return DMA_MAPPING_ERROR;
-
-       /*
-        * Check if DMAR supports zero-length reads on write only
-        * mappings..
-        */
-       if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
-                       !cap_zlr(iommu->cap))
-               prot |= DMA_PTE_READ;
-       if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
-               prot |= DMA_PTE_WRITE;
-
-       /*
-        * If both the physical buffer start address and size are
-        * page aligned, we don't need to use a bounce page.
-        */
-       if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
-               tlb_addr = swiotlb_tbl_map_single(dev,
-                               __phys_to_dma(dev, io_tlb_start),
-                               paddr, size, aligned_size, dir, attrs);
-               if (tlb_addr == DMA_MAPPING_ERROR) {
-                       goto swiotlb_error;
-               } else {
-                       /* Cleanup the padding area. */
-                       void *padding_start = phys_to_virt(tlb_addr);
-                       size_t padding_size = aligned_size;
-
-                       if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
-                           (dir == DMA_TO_DEVICE ||
-                            dir == DMA_BIDIRECTIONAL)) {
-                               padding_start += size;
-                               padding_size -= size;
-                       }
-
-                       memset(padding_start, 0, padding_size);
-               }
-       } else {
-               tlb_addr = paddr;
-       }
-
-       ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
-                                tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
-       if (ret)
-               goto mapping_error;
-
-       trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
-
-       return (phys_addr_t)iova_pfn << PAGE_SHIFT;
-
-mapping_error:
-       if (is_swiotlb_buffer(tlb_addr))
-               swiotlb_tbl_unmap_single(dev, tlb_addr, size,
-                                        aligned_size, dir, attrs);
-swiotlb_error:
-       free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
-       dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
-               size, (unsigned long long)paddr, dir);
-
-       return DMA_MAPPING_ERROR;
-}
-
-static void
-bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
-                   enum dma_data_direction dir, unsigned long attrs)
-{
-       size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
-       struct dmar_domain *domain;
-       phys_addr_t tlb_addr;
-
-       domain = find_domain(dev);
-       if (WARN_ON(!domain))
-               return;
-
-       tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
-       if (WARN_ON(!tlb_addr))
-               return;
-
-       intel_unmap(dev, dev_addr, size);
-       if (is_swiotlb_buffer(tlb_addr))
-               swiotlb_tbl_unmap_single(dev, tlb_addr, size,
-                                        aligned_size, dir, attrs);
-
-       trace_bounce_unmap_single(dev, dev_addr, size);
-}
-
-static dma_addr_t
-bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
-               size_t size, enum dma_data_direction dir, unsigned long attrs)
-{
-       return bounce_map_single(dev, page_to_phys(page) + offset,
-                                size, dir, attrs, *dev->dma_mask);
-}
-
-static dma_addr_t
-bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
-                   enum dma_data_direction dir, unsigned long attrs)
-{
-       return bounce_map_single(dev, phys_addr, size,
-                                dir, attrs, *dev->dma_mask);
-}
-
-static void
-bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
-                 enum dma_data_direction dir, unsigned long attrs)
-{
-       bounce_unmap_single(dev, dev_addr, size, dir, attrs);
-}
-
-static void
-bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
-                     enum dma_data_direction dir, unsigned long attrs)
-{
-       bounce_unmap_single(dev, dev_addr, size, dir, attrs);
-}
-
-static void
-bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
-               enum dma_data_direction dir, unsigned long attrs)
-{
-       struct scatterlist *sg;
-       int i;
-
-       for_each_sg(sglist, sg, nelems, i)
-               bounce_unmap_page(dev, sg->dma_address,
-                                 sg_dma_len(sg), dir, attrs);
-}
-
-static int
-bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
-             enum dma_data_direction dir, unsigned long attrs)
-{
-       int i;
-       struct scatterlist *sg;
-
-       for_each_sg(sglist, sg, nelems, i) {
-               sg->dma_address = bounce_map_page(dev, sg_page(sg),
-                                                 sg->offset, sg->length,
-                                                 dir, attrs);
-               if (sg->dma_address == DMA_MAPPING_ERROR)
-                       goto out_unmap;
-               sg_dma_len(sg) = sg->length;
-       }
-
-       for_each_sg(sglist, sg, nelems, i)
-               trace_bounce_map_sg(dev, i + 1, nelems, sg);
-
-       return nelems;
-
-out_unmap:
-       bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
-       return 0;
-}
-
-static void
-bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
-                          size_t size, enum dma_data_direction dir)
-{
-       bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
-}
-
-static void
-bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
-                             size_t size, enum dma_data_direction dir)
-{
-       bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
-}
-
-static void
-bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
-                      int nelems, enum dma_data_direction dir)
-{
-       struct scatterlist *sg;
-       int i;
-
-       for_each_sg(sglist, sg, nelems, i)
-               bounce_sync_single(dev, sg_dma_address(sg),
-                                  sg_dma_len(sg), dir, SYNC_FOR_CPU);
-}
-
-static void
-bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
-                         int nelems, enum dma_data_direction dir)
-{
-       struct scatterlist *sg;
-       int i;
-
-       for_each_sg(sglist, sg, nelems, i)
-               bounce_sync_single(dev, sg_dma_address(sg),
-                                  sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
-}
-
-static const struct dma_map_ops bounce_dma_ops = {
-       .alloc                  = intel_alloc_coherent,
-       .free                   = intel_free_coherent,
-       .map_sg                 = bounce_map_sg,
-       .unmap_sg               = bounce_unmap_sg,
-       .map_page               = bounce_map_page,
-       .unmap_page             = bounce_unmap_page,
-       .sync_single_for_cpu    = bounce_sync_single_for_cpu,
-       .sync_single_for_device = bounce_sync_single_for_device,
-       .sync_sg_for_cpu        = bounce_sync_sg_for_cpu,
-       .sync_sg_for_device     = bounce_sync_sg_for_device,
-       .map_resource           = bounce_map_resource,
-       .unmap_resource         = bounce_unmap_resource,
-       .dma_supported          = dma_direct_supported,
-};
-
-static inline int iommu_domain_cache_init(void)
-{
-       int ret = 0;
-
-       iommu_domain_cache = kmem_cache_create("iommu_domain",
-                                        sizeof(struct dmar_domain),
-                                        0,
-                                        SLAB_HWCACHE_ALIGN,
-
-                                        NULL);
-       if (!iommu_domain_cache) {
-               pr_err("Couldn't create iommu_domain cache\n");
-               ret = -ENOMEM;
-       }
-
-       return ret;
-}
-
-static inline int iommu_devinfo_cache_init(void)
-{
-       int ret = 0;
-
-       iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
-                                        sizeof(struct device_domain_info),
-                                        0,
-                                        SLAB_HWCACHE_ALIGN,
-                                        NULL);
-       if (!iommu_devinfo_cache) {
-               pr_err("Couldn't create devinfo cache\n");
-               ret = -ENOMEM;
-       }
-
-       return ret;
-}
-
-static int __init iommu_init_mempool(void)
-{
-       int ret;
-       ret = iova_cache_get();
-       if (ret)
-               return ret;
-
-       ret = iommu_domain_cache_init();
-       if (ret)
-               goto domain_error;
-
-       ret = iommu_devinfo_cache_init();
-       if (!ret)
-               return ret;
-
-       kmem_cache_destroy(iommu_domain_cache);
-domain_error:
-       iova_cache_put();
-
-       return -ENOMEM;
-}
-
-static void __init iommu_exit_mempool(void)
-{
-       kmem_cache_destroy(iommu_devinfo_cache);
-       kmem_cache_destroy(iommu_domain_cache);
-       iova_cache_put();
-}
-
-static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
-{
-       struct dmar_drhd_unit *drhd;
-       u32 vtbar;
-       int rc;
-
-       /* We know that this device on this chipset has its own IOMMU.
-        * If we find it under a different IOMMU, then the BIOS is lying
-        * to us. Hope that the IOMMU for this device is actually
-        * disabled, and it needs no translation...
-        */
-       rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
-       if (rc) {
-               /* "can't" happen */
-               dev_info(&pdev->dev, "failed to run vt-d quirk\n");
-               return;
-       }
-       vtbar &= 0xffff0000;
-
-       /* we know that the this iommu should be at offset 0xa000 from vtbar */
-       drhd = dmar_find_matched_drhd_unit(pdev);
-       if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
-               pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
-               add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
-               pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
-       }
-}
-DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
-
-static void __init init_no_remapping_devices(void)
-{
-       struct dmar_drhd_unit *drhd;
-       struct device *dev;
-       int i;
-
-       for_each_drhd_unit(drhd) {
-               if (!drhd->include_all) {
-                       for_each_active_dev_scope(drhd->devices,
-                                                 drhd->devices_cnt, i, dev)
-                               break;
-                       /* ignore DMAR unit if no devices exist */
-                       if (i == drhd->devices_cnt)
-                               drhd->ignored = 1;
-               }
-       }
-
-       for_each_active_drhd_unit(drhd) {
-               if (drhd->include_all)
-                       continue;
-
-               for_each_active_dev_scope(drhd->devices,
-                                         drhd->devices_cnt, i, dev)
-                       if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
-                               break;
-               if (i < drhd->devices_cnt)
-                       continue;
-
-               /* This IOMMU has *only* gfx devices. Either bypass it or
-                  set the gfx_mapped flag, as appropriate */
-               if (!dmar_map_gfx) {
-                       drhd->ignored = 1;
-                       for_each_active_dev_scope(drhd->devices,
-                                                 drhd->devices_cnt, i, dev)
-                               dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
-               }
-       }
-}
-
-#ifdef CONFIG_SUSPEND
-static int init_iommu_hw(void)
-{
-       struct dmar_drhd_unit *drhd;
-       struct intel_iommu *iommu = NULL;
-
-       for_each_active_iommu(iommu, drhd)
-               if (iommu->qi)
-                       dmar_reenable_qi(iommu);
-
-       for_each_iommu(iommu, drhd) {
-               if (drhd->ignored) {
-                       /*
-                        * we always have to disable PMRs or DMA may fail on
-                        * this device
-                        */
-                       if (force_on)
-                               iommu_disable_protect_mem_regions(iommu);
-                       continue;
-               }
-
-               iommu_flush_write_buffer(iommu);
-
-               iommu_set_root_entry(iommu);
-
-               iommu->flush.flush_context(iommu, 0, 0, 0,
-                                          DMA_CCMD_GLOBAL_INVL);
-               iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
-               iommu_enable_translation(iommu);
-               iommu_disable_protect_mem_regions(iommu);
-       }
-
-       return 0;
-}
-
-static void iommu_flush_all(void)
-{
-       struct dmar_drhd_unit *drhd;
-       struct intel_iommu *iommu;
-
-       for_each_active_iommu(iommu, drhd) {
-               iommu->flush.flush_context(iommu, 0, 0, 0,
-                                          DMA_CCMD_GLOBAL_INVL);
-               iommu->flush.flush_iotlb(iommu, 0, 0, 0,
-                                        DMA_TLB_GLOBAL_FLUSH);
-       }
-}
-
-static int iommu_suspend(void)
-{
-       struct dmar_drhd_unit *drhd;
-       struct intel_iommu *iommu = NULL;
-       unsigned long flag;
-
-       for_each_active_iommu(iommu, drhd) {
-               iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
-                                                GFP_ATOMIC);
-               if (!iommu->iommu_state)
-                       goto nomem;
-       }
-
-       iommu_flush_all();
-
-       for_each_active_iommu(iommu, drhd) {
-               iommu_disable_translation(iommu);
-
-               raw_spin_lock_irqsave(&iommu->register_lock, flag);
-
-               iommu->iommu_state[SR_DMAR_FECTL_REG] =
-                       readl(iommu->reg + DMAR_FECTL_REG);
-               iommu->iommu_state[SR_DMAR_FEDATA_REG] =
-                       readl(iommu->reg + DMAR_FEDATA_REG);
-               iommu->iommu_state[SR_DMAR_FEADDR_REG] =
-                       readl(iommu->reg + DMAR_FEADDR_REG);
-               iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
-                       readl(iommu->reg + DMAR_FEUADDR_REG);
-
-               raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
-       }
-       return 0;
-
-nomem:
-       for_each_active_iommu(iommu, drhd)
-               kfree(iommu->iommu_state);
-
-       return -ENOMEM;
-}
-
-static void iommu_resume(void)
-{
-       struct dmar_drhd_unit *drhd;
-       struct intel_iommu *iommu = NULL;
-       unsigned long flag;
-
-       if (init_iommu_hw()) {
-               if (force_on)
-                       panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
-               else
-                       WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
-               return;
-       }
-
-       for_each_active_iommu(iommu, drhd) {
-
-               raw_spin_lock_irqsave(&iommu->register_lock, flag);
-
-               writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
-                       iommu->reg + DMAR_FECTL_REG);
-               writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
-                       iommu->reg + DMAR_FEDATA_REG);
-               writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
-                       iommu->reg + DMAR_FEADDR_REG);
-               writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
-                       iommu->reg + DMAR_FEUADDR_REG);
-
-               raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
-       }
-
-       for_each_active_iommu(iommu, drhd)
-               kfree(iommu->iommu_state);
-}
-
-static struct syscore_ops iommu_syscore_ops = {
-       .resume         = iommu_resume,
-       .suspend        = iommu_suspend,
-};
-
-static void __init init_iommu_pm_ops(void)
-{
-       register_syscore_ops(&iommu_syscore_ops);
-}
-
-#else
-static inline void init_iommu_pm_ops(void) {}
-#endif /* CONFIG_PM */
-
-static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
-{
-       if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
-           !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
-           rmrr->end_address <= rmrr->base_address ||
-           arch_rmrr_sanity_check(rmrr))
-               return -EINVAL;
-
-       return 0;
-}
-
-int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
-{
-       struct acpi_dmar_reserved_memory *rmrr;
-       struct dmar_rmrr_unit *rmrru;
-
-       rmrr = (struct acpi_dmar_reserved_memory *)header;
-       if (rmrr_sanity_check(rmrr)) {
-               pr_warn(FW_BUG
-                          "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
-                          "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
-                          rmrr->base_address, rmrr->end_address,
-                          dmi_get_system_info(DMI_BIOS_VENDOR),
-                          dmi_get_system_info(DMI_BIOS_VERSION),
-                          dmi_get_system_info(DMI_PRODUCT_VERSION));
-               add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
-       }
-
-       rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
-       if (!rmrru)
-               goto out;
-
-       rmrru->hdr = header;
-
-       rmrru->base_address = rmrr->base_address;
-       rmrru->end_address = rmrr->end_address;
-
-       rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
-                               ((void *)rmrr) + rmrr->header.length,
-                               &rmrru->devices_cnt);
-       if (rmrru->devices_cnt && rmrru->devices == NULL)
-               goto free_rmrru;
-
-       list_add(&rmrru->list, &dmar_rmrr_units);
-
-       return 0;
-free_rmrru:
-       kfree(rmrru);
-out:
-       return -ENOMEM;
-}
-
-static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
-{
-       struct dmar_atsr_unit *atsru;
-       struct acpi_dmar_atsr *tmp;
-
-       list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
-                               dmar_rcu_check()) {
-               tmp = (struct acpi_dmar_atsr *)atsru->hdr;
-               if (atsr->segment != tmp->segment)
-                       continue;
-               if (atsr->header.length != tmp->header.length)
-                       continue;
-               if (memcmp(atsr, tmp, atsr->header.length) == 0)
-                       return atsru;
-       }
-
-       return NULL;
-}
-
-int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
-{
-       struct acpi_dmar_atsr *atsr;
-       struct dmar_atsr_unit *atsru;
-
-       if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
-               return 0;
-
-       atsr = container_of(hdr, struct acpi_dmar_atsr, header);
-       atsru = dmar_find_atsr(atsr);
-       if (atsru)
-               return 0;
-
-       atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
-       if (!atsru)
-               return -ENOMEM;
-
-       /*
-        * If memory is allocated from slab by ACPI _DSM method, we need to
-        * copy the memory content because the memory buffer will be freed
-        * on return.
-        */
-       atsru->hdr = (void *)(atsru + 1);
-       memcpy(atsru->hdr, hdr, hdr->length);
-       atsru->include_all = atsr->flags & 0x1;
-       if (!atsru->include_all) {
-               atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
-                               (void *)atsr + atsr->header.length,
-                               &atsru->devices_cnt);
-               if (atsru->devices_cnt && atsru->devices == NULL) {
-                       kfree(atsru);
-                       return -ENOMEM;
-               }
-       }
-
-       list_add_rcu(&atsru->list, &dmar_atsr_units);
-
-       return 0;
-}
-
-static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
-{
-       dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
-       kfree(atsru);
-}
-
-int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
-{
-       struct acpi_dmar_atsr *atsr;
-       struct dmar_atsr_unit *atsru;
-
-       atsr = container_of(hdr, struct acpi_dmar_atsr, header);
-       atsru = dmar_find_atsr(atsr);
-       if (atsru) {
-               list_del_rcu(&atsru->list);
-               synchronize_rcu();
-               intel_iommu_free_atsr(atsru);
-       }
-
-       return 0;
-}
-
-int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
-{
-       int i;
-       struct device *dev;
-       struct acpi_dmar_atsr *atsr;
-       struct dmar_atsr_unit *atsru;
-
-       atsr = container_of(hdr, struct acpi_dmar_atsr, header);
-       atsru = dmar_find_atsr(atsr);
-       if (!atsru)
-               return 0;
-
-       if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
-               for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
-                                         i, dev)
-                       return -EBUSY;
-       }
-
-       return 0;
-}
-
-static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
-{
-       int sp, ret;
-       struct intel_iommu *iommu = dmaru->iommu;
-
-       if (g_iommus[iommu->seq_id])
-               return 0;
-
-       if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
-               pr_warn("%s: Doesn't support hardware pass through.\n",
-                       iommu->name);
-               return -ENXIO;
-       }
-       if (!ecap_sc_support(iommu->ecap) &&
-           domain_update_iommu_snooping(iommu)) {
-               pr_warn("%s: Doesn't support snooping.\n",
-                       iommu->name);
-               return -ENXIO;
-       }
-       sp = domain_update_iommu_superpage(NULL, iommu) - 1;
-       if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
-               pr_warn("%s: Doesn't support large page.\n",
-                       iommu->name);
-               return -ENXIO;
-       }
-
-       /*
-        * Disable translation if already enabled prior to OS handover.
-        */
-       if (iommu->gcmd & DMA_GCMD_TE)
-               iommu_disable_translation(iommu);
-
-       g_iommus[iommu->seq_id] = iommu;
-       ret = iommu_init_domains(iommu);
-       if (ret == 0)
-               ret = iommu_alloc_root_entry(iommu);
-       if (ret)
-               goto out;
-
-       intel_svm_check(iommu);
-
-       if (dmaru->ignored) {
-               /*
-                * we always have to disable PMRs or DMA may fail on this device
-                */
-               if (force_on)
-                       iommu_disable_protect_mem_regions(iommu);
-               return 0;
-       }
-
-       intel_iommu_init_qi(iommu);
-       iommu_flush_write_buffer(iommu);
-
-#ifdef CONFIG_INTEL_IOMMU_SVM
-       if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
-               ret = intel_svm_enable_prq(iommu);
-               if (ret)
-                       goto disable_iommu;
-       }
-#endif
-       ret = dmar_set_interrupt(iommu);
-       if (ret)
-               goto disable_iommu;
-
-       iommu_set_root_entry(iommu);
-       iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
-       iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
-       iommu_enable_translation(iommu);
-
-       iommu_disable_protect_mem_regions(iommu);
-       return 0;
-
-disable_iommu:
-       disable_dmar_iommu(iommu);
-out:
-       free_dmar_iommu(iommu);
-       return ret;
-}
-
-int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
-{
-       int ret = 0;
-       struct intel_iommu *iommu = dmaru->iommu;
-
-       if (!intel_iommu_enabled)
-               return 0;
-       if (iommu == NULL)
-               return -EINVAL;
-
-       if (insert) {
-               ret = intel_iommu_add(dmaru);
-       } else {
-               disable_dmar_iommu(iommu);
-               free_dmar_iommu(iommu);
-       }
-
-       return ret;
-}
-
-static void intel_iommu_free_dmars(void)
-{
-       struct dmar_rmrr_unit *rmrru, *rmrr_n;
-       struct dmar_atsr_unit *atsru, *atsr_n;
-
-       list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
-               list_del(&rmrru->list);
-               dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
-               kfree(rmrru);
-       }
-
-       list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
-               list_del(&atsru->list);
-               intel_iommu_free_atsr(atsru);
-       }
-}
-
-int dmar_find_matched_atsr_unit(struct pci_dev *dev)
-{
-       int i, ret = 1;
-       struct pci_bus *bus;
-       struct pci_dev *bridge = NULL;
-       struct device *tmp;
-       struct acpi_dmar_atsr *atsr;
-       struct dmar_atsr_unit *atsru;
-
-       dev = pci_physfn(dev);
-       for (bus = dev->bus; bus; bus = bus->parent) {
-               bridge = bus->self;
-               /* If it's an integrated device, allow ATS */
-               if (!bridge)
-                       return 1;
-               /* Connected via non-PCIe: no ATS */
-               if (!pci_is_pcie(bridge) ||
-                   pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
-                       return 0;
-               /* If we found the root port, look it up in the ATSR */
-               if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
-                       break;
-       }
-
-       rcu_read_lock();
-       list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
-               atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
-               if (atsr->segment != pci_domain_nr(dev->bus))
-                       continue;
-
-               for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
-                       if (tmp == &bridge->dev)
-                               goto out;
-
-               if (atsru->include_all)
-                       goto out;
-       }
-       ret = 0;
-out:
-       rcu_read_unlock();
-
-       return ret;
-}
-
-int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
-{
-       int ret;
-       struct dmar_rmrr_unit *rmrru;
-       struct dmar_atsr_unit *atsru;
-       struct acpi_dmar_atsr *atsr;
-       struct acpi_dmar_reserved_memory *rmrr;
-
-       if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
-               return 0;
-
-       list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
-               rmrr = container_of(rmrru->hdr,
-                                   struct acpi_dmar_reserved_memory, header);
-               if (info->event == BUS_NOTIFY_ADD_DEVICE) {
-                       ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
-                               ((void *)rmrr) + rmrr->header.length,
-                               rmrr->segment, rmrru->devices,
-                               rmrru->devices_cnt);
-                       if (ret < 0)
-                               return ret;
-               } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
-                       dmar_remove_dev_scope(info, rmrr->segment,
-                               rmrru->devices, rmrru->devices_cnt);
-               }
-       }
-
-       list_for_each_entry(atsru, &dmar_atsr_units, list) {
-               if (atsru->include_all)
-                       continue;
-
-               atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
-               if (info->event == BUS_NOTIFY_ADD_DEVICE) {
-                       ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
-                                       (void *)atsr + atsr->header.length,
-                                       atsr->segment, atsru->devices,
-                                       atsru->devices_cnt);
-                       if (ret > 0)
-                               break;
-                       else if (ret < 0)
-                               return ret;
-               } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
-                       if (dmar_remove_dev_scope(info, atsr->segment,
-                                       atsru->devices, atsru->devices_cnt))
-                               break;
-               }
-       }
-
-       return 0;
-}
-
-static int intel_iommu_memory_notifier(struct notifier_block *nb,
-                                      unsigned long val, void *v)
-{
-       struct memory_notify *mhp = v;
-       unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
-       unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
-                       mhp->nr_pages - 1);
-
-       switch (val) {
-       case MEM_GOING_ONLINE:
-               if (iommu_domain_identity_map(si_domain,
-                                             start_vpfn, last_vpfn)) {
-                       pr_warn("Failed to build identity map for [%lx-%lx]\n",
-                               start_vpfn, last_vpfn);
-                       return NOTIFY_BAD;
-               }
-               break;
-
-       case MEM_OFFLINE:
-       case MEM_CANCEL_ONLINE:
-               {
-                       struct dmar_drhd_unit *drhd;
-                       struct intel_iommu *iommu;
-                       struct page *freelist;
-
-                       freelist = domain_unmap(si_domain,
-                                               start_vpfn, last_vpfn);
-
-                       rcu_read_lock();
-                       for_each_active_iommu(iommu, drhd)
-                               iommu_flush_iotlb_psi(iommu, si_domain,
-                                       start_vpfn, mhp->nr_pages,
-                                       !freelist, 0);
-                       rcu_read_unlock();
-                       dma_free_pagelist(freelist);
-               }
-               break;
-       }
-
-       return NOTIFY_OK;
-}
-
-static struct notifier_block intel_iommu_memory_nb = {
-       .notifier_call = intel_iommu_memory_notifier,
-       .priority = 0
-};
-
-static void free_all_cpu_cached_iovas(unsigned int cpu)
-{
-       int i;
-
-       for (i = 0; i < g_num_of_iommus; i++) {
-               struct intel_iommu *iommu = g_iommus[i];
-               struct dmar_domain *domain;
-               int did;
-
-               if (!iommu)
-                       continue;
-
-               for (did = 0; did < cap_ndoms(iommu->cap); did++) {
-                       domain = get_iommu_domain(iommu, (u16)did);
-
-                       if (!domain || domain->domain.type != IOMMU_DOMAIN_DMA)
-                               continue;
-
-                       free_cpu_cached_iovas(cpu, &domain->iovad);
-               }
-       }
-}
-
-static int intel_iommu_cpu_dead(unsigned int cpu)
-{
-       free_all_cpu_cached_iovas(cpu);
-       return 0;
-}
-
-static void intel_disable_iommus(void)
-{
-       struct intel_iommu *iommu = NULL;
-       struct dmar_drhd_unit *drhd;
-
-       for_each_iommu(iommu, drhd)
-               iommu_disable_translation(iommu);
-}
-
-void intel_iommu_shutdown(void)
-{
-       struct dmar_drhd_unit *drhd;
-       struct intel_iommu *iommu = NULL;
-
-       if (no_iommu || dmar_disabled)
-               return;
-
-       down_write(&dmar_global_lock);
-
-       /* Disable PMRs explicitly here. */
-       for_each_iommu(iommu, drhd)
-               iommu_disable_protect_mem_regions(iommu);
-
-       /* Make sure the IOMMUs are switched off */
-       intel_disable_iommus();
-
-       up_write(&dmar_global_lock);
-}
-
-static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
-{
-       struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
-
-       return container_of(iommu_dev, struct intel_iommu, iommu);
-}
-
-static ssize_t intel_iommu_show_version(struct device *dev,
-                                       struct device_attribute *attr,
-                                       char *buf)
-{
-       struct intel_iommu *iommu = dev_to_intel_iommu(dev);
-       u32 ver = readl(iommu->reg + DMAR_VER_REG);
-       return sprintf(buf, "%d:%d\n",
-                      DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
-}
-static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
-
-static ssize_t intel_iommu_show_address(struct device *dev,
-                                       struct device_attribute *attr,
-                                       char *buf)
-{
-       struct intel_iommu *iommu = dev_to_intel_iommu(dev);
-       return sprintf(buf, "%llx\n", iommu->reg_phys);
-}
-static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
-
-static ssize_t intel_iommu_show_cap(struct device *dev,
-                                   struct device_attribute *attr,
-                                   char *buf)
-{
-       struct intel_iommu *iommu = dev_to_intel_iommu(dev);
-       return sprintf(buf, "%llx\n", iommu->cap);
-}
-static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
-
-static ssize_t intel_iommu_show_ecap(struct device *dev,
-                                   struct device_attribute *attr,
-                                   char *buf)
-{
-       struct intel_iommu *iommu = dev_to_intel_iommu(dev);
-       return sprintf(buf, "%llx\n", iommu->ecap);
-}
-static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
-
-static ssize_t intel_iommu_show_ndoms(struct device *dev,
-                                     struct device_attribute *attr,
-                                     char *buf)
-{
-       struct intel_iommu *iommu = dev_to_intel_iommu(dev);
-       return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
-}
-static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
-
-static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
-                                          struct device_attribute *attr,
-                                          char *buf)
-{
-       struct intel_iommu *iommu = dev_to_intel_iommu(dev);
-       return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
-                                                 cap_ndoms(iommu->cap)));
-}
-static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
-
-static struct attribute *intel_iommu_attrs[] = {
-       &dev_attr_version.attr,
-       &dev_attr_address.attr,
-       &dev_attr_cap.attr,
-       &dev_attr_ecap.attr,
-       &dev_attr_domains_supported.attr,
-       &dev_attr_domains_used.attr,
-       NULL,
-};
-
-static struct attribute_group intel_iommu_group = {
-       .name = "intel-iommu",
-       .attrs = intel_iommu_attrs,
-};
-
-const struct attribute_group *intel_iommu_groups[] = {
-       &intel_iommu_group,
-       NULL,
-};
-
-static inline bool has_untrusted_dev(void)
-{
-       struct pci_dev *pdev = NULL;
-
-       for_each_pci_dev(pdev)
-               if (pdev->untrusted)
-                       return true;
-
-       return false;
-}
-
-static int __init platform_optin_force_iommu(void)
-{
-       if (!dmar_platform_optin() || no_platform_optin || !has_untrusted_dev())
-               return 0;
-
-       if (no_iommu || dmar_disabled)
-               pr_info("Intel-IOMMU force enabled due to platform opt in\n");
-
-       /*
-        * If Intel-IOMMU is disabled by default, we will apply identity
-        * map for all devices except those marked as being untrusted.
-        */
-       if (dmar_disabled)
-               iommu_set_default_passthrough(false);
-
-       dmar_disabled = 0;
-       no_iommu = 0;
-
-       return 1;
-}
-
-static int __init probe_acpi_namespace_devices(void)
-{
-       struct dmar_drhd_unit *drhd;
-       /* To avoid a -Wunused-but-set-variable warning. */
-       struct intel_iommu *iommu __maybe_unused;
-       struct device *dev;
-       int i, ret = 0;
-
-       for_each_active_iommu(iommu, drhd) {
-               for_each_active_dev_scope(drhd->devices,
-                                         drhd->devices_cnt, i, dev) {
-                       struct acpi_device_physical_node *pn;
-                       struct iommu_group *group;
-                       struct acpi_device *adev;
-
-                       if (dev->bus != &acpi_bus_type)
-                               continue;
-
-                       adev = to_acpi_device(dev);
-                       mutex_lock(&adev->physical_node_lock);
-                       list_for_each_entry(pn,
-                                           &adev->physical_node_list, node) {
-                               group = iommu_group_get(pn->dev);
-                               if (group) {
-                                       iommu_group_put(group);
-                                       continue;
-                               }
-
-                               pn->dev->bus->iommu_ops = &intel_iommu_ops;
-                               ret = iommu_probe_device(pn->dev);
-                               if (ret)
-                                       break;
-                       }
-                       mutex_unlock(&adev->physical_node_lock);
-
-                       if (ret)
-                               return ret;
-               }
-       }
-
-       return 0;
-}
-
-int __init intel_iommu_init(void)
-{
-       int ret = -ENODEV;
-       struct dmar_drhd_unit *drhd;
-       struct intel_iommu *iommu;
-
-       /*
-        * Intel IOMMU is required for a TXT/tboot launch or platform
-        * opt in, so enforce that.
-        */
-       force_on = tboot_force_iommu() || platform_optin_force_iommu();
-
-       if (iommu_init_mempool()) {
-               if (force_on)
-                       panic("tboot: Failed to initialize iommu memory\n");
-               return -ENOMEM;
-       }
-
-       down_write(&dmar_global_lock);
-       if (dmar_table_init()) {
-               if (force_on)
-                       panic("tboot: Failed to initialize DMAR table\n");
-               goto out_free_dmar;
-       }
-
-       if (dmar_dev_scope_init() < 0) {
-               if (force_on)
-                       panic("tboot: Failed to initialize DMAR device scope\n");
-               goto out_free_dmar;
-       }
-
-       up_write(&dmar_global_lock);
-
-       /*
-        * The bus notifier takes the dmar_global_lock, so lockdep will
-        * complain later when we register it under the lock.
-        */
-       dmar_register_bus_notifier();
-
-       down_write(&dmar_global_lock);
-
-       if (!no_iommu)
-               intel_iommu_debugfs_init();
-
-       if (no_iommu || dmar_disabled) {
-               /*
-                * We exit the function here to ensure IOMMU's remapping and
-                * mempool aren't setup, which means that the IOMMU's PMRs
-                * won't be disabled via the call to init_dmars(). So disable
-                * it explicitly here. The PMRs were setup by tboot prior to
-                * calling SENTER, but the kernel is expected to reset/tear
-                * down the PMRs.
-                */
-               if (intel_iommu_tboot_noforce) {
-                       for_each_iommu(iommu, drhd)
-                               iommu_disable_protect_mem_regions(iommu);
-               }
-
-               /*
-                * Make sure the IOMMUs are switched off, even when we
-                * boot into a kexec kernel and the previous kernel left
-                * them enabled
-                */
-               intel_disable_iommus();
-               goto out_free_dmar;
-       }
-
-       if (list_empty(&dmar_rmrr_units))
-               pr_info("No RMRR found\n");
-
-       if (list_empty(&dmar_atsr_units))
-               pr_info("No ATSR found\n");
-
-       if (dmar_init_reserved_ranges()) {
-               if (force_on)
-                       panic("tboot: Failed to reserve iommu ranges\n");
-               goto out_free_reserved_range;
-       }
-
-       if (dmar_map_gfx)
-               intel_iommu_gfx_mapped = 1;
-
-       init_no_remapping_devices();
-
-       ret = init_dmars();
-       if (ret) {
-               if (force_on)
-                       panic("tboot: Failed to initialize DMARs\n");
-               pr_err("Initialization failed\n");
-               goto out_free_reserved_range;
-       }
-       up_write(&dmar_global_lock);
-
-       init_iommu_pm_ops();
-
-       down_read(&dmar_global_lock);
-       for_each_active_iommu(iommu, drhd) {
-               iommu_device_sysfs_add(&iommu->iommu, NULL,
-                                      intel_iommu_groups,
-                                      "%s", iommu->name);
-               iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
-               iommu_device_register(&iommu->iommu);
-       }
-       up_read(&dmar_global_lock);
-
-       bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
-       if (si_domain && !hw_pass_through)
-               register_memory_notifier(&intel_iommu_memory_nb);
-       cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
-                         intel_iommu_cpu_dead);
-
-       down_read(&dmar_global_lock);
-       if (probe_acpi_namespace_devices())
-               pr_warn("ACPI name space devices didn't probe correctly\n");
-
-       /* Finally, we enable the DMA remapping hardware. */
-       for_each_iommu(iommu, drhd) {
-               if (!drhd->ignored && !translation_pre_enabled(iommu))
-                       iommu_enable_translation(iommu);
-
-               iommu_disable_protect_mem_regions(iommu);
-       }
-       up_read(&dmar_global_lock);
-
-       pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
-
-       intel_iommu_enabled = 1;
-
-       return 0;
-
-out_free_reserved_range:
-       put_iova_domain(&reserved_iova_list);
-out_free_dmar:
-       intel_iommu_free_dmars();
-       up_write(&dmar_global_lock);
-       iommu_exit_mempool();
-       return ret;
-}
-
-static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
-{
-       struct intel_iommu *iommu = opaque;
-
-       domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
-       return 0;
-}
-
-/*
- * NB - intel-iommu lacks any sort of reference counting for the users of
- * dependent devices.  If multiple endpoints have intersecting dependent
- * devices, unbinding the driver from any one of them will possibly leave
- * the others unable to operate.
- */
-static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
-{
-       if (!iommu || !dev || !dev_is_pci(dev))
-               return;
-
-       pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
-}
-
-static void __dmar_remove_one_dev_info(struct device_domain_info *info)
-{
-       struct dmar_domain *domain;
-       struct intel_iommu *iommu;
-       unsigned long flags;
-
-       assert_spin_locked(&device_domain_lock);
-
-       if (WARN_ON(!info))
-               return;
-
-       iommu = info->iommu;
-       domain = info->domain;
-
-       if (info->dev) {
-               if (dev_is_pci(info->dev) && sm_supported(iommu))
-                       intel_pasid_tear_down_entry(iommu, info->dev,
-                                       PASID_RID2PASID, false);
-
-               iommu_disable_dev_iotlb(info);
-               if (!dev_is_real_dma_subdevice(info->dev))
-                       domain_context_clear(iommu, info->dev);
-               intel_pasid_free_table(info->dev);
-       }
-
-       unlink_domain_info(info);
-
-       spin_lock_irqsave(&iommu->lock, flags);
-       domain_detach_iommu(domain, iommu);
-       spin_unlock_irqrestore(&iommu->lock, flags);
-
-       free_devinfo_mem(info);
-}
-
-static void dmar_remove_one_dev_info(struct device *dev)
-{
-       struct device_domain_info *info;
-       unsigned long flags;
-
-       spin_lock_irqsave(&device_domain_lock, flags);
-       info = get_domain_info(dev);
-       if (info)
-               __dmar_remove_one_dev_info(info);
-       spin_unlock_irqrestore(&device_domain_lock, flags);
-}
-
-static int md_domain_init(struct dmar_domain *domain, int guest_width)
-{
-       int adjust_width;
-
-       /* calculate AGAW */
-       domain->gaw = guest_width;
-       adjust_width = guestwidth_to_adjustwidth(guest_width);
-       domain->agaw = width_to_agaw(adjust_width);
-
-       domain->iommu_coherency = 0;
-       domain->iommu_snooping = 0;
-       domain->iommu_superpage = 0;
-       domain->max_addr = 0;
-
-       /* always allocate the top pgd */
-       domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
-       if (!domain->pgd)
-               return -ENOMEM;
-       domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
-       return 0;
-}
-
-static void intel_init_iova_domain(struct dmar_domain *dmar_domain)
-{
-       init_iova_domain(&dmar_domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
-       copy_reserved_iova(&reserved_iova_list, &dmar_domain->iovad);
-
-       if (!intel_iommu_strict &&
-           init_iova_flush_queue(&dmar_domain->iovad,
-                                 iommu_flush_iova, iova_entry_free))
-               pr_info("iova flush queue initialization failed\n");
-}
-
-static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
-{
-       struct dmar_domain *dmar_domain;
-       struct iommu_domain *domain;
-
-       switch (type) {
-       case IOMMU_DOMAIN_DMA:
-       /* fallthrough */
-       case IOMMU_DOMAIN_UNMANAGED:
-               dmar_domain = alloc_domain(0);
-               if (!dmar_domain) {
-                       pr_err("Can't allocate dmar_domain\n");
-                       return NULL;
-               }
-               if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
-                       pr_err("Domain initialization failed\n");
-                       domain_exit(dmar_domain);
-                       return NULL;
-               }
-
-               if (type == IOMMU_DOMAIN_DMA)
-                       intel_init_iova_domain(dmar_domain);
-
-               domain_update_iommu_cap(dmar_domain);
-
-               domain = &dmar_domain->domain;
-               domain->geometry.aperture_start = 0;
-               domain->geometry.aperture_end   =
-                               __DOMAIN_MAX_ADDR(dmar_domain->gaw);
-               domain->geometry.force_aperture = true;
-
-               return domain;
-       case IOMMU_DOMAIN_IDENTITY:
-               return &si_domain->domain;
-       default:
-               return NULL;
-       }
-
-       return NULL;
-}
-
-static void intel_iommu_domain_free(struct iommu_domain *domain)
-{
-       if (domain != &si_domain->domain)
-               domain_exit(to_dmar_domain(domain));
-}
-
-/*
- * Check whether a @domain could be attached to the @dev through the
- * aux-domain attach/detach APIs.
- */
-static inline bool
-is_aux_domain(struct device *dev, struct iommu_domain *domain)
-{
-       struct device_domain_info *info = get_domain_info(dev);
-
-       return info && info->auxd_enabled &&
-                       domain->type == IOMMU_DOMAIN_UNMANAGED;
-}
-
-static void auxiliary_link_device(struct dmar_domain *domain,
-                                 struct device *dev)
-{
-       struct device_domain_info *info = get_domain_info(dev);
-
-       assert_spin_locked(&device_domain_lock);
-       if (WARN_ON(!info))
-               return;
-
-       domain->auxd_refcnt++;
-       list_add(&domain->auxd, &info->auxiliary_domains);
-}
-
-static void auxiliary_unlink_device(struct dmar_domain *domain,
-                                   struct device *dev)
-{
-       struct device_domain_info *info = get_domain_info(dev);
-
-       assert_spin_locked(&device_domain_lock);
-       if (WARN_ON(!info))
-               return;
-
-       list_del(&domain->auxd);
-       domain->auxd_refcnt--;
-
-       if (!domain->auxd_refcnt && domain->default_pasid > 0)
-               ioasid_free(domain->default_pasid);
-}
-
-static int aux_domain_add_dev(struct dmar_domain *domain,
-                             struct device *dev)
-{
-       int ret;
-       u8 bus, devfn;
-       unsigned long flags;
-       struct intel_iommu *iommu;
-
-       iommu = device_to_iommu(dev, &bus, &devfn);
-       if (!iommu)
-               return -ENODEV;
-
-       if (domain->default_pasid <= 0) {
-               int pasid;
-
-               /* No private data needed for the default pasid */
-               pasid = ioasid_alloc(NULL, PASID_MIN,
-                                    pci_max_pasids(to_pci_dev(dev)) - 1,
-                                    NULL);
-               if (pasid == INVALID_IOASID) {
-                       pr_err("Can't allocate default pasid\n");
-                       return -ENODEV;
-               }
-               domain->default_pasid = pasid;
-       }
-
-       spin_lock_irqsave(&device_domain_lock, flags);
-       /*
-        * iommu->lock must be held to attach domain to iommu and setup the
-        * pasid entry for second level translation.
-        */
-       spin_lock(&iommu->lock);
-       ret = domain_attach_iommu(domain, iommu);
-       if (ret)
-               goto attach_failed;
-
-       /* Setup the PASID entry for mediated devices: */
-       if (domain_use_first_level(domain))
-               ret = domain_setup_first_level(iommu, domain, dev,
-                                              domain->default_pasid);
-       else
-               ret = intel_pasid_setup_second_level(iommu, domain, dev,
-                                                    domain->default_pasid);
-       if (ret)
-               goto table_failed;
-       spin_unlock(&iommu->lock);
-
-       auxiliary_link_device(domain, dev);
-
-       spin_unlock_irqrestore(&device_domain_lock, flags);
-
-       return 0;
-
-table_failed:
-       domain_detach_iommu(domain, iommu);
-attach_failed:
-       spin_unlock(&iommu->lock);
-       spin_unlock_irqrestore(&device_domain_lock, flags);
-       if (!domain->auxd_refcnt && domain->default_pasid > 0)
-               ioasid_free(domain->default_pasid);
-
-       return ret;
-}
-
-static void aux_domain_remove_dev(struct dmar_domain *domain,
-                                 struct device *dev)
-{
-       struct device_domain_info *info;
-       struct intel_iommu *iommu;
-       unsigned long flags;
-
-       if (!is_aux_domain(dev, &domain->domain))
-               return;
-
-       spin_lock_irqsave(&device_domain_lock, flags);
-       info = get_domain_info(dev);
-       iommu = info->iommu;
-
-       auxiliary_unlink_device(domain, dev);
-
-       spin_lock(&iommu->lock);
-       intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid, false);
-       domain_detach_iommu(domain, iommu);
-       spin_unlock(&iommu->lock);
-
-       spin_unlock_irqrestore(&device_domain_lock, flags);
-}
-
-static int prepare_domain_attach_device(struct iommu_domain *domain,
-                                       struct device *dev)
-{
-       struct dmar_domain *dmar_domain = to_dmar_domain(domain);
-       struct intel_iommu *iommu;
-       int addr_width;
-       u8 bus, devfn;
-
-       iommu = device_to_iommu(dev, &bus, &devfn);
-       if (!iommu)
-               return -ENODEV;
-
-       /* check if this iommu agaw is sufficient for max mapped address */
-       addr_width = agaw_to_width(iommu->agaw);
-       if (addr_width > cap_mgaw(iommu->cap))
-               addr_width = cap_mgaw(iommu->cap);
-
-       if (dmar_domain->max_addr > (1LL << addr_width)) {
-               dev_err(dev, "%s: iommu width (%d) is not "
-                       "sufficient for the mapped address (%llx)\n",
-                       __func__, addr_width, dmar_domain->max_addr);
-               return -EFAULT;
-       }
-       dmar_domain->gaw = addr_width;
-
-       /*
-        * Knock out extra levels of page tables if necessary
-        */
-       while (iommu->agaw < dmar_domain->agaw) {
-               struct dma_pte *pte;
-
-               pte = dmar_domain->pgd;
-               if (dma_pte_present(pte)) {
-                       dmar_domain->pgd = (struct dma_pte *)
-                               phys_to_virt(dma_pte_addr(pte));
-                       free_pgtable_page(pte);
-               }
-               dmar_domain->agaw--;
-       }
-
-       return 0;
-}
-
-static int intel_iommu_attach_device(struct iommu_domain *domain,
-                                    struct device *dev)
-{
-       int ret;
-
-       if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
-           device_is_rmrr_locked(dev)) {
-               dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
-               return -EPERM;
-       }
-
-       if (is_aux_domain(dev, domain))
-               return -EPERM;
-
-       /* normally dev is not mapped */
-       if (unlikely(domain_context_mapped(dev))) {
-               struct dmar_domain *old_domain;
-
-               old_domain = find_domain(dev);
-               if (old_domain)
-                       dmar_remove_one_dev_info(dev);
-       }
-
-       ret = prepare_domain_attach_device(domain, dev);
-       if (ret)
-               return ret;
-
-       return domain_add_dev_info(to_dmar_domain(domain), dev);
-}
-
-static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
-                                        struct device *dev)
-{
-       int ret;
-
-       if (!is_aux_domain(dev, domain))
-               return -EPERM;
-
-       ret = prepare_domain_attach_device(domain, dev);
-       if (ret)
-               return ret;
-
-       return aux_domain_add_dev(to_dmar_domain(domain), dev);
-}
-
-static void intel_iommu_detach_device(struct iommu_domain *domain,
-                                     struct device *dev)
-{
-       dmar_remove_one_dev_info(dev);
-}
-
-static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
-                                         struct device *dev)
-{
-       aux_domain_remove_dev(to_dmar_domain(domain), dev);
-}
-
-/*
- * 2D array for converting and sanitizing IOMMU generic TLB granularity to
- * VT-d granularity. Invalidation is typically included in the unmap operation
- * as a result of DMA or VFIO unmap. However, for assigned devices guest
- * owns the first level page tables. Invalidations of translation caches in the
- * guest are trapped and passed down to the host.
- *
- * vIOMMU in the guest will only expose first level page tables, therefore
- * we do not support IOTLB granularity for request without PASID (second level).
- *
- * For example, to find the VT-d granularity encoding for IOTLB
- * type and page selective granularity within PASID:
- * X: indexed by iommu cache type
- * Y: indexed by enum iommu_inv_granularity
- * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
- */
-
-static const int
-inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
-       /*
-        * PASID based IOTLB invalidation: PASID selective (per PASID),
-        * page selective (address granularity)
-        */
-       {-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
-       /* PASID based dev TLBs */
-       {-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL},
-       /* PASID cache */
-       {-EINVAL, -EINVAL, -EINVAL}
-};
-
-static inline int to_vtd_granularity(int type, int granu)
-{
-       return inv_type_granu_table[type][granu];
-}
-
-static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
-{
-       u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
-
-       /* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
-        * IOMMU cache invalidate API passes granu_size in bytes, and number of
-        * granu size in contiguous memory.
-        */
-       return order_base_2(nr_pages);
-}
-
-#ifdef CONFIG_INTEL_IOMMU_SVM
-static int
-intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
-                          struct iommu_cache_invalidate_info *inv_info)
-{
-       struct dmar_domain *dmar_domain = to_dmar_domain(domain);
-       struct device_domain_info *info;
-       struct intel_iommu *iommu;
-       unsigned long flags;
-       int cache_type;
-       u8 bus, devfn;
-       u16 did, sid;
-       int ret = 0;
-       u64 size = 0;
-
-       if (!inv_info || !dmar_domain ||
-           inv_info->version != IOMMU_CACHE_INVALIDATE_INFO_VERSION_1)
-               return -EINVAL;
-
-       if (!dev || !dev_is_pci(dev))
-               return -ENODEV;
-
-       iommu = device_to_iommu(dev, &bus, &devfn);
-       if (!iommu)
-               return -ENODEV;
-
-       if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
-               return -EINVAL;
-
-       spin_lock_irqsave(&device_domain_lock, flags);
-       spin_lock(&iommu->lock);
-       info = get_domain_info(dev);
-       if (!info) {
-               ret = -EINVAL;
-               goto out_unlock;
-       }
-       did = dmar_domain->iommu_did[iommu->seq_id];
-       sid = PCI_DEVID(bus, devfn);
-
-       /* Size is only valid in address selective invalidation */
-       if (inv_info->granularity != IOMMU_INV_GRANU_PASID)
-               size = to_vtd_size(inv_info->addr_info.granule_size,
-                                  inv_info->addr_info.nb_granules);
-
-       for_each_set_bit(cache_type,
-                        (unsigned long *)&inv_info->cache,
-                        IOMMU_CACHE_INV_TYPE_NR) {
-               int granu = 0;
-               u64 pasid = 0;
-
-               granu = to_vtd_granularity(cache_type, inv_info->granularity);
-               if (granu == -EINVAL) {
-                       pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n",
-                                          cache_type, inv_info->granularity);
-                       break;
-               }
-
-               /*
-                * PASID is stored in different locations based on the
-                * granularity.
-                */
-               if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
-                   (inv_info->pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
-                       pasid = inv_info->pasid_info.pasid;
-               else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
-                        (inv_info->addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
-                       pasid = inv_info->addr_info.pasid;
-
-               switch (BIT(cache_type)) {
-               case IOMMU_CACHE_INV_TYPE_IOTLB:
-                       if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
-                           size &&
-                           (inv_info->addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
-                               pr_err_ratelimited("Address out of range, 0x%llx, size order %llu\n",
-                                                  inv_info->addr_info.addr, size);
-                               ret = -ERANGE;
-                               goto out_unlock;
-                       }
-
-                       /*
-                        * If granu is PASID-selective, address is ignored.
-                        * We use npages = -1 to indicate that.
-                        */
-                       qi_flush_piotlb(iommu, did, pasid,
-                                       mm_to_dma_pfn(inv_info->addr_info.addr),
-                                       (granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
-                                       inv_info->addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
-
-                       /*
-                        * Always flush device IOTLB if ATS is enabled. vIOMMU
-                        * in the guest may assume IOTLB flush is inclusive,
-                        * which is more efficient.
-                        */
-                       if (info->ats_enabled)
-                               qi_flush_dev_iotlb_pasid(iommu, sid,
-                                               info->pfsid, pasid,
-                                               info->ats_qdep,
-                                               inv_info->addr_info.addr,
-                                               size, granu);
-                       break;
-               case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
-                       if (info->ats_enabled)
-                               qi_flush_dev_iotlb_pasid(iommu, sid,
-                                               info->pfsid, pasid,
-                                               info->ats_qdep,
-                                               inv_info->addr_info.addr,
-                                               size, granu);
-                       else
-                               pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
-                       break;
-               default:
-                       dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n",
-                                           cache_type);
-                       ret = -EINVAL;
-               }
-       }
-out_unlock:
-       spin_unlock(&iommu->lock);
-       spin_unlock_irqrestore(&device_domain_lock, flags);
-
-       return ret;
-}
-#endif
-
-static int intel_iommu_map(struct iommu_domain *domain,
-                          unsigned long iova, phys_addr_t hpa,
-                          size_t size, int iommu_prot, gfp_t gfp)
-{
-       struct dmar_domain *dmar_domain = to_dmar_domain(domain);
-       u64 max_addr;
-       int prot = 0;
-       int ret;
-
-       if (iommu_prot & IOMMU_READ)
-               prot |= DMA_PTE_READ;
-       if (iommu_prot & IOMMU_WRITE)
-               prot |= DMA_PTE_WRITE;
-       if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
-               prot |= DMA_PTE_SNP;
-
-       max_addr = iova + size;
-       if (dmar_domain->max_addr < max_addr) {
-               u64 end;
-
-               /* check if minimum agaw is sufficient for mapped address */
-               end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
-               if (end < max_addr) {
-                       pr_err("%s: iommu width (%d) is not "
-                              "sufficient for the mapped address (%llx)\n",
-                              __func__, dmar_domain->gaw, max_addr);
-                       return -EFAULT;
-               }
-               dmar_domain->max_addr = max_addr;
-       }
-       /* Round up size to next multiple of PAGE_SIZE, if it and
-          the low bits of hpa would take us onto the next page */
-       size = aligned_nrpages(hpa, size);
-       ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
-                                hpa >> VTD_PAGE_SHIFT, size, prot);
-       return ret;
-}
-
-static size_t intel_iommu_unmap(struct iommu_domain *domain,
-                               unsigned long iova, size_t size,
-                               struct iommu_iotlb_gather *gather)
-{
-       struct dmar_domain *dmar_domain = to_dmar_domain(domain);
-       struct page *freelist = NULL;
-       unsigned long start_pfn, last_pfn;
-       unsigned int npages;
-       int iommu_id, level = 0;
-
-       /* Cope with horrid API which requires us to unmap more than the
-          size argument if it happens to be a large-page mapping. */
-       BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
-
-       if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
-               size = VTD_PAGE_SIZE << level_to_offset_bits(level);
-
-       start_pfn = iova >> VTD_PAGE_SHIFT;
-       last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
-
-       freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
-
-       npages = last_pfn - start_pfn + 1;
-
-       for_each_domain_iommu(iommu_id, dmar_domain)
-               iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
-                                     start_pfn, npages, !freelist, 0);
-
-       dma_free_pagelist(freelist);
-
-       if (dmar_domain->max_addr == iova + size)
-               dmar_domain->max_addr = iova;
-
-       return size;
-}
-
-static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
-                                           dma_addr_t iova)
-{
-       struct dmar_domain *dmar_domain = to_dmar_domain(domain);
-       struct dma_pte *pte;
-       int level = 0;
-       u64 phys = 0;
-
-       pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
-       if (pte && dma_pte_present(pte))
-               phys = dma_pte_addr(pte) +
-                       (iova & (BIT_MASK(level_to_offset_bits(level) +
-                                               VTD_PAGE_SHIFT) - 1));
-
-       return phys;
-}
-
-static inline bool scalable_mode_support(void)
-{
-       struct dmar_drhd_unit *drhd;
-       struct intel_iommu *iommu;
-       bool ret = true;
-
-       rcu_read_lock();
-       for_each_active_iommu(iommu, drhd) {
-               if (!sm_supported(iommu)) {
-                       ret = false;
-                       break;
-               }
-       }
-       rcu_read_unlock();
-
-       return ret;
-}
-
-static inline bool iommu_pasid_support(void)
-{
-       struct dmar_drhd_unit *drhd;
-       struct intel_iommu *iommu;
-       bool ret = true;
-
-       rcu_read_lock();
-       for_each_active_iommu(iommu, drhd) {
-               if (!pasid_supported(iommu)) {
-                       ret = false;
-                       break;
-               }
-       }
-       rcu_read_unlock();
-
-       return ret;
-}
-
-static inline bool nested_mode_support(void)
-{
-       struct dmar_drhd_unit *drhd;
-       struct intel_iommu *iommu;
-       bool ret = true;
-
-       rcu_read_lock();
-       for_each_active_iommu(iommu, drhd) {
-               if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) {
-                       ret = false;
-                       break;
-               }
-       }
-       rcu_read_unlock();
-
-       return ret;
-}
-
-static bool intel_iommu_capable(enum iommu_cap cap)
-{
-       if (cap == IOMMU_CAP_CACHE_COHERENCY)
-               return domain_update_iommu_snooping(NULL) == 1;
-       if (cap == IOMMU_CAP_INTR_REMAP)
-               return irq_remapping_enabled == 1;
-
-       return false;
-}
-
-static struct iommu_device *intel_iommu_probe_device(struct device *dev)
-{
-       struct intel_iommu *iommu;
-       u8 bus, devfn;
-
-       iommu = device_to_iommu(dev, &bus, &devfn);
-       if (!iommu)
-               return ERR_PTR(-ENODEV);
-
-       if (translation_pre_enabled(iommu))
-               dev->archdata.iommu = DEFER_DEVICE_DOMAIN_INFO;
-
-       return &iommu->iommu;
-}
-
-static void intel_iommu_release_device(struct device *dev)
-{
-       struct intel_iommu *iommu;
-       u8 bus, devfn;
-
-       iommu = device_to_iommu(dev, &bus, &devfn);
-       if (!iommu)
-               return;
-
-       dmar_remove_one_dev_info(dev);
-
-       set_dma_ops(dev, NULL);
-}
-
-static void intel_iommu_probe_finalize(struct device *dev)
-{
-       struct iommu_domain *domain;
-
-       domain = iommu_get_domain_for_dev(dev);
-       if (device_needs_bounce(dev))
-               set_dma_ops(dev, &bounce_dma_ops);
-       else if (domain && domain->type == IOMMU_DOMAIN_DMA)
-               set_dma_ops(dev, &intel_dma_ops);
-       else
-               set_dma_ops(dev, NULL);
-}
-
-static void intel_iommu_get_resv_regions(struct device *device,
-                                        struct list_head *head)
-{
-       int prot = DMA_PTE_READ | DMA_PTE_WRITE;
-       struct iommu_resv_region *reg;
-       struct dmar_rmrr_unit *rmrr;
-       struct device *i_dev;
-       int i;
-
-       down_read(&dmar_global_lock);
-       for_each_rmrr_units(rmrr) {
-               for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
-                                         i, i_dev) {
-                       struct iommu_resv_region *resv;
-                       enum iommu_resv_type type;
-                       size_t length;
-
-                       if (i_dev != device &&
-                           !is_downstream_to_pci_bridge(device, i_dev))
-                               continue;
-
-                       length = rmrr->end_address - rmrr->base_address + 1;
-
-                       type = device_rmrr_is_relaxable(device) ?
-                               IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
-
-                       resv = iommu_alloc_resv_region(rmrr->base_address,
-                                                      length, prot, type);
-                       if (!resv)
-                               break;
-
-                       list_add_tail(&resv->list, head);
-               }
-       }
-       up_read(&dmar_global_lock);
-
-#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
-       if (dev_is_pci(device)) {
-               struct pci_dev *pdev = to_pci_dev(device);
-
-               if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
-                       reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
-                                                  IOMMU_RESV_DIRECT_RELAXABLE);
-                       if (reg)
-                               list_add_tail(&reg->list, head);
-               }
-       }
-#endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
-
-       reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
-                                     IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
-                                     0, IOMMU_RESV_MSI);
-       if (!reg)
-               return;
-       list_add_tail(&reg->list, head);
-}
-
-int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
-{
-       struct device_domain_info *info;
-       struct context_entry *context;
-       struct dmar_domain *domain;
-       unsigned long flags;
-       u64 ctx_lo;
-       int ret;
-
-       domain = find_domain(dev);
-       if (!domain)
-               return -EINVAL;
-
-       spin_lock_irqsave(&device_domain_lock, flags);
-       spin_lock(&iommu->lock);
-
-       ret = -EINVAL;
-       info = get_domain_info(dev);
-       if (!info || !info->pasid_supported)
-               goto out;
-
-       context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
-       if (WARN_ON(!context))
-               goto out;
-
-       ctx_lo = context[0].lo;
-
-       if (!(ctx_lo & CONTEXT_PASIDE)) {
-               ctx_lo |= CONTEXT_PASIDE;
-               context[0].lo = ctx_lo;
-               wmb();
-               iommu->flush.flush_context(iommu,
-                                          domain->iommu_did[iommu->seq_id],
-                                          PCI_DEVID(info->bus, info->devfn),
-                                          DMA_CCMD_MASK_NOBIT,
-                                          DMA_CCMD_DEVICE_INVL);
-       }
-
-       /* Enable PASID support in the device, if it wasn't already */
-       if (!info->pasid_enabled)
-               iommu_enable_dev_iotlb(info);
-
-       ret = 0;
-
- out:
-       spin_unlock(&iommu->lock);
-       spin_unlock_irqrestore(&device_domain_lock, flags);
-
-       return ret;
-}
-
-static void intel_iommu_apply_resv_region(struct device *dev,
-                                         struct iommu_domain *domain,
-                                         struct iommu_resv_region *region)
-{
-       struct dmar_domain *dmar_domain = to_dmar_domain(domain);
-       unsigned long start, end;
-
-       start = IOVA_PFN(region->start);
-       end   = IOVA_PFN(region->start + region->length - 1);
-
-       WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
-}
-
-static struct iommu_group *intel_iommu_device_group(struct device *dev)
-{
-       if (dev_is_pci(dev))
-               return pci_device_group(dev);
-       return generic_device_group(dev);
-}
-
-#ifdef CONFIG_INTEL_IOMMU_SVM
-struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
-{
-       struct intel_iommu *iommu;
-       u8 bus, devfn;
-
-       if (iommu_dummy(dev)) {
-               dev_warn(dev,
-                        "No IOMMU translation for device; cannot enable SVM\n");
-               return NULL;
-       }
-
-       iommu = device_to_iommu(dev, &bus, &devfn);
-       if ((!iommu)) {
-               dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
-               return NULL;
-       }
-
-       return iommu;
-}
-#endif /* CONFIG_INTEL_IOMMU_SVM */
-
-static int intel_iommu_enable_auxd(struct device *dev)
-{
-       struct device_domain_info *info;
-       struct intel_iommu *iommu;
-       unsigned long flags;
-       u8 bus, devfn;
-       int ret;
-
-       iommu = device_to_iommu(dev, &bus, &devfn);
-       if (!iommu || dmar_disabled)
-               return -EINVAL;
-
-       if (!sm_supported(iommu) || !pasid_supported(iommu))
-               return -EINVAL;
-
-       ret = intel_iommu_enable_pasid(iommu, dev);
-       if (ret)
-               return -ENODEV;
-
-       spin_lock_irqsave(&device_domain_lock, flags);
-       info = get_domain_info(dev);
-       info->auxd_enabled = 1;
-       spin_unlock_irqrestore(&device_domain_lock, flags);
-
-       return 0;
-}
-
-static int intel_iommu_disable_auxd(struct device *dev)
-{
-       struct device_domain_info *info;
-       unsigned long flags;
-
-       spin_lock_irqsave(&device_domain_lock, flags);
-       info = get_domain_info(dev);
-       if (!WARN_ON(!info))
-               info->auxd_enabled = 0;
-       spin_unlock_irqrestore(&device_domain_lock, flags);
-
-       return 0;
-}
-
-/*
- * A PCI express designated vendor specific extended capability is defined
- * in the section 3.7 of Intel scalable I/O virtualization technical spec
- * for system software and tools to detect endpoint devices supporting the
- * Intel scalable IO virtualization without host driver dependency.
- *
- * Returns the address of the matching extended capability structure within
- * the device's PCI configuration space or 0 if the device does not support
- * it.
- */
-static int siov_find_pci_dvsec(struct pci_dev *pdev)
-{
-       int pos;
-       u16 vendor, id;
-
-       pos = pci_find_next_ext_capability(pdev, 0, 0x23);
-       while (pos) {
-               pci_read_config_word(pdev, pos + 4, &vendor);
-               pci_read_config_word(pdev, pos + 8, &id);
-               if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
-                       return pos;
-
-               pos = pci_find_next_ext_capability(pdev, pos, 0x23);
-       }
-
-       return 0;
-}
-
-static bool
-intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
-{
-       if (feat == IOMMU_DEV_FEAT_AUX) {
-               int ret;
-
-               if (!dev_is_pci(dev) || dmar_disabled ||
-                   !scalable_mode_support() || !iommu_pasid_support())
-                       return false;
-
-               ret = pci_pasid_features(to_pci_dev(dev));
-               if (ret < 0)
-                       return false;
-
-               return !!siov_find_pci_dvsec(to_pci_dev(dev));
-       }
-
-       if (feat == IOMMU_DEV_FEAT_SVA) {
-               struct device_domain_info *info = get_domain_info(dev);
-
-               return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) &&
-                       info->pasid_supported && info->pri_supported &&
-                       info->ats_supported;
-       }
-
-       return false;
-}
-
-static int
-intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
-{
-       if (feat == IOMMU_DEV_FEAT_AUX)
-               return intel_iommu_enable_auxd(dev);
-
-       if (feat == IOMMU_DEV_FEAT_SVA) {
-               struct device_domain_info *info = get_domain_info(dev);
-
-               if (!info)
-                       return -EINVAL;
-
-               if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE)
-                       return 0;
-       }
-
-       return -ENODEV;
-}
-
-static int
-intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
-{
-       if (feat == IOMMU_DEV_FEAT_AUX)
-               return intel_iommu_disable_auxd(dev);
-
-       return -ENODEV;
-}
-
-static bool
-intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
-{
-       struct device_domain_info *info = get_domain_info(dev);
-
-       if (feat == IOMMU_DEV_FEAT_AUX)
-               return scalable_mode_support() && info && info->auxd_enabled;
-
-       return false;
-}
-
-static int
-intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
-{
-       struct dmar_domain *dmar_domain = to_dmar_domain(domain);
-
-       return dmar_domain->default_pasid > 0 ?
-                       dmar_domain->default_pasid : -EINVAL;
-}
-
-static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
-                                          struct device *dev)
-{
-       return attach_deferred(dev);
-}
-
-static int
-intel_iommu_domain_set_attr(struct iommu_domain *domain,
-                           enum iommu_attr attr, void *data)
-{
-       struct dmar_domain *dmar_domain = to_dmar_domain(domain);
-       unsigned long flags;
-       int ret = 0;
-
-       if (domain->type != IOMMU_DOMAIN_UNMANAGED)
-               return -EINVAL;
-
-       switch (attr) {
-       case DOMAIN_ATTR_NESTING:
-               spin_lock_irqsave(&device_domain_lock, flags);
-               if (nested_mode_support() &&
-                   list_empty(&dmar_domain->devices)) {
-                       dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
-                       dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
-               } else {
-                       ret = -ENODEV;
-               }
-               spin_unlock_irqrestore(&device_domain_lock, flags);
-               break;
-       default:
-               ret = -EINVAL;
-               break;
-       }
-
-       return ret;
-}
-
-const struct iommu_ops intel_iommu_ops = {
-       .capable                = intel_iommu_capable,
-       .domain_alloc           = intel_iommu_domain_alloc,
-       .domain_free            = intel_iommu_domain_free,
-       .domain_set_attr        = intel_iommu_domain_set_attr,
-       .attach_dev             = intel_iommu_attach_device,
-       .detach_dev             = intel_iommu_detach_device,
-       .aux_attach_dev         = intel_iommu_aux_attach_device,
-       .aux_detach_dev         = intel_iommu_aux_detach_device,
-       .aux_get_pasid          = intel_iommu_aux_get_pasid,
-       .map                    = intel_iommu_map,
-       .unmap                  = intel_iommu_unmap,
-       .iova_to_phys           = intel_iommu_iova_to_phys,
-       .probe_device           = intel_iommu_probe_device,
-       .probe_finalize         = intel_iommu_probe_finalize,
-       .release_device         = intel_iommu_release_device,
-       .get_resv_regions       = intel_iommu_get_resv_regions,
-       .put_resv_regions       = generic_iommu_put_resv_regions,
-       .apply_resv_region      = intel_iommu_apply_resv_region,
-       .device_group           = intel_iommu_device_group,
-       .dev_has_feat           = intel_iommu_dev_has_feat,
-       .dev_feat_enabled       = intel_iommu_dev_feat_enabled,
-       .dev_enable_feat        = intel_iommu_dev_enable_feat,
-       .dev_disable_feat       = intel_iommu_dev_disable_feat,
-       .is_attach_deferred     = intel_iommu_is_attach_deferred,
-       .def_domain_type        = device_def_domain_type,
-       .pgsize_bitmap          = INTEL_IOMMU_PGSIZES,
-#ifdef CONFIG_INTEL_IOMMU_SVM
-       .cache_invalidate       = intel_iommu_sva_invalidate,
-       .sva_bind_gpasid        = intel_svm_bind_gpasid,
-       .sva_unbind_gpasid      = intel_svm_unbind_gpasid,
-       .sva_bind               = intel_svm_bind,
-       .sva_unbind             = intel_svm_unbind,
-       .sva_get_pasid          = intel_svm_get_pasid,
-#endif
-};
-
-static void quirk_iommu_igfx(struct pci_dev *dev)
-{
-       pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
-       dmar_map_gfx = 0;
-}
-
-/* G4x/GM45 integrated gfx dmar support is totally busted. */
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
-
-/* Broadwell igfx malfunctions with dmar */
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
-
-static void quirk_iommu_rwbf(struct pci_dev *dev)
-{
-       /*
-        * Mobile 4 Series Chipset neglects to set RWBF capability,
-        * but needs it. Same seems to hold for the desktop versions.
-        */
-       pci_info(dev, "Forcing write-buffer flush capability\n");
-       rwbf_quirk = 1;
-}
-
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
-
-#define GGC 0x52
-#define GGC_MEMORY_SIZE_MASK   (0xf << 8)
-#define GGC_MEMORY_SIZE_NONE   (0x0 << 8)
-#define GGC_MEMORY_SIZE_1M     (0x1 << 8)
-#define GGC_MEMORY_SIZE_2M     (0x3 << 8)
-#define GGC_MEMORY_VT_ENABLED  (0x8 << 8)
-#define GGC_MEMORY_SIZE_2M_VT  (0x9 << 8)
-#define GGC_MEMORY_SIZE_3M_VT  (0xa << 8)
-#define GGC_MEMORY_SIZE_4M_VT  (0xb << 8)
-
-static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
-{
-       unsigned short ggc;
-
-       if (pci_read_config_word(dev, GGC, &ggc))
-               return;
-
-       if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
-               pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
-               dmar_map_gfx = 0;
-       } else if (dmar_map_gfx) {
-               /* we have to ensure the gfx device is idle before we flush */
-               pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
-               intel_iommu_strict = 1;
-       }
-}
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
-
-/* On Tylersburg chipsets, some BIOSes have been known to enable the
-   ISOCH DMAR unit for the Azalia sound device, but not give it any
-   TLB entries, which causes it to deadlock. Check for that.  We do
-   this in a function called from init_dmars(), instead of in a PCI
-   quirk, because we don't want to print the obnoxious "BIOS broken"
-   message if VT-d is actually disabled.
-*/
-static void __init check_tylersburg_isoch(void)
-{
-       struct pci_dev *pdev;
-       uint32_t vtisochctrl;
-
-       /* If there's no Azalia in the system anyway, forget it. */
-       pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
-       if (!pdev)
-               return;
-       pci_dev_put(pdev);
-
-       /* System Management Registers. Might be hidden, in which case
-          we can't do the sanity check. But that's OK, because the
-          known-broken BIOSes _don't_ actually hide it, so far. */
-       pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
-       if (!pdev)
-               return;
-
-       if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
-               pci_dev_put(pdev);
-               return;
-       }
-
-       pci_dev_put(pdev);
-
-       /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
-       if (vtisochctrl & 1)
-               return;
-
-       /* Drop all bits other than the number of TLB entries */
-       vtisochctrl &= 0x1c;
-
-       /* If we have the recommended number of TLB entries (16), fine. */
-       if (vtisochctrl == 0x10)
-               return;
-
-       /* Zero TLB entries? You get to ride the short bus to school. */
-       if (!vtisochctrl) {
-               WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
-                    "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
-                    dmi_get_system_info(DMI_BIOS_VENDOR),
-                    dmi_get_system_info(DMI_BIOS_VERSION),
-                    dmi_get_system_info(DMI_PRODUCT_VERSION));
-               iommu_identity_mapping |= IDENTMAP_AZALIA;
-               return;
-       }
-
-       pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
-              vtisochctrl);
-}
diff --git a/drivers/iommu/intel-pasid.c b/drivers/iommu/intel-pasid.c

deleted file mode 100644 (file)

index c81f0f1..0000000
--- a/drivers/iommu/intel-pasid.c
+++ /dev/null
@@ -1,853 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/**
- * intel-pasid.c - PASID idr, table and entry manipulation
- *
- * Copyright (C) 2018 Intel Corporation
- *
- * Author: Lu Baolu <baolu.lu@linux.intel.com>
- */
-
-#define pr_fmt(fmt)    "DMAR: " fmt
-
-#include <linux/bitops.h>
-#include <linux/cpufeature.h>
-#include <linux/dmar.h>
-#include <linux/intel-iommu.h>
-#include <linux/iommu.h>
-#include <linux/memory.h>
-#include <linux/pci.h>
-#include <linux/pci-ats.h>
-#include <linux/spinlock.h>
-
-#include "intel-pasid.h"
-
-/*
- * Intel IOMMU system wide PASID name space:
- */
-static DEFINE_SPINLOCK(pasid_lock);
-u32 intel_pasid_max_id = PASID_MAX;
-
-int vcmd_alloc_pasid(struct intel_iommu *iommu, unsigned int *pasid)
-{
-       unsigned long flags;
-       u8 status_code;
-       int ret = 0;
-       u64 res;
-
-       raw_spin_lock_irqsave(&iommu->register_lock, flags);
-       dmar_writeq(iommu->reg + DMAR_VCMD_REG, VCMD_CMD_ALLOC);
-       IOMMU_WAIT_OP(iommu, DMAR_VCRSP_REG, dmar_readq,
-                     !(res & VCMD_VRSP_IP), res);
-       raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
-
-       status_code = VCMD_VRSP_SC(res);
-       switch (status_code) {
-       case VCMD_VRSP_SC_SUCCESS:
-               *pasid = VCMD_VRSP_RESULT_PASID(res);
-               break;
-       case VCMD_VRSP_SC_NO_PASID_AVAIL:
-               pr_info("IOMMU: %s: No PASID available\n", iommu->name);
-               ret = -ENOSPC;
-               break;
-       default:
-               ret = -ENODEV;
-               pr_warn("IOMMU: %s: Unexpected error code %d\n",
-                       iommu->name, status_code);
-       }
-
-       return ret;
-}
-
-void vcmd_free_pasid(struct intel_iommu *iommu, unsigned int pasid)
-{
-       unsigned long flags;
-       u8 status_code;
-       u64 res;
-
-       raw_spin_lock_irqsave(&iommu->register_lock, flags);
-       dmar_writeq(iommu->reg + DMAR_VCMD_REG,
-                   VCMD_CMD_OPERAND(pasid) | VCMD_CMD_FREE);
-       IOMMU_WAIT_OP(iommu, DMAR_VCRSP_REG, dmar_readq,
-                     !(res & VCMD_VRSP_IP), res);
-       raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
-
-       status_code = VCMD_VRSP_SC(res);
-       switch (status_code) {
-       case VCMD_VRSP_SC_SUCCESS:
-               break;
-       case VCMD_VRSP_SC_INVALID_PASID:
-               pr_info("IOMMU: %s: Invalid PASID\n", iommu->name);
-               break;
-       default:
-               pr_warn("IOMMU: %s: Unexpected error code %d\n",
-                       iommu->name, status_code);
-       }
-}
-
-/*
- * Per device pasid table management:
- */
-static inline void
-device_attach_pasid_table(struct device_domain_info *info,
-                         struct pasid_table *pasid_table)
-{
-       info->pasid_table = pasid_table;
-       list_add(&info->table, &pasid_table->dev);
-}
-
-static inline void
-device_detach_pasid_table(struct device_domain_info *info,
-                         struct pasid_table *pasid_table)
-{
-       info->pasid_table = NULL;
-       list_del(&info->table);
-}
-
-struct pasid_table_opaque {
-       struct pasid_table      **pasid_table;
-       int                     segment;
-       int                     bus;
-       int                     devfn;
-};
-
-static int search_pasid_table(struct device_domain_info *info, void *opaque)
-{
-       struct pasid_table_opaque *data = opaque;
-
-       if (info->iommu->segment == data->segment &&
-           info->bus == data->bus &&
-           info->devfn == data->devfn &&
-           info->pasid_table) {
-               *data->pasid_table = info->pasid_table;
-               return 1;
-       }
-
-       return 0;
-}
-
-static int get_alias_pasid_table(struct pci_dev *pdev, u16 alias, void *opaque)
-{
-       struct pasid_table_opaque *data = opaque;
-
-       data->segment = pci_domain_nr(pdev->bus);
-       data->bus = PCI_BUS_NUM(alias);
-       data->devfn = alias & 0xff;
-
-       return for_each_device_domain(&search_pasid_table, data);
-}
-
-/*
- * Allocate a pasid table for @dev. It should be called in a
- * single-thread context.
- */
-int intel_pasid_alloc_table(struct device *dev)
-{
-       struct device_domain_info *info;
-       struct pasid_table *pasid_table;
-       struct pasid_table_opaque data;
-       struct page *pages;
-       int max_pasid = 0;
-       int ret, order;
-       int size;
-
-       might_sleep();
-       info = get_domain_info(dev);
-       if (WARN_ON(!info || !dev_is_pci(dev) || info->pasid_table))
-               return -EINVAL;
-
-       /* DMA alias device already has a pasid table, use it: */
-       data.pasid_table = &pasid_table;
-       ret = pci_for_each_dma_alias(to_pci_dev(dev),
-                                    &get_alias_pasid_table, &data);
-       if (ret)
-               goto attach_out;
-
-       pasid_table = kzalloc(sizeof(*pasid_table), GFP_KERNEL);
-       if (!pasid_table)
-               return -ENOMEM;
-       INIT_LIST_HEAD(&pasid_table->dev);
-
-       if (info->pasid_supported)
-               max_pasid = min_t(int, pci_max_pasids(to_pci_dev(dev)),
-                                 intel_pasid_max_id);
-
-       size = max_pasid >> (PASID_PDE_SHIFT - 3);
-       order = size ? get_order(size) : 0;
-       pages = alloc_pages_node(info->iommu->node,
-                                GFP_KERNEL | __GFP_ZERO, order);
-       if (!pages) {
-               kfree(pasid_table);
-               return -ENOMEM;
-       }
-
-       pasid_table->table = page_address(pages);
-       pasid_table->order = order;
-       pasid_table->max_pasid = 1 << (order + PAGE_SHIFT + 3);
-
-attach_out:
-       device_attach_pasid_table(info, pasid_table);
-
-       return 0;
-}
-
-void intel_pasid_free_table(struct device *dev)
-{
-       struct device_domain_info *info;
-       struct pasid_table *pasid_table;
-       struct pasid_dir_entry *dir;
-       struct pasid_entry *table;
-       int i, max_pde;
-
-       info = get_domain_info(dev);
-       if (!info || !dev_is_pci(dev) || !info->pasid_table)
-               return;
-
-       pasid_table = info->pasid_table;
-       device_detach_pasid_table(info, pasid_table);
-
-       if (!list_empty(&pasid_table->dev))
-               return;
-
-       /* Free scalable mode PASID directory tables: */
-       dir = pasid_table->table;
-       max_pde = pasid_table->max_pasid >> PASID_PDE_SHIFT;
-       for (i = 0; i < max_pde; i++) {
-               table = get_pasid_table_from_pde(&dir[i]);
-               free_pgtable_page(table);
-       }
-
-       free_pages((unsigned long)pasid_table->table, pasid_table->order);
-       kfree(pasid_table);
-}
-
-struct pasid_table *intel_pasid_get_table(struct device *dev)
-{
-       struct device_domain_info *info;
-
-       info = get_domain_info(dev);
-       if (!info)
-               return NULL;
-
-       return info->pasid_table;
-}
-
-int intel_pasid_get_dev_max_id(struct device *dev)
-{
-       struct device_domain_info *info;
-
-       info = get_domain_info(dev);
-       if (!info || !info->pasid_table)
-               return 0;
-
-       return info->pasid_table->max_pasid;
-}
-
-struct pasid_entry *intel_pasid_get_entry(struct device *dev, int pasid)
-{
-       struct device_domain_info *info;
-       struct pasid_table *pasid_table;
-       struct pasid_dir_entry *dir;
-       struct pasid_entry *entries;
-       int dir_index, index;
-
-       pasid_table = intel_pasid_get_table(dev);
-       if (WARN_ON(!pasid_table || pasid < 0 ||
-                   pasid >= intel_pasid_get_dev_max_id(dev)))
-               return NULL;
-
-       dir = pasid_table->table;
-       info = get_domain_info(dev);
-       dir_index = pasid >> PASID_PDE_SHIFT;
-       index = pasid & PASID_PTE_MASK;
-
-       spin_lock(&pasid_lock);
-       entries = get_pasid_table_from_pde(&dir[dir_index]);
-       if (!entries) {
-               entries = alloc_pgtable_page(info->iommu->node);
-               if (!entries) {
-                       spin_unlock(&pasid_lock);
-                       return NULL;
-               }
-
-               WRITE_ONCE(dir[dir_index].val,
-                          (u64)virt_to_phys(entries) | PASID_PTE_PRESENT);
-       }
-       spin_unlock(&pasid_lock);
-
-       return &entries[index];
-}
-
-/*
- * Interfaces for PASID table entry manipulation:
- */
-static inline void pasid_clear_entry(struct pasid_entry *pe)
-{
-       WRITE_ONCE(pe->val[0], 0);
-       WRITE_ONCE(pe->val[1], 0);
-       WRITE_ONCE(pe->val[2], 0);
-       WRITE_ONCE(pe->val[3], 0);
-       WRITE_ONCE(pe->val[4], 0);
-       WRITE_ONCE(pe->val[5], 0);
-       WRITE_ONCE(pe->val[6], 0);
-       WRITE_ONCE(pe->val[7], 0);
-}
-
-static inline void pasid_clear_entry_with_fpd(struct pasid_entry *pe)
-{
-       WRITE_ONCE(pe->val[0], PASID_PTE_FPD);
-       WRITE_ONCE(pe->val[1], 0);
-       WRITE_ONCE(pe->val[2], 0);
-       WRITE_ONCE(pe->val[3], 0);
-       WRITE_ONCE(pe->val[4], 0);
-       WRITE_ONCE(pe->val[5], 0);
-       WRITE_ONCE(pe->val[6], 0);
-       WRITE_ONCE(pe->val[7], 0);
-}
-
-static void
-intel_pasid_clear_entry(struct device *dev, int pasid, bool fault_ignore)
-{
-       struct pasid_entry *pe;
-
-       pe = intel_pasid_get_entry(dev, pasid);
-       if (WARN_ON(!pe))
-               return;
-
-       if (fault_ignore && pasid_pte_is_present(pe))
-               pasid_clear_entry_with_fpd(pe);
-       else
-               pasid_clear_entry(pe);
-}
-
-static inline void pasid_set_bits(u64 *ptr, u64 mask, u64 bits)
-{
-       u64 old;
-
-       old = READ_ONCE(*ptr);
-       WRITE_ONCE(*ptr, (old & ~mask) | bits);
-}
-
-/*
- * Setup the DID(Domain Identifier) field (Bit 64~79) of scalable mode
- * PASID entry.
- */
-static inline void
-pasid_set_domain_id(struct pasid_entry *pe, u64 value)
-{
-       pasid_set_bits(&pe->val[1], GENMASK_ULL(15, 0), value);
-}
-
-/*
- * Get domain ID value of a scalable mode PASID entry.
- */
-static inline u16
-pasid_get_domain_id(struct pasid_entry *pe)
-{
-       return (u16)(READ_ONCE(pe->val[1]) & GENMASK_ULL(15, 0));
-}
-
-/*
- * Setup the SLPTPTR(Second Level Page Table Pointer) field (Bit 12~63)
- * of a scalable mode PASID entry.
- */
-static inline void
-pasid_set_slptr(struct pasid_entry *pe, u64 value)
-{
-       pasid_set_bits(&pe->val[0], VTD_PAGE_MASK, value);
-}
-
-/*
- * Setup the AW(Address Width) field (Bit 2~4) of a scalable mode PASID
- * entry.
- */
-static inline void
-pasid_set_address_width(struct pasid_entry *pe, u64 value)
-{
-       pasid_set_bits(&pe->val[0], GENMASK_ULL(4, 2), value << 2);
-}
-
-/*
- * Setup the PGTT(PASID Granular Translation Type) field (Bit 6~8)
- * of a scalable mode PASID entry.
- */
-static inline void
-pasid_set_translation_type(struct pasid_entry *pe, u64 value)
-{
-       pasid_set_bits(&pe->val[0], GENMASK_ULL(8, 6), value << 6);
-}
-
-/*
- * Enable fault processing by clearing the FPD(Fault Processing
- * Disable) field (Bit 1) of a scalable mode PASID entry.
- */
-static inline void pasid_set_fault_enable(struct pasid_entry *pe)
-{
-       pasid_set_bits(&pe->val[0], 1 << 1, 0);
-}
-
-/*
- * Setup the SRE(Supervisor Request Enable) field (Bit 128) of a
- * scalable mode PASID entry.
- */
-static inline void pasid_set_sre(struct pasid_entry *pe)
-{
-       pasid_set_bits(&pe->val[2], 1 << 0, 1);
-}
-
-/*
- * Setup the P(Present) field (Bit 0) of a scalable mode PASID
- * entry.
- */
-static inline void pasid_set_present(struct pasid_entry *pe)
-{
-       pasid_set_bits(&pe->val[0], 1 << 0, 1);
-}
-
-/*
- * Setup Page Walk Snoop bit (Bit 87) of a scalable mode PASID
- * entry.
- */
-static inline void pasid_set_page_snoop(struct pasid_entry *pe, bool value)
-{
-       pasid_set_bits(&pe->val[1], 1 << 23, value << 23);
-}
-
-/*
- * Setup the First Level Page table Pointer field (Bit 140~191)
- * of a scalable mode PASID entry.
- */
-static inline void
-pasid_set_flptr(struct pasid_entry *pe, u64 value)
-{
-       pasid_set_bits(&pe->val[2], VTD_PAGE_MASK, value);
-}
-
-/*
- * Setup the First Level Paging Mode field (Bit 130~131) of a
- * scalable mode PASID entry.
- */
-static inline void
-pasid_set_flpm(struct pasid_entry *pe, u64 value)
-{
-       pasid_set_bits(&pe->val[2], GENMASK_ULL(3, 2), value << 2);
-}
-
-/*
- * Setup the Extended Access Flag Enable (EAFE) field (Bit 135)
- * of a scalable mode PASID entry.
- */
-static inline void
-pasid_set_eafe(struct pasid_entry *pe)
-{
-       pasid_set_bits(&pe->val[2], 1 << 7, 1 << 7);
-}
-
-static void
-pasid_cache_invalidation_with_pasid(struct intel_iommu *iommu,
-                                   u16 did, int pasid)
-{
-       struct qi_desc desc;
-
-       desc.qw0 = QI_PC_DID(did) | QI_PC_GRAN(QI_PC_PASID_SEL) |
-               QI_PC_PASID(pasid) | QI_PC_TYPE;
-       desc.qw1 = 0;
-       desc.qw2 = 0;
-       desc.qw3 = 0;
-
-       qi_submit_sync(iommu, &desc, 1, 0);
-}
-
-static void
-iotlb_invalidation_with_pasid(struct intel_iommu *iommu, u16 did, u32 pasid)
-{
-       struct qi_desc desc;
-
-       desc.qw0 = QI_EIOTLB_PASID(pasid) | QI_EIOTLB_DID(did) |
-                       QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) | QI_EIOTLB_TYPE;
-       desc.qw1 = 0;
-       desc.qw2 = 0;
-       desc.qw3 = 0;
-
-       qi_submit_sync(iommu, &desc, 1, 0);
-}
-
-static void
-devtlb_invalidation_with_pasid(struct intel_iommu *iommu,
-                              struct device *dev, int pasid)
-{
-       struct device_domain_info *info;
-       u16 sid, qdep, pfsid;
-
-       info = get_domain_info(dev);
-       if (!info || !info->ats_enabled)
-               return;
-
-       sid = info->bus << 8 | info->devfn;
-       qdep = info->ats_qdep;
-       pfsid = info->pfsid;
-
-       qi_flush_dev_iotlb(iommu, sid, pfsid, qdep, 0, 64 - VTD_PAGE_SHIFT);
-}
-
-void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev,
-                                int pasid, bool fault_ignore)
-{
-       struct pasid_entry *pte;
-       u16 did;
-
-       pte = intel_pasid_get_entry(dev, pasid);
-       if (WARN_ON(!pte))
-               return;
-
-       did = pasid_get_domain_id(pte);
-       intel_pasid_clear_entry(dev, pasid, fault_ignore);
-
-       if (!ecap_coherent(iommu->ecap))
-               clflush_cache_range(pte, sizeof(*pte));
-
-       pasid_cache_invalidation_with_pasid(iommu, did, pasid);
-       iotlb_invalidation_with_pasid(iommu, did, pasid);
-
-       /* Device IOTLB doesn't need to be flushed in caching mode. */
-       if (!cap_caching_mode(iommu->cap))
-               devtlb_invalidation_with_pasid(iommu, dev, pasid);
-}
-
-static void pasid_flush_caches(struct intel_iommu *iommu,
-                               struct pasid_entry *pte,
-                               int pasid, u16 did)
-{
-       if (!ecap_coherent(iommu->ecap))
-               clflush_cache_range(pte, sizeof(*pte));
-
-       if (cap_caching_mode(iommu->cap)) {
-               pasid_cache_invalidation_with_pasid(iommu, did, pasid);
-               iotlb_invalidation_with_pasid(iommu, did, pasid);
-       } else {
-               iommu_flush_write_buffer(iommu);
-       }
-}
-
-/*
- * Set up the scalable mode pasid table entry for first only
- * translation type.
- */
-int intel_pasid_setup_first_level(struct intel_iommu *iommu,
-                                 struct device *dev, pgd_t *pgd,
-                                 int pasid, u16 did, int flags)
-{
-       struct pasid_entry *pte;
-
-       if (!ecap_flts(iommu->ecap)) {
-               pr_err("No first level translation support on %s\n",
-                      iommu->name);
-               return -EINVAL;
-       }
-
-       pte = intel_pasid_get_entry(dev, pasid);
-       if (WARN_ON(!pte))
-               return -EINVAL;
-
-       pasid_clear_entry(pte);
-
-       /* Setup the first level page table pointer: */
-       pasid_set_flptr(pte, (u64)__pa(pgd));
-       if (flags & PASID_FLAG_SUPERVISOR_MODE) {
-               if (!ecap_srs(iommu->ecap)) {
-                       pr_err("No supervisor request support on %s\n",
-                              iommu->name);
-                       return -EINVAL;
-               }
-               pasid_set_sre(pte);
-       }
-
-       if (flags & PASID_FLAG_FL5LP) {
-               if (cap_5lp_support(iommu->cap)) {
-                       pasid_set_flpm(pte, 1);
-               } else {
-                       pr_err("No 5-level paging support for first-level\n");
-                       pasid_clear_entry(pte);
-                       return -EINVAL;
-               }
-       }
-
-       pasid_set_domain_id(pte, did);
-       pasid_set_address_width(pte, iommu->agaw);
-       pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
-
-       /* Setup Present and PASID Granular Transfer Type: */
-       pasid_set_translation_type(pte, PASID_ENTRY_PGTT_FL_ONLY);
-       pasid_set_present(pte);
-       pasid_flush_caches(iommu, pte, pasid, did);
-
-       return 0;
-}
-
-/*
- * Skip top levels of page tables for iommu which has less agaw
- * than default. Unnecessary for PT mode.
- */
-static inline int iommu_skip_agaw(struct dmar_domain *domain,
-                                 struct intel_iommu *iommu,
-                                 struct dma_pte **pgd)
-{
-       int agaw;
-
-       for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
-               *pgd = phys_to_virt(dma_pte_addr(*pgd));
-               if (!dma_pte_present(*pgd))
-                       return -EINVAL;
-       }
-
-       return agaw;
-}
-
-/*
- * Set up the scalable mode pasid entry for second only translation type.
- */
-int intel_pasid_setup_second_level(struct intel_iommu *iommu,
-                                  struct dmar_domain *domain,
-                                  struct device *dev, int pasid)
-{
-       struct pasid_entry *pte;
-       struct dma_pte *pgd;
-       u64 pgd_val;
-       int agaw;
-       u16 did;
-
-       /*
-        * If hardware advertises no support for second level
-        * translation, return directly.
-        */
-       if (!ecap_slts(iommu->ecap)) {
-               pr_err("No second level translation support on %s\n",
-                      iommu->name);
-               return -EINVAL;
-       }
-
-       pgd = domain->pgd;
-       agaw = iommu_skip_agaw(domain, iommu, &pgd);
-       if (agaw < 0) {
-               dev_err(dev, "Invalid domain page table\n");
-               return -EINVAL;
-       }
-
-       pgd_val = virt_to_phys(pgd);
-       did = domain->iommu_did[iommu->seq_id];
-
-       pte = intel_pasid_get_entry(dev, pasid);
-       if (!pte) {
-               dev_err(dev, "Failed to get pasid entry of PASID %d\n", pasid);
-               return -ENODEV;
-       }
-
-       pasid_clear_entry(pte);
-       pasid_set_domain_id(pte, did);
-       pasid_set_slptr(pte, pgd_val);
-       pasid_set_address_width(pte, agaw);
-       pasid_set_translation_type(pte, PASID_ENTRY_PGTT_SL_ONLY);
-       pasid_set_fault_enable(pte);
-       pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
-
-       /*
-        * Since it is a second level only translation setup, we should
-        * set SRE bit as well (addresses are expected to be GPAs).
-        */
-       pasid_set_sre(pte);
-       pasid_set_present(pte);
-       pasid_flush_caches(iommu, pte, pasid, did);
-
-       return 0;
-}
-
-/*
- * Set up the scalable mode pasid entry for passthrough translation type.
- */
-int intel_pasid_setup_pass_through(struct intel_iommu *iommu,
-                                  struct dmar_domain *domain,
-                                  struct device *dev, int pasid)
-{
-       u16 did = FLPT_DEFAULT_DID;
-       struct pasid_entry *pte;
-
-       pte = intel_pasid_get_entry(dev, pasid);
-       if (!pte) {
-               dev_err(dev, "Failed to get pasid entry of PASID %d\n", pasid);
-               return -ENODEV;
-       }
-
-       pasid_clear_entry(pte);
-       pasid_set_domain_id(pte, did);
-       pasid_set_address_width(pte, iommu->agaw);
-       pasid_set_translation_type(pte, PASID_ENTRY_PGTT_PT);
-       pasid_set_fault_enable(pte);
-       pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
-
-       /*
-        * We should set SRE bit as well since the addresses are expected
-        * to be GPAs.
-        */
-       pasid_set_sre(pte);
-       pasid_set_present(pte);
-       pasid_flush_caches(iommu, pte, pasid, did);
-
-       return 0;
-}
-
-static int
-intel_pasid_setup_bind_data(struct intel_iommu *iommu, struct pasid_entry *pte,
-                           struct iommu_gpasid_bind_data_vtd *pasid_data)
-{
-       /*
-        * Not all guest PASID table entry fields are passed down during bind,
-        * here we only set up the ones that are dependent on guest settings.
-        * Execution related bits such as NXE, SMEP are not supported.
-        * Other fields, such as snoop related, are set based on host needs
-        * regardless of guest settings.
-        */
-       if (pasid_data->flags & IOMMU_SVA_VTD_GPASID_SRE) {
-               if (!ecap_srs(iommu->ecap)) {
-                       pr_err_ratelimited("No supervisor request support on %s\n",
-                                          iommu->name);
-                       return -EINVAL;
-               }
-               pasid_set_sre(pte);
-       }
-
-       if (pasid_data->flags & IOMMU_SVA_VTD_GPASID_EAFE) {
-               if (!ecap_eafs(iommu->ecap)) {
-                       pr_err_ratelimited("No extended access flag support on %s\n",
-                                          iommu->name);
-                       return -EINVAL;
-               }
-               pasid_set_eafe(pte);
-       }
-
-       /*
-        * Memory type is only applicable to devices inside processor coherent
-        * domain. Will add MTS support once coherent devices are available.
-        */
-       if (pasid_data->flags & IOMMU_SVA_VTD_GPASID_MTS_MASK) {
-               pr_warn_ratelimited("No memory type support %s\n",
-                                   iommu->name);
-               return -EINVAL;
-       }
-
-       return 0;
-}
-
-/**
- * intel_pasid_setup_nested() - Set up PASID entry for nested translation.
- * This could be used for guest shared virtual address. In this case, the
- * first level page tables are used for GVA-GPA translation in the guest,
- * second level page tables are used for GPA-HPA translation.
- *
- * @iommu:      IOMMU which the device belong to
- * @dev:        Device to be set up for translation
- * @gpgd:       FLPTPTR: First Level Page translation pointer in GPA
- * @pasid:      PASID to be programmed in the device PASID table
- * @pasid_data: Additional PASID info from the guest bind request
- * @domain:     Domain info for setting up second level page tables
- * @addr_width: Address width of the first level (guest)
- */
-int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev,
-                            pgd_t *gpgd, int pasid,
-                            struct iommu_gpasid_bind_data_vtd *pasid_data,
-                            struct dmar_domain *domain, int addr_width)
-{
-       struct pasid_entry *pte;
-       struct dma_pte *pgd;
-       int ret = 0;
-       u64 pgd_val;
-       int agaw;
-       u16 did;
-
-       if (!ecap_nest(iommu->ecap)) {
-               pr_err_ratelimited("IOMMU: %s: No nested translation support\n",
-                                  iommu->name);
-               return -EINVAL;
-       }
-
-       if (!(domain->flags & DOMAIN_FLAG_NESTING_MODE)) {
-               pr_err_ratelimited("Domain is not in nesting mode, %x\n",
-                                  domain->flags);
-               return -EINVAL;
-       }
-
-       pte = intel_pasid_get_entry(dev, pasid);
-       if (WARN_ON(!pte))
-               return -EINVAL;
-
-       /*
-        * Caller must ensure PASID entry is not in use, i.e. not bind the
-        * same PASID to the same device twice.
-        */
-       if (pasid_pte_is_present(pte))
-               return -EBUSY;
-
-       pasid_clear_entry(pte);
-
-       /* Sanity checking performed by caller to make sure address
-        * width matching in two dimensions:
-        * 1. CPU vs. IOMMU
-        * 2. Guest vs. Host.
-        */
-       switch (addr_width) {
-#ifdef CONFIG_X86
-       case ADDR_WIDTH_5LEVEL:
-               if (!cpu_feature_enabled(X86_FEATURE_LA57) ||
-                   !cap_5lp_support(iommu->cap)) {
-                       dev_err_ratelimited(dev,
-                                           "5-level paging not supported\n");
-                       return -EINVAL;
-               }
-
-               pasid_set_flpm(pte, 1);
-               break;
-#endif
-       case ADDR_WIDTH_4LEVEL:
-               pasid_set_flpm(pte, 0);
-               break;
-       default:
-               dev_err_ratelimited(dev, "Invalid guest address width %d\n",
-                                   addr_width);
-               return -EINVAL;
-       }
-
-       /* First level PGD is in GPA, must be supported by the second level */
-       if ((uintptr_t)gpgd > domain->max_addr) {
-               dev_err_ratelimited(dev,
-                                   "Guest PGD %lx not supported, max %llx\n",
-                                   (uintptr_t)gpgd, domain->max_addr);
-               return -EINVAL;
-       }
-       pasid_set_flptr(pte, (uintptr_t)gpgd);
-
-       ret = intel_pasid_setup_bind_data(iommu, pte, pasid_data);
-       if (ret)
-               return ret;
-
-       /* Setup the second level based on the given domain */
-       pgd = domain->pgd;
-
-       agaw = iommu_skip_agaw(domain, iommu, &pgd);
-       if (agaw < 0) {
-               dev_err_ratelimited(dev, "Invalid domain page table\n");
-               return -EINVAL;
-       }
-       pgd_val = virt_to_phys(pgd);
-       pasid_set_slptr(pte, pgd_val);
-       pasid_set_fault_enable(pte);
-
-       did = domain->iommu_did[iommu->seq_id];
-       pasid_set_domain_id(pte, did);
-
-       pasid_set_address_width(pte, agaw);
-       pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
-
-       pasid_set_translation_type(pte, PASID_ENTRY_PGTT_NESTED);
-       pasid_set_present(pte);
-       pasid_flush_caches(iommu, pte, pasid, did);
-
-       return ret;
-}
diff --git a/drivers/iommu/intel-pasid.h b/drivers/iommu/intel-pasid.h

deleted file mode 100644 (file)

index c5318d4..0000000
--- a/drivers/iommu/intel-pasid.h
+++ /dev/null
@@ -1,128 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * intel-pasid.h - PASID idr, table and entry header
- *
- * Copyright (C) 2018 Intel Corporation
- *
- * Author: Lu Baolu <baolu.lu@linux.intel.com>
- */
-
-#ifndef __INTEL_PASID_H
-#define __INTEL_PASID_H
-
-#define PASID_RID2PASID                        0x0
-#define PASID_MIN                      0x1
-#define PASID_MAX                      0x100000
-#define PASID_PTE_MASK                 0x3F
-#define PASID_PTE_PRESENT              1
-#define PASID_PTE_FPD                  2
-#define PDE_PFN_MASK                   PAGE_MASK
-#define PASID_PDE_SHIFT                        6
-#define MAX_NR_PASID_BITS              20
-#define PASID_TBL_ENTRIES              BIT(PASID_PDE_SHIFT)
-
-#define is_pasid_enabled(entry)                (((entry)->lo >> 3) & 0x1)
-#define get_pasid_dir_size(entry)      (1 << ((((entry)->lo >> 9) & 0x7) + 7))
-
-/* Virtual command interface for enlightened pasid management. */
-#define VCMD_CMD_ALLOC                 0x1
-#define VCMD_CMD_FREE                  0x2
-#define VCMD_VRSP_IP                   0x1
-#define VCMD_VRSP_SC(e)                        (((e) >> 1) & 0x3)
-#define VCMD_VRSP_SC_SUCCESS           0
-#define VCMD_VRSP_SC_NO_PASID_AVAIL    1
-#define VCMD_VRSP_SC_INVALID_PASID     1
-#define VCMD_VRSP_RESULT_PASID(e)      (((e) >> 8) & 0xfffff)
-#define VCMD_CMD_OPERAND(e)            ((e) << 8)
-/*
- * Domain ID reserved for pasid entries programmed for first-level
- * only and pass-through transfer modes.
- */
-#define FLPT_DEFAULT_DID               1
-
-/*
- * The SUPERVISOR_MODE flag indicates a first level translation which
- * can be used for access to kernel addresses. It is valid only for
- * access to the kernel's static 1:1 mapping of physical memory — not
- * to vmalloc or even module mappings.
- */
-#define PASID_FLAG_SUPERVISOR_MODE     BIT(0)
-#define PASID_FLAG_NESTED              BIT(1)
-
-/*
- * The PASID_FLAG_FL5LP flag Indicates using 5-level paging for first-
- * level translation, otherwise, 4-level paging will be used.
- */
-#define PASID_FLAG_FL5LP               BIT(1)
-
-struct pasid_dir_entry {
-       u64 val;
-};
-
-struct pasid_entry {
-       u64 val[8];
-};
-
-#define PASID_ENTRY_PGTT_FL_ONLY       (1)
-#define PASID_ENTRY_PGTT_SL_ONLY       (2)
-#define PASID_ENTRY_PGTT_NESTED                (3)
-#define PASID_ENTRY_PGTT_PT            (4)
-
-/* The representative of a PASID table */
-struct pasid_table {
-       void                    *table;         /* pasid table pointer */
-       int                     order;          /* page order of pasid table */
-       int                     max_pasid;      /* max pasid */
-       struct list_head        dev;            /* device list */
-};
-
-/* Get PRESENT bit of a PASID directory entry. */
-static inline bool pasid_pde_is_present(struct pasid_dir_entry *pde)
-{
-       return READ_ONCE(pde->val) & PASID_PTE_PRESENT;
-}
-
-/* Get PASID table from a PASID directory entry. */
-static inline struct pasid_entry *
-get_pasid_table_from_pde(struct pasid_dir_entry *pde)
-{
-       if (!pasid_pde_is_present(pde))
-               return NULL;
-
-       return phys_to_virt(READ_ONCE(pde->val) & PDE_PFN_MASK);
-}
-
-/* Get PRESENT bit of a PASID table entry. */
-static inline bool pasid_pte_is_present(struct pasid_entry *pte)
-{
-       return READ_ONCE(pte->val[0]) & PASID_PTE_PRESENT;
-}
-
-extern u32 intel_pasid_max_id;
-int intel_pasid_alloc_id(void *ptr, int start, int end, gfp_t gfp);
-void intel_pasid_free_id(int pasid);
-void *intel_pasid_lookup_id(int pasid);
-int intel_pasid_alloc_table(struct device *dev);
-void intel_pasid_free_table(struct device *dev);
-struct pasid_table *intel_pasid_get_table(struct device *dev);
-int intel_pasid_get_dev_max_id(struct device *dev);
-struct pasid_entry *intel_pasid_get_entry(struct device *dev, int pasid);
-int intel_pasid_setup_first_level(struct intel_iommu *iommu,
-                                 struct device *dev, pgd_t *pgd,
-                                 int pasid, u16 did, int flags);
-int intel_pasid_setup_second_level(struct intel_iommu *iommu,
-                                  struct dmar_domain *domain,
-                                  struct device *dev, int pasid);
-int intel_pasid_setup_pass_through(struct intel_iommu *iommu,
-                                  struct dmar_domain *domain,
-                                  struct device *dev, int pasid);
-int intel_pasid_setup_nested(struct intel_iommu *iommu,
-                            struct device *dev, pgd_t *pgd, int pasid,
-                            struct iommu_gpasid_bind_data_vtd *pasid_data,
-                            struct dmar_domain *domain, int addr_width);
-void intel_pasid_tear_down_entry(struct intel_iommu *iommu,
-                                struct device *dev, int pasid,
-                                bool fault_ignore);
-int vcmd_alloc_pasid(struct intel_iommu *iommu, unsigned int *pasid);
-void vcmd_free_pasid(struct intel_iommu *iommu, unsigned int pasid);
-#endif /* __INTEL_PASID_H */
diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c

deleted file mode 100644 (file)

index a035ef9..0000000
--- a/drivers/iommu/intel-svm.c
+++ /dev/null
@@ -1,1002 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright © 2015 Intel Corporation.
- *
- * Authors: David Woodhouse <dwmw2@infradead.org>
- */
-
-#include <linux/intel-iommu.h>
-#include <linux/mmu_notifier.h>
-#include <linux/sched.h>
-#include <linux/sched/mm.h>
-#include <linux/slab.h>
-#include <linux/intel-svm.h>
-#include <linux/rculist.h>
-#include <linux/pci.h>
-#include <linux/pci-ats.h>
-#include <linux/dmar.h>
-#include <linux/interrupt.h>
-#include <linux/mm_types.h>
-#include <linux/ioasid.h>
-#include <asm/page.h>
-
-#include "intel-pasid.h"
-
-static irqreturn_t prq_event_thread(int irq, void *d);
-static void intel_svm_drain_prq(struct device *dev, int pasid);
-
-#define PRQ_ORDER 0
-
-int intel_svm_enable_prq(struct intel_iommu *iommu)
-{
-       struct page *pages;
-       int irq, ret;
-
-       pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, PRQ_ORDER);
-       if (!pages) {
-               pr_warn("IOMMU: %s: Failed to allocate page request queue\n",
-                       iommu->name);
-               return -ENOMEM;
-       }
-       iommu->prq = page_address(pages);
-
-       irq = dmar_alloc_hwirq(DMAR_UNITS_SUPPORTED + iommu->seq_id, iommu->node, iommu);
-       if (irq <= 0) {
-               pr_err("IOMMU: %s: Failed to create IRQ vector for page request queue\n",
-                      iommu->name);
-               ret = -EINVAL;
-       err:
-               free_pages((unsigned long)iommu->prq, PRQ_ORDER);
-               iommu->prq = NULL;
-               return ret;
-       }
-       iommu->pr_irq = irq;
-
-       snprintf(iommu->prq_name, sizeof(iommu->prq_name), "dmar%d-prq", iommu->seq_id);
-
-       ret = request_threaded_irq(irq, NULL, prq_event_thread, IRQF_ONESHOT,
-                                  iommu->prq_name, iommu);
-       if (ret) {
-               pr_err("IOMMU: %s: Failed to request IRQ for page request queue\n",
-                      iommu->name);
-               dmar_free_hwirq(irq);
-               iommu->pr_irq = 0;
-               goto err;
-       }
-       dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL);
-       dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL);
-       dmar_writeq(iommu->reg + DMAR_PQA_REG, virt_to_phys(iommu->prq) | PRQ_ORDER);
-
-       init_completion(&iommu->prq_complete);
-
-       return 0;
-}
-
-int intel_svm_finish_prq(struct intel_iommu *iommu)
-{
-       dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL);
-       dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL);
-       dmar_writeq(iommu->reg + DMAR_PQA_REG, 0ULL);
-
-       if (iommu->pr_irq) {
-               free_irq(iommu->pr_irq, iommu);
-               dmar_free_hwirq(iommu->pr_irq);
-               iommu->pr_irq = 0;
-       }
-
-       free_pages((unsigned long)iommu->prq, PRQ_ORDER);
-       iommu->prq = NULL;
-
-       return 0;
-}
-
-static inline bool intel_svm_capable(struct intel_iommu *iommu)
-{
-       return iommu->flags & VTD_FLAG_SVM_CAPABLE;
-}
-
-void intel_svm_check(struct intel_iommu *iommu)
-{
-       if (!pasid_supported(iommu))
-               return;
-
-       if (cpu_feature_enabled(X86_FEATURE_GBPAGES) &&
-           !cap_fl1gp_support(iommu->cap)) {
-               pr_err("%s SVM disabled, incompatible 1GB page capability\n",
-                      iommu->name);
-               return;
-       }
-
-       if (cpu_feature_enabled(X86_FEATURE_LA57) &&
-           !cap_5lp_support(iommu->cap)) {
-               pr_err("%s SVM disabled, incompatible paging mode\n",
-                      iommu->name);
-               return;
-       }
-
-       iommu->flags |= VTD_FLAG_SVM_CAPABLE;
-}
-
-static void intel_flush_svm_range_dev (struct intel_svm *svm, struct intel_svm_dev *sdev,
-                               unsigned long address, unsigned long pages, int ih)
-{
-       struct qi_desc desc;
-
-       if (pages == -1) {
-               desc.qw0 = QI_EIOTLB_PASID(svm->pasid) |
-                       QI_EIOTLB_DID(sdev->did) |
-                       QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) |
-                       QI_EIOTLB_TYPE;
-               desc.qw1 = 0;
-       } else {
-               int mask = ilog2(__roundup_pow_of_two(pages));
-
-               desc.qw0 = QI_EIOTLB_PASID(svm->pasid) |
-                               QI_EIOTLB_DID(sdev->did) |
-                               QI_EIOTLB_GRAN(QI_GRAN_PSI_PASID) |
-                               QI_EIOTLB_TYPE;
-               desc.qw1 = QI_EIOTLB_ADDR(address) |
-                               QI_EIOTLB_IH(ih) |
-                               QI_EIOTLB_AM(mask);
-       }
-       desc.qw2 = 0;
-       desc.qw3 = 0;
-       qi_submit_sync(svm->iommu, &desc, 1, 0);
-
-       if (sdev->dev_iotlb) {
-               desc.qw0 = QI_DEV_EIOTLB_PASID(svm->pasid) |
-                               QI_DEV_EIOTLB_SID(sdev->sid) |
-                               QI_DEV_EIOTLB_QDEP(sdev->qdep) |
-                               QI_DEIOTLB_TYPE;
-               if (pages == -1) {
-                       desc.qw1 = QI_DEV_EIOTLB_ADDR(-1ULL >> 1) |
-                                       QI_DEV_EIOTLB_SIZE;
-               } else if (pages > 1) {
-                       /* The least significant zero bit indicates the size. So,
-                        * for example, an "address" value of 0x12345f000 will
-                        * flush from 0x123440000 to 0x12347ffff (256KiB). */
-                       unsigned long last = address + ((unsigned long)(pages - 1) << VTD_PAGE_SHIFT);
-                       unsigned long mask = __rounddown_pow_of_two(address ^ last);
-
-                       desc.qw1 = QI_DEV_EIOTLB_ADDR((address & ~mask) |
-                                       (mask - 1)) | QI_DEV_EIOTLB_SIZE;
-               } else {
-                       desc.qw1 = QI_DEV_EIOTLB_ADDR(address);
-               }
-               desc.qw2 = 0;
-               desc.qw3 = 0;
-               qi_submit_sync(svm->iommu, &desc, 1, 0);
-       }
-}
-
-static void intel_flush_svm_range(struct intel_svm *svm, unsigned long address,
-                               unsigned long pages, int ih)
-{
-       struct intel_svm_dev *sdev;
-
-       rcu_read_lock();
-       list_for_each_entry_rcu(sdev, &svm->devs, list)
-               intel_flush_svm_range_dev(svm, sdev, address, pages, ih);
-       rcu_read_unlock();
-}
-
-/* Pages have been freed at this point */
-static void intel_invalidate_range(struct mmu_notifier *mn,
-                                  struct mm_struct *mm,
-                                  unsigned long start, unsigned long end)
-{
-       struct intel_svm *svm = container_of(mn, struct intel_svm, notifier);
-
-       intel_flush_svm_range(svm, start,
-                             (end - start + PAGE_SIZE - 1) >> VTD_PAGE_SHIFT, 0);
-}
-
-static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
-{
-       struct intel_svm *svm = container_of(mn, struct intel_svm, notifier);
-       struct intel_svm_dev *sdev;
-
-       /* This might end up being called from exit_mmap(), *before* the page
-        * tables are cleared. And __mmu_notifier_release() will delete us from
-        * the list of notifiers so that our invalidate_range() callback doesn't
-        * get called when the page tables are cleared. So we need to protect
-        * against hardware accessing those page tables.
-        *
-        * We do it by clearing the entry in the PASID table and then flushing
-        * the IOTLB and the PASID table caches. This might upset hardware;
-        * perhaps we'll want to point the PASID to a dummy PGD (like the zero
-        * page) so that we end up taking a fault that the hardware really
-        * *has* to handle gracefully without affecting other processes.
-        */
-       rcu_read_lock();
-       list_for_each_entry_rcu(sdev, &svm->devs, list)
-               intel_pasid_tear_down_entry(svm->iommu, sdev->dev,
-                                           svm->pasid, true);
-       rcu_read_unlock();
-
-}
-
-static const struct mmu_notifier_ops intel_mmuops = {
-       .release = intel_mm_release,
-       .invalidate_range = intel_invalidate_range,
-};
-
-static DEFINE_MUTEX(pasid_mutex);
-static LIST_HEAD(global_svm_list);
-
-#define for_each_svm_dev(sdev, svm, d)                 \
-       list_for_each_entry((sdev), &(svm)->devs, list) \
-               if ((d) != (sdev)->dev) {} else
-
-int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev,
-                         struct iommu_gpasid_bind_data *data)
-{
-       struct intel_iommu *iommu = intel_svm_device_to_iommu(dev);
-       struct dmar_domain *dmar_domain;
-       struct intel_svm_dev *sdev;
-       struct intel_svm *svm;
-       int ret = 0;
-
-       if (WARN_ON(!iommu) || !data)
-               return -EINVAL;
-
-       if (data->version != IOMMU_GPASID_BIND_VERSION_1 ||
-           data->format != IOMMU_PASID_FORMAT_INTEL_VTD)
-               return -EINVAL;
-
-       if (!dev_is_pci(dev))
-               return -ENOTSUPP;
-
-       /* VT-d supports devices with full 20 bit PASIDs only */
-       if (pci_max_pasids(to_pci_dev(dev)) != PASID_MAX)
-               return -EINVAL;
-
-       /*
-        * We only check host PASID range, we have no knowledge to check
-        * guest PASID range.
-        */
-       if (data->hpasid <= 0 || data->hpasid >= PASID_MAX)
-               return -EINVAL;
-
-       dmar_domain = to_dmar_domain(domain);
-
-       mutex_lock(&pasid_mutex);
-       svm = ioasid_find(NULL, data->hpasid, NULL);
-       if (IS_ERR(svm)) {
-               ret = PTR_ERR(svm);
-               goto out;
-       }
-
-       if (svm) {
-               /*
-                * If we found svm for the PASID, there must be at
-                * least one device bond, otherwise svm should be freed.
-                */
-               if (WARN_ON(list_empty(&svm->devs))) {
-                       ret = -EINVAL;
-                       goto out;
-               }
-
-               for_each_svm_dev(sdev, svm, dev) {
-                       /*
-                        * For devices with aux domains, we should allow
-                        * multiple bind calls with the same PASID and pdev.
-                        */
-                       if (iommu_dev_feature_enabled(dev,
-                                                     IOMMU_DEV_FEAT_AUX)) {
-                               sdev->users++;
-                       } else {
-                               dev_warn_ratelimited(dev,
-                                                    "Already bound with PASID %u\n",
-                                                    svm->pasid);
-                               ret = -EBUSY;
-                       }
-                       goto out;
-               }
-       } else {
-               /* We come here when PASID has never been bond to a device. */
-               svm = kzalloc(sizeof(*svm), GFP_KERNEL);
-               if (!svm) {
-                       ret = -ENOMEM;
-                       goto out;
-               }
-               /* REVISIT: upper layer/VFIO can track host process that bind
-                * the PASID. ioasid_set = mm might be sufficient for vfio to
-                * check pasid VMM ownership. We can drop the following line
-                * once VFIO and IOASID set check is in place.
-                */
-               svm->mm = get_task_mm(current);
-               svm->pasid = data->hpasid;
-               if (data->flags & IOMMU_SVA_GPASID_VAL) {
-                       svm->gpasid = data->gpasid;
-                       svm->flags |= SVM_FLAG_GUEST_PASID;
-               }
-               ioasid_set_data(data->hpasid, svm);
-               INIT_LIST_HEAD_RCU(&svm->devs);
-               mmput(svm->mm);
-       }
-       sdev = kzalloc(sizeof(*sdev), GFP_KERNEL);
-       if (!sdev) {
-               ret = -ENOMEM;
-               goto out;
-       }
-       sdev->dev = dev;
-
-       /* Only count users if device has aux domains */
-       if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX))
-               sdev->users = 1;
-
-       /* Set up device context entry for PASID if not enabled already */
-       ret = intel_iommu_enable_pasid(iommu, sdev->dev);
-       if (ret) {
-               dev_err_ratelimited(dev, "Failed to enable PASID capability\n");
-               kfree(sdev);
-               goto out;
-       }
-
-       /*
-        * PASID table is per device for better security. Therefore, for
-        * each bind of a new device even with an existing PASID, we need to
-        * call the nested mode setup function here.
-        */
-       spin_lock(&iommu->lock);
-       ret = intel_pasid_setup_nested(iommu, dev,
-                                      (pgd_t *)(uintptr_t)data->gpgd,
-                                      data->hpasid, &data->vtd, dmar_domain,
-                                      data->addr_width);
-       spin_unlock(&iommu->lock);
-       if (ret) {
-               dev_err_ratelimited(dev, "Failed to set up PASID %llu in nested mode, Err %d\n",
-                                   data->hpasid, ret);
-               /*
-                * PASID entry should be in cleared state if nested mode
-                * set up failed. So we only need to clear IOASID tracking
-                * data such that free call will succeed.
-                */
-               kfree(sdev);
-               goto out;
-       }
-
-       svm->flags |= SVM_FLAG_GUEST_MODE;
-
-       init_rcu_head(&sdev->rcu);
-       list_add_rcu(&sdev->list, &svm->devs);
- out:
-       if (!IS_ERR_OR_NULL(svm) && list_empty(&svm->devs)) {
-               ioasid_set_data(data->hpasid, NULL);
-               kfree(svm);
-       }
-
-       mutex_unlock(&pasid_mutex);
-       return ret;
-}
-
-int intel_svm_unbind_gpasid(struct device *dev, int pasid)
-{
-       struct intel_iommu *iommu = intel_svm_device_to_iommu(dev);
-       struct intel_svm_dev *sdev;
-       struct intel_svm *svm;
-       int ret = -EINVAL;
-
-       if (WARN_ON(!iommu))
-               return -EINVAL;
-
-       mutex_lock(&pasid_mutex);
-       svm = ioasid_find(NULL, pasid, NULL);
-       if (!svm) {
-               ret = -EINVAL;
-               goto out;
-       }
-
-       if (IS_ERR(svm)) {
-               ret = PTR_ERR(svm);
-               goto out;
-       }
-
-       for_each_svm_dev(sdev, svm, dev) {
-               ret = 0;
-               if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX))
-                       sdev->users--;
-               if (!sdev->users) {
-                       list_del_rcu(&sdev->list);
-                       intel_pasid_tear_down_entry(iommu, dev,
-                                                   svm->pasid, false);
-                       intel_svm_drain_prq(dev, svm->pasid);
-                       kfree_rcu(sdev, rcu);
-
-                       if (list_empty(&svm->devs)) {
-                               /*
-                                * We do not free the IOASID here in that
-                                * IOMMU driver did not allocate it.
-                                * Unlike native SVM, IOASID for guest use was
-                                * allocated prior to the bind call.
-                                * In any case, if the free call comes before
-                                * the unbind, IOMMU driver will get notified
-                                * and perform cleanup.
-                                */
-                               ioasid_set_data(pasid, NULL);
-                               kfree(svm);
-                       }
-               }
-               break;
-       }
-out:
-       mutex_unlock(&pasid_mutex);
-       return ret;
-}
-
-/* Caller must hold pasid_mutex, mm reference */
-static int
-intel_svm_bind_mm(struct device *dev, int flags, struct svm_dev_ops *ops,
-                 struct mm_struct *mm, struct intel_svm_dev **sd)
-{
-       struct intel_iommu *iommu = intel_svm_device_to_iommu(dev);
-       struct device_domain_info *info;
-       struct intel_svm_dev *sdev;
-       struct intel_svm *svm = NULL;
-       int pasid_max;
-       int ret;
-
-       if (!iommu || dmar_disabled)
-               return -EINVAL;
-
-       if (!intel_svm_capable(iommu))
-               return -ENOTSUPP;
-
-       if (dev_is_pci(dev)) {
-               pasid_max = pci_max_pasids(to_pci_dev(dev));
-               if (pasid_max < 0)
-                       return -EINVAL;
-       } else
-               pasid_max = 1 << 20;
-
-       /* Bind supervisor PASID shuld have mm = NULL */
-       if (flags & SVM_FLAG_SUPERVISOR_MODE) {
-               if (!ecap_srs(iommu->ecap) || mm) {
-                       pr_err("Supervisor PASID with user provided mm.\n");
-                       return -EINVAL;
-               }
-       }
-
-       if (!(flags & SVM_FLAG_PRIVATE_PASID)) {
-               struct intel_svm *t;
-
-               list_for_each_entry(t, &global_svm_list, list) {
-                       if (t->mm != mm || (t->flags & SVM_FLAG_PRIVATE_PASID))
-                               continue;
-
-                       svm = t;
-                       if (svm->pasid >= pasid_max) {
-                               dev_warn(dev,
-                                        "Limited PASID width. Cannot use existing PASID %d\n",
-                                        svm->pasid);
-                               ret = -ENOSPC;
-                               goto out;
-                       }
-
-                       /* Find the matching device in svm list */
-                       for_each_svm_dev(sdev, svm, dev) {
-                               if (sdev->ops != ops) {
-                                       ret = -EBUSY;
-                                       goto out;
-                               }
-                               sdev->users++;
-                               goto success;
-                       }
-
-                       break;
-               }
-       }
-
-       sdev = kzalloc(sizeof(*sdev), GFP_KERNEL);
-       if (!sdev) {
-               ret = -ENOMEM;
-               goto out;
-       }
-       sdev->dev = dev;
-
-       ret = intel_iommu_enable_pasid(iommu, dev);
-       if (ret) {
-               kfree(sdev);
-               goto out;
-       }
-
-       info = get_domain_info(dev);
-       sdev->did = FLPT_DEFAULT_DID;
-       sdev->sid = PCI_DEVID(info->bus, info->devfn);
-       if (info->ats_enabled) {
-               sdev->dev_iotlb = 1;
-               sdev->qdep = info->ats_qdep;
-               if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS)
-                       sdev->qdep = 0;
-       }
-
-       /* Finish the setup now we know we're keeping it */
-       sdev->users = 1;
-       sdev->ops = ops;
-       init_rcu_head(&sdev->rcu);
-
-       if (!svm) {
-               svm = kzalloc(sizeof(*svm), GFP_KERNEL);
-               if (!svm) {
-                       ret = -ENOMEM;
-                       kfree(sdev);
-                       goto out;
-               }
-               svm->iommu = iommu;
-
-               if (pasid_max > intel_pasid_max_id)
-                       pasid_max = intel_pasid_max_id;
-
-               /* Do not use PASID 0, reserved for RID to PASID */
-               svm->pasid = ioasid_alloc(NULL, PASID_MIN,
-                                         pasid_max - 1, svm);
-               if (svm->pasid == INVALID_IOASID) {
-                       kfree(svm);
-                       kfree(sdev);
-                       ret = -ENOSPC;
-                       goto out;
-               }
-               svm->notifier.ops = &intel_mmuops;
-               svm->mm = mm;
-               svm->flags = flags;
-               INIT_LIST_HEAD_RCU(&svm->devs);
-               INIT_LIST_HEAD(&svm->list);
-               ret = -ENOMEM;
-               if (mm) {
-                       ret = mmu_notifier_register(&svm->notifier, mm);
-                       if (ret) {
-                               ioasid_free(svm->pasid);
-                               kfree(svm);
-                               kfree(sdev);
-                               goto out;
-                       }
-               }
-
-               spin_lock(&iommu->lock);
-               ret = intel_pasid_setup_first_level(iommu, dev,
-                               mm ? mm->pgd : init_mm.pgd,
-                               svm->pasid, FLPT_DEFAULT_DID,
-                               (mm ? 0 : PASID_FLAG_SUPERVISOR_MODE) |
-                               (cpu_feature_enabled(X86_FEATURE_LA57) ?
-                                PASID_FLAG_FL5LP : 0));
-               spin_unlock(&iommu->lock);
-               if (ret) {
-                       if (mm)
-                               mmu_notifier_unregister(&svm->notifier, mm);
-                       ioasid_free(svm->pasid);
-                       kfree(svm);
-                       kfree(sdev);
-                       goto out;
-               }
-
-               list_add_tail(&svm->list, &global_svm_list);
-       } else {
-               /*
-                * Binding a new device with existing PASID, need to setup
-                * the PASID entry.
-                */
-               spin_lock(&iommu->lock);
-               ret = intel_pasid_setup_first_level(iommu, dev,
-                                               mm ? mm->pgd : init_mm.pgd,
-                                               svm->pasid, FLPT_DEFAULT_DID,
-                                               (mm ? 0 : PASID_FLAG_SUPERVISOR_MODE) |
-                                               (cpu_feature_enabled(X86_FEATURE_LA57) ?
-                                               PASID_FLAG_FL5LP : 0));
-               spin_unlock(&iommu->lock);
-               if (ret) {
-                       kfree(sdev);
-                       goto out;
-               }
-       }
-       list_add_rcu(&sdev->list, &svm->devs);
-success:
-       sdev->pasid = svm->pasid;
-       sdev->sva.dev = dev;
-       if (sd)
-               *sd = sdev;
-       ret = 0;
- out:
-       return ret;
-}
-
-/* Caller must hold pasid_mutex */
-static int intel_svm_unbind_mm(struct device *dev, int pasid)
-{
-       struct intel_svm_dev *sdev;
-       struct intel_iommu *iommu;
-       struct intel_svm *svm;
-       int ret = -EINVAL;
-
-       iommu = intel_svm_device_to_iommu(dev);
-       if (!iommu)
-               goto out;
-
-       svm = ioasid_find(NULL, pasid, NULL);
-       if (!svm)
-               goto out;
-
-       if (IS_ERR(svm)) {
-               ret = PTR_ERR(svm);
-               goto out;
-       }
-
-       for_each_svm_dev(sdev, svm, dev) {
-               ret = 0;
-               sdev->users--;
-               if (!sdev->users) {
-                       list_del_rcu(&sdev->list);
-                       /* Flush the PASID cache and IOTLB for this device.
-                        * Note that we do depend on the hardware *not* using
-                        * the PASID any more. Just as we depend on other
-                        * devices never using PASIDs that they have no right
-                        * to use. We have a *shared* PASID table, because it's
-                        * large and has to be physically contiguous. So it's
-                        * hard to be as defensive as we might like. */
-                       intel_pasid_tear_down_entry(iommu, dev,
-                                                   svm->pasid, false);
-                       intel_svm_drain_prq(dev, svm->pasid);
-                       kfree_rcu(sdev, rcu);
-
-                       if (list_empty(&svm->devs)) {
-                               ioasid_free(svm->pasid);
-                               if (svm->mm)
-                                       mmu_notifier_unregister(&svm->notifier, svm->mm);
-                               list_del(&svm->list);
-                               /* We mandate that no page faults may be outstanding
-                                * for the PASID when intel_svm_unbind_mm() is called.
-                                * If that is not obeyed, subtle errors will happen.
-                                * Let's make them less subtle... */
-                               memset(svm, 0x6b, sizeof(*svm));
-                               kfree(svm);
-                       }
-               }
-               break;
-       }
- out:
-
-       return ret;
-}
-
-/* Page request queue descriptor */
-struct page_req_dsc {
-       union {
-               struct {
-                       u64 type:8;
-                       u64 pasid_present:1;
-                       u64 priv_data_present:1;
-                       u64 rsvd:6;
-                       u64 rid:16;
-                       u64 pasid:20;
-                       u64 exe_req:1;
-                       u64 pm_req:1;
-                       u64 rsvd2:10;
-               };
-               u64 qw_0;
-       };
-       union {
-               struct {
-                       u64 rd_req:1;
-                       u64 wr_req:1;
-                       u64 lpig:1;
-                       u64 prg_index:9;
-                       u64 addr:52;
-               };
-               u64 qw_1;
-       };
-       u64 priv_data[2];
-};
-
-#define PRQ_RING_MASK  ((0x1000 << PRQ_ORDER) - 0x20)
-
-static bool access_error(struct vm_area_struct *vma, struct page_req_dsc *req)
-{
-       unsigned long requested = 0;
-
-       if (req->exe_req)
-               requested |= VM_EXEC;
-
-       if (req->rd_req)
-               requested |= VM_READ;
-
-       if (req->wr_req)
-               requested |= VM_WRITE;
-
-       return (requested & ~vma->vm_flags) != 0;
-}
-
-static bool is_canonical_address(u64 addr)
-{
-       int shift = 64 - (__VIRTUAL_MASK_SHIFT + 1);
-       long saddr = (long) addr;
-
-       return (((saddr << shift) >> shift) == saddr);
-}
-
-/**
- * intel_svm_drain_prq - Drain page requests and responses for a pasid
- * @dev: target device
- * @pasid: pasid for draining
- *
- * Drain all pending page requests and responses related to @pasid in both
- * software and hardware. This is supposed to be called after the device
- * driver has stopped DMA, the pasid entry has been cleared, and both IOTLB
- * and DevTLB have been invalidated.
- *
- * It waits until all pending page requests for @pasid in the page fault
- * queue are completed by the prq handling thread. Then follow the steps
- * described in VT-d spec CH7.10 to drain all page requests and page
- * responses pending in the hardware.
- */
-static void intel_svm_drain_prq(struct device *dev, int pasid)
-{
-       struct device_domain_info *info;
-       struct dmar_domain *domain;
-       struct intel_iommu *iommu;
-       struct qi_desc desc[3];
-       struct pci_dev *pdev;
-       int head, tail;
-       u16 sid, did;
-       int qdep;
-
-       info = get_domain_info(dev);
-       if (WARN_ON(!info || !dev_is_pci(dev)))
-               return;
-
-       if (!info->pri_enabled)
-               return;
-
-       iommu = info->iommu;
-       domain = info->domain;
-       pdev = to_pci_dev(dev);
-       sid = PCI_DEVID(info->bus, info->devfn);
-       did = domain->iommu_did[iommu->seq_id];
-       qdep = pci_ats_queue_depth(pdev);
-
-       /*
-        * Check and wait until all pending page requests in the queue are
-        * handled by the prq handling thread.
-        */
-prq_retry:
-       reinit_completion(&iommu->prq_complete);
-       tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK;
-       head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK;
-       while (head != tail) {
-               struct page_req_dsc *req;
-
-               req = &iommu->prq[head / sizeof(*req)];
-               if (!req->pasid_present || req->pasid != pasid) {
-                       head = (head + sizeof(*req)) & PRQ_RING_MASK;
-                       continue;
-               }
-
-               wait_for_completion(&iommu->prq_complete);
-               goto prq_retry;
-       }
-
-       /*
-        * Perform steps described in VT-d spec CH7.10 to drain page
-        * requests and responses in hardware.
-        */
-       memset(desc, 0, sizeof(desc));
-       desc[0].qw0 = QI_IWD_STATUS_DATA(QI_DONE) |
-                       QI_IWD_FENCE |
-                       QI_IWD_TYPE;
-       desc[1].qw0 = QI_EIOTLB_PASID(pasid) |
-                       QI_EIOTLB_DID(did) |
-                       QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) |
-                       QI_EIOTLB_TYPE;
-       desc[2].qw0 = QI_DEV_EIOTLB_PASID(pasid) |
-                       QI_DEV_EIOTLB_SID(sid) |
-                       QI_DEV_EIOTLB_QDEP(qdep) |
-                       QI_DEIOTLB_TYPE |
-                       QI_DEV_IOTLB_PFSID(info->pfsid);
-qi_retry:
-       reinit_completion(&iommu->prq_complete);
-       qi_submit_sync(iommu, desc, 3, QI_OPT_WAIT_DRAIN);
-       if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) {
-               wait_for_completion(&iommu->prq_complete);
-               goto qi_retry;
-       }
-}
-
-static irqreturn_t prq_event_thread(int irq, void *d)
-{
-       struct intel_iommu *iommu = d;
-       struct intel_svm *svm = NULL;
-       int head, tail, handled = 0;
-
-       /* Clear PPR bit before reading head/tail registers, to
-        * ensure that we get a new interrupt if needed. */
-       writel(DMA_PRS_PPR, iommu->reg + DMAR_PRS_REG);
-
-       tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK;
-       head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK;
-       while (head != tail) {
-               struct intel_svm_dev *sdev;
-               struct vm_area_struct *vma;
-               struct page_req_dsc *req;
-               struct qi_desc resp;
-               int result;
-               vm_fault_t ret;
-               u64 address;
-
-               handled = 1;
-
-               req = &iommu->prq[head / sizeof(*req)];
-
-               result = QI_RESP_FAILURE;
-               address = (u64)req->addr << VTD_PAGE_SHIFT;
-               if (!req->pasid_present) {
-                       pr_err("%s: Page request without PASID: %08llx %08llx\n",
-                              iommu->name, ((unsigned long long *)req)[0],
-                              ((unsigned long long *)req)[1]);
-                       goto no_pasid;
-               }
-
-               if (!svm || svm->pasid != req->pasid) {
-                       rcu_read_lock();
-                       svm = ioasid_find(NULL, req->pasid, NULL);
-                       /* It *can't* go away, because the driver is not permitted
-                        * to unbind the mm while any page faults are outstanding.
-                        * So we only need RCU to protect the internal idr code. */
-                       rcu_read_unlock();
-                       if (IS_ERR_OR_NULL(svm)) {
-                               pr_err("%s: Page request for invalid PASID %d: %08llx %08llx\n",
-                                      iommu->name, req->pasid, ((unsigned long long *)req)[0],
-                                      ((unsigned long long *)req)[1]);
-                               goto no_pasid;
-                       }
-               }
-
-               result = QI_RESP_INVALID;
-               /* Since we're using init_mm.pgd directly, we should never take
-                * any faults on kernel addresses. */
-               if (!svm->mm)
-                       goto bad_req;
-
-               /* If address is not canonical, return invalid response */
-               if (!is_canonical_address(address))
-                       goto bad_req;
-
-               /* If the mm is already defunct, don't handle faults. */
-               if (!mmget_not_zero(svm->mm))
-                       goto bad_req;
-
-               down_read(&svm->mm->mmap_sem);
-               vma = find_extend_vma(svm->mm, address);
-               if (!vma || address < vma->vm_start)
-                       goto invalid;
-
-               if (access_error(vma, req))
-                       goto invalid;
-
-               ret = handle_mm_fault(vma, address,
-                                     req->wr_req ? FAULT_FLAG_WRITE : 0);
-               if (ret & VM_FAULT_ERROR)
-                       goto invalid;
-
-               result = QI_RESP_SUCCESS;
-       invalid:
-               up_read(&svm->mm->mmap_sem);
-               mmput(svm->mm);
-       bad_req:
-               /* Accounting for major/minor faults? */
-               rcu_read_lock();
-               list_for_each_entry_rcu(sdev, &svm->devs, list) {
-                       if (sdev->sid == req->rid)
-                               break;
-               }
-               /* Other devices can go away, but the drivers are not permitted
-                * to unbind while any page faults might be in flight. So it's
-                * OK to drop the 'lock' here now we have it. */
-               rcu_read_unlock();
-
-               if (WARN_ON(&sdev->list == &svm->devs))
-                       sdev = NULL;
-
-               if (sdev && sdev->ops && sdev->ops->fault_cb) {
-                       int rwxp = (req->rd_req << 3) | (req->wr_req << 2) |
-                               (req->exe_req << 1) | (req->pm_req);
-                       sdev->ops->fault_cb(sdev->dev, req->pasid, req->addr,
-                                           req->priv_data, rwxp, result);
-               }
-               /* We get here in the error case where the PASID lookup failed,
-                  and these can be NULL. Do not use them below this point! */
-               sdev = NULL;
-               svm = NULL;
-       no_pasid:
-               if (req->lpig || req->priv_data_present) {
-                       /*
-                        * Per VT-d spec. v3.0 ch7.7, system software must
-                        * respond with page group response if private data
-                        * is present (PDP) or last page in group (LPIG) bit
-                        * is set. This is an additional VT-d feature beyond
-                        * PCI ATS spec.
-                        */
-                       resp.qw0 = QI_PGRP_PASID(req->pasid) |
-                               QI_PGRP_DID(req->rid) |
-                               QI_PGRP_PASID_P(req->pasid_present) |
-                               QI_PGRP_PDP(req->pasid_present) |
-                               QI_PGRP_RESP_CODE(result) |
-                               QI_PGRP_RESP_TYPE;
-                       resp.qw1 = QI_PGRP_IDX(req->prg_index) |
-                               QI_PGRP_LPIG(req->lpig);
-
-                       if (req->priv_data_present)
-                               memcpy(&resp.qw2, req->priv_data,
-                                      sizeof(req->priv_data));
-                       resp.qw2 = 0;
-                       resp.qw3 = 0;
-                       qi_submit_sync(iommu, &resp, 1, 0);
-               }
-               head = (head + sizeof(*req)) & PRQ_RING_MASK;
-       }
-
-       dmar_writeq(iommu->reg + DMAR_PQH_REG, tail);
-
-       /*
-        * Clear the page request overflow bit and wake up all threads that
-        * are waiting for the completion of this handling.
-        */
-       if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO)
-               writel(DMA_PRS_PRO, iommu->reg + DMAR_PRS_REG);
-
-       if (!completion_done(&iommu->prq_complete))
-               complete(&iommu->prq_complete);
-
-       return IRQ_RETVAL(handled);
-}
-
-#define to_intel_svm_dev(handle) container_of(handle, struct intel_svm_dev, sva)
-struct iommu_sva *
-intel_svm_bind(struct device *dev, struct mm_struct *mm, void *drvdata)
-{
-       struct iommu_sva *sva = ERR_PTR(-EINVAL);
-       struct intel_svm_dev *sdev = NULL;
-       int flags = 0;
-       int ret;
-
-       /*
-        * TODO: Consolidate with generic iommu-sva bind after it is merged.
-        * It will require shared SVM data structures, i.e. combine io_mm
-        * and intel_svm etc.
-        */
-       if (drvdata)
-               flags = *(int *)drvdata;
-       mutex_lock(&pasid_mutex);
-       ret = intel_svm_bind_mm(dev, flags, NULL, mm, &sdev);
-       if (ret)
-               sva = ERR_PTR(ret);
-       else if (sdev)
-               sva = &sdev->sva;
-       else
-               WARN(!sdev, "SVM bind succeeded with no sdev!\n");
-
-       mutex_unlock(&pasid_mutex);
-
-       return sva;
-}
-
-void intel_svm_unbind(struct iommu_sva *sva)
-{
-       struct intel_svm_dev *sdev;
-
-       mutex_lock(&pasid_mutex);
-       sdev = to_intel_svm_dev(sva);
-       intel_svm_unbind_mm(sdev->dev, sdev->pasid);
-       mutex_unlock(&pasid_mutex);
-}
-
-int intel_svm_get_pasid(struct iommu_sva *sva)
-{
-       struct intel_svm_dev *sdev;
-       int pasid;
-
-       mutex_lock(&pasid_mutex);
-       sdev = to_intel_svm_dev(sva);
-       pasid = sdev->pasid;
-       mutex_unlock(&pasid_mutex);
-
-       return pasid;
-}
diff --git a/drivers/iommu/intel-trace.c b/drivers/iommu/intel-trace.c

deleted file mode 100644 (file)

index bfb6a6e..0000000
--- a/drivers/iommu/intel-trace.c
+++ /dev/null
@@ -1,14 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Intel IOMMU trace support
- *
- * Copyright (C) 2019 Intel Corporation
- *
- * Author: Lu Baolu <baolu.lu@linux.intel.com>
- */
-
-#include <linux/string.h>
-#include <linux/types.h>
-
-#define CREATE_TRACE_POINTS
-#include <trace/events/intel_iommu.h>
diff --git a/drivers/iommu/intel/debugfs.c b/drivers/iommu/intel/debugfs.c

new file mode 100644 (file)

index 0000000..cf1ebb9
--- /dev/null
+++ b/drivers/iommu/intel/debugfs.c
@@ -0,0 +1,559 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright © 2018 Intel Corporation.
+ *
+ * Authors: Gayatri Kammela <gayatri.kammela@intel.com>
+ *         Sohil Mehta <sohil.mehta@intel.com>
+ *         Jacob Pan <jacob.jun.pan@linux.intel.com>
+ *         Lu Baolu <baolu.lu@linux.intel.com>
+ */
+
+#include <linux/debugfs.h>
+#include <linux/dmar.h>
+#include <linux/intel-iommu.h>
+#include <linux/pci.h>
+
+#include <asm/irq_remapping.h>
+
+#include "intel-pasid.h"
+
+struct tbl_walk {
+       u16 bus;
+       u16 devfn;
+       u32 pasid;
+       struct root_entry *rt_entry;
+       struct context_entry *ctx_entry;
+       struct pasid_entry *pasid_tbl_entry;
+};
+
+struct iommu_regset {
+       int offset;
+       const char *regs;
+};
+
+#define IOMMU_REGSET_ENTRY(_reg_)                                      \
+       { DMAR_##_reg_##_REG, __stringify(_reg_) }
+
+static const struct iommu_regset iommu_regs_32[] = {
+       IOMMU_REGSET_ENTRY(VER),
+       IOMMU_REGSET_ENTRY(GCMD),
+       IOMMU_REGSET_ENTRY(GSTS),
+       IOMMU_REGSET_ENTRY(FSTS),
+       IOMMU_REGSET_ENTRY(FECTL),
+       IOMMU_REGSET_ENTRY(FEDATA),
+       IOMMU_REGSET_ENTRY(FEADDR),
+       IOMMU_REGSET_ENTRY(FEUADDR),
+       IOMMU_REGSET_ENTRY(PMEN),
+       IOMMU_REGSET_ENTRY(PLMBASE),
+       IOMMU_REGSET_ENTRY(PLMLIMIT),
+       IOMMU_REGSET_ENTRY(ICS),
+       IOMMU_REGSET_ENTRY(PRS),
+       IOMMU_REGSET_ENTRY(PECTL),
+       IOMMU_REGSET_ENTRY(PEDATA),
+       IOMMU_REGSET_ENTRY(PEADDR),
+       IOMMU_REGSET_ENTRY(PEUADDR),
+};
+
+static const struct iommu_regset iommu_regs_64[] = {
+       IOMMU_REGSET_ENTRY(CAP),
+       IOMMU_REGSET_ENTRY(ECAP),
+       IOMMU_REGSET_ENTRY(RTADDR),
+       IOMMU_REGSET_ENTRY(CCMD),
+       IOMMU_REGSET_ENTRY(AFLOG),
+       IOMMU_REGSET_ENTRY(PHMBASE),
+       IOMMU_REGSET_ENTRY(PHMLIMIT),
+       IOMMU_REGSET_ENTRY(IQH),
+       IOMMU_REGSET_ENTRY(IQT),
+       IOMMU_REGSET_ENTRY(IQA),
+       IOMMU_REGSET_ENTRY(IRTA),
+       IOMMU_REGSET_ENTRY(PQH),
+       IOMMU_REGSET_ENTRY(PQT),
+       IOMMU_REGSET_ENTRY(PQA),
+       IOMMU_REGSET_ENTRY(MTRRCAP),
+       IOMMU_REGSET_ENTRY(MTRRDEF),
+       IOMMU_REGSET_ENTRY(MTRR_FIX64K_00000),
+       IOMMU_REGSET_ENTRY(MTRR_FIX16K_80000),
+       IOMMU_REGSET_ENTRY(MTRR_FIX16K_A0000),
+       IOMMU_REGSET_ENTRY(MTRR_FIX4K_C0000),
+       IOMMU_REGSET_ENTRY(MTRR_FIX4K_C8000),
+       IOMMU_REGSET_ENTRY(MTRR_FIX4K_D0000),
+       IOMMU_REGSET_ENTRY(MTRR_FIX4K_D8000),
+       IOMMU_REGSET_ENTRY(MTRR_FIX4K_E0000),
+       IOMMU_REGSET_ENTRY(MTRR_FIX4K_E8000),
+       IOMMU_REGSET_ENTRY(MTRR_FIX4K_F0000),
+       IOMMU_REGSET_ENTRY(MTRR_FIX4K_F8000),
+       IOMMU_REGSET_ENTRY(MTRR_PHYSBASE0),
+       IOMMU_REGSET_ENTRY(MTRR_PHYSMASK0),
+       IOMMU_REGSET_ENTRY(MTRR_PHYSBASE1),
+       IOMMU_REGSET_ENTRY(MTRR_PHYSMASK1),
+       IOMMU_REGSET_ENTRY(MTRR_PHYSBASE2),
+       IOMMU_REGSET_ENTRY(MTRR_PHYSMASK2),
+       IOMMU_REGSET_ENTRY(MTRR_PHYSBASE3),
+       IOMMU_REGSET_ENTRY(MTRR_PHYSMASK3),
+       IOMMU_REGSET_ENTRY(MTRR_PHYSBASE4),
+       IOMMU_REGSET_ENTRY(MTRR_PHYSMASK4),
+       IOMMU_REGSET_ENTRY(MTRR_PHYSBASE5),
+       IOMMU_REGSET_ENTRY(MTRR_PHYSMASK5),
+       IOMMU_REGSET_ENTRY(MTRR_PHYSBASE6),
+       IOMMU_REGSET_ENTRY(MTRR_PHYSMASK6),
+       IOMMU_REGSET_ENTRY(MTRR_PHYSBASE7),
+       IOMMU_REGSET_ENTRY(MTRR_PHYSMASK7),
+       IOMMU_REGSET_ENTRY(MTRR_PHYSBASE8),
+       IOMMU_REGSET_ENTRY(MTRR_PHYSMASK8),
+       IOMMU_REGSET_ENTRY(MTRR_PHYSBASE9),
+       IOMMU_REGSET_ENTRY(MTRR_PHYSMASK9),
+       IOMMU_REGSET_ENTRY(VCCAP),
+       IOMMU_REGSET_ENTRY(VCMD),
+       IOMMU_REGSET_ENTRY(VCRSP),
+};
+
+static int iommu_regset_show(struct seq_file *m, void *unused)
+{
+       struct dmar_drhd_unit *drhd;
+       struct intel_iommu *iommu;
+       unsigned long flag;
+       int i, ret = 0;
+       u64 value;
+
+       rcu_read_lock();
+       for_each_active_iommu(iommu, drhd) {
+               if (!drhd->reg_base_addr) {
+                       seq_puts(m, "IOMMU: Invalid base address\n");
+                       ret = -EINVAL;
+                       goto out;
+               }
+
+               seq_printf(m, "IOMMU: %s Register Base Address: %llx\n",
+                          iommu->name, drhd->reg_base_addr);
+               seq_puts(m, "Name\t\t\tOffset\t\tContents\n");
+               /*
+                * Publish the contents of the 64-bit hardware registers
+                * by adding the offset to the pointer (virtual address).
+                */
+               raw_spin_lock_irqsave(&iommu->register_lock, flag);
+               for (i = 0 ; i < ARRAY_SIZE(iommu_regs_32); i++) {
+                       value = dmar_readl(iommu->reg + iommu_regs_32[i].offset);
+                       seq_printf(m, "%-16s\t0x%02x\t\t0x%016llx\n",
+                                  iommu_regs_32[i].regs, iommu_regs_32[i].offset,
+                                  value);
+               }
+               for (i = 0 ; i < ARRAY_SIZE(iommu_regs_64); i++) {
+                       value = dmar_readq(iommu->reg + iommu_regs_64[i].offset);
+                       seq_printf(m, "%-16s\t0x%02x\t\t0x%016llx\n",
+                                  iommu_regs_64[i].regs, iommu_regs_64[i].offset,
+                                  value);
+               }
+               raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+               seq_putc(m, '\n');
+       }
+out:
+       rcu_read_unlock();
+
+       return ret;
+}
+DEFINE_SHOW_ATTRIBUTE(iommu_regset);
+
+static inline void print_tbl_walk(struct seq_file *m)
+{
+       struct tbl_walk *tbl_wlk = m->private;
+
+       seq_printf(m, "%02x:%02x.%x\t0x%016llx:0x%016llx\t0x%016llx:0x%016llx\t",
+                  tbl_wlk->bus, PCI_SLOT(tbl_wlk->devfn),
+                  PCI_FUNC(tbl_wlk->devfn), tbl_wlk->rt_entry->hi,
+                  tbl_wlk->rt_entry->lo, tbl_wlk->ctx_entry->hi,
+                  tbl_wlk->ctx_entry->lo);
+
+       /*
+        * A legacy mode DMAR doesn't support PASID, hence default it to -1
+        * indicating that it's invalid. Also, default all PASID related fields
+        * to 0.
+        */
+       if (!tbl_wlk->pasid_tbl_entry)
+               seq_printf(m, "%-6d\t0x%016llx:0x%016llx:0x%016llx\n", -1,
+                          (u64)0, (u64)0, (u64)0);
+       else
+               seq_printf(m, "%-6d\t0x%016llx:0x%016llx:0x%016llx\n",
+                          tbl_wlk->pasid, tbl_wlk->pasid_tbl_entry->val[2],
+                          tbl_wlk->pasid_tbl_entry->val[1],
+                          tbl_wlk->pasid_tbl_entry->val[0]);
+}
+
+static void pasid_tbl_walk(struct seq_file *m, struct pasid_entry *tbl_entry,
+                          u16 dir_idx)
+{
+       struct tbl_walk *tbl_wlk = m->private;
+       u8 tbl_idx;
+
+       for (tbl_idx = 0; tbl_idx < PASID_TBL_ENTRIES; tbl_idx++) {
+               if (pasid_pte_is_present(tbl_entry)) {
+                       tbl_wlk->pasid_tbl_entry = tbl_entry;
+                       tbl_wlk->pasid = (dir_idx << PASID_PDE_SHIFT) + tbl_idx;
+                       print_tbl_walk(m);
+               }
+
+               tbl_entry++;
+       }
+}
+
+static void pasid_dir_walk(struct seq_file *m, u64 pasid_dir_ptr,
+                          u16 pasid_dir_size)
+{
+       struct pasid_dir_entry *dir_entry = phys_to_virt(pasid_dir_ptr);
+       struct pasid_entry *pasid_tbl;
+       u16 dir_idx;
+
+       for (dir_idx = 0; dir_idx < pasid_dir_size; dir_idx++) {
+               pasid_tbl = get_pasid_table_from_pde(dir_entry);
+               if (pasid_tbl)
+                       pasid_tbl_walk(m, pasid_tbl, dir_idx);
+
+               dir_entry++;
+       }
+}
+
+static void ctx_tbl_walk(struct seq_file *m, struct intel_iommu *iommu, u16 bus)
+{
+       struct context_entry *context;
+       u16 devfn, pasid_dir_size;
+       u64 pasid_dir_ptr;
+
+       for (devfn = 0; devfn < 256; devfn++) {
+               struct tbl_walk tbl_wlk = {0};
+
+               /*
+                * Scalable mode root entry points to upper scalable mode
+                * context table and lower scalable mode context table. Each
+                * scalable mode context table has 128 context entries where as
+                * legacy mode context table has 256 context entries. So in
+                * scalable mode, the context entries for former 128 devices are
+                * in the lower scalable mode context table, while the latter
+                * 128 devices are in the upper scalable mode context table.
+                * In scalable mode, when devfn > 127, iommu_context_addr()
+                * automatically refers to upper scalable mode context table and
+                * hence the caller doesn't have to worry about differences
+                * between scalable mode and non scalable mode.
+                */
+               context = iommu_context_addr(iommu, bus, devfn, 0);
+               if (!context)
+                       return;
+
+               if (!context_present(context))
+                       continue;
+
+               tbl_wlk.bus = bus;
+               tbl_wlk.devfn = devfn;
+               tbl_wlk.rt_entry = &iommu->root_entry[bus];
+               tbl_wlk.ctx_entry = context;
+               m->private = &tbl_wlk;
+
+               if (dmar_readq(iommu->reg + DMAR_RTADDR_REG) & DMA_RTADDR_SMT) {
+                       pasid_dir_ptr = context->lo & VTD_PAGE_MASK;
+                       pasid_dir_size = get_pasid_dir_size(context);
+                       pasid_dir_walk(m, pasid_dir_ptr, pasid_dir_size);
+                       continue;
+               }
+
+               print_tbl_walk(m);
+       }
+}
+
+static void root_tbl_walk(struct seq_file *m, struct intel_iommu *iommu)
+{
+       unsigned long flags;
+       u16 bus;
+
+       spin_lock_irqsave(&iommu->lock, flags);
+       seq_printf(m, "IOMMU %s: Root Table Address: 0x%llx\n", iommu->name,
+                  (u64)virt_to_phys(iommu->root_entry));
+       seq_puts(m, "B.D.F\tRoot_entry\t\t\t\tContext_entry\t\t\t\tPASID\tPASID_table_entry\n");
+
+       /*
+        * No need to check if the root entry is present or not because
+        * iommu_context_addr() performs the same check before returning
+        * context entry.
+        */
+       for (bus = 0; bus < 256; bus++)
+               ctx_tbl_walk(m, iommu, bus);
+
+       spin_unlock_irqrestore(&iommu->lock, flags);
+}
+
+static int dmar_translation_struct_show(struct seq_file *m, void *unused)
+{
+       struct dmar_drhd_unit *drhd;
+       struct intel_iommu *iommu;
+       u32 sts;
+
+       rcu_read_lock();
+       for_each_active_iommu(iommu, drhd) {
+               sts = dmar_readl(iommu->reg + DMAR_GSTS_REG);
+               if (!(sts & DMA_GSTS_TES)) {
+                       seq_printf(m, "DMA Remapping is not enabled on %s\n",
+                                  iommu->name);
+                       continue;
+               }
+               root_tbl_walk(m, iommu);
+               seq_putc(m, '\n');
+       }
+       rcu_read_unlock();
+
+       return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(dmar_translation_struct);
+
+static inline unsigned long level_to_directory_size(int level)
+{
+       return BIT_ULL(VTD_PAGE_SHIFT + VTD_STRIDE_SHIFT * (level - 1));
+}
+
+static inline void
+dump_page_info(struct seq_file *m, unsigned long iova, u64 *path)
+{
+       seq_printf(m, "0x%013lx |\t0x%016llx\t0x%016llx\t0x%016llx\t0x%016llx\t0x%016llx\n",
+                  iova >> VTD_PAGE_SHIFT, path[5], path[4],
+                  path[3], path[2], path[1]);
+}
+
+static void pgtable_walk_level(struct seq_file *m, struct dma_pte *pde,
+                              int level, unsigned long start,
+                              u64 *path)
+{
+       int i;
+
+       if (level > 5 || level < 1)
+               return;
+
+       for (i = 0; i < BIT_ULL(VTD_STRIDE_SHIFT);
+                       i++, pde++, start += level_to_directory_size(level)) {
+               if (!dma_pte_present(pde))
+                       continue;
+
+               path[level] = pde->val;
+               if (dma_pte_superpage(pde) || level == 1)
+                       dump_page_info(m, start, path);
+               else
+                       pgtable_walk_level(m, phys_to_virt(dma_pte_addr(pde)),
+                                          level - 1, start, path);
+               path[level] = 0;
+       }
+}
+
+static int show_device_domain_translation(struct device *dev, void *data)
+{
+       struct dmar_domain *domain = find_domain(dev);
+       struct seq_file *m = data;
+       u64 path[6] = { 0 };
+
+       if (!domain)
+               return 0;
+
+       seq_printf(m, "Device %s with pasid %d @0x%llx\n",
+                  dev_name(dev), domain->default_pasid,
+                  (u64)virt_to_phys(domain->pgd));
+       seq_puts(m, "IOVA_PFN\t\tPML5E\t\t\tPML4E\t\t\tPDPE\t\t\tPDE\t\t\tPTE\n");
+
+       pgtable_walk_level(m, domain->pgd, domain->agaw + 2, 0, path);
+       seq_putc(m, '\n');
+
+       return 0;
+}
+
+static int domain_translation_struct_show(struct seq_file *m, void *unused)
+{
+       unsigned long flags;
+       int ret;
+
+       spin_lock_irqsave(&device_domain_lock, flags);
+       ret = bus_for_each_dev(&pci_bus_type, NULL, m,
+                              show_device_domain_translation);
+       spin_unlock_irqrestore(&device_domain_lock, flags);
+
+       return ret;
+}
+DEFINE_SHOW_ATTRIBUTE(domain_translation_struct);
+
+static void invalidation_queue_entry_show(struct seq_file *m,
+                                         struct intel_iommu *iommu)
+{
+       int index, shift = qi_shift(iommu);
+       struct qi_desc *desc;
+       int offset;
+
+       if (ecap_smts(iommu->ecap))
+               seq_puts(m, "Index\t\tqw0\t\t\tqw1\t\t\tqw2\t\t\tqw3\t\t\tstatus\n");
+       else
+               seq_puts(m, "Index\t\tqw0\t\t\tqw1\t\t\tstatus\n");
+
+       for (index = 0; index < QI_LENGTH; index++) {
+               offset = index << shift;
+               desc = iommu->qi->desc + offset;
+               if (ecap_smts(iommu->ecap))
+                       seq_printf(m, "%5d\t%016llx\t%016llx\t%016llx\t%016llx\t%016x\n",
+                                  index, desc->qw0, desc->qw1,
+                                  desc->qw2, desc->qw3,
+                                  iommu->qi->desc_status[index]);
+               else
+                       seq_printf(m, "%5d\t%016llx\t%016llx\t%016x\n",
+                                  index, desc->qw0, desc->qw1,
+                                  iommu->qi->desc_status[index]);
+       }
+}
+
+static int invalidation_queue_show(struct seq_file *m, void *unused)
+{
+       struct dmar_drhd_unit *drhd;
+       struct intel_iommu *iommu;
+       unsigned long flags;
+       struct q_inval *qi;
+       int shift;
+
+       rcu_read_lock();
+       for_each_active_iommu(iommu, drhd) {
+               qi = iommu->qi;
+               shift = qi_shift(iommu);
+
+               if (!qi || !ecap_qis(iommu->ecap))
+                       continue;
+
+               seq_printf(m, "Invalidation queue on IOMMU: %s\n", iommu->name);
+
+               raw_spin_lock_irqsave(&qi->q_lock, flags);
+               seq_printf(m, " Base: 0x%llx\tHead: %lld\tTail: %lld\n",
+                          (u64)virt_to_phys(qi->desc),
+                          dmar_readq(iommu->reg + DMAR_IQH_REG) >> shift,
+                          dmar_readq(iommu->reg + DMAR_IQT_REG) >> shift);
+               invalidation_queue_entry_show(m, iommu);
+               raw_spin_unlock_irqrestore(&qi->q_lock, flags);
+               seq_putc(m, '\n');
+       }
+       rcu_read_unlock();
+
+       return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(invalidation_queue);
+
+#ifdef CONFIG_IRQ_REMAP
+static void ir_tbl_remap_entry_show(struct seq_file *m,
+                                   struct intel_iommu *iommu)
+{
+       struct irte *ri_entry;
+       unsigned long flags;
+       int idx;
+
+       seq_puts(m, " Entry SrcID   DstID    Vct IRTE_high\t\tIRTE_low\n");
+
+       raw_spin_lock_irqsave(&irq_2_ir_lock, flags);
+       for (idx = 0; idx < INTR_REMAP_TABLE_ENTRIES; idx++) {
+               ri_entry = &iommu->ir_table->base[idx];
+               if (!ri_entry->present || ri_entry->p_pst)
+                       continue;
+
+               seq_printf(m, " %-5d %02x:%02x.%01x %08x %02x  %016llx\t%016llx\n",
+                          idx, PCI_BUS_NUM(ri_entry->sid),
+                          PCI_SLOT(ri_entry->sid), PCI_FUNC(ri_entry->sid),
+                          ri_entry->dest_id, ri_entry->vector,
+                          ri_entry->high, ri_entry->low);
+       }
+       raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags);
+}
+
+static void ir_tbl_posted_entry_show(struct seq_file *m,
+                                    struct intel_iommu *iommu)
+{
+       struct irte *pi_entry;
+       unsigned long flags;
+       int idx;
+
+       seq_puts(m, " Entry SrcID   PDA_high PDA_low  Vct IRTE_high\t\tIRTE_low\n");
+
+       raw_spin_lock_irqsave(&irq_2_ir_lock, flags);
+       for (idx = 0; idx < INTR_REMAP_TABLE_ENTRIES; idx++) {
+               pi_entry = &iommu->ir_table->base[idx];
+               if (!pi_entry->present || !pi_entry->p_pst)
+                       continue;
+
+               seq_printf(m, " %-5d %02x:%02x.%01x %08x %08x %02x  %016llx\t%016llx\n",
+                          idx, PCI_BUS_NUM(pi_entry->sid),
+                          PCI_SLOT(pi_entry->sid), PCI_FUNC(pi_entry->sid),
+                          pi_entry->pda_h, pi_entry->pda_l << 6,
+                          pi_entry->vector, pi_entry->high,
+                          pi_entry->low);
+       }
+       raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags);
+}
+
+/*
+ * For active IOMMUs go through the Interrupt remapping
+ * table and print valid entries in a table format for
+ * Remapped and Posted Interrupts.
+ */
+static int ir_translation_struct_show(struct seq_file *m, void *unused)
+{
+       struct dmar_drhd_unit *drhd;
+       struct intel_iommu *iommu;
+       u64 irta;
+       u32 sts;
+
+       rcu_read_lock();
+       for_each_active_iommu(iommu, drhd) {
+               if (!ecap_ir_support(iommu->ecap))
+                       continue;
+
+               seq_printf(m, "Remapped Interrupt supported on IOMMU: %s\n",
+                          iommu->name);
+
+               sts = dmar_readl(iommu->reg + DMAR_GSTS_REG);
+               if (iommu->ir_table && (sts & DMA_GSTS_IRES)) {
+                       irta = virt_to_phys(iommu->ir_table->base);
+                       seq_printf(m, " IR table address:%llx\n", irta);
+                       ir_tbl_remap_entry_show(m, iommu);
+               } else {
+                       seq_puts(m, "Interrupt Remapping is not enabled\n");
+               }
+               seq_putc(m, '\n');
+       }
+
+       seq_puts(m, "****\n\n");
+
+       for_each_active_iommu(iommu, drhd) {
+               if (!cap_pi_support(iommu->cap))
+                       continue;
+
+               seq_printf(m, "Posted Interrupt supported on IOMMU: %s\n",
+                          iommu->name);
+
+               if (iommu->ir_table) {
+                       irta = virt_to_phys(iommu->ir_table->base);
+                       seq_printf(m, " IR table address:%llx\n", irta);
+                       ir_tbl_posted_entry_show(m, iommu);
+               } else {
+                       seq_puts(m, "Interrupt Remapping is not enabled\n");
+               }
+               seq_putc(m, '\n');
+       }
+       rcu_read_unlock();
+
+       return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(ir_translation_struct);
+#endif
+
+void __init intel_iommu_debugfs_init(void)
+{
+       struct dentry *intel_iommu_debug = debugfs_create_dir("intel",
+                                               iommu_debugfs_dir);
+
+       debugfs_create_file("iommu_regset", 0444, intel_iommu_debug, NULL,
+                           &iommu_regset_fops);
+       debugfs_create_file("dmar_translation_struct", 0444, intel_iommu_debug,
+                           NULL, &dmar_translation_struct_fops);
+       debugfs_create_file("domain_translation_struct", 0444,
+                           intel_iommu_debug, NULL,
+                           &domain_translation_struct_fops);
+       debugfs_create_file("invalidation_queue", 0444, intel_iommu_debug,
+                           NULL, &invalidation_queue_fops);
+#ifdef CONFIG_IRQ_REMAP
+       debugfs_create_file("ir_translation_struct", 0444, intel_iommu_debug,
+                           NULL, &ir_translation_struct_fops);
+#endif
+}
diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c

new file mode 100644 (file)

index 0000000..cc46dff
--- /dev/null
+++ b/drivers/iommu/intel/dmar.c
@@ -0,0 +1,2264 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2006, Intel Corporation.
+ *
+ * Copyright (C) 2006-2008 Intel Corporation
+ * Author: Ashok Raj <ashok.raj@intel.com>
+ * Author: Shaohua Li <shaohua.li@intel.com>
+ * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
+ *
+ * This file implements early detection/parsing of Remapping Devices
+ * reported to OS through BIOS via DMA remapping reporting (DMAR) ACPI
+ * tables.
+ *
+ * These routines are used by both DMA-remapping and Interrupt-remapping
+ */
+
+#define pr_fmt(fmt)     "DMAR: " fmt
+
+#include <linux/pci.h>
+#include <linux/dmar.h>
+#include <linux/iova.h>
+#include <linux/intel-iommu.h>
+#include <linux/timer.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/tboot.h>
+#include <linux/dmi.h>
+#include <linux/slab.h>
+#include <linux/iommu.h>
+#include <linux/numa.h>
+#include <linux/limits.h>
+#include <asm/irq_remapping.h>
+#include <asm/iommu_table.h>
+
+#include "../irq_remapping.h"
+
+typedef int (*dmar_res_handler_t)(struct acpi_dmar_header *, void *);
+struct dmar_res_callback {
+       dmar_res_handler_t      cb[ACPI_DMAR_TYPE_RESERVED];
+       void                    *arg[ACPI_DMAR_TYPE_RESERVED];
+       bool                    ignore_unhandled;
+       bool                    print_entry;
+};
+
+/*
+ * Assumptions:
+ * 1) The hotplug framework guarentees that DMAR unit will be hot-added
+ *    before IO devices managed by that unit.
+ * 2) The hotplug framework guarantees that DMAR unit will be hot-removed
+ *    after IO devices managed by that unit.
+ * 3) Hotplug events are rare.
+ *
+ * Locking rules for DMA and interrupt remapping related global data structures:
+ * 1) Use dmar_global_lock in process context
+ * 2) Use RCU in interrupt context
+ */
+DECLARE_RWSEM(dmar_global_lock);
+LIST_HEAD(dmar_drhd_units);
+
+struct acpi_table_header * __initdata dmar_tbl;
+static int dmar_dev_scope_status = 1;
+static unsigned long dmar_seq_ids[BITS_TO_LONGS(DMAR_UNITS_SUPPORTED)];
+
+static int alloc_iommu(struct dmar_drhd_unit *drhd);
+static void free_iommu(struct intel_iommu *iommu);
+
+extern const struct iommu_ops intel_iommu_ops;
+
+static void dmar_register_drhd_unit(struct dmar_drhd_unit *drhd)
+{
+       /*
+        * add INCLUDE_ALL at the tail, so scan the list will find it at
+        * the very end.
+        */
+       if (drhd->include_all)
+               list_add_tail_rcu(&drhd->list, &dmar_drhd_units);
+       else
+               list_add_rcu(&drhd->list, &dmar_drhd_units);
+}
+
+void *dmar_alloc_dev_scope(void *start, void *end, int *cnt)
+{
+       struct acpi_dmar_device_scope *scope;
+
+       *cnt = 0;
+       while (start < end) {
+               scope = start;
+               if (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_NAMESPACE ||
+                   scope->entry_type == ACPI_DMAR_SCOPE_TYPE_ENDPOINT ||
+                   scope->entry_type == ACPI_DMAR_SCOPE_TYPE_BRIDGE)
+                       (*cnt)++;
+               else if (scope->entry_type != ACPI_DMAR_SCOPE_TYPE_IOAPIC &&
+                       scope->entry_type != ACPI_DMAR_SCOPE_TYPE_HPET) {
+                       pr_warn("Unsupported device scope\n");
+               }
+               start += scope->length;
+       }
+       if (*cnt == 0)
+               return NULL;
+
+       return kcalloc(*cnt, sizeof(struct dmar_dev_scope), GFP_KERNEL);
+}
+
+void dmar_free_dev_scope(struct dmar_dev_scope **devices, int *cnt)
+{
+       int i;
+       struct device *tmp_dev;
+
+       if (*devices && *cnt) {
+               for_each_active_dev_scope(*devices, *cnt, i, tmp_dev)
+                       put_device(tmp_dev);
+               kfree(*devices);
+       }
+
+       *devices = NULL;
+       *cnt = 0;
+}
+
+/* Optimize out kzalloc()/kfree() for normal cases */
+static char dmar_pci_notify_info_buf[64];
+
+static struct dmar_pci_notify_info *
+dmar_alloc_pci_notify_info(struct pci_dev *dev, unsigned long event)
+{
+       int level = 0;
+       size_t size;
+       struct pci_dev *tmp;
+       struct dmar_pci_notify_info *info;
+
+       BUG_ON(dev->is_virtfn);
+
+       /*
+        * Ignore devices that have a domain number higher than what can
+        * be looked up in DMAR, e.g. VMD subdevices with domain 0x10000
+        */
+       if (pci_domain_nr(dev->bus) > U16_MAX)
+               return NULL;
+
+       /* Only generate path[] for device addition event */
+       if (event == BUS_NOTIFY_ADD_DEVICE)
+               for (tmp = dev; tmp; tmp = tmp->bus->self)
+                       level++;
+
+       size = struct_size(info, path, level);
+       if (size <= sizeof(dmar_pci_notify_info_buf)) {
+               info = (struct dmar_pci_notify_info *)dmar_pci_notify_info_buf;
+       } else {
+               info = kzalloc(size, GFP_KERNEL);
+               if (!info) {
+                       pr_warn("Out of memory when allocating notify_info "
+                               "for %s.\n", pci_name(dev));
+                       if (dmar_dev_scope_status == 0)
+                               dmar_dev_scope_status = -ENOMEM;
+                       return NULL;
+               }
+       }
+
+       info->event = event;
+       info->dev = dev;
+       info->seg = pci_domain_nr(dev->bus);
+       info->level = level;
+       if (event == BUS_NOTIFY_ADD_DEVICE) {
+               for (tmp = dev; tmp; tmp = tmp->bus->self) {
+                       level--;
+                       info->path[level].bus = tmp->bus->number;
+                       info->path[level].device = PCI_SLOT(tmp->devfn);
+                       info->path[level].function = PCI_FUNC(tmp->devfn);
+                       if (pci_is_root_bus(tmp->bus))
+                               info->bus = tmp->bus->number;
+               }
+       }
+
+       return info;
+}
+
+static inline void dmar_free_pci_notify_info(struct dmar_pci_notify_info *info)
+{
+       if ((void *)info != dmar_pci_notify_info_buf)
+               kfree(info);
+}
+
+static bool dmar_match_pci_path(struct dmar_pci_notify_info *info, int bus,
+                               struct acpi_dmar_pci_path *path, int count)
+{
+       int i;
+
+       if (info->bus != bus)
+               goto fallback;
+       if (info->level != count)
+               goto fallback;
+
+       for (i = 0; i < count; i++) {
+               if (path[i].device != info->path[i].device ||
+                   path[i].function != info->path[i].function)
+                       goto fallback;
+       }
+
+       return true;
+
+fallback:
+
+       if (count != 1)
+               return false;
+
+       i = info->level - 1;
+       if (bus              == info->path[i].bus &&
+           path[0].device   == info->path[i].device &&
+           path[0].function == info->path[i].function) {
+               pr_info(FW_BUG "RMRR entry for device %02x:%02x.%x is broken - applying workaround\n",
+                       bus, path[0].device, path[0].function);
+               return true;
+       }
+
+       return false;
+}
+
+/* Return: > 0 if match found, 0 if no match found, < 0 if error happens */
+int dmar_insert_dev_scope(struct dmar_pci_notify_info *info,
+                         void *start, void*end, u16 segment,
+                         struct dmar_dev_scope *devices,
+                         int devices_cnt)
+{
+       int i, level;
+       struct device *tmp, *dev = &info->dev->dev;
+       struct acpi_dmar_device_scope *scope;
+       struct acpi_dmar_pci_path *path;
+
+       if (segment != info->seg)
+               return 0;
+
+       for (; start < end; start += scope->length) {
+               scope = start;
+               if (scope->entry_type != ACPI_DMAR_SCOPE_TYPE_ENDPOINT &&
+                   scope->entry_type != ACPI_DMAR_SCOPE_TYPE_BRIDGE)
+                       continue;
+
+               path = (struct acpi_dmar_pci_path *)(scope + 1);
+               level = (scope->length - sizeof(*scope)) / sizeof(*path);
+               if (!dmar_match_pci_path(info, scope->bus, path, level))
+                       continue;
+
+               /*
+                * We expect devices with endpoint scope to have normal PCI
+                * headers, and devices with bridge scope to have bridge PCI
+                * headers.  However PCI NTB devices may be listed in the
+                * DMAR table with bridge scope, even though they have a
+                * normal PCI header.  NTB devices are identified by class
+                * "BRIDGE_OTHER" (0680h) - we don't declare a socpe mismatch
+                * for this special case.
+                */
+               if ((scope->entry_type == ACPI_DMAR_SCOPE_TYPE_ENDPOINT &&
+                    info->dev->hdr_type != PCI_HEADER_TYPE_NORMAL) ||
+                   (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_BRIDGE &&
+                    (info->dev->hdr_type == PCI_HEADER_TYPE_NORMAL &&
+                     info->dev->class >> 16 != PCI_BASE_CLASS_BRIDGE))) {
+                       pr_warn("Device scope type does not match for %s\n",
+                               pci_name(info->dev));
+                       return -EINVAL;
+               }
+
+               for_each_dev_scope(devices, devices_cnt, i, tmp)
+                       if (tmp == NULL) {
+                               devices[i].bus = info->dev->bus->number;
+                               devices[i].devfn = info->dev->devfn;
+                               rcu_assign_pointer(devices[i].dev,
+                                                  get_device(dev));
+                               return 1;
+                       }
+               BUG_ON(i >= devices_cnt);
+       }
+
+       return 0;
+}
+
+int dmar_remove_dev_scope(struct dmar_pci_notify_info *info, u16 segment,
+                         struct dmar_dev_scope *devices, int count)
+{
+       int index;
+       struct device *tmp;
+
+       if (info->seg != segment)
+               return 0;
+
+       for_each_active_dev_scope(devices, count, index, tmp)
+               if (tmp == &info->dev->dev) {
+                       RCU_INIT_POINTER(devices[index].dev, NULL);
+                       synchronize_rcu();
+                       put_device(tmp);
+                       return 1;
+               }
+
+       return 0;
+}
+
+static int dmar_pci_bus_add_dev(struct dmar_pci_notify_info *info)
+{
+       int ret = 0;
+       struct dmar_drhd_unit *dmaru;
+       struct acpi_dmar_hardware_unit *drhd;
+
+       for_each_drhd_unit(dmaru) {
+               if (dmaru->include_all)
+                       continue;
+
+               drhd = container_of(dmaru->hdr,
+                                   struct acpi_dmar_hardware_unit, header);
+               ret = dmar_insert_dev_scope(info, (void *)(drhd + 1),
+                               ((void *)drhd) + drhd->header.length,
+                               dmaru->segment,
+                               dmaru->devices, dmaru->devices_cnt);
+               if (ret)
+                       break;
+       }
+       if (ret >= 0)
+               ret = dmar_iommu_notify_scope_dev(info);
+       if (ret < 0 && dmar_dev_scope_status == 0)
+               dmar_dev_scope_status = ret;
+
+       return ret;
+}
+
+static void  dmar_pci_bus_del_dev(struct dmar_pci_notify_info *info)
+{
+       struct dmar_drhd_unit *dmaru;
+
+       for_each_drhd_unit(dmaru)
+               if (dmar_remove_dev_scope(info, dmaru->segment,
+                       dmaru->devices, dmaru->devices_cnt))
+                       break;
+       dmar_iommu_notify_scope_dev(info);
+}
+
+static int dmar_pci_bus_notifier(struct notifier_block *nb,
+                                unsigned long action, void *data)
+{
+       struct pci_dev *pdev = to_pci_dev(data);
+       struct dmar_pci_notify_info *info;
+
+       /* Only care about add/remove events for physical functions.
+        * For VFs we actually do the lookup based on the corresponding
+        * PF in device_to_iommu() anyway. */
+       if (pdev->is_virtfn)
+               return NOTIFY_DONE;
+       if (action != BUS_NOTIFY_ADD_DEVICE &&
+           action != BUS_NOTIFY_REMOVED_DEVICE)
+               return NOTIFY_DONE;
+
+       info = dmar_alloc_pci_notify_info(pdev, action);
+       if (!info)
+               return NOTIFY_DONE;
+
+       down_write(&dmar_global_lock);
+       if (action == BUS_NOTIFY_ADD_DEVICE)
+               dmar_pci_bus_add_dev(info);
+       else if (action == BUS_NOTIFY_REMOVED_DEVICE)
+               dmar_pci_bus_del_dev(info);
+       up_write(&dmar_global_lock);
+
+       dmar_free_pci_notify_info(info);
+
+       return NOTIFY_OK;
+}
+
+static struct notifier_block dmar_pci_bus_nb = {
+       .notifier_call = dmar_pci_bus_notifier,
+       .priority = INT_MIN,
+};
+
+static struct dmar_drhd_unit *
+dmar_find_dmaru(struct acpi_dmar_hardware_unit *drhd)
+{
+       struct dmar_drhd_unit *dmaru;
+
+       list_for_each_entry_rcu(dmaru, &dmar_drhd_units, list,
+                               dmar_rcu_check())
+               if (dmaru->segment == drhd->segment &&
+                   dmaru->reg_base_addr == drhd->address)
+                       return dmaru;
+
+       return NULL;
+}
+
+/**
+ * dmar_parse_one_drhd - parses exactly one DMA remapping hardware definition
+ * structure which uniquely represent one DMA remapping hardware unit
+ * present in the platform
+ */
+static int dmar_parse_one_drhd(struct acpi_dmar_header *header, void *arg)
+{
+       struct acpi_dmar_hardware_unit *drhd;
+       struct dmar_drhd_unit *dmaru;
+       int ret;
+
+       drhd = (struct acpi_dmar_hardware_unit *)header;
+       dmaru = dmar_find_dmaru(drhd);
+       if (dmaru)
+               goto out;
+
+       dmaru = kzalloc(sizeof(*dmaru) + header->length, GFP_KERNEL);
+       if (!dmaru)
+               return -ENOMEM;
+
+       /*
+        * If header is allocated from slab by ACPI _DSM method, we need to
+        * copy the content because the memory buffer will be freed on return.
+        */
+       dmaru->hdr = (void *)(dmaru + 1);
+       memcpy(dmaru->hdr, header, header->length);
+       dmaru->reg_base_addr = drhd->address;
+       dmaru->segment = drhd->segment;
+       dmaru->include_all = drhd->flags & 0x1; /* BIT0: INCLUDE_ALL */
+       dmaru->devices = dmar_alloc_dev_scope((void *)(drhd + 1),
+                                             ((void *)drhd) + drhd->header.length,
+                                             &dmaru->devices_cnt);
+       if (dmaru->devices_cnt && dmaru->devices == NULL) {
+               kfree(dmaru);
+               return -ENOMEM;
+       }
+
+       ret = alloc_iommu(dmaru);
+       if (ret) {
+               dmar_free_dev_scope(&dmaru->devices,
+                                   &dmaru->devices_cnt);
+               kfree(dmaru);
+               return ret;
+       }
+       dmar_register_drhd_unit(dmaru);
+
+out:
+       if (arg)
+               (*(int *)arg)++;
+
+       return 0;
+}
+
+static void dmar_free_drhd(struct dmar_drhd_unit *dmaru)
+{
+       if (dmaru->devices && dmaru->devices_cnt)
+               dmar_free_dev_scope(&dmaru->devices, &dmaru->devices_cnt);
+       if (dmaru->iommu)
+               free_iommu(dmaru->iommu);
+       kfree(dmaru);
+}
+
+static int __init dmar_parse_one_andd(struct acpi_dmar_header *header,
+                                     void *arg)
+{
+       struct acpi_dmar_andd *andd = (void *)header;
+
+       /* Check for NUL termination within the designated length */
+       if (strnlen(andd->device_name, header->length - 8) == header->length - 8) {
+               pr_warn(FW_BUG
+                          "Your BIOS is broken; ANDD object name is not NUL-terminated\n"
+                          "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
+                          dmi_get_system_info(DMI_BIOS_VENDOR),
+                          dmi_get_system_info(DMI_BIOS_VERSION),
+                          dmi_get_system_info(DMI_PRODUCT_VERSION));
+               add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
+               return -EINVAL;
+       }
+       pr_info("ANDD device: %x name: %s\n", andd->device_number,
+               andd->device_name);
+
+       return 0;
+}
+
+#ifdef CONFIG_ACPI_NUMA
+static int dmar_parse_one_rhsa(struct acpi_dmar_header *header, void *arg)
+{
+       struct acpi_dmar_rhsa *rhsa;
+       struct dmar_drhd_unit *drhd;
+
+       rhsa = (struct acpi_dmar_rhsa *)header;
+       for_each_drhd_unit(drhd) {
+               if (drhd->reg_base_addr == rhsa->base_address) {
+                       int node = acpi_map_pxm_to_node(rhsa->proximity_domain);
+
+                       if (!node_online(node))
+                               node = NUMA_NO_NODE;
+                       drhd->iommu->node = node;
+                       return 0;
+               }
+       }
+       pr_warn(FW_BUG
+               "Your BIOS is broken; RHSA refers to non-existent DMAR unit at %llx\n"
+               "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
+               rhsa->base_address,
+               dmi_get_system_info(DMI_BIOS_VENDOR),
+               dmi_get_system_info(DMI_BIOS_VERSION),
+               dmi_get_system_info(DMI_PRODUCT_VERSION));
+       add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
+
+       return 0;
+}
+#else
+#define        dmar_parse_one_rhsa             dmar_res_noop
+#endif
+
+static void
+dmar_table_print_dmar_entry(struct acpi_dmar_header *header)
+{
+       struct acpi_dmar_hardware_unit *drhd;
+       struct acpi_dmar_reserved_memory *rmrr;
+       struct acpi_dmar_atsr *atsr;
+       struct acpi_dmar_rhsa *rhsa;
+
+       switch (header->type) {
+       case ACPI_DMAR_TYPE_HARDWARE_UNIT:
+               drhd = container_of(header, struct acpi_dmar_hardware_unit,
+                                   header);
+               pr_info("DRHD base: %#016Lx flags: %#x\n",
+                       (unsigned long long)drhd->address, drhd->flags);
+               break;
+       case ACPI_DMAR_TYPE_RESERVED_MEMORY:
+               rmrr = container_of(header, struct acpi_dmar_reserved_memory,
+                                   header);
+               pr_info("RMRR base: %#016Lx end: %#016Lx\n",
+                       (unsigned long long)rmrr->base_address,
+                       (unsigned long long)rmrr->end_address);
+               break;
+       case ACPI_DMAR_TYPE_ROOT_ATS:
+               atsr = container_of(header, struct acpi_dmar_atsr, header);
+               pr_info("ATSR flags: %#x\n", atsr->flags);
+               break;
+       case ACPI_DMAR_TYPE_HARDWARE_AFFINITY:
+               rhsa = container_of(header, struct acpi_dmar_rhsa, header);
+               pr_info("RHSA base: %#016Lx proximity domain: %#x\n",
+                      (unsigned long long)rhsa->base_address,
+                      rhsa->proximity_domain);
+               break;
+       case ACPI_DMAR_TYPE_NAMESPACE:
+               /* We don't print this here because we need to sanity-check
+                  it first. So print it in dmar_parse_one_andd() instead. */
+               break;
+       }
+}
+
+/**
+ * dmar_table_detect - checks to see if the platform supports DMAR devices
+ */
+static int __init dmar_table_detect(void)
+{
+       acpi_status status = AE_OK;
+
+       /* if we could find DMAR table, then there are DMAR devices */
+       status = acpi_get_table(ACPI_SIG_DMAR, 0, &dmar_tbl);
+
+       if (ACPI_SUCCESS(status) && !dmar_tbl) {
+               pr_warn("Unable to map DMAR\n");
+               status = AE_NOT_FOUND;
+       }
+
+       return ACPI_SUCCESS(status) ? 0 : -ENOENT;
+}
+
+static int dmar_walk_remapping_entries(struct acpi_dmar_header *start,
+                                      size_t len, struct dmar_res_callback *cb)
+{
+       struct acpi_dmar_header *iter, *next;
+       struct acpi_dmar_header *end = ((void *)start) + len;
+
+       for (iter = start; iter < end; iter = next) {
+               next = (void *)iter + iter->length;
+               if (iter->length == 0) {
+                       /* Avoid looping forever on bad ACPI tables */
+                       pr_debug(FW_BUG "Invalid 0-length structure\n");
+                       break;
+               } else if (next > end) {
+                       /* Avoid passing table end */
+                       pr_warn(FW_BUG "Record passes table end\n");
+                       return -EINVAL;
+               }
+
+               if (cb->print_entry)
+                       dmar_table_print_dmar_entry(iter);
+
+               if (iter->type >= ACPI_DMAR_TYPE_RESERVED) {
+                       /* continue for forward compatibility */
+                       pr_debug("Unknown DMAR structure type %d\n",
+                                iter->type);
+               } else if (cb->cb[iter->type]) {
+                       int ret;
+
+                       ret = cb->cb[iter->type](iter, cb->arg[iter->type]);
+                       if (ret)
+                               return ret;
+               } else if (!cb->ignore_unhandled) {
+                       pr_warn("No handler for DMAR structure type %d\n",
+                               iter->type);
+                       return -EINVAL;
+               }
+       }
+
+       return 0;
+}
+
+static inline int dmar_walk_dmar_table(struct acpi_table_dmar *dmar,
+                                      struct dmar_res_callback *cb)
+{
+       return dmar_walk_remapping_entries((void *)(dmar + 1),
+                       dmar->header.length - sizeof(*dmar), cb);
+}
+
+/**
+ * parse_dmar_table - parses the DMA reporting table
+ */
+static int __init
+parse_dmar_table(void)
+{
+       struct acpi_table_dmar *dmar;
+       int drhd_count = 0;
+       int ret;
+       struct dmar_res_callback cb = {
+               .print_entry = true,
+               .ignore_unhandled = true,
+               .arg[ACPI_DMAR_TYPE_HARDWARE_UNIT] = &drhd_count,
+               .cb[ACPI_DMAR_TYPE_HARDWARE_UNIT] = &dmar_parse_one_drhd,
+               .cb[ACPI_DMAR_TYPE_RESERVED_MEMORY] = &dmar_parse_one_rmrr,
+               .cb[ACPI_DMAR_TYPE_ROOT_ATS] = &dmar_parse_one_atsr,
+               .cb[ACPI_DMAR_TYPE_HARDWARE_AFFINITY] = &dmar_parse_one_rhsa,
+               .cb[ACPI_DMAR_TYPE_NAMESPACE] = &dmar_parse_one_andd,
+       };
+
+       /*
+        * Do it again, earlier dmar_tbl mapping could be mapped with
+        * fixed map.
+        */
+       dmar_table_detect();
+
+       /*
+        * ACPI tables may not be DMA protected by tboot, so use DMAR copy
+        * SINIT saved in SinitMleData in TXT heap (which is DMA protected)
+        */
+       dmar_tbl = tboot_get_dmar_table(dmar_tbl);
+
+       dmar = (struct acpi_table_dmar *)dmar_tbl;
+       if (!dmar)
+               return -ENODEV;
+
+       if (dmar->width < PAGE_SHIFT - 1) {
+               pr_warn("Invalid DMAR haw\n");
+               return -EINVAL;
+       }
+
+       pr_info("Host address width %d\n", dmar->width + 1);
+       ret = dmar_walk_dmar_table(dmar, &cb);
+       if (ret == 0 && drhd_count == 0)
+               pr_warn(FW_BUG "No DRHD structure found in DMAR table\n");
+
+       return ret;
+}
+
+static int dmar_pci_device_match(struct dmar_dev_scope devices[],
+                                int cnt, struct pci_dev *dev)
+{
+       int index;
+       struct device *tmp;
+
+       while (dev) {
+               for_each_active_dev_scope(devices, cnt, index, tmp)
+                       if (dev_is_pci(tmp) && dev == to_pci_dev(tmp))
+                               return 1;
+
+               /* Check our parent */
+               dev = dev->bus->self;
+       }
+
+       return 0;
+}
+
+struct dmar_drhd_unit *
+dmar_find_matched_drhd_unit(struct pci_dev *dev)
+{
+       struct dmar_drhd_unit *dmaru;
+       struct acpi_dmar_hardware_unit *drhd;
+
+       dev = pci_physfn(dev);
+
+       rcu_read_lock();
+       for_each_drhd_unit(dmaru) {
+               drhd = container_of(dmaru->hdr,
+                                   struct acpi_dmar_hardware_unit,
+                                   header);
+
+               if (dmaru->include_all &&
+                   drhd->segment == pci_domain_nr(dev->bus))
+                       goto out;
+
+               if (dmar_pci_device_match(dmaru->devices,
+                                         dmaru->devices_cnt, dev))
+                       goto out;
+       }
+       dmaru = NULL;
+out:
+       rcu_read_unlock();
+
+       return dmaru;
+}
+
+static void __init dmar_acpi_insert_dev_scope(u8 device_number,
+                                             struct acpi_device *adev)
+{
+       struct dmar_drhd_unit *dmaru;
+       struct acpi_dmar_hardware_unit *drhd;
+       struct acpi_dmar_device_scope *scope;
+       struct device *tmp;
+       int i;
+       struct acpi_dmar_pci_path *path;
+
+       for_each_drhd_unit(dmaru) {
+               drhd = container_of(dmaru->hdr,
+                                   struct acpi_dmar_hardware_unit,
+                                   header);
+
+               for (scope = (void *)(drhd + 1);
+                    (unsigned long)scope < ((unsigned long)drhd) + drhd->header.length;
+                    scope = ((void *)scope) + scope->length) {
+                       if (scope->entry_type != ACPI_DMAR_SCOPE_TYPE_NAMESPACE)
+                               continue;
+                       if (scope->enumeration_id != device_number)
+                               continue;
+
+                       path = (void *)(scope + 1);
+                       pr_info("ACPI device \"%s\" under DMAR at %llx as %02x:%02x.%d\n",
+                               dev_name(&adev->dev), dmaru->reg_base_addr,
+                               scope->bus, path->device, path->function);
+                       for_each_dev_scope(dmaru->devices, dmaru->devices_cnt, i, tmp)
+                               if (tmp == NULL) {
+                                       dmaru->devices[i].bus = scope->bus;
+                                       dmaru->devices[i].devfn = PCI_DEVFN(path->device,
+                                                                           path->function);
+                                       rcu_assign_pointer(dmaru->devices[i].dev,
+                                                          get_device(&adev->dev));
+                                       return;
+                               }
+                       BUG_ON(i >= dmaru->devices_cnt);
+               }
+       }
+       pr_warn("No IOMMU scope found for ANDD enumeration ID %d (%s)\n",
+               device_number, dev_name(&adev->dev));
+}
+
+static int __init dmar_acpi_dev_scope_init(void)
+{
+       struct acpi_dmar_andd *andd;
+
+       if (dmar_tbl == NULL)
+               return -ENODEV;
+
+       for (andd = (void *)dmar_tbl + sizeof(struct acpi_table_dmar);
+            ((unsigned long)andd) < ((unsigned long)dmar_tbl) + dmar_tbl->length;
+            andd = ((void *)andd) + andd->header.length) {
+               if (andd->header.type == ACPI_DMAR_TYPE_NAMESPACE) {
+                       acpi_handle h;
+                       struct acpi_device *adev;
+
+                       if (!ACPI_SUCCESS(acpi_get_handle(ACPI_ROOT_OBJECT,
+                                                         andd->device_name,
+                                                         &h))) {
+                               pr_err("Failed to find handle for ACPI object %s\n",
+                                      andd->device_name);
+                               continue;
+                       }
+                       if (acpi_bus_get_device(h, &adev)) {
+                               pr_err("Failed to get device for ACPI object %s\n",
+                                      andd->device_name);
+                               continue;
+                       }
+                       dmar_acpi_insert_dev_scope(andd->device_number, adev);
+               }
+       }
+       return 0;
+}
+
+int __init dmar_dev_scope_init(void)
+{
+       struct pci_dev *dev = NULL;
+       struct dmar_pci_notify_info *info;
+
+       if (dmar_dev_scope_status != 1)
+               return dmar_dev_scope_status;
+
+       if (list_empty(&dmar_drhd_units)) {
+               dmar_dev_scope_status = -ENODEV;
+       } else {
+               dmar_dev_scope_status = 0;
+
+               dmar_acpi_dev_scope_init();
+
+               for_each_pci_dev(dev) {
+                       if (dev->is_virtfn)
+                               continue;
+
+                       info = dmar_alloc_pci_notify_info(dev,
+                                       BUS_NOTIFY_ADD_DEVICE);
+                       if (!info) {
+                               return dmar_dev_scope_status;
+                       } else {
+                               dmar_pci_bus_add_dev(info);
+                               dmar_free_pci_notify_info(info);
+                       }
+               }
+       }
+
+       return dmar_dev_scope_status;
+}
+
+void __init dmar_register_bus_notifier(void)
+{
+       bus_register_notifier(&pci_bus_type, &dmar_pci_bus_nb);
+}
+
+
+int __init dmar_table_init(void)
+{
+       static int dmar_table_initialized;
+       int ret;
+
+       if (dmar_table_initialized == 0) {
+               ret = parse_dmar_table();
+               if (ret < 0) {
+                       if (ret != -ENODEV)
+                               pr_info("Parse DMAR table failure.\n");
+               } else  if (list_empty(&dmar_drhd_units)) {
+                       pr_info("No DMAR devices found\n");
+                       ret = -ENODEV;
+               }
+
+               if (ret < 0)
+                       dmar_table_initialized = ret;
+               else
+                       dmar_table_initialized = 1;
+       }
+
+       return dmar_table_initialized < 0 ? dmar_table_initialized : 0;
+}
+
+static void warn_invalid_dmar(u64 addr, const char *message)
+{
+       pr_warn_once(FW_BUG
+               "Your BIOS is broken; DMAR reported at address %llx%s!\n"
+               "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
+               addr, message,
+               dmi_get_system_info(DMI_BIOS_VENDOR),
+               dmi_get_system_info(DMI_BIOS_VERSION),
+               dmi_get_system_info(DMI_PRODUCT_VERSION));
+       add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
+}
+
+static int __ref
+dmar_validate_one_drhd(struct acpi_dmar_header *entry, void *arg)
+{
+       struct acpi_dmar_hardware_unit *drhd;
+       void __iomem *addr;
+       u64 cap, ecap;
+
+       drhd = (void *)entry;
+       if (!drhd->address) {
+               warn_invalid_dmar(0, "");
+               return -EINVAL;
+       }
+
+       if (arg)
+               addr = ioremap(drhd->address, VTD_PAGE_SIZE);
+       else
+               addr = early_ioremap(drhd->address, VTD_PAGE_SIZE);
+       if (!addr) {
+               pr_warn("Can't validate DRHD address: %llx\n", drhd->address);
+               return -EINVAL;
+       }
+
+       cap = dmar_readq(addr + DMAR_CAP_REG);
+       ecap = dmar_readq(addr + DMAR_ECAP_REG);
+
+       if (arg)
+               iounmap(addr);
+       else
+               early_iounmap(addr, VTD_PAGE_SIZE);
+
+       if (cap == (uint64_t)-1 && ecap == (uint64_t)-1) {
+               warn_invalid_dmar(drhd->address, " returns all ones");
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+int __init detect_intel_iommu(void)
+{
+       int ret;
+       struct dmar_res_callback validate_drhd_cb = {
+               .cb[ACPI_DMAR_TYPE_HARDWARE_UNIT] = &dmar_validate_one_drhd,
+               .ignore_unhandled = true,
+       };
+
+       down_write(&dmar_global_lock);
+       ret = dmar_table_detect();
+       if (!ret)
+               ret = dmar_walk_dmar_table((struct acpi_table_dmar *)dmar_tbl,
+                                          &validate_drhd_cb);
+       if (!ret && !no_iommu && !iommu_detected && !dmar_disabled) {
+               iommu_detected = 1;
+               /* Make sure ACS will be enabled */
+               pci_request_acs();
+       }
+
+#ifdef CONFIG_X86
+       if (!ret) {
+               x86_init.iommu.iommu_init = intel_iommu_init;
+               x86_platform.iommu_shutdown = intel_iommu_shutdown;
+       }
+
+#endif
+
+       if (dmar_tbl) {
+               acpi_put_table(dmar_tbl);
+               dmar_tbl = NULL;
+       }
+       up_write(&dmar_global_lock);
+
+       return ret ? ret : 1;
+}
+
+static void unmap_iommu(struct intel_iommu *iommu)
+{
+       iounmap(iommu->reg);
+       release_mem_region(iommu->reg_phys, iommu->reg_size);
+}
+
+/**
+ * map_iommu: map the iommu's registers
+ * @iommu: the iommu to map
+ * @phys_addr: the physical address of the base resgister
+ *
+ * Memory map the iommu's registers.  Start w/ a single page, and
+ * possibly expand if that turns out to be insufficent.
+ */
+static int map_iommu(struct intel_iommu *iommu, u64 phys_addr)
+{
+       int map_size, err=0;
+
+       iommu->reg_phys = phys_addr;
+       iommu->reg_size = VTD_PAGE_SIZE;
+
+       if (!request_mem_region(iommu->reg_phys, iommu->reg_size, iommu->name)) {
+               pr_err("Can't reserve memory\n");
+               err = -EBUSY;
+               goto out;
+       }
+
+       iommu->reg = ioremap(iommu->reg_phys, iommu->reg_size);
+       if (!iommu->reg) {
+               pr_err("Can't map the region\n");
+               err = -ENOMEM;
+               goto release;
+       }
+
+       iommu->cap = dmar_readq(iommu->reg + DMAR_CAP_REG);
+       iommu->ecap = dmar_readq(iommu->reg + DMAR_ECAP_REG);
+
+       if (iommu->cap == (uint64_t)-1 && iommu->ecap == (uint64_t)-1) {
+               err = -EINVAL;
+               warn_invalid_dmar(phys_addr, " returns all ones");
+               goto unmap;
+       }
+       iommu->vccap = dmar_readq(iommu->reg + DMAR_VCCAP_REG);
+
+       /* the registers might be more than one page */
+       map_size = max_t(int, ecap_max_iotlb_offset(iommu->ecap),
+                        cap_max_fault_reg_offset(iommu->cap));
+       map_size = VTD_PAGE_ALIGN(map_size);
+       if (map_size > iommu->reg_size) {
+               iounmap(iommu->reg);
+               release_mem_region(iommu->reg_phys, iommu->reg_size);
+               iommu->reg_size = map_size;
+               if (!request_mem_region(iommu->reg_phys, iommu->reg_size,
+                                       iommu->name)) {
+                       pr_err("Can't reserve memory\n");
+                       err = -EBUSY;
+                       goto out;
+               }
+               iommu->reg = ioremap(iommu->reg_phys, iommu->reg_size);
+               if (!iommu->reg) {
+                       pr_err("Can't map the region\n");
+                       err = -ENOMEM;
+                       goto release;
+               }
+       }
+       err = 0;
+       goto out;
+
+unmap:
+       iounmap(iommu->reg);
+release:
+       release_mem_region(iommu->reg_phys, iommu->reg_size);
+out:
+       return err;
+}
+
+static int dmar_alloc_seq_id(struct intel_iommu *iommu)
+{
+       iommu->seq_id = find_first_zero_bit(dmar_seq_ids,
+                                           DMAR_UNITS_SUPPORTED);
+       if (iommu->seq_id >= DMAR_UNITS_SUPPORTED) {
+               iommu->seq_id = -1;
+       } else {
+               set_bit(iommu->seq_id, dmar_seq_ids);
+               sprintf(iommu->name, "dmar%d", iommu->seq_id);
+       }
+
+       return iommu->seq_id;
+}
+
+static void dmar_free_seq_id(struct intel_iommu *iommu)
+{
+       if (iommu->seq_id >= 0) {
+               clear_bit(iommu->seq_id, dmar_seq_ids);
+               iommu->seq_id = -1;
+       }
+}
+
+static int alloc_iommu(struct dmar_drhd_unit *drhd)
+{
+       struct intel_iommu *iommu;
+       u32 ver, sts;
+       int agaw = 0;
+       int msagaw = 0;
+       int err;
+
+       if (!drhd->reg_base_addr) {
+               warn_invalid_dmar(0, "");
+               return -EINVAL;
+       }
+
+       iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
+       if (!iommu)
+               return -ENOMEM;
+
+       if (dmar_alloc_seq_id(iommu) < 0) {
+               pr_err("Failed to allocate seq_id\n");
+               err = -ENOSPC;
+               goto error;
+       }
+
+       err = map_iommu(iommu, drhd->reg_base_addr);
+       if (err) {
+               pr_err("Failed to map %s\n", iommu->name);
+               goto error_free_seq_id;
+       }
+
+       err = -EINVAL;
+       agaw = iommu_calculate_agaw(iommu);
+       if (agaw < 0) {
+               pr_err("Cannot get a valid agaw for iommu (seq_id = %d)\n",
+                       iommu->seq_id);
+               goto err_unmap;
+       }
+       msagaw = iommu_calculate_max_sagaw(iommu);
+       if (msagaw < 0) {
+               pr_err("Cannot get a valid max agaw for iommu (seq_id = %d)\n",
+                       iommu->seq_id);
+               goto err_unmap;
+       }
+       iommu->agaw = agaw;
+       iommu->msagaw = msagaw;
+       iommu->segment = drhd->segment;
+
+       iommu->node = NUMA_NO_NODE;
+
+       ver = readl(iommu->reg + DMAR_VER_REG);
+       pr_info("%s: reg_base_addr %llx ver %d:%d cap %llx ecap %llx\n",
+               iommu->name,
+               (unsigned long long)drhd->reg_base_addr,
+               DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver),
+               (unsigned long long)iommu->cap,
+               (unsigned long long)iommu->ecap);
+
+       /* Reflect status in gcmd */
+       sts = readl(iommu->reg + DMAR_GSTS_REG);
+       if (sts & DMA_GSTS_IRES)
+               iommu->gcmd |= DMA_GCMD_IRE;
+       if (sts & DMA_GSTS_TES)
+               iommu->gcmd |= DMA_GCMD_TE;
+       if (sts & DMA_GSTS_QIES)
+               iommu->gcmd |= DMA_GCMD_QIE;
+
+       raw_spin_lock_init(&iommu->register_lock);
+
+       if (intel_iommu_enabled) {
+               err = iommu_device_sysfs_add(&iommu->iommu, NULL,
+                                            intel_iommu_groups,
+                                            "%s", iommu->name);
+               if (err)
+                       goto err_unmap;
+
+               iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
+
+               err = iommu_device_register(&iommu->iommu);
+               if (err)
+                       goto err_unmap;
+       }
+
+       drhd->iommu = iommu;
+
+       return 0;
+
+err_unmap:
+       unmap_iommu(iommu);
+error_free_seq_id:
+       dmar_free_seq_id(iommu);
+error:
+       kfree(iommu);
+       return err;
+}
+
+static void free_iommu(struct intel_iommu *iommu)
+{
+       if (intel_iommu_enabled) {
+               iommu_device_unregister(&iommu->iommu);
+               iommu_device_sysfs_remove(&iommu->iommu);
+       }
+
+       if (iommu->irq) {
+               if (iommu->pr_irq) {
+                       free_irq(iommu->pr_irq, iommu);
+                       dmar_free_hwirq(iommu->pr_irq);
+                       iommu->pr_irq = 0;
+               }
+               free_irq(iommu->irq, iommu);
+               dmar_free_hwirq(iommu->irq);
+               iommu->irq = 0;
+       }
+
+       if (iommu->qi) {
+               free_page((unsigned long)iommu->qi->desc);
+               kfree(iommu->qi->desc_status);
+               kfree(iommu->qi);
+       }
+
+       if (iommu->reg)
+               unmap_iommu(iommu);
+
+       dmar_free_seq_id(iommu);
+       kfree(iommu);
+}
+
+/*
+ * Reclaim all the submitted descriptors which have completed its work.
+ */
+static inline void reclaim_free_desc(struct q_inval *qi)
+{
+       while (qi->desc_status[qi->free_tail] == QI_DONE ||
+              qi->desc_status[qi->free_tail] == QI_ABORT) {
+               qi->desc_status[qi->free_tail] = QI_FREE;
+               qi->free_tail = (qi->free_tail + 1) % QI_LENGTH;
+               qi->free_cnt++;
+       }
+}
+
+static int qi_check_fault(struct intel_iommu *iommu, int index, int wait_index)
+{
+       u32 fault;
+       int head, tail;
+       struct q_inval *qi = iommu->qi;
+       int shift = qi_shift(iommu);
+
+       if (qi->desc_status[wait_index] == QI_ABORT)
+               return -EAGAIN;
+
+       fault = readl(iommu->reg + DMAR_FSTS_REG);
+
+       /*
+        * If IQE happens, the head points to the descriptor associated
+        * with the error. No new descriptors are fetched until the IQE
+        * is cleared.
+        */
+       if (fault & DMA_FSTS_IQE) {
+               head = readl(iommu->reg + DMAR_IQH_REG);
+               if ((head >> shift) == index) {
+                       struct qi_desc *desc = qi->desc + head;
+
+                       /*
+                        * desc->qw2 and desc->qw3 are either reserved or
+                        * used by software as private data. We won't print
+                        * out these two qw's for security consideration.
+                        */
+                       pr_err("VT-d detected invalid descriptor: qw0 = %llx, qw1 = %llx\n",
+                              (unsigned long long)desc->qw0,
+                              (unsigned long long)desc->qw1);
+                       memcpy(desc, qi->desc + (wait_index << shift),
+                              1 << shift);
+                       writel(DMA_FSTS_IQE, iommu->reg + DMAR_FSTS_REG);
+                       return -EINVAL;
+               }
+       }
+
+       /*
+        * If ITE happens, all pending wait_desc commands are aborted.
+        * No new descriptors are fetched until the ITE is cleared.
+        */
+       if (fault & DMA_FSTS_ITE) {
+               head = readl(iommu->reg + DMAR_IQH_REG);
+               head = ((head >> shift) - 1 + QI_LENGTH) % QI_LENGTH;
+               head |= 1;
+               tail = readl(iommu->reg + DMAR_IQT_REG);
+               tail = ((tail >> shift) - 1 + QI_LENGTH) % QI_LENGTH;
+
+               writel(DMA_FSTS_ITE, iommu->reg + DMAR_FSTS_REG);
+
+               do {
+                       if (qi->desc_status[head] == QI_IN_USE)
+                               qi->desc_status[head] = QI_ABORT;
+                       head = (head - 2 + QI_LENGTH) % QI_LENGTH;
+               } while (head != tail);
+
+               if (qi->desc_status[wait_index] == QI_ABORT)
+                       return -EAGAIN;
+       }
+
+       if (fault & DMA_FSTS_ICE)
+               writel(DMA_FSTS_ICE, iommu->reg + DMAR_FSTS_REG);
+
+       return 0;
+}
+
+/*
+ * Function to submit invalidation descriptors of all types to the queued
+ * invalidation interface(QI). Multiple descriptors can be submitted at a
+ * time, a wait descriptor will be appended to each submission to ensure
+ * hardware has completed the invalidation before return. Wait descriptors
+ * can be part of the submission but it will not be polled for completion.
+ */
+int qi_submit_sync(struct intel_iommu *iommu, struct qi_desc *desc,
+                  unsigned int count, unsigned long options)
+{
+       struct q_inval *qi = iommu->qi;
+       struct qi_desc wait_desc;
+       int wait_index, index;
+       unsigned long flags;
+       int offset, shift;
+       int rc, i;
+
+       if (!qi)
+               return 0;
+
+restart:
+       rc = 0;
+
+       raw_spin_lock_irqsave(&qi->q_lock, flags);
+       /*
+        * Check if we have enough empty slots in the queue to submit,
+        * the calculation is based on:
+        * # of desc + 1 wait desc + 1 space between head and tail
+        */
+       while (qi->free_cnt < count + 2) {
+               raw_spin_unlock_irqrestore(&qi->q_lock, flags);
+               cpu_relax();
+               raw_spin_lock_irqsave(&qi->q_lock, flags);
+       }
+
+       index = qi->free_head;
+       wait_index = (index + count) % QI_LENGTH;
+       shift = qi_shift(iommu);
+
+       for (i = 0; i < count; i++) {
+               offset = ((index + i) % QI_LENGTH) << shift;
+               memcpy(qi->desc + offset, &desc[i], 1 << shift);
+               qi->desc_status[(index + i) % QI_LENGTH] = QI_IN_USE;
+       }
+       qi->desc_status[wait_index] = QI_IN_USE;
+
+       wait_desc.qw0 = QI_IWD_STATUS_DATA(QI_DONE) |
+                       QI_IWD_STATUS_WRITE | QI_IWD_TYPE;
+       if (options & QI_OPT_WAIT_DRAIN)
+               wait_desc.qw0 |= QI_IWD_PRQ_DRAIN;
+       wait_desc.qw1 = virt_to_phys(&qi->desc_status[wait_index]);
+       wait_desc.qw2 = 0;
+       wait_desc.qw3 = 0;
+
+       offset = wait_index << shift;
+       memcpy(qi->desc + offset, &wait_desc, 1 << shift);
+
+       qi->free_head = (qi->free_head + count + 1) % QI_LENGTH;
+       qi->free_cnt -= count + 1;
+
+       /*
+        * update the HW tail register indicating the presence of
+        * new descriptors.
+        */
+       writel(qi->free_head << shift, iommu->reg + DMAR_IQT_REG);
+
+       while (qi->desc_status[wait_index] != QI_DONE) {
+               /*
+                * We will leave the interrupts disabled, to prevent interrupt
+                * context to queue another cmd while a cmd is already submitted
+                * and waiting for completion on this cpu. This is to avoid
+                * a deadlock where the interrupt context can wait indefinitely
+                * for free slots in the queue.
+                */
+               rc = qi_check_fault(iommu, index, wait_index);
+               if (rc)
+                       break;
+
+               raw_spin_unlock(&qi->q_lock);
+               cpu_relax();
+               raw_spin_lock(&qi->q_lock);
+       }
+
+       for (i = 0; i < count; i++)
+               qi->desc_status[(index + i) % QI_LENGTH] = QI_DONE;
+
+       reclaim_free_desc(qi);
+       raw_spin_unlock_irqrestore(&qi->q_lock, flags);
+
+       if (rc == -EAGAIN)
+               goto restart;
+
+       return rc;
+}
+
+/*
+ * Flush the global interrupt entry cache.
+ */
+void qi_global_iec(struct intel_iommu *iommu)
+{
+       struct qi_desc desc;
+
+       desc.qw0 = QI_IEC_TYPE;
+       desc.qw1 = 0;
+       desc.qw2 = 0;
+       desc.qw3 = 0;
+
+       /* should never fail */
+       qi_submit_sync(iommu, &desc, 1, 0);
+}
+
+void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm,
+                     u64 type)
+{
+       struct qi_desc desc;
+
+       desc.qw0 = QI_CC_FM(fm) | QI_CC_SID(sid) | QI_CC_DID(did)
+                       | QI_CC_GRAN(type) | QI_CC_TYPE;
+       desc.qw1 = 0;
+       desc.qw2 = 0;
+       desc.qw3 = 0;
+
+       qi_submit_sync(iommu, &desc, 1, 0);
+}
+
+void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
+                   unsigned int size_order, u64 type)
+{
+       u8 dw = 0, dr = 0;
+
+       struct qi_desc desc;
+       int ih = 0;
+
+       if (cap_write_drain(iommu->cap))
+               dw = 1;
+
+       if (cap_read_drain(iommu->cap))
+               dr = 1;
+
+       desc.qw0 = QI_IOTLB_DID(did) | QI_IOTLB_DR(dr) | QI_IOTLB_DW(dw)
+               | QI_IOTLB_GRAN(type) | QI_IOTLB_TYPE;
+       desc.qw1 = QI_IOTLB_ADDR(addr) | QI_IOTLB_IH(ih)
+               | QI_IOTLB_AM(size_order);
+       desc.qw2 = 0;
+       desc.qw3 = 0;
+
+       qi_submit_sync(iommu, &desc, 1, 0);
+}
+
+void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid,
+                       u16 qdep, u64 addr, unsigned mask)
+{
+       struct qi_desc desc;
+
+       if (mask) {
+               addr |= (1ULL << (VTD_PAGE_SHIFT + mask - 1)) - 1;
+               desc.qw1 = QI_DEV_IOTLB_ADDR(addr) | QI_DEV_IOTLB_SIZE;
+       } else
+               desc.qw1 = QI_DEV_IOTLB_ADDR(addr);
+
+       if (qdep >= QI_DEV_IOTLB_MAX_INVS)
+               qdep = 0;
+
+       desc.qw0 = QI_DEV_IOTLB_SID(sid) | QI_DEV_IOTLB_QDEP(qdep) |
+                  QI_DIOTLB_TYPE | QI_DEV_IOTLB_PFSID(pfsid);
+       desc.qw2 = 0;
+       desc.qw3 = 0;
+
+       qi_submit_sync(iommu, &desc, 1, 0);
+}
+
+/* PASID-based IOTLB invalidation */
+void qi_flush_piotlb(struct intel_iommu *iommu, u16 did, u32 pasid, u64 addr,
+                    unsigned long npages, bool ih)
+{
+       struct qi_desc desc = {.qw2 = 0, .qw3 = 0};
+
+       /*
+        * npages == -1 means a PASID-selective invalidation, otherwise,
+        * a positive value for Page-selective-within-PASID invalidation.
+        * 0 is not a valid input.
+        */
+       if (WARN_ON(!npages)) {
+               pr_err("Invalid input npages = %ld\n", npages);
+               return;
+       }
+
+       if (npages == -1) {
+               desc.qw0 = QI_EIOTLB_PASID(pasid) |
+                               QI_EIOTLB_DID(did) |
+                               QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) |
+                               QI_EIOTLB_TYPE;
+               desc.qw1 = 0;
+       } else {
+               int mask = ilog2(__roundup_pow_of_two(npages));
+               unsigned long align = (1ULL << (VTD_PAGE_SHIFT + mask));
+
+               if (WARN_ON_ONCE(!ALIGN(addr, align)))
+                       addr &= ~(align - 1);
+
+               desc.qw0 = QI_EIOTLB_PASID(pasid) |
+                               QI_EIOTLB_DID(did) |
+                               QI_EIOTLB_GRAN(QI_GRAN_PSI_PASID) |
+                               QI_EIOTLB_TYPE;
+               desc.qw1 = QI_EIOTLB_ADDR(addr) |
+                               QI_EIOTLB_IH(ih) |
+                               QI_EIOTLB_AM(mask);
+       }
+
+       qi_submit_sync(iommu, &desc, 1, 0);
+}
+
+/* PASID-based device IOTLB Invalidate */
+void qi_flush_dev_iotlb_pasid(struct intel_iommu *iommu, u16 sid, u16 pfsid,
+                             u32 pasid,  u16 qdep, u64 addr,
+                             unsigned int size_order, u64 granu)
+{
+       unsigned long mask = 1UL << (VTD_PAGE_SHIFT + size_order - 1);
+       struct qi_desc desc = {.qw1 = 0, .qw2 = 0, .qw3 = 0};
+
+       desc.qw0 = QI_DEV_EIOTLB_PASID(pasid) | QI_DEV_EIOTLB_SID(sid) |
+               QI_DEV_EIOTLB_QDEP(qdep) | QI_DEIOTLB_TYPE |
+               QI_DEV_IOTLB_PFSID(pfsid);
+       desc.qw1 = QI_DEV_EIOTLB_GLOB(granu);
+
+       /*
+        * If S bit is 0, we only flush a single page. If S bit is set,
+        * The least significant zero bit indicates the invalidation address
+        * range. VT-d spec 6.5.2.6.
+        * e.g. address bit 12[0] indicates 8KB, 13[0] indicates 16KB.
+        * size order = 0 is PAGE_SIZE 4KB
+        * Max Invs Pending (MIP) is set to 0 for now until we have DIT in
+        * ECAP.
+        */
+       desc.qw1 |= addr & ~mask;
+       if (size_order)
+               desc.qw1 |= QI_DEV_EIOTLB_SIZE;
+
+       qi_submit_sync(iommu, &desc, 1, 0);
+}
+
+void qi_flush_pasid_cache(struct intel_iommu *iommu, u16 did,
+                         u64 granu, int pasid)
+{
+       struct qi_desc desc = {.qw1 = 0, .qw2 = 0, .qw3 = 0};
+
+       desc.qw0 = QI_PC_PASID(pasid) | QI_PC_DID(did) |
+                       QI_PC_GRAN(granu) | QI_PC_TYPE;
+       qi_submit_sync(iommu, &desc, 1, 0);
+}
+
+/*
+ * Disable Queued Invalidation interface.
+ */
+void dmar_disable_qi(struct intel_iommu *iommu)
+{
+       unsigned long flags;
+       u32 sts;
+       cycles_t start_time = get_cycles();
+
+       if (!ecap_qis(iommu->ecap))
+               return;
+
+       raw_spin_lock_irqsave(&iommu->register_lock, flags);
+
+       sts =  readl(iommu->reg + DMAR_GSTS_REG);
+       if (!(sts & DMA_GSTS_QIES))
+               goto end;
+
+       /*
+        * Give a chance to HW to complete the pending invalidation requests.
+        */
+       while ((readl(iommu->reg + DMAR_IQT_REG) !=
+               readl(iommu->reg + DMAR_IQH_REG)) &&
+               (DMAR_OPERATION_TIMEOUT > (get_cycles() - start_time)))
+               cpu_relax();
+
+       iommu->gcmd &= ~DMA_GCMD_QIE;
+       writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
+
+       IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, readl,
+                     !(sts & DMA_GSTS_QIES), sts);
+end:
+       raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
+}
+
+/*
+ * Enable queued invalidation.
+ */
+static void __dmar_enable_qi(struct intel_iommu *iommu)
+{
+       u32 sts;
+       unsigned long flags;
+       struct q_inval *qi = iommu->qi;
+       u64 val = virt_to_phys(qi->desc);
+
+       qi->free_head = qi->free_tail = 0;
+       qi->free_cnt = QI_LENGTH;
+
+       /*
+        * Set DW=1 and QS=1 in IQA_REG when Scalable Mode capability
+        * is present.
+        */
+       if (ecap_smts(iommu->ecap))
+               val |= (1 << 11) | 1;
+
+       raw_spin_lock_irqsave(&iommu->register_lock, flags);
+
+       /* write zero to the tail reg */
+       writel(0, iommu->reg + DMAR_IQT_REG);
+
+       dmar_writeq(iommu->reg + DMAR_IQA_REG, val);
+
+       iommu->gcmd |= DMA_GCMD_QIE;
+       writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
+
+       /* Make sure hardware complete it */
+       IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, readl, (sts & DMA_GSTS_QIES), sts);
+
+       raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
+}
+
+/*
+ * Enable Queued Invalidation interface. This is a must to support
+ * interrupt-remapping. Also used by DMA-remapping, which replaces
+ * register based IOTLB invalidation.
+ */
+int dmar_enable_qi(struct intel_iommu *iommu)
+{
+       struct q_inval *qi;
+       struct page *desc_page;
+
+       if (!ecap_qis(iommu->ecap))
+               return -ENOENT;
+
+       /*
+        * queued invalidation is already setup and enabled.
+        */
+       if (iommu->qi)
+               return 0;
+
+       iommu->qi = kmalloc(sizeof(*qi), GFP_ATOMIC);
+       if (!iommu->qi)
+               return -ENOMEM;
+
+       qi = iommu->qi;
+
+       /*
+        * Need two pages to accommodate 256 descriptors of 256 bits each
+        * if the remapping hardware supports scalable mode translation.
+        */
+       desc_page = alloc_pages_node(iommu->node, GFP_ATOMIC | __GFP_ZERO,
+                                    !!ecap_smts(iommu->ecap));
+       if (!desc_page) {
+               kfree(qi);
+               iommu->qi = NULL;
+               return -ENOMEM;
+       }
+
+       qi->desc = page_address(desc_page);
+
+       qi->desc_status = kcalloc(QI_LENGTH, sizeof(int), GFP_ATOMIC);
+       if (!qi->desc_status) {
+               free_page((unsigned long) qi->desc);
+               kfree(qi);
+               iommu->qi = NULL;
+               return -ENOMEM;
+       }
+
+       raw_spin_lock_init(&qi->q_lock);
+
+       __dmar_enable_qi(iommu);
+
+       return 0;
+}
+
+/* iommu interrupt handling. Most stuff are MSI-like. */
+
+enum faulttype {
+       DMA_REMAP,
+       INTR_REMAP,
+       UNKNOWN,
+};
+
+static const char *dma_remap_fault_reasons[] =
+{
+       "Software",
+       "Present bit in root entry is clear",
+       "Present bit in context entry is clear",
+       "Invalid context entry",
+       "Access beyond MGAW",
+       "PTE Write access is not set",
+       "PTE Read access is not set",
+       "Next page table ptr is invalid",
+       "Root table address invalid",
+       "Context table ptr is invalid",
+       "non-zero reserved fields in RTP",
+       "non-zero reserved fields in CTP",
+       "non-zero reserved fields in PTE",
+       "PCE for translation request specifies blocking",
+};
+
+static const char * const dma_remap_sm_fault_reasons[] = {
+       "SM: Invalid Root Table Address",
+       "SM: TTM 0 for request with PASID",
+       "SM: TTM 0 for page group request",
+       "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", /* 0x33-0x37 */
+       "SM: Error attempting to access Root Entry",
+       "SM: Present bit in Root Entry is clear",
+       "SM: Non-zero reserved field set in Root Entry",
+       "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", /* 0x3B-0x3F */
+       "SM: Error attempting to access Context Entry",
+       "SM: Present bit in Context Entry is clear",
+       "SM: Non-zero reserved field set in the Context Entry",
+       "SM: Invalid Context Entry",
+       "SM: DTE field in Context Entry is clear",
+       "SM: PASID Enable field in Context Entry is clear",
+       "SM: PASID is larger than the max in Context Entry",
+       "SM: PRE field in Context-Entry is clear",
+       "SM: RID_PASID field error in Context-Entry",
+       "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", /* 0x49-0x4F */
+       "SM: Error attempting to access the PASID Directory Entry",
+       "SM: Present bit in Directory Entry is clear",
+       "SM: Non-zero reserved field set in PASID Directory Entry",
+       "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", /* 0x53-0x57 */
+       "SM: Error attempting to access PASID Table Entry",
+       "SM: Present bit in PASID Table Entry is clear",
+       "SM: Non-zero reserved field set in PASID Table Entry",
+       "SM: Invalid Scalable-Mode PASID Table Entry",
+       "SM: ERE field is clear in PASID Table Entry",
+       "SM: SRE field is clear in PASID Table Entry",
+       "Unknown", "Unknown",/* 0x5E-0x5F */
+       "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", /* 0x60-0x67 */
+       "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", /* 0x68-0x6F */
+       "SM: Error attempting to access first-level paging entry",
+       "SM: Present bit in first-level paging entry is clear",
+       "SM: Non-zero reserved field set in first-level paging entry",
+       "SM: Error attempting to access FL-PML4 entry",
+       "SM: First-level entry address beyond MGAW in Nested translation",
+       "SM: Read permission error in FL-PML4 entry in Nested translation",
+       "SM: Read permission error in first-level paging entry in Nested translation",
+       "SM: Write permission error in first-level paging entry in Nested translation",
+       "SM: Error attempting to access second-level paging entry",
+       "SM: Read/Write permission error in second-level paging entry",
+       "SM: Non-zero reserved field set in second-level paging entry",
+       "SM: Invalid second-level page table pointer",
+       "SM: A/D bit update needed in second-level entry when set up in no snoop",
+       "Unknown", "Unknown", "Unknown", /* 0x7D-0x7F */
+       "SM: Address in first-level translation is not canonical",
+       "SM: U/S set 0 for first-level translation with user privilege",
+       "SM: No execute permission for request with PASID and ER=1",
+       "SM: Address beyond the DMA hardware max",
+       "SM: Second-level entry address beyond the max",
+       "SM: No write permission for Write/AtomicOp request",
+       "SM: No read permission for Read/AtomicOp request",
+       "SM: Invalid address-interrupt address",
+       "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", /* 0x88-0x8F */
+       "SM: A/D bit update needed in first-level entry when set up in no snoop",
+};
+
+static const char *irq_remap_fault_reasons[] =
+{
+       "Detected reserved fields in the decoded interrupt-remapped request",
+       "Interrupt index exceeded the interrupt-remapping table size",
+       "Present field in the IRTE entry is clear",
+       "Error accessing interrupt-remapping table pointed by IRTA_REG",
+       "Detected reserved fields in the IRTE entry",
+       "Blocked a compatibility format interrupt request",
+       "Blocked an interrupt request due to source-id verification failure",
+};
+
+static const char *dmar_get_fault_reason(u8 fault_reason, int *fault_type)
+{
+       if (fault_reason >= 0x20 && (fault_reason - 0x20 <
+                                       ARRAY_SIZE(irq_remap_fault_reasons))) {
+               *fault_type = INTR_REMAP;
+               return irq_remap_fault_reasons[fault_reason - 0x20];
+       } else if (fault_reason >= 0x30 && (fault_reason - 0x30 <
+                       ARRAY_SIZE(dma_remap_sm_fault_reasons))) {
+               *fault_type = DMA_REMAP;
+               return dma_remap_sm_fault_reasons[fault_reason - 0x30];
+       } else if (fault_reason < ARRAY_SIZE(dma_remap_fault_reasons)) {
+               *fault_type = DMA_REMAP;
+               return dma_remap_fault_reasons[fault_reason];
+       } else {
+               *fault_type = UNKNOWN;
+               return "Unknown";
+       }
+}
+
+
+static inline int dmar_msi_reg(struct intel_iommu *iommu, int irq)
+{
+       if (iommu->irq == irq)
+               return DMAR_FECTL_REG;
+       else if (iommu->pr_irq == irq)
+               return DMAR_PECTL_REG;
+       else
+               BUG();
+}
+
+void dmar_msi_unmask(struct irq_data *data)
+{
+       struct intel_iommu *iommu = irq_data_get_irq_handler_data(data);
+       int reg = dmar_msi_reg(iommu, data->irq);
+       unsigned long flag;
+
+       /* unmask it */
+       raw_spin_lock_irqsave(&iommu->register_lock, flag);
+       writel(0, iommu->reg + reg);
+       /* Read a reg to force flush the post write */
+       readl(iommu->reg + reg);
+       raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+}
+
+void dmar_msi_mask(struct irq_data *data)
+{
+       struct intel_iommu *iommu = irq_data_get_irq_handler_data(data);
+       int reg = dmar_msi_reg(iommu, data->irq);
+       unsigned long flag;
+
+       /* mask it */
+       raw_spin_lock_irqsave(&iommu->register_lock, flag);
+       writel(DMA_FECTL_IM, iommu->reg + reg);
+       /* Read a reg to force flush the post write */
+       readl(iommu->reg + reg);
+       raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+}
+
+void dmar_msi_write(int irq, struct msi_msg *msg)
+{
+       struct intel_iommu *iommu = irq_get_handler_data(irq);
+       int reg = dmar_msi_reg(iommu, irq);
+       unsigned long flag;
+
+       raw_spin_lock_irqsave(&iommu->register_lock, flag);
+       writel(msg->data, iommu->reg + reg + 4);
+       writel(msg->address_lo, iommu->reg + reg + 8);
+       writel(msg->address_hi, iommu->reg + reg + 12);
+       raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+}
+
+void dmar_msi_read(int irq, struct msi_msg *msg)
+{
+       struct intel_iommu *iommu = irq_get_handler_data(irq);
+       int reg = dmar_msi_reg(iommu, irq);
+       unsigned long flag;
+
+       raw_spin_lock_irqsave(&iommu->register_lock, flag);
+       msg->data = readl(iommu->reg + reg + 4);
+       msg->address_lo = readl(iommu->reg + reg + 8);
+       msg->address_hi = readl(iommu->reg + reg + 12);
+       raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+}
+
+static int dmar_fault_do_one(struct intel_iommu *iommu, int type,
+               u8 fault_reason, int pasid, u16 source_id,
+               unsigned long long addr)
+{
+       const char *reason;
+       int fault_type;
+
+       reason = dmar_get_fault_reason(fault_reason, &fault_type);
+
+       if (fault_type == INTR_REMAP)
+               pr_err("[INTR-REMAP] Request device [%02x:%02x.%d] fault index %llx [fault reason %02d] %s\n",
+                       source_id >> 8, PCI_SLOT(source_id & 0xFF),
+                       PCI_FUNC(source_id & 0xFF), addr >> 48,
+                       fault_reason, reason);
+       else
+               pr_err("[%s] Request device [%02x:%02x.%d] PASID %x fault addr %llx [fault reason %02d] %s\n",
+                      type ? "DMA Read" : "DMA Write",
+                      source_id >> 8, PCI_SLOT(source_id & 0xFF),
+                      PCI_FUNC(source_id & 0xFF), pasid, addr,
+                      fault_reason, reason);
+       return 0;
+}
+
+#define PRIMARY_FAULT_REG_LEN (16)
+irqreturn_t dmar_fault(int irq, void *dev_id)
+{
+       struct intel_iommu *iommu = dev_id;
+       int reg, fault_index;
+       u32 fault_status;
+       unsigned long flag;
+       static DEFINE_RATELIMIT_STATE(rs,
+                                     DEFAULT_RATELIMIT_INTERVAL,
+                                     DEFAULT_RATELIMIT_BURST);
+
+       raw_spin_lock_irqsave(&iommu->register_lock, flag);
+       fault_status = readl(iommu->reg + DMAR_FSTS_REG);
+       if (fault_status && __ratelimit(&rs))
+               pr_err("DRHD: handling fault status reg %x\n", fault_status);
+
+       /* TBD: ignore advanced fault log currently */
+       if (!(fault_status & DMA_FSTS_PPF))
+               goto unlock_exit;
+
+       fault_index = dma_fsts_fault_record_index(fault_status);
+       reg = cap_fault_reg_offset(iommu->cap);
+       while (1) {
+               /* Disable printing, simply clear the fault when ratelimited */
+               bool ratelimited = !__ratelimit(&rs);
+               u8 fault_reason;
+               u16 source_id;
+               u64 guest_addr;
+               int type, pasid;
+               u32 data;
+               bool pasid_present;
+
+               /* highest 32 bits */
+               data = readl(iommu->reg + reg +
+                               fault_index * PRIMARY_FAULT_REG_LEN + 12);
+               if (!(data & DMA_FRCD_F))
+                       break;
+
+               if (!ratelimited) {
+                       fault_reason = dma_frcd_fault_reason(data);
+                       type = dma_frcd_type(data);
+
+                       pasid = dma_frcd_pasid_value(data);
+                       data = readl(iommu->reg + reg +
+                                    fault_index * PRIMARY_FAULT_REG_LEN + 8);
+                       source_id = dma_frcd_source_id(data);
+
+                       pasid_present = dma_frcd_pasid_present(data);
+                       guest_addr = dmar_readq(iommu->reg + reg +
+                                       fault_index * PRIMARY_FAULT_REG_LEN);
+                       guest_addr = dma_frcd_page_addr(guest_addr);
+               }
+
+               /* clear the fault */
+               writel(DMA_FRCD_F, iommu->reg + reg +
+                       fault_index * PRIMARY_FAULT_REG_LEN + 12);
+
+               raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+
+               if (!ratelimited)
+                       /* Using pasid -1 if pasid is not present */
+                       dmar_fault_do_one(iommu, type, fault_reason,
+                                         pasid_present ? pasid : -1,
+                                         source_id, guest_addr);
+
+               fault_index++;
+               if (fault_index >= cap_num_fault_regs(iommu->cap))
+                       fault_index = 0;
+               raw_spin_lock_irqsave(&iommu->register_lock, flag);
+       }
+
+       writel(DMA_FSTS_PFO | DMA_FSTS_PPF | DMA_FSTS_PRO,
+              iommu->reg + DMAR_FSTS_REG);
+
+unlock_exit:
+       raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+       return IRQ_HANDLED;
+}
+
+int dmar_set_interrupt(struct intel_iommu *iommu)
+{
+       int irq, ret;
+
+       /*
+        * Check if the fault interrupt is already initialized.
+        */
+       if (iommu->irq)
+               return 0;
+
+       irq = dmar_alloc_hwirq(iommu->seq_id, iommu->node, iommu);
+       if (irq > 0) {
+               iommu->irq = irq;
+       } else {
+               pr_err("No free IRQ vectors\n");
+               return -EINVAL;
+       }
+
+       ret = request_irq(irq, dmar_fault, IRQF_NO_THREAD, iommu->name, iommu);
+       if (ret)
+               pr_err("Can't request irq\n");
+       return ret;
+}
+
+int __init enable_drhd_fault_handling(void)
+{
+       struct dmar_drhd_unit *drhd;
+       struct intel_iommu *iommu;
+
+       /*
+        * Enable fault control interrupt.
+        */
+       for_each_iommu(iommu, drhd) {
+               u32 fault_status;
+               int ret = dmar_set_interrupt(iommu);
+
+               if (ret) {
+                       pr_err("DRHD %Lx: failed to enable fault, interrupt, ret %d\n",
+                              (unsigned long long)drhd->reg_base_addr, ret);
+                       return -1;
+               }
+
+               /*
+                * Clear any previous faults.
+                */
+               dmar_fault(iommu->irq, iommu);
+               fault_status = readl(iommu->reg + DMAR_FSTS_REG);
+               writel(fault_status, iommu->reg + DMAR_FSTS_REG);
+       }
+
+       return 0;
+}
+
+/*
+ * Re-enable Queued Invalidation interface.
+ */
+int dmar_reenable_qi(struct intel_iommu *iommu)
+{
+       if (!ecap_qis(iommu->ecap))
+               return -ENOENT;
+
+       if (!iommu->qi)
+               return -ENOENT;
+
+       /*
+        * First disable queued invalidation.
+        */
+       dmar_disable_qi(iommu);
+       /*
+        * Then enable queued invalidation again. Since there is no pending
+        * invalidation requests now, it's safe to re-enable queued
+        * invalidation.
+        */
+       __dmar_enable_qi(iommu);
+
+       return 0;
+}
+
+/*
+ * Check interrupt remapping support in DMAR table description.
+ */
+int __init dmar_ir_support(void)
+{
+       struct acpi_table_dmar *dmar;
+       dmar = (struct acpi_table_dmar *)dmar_tbl;
+       if (!dmar)
+               return 0;
+       return dmar->flags & 0x1;
+}
+
+/* Check whether DMAR units are in use */
+static inline bool dmar_in_use(void)
+{
+       return irq_remapping_enabled || intel_iommu_enabled;
+}
+
+static int __init dmar_free_unused_resources(void)
+{
+       struct dmar_drhd_unit *dmaru, *dmaru_n;
+
+       if (dmar_in_use())
+               return 0;
+
+       if (dmar_dev_scope_status != 1 && !list_empty(&dmar_drhd_units))
+               bus_unregister_notifier(&pci_bus_type, &dmar_pci_bus_nb);
+
+       down_write(&dmar_global_lock);
+       list_for_each_entry_safe(dmaru, dmaru_n, &dmar_drhd_units, list) {
+               list_del(&dmaru->list);
+               dmar_free_drhd(dmaru);
+       }
+       up_write(&dmar_global_lock);
+
+       return 0;
+}
+
+late_initcall(dmar_free_unused_resources);
+IOMMU_INIT_POST(detect_intel_iommu);
+
+/*
+ * DMAR Hotplug Support
+ * For more details, please refer to Intel(R) Virtualization Technology
+ * for Directed-IO Architecture Specifiction, Rev 2.2, Section 8.8
+ * "Remapping Hardware Unit Hot Plug".
+ */
+static guid_t dmar_hp_guid =
+       GUID_INIT(0xD8C1A3A6, 0xBE9B, 0x4C9B,
+                 0x91, 0xBF, 0xC3, 0xCB, 0x81, 0xFC, 0x5D, 0xAF);
+
+/*
+ * Currently there's only one revision and BIOS will not check the revision id,
+ * so use 0 for safety.
+ */
+#define        DMAR_DSM_REV_ID                 0
+#define        DMAR_DSM_FUNC_DRHD              1
+#define        DMAR_DSM_FUNC_ATSR              2
+#define        DMAR_DSM_FUNC_RHSA              3
+
+static inline bool dmar_detect_dsm(acpi_handle handle, int func)
+{
+       return acpi_check_dsm(handle, &dmar_hp_guid, DMAR_DSM_REV_ID, 1 << func);
+}
+
+static int dmar_walk_dsm_resource(acpi_handle handle, int func,
+                                 dmar_res_handler_t handler, void *arg)
+{
+       int ret = -ENODEV;
+       union acpi_object *obj;
+       struct acpi_dmar_header *start;
+       struct dmar_res_callback callback;
+       static int res_type[] = {
+               [DMAR_DSM_FUNC_DRHD] = ACPI_DMAR_TYPE_HARDWARE_UNIT,
+               [DMAR_DSM_FUNC_ATSR] = ACPI_DMAR_TYPE_ROOT_ATS,
+               [DMAR_DSM_FUNC_RHSA] = ACPI_DMAR_TYPE_HARDWARE_AFFINITY,
+       };
+
+       if (!dmar_detect_dsm(handle, func))
+               return 0;
+
+       obj = acpi_evaluate_dsm_typed(handle, &dmar_hp_guid, DMAR_DSM_REV_ID,
+                                     func, NULL, ACPI_TYPE_BUFFER);
+       if (!obj)
+               return -ENODEV;
+
+       memset(&callback, 0, sizeof(callback));
+       callback.cb[res_type[func]] = handler;
+       callback.arg[res_type[func]] = arg;
+       start = (struct acpi_dmar_header *)obj->buffer.pointer;
+       ret = dmar_walk_remapping_entries(start, obj->buffer.length, &callback);
+
+       ACPI_FREE(obj);
+
+       return ret;
+}
+
+static int dmar_hp_add_drhd(struct acpi_dmar_header *header, void *arg)
+{
+       int ret;
+       struct dmar_drhd_unit *dmaru;
+
+       dmaru = dmar_find_dmaru((struct acpi_dmar_hardware_unit *)header);
+       if (!dmaru)
+               return -ENODEV;
+
+       ret = dmar_ir_hotplug(dmaru, true);
+       if (ret == 0)
+               ret = dmar_iommu_hotplug(dmaru, true);
+
+       return ret;
+}
+
+static int dmar_hp_remove_drhd(struct acpi_dmar_header *header, void *arg)
+{
+       int i, ret;
+       struct device *dev;
+       struct dmar_drhd_unit *dmaru;
+
+       dmaru = dmar_find_dmaru((struct acpi_dmar_hardware_unit *)header);
+       if (!dmaru)
+               return 0;
+
+       /*
+        * All PCI devices managed by this unit should have been destroyed.
+        */
+       if (!dmaru->include_all && dmaru->devices && dmaru->devices_cnt) {
+               for_each_active_dev_scope(dmaru->devices,
+                                         dmaru->devices_cnt, i, dev)
+                       return -EBUSY;
+       }
+
+       ret = dmar_ir_hotplug(dmaru, false);
+       if (ret == 0)
+               ret = dmar_iommu_hotplug(dmaru, false);
+
+       return ret;
+}
+
+static int dmar_hp_release_drhd(struct acpi_dmar_header *header, void *arg)
+{
+       struct dmar_drhd_unit *dmaru;
+
+       dmaru = dmar_find_dmaru((struct acpi_dmar_hardware_unit *)header);
+       if (dmaru) {
+               list_del_rcu(&dmaru->list);
+               synchronize_rcu();
+               dmar_free_drhd(dmaru);
+       }
+
+       return 0;
+}
+
+static int dmar_hotplug_insert(acpi_handle handle)
+{
+       int ret;
+       int drhd_count = 0;
+
+       ret = dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_DRHD,
+                                    &dmar_validate_one_drhd, (void *)1);
+       if (ret)
+               goto out;
+
+       ret = dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_DRHD,
+                                    &dmar_parse_one_drhd, (void *)&drhd_count);
+       if (ret == 0 && drhd_count == 0) {
+               pr_warn(FW_BUG "No DRHD structures in buffer returned by _DSM method\n");
+               goto out;
+       } else if (ret) {
+               goto release_drhd;
+       }
+
+       ret = dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_RHSA,
+                                    &dmar_parse_one_rhsa, NULL);
+       if (ret)
+               goto release_drhd;
+
+       ret = dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_ATSR,
+                                    &dmar_parse_one_atsr, NULL);
+       if (ret)
+               goto release_atsr;
+
+       ret = dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_DRHD,
+                                    &dmar_hp_add_drhd, NULL);
+       if (!ret)
+               return 0;
+
+       dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_DRHD,
+                              &dmar_hp_remove_drhd, NULL);
+release_atsr:
+       dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_ATSR,
+                              &dmar_release_one_atsr, NULL);
+release_drhd:
+       dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_DRHD,
+                              &dmar_hp_release_drhd, NULL);
+out:
+       return ret;
+}
+
+static int dmar_hotplug_remove(acpi_handle handle)
+{
+       int ret;
+
+       ret = dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_ATSR,
+                                    &dmar_check_one_atsr, NULL);
+       if (ret)
+               return ret;
+
+       ret = dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_DRHD,
+                                    &dmar_hp_remove_drhd, NULL);
+       if (ret == 0) {
+               WARN_ON(dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_ATSR,
+                                              &dmar_release_one_atsr, NULL));
+               WARN_ON(dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_DRHD,
+                                              &dmar_hp_release_drhd, NULL));
+       } else {
+               dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_DRHD,
+                                      &dmar_hp_add_drhd, NULL);
+       }
+
+       return ret;
+}
+
+static acpi_status dmar_get_dsm_handle(acpi_handle handle, u32 lvl,
+                                      void *context, void **retval)
+{
+       acpi_handle *phdl = retval;
+
+       if (dmar_detect_dsm(handle, DMAR_DSM_FUNC_DRHD)) {
+               *phdl = handle;
+               return AE_CTRL_TERMINATE;
+       }
+
+       return AE_OK;
+}
+
+static int dmar_device_hotplug(acpi_handle handle, bool insert)
+{
+       int ret;
+       acpi_handle tmp = NULL;
+       acpi_status status;
+
+       if (!dmar_in_use())
+               return 0;
+
+       if (dmar_detect_dsm(handle, DMAR_DSM_FUNC_DRHD)) {
+               tmp = handle;
+       } else {
+               status = acpi_walk_namespace(ACPI_TYPE_DEVICE, handle,
+                                            ACPI_UINT32_MAX,
+                                            dmar_get_dsm_handle,
+                                            NULL, NULL, &tmp);
+               if (ACPI_FAILURE(status)) {
+                       pr_warn("Failed to locate _DSM method.\n");
+                       return -ENXIO;
+               }
+       }
+       if (tmp == NULL)
+               return 0;
+
+       down_write(&dmar_global_lock);
+       if (insert)
+               ret = dmar_hotplug_insert(tmp);
+       else
+               ret = dmar_hotplug_remove(tmp);
+       up_write(&dmar_global_lock);
+
+       return ret;
+}
+
+int dmar_device_add(acpi_handle handle)
+{
+       return dmar_device_hotplug(handle, true);
+}
+
+int dmar_device_remove(acpi_handle handle)
+{
+       return dmar_device_hotplug(handle, false);
+}
+
+/*
+ * dmar_platform_optin - Is %DMA_CTRL_PLATFORM_OPT_IN_FLAG set in DMAR table
+ *
+ * Returns true if the platform has %DMA_CTRL_PLATFORM_OPT_IN_FLAG set in
+ * the ACPI DMAR table. This means that the platform boot firmware has made
+ * sure no device can issue DMA outside of RMRR regions.
+ */
+bool dmar_platform_optin(void)
+{
+       struct acpi_table_dmar *dmar;
+       acpi_status status;
+       bool ret;
+
+       status = acpi_get_table(ACPI_SIG_DMAR, 0,
+                               (struct acpi_table_header **)&dmar);
+       if (ACPI_FAILURE(status))
+               return false;
+
+       ret = !!(dmar->flags & DMAR_PLATFORM_OPT_IN);
+       acpi_put_table((struct acpi_table_header *)dmar);
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(dmar_platform_optin);
diff --git a/drivers/iommu/intel/intel-pasid.h b/drivers/iommu/intel/intel-pasid.h

new file mode 100644 (file)

index 0000000..c5318d4
--- /dev/null
+++ b/drivers/iommu/intel/intel-pasid.h
@@ -0,0 +1,128 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * intel-pasid.h - PASID idr, table and entry header
+ *
+ * Copyright (C) 2018 Intel Corporation
+ *
+ * Author: Lu Baolu <baolu.lu@linux.intel.com>
+ */
+
+#ifndef __INTEL_PASID_H
+#define __INTEL_PASID_H
+
+#define PASID_RID2PASID                        0x0
+#define PASID_MIN                      0x1
+#define PASID_MAX                      0x100000
+#define PASID_PTE_MASK                 0x3F
+#define PASID_PTE_PRESENT              1
+#define PASID_PTE_FPD                  2
+#define PDE_PFN_MASK                   PAGE_MASK
+#define PASID_PDE_SHIFT                        6
+#define MAX_NR_PASID_BITS              20
+#define PASID_TBL_ENTRIES              BIT(PASID_PDE_SHIFT)
+
+#define is_pasid_enabled(entry)                (((entry)->lo >> 3) & 0x1)
+#define get_pasid_dir_size(entry)      (1 << ((((entry)->lo >> 9) & 0x7) + 7))
+
+/* Virtual command interface for enlightened pasid management. */
+#define VCMD_CMD_ALLOC                 0x1
+#define VCMD_CMD_FREE                  0x2
+#define VCMD_VRSP_IP                   0x1
+#define VCMD_VRSP_SC(e)                        (((e) >> 1) & 0x3)
+#define VCMD_VRSP_SC_SUCCESS           0
+#define VCMD_VRSP_SC_NO_PASID_AVAIL    1
+#define VCMD_VRSP_SC_INVALID_PASID     1
+#define VCMD_VRSP_RESULT_PASID(e)      (((e) >> 8) & 0xfffff)
+#define VCMD_CMD_OPERAND(e)            ((e) << 8)
+/*
+ * Domain ID reserved for pasid entries programmed for first-level
+ * only and pass-through transfer modes.
+ */
+#define FLPT_DEFAULT_DID               1
+
+/*
+ * The SUPERVISOR_MODE flag indicates a first level translation which
+ * can be used for access to kernel addresses. It is valid only for
+ * access to the kernel's static 1:1 mapping of physical memory — not
+ * to vmalloc or even module mappings.
+ */
+#define PASID_FLAG_SUPERVISOR_MODE     BIT(0)
+#define PASID_FLAG_NESTED              BIT(1)
+
+/*
+ * The PASID_FLAG_FL5LP flag Indicates using 5-level paging for first-
+ * level translation, otherwise, 4-level paging will be used.
+ */
+#define PASID_FLAG_FL5LP               BIT(1)
+
+struct pasid_dir_entry {
+       u64 val;
+};
+
+struct pasid_entry {
+       u64 val[8];
+};
+
+#define PASID_ENTRY_PGTT_FL_ONLY       (1)
+#define PASID_ENTRY_PGTT_SL_ONLY       (2)
+#define PASID_ENTRY_PGTT_NESTED                (3)
+#define PASID_ENTRY_PGTT_PT            (4)
+
+/* The representative of a PASID table */
+struct pasid_table {
+       void                    *table;         /* pasid table pointer */
+       int                     order;          /* page order of pasid table */
+       int                     max_pasid;      /* max pasid */
+       struct list_head        dev;            /* device list */
+};
+
+/* Get PRESENT bit of a PASID directory entry. */
+static inline bool pasid_pde_is_present(struct pasid_dir_entry *pde)
+{
+       return READ_ONCE(pde->val) & PASID_PTE_PRESENT;
+}
+
+/* Get PASID table from a PASID directory entry. */
+static inline struct pasid_entry *
+get_pasid_table_from_pde(struct pasid_dir_entry *pde)
+{
+       if (!pasid_pde_is_present(pde))
+               return NULL;
+
+       return phys_to_virt(READ_ONCE(pde->val) & PDE_PFN_MASK);
+}
+
+/* Get PRESENT bit of a PASID table entry. */
+static inline bool pasid_pte_is_present(struct pasid_entry *pte)
+{
+       return READ_ONCE(pte->val[0]) & PASID_PTE_PRESENT;
+}
+
+extern u32 intel_pasid_max_id;
+int intel_pasid_alloc_id(void *ptr, int start, int end, gfp_t gfp);
+void intel_pasid_free_id(int pasid);
+void *intel_pasid_lookup_id(int pasid);
+int intel_pasid_alloc_table(struct device *dev);
+void intel_pasid_free_table(struct device *dev);
+struct pasid_table *intel_pasid_get_table(struct device *dev);
+int intel_pasid_get_dev_max_id(struct device *dev);
+struct pasid_entry *intel_pasid_get_entry(struct device *dev, int pasid);
+int intel_pasid_setup_first_level(struct intel_iommu *iommu,
+                                 struct device *dev, pgd_t *pgd,
+                                 int pasid, u16 did, int flags);
+int intel_pasid_setup_second_level(struct intel_iommu *iommu,
+                                  struct dmar_domain *domain,
+                                  struct device *dev, int pasid);
+int intel_pasid_setup_pass_through(struct intel_iommu *iommu,
+                                  struct dmar_domain *domain,
+                                  struct device *dev, int pasid);
+int intel_pasid_setup_nested(struct intel_iommu *iommu,
+                            struct device *dev, pgd_t *pgd, int pasid,
+                            struct iommu_gpasid_bind_data_vtd *pasid_data,
+                            struct dmar_domain *domain, int addr_width);
+void intel_pasid_tear_down_entry(struct intel_iommu *iommu,
+                                struct device *dev, int pasid,
+                                bool fault_ignore);
+int vcmd_alloc_pasid(struct intel_iommu *iommu, unsigned int *pasid);
+void vcmd_free_pasid(struct intel_iommu *iommu, unsigned int pasid);
+#endif /* __INTEL_PASID_H */
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c

new file mode 100644 (file)

index 0000000..9129663
--- /dev/null
+++ b/drivers/iommu/intel/iommu.c
@@ -0,0 +1,6207 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright © 2006-2014 Intel Corporation.
+ *
+ * Authors: David Woodhouse <dwmw2@infradead.org>,
+ *          Ashok Raj <ashok.raj@intel.com>,
+ *          Shaohua Li <shaohua.li@intel.com>,
+ *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
+ *          Fenghua Yu <fenghua.yu@intel.com>
+ *          Joerg Roedel <jroedel@suse.de>
+ */
+
+#define pr_fmt(fmt)     "DMAR: " fmt
+#define dev_fmt(fmt)    pr_fmt(fmt)
+
+#include <linux/init.h>
+#include <linux/bitmap.h>
+#include <linux/debugfs.h>
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <linux/dmar.h>
+#include <linux/dma-mapping.h>
+#include <linux/mempool.h>
+#include <linux/memory.h>
+#include <linux/cpu.h>
+#include <linux/timer.h>
+#include <linux/io.h>
+#include <linux/iova.h>
+#include <linux/iommu.h>
+#include <linux/intel-iommu.h>
+#include <linux/syscore_ops.h>
+#include <linux/tboot.h>
+#include <linux/dmi.h>
+#include <linux/pci-ats.h>
+#include <linux/memblock.h>
+#include <linux/dma-contiguous.h>
+#include <linux/dma-direct.h>
+#include <linux/crash_dump.h>
+#include <linux/numa.h>
+#include <linux/swiotlb.h>
+#include <asm/irq_remapping.h>
+#include <asm/cacheflush.h>
+#include <asm/iommu.h>
+#include <trace/events/intel_iommu.h>
+
+#include "../irq_remapping.h"
+#include "intel-pasid.h"
+
+#define ROOT_SIZE              VTD_PAGE_SIZE
+#define CONTEXT_SIZE           VTD_PAGE_SIZE
+
+#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
+#define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
+#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
+#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
+
+#define IOAPIC_RANGE_START     (0xfee00000)
+#define IOAPIC_RANGE_END       (0xfeefffff)
+#define IOVA_START_ADDR                (0x1000)
+
+#define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
+
+#define MAX_AGAW_WIDTH 64
+#define MAX_AGAW_PFN_WIDTH     (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
+
+#define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
+#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
+
+/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
+   to match. That way, we can use 'unsigned long' for PFNs with impunity. */
+#define DOMAIN_MAX_PFN(gaw)    ((unsigned long) min_t(uint64_t, \
+                               __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
+#define DOMAIN_MAX_ADDR(gaw)   (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
+
+/* IO virtual address start page frame number */
+#define IOVA_START_PFN         (1)
+
+#define IOVA_PFN(addr)         ((addr) >> PAGE_SHIFT)
+
+/* page table handling */
+#define LEVEL_STRIDE           (9)
+#define LEVEL_MASK             (((u64)1 << LEVEL_STRIDE) - 1)
+
+/*
+ * This bitmap is used to advertise the page sizes our hardware support
+ * to the IOMMU core, which will then use this information to split
+ * physically contiguous memory regions it is mapping into page sizes
+ * that we support.
+ *
+ * Traditionally the IOMMU core just handed us the mappings directly,
+ * after making sure the size is an order of a 4KiB page and that the
+ * mapping has natural alignment.
+ *
+ * To retain this behavior, we currently advertise that we support
+ * all page sizes that are an order of 4KiB.
+ *
+ * If at some point we'd like to utilize the IOMMU core's new behavior,
+ * we could change this to advertise the real page sizes we support.
+ */
+#define INTEL_IOMMU_PGSIZES    (~0xFFFUL)
+
+static inline int agaw_to_level(int agaw)
+{
+       return agaw + 2;
+}
+
+static inline int agaw_to_width(int agaw)
+{
+       return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
+}
+
+static inline int width_to_agaw(int width)
+{
+       return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
+}
+
+static inline unsigned int level_to_offset_bits(int level)
+{
+       return (level - 1) * LEVEL_STRIDE;
+}
+
+static inline int pfn_level_offset(unsigned long pfn, int level)
+{
+       return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
+}
+
+static inline unsigned long level_mask(int level)
+{
+       return -1UL << level_to_offset_bits(level);
+}
+
+static inline unsigned long level_size(int level)
+{
+       return 1UL << level_to_offset_bits(level);
+}
+
+static inline unsigned long align_to_level(unsigned long pfn, int level)
+{
+       return (pfn + level_size(level) - 1) & level_mask(level);
+}
+
+static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
+{
+       return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
+}
+
+/* VT-d pages must always be _smaller_ than MM pages. Otherwise things
+   are never going to work. */
+static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
+{
+       return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
+}
+
+static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
+{
+       return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
+}
+static inline unsigned long page_to_dma_pfn(struct page *pg)
+{
+       return mm_to_dma_pfn(page_to_pfn(pg));
+}
+static inline unsigned long virt_to_dma_pfn(void *p)
+{
+       return page_to_dma_pfn(virt_to_page(p));
+}
+
+/* global iommu list, set NULL for ignored DMAR units */
+static struct intel_iommu **g_iommus;
+
+static void __init check_tylersburg_isoch(void);
+static int rwbf_quirk;
+
+/*
+ * set to 1 to panic kernel if can't successfully enable VT-d
+ * (used when kernel is launched w/ TXT)
+ */
+static int force_on = 0;
+int intel_iommu_tboot_noforce;
+static int no_platform_optin;
+
+#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
+
+/*
+ * Take a root_entry and return the Lower Context Table Pointer (LCTP)
+ * if marked present.
+ */
+static phys_addr_t root_entry_lctp(struct root_entry *re)
+{
+       if (!(re->lo & 1))
+               return 0;
+
+       return re->lo & VTD_PAGE_MASK;
+}
+
+/*
+ * Take a root_entry and return the Upper Context Table Pointer (UCTP)
+ * if marked present.
+ */
+static phys_addr_t root_entry_uctp(struct root_entry *re)
+{
+       if (!(re->hi & 1))
+               return 0;
+
+       return re->hi & VTD_PAGE_MASK;
+}
+
+static inline void context_clear_pasid_enable(struct context_entry *context)
+{
+       context->lo &= ~(1ULL << 11);
+}
+
+static inline bool context_pasid_enabled(struct context_entry *context)
+{
+       return !!(context->lo & (1ULL << 11));
+}
+
+static inline void context_set_copied(struct context_entry *context)
+{
+       context->hi |= (1ull << 3);
+}
+
+static inline bool context_copied(struct context_entry *context)
+{
+       return !!(context->hi & (1ULL << 3));
+}
+
+static inline bool __context_present(struct context_entry *context)
+{
+       return (context->lo & 1);
+}
+
+bool context_present(struct context_entry *context)
+{
+       return context_pasid_enabled(context) ?
+            __context_present(context) :
+            __context_present(context) && !context_copied(context);
+}
+
+static inline void context_set_present(struct context_entry *context)
+{
+       context->lo |= 1;
+}
+
+static inline void context_set_fault_enable(struct context_entry *context)
+{
+       context->lo &= (((u64)-1) << 2) | 1;
+}
+
+static inline void context_set_translation_type(struct context_entry *context,
+                                               unsigned long value)
+{
+       context->lo &= (((u64)-1) << 4) | 3;
+       context->lo |= (value & 3) << 2;
+}
+
+static inline void context_set_address_root(struct context_entry *context,
+                                           unsigned long value)
+{
+       context->lo &= ~VTD_PAGE_MASK;
+       context->lo |= value & VTD_PAGE_MASK;
+}
+
+static inline void context_set_address_width(struct context_entry *context,
+                                            unsigned long value)
+{
+       context->hi |= value & 7;
+}
+
+static inline void context_set_domain_id(struct context_entry *context,
+                                        unsigned long value)
+{
+       context->hi |= (value & ((1 << 16) - 1)) << 8;
+}
+
+static inline int context_domain_id(struct context_entry *c)
+{
+       return((c->hi >> 8) & 0xffff);
+}
+
+static inline void context_clear_entry(struct context_entry *context)
+{
+       context->lo = 0;
+       context->hi = 0;
+}
+
+/*
+ * This domain is a statically identity mapping domain.
+ *     1. This domain creats a static 1:1 mapping to all usable memory.
+ *     2. It maps to each iommu if successful.
+ *     3. Each iommu mapps to this domain if successful.
+ */
+static struct dmar_domain *si_domain;
+static int hw_pass_through = 1;
+
+#define for_each_domain_iommu(idx, domain)                     \
+       for (idx = 0; idx < g_num_of_iommus; idx++)             \
+               if (domain->iommu_refcnt[idx])
+
+struct dmar_rmrr_unit {
+       struct list_head list;          /* list of rmrr units   */
+       struct acpi_dmar_header *hdr;   /* ACPI header          */
+       u64     base_address;           /* reserved base address*/
+       u64     end_address;            /* reserved end address */
+       struct dmar_dev_scope *devices; /* target devices */
+       int     devices_cnt;            /* target device count */
+};
+
+struct dmar_atsr_unit {
+       struct list_head list;          /* list of ATSR units */
+       struct acpi_dmar_header *hdr;   /* ACPI header */
+       struct dmar_dev_scope *devices; /* target devices */
+       int devices_cnt;                /* target device count */
+       u8 include_all:1;               /* include all ports */
+};
+
+static LIST_HEAD(dmar_atsr_units);
+static LIST_HEAD(dmar_rmrr_units);
+
+#define for_each_rmrr_units(rmrr) \
+       list_for_each_entry(rmrr, &dmar_rmrr_units, list)
+
+/* bitmap for indexing intel_iommus */
+static int g_num_of_iommus;
+
+static void domain_exit(struct dmar_domain *domain);
+static void domain_remove_dev_info(struct dmar_domain *domain);
+static void dmar_remove_one_dev_info(struct device *dev);
+static void __dmar_remove_one_dev_info(struct device_domain_info *info);
+static int intel_iommu_attach_device(struct iommu_domain *domain,
+                                    struct device *dev);
+static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
+                                           dma_addr_t iova);
+
+#ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
+int dmar_disabled = 0;
+#else
+int dmar_disabled = 1;
+#endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
+
+#ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
+int intel_iommu_sm = 1;
+#else
+int intel_iommu_sm;
+#endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
+
+int intel_iommu_enabled = 0;
+EXPORT_SYMBOL_GPL(intel_iommu_enabled);
+
+static int dmar_map_gfx = 1;
+static int dmar_forcedac;
+static int intel_iommu_strict;
+static int intel_iommu_superpage = 1;
+static int iommu_identity_mapping;
+static int intel_no_bounce;
+
+#define IDENTMAP_GFX           2
+#define IDENTMAP_AZALIA                4
+
+int intel_iommu_gfx_mapped;
+EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
+
+#define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
+#define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
+struct device_domain_info *get_domain_info(struct device *dev)
+{
+       struct device_domain_info *info;
+
+       if (!dev)
+               return NULL;
+
+       info = dev->archdata.iommu;
+       if (unlikely(info == DUMMY_DEVICE_DOMAIN_INFO ||
+                    info == DEFER_DEVICE_DOMAIN_INFO))
+               return NULL;
+
+       return info;
+}
+
+DEFINE_SPINLOCK(device_domain_lock);
+static LIST_HEAD(device_domain_list);
+
+#define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) &&   \
+                               to_pci_dev(d)->untrusted)
+
+/*
+ * Iterate over elements in device_domain_list and call the specified
+ * callback @fn against each element.
+ */
+int for_each_device_domain(int (*fn)(struct device_domain_info *info,
+                                    void *data), void *data)
+{
+       int ret = 0;
+       unsigned long flags;
+       struct device_domain_info *info;
+
+       spin_lock_irqsave(&device_domain_lock, flags);
+       list_for_each_entry(info, &device_domain_list, global) {
+               ret = fn(info, data);
+               if (ret) {
+                       spin_unlock_irqrestore(&device_domain_lock, flags);
+                       return ret;
+               }
+       }
+       spin_unlock_irqrestore(&device_domain_lock, flags);
+
+       return 0;
+}
+
+const struct iommu_ops intel_iommu_ops;
+
+static bool translation_pre_enabled(struct intel_iommu *iommu)
+{
+       return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
+}
+
+static void clear_translation_pre_enabled(struct intel_iommu *iommu)
+{
+       iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
+}
+
+static void init_translation_status(struct intel_iommu *iommu)
+{
+       u32 gsts;
+
+       gsts = readl(iommu->reg + DMAR_GSTS_REG);
+       if (gsts & DMA_GSTS_TES)
+               iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
+}
+
+static int __init intel_iommu_setup(char *str)
+{
+       if (!str)
+               return -EINVAL;
+       while (*str) {
+               if (!strncmp(str, "on", 2)) {
+                       dmar_disabled = 0;
+                       pr_info("IOMMU enabled\n");
+               } else if (!strncmp(str, "off", 3)) {
+                       dmar_disabled = 1;
+                       no_platform_optin = 1;
+                       pr_info("IOMMU disabled\n");
+               } else if (!strncmp(str, "igfx_off", 8)) {
+                       dmar_map_gfx = 0;
+                       pr_info("Disable GFX device mapping\n");
+               } else if (!strncmp(str, "forcedac", 8)) {
+                       pr_info("Forcing DAC for PCI devices\n");
+                       dmar_forcedac = 1;
+               } else if (!strncmp(str, "strict", 6)) {
+                       pr_info("Disable batched IOTLB flush\n");
+                       intel_iommu_strict = 1;
+               } else if (!strncmp(str, "sp_off", 6)) {
+                       pr_info("Disable supported super page\n");
+                       intel_iommu_superpage = 0;
+               } else if (!strncmp(str, "sm_on", 5)) {
+                       pr_info("Intel-IOMMU: scalable mode supported\n");
+                       intel_iommu_sm = 1;
+               } else if (!strncmp(str, "tboot_noforce", 13)) {
+                       pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
+                       intel_iommu_tboot_noforce = 1;
+               } else if (!strncmp(str, "nobounce", 8)) {
+                       pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
+                       intel_no_bounce = 1;
+               }
+
+               str += strcspn(str, ",");
+               while (*str == ',')
+                       str++;
+       }
+       return 0;
+}
+__setup("intel_iommu=", intel_iommu_setup);
+
+static struct kmem_cache *iommu_domain_cache;
+static struct kmem_cache *iommu_devinfo_cache;
+
+static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
+{
+       struct dmar_domain **domains;
+       int idx = did >> 8;
+
+       domains = iommu->domains[idx];
+       if (!domains)
+               return NULL;
+
+       return domains[did & 0xff];
+}
+
+static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
+                            struct dmar_domain *domain)
+{
+       struct dmar_domain **domains;
+       int idx = did >> 8;
+
+       if (!iommu->domains[idx]) {
+               size_t size = 256 * sizeof(struct dmar_domain *);
+               iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
+       }
+
+       domains = iommu->domains[idx];
+       if (WARN_ON(!domains))
+               return;
+       else
+               domains[did & 0xff] = domain;
+}
+
+void *alloc_pgtable_page(int node)
+{
+       struct page *page;
+       void *vaddr = NULL;
+
+       page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
+       if (page)
+               vaddr = page_address(page);
+       return vaddr;
+}
+
+void free_pgtable_page(void *vaddr)
+{
+       free_page((unsigned long)vaddr);
+}
+
+static inline void *alloc_domain_mem(void)
+{
+       return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
+}
+
+static void free_domain_mem(void *vaddr)
+{
+       kmem_cache_free(iommu_domain_cache, vaddr);
+}
+
+static inline void * alloc_devinfo_mem(void)
+{
+       return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
+}
+
+static inline void free_devinfo_mem(void *vaddr)
+{
+       kmem_cache_free(iommu_devinfo_cache, vaddr);
+}
+
+static inline int domain_type_is_si(struct dmar_domain *domain)
+{
+       return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
+}
+
+static inline bool domain_use_first_level(struct dmar_domain *domain)
+{
+       return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
+}
+
+static inline int domain_pfn_supported(struct dmar_domain *domain,
+                                      unsigned long pfn)
+{
+       int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
+
+       return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
+}
+
+static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
+{
+       unsigned long sagaw;
+       int agaw = -1;
+
+       sagaw = cap_sagaw(iommu->cap);
+       for (agaw = width_to_agaw(max_gaw);
+            agaw >= 0; agaw--) {
+               if (test_bit(agaw, &sagaw))
+                       break;
+       }
+
+       return agaw;
+}
+
+/*
+ * Calculate max SAGAW for each iommu.
+ */
+int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
+{
+       return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
+}
+
+/*
+ * calculate agaw for each iommu.
+ * "SAGAW" may be different across iommus, use a default agaw, and
+ * get a supported less agaw for iommus that don't support the default agaw.
+ */
+int iommu_calculate_agaw(struct intel_iommu *iommu)
+{
+       return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
+}
+
+/* This functionin only returns single iommu in a domain */
+struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
+{
+       int iommu_id;
+
+       /* si_domain and vm domain should not get here. */
+       if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
+               return NULL;
+
+       for_each_domain_iommu(iommu_id, domain)
+               break;
+
+       if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
+               return NULL;
+
+       return g_iommus[iommu_id];
+}
+
+static void domain_update_iommu_coherency(struct dmar_domain *domain)
+{
+       struct dmar_drhd_unit *drhd;
+       struct intel_iommu *iommu;
+       bool found = false;
+       int i;
+
+       domain->iommu_coherency = 1;
+
+       for_each_domain_iommu(i, domain) {
+               found = true;
+               if (!ecap_coherent(g_iommus[i]->ecap)) {
+                       domain->iommu_coherency = 0;
+                       break;
+               }
+       }
+       if (found)
+               return;
+
+       /* No hardware attached; use lowest common denominator */
+       rcu_read_lock();
+       for_each_active_iommu(iommu, drhd) {
+               if (!ecap_coherent(iommu->ecap)) {
+                       domain->iommu_coherency = 0;
+                       break;
+               }
+       }
+       rcu_read_unlock();
+}
+
+static int domain_update_iommu_snooping(struct intel_iommu *skip)
+{
+       struct dmar_drhd_unit *drhd;
+       struct intel_iommu *iommu;
+       int ret = 1;
+
+       rcu_read_lock();
+       for_each_active_iommu(iommu, drhd) {
+               if (iommu != skip) {
+                       if (!ecap_sc_support(iommu->ecap)) {
+                               ret = 0;
+                               break;
+                       }
+               }
+       }
+       rcu_read_unlock();
+
+       return ret;
+}
+
+static int domain_update_iommu_superpage(struct dmar_domain *domain,
+                                        struct intel_iommu *skip)
+{
+       struct dmar_drhd_unit *drhd;
+       struct intel_iommu *iommu;
+       int mask = 0x3;
+
+       if (!intel_iommu_superpage) {
+               return 0;
+       }
+
+       /* set iommu_superpage to the smallest common denominator */
+       rcu_read_lock();
+       for_each_active_iommu(iommu, drhd) {
+               if (iommu != skip) {
+                       if (domain && domain_use_first_level(domain)) {
+                               if (!cap_fl1gp_support(iommu->cap))
+                                       mask = 0x1;
+                       } else {
+                               mask &= cap_super_page_val(iommu->cap);
+                       }
+
+                       if (!mask)
+                               break;
+               }
+       }
+       rcu_read_unlock();
+
+       return fls(mask);
+}
+
+/* Some capabilities may be different across iommus */
+static void domain_update_iommu_cap(struct dmar_domain *domain)
+{
+       domain_update_iommu_coherency(domain);
+       domain->iommu_snooping = domain_update_iommu_snooping(NULL);
+       domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
+}
+
+struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
+                                        u8 devfn, int alloc)
+{
+       struct root_entry *root = &iommu->root_entry[bus];
+       struct context_entry *context;
+       u64 *entry;
+
+       entry = &root->lo;
+       if (sm_supported(iommu)) {
+               if (devfn >= 0x80) {
+                       devfn -= 0x80;
+                       entry = &root->hi;
+               }
+               devfn *= 2;
+       }
+       if (*entry & 1)
+               context = phys_to_virt(*entry & VTD_PAGE_MASK);
+       else {
+               unsigned long phy_addr;
+               if (!alloc)
+                       return NULL;
+
+               context = alloc_pgtable_page(iommu->node);
+               if (!context)
+                       return NULL;
+
+               __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
+               phy_addr = virt_to_phys((void *)context);
+               *entry = phy_addr | 1;
+               __iommu_flush_cache(iommu, entry, sizeof(*entry));
+       }
+       return &context[devfn];
+}
+
+static int iommu_dummy(struct device *dev)
+{
+       return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
+}
+
+static bool attach_deferred(struct device *dev)
+{
+       return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO;
+}
+
+/**
+ * is_downstream_to_pci_bridge - test if a device belongs to the PCI
+ *                              sub-hierarchy of a candidate PCI-PCI bridge
+ * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
+ * @bridge: the candidate PCI-PCI bridge
+ *
+ * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
+ */
+static bool
+is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
+{
+       struct pci_dev *pdev, *pbridge;
+
+       if (!dev_is_pci(dev) || !dev_is_pci(bridge))
+               return false;
+
+       pdev = to_pci_dev(dev);
+       pbridge = to_pci_dev(bridge);
+
+       if (pbridge->subordinate &&
+           pbridge->subordinate->number <= pdev->bus->number &&
+           pbridge->subordinate->busn_res.end >= pdev->bus->number)
+               return true;
+
+       return false;
+}
+
+static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
+{
+       struct dmar_drhd_unit *drhd = NULL;
+       struct intel_iommu *iommu;
+       struct device *tmp;
+       struct pci_dev *pdev = NULL;
+       u16 segment = 0;
+       int i;
+
+       if (iommu_dummy(dev))
+               return NULL;
+
+       if (dev_is_pci(dev)) {
+               struct pci_dev *pf_pdev;
+
+               pdev = pci_real_dma_dev(to_pci_dev(dev));
+
+               /* VFs aren't listed in scope tables; we need to look up
+                * the PF instead to find the IOMMU. */
+               pf_pdev = pci_physfn(pdev);
+               dev = &pf_pdev->dev;
+               segment = pci_domain_nr(pdev->bus);
+       } else if (has_acpi_companion(dev))
+               dev = &ACPI_COMPANION(dev)->dev;
+
+       rcu_read_lock();
+       for_each_active_iommu(iommu, drhd) {
+               if (pdev && segment != drhd->segment)
+                       continue;
+
+               for_each_active_dev_scope(drhd->devices,
+                                         drhd->devices_cnt, i, tmp) {
+                       if (tmp == dev) {
+                               /* For a VF use its original BDF# not that of the PF
+                                * which we used for the IOMMU lookup. Strictly speaking
+                                * we could do this for all PCI devices; we only need to
+                                * get the BDF# from the scope table for ACPI matches. */
+                               if (pdev && pdev->is_virtfn)
+                                       goto got_pdev;
+
+                               *bus = drhd->devices[i].bus;
+                               *devfn = drhd->devices[i].devfn;
+                               goto out;
+                       }
+
+                       if (is_downstream_to_pci_bridge(dev, tmp))
+                               goto got_pdev;
+               }
+
+               if (pdev && drhd->include_all) {
+               got_pdev:
+                       *bus = pdev->bus->number;
+                       *devfn = pdev->devfn;
+                       goto out;
+               }
+       }
+       iommu = NULL;
+ out:
+       rcu_read_unlock();
+
+       return iommu;
+}
+
+static void domain_flush_cache(struct dmar_domain *domain,
+                              void *addr, int size)
+{
+       if (!domain->iommu_coherency)
+               clflush_cache_range(addr, size);
+}
+
+static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
+{
+       struct context_entry *context;
+       int ret = 0;
+       unsigned long flags;
+
+       spin_lock_irqsave(&iommu->lock, flags);
+       context = iommu_context_addr(iommu, bus, devfn, 0);
+       if (context)
+               ret = context_present(context);
+       spin_unlock_irqrestore(&iommu->lock, flags);
+       return ret;
+}
+
+static void free_context_table(struct intel_iommu *iommu)
+{
+       int i;
+       unsigned long flags;
+       struct context_entry *context;
+
+       spin_lock_irqsave(&iommu->lock, flags);
+       if (!iommu->root_entry) {
+               goto out;
+       }
+       for (i = 0; i < ROOT_ENTRY_NR; i++) {
+               context = iommu_context_addr(iommu, i, 0, 0);
+               if (context)
+                       free_pgtable_page(context);
+
+               if (!sm_supported(iommu))
+                       continue;
+
+               context = iommu_context_addr(iommu, i, 0x80, 0);
+               if (context)
+                       free_pgtable_page(context);
+
+       }
+       free_pgtable_page(iommu->root_entry);
+       iommu->root_entry = NULL;
+out:
+       spin_unlock_irqrestore(&iommu->lock, flags);
+}
+
+static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
+                                     unsigned long pfn, int *target_level)
+{
+       struct dma_pte *parent, *pte;
+       int level = agaw_to_level(domain->agaw);
+       int offset;
+
+       BUG_ON(!domain->pgd);
+
+       if (!domain_pfn_supported(domain, pfn))
+               /* Address beyond IOMMU's addressing capabilities. */
+               return NULL;
+
+       parent = domain->pgd;
+
+       while (1) {
+               void *tmp_page;
+
+               offset = pfn_level_offset(pfn, level);
+               pte = &parent[offset];
+               if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
+                       break;
+               if (level == *target_level)
+                       break;
+
+               if (!dma_pte_present(pte)) {
+                       uint64_t pteval;
+
+                       tmp_page = alloc_pgtable_page(domain->nid);
+
+                       if (!tmp_page)
+                               return NULL;
+
+                       domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
+                       pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
+                       if (domain_use_first_level(domain))
+                               pteval |= DMA_FL_PTE_XD;
+                       if (cmpxchg64(&pte->val, 0ULL, pteval))
+                               /* Someone else set it while we were thinking; use theirs. */
+                               free_pgtable_page(tmp_page);
+                       else
+                               domain_flush_cache(domain, pte, sizeof(*pte));
+               }
+               if (level == 1)
+                       break;
+
+               parent = phys_to_virt(dma_pte_addr(pte));
+               level--;
+       }
+
+       if (!*target_level)
+               *target_level = level;
+
+       return pte;
+}
+
+/* return address's pte at specific level */
+static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
+                                        unsigned long pfn,
+                                        int level, int *large_page)
+{
+       struct dma_pte *parent, *pte;
+       int total = agaw_to_level(domain->agaw);
+       int offset;
+
+       parent = domain->pgd;
+       while (level <= total) {
+               offset = pfn_level_offset(pfn, total);
+               pte = &parent[offset];
+               if (level == total)
+                       return pte;
+
+               if (!dma_pte_present(pte)) {
+                       *large_page = total;
+                       break;
+               }
+
+               if (dma_pte_superpage(pte)) {
+                       *large_page = total;
+                       return pte;
+               }
+
+               parent = phys_to_virt(dma_pte_addr(pte));
+               total--;
+       }
+       return NULL;
+}
+
+/* clear last level pte, a tlb flush should be followed */
+static void dma_pte_clear_range(struct dmar_domain *domain,
+                               unsigned long start_pfn,
+                               unsigned long last_pfn)
+{
+       unsigned int large_page;
+       struct dma_pte *first_pte, *pte;
+
+       BUG_ON(!domain_pfn_supported(domain, start_pfn));
+       BUG_ON(!domain_pfn_supported(domain, last_pfn));
+       BUG_ON(start_pfn > last_pfn);
+
+       /* we don't need lock here; nobody else touches the iova range */
+       do {
+               large_page = 1;
+               first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
+               if (!pte) {
+                       start_pfn = align_to_level(start_pfn + 1, large_page + 1);
+                       continue;
+               }
+               do {
+                       dma_clear_pte(pte);
+                       start_pfn += lvl_to_nr_pages(large_page);
+                       pte++;
+               } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
+
+               domain_flush_cache(domain, first_pte,
+                                  (void *)pte - (void *)first_pte);
+
+       } while (start_pfn && start_pfn <= last_pfn);
+}
+
+static void dma_pte_free_level(struct dmar_domain *domain, int level,
+                              int retain_level, struct dma_pte *pte,
+                              unsigned long pfn, unsigned long start_pfn,
+                              unsigned long last_pfn)
+{
+       pfn = max(start_pfn, pfn);
+       pte = &pte[pfn_level_offset(pfn, level)];
+
+       do {
+               unsigned long level_pfn;
+               struct dma_pte *level_pte;
+
+               if (!dma_pte_present(pte) || dma_pte_superpage(pte))
+                       goto next;
+
+               level_pfn = pfn & level_mask(level);
+               level_pte = phys_to_virt(dma_pte_addr(pte));
+
+               if (level > 2) {
+                       dma_pte_free_level(domain, level - 1, retain_level,
+                                          level_pte, level_pfn, start_pfn,
+                                          last_pfn);
+               }
+
+               /*
+                * Free the page table if we're below the level we want to
+                * retain and the range covers the entire table.
+                */
+               if (level < retain_level && !(start_pfn > level_pfn ||
+                     last_pfn < level_pfn + level_size(level) - 1)) {
+                       dma_clear_pte(pte);
+                       domain_flush_cache(domain, pte, sizeof(*pte));
+                       free_pgtable_page(level_pte);
+               }
+next:
+               pfn += level_size(level);
+       } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
+}
+
+/*
+ * clear last level (leaf) ptes and free page table pages below the
+ * level we wish to keep intact.
+ */
+static void dma_pte_free_pagetable(struct dmar_domain *domain,
+                                  unsigned long start_pfn,
+                                  unsigned long last_pfn,
+                                  int retain_level)
+{
+       BUG_ON(!domain_pfn_supported(domain, start_pfn));
+       BUG_ON(!domain_pfn_supported(domain, last_pfn));
+       BUG_ON(start_pfn > last_pfn);
+
+       dma_pte_clear_range(domain, start_pfn, last_pfn);
+
+       /* We don't need lock here; nobody else touches the iova range */
+       dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
+                          domain->pgd, 0, start_pfn, last_pfn);
+
+       /* free pgd */
+       if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
+               free_pgtable_page(domain->pgd);
+               domain->pgd = NULL;
+       }
+}
+
+/* When a page at a given level is being unlinked from its parent, we don't
+   need to *modify* it at all. All we need to do is make a list of all the
+   pages which can be freed just as soon as we've flushed the IOTLB and we
+   know the hardware page-walk will no longer touch them.
+   The 'pte' argument is the *parent* PTE, pointing to the page that is to
+   be freed. */
+static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
+                                           int level, struct dma_pte *pte,
+                                           struct page *freelist)
+{
+       struct page *pg;
+
+       pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
+       pg->freelist = freelist;
+       freelist = pg;
+
+       if (level == 1)
+               return freelist;
+
+       pte = page_address(pg);
+       do {
+               if (dma_pte_present(pte) && !dma_pte_superpage(pte))
+                       freelist = dma_pte_list_pagetables(domain, level - 1,
+                                                          pte, freelist);
+               pte++;
+       } while (!first_pte_in_page(pte));
+
+       return freelist;
+}
+
+static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
+                                       struct dma_pte *pte, unsigned long pfn,
+                                       unsigned long start_pfn,
+                                       unsigned long last_pfn,
+                                       struct page *freelist)
+{
+       struct dma_pte *first_pte = NULL, *last_pte = NULL;
+
+       pfn = max(start_pfn, pfn);
+       pte = &pte[pfn_level_offset(pfn, level)];
+
+       do {
+               unsigned long level_pfn;
+
+               if (!dma_pte_present(pte))
+                       goto next;
+
+               level_pfn = pfn & level_mask(level);
+
+               /* If range covers entire pagetable, free it */
+               if (start_pfn <= level_pfn &&
+                   last_pfn >= level_pfn + level_size(level) - 1) {
+                       /* These suborbinate page tables are going away entirely. Don't
+                          bother to clear them; we're just going to *free* them. */
+                       if (level > 1 && !dma_pte_superpage(pte))
+                               freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
+
+                       dma_clear_pte(pte);
+                       if (!first_pte)
+                               first_pte = pte;
+                       last_pte = pte;
+               } else if (level > 1) {
+                       /* Recurse down into a level that isn't *entirely* obsolete */
+                       freelist = dma_pte_clear_level(domain, level - 1,
+                                                      phys_to_virt(dma_pte_addr(pte)),
+                                                      level_pfn, start_pfn, last_pfn,
+                                                      freelist);
+               }
+next:
+               pfn += level_size(level);
+       } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
+
+       if (first_pte)
+               domain_flush_cache(domain, first_pte,
+                                  (void *)++last_pte - (void *)first_pte);
+
+       return freelist;
+}
+
+/* We can't just free the pages because the IOMMU may still be walking
+   the page tables, and may have cached the intermediate levels. The
+   pages can only be freed after the IOTLB flush has been done. */
+static struct page *domain_unmap(struct dmar_domain *domain,
+                                unsigned long start_pfn,
+                                unsigned long last_pfn)
+{
+       struct page *freelist;
+
+       BUG_ON(!domain_pfn_supported(domain, start_pfn));
+       BUG_ON(!domain_pfn_supported(domain, last_pfn));
+       BUG_ON(start_pfn > last_pfn);
+
+       /* we don't need lock here; nobody else touches the iova range */
+       freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
+                                      domain->pgd, 0, start_pfn, last_pfn, NULL);
+
+       /* free pgd */
+       if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
+               struct page *pgd_page = virt_to_page(domain->pgd);
+               pgd_page->freelist = freelist;
+               freelist = pgd_page;
+
+               domain->pgd = NULL;
+       }
+
+       return freelist;
+}
+
+static void dma_free_pagelist(struct page *freelist)
+{
+       struct page *pg;
+
+       while ((pg = freelist)) {
+               freelist = pg->freelist;
+               free_pgtable_page(page_address(pg));
+       }
+}
+
+static void iova_entry_free(unsigned long data)
+{
+       struct page *freelist = (struct page *)data;
+
+       dma_free_pagelist(freelist);
+}
+
+/* iommu handling */
+static int iommu_alloc_root_entry(struct intel_iommu *iommu)
+{
+       struct root_entry *root;
+       unsigned long flags;
+
+       root = (struct root_entry *)alloc_pgtable_page(iommu->node);
+       if (!root) {
+               pr_err("Allocating root entry for %s failed\n",
+                       iommu->name);
+               return -ENOMEM;
+       }
+
+       __iommu_flush_cache(iommu, root, ROOT_SIZE);
+
+       spin_lock_irqsave(&iommu->lock, flags);
+       iommu->root_entry = root;
+       spin_unlock_irqrestore(&iommu->lock, flags);
+
+       return 0;
+}
+
+static void iommu_set_root_entry(struct intel_iommu *iommu)
+{
+       u64 addr;
+       u32 sts;
+       unsigned long flag;
+
+       addr = virt_to_phys(iommu->root_entry);
+       if (sm_supported(iommu))
+               addr |= DMA_RTADDR_SMT;
+
+       raw_spin_lock_irqsave(&iommu->register_lock, flag);
+       dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
+
+       writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
+
+       /* Make sure hardware complete it */
+       IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+                     readl, (sts & DMA_GSTS_RTPS), sts);
+
+       raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+}
+
+void iommu_flush_write_buffer(struct intel_iommu *iommu)
+{
+       u32 val;
+       unsigned long flag;
+
+       if (!rwbf_quirk && !cap_rwbf(iommu->cap))
+               return;
+
+       raw_spin_lock_irqsave(&iommu->register_lock, flag);
+       writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
+
+       /* Make sure hardware complete it */
+       IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+                     readl, (!(val & DMA_GSTS_WBFS)), val);
+
+       raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+}
+
+/* return value determine if we need a write buffer flush */
+static void __iommu_flush_context(struct intel_iommu *iommu,
+                                 u16 did, u16 source_id, u8 function_mask,
+                                 u64 type)
+{
+       u64 val = 0;
+       unsigned long flag;
+
+       switch (type) {
+       case DMA_CCMD_GLOBAL_INVL:
+               val = DMA_CCMD_GLOBAL_INVL;
+               break;
+       case DMA_CCMD_DOMAIN_INVL:
+               val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
+               break;
+       case DMA_CCMD_DEVICE_INVL:
+               val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
+                       | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
+               break;
+       default:
+               BUG();
+       }
+       val |= DMA_CCMD_ICC;
+
+       raw_spin_lock_irqsave(&iommu->register_lock, flag);
+       dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
+
+       /* Make sure hardware complete it */
+       IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
+               dmar_readq, (!(val & DMA_CCMD_ICC)), val);
+
+       raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+}
+
+/* return value determine if we need a write buffer flush */
+static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
+                               u64 addr, unsigned int size_order, u64 type)
+{
+       int tlb_offset = ecap_iotlb_offset(iommu->ecap);
+       u64 val = 0, val_iva = 0;
+       unsigned long flag;
+
+       switch (type) {
+       case DMA_TLB_GLOBAL_FLUSH:
+               /* global flush doesn't need set IVA_REG */
+               val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
+               break;
+       case DMA_TLB_DSI_FLUSH:
+               val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
+               break;
+       case DMA_TLB_PSI_FLUSH:
+               val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
+               /* IH bit is passed in as part of address */
+               val_iva = size_order | addr;
+               break;
+       default:
+               BUG();
+       }
+       /* Note: set drain read/write */
+#if 0
+       /*
+        * This is probably to be super secure.. Looks like we can
+        * ignore it without any impact.
+        */
+       if (cap_read_drain(iommu->cap))
+               val |= DMA_TLB_READ_DRAIN;
+#endif
+       if (cap_write_drain(iommu->cap))
+               val |= DMA_TLB_WRITE_DRAIN;
+
+       raw_spin_lock_irqsave(&iommu->register_lock, flag);
+       /* Note: Only uses first TLB reg currently */
+       if (val_iva)
+               dmar_writeq(iommu->reg + tlb_offset, val_iva);
+       dmar_writeq(iommu->reg + tlb_offset + 8, val);
+
+       /* Make sure hardware complete it */
+       IOMMU_WAIT_OP(iommu, tlb_offset + 8,
+               dmar_readq, (!(val & DMA_TLB_IVT)), val);
+
+       raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+
+       /* check IOTLB invalidation granularity */
+       if (DMA_TLB_IAIG(val) == 0)
+               pr_err("Flush IOTLB failed\n");
+       if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
+               pr_debug("TLB flush request %Lx, actual %Lx\n",
+                       (unsigned long long)DMA_TLB_IIRG(type),
+                       (unsigned long long)DMA_TLB_IAIG(val));
+}
+
+static struct device_domain_info *
+iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
+                        u8 bus, u8 devfn)
+{
+       struct device_domain_info *info;
+
+       assert_spin_locked(&device_domain_lock);
+
+       if (!iommu->qi)
+               return NULL;
+
+       list_for_each_entry(info, &domain->devices, link)
+               if (info->iommu == iommu && info->bus == bus &&
+                   info->devfn == devfn) {
+                       if (info->ats_supported && info->dev)
+                               return info;
+                       break;
+               }
+
+       return NULL;
+}
+
+static void domain_update_iotlb(struct dmar_domain *domain)
+{
+       struct device_domain_info *info;
+       bool has_iotlb_device = false;
+
+       assert_spin_locked(&device_domain_lock);
+
+       list_for_each_entry(info, &domain->devices, link) {
+               struct pci_dev *pdev;
+
+               if (!info->dev || !dev_is_pci(info->dev))
+                       continue;
+
+               pdev = to_pci_dev(info->dev);
+               if (pdev->ats_enabled) {
+                       has_iotlb_device = true;
+                       break;
+               }
+       }
+
+       domain->has_iotlb_device = has_iotlb_device;
+}
+
+static void iommu_enable_dev_iotlb(struct device_domain_info *info)
+{
+       struct pci_dev *pdev;
+
+       assert_spin_locked(&device_domain_lock);
+
+       if (!info || !dev_is_pci(info->dev))
+               return;
+
+       pdev = to_pci_dev(info->dev);
+       /* For IOMMU that supports device IOTLB throttling (DIT), we assign
+        * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
+        * queue depth at PF level. If DIT is not set, PFSID will be treated as
+        * reserved, which should be set to 0.
+        */
+       if (!ecap_dit(info->iommu->ecap))
+               info->pfsid = 0;
+       else {
+               struct pci_dev *pf_pdev;
+
+               /* pdev will be returned if device is not a vf */
+               pf_pdev = pci_physfn(pdev);
+               info->pfsid = pci_dev_id(pf_pdev);
+       }
+
+#ifdef CONFIG_INTEL_IOMMU_SVM
+       /* The PCIe spec, in its wisdom, declares that the behaviour of
+          the device if you enable PASID support after ATS support is
+          undefined. So always enable PASID support on devices which
+          have it, even if we can't yet know if we're ever going to
+          use it. */
+       if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
+               info->pasid_enabled = 1;
+
+       if (info->pri_supported &&
+           (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
+           !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
+               info->pri_enabled = 1;
+#endif
+       if (info->ats_supported && pci_ats_page_aligned(pdev) &&
+           !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
+               info->ats_enabled = 1;
+               domain_update_iotlb(info->domain);
+               info->ats_qdep = pci_ats_queue_depth(pdev);
+       }
+}
+
+static void iommu_disable_dev_iotlb(struct device_domain_info *info)
+{
+       struct pci_dev *pdev;
+
+       assert_spin_locked(&device_domain_lock);
+
+       if (!dev_is_pci(info->dev))
+               return;
+
+       pdev = to_pci_dev(info->dev);
+
+       if (info->ats_enabled) {
+               pci_disable_ats(pdev);
+               info->ats_enabled = 0;
+               domain_update_iotlb(info->domain);
+       }
+#ifdef CONFIG_INTEL_IOMMU_SVM
+       if (info->pri_enabled) {
+               pci_disable_pri(pdev);
+               info->pri_enabled = 0;
+       }
+       if (info->pasid_enabled) {
+               pci_disable_pasid(pdev);
+               info->pasid_enabled = 0;
+       }
+#endif
+}
+
+static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
+                                 u64 addr, unsigned mask)
+{
+       u16 sid, qdep;
+       unsigned long flags;
+       struct device_domain_info *info;
+
+       if (!domain->has_iotlb_device)
+               return;
+
+       spin_lock_irqsave(&device_domain_lock, flags);
+       list_for_each_entry(info, &domain->devices, link) {
+               if (!info->ats_enabled)
+                       continue;
+
+               sid = info->bus << 8 | info->devfn;
+               qdep = info->ats_qdep;
+               qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
+                               qdep, addr, mask);
+       }
+       spin_unlock_irqrestore(&device_domain_lock, flags);
+}
+
+static void domain_flush_piotlb(struct intel_iommu *iommu,
+                               struct dmar_domain *domain,
+                               u64 addr, unsigned long npages, bool ih)
+{
+       u16 did = domain->iommu_did[iommu->seq_id];
+
+       if (domain->default_pasid)
+               qi_flush_piotlb(iommu, did, domain->default_pasid,
+                               addr, npages, ih);
+
+       if (!list_empty(&domain->devices))
+               qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
+}
+
+static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
+                                 struct dmar_domain *domain,
+                                 unsigned long pfn, unsigned int pages,
+                                 int ih, int map)
+{
+       unsigned int mask = ilog2(__roundup_pow_of_two(pages));
+       uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
+       u16 did = domain->iommu_did[iommu->seq_id];
+
+       BUG_ON(pages == 0);
+
+       if (ih)
+               ih = 1 << 6;
+
+       if (domain_use_first_level(domain)) {
+               domain_flush_piotlb(iommu, domain, addr, pages, ih);
+       } else {
+               /*
+                * Fallback to domain selective flush if no PSI support or
+                * the size is too big. PSI requires page size to be 2 ^ x,
+                * and the base address is naturally aligned to the size.
+                */
+               if (!cap_pgsel_inv(iommu->cap) ||
+                   mask > cap_max_amask_val(iommu->cap))
+                       iommu->flush.flush_iotlb(iommu, did, 0, 0,
+                                                       DMA_TLB_DSI_FLUSH);
+               else
+                       iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
+                                                       DMA_TLB_PSI_FLUSH);
+       }
+
+       /*
+        * In caching mode, changes of pages from non-present to present require
+        * flush. However, device IOTLB doesn't need to be flushed in this case.
+        */
+       if (!cap_caching_mode(iommu->cap) || !map)
+               iommu_flush_dev_iotlb(domain, addr, mask);
+}
+
+/* Notification for newly created mappings */
+static inline void __mapping_notify_one(struct intel_iommu *iommu,
+                                       struct dmar_domain *domain,
+                                       unsigned long pfn, unsigned int pages)
+{
+       /*
+        * It's a non-present to present mapping. Only flush if caching mode
+        * and second level.
+        */
+       if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
+               iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
+       else
+               iommu_flush_write_buffer(iommu);
+}
+
+static void iommu_flush_iova(struct iova_domain *iovad)
+{
+       struct dmar_domain *domain;
+       int idx;
+
+       domain = container_of(iovad, struct dmar_domain, iovad);
+
+       for_each_domain_iommu(idx, domain) {
+               struct intel_iommu *iommu = g_iommus[idx];
+               u16 did = domain->iommu_did[iommu->seq_id];
+
+               if (domain_use_first_level(domain))
+                       domain_flush_piotlb(iommu, domain, 0, -1, 0);
+               else
+                       iommu->flush.flush_iotlb(iommu, did, 0, 0,
+                                                DMA_TLB_DSI_FLUSH);
+
+               if (!cap_caching_mode(iommu->cap))
+                       iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
+                                             0, MAX_AGAW_PFN_WIDTH);
+       }
+}
+
+static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
+{
+       u32 pmen;
+       unsigned long flags;
+
+       if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
+               return;
+
+       raw_spin_lock_irqsave(&iommu->register_lock, flags);
+       pmen = readl(iommu->reg + DMAR_PMEN_REG);
+       pmen &= ~DMA_PMEN_EPM;
+       writel(pmen, iommu->reg + DMAR_PMEN_REG);
+
+       /* wait for the protected region status bit to clear */
+       IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
+               readl, !(pmen & DMA_PMEN_PRS), pmen);
+
+       raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
+}
+
+static void iommu_enable_translation(struct intel_iommu *iommu)
+{
+       u32 sts;
+       unsigned long flags;
+
+       raw_spin_lock_irqsave(&iommu->register_lock, flags);
+       iommu->gcmd |= DMA_GCMD_TE;
+       writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
+
+       /* Make sure hardware complete it */
+       IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+                     readl, (sts & DMA_GSTS_TES), sts);
+
+       raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
+}
+
+static void iommu_disable_translation(struct intel_iommu *iommu)
+{
+       u32 sts;
+       unsigned long flag;
+
+       raw_spin_lock_irqsave(&iommu->register_lock, flag);
+       iommu->gcmd &= ~DMA_GCMD_TE;
+       writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
+
+       /* Make sure hardware complete it */
+       IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+                     readl, (!(sts & DMA_GSTS_TES)), sts);
+
+       raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+}
+
+static int iommu_init_domains(struct intel_iommu *iommu)
+{
+       u32 ndomains, nlongs;
+       size_t size;
+
+       ndomains = cap_ndoms(iommu->cap);
+       pr_debug("%s: Number of Domains supported <%d>\n",
+                iommu->name, ndomains);
+       nlongs = BITS_TO_LONGS(ndomains);
+
+       spin_lock_init(&iommu->lock);
+
+       iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
+       if (!iommu->domain_ids) {
+               pr_err("%s: Allocating domain id array failed\n",
+                      iommu->name);
+               return -ENOMEM;
+       }
+
+       size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
+       iommu->domains = kzalloc(size, GFP_KERNEL);
+
+       if (iommu->domains) {
+               size = 256 * sizeof(struct dmar_domain *);
+               iommu->domains[0] = kzalloc(size, GFP_KERNEL);
+       }
+
+       if (!iommu->domains || !iommu->domains[0]) {
+               pr_err("%s: Allocating domain array failed\n",
+                      iommu->name);
+               kfree(iommu->domain_ids);
+               kfree(iommu->domains);
+               iommu->domain_ids = NULL;
+               iommu->domains    = NULL;
+               return -ENOMEM;
+       }
+
+       /*
+        * If Caching mode is set, then invalid translations are tagged
+        * with domain-id 0, hence we need to pre-allocate it. We also
+        * use domain-id 0 as a marker for non-allocated domain-id, so
+        * make sure it is not used for a real domain.
+        */
+       set_bit(0, iommu->domain_ids);
+
+       /*
+        * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
+        * entry for first-level or pass-through translation modes should
+        * be programmed with a domain id different from those used for
+        * second-level or nested translation. We reserve a domain id for
+        * this purpose.
+        */
+       if (sm_supported(iommu))
+               set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
+
+       return 0;
+}
+
+static void disable_dmar_iommu(struct intel_iommu *iommu)
+{
+       struct device_domain_info *info, *tmp;
+       unsigned long flags;
+
+       if (!iommu->domains || !iommu->domain_ids)
+               return;
+
+       spin_lock_irqsave(&device_domain_lock, flags);
+       list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
+               if (info->iommu != iommu)
+                       continue;
+
+               if (!info->dev || !info->domain)
+                       continue;
+
+               __dmar_remove_one_dev_info(info);
+       }
+       spin_unlock_irqrestore(&device_domain_lock, flags);
+
+       if (iommu->gcmd & DMA_GCMD_TE)
+               iommu_disable_translation(iommu);
+}
+
+static void free_dmar_iommu(struct intel_iommu *iommu)
+{
+       if ((iommu->domains) && (iommu->domain_ids)) {
+               int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
+               int i;
+
+               for (i = 0; i < elems; i++)
+                       kfree(iommu->domains[i]);
+               kfree(iommu->domains);
+               kfree(iommu->domain_ids);
+               iommu->domains = NULL;
+               iommu->domain_ids = NULL;
+       }
+
+       g_iommus[iommu->seq_id] = NULL;
+
+       /* free context mapping */
+       free_context_table(iommu);
+
+#ifdef CONFIG_INTEL_IOMMU_SVM
+       if (pasid_supported(iommu)) {
+               if (ecap_prs(iommu->ecap))
+                       intel_svm_finish_prq(iommu);
+       }
+       if (ecap_vcs(iommu->ecap) && vccap_pasid(iommu->vccap))
+               ioasid_unregister_allocator(&iommu->pasid_allocator);
+
+#endif
+}
+
+/*
+ * Check and return whether first level is used by default for
+ * DMA translation.
+ */
+static bool first_level_by_default(void)
+{
+       struct dmar_drhd_unit *drhd;
+       struct intel_iommu *iommu;
+       static int first_level_support = -1;
+
+       if (likely(first_level_support != -1))
+               return first_level_support;
+
+       first_level_support = 1;
+
+       rcu_read_lock();
+       for_each_active_iommu(iommu, drhd) {
+               if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) {
+                       first_level_support = 0;
+                       break;
+               }
+       }
+       rcu_read_unlock();
+
+       return first_level_support;
+}
+
+static struct dmar_domain *alloc_domain(int flags)
+{
+       struct dmar_domain *domain;
+
+       domain = alloc_domain_mem();
+       if (!domain)
+               return NULL;
+
+       memset(domain, 0, sizeof(*domain));
+       domain->nid = NUMA_NO_NODE;
+       domain->flags = flags;
+       if (first_level_by_default())
+               domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
+       domain->has_iotlb_device = false;
+       INIT_LIST_HEAD(&domain->devices);
+
+       return domain;
+}
+
+/* Must be called with iommu->lock */
+static int domain_attach_iommu(struct dmar_domain *domain,
+                              struct intel_iommu *iommu)
+{
+       unsigned long ndomains;
+       int num;
+
+       assert_spin_locked(&device_domain_lock);
+       assert_spin_locked(&iommu->lock);
+
+       domain->iommu_refcnt[iommu->seq_id] += 1;
+       domain->iommu_count += 1;
+       if (domain->iommu_refcnt[iommu->seq_id] == 1) {
+               ndomains = cap_ndoms(iommu->cap);
+               num      = find_first_zero_bit(iommu->domain_ids, ndomains);
+
+               if (num >= ndomains) {
+                       pr_err("%s: No free domain ids\n", iommu->name);
+                       domain->iommu_refcnt[iommu->seq_id] -= 1;
+                       domain->iommu_count -= 1;
+                       return -ENOSPC;
+               }
+
+               set_bit(num, iommu->domain_ids);
+               set_iommu_domain(iommu, num, domain);
+
+               domain->iommu_did[iommu->seq_id] = num;
+               domain->nid                      = iommu->node;
+
+               domain_update_iommu_cap(domain);
+       }
+
+       return 0;
+}
+
+static int domain_detach_iommu(struct dmar_domain *domain,
+                              struct intel_iommu *iommu)
+{
+       int num, count;
+
+       assert_spin_locked(&device_domain_lock);
+       assert_spin_locked(&iommu->lock);
+
+       domain->iommu_refcnt[iommu->seq_id] -= 1;
+       count = --domain->iommu_count;
+       if (domain->iommu_refcnt[iommu->seq_id] == 0) {
+               num = domain->iommu_did[iommu->seq_id];
+               clear_bit(num, iommu->domain_ids);
+               set_iommu_domain(iommu, num, NULL);
+
+               domain_update_iommu_cap(domain);
+               domain->iommu_did[iommu->seq_id] = 0;
+       }
+
+       return count;
+}
+
+static struct iova_domain reserved_iova_list;
+static struct lock_class_key reserved_rbtree_key;
+
+static int dmar_init_reserved_ranges(void)
+{
+       struct pci_dev *pdev = NULL;
+       struct iova *iova;
+       int i;
+
+       init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
+
+       lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
+               &reserved_rbtree_key);
+
+       /* IOAPIC ranges shouldn't be accessed by DMA */
+       iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
+               IOVA_PFN(IOAPIC_RANGE_END));
+       if (!iova) {
+               pr_err("Reserve IOAPIC range failed\n");
+               return -ENODEV;
+       }
+
+       /* Reserve all PCI MMIO to avoid peer-to-peer access */
+       for_each_pci_dev(pdev) {
+               struct resource *r;
+
+               for (i = 0; i < PCI_NUM_RESOURCES; i++) {
+                       r = &pdev->resource[i];
+                       if (!r->flags || !(r->flags & IORESOURCE_MEM))
+                               continue;
+                       iova = reserve_iova(&reserved_iova_list,
+                                           IOVA_PFN(r->start),
+                                           IOVA_PFN(r->end));
+                       if (!iova) {
+                               pci_err(pdev, "Reserve iova for %pR failed\n", r);
+                               return -ENODEV;
+                       }
+               }
+       }
+       return 0;
+}
+
+static inline int guestwidth_to_adjustwidth(int gaw)
+{
+       int agaw;
+       int r = (gaw - 12) % 9;
+
+       if (r == 0)
+               agaw = gaw;
+       else
+               agaw = gaw + 9 - r;
+       if (agaw > 64)
+               agaw = 64;
+       return agaw;
+}
+
+static void domain_exit(struct dmar_domain *domain)
+{
+
+       /* Remove associated devices and clear attached or cached domains */
+       domain_remove_dev_info(domain);
+
+       /* destroy iovas */
+       if (domain->domain.type == IOMMU_DOMAIN_DMA)
+               put_iova_domain(&domain->iovad);
+
+       if (domain->pgd) {
+               struct page *freelist;
+
+               freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
+               dma_free_pagelist(freelist);
+       }
+
+       free_domain_mem(domain);
+}
+
+/*
+ * Get the PASID directory size for scalable mode context entry.
+ * Value of X in the PDTS field of a scalable mode context entry
+ * indicates PASID directory with 2^(X + 7) entries.
+ */
+static inline unsigned long context_get_sm_pds(struct pasid_table *table)
+{
+       int pds, max_pde;
+
+       max_pde = table->max_pasid >> PASID_PDE_SHIFT;
+       pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
+       if (pds < 7)
+               return 0;
+
+       return pds - 7;
+}
+
+/*
+ * Set the RID_PASID field of a scalable mode context entry. The
+ * IOMMU hardware will use the PASID value set in this field for
+ * DMA translations of DMA requests without PASID.
+ */
+static inline void
+context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
+{
+       context->hi |= pasid & ((1 << 20) - 1);
+       context->hi |= (1 << 20);
+}
+
+/*
+ * Set the DTE(Device-TLB Enable) field of a scalable mode context
+ * entry.
+ */
+static inline void context_set_sm_dte(struct context_entry *context)
+{
+       context->lo |= (1 << 2);
+}
+
+/*
+ * Set the PRE(Page Request Enable) field of a scalable mode context
+ * entry.
+ */
+static inline void context_set_sm_pre(struct context_entry *context)
+{
+       context->lo |= (1 << 4);
+}
+
+/* Convert value to context PASID directory size field coding. */
+#define context_pdts(pds)      (((pds) & 0x7) << 9)
+
+static int domain_context_mapping_one(struct dmar_domain *domain,
+                                     struct intel_iommu *iommu,
+                                     struct pasid_table *table,
+                                     u8 bus, u8 devfn)
+{
+       u16 did = domain->iommu_did[iommu->seq_id];
+       int translation = CONTEXT_TT_MULTI_LEVEL;
+       struct device_domain_info *info = NULL;
+       struct context_entry *context;
+       unsigned long flags;
+       int ret;
+
+       WARN_ON(did == 0);
+
+       if (hw_pass_through && domain_type_is_si(domain))
+               translation = CONTEXT_TT_PASS_THROUGH;
+
+       pr_debug("Set context mapping for %02x:%02x.%d\n",
+               bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
+
+       BUG_ON(!domain->pgd);
+
+       spin_lock_irqsave(&device_domain_lock, flags);
+       spin_lock(&iommu->lock);
+
+       ret = -ENOMEM;
+       context = iommu_context_addr(iommu, bus, devfn, 1);
+       if (!context)
+               goto out_unlock;
+
+       ret = 0;
+       if (context_present(context))
+               goto out_unlock;
+
+       /*
+        * For kdump cases, old valid entries may be cached due to the
+        * in-flight DMA and copied pgtable, but there is no unmapping
+        * behaviour for them, thus we need an explicit cache flush for
+        * the newly-mapped device. For kdump, at this point, the device
+        * is supposed to finish reset at its driver probe stage, so no
+        * in-flight DMA will exist, and we don't need to worry anymore
+        * hereafter.
+        */
+       if (context_copied(context)) {
+               u16 did_old = context_domain_id(context);
+
+               if (did_old < cap_ndoms(iommu->cap)) {
+                       iommu->flush.flush_context(iommu, did_old,
+                                                  (((u16)bus) << 8) | devfn,
+                                                  DMA_CCMD_MASK_NOBIT,
+                                                  DMA_CCMD_DEVICE_INVL);
+                       iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
+                                                DMA_TLB_DSI_FLUSH);
+               }
+       }
+
+       context_clear_entry(context);
+
+       if (sm_supported(iommu)) {
+               unsigned long pds;
+
+               WARN_ON(!table);
+
+               /* Setup the PASID DIR pointer: */
+               pds = context_get_sm_pds(table);
+               context->lo = (u64)virt_to_phys(table->table) |
+                               context_pdts(pds);
+
+               /* Setup the RID_PASID field: */
+               context_set_sm_rid2pasid(context, PASID_RID2PASID);
+
+               /*
+                * Setup the Device-TLB enable bit and Page request
+                * Enable bit:
+                */
+               info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
+               if (info && info->ats_supported)
+                       context_set_sm_dte(context);
+               if (info && info->pri_supported)
+                       context_set_sm_pre(context);
+       } else {
+               struct dma_pte *pgd = domain->pgd;
+               int agaw;
+
+               context_set_domain_id(context, did);
+
+               if (translation != CONTEXT_TT_PASS_THROUGH) {
+                       /*
+                        * Skip top levels of page tables for iommu which has
+                        * less agaw than default. Unnecessary for PT mode.
+                        */
+                       for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
+                               ret = -ENOMEM;
+                               pgd = phys_to_virt(dma_pte_addr(pgd));
+                               if (!dma_pte_present(pgd))
+                                       goto out_unlock;
+                       }
+
+                       info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
+                       if (info && info->ats_supported)
+                               translation = CONTEXT_TT_DEV_IOTLB;
+                       else
+                               translation = CONTEXT_TT_MULTI_LEVEL;
+
+                       context_set_address_root(context, virt_to_phys(pgd));
+                       context_set_address_width(context, agaw);
+               } else {
+                       /*
+                        * In pass through mode, AW must be programmed to
+                        * indicate the largest AGAW value supported by
+                        * hardware. And ASR is ignored by hardware.
+                        */
+                       context_set_address_width(context, iommu->msagaw);
+               }
+
+               context_set_translation_type(context, translation);
+       }
+
+       context_set_fault_enable(context);
+       context_set_present(context);
+       domain_flush_cache(domain, context, sizeof(*context));
+
+       /*
+        * It's a non-present to present mapping. If hardware doesn't cache
+        * non-present entry we only need to flush the write-buffer. If the
+        * _does_ cache non-present entries, then it does so in the special
+        * domain #0, which we have to flush:
+        */
+       if (cap_caching_mode(iommu->cap)) {
+               iommu->flush.flush_context(iommu, 0,
+                                          (((u16)bus) << 8) | devfn,
+                                          DMA_CCMD_MASK_NOBIT,
+                                          DMA_CCMD_DEVICE_INVL);
+               iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
+       } else {
+               iommu_flush_write_buffer(iommu);
+       }
+       iommu_enable_dev_iotlb(info);
+
+       ret = 0;
+
+out_unlock:
+       spin_unlock(&iommu->lock);
+       spin_unlock_irqrestore(&device_domain_lock, flags);
+
+       return ret;
+}
+
+struct domain_context_mapping_data {
+       struct dmar_domain *domain;
+       struct intel_iommu *iommu;
+       struct pasid_table *table;
+};
+
+static int domain_context_mapping_cb(struct pci_dev *pdev,
+                                    u16 alias, void *opaque)
+{
+       struct domain_context_mapping_data *data = opaque;
+
+       return domain_context_mapping_one(data->domain, data->iommu,
+                                         data->table, PCI_BUS_NUM(alias),
+                                         alias & 0xff);
+}
+
+static int
+domain_context_mapping(struct dmar_domain *domain, struct device *dev)
+{
+       struct domain_context_mapping_data data;
+       struct pasid_table *table;
+       struct intel_iommu *iommu;
+       u8 bus, devfn;
+
+       iommu = device_to_iommu(dev, &bus, &devfn);
+       if (!iommu)
+               return -ENODEV;
+
+       table = intel_pasid_get_table(dev);
+
+       if (!dev_is_pci(dev))
+               return domain_context_mapping_one(domain, iommu, table,
+                                                 bus, devfn);
+
+       data.domain = domain;
+       data.iommu = iommu;
+       data.table = table;
+
+       return pci_for_each_dma_alias(to_pci_dev(dev),
+                                     &domain_context_mapping_cb, &data);
+}
+
+static int domain_context_mapped_cb(struct pci_dev *pdev,
+                                   u16 alias, void *opaque)
+{
+       struct intel_iommu *iommu = opaque;
+
+       return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
+}
+
+static int domain_context_mapped(struct device *dev)
+{
+       struct intel_iommu *iommu;
+       u8 bus, devfn;
+
+       iommu = device_to_iommu(dev, &bus, &devfn);
+       if (!iommu)
+               return -ENODEV;
+
+       if (!dev_is_pci(dev))
+               return device_context_mapped(iommu, bus, devfn);
+
+       return !pci_for_each_dma_alias(to_pci_dev(dev),
+                                      domain_context_mapped_cb, iommu);
+}
+
+/* Returns a number of VTD pages, but aligned to MM page size */
+static inline unsigned long aligned_nrpages(unsigned long host_addr,
+                                           size_t size)
+{
+       host_addr &= ~PAGE_MASK;
+       return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
+}
+
+/* Return largest possible superpage level for a given mapping */
+static inline int hardware_largepage_caps(struct dmar_domain *domain,
+                                         unsigned long iov_pfn,
+                                         unsigned long phy_pfn,
+                                         unsigned long pages)
+{
+       int support, level = 1;
+       unsigned long pfnmerge;
+
+       support = domain->iommu_superpage;
+
+       /* To use a large page, the virtual *and* physical addresses
+          must be aligned to 2MiB/1GiB/etc. Lower bits set in either
+          of them will mean we have to use smaller pages. So just
+          merge them and check both at once. */
+       pfnmerge = iov_pfn | phy_pfn;
+
+       while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
+               pages >>= VTD_STRIDE_SHIFT;
+               if (!pages)
+                       break;
+               pfnmerge >>= VTD_STRIDE_SHIFT;
+               level++;
+               support--;
+       }
+       return level;
+}
+
+static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
+                           struct scatterlist *sg, unsigned long phys_pfn,
+                           unsigned long nr_pages, int prot)
+{
+       struct dma_pte *first_pte = NULL, *pte = NULL;
+       phys_addr_t uninitialized_var(pteval);
+       unsigned long sg_res = 0;
+       unsigned int largepage_lvl = 0;
+       unsigned long lvl_pages = 0;
+       u64 attr;
+
+       BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
+
+       if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
+               return -EINVAL;
+
+       attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
+       if (domain_use_first_level(domain))
+               attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD;
+
+       if (!sg) {
+               sg_res = nr_pages;
+               pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
+       }
+
+       while (nr_pages > 0) {
+               uint64_t tmp;
+
+               if (!sg_res) {
+                       unsigned int pgoff = sg->offset & ~PAGE_MASK;
+
+                       sg_res = aligned_nrpages(sg->offset, sg->length);
+                       sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
+                       sg->dma_length = sg->length;
+                       pteval = (sg_phys(sg) - pgoff) | attr;
+                       phys_pfn = pteval >> VTD_PAGE_SHIFT;
+               }
+
+               if (!pte) {
+                       largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
+
+                       first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
+                       if (!pte)
+                               return -ENOMEM;
+                       /* It is large page*/
+                       if (largepage_lvl > 1) {
+                               unsigned long nr_superpages, end_pfn;
+
+                               pteval |= DMA_PTE_LARGE_PAGE;
+                               lvl_pages = lvl_to_nr_pages(largepage_lvl);
+
+                               nr_superpages = sg_res / lvl_pages;
+                               end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
+
+                               /*
+                                * Ensure that old small page tables are
+                                * removed to make room for superpage(s).
+                                * We're adding new large pages, so make sure
+                                * we don't remove their parent tables.
+                                */
+                               dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
+                                                      largepage_lvl + 1);
+                       } else {
+                               pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
+                       }
+
+               }
+               /* We don't need lock here, nobody else
+                * touches the iova range
+                */
+               tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
+               if (tmp) {
+                       static int dumps = 5;
+                       pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
+                               iov_pfn, tmp, (unsigned long long)pteval);
+                       if (dumps) {
+                               dumps--;
+                               debug_dma_dump_mappings(NULL);
+                       }
+                       WARN_ON(1);
+               }
+
+               lvl_pages = lvl_to_nr_pages(largepage_lvl);
+
+               BUG_ON(nr_pages < lvl_pages);
+               BUG_ON(sg_res < lvl_pages);
+
+               nr_pages -= lvl_pages;
+               iov_pfn += lvl_pages;
+               phys_pfn += lvl_pages;
+               pteval += lvl_pages * VTD_PAGE_SIZE;
+               sg_res -= lvl_pages;
+
+               /* If the next PTE would be the first in a new page, then we
+                  need to flush the cache on the entries we've just written.
+                  And then we'll need to recalculate 'pte', so clear it and
+                  let it get set again in the if (!pte) block above.
+
+                  If we're done (!nr_pages) we need to flush the cache too.
+
+                  Also if we've been setting superpages, we may need to
+                  recalculate 'pte' and switch back to smaller pages for the
+                  end of the mapping, if the trailing size is not enough to
+                  use another superpage (i.e. sg_res < lvl_pages). */
+               pte++;
+               if (!nr_pages || first_pte_in_page(pte) ||
+                   (largepage_lvl > 1 && sg_res < lvl_pages)) {
+                       domain_flush_cache(domain, first_pte,
+                                          (void *)pte - (void *)first_pte);
+                       pte = NULL;
+               }
+
+               if (!sg_res && nr_pages)
+                       sg = sg_next(sg);
+       }
+       return 0;
+}
+
+static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
+                         struct scatterlist *sg, unsigned long phys_pfn,
+                         unsigned long nr_pages, int prot)
+{
+       int iommu_id, ret;
+       struct intel_iommu *iommu;
+
+       /* Do the real mapping first */
+       ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
+       if (ret)
+               return ret;
+
+       for_each_domain_iommu(iommu_id, domain) {
+               iommu = g_iommus[iommu_id];
+               __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
+       }
+
+       return 0;
+}
+
+static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
+                                   struct scatterlist *sg, unsigned long nr_pages,
+                                   int prot)
+{
+       return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
+}
+
+static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
+                                    unsigned long phys_pfn, unsigned long nr_pages,
+                                    int prot)
+{
+       return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
+}
+
+static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
+{
+       unsigned long flags;
+       struct context_entry *context;
+       u16 did_old;
+
+       if (!iommu)
+               return;
+
+       spin_lock_irqsave(&iommu->lock, flags);
+       context = iommu_context_addr(iommu, bus, devfn, 0);
+       if (!context) {
+               spin_unlock_irqrestore(&iommu->lock, flags);
+               return;
+       }
+       did_old = context_domain_id(context);
+       context_clear_entry(context);
+       __iommu_flush_cache(iommu, context, sizeof(*context));
+       spin_unlock_irqrestore(&iommu->lock, flags);
+       iommu->flush.flush_context(iommu,
+                                  did_old,
+                                  (((u16)bus) << 8) | devfn,
+                                  DMA_CCMD_MASK_NOBIT,
+                                  DMA_CCMD_DEVICE_INVL);
+       iommu->flush.flush_iotlb(iommu,
+                                did_old,
+                                0,
+                                0,
+                                DMA_TLB_DSI_FLUSH);
+}
+
+static inline void unlink_domain_info(struct device_domain_info *info)
+{
+       assert_spin_locked(&device_domain_lock);
+       list_del(&info->link);
+       list_del(&info->global);
+       if (info->dev)
+               info->dev->archdata.iommu = NULL;
+}
+
+static void domain_remove_dev_info(struct dmar_domain *domain)
+{
+       struct device_domain_info *info, *tmp;
+       unsigned long flags;
+
+       spin_lock_irqsave(&device_domain_lock, flags);
+       list_for_each_entry_safe(info, tmp, &domain->devices, link)
+               __dmar_remove_one_dev_info(info);
+       spin_unlock_irqrestore(&device_domain_lock, flags);
+}
+
+struct dmar_domain *find_domain(struct device *dev)
+{
+       struct device_domain_info *info;
+
+       if (unlikely(attach_deferred(dev) || iommu_dummy(dev)))
+               return NULL;
+
+       /* No lock here, assumes no domain exit in normal case */
+       info = get_domain_info(dev);
+       if (likely(info))
+               return info->domain;
+
+       return NULL;
+}
+
+static void do_deferred_attach(struct device *dev)
+{
+       struct iommu_domain *domain;
+
+       dev->archdata.iommu = NULL;
+       domain = iommu_get_domain_for_dev(dev);
+       if (domain)
+               intel_iommu_attach_device(domain, dev);
+}
+
+static inline struct device_domain_info *
+dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
+{
+       struct device_domain_info *info;
+
+       list_for_each_entry(info, &device_domain_list, global)
+               if (info->segment == segment && info->bus == bus &&
+                   info->devfn == devfn)
+                       return info;
+
+       return NULL;
+}
+
+static int domain_setup_first_level(struct intel_iommu *iommu,
+                                   struct dmar_domain *domain,
+                                   struct device *dev,
+                                   int pasid)
+{
+       int flags = PASID_FLAG_SUPERVISOR_MODE;
+       struct dma_pte *pgd = domain->pgd;
+       int agaw, level;
+
+       /*
+        * Skip top levels of page tables for iommu which has
+        * less agaw than default. Unnecessary for PT mode.
+        */
+       for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
+               pgd = phys_to_virt(dma_pte_addr(pgd));
+               if (!dma_pte_present(pgd))
+                       return -ENOMEM;
+       }
+
+       level = agaw_to_level(agaw);
+       if (level != 4 && level != 5)
+               return -EINVAL;
+
+       flags |= (level == 5) ? PASID_FLAG_FL5LP : 0;
+
+       return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
+                                            domain->iommu_did[iommu->seq_id],
+                                            flags);
+}
+
+static bool dev_is_real_dma_subdevice(struct device *dev)
+{
+       return dev && dev_is_pci(dev) &&
+              pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
+}
+
+static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
+                                                   int bus, int devfn,
+                                                   struct device *dev,
+                                                   struct dmar_domain *domain)
+{
+       struct dmar_domain *found = NULL;
+       struct device_domain_info *info;
+       unsigned long flags;
+       int ret;
+
+       info = alloc_devinfo_mem();
+       if (!info)
+               return NULL;
+
+       if (!dev_is_real_dma_subdevice(dev)) {
+               info->bus = bus;
+               info->devfn = devfn;
+               info->segment = iommu->segment;
+       } else {
+               struct pci_dev *pdev = to_pci_dev(dev);
+
+               info->bus = pdev->bus->number;
+               info->devfn = pdev->devfn;
+               info->segment = pci_domain_nr(pdev->bus);
+       }
+
+       info->ats_supported = info->pasid_supported = info->pri_supported = 0;
+       info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
+       info->ats_qdep = 0;
+       info->dev = dev;
+       info->domain = domain;
+       info->iommu = iommu;
+       info->pasid_table = NULL;
+       info->auxd_enabled = 0;
+       INIT_LIST_HEAD(&info->auxiliary_domains);
+
+       if (dev && dev_is_pci(dev)) {
+               struct pci_dev *pdev = to_pci_dev(info->dev);
+
+               if (ecap_dev_iotlb_support(iommu->ecap) &&
+                   pci_ats_supported(pdev) &&
+                   dmar_find_matched_atsr_unit(pdev))
+                       info->ats_supported = 1;
+
+               if (sm_supported(iommu)) {
+                       if (pasid_supported(iommu)) {
+                               int features = pci_pasid_features(pdev);
+                               if (features >= 0)
+                                       info->pasid_supported = features | 1;
+                       }
+
+                       if (info->ats_supported && ecap_prs(iommu->ecap) &&
+                           pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
+                               info->pri_supported = 1;
+               }
+       }
+
+       spin_lock_irqsave(&device_domain_lock, flags);
+       if (dev)
+               found = find_domain(dev);
+
+       if (!found) {
+               struct device_domain_info *info2;
+               info2 = dmar_search_domain_by_dev_info(info->segment, info->bus,
+                                                      info->devfn);
+               if (info2) {
+                       found      = info2->domain;
+                       info2->dev = dev;
+               }
+       }
+
+       if (found) {
+               spin_unlock_irqrestore(&device_domain_lock, flags);
+               free_devinfo_mem(info);
+               /* Caller must free the original domain */
+               return found;
+       }
+
+       spin_lock(&iommu->lock);
+       ret = domain_attach_iommu(domain, iommu);
+       spin_unlock(&iommu->lock);
+
+       if (ret) {
+               spin_unlock_irqrestore(&device_domain_lock, flags);
+               free_devinfo_mem(info);
+               return NULL;
+       }
+
+       list_add(&info->link, &domain->devices);
+       list_add(&info->global, &device_domain_list);
+       if (dev)
+               dev->archdata.iommu = info;
+       spin_unlock_irqrestore(&device_domain_lock, flags);
+
+       /* PASID table is mandatory for a PCI device in scalable mode. */
+       if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
+               ret = intel_pasid_alloc_table(dev);
+               if (ret) {
+                       dev_err(dev, "PASID table allocation failed\n");
+                       dmar_remove_one_dev_info(dev);
+                       return NULL;
+               }
+
+               /* Setup the PASID entry for requests without PASID: */
+               spin_lock(&iommu->lock);
+               if (hw_pass_through && domain_type_is_si(domain))
+                       ret = intel_pasid_setup_pass_through(iommu, domain,
+                                       dev, PASID_RID2PASID);
+               else if (domain_use_first_level(domain))
+                       ret = domain_setup_first_level(iommu, domain, dev,
+                                       PASID_RID2PASID);
+               else
+                       ret = intel_pasid_setup_second_level(iommu, domain,
+                                       dev, PASID_RID2PASID);
+               spin_unlock(&iommu->lock);
+               if (ret) {
+                       dev_err(dev, "Setup RID2PASID failed\n");
+                       dmar_remove_one_dev_info(dev);
+                       return NULL;
+               }
+       }
+
+       if (dev && domain_context_mapping(domain, dev)) {
+               dev_err(dev, "Domain context map failed\n");
+               dmar_remove_one_dev_info(dev);
+               return NULL;
+       }
+
+       return domain;
+}
+
+static int iommu_domain_identity_map(struct dmar_domain *domain,
+                                    unsigned long first_vpfn,
+                                    unsigned long last_vpfn)
+{
+       /*
+        * RMRR range might have overlap with physical memory range,
+        * clear it first
+        */
+       dma_pte_clear_range(domain, first_vpfn, last_vpfn);
+
+       return __domain_mapping(domain, first_vpfn, NULL,
+                               first_vpfn, last_vpfn - first_vpfn + 1,
+                               DMA_PTE_READ|DMA_PTE_WRITE);
+}
+
+static int md_domain_init(struct dmar_domain *domain, int guest_width);
+
+static int __init si_domain_init(int hw)
+{
+       struct dmar_rmrr_unit *rmrr;
+       struct device *dev;
+       int i, nid, ret;
+
+       si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
+       if (!si_domain)
+               return -EFAULT;
+
+       if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
+               domain_exit(si_domain);
+               return -EFAULT;
+       }
+
+       if (hw)
+               return 0;
+
+       for_each_online_node(nid) {
+               unsigned long start_pfn, end_pfn;
+               int i;
+
+               for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
+                       ret = iommu_domain_identity_map(si_domain,
+                                       mm_to_dma_pfn(start_pfn),
+                                       mm_to_dma_pfn(end_pfn));
+                       if (ret)
+                               return ret;
+               }
+       }
+
+       /*
+        * Identity map the RMRRs so that devices with RMRRs could also use
+        * the si_domain.
+        */
+       for_each_rmrr_units(rmrr) {
+               for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
+                                         i, dev) {
+                       unsigned long long start = rmrr->base_address;
+                       unsigned long long end = rmrr->end_address;
+
+                       if (WARN_ON(end < start ||
+                                   end >> agaw_to_width(si_domain->agaw)))
+                               continue;
+
+                       ret = iommu_domain_identity_map(si_domain, start, end);
+                       if (ret)
+                               return ret;
+               }
+       }
+
+       return 0;
+}
+
+static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
+{
+       struct dmar_domain *ndomain;
+       struct intel_iommu *iommu;
+       u8 bus, devfn;
+
+       iommu = device_to_iommu(dev, &bus, &devfn);
+       if (!iommu)
+               return -ENODEV;
+
+       ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
+       if (ndomain != domain)
+               return -EBUSY;
+
+       return 0;
+}
+
+static bool device_has_rmrr(struct device *dev)
+{
+       struct dmar_rmrr_unit *rmrr;
+       struct device *tmp;
+       int i;
+
+       rcu_read_lock();
+       for_each_rmrr_units(rmrr) {
+               /*
+                * Return TRUE if this RMRR contains the device that
+                * is passed in.
+                */
+               for_each_active_dev_scope(rmrr->devices,
+                                         rmrr->devices_cnt, i, tmp)
+                       if (tmp == dev ||
+                           is_downstream_to_pci_bridge(dev, tmp)) {
+                               rcu_read_unlock();
+                               return true;
+                       }
+       }
+       rcu_read_unlock();
+       return false;
+}
+
+/**
+ * device_rmrr_is_relaxable - Test whether the RMRR of this device
+ * is relaxable (ie. is allowed to be not enforced under some conditions)
+ * @dev: device handle
+ *
+ * We assume that PCI USB devices with RMRRs have them largely
+ * for historical reasons and that the RMRR space is not actively used post
+ * boot.  This exclusion may change if vendors begin to abuse it.
+ *
+ * The same exception is made for graphics devices, with the requirement that
+ * any use of the RMRR regions will be torn down before assigning the device
+ * to a guest.
+ *
+ * Return: true if the RMRR is relaxable, false otherwise
+ */
+static bool device_rmrr_is_relaxable(struct device *dev)
+{
+       struct pci_dev *pdev;
+
+       if (!dev_is_pci(dev))
+               return false;
+
+       pdev = to_pci_dev(dev);
+       if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
+               return true;
+       else
+               return false;
+}
+
+/*
+ * There are a couple cases where we need to restrict the functionality of
+ * devices associated with RMRRs.  The first is when evaluating a device for
+ * identity mapping because problems exist when devices are moved in and out
+ * of domains and their respective RMRR information is lost.  This means that
+ * a device with associated RMRRs will never be in a "passthrough" domain.
+ * The second is use of the device through the IOMMU API.  This interface
+ * expects to have full control of the IOVA space for the device.  We cannot
+ * satisfy both the requirement that RMRR access is maintained and have an
+ * unencumbered IOVA space.  We also have no ability to quiesce the device's
+ * use of the RMRR space or even inform the IOMMU API user of the restriction.
+ * We therefore prevent devices associated with an RMRR from participating in
+ * the IOMMU API, which eliminates them from device assignment.
+ *
+ * In both cases, devices which have relaxable RMRRs are not concerned by this
+ * restriction. See device_rmrr_is_relaxable comment.
+ */
+static bool device_is_rmrr_locked(struct device *dev)
+{
+       if (!device_has_rmrr(dev))
+               return false;
+
+       if (device_rmrr_is_relaxable(dev))
+               return false;
+
+       return true;
+}
+
+/*
+ * Return the required default domain type for a specific device.
+ *
+ * @dev: the device in query
+ * @startup: true if this is during early boot
+ *
+ * Returns:
+ *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
+ *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
+ *  - 0: both identity and dynamic domains work for this device
+ */
+static int device_def_domain_type(struct device *dev)
+{
+       if (dev_is_pci(dev)) {
+               struct pci_dev *pdev = to_pci_dev(dev);
+
+               /*
+                * Prevent any device marked as untrusted from getting
+                * placed into the statically identity mapping domain.
+                */
+               if (pdev->untrusted)
+                       return IOMMU_DOMAIN_DMA;
+
+               if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
+                       return IOMMU_DOMAIN_IDENTITY;
+
+               if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
+                       return IOMMU_DOMAIN_IDENTITY;
+       }
+
+       return 0;
+}
+
+static void intel_iommu_init_qi(struct intel_iommu *iommu)
+{
+       /*
+        * Start from the sane iommu hardware state.
+        * If the queued invalidation is already initialized by us
+        * (for example, while enabling interrupt-remapping) then
+        * we got the things already rolling from a sane state.
+        */
+       if (!iommu->qi) {
+               /*
+                * Clear any previous faults.
+                */
+               dmar_fault(-1, iommu);
+               /*
+                * Disable queued invalidation if supported and already enabled
+                * before OS handover.
+                */
+               dmar_disable_qi(iommu);
+       }
+
+       if (dmar_enable_qi(iommu)) {
+               /*
+                * Queued Invalidate not enabled, use Register Based Invalidate
+                */
+               iommu->flush.flush_context = __iommu_flush_context;
+               iommu->flush.flush_iotlb = __iommu_flush_iotlb;
+               pr_info("%s: Using Register based invalidation\n",
+                       iommu->name);
+       } else {
+               iommu->flush.flush_context = qi_flush_context;
+               iommu->flush.flush_iotlb = qi_flush_iotlb;
+               pr_info("%s: Using Queued invalidation\n", iommu->name);
+       }
+}
+
+static int copy_context_table(struct intel_iommu *iommu,
+                             struct root_entry *old_re,
+                             struct context_entry **tbl,
+                             int bus, bool ext)
+{
+       int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
+       struct context_entry *new_ce = NULL, ce;
+       struct context_entry *old_ce = NULL;
+       struct root_entry re;
+       phys_addr_t old_ce_phys;
+
+       tbl_idx = ext ? bus * 2 : bus;
+       memcpy(&re, old_re, sizeof(re));
+
+       for (devfn = 0; devfn < 256; devfn++) {
+               /* First calculate the correct index */
+               idx = (ext ? devfn * 2 : devfn) % 256;
+
+               if (idx == 0) {
+                       /* First save what we may have and clean up */
+                       if (new_ce) {
+                               tbl[tbl_idx] = new_ce;
+                               __iommu_flush_cache(iommu, new_ce,
+                                                   VTD_PAGE_SIZE);
+                               pos = 1;
+                       }
+
+                       if (old_ce)
+                               memunmap(old_ce);
+
+                       ret = 0;
+                       if (devfn < 0x80)
+                               old_ce_phys = root_entry_lctp(&re);
+                       else
+                               old_ce_phys = root_entry_uctp(&re);
+
+                       if (!old_ce_phys) {
+                               if (ext && devfn == 0) {
+                                       /* No LCTP, try UCTP */
+                                       devfn = 0x7f;
+                                       continue;
+                               } else {
+                                       goto out;
+                               }
+                       }
+
+                       ret = -ENOMEM;
+                       old_ce = memremap(old_ce_phys, PAGE_SIZE,
+                                       MEMREMAP_WB);
+                       if (!old_ce)
+                               goto out;
+
+                       new_ce = alloc_pgtable_page(iommu->node);
+                       if (!new_ce)
+                               goto out_unmap;
+
+                       ret = 0;
+               }
+
+               /* Now copy the context entry */
+               memcpy(&ce, old_ce + idx, sizeof(ce));
+
+               if (!__context_present(&ce))
+                       continue;
+
+               did = context_domain_id(&ce);
+               if (did >= 0 && did < cap_ndoms(iommu->cap))
+                       set_bit(did, iommu->domain_ids);
+
+               /*
+                * We need a marker for copied context entries. This
+                * marker needs to work for the old format as well as
+                * for extended context entries.
+                *
+                * Bit 67 of the context entry is used. In the old
+                * format this bit is available to software, in the
+                * extended format it is the PGE bit, but PGE is ignored
+                * by HW if PASIDs are disabled (and thus still
+                * available).
+                *
+                * So disable PASIDs first and then mark the entry
+                * copied. This means that we don't copy PASID
+                * translations from the old kernel, but this is fine as
+                * faults there are not fatal.
+                */
+               context_clear_pasid_enable(&ce);
+               context_set_copied(&ce);
+
+               new_ce[idx] = ce;
+       }
+
+       tbl[tbl_idx + pos] = new_ce;
+
+       __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
+
+out_unmap:
+       memunmap(old_ce);
+
+out:
+       return ret;
+}
+
+static int copy_translation_tables(struct intel_iommu *iommu)
+{
+       struct context_entry **ctxt_tbls;
+       struct root_entry *old_rt;
+       phys_addr_t old_rt_phys;
+       int ctxt_table_entries;
+       unsigned long flags;
+       u64 rtaddr_reg;
+       int bus, ret;
+       bool new_ext, ext;
+
+       rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
+       ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
+       new_ext    = !!ecap_ecs(iommu->ecap);
+
+       /*
+        * The RTT bit can only be changed when translation is disabled,
+        * but disabling translation means to open a window for data
+        * corruption. So bail out and don't copy anything if we would
+        * have to change the bit.
+        */
+       if (new_ext != ext)
+               return -EINVAL;
+
+       old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
+       if (!old_rt_phys)
+               return -EINVAL;
+
+       old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
+       if (!old_rt)
+               return -ENOMEM;
+
+       /* This is too big for the stack - allocate it from slab */
+       ctxt_table_entries = ext ? 512 : 256;
+       ret = -ENOMEM;
+       ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
+       if (!ctxt_tbls)
+               goto out_unmap;
+
+       for (bus = 0; bus < 256; bus++) {
+               ret = copy_context_table(iommu, &old_rt[bus],
+                                        ctxt_tbls, bus, ext);
+               if (ret) {
+                       pr_err("%s: Failed to copy context table for bus %d\n",
+                               iommu->name, bus);
+                       continue;
+               }
+       }
+
+       spin_lock_irqsave(&iommu->lock, flags);
+
+       /* Context tables are copied, now write them to the root_entry table */
+       for (bus = 0; bus < 256; bus++) {
+               int idx = ext ? bus * 2 : bus;
+               u64 val;
+
+               if (ctxt_tbls[idx]) {
+                       val = virt_to_phys(ctxt_tbls[idx]) | 1;
+                       iommu->root_entry[bus].lo = val;
+               }
+
+               if (!ext || !ctxt_tbls[idx + 1])
+                       continue;
+
+               val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
+               iommu->root_entry[bus].hi = val;
+       }
+
+       spin_unlock_irqrestore(&iommu->lock, flags);
+
+       kfree(ctxt_tbls);
+
+       __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
+
+       ret = 0;
+
+out_unmap:
+       memunmap(old_rt);
+
+       return ret;
+}
+
+#ifdef CONFIG_INTEL_IOMMU_SVM
+static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
+{
+       struct intel_iommu *iommu = data;
+       ioasid_t ioasid;
+
+       if (!iommu)
+               return INVALID_IOASID;
+       /*
+        * VT-d virtual command interface always uses the full 20 bit
+        * PASID range. Host can partition guest PASID range based on
+        * policies but it is out of guest's control.
+        */
+       if (min < PASID_MIN || max > intel_pasid_max_id)
+               return INVALID_IOASID;
+
+       if (vcmd_alloc_pasid(iommu, &ioasid))
+               return INVALID_IOASID;
+
+       return ioasid;
+}
+
+static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
+{
+       struct intel_iommu *iommu = data;
+
+       if (!iommu)
+               return;
+       /*
+        * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
+        * We can only free the PASID when all the devices are unbound.
+        */
+       if (ioasid_find(NULL, ioasid, NULL)) {
+               pr_alert("Cannot free active IOASID %d\n", ioasid);
+               return;
+       }
+       vcmd_free_pasid(iommu, ioasid);
+}
+
+static void register_pasid_allocator(struct intel_iommu *iommu)
+{
+       /*
+        * If we are running in the host, no need for custom allocator
+        * in that PASIDs are allocated from the host system-wide.
+        */
+       if (!cap_caching_mode(iommu->cap))
+               return;
+
+       if (!sm_supported(iommu)) {
+               pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
+               return;
+       }
+
+       /*
+        * Register a custom PASID allocator if we are running in a guest,
+        * guest PASID must be obtained via virtual command interface.
+        * There can be multiple vIOMMUs in each guest but only one allocator
+        * is active. All vIOMMU allocators will eventually be calling the same
+        * host allocator.
+        */
+       if (!ecap_vcs(iommu->ecap) || !vccap_pasid(iommu->vccap))
+               return;
+
+       pr_info("Register custom PASID allocator\n");
+       iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
+       iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
+       iommu->pasid_allocator.pdata = (void *)iommu;
+       if (ioasid_register_allocator(&iommu->pasid_allocator)) {
+               pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
+               /*
+                * Disable scalable mode on this IOMMU if there
+                * is no custom allocator. Mixing SM capable vIOMMU
+                * and non-SM vIOMMU are not supported.
+                */
+               intel_iommu_sm = 0;
+       }
+}
+#endif
+
+static int __init init_dmars(void)
+{
+       struct dmar_drhd_unit *drhd;
+       struct intel_iommu *iommu;
+       int ret;
+
+       /*
+        * for each drhd
+        *    allocate root
+        *    initialize and program root entry to not present
+        * endfor
+        */
+       for_each_drhd_unit(drhd) {
+               /*
+                * lock not needed as this is only incremented in the single
+                * threaded kernel __init code path all other access are read
+                * only
+                */
+               if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
+                       g_num_of_iommus++;
+                       continue;
+               }
+               pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
+       }
+
+       /* Preallocate enough resources for IOMMU hot-addition */
+       if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
+               g_num_of_iommus = DMAR_UNITS_SUPPORTED;
+
+       g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
+                       GFP_KERNEL);
+       if (!g_iommus) {
+               pr_err("Allocating global iommu array failed\n");
+               ret = -ENOMEM;
+               goto error;
+       }
+
+       for_each_iommu(iommu, drhd) {
+               if (drhd->ignored) {
+                       iommu_disable_translation(iommu);
+                       continue;
+               }
+
+               /*
+                * Find the max pasid size of all IOMMU's in the system.
+                * We need to ensure the system pasid table is no bigger
+                * than the smallest supported.
+                */
+               if (pasid_supported(iommu)) {
+                       u32 temp = 2 << ecap_pss(iommu->ecap);
+
+                       intel_pasid_max_id = min_t(u32, temp,
+                                                  intel_pasid_max_id);
+               }
+
+               g_iommus[iommu->seq_id] = iommu;
+
+               intel_iommu_init_qi(iommu);
+
+               ret = iommu_init_domains(iommu);
+               if (ret)
+                       goto free_iommu;
+
+               init_translation_status(iommu);
+
+               if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
+                       iommu_disable_translation(iommu);
+                       clear_translation_pre_enabled(iommu);
+                       pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
+                               iommu->name);
+               }
+
+               /*
+                * TBD:
+                * we could share the same root & context tables
+                * among all IOMMU's. Need to Split it later.
+                */
+               ret = iommu_alloc_root_entry(iommu);
+               if (ret)
+                       goto free_iommu;
+
+               if (translation_pre_enabled(iommu)) {
+                       pr_info("Translation already enabled - trying to copy translation structures\n");
+
+                       ret = copy_translation_tables(iommu);
+                       if (ret) {
+                               /*
+                                * We found the IOMMU with translation
+                                * enabled - but failed to copy over the
+                                * old root-entry table. Try to proceed
+                                * by disabling translation now and
+                                * allocating a clean root-entry table.
+                                * This might cause DMAR faults, but
+                                * probably the dump will still succeed.
+                                */
+                               pr_err("Failed to copy translation tables from previous kernel for %s\n",
+                                      iommu->name);
+                               iommu_disable_translation(iommu);
+                               clear_translation_pre_enabled(iommu);
+                       } else {
+                               pr_info("Copied translation tables from previous kernel for %s\n",
+                                       iommu->name);
+                       }
+               }
+
+               if (!ecap_pass_through(iommu->ecap))
+                       hw_pass_through = 0;
+               intel_svm_check(iommu);
+       }
+
+       /*
+        * Now that qi is enabled on all iommus, set the root entry and flush
+        * caches. This is required on some Intel X58 chipsets, otherwise the
+        * flush_context function will loop forever and the boot hangs.
+        */
+       for_each_active_iommu(iommu, drhd) {
+               iommu_flush_write_buffer(iommu);
+#ifdef CONFIG_INTEL_IOMMU_SVM
+               register_pasid_allocator(iommu);
+#endif
+               iommu_set_root_entry(iommu);
+               iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
+               iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
+       }
+
+#ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
+       dmar_map_gfx = 0;
+#endif
+
+       if (!dmar_map_gfx)
+               iommu_identity_mapping |= IDENTMAP_GFX;
+
+       check_tylersburg_isoch();
+
+       ret = si_domain_init(hw_pass_through);
+       if (ret)
+               goto free_iommu;
+
+       /*
+        * for each drhd
+        *   enable fault log
+        *   global invalidate context cache
+        *   global invalidate iotlb
+        *   enable translation
+        */
+       for_each_iommu(iommu, drhd) {
+               if (drhd->ignored) {
+                       /*
+                        * we always have to disable PMRs or DMA may fail on
+                        * this device
+                        */
+                       if (force_on)
+                               iommu_disable_protect_mem_regions(iommu);
+                       continue;
+               }
+
+               iommu_flush_write_buffer(iommu);
+
+#ifdef CONFIG_INTEL_IOMMU_SVM
+               if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
+                       /*
+                        * Call dmar_alloc_hwirq() with dmar_global_lock held,
+                        * could cause possible lock race condition.
+                        */
+                       up_write(&dmar_global_lock);
+                       ret = intel_svm_enable_prq(iommu);
+                       down_write(&dmar_global_lock);
+                       if (ret)
+                               goto free_iommu;
+               }
+#endif
+               ret = dmar_set_interrupt(iommu);
+               if (ret)
+                       goto free_iommu;
+       }
+
+       return 0;
+
+free_iommu:
+       for_each_active_iommu(iommu, drhd) {
+               disable_dmar_iommu(iommu);
+               free_dmar_iommu(iommu);
+       }
+
+       kfree(g_iommus);
+
+error:
+       return ret;
+}
+
+/* This takes a number of _MM_ pages, not VTD pages */
+static unsigned long intel_alloc_iova(struct device *dev,
+                                    struct dmar_domain *domain,
+                                    unsigned long nrpages, uint64_t dma_mask)
+{
+       unsigned long iova_pfn;
+
+       /*
+        * Restrict dma_mask to the width that the iommu can handle.
+        * First-level translation restricts the input-address to a
+        * canonical address (i.e., address bits 63:N have the same
+        * value as address bit [N-1], where N is 48-bits with 4-level
+        * paging and 57-bits with 5-level paging). Hence, skip bit
+        * [N-1].
+        */
+       if (domain_use_first_level(domain))
+               dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw - 1),
+                                dma_mask);
+       else
+               dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw),
+                                dma_mask);
+
+       /* Ensure we reserve the whole size-aligned region */
+       nrpages = __roundup_pow_of_two(nrpages);
+
+       if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
+               /*
+                * First try to allocate an io virtual address in
+                * DMA_BIT_MASK(32) and if that fails then try allocating
+                * from higher range
+                */
+               iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
+                                          IOVA_PFN(DMA_BIT_MASK(32)), false);
+               if (iova_pfn)
+                       return iova_pfn;
+       }
+       iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
+                                  IOVA_PFN(dma_mask), true);
+       if (unlikely(!iova_pfn)) {
+               dev_err_once(dev, "Allocating %ld-page iova failed\n",
+                            nrpages);
+               return 0;
+       }
+
+       return iova_pfn;
+}
+
+static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
+                                    size_t size, int dir, u64 dma_mask)
+{
+       struct dmar_domain *domain;
+       phys_addr_t start_paddr;
+       unsigned long iova_pfn;
+       int prot = 0;
+       int ret;
+       struct intel_iommu *iommu;
+       unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
+
+       BUG_ON(dir == DMA_NONE);
+
+       if (unlikely(attach_deferred(dev)))
+               do_deferred_attach(dev);
+
+       domain = find_domain(dev);
+       if (!domain)
+               return DMA_MAPPING_ERROR;
+
+       iommu = domain_get_iommu(domain);
+       size = aligned_nrpages(paddr, size);
+
+       iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
+       if (!iova_pfn)
+               goto error;
+
+       /*
+        * Check if DMAR supports zero-length reads on write only
+        * mappings..
+        */
+       if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
+                       !cap_zlr(iommu->cap))
+               prot |= DMA_PTE_READ;
+       if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
+               prot |= DMA_PTE_WRITE;
+       /*
+        * paddr - (paddr + size) might be partial page, we should map the whole
+        * page.  Note: if two part of one page are separately mapped, we
+        * might have two guest_addr mapping to the same host paddr, but this
+        * is not a big problem
+        */
+       ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
+                                mm_to_dma_pfn(paddr_pfn), size, prot);
+       if (ret)
+               goto error;
+
+       start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
+       start_paddr += paddr & ~PAGE_MASK;
+
+       trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
+
+       return start_paddr;
+
+error:
+       if (iova_pfn)
+               free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
+       dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
+               size, (unsigned long long)paddr, dir);
+       return DMA_MAPPING_ERROR;
+}
+
+static dma_addr_t intel_map_page(struct device *dev, struct page *page,
+                                unsigned long offset, size_t size,
+                                enum dma_data_direction dir,
+                                unsigned long attrs)
+{
+       return __intel_map_single(dev, page_to_phys(page) + offset,
+                                 size, dir, *dev->dma_mask);
+}
+
+static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
+                                    size_t size, enum dma_data_direction dir,
+                                    unsigned long attrs)
+{
+       return __intel_map_single(dev, phys_addr, size, dir, *dev->dma_mask);
+}
+
+static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
+{
+       struct dmar_domain *domain;
+       unsigned long start_pfn, last_pfn;
+       unsigned long nrpages;
+       unsigned long iova_pfn;
+       struct intel_iommu *iommu;
+       struct page *freelist;
+       struct pci_dev *pdev = NULL;
+
+       domain = find_domain(dev);
+       BUG_ON(!domain);
+
+       iommu = domain_get_iommu(domain);
+
+       iova_pfn = IOVA_PFN(dev_addr);
+
+       nrpages = aligned_nrpages(dev_addr, size);
+       start_pfn = mm_to_dma_pfn(iova_pfn);
+       last_pfn = start_pfn + nrpages - 1;
+
+       if (dev_is_pci(dev))
+               pdev = to_pci_dev(dev);
+
+       freelist = domain_unmap(domain, start_pfn, last_pfn);
+       if (intel_iommu_strict || (pdev && pdev->untrusted) ||
+                       !has_iova_flush_queue(&domain->iovad)) {
+               iommu_flush_iotlb_psi(iommu, domain, start_pfn,
+                                     nrpages, !freelist, 0);
+               /* free iova */
+               free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
+               dma_free_pagelist(freelist);
+       } else {
+               queue_iova(&domain->iovad, iova_pfn, nrpages,
+                          (unsigned long)freelist);
+               /*
+                * queue up the release of the unmap to save the 1/6th of the
+                * cpu used up by the iotlb flush operation...
+                */
+       }
+
+       trace_unmap_single(dev, dev_addr, size);
+}
+
+static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
+                            size_t size, enum dma_data_direction dir,
+                            unsigned long attrs)
+{
+       intel_unmap(dev, dev_addr, size);
+}
+
+static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
+               size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
+       intel_unmap(dev, dev_addr, size);
+}
+
+static void *intel_alloc_coherent(struct device *dev, size_t size,
+                                 dma_addr_t *dma_handle, gfp_t flags,
+                                 unsigned long attrs)
+{
+       struct page *page = NULL;
+       int order;
+
+       if (unlikely(attach_deferred(dev)))
+               do_deferred_attach(dev);
+
+       size = PAGE_ALIGN(size);
+       order = get_order(size);
+
+       if (gfpflags_allow_blocking(flags)) {
+               unsigned int count = size >> PAGE_SHIFT;
+
+               page = dma_alloc_from_contiguous(dev, count, order,
+                                                flags & __GFP_NOWARN);
+       }
+
+       if (!page)
+               page = alloc_pages(flags, order);
+       if (!page)
+               return NULL;
+       memset(page_address(page), 0, size);
+
+       *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
+                                        DMA_BIDIRECTIONAL,
+                                        dev->coherent_dma_mask);
+       if (*dma_handle != DMA_MAPPING_ERROR)
+               return page_address(page);
+       if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
+               __free_pages(page, order);
+
+       return NULL;
+}
+
+static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
+                               dma_addr_t dma_handle, unsigned long attrs)
+{
+       int order;
+       struct page *page = virt_to_page(vaddr);
+
+       size = PAGE_ALIGN(size);
+       order = get_order(size);
+
+       intel_unmap(dev, dma_handle, size);
+       if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
+               __free_pages(page, order);
+}
+
+static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
+                          int nelems, enum dma_data_direction dir,
+                          unsigned long attrs)
+{
+       dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
+       unsigned long nrpages = 0;
+       struct scatterlist *sg;
+       int i;
+
+       for_each_sg(sglist, sg, nelems, i) {
+               nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
+       }
+
+       intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
+
+       trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
+}
+
+static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
+                       enum dma_data_direction dir, unsigned long attrs)
+{
+       int i;
+       struct dmar_domain *domain;
+       size_t size = 0;
+       int prot = 0;
+       unsigned long iova_pfn;
+       int ret;
+       struct scatterlist *sg;
+       unsigned long start_vpfn;
+       struct intel_iommu *iommu;
+
+       BUG_ON(dir == DMA_NONE);
+
+       if (unlikely(attach_deferred(dev)))
+               do_deferred_attach(dev);
+
+       domain = find_domain(dev);
+       if (!domain)
+               return 0;
+
+       iommu = domain_get_iommu(domain);
+
+       for_each_sg(sglist, sg, nelems, i)
+               size += aligned_nrpages(sg->offset, sg->length);
+
+       iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
+                               *dev->dma_mask);
+       if (!iova_pfn) {
+               sglist->dma_length = 0;
+               return 0;
+       }
+
+       /*
+        * Check if DMAR supports zero-length reads on write only
+        * mappings..
+        */
+       if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
+                       !cap_zlr(iommu->cap))
+               prot |= DMA_PTE_READ;
+       if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
+               prot |= DMA_PTE_WRITE;
+
+       start_vpfn = mm_to_dma_pfn(iova_pfn);
+
+       ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
+       if (unlikely(ret)) {
+               dma_pte_free_pagetable(domain, start_vpfn,
+                                      start_vpfn + size - 1,
+                                      agaw_to_level(domain->agaw) + 1);
+               free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
+               return 0;
+       }
+
+       for_each_sg(sglist, sg, nelems, i)
+               trace_map_sg(dev, i + 1, nelems, sg);
+
+       return nelems;
+}
+
+static u64 intel_get_required_mask(struct device *dev)
+{
+       return DMA_BIT_MASK(32);
+}
+
+static const struct dma_map_ops intel_dma_ops = {
+       .alloc = intel_alloc_coherent,
+       .free = intel_free_coherent,
+       .map_sg = intel_map_sg,
+       .unmap_sg = intel_unmap_sg,
+       .map_page = intel_map_page,
+       .unmap_page = intel_unmap_page,
+       .map_resource = intel_map_resource,
+       .unmap_resource = intel_unmap_resource,
+       .dma_supported = dma_direct_supported,
+       .mmap = dma_common_mmap,
+       .get_sgtable = dma_common_get_sgtable,
+       .get_required_mask = intel_get_required_mask,
+};
+
+static void
+bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
+                  enum dma_data_direction dir, enum dma_sync_target target)
+{
+       struct dmar_domain *domain;
+       phys_addr_t tlb_addr;
+
+       domain = find_domain(dev);
+       if (WARN_ON(!domain))
+               return;
+
+       tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
+       if (is_swiotlb_buffer(tlb_addr))
+               swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
+}
+
+static dma_addr_t
+bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
+                 enum dma_data_direction dir, unsigned long attrs,
+                 u64 dma_mask)
+{
+       size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
+       struct dmar_domain *domain;
+       struct intel_iommu *iommu;
+       unsigned long iova_pfn;
+       unsigned long nrpages;
+       phys_addr_t tlb_addr;
+       int prot = 0;
+       int ret;
+
+       if (unlikely(attach_deferred(dev)))
+               do_deferred_attach(dev);
+
+       domain = find_domain(dev);
+
+       if (WARN_ON(dir == DMA_NONE || !domain))
+               return DMA_MAPPING_ERROR;
+
+       iommu = domain_get_iommu(domain);
+       if (WARN_ON(!iommu))
+               return DMA_MAPPING_ERROR;
+
+       nrpages = aligned_nrpages(0, size);
+       iova_pfn = intel_alloc_iova(dev, domain,
+                                   dma_to_mm_pfn(nrpages), dma_mask);
+       if (!iova_pfn)
+               return DMA_MAPPING_ERROR;
+
+       /*
+        * Check if DMAR supports zero-length reads on write only
+        * mappings..
+        */
+       if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
+                       !cap_zlr(iommu->cap))
+               prot |= DMA_PTE_READ;
+       if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
+               prot |= DMA_PTE_WRITE;
+
+       /*
+        * If both the physical buffer start address and size are
+        * page aligned, we don't need to use a bounce page.
+        */
+       if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
+               tlb_addr = swiotlb_tbl_map_single(dev,
+                               __phys_to_dma(dev, io_tlb_start),
+                               paddr, size, aligned_size, dir, attrs);
+               if (tlb_addr == DMA_MAPPING_ERROR) {
+                       goto swiotlb_error;
+               } else {
+                       /* Cleanup the padding area. */
+                       void *padding_start = phys_to_virt(tlb_addr);
+                       size_t padding_size = aligned_size;
+
+                       if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
+                           (dir == DMA_TO_DEVICE ||
+                            dir == DMA_BIDIRECTIONAL)) {
+                               padding_start += size;
+                               padding_size -= size;
+                       }
+
+                       memset(padding_start, 0, padding_size);
+               }
+       } else {
+               tlb_addr = paddr;
+       }
+
+       ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
+                                tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
+       if (ret)
+               goto mapping_error;
+
+       trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
+
+       return (phys_addr_t)iova_pfn << PAGE_SHIFT;
+
+mapping_error:
+       if (is_swiotlb_buffer(tlb_addr))
+               swiotlb_tbl_unmap_single(dev, tlb_addr, size,
+                                        aligned_size, dir, attrs);
+swiotlb_error:
+       free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
+       dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
+               size, (unsigned long long)paddr, dir);
+
+       return DMA_MAPPING_ERROR;
+}
+
+static void
+bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
+                   enum dma_data_direction dir, unsigned long attrs)
+{
+       size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
+       struct dmar_domain *domain;
+       phys_addr_t tlb_addr;
+
+       domain = find_domain(dev);
+       if (WARN_ON(!domain))
+               return;
+
+       tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
+       if (WARN_ON(!tlb_addr))
+               return;
+
+       intel_unmap(dev, dev_addr, size);
+       if (is_swiotlb_buffer(tlb_addr))
+               swiotlb_tbl_unmap_single(dev, tlb_addr, size,
+                                        aligned_size, dir, attrs);
+
+       trace_bounce_unmap_single(dev, dev_addr, size);
+}
+
+static dma_addr_t
+bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
+               size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
+       return bounce_map_single(dev, page_to_phys(page) + offset,
+                                size, dir, attrs, *dev->dma_mask);
+}
+
+static dma_addr_t
+bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
+                   enum dma_data_direction dir, unsigned long attrs)
+{
+       return bounce_map_single(dev, phys_addr, size,
+                                dir, attrs, *dev->dma_mask);
+}
+
+static void
+bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
+                 enum dma_data_direction dir, unsigned long attrs)
+{
+       bounce_unmap_single(dev, dev_addr, size, dir, attrs);
+}
+
+static void
+bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
+                     enum dma_data_direction dir, unsigned long attrs)
+{
+       bounce_unmap_single(dev, dev_addr, size, dir, attrs);
+}
+
+static void
+bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
+               enum dma_data_direction dir, unsigned long attrs)
+{
+       struct scatterlist *sg;
+       int i;
+
+       for_each_sg(sglist, sg, nelems, i)
+               bounce_unmap_page(dev, sg->dma_address,
+                                 sg_dma_len(sg), dir, attrs);
+}
+
+static int
+bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
+             enum dma_data_direction dir, unsigned long attrs)
+{
+       int i;
+       struct scatterlist *sg;
+
+       for_each_sg(sglist, sg, nelems, i) {
+               sg->dma_address = bounce_map_page(dev, sg_page(sg),
+                                                 sg->offset, sg->length,
+                                                 dir, attrs);
+               if (sg->dma_address == DMA_MAPPING_ERROR)
+                       goto out_unmap;
+               sg_dma_len(sg) = sg->length;
+       }
+
+       for_each_sg(sglist, sg, nelems, i)
+               trace_bounce_map_sg(dev, i + 1, nelems, sg);
+
+       return nelems;
+
+out_unmap:
+       bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
+       return 0;
+}
+
+static void
+bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
+                          size_t size, enum dma_data_direction dir)
+{
+       bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
+}
+
+static void
+bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
+                             size_t size, enum dma_data_direction dir)
+{
+       bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
+}
+
+static void
+bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
+                      int nelems, enum dma_data_direction dir)
+{
+       struct scatterlist *sg;
+       int i;
+
+       for_each_sg(sglist, sg, nelems, i)
+               bounce_sync_single(dev, sg_dma_address(sg),
+                                  sg_dma_len(sg), dir, SYNC_FOR_CPU);
+}
+
+static void
+bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
+                         int nelems, enum dma_data_direction dir)
+{
+       struct scatterlist *sg;
+       int i;
+
+       for_each_sg(sglist, sg, nelems, i)
+               bounce_sync_single(dev, sg_dma_address(sg),
+                                  sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
+}
+
+static const struct dma_map_ops bounce_dma_ops = {
+       .alloc                  = intel_alloc_coherent,
+       .free                   = intel_free_coherent,
+       .map_sg                 = bounce_map_sg,
+       .unmap_sg               = bounce_unmap_sg,
+       .map_page               = bounce_map_page,
+       .unmap_page             = bounce_unmap_page,
+       .sync_single_for_cpu    = bounce_sync_single_for_cpu,
+       .sync_single_for_device = bounce_sync_single_for_device,
+       .sync_sg_for_cpu        = bounce_sync_sg_for_cpu,
+       .sync_sg_for_device     = bounce_sync_sg_for_device,
+       .map_resource           = bounce_map_resource,
+       .unmap_resource         = bounce_unmap_resource,
+       .dma_supported          = dma_direct_supported,
+};
+
+static inline int iommu_domain_cache_init(void)
+{
+       int ret = 0;
+
+       iommu_domain_cache = kmem_cache_create("iommu_domain",
+                                        sizeof(struct dmar_domain),
+                                        0,
+                                        SLAB_HWCACHE_ALIGN,
+
+                                        NULL);
+       if (!iommu_domain_cache) {
+               pr_err("Couldn't create iommu_domain cache\n");
+               ret = -ENOMEM;
+       }
+
+       return ret;
+}
+
+static inline int iommu_devinfo_cache_init(void)
+{
+       int ret = 0;
+
+       iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
+                                        sizeof(struct device_domain_info),
+                                        0,
+                                        SLAB_HWCACHE_ALIGN,
+                                        NULL);
+       if (!iommu_devinfo_cache) {
+               pr_err("Couldn't create devinfo cache\n");
+               ret = -ENOMEM;
+       }
+
+       return ret;
+}
+
+static int __init iommu_init_mempool(void)
+{
+       int ret;
+       ret = iova_cache_get();
+       if (ret)
+               return ret;
+
+       ret = iommu_domain_cache_init();
+       if (ret)
+               goto domain_error;
+
+       ret = iommu_devinfo_cache_init();
+       if (!ret)
+               return ret;
+
+       kmem_cache_destroy(iommu_domain_cache);
+domain_error:
+       iova_cache_put();
+
+       return -ENOMEM;
+}
+
+static void __init iommu_exit_mempool(void)
+{
+       kmem_cache_destroy(iommu_devinfo_cache);
+       kmem_cache_destroy(iommu_domain_cache);
+       iova_cache_put();
+}
+
+static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
+{
+       struct dmar_drhd_unit *drhd;
+       u32 vtbar;
+       int rc;
+
+       /* We know that this device on this chipset has its own IOMMU.
+        * If we find it under a different IOMMU, then the BIOS is lying
+        * to us. Hope that the IOMMU for this device is actually
+        * disabled, and it needs no translation...
+        */
+       rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
+       if (rc) {
+               /* "can't" happen */
+               dev_info(&pdev->dev, "failed to run vt-d quirk\n");
+               return;
+       }
+       vtbar &= 0xffff0000;
+
+       /* we know that the this iommu should be at offset 0xa000 from vtbar */
+       drhd = dmar_find_matched_drhd_unit(pdev);
+       if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
+               pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
+               add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
+               pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
+       }
+}
+DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
+
+static void __init init_no_remapping_devices(void)
+{
+       struct dmar_drhd_unit *drhd;
+       struct device *dev;
+       int i;
+
+       for_each_drhd_unit(drhd) {
+               if (!drhd->include_all) {
+                       for_each_active_dev_scope(drhd->devices,
+                                                 drhd->devices_cnt, i, dev)
+                               break;
+                       /* ignore DMAR unit if no devices exist */
+                       if (i == drhd->devices_cnt)
+                               drhd->ignored = 1;
+               }
+       }
+
+       for_each_active_drhd_unit(drhd) {
+               if (drhd->include_all)
+                       continue;
+
+               for_each_active_dev_scope(drhd->devices,
+                                         drhd->devices_cnt, i, dev)
+                       if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
+                               break;
+               if (i < drhd->devices_cnt)
+                       continue;
+
+               /* This IOMMU has *only* gfx devices. Either bypass it or
+                  set the gfx_mapped flag, as appropriate */
+               if (!dmar_map_gfx) {
+                       drhd->ignored = 1;
+                       for_each_active_dev_scope(drhd->devices,
+                                                 drhd->devices_cnt, i, dev)
+                               dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
+               }
+       }
+}
+
+#ifdef CONFIG_SUSPEND
+static int init_iommu_hw(void)
+{
+       struct dmar_drhd_unit *drhd;
+       struct intel_iommu *iommu = NULL;
+
+       for_each_active_iommu(iommu, drhd)
+               if (iommu->qi)
+                       dmar_reenable_qi(iommu);
+
+       for_each_iommu(iommu, drhd) {
+               if (drhd->ignored) {
+                       /*
+                        * we always have to disable PMRs or DMA may fail on
+                        * this device
+                        */
+                       if (force_on)
+                               iommu_disable_protect_mem_regions(iommu);
+                       continue;
+               }
+
+               iommu_flush_write_buffer(iommu);
+
+               iommu_set_root_entry(iommu);
+
+               iommu->flush.flush_context(iommu, 0, 0, 0,
+                                          DMA_CCMD_GLOBAL_INVL);
+               iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
+               iommu_enable_translation(iommu);
+               iommu_disable_protect_mem_regions(iommu);
+       }
+
+       return 0;
+}
+
+static void iommu_flush_all(void)
+{
+       struct dmar_drhd_unit *drhd;
+       struct intel_iommu *iommu;
+
+       for_each_active_iommu(iommu, drhd) {
+               iommu->flush.flush_context(iommu, 0, 0, 0,
+                                          DMA_CCMD_GLOBAL_INVL);
+               iommu->flush.flush_iotlb(iommu, 0, 0, 0,
+                                        DMA_TLB_GLOBAL_FLUSH);
+       }
+}
+
+static int iommu_suspend(void)
+{
+       struct dmar_drhd_unit *drhd;
+       struct intel_iommu *iommu = NULL;
+       unsigned long flag;
+
+       for_each_active_iommu(iommu, drhd) {
+               iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
+                                                GFP_ATOMIC);
+               if (!iommu->iommu_state)
+                       goto nomem;
+       }
+
+       iommu_flush_all();
+
+       for_each_active_iommu(iommu, drhd) {
+               iommu_disable_translation(iommu);
+
+               raw_spin_lock_irqsave(&iommu->register_lock, flag);
+
+               iommu->iommu_state[SR_DMAR_FECTL_REG] =
+                       readl(iommu->reg + DMAR_FECTL_REG);
+               iommu->iommu_state[SR_DMAR_FEDATA_REG] =
+                       readl(iommu->reg + DMAR_FEDATA_REG);
+               iommu->iommu_state[SR_DMAR_FEADDR_REG] =
+                       readl(iommu->reg + DMAR_FEADDR_REG);
+               iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
+                       readl(iommu->reg + DMAR_FEUADDR_REG);
+
+               raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+       }
+       return 0;
+
+nomem:
+       for_each_active_iommu(iommu, drhd)
+               kfree(iommu->iommu_state);
+
+       return -ENOMEM;
+}
+
+static void iommu_resume(void)
+{
+       struct dmar_drhd_unit *drhd;
+       struct intel_iommu *iommu = NULL;
+       unsigned long flag;
+
+       if (init_iommu_hw()) {
+               if (force_on)
+                       panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
+               else
+                       WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
+               return;
+       }
+
+       for_each_active_iommu(iommu, drhd) {
+
+               raw_spin_lock_irqsave(&iommu->register_lock, flag);
+
+               writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
+                       iommu->reg + DMAR_FECTL_REG);
+               writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
+                       iommu->reg + DMAR_FEDATA_REG);
+               writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
+                       iommu->reg + DMAR_FEADDR_REG);
+               writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
+                       iommu->reg + DMAR_FEUADDR_REG);
+
+               raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+       }
+
+       for_each_active_iommu(iommu, drhd)
+               kfree(iommu->iommu_state);
+}
+
+static struct syscore_ops iommu_syscore_ops = {
+       .resume         = iommu_resume,
+       .suspend        = iommu_suspend,
+};
+
+static void __init init_iommu_pm_ops(void)
+{
+       register_syscore_ops(&iommu_syscore_ops);
+}
+
+#else
+static inline void init_iommu_pm_ops(void) {}
+#endif /* CONFIG_PM */
+
+static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
+{
+       if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
+           !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
+           rmrr->end_address <= rmrr->base_address ||
+           arch_rmrr_sanity_check(rmrr))
+               return -EINVAL;
+
+       return 0;
+}
+
+int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
+{
+       struct acpi_dmar_reserved_memory *rmrr;
+       struct dmar_rmrr_unit *rmrru;
+
+       rmrr = (struct acpi_dmar_reserved_memory *)header;
+       if (rmrr_sanity_check(rmrr)) {
+               pr_warn(FW_BUG
+                          "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
+                          "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
+                          rmrr->base_address, rmrr->end_address,
+                          dmi_get_system_info(DMI_BIOS_VENDOR),
+                          dmi_get_system_info(DMI_BIOS_VERSION),
+                          dmi_get_system_info(DMI_PRODUCT_VERSION));
+               add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
+       }
+
+       rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
+       if (!rmrru)
+               goto out;
+
+       rmrru->hdr = header;
+
+       rmrru->base_address = rmrr->base_address;
+       rmrru->end_address = rmrr->end_address;
+
+       rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
+                               ((void *)rmrr) + rmrr->header.length,
+                               &rmrru->devices_cnt);
+       if (rmrru->devices_cnt && rmrru->devices == NULL)
+               goto free_rmrru;
+
+       list_add(&rmrru->list, &dmar_rmrr_units);
+
+       return 0;
+free_rmrru:
+       kfree(rmrru);
+out:
+       return -ENOMEM;
+}
+
+static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
+{
+       struct dmar_atsr_unit *atsru;
+       struct acpi_dmar_atsr *tmp;
+
+       list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
+                               dmar_rcu_check()) {
+               tmp = (struct acpi_dmar_atsr *)atsru->hdr;
+               if (atsr->segment != tmp->segment)
+                       continue;
+               if (atsr->header.length != tmp->header.length)
+                       continue;
+               if (memcmp(atsr, tmp, atsr->header.length) == 0)
+                       return atsru;
+       }
+
+       return NULL;
+}
+
+int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
+{
+       struct acpi_dmar_atsr *atsr;
+       struct dmar_atsr_unit *atsru;
+
+       if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
+               return 0;
+
+       atsr = container_of(hdr, struct acpi_dmar_atsr, header);
+       atsru = dmar_find_atsr(atsr);
+       if (atsru)
+               return 0;
+
+       atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
+       if (!atsru)
+               return -ENOMEM;
+
+       /*
+        * If memory is allocated from slab by ACPI _DSM method, we need to
+        * copy the memory content because the memory buffer will be freed
+        * on return.
+        */
+       atsru->hdr = (void *)(atsru + 1);
+       memcpy(atsru->hdr, hdr, hdr->length);
+       atsru->include_all = atsr->flags & 0x1;
+       if (!atsru->include_all) {
+               atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
+                               (void *)atsr + atsr->header.length,
+                               &atsru->devices_cnt);
+               if (atsru->devices_cnt && atsru->devices == NULL) {
+                       kfree(atsru);
+                       return -ENOMEM;
+               }
+       }
+
+       list_add_rcu(&atsru->list, &dmar_atsr_units);
+
+       return 0;
+}
+
+static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
+{
+       dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
+       kfree(atsru);
+}
+
+int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
+{
+       struct acpi_dmar_atsr *atsr;
+       struct dmar_atsr_unit *atsru;
+
+       atsr = container_of(hdr, struct acpi_dmar_atsr, header);
+       atsru = dmar_find_atsr(atsr);
+       if (atsru) {
+               list_del_rcu(&atsru->list);
+               synchronize_rcu();
+               intel_iommu_free_atsr(atsru);
+       }
+
+       return 0;
+}
+
+int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
+{
+       int i;
+       struct device *dev;
+       struct acpi_dmar_atsr *atsr;
+       struct dmar_atsr_unit *atsru;
+
+       atsr = container_of(hdr, struct acpi_dmar_atsr, header);
+       atsru = dmar_find_atsr(atsr);
+       if (!atsru)
+               return 0;
+
+       if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
+               for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
+                                         i, dev)
+                       return -EBUSY;
+       }
+
+       return 0;
+}
+
+static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
+{
+       int sp, ret;
+       struct intel_iommu *iommu = dmaru->iommu;
+
+       if (g_iommus[iommu->seq_id])
+               return 0;
+
+       if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
+               pr_warn("%s: Doesn't support hardware pass through.\n",
+                       iommu->name);
+               return -ENXIO;
+       }
+       if (!ecap_sc_support(iommu->ecap) &&
+           domain_update_iommu_snooping(iommu)) {
+               pr_warn("%s: Doesn't support snooping.\n",
+                       iommu->name);
+               return -ENXIO;
+       }
+       sp = domain_update_iommu_superpage(NULL, iommu) - 1;
+       if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
+               pr_warn("%s: Doesn't support large page.\n",
+                       iommu->name);
+               return -ENXIO;
+       }
+
+       /*
+        * Disable translation if already enabled prior to OS handover.
+        */
+       if (iommu->gcmd & DMA_GCMD_TE)
+               iommu_disable_translation(iommu);
+
+       g_iommus[iommu->seq_id] = iommu;
+       ret = iommu_init_domains(iommu);
+       if (ret == 0)
+               ret = iommu_alloc_root_entry(iommu);
+       if (ret)
+               goto out;
+
+       intel_svm_check(iommu);
+
+       if (dmaru->ignored) {
+               /*
+                * we always have to disable PMRs or DMA may fail on this device
+                */
+               if (force_on)
+                       iommu_disable_protect_mem_regions(iommu);
+               return 0;
+       }
+
+       intel_iommu_init_qi(iommu);
+       iommu_flush_write_buffer(iommu);
+
+#ifdef CONFIG_INTEL_IOMMU_SVM
+       if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
+               ret = intel_svm_enable_prq(iommu);
+               if (ret)
+                       goto disable_iommu;
+       }
+#endif
+       ret = dmar_set_interrupt(iommu);
+       if (ret)
+               goto disable_iommu;
+
+       iommu_set_root_entry(iommu);
+       iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
+       iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
+       iommu_enable_translation(iommu);
+
+       iommu_disable_protect_mem_regions(iommu);
+       return 0;
+
+disable_iommu:
+       disable_dmar_iommu(iommu);
+out:
+       free_dmar_iommu(iommu);
+       return ret;
+}
+
+int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
+{
+       int ret = 0;
+       struct intel_iommu *iommu = dmaru->iommu;
+
+       if (!intel_iommu_enabled)
+               return 0;
+       if (iommu == NULL)
+               return -EINVAL;
+
+       if (insert) {
+               ret = intel_iommu_add(dmaru);
+       } else {
+               disable_dmar_iommu(iommu);
+               free_dmar_iommu(iommu);
+       }
+
+       return ret;
+}
+
+static void intel_iommu_free_dmars(void)
+{
+       struct dmar_rmrr_unit *rmrru, *rmrr_n;
+       struct dmar_atsr_unit *atsru, *atsr_n;
+
+       list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
+               list_del(&rmrru->list);
+               dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
+               kfree(rmrru);
+       }
+
+       list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
+               list_del(&atsru->list);
+               intel_iommu_free_atsr(atsru);
+       }
+}
+
+int dmar_find_matched_atsr_unit(struct pci_dev *dev)
+{
+       int i, ret = 1;
+       struct pci_bus *bus;
+       struct pci_dev *bridge = NULL;
+       struct device *tmp;
+       struct acpi_dmar_atsr *atsr;
+       struct dmar_atsr_unit *atsru;
+
+       dev = pci_physfn(dev);
+       for (bus = dev->bus; bus; bus = bus->parent) {
+               bridge = bus->self;
+               /* If it's an integrated device, allow ATS */
+               if (!bridge)
+                       return 1;
+               /* Connected via non-PCIe: no ATS */
+               if (!pci_is_pcie(bridge) ||
+                   pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
+                       return 0;
+               /* If we found the root port, look it up in the ATSR */
+               if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
+                       break;
+       }
+
+       rcu_read_lock();
+       list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
+               atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
+               if (atsr->segment != pci_domain_nr(dev->bus))
+                       continue;
+
+               for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
+                       if (tmp == &bridge->dev)
+                               goto out;
+
+               if (atsru->include_all)
+                       goto out;
+       }
+       ret = 0;
+out:
+       rcu_read_unlock();
+
+       return ret;
+}
+
+int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
+{
+       int ret;
+       struct dmar_rmrr_unit *rmrru;
+       struct dmar_atsr_unit *atsru;
+       struct acpi_dmar_atsr *atsr;
+       struct acpi_dmar_reserved_memory *rmrr;
+
+       if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
+               return 0;
+
+       list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
+               rmrr = container_of(rmrru->hdr,
+                                   struct acpi_dmar_reserved_memory, header);
+               if (info->event == BUS_NOTIFY_ADD_DEVICE) {
+                       ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
+                               ((void *)rmrr) + rmrr->header.length,
+                               rmrr->segment, rmrru->devices,
+                               rmrru->devices_cnt);
+                       if (ret < 0)
+                               return ret;
+               } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
+                       dmar_remove_dev_scope(info, rmrr->segment,
+                               rmrru->devices, rmrru->devices_cnt);
+               }
+       }
+
+       list_for_each_entry(atsru, &dmar_atsr_units, list) {
+               if (atsru->include_all)
+                       continue;
+
+               atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
+               if (info->event == BUS_NOTIFY_ADD_DEVICE) {
+                       ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
+                                       (void *)atsr + atsr->header.length,
+                                       atsr->segment, atsru->devices,
+                                       atsru->devices_cnt);
+                       if (ret > 0)
+                               break;
+                       else if (ret < 0)
+                               return ret;
+               } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
+                       if (dmar_remove_dev_scope(info, atsr->segment,
+                                       atsru->devices, atsru->devices_cnt))
+                               break;
+               }
+       }
+
+       return 0;
+}
+
+static int intel_iommu_memory_notifier(struct notifier_block *nb,
+                                      unsigned long val, void *v)
+{
+       struct memory_notify *mhp = v;
+       unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
+       unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
+                       mhp->nr_pages - 1);
+
+       switch (val) {
+       case MEM_GOING_ONLINE:
+               if (iommu_domain_identity_map(si_domain,
+                                             start_vpfn, last_vpfn)) {
+                       pr_warn("Failed to build identity map for [%lx-%lx]\n",
+                               start_vpfn, last_vpfn);
+                       return NOTIFY_BAD;
+               }
+               break;
+
+       case MEM_OFFLINE:
+       case MEM_CANCEL_ONLINE:
+               {
+                       struct dmar_drhd_unit *drhd;
+                       struct intel_iommu *iommu;
+                       struct page *freelist;
+
+                       freelist = domain_unmap(si_domain,
+                                               start_vpfn, last_vpfn);
+
+                       rcu_read_lock();
+                       for_each_active_iommu(iommu, drhd)
+                               iommu_flush_iotlb_psi(iommu, si_domain,
+                                       start_vpfn, mhp->nr_pages,
+                                       !freelist, 0);
+                       rcu_read_unlock();
+                       dma_free_pagelist(freelist);
+               }
+               break;
+       }
+
+       return NOTIFY_OK;
+}
+
+static struct notifier_block intel_iommu_memory_nb = {
+       .notifier_call = intel_iommu_memory_notifier,
+       .priority = 0
+};
+
+static void free_all_cpu_cached_iovas(unsigned int cpu)
+{
+       int i;
+
+       for (i = 0; i < g_num_of_iommus; i++) {
+               struct intel_iommu *iommu = g_iommus[i];
+               struct dmar_domain *domain;
+               int did;
+
+               if (!iommu)
+                       continue;
+
+               for (did = 0; did < cap_ndoms(iommu->cap); did++) {
+                       domain = get_iommu_domain(iommu, (u16)did);
+
+                       if (!domain || domain->domain.type != IOMMU_DOMAIN_DMA)
+                               continue;
+
+                       free_cpu_cached_iovas(cpu, &domain->iovad);
+               }
+       }
+}
+
+static int intel_iommu_cpu_dead(unsigned int cpu)
+{
+       free_all_cpu_cached_iovas(cpu);
+       return 0;
+}
+
+static void intel_disable_iommus(void)
+{
+       struct intel_iommu *iommu = NULL;
+       struct dmar_drhd_unit *drhd;
+
+       for_each_iommu(iommu, drhd)
+               iommu_disable_translation(iommu);
+}
+
+void intel_iommu_shutdown(void)
+{
+       struct dmar_drhd_unit *drhd;
+       struct intel_iommu *iommu = NULL;
+
+       if (no_iommu || dmar_disabled)
+               return;
+
+       down_write(&dmar_global_lock);
+
+       /* Disable PMRs explicitly here. */
+       for_each_iommu(iommu, drhd)
+               iommu_disable_protect_mem_regions(iommu);
+
+       /* Make sure the IOMMUs are switched off */
+       intel_disable_iommus();
+
+       up_write(&dmar_global_lock);
+}
+
+static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
+{
+       struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
+
+       return container_of(iommu_dev, struct intel_iommu, iommu);
+}
+
+static ssize_t intel_iommu_show_version(struct device *dev,
+                                       struct device_attribute *attr,
+                                       char *buf)
+{
+       struct intel_iommu *iommu = dev_to_intel_iommu(dev);
+       u32 ver = readl(iommu->reg + DMAR_VER_REG);
+       return sprintf(buf, "%d:%d\n",
+                      DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
+}
+static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
+
+static ssize_t intel_iommu_show_address(struct device *dev,
+                                       struct device_attribute *attr,
+                                       char *buf)
+{
+       struct intel_iommu *iommu = dev_to_intel_iommu(dev);
+       return sprintf(buf, "%llx\n", iommu->reg_phys);
+}
+static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
+
+static ssize_t intel_iommu_show_cap(struct device *dev,
+                                   struct device_attribute *attr,
+                                   char *buf)
+{
+       struct intel_iommu *iommu = dev_to_intel_iommu(dev);
+       return sprintf(buf, "%llx\n", iommu->cap);
+}
+static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
+
+static ssize_t intel_iommu_show_ecap(struct device *dev,
+                                   struct device_attribute *attr,
+                                   char *buf)
+{
+       struct intel_iommu *iommu = dev_to_intel_iommu(dev);
+       return sprintf(buf, "%llx\n", iommu->ecap);
+}
+static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
+
+static ssize_t intel_iommu_show_ndoms(struct device *dev,
+                                     struct device_attribute *attr,
+                                     char *buf)
+{
+       struct intel_iommu *iommu = dev_to_intel_iommu(dev);
+       return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
+}
+static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
+
+static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
+                                          struct device_attribute *attr,
+                                          char *buf)
+{
+       struct intel_iommu *iommu = dev_to_intel_iommu(dev);
+       return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
+                                                 cap_ndoms(iommu->cap)));
+}
+static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
+
+static struct attribute *intel_iommu_attrs[] = {
+       &dev_attr_version.attr,
+       &dev_attr_address.attr,
+       &dev_attr_cap.attr,
+       &dev_attr_ecap.attr,
+       &dev_attr_domains_supported.attr,
+       &dev_attr_domains_used.attr,
+       NULL,
+};
+
+static struct attribute_group intel_iommu_group = {
+       .name = "intel-iommu",
+       .attrs = intel_iommu_attrs,
+};
+
+const struct attribute_group *intel_iommu_groups[] = {
+       &intel_iommu_group,
+       NULL,
+};
+
+static inline bool has_untrusted_dev(void)
+{
+       struct pci_dev *pdev = NULL;
+
+       for_each_pci_dev(pdev)
+               if (pdev->untrusted)
+                       return true;
+
+       return false;
+}
+
+static int __init platform_optin_force_iommu(void)
+{
+       if (!dmar_platform_optin() || no_platform_optin || !has_untrusted_dev())
+               return 0;
+
+       if (no_iommu || dmar_disabled)
+               pr_info("Intel-IOMMU force enabled due to platform opt in\n");
+
+       /*
+        * If Intel-IOMMU is disabled by default, we will apply identity
+        * map for all devices except those marked as being untrusted.
+        */
+       if (dmar_disabled)
+               iommu_set_default_passthrough(false);
+
+       dmar_disabled = 0;
+       no_iommu = 0;
+
+       return 1;
+}
+
+static int __init probe_acpi_namespace_devices(void)
+{
+       struct dmar_drhd_unit *drhd;
+       /* To avoid a -Wunused-but-set-variable warning. */
+       struct intel_iommu *iommu __maybe_unused;
+       struct device *dev;
+       int i, ret = 0;
+
+       for_each_active_iommu(iommu, drhd) {
+               for_each_active_dev_scope(drhd->devices,
+                                         drhd->devices_cnt, i, dev) {
+                       struct acpi_device_physical_node *pn;
+                       struct iommu_group *group;
+                       struct acpi_device *adev;
+
+                       if (dev->bus != &acpi_bus_type)
+                               continue;
+
+                       adev = to_acpi_device(dev);
+                       mutex_lock(&adev->physical_node_lock);
+                       list_for_each_entry(pn,
+                                           &adev->physical_node_list, node) {
+                               group = iommu_group_get(pn->dev);
+                               if (group) {
+                                       iommu_group_put(group);
+                                       continue;
+                               }
+
+                               pn->dev->bus->iommu_ops = &intel_iommu_ops;
+                               ret = iommu_probe_device(pn->dev);
+                               if (ret)
+                                       break;
+                       }
+                       mutex_unlock(&adev->physical_node_lock);
+
+                       if (ret)
+                               return ret;
+               }
+       }
+
+       return 0;
+}
+
+int __init intel_iommu_init(void)
+{
+       int ret = -ENODEV;
+       struct dmar_drhd_unit *drhd;
+       struct intel_iommu *iommu;
+
+       /*
+        * Intel IOMMU is required for a TXT/tboot launch or platform
+        * opt in, so enforce that.
+        */
+       force_on = tboot_force_iommu() || platform_optin_force_iommu();
+
+       if (iommu_init_mempool()) {
+               if (force_on)
+                       panic("tboot: Failed to initialize iommu memory\n");
+               return -ENOMEM;
+       }
+
+       down_write(&dmar_global_lock);
+       if (dmar_table_init()) {
+               if (force_on)
+                       panic("tboot: Failed to initialize DMAR table\n");
+               goto out_free_dmar;
+       }
+
+       if (dmar_dev_scope_init() < 0) {
+               if (force_on)
+                       panic("tboot: Failed to initialize DMAR device scope\n");
+               goto out_free_dmar;
+       }
+
+       up_write(&dmar_global_lock);
+
+       /*
+        * The bus notifier takes the dmar_global_lock, so lockdep will
+        * complain later when we register it under the lock.
+        */
+       dmar_register_bus_notifier();
+
+       down_write(&dmar_global_lock);
+
+       if (!no_iommu)
+               intel_iommu_debugfs_init();
+
+       if (no_iommu || dmar_disabled) {
+               /*
+                * We exit the function here to ensure IOMMU's remapping and
+                * mempool aren't setup, which means that the IOMMU's PMRs
+                * won't be disabled via the call to init_dmars(). So disable
+                * it explicitly here. The PMRs were setup by tboot prior to
+                * calling SENTER, but the kernel is expected to reset/tear
+                * down the PMRs.
+                */
+               if (intel_iommu_tboot_noforce) {
+                       for_each_iommu(iommu, drhd)
+                               iommu_disable_protect_mem_regions(iommu);
+               }
+
+               /*
+                * Make sure the IOMMUs are switched off, even when we
+                * boot into a kexec kernel and the previous kernel left
+                * them enabled
+                */
+               intel_disable_iommus();
+               goto out_free_dmar;
+       }
+
+       if (list_empty(&dmar_rmrr_units))
+               pr_info("No RMRR found\n");
+
+       if (list_empty(&dmar_atsr_units))
+               pr_info("No ATSR found\n");
+
+       if (dmar_init_reserved_ranges()) {
+               if (force_on)
+                       panic("tboot: Failed to reserve iommu ranges\n");
+               goto out_free_reserved_range;
+       }
+
+       if (dmar_map_gfx)
+               intel_iommu_gfx_mapped = 1;
+
+       init_no_remapping_devices();
+
+       ret = init_dmars();
+       if (ret) {
+               if (force_on)
+                       panic("tboot: Failed to initialize DMARs\n");
+               pr_err("Initialization failed\n");
+               goto out_free_reserved_range;
+       }
+       up_write(&dmar_global_lock);
+
+       init_iommu_pm_ops();
+
+       down_read(&dmar_global_lock);
+       for_each_active_iommu(iommu, drhd) {
+               iommu_device_sysfs_add(&iommu->iommu, NULL,
+                                      intel_iommu_groups,
+                                      "%s", iommu->name);
+               iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
+               iommu_device_register(&iommu->iommu);
+       }
+       up_read(&dmar_global_lock);
+
+       bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
+       if (si_domain && !hw_pass_through)
+               register_memory_notifier(&intel_iommu_memory_nb);
+       cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
+                         intel_iommu_cpu_dead);
+
+       down_read(&dmar_global_lock);
+       if (probe_acpi_namespace_devices())
+               pr_warn("ACPI name space devices didn't probe correctly\n");
+
+       /* Finally, we enable the DMA remapping hardware. */
+       for_each_iommu(iommu, drhd) {
+               if (!drhd->ignored && !translation_pre_enabled(iommu))
+                       iommu_enable_translation(iommu);
+
+               iommu_disable_protect_mem_regions(iommu);
+       }
+       up_read(&dmar_global_lock);
+
+       pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
+
+       intel_iommu_enabled = 1;
+
+       return 0;
+
+out_free_reserved_range:
+       put_iova_domain(&reserved_iova_list);
+out_free_dmar:
+       intel_iommu_free_dmars();
+       up_write(&dmar_global_lock);
+       iommu_exit_mempool();
+       return ret;
+}
+
+static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
+{
+       struct intel_iommu *iommu = opaque;
+
+       domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
+       return 0;
+}
+
+/*
+ * NB - intel-iommu lacks any sort of reference counting for the users of
+ * dependent devices.  If multiple endpoints have intersecting dependent
+ * devices, unbinding the driver from any one of them will possibly leave
+ * the others unable to operate.
+ */
+static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
+{
+       if (!iommu || !dev || !dev_is_pci(dev))
+               return;
+
+       pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
+}
+
+static void __dmar_remove_one_dev_info(struct device_domain_info *info)
+{
+       struct dmar_domain *domain;
+       struct intel_iommu *iommu;
+       unsigned long flags;
+
+       assert_spin_locked(&device_domain_lock);
+
+       if (WARN_ON(!info))
+               return;
+
+       iommu = info->iommu;
+       domain = info->domain;
+
+       if (info->dev) {
+               if (dev_is_pci(info->dev) && sm_supported(iommu))
+                       intel_pasid_tear_down_entry(iommu, info->dev,
+                                       PASID_RID2PASID, false);
+
+               iommu_disable_dev_iotlb(info);
+               if (!dev_is_real_dma_subdevice(info->dev))
+                       domain_context_clear(iommu, info->dev);
+               intel_pasid_free_table(info->dev);
+       }
+
+       unlink_domain_info(info);
+
+       spin_lock_irqsave(&iommu->lock, flags);
+       domain_detach_iommu(domain, iommu);
+       spin_unlock_irqrestore(&iommu->lock, flags);
+
+       free_devinfo_mem(info);
+}
+
+static void dmar_remove_one_dev_info(struct device *dev)
+{
+       struct device_domain_info *info;
+       unsigned long flags;
+
+       spin_lock_irqsave(&device_domain_lock, flags);
+       info = get_domain_info(dev);
+       if (info)
+               __dmar_remove_one_dev_info(info);
+       spin_unlock_irqrestore(&device_domain_lock, flags);
+}
+
+static int md_domain_init(struct dmar_domain *domain, int guest_width)
+{
+       int adjust_width;
+
+       /* calculate AGAW */
+       domain->gaw = guest_width;
+       adjust_width = guestwidth_to_adjustwidth(guest_width);
+       domain->agaw = width_to_agaw(adjust_width);
+
+       domain->iommu_coherency = 0;
+       domain->iommu_snooping = 0;
+       domain->iommu_superpage = 0;
+       domain->max_addr = 0;
+
+       /* always allocate the top pgd */
+       domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
+       if (!domain->pgd)
+               return -ENOMEM;
+       domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
+       return 0;
+}
+
+static void intel_init_iova_domain(struct dmar_domain *dmar_domain)
+{
+       init_iova_domain(&dmar_domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
+       copy_reserved_iova(&reserved_iova_list, &dmar_domain->iovad);
+
+       if (!intel_iommu_strict &&
+           init_iova_flush_queue(&dmar_domain->iovad,
+                                 iommu_flush_iova, iova_entry_free))
+               pr_info("iova flush queue initialization failed\n");
+}
+
+static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
+{
+       struct dmar_domain *dmar_domain;
+       struct iommu_domain *domain;
+
+       switch (type) {
+       case IOMMU_DOMAIN_DMA:
+       /* fallthrough */
+       case IOMMU_DOMAIN_UNMANAGED:
+               dmar_domain = alloc_domain(0);
+               if (!dmar_domain) {
+                       pr_err("Can't allocate dmar_domain\n");
+                       return NULL;
+               }
+               if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
+                       pr_err("Domain initialization failed\n");
+                       domain_exit(dmar_domain);
+                       return NULL;
+               }
+
+               if (type == IOMMU_DOMAIN_DMA)
+                       intel_init_iova_domain(dmar_domain);
+
+               domain_update_iommu_cap(dmar_domain);
+
+               domain = &dmar_domain->domain;
+               domain->geometry.aperture_start = 0;
+               domain->geometry.aperture_end   =
+                               __DOMAIN_MAX_ADDR(dmar_domain->gaw);
+               domain->geometry.force_aperture = true;
+
+               return domain;
+       case IOMMU_DOMAIN_IDENTITY:
+               return &si_domain->domain;
+       default:
+               return NULL;
+       }
+
+       return NULL;
+}
+
+static void intel_iommu_domain_free(struct iommu_domain *domain)
+{
+       if (domain != &si_domain->domain)
+               domain_exit(to_dmar_domain(domain));
+}
+
+/*
+ * Check whether a @domain could be attached to the @dev through the
+ * aux-domain attach/detach APIs.
+ */
+static inline bool
+is_aux_domain(struct device *dev, struct iommu_domain *domain)
+{
+       struct device_domain_info *info = get_domain_info(dev);
+
+       return info && info->auxd_enabled &&
+                       domain->type == IOMMU_DOMAIN_UNMANAGED;
+}
+
+static void auxiliary_link_device(struct dmar_domain *domain,
+                                 struct device *dev)
+{
+       struct device_domain_info *info = get_domain_info(dev);
+
+       assert_spin_locked(&device_domain_lock);
+       if (WARN_ON(!info))
+               return;
+
+       domain->auxd_refcnt++;
+       list_add(&domain->auxd, &info->auxiliary_domains);
+}
+
+static void auxiliary_unlink_device(struct dmar_domain *domain,
+                                   struct device *dev)
+{
+       struct device_domain_info *info = get_domain_info(dev);
+
+       assert_spin_locked(&device_domain_lock);
+       if (WARN_ON(!info))
+               return;
+
+       list_del(&domain->auxd);
+       domain->auxd_refcnt--;
+
+       if (!domain->auxd_refcnt && domain->default_pasid > 0)
+               ioasid_free(domain->default_pasid);
+}
+
+static int aux_domain_add_dev(struct dmar_domain *domain,
+                             struct device *dev)
+{
+       int ret;
+       u8 bus, devfn;
+       unsigned long flags;
+       struct intel_iommu *iommu;
+
+       iommu = device_to_iommu(dev, &bus, &devfn);
+       if (!iommu)
+               return -ENODEV;
+
+       if (domain->default_pasid <= 0) {
+               int pasid;
+
+               /* No private data needed for the default pasid */
+               pasid = ioasid_alloc(NULL, PASID_MIN,
+                                    pci_max_pasids(to_pci_dev(dev)) - 1,
+                                    NULL);
+               if (pasid == INVALID_IOASID) {
+                       pr_err("Can't allocate default pasid\n");
+                       return -ENODEV;
+               }
+               domain->default_pasid = pasid;
+       }
+
+       spin_lock_irqsave(&device_domain_lock, flags);
+       /*
+        * iommu->lock must be held to attach domain to iommu and setup the
+        * pasid entry for second level translation.
+        */
+       spin_lock(&iommu->lock);
+       ret = domain_attach_iommu(domain, iommu);
+       if (ret)
+               goto attach_failed;
+
+       /* Setup the PASID entry for mediated devices: */
+       if (domain_use_first_level(domain))
+               ret = domain_setup_first_level(iommu, domain, dev,
+                                              domain->default_pasid);
+       else
+               ret = intel_pasid_setup_second_level(iommu, domain, dev,
+                                                    domain->default_pasid);
+       if (ret)
+               goto table_failed;
+       spin_unlock(&iommu->lock);
+
+       auxiliary_link_device(domain, dev);
+
+       spin_unlock_irqrestore(&device_domain_lock, flags);
+
+       return 0;
+
+table_failed:
+       domain_detach_iommu(domain, iommu);
+attach_failed:
+       spin_unlock(&iommu->lock);
+       spin_unlock_irqrestore(&device_domain_lock, flags);
+       if (!domain->auxd_refcnt && domain->default_pasid > 0)
+               ioasid_free(domain->default_pasid);
+
+       return ret;
+}
+
+static void aux_domain_remove_dev(struct dmar_domain *domain,
+                                 struct device *dev)
+{
+       struct device_domain_info *info;
+       struct intel_iommu *iommu;
+       unsigned long flags;
+
+       if (!is_aux_domain(dev, &domain->domain))
+               return;
+
+       spin_lock_irqsave(&device_domain_lock, flags);
+       info = get_domain_info(dev);
+       iommu = info->iommu;
+
+       auxiliary_unlink_device(domain, dev);
+
+       spin_lock(&iommu->lock);
+       intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid, false);
+       domain_detach_iommu(domain, iommu);
+       spin_unlock(&iommu->lock);
+
+       spin_unlock_irqrestore(&device_domain_lock, flags);
+}
+
+static int prepare_domain_attach_device(struct iommu_domain *domain,
+                                       struct device *dev)
+{
+       struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+       struct intel_iommu *iommu;
+       int addr_width;
+       u8 bus, devfn;
+
+       iommu = device_to_iommu(dev, &bus, &devfn);
+       if (!iommu)
+               return -ENODEV;
+
+       /* check if this iommu agaw is sufficient for max mapped address */
+       addr_width = agaw_to_width(iommu->agaw);
+       if (addr_width > cap_mgaw(iommu->cap))
+               addr_width = cap_mgaw(iommu->cap);
+
+       if (dmar_domain->max_addr > (1LL << addr_width)) {
+               dev_err(dev, "%s: iommu width (%d) is not "
+                       "sufficient for the mapped address (%llx)\n",
+                       __func__, addr_width, dmar_domain->max_addr);
+               return -EFAULT;
+       }
+       dmar_domain->gaw = addr_width;
+
+       /*
+        * Knock out extra levels of page tables if necessary
+        */
+       while (iommu->agaw < dmar_domain->agaw) {
+               struct dma_pte *pte;
+
+               pte = dmar_domain->pgd;
+               if (dma_pte_present(pte)) {
+                       dmar_domain->pgd = (struct dma_pte *)
+                               phys_to_virt(dma_pte_addr(pte));
+                       free_pgtable_page(pte);
+               }
+               dmar_domain->agaw--;
+       }
+
+       return 0;
+}
+
+static int intel_iommu_attach_device(struct iommu_domain *domain,
+                                    struct device *dev)
+{
+       int ret;
+
+       if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
+           device_is_rmrr_locked(dev)) {
+               dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
+               return -EPERM;
+       }
+
+       if (is_aux_domain(dev, domain))
+               return -EPERM;
+
+       /* normally dev is not mapped */
+       if (unlikely(domain_context_mapped(dev))) {
+               struct dmar_domain *old_domain;
+
+               old_domain = find_domain(dev);
+               if (old_domain)
+                       dmar_remove_one_dev_info(dev);
+       }
+
+       ret = prepare_domain_attach_device(domain, dev);
+       if (ret)
+               return ret;
+
+       return domain_add_dev_info(to_dmar_domain(domain), dev);
+}
+
+static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
+                                        struct device *dev)
+{
+       int ret;
+
+       if (!is_aux_domain(dev, domain))
+               return -EPERM;
+
+       ret = prepare_domain_attach_device(domain, dev);
+       if (ret)
+               return ret;
+
+       return aux_domain_add_dev(to_dmar_domain(domain), dev);
+}
+
+static void intel_iommu_detach_device(struct iommu_domain *domain,
+                                     struct device *dev)
+{
+       dmar_remove_one_dev_info(dev);
+}
+
+static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
+                                         struct device *dev)
+{
+       aux_domain_remove_dev(to_dmar_domain(domain), dev);
+}
+
+/*
+ * 2D array for converting and sanitizing IOMMU generic TLB granularity to
+ * VT-d granularity. Invalidation is typically included in the unmap operation
+ * as a result of DMA or VFIO unmap. However, for assigned devices guest
+ * owns the first level page tables. Invalidations of translation caches in the
+ * guest are trapped and passed down to the host.
+ *
+ * vIOMMU in the guest will only expose first level page tables, therefore
+ * we do not support IOTLB granularity for request without PASID (second level).
+ *
+ * For example, to find the VT-d granularity encoding for IOTLB
+ * type and page selective granularity within PASID:
+ * X: indexed by iommu cache type
+ * Y: indexed by enum iommu_inv_granularity
+ * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
+ */
+
+static const int
+inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
+       /*
+        * PASID based IOTLB invalidation: PASID selective (per PASID),
+        * page selective (address granularity)
+        */
+       {-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
+       /* PASID based dev TLBs */
+       {-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL},
+       /* PASID cache */
+       {-EINVAL, -EINVAL, -EINVAL}
+};
+
+static inline int to_vtd_granularity(int type, int granu)
+{
+       return inv_type_granu_table[type][granu];
+}
+
+static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
+{
+       u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
+
+       /* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
+        * IOMMU cache invalidate API passes granu_size in bytes, and number of
+        * granu size in contiguous memory.
+        */
+       return order_base_2(nr_pages);
+}
+
+#ifdef CONFIG_INTEL_IOMMU_SVM
+static int
+intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
+                          struct iommu_cache_invalidate_info *inv_info)
+{
+       struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+       struct device_domain_info *info;
+       struct intel_iommu *iommu;
+       unsigned long flags;
+       int cache_type;
+       u8 bus, devfn;
+       u16 did, sid;
+       int ret = 0;
+       u64 size = 0;
+
+       if (!inv_info || !dmar_domain ||
+           inv_info->version != IOMMU_CACHE_INVALIDATE_INFO_VERSION_1)
+               return -EINVAL;
+
+       if (!dev || !dev_is_pci(dev))
+               return -ENODEV;
+
+       iommu = device_to_iommu(dev, &bus, &devfn);
+       if (!iommu)
+               return -ENODEV;
+
+       if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
+               return -EINVAL;
+
+       spin_lock_irqsave(&device_domain_lock, flags);
+       spin_lock(&iommu->lock);
+       info = get_domain_info(dev);
+       if (!info) {
+               ret = -EINVAL;
+               goto out_unlock;
+       }
+       did = dmar_domain->iommu_did[iommu->seq_id];
+       sid = PCI_DEVID(bus, devfn);
+
+       /* Size is only valid in address selective invalidation */
+       if (inv_info->granularity != IOMMU_INV_GRANU_PASID)
+               size = to_vtd_size(inv_info->addr_info.granule_size,
+                                  inv_info->addr_info.nb_granules);
+
+       for_each_set_bit(cache_type,
+                        (unsigned long *)&inv_info->cache,
+                        IOMMU_CACHE_INV_TYPE_NR) {
+               int granu = 0;
+               u64 pasid = 0;
+
+               granu = to_vtd_granularity(cache_type, inv_info->granularity);
+               if (granu == -EINVAL) {
+                       pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n",
+                                          cache_type, inv_info->granularity);
+                       break;
+               }
+
+               /*
+                * PASID is stored in different locations based on the
+                * granularity.
+                */
+               if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
+                   (inv_info->pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
+                       pasid = inv_info->pasid_info.pasid;
+               else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
+                        (inv_info->addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
+                       pasid = inv_info->addr_info.pasid;
+
+               switch (BIT(cache_type)) {
+               case IOMMU_CACHE_INV_TYPE_IOTLB:
+                       if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
+                           size &&
+                           (inv_info->addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
+                               pr_err_ratelimited("Address out of range, 0x%llx, size order %llu\n",
+                                                  inv_info->addr_info.addr, size);
+                               ret = -ERANGE;
+                               goto out_unlock;
+                       }
+
+                       /*
+                        * If granu is PASID-selective, address is ignored.
+                        * We use npages = -1 to indicate that.
+                        */
+                       qi_flush_piotlb(iommu, did, pasid,
+                                       mm_to_dma_pfn(inv_info->addr_info.addr),
+                                       (granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
+                                       inv_info->addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
+
+                       /*
+                        * Always flush device IOTLB if ATS is enabled. vIOMMU
+                        * in the guest may assume IOTLB flush is inclusive,
+                        * which is more efficient.
+                        */
+                       if (info->ats_enabled)
+                               qi_flush_dev_iotlb_pasid(iommu, sid,
+                                               info->pfsid, pasid,
+                                               info->ats_qdep,
+                                               inv_info->addr_info.addr,
+                                               size, granu);
+                       break;
+               case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
+                       if (info->ats_enabled)
+                               qi_flush_dev_iotlb_pasid(iommu, sid,
+                                               info->pfsid, pasid,
+                                               info->ats_qdep,
+                                               inv_info->addr_info.addr,
+                                               size, granu);
+                       else
+                               pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
+                       break;
+               default:
+                       dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n",
+                                           cache_type);
+                       ret = -EINVAL;
+               }
+       }
+out_unlock:
+       spin_unlock(&iommu->lock);
+       spin_unlock_irqrestore(&device_domain_lock, flags);
+
+       return ret;
+}
+#endif
+
+static int intel_iommu_map(struct iommu_domain *domain,
+                          unsigned long iova, phys_addr_t hpa,
+                          size_t size, int iommu_prot, gfp_t gfp)
+{
+       struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+       u64 max_addr;
+       int prot = 0;
+       int ret;
+
+       if (iommu_prot & IOMMU_READ)
+               prot |= DMA_PTE_READ;
+       if (iommu_prot & IOMMU_WRITE)
+               prot |= DMA_PTE_WRITE;
+       if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
+               prot |= DMA_PTE_SNP;
+
+       max_addr = iova + size;
+       if (dmar_domain->max_addr < max_addr) {
+               u64 end;
+
+               /* check if minimum agaw is sufficient for mapped address */
+               end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
+               if (end < max_addr) {
+                       pr_err("%s: iommu width (%d) is not "
+                              "sufficient for the mapped address (%llx)\n",
+                              __func__, dmar_domain->gaw, max_addr);
+                       return -EFAULT;
+               }
+               dmar_domain->max_addr = max_addr;
+       }
+       /* Round up size to next multiple of PAGE_SIZE, if it and
+          the low bits of hpa would take us onto the next page */
+       size = aligned_nrpages(hpa, size);
+       ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
+                                hpa >> VTD_PAGE_SHIFT, size, prot);
+       return ret;
+}
+
+static size_t intel_iommu_unmap(struct iommu_domain *domain,
+                               unsigned long iova, size_t size,
+                               struct iommu_iotlb_gather *gather)
+{
+       struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+       struct page *freelist = NULL;
+       unsigned long start_pfn, last_pfn;
+       unsigned int npages;
+       int iommu_id, level = 0;
+
+       /* Cope with horrid API which requires us to unmap more than the
+          size argument if it happens to be a large-page mapping. */
+       BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
+
+       if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
+               size = VTD_PAGE_SIZE << level_to_offset_bits(level);
+
+       start_pfn = iova >> VTD_PAGE_SHIFT;
+       last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
+
+       freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
+
+       npages = last_pfn - start_pfn + 1;
+
+       for_each_domain_iommu(iommu_id, dmar_domain)
+               iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
+                                     start_pfn, npages, !freelist, 0);
+
+       dma_free_pagelist(freelist);
+
+       if (dmar_domain->max_addr == iova + size)
+               dmar_domain->max_addr = iova;
+
+       return size;
+}
+
+static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
+                                           dma_addr_t iova)
+{
+       struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+       struct dma_pte *pte;
+       int level = 0;
+       u64 phys = 0;
+
+       pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
+       if (pte && dma_pte_present(pte))
+               phys = dma_pte_addr(pte) +
+                       (iova & (BIT_MASK(level_to_offset_bits(level) +
+                                               VTD_PAGE_SHIFT) - 1));
+
+       return phys;
+}
+
+static inline bool scalable_mode_support(void)
+{
+       struct dmar_drhd_unit *drhd;
+       struct intel_iommu *iommu;
+       bool ret = true;
+
+       rcu_read_lock();
+       for_each_active_iommu(iommu, drhd) {
+               if (!sm_supported(iommu)) {
+                       ret = false;
+                       break;
+               }
+       }
+       rcu_read_unlock();
+
+       return ret;
+}
+
+static inline bool iommu_pasid_support(void)
+{
+       struct dmar_drhd_unit *drhd;
+       struct intel_iommu *iommu;
+       bool ret = true;
+
+       rcu_read_lock();
+       for_each_active_iommu(iommu, drhd) {
+               if (!pasid_supported(iommu)) {
+                       ret = false;
+                       break;
+               }
+       }
+       rcu_read_unlock();
+
+       return ret;
+}
+
+static inline bool nested_mode_support(void)
+{
+       struct dmar_drhd_unit *drhd;
+       struct intel_iommu *iommu;
+       bool ret = true;
+
+       rcu_read_lock();
+       for_each_active_iommu(iommu, drhd) {
+               if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) {
+                       ret = false;
+                       break;
+               }
+       }
+       rcu_read_unlock();
+
+       return ret;
+}
+
+static bool intel_iommu_capable(enum iommu_cap cap)
+{
+       if (cap == IOMMU_CAP_CACHE_COHERENCY)
+               return domain_update_iommu_snooping(NULL) == 1;
+       if (cap == IOMMU_CAP_INTR_REMAP)
+               return irq_remapping_enabled == 1;
+
+       return false;
+}
+
+static struct iommu_device *intel_iommu_probe_device(struct device *dev)
+{
+       struct intel_iommu *iommu;
+       u8 bus, devfn;
+
+       iommu = device_to_iommu(dev, &bus, &devfn);
+       if (!iommu)
+               return ERR_PTR(-ENODEV);
+
+       if (translation_pre_enabled(iommu))
+               dev->archdata.iommu = DEFER_DEVICE_DOMAIN_INFO;
+
+       return &iommu->iommu;
+}
+
+static void intel_iommu_release_device(struct device *dev)
+{
+       struct intel_iommu *iommu;
+       u8 bus, devfn;
+
+       iommu = device_to_iommu(dev, &bus, &devfn);
+       if (!iommu)
+               return;
+
+       dmar_remove_one_dev_info(dev);
+
+       set_dma_ops(dev, NULL);
+}
+
+static void intel_iommu_probe_finalize(struct device *dev)
+{
+       struct iommu_domain *domain;
+
+       domain = iommu_get_domain_for_dev(dev);
+       if (device_needs_bounce(dev))
+               set_dma_ops(dev, &bounce_dma_ops);
+       else if (domain && domain->type == IOMMU_DOMAIN_DMA)
+               set_dma_ops(dev, &intel_dma_ops);
+       else
+               set_dma_ops(dev, NULL);
+}
+
+static void intel_iommu_get_resv_regions(struct device *device,
+                                        struct list_head *head)
+{
+       int prot = DMA_PTE_READ | DMA_PTE_WRITE;
+       struct iommu_resv_region *reg;
+       struct dmar_rmrr_unit *rmrr;
+       struct device *i_dev;
+       int i;
+
+       down_read(&dmar_global_lock);
+       for_each_rmrr_units(rmrr) {
+               for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
+                                         i, i_dev) {
+                       struct iommu_resv_region *resv;
+                       enum iommu_resv_type type;
+                       size_t length;
+
+                       if (i_dev != device &&
+                           !is_downstream_to_pci_bridge(device, i_dev))
+                               continue;
+
+                       length = rmrr->end_address - rmrr->base_address + 1;
+
+                       type = device_rmrr_is_relaxable(device) ?
+                               IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
+
+                       resv = iommu_alloc_resv_region(rmrr->base_address,
+                                                      length, prot, type);
+                       if (!resv)
+                               break;
+
+                       list_add_tail(&resv->list, head);
+               }
+       }
+       up_read(&dmar_global_lock);
+
+#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
+       if (dev_is_pci(device)) {
+               struct pci_dev *pdev = to_pci_dev(device);
+
+               if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
+                       reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
+                                                  IOMMU_RESV_DIRECT_RELAXABLE);
+                       if (reg)
+                               list_add_tail(&reg->list, head);
+               }
+       }
+#endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
+
+       reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
+                                     IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
+                                     0, IOMMU_RESV_MSI);
+       if (!reg)
+               return;
+       list_add_tail(&reg->list, head);
+}
+
+int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
+{
+       struct device_domain_info *info;
+       struct context_entry *context;
+       struct dmar_domain *domain;
+       unsigned long flags;
+       u64 ctx_lo;
+       int ret;
+
+       domain = find_domain(dev);
+       if (!domain)
+               return -EINVAL;
+
+       spin_lock_irqsave(&device_domain_lock, flags);
+       spin_lock(&iommu->lock);
+
+       ret = -EINVAL;
+       info = get_domain_info(dev);
+       if (!info || !info->pasid_supported)
+               goto out;
+
+       context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
+       if (WARN_ON(!context))
+               goto out;
+
+       ctx_lo = context[0].lo;
+
+       if (!(ctx_lo & CONTEXT_PASIDE)) {
+               ctx_lo |= CONTEXT_PASIDE;
+               context[0].lo = ctx_lo;
+               wmb();
+               iommu->flush.flush_context(iommu,
+                                          domain->iommu_did[iommu->seq_id],
+                                          PCI_DEVID(info->bus, info->devfn),
+                                          DMA_CCMD_MASK_NOBIT,
+                                          DMA_CCMD_DEVICE_INVL);
+       }
+
+       /* Enable PASID support in the device, if it wasn't already */
+       if (!info->pasid_enabled)
+               iommu_enable_dev_iotlb(info);
+
+       ret = 0;
+
+ out:
+       spin_unlock(&iommu->lock);
+       spin_unlock_irqrestore(&device_domain_lock, flags);
+
+       return ret;
+}
+
+static void intel_iommu_apply_resv_region(struct device *dev,
+                                         struct iommu_domain *domain,
+                                         struct iommu_resv_region *region)
+{
+       struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+       unsigned long start, end;
+
+       start = IOVA_PFN(region->start);
+       end   = IOVA_PFN(region->start + region->length - 1);
+
+       WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
+}
+
+static struct iommu_group *intel_iommu_device_group(struct device *dev)
+{
+       if (dev_is_pci(dev))
+               return pci_device_group(dev);
+       return generic_device_group(dev);
+}
+
+#ifdef CONFIG_INTEL_IOMMU_SVM
+struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
+{
+       struct intel_iommu *iommu;
+       u8 bus, devfn;
+
+       if (iommu_dummy(dev)) {
+               dev_warn(dev,
+                        "No IOMMU translation for device; cannot enable SVM\n");
+               return NULL;
+       }
+
+       iommu = device_to_iommu(dev, &bus, &devfn);
+       if ((!iommu)) {
+               dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
+               return NULL;
+       }
+
+       return iommu;
+}
+#endif /* CONFIG_INTEL_IOMMU_SVM */
+
+static int intel_iommu_enable_auxd(struct device *dev)
+{
+       struct device_domain_info *info;
+       struct intel_iommu *iommu;
+       unsigned long flags;
+       u8 bus, devfn;
+       int ret;
+
+       iommu = device_to_iommu(dev, &bus, &devfn);
+       if (!iommu || dmar_disabled)
+               return -EINVAL;
+
+       if (!sm_supported(iommu) || !pasid_supported(iommu))
+               return -EINVAL;
+
+       ret = intel_iommu_enable_pasid(iommu, dev);
+       if (ret)
+               return -ENODEV;
+
+       spin_lock_irqsave(&device_domain_lock, flags);
+       info = get_domain_info(dev);
+       info->auxd_enabled = 1;
+       spin_unlock_irqrestore(&device_domain_lock, flags);
+
+       return 0;
+}
+
+static int intel_iommu_disable_auxd(struct device *dev)
+{
+       struct device_domain_info *info;
+       unsigned long flags;
+
+       spin_lock_irqsave(&device_domain_lock, flags);
+       info = get_domain_info(dev);
+       if (!WARN_ON(!info))
+               info->auxd_enabled = 0;
+       spin_unlock_irqrestore(&device_domain_lock, flags);
+
+       return 0;
+}
+
+/*
+ * A PCI express designated vendor specific extended capability is defined
+ * in the section 3.7 of Intel scalable I/O virtualization technical spec
+ * for system software and tools to detect endpoint devices supporting the
+ * Intel scalable IO virtualization without host driver dependency.
+ *
+ * Returns the address of the matching extended capability structure within
+ * the device's PCI configuration space or 0 if the device does not support
+ * it.
+ */
+static int siov_find_pci_dvsec(struct pci_dev *pdev)
+{
+       int pos;
+       u16 vendor, id;
+
+       pos = pci_find_next_ext_capability(pdev, 0, 0x23);
+       while (pos) {
+               pci_read_config_word(pdev, pos + 4, &vendor);
+               pci_read_config_word(pdev, pos + 8, &id);
+               if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
+                       return pos;
+
+               pos = pci_find_next_ext_capability(pdev, pos, 0x23);
+       }
+
+       return 0;
+}
+
+static bool
+intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
+{
+       if (feat == IOMMU_DEV_FEAT_AUX) {
+               int ret;
+
+               if (!dev_is_pci(dev) || dmar_disabled ||
+                   !scalable_mode_support() || !iommu_pasid_support())
+                       return false;
+
+               ret = pci_pasid_features(to_pci_dev(dev));
+               if (ret < 0)
+                       return false;
+
+               return !!siov_find_pci_dvsec(to_pci_dev(dev));
+       }
+
+       if (feat == IOMMU_DEV_FEAT_SVA) {
+               struct device_domain_info *info = get_domain_info(dev);
+
+               return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) &&
+                       info->pasid_supported && info->pri_supported &&
+                       info->ats_supported;
+       }
+
+       return false;
+}
+
+static int
+intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
+{
+       if (feat == IOMMU_DEV_FEAT_AUX)
+               return intel_iommu_enable_auxd(dev);
+
+       if (feat == IOMMU_DEV_FEAT_SVA) {
+               struct device_domain_info *info = get_domain_info(dev);
+
+               if (!info)
+                       return -EINVAL;
+
+               if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE)
+                       return 0;
+       }
+
+       return -ENODEV;
+}
+
+static int
+intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
+{
+       if (feat == IOMMU_DEV_FEAT_AUX)
+               return intel_iommu_disable_auxd(dev);
+
+       return -ENODEV;
+}
+
+static bool
+intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
+{
+       struct device_domain_info *info = get_domain_info(dev);
+
+       if (feat == IOMMU_DEV_FEAT_AUX)
+               return scalable_mode_support() && info && info->auxd_enabled;
+
+       return false;
+}
+
+static int
+intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
+{
+       struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+
+       return dmar_domain->default_pasid > 0 ?
+                       dmar_domain->default_pasid : -EINVAL;
+}
+
+static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
+                                          struct device *dev)
+{
+       return attach_deferred(dev);
+}
+
+static int
+intel_iommu_domain_set_attr(struct iommu_domain *domain,
+                           enum iommu_attr attr, void *data)
+{
+       struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+       unsigned long flags;
+       int ret = 0;
+
+       if (domain->type != IOMMU_DOMAIN_UNMANAGED)
+               return -EINVAL;
+
+       switch (attr) {
+       case DOMAIN_ATTR_NESTING:
+               spin_lock_irqsave(&device_domain_lock, flags);
+               if (nested_mode_support() &&
+                   list_empty(&dmar_domain->devices)) {
+                       dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
+                       dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
+               } else {
+                       ret = -ENODEV;
+               }
+               spin_unlock_irqrestore(&device_domain_lock, flags);
+               break;
+       default:
+               ret = -EINVAL;
+               break;
+       }
+
+       return ret;
+}
+
+const struct iommu_ops intel_iommu_ops = {
+       .capable                = intel_iommu_capable,
+       .domain_alloc           = intel_iommu_domain_alloc,
+       .domain_free            = intel_iommu_domain_free,
+       .domain_set_attr        = intel_iommu_domain_set_attr,
+       .attach_dev             = intel_iommu_attach_device,
+       .detach_dev             = intel_iommu_detach_device,
+       .aux_attach_dev         = intel_iommu_aux_attach_device,
+       .aux_detach_dev         = intel_iommu_aux_detach_device,
+       .aux_get_pasid          = intel_iommu_aux_get_pasid,
+       .map                    = intel_iommu_map,
+       .unmap                  = intel_iommu_unmap,
+       .iova_to_phys           = intel_iommu_iova_to_phys,
+       .probe_device           = intel_iommu_probe_device,
+       .probe_finalize         = intel_iommu_probe_finalize,
+       .release_device         = intel_iommu_release_device,
+       .get_resv_regions       = intel_iommu_get_resv_regions,
+       .put_resv_regions       = generic_iommu_put_resv_regions,
+       .apply_resv_region      = intel_iommu_apply_resv_region,
+       .device_group           = intel_iommu_device_group,
+       .dev_has_feat           = intel_iommu_dev_has_feat,
+       .dev_feat_enabled       = intel_iommu_dev_feat_enabled,
+       .dev_enable_feat        = intel_iommu_dev_enable_feat,
+       .dev_disable_feat       = intel_iommu_dev_disable_feat,
+       .is_attach_deferred     = intel_iommu_is_attach_deferred,
+       .def_domain_type        = device_def_domain_type,
+       .pgsize_bitmap          = INTEL_IOMMU_PGSIZES,
+#ifdef CONFIG_INTEL_IOMMU_SVM
+       .cache_invalidate       = intel_iommu_sva_invalidate,
+       .sva_bind_gpasid        = intel_svm_bind_gpasid,
+       .sva_unbind_gpasid      = intel_svm_unbind_gpasid,
+       .sva_bind               = intel_svm_bind,
+       .sva_unbind             = intel_svm_unbind,
+       .sva_get_pasid          = intel_svm_get_pasid,
+#endif
+};
+
+static void quirk_iommu_igfx(struct pci_dev *dev)
+{
+       pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
+       dmar_map_gfx = 0;
+}
+
+/* G4x/GM45 integrated gfx dmar support is totally busted. */
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
+
+/* Broadwell igfx malfunctions with dmar */
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
+
+static void quirk_iommu_rwbf(struct pci_dev *dev)
+{
+       /*
+        * Mobile 4 Series Chipset neglects to set RWBF capability,
+        * but needs it. Same seems to hold for the desktop versions.
+        */
+       pci_info(dev, "Forcing write-buffer flush capability\n");
+       rwbf_quirk = 1;
+}
+
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
+
+#define GGC 0x52
+#define GGC_MEMORY_SIZE_MASK   (0xf << 8)
+#define GGC_MEMORY_SIZE_NONE   (0x0 << 8)
+#define GGC_MEMORY_SIZE_1M     (0x1 << 8)
+#define GGC_MEMORY_SIZE_2M     (0x3 << 8)
+#define GGC_MEMORY_VT_ENABLED  (0x8 << 8)
+#define GGC_MEMORY_SIZE_2M_VT  (0x9 << 8)
+#define GGC_MEMORY_SIZE_3M_VT  (0xa << 8)
+#define GGC_MEMORY_SIZE_4M_VT  (0xb << 8)
+
+static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
+{
+       unsigned short ggc;
+
+       if (pci_read_config_word(dev, GGC, &ggc))
+               return;
+
+       if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
+               pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
+               dmar_map_gfx = 0;
+       } else if (dmar_map_gfx) {
+               /* we have to ensure the gfx device is idle before we flush */
+               pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
+               intel_iommu_strict = 1;
+       }
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
+
+/* On Tylersburg chipsets, some BIOSes have been known to enable the
+   ISOCH DMAR unit for the Azalia sound device, but not give it any
+   TLB entries, which causes it to deadlock. Check for that.  We do
+   this in a function called from init_dmars(), instead of in a PCI
+   quirk, because we don't want to print the obnoxious "BIOS broken"
+   message if VT-d is actually disabled.
+*/
+static void __init check_tylersburg_isoch(void)
+{
+       struct pci_dev *pdev;
+       uint32_t vtisochctrl;
+
+       /* If there's no Azalia in the system anyway, forget it. */
+       pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
+       if (!pdev)
+               return;
+       pci_dev_put(pdev);
+
+       /* System Management Registers. Might be hidden, in which case
+          we can't do the sanity check. But that's OK, because the
+          known-broken BIOSes _don't_ actually hide it, so far. */
+       pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
+       if (!pdev)
+               return;
+
+       if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
+               pci_dev_put(pdev);
+               return;
+       }
+
+       pci_dev_put(pdev);
+
+       /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
+       if (vtisochctrl & 1)
+               return;
+
+       /* Drop all bits other than the number of TLB entries */
+       vtisochctrl &= 0x1c;
+
+       /* If we have the recommended number of TLB entries (16), fine. */
+       if (vtisochctrl == 0x10)
+               return;
+
+       /* Zero TLB entries? You get to ride the short bus to school. */
+       if (!vtisochctrl) {
+               WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
+                    "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
+                    dmi_get_system_info(DMI_BIOS_VENDOR),
+                    dmi_get_system_info(DMI_BIOS_VERSION),
+                    dmi_get_system_info(DMI_PRODUCT_VERSION));
+               iommu_identity_mapping |= IDENTMAP_AZALIA;
+               return;
+       }
+
+       pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
+              vtisochctrl);
+}
diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c

new file mode 100644 (file)

index 0000000..7f87698
--- /dev/null
+++ b/drivers/iommu/intel/irq_remapping.c
@@ -0,0 +1,1518 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define pr_fmt(fmt)     "DMAR-IR: " fmt
+
+#include <linux/interrupt.h>
+#include <linux/dmar.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/jiffies.h>
+#include <linux/hpet.h>
+#include <linux/pci.h>
+#include <linux/irq.h>
+#include <linux/intel-iommu.h>
+#include <linux/acpi.h>
+#include <linux/irqdomain.h>
+#include <linux/crash_dump.h>
+#include <asm/io_apic.h>
+#include <asm/smp.h>
+#include <asm/cpu.h>
+#include <asm/irq_remapping.h>
+#include <asm/pci-direct.h>
+#include <asm/msidef.h>
+
+#include "../irq_remapping.h"
+
+enum irq_mode {
+       IRQ_REMAPPING,
+       IRQ_POSTING,
+};
+
+struct ioapic_scope {
+       struct intel_iommu *iommu;
+       unsigned int id;
+       unsigned int bus;       /* PCI bus number */
+       unsigned int devfn;     /* PCI devfn number */
+};
+
+struct hpet_scope {
+       struct intel_iommu *iommu;
+       u8 id;
+       unsigned int bus;
+       unsigned int devfn;
+};
+
+struct irq_2_iommu {
+       struct intel_iommu *iommu;
+       u16 irte_index;
+       u16 sub_handle;
+       u8  irte_mask;
+       enum irq_mode mode;
+};
+
+struct intel_ir_data {
+       struct irq_2_iommu                      irq_2_iommu;
+       struct irte                             irte_entry;
+       union {
+               struct msi_msg                  msi_entry;
+       };
+};
+
+#define IR_X2APIC_MODE(mode) (mode ? (1 << 11) : 0)
+#define IRTE_DEST(dest) ((eim_mode) ? dest : dest << 8)
+
+static int __read_mostly eim_mode;
+static struct ioapic_scope ir_ioapic[MAX_IO_APICS];
+static struct hpet_scope ir_hpet[MAX_HPET_TBS];
+
+/*
+ * Lock ordering:
+ * ->dmar_global_lock
+ *     ->irq_2_ir_lock
+ *             ->qi->q_lock
+ *     ->iommu->register_lock
+ * Note:
+ * intel_irq_remap_ops.{supported,prepare,enable,disable,reenable} are called
+ * in single-threaded environment with interrupt disabled, so no need to tabke
+ * the dmar_global_lock.
+ */
+DEFINE_RAW_SPINLOCK(irq_2_ir_lock);
+static const struct irq_domain_ops intel_ir_domain_ops;
+
+static void iommu_disable_irq_remapping(struct intel_iommu *iommu);
+static int __init parse_ioapics_under_ir(void);
+
+static bool ir_pre_enabled(struct intel_iommu *iommu)
+{
+       return (iommu->flags & VTD_FLAG_IRQ_REMAP_PRE_ENABLED);
+}
+
+static void clear_ir_pre_enabled(struct intel_iommu *iommu)
+{
+       iommu->flags &= ~VTD_FLAG_IRQ_REMAP_PRE_ENABLED;
+}
+
+static void init_ir_status(struct intel_iommu *iommu)
+{
+       u32 gsts;
+
+       gsts = readl(iommu->reg + DMAR_GSTS_REG);
+       if (gsts & DMA_GSTS_IRES)
+               iommu->flags |= VTD_FLAG_IRQ_REMAP_PRE_ENABLED;
+}
+
+static int alloc_irte(struct intel_iommu *iommu,
+                     struct irq_2_iommu *irq_iommu, u16 count)
+{
+       struct ir_table *table = iommu->ir_table;
+       unsigned int mask = 0;
+       unsigned long flags;
+       int index;
+
+       if (!count || !irq_iommu)
+               return -1;
+
+       if (count > 1) {
+               count = __roundup_pow_of_two(count);
+               mask = ilog2(count);
+       }
+
+       if (mask > ecap_max_handle_mask(iommu->ecap)) {
+               pr_err("Requested mask %x exceeds the max invalidation handle"
+                      " mask value %Lx\n", mask,
+                      ecap_max_handle_mask(iommu->ecap));
+               return -1;
+       }
+
+       raw_spin_lock_irqsave(&irq_2_ir_lock, flags);
+       index = bitmap_find_free_region(table->bitmap,
+                                       INTR_REMAP_TABLE_ENTRIES, mask);
+       if (index < 0) {
+               pr_warn("IR%d: can't allocate an IRTE\n", iommu->seq_id);
+       } else {
+               irq_iommu->iommu = iommu;
+               irq_iommu->irte_index =  index;
+               irq_iommu->sub_handle = 0;
+               irq_iommu->irte_mask = mask;
+               irq_iommu->mode = IRQ_REMAPPING;
+       }
+       raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags);
+
+       return index;
+}
+
+static int qi_flush_iec(struct intel_iommu *iommu, int index, int mask)
+{
+       struct qi_desc desc;
+
+       desc.qw0 = QI_IEC_IIDEX(index) | QI_IEC_TYPE | QI_IEC_IM(mask)
+                  | QI_IEC_SELECTIVE;
+       desc.qw1 = 0;
+       desc.qw2 = 0;
+       desc.qw3 = 0;
+
+       return qi_submit_sync(iommu, &desc, 1, 0);
+}
+
+static int modify_irte(struct irq_2_iommu *irq_iommu,
+                      struct irte *irte_modified)
+{
+       struct intel_iommu *iommu;
+       unsigned long flags;
+       struct irte *irte;
+       int rc, index;
+
+       if (!irq_iommu)
+               return -1;
+
+       raw_spin_lock_irqsave(&irq_2_ir_lock, flags);
+
+       iommu = irq_iommu->iommu;
+
+       index = irq_iommu->irte_index + irq_iommu->sub_handle;
+       irte = &iommu->ir_table->base[index];
+
+#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE)
+       if ((irte->pst == 1) || (irte_modified->pst == 1)) {
+               bool ret;
+
+               ret = cmpxchg_double(&irte->low, &irte->high,
+                                    irte->low, irte->high,
+                                    irte_modified->low, irte_modified->high);
+               /*
+                * We use cmpxchg16 to atomically update the 128-bit IRTE,
+                * and it cannot be updated by the hardware or other processors
+                * behind us, so the return value of cmpxchg16 should be the
+                * same as the old value.
+                */
+               WARN_ON(!ret);
+       } else
+#endif
+       {
+               set_64bit(&irte->low, irte_modified->low);
+               set_64bit(&irte->high, irte_modified->high);
+       }
+       __iommu_flush_cache(iommu, irte, sizeof(*irte));
+
+       rc = qi_flush_iec(iommu, index, 0);
+
+       /* Update iommu mode according to the IRTE mode */
+       irq_iommu->mode = irte->pst ? IRQ_POSTING : IRQ_REMAPPING;
+       raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags);
+
+       return rc;
+}
+
+static struct intel_iommu *map_hpet_to_ir(u8 hpet_id)
+{
+       int i;
+
+       for (i = 0; i < MAX_HPET_TBS; i++)
+               if (ir_hpet[i].id == hpet_id && ir_hpet[i].iommu)
+                       return ir_hpet[i].iommu;
+       return NULL;
+}
+
+static struct intel_iommu *map_ioapic_to_ir(int apic)
+{
+       int i;
+
+       for (i = 0; i < MAX_IO_APICS; i++)
+               if (ir_ioapic[i].id == apic && ir_ioapic[i].iommu)
+                       return ir_ioapic[i].iommu;
+       return NULL;
+}
+
+static struct intel_iommu *map_dev_to_ir(struct pci_dev *dev)
+{
+       struct dmar_drhd_unit *drhd;
+
+       drhd = dmar_find_matched_drhd_unit(dev);
+       if (!drhd)
+               return NULL;
+
+       return drhd->iommu;
+}
+
+static int clear_entries(struct irq_2_iommu *irq_iommu)
+{
+       struct irte *start, *entry, *end;
+       struct intel_iommu *iommu;
+       int index;
+
+       if (irq_iommu->sub_handle)
+               return 0;
+
+       iommu = irq_iommu->iommu;
+       index = irq_iommu->irte_index;
+
+       start = iommu->ir_table->base + index;
+       end = start + (1 << irq_iommu->irte_mask);
+
+       for (entry = start; entry < end; entry++) {
+               set_64bit(&entry->low, 0);
+               set_64bit(&entry->high, 0);
+       }
+       bitmap_release_region(iommu->ir_table->bitmap, index,
+                             irq_iommu->irte_mask);
+
+       return qi_flush_iec(iommu, index, irq_iommu->irte_mask);
+}
+
+/*
+ * source validation type
+ */
+#define SVT_NO_VERIFY          0x0  /* no verification is required */
+#define SVT_VERIFY_SID_SQ      0x1  /* verify using SID and SQ fields */
+#define SVT_VERIFY_BUS         0x2  /* verify bus of request-id */
+
+/*
+ * source-id qualifier
+ */
+#define SQ_ALL_16      0x0  /* verify all 16 bits of request-id */
+#define SQ_13_IGNORE_1 0x1  /* verify most significant 13 bits, ignore
+                             * the third least significant bit
+                             */
+#define SQ_13_IGNORE_2 0x2  /* verify most significant 13 bits, ignore
+                             * the second and third least significant bits
+                             */
+#define SQ_13_IGNORE_3 0x3  /* verify most significant 13 bits, ignore
+                             * the least three significant bits
+                             */
+
+/*
+ * set SVT, SQ and SID fields of irte to verify
+ * source ids of interrupt requests
+ */
+static void set_irte_sid(struct irte *irte, unsigned int svt,
+                        unsigned int sq, unsigned int sid)
+{
+       if (disable_sourceid_checking)
+               svt = SVT_NO_VERIFY;
+       irte->svt = svt;
+       irte->sq = sq;
+       irte->sid = sid;
+}
+
+/*
+ * Set an IRTE to match only the bus number. Interrupt requests that reference
+ * this IRTE must have a requester-id whose bus number is between or equal
+ * to the start_bus and end_bus arguments.
+ */
+static void set_irte_verify_bus(struct irte *irte, unsigned int start_bus,
+                               unsigned int end_bus)
+{
+       set_irte_sid(irte, SVT_VERIFY_BUS, SQ_ALL_16,
+                    (start_bus << 8) | end_bus);
+}
+
+static int set_ioapic_sid(struct irte *irte, int apic)
+{
+       int i;
+       u16 sid = 0;
+
+       if (!irte)
+               return -1;
+
+       down_read(&dmar_global_lock);
+       for (i = 0; i < MAX_IO_APICS; i++) {
+               if (ir_ioapic[i].iommu && ir_ioapic[i].id == apic) {
+                       sid = (ir_ioapic[i].bus << 8) | ir_ioapic[i].devfn;
+                       break;
+               }
+       }
+       up_read(&dmar_global_lock);
+
+       if (sid == 0) {
+               pr_warn("Failed to set source-id of IOAPIC (%d)\n", apic);
+               return -1;
+       }
+
+       set_irte_sid(irte, SVT_VERIFY_SID_SQ, SQ_ALL_16, sid);
+
+       return 0;
+}
+
+static int set_hpet_sid(struct irte *irte, u8 id)
+{
+       int i;
+       u16 sid = 0;
+
+       if (!irte)
+               return -1;
+
+       down_read(&dmar_global_lock);
+       for (i = 0; i < MAX_HPET_TBS; i++) {
+               if (ir_hpet[i].iommu && ir_hpet[i].id == id) {
+                       sid = (ir_hpet[i].bus << 8) | ir_hpet[i].devfn;
+                       break;
+               }
+       }
+       up_read(&dmar_global_lock);
+
+       if (sid == 0) {
+               pr_warn("Failed to set source-id of HPET block (%d)\n", id);
+               return -1;
+       }
+
+       /*
+        * Should really use SQ_ALL_16. Some platforms are broken.
+        * While we figure out the right quirks for these broken platforms, use
+        * SQ_13_IGNORE_3 for now.
+        */
+       set_irte_sid(irte, SVT_VERIFY_SID_SQ, SQ_13_IGNORE_3, sid);
+
+       return 0;
+}
+
+struct set_msi_sid_data {
+       struct pci_dev *pdev;
+       u16 alias;
+       int count;
+       int busmatch_count;
+};
+
+static int set_msi_sid_cb(struct pci_dev *pdev, u16 alias, void *opaque)
+{
+       struct set_msi_sid_data *data = opaque;
+
+       if (data->count == 0 || PCI_BUS_NUM(alias) == PCI_BUS_NUM(data->alias))
+               data->busmatch_count++;
+
+       data->pdev = pdev;
+       data->alias = alias;
+       data->count++;
+
+       return 0;
+}
+
+static int set_msi_sid(struct irte *irte, struct pci_dev *dev)
+{
+       struct set_msi_sid_data data;
+
+       if (!irte || !dev)
+               return -1;
+
+       data.count = 0;
+       data.busmatch_count = 0;
+       pci_for_each_dma_alias(dev, set_msi_sid_cb, &data);
+
+       /*
+        * DMA alias provides us with a PCI device and alias.  The only case
+        * where the it will return an alias on a different bus than the
+        * device is the case of a PCIe-to-PCI bridge, where the alias is for
+        * the subordinate bus.  In this case we can only verify the bus.
+        *
+        * If there are multiple aliases, all with the same bus number,
+        * then all we can do is verify the bus. This is typical in NTB
+        * hardware which use proxy IDs where the device will generate traffic
+        * from multiple devfn numbers on the same bus.
+        *
+        * If the alias device is on a different bus than our source device
+        * then we have a topology based alias, use it.
+        *
+        * Otherwise, the alias is for a device DMA quirk and we cannot
+        * assume that MSI uses the same requester ID.  Therefore use the
+        * original device.
+        */
+       if (PCI_BUS_NUM(data.alias) != data.pdev->bus->number)
+               set_irte_verify_bus(irte, PCI_BUS_NUM(data.alias),
+                                   dev->bus->number);
+       else if (data.count >= 2 && data.busmatch_count == data.count)
+               set_irte_verify_bus(irte, dev->bus->number, dev->bus->number);
+       else if (data.pdev->bus->number != dev->bus->number)
+               set_irte_sid(irte, SVT_VERIFY_SID_SQ, SQ_ALL_16, data.alias);
+       else
+               set_irte_sid(irte, SVT_VERIFY_SID_SQ, SQ_ALL_16,
+                            pci_dev_id(dev));
+
+       return 0;
+}
+
+static int iommu_load_old_irte(struct intel_iommu *iommu)
+{
+       struct irte *old_ir_table;
+       phys_addr_t irt_phys;
+       unsigned int i;
+       size_t size;
+       u64 irta;
+
+       /* Check whether the old ir-table has the same size as ours */
+       irta = dmar_readq(iommu->reg + DMAR_IRTA_REG);
+       if ((irta & INTR_REMAP_TABLE_REG_SIZE_MASK)
+            != INTR_REMAP_TABLE_REG_SIZE)
+               return -EINVAL;
+
+       irt_phys = irta & VTD_PAGE_MASK;
+       size     = INTR_REMAP_TABLE_ENTRIES*sizeof(struct irte);
+
+       /* Map the old IR table */
+       old_ir_table = memremap(irt_phys, size, MEMREMAP_WB);
+       if (!old_ir_table)
+               return -ENOMEM;
+
+       /* Copy data over */
+       memcpy(iommu->ir_table->base, old_ir_table, size);
+
+       __iommu_flush_cache(iommu, iommu->ir_table->base, size);
+
+       /*
+        * Now check the table for used entries and mark those as
+        * allocated in the bitmap
+        */
+       for (i = 0; i < INTR_REMAP_TABLE_ENTRIES; i++) {
+               if (iommu->ir_table->base[i].present)
+                       bitmap_set(iommu->ir_table->bitmap, i, 1);
+       }
+
+       memunmap(old_ir_table);
+
+       return 0;
+}
+
+
+static void iommu_set_irq_remapping(struct intel_iommu *iommu, int mode)
+{
+       unsigned long flags;
+       u64 addr;
+       u32 sts;
+
+       addr = virt_to_phys((void *)iommu->ir_table->base);
+
+       raw_spin_lock_irqsave(&iommu->register_lock, flags);
+
+       dmar_writeq(iommu->reg + DMAR_IRTA_REG,
+                   (addr) | IR_X2APIC_MODE(mode) | INTR_REMAP_TABLE_REG_SIZE);
+
+       /* Set interrupt-remapping table pointer */
+       writel(iommu->gcmd | DMA_GCMD_SIRTP, iommu->reg + DMAR_GCMD_REG);
+
+       IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+                     readl, (sts & DMA_GSTS_IRTPS), sts);
+       raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
+
+       /*
+        * Global invalidation of interrupt entry cache to make sure the
+        * hardware uses the new irq remapping table.
+        */
+       qi_global_iec(iommu);
+}
+
+static void iommu_enable_irq_remapping(struct intel_iommu *iommu)
+{
+       unsigned long flags;
+       u32 sts;
+
+       raw_spin_lock_irqsave(&iommu->register_lock, flags);
+
+       /* Enable interrupt-remapping */
+       iommu->gcmd |= DMA_GCMD_IRE;
+       iommu->gcmd &= ~DMA_GCMD_CFI;  /* Block compatibility-format MSIs */
+       writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
+
+       IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+                     readl, (sts & DMA_GSTS_IRES), sts);
+
+       /*
+        * With CFI clear in the Global Command register, we should be
+        * protected from dangerous (i.e. compatibility) interrupts
+        * regardless of x2apic status.  Check just to be sure.
+        */
+       if (sts & DMA_GSTS_CFIS)
+               WARN(1, KERN_WARNING
+                       "Compatibility-format IRQs enabled despite intr remapping;\n"
+                       "you are vulnerable to IRQ injection.\n");
+
+       raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
+}
+
+static int intel_setup_irq_remapping(struct intel_iommu *iommu)
+{
+       struct ir_table *ir_table;
+       struct fwnode_handle *fn;
+       unsigned long *bitmap;
+       struct page *pages;
+
+       if (iommu->ir_table)
+               return 0;
+
+       ir_table = kzalloc(sizeof(struct ir_table), GFP_KERNEL);
+       if (!ir_table)
+               return -ENOMEM;
+
+       pages = alloc_pages_node(iommu->node, GFP_KERNEL | __GFP_ZERO,
+                                INTR_REMAP_PAGE_ORDER);
+       if (!pages) {
+               pr_err("IR%d: failed to allocate pages of order %d\n",
+                      iommu->seq_id, INTR_REMAP_PAGE_ORDER);
+               goto out_free_table;
+       }
+
+       bitmap = bitmap_zalloc(INTR_REMAP_TABLE_ENTRIES, GFP_ATOMIC);
+       if (bitmap == NULL) {
+               pr_err("IR%d: failed to allocate bitmap\n", iommu->seq_id);
+               goto out_free_pages;
+       }
+
+       fn = irq_domain_alloc_named_id_fwnode("INTEL-IR", iommu->seq_id);
+       if (!fn)
+               goto out_free_bitmap;
+
+       iommu->ir_domain =
+               irq_domain_create_hierarchy(arch_get_ir_parent_domain(),
+                                           0, INTR_REMAP_TABLE_ENTRIES,
+                                           fn, &intel_ir_domain_ops,
+                                           iommu);
+       irq_domain_free_fwnode(fn);
+       if (!iommu->ir_domain) {
+               pr_err("IR%d: failed to allocate irqdomain\n", iommu->seq_id);
+               goto out_free_bitmap;
+       }
+       iommu->ir_msi_domain =
+               arch_create_remap_msi_irq_domain(iommu->ir_domain,
+                                                "INTEL-IR-MSI",
+                                                iommu->seq_id);
+
+       ir_table->base = page_address(pages);
+       ir_table->bitmap = bitmap;
+       iommu->ir_table = ir_table;
+
+       /*
+        * If the queued invalidation is already initialized,
+        * shouldn't disable it.
+        */
+       if (!iommu->qi) {
+               /*
+                * Clear previous faults.
+                */
+               dmar_fault(-1, iommu);
+               dmar_disable_qi(iommu);
+
+               if (dmar_enable_qi(iommu)) {
+                       pr_err("Failed to enable queued invalidation\n");
+                       goto out_free_bitmap;
+               }
+       }
+
+       init_ir_status(iommu);
+
+       if (ir_pre_enabled(iommu)) {
+               if (!is_kdump_kernel()) {
+                       pr_warn("IRQ remapping was enabled on %s but we are not in kdump mode\n",
+                               iommu->name);
+                       clear_ir_pre_enabled(iommu);
+                       iommu_disable_irq_remapping(iommu);
+               } else if (iommu_load_old_irte(iommu))
+                       pr_err("Failed to copy IR table for %s from previous kernel\n",
+                              iommu->name);
+               else
+                       pr_info("Copied IR table for %s from previous kernel\n",
+                               iommu->name);
+       }
+
+       iommu_set_irq_remapping(iommu, eim_mode);
+
+       return 0;
+
+out_free_bitmap:
+       bitmap_free(bitmap);
+out_free_pages:
+       __free_pages(pages, INTR_REMAP_PAGE_ORDER);
+out_free_table:
+       kfree(ir_table);
+
+       iommu->ir_table  = NULL;
+
+       return -ENOMEM;
+}
+
+static void intel_teardown_irq_remapping(struct intel_iommu *iommu)
+{
+       if (iommu && iommu->ir_table) {
+               if (iommu->ir_msi_domain) {
+                       irq_domain_remove(iommu->ir_msi_domain);
+                       iommu->ir_msi_domain = NULL;
+               }
+               if (iommu->ir_domain) {
+                       irq_domain_remove(iommu->ir_domain);
+                       iommu->ir_domain = NULL;
+               }
+               free_pages((unsigned long)iommu->ir_table->base,
+                          INTR_REMAP_PAGE_ORDER);
+               bitmap_free(iommu->ir_table->bitmap);
+               kfree(iommu->ir_table);
+               iommu->ir_table = NULL;
+       }
+}
+
+/*
+ * Disable Interrupt Remapping.
+ */
+static void iommu_disable_irq_remapping(struct intel_iommu *iommu)
+{
+       unsigned long flags;
+       u32 sts;
+
+       if (!ecap_ir_support(iommu->ecap))
+               return;
+
+       /*
+        * global invalidation of interrupt entry cache before disabling
+        * interrupt-remapping.
+        */
+       qi_global_iec(iommu);
+
+       raw_spin_lock_irqsave(&iommu->register_lock, flags);
+
+       sts = readl(iommu->reg + DMAR_GSTS_REG);
+       if (!(sts & DMA_GSTS_IRES))
+               goto end;
+
+       iommu->gcmd &= ~DMA_GCMD_IRE;
+       writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
+
+       IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+                     readl, !(sts & DMA_GSTS_IRES), sts);
+
+end:
+       raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
+}
+
+static int __init dmar_x2apic_optout(void)
+{
+       struct acpi_table_dmar *dmar;
+       dmar = (struct acpi_table_dmar *)dmar_tbl;
+       if (!dmar || no_x2apic_optout)
+               return 0;
+       return dmar->flags & DMAR_X2APIC_OPT_OUT;
+}
+
+static void __init intel_cleanup_irq_remapping(void)
+{
+       struct dmar_drhd_unit *drhd;
+       struct intel_iommu *iommu;
+
+       for_each_iommu(iommu, drhd) {
+               if (ecap_ir_support(iommu->ecap)) {
+                       iommu_disable_irq_remapping(iommu);
+                       intel_teardown_irq_remapping(iommu);
+               }
+       }
+
+       if (x2apic_supported())
+               pr_warn("Failed to enable irq remapping. You are vulnerable to irq-injection attacks.\n");
+}
+
+static int __init intel_prepare_irq_remapping(void)
+{
+       struct dmar_drhd_unit *drhd;
+       struct intel_iommu *iommu;
+       int eim = 0;
+
+       if (irq_remap_broken) {
+               pr_warn("This system BIOS has enabled interrupt remapping\n"
+                       "on a chipset that contains an erratum making that\n"
+                       "feature unstable.  To maintain system stability\n"
+                       "interrupt remapping is being disabled.  Please\n"
+                       "contact your BIOS vendor for an update\n");
+               add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
+               return -ENODEV;
+       }
+
+       if (dmar_table_init() < 0)
+               return -ENODEV;
+
+       if (!dmar_ir_support())
+               return -ENODEV;
+
+       if (parse_ioapics_under_ir()) {
+               pr_info("Not enabling interrupt remapping\n");
+               goto error;
+       }
+
+       /* First make sure all IOMMUs support IRQ remapping */
+       for_each_iommu(iommu, drhd)
+               if (!ecap_ir_support(iommu->ecap))
+                       goto error;
+
+       /* Detect remapping mode: lapic or x2apic */
+       if (x2apic_supported()) {
+               eim = !dmar_x2apic_optout();
+               if (!eim) {
+                       pr_info("x2apic is disabled because BIOS sets x2apic opt out bit.");
+                       pr_info("Use 'intremap=no_x2apic_optout' to override the BIOS setting.\n");
+               }
+       }
+
+       for_each_iommu(iommu, drhd) {
+               if (eim && !ecap_eim_support(iommu->ecap)) {
+                       pr_info("%s does not support EIM\n", iommu->name);
+                       eim = 0;
+               }
+       }
+
+       eim_mode = eim;
+       if (eim)
+               pr_info("Queued invalidation will be enabled to support x2apic and Intr-remapping.\n");
+
+       /* Do the initializations early */
+       for_each_iommu(iommu, drhd) {
+               if (intel_setup_irq_remapping(iommu)) {
+                       pr_err("Failed to setup irq remapping for %s\n",
+                              iommu->name);
+                       goto error;
+               }
+       }
+
+       return 0;
+
+error:
+       intel_cleanup_irq_remapping();
+       return -ENODEV;
+}
+
+/*
+ * Set Posted-Interrupts capability.
+ */
+static inline void set_irq_posting_cap(void)
+{
+       struct dmar_drhd_unit *drhd;
+       struct intel_iommu *iommu;
+
+       if (!disable_irq_post) {
+               /*
+                * If IRTE is in posted format, the 'pda' field goes across the
+                * 64-bit boundary, we need use cmpxchg16b to atomically update
+                * it. We only expose posted-interrupt when X86_FEATURE_CX16
+                * is supported. Actually, hardware platforms supporting PI
+                * should have X86_FEATURE_CX16 support, this has been confirmed
+                * with Intel hardware guys.
+                */
+               if (boot_cpu_has(X86_FEATURE_CX16))
+                       intel_irq_remap_ops.capability |= 1 << IRQ_POSTING_CAP;
+
+               for_each_iommu(iommu, drhd)
+                       if (!cap_pi_support(iommu->cap)) {
+                               intel_irq_remap_ops.capability &=
+                                               ~(1 << IRQ_POSTING_CAP);
+                               break;
+                       }
+       }
+}
+
+static int __init intel_enable_irq_remapping(void)
+{
+       struct dmar_drhd_unit *drhd;
+       struct intel_iommu *iommu;
+       bool setup = false;
+
+       /*
+        * Setup Interrupt-remapping for all the DRHD's now.
+        */
+       for_each_iommu(iommu, drhd) {
+               if (!ir_pre_enabled(iommu))
+                       iommu_enable_irq_remapping(iommu);
+               setup = true;
+       }
+
+       if (!setup)
+               goto error;
+
+       irq_remapping_enabled = 1;
+
+       set_irq_posting_cap();
+
+       pr_info("Enabled IRQ remapping in %s mode\n", eim_mode ? "x2apic" : "xapic");
+
+       return eim_mode ? IRQ_REMAP_X2APIC_MODE : IRQ_REMAP_XAPIC_MODE;
+
+error:
+       intel_cleanup_irq_remapping();
+       return -1;
+}
+
+static int ir_parse_one_hpet_scope(struct acpi_dmar_device_scope *scope,
+                                  struct intel_iommu *iommu,
+                                  struct acpi_dmar_hardware_unit *drhd)
+{
+       struct acpi_dmar_pci_path *path;
+       u8 bus;
+       int count, free = -1;
+
+       bus = scope->bus;
+       path = (struct acpi_dmar_pci_path *)(scope + 1);
+       count = (scope->length - sizeof(struct acpi_dmar_device_scope))
+               / sizeof(struct acpi_dmar_pci_path);
+
+       while (--count > 0) {
+               /*
+                * Access PCI directly due to the PCI
+                * subsystem isn't initialized yet.
+                */
+               bus = read_pci_config_byte(bus, path->device, path->function,
+                                          PCI_SECONDARY_BUS);
+               path++;
+       }
+
+       for (count = 0; count < MAX_HPET_TBS; count++) {
+               if (ir_hpet[count].iommu == iommu &&
+                   ir_hpet[count].id == scope->enumeration_id)
+                       return 0;
+               else if (ir_hpet[count].iommu == NULL && free == -1)
+                       free = count;
+       }
+       if (free == -1) {
+               pr_warn("Exceeded Max HPET blocks\n");
+               return -ENOSPC;
+       }
+
+       ir_hpet[free].iommu = iommu;
+       ir_hpet[free].id    = scope->enumeration_id;
+       ir_hpet[free].bus   = bus;
+       ir_hpet[free].devfn = PCI_DEVFN(path->device, path->function);
+       pr_info("HPET id %d under DRHD base 0x%Lx\n",
+               scope->enumeration_id, drhd->address);
+
+       return 0;
+}
+
+static int ir_parse_one_ioapic_scope(struct acpi_dmar_device_scope *scope,
+                                    struct intel_iommu *iommu,
+                                    struct acpi_dmar_hardware_unit *drhd)
+{
+       struct acpi_dmar_pci_path *path;
+       u8 bus;
+       int count, free = -1;
+
+       bus = scope->bus;
+       path = (struct acpi_dmar_pci_path *)(scope + 1);
+       count = (scope->length - sizeof(struct acpi_dmar_device_scope))
+               / sizeof(struct acpi_dmar_pci_path);
+
+       while (--count > 0) {
+               /*
+                * Access PCI directly due to the PCI
+                * subsystem isn't initialized yet.
+                */
+               bus = read_pci_config_byte(bus, path->device, path->function,
+                                          PCI_SECONDARY_BUS);
+               path++;
+       }
+
+       for (count = 0; count < MAX_IO_APICS; count++) {
+               if (ir_ioapic[count].iommu == iommu &&
+                   ir_ioapic[count].id == scope->enumeration_id)
+                       return 0;
+               else if (ir_ioapic[count].iommu == NULL && free == -1)
+                       free = count;
+       }
+       if (free == -1) {
+               pr_warn("Exceeded Max IO APICS\n");
+               return -ENOSPC;
+       }
+
+       ir_ioapic[free].bus   = bus;
+       ir_ioapic[free].devfn = PCI_DEVFN(path->device, path->function);
+       ir_ioapic[free].iommu = iommu;
+       ir_ioapic[free].id    = scope->enumeration_id;
+       pr_info("IOAPIC id %d under DRHD base  0x%Lx IOMMU %d\n",
+               scope->enumeration_id, drhd->address, iommu->seq_id);
+
+       return 0;
+}
+
+static int ir_parse_ioapic_hpet_scope(struct acpi_dmar_header *header,
+                                     struct intel_iommu *iommu)
+{
+       int ret = 0;
+       struct acpi_dmar_hardware_unit *drhd;
+       struct acpi_dmar_device_scope *scope;
+       void *start, *end;
+
+       drhd = (struct acpi_dmar_hardware_unit *)header;
+       start = (void *)(drhd + 1);
+       end = ((void *)drhd) + header->length;
+
+       while (start < end && ret == 0) {
+               scope = start;
+               if (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_IOAPIC)
+                       ret = ir_parse_one_ioapic_scope(scope, iommu, drhd);
+               else if (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_HPET)
+                       ret = ir_parse_one_hpet_scope(scope, iommu, drhd);
+               start += scope->length;
+       }
+
+       return ret;
+}
+
+static void ir_remove_ioapic_hpet_scope(struct intel_iommu *iommu)
+{
+       int i;
+
+       for (i = 0; i < MAX_HPET_TBS; i++)
+               if (ir_hpet[i].iommu == iommu)
+                       ir_hpet[i].iommu = NULL;
+
+       for (i = 0; i < MAX_IO_APICS; i++)
+               if (ir_ioapic[i].iommu == iommu)
+                       ir_ioapic[i].iommu = NULL;
+}
+
+/*
+ * Finds the assocaition between IOAPIC's and its Interrupt-remapping
+ * hardware unit.
+ */
+static int __init parse_ioapics_under_ir(void)
+{
+       struct dmar_drhd_unit *drhd;
+       struct intel_iommu *iommu;
+       bool ir_supported = false;
+       int ioapic_idx;
+
+       for_each_iommu(iommu, drhd) {
+               int ret;
+
+               if (!ecap_ir_support(iommu->ecap))
+                       continue;
+
+               ret = ir_parse_ioapic_hpet_scope(drhd->hdr, iommu);
+               if (ret)
+                       return ret;
+
+               ir_supported = true;
+       }
+
+       if (!ir_supported)
+               return -ENODEV;
+
+       for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) {
+               int ioapic_id = mpc_ioapic_id(ioapic_idx);
+               if (!map_ioapic_to_ir(ioapic_id)) {
+                       pr_err(FW_BUG "ioapic %d has no mapping iommu, "
+                              "interrupt remapping will be disabled\n",
+                              ioapic_id);
+                       return -1;
+               }
+       }
+
+       return 0;
+}
+
+static int __init ir_dev_scope_init(void)
+{
+       int ret;
+
+       if (!irq_remapping_enabled)
+               return 0;
+
+       down_write(&dmar_global_lock);
+       ret = dmar_dev_scope_init();
+       up_write(&dmar_global_lock);
+
+       return ret;
+}
+rootfs_initcall(ir_dev_scope_init);
+
+static void disable_irq_remapping(void)
+{
+       struct dmar_drhd_unit *drhd;
+       struct intel_iommu *iommu = NULL;
+
+       /*
+        * Disable Interrupt-remapping for all the DRHD's now.
+        */
+       for_each_iommu(iommu, drhd) {
+               if (!ecap_ir_support(iommu->ecap))
+                       continue;
+
+               iommu_disable_irq_remapping(iommu);
+       }
+
+       /*
+        * Clear Posted-Interrupts capability.
+        */
+       if (!disable_irq_post)
+               intel_irq_remap_ops.capability &= ~(1 << IRQ_POSTING_CAP);
+}
+
+static int reenable_irq_remapping(int eim)
+{
+       struct dmar_drhd_unit *drhd;
+       bool setup = false;
+       struct intel_iommu *iommu = NULL;
+
+       for_each_iommu(iommu, drhd)
+               if (iommu->qi)
+                       dmar_reenable_qi(iommu);
+
+       /*
+        * Setup Interrupt-remapping for all the DRHD's now.
+        */
+       for_each_iommu(iommu, drhd) {
+               if (!ecap_ir_support(iommu->ecap))
+                       continue;
+
+               /* Set up interrupt remapping for iommu.*/
+               iommu_set_irq_remapping(iommu, eim);
+               iommu_enable_irq_remapping(iommu);
+               setup = true;
+       }
+
+       if (!setup)
+               goto error;
+
+       set_irq_posting_cap();
+
+       return 0;
+
+error:
+       /*
+        * handle error condition gracefully here!
+        */
+       return -1;
+}
+
+static void prepare_irte(struct irte *irte, int vector, unsigned int dest)
+{
+       memset(irte, 0, sizeof(*irte));
+
+       irte->present = 1;
+       irte->dst_mode = apic->irq_dest_mode;
+       /*
+        * Trigger mode in the IRTE will always be edge, and for IO-APIC, the
+        * actual level or edge trigger will be setup in the IO-APIC
+        * RTE. This will help simplify level triggered irq migration.
+        * For more details, see the comments (in io_apic.c) explainig IO-APIC
+        * irq migration in the presence of interrupt-remapping.
+       */
+       irte->trigger_mode = 0;
+       irte->dlvry_mode = apic->irq_delivery_mode;
+       irte->vector = vector;
+       irte->dest_id = IRTE_DEST(dest);
+       irte->redir_hint = 1;
+}
+
+static struct irq_domain *intel_get_ir_irq_domain(struct irq_alloc_info *info)
+{
+       struct intel_iommu *iommu = NULL;
+
+       if (!info)
+               return NULL;
+
+       switch (info->type) {
+       case X86_IRQ_ALLOC_TYPE_IOAPIC:
+               iommu = map_ioapic_to_ir(info->ioapic_id);
+               break;
+       case X86_IRQ_ALLOC_TYPE_HPET:
+               iommu = map_hpet_to_ir(info->hpet_id);
+               break;
+       case X86_IRQ_ALLOC_TYPE_MSI:
+       case X86_IRQ_ALLOC_TYPE_MSIX:
+               iommu = map_dev_to_ir(info->msi_dev);
+               break;
+       default:
+               BUG_ON(1);
+               break;
+       }
+
+       return iommu ? iommu->ir_domain : NULL;
+}
+
+static struct irq_domain *intel_get_irq_domain(struct irq_alloc_info *info)
+{
+       struct intel_iommu *iommu;
+
+       if (!info)
+               return NULL;
+
+       switch (info->type) {
+       case X86_IRQ_ALLOC_TYPE_MSI:
+       case X86_IRQ_ALLOC_TYPE_MSIX:
+               iommu = map_dev_to_ir(info->msi_dev);
+               if (iommu)
+                       return iommu->ir_msi_domain;
+               break;
+       default:
+               break;
+       }
+
+       return NULL;
+}
+
+struct irq_remap_ops intel_irq_remap_ops = {
+       .prepare                = intel_prepare_irq_remapping,
+       .enable                 = intel_enable_irq_remapping,
+       .disable                = disable_irq_remapping,
+       .reenable               = reenable_irq_remapping,
+       .enable_faulting        = enable_drhd_fault_handling,
+       .get_ir_irq_domain      = intel_get_ir_irq_domain,
+       .get_irq_domain         = intel_get_irq_domain,
+};
+
+static void intel_ir_reconfigure_irte(struct irq_data *irqd, bool force)
+{
+       struct intel_ir_data *ir_data = irqd->chip_data;
+       struct irte *irte = &ir_data->irte_entry;
+       struct irq_cfg *cfg = irqd_cfg(irqd);
+
+       /*
+        * Atomically updates the IRTE with the new destination, vector
+        * and flushes the interrupt entry cache.
+        */
+       irte->vector = cfg->vector;
+       irte->dest_id = IRTE_DEST(cfg->dest_apicid);
+
+       /* Update the hardware only if the interrupt is in remapped mode. */
+       if (force || ir_data->irq_2_iommu.mode == IRQ_REMAPPING)
+               modify_irte(&ir_data->irq_2_iommu, irte);
+}
+
+/*
+ * Migrate the IO-APIC irq in the presence of intr-remapping.
+ *
+ * For both level and edge triggered, irq migration is a simple atomic
+ * update(of vector and cpu destination) of IRTE and flush the hardware cache.
+ *
+ * For level triggered, we eliminate the io-apic RTE modification (with the
+ * updated vector information), by using a virtual vector (io-apic pin number).
+ * Real vector that is used for interrupting cpu will be coming from
+ * the interrupt-remapping table entry.
+ *
+ * As the migration is a simple atomic update of IRTE, the same mechanism
+ * is used to migrate MSI irq's in the presence of interrupt-remapping.
+ */
+static int
+intel_ir_set_affinity(struct irq_data *data, const struct cpumask *mask,
+                     bool force)
+{
+       struct irq_data *parent = data->parent_data;
+       struct irq_cfg *cfg = irqd_cfg(data);
+       int ret;
+
+       ret = parent->chip->irq_set_affinity(parent, mask, force);
+       if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE)
+               return ret;
+
+       intel_ir_reconfigure_irte(data, false);
+       /*
+        * After this point, all the interrupts will start arriving
+        * at the new destination. So, time to cleanup the previous
+        * vector allocation.
+        */
+       send_cleanup_vector(cfg);
+
+       return IRQ_SET_MASK_OK_DONE;
+}
+
+static void intel_ir_compose_msi_msg(struct irq_data *irq_data,
+                                    struct msi_msg *msg)
+{
+       struct intel_ir_data *ir_data = irq_data->chip_data;
+
+       *msg = ir_data->msi_entry;
+}
+
+static int intel_ir_set_vcpu_affinity(struct irq_data *data, void *info)
+{
+       struct intel_ir_data *ir_data = data->chip_data;
+       struct vcpu_data *vcpu_pi_info = info;
+
+       /* stop posting interrupts, back to remapping mode */
+       if (!vcpu_pi_info) {
+               modify_irte(&ir_data->irq_2_iommu, &ir_data->irte_entry);
+       } else {
+               struct irte irte_pi;
+
+               /*
+                * We are not caching the posted interrupt entry. We
+                * copy the data from the remapped entry and modify
+                * the fields which are relevant for posted mode. The
+                * cached remapped entry is used for switching back to
+                * remapped mode.
+                */
+               memset(&irte_pi, 0, sizeof(irte_pi));
+               dmar_copy_shared_irte(&irte_pi, &ir_data->irte_entry);
+
+               /* Update the posted mode fields */
+               irte_pi.p_pst = 1;
+               irte_pi.p_urgent = 0;
+               irte_pi.p_vector = vcpu_pi_info->vector;
+               irte_pi.pda_l = (vcpu_pi_info->pi_desc_addr >>
+                               (32 - PDA_LOW_BIT)) & ~(-1UL << PDA_LOW_BIT);
+               irte_pi.pda_h = (vcpu_pi_info->pi_desc_addr >> 32) &
+                               ~(-1UL << PDA_HIGH_BIT);
+
+               modify_irte(&ir_data->irq_2_iommu, &irte_pi);
+       }
+
+       return 0;
+}
+
+static struct irq_chip intel_ir_chip = {
+       .name                   = "INTEL-IR",
+       .irq_ack                = apic_ack_irq,
+       .irq_set_affinity       = intel_ir_set_affinity,
+       .irq_compose_msi_msg    = intel_ir_compose_msi_msg,
+       .irq_set_vcpu_affinity  = intel_ir_set_vcpu_affinity,
+};
+
+static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data,
+                                            struct irq_cfg *irq_cfg,
+                                            struct irq_alloc_info *info,
+                                            int index, int sub_handle)
+{
+       struct IR_IO_APIC_route_entry *entry;
+       struct irte *irte = &data->irte_entry;
+       struct msi_msg *msg = &data->msi_entry;
+
+       prepare_irte(irte, irq_cfg->vector, irq_cfg->dest_apicid);
+       switch (info->type) {
+       case X86_IRQ_ALLOC_TYPE_IOAPIC:
+               /* Set source-id of interrupt request */
+               set_ioapic_sid(irte, info->ioapic_id);
+               apic_printk(APIC_VERBOSE, KERN_DEBUG "IOAPIC[%d]: Set IRTE entry (P:%d FPD:%d Dst_Mode:%d Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X Avail:%X Vector:%02X Dest:%08X SID:%04X SQ:%X SVT:%X)\n",
+                       info->ioapic_id, irte->present, irte->fpd,
+                       irte->dst_mode, irte->redir_hint,
+                       irte->trigger_mode, irte->dlvry_mode,
+                       irte->avail, irte->vector, irte->dest_id,
+                       irte->sid, irte->sq, irte->svt);
+
+               entry = (struct IR_IO_APIC_route_entry *)info->ioapic_entry;
+               info->ioapic_entry = NULL;
+               memset(entry, 0, sizeof(*entry));
+               entry->index2   = (index >> 15) & 0x1;
+               entry->zero     = 0;
+               entry->format   = 1;
+               entry->index    = (index & 0x7fff);
+               /*
+                * IO-APIC RTE will be configured with virtual vector.
+                * irq handler will do the explicit EOI to the io-apic.
+                */
+               entry->vector   = info->ioapic_pin;
+               entry->mask     = 0;                    /* enable IRQ */
+               entry->trigger  = info->ioapic_trigger;
+               entry->polarity = info->ioapic_polarity;
+               if (info->ioapic_trigger)
+                       entry->mask = 1; /* Mask level triggered irqs. */
+               break;
+
+       case X86_IRQ_ALLOC_TYPE_HPET:
+       case X86_IRQ_ALLOC_TYPE_MSI:
+       case X86_IRQ_ALLOC_TYPE_MSIX:
+               if (info->type == X86_IRQ_ALLOC_TYPE_HPET)
+                       set_hpet_sid(irte, info->hpet_id);
+               else
+                       set_msi_sid(irte, info->msi_dev);
+
+               msg->address_hi = MSI_ADDR_BASE_HI;
+               msg->data = sub_handle;
+               msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT |
+                                 MSI_ADDR_IR_SHV |
+                                 MSI_ADDR_IR_INDEX1(index) |
+                                 MSI_ADDR_IR_INDEX2(index);
+               break;
+
+       default:
+               BUG_ON(1);
+               break;
+       }
+}
+
+static void intel_free_irq_resources(struct irq_domain *domain,
+                                    unsigned int virq, unsigned int nr_irqs)
+{
+       struct irq_data *irq_data;
+       struct intel_ir_data *data;
+       struct irq_2_iommu *irq_iommu;
+       unsigned long flags;
+       int i;
+       for (i = 0; i < nr_irqs; i++) {
+               irq_data = irq_domain_get_irq_data(domain, virq  + i);
+               if (irq_data && irq_data->chip_data) {
+                       data = irq_data->chip_data;
+                       irq_iommu = &data->irq_2_iommu;
+                       raw_spin_lock_irqsave(&irq_2_ir_lock, flags);
+                       clear_entries(irq_iommu);
+                       raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags);
+                       irq_domain_reset_irq_data(irq_data);
+                       kfree(data);
+               }
+       }
+}
+
+static int intel_irq_remapping_alloc(struct irq_domain *domain,
+                                    unsigned int virq, unsigned int nr_irqs,
+                                    void *arg)
+{
+       struct intel_iommu *iommu = domain->host_data;
+       struct irq_alloc_info *info = arg;
+       struct intel_ir_data *data, *ird;
+       struct irq_data *irq_data;
+       struct irq_cfg *irq_cfg;
+       int i, ret, index;
+
+       if (!info || !iommu)
+               return -EINVAL;
+       if (nr_irqs > 1 && info->type != X86_IRQ_ALLOC_TYPE_MSI &&
+           info->type != X86_IRQ_ALLOC_TYPE_MSIX)
+               return -EINVAL;
+
+       /*
+        * With IRQ remapping enabled, don't need contiguous CPU vectors
+        * to support multiple MSI interrupts.
+        */
+       if (info->type == X86_IRQ_ALLOC_TYPE_MSI)
+               info->flags &= ~X86_IRQ_ALLOC_CONTIGUOUS_VECTORS;
+
+       ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg);
+       if (ret < 0)
+               return ret;
+
+       ret = -ENOMEM;
+       data = kzalloc(sizeof(*data), GFP_KERNEL);
+       if (!data)
+               goto out_free_parent;
+
+       down_read(&dmar_global_lock);
+       index = alloc_irte(iommu, &data->irq_2_iommu, nr_irqs);
+       up_read(&dmar_global_lock);
+       if (index < 0) {
+               pr_warn("Failed to allocate IRTE\n");
+               kfree(data);
+               goto out_free_parent;
+       }
+
+       for (i = 0; i < nr_irqs; i++) {
+               irq_data = irq_domain_get_irq_data(domain, virq + i);
+               irq_cfg = irqd_cfg(irq_data);
+               if (!irq_data || !irq_cfg) {
+                       ret = -EINVAL;
+                       goto out_free_data;
+               }
+
+               if (i > 0) {
+                       ird = kzalloc(sizeof(*ird), GFP_KERNEL);
+                       if (!ird)
+                               goto out_free_data;
+                       /* Initialize the common data */
+                       ird->irq_2_iommu = data->irq_2_iommu;
+                       ird->irq_2_iommu.sub_handle = i;
+               } else {
+                       ird = data;
+               }
+
+               irq_data->hwirq = (index << 16) + i;
+               irq_data->chip_data = ird;
+               irq_data->chip = &intel_ir_chip;
+               intel_irq_remapping_prepare_irte(ird, irq_cfg, info, index, i);
+               irq_set_status_flags(virq + i, IRQ_MOVE_PCNTXT);
+       }
+       return 0;
+
+out_free_data:
+       intel_free_irq_resources(domain, virq, i);
+out_free_parent:
+       irq_domain_free_irqs_common(domain, virq, nr_irqs);
+       return ret;
+}
+
+static void intel_irq_remapping_free(struct irq_domain *domain,
+                                    unsigned int virq, unsigned int nr_irqs)
+{
+       intel_free_irq_resources(domain, virq, nr_irqs);
+       irq_domain_free_irqs_common(domain, virq, nr_irqs);
+}
+
+static int intel_irq_remapping_activate(struct irq_domain *domain,
+                                       struct irq_data *irq_data, bool reserve)
+{
+       intel_ir_reconfigure_irte(irq_data, true);
+       return 0;
+}
+
+static void intel_irq_remapping_deactivate(struct irq_domain *domain,
+                                          struct irq_data *irq_data)
+{
+       struct intel_ir_data *data = irq_data->chip_data;
+       struct irte entry;
+
+       memset(&entry, 0, sizeof(entry));
+       modify_irte(&data->irq_2_iommu, &entry);
+}
+
+static const struct irq_domain_ops intel_ir_domain_ops = {
+       .alloc = intel_irq_remapping_alloc,
+       .free = intel_irq_remapping_free,
+       .activate = intel_irq_remapping_activate,
+       .deactivate = intel_irq_remapping_deactivate,
+};
+
+/*
+ * Support of Interrupt Remapping Unit Hotplug
+ */
+static int dmar_ir_add(struct dmar_drhd_unit *dmaru, struct intel_iommu *iommu)
+{
+       int ret;
+       int eim = x2apic_enabled();
+
+       if (eim && !ecap_eim_support(iommu->ecap)) {
+               pr_info("DRHD %Lx: EIM not supported by DRHD, ecap %Lx\n",
+                       iommu->reg_phys, iommu->ecap);
+               return -ENODEV;
+       }
+
+       if (ir_parse_ioapic_hpet_scope(dmaru->hdr, iommu)) {
+               pr_warn("DRHD %Lx: failed to parse managed IOAPIC/HPET\n",
+                       iommu->reg_phys);
+               return -ENODEV;
+       }
+
+       /* TODO: check all IOAPICs are covered by IOMMU */
+
+       /* Setup Interrupt-remapping now. */
+       ret = intel_setup_irq_remapping(iommu);
+       if (ret) {
+               pr_err("Failed to setup irq remapping for %s\n",
+                      iommu->name);
+               intel_teardown_irq_remapping(iommu);
+               ir_remove_ioapic_hpet_scope(iommu);
+       } else {
+               iommu_enable_irq_remapping(iommu);
+       }
+
+       return ret;
+}
+
+int dmar_ir_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
+{
+       int ret = 0;
+       struct intel_iommu *iommu = dmaru->iommu;
+
+       if (!irq_remapping_enabled)
+               return 0;
+       if (iommu == NULL)
+               return -EINVAL;
+       if (!ecap_ir_support(iommu->ecap))
+               return 0;
+       if (irq_remapping_cap(IRQ_POSTING_CAP) &&
+           !cap_pi_support(iommu->cap))
+               return -EBUSY;
+
+       if (insert) {
+               if (!iommu->ir_table)
+                       ret = dmar_ir_add(dmaru, iommu);
+       } else {
+               if (iommu->ir_table) {
+                       if (!bitmap_empty(iommu->ir_table->bitmap,
+                                         INTR_REMAP_TABLE_ENTRIES)) {
+                               ret = -EBUSY;
+                       } else {
+                               iommu_disable_irq_remapping(iommu);
+                               intel_teardown_irq_remapping(iommu);
+                               ir_remove_ioapic_hpet_scope(iommu);
+                       }
+               }
+       }
+
+       return ret;
+}
diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c

new file mode 100644 (file)

index 0000000..c81f0f1
--- /dev/null
+++ b/drivers/iommu/intel/pasid.c
@@ -0,0 +1,853 @@
+// SPDX-License-Identifier: GPL-2.0
+/**
+ * intel-pasid.c - PASID idr, table and entry manipulation
+ *
+ * Copyright (C) 2018 Intel Corporation
+ *
+ * Author: Lu Baolu <baolu.lu@linux.intel.com>
+ */
+
+#define pr_fmt(fmt)    "DMAR: " fmt
+
+#include <linux/bitops.h>
+#include <linux/cpufeature.h>
+#include <linux/dmar.h>
+#include <linux/intel-iommu.h>
+#include <linux/iommu.h>
+#include <linux/memory.h>
+#include <linux/pci.h>
+#include <linux/pci-ats.h>
+#include <linux/spinlock.h>
+
+#include "intel-pasid.h"
+
+/*
+ * Intel IOMMU system wide PASID name space:
+ */
+static DEFINE_SPINLOCK(pasid_lock);
+u32 intel_pasid_max_id = PASID_MAX;
+
+int vcmd_alloc_pasid(struct intel_iommu *iommu, unsigned int *pasid)
+{
+       unsigned long flags;
+       u8 status_code;
+       int ret = 0;
+       u64 res;
+
+       raw_spin_lock_irqsave(&iommu->register_lock, flags);
+       dmar_writeq(iommu->reg + DMAR_VCMD_REG, VCMD_CMD_ALLOC);
+       IOMMU_WAIT_OP(iommu, DMAR_VCRSP_REG, dmar_readq,
+                     !(res & VCMD_VRSP_IP), res);
+       raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
+
+       status_code = VCMD_VRSP_SC(res);
+       switch (status_code) {
+       case VCMD_VRSP_SC_SUCCESS:
+               *pasid = VCMD_VRSP_RESULT_PASID(res);
+               break;
+       case VCMD_VRSP_SC_NO_PASID_AVAIL:
+               pr_info("IOMMU: %s: No PASID available\n", iommu->name);
+               ret = -ENOSPC;
+               break;
+       default:
+               ret = -ENODEV;
+               pr_warn("IOMMU: %s: Unexpected error code %d\n",
+                       iommu->name, status_code);
+       }
+
+       return ret;
+}
+
+void vcmd_free_pasid(struct intel_iommu *iommu, unsigned int pasid)
+{
+       unsigned long flags;
+       u8 status_code;
+       u64 res;
+
+       raw_spin_lock_irqsave(&iommu->register_lock, flags);
+       dmar_writeq(iommu->reg + DMAR_VCMD_REG,
+                   VCMD_CMD_OPERAND(pasid) | VCMD_CMD_FREE);
+       IOMMU_WAIT_OP(iommu, DMAR_VCRSP_REG, dmar_readq,
+                     !(res & VCMD_VRSP_IP), res);
+       raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
+
+       status_code = VCMD_VRSP_SC(res);
+       switch (status_code) {
+       case VCMD_VRSP_SC_SUCCESS:
+               break;
+       case VCMD_VRSP_SC_INVALID_PASID:
+               pr_info("IOMMU: %s: Invalid PASID\n", iommu->name);
+               break;
+       default:
+               pr_warn("IOMMU: %s: Unexpected error code %d\n",
+                       iommu->name, status_code);
+       }
+}
+
+/*
+ * Per device pasid table management:
+ */
+static inline void
+device_attach_pasid_table(struct device_domain_info *info,
+                         struct pasid_table *pasid_table)
+{
+       info->pasid_table = pasid_table;
+       list_add(&info->table, &pasid_table->dev);
+}
+
+static inline void
+device_detach_pasid_table(struct device_domain_info *info,
+                         struct pasid_table *pasid_table)
+{
+       info->pasid_table = NULL;
+       list_del(&info->table);
+}
+
+struct pasid_table_opaque {
+       struct pasid_table      **pasid_table;
+       int                     segment;
+       int                     bus;
+       int                     devfn;
+};
+
+static int search_pasid_table(struct device_domain_info *info, void *opaque)
+{
+       struct pasid_table_opaque *data = opaque;
+
+       if (info->iommu->segment == data->segment &&
+           info->bus == data->bus &&
+           info->devfn == data->devfn &&
+           info->pasid_table) {
+               *data->pasid_table = info->pasid_table;
+               return 1;
+       }
+
+       return 0;
+}
+
+static int get_alias_pasid_table(struct pci_dev *pdev, u16 alias, void *opaque)
+{
+       struct pasid_table_opaque *data = opaque;
+
+       data->segment = pci_domain_nr(pdev->bus);
+       data->bus = PCI_BUS_NUM(alias);
+       data->devfn = alias & 0xff;
+
+       return for_each_device_domain(&search_pasid_table, data);
+}
+
+/*
+ * Allocate a pasid table for @dev. It should be called in a
+ * single-thread context.
+ */
+int intel_pasid_alloc_table(struct device *dev)
+{
+       struct device_domain_info *info;
+       struct pasid_table *pasid_table;
+       struct pasid_table_opaque data;
+       struct page *pages;
+       int max_pasid = 0;
+       int ret, order;
+       int size;
+
+       might_sleep();
+       info = get_domain_info(dev);
+       if (WARN_ON(!info || !dev_is_pci(dev) || info->pasid_table))
+               return -EINVAL;
+
+       /* DMA alias device already has a pasid table, use it: */
+       data.pasid_table = &pasid_table;
+       ret = pci_for_each_dma_alias(to_pci_dev(dev),
+                                    &get_alias_pasid_table, &data);
+       if (ret)
+               goto attach_out;
+
+       pasid_table = kzalloc(sizeof(*pasid_table), GFP_KERNEL);
+       if (!pasid_table)
+               return -ENOMEM;
+       INIT_LIST_HEAD(&pasid_table->dev);
+
+       if (info->pasid_supported)
+               max_pasid = min_t(int, pci_max_pasids(to_pci_dev(dev)),
+                                 intel_pasid_max_id);
+
+       size = max_pasid >> (PASID_PDE_SHIFT - 3);
+       order = size ? get_order(size) : 0;
+       pages = alloc_pages_node(info->iommu->node,
+                                GFP_KERNEL | __GFP_ZERO, order);
+       if (!pages) {
+               kfree(pasid_table);
+               return -ENOMEM;
+       }
+
+       pasid_table->table = page_address(pages);
+       pasid_table->order = order;
+       pasid_table->max_pasid = 1 << (order + PAGE_SHIFT + 3);
+
+attach_out:
+       device_attach_pasid_table(info, pasid_table);
+
+       return 0;
+}
+
+void intel_pasid_free_table(struct device *dev)
+{
+       struct device_domain_info *info;
+       struct pasid_table *pasid_table;
+       struct pasid_dir_entry *dir;
+       struct pasid_entry *table;
+       int i, max_pde;
+
+       info = get_domain_info(dev);
+       if (!info || !dev_is_pci(dev) || !info->pasid_table)
+               return;
+
+       pasid_table = info->pasid_table;
+       device_detach_pasid_table(info, pasid_table);
+
+       if (!list_empty(&pasid_table->dev))
+               return;
+
+       /* Free scalable mode PASID directory tables: */
+       dir = pasid_table->table;
+       max_pde = pasid_table->max_pasid >> PASID_PDE_SHIFT;
+       for (i = 0; i < max_pde; i++) {
+               table = get_pasid_table_from_pde(&dir[i]);
+               free_pgtable_page(table);
+       }
+
+       free_pages((unsigned long)pasid_table->table, pasid_table->order);
+       kfree(pasid_table);
+}
+
+struct pasid_table *intel_pasid_get_table(struct device *dev)
+{
+       struct device_domain_info *info;
+
+       info = get_domain_info(dev);
+       if (!info)
+               return NULL;
+
+       return info->pasid_table;
+}
+
+int intel_pasid_get_dev_max_id(struct device *dev)
+{
+       struct device_domain_info *info;
+
+       info = get_domain_info(dev);
+       if (!info || !info->pasid_table)
+               return 0;
+
+       return info->pasid_table->max_pasid;
+}
+
+struct pasid_entry *intel_pasid_get_entry(struct device *dev, int pasid)
+{
+       struct device_domain_info *info;
+       struct pasid_table *pasid_table;
+       struct pasid_dir_entry *dir;
+       struct pasid_entry *entries;
+       int dir_index, index;
+
+       pasid_table = intel_pasid_get_table(dev);
+       if (WARN_ON(!pasid_table || pasid < 0 ||
+                   pasid >= intel_pasid_get_dev_max_id(dev)))
+               return NULL;
+
+       dir = pasid_table->table;
+       info = get_domain_info(dev);
+       dir_index = pasid >> PASID_PDE_SHIFT;
+       index = pasid & PASID_PTE_MASK;
+
+       spin_lock(&pasid_lock);
+       entries = get_pasid_table_from_pde(&dir[dir_index]);
+       if (!entries) {
+               entries = alloc_pgtable_page(info->iommu->node);
+               if (!entries) {
+                       spin_unlock(&pasid_lock);
+                       return NULL;
+               }
+
+               WRITE_ONCE(dir[dir_index].val,
+                          (u64)virt_to_phys(entries) | PASID_PTE_PRESENT);
+       }
+       spin_unlock(&pasid_lock);
+
+       return &entries[index];
+}
+
+/*
+ * Interfaces for PASID table entry manipulation:
+ */
+static inline void pasid_clear_entry(struct pasid_entry *pe)
+{
+       WRITE_ONCE(pe->val[0], 0);
+       WRITE_ONCE(pe->val[1], 0);
+       WRITE_ONCE(pe->val[2], 0);
+       WRITE_ONCE(pe->val[3], 0);
+       WRITE_ONCE(pe->val[4], 0);
+       WRITE_ONCE(pe->val[5], 0);
+       WRITE_ONCE(pe->val[6], 0);
+       WRITE_ONCE(pe->val[7], 0);
+}
+
+static inline void pasid_clear_entry_with_fpd(struct pasid_entry *pe)
+{
+       WRITE_ONCE(pe->val[0], PASID_PTE_FPD);
+       WRITE_ONCE(pe->val[1], 0);
+       WRITE_ONCE(pe->val[2], 0);
+       WRITE_ONCE(pe->val[3], 0);
+       WRITE_ONCE(pe->val[4], 0);
+       WRITE_ONCE(pe->val[5], 0);
+       WRITE_ONCE(pe->val[6], 0);
+       WRITE_ONCE(pe->val[7], 0);
+}
+
+static void
+intel_pasid_clear_entry(struct device *dev, int pasid, bool fault_ignore)
+{
+       struct pasid_entry *pe;
+
+       pe = intel_pasid_get_entry(dev, pasid);
+       if (WARN_ON(!pe))
+               return;
+
+       if (fault_ignore && pasid_pte_is_present(pe))
+               pasid_clear_entry_with_fpd(pe);
+       else
+               pasid_clear_entry(pe);
+}
+
+static inline void pasid_set_bits(u64 *ptr, u64 mask, u64 bits)
+{
+       u64 old;
+
+       old = READ_ONCE(*ptr);
+       WRITE_ONCE(*ptr, (old & ~mask) | bits);
+}
+
+/*
+ * Setup the DID(Domain Identifier) field (Bit 64~79) of scalable mode
+ * PASID entry.
+ */
+static inline void
+pasid_set_domain_id(struct pasid_entry *pe, u64 value)
+{
+       pasid_set_bits(&pe->val[1], GENMASK_ULL(15, 0), value);
+}
+
+/*
+ * Get domain ID value of a scalable mode PASID entry.
+ */
+static inline u16
+pasid_get_domain_id(struct pasid_entry *pe)
+{
+       return (u16)(READ_ONCE(pe->val[1]) & GENMASK_ULL(15, 0));
+}
+
+/*
+ * Setup the SLPTPTR(Second Level Page Table Pointer) field (Bit 12~63)
+ * of a scalable mode PASID entry.
+ */
+static inline void
+pasid_set_slptr(struct pasid_entry *pe, u64 value)
+{
+       pasid_set_bits(&pe->val[0], VTD_PAGE_MASK, value);
+}
+
+/*
+ * Setup the AW(Address Width) field (Bit 2~4) of a scalable mode PASID
+ * entry.
+ */
+static inline void
+pasid_set_address_width(struct pasid_entry *pe, u64 value)
+{
+       pasid_set_bits(&pe->val[0], GENMASK_ULL(4, 2), value << 2);
+}
+
+/*
+ * Setup the PGTT(PASID Granular Translation Type) field (Bit 6~8)
+ * of a scalable mode PASID entry.
+ */
+static inline void
+pasid_set_translation_type(struct pasid_entry *pe, u64 value)
+{
+       pasid_set_bits(&pe->val[0], GENMASK_ULL(8, 6), value << 6);
+}
+
+/*
+ * Enable fault processing by clearing the FPD(Fault Processing
+ * Disable) field (Bit 1) of a scalable mode PASID entry.
+ */
+static inline void pasid_set_fault_enable(struct pasid_entry *pe)
+{
+       pasid_set_bits(&pe->val[0], 1 << 1, 0);
+}
+
+/*
+ * Setup the SRE(Supervisor Request Enable) field (Bit 128) of a
+ * scalable mode PASID entry.
+ */
+static inline void pasid_set_sre(struct pasid_entry *pe)
+{
+       pasid_set_bits(&pe->val[2], 1 << 0, 1);
+}
+
+/*
+ * Setup the P(Present) field (Bit 0) of a scalable mode PASID
+ * entry.
+ */
+static inline void pasid_set_present(struct pasid_entry *pe)
+{
+       pasid_set_bits(&pe->val[0], 1 << 0, 1);
+}
+
+/*
+ * Setup Page Walk Snoop bit (Bit 87) of a scalable mode PASID
+ * entry.
+ */
+static inline void pasid_set_page_snoop(struct pasid_entry *pe, bool value)
+{
+       pasid_set_bits(&pe->val[1], 1 << 23, value << 23);
+}
+
+/*
+ * Setup the First Level Page table Pointer field (Bit 140~191)
+ * of a scalable mode PASID entry.
+ */
+static inline void
+pasid_set_flptr(struct pasid_entry *pe, u64 value)
+{
+       pasid_set_bits(&pe->val[2], VTD_PAGE_MASK, value);
+}
+
+/*
+ * Setup the First Level Paging Mode field (Bit 130~131) of a
+ * scalable mode PASID entry.
+ */
+static inline void
+pasid_set_flpm(struct pasid_entry *pe, u64 value)
+{
+       pasid_set_bits(&pe->val[2], GENMASK_ULL(3, 2), value << 2);
+}
+
+/*
+ * Setup the Extended Access Flag Enable (EAFE) field (Bit 135)
+ * of a scalable mode PASID entry.
+ */
+static inline void
+pasid_set_eafe(struct pasid_entry *pe)
+{
+       pasid_set_bits(&pe->val[2], 1 << 7, 1 << 7);
+}
+
+static void
+pasid_cache_invalidation_with_pasid(struct intel_iommu *iommu,
+                                   u16 did, int pasid)
+{
+       struct qi_desc desc;
+
+       desc.qw0 = QI_PC_DID(did) | QI_PC_GRAN(QI_PC_PASID_SEL) |
+               QI_PC_PASID(pasid) | QI_PC_TYPE;
+       desc.qw1 = 0;
+       desc.qw2 = 0;
+       desc.qw3 = 0;
+
+       qi_submit_sync(iommu, &desc, 1, 0);
+}
+
+static void
+iotlb_invalidation_with_pasid(struct intel_iommu *iommu, u16 did, u32 pasid)
+{
+       struct qi_desc desc;
+
+       desc.qw0 = QI_EIOTLB_PASID(pasid) | QI_EIOTLB_DID(did) |
+                       QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) | QI_EIOTLB_TYPE;
+       desc.qw1 = 0;
+       desc.qw2 = 0;
+       desc.qw3 = 0;
+
+       qi_submit_sync(iommu, &desc, 1, 0);
+}
+
+static void
+devtlb_invalidation_with_pasid(struct intel_iommu *iommu,
+                              struct device *dev, int pasid)
+{
+       struct device_domain_info *info;
+       u16 sid, qdep, pfsid;
+
+       info = get_domain_info(dev);
+       if (!info || !info->ats_enabled)
+               return;
+
+       sid = info->bus << 8 | info->devfn;
+       qdep = info->ats_qdep;
+       pfsid = info->pfsid;
+
+       qi_flush_dev_iotlb(iommu, sid, pfsid, qdep, 0, 64 - VTD_PAGE_SHIFT);
+}
+
+void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev,
+                                int pasid, bool fault_ignore)
+{
+       struct pasid_entry *pte;
+       u16 did;
+
+       pte = intel_pasid_get_entry(dev, pasid);
+       if (WARN_ON(!pte))
+               return;
+
+       did = pasid_get_domain_id(pte);
+       intel_pasid_clear_entry(dev, pasid, fault_ignore);
+
+       if (!ecap_coherent(iommu->ecap))
+               clflush_cache_range(pte, sizeof(*pte));
+
+       pasid_cache_invalidation_with_pasid(iommu, did, pasid);
+       iotlb_invalidation_with_pasid(iommu, did, pasid);
+
+       /* Device IOTLB doesn't need to be flushed in caching mode. */
+       if (!cap_caching_mode(iommu->cap))
+               devtlb_invalidation_with_pasid(iommu, dev, pasid);
+}
+
+static void pasid_flush_caches(struct intel_iommu *iommu,
+                               struct pasid_entry *pte,
+                               int pasid, u16 did)
+{
+       if (!ecap_coherent(iommu->ecap))
+               clflush_cache_range(pte, sizeof(*pte));
+
+       if (cap_caching_mode(iommu->cap)) {
+               pasid_cache_invalidation_with_pasid(iommu, did, pasid);
+               iotlb_invalidation_with_pasid(iommu, did, pasid);
+       } else {
+               iommu_flush_write_buffer(iommu);
+       }
+}
+
+/*
+ * Set up the scalable mode pasid table entry for first only
+ * translation type.
+ */
+int intel_pasid_setup_first_level(struct intel_iommu *iommu,
+                                 struct device *dev, pgd_t *pgd,
+                                 int pasid, u16 did, int flags)
+{
+       struct pasid_entry *pte;
+
+       if (!ecap_flts(iommu->ecap)) {
+               pr_err("No first level translation support on %s\n",
+                      iommu->name);
+               return -EINVAL;
+       }
+
+       pte = intel_pasid_get_entry(dev, pasid);
+       if (WARN_ON(!pte))
+               return -EINVAL;
+
+       pasid_clear_entry(pte);
+
+       /* Setup the first level page table pointer: */
+       pasid_set_flptr(pte, (u64)__pa(pgd));
+       if (flags & PASID_FLAG_SUPERVISOR_MODE) {
+               if (!ecap_srs(iommu->ecap)) {
+                       pr_err("No supervisor request support on %s\n",
+                              iommu->name);
+                       return -EINVAL;
+               }
+               pasid_set_sre(pte);
+       }
+
+       if (flags & PASID_FLAG_FL5LP) {
+               if (cap_5lp_support(iommu->cap)) {
+                       pasid_set_flpm(pte, 1);
+               } else {
+                       pr_err("No 5-level paging support for first-level\n");
+                       pasid_clear_entry(pte);
+                       return -EINVAL;
+               }
+       }
+
+       pasid_set_domain_id(pte, did);
+       pasid_set_address_width(pte, iommu->agaw);
+       pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
+
+       /* Setup Present and PASID Granular Transfer Type: */
+       pasid_set_translation_type(pte, PASID_ENTRY_PGTT_FL_ONLY);
+       pasid_set_present(pte);
+       pasid_flush_caches(iommu, pte, pasid, did);
+
+       return 0;
+}
+
+/*
+ * Skip top levels of page tables for iommu which has less agaw
+ * than default. Unnecessary for PT mode.
+ */
+static inline int iommu_skip_agaw(struct dmar_domain *domain,
+                                 struct intel_iommu *iommu,
+                                 struct dma_pte **pgd)
+{
+       int agaw;
+
+       for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
+               *pgd = phys_to_virt(dma_pte_addr(*pgd));
+               if (!dma_pte_present(*pgd))
+                       return -EINVAL;
+       }
+
+       return agaw;
+}
+
+/*
+ * Set up the scalable mode pasid entry for second only translation type.
+ */
+int intel_pasid_setup_second_level(struct intel_iommu *iommu,
+                                  struct dmar_domain *domain,
+                                  struct device *dev, int pasid)
+{
+       struct pasid_entry *pte;
+       struct dma_pte *pgd;
+       u64 pgd_val;
+       int agaw;
+       u16 did;
+
+       /*
+        * If hardware advertises no support for second level
+        * translation, return directly.
+        */
+       if (!ecap_slts(iommu->ecap)) {
+               pr_err("No second level translation support on %s\n",
+                      iommu->name);
+               return -EINVAL;
+       }
+
+       pgd = domain->pgd;
+       agaw = iommu_skip_agaw(domain, iommu, &pgd);
+       if (agaw < 0) {
+               dev_err(dev, "Invalid domain page table\n");
+               return -EINVAL;
+       }
+
+       pgd_val = virt_to_phys(pgd);
+       did = domain->iommu_did[iommu->seq_id];
+
+       pte = intel_pasid_get_entry(dev, pasid);
+       if (!pte) {
+               dev_err(dev, "Failed to get pasid entry of PASID %d\n", pasid);
+               return -ENODEV;
+       }
+
+       pasid_clear_entry(pte);
+       pasid_set_domain_id(pte, did);
+       pasid_set_slptr(pte, pgd_val);
+       pasid_set_address_width(pte, agaw);
+       pasid_set_translation_type(pte, PASID_ENTRY_PGTT_SL_ONLY);
+       pasid_set_fault_enable(pte);
+       pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
+
+       /*
+        * Since it is a second level only translation setup, we should
+        * set SRE bit as well (addresses are expected to be GPAs).
+        */
+       pasid_set_sre(pte);
+       pasid_set_present(pte);
+       pasid_flush_caches(iommu, pte, pasid, did);
+
+       return 0;
+}
+
+/*
+ * Set up the scalable mode pasid entry for passthrough translation type.
+ */
+int intel_pasid_setup_pass_through(struct intel_iommu *iommu,
+                                  struct dmar_domain *domain,
+                                  struct device *dev, int pasid)
+{
+       u16 did = FLPT_DEFAULT_DID;
+       struct pasid_entry *pte;
+
+       pte = intel_pasid_get_entry(dev, pasid);
+       if (!pte) {
+               dev_err(dev, "Failed to get pasid entry of PASID %d\n", pasid);
+               return -ENODEV;
+       }
+
+       pasid_clear_entry(pte);
+       pasid_set_domain_id(pte, did);
+       pasid_set_address_width(pte, iommu->agaw);
+       pasid_set_translation_type(pte, PASID_ENTRY_PGTT_PT);
+       pasid_set_fault_enable(pte);
+       pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
+
+       /*
+        * We should set SRE bit as well since the addresses are expected
+        * to be GPAs.
+        */
+       pasid_set_sre(pte);
+       pasid_set_present(pte);
+       pasid_flush_caches(iommu, pte, pasid, did);
+
+       return 0;
+}
+
+static int
+intel_pasid_setup_bind_data(struct intel_iommu *iommu, struct pasid_entry *pte,
+                           struct iommu_gpasid_bind_data_vtd *pasid_data)
+{
+       /*
+        * Not all guest PASID table entry fields are passed down during bind,
+        * here we only set up the ones that are dependent on guest settings.
+        * Execution related bits such as NXE, SMEP are not supported.
+        * Other fields, such as snoop related, are set based on host needs
+        * regardless of guest settings.
+        */
+       if (pasid_data->flags & IOMMU_SVA_VTD_GPASID_SRE) {
+               if (!ecap_srs(iommu->ecap)) {
+                       pr_err_ratelimited("No supervisor request support on %s\n",
+                                          iommu->name);
+                       return -EINVAL;
+               }
+               pasid_set_sre(pte);
+       }
+
+       if (pasid_data->flags & IOMMU_SVA_VTD_GPASID_EAFE) {
+               if (!ecap_eafs(iommu->ecap)) {
+                       pr_err_ratelimited("No extended access flag support on %s\n",
+                                          iommu->name);
+                       return -EINVAL;
+               }
+               pasid_set_eafe(pte);
+       }
+
+       /*
+        * Memory type is only applicable to devices inside processor coherent
+        * domain. Will add MTS support once coherent devices are available.
+        */
+       if (pasid_data->flags & IOMMU_SVA_VTD_GPASID_MTS_MASK) {
+               pr_warn_ratelimited("No memory type support %s\n",
+                                   iommu->name);
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+/**
+ * intel_pasid_setup_nested() - Set up PASID entry for nested translation.
+ * This could be used for guest shared virtual address. In this case, the
+ * first level page tables are used for GVA-GPA translation in the guest,
+ * second level page tables are used for GPA-HPA translation.
+ *
+ * @iommu:      IOMMU which the device belong to
+ * @dev:        Device to be set up for translation
+ * @gpgd:       FLPTPTR: First Level Page translation pointer in GPA
+ * @pasid:      PASID to be programmed in the device PASID table
+ * @pasid_data: Additional PASID info from the guest bind request
+ * @domain:     Domain info for setting up second level page tables
+ * @addr_width: Address width of the first level (guest)
+ */
+int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev,
+                            pgd_t *gpgd, int pasid,
+                            struct iommu_gpasid_bind_data_vtd *pasid_data,
+                            struct dmar_domain *domain, int addr_width)
+{
+       struct pasid_entry *pte;
+       struct dma_pte *pgd;
+       int ret = 0;
+       u64 pgd_val;
+       int agaw;
+       u16 did;
+
+       if (!ecap_nest(iommu->ecap)) {
+               pr_err_ratelimited("IOMMU: %s: No nested translation support\n",
+                                  iommu->name);
+               return -EINVAL;
+       }
+
+       if (!(domain->flags & DOMAIN_FLAG_NESTING_MODE)) {
+               pr_err_ratelimited("Domain is not in nesting mode, %x\n",
+                                  domain->flags);
+               return -EINVAL;
+       }
+
+       pte = intel_pasid_get_entry(dev, pasid);
+       if (WARN_ON(!pte))
+               return -EINVAL;
+
+       /*
+        * Caller must ensure PASID entry is not in use, i.e. not bind the
+        * same PASID to the same device twice.
+        */
+       if (pasid_pte_is_present(pte))
+               return -EBUSY;
+
+       pasid_clear_entry(pte);
+
+       /* Sanity checking performed by caller to make sure address
+        * width matching in two dimensions:
+        * 1. CPU vs. IOMMU
+        * 2. Guest vs. Host.
+        */
+       switch (addr_width) {
+#ifdef CONFIG_X86
+       case ADDR_WIDTH_5LEVEL:
+               if (!cpu_feature_enabled(X86_FEATURE_LA57) ||
+                   !cap_5lp_support(iommu->cap)) {
+                       dev_err_ratelimited(dev,
+                                           "5-level paging not supported\n");
+                       return -EINVAL;
+               }
+
+               pasid_set_flpm(pte, 1);
+               break;
+#endif
+       case ADDR_WIDTH_4LEVEL:
+               pasid_set_flpm(pte, 0);
+               break;
+       default:
+               dev_err_ratelimited(dev, "Invalid guest address width %d\n",
+                                   addr_width);
+               return -EINVAL;
+       }
+
+       /* First level PGD is in GPA, must be supported by the second level */
+       if ((uintptr_t)gpgd > domain->max_addr) {
+               dev_err_ratelimited(dev,
+                                   "Guest PGD %lx not supported, max %llx\n",
+                                   (uintptr_t)gpgd, domain->max_addr);
+               return -EINVAL;
+       }
+       pasid_set_flptr(pte, (uintptr_t)gpgd);
+
+       ret = intel_pasid_setup_bind_data(iommu, pte, pasid_data);
+       if (ret)
+               return ret;
+
+       /* Setup the second level based on the given domain */
+       pgd = domain->pgd;
+
+       agaw = iommu_skip_agaw(domain, iommu, &pgd);
+       if (agaw < 0) {
+               dev_err_ratelimited(dev, "Invalid domain page table\n");
+               return -EINVAL;
+       }
+       pgd_val = virt_to_phys(pgd);
+       pasid_set_slptr(pte, pgd_val);
+       pasid_set_fault_enable(pte);
+
+       did = domain->iommu_did[iommu->seq_id];
+       pasid_set_domain_id(pte, did);
+
+       pasid_set_address_width(pte, agaw);
+       pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
+
+       pasid_set_translation_type(pte, PASID_ENTRY_PGTT_NESTED);
+       pasid_set_present(pte);
+       pasid_flush_caches(iommu, pte, pasid, did);
+
+       return ret;
+}
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c

new file mode 100644 (file)

index 0000000..a035ef9
--- /dev/null
+++ b/drivers/iommu/intel/svm.c
@@ -0,0 +1,1002 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright © 2015 Intel Corporation.
+ *
+ * Authors: David Woodhouse <dwmw2@infradead.org>
+ */
+
+#include <linux/intel-iommu.h>
+#include <linux/mmu_notifier.h>
+#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/slab.h>
+#include <linux/intel-svm.h>
+#include <linux/rculist.h>
+#include <linux/pci.h>
+#include <linux/pci-ats.h>
+#include <linux/dmar.h>
+#include <linux/interrupt.h>
+#include <linux/mm_types.h>
+#include <linux/ioasid.h>
+#include <asm/page.h>
+
+#include "intel-pasid.h"
+
+static irqreturn_t prq_event_thread(int irq, void *d);
+static void intel_svm_drain_prq(struct device *dev, int pasid);
+
+#define PRQ_ORDER 0
+
+int intel_svm_enable_prq(struct intel_iommu *iommu)
+{
+       struct page *pages;
+       int irq, ret;
+
+       pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, PRQ_ORDER);
+       if (!pages) {
+               pr_warn("IOMMU: %s: Failed to allocate page request queue\n",
+                       iommu->name);
+               return -ENOMEM;
+       }
+       iommu->prq = page_address(pages);
+
+       irq = dmar_alloc_hwirq(DMAR_UNITS_SUPPORTED + iommu->seq_id, iommu->node, iommu);
+       if (irq <= 0) {
+               pr_err("IOMMU: %s: Failed to create IRQ vector for page request queue\n",
+                      iommu->name);
+               ret = -EINVAL;
+       err:
+               free_pages((unsigned long)iommu->prq, PRQ_ORDER);
+               iommu->prq = NULL;
+               return ret;
+       }
+       iommu->pr_irq = irq;
+
+       snprintf(iommu->prq_name, sizeof(iommu->prq_name), "dmar%d-prq", iommu->seq_id);
+
+       ret = request_threaded_irq(irq, NULL, prq_event_thread, IRQF_ONESHOT,
+                                  iommu->prq_name, iommu);
+       if (ret) {
+               pr_err("IOMMU: %s: Failed to request IRQ for page request queue\n",
+                      iommu->name);
+               dmar_free_hwirq(irq);
+               iommu->pr_irq = 0;
+               goto err;
+       }
+       dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL);
+       dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL);
+       dmar_writeq(iommu->reg + DMAR_PQA_REG, virt_to_phys(iommu->prq) | PRQ_ORDER);
+
+       init_completion(&iommu->prq_complete);
+
+       return 0;
+}
+
+int intel_svm_finish_prq(struct intel_iommu *iommu)
+{
+       dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL);
+       dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL);
+       dmar_writeq(iommu->reg + DMAR_PQA_REG, 0ULL);
+
+       if (iommu->pr_irq) {
+               free_irq(iommu->pr_irq, iommu);
+               dmar_free_hwirq(iommu->pr_irq);
+               iommu->pr_irq = 0;
+       }
+
+       free_pages((unsigned long)iommu->prq, PRQ_ORDER);
+       iommu->prq = NULL;
+
+       return 0;
+}
+
+static inline bool intel_svm_capable(struct intel_iommu *iommu)
+{
+       return iommu->flags & VTD_FLAG_SVM_CAPABLE;
+}
+
+void intel_svm_check(struct intel_iommu *iommu)
+{
+       if (!pasid_supported(iommu))
+               return;
+
+       if (cpu_feature_enabled(X86_FEATURE_GBPAGES) &&
+           !cap_fl1gp_support(iommu->cap)) {
+               pr_err("%s SVM disabled, incompatible 1GB page capability\n",
+                      iommu->name);
+               return;
+       }
+
+       if (cpu_feature_enabled(X86_FEATURE_LA57) &&
+           !cap_5lp_support(iommu->cap)) {
+               pr_err("%s SVM disabled, incompatible paging mode\n",
+                      iommu->name);
+               return;
+       }
+
+       iommu->flags |= VTD_FLAG_SVM_CAPABLE;
+}
+
+static void intel_flush_svm_range_dev (struct intel_svm *svm, struct intel_svm_dev *sdev,
+                               unsigned long address, unsigned long pages, int ih)
+{
+       struct qi_desc desc;
+
+       if (pages == -1) {
+               desc.qw0 = QI_EIOTLB_PASID(svm->pasid) |
+                       QI_EIOTLB_DID(sdev->did) |
+                       QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) |
+                       QI_EIOTLB_TYPE;
+               desc.qw1 = 0;
+       } else {
+               int mask = ilog2(__roundup_pow_of_two(pages));
+
+               desc.qw0 = QI_EIOTLB_PASID(svm->pasid) |
+                               QI_EIOTLB_DID(sdev->did) |
+                               QI_EIOTLB_GRAN(QI_GRAN_PSI_PASID) |
+                               QI_EIOTLB_TYPE;
+               desc.qw1 = QI_EIOTLB_ADDR(address) |
+                               QI_EIOTLB_IH(ih) |
+                               QI_EIOTLB_AM(mask);
+       }
+       desc.qw2 = 0;
+       desc.qw3 = 0;
+       qi_submit_sync(svm->iommu, &desc, 1, 0);
+
+       if (sdev->dev_iotlb) {
+               desc.qw0 = QI_DEV_EIOTLB_PASID(svm->pasid) |
+                               QI_DEV_EIOTLB_SID(sdev->sid) |
+                               QI_DEV_EIOTLB_QDEP(sdev->qdep) |
+                               QI_DEIOTLB_TYPE;
+               if (pages == -1) {
+                       desc.qw1 = QI_DEV_EIOTLB_ADDR(-1ULL >> 1) |
+                                       QI_DEV_EIOTLB_SIZE;
+               } else if (pages > 1) {
+                       /* The least significant zero bit indicates the size. So,
+                        * for example, an "address" value of 0x12345f000 will
+                        * flush from 0x123440000 to 0x12347ffff (256KiB). */
+                       unsigned long last = address + ((unsigned long)(pages - 1) << VTD_PAGE_SHIFT);
+                       unsigned long mask = __rounddown_pow_of_two(address ^ last);
+
+                       desc.qw1 = QI_DEV_EIOTLB_ADDR((address & ~mask) |
+                                       (mask - 1)) | QI_DEV_EIOTLB_SIZE;
+               } else {
+                       desc.qw1 = QI_DEV_EIOTLB_ADDR(address);
+               }
+               desc.qw2 = 0;
+               desc.qw3 = 0;
+               qi_submit_sync(svm->iommu, &desc, 1, 0);
+       }
+}
+
+static void intel_flush_svm_range(struct intel_svm *svm, unsigned long address,
+                               unsigned long pages, int ih)
+{
+       struct intel_svm_dev *sdev;
+
+       rcu_read_lock();
+       list_for_each_entry_rcu(sdev, &svm->devs, list)
+               intel_flush_svm_range_dev(svm, sdev, address, pages, ih);
+       rcu_read_unlock();
+}
+
+/* Pages have been freed at this point */
+static void intel_invalidate_range(struct mmu_notifier *mn,
+                                  struct mm_struct *mm,
+                                  unsigned long start, unsigned long end)
+{
+       struct intel_svm *svm = container_of(mn, struct intel_svm, notifier);
+
+       intel_flush_svm_range(svm, start,
+                             (end - start + PAGE_SIZE - 1) >> VTD_PAGE_SHIFT, 0);
+}
+
+static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
+{
+       struct intel_svm *svm = container_of(mn, struct intel_svm, notifier);
+       struct intel_svm_dev *sdev;
+
+       /* This might end up being called from exit_mmap(), *before* the page
+        * tables are cleared. And __mmu_notifier_release() will delete us from
+        * the list of notifiers so that our invalidate_range() callback doesn't
+        * get called when the page tables are cleared. So we need to protect
+        * against hardware accessing those page tables.
+        *
+        * We do it by clearing the entry in the PASID table and then flushing
+        * the IOTLB and the PASID table caches. This might upset hardware;
+        * perhaps we'll want to point the PASID to a dummy PGD (like the zero
+        * page) so that we end up taking a fault that the hardware really
+        * *has* to handle gracefully without affecting other processes.
+        */
+       rcu_read_lock();
+       list_for_each_entry_rcu(sdev, &svm->devs, list)
+               intel_pasid_tear_down_entry(svm->iommu, sdev->dev,
+                                           svm->pasid, true);
+       rcu_read_unlock();
+
+}
+
+static const struct mmu_notifier_ops intel_mmuops = {
+       .release = intel_mm_release,
+       .invalidate_range = intel_invalidate_range,
+};
+
+static DEFINE_MUTEX(pasid_mutex);
+static LIST_HEAD(global_svm_list);
+
+#define for_each_svm_dev(sdev, svm, d)                 \
+       list_for_each_entry((sdev), &(svm)->devs, list) \
+               if ((d) != (sdev)->dev) {} else
+
+int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev,
+                         struct iommu_gpasid_bind_data *data)
+{
+       struct intel_iommu *iommu = intel_svm_device_to_iommu(dev);
+       struct dmar_domain *dmar_domain;
+       struct intel_svm_dev *sdev;
+       struct intel_svm *svm;
+       int ret = 0;
+
+       if (WARN_ON(!iommu) || !data)
+               return -EINVAL;
+
+       if (data->version != IOMMU_GPASID_BIND_VERSION_1 ||
+           data->format != IOMMU_PASID_FORMAT_INTEL_VTD)
+               return -EINVAL;
+
+       if (!dev_is_pci(dev))
+               return -ENOTSUPP;
+
+       /* VT-d supports devices with full 20 bit PASIDs only */
+       if (pci_max_pasids(to_pci_dev(dev)) != PASID_MAX)
+               return -EINVAL;
+
+       /*
+        * We only check host PASID range, we have no knowledge to check
+        * guest PASID range.
+        */
+       if (data->hpasid <= 0 || data->hpasid >= PASID_MAX)
+               return -EINVAL;
+
+       dmar_domain = to_dmar_domain(domain);
+
+       mutex_lock(&pasid_mutex);
+       svm = ioasid_find(NULL, data->hpasid, NULL);
+       if (IS_ERR(svm)) {
+               ret = PTR_ERR(svm);
+               goto out;
+       }
+
+       if (svm) {
+               /*
+                * If we found svm for the PASID, there must be at
+                * least one device bond, otherwise svm should be freed.
+                */
+               if (WARN_ON(list_empty(&svm->devs))) {
+                       ret = -EINVAL;
+                       goto out;
+               }
+
+               for_each_svm_dev(sdev, svm, dev) {
+                       /*
+                        * For devices with aux domains, we should allow
+                        * multiple bind calls with the same PASID and pdev.
+                        */
+                       if (iommu_dev_feature_enabled(dev,
+                                                     IOMMU_DEV_FEAT_AUX)) {
+                               sdev->users++;
+                       } else {
+                               dev_warn_ratelimited(dev,
+                                                    "Already bound with PASID %u\n",
+                                                    svm->pasid);
+                               ret = -EBUSY;
+                       }
+                       goto out;
+               }
+       } else {
+               /* We come here when PASID has never been bond to a device. */
+               svm = kzalloc(sizeof(*svm), GFP_KERNEL);
+               if (!svm) {
+                       ret = -ENOMEM;
+                       goto out;
+               }
+               /* REVISIT: upper layer/VFIO can track host process that bind
+                * the PASID. ioasid_set = mm might be sufficient for vfio to
+                * check pasid VMM ownership. We can drop the following line
+                * once VFIO and IOASID set check is in place.
+                */
+               svm->mm = get_task_mm(current);
+               svm->pasid = data->hpasid;
+               if (data->flags & IOMMU_SVA_GPASID_VAL) {
+                       svm->gpasid = data->gpasid;
+                       svm->flags |= SVM_FLAG_GUEST_PASID;
+               }
+               ioasid_set_data(data->hpasid, svm);
+               INIT_LIST_HEAD_RCU(&svm->devs);
+               mmput(svm->mm);
+       }
+       sdev = kzalloc(sizeof(*sdev), GFP_KERNEL);
+       if (!sdev) {
+               ret = -ENOMEM;
+               goto out;
+       }
+       sdev->dev = dev;
+
+       /* Only count users if device has aux domains */
+       if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX))
+               sdev->users = 1;
+
+       /* Set up device context entry for PASID if not enabled already */
+       ret = intel_iommu_enable_pasid(iommu, sdev->dev);
+       if (ret) {
+               dev_err_ratelimited(dev, "Failed to enable PASID capability\n");
+               kfree(sdev);
+               goto out;
+       }
+
+       /*
+        * PASID table is per device for better security. Therefore, for
+        * each bind of a new device even with an existing PASID, we need to
+        * call the nested mode setup function here.
+        */
+       spin_lock(&iommu->lock);
+       ret = intel_pasid_setup_nested(iommu, dev,
+                                      (pgd_t *)(uintptr_t)data->gpgd,
+                                      data->hpasid, &data->vtd, dmar_domain,
+                                      data->addr_width);
+       spin_unlock(&iommu->lock);
+       if (ret) {
+               dev_err_ratelimited(dev, "Failed to set up PASID %llu in nested mode, Err %d\n",
+                                   data->hpasid, ret);
+               /*
+                * PASID entry should be in cleared state if nested mode
+                * set up failed. So we only need to clear IOASID tracking
+                * data such that free call will succeed.
+                */
+               kfree(sdev);
+               goto out;
+       }
+
+       svm->flags |= SVM_FLAG_GUEST_MODE;
+
+       init_rcu_head(&sdev->rcu);
+       list_add_rcu(&sdev->list, &svm->devs);
+ out:
+       if (!IS_ERR_OR_NULL(svm) && list_empty(&svm->devs)) {
+               ioasid_set_data(data->hpasid, NULL);
+               kfree(svm);
+       }
+
+       mutex_unlock(&pasid_mutex);
+       return ret;
+}
+
+int intel_svm_unbind_gpasid(struct device *dev, int pasid)
+{
+       struct intel_iommu *iommu = intel_svm_device_to_iommu(dev);
+       struct intel_svm_dev *sdev;
+       struct intel_svm *svm;
+       int ret = -EINVAL;
+
+       if (WARN_ON(!iommu))
+               return -EINVAL;
+
+       mutex_lock(&pasid_mutex);
+       svm = ioasid_find(NULL, pasid, NULL);
+       if (!svm) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       if (IS_ERR(svm)) {
+               ret = PTR_ERR(svm);
+               goto out;
+       }
+
+       for_each_svm_dev(sdev, svm, dev) {
+               ret = 0;
+               if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX))
+                       sdev->users--;
+               if (!sdev->users) {
+                       list_del_rcu(&sdev->list);
+                       intel_pasid_tear_down_entry(iommu, dev,
+                                                   svm->pasid, false);
+                       intel_svm_drain_prq(dev, svm->pasid);
+                       kfree_rcu(sdev, rcu);
+
+                       if (list_empty(&svm->devs)) {
+                               /*
+                                * We do not free the IOASID here in that
+                                * IOMMU driver did not allocate it.
+                                * Unlike native SVM, IOASID for guest use was
+                                * allocated prior to the bind call.
+                                * In any case, if the free call comes before
+                                * the unbind, IOMMU driver will get notified
+                                * and perform cleanup.
+                                */
+                               ioasid_set_data(pasid, NULL);
+                               kfree(svm);
+                       }
+               }
+               break;
+       }
+out:
+       mutex_unlock(&pasid_mutex);
+       return ret;
+}
+
+/* Caller must hold pasid_mutex, mm reference */
+static int
+intel_svm_bind_mm(struct device *dev, int flags, struct svm_dev_ops *ops,
+                 struct mm_struct *mm, struct intel_svm_dev **sd)
+{
+       struct intel_iommu *iommu = intel_svm_device_to_iommu(dev);
+       struct device_domain_info *info;
+       struct intel_svm_dev *sdev;
+       struct intel_svm *svm = NULL;
+       int pasid_max;
+       int ret;
+
+       if (!iommu || dmar_disabled)
+               return -EINVAL;
+
+       if (!intel_svm_capable(iommu))
+               return -ENOTSUPP;
+
+       if (dev_is_pci(dev)) {
+               pasid_max = pci_max_pasids(to_pci_dev(dev));
+               if (pasid_max < 0)
+                       return -EINVAL;
+       } else
+               pasid_max = 1 << 20;
+
+       /* Bind supervisor PASID shuld have mm = NULL */
+       if (flags & SVM_FLAG_SUPERVISOR_MODE) {
+               if (!ecap_srs(iommu->ecap) || mm) {
+                       pr_err("Supervisor PASID with user provided mm.\n");
+                       return -EINVAL;
+               }
+       }
+
+       if (!(flags & SVM_FLAG_PRIVATE_PASID)) {
+               struct intel_svm *t;
+
+               list_for_each_entry(t, &global_svm_list, list) {
+                       if (t->mm != mm || (t->flags & SVM_FLAG_PRIVATE_PASID))
+                               continue;
+
+                       svm = t;
+                       if (svm->pasid >= pasid_max) {
+                               dev_warn(dev,
+                                        "Limited PASID width. Cannot use existing PASID %d\n",
+                                        svm->pasid);
+                               ret = -ENOSPC;
+                               goto out;
+                       }
+
+                       /* Find the matching device in svm list */
+                       for_each_svm_dev(sdev, svm, dev) {
+                               if (sdev->ops != ops) {
+                                       ret = -EBUSY;
+                                       goto out;
+                               }
+                               sdev->users++;
+                               goto success;
+                       }
+
+                       break;
+               }
+       }
+
+       sdev = kzalloc(sizeof(*sdev), GFP_KERNEL);
+       if (!sdev) {
+               ret = -ENOMEM;
+               goto out;
+       }
+       sdev->dev = dev;
+
+       ret = intel_iommu_enable_pasid(iommu, dev);
+       if (ret) {
+               kfree(sdev);
+               goto out;
+       }
+
+       info = get_domain_info(dev);
+       sdev->did = FLPT_DEFAULT_DID;
+       sdev->sid = PCI_DEVID(info->bus, info->devfn);
+       if (info->ats_enabled) {
+               sdev->dev_iotlb = 1;
+               sdev->qdep = info->ats_qdep;
+               if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS)
+                       sdev->qdep = 0;
+       }
+
+       /* Finish the setup now we know we're keeping it */
+       sdev->users = 1;
+       sdev->ops = ops;
+       init_rcu_head(&sdev->rcu);
+
+       if (!svm) {
+               svm = kzalloc(sizeof(*svm), GFP_KERNEL);
+               if (!svm) {
+                       ret = -ENOMEM;
+                       kfree(sdev);
+                       goto out;
+               }
+               svm->iommu = iommu;
+
+               if (pasid_max > intel_pasid_max_id)
+                       pasid_max = intel_pasid_max_id;
+
+               /* Do not use PASID 0, reserved for RID to PASID */
+               svm->pasid = ioasid_alloc(NULL, PASID_MIN,
+                                         pasid_max - 1, svm);
+               if (svm->pasid == INVALID_IOASID) {
+                       kfree(svm);
+                       kfree(sdev);
+                       ret = -ENOSPC;
+                       goto out;
+               }
+               svm->notifier.ops = &intel_mmuops;
+               svm->mm = mm;
+               svm->flags = flags;
+               INIT_LIST_HEAD_RCU(&svm->devs);
+               INIT_LIST_HEAD(&svm->list);
+               ret = -ENOMEM;
+               if (mm) {
+                       ret = mmu_notifier_register(&svm->notifier, mm);
+                       if (ret) {
+                               ioasid_free(svm->pasid);
+                               kfree(svm);
+                               kfree(sdev);
+                               goto out;
+                       }
+               }
+
+               spin_lock(&iommu->lock);
+               ret = intel_pasid_setup_first_level(iommu, dev,
+                               mm ? mm->pgd : init_mm.pgd,
+                               svm->pasid, FLPT_DEFAULT_DID,
+                               (mm ? 0 : PASID_FLAG_SUPERVISOR_MODE) |
+                               (cpu_feature_enabled(X86_FEATURE_LA57) ?
+                                PASID_FLAG_FL5LP : 0));
+               spin_unlock(&iommu->lock);
+               if (ret) {
+                       if (mm)
+                               mmu_notifier_unregister(&svm->notifier, mm);
+                       ioasid_free(svm->pasid);
+                       kfree(svm);
+                       kfree(sdev);
+                       goto out;
+               }
+
+               list_add_tail(&svm->list, &global_svm_list);
+       } else {
+               /*
+                * Binding a new device with existing PASID, need to setup
+                * the PASID entry.
+                */
+               spin_lock(&iommu->lock);
+               ret = intel_pasid_setup_first_level(iommu, dev,
+                                               mm ? mm->pgd : init_mm.pgd,
+                                               svm->pasid, FLPT_DEFAULT_DID,
+                                               (mm ? 0 : PASID_FLAG_SUPERVISOR_MODE) |
+                                               (cpu_feature_enabled(X86_FEATURE_LA57) ?
+                                               PASID_FLAG_FL5LP : 0));
+               spin_unlock(&iommu->lock);
+               if (ret) {
+                       kfree(sdev);
+                       goto out;
+               }
+       }
+       list_add_rcu(&sdev->list, &svm->devs);
+success:
+       sdev->pasid = svm->pasid;
+       sdev->sva.dev = dev;
+       if (sd)
+               *sd = sdev;
+       ret = 0;
+ out:
+       return ret;
+}
+
+/* Caller must hold pasid_mutex */
+static int intel_svm_unbind_mm(struct device *dev, int pasid)
+{
+       struct intel_svm_dev *sdev;
+       struct intel_iommu *iommu;
+       struct intel_svm *svm;
+       int ret = -EINVAL;
+
+       iommu = intel_svm_device_to_iommu(dev);
+       if (!iommu)
+               goto out;
+
+       svm = ioasid_find(NULL, pasid, NULL);
+       if (!svm)
+               goto out;
+
+       if (IS_ERR(svm)) {
+               ret = PTR_ERR(svm);
+               goto out;
+       }
+
+       for_each_svm_dev(sdev, svm, dev) {
+               ret = 0;
+               sdev->users--;
+               if (!sdev->users) {
+                       list_del_rcu(&sdev->list);
+                       /* Flush the PASID cache and IOTLB for this device.
+                        * Note that we do depend on the hardware *not* using
+                        * the PASID any more. Just as we depend on other
+                        * devices never using PASIDs that they have no right
+                        * to use. We have a *shared* PASID table, because it's
+                        * large and has to be physically contiguous. So it's
+                        * hard to be as defensive as we might like. */
+                       intel_pasid_tear_down_entry(iommu, dev,
+                                                   svm->pasid, false);
+                       intel_svm_drain_prq(dev, svm->pasid);
+                       kfree_rcu(sdev, rcu);
+
+                       if (list_empty(&svm->devs)) {
+                               ioasid_free(svm->pasid);
+                               if (svm->mm)
+                                       mmu_notifier_unregister(&svm->notifier, svm->mm);
+                               list_del(&svm->list);
+                               /* We mandate that no page faults may be outstanding
+                                * for the PASID when intel_svm_unbind_mm() is called.
+                                * If that is not obeyed, subtle errors will happen.
+                                * Let's make them less subtle... */
+                               memset(svm, 0x6b, sizeof(*svm));
+                               kfree(svm);
+                       }
+               }
+               break;
+       }
+ out:
+
+       return ret;
+}
+
+/* Page request queue descriptor */
+struct page_req_dsc {
+       union {
+               struct {
+                       u64 type:8;
+                       u64 pasid_present:1;
+                       u64 priv_data_present:1;
+                       u64 rsvd:6;
+                       u64 rid:16;
+                       u64 pasid:20;
+                       u64 exe_req:1;
+                       u64 pm_req:1;
+                       u64 rsvd2:10;
+               };
+               u64 qw_0;
+       };
+       union {
+               struct {
+                       u64 rd_req:1;
+                       u64 wr_req:1;
+                       u64 lpig:1;
+                       u64 prg_index:9;
+                       u64 addr:52;
+               };
+               u64 qw_1;
+       };
+       u64 priv_data[2];
+};
+
+#define PRQ_RING_MASK  ((0x1000 << PRQ_ORDER) - 0x20)
+
+static bool access_error(struct vm_area_struct *vma, struct page_req_dsc *req)
+{
+       unsigned long requested = 0;
+
+       if (req->exe_req)
+               requested |= VM_EXEC;
+
+       if (req->rd_req)
+               requested |= VM_READ;
+
+       if (req->wr_req)
+               requested |= VM_WRITE;
+
+       return (requested & ~vma->vm_flags) != 0;
+}
+
+static bool is_canonical_address(u64 addr)
+{
+       int shift = 64 - (__VIRTUAL_MASK_SHIFT + 1);
+       long saddr = (long) addr;
+
+       return (((saddr << shift) >> shift) == saddr);
+}
+
+/**
+ * intel_svm_drain_prq - Drain page requests and responses for a pasid
+ * @dev: target device
+ * @pasid: pasid for draining
+ *
+ * Drain all pending page requests and responses related to @pasid in both
+ * software and hardware. This is supposed to be called after the device
+ * driver has stopped DMA, the pasid entry has been cleared, and both IOTLB
+ * and DevTLB have been invalidated.
+ *
+ * It waits until all pending page requests for @pasid in the page fault
+ * queue are completed by the prq handling thread. Then follow the steps
+ * described in VT-d spec CH7.10 to drain all page requests and page
+ * responses pending in the hardware.
+ */
+static void intel_svm_drain_prq(struct device *dev, int pasid)
+{
+       struct device_domain_info *info;
+       struct dmar_domain *domain;
+       struct intel_iommu *iommu;
+       struct qi_desc desc[3];
+       struct pci_dev *pdev;
+       int head, tail;
+       u16 sid, did;
+       int qdep;
+
+       info = get_domain_info(dev);
+       if (WARN_ON(!info || !dev_is_pci(dev)))
+               return;
+
+       if (!info->pri_enabled)
+               return;
+
+       iommu = info->iommu;
+       domain = info->domain;
+       pdev = to_pci_dev(dev);
+       sid = PCI_DEVID(info->bus, info->devfn);
+       did = domain->iommu_did[iommu->seq_id];
+       qdep = pci_ats_queue_depth(pdev);
+
+       /*
+        * Check and wait until all pending page requests in the queue are
+        * handled by the prq handling thread.
+        */
+prq_retry:
+       reinit_completion(&iommu->prq_complete);
+       tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK;
+       head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK;
+       while (head != tail) {
+               struct page_req_dsc *req;
+
+               req = &iommu->prq[head / sizeof(*req)];
+               if (!req->pasid_present || req->pasid != pasid) {
+                       head = (head + sizeof(*req)) & PRQ_RING_MASK;
+                       continue;
+               }
+
+               wait_for_completion(&iommu->prq_complete);
+               goto prq_retry;
+       }
+
+       /*
+        * Perform steps described in VT-d spec CH7.10 to drain page
+        * requests and responses in hardware.
+        */
+       memset(desc, 0, sizeof(desc));
+       desc[0].qw0 = QI_IWD_STATUS_DATA(QI_DONE) |
+                       QI_IWD_FENCE |
+                       QI_IWD_TYPE;
+       desc[1].qw0 = QI_EIOTLB_PASID(pasid) |
+                       QI_EIOTLB_DID(did) |
+                       QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) |
+                       QI_EIOTLB_TYPE;
+       desc[2].qw0 = QI_DEV_EIOTLB_PASID(pasid) |
+                       QI_DEV_EIOTLB_SID(sid) |
+                       QI_DEV_EIOTLB_QDEP(qdep) |
+                       QI_DEIOTLB_TYPE |
+                       QI_DEV_IOTLB_PFSID(info->pfsid);
+qi_retry:
+       reinit_completion(&iommu->prq_complete);
+       qi_submit_sync(iommu, desc, 3, QI_OPT_WAIT_DRAIN);
+       if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) {
+               wait_for_completion(&iommu->prq_complete);
+               goto qi_retry;
+       }
+}
+
+static irqreturn_t prq_event_thread(int irq, void *d)
+{
+       struct intel_iommu *iommu = d;
+       struct intel_svm *svm = NULL;
+       int head, tail, handled = 0;
+
+       /* Clear PPR bit before reading head/tail registers, to
+        * ensure that we get a new interrupt if needed. */
+       writel(DMA_PRS_PPR, iommu->reg + DMAR_PRS_REG);
+
+       tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK;
+       head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK;
+       while (head != tail) {
+               struct intel_svm_dev *sdev;
+               struct vm_area_struct *vma;
+               struct page_req_dsc *req;
+               struct qi_desc resp;
+               int result;
+               vm_fault_t ret;
+               u64 address;
+
+               handled = 1;
+
+               req = &iommu->prq[head / sizeof(*req)];
+
+               result = QI_RESP_FAILURE;
+               address = (u64)req->addr << VTD_PAGE_SHIFT;
+               if (!req->pasid_present) {
+                       pr_err("%s: Page request without PASID: %08llx %08llx\n",
+                              iommu->name, ((unsigned long long *)req)[0],
+                              ((unsigned long long *)req)[1]);
+                       goto no_pasid;
+               }
+
+               if (!svm || svm->pasid != req->pasid) {
+                       rcu_read_lock();
+                       svm = ioasid_find(NULL, req->pasid, NULL);
+                       /* It *can't* go away, because the driver is not permitted
+                        * to unbind the mm while any page faults are outstanding.
+                        * So we only need RCU to protect the internal idr code. */
+                       rcu_read_unlock();
+                       if (IS_ERR_OR_NULL(svm)) {
+                               pr_err("%s: Page request for invalid PASID %d: %08llx %08llx\n",
+                                      iommu->name, req->pasid, ((unsigned long long *)req)[0],
+                                      ((unsigned long long *)req)[1]);
+                               goto no_pasid;
+                       }
+               }
+
+               result = QI_RESP_INVALID;
+               /* Since we're using init_mm.pgd directly, we should never take
+                * any faults on kernel addresses. */
+               if (!svm->mm)
+                       goto bad_req;
+
+               /* If address is not canonical, return invalid response */
+               if (!is_canonical_address(address))
+                       goto bad_req;
+
+               /* If the mm is already defunct, don't handle faults. */
+               if (!mmget_not_zero(svm->mm))
+                       goto bad_req;
+
+               down_read(&svm->mm->mmap_sem);
+               vma = find_extend_vma(svm->mm, address);
+               if (!vma || address < vma->vm_start)
+                       goto invalid;
+
+               if (access_error(vma, req))
+                       goto invalid;
+
+               ret = handle_mm_fault(vma, address,
+                                     req->wr_req ? FAULT_FLAG_WRITE : 0);
+               if (ret & VM_FAULT_ERROR)
+                       goto invalid;
+
+               result = QI_RESP_SUCCESS;
+       invalid:
+               up_read(&svm->mm->mmap_sem);
+               mmput(svm->mm);
+       bad_req:
+               /* Accounting for major/minor faults? */
+               rcu_read_lock();
+               list_for_each_entry_rcu(sdev, &svm->devs, list) {
+                       if (sdev->sid == req->rid)
+                               break;
+               }
+               /* Other devices can go away, but the drivers are not permitted
+                * to unbind while any page faults might be in flight. So it's
+                * OK to drop the 'lock' here now we have it. */
+               rcu_read_unlock();
+
+               if (WARN_ON(&sdev->list == &svm->devs))
+                       sdev = NULL;
+
+               if (sdev && sdev->ops && sdev->ops->fault_cb) {
+                       int rwxp = (req->rd_req << 3) | (req->wr_req << 2) |
+                               (req->exe_req << 1) | (req->pm_req);
+                       sdev->ops->fault_cb(sdev->dev, req->pasid, req->addr,
+                                           req->priv_data, rwxp, result);
+               }
+               /* We get here in the error case where the PASID lookup failed,
+                  and these can be NULL. Do not use them below this point! */
+               sdev = NULL;
+               svm = NULL;
+       no_pasid:
+               if (req->lpig || req->priv_data_present) {
+                       /*
+                        * Per VT-d spec. v3.0 ch7.7, system software must
+                        * respond with page group response if private data
+                        * is present (PDP) or last page in group (LPIG) bit
+                        * is set. This is an additional VT-d feature beyond
+                        * PCI ATS spec.
+                        */
+                       resp.qw0 = QI_PGRP_PASID(req->pasid) |
+                               QI_PGRP_DID(req->rid) |
+                               QI_PGRP_PASID_P(req->pasid_present) |
+                               QI_PGRP_PDP(req->pasid_present) |
+                               QI_PGRP_RESP_CODE(result) |
+                               QI_PGRP_RESP_TYPE;
+                       resp.qw1 = QI_PGRP_IDX(req->prg_index) |
+                               QI_PGRP_LPIG(req->lpig);
+
+                       if (req->priv_data_present)
+                               memcpy(&resp.qw2, req->priv_data,
+                                      sizeof(req->priv_data));
+                       resp.qw2 = 0;
+                       resp.qw3 = 0;
+                       qi_submit_sync(iommu, &resp, 1, 0);
+               }
+               head = (head + sizeof(*req)) & PRQ_RING_MASK;
+       }
+
+       dmar_writeq(iommu->reg + DMAR_PQH_REG, tail);
+
+       /*
+        * Clear the page request overflow bit and wake up all threads that
+        * are waiting for the completion of this handling.
+        */
+       if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO)
+               writel(DMA_PRS_PRO, iommu->reg + DMAR_PRS_REG);
+
+       if (!completion_done(&iommu->prq_complete))
+               complete(&iommu->prq_complete);
+
+       return IRQ_RETVAL(handled);
+}
+
+#define to_intel_svm_dev(handle) container_of(handle, struct intel_svm_dev, sva)
+struct iommu_sva *
+intel_svm_bind(struct device *dev, struct mm_struct *mm, void *drvdata)
+{
+       struct iommu_sva *sva = ERR_PTR(-EINVAL);
+       struct intel_svm_dev *sdev = NULL;
+       int flags = 0;
+       int ret;
+
+       /*
+        * TODO: Consolidate with generic iommu-sva bind after it is merged.
+        * It will require shared SVM data structures, i.e. combine io_mm
+        * and intel_svm etc.
+        */
+       if (drvdata)
+               flags = *(int *)drvdata;
+       mutex_lock(&pasid_mutex);
+       ret = intel_svm_bind_mm(dev, flags, NULL, mm, &sdev);
+       if (ret)
+               sva = ERR_PTR(ret);
+       else if (sdev)
+               sva = &sdev->sva;
+       else
+               WARN(!sdev, "SVM bind succeeded with no sdev!\n");
+
+       mutex_unlock(&pasid_mutex);
+
+       return sva;
+}
+
+void intel_svm_unbind(struct iommu_sva *sva)
+{
+       struct intel_svm_dev *sdev;
+
+       mutex_lock(&pasid_mutex);
+       sdev = to_intel_svm_dev(sva);
+       intel_svm_unbind_mm(sdev->dev, sdev->pasid);
+       mutex_unlock(&pasid_mutex);
+}
+
+int intel_svm_get_pasid(struct iommu_sva *sva)
+{
+       struct intel_svm_dev *sdev;
+       int pasid;
+
+       mutex_lock(&pasid_mutex);
+       sdev = to_intel_svm_dev(sva);
+       pasid = sdev->pasid;
+       mutex_unlock(&pasid_mutex);
+
+       return pasid;
+}
diff --git a/drivers/iommu/intel/trace.c b/drivers/iommu/intel/trace.c

new file mode 100644 (file)

index 0000000..bfb6a6e
--- /dev/null
+++ b/drivers/iommu/intel/trace.c
@@ -0,0 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel IOMMU trace support
+ *
+ * Copyright (C) 2019 Intel Corporation
+ *
+ * Author: Lu Baolu <baolu.lu@linux.intel.com>
+ */
+
+#include <linux/string.h>
+#include <linux/types.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/intel_iommu.h>
diff --git a/drivers/iommu/intel_irq_remapping.c b/drivers/iommu/intel_irq_remapping.c

deleted file mode 100644 (file)

index a042f12..0000000
--- a/drivers/iommu/intel_irq_remapping.c
+++ /dev/null
@@ -1,1518 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#define pr_fmt(fmt)     "DMAR-IR: " fmt
-
-#include <linux/interrupt.h>
-#include <linux/dmar.h>
-#include <linux/spinlock.h>
-#include <linux/slab.h>
-#include <linux/jiffies.h>
-#include <linux/hpet.h>
-#include <linux/pci.h>
-#include <linux/irq.h>
-#include <linux/intel-iommu.h>
-#include <linux/acpi.h>
-#include <linux/irqdomain.h>
-#include <linux/crash_dump.h>
-#include <asm/io_apic.h>
-#include <asm/smp.h>
-#include <asm/cpu.h>
-#include <asm/irq_remapping.h>
-#include <asm/pci-direct.h>
-#include <asm/msidef.h>
-
-#include "irq_remapping.h"
-
-enum irq_mode {
-       IRQ_REMAPPING,
-       IRQ_POSTING,
-};
-
-struct ioapic_scope {
-       struct intel_iommu *iommu;
-       unsigned int id;
-       unsigned int bus;       /* PCI bus number */
-       unsigned int devfn;     /* PCI devfn number */
-};
-
-struct hpet_scope {
-       struct intel_iommu *iommu;
-       u8 id;
-       unsigned int bus;
-       unsigned int devfn;
-};
-
-struct irq_2_iommu {
-       struct intel_iommu *iommu;
-       u16 irte_index;
-       u16 sub_handle;
-       u8  irte_mask;
-       enum irq_mode mode;
-};
-
-struct intel_ir_data {
-       struct irq_2_iommu                      irq_2_iommu;
-       struct irte                             irte_entry;
-       union {
-               struct msi_msg                  msi_entry;
-       };
-};
-
-#define IR_X2APIC_MODE(mode) (mode ? (1 << 11) : 0)
-#define IRTE_DEST(dest) ((eim_mode) ? dest : dest << 8)
-
-static int __read_mostly eim_mode;
-static struct ioapic_scope ir_ioapic[MAX_IO_APICS];
-static struct hpet_scope ir_hpet[MAX_HPET_TBS];
-
-/*
- * Lock ordering:
- * ->dmar_global_lock
- *     ->irq_2_ir_lock
- *             ->qi->q_lock
- *     ->iommu->register_lock
- * Note:
- * intel_irq_remap_ops.{supported,prepare,enable,disable,reenable} are called
- * in single-threaded environment with interrupt disabled, so no need to tabke
- * the dmar_global_lock.
- */
-DEFINE_RAW_SPINLOCK(irq_2_ir_lock);
-static const struct irq_domain_ops intel_ir_domain_ops;
-
-static void iommu_disable_irq_remapping(struct intel_iommu *iommu);
-static int __init parse_ioapics_under_ir(void);
-
-static bool ir_pre_enabled(struct intel_iommu *iommu)
-{
-       return (iommu->flags & VTD_FLAG_IRQ_REMAP_PRE_ENABLED);
-}
-
-static void clear_ir_pre_enabled(struct intel_iommu *iommu)
-{
-       iommu->flags &= ~VTD_FLAG_IRQ_REMAP_PRE_ENABLED;
-}
-
-static void init_ir_status(struct intel_iommu *iommu)
-{
-       u32 gsts;
-
-       gsts = readl(iommu->reg + DMAR_GSTS_REG);
-       if (gsts & DMA_GSTS_IRES)
-               iommu->flags |= VTD_FLAG_IRQ_REMAP_PRE_ENABLED;
-}
-
-static int alloc_irte(struct intel_iommu *iommu,
-                     struct irq_2_iommu *irq_iommu, u16 count)
-{
-       struct ir_table *table = iommu->ir_table;
-       unsigned int mask = 0;
-       unsigned long flags;
-       int index;
-
-       if (!count || !irq_iommu)
-               return -1;
-
-       if (count > 1) {
-               count = __roundup_pow_of_two(count);
-               mask = ilog2(count);
-       }
-
-       if (mask > ecap_max_handle_mask(iommu->ecap)) {
-               pr_err("Requested mask %x exceeds the max invalidation handle"
-                      " mask value %Lx\n", mask,
-                      ecap_max_handle_mask(iommu->ecap));
-               return -1;
-       }
-
-       raw_spin_lock_irqsave(&irq_2_ir_lock, flags);
-       index = bitmap_find_free_region(table->bitmap,
-                                       INTR_REMAP_TABLE_ENTRIES, mask);
-       if (index < 0) {
-               pr_warn("IR%d: can't allocate an IRTE\n", iommu->seq_id);
-       } else {
-               irq_iommu->iommu = iommu;
-               irq_iommu->irte_index =  index;
-               irq_iommu->sub_handle = 0;
-               irq_iommu->irte_mask = mask;
-               irq_iommu->mode = IRQ_REMAPPING;
-       }
-       raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags);
-
-       return index;
-}
-
-static int qi_flush_iec(struct intel_iommu *iommu, int index, int mask)
-{
-       struct qi_desc desc;
-
-       desc.qw0 = QI_IEC_IIDEX(index) | QI_IEC_TYPE | QI_IEC_IM(mask)
-                  | QI_IEC_SELECTIVE;
-       desc.qw1 = 0;
-       desc.qw2 = 0;
-       desc.qw3 = 0;
-
-       return qi_submit_sync(iommu, &desc, 1, 0);
-}
-
-static int modify_irte(struct irq_2_iommu *irq_iommu,
-                      struct irte *irte_modified)
-{
-       struct intel_iommu *iommu;
-       unsigned long flags;
-       struct irte *irte;
-       int rc, index;
-
-       if (!irq_iommu)
-               return -1;
-
-       raw_spin_lock_irqsave(&irq_2_ir_lock, flags);
-
-       iommu = irq_iommu->iommu;
-
-       index = irq_iommu->irte_index + irq_iommu->sub_handle;
-       irte = &iommu->ir_table->base[index];
-
-#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE)
-       if ((irte->pst == 1) || (irte_modified->pst == 1)) {
-               bool ret;
-
-               ret = cmpxchg_double(&irte->low, &irte->high,
-                                    irte->low, irte->high,
-                                    irte_modified->low, irte_modified->high);
-               /*
-                * We use cmpxchg16 to atomically update the 128-bit IRTE,
-                * and it cannot be updated by the hardware or other processors
-                * behind us, so the return value of cmpxchg16 should be the
-                * same as the old value.
-                */
-               WARN_ON(!ret);
-       } else
-#endif
-       {
-               set_64bit(&irte->low, irte_modified->low);
-               set_64bit(&irte->high, irte_modified->high);
-       }
-       __iommu_flush_cache(iommu, irte, sizeof(*irte));
-
-       rc = qi_flush_iec(iommu, index, 0);
-
-       /* Update iommu mode according to the IRTE mode */
-       irq_iommu->mode = irte->pst ? IRQ_POSTING : IRQ_REMAPPING;
-       raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags);
-
-       return rc;
-}
-
-static struct intel_iommu *map_hpet_to_ir(u8 hpet_id)
-{
-       int i;
-
-       for (i = 0; i < MAX_HPET_TBS; i++)
-               if (ir_hpet[i].id == hpet_id && ir_hpet[i].iommu)
-                       return ir_hpet[i].iommu;
-       return NULL;
-}
-
-static struct intel_iommu *map_ioapic_to_ir(int apic)
-{
-       int i;
-
-       for (i = 0; i < MAX_IO_APICS; i++)
-               if (ir_ioapic[i].id == apic && ir_ioapic[i].iommu)
-                       return ir_ioapic[i].iommu;
-       return NULL;
-}
-
-static struct intel_iommu *map_dev_to_ir(struct pci_dev *dev)
-{
-       struct dmar_drhd_unit *drhd;
-
-       drhd = dmar_find_matched_drhd_unit(dev);
-       if (!drhd)
-               return NULL;
-
-       return drhd->iommu;
-}
-
-static int clear_entries(struct irq_2_iommu *irq_iommu)
-{
-       struct irte *start, *entry, *end;
-       struct intel_iommu *iommu;
-       int index;
-
-       if (irq_iommu->sub_handle)
-               return 0;
-
-       iommu = irq_iommu->iommu;
-       index = irq_iommu->irte_index;
-
-       start = iommu->ir_table->base + index;
-       end = start + (1 << irq_iommu->irte_mask);
-
-       for (entry = start; entry < end; entry++) {
-               set_64bit(&entry->low, 0);
-               set_64bit(&entry->high, 0);
-       }
-       bitmap_release_region(iommu->ir_table->bitmap, index,
-                             irq_iommu->irte_mask);
-
-       return qi_flush_iec(iommu, index, irq_iommu->irte_mask);
-}
-
-/*
- * source validation type
- */
-#define SVT_NO_VERIFY          0x0  /* no verification is required */
-#define SVT_VERIFY_SID_SQ      0x1  /* verify using SID and SQ fields */
-#define SVT_VERIFY_BUS         0x2  /* verify bus of request-id */
-
-/*
- * source-id qualifier
- */
-#define SQ_ALL_16      0x0  /* verify all 16 bits of request-id */
-#define SQ_13_IGNORE_1 0x1  /* verify most significant 13 bits, ignore
-                             * the third least significant bit
-                             */
-#define SQ_13_IGNORE_2 0x2  /* verify most significant 13 bits, ignore
-                             * the second and third least significant bits
-                             */
-#define SQ_13_IGNORE_3 0x3  /* verify most significant 13 bits, ignore
-                             * the least three significant bits
-                             */
-
-/*
- * set SVT, SQ and SID fields of irte to verify
- * source ids of interrupt requests
- */
-static void set_irte_sid(struct irte *irte, unsigned int svt,
-                        unsigned int sq, unsigned int sid)
-{
-       if (disable_sourceid_checking)
-               svt = SVT_NO_VERIFY;
-       irte->svt = svt;
-       irte->sq = sq;
-       irte->sid = sid;
-}
-
-/*
- * Set an IRTE to match only the bus number. Interrupt requests that reference
- * this IRTE must have a requester-id whose bus number is between or equal
- * to the start_bus and end_bus arguments.
- */
-static void set_irte_verify_bus(struct irte *irte, unsigned int start_bus,
-                               unsigned int end_bus)
-{
-       set_irte_sid(irte, SVT_VERIFY_BUS, SQ_ALL_16,
-                    (start_bus << 8) | end_bus);
-}
-
-static int set_ioapic_sid(struct irte *irte, int apic)
-{
-       int i;
-       u16 sid = 0;
-
-       if (!irte)
-               return -1;
-
-       down_read(&dmar_global_lock);
-       for (i = 0; i < MAX_IO_APICS; i++) {
-               if (ir_ioapic[i].iommu && ir_ioapic[i].id == apic) {
-                       sid = (ir_ioapic[i].bus << 8) | ir_ioapic[i].devfn;
-                       break;
-               }
-       }
-       up_read(&dmar_global_lock);
-
-       if (sid == 0) {
-               pr_warn("Failed to set source-id of IOAPIC (%d)\n", apic);
-               return -1;
-       }
-
-       set_irte_sid(irte, SVT_VERIFY_SID_SQ, SQ_ALL_16, sid);
-
-       return 0;
-}
-
-static int set_hpet_sid(struct irte *irte, u8 id)
-{
-       int i;
-       u16 sid = 0;
-
-       if (!irte)
-               return -1;
-
-       down_read(&dmar_global_lock);
-       for (i = 0; i < MAX_HPET_TBS; i++) {
-               if (ir_hpet[i].iommu && ir_hpet[i].id == id) {
-                       sid = (ir_hpet[i].bus << 8) | ir_hpet[i].devfn;
-                       break;
-               }
-       }
-       up_read(&dmar_global_lock);
-
-       if (sid == 0) {
-               pr_warn("Failed to set source-id of HPET block (%d)\n", id);
-               return -1;
-       }
-
-       /*
-        * Should really use SQ_ALL_16. Some platforms are broken.
-        * While we figure out the right quirks for these broken platforms, use
-        * SQ_13_IGNORE_3 for now.
-        */
-       set_irte_sid(irte, SVT_VERIFY_SID_SQ, SQ_13_IGNORE_3, sid);
-
-       return 0;
-}
-
-struct set_msi_sid_data {
-       struct pci_dev *pdev;
-       u16 alias;
-       int count;
-       int busmatch_count;
-};
-
-static int set_msi_sid_cb(struct pci_dev *pdev, u16 alias, void *opaque)
-{
-       struct set_msi_sid_data *data = opaque;
-
-       if (data->count == 0 || PCI_BUS_NUM(alias) == PCI_BUS_NUM(data->alias))
-               data->busmatch_count++;
-
-       data->pdev = pdev;
-       data->alias = alias;
-       data->count++;
-
-       return 0;
-}
-
-static int set_msi_sid(struct irte *irte, struct pci_dev *dev)
-{
-       struct set_msi_sid_data data;
-
-       if (!irte || !dev)
-               return -1;
-
-       data.count = 0;
-       data.busmatch_count = 0;
-       pci_for_each_dma_alias(dev, set_msi_sid_cb, &data);
-
-       /*
-        * DMA alias provides us with a PCI device and alias.  The only case
-        * where the it will return an alias on a different bus than the
-        * device is the case of a PCIe-to-PCI bridge, where the alias is for
-        * the subordinate bus.  In this case we can only verify the bus.
-        *
-        * If there are multiple aliases, all with the same bus number,
-        * then all we can do is verify the bus. This is typical in NTB
-        * hardware which use proxy IDs where the device will generate traffic
-        * from multiple devfn numbers on the same bus.
-        *
-        * If the alias device is on a different bus than our source device
-        * then we have a topology based alias, use it.
-        *
-        * Otherwise, the alias is for a device DMA quirk and we cannot
-        * assume that MSI uses the same requester ID.  Therefore use the
-        * original device.
-        */
-       if (PCI_BUS_NUM(data.alias) != data.pdev->bus->number)
-               set_irte_verify_bus(irte, PCI_BUS_NUM(data.alias),
-                                   dev->bus->number);
-       else if (data.count >= 2 && data.busmatch_count == data.count)
-               set_irte_verify_bus(irte, dev->bus->number, dev->bus->number);
-       else if (data.pdev->bus->number != dev->bus->number)
-               set_irte_sid(irte, SVT_VERIFY_SID_SQ, SQ_ALL_16, data.alias);
-       else
-               set_irte_sid(irte, SVT_VERIFY_SID_SQ, SQ_ALL_16,
-                            pci_dev_id(dev));
-
-       return 0;
-}
-
-static int iommu_load_old_irte(struct intel_iommu *iommu)
-{
-       struct irte *old_ir_table;
-       phys_addr_t irt_phys;
-       unsigned int i;
-       size_t size;
-       u64 irta;
-
-       /* Check whether the old ir-table has the same size as ours */
-       irta = dmar_readq(iommu->reg + DMAR_IRTA_REG);
-       if ((irta & INTR_REMAP_TABLE_REG_SIZE_MASK)
-            != INTR_REMAP_TABLE_REG_SIZE)
-               return -EINVAL;
-
-       irt_phys = irta & VTD_PAGE_MASK;
-       size     = INTR_REMAP_TABLE_ENTRIES*sizeof(struct irte);
-
-       /* Map the old IR table */
-       old_ir_table = memremap(irt_phys, size, MEMREMAP_WB);
-       if (!old_ir_table)
-               return -ENOMEM;
-
-       /* Copy data over */
-       memcpy(iommu->ir_table->base, old_ir_table, size);
-
-       __iommu_flush_cache(iommu, iommu->ir_table->base, size);
-
-       /*
-        * Now check the table for used entries and mark those as
-        * allocated in the bitmap
-        */
-       for (i = 0; i < INTR_REMAP_TABLE_ENTRIES; i++) {
-               if (iommu->ir_table->base[i].present)
-                       bitmap_set(iommu->ir_table->bitmap, i, 1);
-       }
-
-       memunmap(old_ir_table);
-
-       return 0;
-}
-
-
-static void iommu_set_irq_remapping(struct intel_iommu *iommu, int mode)
-{
-       unsigned long flags;
-       u64 addr;
-       u32 sts;
-
-       addr = virt_to_phys((void *)iommu->ir_table->base);
-
-       raw_spin_lock_irqsave(&iommu->register_lock, flags);
-
-       dmar_writeq(iommu->reg + DMAR_IRTA_REG,
-                   (addr) | IR_X2APIC_MODE(mode) | INTR_REMAP_TABLE_REG_SIZE);
-
-       /* Set interrupt-remapping table pointer */
-       writel(iommu->gcmd | DMA_GCMD_SIRTP, iommu->reg + DMAR_GCMD_REG);
-
-       IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
-                     readl, (sts & DMA_GSTS_IRTPS), sts);
-       raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
-
-       /*
-        * Global invalidation of interrupt entry cache to make sure the
-        * hardware uses the new irq remapping table.
-        */
-       qi_global_iec(iommu);
-}
-
-static void iommu_enable_irq_remapping(struct intel_iommu *iommu)
-{
-       unsigned long flags;
-       u32 sts;
-
-       raw_spin_lock_irqsave(&iommu->register_lock, flags);
-
-       /* Enable interrupt-remapping */
-       iommu->gcmd |= DMA_GCMD_IRE;
-       iommu->gcmd &= ~DMA_GCMD_CFI;  /* Block compatibility-format MSIs */
-       writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
-
-       IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
-                     readl, (sts & DMA_GSTS_IRES), sts);
-
-       /*
-        * With CFI clear in the Global Command register, we should be
-        * protected from dangerous (i.e. compatibility) interrupts
-        * regardless of x2apic status.  Check just to be sure.
-        */
-       if (sts & DMA_GSTS_CFIS)
-               WARN(1, KERN_WARNING
-                       "Compatibility-format IRQs enabled despite intr remapping;\n"
-                       "you are vulnerable to IRQ injection.\n");
-
-       raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
-}
-
-static int intel_setup_irq_remapping(struct intel_iommu *iommu)
-{
-       struct ir_table *ir_table;
-       struct fwnode_handle *fn;
-       unsigned long *bitmap;
-       struct page *pages;
-
-       if (iommu->ir_table)
-               return 0;
-
-       ir_table = kzalloc(sizeof(struct ir_table), GFP_KERNEL);
-       if (!ir_table)
-               return -ENOMEM;
-
-       pages = alloc_pages_node(iommu->node, GFP_KERNEL | __GFP_ZERO,
-                                INTR_REMAP_PAGE_ORDER);
-       if (!pages) {
-               pr_err("IR%d: failed to allocate pages of order %d\n",
-                      iommu->seq_id, INTR_REMAP_PAGE_ORDER);
-               goto out_free_table;
-       }
-
-       bitmap = bitmap_zalloc(INTR_REMAP_TABLE_ENTRIES, GFP_ATOMIC);
-       if (bitmap == NULL) {
-               pr_err("IR%d: failed to allocate bitmap\n", iommu->seq_id);
-               goto out_free_pages;
-       }
-
-       fn = irq_domain_alloc_named_id_fwnode("INTEL-IR", iommu->seq_id);
-       if (!fn)
-               goto out_free_bitmap;
-
-       iommu->ir_domain =
-               irq_domain_create_hierarchy(arch_get_ir_parent_domain(),
-                                           0, INTR_REMAP_TABLE_ENTRIES,
-                                           fn, &intel_ir_domain_ops,
-                                           iommu);
-       irq_domain_free_fwnode(fn);
-       if (!iommu->ir_domain) {
-               pr_err("IR%d: failed to allocate irqdomain\n", iommu->seq_id);
-               goto out_free_bitmap;
-       }
-       iommu->ir_msi_domain =
-               arch_create_remap_msi_irq_domain(iommu->ir_domain,
-                                                "INTEL-IR-MSI",
-                                                iommu->seq_id);
-
-       ir_table->base = page_address(pages);
-       ir_table->bitmap = bitmap;
-       iommu->ir_table = ir_table;
-
-       /*
-        * If the queued invalidation is already initialized,
-        * shouldn't disable it.
-        */
-       if (!iommu->qi) {
-               /*
-                * Clear previous faults.
-                */
-               dmar_fault(-1, iommu);
-               dmar_disable_qi(iommu);
-
-               if (dmar_enable_qi(iommu)) {
-                       pr_err("Failed to enable queued invalidation\n");
-                       goto out_free_bitmap;
-               }
-       }
-
-       init_ir_status(iommu);
-
-       if (ir_pre_enabled(iommu)) {
-               if (!is_kdump_kernel()) {
-                       pr_warn("IRQ remapping was enabled on %s but we are not in kdump mode\n",
-                               iommu->name);
-                       clear_ir_pre_enabled(iommu);
-                       iommu_disable_irq_remapping(iommu);
-               } else if (iommu_load_old_irte(iommu))
-                       pr_err("Failed to copy IR table for %s from previous kernel\n",
-                              iommu->name);
-               else
-                       pr_info("Copied IR table for %s from previous kernel\n",
-                               iommu->name);
-       }
-
-       iommu_set_irq_remapping(iommu, eim_mode);
-
-       return 0;
-
-out_free_bitmap:
-       bitmap_free(bitmap);
-out_free_pages:
-       __free_pages(pages, INTR_REMAP_PAGE_ORDER);
-out_free_table:
-       kfree(ir_table);
-
-       iommu->ir_table  = NULL;
-
-       return -ENOMEM;
-}
-
-static void intel_teardown_irq_remapping(struct intel_iommu *iommu)
-{
-       if (iommu && iommu->ir_table) {
-               if (iommu->ir_msi_domain) {
-                       irq_domain_remove(iommu->ir_msi_domain);
-                       iommu->ir_msi_domain = NULL;
-               }
-               if (iommu->ir_domain) {
-                       irq_domain_remove(iommu->ir_domain);
-                       iommu->ir_domain = NULL;
-               }
-               free_pages((unsigned long)iommu->ir_table->base,
-                          INTR_REMAP_PAGE_ORDER);
-               bitmap_free(iommu->ir_table->bitmap);
-               kfree(iommu->ir_table);
-               iommu->ir_table = NULL;
-       }
-}
-
-/*
- * Disable Interrupt Remapping.
- */
-static void iommu_disable_irq_remapping(struct intel_iommu *iommu)
-{
-       unsigned long flags;
-       u32 sts;
-
-       if (!ecap_ir_support(iommu->ecap))
-               return;
-
-       /*
-        * global invalidation of interrupt entry cache before disabling
-        * interrupt-remapping.
-        */
-       qi_global_iec(iommu);
-
-       raw_spin_lock_irqsave(&iommu->register_lock, flags);
-
-       sts = readl(iommu->reg + DMAR_GSTS_REG);
-       if (!(sts & DMA_GSTS_IRES))
-               goto end;
-
-       iommu->gcmd &= ~DMA_GCMD_IRE;
-       writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
-
-       IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
-                     readl, !(sts & DMA_GSTS_IRES), sts);
-
-end:
-       raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
-}
-
-static int __init dmar_x2apic_optout(void)
-{
-       struct acpi_table_dmar *dmar;
-       dmar = (struct acpi_table_dmar *)dmar_tbl;
-       if (!dmar || no_x2apic_optout)
-               return 0;
-       return dmar->flags & DMAR_X2APIC_OPT_OUT;
-}
-
-static void __init intel_cleanup_irq_remapping(void)
-{
-       struct dmar_drhd_unit *drhd;
-       struct intel_iommu *iommu;
-
-       for_each_iommu(iommu, drhd) {
-               if (ecap_ir_support(iommu->ecap)) {
-                       iommu_disable_irq_remapping(iommu);
-                       intel_teardown_irq_remapping(iommu);
-               }
-       }
-
-       if (x2apic_supported())
-               pr_warn("Failed to enable irq remapping. You are vulnerable to irq-injection attacks.\n");
-}
-
-static int __init intel_prepare_irq_remapping(void)
-{
-       struct dmar_drhd_unit *drhd;
-       struct intel_iommu *iommu;
-       int eim = 0;
-
-       if (irq_remap_broken) {
-               pr_warn("This system BIOS has enabled interrupt remapping\n"
-                       "on a chipset that contains an erratum making that\n"
-                       "feature unstable.  To maintain system stability\n"
-                       "interrupt remapping is being disabled.  Please\n"
-                       "contact your BIOS vendor for an update\n");
-               add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
-               return -ENODEV;
-       }
-
-       if (dmar_table_init() < 0)
-               return -ENODEV;
-
-       if (!dmar_ir_support())
-               return -ENODEV;
-
-       if (parse_ioapics_under_ir()) {
-               pr_info("Not enabling interrupt remapping\n");
-               goto error;
-       }
-
-       /* First make sure all IOMMUs support IRQ remapping */
-       for_each_iommu(iommu, drhd)
-               if (!ecap_ir_support(iommu->ecap))
-                       goto error;
-
-       /* Detect remapping mode: lapic or x2apic */
-       if (x2apic_supported()) {
-               eim = !dmar_x2apic_optout();
-               if (!eim) {
-                       pr_info("x2apic is disabled because BIOS sets x2apic opt out bit.");
-                       pr_info("Use 'intremap=no_x2apic_optout' to override the BIOS setting.\n");
-               }
-       }
-
-       for_each_iommu(iommu, drhd) {
-               if (eim && !ecap_eim_support(iommu->ecap)) {
-                       pr_info("%s does not support EIM\n", iommu->name);
-                       eim = 0;
-               }
-       }
-
-       eim_mode = eim;
-       if (eim)
-               pr_info("Queued invalidation will be enabled to support x2apic and Intr-remapping.\n");
-
-       /* Do the initializations early */
-       for_each_iommu(iommu, drhd) {
-               if (intel_setup_irq_remapping(iommu)) {
-                       pr_err("Failed to setup irq remapping for %s\n",
-                              iommu->name);
-                       goto error;
-               }
-       }
-
-       return 0;
-
-error:
-       intel_cleanup_irq_remapping();
-       return -ENODEV;
-}
-
-/*
- * Set Posted-Interrupts capability.
- */
-static inline void set_irq_posting_cap(void)
-{
-       struct dmar_drhd_unit *drhd;
-       struct intel_iommu *iommu;
-
-       if (!disable_irq_post) {
-               /*
-                * If IRTE is in posted format, the 'pda' field goes across the
-                * 64-bit boundary, we need use cmpxchg16b to atomically update
-                * it. We only expose posted-interrupt when X86_FEATURE_CX16
-                * is supported. Actually, hardware platforms supporting PI
-                * should have X86_FEATURE_CX16 support, this has been confirmed
-                * with Intel hardware guys.
-                */
-               if (boot_cpu_has(X86_FEATURE_CX16))
-                       intel_irq_remap_ops.capability |= 1 << IRQ_POSTING_CAP;
-
-               for_each_iommu(iommu, drhd)
-                       if (!cap_pi_support(iommu->cap)) {
-                               intel_irq_remap_ops.capability &=
-                                               ~(1 << IRQ_POSTING_CAP);
-                               break;
-                       }
-       }
-}
-
-static int __init intel_enable_irq_remapping(void)
-{
-       struct dmar_drhd_unit *drhd;
-       struct intel_iommu *iommu;
-       bool setup = false;
-
-       /*
-        * Setup Interrupt-remapping for all the DRHD's now.
-        */
-       for_each_iommu(iommu, drhd) {
-               if (!ir_pre_enabled(iommu))
-                       iommu_enable_irq_remapping(iommu);
-               setup = true;
-       }
-
-       if (!setup)
-               goto error;
-
-       irq_remapping_enabled = 1;
-
-       set_irq_posting_cap();
-
-       pr_info("Enabled IRQ remapping in %s mode\n", eim_mode ? "x2apic" : "xapic");
-
-       return eim_mode ? IRQ_REMAP_X2APIC_MODE : IRQ_REMAP_XAPIC_MODE;
-
-error:
-       intel_cleanup_irq_remapping();
-       return -1;
-}
-
-static int ir_parse_one_hpet_scope(struct acpi_dmar_device_scope *scope,
-                                  struct intel_iommu *iommu,
-                                  struct acpi_dmar_hardware_unit *drhd)
-{
-       struct acpi_dmar_pci_path *path;
-       u8 bus;
-       int count, free = -1;
-
-       bus = scope->bus;
-       path = (struct acpi_dmar_pci_path *)(scope + 1);
-       count = (scope->length - sizeof(struct acpi_dmar_device_scope))
-               / sizeof(struct acpi_dmar_pci_path);
-
-       while (--count > 0) {
-               /*
-                * Access PCI directly due to the PCI
-                * subsystem isn't initialized yet.
-                */
-               bus = read_pci_config_byte(bus, path->device, path->function,
-                                          PCI_SECONDARY_BUS);
-               path++;
-       }
-
-       for (count = 0; count < MAX_HPET_TBS; count++) {
-               if (ir_hpet[count].iommu == iommu &&
-                   ir_hpet[count].id == scope->enumeration_id)
-                       return 0;
-               else if (ir_hpet[count].iommu == NULL && free == -1)
-                       free = count;
-       }
-       if (free == -1) {
-               pr_warn("Exceeded Max HPET blocks\n");
-               return -ENOSPC;
-       }
-
-       ir_hpet[free].iommu = iommu;
-       ir_hpet[free].id    = scope->enumeration_id;
-       ir_hpet[free].bus   = bus;
-       ir_hpet[free].devfn = PCI_DEVFN(path->device, path->function);
-       pr_info("HPET id %d under DRHD base 0x%Lx\n",
-               scope->enumeration_id, drhd->address);
-
-       return 0;
-}
-
-static int ir_parse_one_ioapic_scope(struct acpi_dmar_device_scope *scope,
-                                    struct intel_iommu *iommu,
-                                    struct acpi_dmar_hardware_unit *drhd)
-{
-       struct acpi_dmar_pci_path *path;
-       u8 bus;
-       int count, free = -1;
-
-       bus = scope->bus;
-       path = (struct acpi_dmar_pci_path *)(scope + 1);
-       count = (scope->length - sizeof(struct acpi_dmar_device_scope))
-               / sizeof(struct acpi_dmar_pci_path);
-
-       while (--count > 0) {
-               /*
-                * Access PCI directly due to the PCI
-                * subsystem isn't initialized yet.
-                */
-               bus = read_pci_config_byte(bus, path->device, path->function,
-                                          PCI_SECONDARY_BUS);
-               path++;
-       }
-
-       for (count = 0; count < MAX_IO_APICS; count++) {
-               if (ir_ioapic[count].iommu == iommu &&
-                   ir_ioapic[count].id == scope->enumeration_id)
-                       return 0;
-               else if (ir_ioapic[count].iommu == NULL && free == -1)
-                       free = count;
-       }
-       if (free == -1) {
-               pr_warn("Exceeded Max IO APICS\n");
-               return -ENOSPC;
-       }
-
-       ir_ioapic[free].bus   = bus;
-       ir_ioapic[free].devfn = PCI_DEVFN(path->device, path->function);
-       ir_ioapic[free].iommu = iommu;
-       ir_ioapic[free].id    = scope->enumeration_id;
-       pr_info("IOAPIC id %d under DRHD base  0x%Lx IOMMU %d\n",
-               scope->enumeration_id, drhd->address, iommu->seq_id);
-
-       return 0;
-}
-
-static int ir_parse_ioapic_hpet_scope(struct acpi_dmar_header *header,
-                                     struct intel_iommu *iommu)
-{
-       int ret = 0;
-       struct acpi_dmar_hardware_unit *drhd;
-       struct acpi_dmar_device_scope *scope;
-       void *start, *end;
-
-       drhd = (struct acpi_dmar_hardware_unit *)header;
-       start = (void *)(drhd + 1);
-       end = ((void *)drhd) + header->length;
-
-       while (start < end && ret == 0) {
-               scope = start;
-               if (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_IOAPIC)
-                       ret = ir_parse_one_ioapic_scope(scope, iommu, drhd);
-               else if (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_HPET)
-                       ret = ir_parse_one_hpet_scope(scope, iommu, drhd);
-               start += scope->length;
-       }
-
-       return ret;
-}
-
-static void ir_remove_ioapic_hpet_scope(struct intel_iommu *iommu)
-{
-       int i;
-
-       for (i = 0; i < MAX_HPET_TBS; i++)
-               if (ir_hpet[i].iommu == iommu)
-                       ir_hpet[i].iommu = NULL;
-
-       for (i = 0; i < MAX_IO_APICS; i++)
-               if (ir_ioapic[i].iommu == iommu)
-                       ir_ioapic[i].iommu = NULL;
-}
-
-/*
- * Finds the assocaition between IOAPIC's and its Interrupt-remapping
- * hardware unit.
- */
-static int __init parse_ioapics_under_ir(void)
-{
-       struct dmar_drhd_unit *drhd;
-       struct intel_iommu *iommu;
-       bool ir_supported = false;
-       int ioapic_idx;
-
-       for_each_iommu(iommu, drhd) {
-               int ret;
-
-               if (!ecap_ir_support(iommu->ecap))
-                       continue;
-
-               ret = ir_parse_ioapic_hpet_scope(drhd->hdr, iommu);
-               if (ret)
-                       return ret;
-
-               ir_supported = true;
-       }
-
-       if (!ir_supported)
-               return -ENODEV;
-
-       for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) {
-               int ioapic_id = mpc_ioapic_id(ioapic_idx);
-               if (!map_ioapic_to_ir(ioapic_id)) {
-                       pr_err(FW_BUG "ioapic %d has no mapping iommu, "
-                              "interrupt remapping will be disabled\n",
-                              ioapic_id);
-                       return -1;
-               }
-       }
-
-       return 0;
-}
-
-static int __init ir_dev_scope_init(void)
-{
-       int ret;
-
-       if (!irq_remapping_enabled)
-               return 0;
-
-       down_write(&dmar_global_lock);
-       ret = dmar_dev_scope_init();
-       up_write(&dmar_global_lock);
-
-       return ret;
-}
-rootfs_initcall(ir_dev_scope_init);
-
-static void disable_irq_remapping(void)
-{
-       struct dmar_drhd_unit *drhd;
-       struct intel_iommu *iommu = NULL;
-
-       /*
-        * Disable Interrupt-remapping for all the DRHD's now.
-        */
-       for_each_iommu(iommu, drhd) {
-               if (!ecap_ir_support(iommu->ecap))
-                       continue;
-
-               iommu_disable_irq_remapping(iommu);
-       }
-
-       /*
-        * Clear Posted-Interrupts capability.
-        */
-       if (!disable_irq_post)
-               intel_irq_remap_ops.capability &= ~(1 << IRQ_POSTING_CAP);
-}
-
-static int reenable_irq_remapping(int eim)
-{
-       struct dmar_drhd_unit *drhd;
-       bool setup = false;
-       struct intel_iommu *iommu = NULL;
-
-       for_each_iommu(iommu, drhd)
-               if (iommu->qi)
-                       dmar_reenable_qi(iommu);
-
-       /*
-        * Setup Interrupt-remapping for all the DRHD's now.
-        */
-       for_each_iommu(iommu, drhd) {
-               if (!ecap_ir_support(iommu->ecap))
-                       continue;
-
-               /* Set up interrupt remapping for iommu.*/
-               iommu_set_irq_remapping(iommu, eim);
-               iommu_enable_irq_remapping(iommu);
-               setup = true;
-       }
-
-       if (!setup)
-               goto error;
-
-       set_irq_posting_cap();
-
-       return 0;
-
-error:
-       /*
-        * handle error condition gracefully here!
-        */
-       return -1;
-}
-
-static void prepare_irte(struct irte *irte, int vector, unsigned int dest)
-{
-       memset(irte, 0, sizeof(*irte));
-
-       irte->present = 1;
-       irte->dst_mode = apic->irq_dest_mode;
-       /*
-        * Trigger mode in the IRTE will always be edge, and for IO-APIC, the
-        * actual level or edge trigger will be setup in the IO-APIC
-        * RTE. This will help simplify level triggered irq migration.
-        * For more details, see the comments (in io_apic.c) explainig IO-APIC
-        * irq migration in the presence of interrupt-remapping.
-       */
-       irte->trigger_mode = 0;
-       irte->dlvry_mode = apic->irq_delivery_mode;
-       irte->vector = vector;
-       irte->dest_id = IRTE_DEST(dest);
-       irte->redir_hint = 1;
-}
-
-static struct irq_domain *intel_get_ir_irq_domain(struct irq_alloc_info *info)
-{
-       struct intel_iommu *iommu = NULL;
-
-       if (!info)
-               return NULL;
-
-       switch (info->type) {
-       case X86_IRQ_ALLOC_TYPE_IOAPIC:
-               iommu = map_ioapic_to_ir(info->ioapic_id);
-               break;
-       case X86_IRQ_ALLOC_TYPE_HPET:
-               iommu = map_hpet_to_ir(info->hpet_id);
-               break;
-       case X86_IRQ_ALLOC_TYPE_MSI:
-       case X86_IRQ_ALLOC_TYPE_MSIX:
-               iommu = map_dev_to_ir(info->msi_dev);
-               break;
-       default:
-               BUG_ON(1);
-               break;
-       }
-
-       return iommu ? iommu->ir_domain : NULL;
-}
-
-static struct irq_domain *intel_get_irq_domain(struct irq_alloc_info *info)
-{
-       struct intel_iommu *iommu;
-
-       if (!info)
-               return NULL;
-
-       switch (info->type) {
-       case X86_IRQ_ALLOC_TYPE_MSI:
-       case X86_IRQ_ALLOC_TYPE_MSIX:
-               iommu = map_dev_to_ir(info->msi_dev);
-               if (iommu)
-                       return iommu->ir_msi_domain;
-               break;
-       default:
-               break;
-       }
-
-       return NULL;
-}
-
-struct irq_remap_ops intel_irq_remap_ops = {
-       .prepare                = intel_prepare_irq_remapping,
-       .enable                 = intel_enable_irq_remapping,
-       .disable                = disable_irq_remapping,
-       .reenable               = reenable_irq_remapping,
-       .enable_faulting        = enable_drhd_fault_handling,
-       .get_ir_irq_domain      = intel_get_ir_irq_domain,
-       .get_irq_domain         = intel_get_irq_domain,
-};
-
-static void intel_ir_reconfigure_irte(struct irq_data *irqd, bool force)
-{
-       struct intel_ir_data *ir_data = irqd->chip_data;
-       struct irte *irte = &ir_data->irte_entry;
-       struct irq_cfg *cfg = irqd_cfg(irqd);
-
-       /*
-        * Atomically updates the IRTE with the new destination, vector
-        * and flushes the interrupt entry cache.
-        */
-       irte->vector = cfg->vector;
-       irte->dest_id = IRTE_DEST(cfg->dest_apicid);
-
-       /* Update the hardware only if the interrupt is in remapped mode. */
-       if (force || ir_data->irq_2_iommu.mode == IRQ_REMAPPING)
-               modify_irte(&ir_data->irq_2_iommu, irte);
-}
-
-/*
- * Migrate the IO-APIC irq in the presence of intr-remapping.
- *
- * For both level and edge triggered, irq migration is a simple atomic
- * update(of vector and cpu destination) of IRTE and flush the hardware cache.
- *
- * For level triggered, we eliminate the io-apic RTE modification (with the
- * updated vector information), by using a virtual vector (io-apic pin number).
- * Real vector that is used for interrupting cpu will be coming from
- * the interrupt-remapping table entry.
- *
- * As the migration is a simple atomic update of IRTE, the same mechanism
- * is used to migrate MSI irq's in the presence of interrupt-remapping.
- */
-static int
-intel_ir_set_affinity(struct irq_data *data, const struct cpumask *mask,
-                     bool force)
-{
-       struct irq_data *parent = data->parent_data;
-       struct irq_cfg *cfg = irqd_cfg(data);
-       int ret;
-
-       ret = parent->chip->irq_set_affinity(parent, mask, force);
-       if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE)
-               return ret;
-
-       intel_ir_reconfigure_irte(data, false);
-       /*
-        * After this point, all the interrupts will start arriving
-        * at the new destination. So, time to cleanup the previous
-        * vector allocation.
-        */
-       send_cleanup_vector(cfg);
-
-       return IRQ_SET_MASK_OK_DONE;
-}
-
-static void intel_ir_compose_msi_msg(struct irq_data *irq_data,
-                                    struct msi_msg *msg)
-{
-       struct intel_ir_data *ir_data = irq_data->chip_data;
-
-       *msg = ir_data->msi_entry;
-}
-
-static int intel_ir_set_vcpu_affinity(struct irq_data *data, void *info)
-{
-       struct intel_ir_data *ir_data = data->chip_data;
-       struct vcpu_data *vcpu_pi_info = info;
-
-       /* stop posting interrupts, back to remapping mode */
-       if (!vcpu_pi_info) {
-               modify_irte(&ir_data->irq_2_iommu, &ir_data->irte_entry);
-       } else {
-               struct irte irte_pi;
-
-               /*
-                * We are not caching the posted interrupt entry. We
-                * copy the data from the remapped entry and modify
-                * the fields which are relevant for posted mode. The
-                * cached remapped entry is used for switching back to
-                * remapped mode.
-                */
-               memset(&irte_pi, 0, sizeof(irte_pi));
-               dmar_copy_shared_irte(&irte_pi, &ir_data->irte_entry);
-
-               /* Update the posted mode fields */
-               irte_pi.p_pst = 1;
-               irte_pi.p_urgent = 0;
-               irte_pi.p_vector = vcpu_pi_info->vector;
-               irte_pi.pda_l = (vcpu_pi_info->pi_desc_addr >>
-                               (32 - PDA_LOW_BIT)) & ~(-1UL << PDA_LOW_BIT);
-               irte_pi.pda_h = (vcpu_pi_info->pi_desc_addr >> 32) &
-                               ~(-1UL << PDA_HIGH_BIT);
-
-               modify_irte(&ir_data->irq_2_iommu, &irte_pi);
-       }
-
-       return 0;
-}
-
-static struct irq_chip intel_ir_chip = {
-       .name                   = "INTEL-IR",
-       .irq_ack                = apic_ack_irq,
-       .irq_set_affinity       = intel_ir_set_affinity,
-       .irq_compose_msi_msg    = intel_ir_compose_msi_msg,
-       .irq_set_vcpu_affinity  = intel_ir_set_vcpu_affinity,
-};
-
-static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data,
-                                            struct irq_cfg *irq_cfg,
-                                            struct irq_alloc_info *info,
-                                            int index, int sub_handle)
-{
-       struct IR_IO_APIC_route_entry *entry;
-       struct irte *irte = &data->irte_entry;
-       struct msi_msg *msg = &data->msi_entry;
-
-       prepare_irte(irte, irq_cfg->vector, irq_cfg->dest_apicid);
-       switch (info->type) {
-       case X86_IRQ_ALLOC_TYPE_IOAPIC:
-               /* Set source-id of interrupt request */
-               set_ioapic_sid(irte, info->ioapic_id);
-               apic_printk(APIC_VERBOSE, KERN_DEBUG "IOAPIC[%d]: Set IRTE entry (P:%d FPD:%d Dst_Mode:%d Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X Avail:%X Vector:%02X Dest:%08X SID:%04X SQ:%X SVT:%X)\n",
-                       info->ioapic_id, irte->present, irte->fpd,
-                       irte->dst_mode, irte->redir_hint,
-                       irte->trigger_mode, irte->dlvry_mode,
-                       irte->avail, irte->vector, irte->dest_id,
-                       irte->sid, irte->sq, irte->svt);
-
-               entry = (struct IR_IO_APIC_route_entry *)info->ioapic_entry;
-               info->ioapic_entry = NULL;
-               memset(entry, 0, sizeof(*entry));
-               entry->index2   = (index >> 15) & 0x1;
-               entry->zero     = 0;
-               entry->format   = 1;
-               entry->index    = (index & 0x7fff);
-               /*
-                * IO-APIC RTE will be configured with virtual vector.
-                * irq handler will do the explicit EOI to the io-apic.
-                */
-               entry->vector   = info->ioapic_pin;
-               entry->mask     = 0;                    /* enable IRQ */
-               entry->trigger  = info->ioapic_trigger;
-               entry->polarity = info->ioapic_polarity;
-               if (info->ioapic_trigger)
-                       entry->mask = 1; /* Mask level triggered irqs. */
-               break;
-
-       case X86_IRQ_ALLOC_TYPE_HPET:
-       case X86_IRQ_ALLOC_TYPE_MSI:
-       case X86_IRQ_ALLOC_TYPE_MSIX:
-               if (info->type == X86_IRQ_ALLOC_TYPE_HPET)
-                       set_hpet_sid(irte, info->hpet_id);
-               else
-                       set_msi_sid(irte, info->msi_dev);
-
-               msg->address_hi = MSI_ADDR_BASE_HI;
-               msg->data = sub_handle;
-               msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT |
-                                 MSI_ADDR_IR_SHV |
-                                 MSI_ADDR_IR_INDEX1(index) |
-                                 MSI_ADDR_IR_INDEX2(index);
-               break;
-
-       default:
-               BUG_ON(1);
-               break;
-       }
-}
-
-static void intel_free_irq_resources(struct irq_domain *domain,
-                                    unsigned int virq, unsigned int nr_irqs)
-{
-       struct irq_data *irq_data;
-       struct intel_ir_data *data;
-       struct irq_2_iommu *irq_iommu;
-       unsigned long flags;
-       int i;
-       for (i = 0; i < nr_irqs; i++) {
-               irq_data = irq_domain_get_irq_data(domain, virq  + i);
-               if (irq_data && irq_data->chip_data) {
-                       data = irq_data->chip_data;
-                       irq_iommu = &data->irq_2_iommu;
-                       raw_spin_lock_irqsave(&irq_2_ir_lock, flags);
-                       clear_entries(irq_iommu);
-                       raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags);
-                       irq_domain_reset_irq_data(irq_data);
-                       kfree(data);
-               }
-       }
-}
-
-static int intel_irq_remapping_alloc(struct irq_domain *domain,
-                                    unsigned int virq, unsigned int nr_irqs,
-                                    void *arg)
-{
-       struct intel_iommu *iommu = domain->host_data;
-       struct irq_alloc_info *info = arg;
-       struct intel_ir_data *data, *ird;
-       struct irq_data *irq_data;
-       struct irq_cfg *irq_cfg;
-       int i, ret, index;
-
-       if (!info || !iommu)
-               return -EINVAL;
-       if (nr_irqs > 1 && info->type != X86_IRQ_ALLOC_TYPE_MSI &&
-           info->type != X86_IRQ_ALLOC_TYPE_MSIX)
-               return -EINVAL;
-
-       /*
-        * With IRQ remapping enabled, don't need contiguous CPU vectors
-        * to support multiple MSI interrupts.
-        */
-       if (info->type == X86_IRQ_ALLOC_TYPE_MSI)
-               info->flags &= ~X86_IRQ_ALLOC_CONTIGUOUS_VECTORS;
-
-       ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg);
-       if (ret < 0)
-               return ret;
-
-       ret = -ENOMEM;
-       data = kzalloc(sizeof(*data), GFP_KERNEL);
-       if (!data)
-               goto out_free_parent;
-
-       down_read(&dmar_global_lock);
-       index = alloc_irte(iommu, &data->irq_2_iommu, nr_irqs);
-       up_read(&dmar_global_lock);
-       if (index < 0) {
-               pr_warn("Failed to allocate IRTE\n");
-               kfree(data);
-               goto out_free_parent;
-       }
-
-       for (i = 0; i < nr_irqs; i++) {
-               irq_data = irq_domain_get_irq_data(domain, virq + i);
-               irq_cfg = irqd_cfg(irq_data);
-               if (!irq_data || !irq_cfg) {
-                       ret = -EINVAL;
-                       goto out_free_data;
-               }
-
-               if (i > 0) {
-                       ird = kzalloc(sizeof(*ird), GFP_KERNEL);
-                       if (!ird)
-                               goto out_free_data;
-                       /* Initialize the common data */
-                       ird->irq_2_iommu = data->irq_2_iommu;
-                       ird->irq_2_iommu.sub_handle = i;
-               } else {
-                       ird = data;
-               }
-
-               irq_data->hwirq = (index << 16) + i;
-               irq_data->chip_data = ird;
-               irq_data->chip = &intel_ir_chip;
-               intel_irq_remapping_prepare_irte(ird, irq_cfg, info, index, i);
-               irq_set_status_flags(virq + i, IRQ_MOVE_PCNTXT);
-       }
-       return 0;
-
-out_free_data:
-       intel_free_irq_resources(domain, virq, i);
-out_free_parent:
-       irq_domain_free_irqs_common(domain, virq, nr_irqs);
-       return ret;
-}
-
-static void intel_irq_remapping_free(struct irq_domain *domain,
-                                    unsigned int virq, unsigned int nr_irqs)
-{
-       intel_free_irq_resources(domain, virq, nr_irqs);
-       irq_domain_free_irqs_common(domain, virq, nr_irqs);
-}
-
-static int intel_irq_remapping_activate(struct irq_domain *domain,
-                                       struct irq_data *irq_data, bool reserve)
-{
-       intel_ir_reconfigure_irte(irq_data, true);
-       return 0;
-}
-
-static void intel_irq_remapping_deactivate(struct irq_domain *domain,
-                                          struct irq_data *irq_data)
-{
-       struct intel_ir_data *data = irq_data->chip_data;
-       struct irte entry;
-
-       memset(&entry, 0, sizeof(entry));
-       modify_irte(&data->irq_2_iommu, &entry);
-}
-
-static const struct irq_domain_ops intel_ir_domain_ops = {
-       .alloc = intel_irq_remapping_alloc,
-       .free = intel_irq_remapping_free,
-       .activate = intel_irq_remapping_activate,
-       .deactivate = intel_irq_remapping_deactivate,
-};
-
-/*
- * Support of Interrupt Remapping Unit Hotplug
- */
-static int dmar_ir_add(struct dmar_drhd_unit *dmaru, struct intel_iommu *iommu)
-{
-       int ret;
-       int eim = x2apic_enabled();
-
-       if (eim && !ecap_eim_support(iommu->ecap)) {
-               pr_info("DRHD %Lx: EIM not supported by DRHD, ecap %Lx\n",
-                       iommu->reg_phys, iommu->ecap);
-               return -ENODEV;
-       }
-
-       if (ir_parse_ioapic_hpet_scope(dmaru->hdr, iommu)) {
-               pr_warn("DRHD %Lx: failed to parse managed IOAPIC/HPET\n",
-                       iommu->reg_phys);
-               return -ENODEV;
-       }
-
-       /* TODO: check all IOAPICs are covered by IOMMU */
-
-       /* Setup Interrupt-remapping now. */
-       ret = intel_setup_irq_remapping(iommu);
-       if (ret) {
-               pr_err("Failed to setup irq remapping for %s\n",
-                      iommu->name);
-               intel_teardown_irq_remapping(iommu);
-               ir_remove_ioapic_hpet_scope(iommu);
-       } else {
-               iommu_enable_irq_remapping(iommu);
-       }
-
-       return ret;
-}
-
-int dmar_ir_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
-{
-       int ret = 0;
-       struct intel_iommu *iommu = dmaru->iommu;
-
-       if (!irq_remapping_enabled)
-               return 0;
-       if (iommu == NULL)
-               return -EINVAL;
-       if (!ecap_ir_support(iommu->ecap))
-               return 0;
-       if (irq_remapping_cap(IRQ_POSTING_CAP) &&
-           !cap_pi_support(iommu->cap))
-               return -EBUSY;
-
-       if (insert) {
-               if (!iommu->ir_table)
-                       ret = dmar_ir_add(dmaru, iommu);
-       } else {
-               if (iommu->ir_table) {
-                       if (!bitmap_empty(iommu->ir_table->bitmap,
-                                         INTR_REMAP_TABLE_ENTRIES)) {
-                               ret = -EBUSY;
-                       } else {
-                               iommu_disable_irq_remapping(iommu);
-                               intel_teardown_irq_remapping(iommu);
-                               ir_remove_ioapic_hpet_scope(iommu);
-                       }
-               }
-       }
-
-       return ret;
-}
author	Joerg Roedel <jroedel@suse.de>
	Tue, 9 Jun 2020 13:03:03 +0000 (15:03 +0200)
committer	Joerg Roedel <jroedel@suse.de>
	Wed, 10 Jun 2020 15:46:43 +0000 (17:46 +0200)
MAINTAINERS		patch \| blob \| history
drivers/iommu/Makefile		patch \| blob \| history
drivers/iommu/dmar.c	[deleted file]	patch \| blob \| history
drivers/iommu/intel-iommu-debugfs.c	[deleted file]	patch \| blob \| history
drivers/iommu/intel-iommu.c	[deleted file]	patch \| blob \| history
drivers/iommu/intel-pasid.c	[deleted file]	patch \| blob \| history
drivers/iommu/intel-pasid.h	[deleted file]	patch \| blob \| history
drivers/iommu/intel-svm.c	[deleted file]	patch \| blob \| history
drivers/iommu/intel-trace.c	[deleted file]	patch \| blob \| history
drivers/iommu/intel/debugfs.c	[new file with mode: 0644]	patch \| blob
drivers/iommu/intel/dmar.c	[new file with mode: 0644]	patch \| blob
drivers/iommu/intel/intel-pasid.h	[new file with mode: 0644]	patch \| blob
drivers/iommu/intel/iommu.c	[new file with mode: 0644]	patch \| blob
drivers/iommu/intel/irq_remapping.c	[new file with mode: 0644]	patch \| blob
drivers/iommu/intel/pasid.c	[new file with mode: 0644]	patch \| blob
drivers/iommu/intel/svm.c	[new file with mode: 0644]	patch \| blob
drivers/iommu/intel/trace.c	[new file with mode: 0644]	patch \| blob
drivers/iommu/intel_irq_remapping.c	[deleted file]	patch \| blob \| history