KVM: x86/MMU: Allow NX huge pages to be disabled on a per-vm basis

author Ben Gardon <bgardon@google.com>

Mon, 13 Jun 2022 21:25:21 +0000 (21:25 +0000)

committer Paolo Bonzini <pbonzini@redhat.com>

Fri, 24 Jun 2022 08:51:49 +0000 (04:51 -0400)
author Ben Gardon <bgardon@google.com>
Mon, 13 Jun 2022 21:25:21 +0000 (21:25 +0000)
committer Paolo Bonzini <pbonzini@redhat.com>
Fri, 24 Jun 2022 08:51:49 +0000 (04:51 -0400)
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst

index 320cb04f7bd9fa27eefc7cec8c5d0cf3e2d78c27..bafaeedd455c38da79b116c3199d2eb65c97a1bd 100644 (file)
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -8206,6 +8206,22 @@ PV guests. The `KVM_PV_DUMP` command is available for the
  dump related UV data. Also the vcpu ioctl `KVM_S390_PV_CPU_COMMAND` is
  available and supports the `KVM_PV_DUMP_CPU` subcommand.
  
+8.38 KVM_CAP_VM_DISABLE_NX_HUGE_PAGES
+---------------------------
+
+:Capability KVM_CAP_VM_DISABLE_NX_HUGE_PAGES
+:Architectures: x86
+:Type: vm
+:Parameters: arg[0] must be 0.
+:Returns 0 on success, -EPERM if the userspace process does not
+        have CAP_SYS_BOOT, -EINVAL if args[0] is not 0 or any vCPUs have been
+        created.
+
+This capability disables the NX huge pages mitigation for iTLB MULTIHIT.
+
+The capability has no effect if the nx_huge_pages module parameter is not set.
+
+This capability may only be set before any vCPUs are created.
  
  9. Known KVM API problems
  =========================
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h

index e37727a74d0a4e0e3733e0552b98201d65c50125..7e4c31b57a75b8ee058d288c4d34cd0961f3711f 100644 (file)
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1336,6 +1336,8 @@ struct kvm_arch {
          * the global KVM_MAX_VCPU_IDS may lead to significant memory waste.
          */
         u32 max_vcpu_ids;
+
+       bool disable_nx_huge_pages;
  };
  
  struct kvm_vm_stat {
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h

index 5e1e3c8f8aaa9478bc0f50a033037615e802304a..bb9d12ac0db3f5899c48e3d039afda1766d74e4d 100644 (file)
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -155,9 +155,9 @@ void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
  unsigned int pte_list_count(struct kvm_rmap_head *rmap_head);
  
  extern int nx_huge_pages;
-static inline bool is_nx_huge_page_enabled(void)
+static inline bool is_nx_huge_page_enabled(struct kvm *kvm)
  {
-       return READ_ONCE(nx_huge_pages);
+       return READ_ONCE(nx_huge_pages) && !kvm->arch.disable_nx_huge_pages;
  }
  
  struct kvm_page_fault {
@@ -256,7 +256,8 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
                 .user = err & PFERR_USER_MASK,
                 .prefetch = prefetch,
                 .is_tdp = likely(vcpu->arch.mmu->page_fault == kvm_tdp_page_fault),
-               .nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(),
+               .nx_huge_page_workaround_enabled =
+                       is_nx_huge_page_enabled(vcpu->kvm),
  
                 .max_level = KVM_MAX_HUGEPAGE_LEVEL,
                 .req_level = PG_LEVEL_4K,
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c

index 242e4828d7dfad61aea7a280985775e29a177d6a..db294c1beea2fa2651f543430992b7f9d072f927 100644 (file)
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -147,7 +147,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
                 spte |= spte_shadow_accessed_mask(spte);
  
         if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) &&
-           is_nx_huge_page_enabled()) {
+           is_nx_huge_page_enabled(vcpu->kvm)) {
                 pte_access &= ~ACC_EXEC_MASK;
         }
  
@@ -246,7 +246,8 @@ static u64 make_spte_executable(u64 spte)
   * This is used during huge page splitting to build the SPTEs that make up the
   * new page table.
   */
-u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index)
+u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte, int huge_level,
+                             int index)
  {
         u64 child_spte;
         int child_level;
@@ -274,7 +275,7 @@ u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index)
                  * When splitting to a 4K page, mark the page executable as the
                  * NX hugepage mitigation no longer applies.
                  */
-               if (is_nx_huge_page_enabled())
+               if (is_nx_huge_page_enabled(kvm))
                         child_spte = make_spte_executable(child_spte);
         }
  
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h

index 121c5eaaec77d8fd54f1f3deb5a82fc4d20b27a0..256f90587e8dfe9f9083c01fcacabf582cd60e53 100644 (file)
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -421,7 +421,8 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
                unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn,
                u64 old_spte, bool prefetch, bool can_unsync,
                bool host_writable, u64 *new_spte);
-u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index);
+u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte, int huge_level,
+                             int index);
  u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled);
  u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access);
  u64 mark_spte_for_access_track(u64 spte);
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c

index 1ea40809ef1f3797a29c297436a6519bf3b4406f..522e2532343b9f835aba708931c937f0502eff6b 100644 (file)
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1478,7 +1478,7 @@ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
          * not been linked in yet and thus is not reachable from any other CPU.
          */
         for (i = 0; i < SPTE_ENT_PER_PAGE; i++)
-               sp->spt[i] = make_huge_page_split_spte(huge_spte, level, i);
+               sp->spt[i] = make_huge_page_split_spte(kvm, huge_spte, level, i);
  
         /*
          * Replace the huge spte with a pointer to the populated lower level
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c

index c1b3b2ea8ee0be2040f5998395e31cb19b14d23c..7ce0c6fe166dd68467d89c7f4ed6e65fbbb4814e 100644 (file)
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4324,6 +4324,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
         case KVM_CAP_SYS_ATTRIBUTES:
         case KVM_CAP_VAPIC:
         case KVM_CAP_ENABLE_CAP:
+       case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES:
                 r = 1;
                 break;
         case KVM_CAP_EXIT_HYPERCALL:
@@ -6184,6 +6185,35 @@ split_irqchip_unlock:
                 }
                 mutex_unlock(&kvm->lock);
                 break;
+       case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES:
+               r = -EINVAL;
+
+               /*
+                * Since the risk of disabling NX hugepages is a guest crashing
+                * the system, ensure the userspace process has permission to
+                * reboot the system.
+                *
+                * Note that unlike the reboot() syscall, the process must have
+                * this capability in the root namespace because exposing
+                * /dev/kvm into a container does not limit the scope of the
+                * iTLB multihit bug to that container. In other words,
+                * this must use capable(), not ns_capable().
+                */
+               if (!capable(CAP_SYS_BOOT)) {
+                       r = -EPERM;
+                       break;
+               }
+
+               if (cap->args[0])
+                       break;
+
+               mutex_lock(&kvm->lock);
+               if (!kvm->created_vcpus) {
+                       kvm->arch.disable_nx_huge_pages = true;
+                       r = 0;
+               }
+               mutex_unlock(&kvm->lock);
+               break;
         default:
                 r = -EINVAL;
                 break;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h

index 7569b4ec199c08b1dd93ae666dd1ccd3b05f94a4..a36e78710382bc53e47e2c5937bb00503089d7c7 100644 (file)
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1166,6 +1166,7 @@ struct kvm_ppc_resize_hpt {
  #define KVM_CAP_S390_PROTECTED_DUMP 217
  #define KVM_CAP_X86_TRIPLE_FAULT_EVENT 218
  #define KVM_CAP_X86_NOTIFY_VMEXIT 219
+#define KVM_CAP_VM_DISABLE_NX_HUGE_PAGES 220
  
  #ifdef KVM_CAP_IRQ_ROUTING
author	Ben Gardon <bgardon@google.com>
	Mon, 13 Jun 2022 21:25:21 +0000 (21:25 +0000)
committer	Paolo Bonzini <pbonzini@redhat.com>
	Fri, 24 Jun 2022 08:51:49 +0000 (04:51 -0400)
Documentation/virt/kvm/api.rst		patch \| blob \| history
arch/x86/include/asm/kvm_host.h		patch \| blob \| history
arch/x86/kvm/mmu/mmu_internal.h		patch \| blob \| history
arch/x86/kvm/mmu/spte.c		patch \| blob \| history
arch/x86/kvm/mmu/spte.h		patch \| blob \| history
arch/x86/kvm/mmu/tdp_mmu.c		patch \| blob \| history
arch/x86/kvm/x86.c		patch \| blob \| history
include/uapi/linux/kvm.h		patch \| blob \| history