KVM: x86/mmu: Extend Eager Page Splitting to nested MMUs

author David Matlack <dmatlack@google.com>

Wed, 22 Jun 2022 19:27:09 +0000 (15:27 -0400)

committer Paolo Bonzini <pbonzini@redhat.com>

Fri, 24 Jun 2022 08:52:00 +0000 (04:52 -0400)
author David Matlack <dmatlack@google.com>
Wed, 22 Jun 2022 19:27:09 +0000 (15:27 -0400)
committer Paolo Bonzini <pbonzini@redhat.com>
Fri, 24 Jun 2022 08:52:00 +0000 (04:52 -0400)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt

index 97c16aa2f53fda926fcdc6f8ecbf3a45fe036594..329f0f274e2bb8387e91d879326f8c32a9892b0d 100644 (file)
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2418,8 +2418,7 @@
                         the KVM_CLEAR_DIRTY ioctl, and only for the pages being
                         cleared.
  
-                       Eager page splitting currently only supports splitting
-                       huge pages mapped by the TDP MMU.
+                       Eager page splitting is only supported when kvm.tdp_mmu=Y.
  
                         Default is Y (on).
  
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h

index 64efe8c90c31fe1ce83dd4354062caa0e44fe614..665667d61cafa29ed3539d31ff0e9d97806458dd 100644 (file)
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1338,6 +1338,28 @@ struct kvm_arch {
         u32 max_vcpu_ids;
  
         bool disable_nx_huge_pages;
+
+       /*
+        * Memory caches used to allocate shadow pages when performing eager
+        * page splitting. No need for a shadowed_info_cache since eager page
+        * splitting only allocates direct shadow pages.
+        *
+        * Protected by kvm->slots_lock.
+        */
+       struct kvm_mmu_memory_cache split_shadow_page_cache;
+       struct kvm_mmu_memory_cache split_page_header_cache;
+
+       /*
+        * Memory cache used to allocate pte_list_desc structs while splitting
+        * huge pages. In the worst case, to split one huge page, 512
+        * pte_list_desc structs are needed to add each lower level leaf sptep
+        * to the rmap plus 1 to extend the parent_ptes rmap of the lower level
+        * page table.
+        *
+        * Protected by kvm->slots_lock.
+        */
+#define SPLIT_DESC_CACHE_MIN_NR_OBJECTS (SPTE_ENT_PER_PAGE + 1)
+       struct kvm_mmu_memory_cache split_desc_cache;
  };
  
  struct kvm_vm_stat {
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c

index 192cb7dc4471470ee183d2e033588a7ae91aefae..9bfe339bf67f695ad132a226ef4689fba0e75989 100644 (file)
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5942,9 +5942,25 @@ int kvm_mmu_init_vm(struct kvm *kvm)
         node->track_write = kvm_mmu_pte_write;
         node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
         kvm_page_track_register_notifier(kvm, node);
+
+       kvm->arch.split_page_header_cache.kmem_cache = mmu_page_header_cache;
+       kvm->arch.split_page_header_cache.gfp_zero = __GFP_ZERO;
+
+       kvm->arch.split_shadow_page_cache.gfp_zero = __GFP_ZERO;
+
+       kvm->arch.split_desc_cache.kmem_cache = pte_list_desc_cache;
+       kvm->arch.split_desc_cache.gfp_zero = __GFP_ZERO;
+
         return 0;
  }
  
+static void mmu_free_vm_memory_caches(struct kvm *kvm)
+{
+       kvm_mmu_free_memory_cache(&kvm->arch.split_desc_cache);
+       kvm_mmu_free_memory_cache(&kvm->arch.split_page_header_cache);
+       kvm_mmu_free_memory_cache(&kvm->arch.split_shadow_page_cache);
+}
+
  void kvm_mmu_uninit_vm(struct kvm *kvm)
  {
         struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
@@ -5952,6 +5968,8 @@ void kvm_mmu_uninit_vm(struct kvm *kvm)
         kvm_page_track_unregister_notifier(kvm, node);
  
         kvm_mmu_uninit_tdp_mmu(kvm);
+
+       mmu_free_vm_memory_caches(kvm);
  }
  
  static bool __kvm_zap_rmaps(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
@@ -6073,15 +6091,235 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
                 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
  }
  
+static inline bool need_topup(struct kvm_mmu_memory_cache *cache, int min)
+{
+       return kvm_mmu_memory_cache_nr_free_objects(cache) < min;
+}
+
+static bool need_topup_split_caches_or_resched(struct kvm *kvm)
+{
+       if (need_resched() || rwlock_needbreak(&kvm->mmu_lock))
+               return true;
+
+       /*
+        * In the worst case, SPLIT_DESC_CACHE_MIN_NR_OBJECTS descriptors are needed
+        * to split a single huge page. Calculating how many are actually needed
+        * is possible but not worth the complexity.
+        */
+       return need_topup(&kvm->arch.split_desc_cache, SPLIT_DESC_CACHE_MIN_NR_OBJECTS) ||
+              need_topup(&kvm->arch.split_page_header_cache, 1) ||
+              need_topup(&kvm->arch.split_shadow_page_cache, 1);
+}
+
+static int topup_split_caches(struct kvm *kvm)
+{
+       int r;
+
+       lockdep_assert_held(&kvm->slots_lock);
+
+       /*
+        * Setting capacity == min would cause KVM to drop mmu_lock even if
+        * just one object was consumed from the cache, so make capacity
+        * larger than min.
+        */
+       r = __kvm_mmu_topup_memory_cache(&kvm->arch.split_desc_cache,
+                                        2 * SPLIT_DESC_CACHE_MIN_NR_OBJECTS,
+                                        SPLIT_DESC_CACHE_MIN_NR_OBJECTS);
+       if (r)
+               return r;
+
+       r = kvm_mmu_topup_memory_cache(&kvm->arch.split_page_header_cache, 1);
+       if (r)
+               return r;
+
+       return kvm_mmu_topup_memory_cache(&kvm->arch.split_shadow_page_cache, 1);
+}
+
+static struct kvm_mmu_page *shadow_mmu_get_sp_for_split(struct kvm *kvm, u64 *huge_sptep)
+{
+       struct kvm_mmu_page *huge_sp = sptep_to_sp(huge_sptep);
+       struct shadow_page_caches caches = {};
+       union kvm_mmu_page_role role;
+       unsigned int access;
+       gfn_t gfn;
+
+       gfn = kvm_mmu_page_get_gfn(huge_sp, huge_sptep - huge_sp->spt);
+       access = kvm_mmu_page_get_access(huge_sp, huge_sptep - huge_sp->spt);
+
+       /*
+        * Note, huge page splitting always uses direct shadow pages, regardless
+        * of whether the huge page itself is mapped by a direct or indirect
+        * shadow page, since the huge page region itself is being directly
+        * mapped with smaller pages.
+        */
+       role = kvm_mmu_child_role(huge_sptep, /*direct=*/true, access);
+
+       /* Direct SPs do not require a shadowed_info_cache. */
+       caches.page_header_cache = &kvm->arch.split_page_header_cache;
+       caches.shadow_page_cache = &kvm->arch.split_shadow_page_cache;
+
+       /* Safe to pass NULL for vCPU since requesting a direct SP. */
+       return __kvm_mmu_get_shadow_page(kvm, NULL, &caches, gfn, role);
+}
+
+static void shadow_mmu_split_huge_page(struct kvm *kvm,
+                                      const struct kvm_memory_slot *slot,
+                                      u64 *huge_sptep)
+
+{
+       struct kvm_mmu_memory_cache *cache = &kvm->arch.split_desc_cache;
+       u64 huge_spte = READ_ONCE(*huge_sptep);
+       struct kvm_mmu_page *sp;
+       u64 *sptep, spte;
+       gfn_t gfn;
+       int index;
+
+       sp = shadow_mmu_get_sp_for_split(kvm, huge_sptep);
+
+       for (index = 0; index < SPTE_ENT_PER_PAGE; index++) {
+               sptep = &sp->spt[index];
+               gfn = kvm_mmu_page_get_gfn(sp, index);
+
+               /*
+                * The SP may already have populated SPTEs, e.g. if this huge
+                * page is aliased by multiple sptes with the same access
+                * permissions. These entries are guaranteed to map the same
+                * gfn-to-pfn translation since the SP is direct, so no need to
+                * modify them.
+                *
+                * If a given SPTE points to a lower level page table, installing
+                * such SPTEs would effectively unmap a potion of the huge page.
+                * This is not an issue because __link_shadow_page() flushes the TLB
+                * when the passed sp replaces a large SPTE.
+                */
+               if (is_shadow_present_pte(*sptep))
+                       continue;
+
+               spte = make_huge_page_split_spte(kvm, huge_spte, sp->role, index);
+               mmu_spte_set(sptep, spte);
+               __rmap_add(kvm, cache, slot, sptep, gfn, sp->role.access);
+       }
+
+       __link_shadow_page(kvm, cache, huge_sptep, sp);
+}
+
+static int shadow_mmu_try_split_huge_page(struct kvm *kvm,
+                                         const struct kvm_memory_slot *slot,
+                                         u64 *huge_sptep)
+{
+       struct kvm_mmu_page *huge_sp = sptep_to_sp(huge_sptep);
+       int level, r = 0;
+       gfn_t gfn;
+       u64 spte;
+
+       /* Grab information for the tracepoint before dropping the MMU lock. */
+       gfn = kvm_mmu_page_get_gfn(huge_sp, huge_sptep - huge_sp->spt);
+       level = huge_sp->role.level;
+       spte = *huge_sptep;
+
+       if (kvm_mmu_available_pages(kvm) <= KVM_MIN_FREE_MMU_PAGES) {
+               r = -ENOSPC;
+               goto out;
+       }
+
+       if (need_topup_split_caches_or_resched(kvm)) {
+               write_unlock(&kvm->mmu_lock);
+               cond_resched();
+               /*
+                * If the topup succeeds, return -EAGAIN to indicate that the
+                * rmap iterator should be restarted because the MMU lock was
+                * dropped.
+                */
+               r = topup_split_caches(kvm) ?: -EAGAIN;
+               write_lock(&kvm->mmu_lock);
+               goto out;
+       }
+
+       shadow_mmu_split_huge_page(kvm, slot, huge_sptep);
+
+out:
+       trace_kvm_mmu_split_huge_page(gfn, spte, level, r);
+       return r;
+}
+
+static bool shadow_mmu_try_split_huge_pages(struct kvm *kvm,
+                                           struct kvm_rmap_head *rmap_head,
+                                           const struct kvm_memory_slot *slot)
+{
+       struct rmap_iterator iter;
+       struct kvm_mmu_page *sp;
+       u64 *huge_sptep;
+       int r;
+
+restart:
+       for_each_rmap_spte(rmap_head, &iter, huge_sptep) {
+               sp = sptep_to_sp(huge_sptep);
+
+               /* TDP MMU is enabled, so rmap only contains nested MMU SPs. */
+               if (WARN_ON_ONCE(!sp->role.guest_mode))
+                       continue;
+
+               /* The rmaps should never contain non-leaf SPTEs. */
+               if (WARN_ON_ONCE(!is_large_pte(*huge_sptep)))
+                       continue;
+
+               /* SPs with level >PG_LEVEL_4K should never by unsync. */
+               if (WARN_ON_ONCE(sp->unsync))
+                       continue;
+
+               /* Don't bother splitting huge pages on invalid SPs. */
+               if (sp->role.invalid)
+                       continue;
+
+               r = shadow_mmu_try_split_huge_page(kvm, slot, huge_sptep);
+
+               /*
+                * The split succeeded or needs to be retried because the MMU
+                * lock was dropped. Either way, restart the iterator to get it
+                * back into a consistent state.
+                */
+               if (!r || r == -EAGAIN)
+                       goto restart;
+
+               /* The split failed and shouldn't be retried (e.g. -ENOMEM). */
+               break;
+       }
+
+       return false;
+}
+
+static void kvm_shadow_mmu_try_split_huge_pages(struct kvm *kvm,
+                                               const struct kvm_memory_slot *slot,
+                                               gfn_t start, gfn_t end,
+                                               int target_level)
+{
+       int level;
+
+       /*
+        * Split huge pages starting with KVM_MAX_HUGEPAGE_LEVEL and working
+        * down to the target level. This ensures pages are recursively split
+        * all the way to the target level. There's no need to split pages
+        * already at the target level.
+        */
+       for (level = KVM_MAX_HUGEPAGE_LEVEL; level > target_level; level--) {
+               slot_handle_level_range(kvm, slot, shadow_mmu_try_split_huge_pages,
+                                       level, level, start, end - 1, true, false);
+       }
+}
+
  /* Must be called with the mmu_lock held in write-mode. */
  void kvm_mmu_try_split_huge_pages(struct kvm *kvm,
                                    const struct kvm_memory_slot *memslot,
                                    u64 start, u64 end,
                                    int target_level)
  {
-       if (is_tdp_mmu_enabled(kvm))
-               kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end,
-                                                target_level, false);
+       if (!is_tdp_mmu_enabled(kvm))
+               return;
+
+       if (kvm_memslots_have_rmaps(kvm))
+               kvm_shadow_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level);
+
+       kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, false);
  
         /*
          * A TLB flush is unnecessary at this point for the same resons as in
@@ -6096,12 +6334,19 @@ void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm,
         u64 start = memslot->base_gfn;
         u64 end = start + memslot->npages;
  
-       if (is_tdp_mmu_enabled(kvm)) {
-               read_lock(&kvm->mmu_lock);
-               kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, true);
-               read_unlock(&kvm->mmu_lock);
+       if (!is_tdp_mmu_enabled(kvm))
+               return;
+
+       if (kvm_memslots_have_rmaps(kvm)) {
+               write_lock(&kvm->mmu_lock);
+               kvm_shadow_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level);
+               write_unlock(&kvm->mmu_lock);
         }
  
+       read_lock(&kvm->mmu_lock);
+       kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, true);
+       read_unlock(&kvm->mmu_lock);
+
         /*
          * No TLB flush is necessary here. KVM will flush TLBs after
          * write-protecting and/or clearing dirty on the newly split SPTEs to
author	David Matlack <dmatlack@google.com>
	Wed, 22 Jun 2022 19:27:09 +0000 (15:27 -0400)
committer	Paolo Bonzini <pbonzini@redhat.com>
	Fri, 24 Jun 2022 08:52:00 +0000 (04:52 -0400)
Documentation/admin-guide/kernel-parameters.txt		patch \| blob \| history
arch/x86/include/asm/kvm_host.h		patch \| blob \| history
arch/x86/kvm/mmu/mmu.c		patch \| blob \| history