mm: make alloc_contig_range handle free hugetlb pages

author Oscar Salvador <osalvador@suse.de>

Wed, 5 May 2021 01:35:26 +0000 (18:35 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Wed, 5 May 2021 18:27:22 +0000 (11:27 -0700)
author Oscar Salvador <osalvador@suse.de>
Wed, 5 May 2021 01:35:26 +0000 (18:35 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Wed, 5 May 2021 18:27:22 +0000 (11:27 -0700)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h

index 628639422c5dd4d78e7493b21eb3e37608127d73..ec6a10b8860a3eba21b2e0e1b4ce76d97a527f44 100644 (file)
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -588,6 +588,7 @@ struct huge_bootmem_page {
         struct hstate *hstate;
  };
  
+int isolate_or_dissolve_huge_page(struct page *page);
  struct page *alloc_huge_page(struct vm_area_struct *vma,
                                 unsigned long addr, int avoid_reserve);
  struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
@@ -870,6 +871,11 @@ static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
  #else  /* CONFIG_HUGETLB_PAGE */
  struct hstate {};
  
+static inline int isolate_or_dissolve_huge_page(struct page *page)
+{
+       return -ENOMEM;
+}
+
  static inline struct page *alloc_huge_page(struct vm_area_struct *vma,
                                            unsigned long addr,
                                            int avoid_reserve)
diff --git a/mm/compaction.c b/mm/compaction.c

index c4d8007221b74e501054619e5eaa3867bf745848..b77e1382307f1da0511dd4ed16b2f981fd179b3d 100644 (file)
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -788,7 +788,7 @@ static bool too_many_isolated(pg_data_t *pgdat)
   * Isolate all pages that can be migrated from the range specified by
   * [low_pfn, end_pfn). The range is expected to be within same pageblock.
   * Returns errno, like -EAGAIN or -EINTR in case e.g signal pending or congestion,
- * or 0.
+ * -ENOMEM in case we could not allocate a page, or 0.
   * cc->migrate_pfn will contain the next pfn to scan.
   *
   * The pages are isolated on cc->migratepages list (not required to be empty),
@@ -906,6 +906,29 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
                         valid_page = page;
                 }
  
+               if (PageHuge(page) && cc->alloc_contig) {
+                       ret = isolate_or_dissolve_huge_page(page);
+
+                       /*
+                        * Fail isolation in case isolate_or_dissolve_huge_page()
+                        * reports an error. In case of -ENOMEM, abort right away.
+                        */
+                       if (ret < 0) {
+                                /* Do not report -EBUSY down the chain */
+                               if (ret == -EBUSY)
+                                       ret = 0;
+                               low_pfn += (1UL << compound_order(page)) - 1;
+                               goto isolate_fail;
+                       }
+
+                       /*
+                        * Ok, the hugepage was dissolved. Now these pages are
+                        * Buddy and cannot be re-allocated because they are
+                        * isolated. Fall-through as the check below handles
+                        * Buddy pages.
+                        */
+               }
+
                 /*
                  * Skip if free. We read page order here without zone lock
                  * which is generally unsafe, but the race window is small and
@@ -1065,7 +1088,7 @@ isolate_fail_put:
                 put_page(page);
  
  isolate_fail:
-               if (!skip_on_failure)
+               if (!skip_on_failure && ret != -ENOMEM)
                         continue;
  
                 /*
@@ -1091,6 +1114,9 @@ isolate_fail:
                          */
                         next_skip_pfn += 1UL << cc->order;
                 }
+
+               if (ret == -ENOMEM)
+                       break;
         }
  
         /*
@@ -1143,7 +1169,8 @@ fatal_pending:
   * @start_pfn: The first PFN to start isolating.
   * @end_pfn:   The one-past-last PFN.
   *
- * Returns -EAGAIN when contented, -EINTR in case of a signal pending or 0.
+ * Returns -EAGAIN when contented, -EINTR in case of a signal pending, -ENOMEM
+ * in case we could not allocate a page, or 0.
   */
  int
  isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c

index 63760be2688ed19156c98a3db4dd7375f8a0d5a9..92f3cd08946f1e061274acf28ef8f0a521858333 100644 (file)
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2267,6 +2267,122 @@ static void restore_reserve_on_error(struct hstate *h,
         }
  }
  
+/*
+ * alloc_and_dissolve_huge_page - Allocate a new page and dissolve the old one
+ * @h: struct hstate old page belongs to
+ * @old_page: Old page to dissolve
+ * Returns 0 on success, otherwise negated error.
+ */
+static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page)
+{
+       gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
+       int nid = page_to_nid(old_page);
+       struct page *new_page;
+       int ret = 0;
+
+       /*
+        * Before dissolving the page, we need to allocate a new one for the
+        * pool to remain stable. Using alloc_buddy_huge_page() allows us to
+        * not having to deal with prep_new_huge_page() and avoids dealing of any
+        * counters. This simplifies and let us do the whole thing under the
+        * lock.
+        */
+       new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL);
+       if (!new_page)
+               return -ENOMEM;
+
+retry:
+       spin_lock_irq(&hugetlb_lock);
+       if (!PageHuge(old_page)) {
+               /*
+                * Freed from under us. Drop new_page too.
+                */
+               goto free_new;
+       } else if (page_count(old_page)) {
+               /*
+                * Someone has grabbed the page, fail for now.
+                */
+               ret = -EBUSY;
+               goto free_new;
+       } else if (!HPageFreed(old_page)) {
+               /*
+                * Page's refcount is 0 but it has not been enqueued in the
+                * freelist yet. Race window is small, so we can succeed here if
+                * we retry.
+                */
+               spin_unlock_irq(&hugetlb_lock);
+               cond_resched();
+               goto retry;
+       } else {
+               /*
+                * Ok, old_page is still a genuine free hugepage. Remove it from
+                * the freelist and decrease the counters. These will be
+                * incremented again when calling __prep_account_new_huge_page()
+                * and enqueue_huge_page() for new_page. The counters will remain
+                * stable since this happens under the lock.
+                */
+               remove_hugetlb_page(h, old_page, false);
+
+               /*
+                * new_page needs to be initialized with the standard hugetlb
+                * state. This is normally done by prep_new_huge_page() but
+                * that takes hugetlb_lock which is already held so we need to
+                * open code it here.
+                * Reference count trick is needed because allocator gives us
+                * referenced page but the pool requires pages with 0 refcount.
+                */
+               __prep_new_huge_page(new_page);
+               __prep_account_new_huge_page(h, nid);
+               page_ref_dec(new_page);
+               enqueue_huge_page(h, new_page);
+
+               /*
+                * Pages have been replaced, we can safely free the old one.
+                */
+               spin_unlock_irq(&hugetlb_lock);
+               update_and_free_page(h, old_page);
+       }
+
+       return ret;
+
+free_new:
+       spin_unlock_irq(&hugetlb_lock);
+       __free_pages(new_page, huge_page_order(h));
+
+       return ret;
+}
+
+int isolate_or_dissolve_huge_page(struct page *page)
+{
+       struct hstate *h;
+       struct page *head;
+
+       /*
+        * The page might have been dissolved from under our feet, so make sure
+        * to carefully check the state under the lock.
+        * Return success when racing as if we dissolved the page ourselves.
+        */
+       spin_lock_irq(&hugetlb_lock);
+       if (PageHuge(page)) {
+               head = compound_head(page);
+               h = page_hstate(head);
+       } else {
+               spin_unlock_irq(&hugetlb_lock);
+               return 0;
+       }
+       spin_unlock_irq(&hugetlb_lock);
+
+       /*
+        * Fence off gigantic pages as there is a cyclic dependency between
+        * alloc_contig_range and them. Return -ENOMEM as this has the effect
+        * of bailing out right away without further retrying.
+        */
+       if (hstate_is_gigantic(h))
+               return -ENOMEM;
+
+       return alloc_and_dissolve_huge_page(h, head);
+}
+
  struct page *alloc_huge_page(struct vm_area_struct *vma,
                                     unsigned long addr, int avoid_reserve)
  {
author	Oscar Salvador <osalvador@suse.de>
	Wed, 5 May 2021 01:35:26 +0000 (18:35 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 5 May 2021 18:27:22 +0000 (11:27 -0700)
include/linux/hugetlb.h		patch \| blob \| history
mm/compaction.c		patch \| blob \| history
mm/hugetlb.c		patch \| blob \| history