mm: make alloc_contig_range work at pageblock granularity

author Zi Yan <ziy@nvidia.com>

Fri, 13 May 2022 03:22:58 +0000 (20:22 -0700)

committer Andrew Morton <akpm@linux-foundation.org>

Fri, 13 May 2022 14:20:13 +0000 (07:20 -0700)
author Zi Yan <ziy@nvidia.com>
Fri, 13 May 2022 03:22:58 +0000 (20:22 -0700)
committer Andrew Morton <akpm@linux-foundation.org>
Fri, 13 May 2022 14:20:13 +0000 (07:20 -0700)
diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h

index e14eddf6741a40f27be59686f329b7877cb6a16a..5456b7be38ae50dd36729a6a82e936ea812ec747 100644 (file)
--- a/include/linux/page-isolation.h
+++ b/include/linux/page-isolation.h
@@ -42,7 +42,7 @@ int move_freepages_block(struct zone *zone, struct page *page,
   */
  int
  start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
-                        unsigned migratetype, int flags);
+                        int migratetype, int flags, gfp_t gfp_flags);
  
  /*
   * Changes MIGRATE_ISOLATE to MIGRATE_MOVABLE.
@@ -50,7 +50,7 @@ start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
   */
  void
  undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
-                       unsigned migratetype);
+                       int migratetype);
  
  /*
   * Test all pages in [start_pfn, end_pfn) are isolated or not.
diff --git a/mm/internal.h b/mm/internal.h

index ddd09245a6db0f02ccd7b6b88f6e00de2a5d93c7..a770029beb080da97e5b4c8643045103f05d5593 100644 (file)
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -359,6 +359,9 @@ extern void *memmap_alloc(phys_addr_t size, phys_addr_t align,
                           phys_addr_t min_addr,
                           int nid, bool exact_nid);
  
+void split_free_page(struct page *free_page,
+                               int order, unsigned long split_pfn_offset);
+
  #if defined CONFIG_COMPACTION || defined CONFIG_CMA
  
  /*
@@ -422,6 +425,9 @@ isolate_freepages_range(struct compact_control *cc,
  int
  isolate_migratepages_range(struct compact_control *cc,
                            unsigned long low_pfn, unsigned long end_pfn);
+
+int __alloc_contig_migrate_range(struct compact_control *cc,
+                                       unsigned long start, unsigned long end);
  #endif
  int find_suitable_fallback(struct free_area *area, unsigned int order,
                         int migratetype, bool only_stealable, bool *can_steal);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c

index e99fd60548f5fc3d78770356ab771aa73f2cb462..945191708ef6ac6501561d10797a968931f149d3 100644 (file)
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1837,7 +1837,8 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
         /* set above range as isolated */
         ret = start_isolate_page_range(start_pfn, end_pfn,
                                        MIGRATE_MOVABLE,
-                                      MEMORY_OFFLINE | REPORT_FAILURE);
+                                      MEMORY_OFFLINE | REPORT_FAILURE,
+                                      GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL);
         if (ret) {
                 reason = "failure to isolate range";
                 goto failed_removal_pcplists_disabled;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 0756f046b6449baeb5dfdebe592db3008ec49d83..0c7252ed14a0cbfff6c20fb4725f151a97c0f33b 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1094,6 +1094,43 @@ done_merging:
                 page_reporting_notify_free(order);
  }
  
+/**
+ * split_free_page() -- split a free page at split_pfn_offset
+ * @free_page:         the original free page
+ * @order:             the order of the page
+ * @split_pfn_offset:  split offset within the page
+ *
+ * It is used when the free page crosses two pageblocks with different migratetypes
+ * at split_pfn_offset within the page. The split free page will be put into
+ * separate migratetype lists afterwards. Otherwise, the function achieves
+ * nothing.
+ */
+void split_free_page(struct page *free_page,
+                               int order, unsigned long split_pfn_offset)
+{
+       struct zone *zone = page_zone(free_page);
+       unsigned long free_page_pfn = page_to_pfn(free_page);
+       unsigned long pfn;
+       unsigned long flags;
+       int free_page_order;
+
+       spin_lock_irqsave(&zone->lock, flags);
+       del_page_from_free_list(free_page, zone, order);
+       for (pfn = free_page_pfn;
+            pfn < free_page_pfn + (1UL << order);) {
+               int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn);
+
+               free_page_order = ffs(split_pfn_offset) - 1;
+               __free_one_page(pfn_to_page(pfn), pfn, zone, free_page_order,
+                               mt, FPI_NONE);
+               pfn += 1UL << free_page_order;
+               split_pfn_offset -= (1UL << free_page_order);
+               /* we have done the first part, now switch to second part */
+               if (split_pfn_offset == 0)
+                       split_pfn_offset = (1UL << order) - (pfn - free_page_pfn);
+       }
+       spin_unlock_irqrestore(&zone->lock, flags);
+}
  /*
   * A bad page could be due to a number of fields. Instead of multiple branches,
   * try and check multiple fields with one check. The caller must do a detailed
@@ -8951,7 +8988,7 @@ static inline void alloc_contig_dump_pages(struct list_head *page_list)
  #endif
  
  /* [start, end) must belong to a single zone. */
-static int __alloc_contig_migrate_range(struct compact_control *cc,
+int __alloc_contig_migrate_range(struct compact_control *cc,
                                         unsigned long start, unsigned long end)
  {
         /* This function is based on compact_zone() from compaction.c. */
@@ -9034,7 +9071,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
                        unsigned migratetype, gfp_t gfp_mask)
  {
         unsigned long outer_start, outer_end;
-       unsigned int order;
+       int order;
         int ret = 0;
  
         struct compact_control cc = {
@@ -9053,14 +9090,11 @@ int alloc_contig_range(unsigned long start, unsigned long end,
          * What we do here is we mark all pageblocks in range as
          * MIGRATE_ISOLATE.  Because pageblock and max order pages may
          * have different sizes, and due to the way page allocator
-        * work, we align the range to biggest of the two pages so
-        * that page allocator won't try to merge buddies from
-        * different pageblocks and change MIGRATE_ISOLATE to some
-        * other migration type.
+        * work, start_isolate_page_range() has special handlings for this.
          *
          * Once the pageblocks are marked as MIGRATE_ISOLATE, we
          * migrate the pages from an unaligned range (ie. pages that
-        * we are interested in).  This will put all the pages in
+        * we are interested in). This will put all the pages in
          * range back to page allocator as MIGRATE_ISOLATE.
          *
          * When this is done, we take the pages in range from page
@@ -9074,9 +9108,9 @@ int alloc_contig_range(unsigned long start, unsigned long end,
          */
  
         ret = start_isolate_page_range(pfn_max_align_down(start),
-                                      pfn_max_align_up(end), migratetype, 0);
+                               pfn_max_align_up(end), migratetype, 0, gfp_mask);
         if (ret)
-               return ret;
+               goto done;
  
         drain_all_pages(cc.zone);
  
@@ -9096,7 +9130,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
         ret = 0;
  
         /*
-        * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
+        * Pages from [start, end) are within a pageblock_nr_pages
          * aligned blocks that are marked as MIGRATE_ISOLATE.  What's
          * more, all pages in [start, end) are free in page allocator.
          * What we are going to do is to allocate all pages from
diff --git a/mm/page_isolation.c b/mm/page_isolation.c

index c2f7a8bb634d86da9b3731d7926c8cc05f8e37c3..8a0f16d2e4c358057f4f51485374806a8748e4df 100644 (file)
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -203,7 +203,7 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_
         return -EBUSY;
  }
  
-static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
+static void unset_migratetype_isolate(struct page *page, int migratetype)
  {
         struct zone *zone;
         unsigned long flags, nr_pages;
@@ -279,6 +279,166 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
         return NULL;
  }
  
+/**
+ * isolate_single_pageblock() -- tries to isolate a pageblock that might be
+ * within a free or in-use page.
+ * @boundary_pfn:              pageblock-aligned pfn that a page might cross
+ * @gfp_flags:                 GFP flags used for migrating pages
+ * @isolate_before:    isolate the pageblock before the boundary_pfn
+ *
+ * Free and in-use pages can be as big as MAX_ORDER-1 and contain more than one
+ * pageblock. When not all pageblocks within a page are isolated at the same
+ * time, free page accounting can go wrong. For example, in the case of
+ * MAX_ORDER-1 = pageblock_order + 1, a MAX_ORDER-1 page has two pagelbocks.
+ * [         MAX_ORDER-1         ]
+ * [  pageblock0  |  pageblock1  ]
+ * When either pageblock is isolated, if it is a free page, the page is not
+ * split into separate migratetype lists, which is supposed to; if it is an
+ * in-use page and freed later, __free_one_page() does not split the free page
+ * either. The function handles this by splitting the free page or migrating
+ * the in-use page then splitting the free page.
+ */
+static int isolate_single_pageblock(unsigned long boundary_pfn, gfp_t gfp_flags,
+                       bool isolate_before)
+{
+       unsigned char saved_mt;
+       unsigned long start_pfn;
+       unsigned long isolate_pageblock;
+       unsigned long pfn;
+       struct zone *zone;
+
+       VM_BUG_ON(!IS_ALIGNED(boundary_pfn, pageblock_nr_pages));
+
+       if (isolate_before)
+               isolate_pageblock = boundary_pfn - pageblock_nr_pages;
+       else
+               isolate_pageblock = boundary_pfn;
+
+       /*
+        * scan at the beginning of MAX_ORDER_NR_PAGES aligned range to avoid
+        * only isolating a subset of pageblocks from a bigger than pageblock
+        * free or in-use page. Also make sure all to-be-isolated pageblocks
+        * are within the same zone.
+        */
+       zone  = page_zone(pfn_to_page(isolate_pageblock));
+       start_pfn  = max(ALIGN_DOWN(isolate_pageblock, MAX_ORDER_NR_PAGES),
+                                     zone->zone_start_pfn);
+
+       saved_mt = get_pageblock_migratetype(pfn_to_page(isolate_pageblock));
+       set_pageblock_migratetype(pfn_to_page(isolate_pageblock), MIGRATE_ISOLATE);
+
+       /*
+        * Bail out early when the to-be-isolated pageblock does not form
+        * a free or in-use page across boundary_pfn:
+        *
+        * 1. isolate before boundary_pfn: the page after is not online
+        * 2. isolate after boundary_pfn: the page before is not online
+        *
+        * This also ensures correctness. Without it, when isolate after
+        * boundary_pfn and [start_pfn, boundary_pfn) are not online,
+        * __first_valid_page() will return unexpected NULL in the for loop
+        * below.
+        */
+       if (isolate_before) {
+               if (!pfn_to_online_page(boundary_pfn))
+                       return 0;
+       } else {
+               if (!pfn_to_online_page(boundary_pfn - 1))
+                       return 0;
+       }
+
+       for (pfn = start_pfn; pfn < boundary_pfn;) {
+               struct page *page = __first_valid_page(pfn, boundary_pfn - pfn);
+
+               VM_BUG_ON(!page);
+               pfn = page_to_pfn(page);
+               /*
+                * start_pfn is MAX_ORDER_NR_PAGES aligned, if there is any
+                * free pages in [start_pfn, boundary_pfn), its head page will
+                * always be in the range.
+                */
+               if (PageBuddy(page)) {
+                       int order = buddy_order(page);
+
+                       if (pfn + (1UL << order) > boundary_pfn)
+                               split_free_page(page, order, boundary_pfn - pfn);
+                       pfn += (1UL << order);
+                       continue;
+               }
+               /*
+                * migrate compound pages then let the free page handling code
+                * above do the rest. If migration is not possible, just fail.
+                */
+               if (PageCompound(page)) {
+                       unsigned long nr_pages = compound_nr(page);
+                       struct page *head = compound_head(page);
+                       unsigned long head_pfn = page_to_pfn(head);
+
+                       if (head_pfn + nr_pages < boundary_pfn) {
+                               pfn = head_pfn + nr_pages;
+                               continue;
+                       }
+#if defined CONFIG_COMPACTION || defined CONFIG_CMA
+                       /*
+                        * hugetlb, lru compound (THP), and movable compound pages
+                        * can be migrated. Otherwise, fail the isolation.
+                        */
+                       if (PageHuge(page) || PageLRU(page) || __PageMovable(page)) {
+                               int order;
+                               unsigned long outer_pfn;
+                               int ret;
+                               struct compact_control cc = {
+                                       .nr_migratepages = 0,
+                                       .order = -1,
+                                       .zone = page_zone(pfn_to_page(head_pfn)),
+                                       .mode = MIGRATE_SYNC,
+                                       .ignore_skip_hint = true,
+                                       .no_set_skip_hint = true,
+                                       .gfp_mask = gfp_flags,
+                                       .alloc_contig = true,
+                               };
+                               INIT_LIST_HEAD(&cc.migratepages);
+
+                               ret = __alloc_contig_migrate_range(&cc, head_pfn,
+                                                       head_pfn + nr_pages);
+
+                               if (ret)
+                                       goto failed;
+                               /*
+                                * reset pfn to the head of the free page, so
+                                * that the free page handling code above can split
+                                * the free page to the right migratetype list.
+                                *
+                                * head_pfn is not used here as a hugetlb page order
+                                * can be bigger than MAX_ORDER-1, but after it is
+                                * freed, the free page order is not. Use pfn within
+                                * the range to find the head of the free page.
+                                */
+                               order = 0;
+                               outer_pfn = pfn;
+                               while (!PageBuddy(pfn_to_page(outer_pfn))) {
+                                       if (++order >= MAX_ORDER) {
+                                               outer_pfn = pfn;
+                                               break;
+                                       }
+                                       outer_pfn &= ~0UL << order;
+                               }
+                               pfn = outer_pfn;
+                               continue;
+                       } else
+#endif
+                               goto failed;
+               }
+
+               pfn++;
+       }
+       return 0;
+failed:
+       /* restore the original migratetype */
+       set_pageblock_migratetype(pfn_to_page(isolate_pageblock), saved_mt);
+       return -EBUSY;
+}
+
  /**
   * start_isolate_page_range() - make page-allocation-type of range of pages to
   * be MIGRATE_ISOLATE.
@@ -293,6 +453,8 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
   *                                      and PageOffline() pages.
   *                     REPORT_FAILURE - report details about the failure to
   *                     isolate the range
+ * @gfp_flags:         GFP flags used for migrating pages that sit across the
+ *                     range boundaries.
   *
   * Making page-allocation-type to be MIGRATE_ISOLATE means free pages in
   * the range will never be allocated. Any free pages and pages freed in the
@@ -301,6 +463,10 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
   * pages in the range finally, the caller have to free all pages in the range.
   * test_page_isolated() can be used for test it.
   *
+ * The function first tries to isolate the pageblocks at the beginning and end
+ * of the range, since there might be pages across the range boundaries.
+ * Afterwards, it isolates the rest of the range.
+ *
   * There is no high level synchronization mechanism that prevents two threads
   * from trying to isolate overlapping ranges. If this happens, one thread
   * will notice pageblocks in the overlapping range already set to isolate.
@@ -321,21 +487,38 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
   * Return: 0 on success and -EBUSY if any part of range cannot be isolated.
   */
  int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
-                            unsigned migratetype, int flags)
+                            int migratetype, int flags, gfp_t gfp_flags)
  {
         unsigned long pfn;
         struct page *page;
+       int ret;
  
         BUG_ON(!IS_ALIGNED(start_pfn, pageblock_nr_pages));
         BUG_ON(!IS_ALIGNED(end_pfn, pageblock_nr_pages));
  
-       for (pfn = start_pfn;
-            pfn < end_pfn;
+       /* isolate [start_pfn, start_pfn + pageblock_nr_pages) pageblock */
+       ret = isolate_single_pageblock(start_pfn, gfp_flags, false);
+       if (ret)
+               return ret;
+
+       /* isolate [end_pfn - pageblock_nr_pages, end_pfn) pageblock */
+       ret = isolate_single_pageblock(end_pfn, gfp_flags, true);
+       if (ret) {
+               unset_migratetype_isolate(pfn_to_page(start_pfn), migratetype);
+               return ret;
+       }
+
+       /* skip isolated pageblocks at the beginning and end */
+       for (pfn = start_pfn + pageblock_nr_pages;
+            pfn < end_pfn - pageblock_nr_pages;
              pfn += pageblock_nr_pages) {
                 page = __first_valid_page(pfn, pageblock_nr_pages);
                 if (page && set_migratetype_isolate(page, migratetype, flags,
                                         start_pfn, end_pfn)) {
                         undo_isolate_page_range(start_pfn, pfn, migratetype);
+                       unset_migratetype_isolate(
+                               pfn_to_page(end_pfn - pageblock_nr_pages),
+                               migratetype);
                         return -EBUSY;
                 }
         }
@@ -346,7 +529,7 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
   * Make isolated pages available again.
   */
  void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
-                           unsigned migratetype)
+                           int migratetype)
  {
         unsigned long pfn;
         struct page *page;
author	Zi Yan <ziy@nvidia.com>
	Fri, 13 May 2022 03:22:58 +0000 (20:22 -0700)
committer	Andrew Morton <akpm@linux-foundation.org>
	Fri, 13 May 2022 14:20:13 +0000 (07:20 -0700)
include/linux/page-isolation.h		patch \| blob \| history
mm/internal.h		patch \| blob \| history
mm/memory_hotplug.c		patch \| blob \| history
mm/page_alloc.c		patch \| blob \| history
mm/page_isolation.c		patch \| blob \| history