hugetlb: add nodemask arg to huge page alloc, free and surplus adjust functions

author Lee Schermerhorn <lee.schermerhorn@hp.com>

Tue, 15 Dec 2009 01:58:16 +0000 (17:58 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Tue, 15 Dec 2009 16:53:12 +0000 (08:53 -0800)
author Lee Schermerhorn <lee.schermerhorn@hp.com>
Tue, 15 Dec 2009 01:58:16 +0000 (17:58 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Tue, 15 Dec 2009 16:53:12 +0000 (08:53 -0800)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c

index bffcf774f60b6dd261a541318720ceff9379a4b4..324d1abae8768f829f361489d1e92b1efb7a401c 100644 (file)
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -622,48 +622,56 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
  }
  
  /*
- * common helper function for hstate_next_node_to_{alloc|free}.
- * return next node in node_online_map, wrapping at end.
+ * common helper functions for hstate_next_node_to_{alloc|free}.
+ * We may have allocated or freed a huge page based on a different
+ * nodes_allowed previously, so h->next_node_to_{alloc|free} might
+ * be outside of *nodes_allowed.  Ensure that we use an allowed
+ * node for alloc or free.
   */
-static int next_node_allowed(int nid)
+static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
  {
-       nid = next_node(nid, node_online_map);
+       nid = next_node(nid, *nodes_allowed);
         if (nid == MAX_NUMNODES)
-               nid = first_node(node_online_map);
+               nid = first_node(*nodes_allowed);
         VM_BUG_ON(nid >= MAX_NUMNODES);
  
         return nid;
  }
  
+static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
+{
+       if (!node_isset(nid, *nodes_allowed))
+               nid = next_node_allowed(nid, nodes_allowed);
+       return nid;
+}
+
  /*
- * Use a helper variable to find the next node and then
- * copy it back to next_nid_to_alloc afterwards:
- * otherwise there's a window in which a racer might
- * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node.
- * But we don't need to use a spin_lock here: it really
- * doesn't matter if occasionally a racer chooses the
- * same nid as we do.  Move nid forward in the mask even
- * if we just successfully allocated a hugepage so that
- * the next caller gets hugepages on the next node.
+ * returns the previously saved node ["this node"] from which to
+ * allocate a persistent huge page for the pool and advance the
+ * next node from which to allocate, handling wrap at end of node
+ * mask.
   */
-static int hstate_next_node_to_alloc(struct hstate *h)
+static int hstate_next_node_to_alloc(struct hstate *h,
+                                       nodemask_t *nodes_allowed)
  {
-       int nid, next_nid;
+       int nid;
+
+       VM_BUG_ON(!nodes_allowed);
+
+       nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
+       h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
  
-       nid = h->next_nid_to_alloc;
-       next_nid = next_node_allowed(nid);
-       h->next_nid_to_alloc = next_nid;
         return nid;
  }
  
-static int alloc_fresh_huge_page(struct hstate *h)
+static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
  {
         struct page *page;
         int start_nid;
         int next_nid;
         int ret = 0;
  
-       start_nid = hstate_next_node_to_alloc(h);
+       start_nid = hstate_next_node_to_alloc(h, nodes_allowed);
         next_nid = start_nid;
  
         do {
@@ -672,7 +680,7 @@ static int alloc_fresh_huge_page(struct hstate *h)
                         ret = 1;
                         break;
                 }
-               next_nid = hstate_next_node_to_alloc(h);
+               next_nid = hstate_next_node_to_alloc(h, nodes_allowed);
         } while (next_nid != start_nid);
  
         if (ret)
@@ -684,18 +692,20 @@ static int alloc_fresh_huge_page(struct hstate *h)
  }
  
  /*
- * helper for free_pool_huge_page() - return the next node
- * from which to free a huge page.  Advance the next node id
- * whether or not we find a free huge page to free so that the
- * next attempt to free addresses the next node.
+ * helper for free_pool_huge_page() - return the previously saved
+ * node ["this node"] from which to free a huge page.  Advance the
+ * next node id whether or not we find a free huge page to free so
+ * that the next attempt to free addresses the next node.
   */
-static int hstate_next_node_to_free(struct hstate *h)
+static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
  {
-       int nid, next_nid;
+       int nid;
+
+       VM_BUG_ON(!nodes_allowed);
+
+       nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
+       h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
  
-       nid = h->next_nid_to_free;
-       next_nid = next_node_allowed(nid);
-       h->next_nid_to_free = next_nid;
         return nid;
  }
  
@@ -705,13 +715,14 @@ static int hstate_next_node_to_free(struct hstate *h)
   * balanced over allowed nodes.
   * Called with hugetlb_lock locked.
   */
-static int free_pool_huge_page(struct hstate *h, bool acct_surplus)
+static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
+                                                        bool acct_surplus)
  {
         int start_nid;
         int next_nid;
         int ret = 0;
  
-       start_nid = hstate_next_node_to_free(h);
+       start_nid = hstate_next_node_to_free(h, nodes_allowed);
         next_nid = start_nid;
  
         do {
@@ -735,7 +746,7 @@ static int free_pool_huge_page(struct hstate *h, bool acct_surplus)
                         ret = 1;
                         break;
                 }
-               next_nid = hstate_next_node_to_free(h);
+               next_nid = hstate_next_node_to_free(h, nodes_allowed);
         } while (next_nid != start_nid);
  
         return ret;
@@ -937,7 +948,7 @@ static void return_unused_surplus_pages(struct hstate *h,
          * on-line nodes for us and will handle the hstate accounting.
          */
         while (nr_pages--) {
-               if (!free_pool_huge_page(h, 1))
+               if (!free_pool_huge_page(h, &node_online_map, 1))
                         break;
         }
  }
@@ -1047,7 +1058,8 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
                 void *addr;
  
                 addr = __alloc_bootmem_node_nopanic(
-                               NODE_DATA(hstate_next_node_to_alloc(h)),
+                               NODE_DATA(hstate_next_node_to_alloc(h,
+                                                       &node_online_map)),
                                 huge_page_size(h), huge_page_size(h), 0);
  
                 if (addr) {
@@ -1102,7 +1114,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
                 if (h->order >= MAX_ORDER) {
                         if (!alloc_bootmem_huge_page(h))
                                 break;
-               } else if (!alloc_fresh_huge_page(h))
+               } else if (!alloc_fresh_huge_page(h, &node_online_map))
                         break;
         }
         h->max_huge_pages = i;
@@ -1144,14 +1156,15 @@ static void __init report_hugepages(void)
  }
  
  #ifdef CONFIG_HIGHMEM
-static void try_to_free_low(struct hstate *h, unsigned long count)
+static void try_to_free_low(struct hstate *h, unsigned long count,
+                                               nodemask_t *nodes_allowed)
  {
         int i;
  
         if (h->order >= MAX_ORDER)
                 return;
  
-       for (i = 0; i < MAX_NUMNODES; ++i) {
+       for_each_node_mask(i, *nodes_allowed) {
                 struct page *page, *next;
                 struct list_head *freel = &h->hugepage_freelists[i];
                 list_for_each_entry_safe(page, next, freel, lru) {
@@ -1167,7 +1180,8 @@ static void try_to_free_low(struct hstate *h, unsigned long count)
         }
  }
  #else
-static inline void try_to_free_low(struct hstate *h, unsigned long count)
+static inline void try_to_free_low(struct hstate *h, unsigned long count,
+                                               nodemask_t *nodes_allowed)
  {
  }
  #endif
@@ -1177,7 +1191,8 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count)
   * balanced by operating on them in a round-robin fashion.
   * Returns 1 if an adjustment was made.
   */
-static int adjust_pool_surplus(struct hstate *h, int delta)
+static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
+                               int delta)
  {
         int start_nid, next_nid;
         int ret = 0;
@@ -1185,9 +1200,9 @@ static int adjust_pool_surplus(struct hstate *h, int delta)
         VM_BUG_ON(delta != -1 && delta != 1);
  
         if (delta < 0)
-               start_nid = hstate_next_node_to_alloc(h);
+               start_nid = hstate_next_node_to_alloc(h, nodes_allowed);
         else
-               start_nid = hstate_next_node_to_free(h);
+               start_nid = hstate_next_node_to_free(h, nodes_allowed);
         next_nid = start_nid;
  
         do {
@@ -1197,7 +1212,8 @@ static int adjust_pool_surplus(struct hstate *h, int delta)
                          * To shrink on this node, there must be a surplus page
                          */
                         if (!h->surplus_huge_pages_node[nid]) {
-                               next_nid = hstate_next_node_to_alloc(h);
+                               next_nid = hstate_next_node_to_alloc(h,
+                                                               nodes_allowed);
                                 continue;
                         }
                 }
@@ -1207,7 +1223,8 @@ static int adjust_pool_surplus(struct hstate *h, int delta)
                          */
                         if (h->surplus_huge_pages_node[nid] >=
                                                 h->nr_huge_pages_node[nid]) {
-                               next_nid = hstate_next_node_to_free(h);
+                               next_nid = hstate_next_node_to_free(h,
+                                                               nodes_allowed);
                                 continue;
                         }
                 }
@@ -1222,7 +1239,8 @@ static int adjust_pool_surplus(struct hstate *h, int delta)
  }
  
  #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
-static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
+static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
+                                               nodemask_t *nodes_allowed)
  {
         unsigned long min_count, ret;
  
@@ -1242,7 +1260,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
          */
         spin_lock(&hugetlb_lock);
         while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
-               if (!adjust_pool_surplus(h, -1))
+               if (!adjust_pool_surplus(h, nodes_allowed, -1))
                         break;
         }
  
@@ -1253,7 +1271,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
                  * and reducing the surplus.
                  */
                 spin_unlock(&hugetlb_lock);
-               ret = alloc_fresh_huge_page(h);
+               ret = alloc_fresh_huge_page(h, nodes_allowed);
                 spin_lock(&hugetlb_lock);
                 if (!ret)
                         goto out;
@@ -1277,13 +1295,13 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
          */
         min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
         min_count = max(count, min_count);
-       try_to_free_low(h, min_count);
+       try_to_free_low(h, min_count, nodes_allowed);
         while (min_count < persistent_huge_pages(h)) {
-               if (!free_pool_huge_page(h, 0))
+               if (!free_pool_huge_page(h, nodes_allowed, 0))
                         break;
         }
         while (count < persistent_huge_pages(h)) {
-               if (!adjust_pool_surplus(h, 1))
+               if (!adjust_pool_surplus(h, nodes_allowed, 1))
                         break;
         }
  out:
@@ -1329,7 +1347,7 @@ static ssize_t nr_hugepages_store(struct kobject *kobj,
         if (err)
                 return 0;
  
-       h->max_huge_pages = set_max_huge_pages(h, input);
+       h->max_huge_pages = set_max_huge_pages(h, input, &node_online_map);
  
         return count;
  }
@@ -1571,7 +1589,8 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write,
         proc_doulongvec_minmax(table, write, buffer, length, ppos);
  
         if (write)
-               h->max_huge_pages = set_max_huge_pages(h, tmp);
+               h->max_huge_pages = set_max_huge_pages(h, tmp,
+                                                       &node_online_map);
  
         return 0;
  }
author	Lee Schermerhorn <lee.schermerhorn@hp.com>
	Tue, 15 Dec 2009 01:58:16 +0000 (17:58 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 15 Dec 2009 16:53:12 +0000 (08:53 -0800)