mm: remove per-zone hashtable of bitlock waitqueues

author Linus Torvalds <torvalds@linux-foundation.org>

Wed, 26 Oct 2016 17:15:30 +0000 (10:15 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Thu, 27 Oct 2016 16:27:57 +0000 (09:27 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Wed, 26 Oct 2016 17:15:30 +0000 (10:15 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Thu, 27 Oct 2016 16:27:57 +0000 (09:27 -0700)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h

index 7f2ae99e5daf39406fa5b166f6a1a75c464d1fcd..0f088f3a2fed8df6435819a52360940310272462 100644 (file)
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -440,33 +440,7 @@ struct zone {
         seqlock_t               span_seqlock;
  #endif
  
-       /*
-        * wait_table           -- the array holding the hash table
-        * wait_table_hash_nr_entries   -- the size of the hash table array
-        * wait_table_bits      -- wait_table_size == (1 << wait_table_bits)
-        *
-        * The purpose of all these is to keep track of the people
-        * waiting for a page to become available and make them
-        * runnable again when possible. The trouble is that this
-        * consumes a lot of space, especially when so few things
-        * wait on pages at a given time. So instead of using
-        * per-page waitqueues, we use a waitqueue hash table.
-        *
-        * The bucket discipline is to sleep on the same queue when
-        * colliding and wake all in that wait queue when removing.
-        * When something wakes, it must check to be sure its page is
-        * truly available, a la thundering herd. The cost of a
-        * collision is great, but given the expected load of the
-        * table, they should be so rare as to be outweighed by the
-        * benefits from the saved space.
-        *
-        * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the
-        * primary users of these fields, and in mm/page_alloc.c
-        * free_area_init_core() performs the initialization of them.
-        */
-       wait_queue_head_t       *wait_table;
-       unsigned long           wait_table_hash_nr_entries;
-       unsigned long           wait_table_bits;
+       int initialized;
  
         /* Write-intensive fields used from the page allocator */
         ZONE_PADDING(_pad1_)
@@ -546,7 +520,7 @@ static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn)
  
  static inline bool zone_is_initialized(struct zone *zone)
  {
-       return !!zone->wait_table;
+       return zone->initialized;
  }
  
  static inline bool zone_is_empty(struct zone *zone)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 94732d1ab00ab9d9afdebad41642e279249776cb..42d4027f9e2656b763aa3050e38c2ae1803e1890 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7515,11 +7515,27 @@ static struct kmem_cache *task_group_cache __read_mostly;
  DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
  DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
  
+#define WAIT_TABLE_BITS 8
+#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS)
+static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned;
+
+wait_queue_head_t *bit_waitqueue(void *word, int bit)
+{
+       const int shift = BITS_PER_LONG == 32 ? 5 : 6;
+       unsigned long val = (unsigned long)word << shift | bit;
+
+       return bit_wait_table + hash_long(val, WAIT_TABLE_BITS);
+}
+EXPORT_SYMBOL(bit_waitqueue);
+
  void __init sched_init(void)
  {
         int i, j;
         unsigned long alloc_size = 0, ptr;
  
+       for (i = 0; i < WAIT_TABLE_SIZE; i++)
+               init_waitqueue_head(bit_wait_table + i);
+
  #ifdef CONFIG_FAIR_GROUP_SCHED
         alloc_size += 2 * nr_cpu_ids * sizeof(void **);
  #endif
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c

index 4f7053579fe3c9e473bfb9854f959a874e9566ea..9453efe9b25a64bd4aefceae8e5dffef54df64f2 100644 (file)
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -480,16 +480,6 @@ void wake_up_bit(void *word, int bit)
  }
  EXPORT_SYMBOL(wake_up_bit);
  
-wait_queue_head_t *bit_waitqueue(void *word, int bit)
-{
-       const int shift = BITS_PER_LONG == 32 ? 5 : 6;
-       const struct zone *zone = page_zone(virt_to_page(word));
-       unsigned long val = (unsigned long)word << shift | bit;
-
-       return &zone->wait_table[hash_long(val, zone->wait_table_bits)];
-}
-EXPORT_SYMBOL(bit_waitqueue);
-
  /*
   * Manipulate the atomic_t address to produce a better bit waitqueue table hash
   * index (we're keying off bit -1, but that would produce a horrible hash
diff --git a/mm/filemap.c b/mm/filemap.c

index 849f459ad0780e27bc256ff13fd52fa8c9007661..c7fe2f16503f9f906e3f01be0155818ef86c522e 100644 (file)
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -790,9 +790,7 @@ EXPORT_SYMBOL(__page_cache_alloc);
   */
  wait_queue_head_t *page_waitqueue(struct page *page)
  {
-       const struct zone *zone = page_zone(page);
-
-       return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)];
+       return bit_waitqueue(page, 0);
  }
  EXPORT_SYMBOL(page_waitqueue);
  
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c

index 962927309b6e963fa05a0acf09608c2ec6cef761..b18dab401be6e9e82bc213568a2bd7f6af378389 100644 (file)
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -268,7 +268,6 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat)
         unsigned long i, pfn, end_pfn, nr_pages;
         int node = pgdat->node_id;
         struct page *page;
-       struct zone *zone;
  
         nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT;
         page = virt_to_page(pgdat);
@@ -276,19 +275,6 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat)
         for (i = 0; i < nr_pages; i++, page++)
                 get_page_bootmem(node, page, NODE_INFO);
  
-       zone = &pgdat->node_zones[0];
-       for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) {
-               if (zone_is_initialized(zone)) {
-                       nr_pages = zone->wait_table_hash_nr_entries
-                               * sizeof(wait_queue_head_t);
-                       nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT;
-                       page = virt_to_page(zone->wait_table);
-
-                       for (i = 0; i < nr_pages; i++, page++)
-                               get_page_bootmem(node, page, NODE_INFO);
-               }
-       }
-
         pfn = pgdat->node_start_pfn;
         end_pfn = pgdat_end_pfn(pgdat);
  
@@ -2158,20 +2144,6 @@ void try_offline_node(int nid)
          */
         node_set_offline(nid);
         unregister_one_node(nid);
-
-       /* free waittable in each zone */
-       for (i = 0; i < MAX_NR_ZONES; i++) {
-               struct zone *zone = pgdat->node_zones + i;
-
-               /*
-                * wait_table may be allocated from boot memory,
-                * here only free if it's allocated by vmalloc.
-                */
-               if (is_vmalloc_addr(zone->wait_table)) {
-                       vfree(zone->wait_table);
-                       zone->wait_table = NULL;
-               }
-       }
  }
  EXPORT_SYMBOL(try_offline_node);
  
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 2b3bf6767d5441a876890dce16d9de2d60f1e2bc..de7c6e43b1c96e06202f8b8bf3fae901111b2027 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4976,72 +4976,6 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
  #endif
  }
  
-/*
- * Helper functions to size the waitqueue hash table.
- * Essentially these want to choose hash table sizes sufficiently
- * large so that collisions trying to wait on pages are rare.
- * But in fact, the number of active page waitqueues on typical
- * systems is ridiculously low, less than 200. So this is even
- * conservative, even though it seems large.
- *
- * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
- * waitqueues, i.e. the size of the waitq table given the number of pages.
- */
-#define PAGES_PER_WAITQUEUE    256
-
-#ifndef CONFIG_MEMORY_HOTPLUG
-static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
-{
-       unsigned long size = 1;
-
-       pages /= PAGES_PER_WAITQUEUE;
-
-       while (size < pages)
-               size <<= 1;
-
-       /*
-        * Once we have dozens or even hundreds of threads sleeping
-        * on IO we've got bigger problems than wait queue collision.
-        * Limit the size of the wait table to a reasonable size.
-        */
-       size = min(size, 4096UL);
-
-       return max(size, 4UL);
-}
-#else
-/*
- * A zone's size might be changed by hot-add, so it is not possible to determine
- * a suitable size for its wait_table.  So we use the maximum size now.
- *
- * The max wait table size = 4096 x sizeof(wait_queue_head_t).   ie:
- *
- *    i386 (preemption config)    : 4096 x 16 = 64Kbyte.
- *    ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
- *    ia64, x86-64 (preemption)   : 4096 x 24 = 96Kbyte.
- *
- * The maximum entries are prepared when a zone's memory is (512K + 256) pages
- * or more by the traditional way. (See above).  It equals:
- *
- *    i386, x86-64, powerpc(4K page size) : =  ( 2G + 1M)byte.
- *    ia64(16K page size)                 : =  ( 8G + 4M)byte.
- *    powerpc (64K page size)             : =  (32G +16M)byte.
- */
-static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
-{
-       return 4096UL;
-}
-#endif
-
-/*
- * This is an integer logarithm so that shifts can be used later
- * to extract the more random high bits from the multiplicative
- * hash function before the remainder is taken.
- */
-static inline unsigned long wait_table_bits(unsigned long size)
-{
-       return ffz(~size);
-}
-
  /*
   * Initially all pages are reserved - free ones are freed
   * up by free_all_bootmem() once the early boot process is
@@ -5304,49 +5238,6 @@ void __init setup_per_cpu_pageset(void)
                         alloc_percpu(struct per_cpu_nodestat);
  }
  
-static noinline __ref
-int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
-{
-       int i;
-       size_t alloc_size;
-
-       /*
-        * The per-page waitqueue mechanism uses hashed waitqueues
-        * per zone.
-        */
-       zone->wait_table_hash_nr_entries =
-                wait_table_hash_nr_entries(zone_size_pages);
-       zone->wait_table_bits =
-               wait_table_bits(zone->wait_table_hash_nr_entries);
-       alloc_size = zone->wait_table_hash_nr_entries
-                                       * sizeof(wait_queue_head_t);
-
-       if (!slab_is_available()) {
-               zone->wait_table = (wait_queue_head_t *)
-                       memblock_virt_alloc_node_nopanic(
-                               alloc_size, zone->zone_pgdat->node_id);
-       } else {
-               /*
-                * This case means that a zone whose size was 0 gets new memory
-                * via memory hot-add.
-                * But it may be the case that a new node was hot-added.  In
-                * this case vmalloc() will not be able to use this new node's
-                * memory - this wait_table must be initialized to use this new
-                * node itself as well.
-                * To use this new node's memory, further consideration will be
-                * necessary.
-                */
-               zone->wait_table = vmalloc(alloc_size);
-       }
-       if (!zone->wait_table)
-               return -ENOMEM;
-
-       for (i = 0; i < zone->wait_table_hash_nr_entries; ++i)
-               init_waitqueue_head(zone->wait_table + i);
-
-       return 0;
-}
-
  static __meminit void zone_pcp_init(struct zone *zone)
  {
         /*
@@ -5367,10 +5258,7 @@ int __meminit init_currently_empty_zone(struct zone *zone,
                                         unsigned long size)
  {
         struct pglist_data *pgdat = zone->zone_pgdat;
-       int ret;
-       ret = zone_wait_table_init(zone, size);
-       if (ret)
-               return ret;
+
         pgdat->nr_zones = zone_idx(zone) + 1;
  
         zone->zone_start_pfn = zone_start_pfn;
@@ -5382,6 +5270,7 @@ int __meminit init_currently_empty_zone(struct zone *zone,
                         zone_start_pfn, (zone_start_pfn + size));
  
         zone_init_free_lists(zone);
+       zone->initialized = 1;
  
         return 0;
  }
author	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 26 Oct 2016 17:15:30 +0000 (10:15 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 27 Oct 2016 16:27:57 +0000 (09:27 -0700)
include/linux/mmzone.h		patch \| blob \| history
kernel/sched/core.c		patch \| blob \| history
kernel/sched/wait.c		patch \| blob \| history
mm/filemap.c		patch \| blob \| history
mm/memory_hotplug.c		patch \| blob \| history
mm/page_alloc.c		patch \| blob \| history