mm: workingset: tell cache transitions from workingset thrashing

author Johannes Weiner <hannes@cmpxchg.org>

Fri, 26 Oct 2018 22:06:04 +0000 (15:06 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 26 Oct 2018 23:26:32 +0000 (16:26 -0700)
author Johannes Weiner <hannes@cmpxchg.org>
Fri, 26 Oct 2018 22:06:04 +0000 (15:06 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 26 Oct 2018 23:26:32 +0000 (16:26 -0700)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h

index 7bbeba21f6a3f3c52ff8f952d0395a2df65be67f..ba51d5bf7af106aa5215ed103e08d3ea92a63c3e 100644 (file)
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -163,6 +163,7 @@ enum node_stat_item {
         NR_ISOLATED_FILE,       /* Temporary isolated pages from file lru */
         WORKINGSET_REFAULT,
         WORKINGSET_ACTIVATE,
+       WORKINGSET_RESTORE,
         WORKINGSET_NODERECLAIM,
         NR_ANON_MAPPED, /* Mapped anonymous pages */
         NR_FILE_MAPPED, /* pagecache pages mapped into pagetables.
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h

index 74bee8cecf4ce864322157d3e93defe200ec0173..4d99504f649604126c7a0c3b5229db7dfb2366f3 100644 (file)
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -69,13 +69,14 @@
   */
  enum pageflags {
         PG_locked,              /* Page is locked. Don't touch. */
-       PG_error,
         PG_referenced,
         PG_uptodate,
         PG_dirty,
         PG_lru,
         PG_active,
+       PG_workingset,
         PG_waiters,             /* Page has waiters, check its waitqueue. Must be bit #7 and in the same byte as "PG_locked" */
+       PG_error,
         PG_slab,
         PG_owner_priv_1,        /* Owner use. If pagecache, fs may use*/
         PG_arch_1,
@@ -280,6 +281,8 @@ PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD)
  PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD)
  PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD)
         TESTCLEARFLAG(Active, active, PF_HEAD)
+PAGEFLAG(Workingset, workingset, PF_HEAD)
+       TESTCLEARFLAG(Workingset, workingset, PF_HEAD)
  __PAGEFLAG(Slab, slab, PF_NO_TAIL)
  __PAGEFLAG(SlobFree, slob_free, PF_NO_TAIL)
  PAGEFLAG(Checked, checked, PF_NO_COMPOUND)        /* Used by some filesystems */
diff --git a/include/linux/swap.h b/include/linux/swap.h

index 8e2c11e692baf9bba6380d63888cfbf5a206cf0b..b93740d72e78eb5cbea0b53cdbb29b24348d3bd3 100644 (file)
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -296,7 +296,7 @@ struct vma_swap_readahead {
  
  /* linux/mm/workingset.c */
  void *workingset_eviction(struct address_space *mapping, struct page *page);
-bool workingset_refault(void *shadow);
+void workingset_refault(struct page *page, void *shadow);
  void workingset_activation(struct page *page);
  
  /* Do not use directly, use workingset_lookup_update */
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h

index a81cffb76d890185f09ddcf08496a7dd432233d6..a1675d43777e8feb9165401af1b7c4a26d76f306 100644 (file)
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -88,6 +88,7 @@
         {1UL << PG_dirty,               "dirty"         },              \
         {1UL << PG_lru,                 "lru"           },              \
         {1UL << PG_active,              "active"        },              \
+       {1UL << PG_workingset,          "workingset"    },              \
         {1UL << PG_slab,                "slab"          },              \
         {1UL << PG_owner_priv_1,        "owner_priv_1"  },              \
         {1UL << PG_arch_1,              "arch_1"        },              \
diff --git a/mm/filemap.c b/mm/filemap.c

index de6fed2a08156c065f21e4161a5786f7cde71310..7997adce5a29a2a515107c06d1ecadefd2151472 100644 (file)
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -915,12 +915,9 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
                  * data from the working set, only to cache data that will
                  * get overwritten with something else, is a waste of memory.
                  */
-               if (!(gfp_mask & __GFP_WRITE) &&
-                   shadow && workingset_refault(shadow)) {
-                       SetPageActive(page);
-                       workingset_activation(page);
-               } else
-                       ClearPageActive(page);
+               WARN_ON_ONCE(PageActive(page));
+               if (!(gfp_mask & __GFP_WRITE) && shadow)
+                       workingset_refault(page, shadow);
                 lru_cache_add(page);
         }
         return ret;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c

index deed97fba97976da9e79da8b45fb7e09eee9efa5..8ea1b36bd452d795da8ac2714634c6bc70a45b99 100644 (file)
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2369,6 +2369,7 @@ static void __split_huge_page_tail(struct page *head, int tail,
                          (1L << PG_mlocked) |
                          (1L << PG_uptodate) |
                          (1L << PG_active) |
+                        (1L << PG_workingset) |
                          (1L << PG_locked) |
                          (1L << PG_unevictable) |
                          (1L << PG_dirty)));
diff --git a/mm/migrate.c b/mm/migrate.c

index 84381b55b2bd5c535bd181b7670a69f37bb084a4..1ea27b343ccd1add7fd798fd46bd99a988af25b7 100644 (file)
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -685,6 +685,8 @@ void migrate_page_states(struct page *newpage, struct page *page)
                 SetPageActive(newpage);
         } else if (TestClearPageUnevictable(page))
                 SetPageUnevictable(newpage);
+       if (PageWorkingset(page))
+               SetPageWorkingset(newpage);
         if (PageChecked(page))
                 SetPageChecked(newpage);
         if (PageMappedToDisk(page))
diff --git a/mm/swap_state.c b/mm/swap_state.c

index ecee9c6c4cc1725028fd723452f3655c0799a8a7..0d6a7f268d2e6c1abe6fb550a9e14206b26188ff 100644 (file)
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -448,6 +448,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
                         /*
                          * Initiate read into locked page and return.
                          */
+                       SetPageWorkingset(new_page);
                         lru_cache_add_anon(new_page);
                         *new_page_allocated = true;
                         return new_page;
diff --git a/mm/vmscan.c b/mm/vmscan.c

index 961401c46334cf815b1d92d7755314577aabd3e2..87e9fef341d2d05638958771b4840aba8a18ef2a 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2145,6 +2145,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
                 }
  
                 ClearPageActive(page);  /* we are de-activating */
+               SetPageWorkingset(page);
                 list_add(&page->lru, &l_inactive);
         }
  
diff --git a/mm/vmstat.c b/mm/vmstat.c

index 2cec2fa4c8aeda07a05ccee1374c9f456d59785b..d918f6192d15ab857dd42748e81ae79d42be797c 100644 (file)
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1145,6 +1145,7 @@ const char * const vmstat_text[] = {
         "nr_isolated_file",
         "workingset_refault",
         "workingset_activate",
+       "workingset_restore",
         "workingset_nodereclaim",
         "nr_anon_pages",
         "nr_mapped",
diff --git a/mm/workingset.c b/mm/workingset.c

index 7d5fa0dd2b38047ac60a3339bc8179ee0e8b3583..99b7f7c09b13850198894d819f3f26252adcffae 100644 (file)
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -121,7 +121,7 @@
   * the only thing eating into inactive list space is active pages.
   *
   *
- *             Activating refaulting pages
+ *             Refaulting inactive pages
   *
   * All that is known about the active list is that the pages have been
   * accessed more than once in the past.  This means that at any given
@@ -134,6 +134,10 @@
   * used less frequently than the refaulting page - or even not used at
   * all anymore.
   *
+ * That means if inactive cache is refaulting with a suitable refault
+ * distance, we assume the cache workingset is transitioning and put
+ * pressure on the current active list.
+ *
   * If this is wrong and demotion kicks in, the pages which are truly
   * used more frequently will be reactivated while the less frequently
   * used once will be evicted from memory.
@@ -141,6 +145,14 @@
   * But if this is right, the stale pages will be pushed out of memory
   * and the used pages get to stay in cache.
   *
+ *             Refaulting active pages
+ *
+ * If on the other hand the refaulting pages have recently been
+ * deactivated, it means that the active list is no longer protecting
+ * actively used cache from reclaim. The cache is NOT transitioning to
+ * a different workingset; the existing workingset is thrashing in the
+ * space allocated to the page cache.
+ *
   *
   *             Implementation
   *
@@ -156,8 +168,7 @@
   */
  
  #define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \
-                        NODES_SHIFT +  \
-                        MEM_CGROUP_ID_SHIFT)
+                        1 + NODES_SHIFT + MEM_CGROUP_ID_SHIFT)
  #define EVICTION_MASK  (~0UL >> EVICTION_SHIFT)
  
  /*
@@ -170,23 +181,28 @@
   */
  static unsigned int bucket_order __read_mostly;
  
-static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction)
+static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
+                        bool workingset)
  {
         eviction >>= bucket_order;
         eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
         eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
+       eviction = (eviction << 1) | workingset;
         eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT);
  
         return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY);
  }
  
  static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
-                         unsigned long *evictionp)
+                         unsigned long *evictionp, bool *workingsetp)
  {
         unsigned long entry = (unsigned long)shadow;
         int memcgid, nid;
+       bool workingset;
  
         entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT;
+       workingset = entry & 1;
+       entry >>= 1;
         nid = entry & ((1UL << NODES_SHIFT) - 1);
         entry >>= NODES_SHIFT;
         memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
@@ -195,6 +211,7 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
         *memcgidp = memcgid;
         *pgdat = NODE_DATA(nid);
         *evictionp = entry << bucket_order;
+       *workingsetp = workingset;
  }
  
  /**
@@ -207,8 +224,8 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
   */
  void *workingset_eviction(struct address_space *mapping, struct page *page)
  {
-       struct mem_cgroup *memcg = page_memcg(page);
         struct pglist_data *pgdat = page_pgdat(page);
+       struct mem_cgroup *memcg = page_memcg(page);
         int memcgid = mem_cgroup_id(memcg);
         unsigned long eviction;
         struct lruvec *lruvec;
@@ -220,30 +237,30 @@ void *workingset_eviction(struct address_space *mapping, struct page *page)
  
         lruvec = mem_cgroup_lruvec(pgdat, memcg);
         eviction = atomic_long_inc_return(&lruvec->inactive_age);
-       return pack_shadow(memcgid, pgdat, eviction);
+       return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page));
  }
  
  /**
   * workingset_refault - evaluate the refault of a previously evicted page
+ * @page: the freshly allocated replacement page
   * @shadow: shadow entry of the evicted page
   *
   * Calculates and evaluates the refault distance of the previously
   * evicted page in the context of the node it was allocated in.
- *
- * Returns %true if the page should be activated, %false otherwise.
   */
-bool workingset_refault(void *shadow)
+void workingset_refault(struct page *page, void *shadow)
  {
         unsigned long refault_distance;
+       struct pglist_data *pgdat;
         unsigned long active_file;
         struct mem_cgroup *memcg;
         unsigned long eviction;
         struct lruvec *lruvec;
         unsigned long refault;
-       struct pglist_data *pgdat;
+       bool workingset;
         int memcgid;
  
-       unpack_shadow(shadow, &memcgid, &pgdat, &eviction);
+       unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset);
  
         rcu_read_lock();
         /*
@@ -263,41 +280,51 @@ bool workingset_refault(void *shadow)
          * configurations instead.
          */
         memcg = mem_cgroup_from_id(memcgid);
-       if (!mem_cgroup_disabled() && !memcg) {
-               rcu_read_unlock();
-               return false;
-       }
+       if (!mem_cgroup_disabled() && !memcg)
+               goto out;
         lruvec = mem_cgroup_lruvec(pgdat, memcg);
         refault = atomic_long_read(&lruvec->inactive_age);
         active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES);
  
         /*
-        * The unsigned subtraction here gives an accurate distance
-        * across inactive_age overflows in most cases.
+        * Calculate the refault distance
          *
-        * There is a special case: usually, shadow entries have a
-        * short lifetime and are either refaulted or reclaimed along
-        * with the inode before they get too old.  But it is not
-        * impossible for the inactive_age to lap a shadow entry in
-        * the field, which can then can result in a false small
-        * refault distance, leading to a false activation should this
-        * old entry actually refault again.  However, earlier kernels
-        * used to deactivate unconditionally with *every* reclaim
-        * invocation for the longest time, so the occasional
-        * inappropriate activation leading to pressure on the active
-        * list is not a problem.
+        * The unsigned subtraction here gives an accurate distance
+        * across inactive_age overflows in most cases. There is a
+        * special case: usually, shadow entries have a short lifetime
+        * and are either refaulted or reclaimed along with the inode
+        * before they get too old.  But it is not impossible for the
+        * inactive_age to lap a shadow entry in the field, which can
+        * then result in a false small refault distance, leading to a
+        * false activation should this old entry actually refault
+        * again.  However, earlier kernels used to deactivate
+        * unconditionally with *every* reclaim invocation for the
+        * longest time, so the occasional inappropriate activation
+        * leading to pressure on the active list is not a problem.
          */
         refault_distance = (refault - eviction) & EVICTION_MASK;
  
         inc_lruvec_state(lruvec, WORKINGSET_REFAULT);
  
-       if (refault_distance <= active_file) {
-               inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE);
-               rcu_read_unlock();
-               return true;
+       /*
+        * Compare the distance to the existing workingset size. We
+        * don't act on pages that couldn't stay resident even if all
+        * the memory was available to the page cache.
+        */
+       if (refault_distance > active_file)
+               goto out;
+
+       SetPageActive(page);
+       atomic_long_inc(&lruvec->inactive_age);
+       inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE);
+
+       /* Page was active prior to eviction */
+       if (workingset) {
+               SetPageWorkingset(page);
+               inc_lruvec_state(lruvec, WORKINGSET_RESTORE);
         }
+out:
         rcu_read_unlock();
-       return false;
  }
  
  /**
author	Johannes Weiner <hannes@cmpxchg.org>
	Fri, 26 Oct 2018 22:06:04 +0000 (15:06 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 26 Oct 2018 23:26:32 +0000 (16:26 -0700)
include/linux/mmzone.h		patch \| blob \| history
include/linux/page-flags.h		patch \| blob \| history
include/linux/swap.h		patch \| blob \| history
include/trace/events/mmflags.h		patch \| blob \| history
mm/filemap.c		patch \| blob \| history
mm/huge_memory.c		patch \| blob \| history
mm/migrate.c		patch \| blob \| history
mm/swap_state.c		patch \| blob \| history
mm/vmscan.c		patch \| blob \| history
mm/vmstat.c		patch \| blob \| history
mm/workingset.c		patch \| blob \| history