vfs: keep inodes with page cache off the inode shrinker LRU

author Johannes Weiner <hannes@cmpxchg.org>

Tue, 9 Nov 2021 02:31:24 +0000 (18:31 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Tue, 9 Nov 2021 18:02:48 +0000 (10:02 -0800)
author Johannes Weiner <hannes@cmpxchg.org>
Tue, 9 Nov 2021 02:31:24 +0000 (18:31 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Tue, 9 Nov 2021 18:02:48 +0000 (10:02 -0800)
diff --git a/fs/inode.c b/fs/inode.c

index ed0cab8a32db109c6851ddcf567d8de83d1c5c22..a49695f57e1eaa2f047c93b77f40f9269ea32aa5 100644 (file)
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -428,11 +428,20 @@ void ihold(struct inode *inode)
  }
  EXPORT_SYMBOL(ihold);
  
-static void inode_lru_list_add(struct inode *inode)
+static void __inode_add_lru(struct inode *inode, bool rotate)
  {
+       if (inode->i_state & (I_DIRTY_ALL | I_SYNC | I_FREEING | I_WILL_FREE))
+               return;
+       if (atomic_read(&inode->i_count))
+               return;
+       if (!(inode->i_sb->s_flags & SB_ACTIVE))
+               return;
+       if (!mapping_shrinkable(&inode->i_data))
+               return;
+
         if (list_lru_add(&inode->i_sb->s_inode_lru, &inode->i_lru))
                 this_cpu_inc(nr_unused);
-       else
+       else if (rotate)
                 inode->i_state |= I_REFERENCED;
  }
  
@@ -443,16 +452,11 @@ static void inode_lru_list_add(struct inode *inode)
   */
  void inode_add_lru(struct inode *inode)
  {
-       if (!(inode->i_state & (I_DIRTY_ALL | I_SYNC |
-                               I_FREEING | I_WILL_FREE)) &&
-           !atomic_read(&inode->i_count) && inode->i_sb->s_flags & SB_ACTIVE)
-               inode_lru_list_add(inode);
+       __inode_add_lru(inode, false);
  }
  
-
  static void inode_lru_list_del(struct inode *inode)
  {
-
         if (list_lru_del(&inode->i_sb->s_inode_lru, &inode->i_lru))
                 this_cpu_dec(nr_unused);
  }
@@ -728,10 +732,6 @@ again:
  /*
   * Isolate the inode from the LRU in preparation for freeing it.
   *
- * Any inodes which are pinned purely because of attached pagecache have their
- * pagecache removed.  If the inode has metadata buffers attached to
- * mapping->private_list then try to remove them.
- *
   * If the inode has the I_REFERENCED flag set, then it means that it has been
   * used recently - the flag is set in iput_final(). When we encounter such an
   * inode, clear the flag and move it to the back of the LRU so it gets another
@@ -747,31 +747,39 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
         struct inode    *inode = container_of(item, struct inode, i_lru);
  
         /*
-        * we are inverting the lru lock/inode->i_lock here, so use a trylock.
-        * If we fail to get the lock, just skip it.
+        * We are inverting the lru lock/inode->i_lock here, so use a
+        * trylock. If we fail to get the lock, just skip it.
          */
         if (!spin_trylock(&inode->i_lock))
                 return LRU_SKIP;
  
         /*
-        * Referenced or dirty inodes are still in use. Give them another pass
-        * through the LRU as we canot reclaim them now.
+        * Inodes can get referenced, redirtied, or repopulated while
+        * they're already on the LRU, and this can make them
+        * unreclaimable for a while. Remove them lazily here; iput,
+        * sync, or the last page cache deletion will requeue them.
          */
         if (atomic_read(&inode->i_count) ||
-           (inode->i_state & ~I_REFERENCED)) {
+           (inode->i_state & ~I_REFERENCED) ||
+           !mapping_shrinkable(&inode->i_data)) {
                 list_lru_isolate(lru, &inode->i_lru);
                 spin_unlock(&inode->i_lock);
                 this_cpu_dec(nr_unused);
                 return LRU_REMOVED;
         }
  
-       /* recently referenced inodes get one more pass */
+       /* Recently referenced inodes get one more pass */
         if (inode->i_state & I_REFERENCED) {
                 inode->i_state &= ~I_REFERENCED;
                 spin_unlock(&inode->i_lock);
                 return LRU_ROTATE;
         }
  
+       /*
+        * On highmem systems, mapping_shrinkable() permits dropping
+        * page cache in order to free up struct inodes: lowmem might
+        * be under pressure before the cache inside the highmem zone.
+        */
         if (inode_has_buffers(inode) || !mapping_empty(&inode->i_data)) {
                 __iget(inode);
                 spin_unlock(&inode->i_lock);
@@ -1638,7 +1646,7 @@ static void iput_final(struct inode *inode)
         if (!drop &&
             !(inode->i_state & I_DONTCACHE) &&
             (sb->s_flags & SB_ACTIVE)) {
-               inode_add_lru(inode);
+               __inode_add_lru(inode, true);
                 spin_unlock(&inode->i_lock);
                 return;
         }
diff --git a/fs/internal.h b/fs/internal.h

index 3cd065c8a66b4ceb99d255f4c0fa82314fa218d9..2854ff29f1167b0e895fb2c40bb69d502b6080b1 100644 (file)
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -149,7 +149,6 @@ extern int vfs_open(const struct path *, struct file *);
   * inode.c
   */
  extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc);
-extern void inode_add_lru(struct inode *inode);
  extern int dentry_needs_remove_privs(struct dentry *dentry);
  
  /*
diff --git a/include/linux/fs.h b/include/linux/fs.h

index 226de651f52e6f9f01efdfe2aa374dab7a220895..de35a2640e705b7300be82b37b3e5865ca2d2eb0 100644 (file)
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3193,6 +3193,7 @@ static inline void remove_inode_hash(struct inode *inode)
  }
  
  extern void inode_sb_list_add(struct inode *inode);
+extern void inode_add_lru(struct inode *inode);
  
  extern int sb_set_blocksize(struct super_block *, int);
  extern int sb_min_blocksize(struct super_block *, int);
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h

index 62db6b0176b95782666e5e1a7369dea10715ed6d..5c74a45ff97a02168c302d85fb856569bf21a785 100644 (file)
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -23,6 +23,56 @@ static inline bool mapping_empty(struct address_space *mapping)
         return xa_empty(&mapping->i_pages);
  }
  
+/*
+ * mapping_shrinkable - test if page cache state allows inode reclaim
+ * @mapping: the page cache mapping
+ *
+ * This checks the mapping's cache state for the pupose of inode
+ * reclaim and LRU management.
+ *
+ * The caller is expected to hold the i_lock, but is not required to
+ * hold the i_pages lock, which usually protects cache state. That's
+ * because the i_lock and the list_lru lock that protect the inode and
+ * its LRU state don't nest inside the irq-safe i_pages lock.
+ *
+ * Cache deletions are performed under the i_lock, which ensures that
+ * when an inode goes empty, it will reliably get queued on the LRU.
+ *
+ * Cache additions do not acquire the i_lock and may race with this
+ * check, in which case we'll report the inode as shrinkable when it
+ * has cache pages. This is okay: the shrinker also checks the
+ * refcount and the referenced bit, which will be elevated or set in
+ * the process of adding new cache pages to an inode.
+ */
+static inline bool mapping_shrinkable(struct address_space *mapping)
+{
+       void *head;
+
+       /*
+        * On highmem systems, there could be lowmem pressure from the
+        * inodes before there is highmem pressure from the page
+        * cache. Make inodes shrinkable regardless of cache state.
+        */
+       if (IS_ENABLED(CONFIG_HIGHMEM))
+               return true;
+
+       /* Cache completely empty? Shrink away. */
+       head = rcu_access_pointer(mapping->i_pages.xa_head);
+       if (!head)
+               return true;
+
+       /*
+        * The xarray stores single offset-0 entries directly in the
+        * head pointer, which allows non-resident page cache entries
+        * to escape the shadow shrinker's list of xarray nodes. The
+        * inode shrinker needs to pick them up under memory pressure.
+        */
+       if (!xa_is_node(head) && xa_is_value(head))
+               return true;
+
+       return false;
+}
+
  /*
   * Bits in mapping->flags.
   */
diff --git a/mm/filemap.c b/mm/filemap.c

index b6140debc2da37c3e43c8ef026734a301e06a203..06de9a17499fb59f8b5046b355279a5037d8b5f6 100644 (file)
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -262,9 +262,13 @@ void delete_from_page_cache(struct page *page)
         struct address_space *mapping = page_mapping(page);
  
         BUG_ON(!PageLocked(page));
+       spin_lock(&mapping->host->i_lock);
         xa_lock_irq(&mapping->i_pages);
         __delete_from_page_cache(page, NULL);
         xa_unlock_irq(&mapping->i_pages);
+       if (mapping_shrinkable(mapping))
+               inode_add_lru(mapping->host);
+       spin_unlock(&mapping->host->i_lock);
  
         page_cache_free_page(mapping, page);
  }
@@ -340,6 +344,7 @@ void delete_from_page_cache_batch(struct address_space *mapping,
         if (!pagevec_count(pvec))
                 return;
  
+       spin_lock(&mapping->host->i_lock);
         xa_lock_irq(&mapping->i_pages);
         for (i = 0; i < pagevec_count(pvec); i++) {
                 trace_mm_filemap_delete_from_page_cache(pvec->pages[i]);
@@ -348,6 +353,9 @@ void delete_from_page_cache_batch(struct address_space *mapping,
         }
         page_cache_delete_batch(mapping, pvec);
         xa_unlock_irq(&mapping->i_pages);
+       if (mapping_shrinkable(mapping))
+               inode_add_lru(mapping->host);
+       spin_unlock(&mapping->host->i_lock);
  
         for (i = 0; i < pagevec_count(pvec); i++)
                 page_cache_free_page(mapping, pvec->pages[i]);
diff --git a/mm/truncate.c b/mm/truncate.c

index 714eaf19821d7f4f251dde2237828a06c379b0da..cc83a3f7c1ad34330c8b53f7b25c3e3f6c78dcda 100644 (file)
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -45,9 +45,13 @@ static inline void __clear_shadow_entry(struct address_space *mapping,
  static void clear_shadow_entry(struct address_space *mapping, pgoff_t index,
                                void *entry)
  {
+       spin_lock(&mapping->host->i_lock);
         xa_lock_irq(&mapping->i_pages);
         __clear_shadow_entry(mapping, index, entry);
         xa_unlock_irq(&mapping->i_pages);
+       if (mapping_shrinkable(mapping))
+               inode_add_lru(mapping->host);
+       spin_unlock(&mapping->host->i_lock);
  }
  
  /*
@@ -73,8 +77,10 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping,
                 return;
  
         dax = dax_mapping(mapping);
-       if (!dax)
+       if (!dax) {
+               spin_lock(&mapping->host->i_lock);
                 xa_lock_irq(&mapping->i_pages);
+       }
  
         for (i = j; i < pagevec_count(pvec); i++) {
                 struct page *page = pvec->pages[i];
@@ -93,8 +99,12 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping,
                 __clear_shadow_entry(mapping, index, page);
         }
  
-       if (!dax)
+       if (!dax) {
                 xa_unlock_irq(&mapping->i_pages);
+               if (mapping_shrinkable(mapping))
+                       inode_add_lru(mapping->host);
+               spin_unlock(&mapping->host->i_lock);
+       }
         pvec->nr = j;
  }
  
@@ -567,6 +577,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
         if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL))
                 return 0;
  
+       spin_lock(&mapping->host->i_lock);
         xa_lock_irq(&mapping->i_pages);
         if (PageDirty(page))
                 goto failed;
@@ -574,6 +585,9 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
         BUG_ON(page_has_private(page));
         __delete_from_page_cache(page, NULL);
         xa_unlock_irq(&mapping->i_pages);
+       if (mapping_shrinkable(mapping))
+               inode_add_lru(mapping->host);
+       spin_unlock(&mapping->host->i_lock);
  
         if (mapping->a_ops->freepage)
                 mapping->a_ops->freepage(page);
@@ -582,6 +596,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
         return 1;
  failed:
         xa_unlock_irq(&mapping->i_pages);
+       spin_unlock(&mapping->host->i_lock);
         return 0;
  }
  
diff --git a/mm/vmscan.c b/mm/vmscan.c

index 41f5f6007c30b8bfaec702f446350c754d04b80b..26f07518d7a4712977f73c0247f9501093e5d0c7 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1190,6 +1190,8 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
         BUG_ON(!PageLocked(page));
         BUG_ON(mapping != page_mapping(page));
  
+       if (!PageSwapCache(page))
+               spin_lock(&mapping->host->i_lock);
         xa_lock_irq(&mapping->i_pages);
         /*
          * The non racy check for a busy page.
@@ -1258,6 +1260,9 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
                         shadow = workingset_eviction(page, target_memcg);
                 __delete_from_page_cache(page, shadow);
                 xa_unlock_irq(&mapping->i_pages);
+               if (mapping_shrinkable(mapping))
+                       inode_add_lru(mapping->host);
+               spin_unlock(&mapping->host->i_lock);
  
                 if (freepage != NULL)
                         freepage(page);
@@ -1267,6 +1272,8 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
  
  cannot_free:
         xa_unlock_irq(&mapping->i_pages);
+       if (!PageSwapCache(page))
+               spin_unlock(&mapping->host->i_lock);
         return 0;
  }
  
diff --git a/mm/workingset.c b/mm/workingset.c

index d5b81e4f4cbe8cdc1c1434a21ffc743d322b2227..23df60ce2e109282ed6456a704b54ad6bda1daea 100644 (file)
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -543,6 +543,13 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
                 goto out;
         }
  
+       if (!spin_trylock(&mapping->host->i_lock)) {
+               xa_unlock(&mapping->i_pages);
+               spin_unlock_irq(lru_lock);
+               ret = LRU_RETRY;
+               goto out;
+       }
+
         list_lru_isolate(lru, item);
         __dec_lruvec_kmem_state(node, WORKINGSET_NODES);
  
@@ -562,6 +569,9 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
  
  out_invalid:
         xa_unlock_irq(&mapping->i_pages);
+       if (mapping_shrinkable(mapping))
+               inode_add_lru(mapping->host);
+       spin_unlock(&mapping->host->i_lock);
         ret = LRU_REMOVED_RETRY;
  out:
         cond_resched();
author	Johannes Weiner <hannes@cmpxchg.org>
	Tue, 9 Nov 2021 02:31:24 +0000 (18:31 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 9 Nov 2021 18:02:48 +0000 (10:02 -0800)
fs/inode.c		patch \| blob \| history
fs/internal.h		patch \| blob \| history
include/linux/fs.h		patch \| blob \| history
include/linux/pagemap.h		patch \| blob \| history
mm/filemap.c		patch \| blob \| history
mm/truncate.c		patch \| blob \| history
mm/vmscan.c		patch \| blob \| history
mm/workingset.c		patch \| blob \| history