btrfs: send: avoid trashing the page cache

author Filipe Manana <fdmanana@suse.com>

Tue, 17 May 2022 10:47:30 +0000 (11:47 +0100)

committer David Sterba <dsterba@suse.com>

Tue, 17 May 2022 18:14:54 +0000 (20:14 +0200)
author Filipe Manana <fdmanana@suse.com>
Tue, 17 May 2022 10:47:30 +0000 (11:47 +0100)
committer David Sterba <dsterba@suse.com>
Tue, 17 May 2022 18:14:54 +0000 (20:14 +0200)
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c

index 55275ba90cb496a8ed32f769b00c3402b4a52487..5a05beabf0c34c7397b9074e408d12de60f26020 100644 (file)
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -137,6 +137,8 @@ struct send_ctx {
          */
         struct inode *cur_inode;
         struct file_ra_state ra;
+       u64 page_cache_clear_start;
+       bool clean_page_cache;
  
         /*
          * We process inodes by their increasing order, so if before an
@@ -5139,6 +5141,7 @@ static int send_extent_data(struct send_ctx *sctx,
                             const u64 offset,
                             const u64 len)
  {
+       const u64 end = offset + len;
         u64 read_size = max_send_read_size(sctx);
         u64 sent = 0;
  
@@ -5157,6 +5160,28 @@ static int send_extent_data(struct send_ctx *sctx,
                 }
                 memset(&sctx->ra, 0, sizeof(struct file_ra_state));
                 file_ra_state_init(&sctx->ra, sctx->cur_inode->i_mapping);
+
+               /*
+                * It's very likely there are no pages from this inode in the page
+                * cache, so after reading extents and sending their data, we clean
+                * the page cache to avoid trashing the page cache (adding pressure
+                * to the page cache and forcing eviction of other data more useful
+                * for applications).
+                *
+                * We decide if we should clean the page cache simply by checking
+                * if the inode's mapping nrpages is 0 when we first open it, and
+                * not by using something like filemap_range_has_page() before
+                * reading an extent because when we ask the readahead code to
+                * read a given file range, it may (and almost always does) read
+                * pages from beyond that range (see the documentation for
+                * page_cache_sync_readahead()), so it would not be reliable,
+                * because after reading the first extent future calls to
+                * filemap_range_has_page() would return true because the readahead
+                * on the previous extent resulted in reading pages of the current
+                * extent as well.
+                */
+               sctx->clean_page_cache = (sctx->cur_inode->i_mapping->nrpages == 0);
+               sctx->page_cache_clear_start = round_down(offset, PAGE_SIZE);
         }
  
         while (sent < len) {
@@ -5168,6 +5193,37 @@ static int send_extent_data(struct send_ctx *sctx,
                         return ret;
                 sent += size;
         }
+
+       if (sctx->clean_page_cache && IS_ALIGNED(end, PAGE_SIZE)) {
+               /*
+                * Always operate only on ranges that are a multiple of the page
+                * size. This is not only to prevent zeroing parts of a page in
+                * the case of subpage sector size, but also to guarantee we evict
+                * pages, as passing a range that is smaller than page size does
+                * not evict the respective page (only zeroes part of its content).
+                *
+                * Always start from the end offset of the last range cleared.
+                * This is because the readahead code may (and very often does)
+                * reads pages beyond the range we request for readahead. So if
+                * we have an extent layout like this:
+                *
+                *            [ extent A ] [ extent B ] [ extent C ]
+                *
+                * When we ask page_cache_sync_readahead() to read extent A, it
+                * may also trigger reads for pages of extent B. If we are doing
+                * an incremental send and extent B has not changed between the
+                * parent and send snapshots, some or all of its pages may end
+                * up being read and placed in the page cache. So when truncating
+                * the page cache we always start from the end offset of the
+                * previously processed extent up to the end of the current
+                * extent.
+                */
+               truncate_inode_pages_range(&sctx->cur_inode->i_data,
+                                          sctx->page_cache_clear_start,
+                                          end - 1);
+               sctx->page_cache_clear_start = end;
+       }
+
         return 0;
  }
  
@@ -6172,6 +6228,30 @@ out:
         return ret;
  }
  
+static void close_current_inode(struct send_ctx *sctx)
+{
+       u64 i_size;
+
+       if (sctx->cur_inode == NULL)
+               return;
+
+       i_size = i_size_read(sctx->cur_inode);
+
+       /*
+        * If we are doing an incremental send, we may have extents between the
+        * last processed extent and the i_size that have not been processed
+        * because they haven't changed but we may have read some of their pages
+        * through readahead, see the comments at send_extent_data().
+        */
+       if (sctx->clean_page_cache && sctx->page_cache_clear_start < i_size)
+               truncate_inode_pages_range(&sctx->cur_inode->i_data,
+                                          sctx->page_cache_clear_start,
+                                          round_up(i_size, PAGE_SIZE) - 1);
+
+       iput(sctx->cur_inode);
+       sctx->cur_inode = NULL;
+}
+
  static int changed_inode(struct send_ctx *sctx,
                          enum btrfs_compare_tree_result result)
  {
@@ -6182,8 +6262,7 @@ static int changed_inode(struct send_ctx *sctx,
         u64 left_gen = 0;
         u64 right_gen = 0;
  
-       iput(sctx->cur_inode);
-       sctx->cur_inode = NULL;
+       close_current_inode(sctx);
  
         sctx->cur_ino = key->objectid;
         sctx->cur_inode_new_gen = 0;
@@ -7671,7 +7750,7 @@ out:
  
                 name_cache_free(sctx);
  
-               iput(sctx->cur_inode);
+               close_current_inode(sctx);
  
                 kfree(sctx);
         }
author	Filipe Manana <fdmanana@suse.com>
	Tue, 17 May 2022 10:47:30 +0000 (11:47 +0100)
committer	David Sterba <dsterba@suse.com>
	Tue, 17 May 2022 18:14:54 +0000 (20:14 +0200)