]> git.baikalelectronics.ru Git - kernel.git/commitdiff
btrfs: zoned: add a dedicated data relocation block group
authorJohannes Thumshirn <johannes.thumshirn@wdc.com>
Wed, 8 Sep 2021 16:19:26 +0000 (01:19 +0900)
committerDavid Sterba <dsterba@suse.com>
Tue, 26 Oct 2021 17:08:01 +0000 (19:08 +0200)
Relocation in a zoned filesystem can fail with a transaction abort with
error -22 (EINVAL). This happens because the relocation code assumes that
the extents we relocated the data to have the same size the source extents
had and ensures this by preallocating the extents.

But in a zoned filesystem we currently can't preallocate the extents as
this would break the sequential write required rule. Therefore it can
happen that the writeback process kicks in while we're still adding pages
to a delalloc range and starts writing out dirty pages.

This then creates destination extents that are smaller than the source
extents, triggering the following safety check in get_new_location():

 1034         if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) {
 1035                 ret = -EINVAL;
 1036                 goto out;
 1037         }

Temporarily create a dedicated block group for the relocation process, so
no non-relocation data writes can interfere with the relocation writes.

This is needed that we can switch the relocation process on a zoned
filesystem from the REQ_OP_ZONE_APPEND writing we use for data to a scheme
like in a non-zoned filesystem using REQ_OP_WRITE and preallocation.

Fixes: c2bb2da10e1e ("btrfs: zoned: enable relocation on a zoned filesystem")
Reviewed-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
fs/btrfs/block-group.c
fs/btrfs/ctree.h
fs/btrfs/disk-io.c
fs/btrfs/extent-tree.c
fs/btrfs/zoned.c
fs/btrfs/zoned.h

index 1302bf8d0be1d692c77f5fbf88a78e64006c1277..46fdef7bbe20c65be3f1e24815ad9ecef383bf9f 100644 (file)
@@ -903,6 +903,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
        spin_unlock(&cluster->refill_lock);
 
        btrfs_clear_treelog_bg(block_group);
+       btrfs_clear_data_reloc_bg(block_group);
 
        path = btrfs_alloc_path();
        if (!path) {
index f6e624098d531dc14d6779427ec22734e11d13b2..41f1718a83df922d4e7241276ca0af5888de3d7d 100644 (file)
@@ -1018,6 +1018,13 @@ struct btrfs_fs_info {
        spinlock_t treelog_bg_lock;
        u64 treelog_bg;
 
+       /*
+        * Start of the dedicated data relocation block group, protected by
+        * relocation_bg_lock.
+        */
+       spinlock_t relocation_bg_lock;
+       u64 data_reloc_bg;
+
        spinlock_t zone_active_bgs_lock;
        struct list_head zone_active_bgs;
 
index d63c5e776a964dfd4d514e63784d36b6086c9975..be382276d24f321cfb9aeb7684f9f3abfc72b272 100644 (file)
@@ -2885,6 +2885,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
        spin_lock_init(&fs_info->unused_bgs_lock);
        spin_lock_init(&fs_info->treelog_bg_lock);
        spin_lock_init(&fs_info->zone_active_bgs_lock);
+       spin_lock_init(&fs_info->relocation_bg_lock);
        rwlock_init(&fs_info->tree_mod_log_lock);
        mutex_init(&fs_info->unused_bg_unpin_mutex);
        mutex_init(&fs_info->reclaim_bgs_lock);
index 74ac37c5f21810187c6e3a46058f847753fa47bf..9b7cbb669a589c649e006e2955b6966f2240914c 100644 (file)
@@ -3497,6 +3497,9 @@ struct find_free_extent_ctl {
        /* Allocation is called for tree-log */
        bool for_treelog;
 
+       /* Allocation is called for data relocation */
+       bool for_data_reloc;
+
        /* RAID index, converted from flags */
        int index;
 
@@ -3758,6 +3761,7 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
        u64 avail;
        u64 bytenr = block_group->start;
        u64 log_bytenr;
+       u64 data_reloc_bytenr;
        int ret = 0;
        bool skip;
 
@@ -3775,6 +3779,19 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
        if (skip)
                return 1;
 
+       /*
+        * Do not allow non-relocation blocks in the dedicated relocation block
+        * group, and vice versa.
+        */
+       spin_lock(&fs_info->relocation_bg_lock);
+       data_reloc_bytenr = fs_info->data_reloc_bg;
+       if (data_reloc_bytenr &&
+           ((ffe_ctl->for_data_reloc && bytenr != data_reloc_bytenr) ||
+            (!ffe_ctl->for_data_reloc && bytenr == data_reloc_bytenr)))
+               skip = true;
+       spin_unlock(&fs_info->relocation_bg_lock);
+       if (skip)
+               return 1;
        /* Check RO and no space case before trying to activate it */
        spin_lock(&block_group->lock);
        if (block_group->ro ||
@@ -3790,10 +3807,14 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
        spin_lock(&space_info->lock);
        spin_lock(&block_group->lock);
        spin_lock(&fs_info->treelog_bg_lock);
+       spin_lock(&fs_info->relocation_bg_lock);
 
        ASSERT(!ffe_ctl->for_treelog ||
               block_group->start == fs_info->treelog_bg ||
               fs_info->treelog_bg == 0);
+       ASSERT(!ffe_ctl->for_data_reloc ||
+              block_group->start == fs_info->data_reloc_bg ||
+              fs_info->data_reloc_bg == 0);
 
        if (block_group->ro) {
                ret = 1;
@@ -3810,6 +3831,16 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
                goto out;
        }
 
+       /*
+        * Do not allow currently used block group to be the data relocation
+        * dedicated block group.
+        */
+       if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg &&
+           (block_group->used || block_group->reserved)) {
+               ret = 1;
+               goto out;
+       }
+
        WARN_ON_ONCE(block_group->alloc_offset > block_group->zone_capacity);
        avail = block_group->zone_capacity - block_group->alloc_offset;
        if (avail < num_bytes) {
@@ -3828,6 +3859,9 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
        if (ffe_ctl->for_treelog && !fs_info->treelog_bg)
                fs_info->treelog_bg = block_group->start;
 
+       if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg)
+               fs_info->data_reloc_bg = block_group->start;
+
        ffe_ctl->found_offset = start + block_group->alloc_offset;
        block_group->alloc_offset += num_bytes;
        spin_lock(&ctl->tree_lock);
@@ -3844,6 +3878,9 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
 out:
        if (ret && ffe_ctl->for_treelog)
                fs_info->treelog_bg = 0;
+       if (ret && ffe_ctl->for_data_reloc)
+               fs_info->data_reloc_bg = 0;
+       spin_unlock(&fs_info->relocation_bg_lock);
        spin_unlock(&fs_info->treelog_bg_lock);
        spin_unlock(&block_group->lock);
        spin_unlock(&space_info->lock);
@@ -4112,6 +4149,12 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info,
                                ffe_ctl->hint_byte = fs_info->treelog_bg;
                        spin_unlock(&fs_info->treelog_bg_lock);
                }
+               if (ffe_ctl->for_data_reloc) {
+                       spin_lock(&fs_info->relocation_bg_lock);
+                       if (fs_info->data_reloc_bg)
+                               ffe_ctl->hint_byte = fs_info->data_reloc_bg;
+                       spin_unlock(&fs_info->relocation_bg_lock);
+               }
                return 0;
        default:
                BUG();
@@ -4245,6 +4288,8 @@ search:
                if (unlikely(block_group->ro)) {
                        if (ffe_ctl->for_treelog)
                                btrfs_clear_treelog_bg(block_group);
+                       if (ffe_ctl->for_data_reloc)
+                               btrfs_clear_data_reloc_bg(block_group);
                        continue;
                }
 
@@ -4438,6 +4483,7 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
        u64 flags;
        int ret;
        bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
+       bool for_data_reloc = (btrfs_is_data_reloc_root(root) && is_data);
 
        flags = get_alloc_profile_by_root(root, is_data);
 again:
@@ -4451,6 +4497,7 @@ again:
        ffe_ctl.delalloc = delalloc;
        ffe_ctl.hint_byte = hint_byte;
        ffe_ctl.for_treelog = for_treelog;
+       ffe_ctl.for_data_reloc = for_data_reloc;
 
        ret = find_free_extent(root, ins, &ffe_ctl);
        if (!ret && !is_data) {
@@ -4470,8 +4517,8 @@ again:
 
                        sinfo = btrfs_find_space_info(fs_info, flags);
                        btrfs_err(fs_info,
-                       "allocation failed flags %llu, wanted %llu tree-log %d",
-                                 flags, num_bytes, for_treelog);
+       "allocation failed flags %llu, wanted %llu tree-log %d, relocation: %d",
+                                 flags, num_bytes, for_treelog, for_data_reloc);
                        if (sinfo)
                                btrfs_dump_space_info(fs_info, sinfo,
                                                      num_bytes, 1);
index 28a06c2d80adb8b9bb8fd7003e5cf46d10f4a9ec..c7fe3e11e6853b9870b55a5e0674f9182aa42202 100644 (file)
@@ -1954,3 +1954,13 @@ void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 len
 out:
        btrfs_put_block_group(block_group);
 }
+
+void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg)
+{
+       struct btrfs_fs_info *fs_info = bg->fs_info;
+
+       spin_lock(&fs_info->relocation_bg_lock);
+       if (fs_info->data_reloc_bg == bg->start)
+               fs_info->data_reloc_bg = 0;
+       spin_unlock(&fs_info->relocation_bg_lock);
+}
index 9c512402d7f42a8b7bde9c0c0665a89496d578aa..e53ab7b96437e79264d50372876bfc8f674bbf5a 100644 (file)
@@ -75,6 +75,7 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices,
                             int raid_index);
 void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical,
                             u64 length);
+void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg);
 #else /* CONFIG_BLK_DEV_ZONED */
 static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
                                     struct blk_zone *zone)
@@ -229,6 +230,8 @@ static inline bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices,
 static inline void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info,
                                           u64 logical, u64 length) { }
 
+static inline void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { }
+
 #endif
 
 static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)