#include "zoned.h"
#include "inode-item.h"
+#define MAX_CONFLICT_INODES 10
+
/* magic values for the inode_only field in btrfs_log_inode:
*
* LOG_INODE_ALL means to log everything
enum {
LOG_INODE_ALL,
LOG_INODE_EXISTS,
- LOG_OTHER_INODE,
- LOG_OTHER_INODE_ALL,
};
/*
struct list_head list;
};
-static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct btrfs_log_ctx *ctx,
- u64 ino, u64 parent)
+static void free_conflicting_inodes(struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_ino_list *curr;
+ struct btrfs_ino_list *next;
+
+ list_for_each_entry_safe(curr, next, &ctx->conflict_inodes, list) {
+ list_del(&curr->list);
+ kfree(curr);
+ }
+}
+
+static int add_conflicting_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 ino, u64 parent,
+ struct btrfs_log_ctx *ctx)
{
struct btrfs_ino_list *ino_elem;
- LIST_HEAD(inode_list);
- int ret = 0;
+ struct inode *inode;
+
+ /*
+ * It's rare to have a lot of conflicting inodes, in practice it is not
+ * common to have more than 1 or 2. We don't want to collect too many,
+ * as we could end up logging too many inodes (even if only in
+ * LOG_INODE_EXISTS mode) and slow down other fsyncs or transaction
+ * commits.
+ */
+ if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES)
+ return BTRFS_LOG_FORCE_COMMIT;
+
+ inode = btrfs_iget(root->fs_info->sb, ino, root);
+ /*
+ * If the other inode that had a conflicting dir entry was deleted in
+ * the current transaction, we need to log its parent directory.
+ * We can't simply ignore it and let the current inode's reference cause
+ * an unlink of the conflicting inode when replaying the log - because
+ * the conflicting inode may be a deleted subvolume/snapshot or it may
+ * be a directory that had subvolumes/snapshots inside it (or had one
+ * or more subdirectories with subvolumes/snapshots, etc). If that's the
+ * case, then when logging the parent directory we will fallback to a
+ * transaction commit because the parent directory will have a
+ * last_unlink_trans that matches the current transaction.
+ */
+ if (IS_ERR(inode)) {
+ int ret = PTR_ERR(inode);
+
+ if (ret != -ENOENT)
+ return ret;
+
+ ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
+ if (!ino_elem)
+ return -ENOMEM;
+ ino_elem->ino = ino;
+ ino_elem->parent = parent;
+ list_add_tail(&ino_elem->list, &ctx->conflict_inodes);
+ ctx->num_conflict_inodes++;
+
+ return 0;
+ }
+
+ /*
+ * If the inode was already logged skip it - otherwise we can hit an
+ * infinite loop. Example:
+ *
+ * From the commit root (previous transaction) we have the following
+ * inodes:
+ *
+ * inode 257 a directory
+ * inode 258 with references "zz" and "zz_link" on inode 257
+ * inode 259 with reference "a" on inode 257
+ *
+ * And in the current (uncommitted) transaction we have:
+ *
+ * inode 257 a directory, unchanged
+ * inode 258 with references "a" and "a2" on inode 257
+ * inode 259 with reference "zz_link" on inode 257
+ * inode 261 with reference "zz" on inode 257
+ *
+ * When logging inode 261 the following infinite loop could
+ * happen if we don't skip already logged inodes:
+ *
+ * - we detect inode 258 as a conflicting inode, with inode 261
+ * on reference "zz", and log it;
+ *
+ * - we detect inode 259 as a conflicting inode, with inode 258
+ * on reference "a", and log it;
+ *
+ * - we detect inode 258 as a conflicting inode, with inode 259
+ * on reference "zz_link", and log it - again! After this we
+ * repeat the above steps forever.
+ *
+ * Here we can use need_log_inode() because we only need to log the
+ * inode in LOG_INODE_EXISTS mode and rename operations update the log,
+ * so that the log ends up with the new name and without the old name.
+ */
+ if (!need_log_inode(trans, BTRFS_I(inode))) {
+ btrfs_add_delayed_iput(inode);
+ return 0;
+ }
+
+ btrfs_add_delayed_iput(inode);
ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
if (!ino_elem)
return -ENOMEM;
ino_elem->ino = ino;
ino_elem->parent = parent;
- list_add_tail(&ino_elem->list, &inode_list);
+ list_add_tail(&ino_elem->list, &ctx->conflict_inodes);
+ ctx->num_conflict_inodes++;
- while (!list_empty(&inode_list)) {
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_key key;
- struct inode *inode;
+ return 0;
+}
- ino_elem = list_first_entry(&inode_list, struct btrfs_ino_list,
- list);
- ino = ino_elem->ino;
- parent = ino_elem->parent;
- list_del(&ino_elem->list);
- kfree(ino_elem);
- if (ret)
- continue;
+static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ int ret = 0;
- btrfs_release_path(path);
+ /*
+ * Conflicting inodes are logged by the first call to btrfs_log_inode(),
+ * otherwise we could have unbounded recursion of btrfs_log_inode()
+ * calls. This check guarantees we can have only 1 level of recursion.
+ */
+ if (ctx->logging_conflict_inodes)
+ return 0;
+
+ ctx->logging_conflict_inodes = true;
+
+ /*
+ * New conflicting inodes may be found and added to the list while we
+ * are logging a conflicting inode, so keep iterating while the list is
+ * not empty.
+ */
+ while (!list_empty(&ctx->conflict_inodes)) {
+ struct btrfs_ino_list *curr;
+ struct inode *inode;
+ u64 ino;
+ u64 parent;
+
+ curr = list_first_entry(&ctx->conflict_inodes,
+ struct btrfs_ino_list, list);
+ ino = curr->ino;
+ parent = curr->parent;
+ list_del(&curr->list);
+ kfree(curr);
inode = btrfs_iget(fs_info->sb, ino, root);
/*
* If the other inode that had a conflicting dir entry was
* deleted in the current transaction, we need to log its parent
- * directory.
+ * directory. See the comment at add_conflicting_inode().
*/
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
- if (ret == -ENOENT) {
- inode = btrfs_iget(fs_info->sb, parent, root);
- if (IS_ERR(inode)) {
- ret = PTR_ERR(inode);
- } else {
- ret = btrfs_log_inode(trans,
- BTRFS_I(inode),
- LOG_OTHER_INODE_ALL,
- ctx);
- btrfs_add_delayed_iput(inode);
- }
+ if (ret != -ENOENT)
+ break;
+
+ inode = btrfs_iget(fs_info->sb, parent, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ break;
}
+
+ /*
+ * Always log the directory, we cannot make this
+ * conditional on need_log_inode() because the directory
+ * might have been logged in LOG_INODE_EXISTS mode or
+ * the dir index of the conflicting inode is not in a
+ * dir index key range logged for the directory. So we
+ * must make sure the deletion is recorded.
+ */
+ ret = btrfs_log_inode(trans, BTRFS_I(inode),
+ LOG_INODE_ALL, ctx);
+ btrfs_add_delayed_iput(inode);
+ if (ret)
+ break;
continue;
}
+
/*
- * If the inode was already logged skip it - otherwise we can
- * hit an infinite loop. Example:
- *
- * From the commit root (previous transaction) we have the
- * following inodes:
- *
- * inode 257 a directory
- * inode 258 with references "zz" and "zz_link" on inode 257
- * inode 259 with reference "a" on inode 257
- *
- * And in the current (uncommitted) transaction we have:
- *
- * inode 257 a directory, unchanged
- * inode 258 with references "a" and "a2" on inode 257
- * inode 259 with reference "zz_link" on inode 257
- * inode 261 with reference "zz" on inode 257
- *
- * When logging inode 261 the following infinite loop could
- * happen if we don't skip already logged inodes:
- *
- * - we detect inode 258 as a conflicting inode, with inode 261
- * on reference "zz", and log it;
+ * Here we can use need_log_inode() because we only need to log
+ * the inode in LOG_INODE_EXISTS mode and rename operations
+ * update the log, so that the log ends up with the new name and
+ * without the old name.
*
- * - we detect inode 259 as a conflicting inode, with inode 258
- * on reference "a", and log it;
- *
- * - we detect inode 258 as a conflicting inode, with inode 259
- * on reference "zz_link", and log it - again! After this we
- * repeat the above steps forever.
- */
- spin_lock(&BTRFS_I(inode)->lock);
- /*
- * Check the inode's logged_trans only instead of
- * btrfs_inode_in_log(). This is because the last_log_commit of
- * the inode is not updated when we only log that it exists (see
- * btrfs_log_inode()).
+ * We did this check at add_conflicting_inode(), but here we do
+ * it again because if some other task logged the inode after
+ * that, we can avoid doing it again.
*/
- if (BTRFS_I(inode)->logged_trans == trans->transid) {
- spin_unlock(&BTRFS_I(inode)->lock);
+ if (!need_log_inode(trans, BTRFS_I(inode))) {
btrfs_add_delayed_iput(inode);
continue;
}
- spin_unlock(&BTRFS_I(inode)->lock);
+
/*
* We are safe logging the other inode without acquiring its
* lock as long as we log with the LOG_INODE_EXISTS mode. We
* well because during a rename we pin the log and update the
* log with the new name before we unpin it.
*/
- ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_OTHER_INODE, ctx);
- if (ret) {
- btrfs_add_delayed_iput(inode);
- continue;
- }
-
- key.objectid = ino;
- key.type = BTRFS_INODE_REF_KEY;
- key.offset = 0;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0) {
- btrfs_add_delayed_iput(inode);
- continue;
- }
-
- while (true) {
- struct extent_buffer *leaf = path->nodes[0];
- int slot = path->slots[0];
- u64 other_ino = 0;
- u64 other_parent = 0;
-
- if (slot >= btrfs_header_nritems(leaf)) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0) {
- break;
- } else if (ret > 0) {
- ret = 0;
- break;
- }
- continue;
- }
-
- btrfs_item_key_to_cpu(leaf, &key, slot);
- if (key.objectid != ino ||
- (key.type != BTRFS_INODE_REF_KEY &&
- key.type != BTRFS_INODE_EXTREF_KEY)) {
- ret = 0;
- break;
- }
-
- ret = btrfs_check_ref_name_override(leaf, slot, &key,
- BTRFS_I(inode), &other_ino,
- &other_parent);
- if (ret < 0)
- break;
- if (ret > 0) {
- ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
- if (!ino_elem) {
- ret = -ENOMEM;
- break;
- }
- ino_elem->ino = other_ino;
- ino_elem->parent = other_parent;
- list_add_tail(&ino_elem->list, &inode_list);
- ret = 0;
- }
- path->slots[0]++;
- }
+ ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_INODE_EXISTS, ctx);
btrfs_add_delayed_iput(inode);
+ if (ret)
+ break;
}
+ ctx->logging_conflict_inodes = false;
+ if (ret)
+ free_conflicting_inodes(ctx);
+
return ret;
}
struct btrfs_path *path,
struct btrfs_path *dst_path,
const u64 logged_isize,
- const bool recursive_logging,
const int inode_only,
struct btrfs_log_ctx *ctx,
bool *need_log_inode_item)
break;
} else if ((min_key->type == BTRFS_INODE_REF_KEY ||
min_key->type == BTRFS_INODE_EXTREF_KEY) &&
- inode->generation == trans->transid &&
- !recursive_logging) {
+ (inode->generation == trans->transid ||
+ ctx->logging_conflict_inodes)) {
u64 other_ino = 0;
u64 other_parent = 0;
return ret;
ins_nr = 0;
- ret = log_conflicting_inodes(trans, root, path,
- ctx, other_ino, other_parent);
+ btrfs_release_path(path);
+ ret = add_conflicting_inode(trans, root,
+ other_ino,
+ other_parent, ctx);
if (ret)
return ret;
- btrfs_release_path(path);
goto next_key;
}
} else if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
u64 logged_isize = 0;
bool need_log_inode_item = true;
bool xattrs_logged = false;
- bool recursive_logging = false;
bool inode_item_dropped = true;
- const bool orig_logged_before = ctx->logged_before;
path = btrfs_alloc_path();
if (!path)
goto out;
}
- if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) {
- recursive_logging = true;
- if (inode_only == LOG_OTHER_INODE)
- inode_only = LOG_INODE_EXISTS;
- else
- inode_only = LOG_INODE_ALL;
- mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING);
- } else {
- mutex_lock(&inode->log_mutex);
- }
+ mutex_lock(&inode->log_mutex);
/*
* For symlinks, we must always log their content, which is stored in an
ret = copy_inode_items_to_log(trans, inode, &min_key, &max_key,
path, dst_path, logged_isize,
- recursive_logging, inode_only, ctx,
+ inode_only, ctx,
&need_log_inode_item);
if (ret)
goto out_unlock;
btrfs_free_path(path);
btrfs_free_path(dst_path);
- if (recursive_logging)
- ctx->logged_before = orig_logged_before;
+ if (ret)
+ free_conflicting_inodes(ctx);
+ else
+ ret = log_conflicting_inodes(trans, inode->root, ctx);
return ret;
}
* inconsistent state after a rename operation.
*/
btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx);
+ ASSERT(list_empty(&ctx.conflict_inodes));
out:
/*
* If an error happened mark the log for a full commit because it's not