btrfs: initial fsverity support

author Boris Burkov <boris@bur.io>

Wed, 30 Jun 2021 20:01:49 +0000 (13:01 -0700)

committer David Sterba <dsterba@suse.com>

Mon, 23 Aug 2021 11:19:09 +0000 (13:19 +0200)
author Boris Burkov <boris@bur.io>
Wed, 30 Jun 2021 20:01:49 +0000 (13:01 -0700)
committer David Sterba <dsterba@suse.com>
Mon, 23 Aug 2021 11:19:09 +0000 (13:19 +0200)
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile

index cec88a66bd6c19d643eff8327ef581a87808c037..3dcf9bcc232613735690f763ea98daf7ca62a062 100644 (file)
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -36,6 +36,7 @@ btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
  btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
  btrfs-$(CONFIG_BTRFS_FS_REF_VERIFY) += ref-verify.o
  btrfs-$(CONFIG_BLK_DEV_ZONED) += zoned.o
+btrfs-$(CONFIG_FS_VERITY) += verity.o
  
  btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \
         tests/extent-buffer-tests.o tests/btrfs-tests.o \
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h

index 1093b00130bec99fced28406b2b2f85706f82a8e..76ee1452c57bae757b81cc189f2c4f31d61d593d 100644 (file)
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -51,6 +51,13 @@ enum {
          * the file range, inode's io_tree).
          */
         BTRFS_INODE_NO_DELALLOC_FLUSH,
+       /*
+        * Set when we are working on enabling verity for a file. Computing and
+        * writing the whole Merkle tree can take a while so we want to prevent
+        * races where two separate tasks attempt to simultaneously start verity
+        * on the same file.
+        */
+       BTRFS_INODE_VERITY_IN_PROGRESS,
  };
  
  /* in memory btrfs inode */
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h

index 9e3b7a56a78f48817e2fdb1c656ce472fd6a9412..f17be4b023cb8c3e50d02b634193417b1a5024ce 100644 (file)
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -281,7 +281,8 @@ struct btrfs_super_block {
  
  #define BTRFS_FEATURE_COMPAT_RO_SUPP                   \
         (BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE |      \
-        BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID)
+        BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID | \
+        BTRFS_FEATURE_COMPAT_RO_VERITY)
  
  #define BTRFS_FEATURE_COMPAT_RO_SAFE_SET       0ULL
  #define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR     0ULL
@@ -1512,7 +1513,9 @@ do {                                                                   \
          BTRFS_INODE_COMPRESS |                                         \
          BTRFS_INODE_ROOT_ITEM_INIT)
  
-#define BTRFS_INODE_RO_FLAG_MASK                                       (0)
+#define BTRFS_INODE_RO_VERITY          (1U << 0)
+
+#define BTRFS_INODE_RO_FLAG_MASK       (BTRFS_INODE_RO_VERITY)
  
  struct btrfs_map_token {
         struct extent_buffer *eb;
@@ -3791,6 +3794,30 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
         return signal_pending(current);
  }
  
+/* verity.c */
+#ifdef CONFIG_FS_VERITY
+
+extern const struct fsverity_operations btrfs_verityops;
+int btrfs_drop_verity_items(struct btrfs_inode *inode);
+
+BTRFS_SETGET_FUNCS(verity_descriptor_encryption, struct btrfs_verity_descriptor_item,
+                  encryption, 8);
+BTRFS_SETGET_FUNCS(verity_descriptor_size, struct btrfs_verity_descriptor_item,
+                  size, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_encryption,
+                        struct btrfs_verity_descriptor_item, encryption, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_size,
+                        struct btrfs_verity_descriptor_item, size, 64);
+
+#else
+
+static inline int btrfs_drop_verity_items(struct btrfs_inode *inode)
+{
+       return 0;
+}
+
+#endif
+
  /* Sanity test specific functions */
  #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
  void btrfs_test_destroy_inode(struct inode *inode);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c

index 66888b10b00db5937055d445955bd1f9f7d1b91a..96de6e70d06c654e394dcb9f2df17987706a9d7e 100644 (file)
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -13,6 +13,7 @@
  #include <linux/pagevec.h>
  #include <linux/prefetch.h>
  #include <linux/cleancache.h>
+#include <linux/fsverity.h>
  #include "misc.h"
  #include "extent_io.h"
  #include "extent-io-tree.h"
@@ -2247,18 +2248,6 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
         return bitset;
  }
  
-/*
- * helper function to set a given page up to date if all the
- * extents in the tree for that page are up to date
- */
-static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
-{
-       u64 start = page_offset(page);
-       u64 end = start + PAGE_SIZE - 1;
-       if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
-               SetPageUptodate(page);
-}
-
  int free_io_failure(struct extent_io_tree *failure_tree,
                     struct extent_io_tree *io_tree,
                     struct io_failure_record *rec)
@@ -2690,7 +2679,15 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
                start + len <= page_offset(page) + PAGE_SIZE);
  
         if (uptodate) {
-               btrfs_page_set_uptodate(fs_info, page, start, len);
+               if (fsverity_active(page->mapping->host) &&
+                   !PageError(page) &&
+                   !PageUptodate(page) &&
+                   start < i_size_read(page->mapping->host) &&
+                   !fsverity_verify_page(page)) {
+                       btrfs_page_set_error(fs_info, page, start, len);
+               } else {
+                       btrfs_page_set_uptodate(fs_info, page, start, len);
+               }
         } else {
                 btrfs_page_clear_uptodate(fs_info, page, start, len);
                 btrfs_page_set_error(fs_info, page, start, len);
@@ -3105,7 +3102,7 @@ readpage_ok:
                 /* Update page status and unlock */
                 end_page_read(page, uptodate, start, len);
                 endio_readpage_release_extent(&processed, BTRFS_I(inode),
-                                             start, end, uptodate);
+                                             start, end, PageUptodate(page));
         }
         /* Release the last extent */
         endio_readpage_release_extent(&processed, NULL, 0, 0, false);
@@ -3706,7 +3703,6 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
                 /* the get_extent function already copied into the page */
                 if (test_range_bit(tree, cur, cur_end,
                                    EXTENT_UPTODATE, 1, NULL)) {
-                       check_page_uptodate(tree, page);
                         unlock_extent(tree, cur, cur + iosize - 1);
                         end_page_read(page, true, cur, iosize);
                         cur = cur + iosize;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c

index d3f2623a2af0a1bd76eca7f63487cc129a829754..7ff577005d0febbf6be14c21a2d1c81f3e600c50 100644 (file)
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -16,6 +16,7 @@
  #include <linux/btrfs.h>
  #include <linux/uio.h>
  #include <linux/iversion.h>
+#include <linux/fsverity.h>
  #include "ctree.h"
  #include "disk-io.h"
  #include "transaction.h"
@@ -3615,7 +3616,13 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
  
  static int btrfs_file_open(struct inode *inode, struct file *filp)
  {
+       int ret;
+
         filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
+
+       ret = fsverity_file_open(inode, filp);
+       if (ret)
+               return ret;
         return generic_file_open(inode, filp);
  }
  
@@ -3644,6 +3651,9 @@ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
         struct inode *inode = file_inode(iocb->ki_filp);
         ssize_t ret;
  
+       if (fsverity_active(inode))
+               return 0;
+
         if (check_direct_read(btrfs_sb(inode->i_sb), to, iocb->ki_pos))
                 return 0;
  
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c

index cd5a67ba7e71035643afd572c56c3ead82957da9..766cd35be33d4126925d88d216a50c4921a35cf1 100644 (file)
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -32,6 +32,7 @@
  #include <linux/sched/mm.h>
  #include <linux/iomap.h>
  #include <asm/unaligned.h>
+#include <linux/fsverity.h>
  #include "misc.h"
  #include "ctree.h"
  #include "disk-io.h"
@@ -5560,6 +5561,7 @@ void btrfs_evict_inode(struct inode *inode)
         trace_btrfs_inode_evict(inode);
  
         if (!root) {
+               fsverity_cleanup_inode(inode);
                 clear_inode(inode);
                 return;
         }
@@ -5642,6 +5644,7 @@ no_delete:
          * to retry these periodically in the future.
          */
         btrfs_remove_delayed_node(BTRFS_I(inode));
+       fsverity_cleanup_inode(inode);
         clear_inode(inode);
  }
  
@@ -9250,6 +9253,7 @@ static int btrfs_getattr(struct user_namespace *mnt_userns,
         struct inode *inode = d_inode(path->dentry);
         u32 blocksize = inode->i_sb->s_blocksize;
         u32 bi_flags = BTRFS_I(inode)->flags;
+       u32 bi_ro_flags = BTRFS_I(inode)->ro_flags;
  
         stat->result_mask |= STATX_BTIME;
         stat->btime.tv_sec = BTRFS_I(inode)->i_otime.tv_sec;
@@ -9262,6 +9266,8 @@ static int btrfs_getattr(struct user_namespace *mnt_userns,
                 stat->attributes |= STATX_ATTR_IMMUTABLE;
         if (bi_flags & BTRFS_INODE_NODUMP)
                 stat->attributes |= STATX_ATTR_NODUMP;
+       if (bi_ro_flags & BTRFS_INODE_RO_VERITY)
+               stat->attributes |= STATX_ATTR_VERITY;
  
         stat->attributes_mask |= (STATX_ATTR_APPEND |
                                   STATX_ATTR_COMPRESSED |
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c

index 17aefb5f08ea016c634b30b4619263b4435bd187..85c8b5a87a6a961b49503fbbeec7a32836496f40 100644 (file)
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -27,6 +27,7 @@
  #include <linux/uaccess.h>
  #include <linux/iversion.h>
  #include <linux/fileattr.h>
+#include <linux/fsverity.h>
  #include "ctree.h"
  #include "disk-io.h"
  #include "export.h"
@@ -107,6 +108,7 @@ static unsigned int btrfs_inode_flags_to_fsflags(struct btrfs_inode *binode)
  {
         unsigned int iflags = 0;
         u32 flags = binode->flags;
+       u32 ro_flags = binode->ro_flags;
  
         if (flags & BTRFS_INODE_SYNC)
                 iflags |= FS_SYNC_FL;
@@ -122,6 +124,8 @@ static unsigned int btrfs_inode_flags_to_fsflags(struct btrfs_inode *binode)
                 iflags |= FS_DIRSYNC_FL;
         if (flags & BTRFS_INODE_NODATACOW)
                 iflags |= FS_NOCOW_FL;
+       if (ro_flags & BTRFS_INODE_RO_VERITY)
+               iflags |= FS_VERITY_FL;
  
         if (flags & BTRFS_INODE_NOCOMPRESS)
                 iflags |= FS_NOCOMP_FL;
@@ -149,10 +153,12 @@ void btrfs_sync_inode_flags_to_i_flags(struct inode *inode)
                 new_fl |= S_NOATIME;
         if (binode->flags & BTRFS_INODE_DIRSYNC)
                 new_fl |= S_DIRSYNC;
+       if (binode->ro_flags & BTRFS_INODE_RO_VERITY)
+               new_fl |= S_VERITY;
  
         set_mask_bits(&inode->i_flags,
-                     S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC,
-                     new_fl);
+                     S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC |
+                     S_VERITY, new_fl);
  }
  
  /*
@@ -5020,6 +5026,10 @@ long btrfs_ioctl(struct file *file, unsigned int
                 return btrfs_ioctl_get_subvol_rootref(file, argp);
         case BTRFS_IOC_INO_LOOKUP_USER:
                 return btrfs_ioctl_ino_lookup_user(file, argp);
+       case FS_IOC_ENABLE_VERITY:
+               return fsverity_ioctl_enable(file, (const void __user *)argp);
+       case FS_IOC_MEASURE_VERITY:
+               return fsverity_ioctl_measure(file, argp);
         }
  
         return -ENOTTY;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c

index 0d2e3ab2fc3144e037c025cd11e812255bd3b04f..2bdc544b4c951df36ce438f48e02de1cdefff042 100644 (file)
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1353,6 +1353,9 @@ static int btrfs_fill_super(struct super_block *sb,
         sb->s_op = &btrfs_super_ops;
         sb->s_d_op = &btrfs_dentry_operations;
         sb->s_export_op = &btrfs_export_ops;
+#ifdef CONFIG_FS_VERITY
+       sb->s_vop = &btrfs_verityops;
+#endif
         sb->s_xattr = btrfs_xattr_handlers;
         sb->s_time_gran = 1;
  #ifdef CONFIG_BTRFS_FS_POSIX_ACL
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c

index d9d53a255ef97546d00e9bb2f4cefc64d40fea42..bfe5e27617b072042af3471483f348d75d44cf10 100644 (file)
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -267,6 +267,9 @@ BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34);
  #ifdef CONFIG_BTRFS_DEBUG
  BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED);
  #endif
+#ifdef CONFIG_FS_VERITY
+BTRFS_FEAT_ATTR_COMPAT_RO(verity, VERITY);
+#endif
  
  static struct attribute *btrfs_supported_feature_attrs[] = {
         BTRFS_FEAT_ATTR_PTR(mixed_backref),
@@ -284,6 +287,9 @@ static struct attribute *btrfs_supported_feature_attrs[] = {
         BTRFS_FEAT_ATTR_PTR(raid1c34),
  #ifdef CONFIG_BTRFS_DEBUG
         BTRFS_FEAT_ATTR_PTR(zoned),
+#endif
+#ifdef CONFIG_FS_VERITY
+       BTRFS_FEAT_ATTR_PTR(verity),
  #endif
         NULL
  };
diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c

new file mode 100644 (file)

index 0000000..ac4c2ca
--- /dev/null
+++ b/fs/btrfs/verity.c
@@ -0,0 +1,738 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/rwsem.h>
+#include <linux/xattr.h>
+#include <linux/security.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/iversion.h>
+#include <linux/fsverity.h>
+#include <linux/sched/mm.h>
+#include "ctree.h"
+#include "btrfs_inode.h"
+#include "transaction.h"
+#include "disk-io.h"
+#include "locking.h"
+
+/*
+ * Implementation of the interface defined in struct fsverity_operations.
+ *
+ * The main question is how and where to store the verity descriptor and the
+ * Merkle tree. We store both in dedicated btree items in the filesystem tree,
+ * together with the rest of the inode metadata. This means we'll need to do
+ * extra work to encrypt them once encryption is supported in btrfs, but btrfs
+ * has a lot of careful code around i_size and it seems better to make a new key
+ * type than try and adjust all of our expectations for i_size.
+ *
+ * Note that this differs from the implementation in ext4 and f2fs, where
+ * this data is stored as if it were in the file, but past EOF. However, btrfs
+ * does not have a widespread mechanism for caching opaque metadata pages, so we
+ * do pretend that the Merkle tree pages themselves are past EOF for the
+ * purposes of caching them (as opposed to creating a virtual inode).
+ *
+ * fs verity items are stored under two different key types on disk.
+ * The descriptor items:
+ * [ inode objectid, BTRFS_VERITY_DESC_ITEM_KEY, offset ]
+ *
+ * At offset 0, we store a btrfs_verity_descriptor_item which tracks the
+ * size of the descriptor item and some extra data for encryption.
+ * Starting at offset 1, these hold the generic fs verity descriptor.
+ * The latter are opaque to btrfs, we just read and write them as a blob for
+ * the higher level verity code.  The most common descriptor size is 256 bytes.
+ *
+ * The merkle tree items:
+ * [ inode objectid, BTRFS_VERITY_MERKLE_ITEM_KEY, offset ]
+ *
+ * These also start at offset 0, and correspond to the merkle tree bytes.
+ * So when fsverity asks for page 0 of the merkle tree, we pull up one page
+ * starting at offset 0 for this key type.  These are also opaque to btrfs,
+ * we're blindly storing whatever fsverity sends down.
+ */
+
+#define MERKLE_START_ALIGN                     65536
+
+/*
+ * Compute the logical file offset where we cache the Merkle tree.
+ *
+ * @inode:  inode of the verity file
+ *
+ * For the purposes of caching the Merkle tree pages, as required by
+ * fs-verity, it is convenient to do size computations in terms of a file
+ * offset, rather than in terms of page indices.
+ *
+ * Use 64K to be sure it's past the last page in the file, even with 64K pages.
+ * That rounding operation itself can overflow loff_t, so we do it in u64 and
+ * check.
+ *
+ * Returns the file offset on success, negative error code on failure.
+ */
+static loff_t merkle_file_pos(const struct inode *inode)
+{
+       u64 sz = inode->i_size;
+       u64 rounded = round_up(sz, MERKLE_START_ALIGN);
+
+       if (rounded > inode->i_sb->s_maxbytes)
+               return -EFBIG;
+
+       return rounded;
+}
+
+/*
+ * Drop all the items for this inode with this key_type.
+ *
+ * @inode:     inode to drop items for
+ * @key_type:  type of items to drop (BTRFS_VERITY_DESC_ITEM or
+ *             BTRFS_VERITY_MERKLE_ITEM)
+ *
+ * Before doing a verity enable we cleanup any existing verity items.
+ * This is also used to clean up if a verity enable failed half way through.
+ *
+ * Returns number of dropped items on success, negative error code on failure.
+ */
+static int drop_verity_items(struct btrfs_inode *inode, u8 key_type)
+{
+       struct btrfs_trans_handle *trans;
+       struct btrfs_root *root = inode->root;
+       struct btrfs_path *path;
+       struct btrfs_key key;
+       int count = 0;
+       int ret;
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       while (1) {
+               /* 1 for the item being dropped */
+               trans = btrfs_start_transaction(root, 1);
+               if (IS_ERR(trans)) {
+                       ret = PTR_ERR(trans);
+                       goto out;
+               }
+
+               /*
+                * Walk backwards through all the items until we find one that
+                * isn't from our key type or objectid
+                */
+               key.objectid = btrfs_ino(inode);
+               key.type = key_type;
+               key.offset = (u64)-1;
+
+               ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+               if (ret > 0) {
+                       ret = 0;
+                       /* No more keys of this type, we're done */
+                       if (path->slots[0] == 0)
+                               break;
+                       path->slots[0]--;
+               } else if (ret < 0) {
+                       btrfs_end_transaction(trans);
+                       goto out;
+               }
+
+               btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+               /* No more keys of this type, we're done */
+               if (key.objectid != btrfs_ino(inode) || key.type != key_type)
+                       break;
+
+               /*
+                * This shouldn't be a performance sensitive function because
+                * it's not used as part of truncate.  If it ever becomes
+                * perf sensitive, change this to walk forward and bulk delete
+                * items
+                */
+               ret = btrfs_del_items(trans, root, path, path->slots[0], 1);
+               if (ret) {
+                       btrfs_end_transaction(trans);
+                       goto out;
+               }
+               count++;
+               btrfs_release_path(path);
+               btrfs_end_transaction(trans);
+       }
+       ret = count;
+       btrfs_end_transaction(trans);
+out:
+       btrfs_free_path(path);
+       return ret;
+}
+
+/*
+ * Drop all verity items
+ *
+ * @inode:  inode to drop verity items for
+ *
+ * In most contexts where we are dropping verity items, we want to do it for all
+ * the types of verity items, not a particular one.
+ *
+ * Returns: 0 on success, negative error code on failure.
+ */
+int btrfs_drop_verity_items(struct btrfs_inode *inode)
+{
+       int ret;
+
+       ret = drop_verity_items(inode, BTRFS_VERITY_DESC_ITEM_KEY);
+       if (ret < 0)
+               return ret;
+       ret = drop_verity_items(inode, BTRFS_VERITY_MERKLE_ITEM_KEY);
+       if (ret < 0)
+               return ret;
+
+       return 0;
+}
+
+/*
+ * Insert and write inode items with a given key type and offset.
+ *
+ * @inode:     inode to insert for
+ * @key_type:  key type to insert
+ * @offset:    item offset to insert at
+ * @src:       source data to write
+ * @len:       length of source data to write
+ *
+ * Write len bytes from src into items of up to 2K length.
+ * The inserted items will have key (ino, key_type, offset + off) where off is
+ * consecutively increasing from 0 up to the last item ending at offset + len.
+ *
+ * Returns 0 on success and a negative error code on failure.
+ */
+static int write_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset,
+                          const char *src, u64 len)
+{
+       struct btrfs_trans_handle *trans;
+       struct btrfs_path *path;
+       struct btrfs_root *root = inode->root;
+       struct extent_buffer *leaf;
+       struct btrfs_key key;
+       unsigned long copy_bytes;
+       unsigned long src_offset = 0;
+       void *data;
+       int ret = 0;
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       while (len > 0) {
+               /* 1 for the new item being inserted */
+               trans = btrfs_start_transaction(root, 1);
+               if (IS_ERR(trans)) {
+                       ret = PTR_ERR(trans);
+                       break;
+               }
+
+               key.objectid = btrfs_ino(inode);
+               key.type = key_type;
+               key.offset = offset;
+
+               /*
+                * Insert 2K at a time mostly to be friendly for smaller leaf
+                * size filesystems
+                */
+               copy_bytes = min_t(u64, len, 2048);
+
+               ret = btrfs_insert_empty_item(trans, root, path, &key, copy_bytes);
+               if (ret) {
+                       btrfs_end_transaction(trans);
+                       break;
+               }
+
+               leaf = path->nodes[0];
+
+               data = btrfs_item_ptr(leaf, path->slots[0], void);
+               write_extent_buffer(leaf, src + src_offset,
+                                   (unsigned long)data, copy_bytes);
+               offset += copy_bytes;
+               src_offset += copy_bytes;
+               len -= copy_bytes;
+
+               btrfs_release_path(path);
+               btrfs_end_transaction(trans);
+       }
+
+       btrfs_free_path(path);
+       return ret;
+}
+
+/*
+ * Read inode items of the given key type and offset from the btree.
+ *
+ * @inode:      inode to read items of
+ * @key_type:   key type to read
+ * @offset:     item offset to read from
+ * @dest:       Buffer to read into. This parameter has slightly tricky
+ *              semantics.  If it is NULL, the function will not do any copying
+ *              and will just return the size of all the items up to len bytes.
+ *              If dest_page is passed, then the function will kmap_local the
+ *              page and ignore dest, but it must still be non-NULL to avoid the
+ *              counting-only behavior.
+ * @len:        length in bytes to read
+ * @dest_page:  copy into this page instead of the dest buffer
+ *
+ * Helper function to read items from the btree.  This returns the number of
+ * bytes read or < 0 for errors.  We can return short reads if the items don't
+ * exist on disk or aren't big enough to fill the desired length.  Supports
+ * reading into a provided buffer (dest) or into the page cache
+ *
+ * Returns number of bytes read or a negative error code on failure.
+ */
+static int read_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset,
+                         char *dest, u64 len, struct page *dest_page)
+{
+       struct btrfs_path *path;
+       struct btrfs_root *root = inode->root;
+       struct extent_buffer *leaf;
+       struct btrfs_key key;
+       u64 item_end;
+       u64 copy_end;
+       int copied = 0;
+       u32 copy_offset;
+       unsigned long copy_bytes;
+       unsigned long dest_offset = 0;
+       void *data;
+       char *kaddr = dest;
+       int ret;
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       if (dest_page)
+               path->reada = READA_FORWARD;
+
+       key.objectid = btrfs_ino(inode);
+       key.type = key_type;
+       key.offset = offset;
+
+       ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+       if (ret < 0) {
+               goto out;
+       } else if (ret > 0) {
+               ret = 0;
+               if (path->slots[0] == 0)
+                       goto out;
+               path->slots[0]--;
+       }
+
+       while (len > 0) {
+               leaf = path->nodes[0];
+               btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+               if (key.objectid != btrfs_ino(inode) || key.type != key_type)
+                       break;
+
+               item_end = btrfs_item_size_nr(leaf, path->slots[0]) + key.offset;
+
+               if (copied > 0) {
+                       /*
+                        * Once we've copied something, we want all of the items
+                        * to be sequential
+                        */
+                       if (key.offset != offset)
+                               break;
+               } else {
+                       /*
+                        * Our initial offset might be in the middle of an
+                        * item.  Make sure it all makes sense.
+                        */
+                       if (key.offset > offset)
+                               break;
+                       if (item_end <= offset)
+                               break;
+               }
+
+               /* desc = NULL to just sum all the item lengths */
+               if (!dest)
+                       copy_end = item_end;
+               else
+                       copy_end = min(offset + len, item_end);
+
+               /* Number of bytes in this item we want to copy */
+               copy_bytes = copy_end - offset;
+
+               /* Offset from the start of item for copying */
+               copy_offset = offset - key.offset;
+
+               if (dest) {
+                       if (dest_page)
+                               kaddr = kmap_local_page(dest_page);
+
+                       data = btrfs_item_ptr(leaf, path->slots[0], void);
+                       read_extent_buffer(leaf, kaddr + dest_offset,
+                                          (unsigned long)data + copy_offset,
+                                          copy_bytes);
+
+                       if (dest_page)
+                               kunmap_local(kaddr);
+               }
+
+               offset += copy_bytes;
+               dest_offset += copy_bytes;
+               len -= copy_bytes;
+               copied += copy_bytes;
+
+               path->slots[0]++;
+               if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+                       /*
+                        * We've reached the last slot in this leaf and we need
+                        * to go to the next leaf.
+                        */
+                       ret = btrfs_next_leaf(root, path);
+                       if (ret < 0) {
+                               break;
+                       } else if (ret > 0) {
+                               ret = 0;
+                               break;
+                       }
+               }
+       }
+out:
+       btrfs_free_path(path);
+       if (!ret)
+               ret = copied;
+       return ret;
+}
+
+/*
+ * Rollback in-progress verity if we encounter an error.
+ *
+ * @inode:  inode verity had an error for
+ *
+ * We try to handle recoverable errors while enabling verity by rolling it back
+ * and just failing the operation, rather than having an fs level error no
+ * matter what. However, any error in rollback is unrecoverable.
+ *
+ * Returns 0 on success, negative error code on failure.
+ */
+static int rollback_verity(struct btrfs_inode *inode)
+{
+       struct btrfs_trans_handle *trans;
+       struct btrfs_root *root = inode->root;
+       int ret;
+
+       ASSERT(inode_is_locked(&inode->vfs_inode));
+       truncate_inode_pages(inode->vfs_inode.i_mapping, inode->vfs_inode.i_size);
+       clear_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags);
+       ret = btrfs_drop_verity_items(inode);
+       if (ret) {
+               btrfs_handle_fs_error(root->fs_info, ret,
+                               "failed to drop verity items in rollback %llu",
+                               (u64)inode->vfs_inode.i_ino);
+               goto out;
+       }
+
+       /* 1 for updating the inode flag */
+       trans = btrfs_start_transaction(root, 1);
+       if (IS_ERR(trans)) {
+               ret = PTR_ERR(trans);
+               btrfs_handle_fs_error(root->fs_info, ret,
+                       "failed to start transaction in verity rollback %llu",
+                       (u64)inode->vfs_inode.i_ino);
+               goto out;
+       }
+       inode->ro_flags &= ~BTRFS_INODE_RO_VERITY;
+       btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode);
+       ret = btrfs_update_inode(trans, root, inode);
+       if (ret) {
+               btrfs_abort_transaction(trans, ret);
+               goto out;
+       }
+       btrfs_end_transaction(trans);
+out:
+       return ret;
+}
+
+/*
+ * Finalize making the file a valid verity file
+ *
+ * @inode:      inode to be marked as verity
+ * @desc:       contents of the verity descriptor to write (not NULL)
+ * @desc_size:  size of the verity descriptor
+ *
+ * Do the actual work of finalizing verity after successfully writing the Merkle
+ * tree:
+ *
+ * - write out the descriptor items
+ * - mark the inode with the verity flag
+ * - mark the ro compat bit
+ * - clear the in progress bit
+ *
+ * Returns 0 on success, negative error code on failure.
+ */
+static int finish_verity(struct btrfs_inode *inode, const void *desc,
+                        size_t desc_size)
+{
+       struct btrfs_trans_handle *trans = NULL;
+       struct btrfs_root *root = inode->root;
+       struct btrfs_verity_descriptor_item item;
+       int ret;
+
+       /* Write out the descriptor item */
+       memset(&item, 0, sizeof(item));
+       btrfs_set_stack_verity_descriptor_size(&item, desc_size);
+       ret = write_key_bytes(inode, BTRFS_VERITY_DESC_ITEM_KEY, 0,
+                             (const char *)&item, sizeof(item));
+       if (ret)
+               goto out;
+
+       /* Write out the descriptor itself */
+       ret = write_key_bytes(inode, BTRFS_VERITY_DESC_ITEM_KEY, 1,
+                             desc, desc_size);
+       if (ret)
+               goto out;
+
+       /* 1 for updating the inode flag */
+       trans = btrfs_start_transaction(root, 1);
+       if (IS_ERR(trans)) {
+               ret = PTR_ERR(trans);
+               goto out;
+       }
+       inode->ro_flags |= BTRFS_INODE_RO_VERITY;
+       btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode);
+       ret = btrfs_update_inode(trans, root, inode);
+       if (ret)
+               goto end_trans;
+       clear_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags);
+       btrfs_set_fs_compat_ro(root->fs_info, VERITY);
+end_trans:
+       btrfs_end_transaction(trans);
+out:
+       return ret;
+
+}
+
+/*
+ * fsverity op that begins enabling verity.
+ *
+ * @filp:  file to enable verity on
+ *
+ * Begin enabling fsverity for the file. We drop any existing verity items
+ * and set the in progress bit.
+ *
+ * Returns 0 on success, negative error code on failure.
+ */
+static int btrfs_begin_enable_verity(struct file *filp)
+{
+       struct btrfs_inode *inode = BTRFS_I(file_inode(filp));
+       int ret;
+
+       ASSERT(inode_is_locked(file_inode(filp)));
+
+       if (test_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags))
+               return -EBUSY;
+
+       ret = btrfs_drop_verity_items(inode);
+       if (ret)
+               return ret;
+
+       set_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags);
+
+       return 0;
+}
+
+/*
+ * fsverity op that ends enabling verity.
+ *
+ * @filp:              file we are finishing enabling verity on
+ * @desc:              verity descriptor to write out (NULL in error conditions)
+ * @desc_size:         size of the verity descriptor (variable with signatures)
+ * @merkle_tree_size:  size of the merkle tree in bytes
+ *
+ * If desc is null, then VFS is signaling an error occurred during verity
+ * enable, and we should try to rollback. Otherwise, attempt to finish verity.
+ *
+ * Returns 0 on success, negative error code on error.
+ */
+static int btrfs_end_enable_verity(struct file *filp, const void *desc,
+                                  size_t desc_size, u64 merkle_tree_size)
+{
+       struct btrfs_inode *inode = BTRFS_I(file_inode(filp));
+       int ret = 0;
+       int rollback_ret;
+
+       ASSERT(inode_is_locked(file_inode(filp)));
+
+       if (desc == NULL)
+               goto rollback;
+
+       ret = finish_verity(inode, desc, desc_size);
+       if (ret)
+               goto rollback;
+       return ret;
+
+rollback:
+       rollback_ret = rollback_verity(inode);
+       if (rollback_ret)
+               btrfs_err(inode->root->fs_info,
+                         "failed to rollback verity items: %d", rollback_ret);
+       return ret;
+}
+
+/*
+ * fsverity op that gets the struct fsverity_descriptor.
+ *
+ * @inode:     inode to get the descriptor of
+ * @buf:       output buffer for the descriptor contents
+ * @buf_size:  size of the output buffer. 0 to query the size
+ *
+ * fsverity does a two pass setup for reading the descriptor, in the first pass
+ * it calls with buf_size = 0 to query the size of the descriptor, and then in
+ * the second pass it actually reads the descriptor off disk.
+ *
+ * Returns the size on success or a negative error code on failure.
+ */
+static int btrfs_get_verity_descriptor(struct inode *inode, void *buf,
+                                      size_t buf_size)
+{
+       u64 true_size;
+       int ret = 0;
+       struct btrfs_verity_descriptor_item item;
+
+       memset(&item, 0, sizeof(item));
+       ret = read_key_bytes(BTRFS_I(inode), BTRFS_VERITY_DESC_ITEM_KEY, 0,
+                            (char *)&item, sizeof(item), NULL);
+       if (ret < 0)
+               return ret;
+
+       if (item.reserved[0] != 0 || item.reserved[1] != 0)
+               return -EUCLEAN;
+
+       true_size = btrfs_stack_verity_descriptor_size(&item);
+       if (true_size > INT_MAX)
+               return -EUCLEAN;
+
+       if (buf_size == 0)
+               return true_size;
+       if (buf_size < true_size)
+               return -ERANGE;
+
+       ret = read_key_bytes(BTRFS_I(inode), BTRFS_VERITY_DESC_ITEM_KEY, 1,
+                            buf, buf_size, NULL);
+       if (ret < 0)
+               return ret;
+       if (ret != true_size)
+               return -EIO;
+
+       return true_size;
+}
+
+/*
+ * fsverity op that reads and caches a merkle tree page.
+ *
+ * @inode:         inode to read a merkle tree page for
+ * @index:         page index relative to the start of the merkle tree
+ * @num_ra_pages:  number of pages to readahead. Optional, we ignore it
+ *
+ * The Merkle tree is stored in the filesystem btree, but its pages are cached
+ * with a logical position past EOF in the inode's mapping.
+ *
+ * Returns the page we read, or an ERR_PTR on error.
+ */
+static struct page *btrfs_read_merkle_tree_page(struct inode *inode,
+                                               pgoff_t index,
+                                               unsigned long num_ra_pages)
+{
+       struct page *page;
+       u64 off = (u64)index << PAGE_SHIFT;
+       loff_t merkle_pos = merkle_file_pos(inode);
+       int ret;
+
+       if (merkle_pos < 0)
+               return ERR_PTR(merkle_pos);
+       if (merkle_pos > inode->i_sb->s_maxbytes - off - PAGE_SIZE)
+               return ERR_PTR(-EFBIG);
+       index += merkle_pos >> PAGE_SHIFT;
+again:
+       page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED);
+       if (page) {
+               if (PageUptodate(page))
+                       return page;
+
+               lock_page(page);
+               /*
+                * We only insert uptodate pages, so !Uptodate has to be
+                * an error
+                */
+               if (!PageUptodate(page)) {
+                       unlock_page(page);
+                       put_page(page);
+                       return ERR_PTR(-EIO);
+               }
+               unlock_page(page);
+               return page;
+       }
+
+       page = __page_cache_alloc(mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS));
+       if (!page)
+               return ERR_PTR(-ENOMEM);
+
+       /*
+        * Merkle item keys are indexed from byte 0 in the merkle tree.
+        * They have the form:
+        *
+        * [ inode objectid, BTRFS_MERKLE_ITEM_KEY, offset in bytes ]
+        */
+       ret = read_key_bytes(BTRFS_I(inode), BTRFS_VERITY_MERKLE_ITEM_KEY, off,
+                            page_address(page), PAGE_SIZE, page);
+       if (ret < 0) {
+               put_page(page);
+               return ERR_PTR(ret);
+       }
+       if (ret < PAGE_SIZE)
+               memzero_page(page, ret, PAGE_SIZE - ret);
+
+       SetPageUptodate(page);
+       ret = add_to_page_cache_lru(page, inode->i_mapping, index, GFP_NOFS);
+
+       if (!ret) {
+               /* Inserted and ready for fsverity */
+               unlock_page(page);
+       } else {
+               put_page(page);
+               /* Did someone race us into inserting this page? */
+               if (ret == -EEXIST)
+                       goto again;
+               page = ERR_PTR(ret);
+       }
+       return page;
+}
+
+/*
+ * fsverity op that writes a Merkle tree block into the btree.
+ *
+ * @inode:          inode to write a Merkle tree block for
+ * @buf:            Merkle tree data block to write
+ * @index:          index of the block in the Merkle tree
+ * @log_blocksize:  log base 2 of the Merkle tree block size
+ *
+ * Note that the block size could be different from the page size, so it is not
+ * safe to assume that index is a page index.
+ *
+ * Returns 0 on success or negative error code on failure
+ */
+static int btrfs_write_merkle_tree_block(struct inode *inode, const void *buf,
+                                       u64 index, int log_blocksize)
+{
+       u64 off = index << log_blocksize;
+       u64 len = 1ULL << log_blocksize;
+       loff_t merkle_pos = merkle_file_pos(inode);
+
+       if (merkle_pos < 0)
+               return merkle_pos;
+       if (merkle_pos > inode->i_sb->s_maxbytes - off - len)
+               return -EFBIG;
+
+       return write_key_bytes(BTRFS_I(inode), BTRFS_VERITY_MERKLE_ITEM_KEY,
+                              off, buf, len);
+}
+
+const struct fsverity_operations btrfs_verityops = {
+       .begin_enable_verity     = btrfs_begin_enable_verity,
+       .end_enable_verity       = btrfs_end_enable_verity,
+       .get_verity_descriptor   = btrfs_get_verity_descriptor,
+       .read_merkle_tree_page   = btrfs_read_merkle_tree_page,
+       .write_merkle_tree_block = btrfs_write_merkle_tree_block,
+};
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h

index 22cd037123fadba03271eef6879064e703beec26..d7d3cfead056354ab28db9699fb4ab87b1d3afef 100644 (file)
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -288,6 +288,7 @@ struct btrfs_ioctl_fs_info_args {
   * first mount when booting older kernel versions.
   */
  #define BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID  (1ULL << 1)
+#define BTRFS_FEATURE_COMPAT_RO_VERITY                 (1ULL << 2)
  
  #define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF   (1ULL << 0)
  #define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL  (1ULL << 1)
diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h

index ccdb40fe40dc254e120e86007b586cee548c2735..e1c4c732aabac20140cfb2924745b9a67448966c 100644 (file)
--- a/include/uapi/linux/btrfs_tree.h
+++ b/include/uapi/linux/btrfs_tree.h
@@ -118,6 +118,29 @@
  #define BTRFS_INODE_REF_KEY            12
  #define BTRFS_INODE_EXTREF_KEY         13
  #define BTRFS_XATTR_ITEM_KEY           24
+
+/*
+ * fs verity items are stored under two different key types on disk.
+ * The descriptor items:
+ * [ inode objectid, BTRFS_VERITY_DESC_ITEM_KEY, offset ]
+ *
+ * At offset 0, we store a btrfs_verity_descriptor_item which tracks the size
+ * of the descriptor item and some extra data for encryption.
+ * Starting at offset 1, these hold the generic fs verity descriptor.  The
+ * latter are opaque to btrfs, we just read and write them as a blob for the
+ * higher level verity code.  The most common descriptor size is 256 bytes.
+ *
+ * The merkle tree items:
+ * [ inode objectid, BTRFS_VERITY_MERKLE_ITEM_KEY, offset ]
+ *
+ * These also start at offset 0, and correspond to the merkle tree bytes.  When
+ * fsverity asks for page 0 of the merkle tree, we pull up one page starting at
+ * offset 0 for this key type.  These are also opaque to btrfs, we're blindly
+ * storing whatever fsverity sends down.
+ */
+#define BTRFS_VERITY_DESC_ITEM_KEY     36
+#define BTRFS_VERITY_MERKLE_ITEM_KEY   37
+
  #define BTRFS_ORPHAN_ITEM_KEY          48
  /* reserve 2-15 close to the inode for later flexibility */
  
@@ -991,4 +1014,16 @@ struct btrfs_qgroup_limit_item {
         __le64 rsv_excl;
  } __attribute__ ((__packed__));
  
+struct btrfs_verity_descriptor_item {
+       /* Size of the verity descriptor in bytes */
+       __le64 size;
+       /*
+        * When we implement support for fscrypt, we will need to encrypt the
+        * Merkle tree for encrypted verity files. These 128 bits are for the
+        * eventual storage of an fscrypt initialization vector.
+        */
+       __le64 reserved[2];
+       __u8 encryption;
+} __attribute__ ((__packed__));
+
  #endif /* _BTRFS_CTREE_H_ */
author	Boris Burkov <boris@bur.io>
	Wed, 30 Jun 2021 20:01:49 +0000 (13:01 -0700)
committer	David Sterba <dsterba@suse.com>
	Mon, 23 Aug 2021 11:19:09 +0000 (13:19 +0200)
fs/btrfs/Makefile		patch \| blob \| history
fs/btrfs/btrfs_inode.h		patch \| blob \| history
fs/btrfs/ctree.h		patch \| blob \| history
fs/btrfs/extent_io.c		patch \| blob \| history
fs/btrfs/file.c		patch \| blob \| history
fs/btrfs/inode.c		patch \| blob \| history
fs/btrfs/ioctl.c		patch \| blob \| history
fs/btrfs/super.c		patch \| blob \| history
fs/btrfs/sysfs.c		patch \| blob \| history
fs/btrfs/verity.c	[new file with mode: 0644]	patch \| blob
include/uapi/linux/btrfs.h		patch \| blob \| history
include/uapi/linux/btrfs_tree.h		patch \| blob \| history