From: Linus Torvalds Date: Sun, 14 Jun 2020 16:47:25 +0000 (-0700) Subject: Merge tag 'for-5.8-part2-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave... X-Git-Tag: baikal/mips/sdk5.9~13367 X-Git-Url: https://git.baikalelectronics.ru/?a=commitdiff_plain;h=0362d4351b99ca74a29b897635bd92e5bfc1f158;p=kernel.git Merge tag 'for-5.8-part2-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux Pull btrfs updates from David Sterba: "This reverts the direct io port to iomap infrastructure of btrfs merged in the first pull request. We found problems in invalidate page that don't seem to be fixable as regressions or without changing iomap code that would not affect other filesystems. There are four reverts in total, but three of them are followup cleanups needed to revert fbc996e547bd cleanly. The result is the buffer head based implementation of direct io. Reverts are not great, but under current circumstances I don't see better options" * tag 'for-5.8-part2-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: Revert "btrfs: switch to iomap_dio_rw() for dio" Revert "fs: remove dio_end_io()" Revert "btrfs: remove BTRFS_INODE_READDIO_NEED_LOCK" Revert "btrfs: split btrfs_direct_IO to read and write part" --- 0362d4351b99ca74a29b897635bd92e5bfc1f158 diff --cc fs/btrfs/inode.c index 31ac8c682f198,12b5d61f23bb1..d04c82c884181 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@@ -7816,18 -7789,150 +7789,148 @@@ static void btrfs_submit_direct(struct out_err: dip->dio_bio->bi_status = status; btrfs_dio_private_put(dip); - return BLK_QC_T_NONE; } - const struct iomap_ops btrfs_dio_iomap_ops = { - .iomap_begin = btrfs_dio_iomap_begin, - .iomap_end = btrfs_dio_iomap_end, - }; + static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, + const struct iov_iter *iter, loff_t offset) + { + int seg; + int i; + unsigned int blocksize_mask = fs_info->sectorsize - 1; + ssize_t retval = -EINVAL; - const struct iomap_dio_ops btrfs_dops = { - .submit_io = btrfs_submit_direct, - }; + if (offset & blocksize_mask) + goto out; + + if (iov_iter_alignment(iter) & blocksize_mask) + goto out; + + /* If this is a write we don't need to check anymore */ + if (iov_iter_rw(iter) != READ || !iter_is_iovec(iter)) + return 0; + /* + * Check to make sure we don't have duplicate iov_base's in this + * iovec, if so return EINVAL, otherwise we'll get csum errors + * when reading back. + */ + for (seg = 0; seg < iter->nr_segs; seg++) { + for (i = seg + 1; i < iter->nr_segs; i++) { + if (iter->iov[seg].iov_base == iter->iov[i].iov_base) + goto out; + } + } + retval = 0; + out: + return retval; + } + + static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) + { + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_dio_data dio_data = { 0 }; + struct extent_changeset *data_reserved = NULL; + loff_t offset = iocb->ki_pos; + size_t count = 0; + int flags = 0; + bool wakeup = true; + bool relock = false; + ssize_t ret; + + if (check_direct_IO(fs_info, iter, offset)) + return 0; + + inode_dio_begin(inode); + + /* + * The generic stuff only does filemap_write_and_wait_range, which + * isn't enough if we've written compressed pages to this area, so + * we need to flush the dirty pages again to make absolutely sure + * that any outstanding dirty pages are on disk. + */ + count = iov_iter_count(iter); + if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) + filemap_fdatawrite_range(inode->i_mapping, offset, + offset + count - 1); + + if (iov_iter_rw(iter) == WRITE) { + /* + * If the write DIO is beyond the EOF, we need update + * the isize, but it is protected by i_mutex. So we can + * not unlock the i_mutex at this case. + */ + if (offset + count <= inode->i_size) { + dio_data.overwrite = 1; + inode_unlock(inode); + relock = true; + } else if (iocb->ki_flags & IOCB_NOWAIT) { + ret = -EAGAIN; + goto out; + } + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, + offset, count); + if (ret) + goto out; + + /* + * We need to know how many extents we reserved so that we can + * do the accounting properly if we go over the number we + * originally calculated. Abuse current->journal_info for this. + */ + dio_data.reserve = round_up(count, + fs_info->sectorsize); + dio_data.unsubmitted_oe_range_start = (u64)offset; + dio_data.unsubmitted_oe_range_end = (u64)offset; + current->journal_info = &dio_data; + down_read(&BTRFS_I(inode)->dio_sem); + } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK, + &BTRFS_I(inode)->runtime_flags)) { + inode_dio_end(inode); + flags = DIO_LOCKING | DIO_SKIP_HOLES; + wakeup = false; + } + + ret = __blockdev_direct_IO(iocb, inode, + fs_info->fs_devices->latest_bdev, + iter, btrfs_get_blocks_direct, NULL, + btrfs_submit_direct, flags); + if (iov_iter_rw(iter) == WRITE) { + up_read(&BTRFS_I(inode)->dio_sem); + current->journal_info = NULL; + if (ret < 0 && ret != -EIOCBQUEUED) { + if (dio_data.reserve) + btrfs_delalloc_release_space(inode, data_reserved, + offset, dio_data.reserve, true); + /* + * On error we might have left some ordered extents + * without submitting corresponding bios for them, so + * cleanup them up to avoid other tasks getting them + * and waiting for them to complete forever. + */ + if (dio_data.unsubmitted_oe_range_start < + dio_data.unsubmitted_oe_range_end) + __endio_write_update_ordered(inode, + dio_data.unsubmitted_oe_range_start, + dio_data.unsubmitted_oe_range_end - + dio_data.unsubmitted_oe_range_start, + false); + } else if (ret >= 0 && (size_t)ret < count) + btrfs_delalloc_release_space(inode, data_reserved, + offset, count - (size_t)ret, true); + btrfs_delalloc_release_extents(BTRFS_I(inode), count); + } + out: + if (wakeup) + inode_dio_end(inode); + if (relock) + inode_lock(inode); + + extent_changeset_free(data_reserved); + return ret; + } -#define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC) - static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len) { @@@ -10121,8 -10241,8 +10224,8 @@@ static const struct address_space_opera .readpage = btrfs_readpage, .writepage = btrfs_writepage, .writepages = btrfs_writepages, - .readpages = btrfs_readpages, + .readahead = btrfs_readahead, - .direct_IO = noop_direct_IO, + .direct_IO = btrfs_direct_IO, .invalidatepage = btrfs_invalidatepage, .releasepage = btrfs_releasepage, #ifdef CONFIG_MIGRATION