]> git.baikalelectronics.ru Git - kernel.git/commitdiff
dm writecache: improve performance of large linear writes on SSDs
authorMikulas Patocka <mpatocka@redhat.com>
Wed, 15 Jan 2020 09:35:22 +0000 (04:35 -0500)
committerMike Snitzer <snitzer@redhat.com>
Thu, 16 Jan 2020 18:34:17 +0000 (13:34 -0500)
When dm-writecache is used with SSD as a cache device, it would submit a
separate bio for each written block. The I/Os would be merged by the disk
scheduler, but this merging degrades performance.

Improve dm-writecache performance by submitting larger bios - this is
possible as long as there is consecutive free space on the cache
device.

Benchmark (arm64 with 64k page size, using /dev/ram0 as a cache device):

fio --bs=512k --iodepth=32 --size=400M --direct=1 \
    --filename=/dev/mapper/cache --rw=randwrite --numjobs=1 --name=test

block old new
size MiB/s MiB/s
---------------------
512 181 700
1k 347 1256
2k 644 2020
4k 1183 2759
8k 1852 3333
16k 2469 3509
32k 2974 3670
64k 3404 3810

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
drivers/md/dm-writecache.c

index 9b0a3bf6a4a18dbf00918238effc4810d961d138..b9e27e37a94373c1cbc8ca52f3b64c0d72d46228 100644 (file)
@@ -625,7 +625,7 @@ static void writecache_add_to_freelist(struct dm_writecache *wc, struct wc_entry
        wc->freelist_size++;
 }
 
-static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc)
+static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc, sector_t expected_sector)
 {
        struct wc_entry *e;
 
@@ -634,6 +634,8 @@ static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc)
                if (unlikely(!wc->current_free))
                        return NULL;
                e = wc->current_free;
+               if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector))
+                       return NULL;
                next = rb_next(&e->rb_node);
                rb_erase(&e->rb_node, &wc->freetree);
                if (unlikely(!next))
@@ -643,6 +645,8 @@ static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc)
                if (unlikely(list_empty(&wc->freelist)))
                        return NULL;
                e = container_of(wc->freelist.next, struct wc_entry, lru);
+               if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector))
+                       return NULL;
                list_del(&e->lru);
        }
        wc->freelist_size--;
@@ -1193,7 +1197,7 @@ read_next_block:
                                        goto bio_copy;
                                }
                        }
-                       e = writecache_pop_from_freelist(wc);
+                       e = writecache_pop_from_freelist(wc, (sector_t)-1);
                        if (unlikely(!e)) {
                                writecache_wait_on_freelist(wc);
                                continue;
@@ -1205,9 +1209,26 @@ bio_copy:
                        if (WC_MODE_PMEM(wc)) {
                                bio_copy_block(wc, bio, memory_data(wc, e));
                        } else {
-                               dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT);
+                               unsigned bio_size = wc->block_size;
+                               sector_t start_cache_sec = cache_sector(wc, e);
+                               sector_t current_cache_sec = start_cache_sec + (bio_size >> SECTOR_SHIFT);
+
+                               while (bio_size < bio->bi_iter.bi_size) {
+                                       struct wc_entry *f = writecache_pop_from_freelist(wc, current_cache_sec);
+                                       if (!f)
+                                               break;
+                                       write_original_sector_seq_count(wc, f, bio->bi_iter.bi_sector +
+                                                                       (bio_size >> SECTOR_SHIFT), wc->seq_count);
+                                       writecache_insert_entry(wc, f);
+                                       wc->uncommitted_blocks++;
+                                       bio_size += wc->block_size;
+                                       current_cache_sec += wc->block_size >> SECTOR_SHIFT;
+                               }
+
                                bio_set_dev(bio, wc->ssd_dev->bdev);
-                               bio->bi_iter.bi_sector = cache_sector(wc, e);
+                               bio->bi_iter.bi_sector = start_cache_sec;
+                               dm_accept_partial_bio(bio, bio_size >> SECTOR_SHIFT);
+
                                if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) {
                                        wc->uncommitted_blocks = 0;
                                        queue_work(wc->writeback_wq, &wc->flush_work);