Skip to content

Commit 2cfa582

Browse files
committed
Merge tag 'for-5.14/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm
Pull device mapper updates from Mike Snitzer: - Various DM persistent-data library improvements and fixes that benefit both the DM thinp and cache targets. - A few small DM kcopyd efficiency improvements. - Significant zoned related block core, DM core and DM zoned target changes that culminate with adding zoned append emulation (which is required to properly fix DM crypt's zoned support). - Various DM writecache target changes that improve efficiency. Adds an optional "metadata_only" feature that only promotes bios flagged with REQ_META. But the most significant improvement is writecache's ability to pause writeback, for a confiurable time, if/when the working set is larger than the cache (and the cache is full) -- this ensures performance is no worse than the slower origin device. * tag 'for-5.14/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (35 commits) dm writecache: make writeback pause configurable dm writecache: pause writeback if cache full and origin being written directly dm io tracker: factor out IO tracker dm btree remove: assign new_root only when removal succeeds dm zone: fix dm_revalidate_zones() memory allocation dm ps io affinity: remove redundant continue statement dm writecache: add optional "metadata_only" parameter dm writecache: add "cleaner" and "max_age" to Documentation dm writecache: write at least 4k when committing dm writecache: flush origin device when writing and cache is full dm writecache: have ssd writeback wait if the kcopyd workqueue is busy dm writecache: use list_move instead of list_del/list_add in writecache_writeback() dm writecache: commit just one block, not a full page dm writecache: remove unused gfp_t argument from wc_add_block() dm crypt: Fix zoned block device support dm: introduce zone append emulation dm: rearrange core declarations for extended use from dm-zone.c block: introduce BIO_ZONE_WRITE_LOCKED bio flag block: introduce bio zone helpers block: improve handling of all zones reset operation ...
2 parents dbe69e4 + 5c0de3d commit 2cfa582

38 files changed

+2548
-622
lines changed

Documentation/admin-guide/device-mapper/writecache.rst

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@ first sector should contain valid superblock from previous invocation.
1212
Constructor parameters:
1313

1414
1. type of the cache device - "p" or "s"
15-
1615
- p - persistent memory
1716
- s - SSD
1817
2. the underlying device that will be cached
@@ -21,7 +20,6 @@ Constructor parameters:
2120
size)
2221
5. the number of optional parameters (the parameters with an argument
2322
count as two)
24-
2523
start_sector n (default: 0)
2624
offset from the start of cache device in 512-byte sectors
2725
high_watermark n (default: 50)
@@ -53,6 +51,27 @@ Constructor parameters:
5351

5452
- some underlying devices perform better with fua, some
5553
with nofua. The user should test it
54+
cleaner
55+
when this option is activated (either in the constructor
56+
arguments or by a message), the cache will not promote
57+
new writes (however, writes to already cached blocks are
58+
promoted, to avoid data corruption due to misordered
59+
writes) and it will gradually writeback any cached
60+
data. The userspace can then monitor the cleaning
61+
process with "dmsetup status". When the number of cached
62+
blocks drops to zero, userspace can unload the
63+
dm-writecache target and replace it with dm-linear or
64+
other targets.
65+
max_age n
66+
specifies the maximum age of a block in milliseconds. If
67+
a block is stored in the cache for too long, it will be
68+
written to the underlying device and cleaned up.
69+
metadata_only
70+
only metadata is promoted to the cache. This option
71+
improves performance for heavier REQ_META workloads.
72+
pause_writeback n (default: 3000)
73+
pause writeback if there was some write I/O redirected to
74+
the origin volume in the last n milliseconds
5675

5776
Status:
5877
1. error indicator - 0 if there was no error, otherwise error number
@@ -77,3 +96,5 @@ Messages:
7796
5. resume the device, so that it will use the linear
7897
target
7998
6. the cache device is now inactive and it can be deleted
99+
cleaner
100+
See above "cleaner" constructor documentation.

block/blk-zoned.c

Lines changed: 92 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -161,18 +161,89 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector,
161161
}
162162
EXPORT_SYMBOL_GPL(blkdev_report_zones);
163163

164-
static inline bool blkdev_allow_reset_all_zones(struct block_device *bdev,
165-
sector_t sector,
166-
sector_t nr_sectors)
164+
static inline unsigned long *blk_alloc_zone_bitmap(int node,
165+
unsigned int nr_zones)
167166
{
168-
if (!blk_queue_zone_resetall(bdev_get_queue(bdev)))
169-
return false;
167+
return kcalloc_node(BITS_TO_LONGS(nr_zones), sizeof(unsigned long),
168+
GFP_NOIO, node);
169+
}
170170

171+
static int blk_zone_need_reset_cb(struct blk_zone *zone, unsigned int idx,
172+
void *data)
173+
{
171174
/*
172-
* REQ_OP_ZONE_RESET_ALL can be executed only if the number of sectors
173-
* of the applicable zone range is the entire disk.
175+
* For an all-zones reset, ignore conventional, empty, read-only
176+
* and offline zones.
174177
*/
175-
return !sector && nr_sectors == get_capacity(bdev->bd_disk);
178+
switch (zone->cond) {
179+
case BLK_ZONE_COND_NOT_WP:
180+
case BLK_ZONE_COND_EMPTY:
181+
case BLK_ZONE_COND_READONLY:
182+
case BLK_ZONE_COND_OFFLINE:
183+
return 0;
184+
default:
185+
set_bit(idx, (unsigned long *)data);
186+
return 0;
187+
}
188+
}
189+
190+
static int blkdev_zone_reset_all_emulated(struct block_device *bdev,
191+
gfp_t gfp_mask)
192+
{
193+
struct request_queue *q = bdev_get_queue(bdev);
194+
sector_t capacity = get_capacity(bdev->bd_disk);
195+
sector_t zone_sectors = blk_queue_zone_sectors(q);
196+
unsigned long *need_reset;
197+
struct bio *bio = NULL;
198+
sector_t sector = 0;
199+
int ret;
200+
201+
need_reset = blk_alloc_zone_bitmap(q->node, q->nr_zones);
202+
if (!need_reset)
203+
return -ENOMEM;
204+
205+
ret = bdev->bd_disk->fops->report_zones(bdev->bd_disk, 0,
206+
q->nr_zones, blk_zone_need_reset_cb,
207+
need_reset);
208+
if (ret < 0)
209+
goto out_free_need_reset;
210+
211+
ret = 0;
212+
while (sector < capacity) {
213+
if (!test_bit(blk_queue_zone_no(q, sector), need_reset)) {
214+
sector += zone_sectors;
215+
continue;
216+
}
217+
218+
bio = blk_next_bio(bio, 0, gfp_mask);
219+
bio_set_dev(bio, bdev);
220+
bio->bi_opf = REQ_OP_ZONE_RESET | REQ_SYNC;
221+
bio->bi_iter.bi_sector = sector;
222+
sector += zone_sectors;
223+
224+
/* This may take a while, so be nice to others */
225+
cond_resched();
226+
}
227+
228+
if (bio) {
229+
ret = submit_bio_wait(bio);
230+
bio_put(bio);
231+
}
232+
233+
out_free_need_reset:
234+
kfree(need_reset);
235+
return ret;
236+
}
237+
238+
static int blkdev_zone_reset_all(struct block_device *bdev, gfp_t gfp_mask)
239+
{
240+
struct bio bio;
241+
242+
bio_init(&bio, NULL, 0);
243+
bio_set_dev(&bio, bdev);
244+
bio.bi_opf = REQ_OP_ZONE_RESET_ALL | REQ_SYNC;
245+
246+
return submit_bio_wait(&bio);
176247
}
177248

178249
/**
@@ -200,7 +271,7 @@ int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op,
200271
sector_t capacity = get_capacity(bdev->bd_disk);
201272
sector_t end_sector = sector + nr_sectors;
202273
struct bio *bio = NULL;
203-
int ret;
274+
int ret = 0;
204275

205276
if (!blk_queue_is_zoned(q))
206277
return -EOPNOTSUPP;
@@ -222,20 +293,21 @@ int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op,
222293
if ((nr_sectors & (zone_sectors - 1)) && end_sector != capacity)
223294
return -EINVAL;
224295

296+
/*
297+
* In the case of a zone reset operation over all zones,
298+
* REQ_OP_ZONE_RESET_ALL can be used with devices supporting this
299+
* command. For other devices, we emulate this command behavior by
300+
* identifying the zones needing a reset.
301+
*/
302+
if (op == REQ_OP_ZONE_RESET && sector == 0 && nr_sectors == capacity) {
303+
if (!blk_queue_zone_resetall(q))
304+
return blkdev_zone_reset_all_emulated(bdev, gfp_mask);
305+
return blkdev_zone_reset_all(bdev, gfp_mask);
306+
}
307+
225308
while (sector < end_sector) {
226309
bio = blk_next_bio(bio, 0, gfp_mask);
227310
bio_set_dev(bio, bdev);
228-
229-
/*
230-
* Special case for the zone reset operation that reset all
231-
* zones, this is useful for applications like mkfs.
232-
*/
233-
if (op == REQ_OP_ZONE_RESET &&
234-
blkdev_allow_reset_all_zones(bdev, sector, nr_sectors)) {
235-
bio->bi_opf = REQ_OP_ZONE_RESET_ALL | REQ_SYNC;
236-
break;
237-
}
238-
239311
bio->bi_opf = op | REQ_SYNC;
240312
bio->bi_iter.bi_sector = sector;
241313
sector += zone_sectors;
@@ -396,13 +468,6 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode,
396468
return ret;
397469
}
398470

399-
static inline unsigned long *blk_alloc_zone_bitmap(int node,
400-
unsigned int nr_zones)
401-
{
402-
return kcalloc_node(BITS_TO_LONGS(nr_zones), sizeof(unsigned long),
403-
GFP_NOIO, node);
404-
}
405-
406471
void blk_queue_free_zone_bitmaps(struct request_queue *q)
407472
{
408473
kfree(q->conv_zones_bitmap);

drivers/md/Makefile

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,10 @@ ifeq ($(CONFIG_DM_UEVENT),y)
9292
dm-mod-objs += dm-uevent.o
9393
endif
9494

95+
ifeq ($(CONFIG_BLK_DEV_ZONED),y)
96+
dm-mod-objs += dm-zone.o
97+
endif
98+
9599
ifeq ($(CONFIG_DM_VERITY_FEC),y)
96100
dm-verity-objs += dm-verity-fec.o
97101
endif

drivers/md/dm-cache-target.c

Lines changed: 6 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#include "dm-bio-prison-v2.h"
99
#include "dm-bio-record.h"
1010
#include "dm-cache-metadata.h"
11+
#include "dm-io-tracker.h"
1112

1213
#include <linux/dm-io.h>
1314
#include <linux/dm-kcopyd.h>
@@ -39,77 +40,6 @@ DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
3940

4041
/*----------------------------------------------------------------*/
4142

42-
struct io_tracker {
43-
spinlock_t lock;
44-
45-
/*
46-
* Sectors of in-flight IO.
47-
*/
48-
sector_t in_flight;
49-
50-
/*
51-
* The time, in jiffies, when this device became idle (if it is
52-
* indeed idle).
53-
*/
54-
unsigned long idle_time;
55-
unsigned long last_update_time;
56-
};
57-
58-
static void iot_init(struct io_tracker *iot)
59-
{
60-
spin_lock_init(&iot->lock);
61-
iot->in_flight = 0ul;
62-
iot->idle_time = 0ul;
63-
iot->last_update_time = jiffies;
64-
}
65-
66-
static bool __iot_idle_for(struct io_tracker *iot, unsigned long jifs)
67-
{
68-
if (iot->in_flight)
69-
return false;
70-
71-
return time_after(jiffies, iot->idle_time + jifs);
72-
}
73-
74-
static bool iot_idle_for(struct io_tracker *iot, unsigned long jifs)
75-
{
76-
bool r;
77-
78-
spin_lock_irq(&iot->lock);
79-
r = __iot_idle_for(iot, jifs);
80-
spin_unlock_irq(&iot->lock);
81-
82-
return r;
83-
}
84-
85-
static void iot_io_begin(struct io_tracker *iot, sector_t len)
86-
{
87-
spin_lock_irq(&iot->lock);
88-
iot->in_flight += len;
89-
spin_unlock_irq(&iot->lock);
90-
}
91-
92-
static void __iot_io_end(struct io_tracker *iot, sector_t len)
93-
{
94-
if (!len)
95-
return;
96-
97-
iot->in_flight -= len;
98-
if (!iot->in_flight)
99-
iot->idle_time = jiffies;
100-
}
101-
102-
static void iot_io_end(struct io_tracker *iot, sector_t len)
103-
{
104-
unsigned long flags;
105-
106-
spin_lock_irqsave(&iot->lock, flags);
107-
__iot_io_end(iot, len);
108-
spin_unlock_irqrestore(&iot->lock, flags);
109-
}
110-
111-
/*----------------------------------------------------------------*/
112-
11343
/*
11444
* Represents a chunk of future work. 'input' allows continuations to pass
11545
* values between themselves, typically error values.
@@ -470,7 +400,7 @@ struct cache {
470400
struct batcher committer;
471401
struct work_struct commit_ws;
472402

473-
struct io_tracker tracker;
403+
struct dm_io_tracker tracker;
474404

475405
mempool_t migration_pool;
476406

@@ -866,15 +796,15 @@ static void accounted_begin(struct cache *cache, struct bio *bio)
866796
if (accountable_bio(cache, bio)) {
867797
pb = get_per_bio_data(bio);
868798
pb->len = bio_sectors(bio);
869-
iot_io_begin(&cache->tracker, pb->len);
799+
dm_iot_io_begin(&cache->tracker, pb->len);
870800
}
871801
}
872802

873803
static void accounted_complete(struct cache *cache, struct bio *bio)
874804
{
875805
struct per_bio_data *pb = get_per_bio_data(bio);
876806

877-
iot_io_end(&cache->tracker, pb->len);
807+
dm_iot_io_end(&cache->tracker, pb->len);
878808
}
879809

880810
static void accounted_request(struct cache *cache, struct bio *bio)
@@ -1642,7 +1572,7 @@ enum busy {
16421572

16431573
static enum busy spare_migration_bandwidth(struct cache *cache)
16441574
{
1645-
bool idle = iot_idle_for(&cache->tracker, HZ);
1575+
bool idle = dm_iot_idle_for(&cache->tracker, HZ);
16461576
sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) *
16471577
cache->sectors_per_block;
16481578

@@ -2603,7 +2533,7 @@ static int cache_create(struct cache_args *ca, struct cache **result)
26032533

26042534
batcher_init(&cache->committer, commit_op, cache,
26052535
issue_op, cache, cache->wq);
2606-
iot_init(&cache->tracker);
2536+
dm_iot_init(&cache->tracker);
26072537

26082538
init_rwsem(&cache->background_work_lock);
26092539
prevent_background_work(cache);

0 commit comments

Comments
 (0)