Skip to content

Commit f7394b2

Browse files
damien-lemoalSasha Levin
authored andcommitted
dm: Fix dm-zoned-reclaim zone write pointer alignment
commit b76b840 upstream. The zone reclaim processing of the dm-zoned device mapper uses blkdev_issue_zeroout() to align the write pointer of a zone being used for reclaiming another zone, to write the valid data blocks from the zone being reclaimed at the same position relative to the zone start in the reclaim target zone. The first call to blkdev_issue_zeroout() will try to use hardware offload using a REQ_OP_WRITE_ZEROES operation if the device reports a non-zero max_write_zeroes_sectors queue limit. If this operation fails because of the lack of hardware support, blkdev_issue_zeroout() falls back to using a regular write operation with the zero-page as buffer. Currently, such REQ_OP_WRITE_ZEROES failure is automatically handled by the block layer zone write plugging code which will execute a report zones operation to ensure that the write pointer of the target zone of the failed operation has not changed and to "rewind" the zone write pointer offset of the target zone as it was advanced when the write zero operation was submitted. So the REQ_OP_WRITE_ZEROES failure does not cause any issue and blkdev_issue_zeroout() works as expected. However, since the automatic recovery of zone write pointers by the zone write plugging code can potentially cause deadlocks with queue freeze operations, a different recovery must be implemented in preparation for the removal of zone write plugging report zones based recovery. Do this by introducing the new function blk_zone_issue_zeroout(). This function first calls blkdev_issue_zeroout() with the flag BLKDEV_ZERO_NOFALLBACK to intercept failures on the first execution which attempt to use the device hardware offload with the REQ_OP_WRITE_ZEROES operation. If this attempt fails, a report zone operation is issued to restore the zone write pointer offset of the target zone to the correct position and blkdev_issue_zeroout() is called again without the BLKDEV_ZERO_NOFALLBACK flag. The report zones operation performing this recovery is implemented using the helper function disk_zone_sync_wp_offset() which calls the gendisk report_zones file operation with the callback disk_report_zones_cb(). This callback updates the target write pointer offset of the target zone using the new function disk_zone_wplug_sync_wp_offset(). dmz_reclaim_align_wp() is modified to change its call to blkdev_issue_zeroout() to a call to blk_zone_issue_zeroout() without any other change needed as the two functions are functionnally equivalent. Fixes: dd291d7 ("block: Introduce zone write plugging") Cc: [email protected] Signed-off-by: Damien Le Moal <[email protected]> Reviewed-by: Christoph Hellwig <[email protected]> Acked-by: Mike Snitzer <[email protected]> Reviewed-by: Martin K. Petersen <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Jens Axboe <[email protected]> Signed-off-by: Greg Kroah-Hartman <[email protected]>
1 parent beb2dbd commit f7394b2

File tree

3 files changed

+124
-26
lines changed

3 files changed

+124
-26
lines changed

block/blk-zoned.c

Lines changed: 118 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,30 @@ const char *blk_zone_cond_str(enum blk_zone_cond zone_cond)
115115
}
116116
EXPORT_SYMBOL_GPL(blk_zone_cond_str);
117117

118+
struct disk_report_zones_cb_args {
119+
struct gendisk *disk;
120+
report_zones_cb user_cb;
121+
void *user_data;
122+
};
123+
124+
static void disk_zone_wplug_sync_wp_offset(struct gendisk *disk,
125+
struct blk_zone *zone);
126+
127+
static int disk_report_zones_cb(struct blk_zone *zone, unsigned int idx,
128+
void *data)
129+
{
130+
struct disk_report_zones_cb_args *args = data;
131+
struct gendisk *disk = args->disk;
132+
133+
if (disk->zone_wplugs_hash)
134+
disk_zone_wplug_sync_wp_offset(disk, zone);
135+
136+
if (!args->user_cb)
137+
return 0;
138+
139+
return args->user_cb(zone, idx, args->user_data);
140+
}
141+
118142
/**
119143
* blkdev_report_zones - Get zones information
120144
* @bdev: Target block device
@@ -707,6 +731,58 @@ static void disk_zone_wplug_set_wp_offset(struct gendisk *disk,
707731
spin_unlock_irqrestore(&zwplug->lock, flags);
708732
}
709733

734+
static unsigned int blk_zone_wp_offset(struct blk_zone *zone)
735+
{
736+
switch (zone->cond) {
737+
case BLK_ZONE_COND_IMP_OPEN:
738+
case BLK_ZONE_COND_EXP_OPEN:
739+
case BLK_ZONE_COND_CLOSED:
740+
return zone->wp - zone->start;
741+
case BLK_ZONE_COND_FULL:
742+
return zone->len;
743+
case BLK_ZONE_COND_EMPTY:
744+
return 0;
745+
case BLK_ZONE_COND_NOT_WP:
746+
case BLK_ZONE_COND_OFFLINE:
747+
case BLK_ZONE_COND_READONLY:
748+
default:
749+
/*
750+
* Conventional, offline and read-only zones do not have a valid
751+
* write pointer.
752+
*/
753+
return UINT_MAX;
754+
}
755+
}
756+
757+
static void disk_zone_wplug_sync_wp_offset(struct gendisk *disk,
758+
struct blk_zone *zone)
759+
{
760+
struct blk_zone_wplug *zwplug;
761+
unsigned long flags;
762+
763+
zwplug = disk_get_zone_wplug(disk, zone->start);
764+
if (!zwplug)
765+
return;
766+
767+
spin_lock_irqsave(&zwplug->lock, flags);
768+
if (zwplug->flags & BLK_ZONE_WPLUG_ERROR)
769+
disk_zone_wplug_set_wp_offset(disk, zwplug,
770+
blk_zone_wp_offset(zone));
771+
spin_unlock_irqrestore(&zwplug->lock, flags);
772+
773+
disk_put_zone_wplug(zwplug);
774+
}
775+
776+
static int disk_zone_sync_wp_offset(struct gendisk *disk, sector_t sector)
777+
{
778+
struct disk_report_zones_cb_args args = {
779+
.disk = disk,
780+
};
781+
782+
return disk->fops->report_zones(disk, sector, 1,
783+
disk_report_zones_cb, &args);
784+
}
785+
710786
static bool blk_zone_wplug_handle_reset_or_finish(struct bio *bio,
711787
unsigned int wp_offset)
712788
{
@@ -1284,29 +1360,6 @@ static void blk_zone_wplug_bio_work(struct work_struct *work)
12841360
disk_put_zone_wplug(zwplug);
12851361
}
12861362

1287-
static unsigned int blk_zone_wp_offset(struct blk_zone *zone)
1288-
{
1289-
switch (zone->cond) {
1290-
case BLK_ZONE_COND_IMP_OPEN:
1291-
case BLK_ZONE_COND_EXP_OPEN:
1292-
case BLK_ZONE_COND_CLOSED:
1293-
return zone->wp - zone->start;
1294-
case BLK_ZONE_COND_FULL:
1295-
return zone->len;
1296-
case BLK_ZONE_COND_EMPTY:
1297-
return 0;
1298-
case BLK_ZONE_COND_NOT_WP:
1299-
case BLK_ZONE_COND_OFFLINE:
1300-
case BLK_ZONE_COND_READONLY:
1301-
default:
1302-
/*
1303-
* Conventional, offline and read-only zones do not have a valid
1304-
* write pointer.
1305-
*/
1306-
return UINT_MAX;
1307-
}
1308-
}
1309-
13101363
static int blk_zone_wplug_report_zone_cb(struct blk_zone *zone,
13111364
unsigned int idx, void *data)
13121365
{
@@ -1876,6 +1929,48 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
18761929
}
18771930
EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones);
18781931

1932+
/**
1933+
* blk_zone_issue_zeroout - zero-fill a block range in a zone
1934+
* @bdev: blockdev to write
1935+
* @sector: start sector
1936+
* @nr_sects: number of sectors to write
1937+
* @gfp_mask: memory allocation flags (for bio_alloc)
1938+
*
1939+
* Description:
1940+
* Zero-fill a block range in a zone (@sector must be equal to the zone write
1941+
* pointer), handling potential errors due to the (initially unknown) lack of
1942+
* hardware offload (See blkdev_issue_zeroout()).
1943+
*/
1944+
int blk_zone_issue_zeroout(struct block_device *bdev, sector_t sector,
1945+
sector_t nr_sects, gfp_t gfp_mask)
1946+
{
1947+
int ret;
1948+
1949+
if (WARN_ON_ONCE(!bdev_is_zoned(bdev)))
1950+
return -EIO;
1951+
1952+
ret = blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask,
1953+
BLKDEV_ZERO_NOFALLBACK);
1954+
if (ret != -EOPNOTSUPP)
1955+
return ret;
1956+
1957+
/*
1958+
* The failed call to blkdev_issue_zeroout() advanced the zone write
1959+
* pointer. Undo this using a report zone to update the zone write
1960+
* pointer to the correct current value.
1961+
*/
1962+
ret = disk_zone_sync_wp_offset(bdev->bd_disk, sector);
1963+
if (ret != 1)
1964+
return ret < 0 ? ret : -EIO;
1965+
1966+
/*
1967+
* Retry without BLKDEV_ZERO_NOFALLBACK to force the fallback to a
1968+
* regular write with zero-pages.
1969+
*/
1970+
return blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask, 0);
1971+
}
1972+
EXPORT_SYMBOL_GPL(blk_zone_issue_zeroout);
1973+
18791974
#ifdef CONFIG_BLK_DEBUG_FS
18801975

18811976
int queue_zone_wplugs_show(void *data, struct seq_file *m)

drivers/md/dm-zoned-reclaim.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -76,9 +76,9 @@ static int dmz_reclaim_align_wp(struct dmz_reclaim *zrc, struct dm_zone *zone,
7676
* pointer and the requested position.
7777
*/
7878
nr_blocks = block - wp_block;
79-
ret = blkdev_issue_zeroout(dev->bdev,
80-
dmz_start_sect(zmd, zone) + dmz_blk2sect(wp_block),
81-
dmz_blk2sect(nr_blocks), GFP_NOIO, 0);
79+
ret = blk_zone_issue_zeroout(dev->bdev,
80+
dmz_start_sect(zmd, zone) + dmz_blk2sect(wp_block),
81+
dmz_blk2sect(nr_blocks), GFP_NOIO);
8282
if (ret) {
8383
dmz_dev_err(dev,
8484
"Align zone %u wp %llu to %llu (wp+%u) blocks failed %d",

include/linux/blkdev.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1386,6 +1386,9 @@ static inline bool bdev_is_zone_start(struct block_device *bdev,
13861386
return bdev_offset_from_zone_start(bdev, sector) == 0;
13871387
}
13881388

1389+
int blk_zone_issue_zeroout(struct block_device *bdev, sector_t sector,
1390+
sector_t nr_sects, gfp_t gfp_mask);
1391+
13891392
static inline int queue_dma_alignment(const struct request_queue *q)
13901393
{
13911394
return q->limits.dma_alignment;

0 commit comments

Comments
 (0)