Skip to content

Commit 343d8a3

Browse files
naotakdave
authored andcommitted
btrfs: zoned: prevent allocation from previous data relocation BG
After commit 5f0addf ("btrfs: zoned: use dedicated lock for data relocation"), we observe IO errors on e.g, btrfs/232 like below. [09.0][T4038707] WARNING: CPU: 3 PID: 4038707 at fs/btrfs/extent-tree.c:2381 btrfs_cross_ref_exist+0xfc/0x120 [btrfs] <snip> [09.9][T4038707] Call Trace: [09.5][T4038707] <TASK> [09.3][T4038707] run_delalloc_nocow+0x7f1/0x11a0 [btrfs] [09.6][T4038707] ? test_range_bit+0x174/0x320 [btrfs] [09.2][T4038707] ? fallback_to_cow+0x980/0x980 [btrfs] [09.3][T4038707] ? find_lock_delalloc_range+0x33e/0x3e0 [btrfs] [09.5][T4038707] btrfs_run_delalloc_range+0x445/0x1320 [btrfs] [09.2][T4038707] ? test_range_bit+0x320/0x320 [btrfs] [09.4][T4038707] ? lock_downgrade+0x6a0/0x6a0 [09.2][T4038707] ? orc_find.part.0+0x1ed/0x300 [09.5][T4038707] ? __module_address.part.0+0x25/0x300 [09.0][T4038707] writepage_delalloc+0x159/0x310 [btrfs] <snip> [09.4][ C3] sd 10:0:1:0: [sde] tag#2620 FAILED Result: hostbyte=DID_OK driverbyte=DRIVER_OK cmd_age=0s [09.5][ C3] sd 10:0:1:0: [sde] tag#2620 Sense Key : Illegal Request [current] [09.9][ C3] sd 10:0:1:0: [sde] tag#2620 Add. Sense: Unaligned write command [09.5][ C3] sd 10:0:1:0: [sde] tag#2620 CDB: Write(16) 8a 00 00 00 00 00 02 f3 63 87 00 00 00 2c 00 00 [09.4][ C3] critical target error, dev sde, sector 396041272 op 0x1:(WRITE) flags 0x800 phys_seg 3 prio class 0 [09.9][ C3] BTRFS error (device dm-1): bdev /dev/mapper/dml_102_2 errs: wr 1, rd 0, flush 0, corrupt 0, gen 0 The IO errors occur when we allocate a regular extent in previous data relocation block group. On zoned btrfs, we use a dedicated block group to relocate a data extent. Thus, we allocate relocating data extents (pre-alloc) only from the dedicated block group and vice versa. Once the free space in the dedicated block group gets tight, a relocating extent may not fit into the block group. In that case, we need to switch the dedicated block group to the next one. Then, the previous one is now freed up for allocating a regular extent. The BG is already not enough to allocate the relocating extent, but there is still room to allocate a smaller extent. Now the problem happens. By allocating a regular extent while nocow IOs for the relocation is still on-going, we will issue WRITE IOs (for relocation) and ZONE APPEND IOs (for the regular writes) at the same time. That mixed IOs confuses the write pointer and arises the unaligned write errors. This commit introduces a new bit 'zoned_data_reloc_ongoing' to the btrfs_block_group. We set this bit before releasing the dedicated block group, and no extent are allocated from a block group having this bit set. This bit is similar to setting block_group->ro, but is different from it by allowing nocow writes to start. Once all the nocow IO for relocation is done (hooked from btrfs_finish_ordered_io), we reset the bit to release the block group for further allocation. Fixes: c2707a2 ("btrfs: zoned: add a dedicated data relocation block group") CC: [email protected] # 5.16+ Signed-off-by: Naohiro Aota <[email protected]> Reviewed-by: David Sterba <[email protected]> Signed-off-by: David Sterba <[email protected]>
1 parent 650c9ca commit 343d8a3

File tree

5 files changed

+53
-2
lines changed

5 files changed

+53
-2
lines changed

fs/btrfs/block-group.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ struct btrfs_block_group {
104104
unsigned int relocating_repair:1;
105105
unsigned int chunk_item_inserted:1;
106106
unsigned int zone_is_active:1;
107+
unsigned int zoned_data_reloc_ongoing:1;
107108

108109
int disk_cache_state;
109110

fs/btrfs/extent-tree.c

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3832,7 +3832,7 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
38323832
block_group->start == fs_info->data_reloc_bg ||
38333833
fs_info->data_reloc_bg == 0);
38343834

3835-
if (block_group->ro) {
3835+
if (block_group->ro || block_group->zoned_data_reloc_ongoing) {
38363836
ret = 1;
38373837
goto out;
38383838
}
@@ -3894,8 +3894,24 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
38943894
out:
38953895
if (ret && ffe_ctl->for_treelog)
38963896
fs_info->treelog_bg = 0;
3897-
if (ret && ffe_ctl->for_data_reloc)
3897+
if (ret && ffe_ctl->for_data_reloc &&
3898+
fs_info->data_reloc_bg == block_group->start) {
3899+
/*
3900+
* Do not allow further allocations from this block group.
3901+
* Compared to increasing the ->ro, setting the
3902+
* ->zoned_data_reloc_ongoing flag still allows nocow
3903+
* writers to come in. See btrfs_inc_nocow_writers().
3904+
*
3905+
* We need to disable an allocation to avoid an allocation of
3906+
* regular (non-relocation data) extent. With mix of relocation
3907+
* extents and regular extents, we can dispatch WRITE commands
3908+
* (for relocation extents) and ZONE APPEND commands (for
3909+
* regular extents) at the same time to the same zone, which
3910+
* easily break the write pointer.
3911+
*/
3912+
block_group->zoned_data_reloc_ongoing = 1;
38983913
fs_info->data_reloc_bg = 0;
3914+
}
38993915
spin_unlock(&fs_info->relocation_bg_lock);
39003916
spin_unlock(&fs_info->treelog_bg_lock);
39013917
spin_unlock(&block_group->lock);

fs/btrfs/inode.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3195,6 +3195,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
31953195
ordered_extent->file_offset,
31963196
ordered_extent->file_offset +
31973197
logical_len);
3198+
btrfs_zoned_release_data_reloc_bg(fs_info, ordered_extent->disk_bytenr,
3199+
ordered_extent->disk_num_bytes);
31983200
} else {
31993201
BUG_ON(root == fs_info->tree_root);
32003202
ret = insert_ordered_extent_file_extent(trans, ordered_extent);

fs/btrfs/zoned.c

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2140,3 +2140,30 @@ bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
21402140
factor = div64_u64(used * 100, total);
21412141
return factor >= fs_info->bg_reclaim_threshold;
21422142
}
2143+
2144+
void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical,
2145+
u64 length)
2146+
{
2147+
struct btrfs_block_group *block_group;
2148+
2149+
if (!btrfs_is_zoned(fs_info))
2150+
return;
2151+
2152+
block_group = btrfs_lookup_block_group(fs_info, logical);
2153+
/* It should be called on a previous data relocation block group. */
2154+
ASSERT(block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA));
2155+
2156+
spin_lock(&block_group->lock);
2157+
if (!block_group->zoned_data_reloc_ongoing)
2158+
goto out;
2159+
2160+
/* All relocation extents are written. */
2161+
if (block_group->start + block_group->alloc_offset == logical + length) {
2162+
/* Now, release this block group for further allocations. */
2163+
block_group->zoned_data_reloc_ongoing = 0;
2164+
}
2165+
2166+
out:
2167+
spin_unlock(&block_group->lock);
2168+
btrfs_put_block_group(block_group);
2169+
}

fs/btrfs/zoned.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,8 @@ void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
7777
void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg);
7878
void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info);
7979
bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info);
80+
void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical,
81+
u64 length);
8082
#else /* CONFIG_BLK_DEV_ZONED */
8183
static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
8284
struct blk_zone *zone)
@@ -243,6 +245,9 @@ static inline bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
243245
{
244246
return false;
245247
}
248+
249+
static inline void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info,
250+
u64 logical, u64 length) { }
246251
#endif
247252

248253
static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)

0 commit comments

Comments
 (0)