Skip to content

Commit 332581b

Browse files
naotakdave
authored andcommitted
btrfs: zoned: do not zone finish data relocation block group
When multiple writes happen at once, we may need to sacrifice a currently active block group to be zone finished for a new allocation. We choose a block group with the least free space left, and zone finish it. To do the finishing, we need to send IOs for already allocated region and wait for them and on-going IOs. Otherwise, these IOs fail because the zone is already finished at the time the IO reach a device. However, if a block group dedicated to the data relocation is zone finished, there is a chance that finishing it before an ongoing write IO reaches the device. That is because there is timing gap between an allocation is done (block_group->reservations == 0, as pre-allocation is done) and an ordered extent is created when the relocation IO starts. Thus, if we finish the zone between them, we can fail the IOs. We cannot simply use "fs_info->data_reloc_bg == block_group->start" to avoid the zone finishing. Because, the data_reloc_bg may already switch to a new block group, while there are still ongoing write IOs to the old data_reloc_bg. So, this patch reworks the BLOCK_GROUP_FLAG_ZONED_DATA_RELOC bit to indicate there is a data relocation allocation and/or ongoing write to the block group. The bit is set on allocation and cleared in end_io function of the last IO for the currently allocated region. To change the timing of the bit setting also solves the issue that the bit being left even after there is no IO going on. With the current code, if the data_reloc_bg switches after the last IO to the current data_reloc_bg, the bit is set at this timing and there is no one clearing that bit. As a result, that block group is kept unallocatable for anything. Fixes: 343d8a3 ("btrfs: zoned: prevent allocation from previous data relocation BG") Fixes: 74e91b1 ("btrfs: zoned: zone finish unused block group") CC: [email protected] # 6.1+ Reviewed-by: Christoph Hellwig <[email protected]> Reviewed-by: Johannes Thumshirn <[email protected]> Signed-off-by: Naohiro Aota <[email protected]> Signed-off-by: David Sterba <[email protected]>
1 parent e7f1326 commit 332581b

File tree

2 files changed

+36
-23
lines changed

2 files changed

+36
-23
lines changed

fs/btrfs/extent-tree.c

Lines changed: 23 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -3738,7 +3738,8 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
37383738
fs_info->data_reloc_bg == 0);
37393739

37403740
if (block_group->ro ||
3741-
test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) {
3741+
(!ffe_ctl->for_data_reloc &&
3742+
test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags))) {
37423743
ret = 1;
37433744
goto out;
37443745
}
@@ -3781,8 +3782,26 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
37813782
if (ffe_ctl->for_treelog && !fs_info->treelog_bg)
37823783
fs_info->treelog_bg = block_group->start;
37833784

3784-
if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg)
3785-
fs_info->data_reloc_bg = block_group->start;
3785+
if (ffe_ctl->for_data_reloc) {
3786+
if (!fs_info->data_reloc_bg)
3787+
fs_info->data_reloc_bg = block_group->start;
3788+
/*
3789+
* Do not allow allocations from this block group, unless it is
3790+
* for data relocation. Compared to increasing the ->ro, setting
3791+
* the ->zoned_data_reloc_ongoing flag still allows nocow
3792+
* writers to come in. See btrfs_inc_nocow_writers().
3793+
*
3794+
* We need to disable an allocation to avoid an allocation of
3795+
* regular (non-relocation data) extent. With mix of relocation
3796+
* extents and regular extents, we can dispatch WRITE commands
3797+
* (for relocation extents) and ZONE APPEND commands (for
3798+
* regular extents) at the same time to the same zone, which
3799+
* easily break the write pointer.
3800+
*
3801+
* Also, this flag avoids this block group to be zone finished.
3802+
*/
3803+
set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags);
3804+
}
37863805

37873806
ffe_ctl->found_offset = start + block_group->alloc_offset;
37883807
block_group->alloc_offset += num_bytes;
@@ -3800,24 +3819,8 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
38003819
out:
38013820
if (ret && ffe_ctl->for_treelog)
38023821
fs_info->treelog_bg = 0;
3803-
if (ret && ffe_ctl->for_data_reloc &&
3804-
fs_info->data_reloc_bg == block_group->start) {
3805-
/*
3806-
* Do not allow further allocations from this block group.
3807-
* Compared to increasing the ->ro, setting the
3808-
* ->zoned_data_reloc_ongoing flag still allows nocow
3809-
* writers to come in. See btrfs_inc_nocow_writers().
3810-
*
3811-
* We need to disable an allocation to avoid an allocation of
3812-
* regular (non-relocation data) extent. With mix of relocation
3813-
* extents and regular extents, we can dispatch WRITE commands
3814-
* (for relocation extents) and ZONE APPEND commands (for
3815-
* regular extents) at the same time to the same zone, which
3816-
* easily break the write pointer.
3817-
*/
3818-
set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags);
3822+
if (ret && ffe_ctl->for_data_reloc)
38193823
fs_info->data_reloc_bg = 0;
3820-
}
38213824
spin_unlock(&fs_info->relocation_bg_lock);
38223825
spin_unlock(&fs_info->treelog_bg_lock);
38233826
spin_unlock(&block_group->lock);

fs/btrfs/zoned.c

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2091,6 +2091,10 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
20912091
* and block_group->meta_write_pointer for metadata.
20922092
*/
20932093
if (!fully_written) {
2094+
if (test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) {
2095+
spin_unlock(&block_group->lock);
2096+
return -EAGAIN;
2097+
}
20942098
spin_unlock(&block_group->lock);
20952099

20962100
ret = btrfs_inc_block_group_ro(block_group, false);
@@ -2119,7 +2123,9 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
21192123
return 0;
21202124
}
21212125

2122-
if (block_group->reserved) {
2126+
if (block_group->reserved ||
2127+
test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC,
2128+
&block_group->runtime_flags)) {
21232129
spin_unlock(&block_group->lock);
21242130
btrfs_dec_block_group_ro(block_group);
21252131
return -EAGAIN;
@@ -2362,7 +2368,10 @@ void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logica
23622368

23632369
/* All relocation extents are written. */
23642370
if (block_group->start + block_group->alloc_offset == logical + length) {
2365-
/* Now, release this block group for further allocations. */
2371+
/*
2372+
* Now, release this block group for further allocations and
2373+
* zone finish.
2374+
*/
23662375
clear_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC,
23672376
&block_group->runtime_flags);
23682377
}
@@ -2386,7 +2395,8 @@ int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info)
23862395

23872396
spin_lock(&block_group->lock);
23882397
if (block_group->reserved || block_group->alloc_offset == 0 ||
2389-
(block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM)) {
2398+
(block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM) ||
2399+
test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) {
23902400
spin_unlock(&block_group->lock);
23912401
continue;
23922402
}

0 commit comments

Comments
 (0)