Skip to content

Commit ae76d8e

Browse files
adam900710kdave
authored andcommitted
btrfs: scrub: fix grouping of read IO
[REGRESSION] There are several regression reports about the scrub performance with v6.4 kernel. On a PCIe 3.0 device, the old v6.3 kernel can go 3GB/s scrub speed, but v6.4 can only go 1GB/s, an obvious 66% performance drop. [CAUSE] Iostat shows a very different behavior between v6.3 and v6.4 kernel: Device r/s rkB/s rrqm/s %rrqm r_await rareq-sz aqu-sz %util nvme0n1p3 9731.00 3425544.00 17237.00 63.92 2.18 352.02 21.18 100.00 nvme0n1p3 15578.00 993616.00 5.00 0.03 0.09 63.78 1.32 100.00 The upper one is v6.3 while the lower one is v6.4. There are several obvious differences: - Very few read merges This turns out to be a behavior change that we no longer do bio plug/unplug. - Very low aqu-sz This is due to the submit-and-wait behavior of flush_scrub_stripes(), and extra extent/csum tree search. Both behaviors are not that obvious on SATA SSDs, as SATA SSDs have NCQ to merge the reads, while SATA SSDs can not handle high queue depth well either. [FIX] For now this patch focuses on the read speed fix. Dev-replace replace speed needs more work. For the read part, we go two directions to fix the problems: - Re-introduce blk plug/unplug to merge read requests This is pretty simple, and the behavior is pretty easy to observe. This would enlarge the average read request size to 512K. - Introduce multi-group reads and no longer wait for each group Instead of the old behavior, which submits 8 stripes and waits for them, here we would enlarge the total number of stripes to 16 * 8. Which is 8M per device, the same limit as the old scrub in-flight bios size limit. Now every time we fill a group (8 stripes), we submit them and continue to next stripes. Only when the full 16 * 8 stripes are all filled, we submit the remaining ones (the last group), and wait for all groups to finish. Then submit the repair writes and dev-replace writes. This should enlarge the queue depth. This would greatly improve the merge rate (thus read block size) and queue depth: Before (with regression, and cached extent/csum path): Device r/s rkB/s rrqm/s %rrqm r_await rareq-sz aqu-sz %util nvme0n1p3 20666.00 1318240.00 10.00 0.05 0.08 63.79 1.63 100.00 After (with all patches applied): nvme0n1p3 5165.00 2278304.00 30557.00 85.54 0.55 441.10 2.81 100.00 i.e. 1287 to 2224 MB/s. CC: [email protected] # 6.4+ Signed-off-by: Qu Wenruo <[email protected]> Reviewed-by: David Sterba <[email protected]> Signed-off-by: David Sterba <[email protected]>
1 parent 3c771c1 commit ae76d8e

File tree

1 file changed

+71
-25
lines changed

1 file changed

+71
-25
lines changed

fs/btrfs/scrub.c

Lines changed: 71 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -43,9 +43,20 @@ struct scrub_ctx;
4343
/*
4444
* The following value only influences the performance.
4545
*
46-
* This determines the batch size for stripe submitted in one go.
46+
* This detemines how many stripes would be submitted in one go,
47+
* which is 512KiB (BTRFS_STRIPE_LEN * SCRUB_STRIPES_PER_GROUP).
4748
*/
48-
#define SCRUB_STRIPES_PER_SCTX 8 /* That would be 8 64K stripe per-device. */
49+
#define SCRUB_STRIPES_PER_GROUP 8
50+
51+
/*
52+
* How many groups we have for each sctx.
53+
*
54+
* This would be 8M per device, the same value as the old scrub in-flight bios
55+
* size limit.
56+
*/
57+
#define SCRUB_GROUPS_PER_SCTX 16
58+
59+
#define SCRUB_TOTAL_STRIPES (SCRUB_GROUPS_PER_SCTX * SCRUB_STRIPES_PER_GROUP)
4960

5061
/*
5162
* The following value times PAGE_SIZE needs to be large enough to match the
@@ -172,7 +183,7 @@ struct scrub_stripe {
172183
};
173184

174185
struct scrub_ctx {
175-
struct scrub_stripe stripes[SCRUB_STRIPES_PER_SCTX];
186+
struct scrub_stripe stripes[SCRUB_TOTAL_STRIPES];
176187
struct scrub_stripe *raid56_data_stripes;
177188
struct btrfs_fs_info *fs_info;
178189
struct btrfs_path extent_path;
@@ -317,10 +328,10 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
317328
if (!sctx)
318329
return;
319330

320-
for (i = 0; i < SCRUB_STRIPES_PER_SCTX; i++)
331+
for (i = 0; i < SCRUB_TOTAL_STRIPES; i++)
321332
release_scrub_stripe(&sctx->stripes[i]);
322333

323-
kfree(sctx);
334+
kvfree(sctx);
324335
}
325336

326337
static void scrub_put_ctx(struct scrub_ctx *sctx)
@@ -335,7 +346,10 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
335346
struct scrub_ctx *sctx;
336347
int i;
337348

338-
sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
349+
/* Since sctx has inline 128 stripes, it can go beyond 64K easily. Use
350+
* kvzalloc().
351+
*/
352+
sctx = kvzalloc(sizeof(*sctx), GFP_KERNEL);
339353
if (!sctx)
340354
goto nomem;
341355
refcount_set(&sctx->refs, 1);
@@ -345,7 +359,7 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
345359
sctx->extent_path.skip_locking = 1;
346360
sctx->csum_path.search_commit_root = 1;
347361
sctx->csum_path.skip_locking = 1;
348-
for (i = 0; i < SCRUB_STRIPES_PER_SCTX; i++) {
362+
for (i = 0; i < SCRUB_TOTAL_STRIPES; i++) {
349363
int ret;
350364

351365
ret = init_scrub_stripe(fs_info, &sctx->stripes[i]);
@@ -1658,6 +1672,28 @@ static bool stripe_has_metadata_error(struct scrub_stripe *stripe)
16581672
return false;
16591673
}
16601674

1675+
static void submit_initial_group_read(struct scrub_ctx *sctx,
1676+
unsigned int first_slot,
1677+
unsigned int nr_stripes)
1678+
{
1679+
struct blk_plug plug;
1680+
1681+
ASSERT(first_slot < SCRUB_TOTAL_STRIPES);
1682+
ASSERT(first_slot + nr_stripes <= SCRUB_TOTAL_STRIPES);
1683+
1684+
scrub_throttle_dev_io(sctx, sctx->stripes[0].dev,
1685+
btrfs_stripe_nr_to_offset(nr_stripes));
1686+
blk_start_plug(&plug);
1687+
for (int i = 0; i < nr_stripes; i++) {
1688+
struct scrub_stripe *stripe = &sctx->stripes[first_slot + i];
1689+
1690+
/* Those stripes should be initialized. */
1691+
ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state));
1692+
scrub_submit_initial_read(sctx, stripe);
1693+
}
1694+
blk_finish_plug(&plug);
1695+
}
1696+
16611697
static int flush_scrub_stripes(struct scrub_ctx *sctx)
16621698
{
16631699
struct btrfs_fs_info *fs_info = sctx->fs_info;
@@ -1670,11 +1706,11 @@ static int flush_scrub_stripes(struct scrub_ctx *sctx)
16701706

16711707
ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &sctx->stripes[0].state));
16721708

1673-
scrub_throttle_dev_io(sctx, sctx->stripes[0].dev,
1674-
btrfs_stripe_nr_to_offset(nr_stripes));
1675-
for (int i = 0; i < nr_stripes; i++) {
1676-
stripe = &sctx->stripes[i];
1677-
scrub_submit_initial_read(sctx, stripe);
1709+
/* Submit the stripes which are populated but not submitted. */
1710+
if (nr_stripes % SCRUB_STRIPES_PER_GROUP) {
1711+
const int first_slot = round_down(nr_stripes, SCRUB_STRIPES_PER_GROUP);
1712+
1713+
submit_initial_group_read(sctx, first_slot, nr_stripes - first_slot);
16781714
}
16791715

16801716
for (int i = 0; i < nr_stripes; i++) {
@@ -1754,29 +1790,40 @@ static void raid56_scrub_wait_endio(struct bio *bio)
17541790

17551791
static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *bg,
17561792
struct btrfs_device *dev, int mirror_num,
1757-
u64 logical, u32 length, u64 physical)
1793+
u64 logical, u32 length, u64 physical,
1794+
u64 *found_logical_ret)
17581795
{
17591796
struct scrub_stripe *stripe;
17601797
int ret;
17611798

1762-
/* No available slot, submit all stripes and wait for them. */
1763-
if (sctx->cur_stripe >= SCRUB_STRIPES_PER_SCTX) {
1764-
ret = flush_scrub_stripes(sctx);
1765-
if (ret < 0)
1766-
return ret;
1767-
}
1799+
/*
1800+
* There should always be one slot left, as caller filling the last
1801+
* slot should flush them all.
1802+
*/
1803+
ASSERT(sctx->cur_stripe < SCRUB_TOTAL_STRIPES);
17681804

17691805
stripe = &sctx->stripes[sctx->cur_stripe];
1770-
1771-
/* We can queue one stripe using the remaining slot. */
17721806
scrub_reset_stripe(stripe);
17731807
ret = scrub_find_fill_first_stripe(bg, &sctx->extent_path,
17741808
&sctx->csum_path, dev, physical,
17751809
mirror_num, logical, length, stripe);
17761810
/* Either >0 as no more extents or <0 for error. */
17771811
if (ret)
17781812
return ret;
1813+
if (found_logical_ret)
1814+
*found_logical_ret = stripe->logical;
17791815
sctx->cur_stripe++;
1816+
1817+
/* We filled one group, submit it. */
1818+
if (sctx->cur_stripe % SCRUB_STRIPES_PER_GROUP == 0) {
1819+
const int first_slot = sctx->cur_stripe - SCRUB_STRIPES_PER_GROUP;
1820+
1821+
submit_initial_group_read(sctx, first_slot, SCRUB_STRIPES_PER_GROUP);
1822+
}
1823+
1824+
/* Last slot used, flush them all. */
1825+
if (sctx->cur_stripe == SCRUB_TOTAL_STRIPES)
1826+
return flush_scrub_stripes(sctx);
17801827
return 0;
17811828
}
17821829

@@ -1985,6 +2032,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
19852032

19862033
/* Go through each extent items inside the logical range */
19872034
while (cur_logical < logical_end) {
2035+
u64 found_logical;
19882036
u64 cur_physical = physical + cur_logical - logical_start;
19892037

19902038
/* Canceled? */
@@ -2009,7 +2057,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
20092057

20102058
ret = queue_scrub_stripe(sctx, bg, device, mirror_num,
20112059
cur_logical, logical_end - cur_logical,
2012-
cur_physical);
2060+
cur_physical, &found_logical);
20132061
if (ret > 0) {
20142062
/* No more extent, just update the accounting */
20152063
sctx->stat.last_physical = physical + logical_length;
@@ -2019,9 +2067,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
20192067
if (ret < 0)
20202068
break;
20212069

2022-
ASSERT(sctx->cur_stripe > 0);
2023-
cur_logical = sctx->stripes[sctx->cur_stripe - 1].logical
2024-
+ BTRFS_STRIPE_LEN;
2070+
cur_logical = found_logical + BTRFS_STRIPE_LEN;
20252071

20262072
/* Don't hold CPU for too long time */
20272073
cond_resched();

0 commit comments

Comments
 (0)