Skip to content

Commit fb16787

Browse files
committed
Merge branch 'md-6.12-raid5-opt' into md-6.12
From Artur: The wait_for_overlap wait queue is currently used in two cases, which are not really related: - waiting for actual overlapping bios, which uses R5_Overlap bit, - waiting for events related to reshape. Handling every write request in raid5_make_request() involves adding to and removing from this wait queue, which uses a spinlock. With fast storage and multiple submitting threads the contention on this lock is noticeable. This patch series aims to resolve this by separating the two cases mentioned above and using this wait queue only when reshape is in progress. The results when testing 4k random writes on raid5 with null_blk (8 jobs, qd=64, group_thread_cnt=8): before: 463k IOPS after: 523k IOPS The improvement is not huge with this series alone but it is just one of the bottlenecks. When applied onto some other changes I'm working on, it allowed to go from 845k IOPS to 975k IOPS on the same test. * md-6.12-raid5-opt: md/raid5: rename wait_for_overlap to wait_for_reshape md/raid5: only add to wq if reshape is in progress md/raid5: use wait_on_bit() for R5_Overlap Signed-off-by: Song Liu <[email protected]>
2 parents 7f67fda + 6f039cc commit fb16787

File tree

3 files changed

+52
-51
lines changed

3 files changed

+52
-51
lines changed

drivers/md/raid5-cache.c

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2798,7 +2798,6 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
27982798
{
27992799
struct r5l_log *log = READ_ONCE(conf->log);
28002800
int i;
2801-
int do_wakeup = 0;
28022801
sector_t tree_index;
28032802
void __rcu **pslot;
28042803
uintptr_t refcount;
@@ -2815,7 +2814,7 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
28152814
for (i = sh->disks; i--; ) {
28162815
clear_bit(R5_InJournal, &sh->dev[i].flags);
28172816
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
2818-
do_wakeup = 1;
2817+
wake_up_bit(&sh->dev[i].flags, R5_Overlap);
28192818
}
28202819

28212820
/*
@@ -2828,9 +2827,6 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
28282827
if (atomic_dec_and_test(&conf->pending_full_writes))
28292828
md_wakeup_thread(conf->mddev->thread);
28302829

2831-
if (do_wakeup)
2832-
wake_up(&conf->wait_for_overlap);
2833-
28342830
spin_lock_irq(&log->stripe_in_journal_lock);
28352831
list_del_init(&sh->r5c);
28362832
spin_unlock_irq(&log->stripe_in_journal_lock);

drivers/md/raid5.c

Lines changed: 50 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -2337,7 +2337,7 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
23372337
for (i = disks; i--; ) {
23382338
struct r5dev *dev = &sh->dev[i];
23392339
if (test_and_clear_bit(R5_Overlap, &dev->flags))
2340-
wake_up(&sh->raid_conf->wait_for_overlap);
2340+
wake_up_bit(&dev->flags, R5_Overlap);
23412341
}
23422342
}
23432343
local_unlock(&conf->percpu->lock);
@@ -3473,7 +3473,7 @@ static bool stripe_bio_overlaps(struct stripe_head *sh, struct bio *bi,
34733473
* With PPL only writes to consecutive data chunks within a
34743474
* stripe are allowed because for a single stripe_head we can
34753475
* only have one PPL entry at a time, which describes one data
3476-
* range. Not really an overlap, but wait_for_overlap can be
3476+
* range. Not really an overlap, but R5_Overlap can be
34773477
* used to handle this.
34783478
*/
34793479
sector_t sector;
@@ -3652,7 +3652,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
36523652
log_stripe_write_finished(sh);
36533653

36543654
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
3655-
wake_up(&conf->wait_for_overlap);
3655+
wake_up_bit(&sh->dev[i].flags, R5_Overlap);
36563656

36573657
while (bi && bi->bi_iter.bi_sector <
36583658
sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
@@ -3697,7 +3697,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
36973697
sh->dev[i].toread = NULL;
36983698
spin_unlock_irq(&sh->stripe_lock);
36993699
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
3700-
wake_up(&conf->wait_for_overlap);
3700+
wake_up_bit(&sh->dev[i].flags, R5_Overlap);
37013701
if (bi)
37023702
s->to_read--;
37033703
while (bi && bi->bi_iter.bi_sector <
@@ -3736,7 +3736,7 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
37363736
BUG_ON(sh->batch_head);
37373737
clear_bit(STRIPE_SYNCING, &sh->state);
37383738
if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
3739-
wake_up(&conf->wait_for_overlap);
3739+
wake_up_bit(&sh->dev[sh->pd_idx].flags, R5_Overlap);
37403740
s->syncing = 0;
37413741
s->replacing = 0;
37423742
/* There is nothing more to do for sync/check/repair.
@@ -4877,7 +4877,6 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
48774877
{
48784878
struct stripe_head *sh, *next;
48794879
int i;
4880-
int do_wakeup = 0;
48814880

48824881
list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) {
48834882

@@ -4913,7 +4912,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
49134912
spin_unlock_irq(&sh->stripe_lock);
49144913
for (i = 0; i < sh->disks; i++) {
49154914
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
4916-
do_wakeup = 1;
4915+
wake_up_bit(&sh->dev[i].flags, R5_Overlap);
49174916
sh->dev[i].flags = head_sh->dev[i].flags &
49184917
(~((1 << R5_WriteError) | (1 << R5_Overlap)));
49194918
}
@@ -4927,12 +4926,9 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
49274926
spin_unlock_irq(&head_sh->stripe_lock);
49284927
for (i = 0; i < head_sh->disks; i++)
49294928
if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags))
4930-
do_wakeup = 1;
4929+
wake_up_bit(&head_sh->dev[i].flags, R5_Overlap);
49314930
if (head_sh->state & handle_flags)
49324931
set_bit(STRIPE_HANDLE, &head_sh->state);
4933-
4934-
if (do_wakeup)
4935-
wake_up(&head_sh->raid_conf->wait_for_overlap);
49364932
}
49374933

49384934
static void handle_stripe(struct stripe_head *sh)
@@ -5198,7 +5194,7 @@ static void handle_stripe(struct stripe_head *sh)
51985194
md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1);
51995195
clear_bit(STRIPE_SYNCING, &sh->state);
52005196
if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
5201-
wake_up(&conf->wait_for_overlap);
5197+
wake_up_bit(&sh->dev[sh->pd_idx].flags, R5_Overlap);
52025198
}
52035199

52045200
/* If the failed drives are just a ReadError, then we might need
@@ -5261,7 +5257,7 @@ static void handle_stripe(struct stripe_head *sh)
52615257
} else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
52625258
clear_bit(STRIPE_EXPAND_READY, &sh->state);
52635259
atomic_dec(&conf->reshape_stripes);
5264-
wake_up(&conf->wait_for_overlap);
5260+
wake_up(&conf->wait_for_reshape);
52655261
md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1);
52665262
}
52675263

@@ -5755,12 +5751,11 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
57555751
int d;
57565752
again:
57575753
sh = raid5_get_active_stripe(conf, NULL, logical_sector, 0);
5758-
prepare_to_wait(&conf->wait_for_overlap, &w,
5759-
TASK_UNINTERRUPTIBLE);
57605754
set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
57615755
if (test_bit(STRIPE_SYNCING, &sh->state)) {
57625756
raid5_release_stripe(sh);
5763-
schedule();
5757+
wait_on_bit(&sh->dev[sh->pd_idx].flags, R5_Overlap,
5758+
TASK_UNINTERRUPTIBLE);
57645759
goto again;
57655760
}
57665761
clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
@@ -5772,12 +5767,12 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
57725767
set_bit(R5_Overlap, &sh->dev[d].flags);
57735768
spin_unlock_irq(&sh->stripe_lock);
57745769
raid5_release_stripe(sh);
5775-
schedule();
5770+
wait_on_bit(&sh->dev[d].flags, R5_Overlap,
5771+
TASK_UNINTERRUPTIBLE);
57765772
goto again;
57775773
}
57785774
}
57795775
set_bit(STRIPE_DISCARD, &sh->state);
5780-
finish_wait(&conf->wait_for_overlap, &w);
57815776
sh->overwrite_disks = 0;
57825777
for (d = 0; d < conf->raid_disks; d++) {
57835778
if (d == sh->pd_idx || d == sh->qd_idx)
@@ -5854,7 +5849,6 @@ static int add_all_stripe_bios(struct r5conf *conf,
58545849
struct bio *bi, int forwrite, int previous)
58555850
{
58565851
int dd_idx;
5857-
int ret = 1;
58585852

58595853
spin_lock_irq(&sh->stripe_lock);
58605854

@@ -5870,14 +5864,19 @@ static int add_all_stripe_bios(struct r5conf *conf,
58705864

58715865
if (stripe_bio_overlaps(sh, bi, dd_idx, forwrite)) {
58725866
set_bit(R5_Overlap, &dev->flags);
5873-
ret = 0;
5874-
continue;
5867+
spin_unlock_irq(&sh->stripe_lock);
5868+
raid5_release_stripe(sh);
5869+
/* release batch_last before wait to avoid risk of deadlock */
5870+
if (ctx->batch_last) {
5871+
raid5_release_stripe(ctx->batch_last);
5872+
ctx->batch_last = NULL;
5873+
}
5874+
md_wakeup_thread(conf->mddev->thread);
5875+
wait_on_bit(&dev->flags, R5_Overlap, TASK_UNINTERRUPTIBLE);
5876+
return 0;
58755877
}
58765878
}
58775879

5878-
if (!ret)
5879-
goto out;
5880-
58815880
for (dd_idx = 0; dd_idx < sh->disks; dd_idx++) {
58825881
struct r5dev *dev = &sh->dev[dd_idx];
58835882

@@ -5893,9 +5892,8 @@ static int add_all_stripe_bios(struct r5conf *conf,
58935892
RAID5_STRIPE_SHIFT(conf), ctx->sectors_to_do);
58945893
}
58955894

5896-
out:
58975895
spin_unlock_irq(&sh->stripe_lock);
5898-
return ret;
5896+
return 1;
58995897
}
59005898

59015899
enum reshape_loc {
@@ -5991,17 +5989,17 @@ static enum stripe_result make_stripe_request(struct mddev *mddev,
59915989
goto out_release;
59925990
}
59935991

5994-
if (test_bit(STRIPE_EXPANDING, &sh->state) ||
5995-
!add_all_stripe_bios(conf, ctx, sh, bi, rw, previous)) {
5996-
/*
5997-
* Stripe is busy expanding or add failed due to
5998-
* overlap. Flush everything and wait a while.
5999-
*/
5992+
if (test_bit(STRIPE_EXPANDING, &sh->state)) {
60005993
md_wakeup_thread(mddev->thread);
60015994
ret = STRIPE_SCHEDULE_AND_RETRY;
60025995
goto out_release;
60035996
}
60045997

5998+
if (!add_all_stripe_bios(conf, ctx, sh, bi, rw, previous)) {
5999+
ret = STRIPE_RETRY;
6000+
goto out;
6001+
}
6002+
60056003
if (stripe_can_batch(sh)) {
60066004
stripe_add_to_batch_list(conf, sh, ctx->batch_last);
60076005
if (ctx->batch_last)
@@ -6072,6 +6070,7 @@ static sector_t raid5_bio_lowest_chunk_sector(struct r5conf *conf,
60726070
static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
60736071
{
60746072
DEFINE_WAIT_FUNC(wait, woken_wake_function);
6073+
bool on_wq;
60756074
struct r5conf *conf = mddev->private;
60766075
sector_t logical_sector;
60776076
struct stripe_request_ctx ctx = {};
@@ -6145,11 +6144,15 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
61456144
* sequential IO pattern. We don't bother with the optimization when
61466145
* reshaping as the performance benefit is not worth the complexity.
61476146
*/
6148-
if (likely(conf->reshape_progress == MaxSector))
6147+
if (likely(conf->reshape_progress == MaxSector)) {
61496148
logical_sector = raid5_bio_lowest_chunk_sector(conf, bi);
6149+
on_wq = false;
6150+
} else {
6151+
add_wait_queue(&conf->wait_for_reshape, &wait);
6152+
on_wq = true;
6153+
}
61506154
s = (logical_sector - ctx.first_sector) >> RAID5_STRIPE_SHIFT(conf);
61516155

6152-
add_wait_queue(&conf->wait_for_overlap, &wait);
61536156
while (1) {
61546157
res = make_stripe_request(mddev, conf, &ctx, logical_sector,
61556158
bi);
@@ -6160,6 +6163,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
61606163
continue;
61616164

61626165
if (res == STRIPE_SCHEDULE_AND_RETRY) {
6166+
WARN_ON_ONCE(!on_wq);
61636167
/*
61646168
* Must release the reference to batch_last before
61656169
* scheduling and waiting for work to be done,
@@ -6184,7 +6188,8 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
61846188
logical_sector = ctx.first_sector +
61856189
(s << RAID5_STRIPE_SHIFT(conf));
61866190
}
6187-
remove_wait_queue(&conf->wait_for_overlap, &wait);
6191+
if (unlikely(on_wq))
6192+
remove_wait_queue(&conf->wait_for_reshape, &wait);
61886193

61896194
if (ctx.batch_last)
61906195
raid5_release_stripe(ctx.batch_last);
@@ -6337,7 +6342,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
63376342
: (safepos < writepos && readpos > writepos)) ||
63386343
time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
63396344
/* Cannot proceed until we've updated the superblock... */
6340-
wait_event(conf->wait_for_overlap,
6345+
wait_event(conf->wait_for_reshape,
63416346
atomic_read(&conf->reshape_stripes)==0
63426347
|| test_bit(MD_RECOVERY_INTR, &mddev->recovery));
63436348
if (atomic_read(&conf->reshape_stripes) != 0)
@@ -6363,7 +6368,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
63636368
spin_lock_irq(&conf->device_lock);
63646369
conf->reshape_safe = mddev->reshape_position;
63656370
spin_unlock_irq(&conf->device_lock);
6366-
wake_up(&conf->wait_for_overlap);
6371+
wake_up(&conf->wait_for_reshape);
63676372
sysfs_notify_dirent_safe(mddev->sysfs_completed);
63686373
}
63696374

@@ -6446,7 +6451,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
64466451
(sector_nr - mddev->curr_resync_completed) * 2
64476452
>= mddev->resync_max - mddev->curr_resync_completed) {
64486453
/* Cannot proceed until we've updated the superblock... */
6449-
wait_event(conf->wait_for_overlap,
6454+
wait_event(conf->wait_for_reshape,
64506455
atomic_read(&conf->reshape_stripes) == 0
64516456
|| test_bit(MD_RECOVERY_INTR, &mddev->recovery));
64526457
if (atomic_read(&conf->reshape_stripes) != 0)
@@ -6472,7 +6477,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
64726477
spin_lock_irq(&conf->device_lock);
64736478
conf->reshape_safe = mddev->reshape_position;
64746479
spin_unlock_irq(&conf->device_lock);
6475-
wake_up(&conf->wait_for_overlap);
6480+
wake_up(&conf->wait_for_reshape);
64766481
sysfs_notify_dirent_safe(mddev->sysfs_completed);
64776482
}
64786483
ret:
@@ -6507,7 +6512,7 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
65076512
}
65086513

65096514
/* Allow raid5_quiesce to complete */
6510-
wait_event(conf->wait_for_overlap, conf->quiesce != 2);
6515+
wait_event(conf->wait_for_reshape, conf->quiesce != 2);
65116516

65126517
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
65136518
return reshape_request(mddev, sector_nr, skipped);
@@ -7493,7 +7498,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
74937498

74947499
init_waitqueue_head(&conf->wait_for_quiescent);
74957500
init_waitqueue_head(&conf->wait_for_stripe);
7496-
init_waitqueue_head(&conf->wait_for_overlap);
7501+
init_waitqueue_head(&conf->wait_for_reshape);
74977502
INIT_LIST_HEAD(&conf->handle_list);
74987503
INIT_LIST_HEAD(&conf->loprio_list);
74997504
INIT_LIST_HEAD(&conf->hold_list);
@@ -8552,7 +8557,7 @@ static void end_reshape(struct r5conf *conf)
85528557
!test_bit(In_sync, &rdev->flags))
85538558
rdev->recovery_offset = MaxSector;
85548559
spin_unlock_irq(&conf->device_lock);
8555-
wake_up(&conf->wait_for_overlap);
8560+
wake_up(&conf->wait_for_reshape);
85568561

85578562
mddev_update_io_opt(conf->mddev,
85588563
conf->raid_disks - conf->max_degraded);
@@ -8616,13 +8621,13 @@ static void raid5_quiesce(struct mddev *mddev, int quiesce)
86168621
conf->quiesce = 1;
86178622
unlock_all_device_hash_locks_irq(conf);
86188623
/* allow reshape to continue */
8619-
wake_up(&conf->wait_for_overlap);
8624+
wake_up(&conf->wait_for_reshape);
86208625
} else {
86218626
/* re-enable writes */
86228627
lock_all_device_hash_locks_irq(conf);
86238628
conf->quiesce = 0;
86248629
wake_up(&conf->wait_for_quiescent);
8625-
wake_up(&conf->wait_for_overlap);
8630+
wake_up(&conf->wait_for_reshape);
86268631
unlock_all_device_hash_locks_irq(conf);
86278632
}
86288633
log_quiesce(conf, quiesce);
@@ -8941,7 +8946,7 @@ static void raid5_prepare_suspend(struct mddev *mddev)
89418946
{
89428947
struct r5conf *conf = mddev->private;
89438948

8944-
wake_up(&conf->wait_for_overlap);
8949+
wake_up(&conf->wait_for_reshape);
89458950
}
89468951

89478952
static struct md_personality raid6_personality =

drivers/md/raid5.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -668,7 +668,7 @@ struct r5conf {
668668
struct llist_head released_stripes;
669669
wait_queue_head_t wait_for_quiescent;
670670
wait_queue_head_t wait_for_stripe;
671-
wait_queue_head_t wait_for_overlap;
671+
wait_queue_head_t wait_for_reshape;
672672
unsigned long cache_state;
673673
struct shrinker *shrinker;
674674
int pool_size; /* number of disks in stripeheads in pool */

0 commit comments

Comments
 (0)