Skip to content

Commit f5a6604

Browse files
shroffniaxboe
authored andcommitted
block: fix lockdep warning caused by lock dependency in elv_iosched_store
Recent lockdep reports [1] have revealed a potential deadlock caused by a lock dependency between the percpu allocator lock and the elevator lock. This issue can be avoided by ensuring that the allocation and release of scheduler tags (sched_tags) are performed outside the elevator lock. Furthermore, the queue does not need to be remain frozen during these operations. To address this, move all sched_tags allocations and deallocations outside of both the ->elevator_lock and the ->freeze_lock. Since the lifetime of the elevator queue and its associated sched_tags is closely tied, the allocated sched_tags are now stored in the elevator queue structure. Then, during the actual elevator switch (which runs under ->freeze_lock and ->elevator_lock), the pre-allocated sched_tags are assigned to the appropriate q->hctx. Once the elevator switch is complete and the locks are released, the old elevator queue and its associated sched_tags are freed. This commit specifically addresses the allocation/deallocation of sched_ tags during elevator switching. Note that sched_tags may also be allocated in other contexts, such as during nr_hw_queues updates. Supporting that use case will require batch allocation/deallocation, which will be handled in a follow-up patch. This restructuring ensures that sched_tags memory management occurs entirely outside of the ->elevator_lock and ->freeze_lock context, eliminating the lock dependency problem seen during scheduler updates. [1] https://lore.kernel.org/all/[email protected]/ Reported-by: Stefan Haberland <[email protected]> Closes: https://lore.kernel.org/all/[email protected]/ Reviewed-by: Ming Lei <[email protected]> Reviewed-by: Christoph Hellwig <[email protected]> Reviewed-by: Hannes Reinecke <[email protected]> Signed-off-by: Nilay Shroff <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Jens Axboe <[email protected]>
1 parent 4981158 commit f5a6604

File tree

4 files changed

+136
-81
lines changed

4 files changed

+136
-81
lines changed

block/blk-mq-sched.c

Lines changed: 83 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -374,64 +374,17 @@ bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq,
374374
}
375375
EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge);
376376

377-
static int blk_mq_sched_alloc_map_and_rqs(struct request_queue *q,
378-
struct blk_mq_hw_ctx *hctx,
379-
unsigned int hctx_idx)
380-
{
381-
if (blk_mq_is_shared_tags(q->tag_set->flags)) {
382-
hctx->sched_tags = q->sched_shared_tags;
383-
return 0;
384-
}
385-
386-
hctx->sched_tags = blk_mq_alloc_map_and_rqs(q->tag_set, hctx_idx,
387-
q->nr_requests);
388-
389-
if (!hctx->sched_tags)
390-
return -ENOMEM;
391-
return 0;
392-
}
393-
394-
static void blk_mq_exit_sched_shared_tags(struct request_queue *queue)
395-
{
396-
blk_mq_free_rq_map(queue->sched_shared_tags);
397-
queue->sched_shared_tags = NULL;
398-
}
399-
400377
/* called in queue's release handler, tagset has gone away */
401378
static void blk_mq_sched_tags_teardown(struct request_queue *q, unsigned int flags)
402379
{
403380
struct blk_mq_hw_ctx *hctx;
404381
unsigned long i;
405382

406-
queue_for_each_hw_ctx(q, hctx, i) {
407-
if (hctx->sched_tags) {
408-
if (!blk_mq_is_shared_tags(flags))
409-
blk_mq_free_rq_map(hctx->sched_tags);
410-
hctx->sched_tags = NULL;
411-
}
412-
}
383+
queue_for_each_hw_ctx(q, hctx, i)
384+
hctx->sched_tags = NULL;
413385

414386
if (blk_mq_is_shared_tags(flags))
415-
blk_mq_exit_sched_shared_tags(q);
416-
}
417-
418-
static int blk_mq_init_sched_shared_tags(struct request_queue *queue)
419-
{
420-
struct blk_mq_tag_set *set = queue->tag_set;
421-
422-
/*
423-
* Set initial depth at max so that we don't need to reallocate for
424-
* updating nr_requests.
425-
*/
426-
queue->sched_shared_tags = blk_mq_alloc_map_and_rqs(set,
427-
BLK_MQ_NO_HCTX_IDX,
428-
MAX_SCHED_RQ);
429-
if (!queue->sched_shared_tags)
430-
return -ENOMEM;
431-
432-
blk_mq_tag_update_sched_shared_tags(queue);
433-
434-
return 0;
387+
q->sched_shared_tags = NULL;
435388
}
436389

437390
void blk_mq_sched_reg_debugfs(struct request_queue *q)
@@ -458,49 +411,109 @@ void blk_mq_sched_unreg_debugfs(struct request_queue *q)
458411
mutex_unlock(&q->debugfs_mutex);
459412
}
460413

414+
void blk_mq_free_sched_tags(struct elevator_tags *et,
415+
struct blk_mq_tag_set *set)
416+
{
417+
unsigned long i;
418+
419+
/* Shared tags are stored at index 0 in @tags. */
420+
if (blk_mq_is_shared_tags(set->flags))
421+
blk_mq_free_map_and_rqs(set, et->tags[0], BLK_MQ_NO_HCTX_IDX);
422+
else {
423+
for (i = 0; i < et->nr_hw_queues; i++)
424+
blk_mq_free_map_and_rqs(set, et->tags[i], i);
425+
}
426+
427+
kfree(et);
428+
}
429+
430+
struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
431+
unsigned int nr_hw_queues)
432+
{
433+
unsigned int nr_tags;
434+
int i;
435+
struct elevator_tags *et;
436+
gfp_t gfp = GFP_NOIO | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
437+
438+
if (blk_mq_is_shared_tags(set->flags))
439+
nr_tags = 1;
440+
else
441+
nr_tags = nr_hw_queues;
442+
443+
et = kmalloc(sizeof(struct elevator_tags) +
444+
nr_tags * sizeof(struct blk_mq_tags *), gfp);
445+
if (!et)
446+
return NULL;
447+
/*
448+
* Default to double of smaller one between hw queue_depth and
449+
* 128, since we don't split into sync/async like the old code
450+
* did. Additionally, this is a per-hw queue depth.
451+
*/
452+
et->nr_requests = 2 * min_t(unsigned int, set->queue_depth,
453+
BLKDEV_DEFAULT_RQ);
454+
et->nr_hw_queues = nr_hw_queues;
455+
456+
if (blk_mq_is_shared_tags(set->flags)) {
457+
/* Shared tags are stored at index 0 in @tags. */
458+
et->tags[0] = blk_mq_alloc_map_and_rqs(set, BLK_MQ_NO_HCTX_IDX,
459+
MAX_SCHED_RQ);
460+
if (!et->tags[0])
461+
goto out;
462+
} else {
463+
for (i = 0; i < et->nr_hw_queues; i++) {
464+
et->tags[i] = blk_mq_alloc_map_and_rqs(set, i,
465+
et->nr_requests);
466+
if (!et->tags[i])
467+
goto out_unwind;
468+
}
469+
}
470+
471+
return et;
472+
out_unwind:
473+
while (--i >= 0)
474+
blk_mq_free_map_and_rqs(set, et->tags[i], i);
475+
out:
476+
kfree(et);
477+
return NULL;
478+
}
479+
461480
/* caller must have a reference to @e, will grab another one if successful */
462-
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
481+
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e,
482+
struct elevator_tags *et)
463483
{
464484
unsigned int flags = q->tag_set->flags;
465485
struct blk_mq_hw_ctx *hctx;
466486
struct elevator_queue *eq;
467487
unsigned long i;
468488
int ret;
469489

470-
/*
471-
* Default to double of smaller one between hw queue_depth and 128,
472-
* since we don't split into sync/async like the old code did.
473-
* Additionally, this is a per-hw queue depth.
474-
*/
475-
q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth,
476-
BLKDEV_DEFAULT_RQ);
477-
478-
eq = elevator_alloc(q, e);
490+
eq = elevator_alloc(q, e, et);
479491
if (!eq)
480492
return -ENOMEM;
481493

494+
q->nr_requests = et->nr_requests;
495+
482496
if (blk_mq_is_shared_tags(flags)) {
483-
ret = blk_mq_init_sched_shared_tags(q);
484-
if (ret)
485-
goto err_put_elevator;
497+
/* Shared tags are stored at index 0 in @et->tags. */
498+
q->sched_shared_tags = et->tags[0];
499+
blk_mq_tag_update_sched_shared_tags(q);
486500
}
487501

488502
queue_for_each_hw_ctx(q, hctx, i) {
489-
ret = blk_mq_sched_alloc_map_and_rqs(q, hctx, i);
490-
if (ret)
491-
goto err_free_map_and_rqs;
503+
if (blk_mq_is_shared_tags(flags))
504+
hctx->sched_tags = q->sched_shared_tags;
505+
else
506+
hctx->sched_tags = et->tags[i];
492507
}
493508

494509
ret = e->ops.init_sched(q, eq);
495510
if (ret)
496-
goto err_free_map_and_rqs;
511+
goto out;
497512

498513
queue_for_each_hw_ctx(q, hctx, i) {
499514
if (e->ops.init_hctx) {
500515
ret = e->ops.init_hctx(hctx, i);
501516
if (ret) {
502-
eq = q->elevator;
503-
blk_mq_sched_free_rqs(q);
504517
blk_mq_exit_sched(q, eq);
505518
kobject_put(&eq->kobj);
506519
return ret;
@@ -509,10 +522,8 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
509522
}
510523
return 0;
511524

512-
err_free_map_and_rqs:
513-
blk_mq_sched_free_rqs(q);
525+
out:
514526
blk_mq_sched_tags_teardown(q, flags);
515-
err_put_elevator:
516527
kobject_put(&eq->kobj);
517528
q->elevator = NULL;
518529
return ret;

block/blk-mq-sched.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,16 @@ void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);
1818

1919
void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);
2020

21-
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e);
21+
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e,
22+
struct elevator_tags *et);
2223
void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e);
2324
void blk_mq_sched_free_rqs(struct request_queue *q);
2425

26+
struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
27+
unsigned int nr_hw_queues);
28+
void blk_mq_free_sched_tags(struct elevator_tags *et,
29+
struct blk_mq_tag_set *set);
30+
2531
static inline void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
2632
{
2733
if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))

block/elevator.c

Lines changed: 34 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,8 @@ struct elv_change_ctx {
5454
struct elevator_queue *old;
5555
/* for registering new elevator */
5656
struct elevator_queue *new;
57+
/* holds sched tags data */
58+
struct elevator_tags *et;
5759
};
5860

5961
static DEFINE_SPINLOCK(elv_list_lock);
@@ -132,7 +134,7 @@ static struct elevator_type *elevator_find_get(const char *name)
132134
static const struct kobj_type elv_ktype;
133135

134136
struct elevator_queue *elevator_alloc(struct request_queue *q,
135-
struct elevator_type *e)
137+
struct elevator_type *e, struct elevator_tags *et)
136138
{
137139
struct elevator_queue *eq;
138140

@@ -145,6 +147,7 @@ struct elevator_queue *elevator_alloc(struct request_queue *q,
145147
kobject_init(&eq->kobj, &elv_ktype);
146148
mutex_init(&eq->sysfs_lock);
147149
hash_init(eq->hash);
150+
eq->et = et;
148151

149152
return eq;
150153
}
@@ -165,7 +168,6 @@ static void elevator_exit(struct request_queue *q)
165168
lockdep_assert_held(&q->elevator_lock);
166169

167170
ioc_clear_queue(q);
168-
blk_mq_sched_free_rqs(q);
169171

170172
mutex_lock(&e->sysfs_lock);
171173
blk_mq_exit_sched(q, e);
@@ -591,7 +593,7 @@ static int elevator_switch(struct request_queue *q, struct elv_change_ctx *ctx)
591593
}
592594

593595
if (new_e) {
594-
ret = blk_mq_init_sched(q, new_e);
596+
ret = blk_mq_init_sched(q, new_e, ctx->et);
595597
if (ret)
596598
goto out_unfreeze;
597599
ctx->new = q->elevator;
@@ -626,8 +628,10 @@ static void elv_exit_and_release(struct request_queue *q)
626628
elevator_exit(q);
627629
mutex_unlock(&q->elevator_lock);
628630
blk_mq_unfreeze_queue(q, memflags);
629-
if (e)
631+
if (e) {
632+
blk_mq_free_sched_tags(e->et, q->tag_set);
630633
kobject_put(&e->kobj);
634+
}
631635
}
632636

633637
static int elevator_change_done(struct request_queue *q,
@@ -640,6 +644,7 @@ static int elevator_change_done(struct request_queue *q,
640644
&ctx->old->flags);
641645

642646
elv_unregister_queue(q, ctx->old);
647+
blk_mq_free_sched_tags(ctx->old->et, q->tag_set);
643648
kobject_put(&ctx->old->kobj);
644649
if (enable_wbt)
645650
wbt_enable_default(q->disk);
@@ -658,9 +663,16 @@ static int elevator_change_done(struct request_queue *q,
658663
static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx)
659664
{
660665
unsigned int memflags;
666+
struct blk_mq_tag_set *set = q->tag_set;
661667
int ret = 0;
662668

663-
lockdep_assert_held(&q->tag_set->update_nr_hwq_lock);
669+
lockdep_assert_held(&set->update_nr_hwq_lock);
670+
671+
if (strncmp(ctx->name, "none", 4)) {
672+
ctx->et = blk_mq_alloc_sched_tags(set, set->nr_hw_queues);
673+
if (!ctx->et)
674+
return -ENOMEM;
675+
}
664676

665677
memflags = blk_mq_freeze_queue(q);
666678
/*
@@ -680,6 +692,11 @@ static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx)
680692
blk_mq_unfreeze_queue(q, memflags);
681693
if (!ret)
682694
ret = elevator_change_done(q, ctx);
695+
/*
696+
* Free sched tags if it's allocated but we couldn't switch elevator.
697+
*/
698+
if (ctx->et && !ctx->new)
699+
blk_mq_free_sched_tags(ctx->et, set);
683700

684701
return ret;
685702
}
@@ -690,22 +707,33 @@ static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx)
690707
*/
691708
void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e)
692709
{
710+
struct blk_mq_tag_set *set = q->tag_set;
693711
struct elv_change_ctx ctx = {};
694712
int ret = -ENODEV;
695713

696714
WARN_ON_ONCE(q->mq_freeze_depth == 0);
697715

698716
if (e && !blk_queue_dying(q) && blk_queue_registered(q)) {
699717
ctx.name = e->elevator_name;
700-
718+
ctx.et = blk_mq_alloc_sched_tags(set, set->nr_hw_queues);
719+
if (!ctx.et) {
720+
WARN_ON_ONCE(1);
721+
goto unfreeze;
722+
}
701723
mutex_lock(&q->elevator_lock);
702724
/* force to reattach elevator after nr_hw_queue is updated */
703725
ret = elevator_switch(q, &ctx);
704726
mutex_unlock(&q->elevator_lock);
705727
}
728+
unfreeze:
706729
blk_mq_unfreeze_queue_nomemrestore(q);
707730
if (!ret)
708731
WARN_ON_ONCE(elevator_change_done(q, &ctx));
732+
/*
733+
* Free sched tags if it's allocated but we couldn't switch elevator.
734+
*/
735+
if (ctx.et && !ctx.new)
736+
blk_mq_free_sched_tags(ctx.et, set);
709737
}
710738

711739
/*

block/elevator.h

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,15 @@ enum elv_merge {
2323
struct blk_mq_alloc_data;
2424
struct blk_mq_hw_ctx;
2525

26+
struct elevator_tags {
27+
/* num. of hardware queues for which tags are allocated */
28+
unsigned int nr_hw_queues;
29+
/* depth used while allocating tags */
30+
unsigned int nr_requests;
31+
/* shared tag is stored at index 0 */
32+
struct blk_mq_tags *tags[];
33+
};
34+
2635
struct elevator_mq_ops {
2736
int (*init_sched)(struct request_queue *, struct elevator_queue *);
2837
void (*exit_sched)(struct elevator_queue *);
@@ -113,6 +122,7 @@ struct request *elv_rqhash_find(struct request_queue *q, sector_t offset);
113122
struct elevator_queue
114123
{
115124
struct elevator_type *type;
125+
struct elevator_tags *et;
116126
void *elevator_data;
117127
struct kobject kobj;
118128
struct mutex sysfs_lock;
@@ -152,8 +162,8 @@ ssize_t elv_iosched_show(struct gendisk *disk, char *page);
152162
ssize_t elv_iosched_store(struct gendisk *disk, const char *page, size_t count);
153163

154164
extern bool elv_bio_merge_ok(struct request *, struct bio *);
155-
extern struct elevator_queue *elevator_alloc(struct request_queue *,
156-
struct elevator_type *);
165+
struct elevator_queue *elevator_alloc(struct request_queue *,
166+
struct elevator_type *, struct elevator_tags *);
157167

158168
/*
159169
* Helper functions.

0 commit comments

Comments
 (0)