Skip to content

Commit 5d726c4

Browse files
YuKuai-huaweiaxboe
authored andcommitted
blk-cgroup: fix possible deadlock while configuring policy
Following deadlock can be triggered easily by lockdep: WARNING: possible circular locking dependency detected 6.17.0-rc3-00124-ga12c2658ced0 #1665 Not tainted ------------------------------------------------------ check/1334 is trying to acquire lock: ff1100011d9d0678 (&q->sysfs_lock){+.+.}-{4:4}, at: blk_unregister_queue+0x53/0x180 but task is already holding lock: ff1100011d9d00e0 (&q->q_usage_counter(queue)#3){++++}-{0:0}, at: del_gendisk+0xba/0x110 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (&q->q_usage_counter(queue)#3){++++}-{0:0}: blk_queue_enter+0x40b/0x470 blkg_conf_prep+0x7b/0x3c0 tg_set_limit+0x10a/0x3e0 cgroup_file_write+0xc6/0x420 kernfs_fop_write_iter+0x189/0x280 vfs_write+0x256/0x490 ksys_write+0x83/0x190 __x64_sys_write+0x21/0x30 x64_sys_call+0x4608/0x4630 do_syscall_64+0xdb/0x6b0 entry_SYSCALL_64_after_hwframe+0x76/0x7e -> #1 (&q->rq_qos_mutex){+.+.}-{4:4}: __mutex_lock+0xd8/0xf50 mutex_lock_nested+0x2b/0x40 wbt_init+0x17e/0x280 wbt_enable_default+0xe9/0x140 blk_register_queue+0x1da/0x2e0 __add_disk+0x38c/0x5d0 add_disk_fwnode+0x89/0x250 device_add_disk+0x18/0x30 virtblk_probe+0x13a3/0x1800 virtio_dev_probe+0x389/0x610 really_probe+0x136/0x620 __driver_probe_device+0xb3/0x230 driver_probe_device+0x2f/0xe0 __driver_attach+0x158/0x250 bus_for_each_dev+0xa9/0x130 driver_attach+0x26/0x40 bus_add_driver+0x178/0x3d0 driver_register+0x7d/0x1c0 __register_virtio_driver+0x2c/0x60 virtio_blk_init+0x6f/0xe0 do_one_initcall+0x94/0x540 kernel_init_freeable+0x56a/0x7b0 kernel_init+0x2b/0x270 ret_from_fork+0x268/0x4c0 ret_from_fork_asm+0x1a/0x30 -> #0 (&q->sysfs_lock){+.+.}-{4:4}: __lock_acquire+0x1835/0x2940 lock_acquire+0xf9/0x450 __mutex_lock+0xd8/0xf50 mutex_lock_nested+0x2b/0x40 blk_unregister_queue+0x53/0x180 __del_gendisk+0x226/0x690 del_gendisk+0xba/0x110 sd_remove+0x49/0xb0 [sd_mod] device_remove+0x87/0xb0 device_release_driver_internal+0x11e/0x230 device_release_driver+0x1a/0x30 bus_remove_device+0x14d/0x220 device_del+0x1e1/0x5a0 __scsi_remove_device+0x1ff/0x2f0 scsi_remove_device+0x37/0x60 sdev_store_delete+0x77/0x100 dev_attr_store+0x1f/0x40 sysfs_kf_write+0x65/0x90 kernfs_fop_write_iter+0x189/0x280 vfs_write+0x256/0x490 ksys_write+0x83/0x190 __x64_sys_write+0x21/0x30 x64_sys_call+0x4608/0x4630 do_syscall_64+0xdb/0x6b0 entry_SYSCALL_64_after_hwframe+0x76/0x7e other info that might help us debug this: Chain exists of: &q->sysfs_lock --> &q->rq_qos_mutex --> &q->q_usage_counter(queue)#3 Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&q->q_usage_counter(queue)#3); lock(&q->rq_qos_mutex); lock(&q->q_usage_counter(queue)#3); lock(&q->sysfs_lock); Root cause is that queue_usage_counter is grabbed with rq_qos_mutex held in blkg_conf_prep(), while queue should be freezed before rq_qos_mutex from other context. The blk_queue_enter() from blkg_conf_prep() is used to protect against policy deactivation, which is already protected with blkcg_mutex, hence convert blk_queue_enter() to blkcg_mutex to fix this problem. Meanwhile, consider that blkcg_mutex is held after queue is freezed from policy deactivation, also convert blkg_alloc() to use GFP_NOIO. Signed-off-by: Yu Kuai <[email protected]> Reviewed-by: Ming Lei <[email protected]> Signed-off-by: Jens Axboe <[email protected]>
1 parent 670bfe6 commit 5d726c4

File tree

1 file changed

+8
-15
lines changed

1 file changed

+8
-15
lines changed

block/blk-cgroup.c

Lines changed: 8 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -877,14 +877,8 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
877877
disk = ctx->bdev->bd_disk;
878878
q = disk->queue;
879879

880-
/*
881-
* blkcg_deactivate_policy() requires queue to be frozen, we can grab
882-
* q_usage_counter to prevent concurrent with blkcg_deactivate_policy().
883-
*/
884-
ret = blk_queue_enter(q, 0);
885-
if (ret)
886-
goto fail;
887-
880+
/* Prevent concurrent with blkcg_deactivate_policy() */
881+
mutex_lock(&q->blkcg_mutex);
888882
spin_lock_irq(&q->queue_lock);
889883

890884
if (!blkcg_policy_enabled(q, pol)) {
@@ -914,16 +908,16 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
914908
/* Drop locks to do new blkg allocation with GFP_KERNEL. */
915909
spin_unlock_irq(&q->queue_lock);
916910

917-
new_blkg = blkg_alloc(pos, disk, GFP_KERNEL);
911+
new_blkg = blkg_alloc(pos, disk, GFP_NOIO);
918912
if (unlikely(!new_blkg)) {
919913
ret = -ENOMEM;
920-
goto fail_exit_queue;
914+
goto fail_exit;
921915
}
922916

923917
if (radix_tree_preload(GFP_KERNEL)) {
924918
blkg_free(new_blkg);
925919
ret = -ENOMEM;
926-
goto fail_exit_queue;
920+
goto fail_exit;
927921
}
928922

929923
spin_lock_irq(&q->queue_lock);
@@ -951,17 +945,16 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
951945
goto success;
952946
}
953947
success:
954-
blk_queue_exit(q);
948+
mutex_unlock(&q->blkcg_mutex);
955949
ctx->blkg = blkg;
956950
return 0;
957951

958952
fail_preloaded:
959953
radix_tree_preload_end();
960954
fail_unlock:
961955
spin_unlock_irq(&q->queue_lock);
962-
fail_exit_queue:
963-
blk_queue_exit(q);
964-
fail:
956+
fail_exit:
957+
mutex_unlock(&q->blkcg_mutex);
965958
/*
966959
* If queue was bypassing, we should retry. Do so after a
967960
* short msleep(). It isn't strictly necessary but queue

0 commit comments

Comments
 (0)