Skip to content

Commit ad39c08

Browse files
YuKuai-huaweiliu-song-6
authored andcommitted
md: Don't register sync_thread for reshape directly
Currently, if reshape is interrupted, then reassemble the array will register sync_thread directly from pers->run(), in this case 'MD_RECOVERY_RUNNING' is set directly, however, there is no guarantee that md_do_sync() will be executed, hence stop_sync_thread() will hang because 'MD_RECOVERY_RUNNING' can't be cleared. Last patch make sure that md_do_sync() will set MD_RECOVERY_DONE, however, following hang can still be triggered by dm-raid test shell/lvconvert-raid-reshape.sh occasionally: [root@fedora ~]# cat /proc/1982/stack [<0>] stop_sync_thread+0x1ab/0x270 [md_mod] [<0>] md_frozen_sync_thread+0x5c/0xa0 [md_mod] [<0>] raid_presuspend+0x1e/0x70 [dm_raid] [<0>] dm_table_presuspend_targets+0x40/0xb0 [dm_mod] [<0>] __dm_destroy+0x2a5/0x310 [dm_mod] [<0>] dm_destroy+0x16/0x30 [dm_mod] [<0>] dev_remove+0x165/0x290 [dm_mod] [<0>] ctl_ioctl+0x4bb/0x7b0 [dm_mod] [<0>] dm_ctl_ioctl+0x11/0x20 [dm_mod] [<0>] vfs_ioctl+0x21/0x60 [<0>] __x64_sys_ioctl+0xb9/0xe0 [<0>] do_syscall_64+0xc6/0x230 [<0>] entry_SYSCALL_64_after_hwframe+0x6c/0x74 Meanwhile mddev->recovery is: MD_RECOVERY_RUNNING | MD_RECOVERY_INTR | MD_RECOVERY_RESHAPE | MD_RECOVERY_FROZEN Fix this problem by remove the code to register sync_thread directly from raid10 and raid5. And let md_check_recovery() to register sync_thread. Fixes: f670557 ("[PATCH] md: Checkpoint and allow restart of raid5 reshape") Fixes: f52f5c7 ("md: fix stopping sync thread") Cc: [email protected] # v6.7+ Signed-off-by: Yu Kuai <[email protected]> Signed-off-by: Song Liu <[email protected]> Link: https://lore.kernel.org/r/[email protected]
1 parent 82ec0ae commit ad39c08

File tree

3 files changed

+8
-42
lines changed

3 files changed

+8
-42
lines changed

drivers/md/md.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9376,6 +9376,7 @@ static void md_start_sync(struct work_struct *ws)
93769376
struct mddev *mddev = container_of(ws, struct mddev, sync_work);
93779377
int spares = 0;
93789378
bool suspend = false;
9379+
char *name;
93799380

93809381
if (md_spares_need_change(mddev))
93819382
suspend = true;
@@ -9408,8 +9409,10 @@ static void md_start_sync(struct work_struct *ws)
94089409
if (spares)
94099410
md_bitmap_write_all(mddev->bitmap);
94109411

9412+
name = test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ?
9413+
"reshape" : "resync";
94119414
rcu_assign_pointer(mddev->sync_thread,
9412-
md_register_thread(md_do_sync, mddev, "resync"));
9415+
md_register_thread(md_do_sync, mddev, name));
94139416
if (!mddev->sync_thread) {
94149417
pr_warn("%s: could not start resync thread...\n",
94159418
mdname(mddev));

drivers/md/raid10.c

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -4175,11 +4175,7 @@ static int raid10_run(struct mddev *mddev)
41754175
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
41764176
clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
41774177
set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
4178-
set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
4179-
rcu_assign_pointer(mddev->sync_thread,
4180-
md_register_thread(md_do_sync, mddev, "reshape"));
4181-
if (!mddev->sync_thread)
4182-
goto out_free_conf;
4178+
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
41834179
}
41844180

41854181
return 0;
@@ -4573,16 +4569,8 @@ static int raid10_start_reshape(struct mddev *mddev)
45734569
clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
45744570
clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
45754571
set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
4576-
set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
4577-
4578-
rcu_assign_pointer(mddev->sync_thread,
4579-
md_register_thread(md_do_sync, mddev, "reshape"));
4580-
if (!mddev->sync_thread) {
4581-
ret = -EAGAIN;
4582-
goto abort;
4583-
}
4572+
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
45844573
conf->reshape_checkpoint = jiffies;
4585-
md_wakeup_thread(mddev->sync_thread);
45864574
md_new_event();
45874575
return 0;
45884576

drivers/md/raid5.c

Lines changed: 2 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -7936,11 +7936,7 @@ static int raid5_run(struct mddev *mddev)
79367936
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
79377937
clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
79387938
set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
7939-
set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7940-
rcu_assign_pointer(mddev->sync_thread,
7941-
md_register_thread(md_do_sync, mddev, "reshape"));
7942-
if (!mddev->sync_thread)
7943-
goto abort;
7939+
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
79447940
}
79457941

79467942
/* Ok, everything is just fine now */
@@ -8506,29 +8502,8 @@ static int raid5_start_reshape(struct mddev *mddev)
85068502
clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
85078503
clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
85088504
set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
8509-
set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
8510-
rcu_assign_pointer(mddev->sync_thread,
8511-
md_register_thread(md_do_sync, mddev, "reshape"));
8512-
if (!mddev->sync_thread) {
8513-
mddev->recovery = 0;
8514-
spin_lock_irq(&conf->device_lock);
8515-
write_seqcount_begin(&conf->gen_lock);
8516-
mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
8517-
mddev->new_chunk_sectors =
8518-
conf->chunk_sectors = conf->prev_chunk_sectors;
8519-
mddev->new_layout = conf->algorithm = conf->prev_algo;
8520-
rdev_for_each(rdev, mddev)
8521-
rdev->new_data_offset = rdev->data_offset;
8522-
smp_wmb();
8523-
conf->generation --;
8524-
conf->reshape_progress = MaxSector;
8525-
mddev->reshape_position = MaxSector;
8526-
write_seqcount_end(&conf->gen_lock);
8527-
spin_unlock_irq(&conf->device_lock);
8528-
return -EAGAIN;
8529-
}
8505+
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
85308506
conf->reshape_checkpoint = jiffies;
8531-
md_wakeup_thread(mddev->sync_thread);
85328507
md_new_event();
85338508
return 0;
85348509
}

0 commit comments

Comments
 (0)