Skip to content

Commit 009836b

Browse files
author
Peter Zijlstra
committed
sched/core: Fix migrate_swap() vs. hotplug
On Mon, Jun 02, 2025 at 03:22:13PM +0800, Kuyo Chang wrote: > So, the potential race scenario is: > > CPU0 CPU1 > // doing migrate_swap(cpu0/cpu1) > stop_two_cpus() > ... > // doing _cpu_down() > sched_cpu_deactivate() > set_cpu_active(cpu, false); > balance_push_set(cpu, true); > cpu_stop_queue_two_works > __cpu_stop_queue_work(stopper1,...); > __cpu_stop_queue_work(stopper2,..); > stop_cpus_in_progress -> true > preempt_enable(); > ... > 1st balance_push > stop_one_cpu_nowait > cpu_stop_queue_work > __cpu_stop_queue_work > list_add_tail -> 1st add push_work > wake_up_q(&wakeq); -> "wakeq is empty. > This implies that the stopper is at wakeq@migrate_swap." > preempt_disable > wake_up_q(&wakeq); > wake_up_process // wakeup migrate/0 > try_to_wake_up > ttwu_queue > ttwu_queue_cond ->meet below case > if (cpu == smp_processor_id()) > return false; > ttwu_do_activate > //migrate/0 wakeup done > wake_up_process // wakeup migrate/1 > try_to_wake_up > ttwu_queue > ttwu_queue_cond > ttwu_queue_wakelist > __ttwu_queue_wakelist > __smp_call_single_queue > preempt_enable(); > > 2nd balance_push > stop_one_cpu_nowait > cpu_stop_queue_work > __cpu_stop_queue_work > list_add_tail -> 2nd add push_work, so the double list add is detected > ... > ... > cpu1 get ipi, do sched_ttwu_pending, wakeup migrate/1 > So this balance_push() is part of schedule(), and schedule() is supposed to switch to stopper task, but because of this race condition, stopper task is stuck in WAKING state and not actually visible to be picked. Therefore CPU1 can do another schedule() and end up doing another balance_push() even though the last one hasn't been done yet. This is a confluence of fail, where both wake_q and ttwu_wakelist can cause crucial wakeups to be delayed, resulting in the malfunction of balance_push. Since there is only a single stopper thread to be woken, the wake_q doesn't really add anything here, and can be removed in favour of direct wakeups of the stopper thread. Then add a clause to ttwu_queue_cond() to ensure the stopper threads are never queued / delayed. Of all 3 moving parts, the last addition was the balance_push() machinery, so pick that as the point the bug was introduced. Fixes: 2558aac ("sched/hotplug: Ensure only per-cpu kthreads run during hotplug") Reported-by: Kuyo Chang <[email protected]> Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Tested-by: Kuyo Chang <[email protected]> Link: https://lkml.kernel.org/r/[email protected]
1 parent 3ebb1b6 commit 009836b

File tree

2 files changed

+15
-10
lines changed

2 files changed

+15
-10
lines changed

kernel/sched/core.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3943,6 +3943,11 @@ static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
39433943
if (!scx_allow_ttwu_queue(p))
39443944
return false;
39453945

3946+
#ifdef CONFIG_SMP
3947+
if (p->sched_class == &stop_sched_class)
3948+
return false;
3949+
#endif
3950+
39463951
/*
39473952
* Do not complicate things with the async wake_list while the CPU is
39483953
* in hotplug state.

kernel/stop_machine.c

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -82,31 +82,29 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done)
8282
}
8383

8484
static void __cpu_stop_queue_work(struct cpu_stopper *stopper,
85-
struct cpu_stop_work *work,
86-
struct wake_q_head *wakeq)
85+
struct cpu_stop_work *work)
8786
{
8887
list_add_tail(&work->list, &stopper->works);
89-
wake_q_add(wakeq, stopper->thread);
9088
}
9189

9290
/* queue @work to @stopper. if offline, @work is completed immediately */
9391
static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
9492
{
9593
struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
96-
DEFINE_WAKE_Q(wakeq);
9794
unsigned long flags;
9895
bool enabled;
9996

10097
preempt_disable();
10198
raw_spin_lock_irqsave(&stopper->lock, flags);
10299
enabled = stopper->enabled;
103100
if (enabled)
104-
__cpu_stop_queue_work(stopper, work, &wakeq);
101+
__cpu_stop_queue_work(stopper, work);
105102
else if (work->done)
106103
cpu_stop_signal_done(work->done);
107104
raw_spin_unlock_irqrestore(&stopper->lock, flags);
108105

109-
wake_up_q(&wakeq);
106+
if (enabled)
107+
wake_up_process(stopper->thread);
110108
preempt_enable();
111109

112110
return enabled;
@@ -264,7 +262,6 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
264262
{
265263
struct cpu_stopper *stopper1 = per_cpu_ptr(&cpu_stopper, cpu1);
266264
struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2);
267-
DEFINE_WAKE_Q(wakeq);
268265
int err;
269266

270267
retry:
@@ -300,8 +297,8 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
300297
}
301298

302299
err = 0;
303-
__cpu_stop_queue_work(stopper1, work1, &wakeq);
304-
__cpu_stop_queue_work(stopper2, work2, &wakeq);
300+
__cpu_stop_queue_work(stopper1, work1);
301+
__cpu_stop_queue_work(stopper2, work2);
305302

306303
unlock:
307304
raw_spin_unlock(&stopper2->lock);
@@ -316,7 +313,10 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
316313
goto retry;
317314
}
318315

319-
wake_up_q(&wakeq);
316+
if (!err) {
317+
wake_up_process(stopper1->thread);
318+
wake_up_process(stopper2->thread);
319+
}
320320
preempt_enable();
321321

322322
return err;

0 commit comments

Comments
 (0)