Skip to content

Commit 58877d3

Browse files
author
Peter Zijlstra
committed
sched: Better document ttwu()
Dave hit the problem fixed by commit: b6e13e8 ("sched/core: Fix ttwu() race") and failed to understand much of the code involved. Per his request a few comments to (hopefully) clarify things. Requested-by: Dave Chinner <[email protected]> Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Link: https://lkml.kernel.org/r/[email protected]
1 parent 015dc08 commit 58877d3

File tree

3 files changed

+179
-31
lines changed

3 files changed

+179
-31
lines changed

include/linux/sched.h

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -154,24 +154,24 @@ struct task_group;
154154
*
155155
* for (;;) {
156156
* set_current_state(TASK_UNINTERRUPTIBLE);
157-
* if (!need_sleep)
158-
* break;
157+
* if (CONDITION)
158+
* break;
159159
*
160160
* schedule();
161161
* }
162162
* __set_current_state(TASK_RUNNING);
163163
*
164164
* If the caller does not need such serialisation (because, for instance, the
165-
* condition test and condition change and wakeup are under the same lock) then
165+
* CONDITION test and condition change and wakeup are under the same lock) then
166166
* use __set_current_state().
167167
*
168168
* The above is typically ordered against the wakeup, which does:
169169
*
170-
* need_sleep = false;
170+
* CONDITION = 1;
171171
* wake_up_state(p, TASK_UNINTERRUPTIBLE);
172172
*
173-
* where wake_up_state() executes a full memory barrier before accessing the
174-
* task state.
173+
* where wake_up_state()/try_to_wake_up() executes a full memory barrier before
174+
* accessing p->state.
175175
*
176176
* Wakeup will do: if (@state & p->state) p->state = TASK_RUNNING, that is,
177177
* once it observes the TASK_UNINTERRUPTIBLE store the waking CPU can issue a

kernel/sched/core.c

Lines changed: 163 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,100 @@ __read_mostly int scheduler_running;
7979
*/
8080
int sysctl_sched_rt_runtime = 950000;
8181

82+
83+
/*
84+
* Serialization rules:
85+
*
86+
* Lock order:
87+
*
88+
* p->pi_lock
89+
* rq->lock
90+
* hrtimer_cpu_base->lock (hrtimer_start() for bandwidth controls)
91+
*
92+
* rq1->lock
93+
* rq2->lock where: rq1 < rq2
94+
*
95+
* Regular state:
96+
*
97+
* Normal scheduling state is serialized by rq->lock. __schedule() takes the
98+
* local CPU's rq->lock, it optionally removes the task from the runqueue and
99+
* always looks at the local rq data structures to find the most elegible task
100+
* to run next.
101+
*
102+
* Task enqueue is also under rq->lock, possibly taken from another CPU.
103+
* Wakeups from another LLC domain might use an IPI to transfer the enqueue to
104+
* the local CPU to avoid bouncing the runqueue state around [ see
105+
* ttwu_queue_wakelist() ]
106+
*
107+
* Task wakeup, specifically wakeups that involve migration, are horribly
108+
* complicated to avoid having to take two rq->locks.
109+
*
110+
* Special state:
111+
*
112+
* System-calls and anything external will use task_rq_lock() which acquires
113+
* both p->pi_lock and rq->lock. As a consequence the state they change is
114+
* stable while holding either lock:
115+
*
116+
* - sched_setaffinity()/
117+
* set_cpus_allowed_ptr(): p->cpus_ptr, p->nr_cpus_allowed
118+
* - set_user_nice(): p->se.load, p->*prio
119+
* - __sched_setscheduler(): p->sched_class, p->policy, p->*prio,
120+
* p->se.load, p->rt_priority,
121+
* p->dl.dl_{runtime, deadline, period, flags, bw, density}
122+
* - sched_setnuma(): p->numa_preferred_nid
123+
* - sched_move_task()/
124+
* cpu_cgroup_fork(): p->sched_task_group
125+
* - uclamp_update_active() p->uclamp*
126+
*
127+
* p->state <- TASK_*:
128+
*
129+
* is changed locklessly using set_current_state(), __set_current_state() or
130+
* set_special_state(), see their respective comments, or by
131+
* try_to_wake_up(). This latter uses p->pi_lock to serialize against
132+
* concurrent self.
133+
*
134+
* p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }:
135+
*
136+
* is set by activate_task() and cleared by deactivate_task(), under
137+
* rq->lock. Non-zero indicates the task is runnable, the special
138+
* ON_RQ_MIGRATING state is used for migration without holding both
139+
* rq->locks. It indicates task_cpu() is not stable, see task_rq_lock().
140+
*
141+
* p->on_cpu <- { 0, 1 }:
142+
*
143+
* is set by prepare_task() and cleared by finish_task() such that it will be
144+
* set before p is scheduled-in and cleared after p is scheduled-out, both
145+
* under rq->lock. Non-zero indicates the task is running on its CPU.
146+
*
147+
* [ The astute reader will observe that it is possible for two tasks on one
148+
* CPU to have ->on_cpu = 1 at the same time. ]
149+
*
150+
* task_cpu(p): is changed by set_task_cpu(), the rules are:
151+
*
152+
* - Don't call set_task_cpu() on a blocked task:
153+
*
154+
* We don't care what CPU we're not running on, this simplifies hotplug,
155+
* the CPU assignment of blocked tasks isn't required to be valid.
156+
*
157+
* - for try_to_wake_up(), called under p->pi_lock:
158+
*
159+
* This allows try_to_wake_up() to only take one rq->lock, see its comment.
160+
*
161+
* - for migration called under rq->lock:
162+
* [ see task_on_rq_migrating() in task_rq_lock() ]
163+
*
164+
* o move_queued_task()
165+
* o detach_task()
166+
*
167+
* - for migration called under double_rq_lock():
168+
*
169+
* o __migrate_swap_task()
170+
* o push_rt_task() / pull_rt_task()
171+
* o push_dl_task() / pull_dl_task()
172+
* o dl_task_offline_migration()
173+
*
174+
*/
175+
82176
/*
83177
* __task_rq_lock - lock the rq @p resides on.
84178
*/
@@ -1543,17 +1637,15 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
15431637
{
15441638
lockdep_assert_held(&rq->lock);
15451639

1546-
WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING);
1547-
dequeue_task(rq, p, DEQUEUE_NOCLOCK);
1640+
deactivate_task(rq, p, DEQUEUE_NOCLOCK);
15481641
set_task_cpu(p, new_cpu);
15491642
rq_unlock(rq, rf);
15501643

15511644
rq = cpu_rq(new_cpu);
15521645

15531646
rq_lock(rq, rf);
15541647
BUG_ON(task_cpu(p) != new_cpu);
1555-
enqueue_task(rq, p, 0);
1556-
p->on_rq = TASK_ON_RQ_QUEUED;
1648+
activate_task(rq, p, 0);
15571649
check_preempt_curr(rq, p, 0);
15581650

15591651
return rq;
@@ -2318,12 +2410,31 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
23182410
}
23192411

23202412
/*
2321-
* Called in case the task @p isn't fully descheduled from its runqueue,
2322-
* in this case we must do a remote wakeup. Its a 'light' wakeup though,
2323-
* since all we need to do is flip p->state to TASK_RUNNING, since
2324-
* the task is still ->on_rq.
2413+
* Consider @p being inside a wait loop:
2414+
*
2415+
* for (;;) {
2416+
* set_current_state(TASK_UNINTERRUPTIBLE);
2417+
*
2418+
* if (CONDITION)
2419+
* break;
2420+
*
2421+
* schedule();
2422+
* }
2423+
* __set_current_state(TASK_RUNNING);
2424+
*
2425+
* between set_current_state() and schedule(). In this case @p is still
2426+
* runnable, so all that needs doing is change p->state back to TASK_RUNNING in
2427+
* an atomic manner.
2428+
*
2429+
* By taking task_rq(p)->lock we serialize against schedule(), if @p->on_rq
2430+
* then schedule() must still happen and p->state can be changed to
2431+
* TASK_RUNNING. Otherwise we lost the race, schedule() has happened, and we
2432+
* need to do a full wakeup with enqueue.
2433+
*
2434+
* Returns: %true when the wakeup is done,
2435+
* %false otherwise.
23252436
*/
2326-
static int ttwu_remote(struct task_struct *p, int wake_flags)
2437+
static int ttwu_runnable(struct task_struct *p, int wake_flags)
23272438
{
23282439
struct rq_flags rf;
23292440
struct rq *rq;
@@ -2464,17 +2575,23 @@ static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
24642575

24652576
return false;
24662577
}
2578+
2579+
#else /* !CONFIG_SMP */
2580+
2581+
static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
2582+
{
2583+
return false;
2584+
}
2585+
24672586
#endif /* CONFIG_SMP */
24682587

24692588
static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
24702589
{
24712590
struct rq *rq = cpu_rq(cpu);
24722591
struct rq_flags rf;
24732592

2474-
#if defined(CONFIG_SMP)
24752593
if (ttwu_queue_wakelist(p, cpu, wake_flags))
24762594
return;
2477-
#endif
24782595

24792596
rq_lock(rq, &rf);
24802597
update_rq_clock(rq);
@@ -2530,8 +2647,8 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
25302647
* migration. However the means are completely different as there is no lock
25312648
* chain to provide order. Instead we do:
25322649
*
2533-
* 1) smp_store_release(X->on_cpu, 0)
2534-
* 2) smp_cond_load_acquire(!X->on_cpu)
2650+
* 1) smp_store_release(X->on_cpu, 0) -- finish_task()
2651+
* 2) smp_cond_load_acquire(!X->on_cpu) -- try_to_wake_up()
25352652
*
25362653
* Example:
25372654
*
@@ -2571,15 +2688,33 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
25712688
* @state: the mask of task states that can be woken
25722689
* @wake_flags: wake modifier flags (WF_*)
25732690
*
2574-
* If (@state & @p->state) @p->state = TASK_RUNNING.
2691+
* Conceptually does:
2692+
*
2693+
* If (@state & @p->state) @p->state = TASK_RUNNING.
25752694
*
25762695
* If the task was not queued/runnable, also place it back on a runqueue.
25772696
*
2578-
* Atomic against schedule() which would dequeue a task, also see
2579-
* set_current_state().
2697+
* This function is atomic against schedule() which would dequeue the task.
2698+
*
2699+
* It issues a full memory barrier before accessing @p->state, see the comment
2700+
* with set_current_state().
2701+
*
2702+
* Uses p->pi_lock to serialize against concurrent wake-ups.
25802703
*
2581-
* This function executes a full memory barrier before accessing the task
2582-
* state; see set_current_state().
2704+
* Relies on p->pi_lock stabilizing:
2705+
* - p->sched_class
2706+
* - p->cpus_ptr
2707+
* - p->sched_task_group
2708+
* in order to do migration, see its use of select_task_rq()/set_task_cpu().
2709+
*
2710+
* Tries really hard to only take one task_rq(p)->lock for performance.
2711+
* Takes rq->lock in:
2712+
* - ttwu_runnable() -- old rq, unavoidable, see comment there;
2713+
* - ttwu_queue() -- new rq, for enqueue of the task;
2714+
* - psi_ttwu_dequeue() -- much sadness :-( accounting will kill us.
2715+
*
2716+
* As a consequence we race really badly with just about everything. See the
2717+
* many memory barriers and their comments for details.
25832718
*
25842719
* Return: %true if @p->state changes (an actual wakeup was done),
25852720
* %false otherwise.
@@ -2595,7 +2730,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
25952730
/*
25962731
* We're waking current, this means 'p->on_rq' and 'task_cpu(p)
25972732
* == smp_processor_id()'. Together this means we can special
2598-
* case the whole 'p->on_rq && ttwu_remote()' case below
2733+
* case the whole 'p->on_rq && ttwu_runnable()' case below
25992734
* without taking any locks.
26002735
*
26012736
* In particular:
@@ -2616,8 +2751,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
26162751
/*
26172752
* If we are going to wake up a thread waiting for CONDITION we
26182753
* need to ensure that CONDITION=1 done by the caller can not be
2619-
* reordered with p->state check below. This pairs with mb() in
2620-
* set_current_state() the waiting thread does.
2754+
* reordered with p->state check below. This pairs with smp_store_mb()
2755+
* in set_current_state() that the waiting thread does.
26212756
*/
26222757
raw_spin_lock_irqsave(&p->pi_lock, flags);
26232758
smp_mb__after_spinlock();
@@ -2652,7 +2787,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
26522787
* A similar smb_rmb() lives in try_invoke_on_locked_down_task().
26532788
*/
26542789
smp_rmb();
2655-
if (READ_ONCE(p->on_rq) && ttwu_remote(p, wake_flags))
2790+
if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))
26562791
goto unlock;
26572792

26582793
if (p->in_iowait) {
@@ -3222,17 +3357,20 @@ static inline void prepare_task(struct task_struct *next)
32223357
/*
32233358
* Claim the task as running, we do this before switching to it
32243359
* such that any running task will have this set.
3360+
*
3361+
* See the ttwu() WF_ON_CPU case and its ordering comment.
32253362
*/
3226-
next->on_cpu = 1;
3363+
WRITE_ONCE(next->on_cpu, 1);
32273364
#endif
32283365
}
32293366

32303367
static inline void finish_task(struct task_struct *prev)
32313368
{
32323369
#ifdef CONFIG_SMP
32333370
/*
3234-
* After ->on_cpu is cleared, the task can be moved to a different CPU.
3235-
* We must ensure this doesn't happen until the switch is completely
3371+
* This must be the very last reference to @prev from this CPU. After
3372+
* p->on_cpu is cleared, the task can be moved to a different CPU. We
3373+
* must ensure this doesn't happen until the switch is completely
32363374
* finished.
32373375
*
32383376
* In particular, the load of prev->state in finish_task_switch() must

kernel/sched/sched.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1203,6 +1203,16 @@ struct rq_flags {
12031203
#endif
12041204
};
12051205

1206+
/*
1207+
* Lockdep annotation that avoids accidental unlocks; it's like a
1208+
* sticky/continuous lockdep_assert_held().
1209+
*
1210+
* This avoids code that has access to 'struct rq *rq' (basically everything in
1211+
* the scheduler) from accidentally unlocking the rq if they do not also have a
1212+
* copy of the (on-stack) 'struct rq_flags rf'.
1213+
*
1214+
* Also see Documentation/locking/lockdep-design.rst.
1215+
*/
12061216
static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf)
12071217
{
12081218
rf->cookie = lockdep_pin_lock(&rq->lock);

0 commit comments

Comments
 (0)