@@ -79,6 +79,100 @@ __read_mostly int scheduler_running;
79
79
*/
80
80
int sysctl_sched_rt_runtime = 950000 ;
81
81
82
+
83
+ /*
84
+ * Serialization rules:
85
+ *
86
+ * Lock order:
87
+ *
88
+ * p->pi_lock
89
+ * rq->lock
90
+ * hrtimer_cpu_base->lock (hrtimer_start() for bandwidth controls)
91
+ *
92
+ * rq1->lock
93
+ * rq2->lock where: rq1 < rq2
94
+ *
95
+ * Regular state:
96
+ *
97
+ * Normal scheduling state is serialized by rq->lock. __schedule() takes the
98
+ * local CPU's rq->lock, it optionally removes the task from the runqueue and
99
+ * always looks at the local rq data structures to find the most elegible task
100
+ * to run next.
101
+ *
102
+ * Task enqueue is also under rq->lock, possibly taken from another CPU.
103
+ * Wakeups from another LLC domain might use an IPI to transfer the enqueue to
104
+ * the local CPU to avoid bouncing the runqueue state around [ see
105
+ * ttwu_queue_wakelist() ]
106
+ *
107
+ * Task wakeup, specifically wakeups that involve migration, are horribly
108
+ * complicated to avoid having to take two rq->locks.
109
+ *
110
+ * Special state:
111
+ *
112
+ * System-calls and anything external will use task_rq_lock() which acquires
113
+ * both p->pi_lock and rq->lock. As a consequence the state they change is
114
+ * stable while holding either lock:
115
+ *
116
+ * - sched_setaffinity()/
117
+ * set_cpus_allowed_ptr(): p->cpus_ptr, p->nr_cpus_allowed
118
+ * - set_user_nice(): p->se.load, p->*prio
119
+ * - __sched_setscheduler(): p->sched_class, p->policy, p->*prio,
120
+ * p->se.load, p->rt_priority,
121
+ * p->dl.dl_{runtime, deadline, period, flags, bw, density}
122
+ * - sched_setnuma(): p->numa_preferred_nid
123
+ * - sched_move_task()/
124
+ * cpu_cgroup_fork(): p->sched_task_group
125
+ * - uclamp_update_active() p->uclamp*
126
+ *
127
+ * p->state <- TASK_*:
128
+ *
129
+ * is changed locklessly using set_current_state(), __set_current_state() or
130
+ * set_special_state(), see their respective comments, or by
131
+ * try_to_wake_up(). This latter uses p->pi_lock to serialize against
132
+ * concurrent self.
133
+ *
134
+ * p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }:
135
+ *
136
+ * is set by activate_task() and cleared by deactivate_task(), under
137
+ * rq->lock. Non-zero indicates the task is runnable, the special
138
+ * ON_RQ_MIGRATING state is used for migration without holding both
139
+ * rq->locks. It indicates task_cpu() is not stable, see task_rq_lock().
140
+ *
141
+ * p->on_cpu <- { 0, 1 }:
142
+ *
143
+ * is set by prepare_task() and cleared by finish_task() such that it will be
144
+ * set before p is scheduled-in and cleared after p is scheduled-out, both
145
+ * under rq->lock. Non-zero indicates the task is running on its CPU.
146
+ *
147
+ * [ The astute reader will observe that it is possible for two tasks on one
148
+ * CPU to have ->on_cpu = 1 at the same time. ]
149
+ *
150
+ * task_cpu(p): is changed by set_task_cpu(), the rules are:
151
+ *
152
+ * - Don't call set_task_cpu() on a blocked task:
153
+ *
154
+ * We don't care what CPU we're not running on, this simplifies hotplug,
155
+ * the CPU assignment of blocked tasks isn't required to be valid.
156
+ *
157
+ * - for try_to_wake_up(), called under p->pi_lock:
158
+ *
159
+ * This allows try_to_wake_up() to only take one rq->lock, see its comment.
160
+ *
161
+ * - for migration called under rq->lock:
162
+ * [ see task_on_rq_migrating() in task_rq_lock() ]
163
+ *
164
+ * o move_queued_task()
165
+ * o detach_task()
166
+ *
167
+ * - for migration called under double_rq_lock():
168
+ *
169
+ * o __migrate_swap_task()
170
+ * o push_rt_task() / pull_rt_task()
171
+ * o push_dl_task() / pull_dl_task()
172
+ * o dl_task_offline_migration()
173
+ *
174
+ */
175
+
82
176
/*
83
177
* __task_rq_lock - lock the rq @p resides on.
84
178
*/
@@ -1543,17 +1637,15 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
1543
1637
{
1544
1638
lockdep_assert_held (& rq -> lock );
1545
1639
1546
- WRITE_ONCE (p -> on_rq , TASK_ON_RQ_MIGRATING );
1547
- dequeue_task (rq , p , DEQUEUE_NOCLOCK );
1640
+ deactivate_task (rq , p , DEQUEUE_NOCLOCK );
1548
1641
set_task_cpu (p , new_cpu );
1549
1642
rq_unlock (rq , rf );
1550
1643
1551
1644
rq = cpu_rq (new_cpu );
1552
1645
1553
1646
rq_lock (rq , rf );
1554
1647
BUG_ON (task_cpu (p ) != new_cpu );
1555
- enqueue_task (rq , p , 0 );
1556
- p -> on_rq = TASK_ON_RQ_QUEUED ;
1648
+ activate_task (rq , p , 0 );
1557
1649
check_preempt_curr (rq , p , 0 );
1558
1650
1559
1651
return rq ;
@@ -2318,12 +2410,31 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
2318
2410
}
2319
2411
2320
2412
/*
2321
- * Called in case the task @p isn't fully descheduled from its runqueue,
2322
- * in this case we must do a remote wakeup. Its a 'light' wakeup though,
2323
- * since all we need to do is flip p->state to TASK_RUNNING, since
2324
- * the task is still ->on_rq.
2413
+ * Consider @p being inside a wait loop:
2414
+ *
2415
+ * for (;;) {
2416
+ * set_current_state(TASK_UNINTERRUPTIBLE);
2417
+ *
2418
+ * if (CONDITION)
2419
+ * break;
2420
+ *
2421
+ * schedule();
2422
+ * }
2423
+ * __set_current_state(TASK_RUNNING);
2424
+ *
2425
+ * between set_current_state() and schedule(). In this case @p is still
2426
+ * runnable, so all that needs doing is change p->state back to TASK_RUNNING in
2427
+ * an atomic manner.
2428
+ *
2429
+ * By taking task_rq(p)->lock we serialize against schedule(), if @p->on_rq
2430
+ * then schedule() must still happen and p->state can be changed to
2431
+ * TASK_RUNNING. Otherwise we lost the race, schedule() has happened, and we
2432
+ * need to do a full wakeup with enqueue.
2433
+ *
2434
+ * Returns: %true when the wakeup is done,
2435
+ * %false otherwise.
2325
2436
*/
2326
- static int ttwu_remote (struct task_struct * p , int wake_flags )
2437
+ static int ttwu_runnable (struct task_struct * p , int wake_flags )
2327
2438
{
2328
2439
struct rq_flags rf ;
2329
2440
struct rq * rq ;
@@ -2464,17 +2575,23 @@ static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
2464
2575
2465
2576
return false;
2466
2577
}
2578
+
2579
+ #else /* !CONFIG_SMP */
2580
+
2581
+ static inline bool ttwu_queue_wakelist (struct task_struct * p , int cpu , int wake_flags )
2582
+ {
2583
+ return false;
2584
+ }
2585
+
2467
2586
#endif /* CONFIG_SMP */
2468
2587
2469
2588
static void ttwu_queue (struct task_struct * p , int cpu , int wake_flags )
2470
2589
{
2471
2590
struct rq * rq = cpu_rq (cpu );
2472
2591
struct rq_flags rf ;
2473
2592
2474
- #if defined(CONFIG_SMP )
2475
2593
if (ttwu_queue_wakelist (p , cpu , wake_flags ))
2476
2594
return ;
2477
- #endif
2478
2595
2479
2596
rq_lock (rq , & rf );
2480
2597
update_rq_clock (rq );
@@ -2530,8 +2647,8 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
2530
2647
* migration. However the means are completely different as there is no lock
2531
2648
* chain to provide order. Instead we do:
2532
2649
*
2533
- * 1) smp_store_release(X->on_cpu, 0)
2534
- * 2) smp_cond_load_acquire(!X->on_cpu)
2650
+ * 1) smp_store_release(X->on_cpu, 0) -- finish_task()
2651
+ * 2) smp_cond_load_acquire(!X->on_cpu) -- try_to_wake_up()
2535
2652
*
2536
2653
* Example:
2537
2654
*
@@ -2571,15 +2688,33 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
2571
2688
* @state: the mask of task states that can be woken
2572
2689
* @wake_flags: wake modifier flags (WF_*)
2573
2690
*
2574
- * If (@state & @p->state) @p->state = TASK_RUNNING.
2691
+ * Conceptually does:
2692
+ *
2693
+ * If (@state & @p->state) @p->state = TASK_RUNNING.
2575
2694
*
2576
2695
* If the task was not queued/runnable, also place it back on a runqueue.
2577
2696
*
2578
- * Atomic against schedule() which would dequeue a task, also see
2579
- * set_current_state().
2697
+ * This function is atomic against schedule() which would dequeue the task.
2698
+ *
2699
+ * It issues a full memory barrier before accessing @p->state, see the comment
2700
+ * with set_current_state().
2701
+ *
2702
+ * Uses p->pi_lock to serialize against concurrent wake-ups.
2580
2703
*
2581
- * This function executes a full memory barrier before accessing the task
2582
- * state; see set_current_state().
2704
+ * Relies on p->pi_lock stabilizing:
2705
+ * - p->sched_class
2706
+ * - p->cpus_ptr
2707
+ * - p->sched_task_group
2708
+ * in order to do migration, see its use of select_task_rq()/set_task_cpu().
2709
+ *
2710
+ * Tries really hard to only take one task_rq(p)->lock for performance.
2711
+ * Takes rq->lock in:
2712
+ * - ttwu_runnable() -- old rq, unavoidable, see comment there;
2713
+ * - ttwu_queue() -- new rq, for enqueue of the task;
2714
+ * - psi_ttwu_dequeue() -- much sadness :-( accounting will kill us.
2715
+ *
2716
+ * As a consequence we race really badly with just about everything. See the
2717
+ * many memory barriers and their comments for details.
2583
2718
*
2584
2719
* Return: %true if @p->state changes (an actual wakeup was done),
2585
2720
* %false otherwise.
@@ -2595,7 +2730,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
2595
2730
/*
2596
2731
* We're waking current, this means 'p->on_rq' and 'task_cpu(p)
2597
2732
* == smp_processor_id()'. Together this means we can special
2598
- * case the whole 'p->on_rq && ttwu_remote ()' case below
2733
+ * case the whole 'p->on_rq && ttwu_runnable ()' case below
2599
2734
* without taking any locks.
2600
2735
*
2601
2736
* In particular:
@@ -2616,8 +2751,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
2616
2751
/*
2617
2752
* If we are going to wake up a thread waiting for CONDITION we
2618
2753
* need to ensure that CONDITION=1 done by the caller can not be
2619
- * reordered with p->state check below. This pairs with mb() in
2620
- * set_current_state() the waiting thread does.
2754
+ * reordered with p->state check below. This pairs with smp_store_mb()
2755
+ * in set_current_state() that the waiting thread does.
2621
2756
*/
2622
2757
raw_spin_lock_irqsave (& p -> pi_lock , flags );
2623
2758
smp_mb__after_spinlock ();
@@ -2652,7 +2787,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
2652
2787
* A similar smb_rmb() lives in try_invoke_on_locked_down_task().
2653
2788
*/
2654
2789
smp_rmb ();
2655
- if (READ_ONCE (p -> on_rq ) && ttwu_remote (p , wake_flags ))
2790
+ if (READ_ONCE (p -> on_rq ) && ttwu_runnable (p , wake_flags ))
2656
2791
goto unlock ;
2657
2792
2658
2793
if (p -> in_iowait ) {
@@ -3222,17 +3357,20 @@ static inline void prepare_task(struct task_struct *next)
3222
3357
/*
3223
3358
* Claim the task as running, we do this before switching to it
3224
3359
* such that any running task will have this set.
3360
+ *
3361
+ * See the ttwu() WF_ON_CPU case and its ordering comment.
3225
3362
*/
3226
- next -> on_cpu = 1 ;
3363
+ WRITE_ONCE ( next -> on_cpu , 1 ) ;
3227
3364
#endif
3228
3365
}
3229
3366
3230
3367
static inline void finish_task (struct task_struct * prev )
3231
3368
{
3232
3369
#ifdef CONFIG_SMP
3233
3370
/*
3234
- * After ->on_cpu is cleared, the task can be moved to a different CPU.
3235
- * We must ensure this doesn't happen until the switch is completely
3371
+ * This must be the very last reference to @prev from this CPU. After
3372
+ * p->on_cpu is cleared, the task can be moved to a different CPU. We
3373
+ * must ensure this doesn't happen until the switch is completely
3236
3374
* finished.
3237
3375
*
3238
3376
* In particular, the load of prev->state in finish_task_switch() must
0 commit comments