Skip to content

Commit dbfb089

Browse files
author
Peter Zijlstra
committed
sched: Fix loadavg accounting race
The recent commit: c6e7bd7 ("sched/core: Optimize ttwu() spinning on p->on_cpu") moved these lines in ttwu(): p->sched_contributes_to_load = !!task_contributes_to_load(p); p->state = TASK_WAKING; up before: smp_cond_load_acquire(&p->on_cpu, !VAL); into the 'p->on_rq == 0' block, with the thinking that once we hit schedule() the current task cannot change it's ->state anymore. And while this is true, it is both incorrect and flawed. It is incorrect in that we need at least an ACQUIRE on 'p->on_rq == 0' to avoid weak hardware from re-ordering things for us. This can fairly easily be achieved by relying on the control-dependency already in place. The second problem, which makes the flaw in the original argument, is that while schedule() will not change prev->state, it will read it a number of times (arguably too many times since it's marked volatile). The previous condition 'p->on_cpu == 0' was sufficient because that indicates schedule() has completed, and will no longer read prev->state. So now the trick is to make this same true for the (much) earlier 'prev->on_rq == 0' case. Furthermore, in order to make the ordering stick, the 'prev->on_rq = 0' assignment needs to he a RELEASE, but adding additional ordering to schedule() is an unwelcome proposition at the best of times, doubly so for mere accounting. Luckily we can push the prev->state load up before rq->lock, with the only caveat that we then have to re-read the state after. However, we know that if it changed, we no longer have to worry about the blocking path. This gives us the required ordering, if we block, we did the prev->state load before an (effective) smp_mb() and the p->on_rq store needs not change. With this we end up with the effective ordering: LOAD p->state LOAD-ACQUIRE p->on_rq == 0 MB STORE p->on_rq, 0 STORE p->state, TASK_WAKING which ensures the TASK_WAKING store happens after the prev->state load, and all is well again. Fixes: c6e7bd7 ("sched/core: Optimize ttwu() spinning on p->on_cpu") Reported-by: Dave Jones <[email protected]> Reported-by: Paul Gortmaker <[email protected]> Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Tested-by: Dave Jones <[email protected]> Tested-by: Paul Gortmaker <[email protected]> Link: https://lkml.kernel.org/r/[email protected]
1 parent dcb7fd8 commit dbfb089

File tree

2 files changed

+51
-20
lines changed

2 files changed

+51
-20
lines changed

include/linux/sched.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -114,10 +114,6 @@ struct task_group;
114114

115115
#define task_is_stopped_or_traced(task) ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
116116

117-
#define task_contributes_to_load(task) ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
118-
(task->flags & PF_FROZEN) == 0 && \
119-
(task->state & TASK_NOLOAD) == 0)
120-
121117
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
122118

123119
/*

kernel/sched/core.c

Lines changed: 51 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1311,9 +1311,6 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
13111311

13121312
void activate_task(struct rq *rq, struct task_struct *p, int flags)
13131313
{
1314-
if (task_contributes_to_load(p))
1315-
rq->nr_uninterruptible--;
1316-
13171314
enqueue_task(rq, p, flags);
13181315

13191316
p->on_rq = TASK_ON_RQ_QUEUED;
@@ -1323,9 +1320,6 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
13231320
{
13241321
p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING;
13251322

1326-
if (task_contributes_to_load(p))
1327-
rq->nr_uninterruptible++;
1328-
13291323
dequeue_task(rq, p, flags);
13301324
}
13311325

@@ -2236,10 +2230,10 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
22362230

22372231
lockdep_assert_held(&rq->lock);
22382232

2239-
#ifdef CONFIG_SMP
22402233
if (p->sched_contributes_to_load)
22412234
rq->nr_uninterruptible--;
22422235

2236+
#ifdef CONFIG_SMP
22432237
if (wake_flags & WF_MIGRATED)
22442238
en_flags |= ENQUEUE_MIGRATED;
22452239
#endif
@@ -2583,7 +2577,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
25832577
* A similar smb_rmb() lives in try_invoke_on_locked_down_task().
25842578
*/
25852579
smp_rmb();
2586-
if (p->on_rq && ttwu_remote(p, wake_flags))
2580+
if (READ_ONCE(p->on_rq) && ttwu_remote(p, wake_flags))
25872581
goto unlock;
25882582

25892583
if (p->in_iowait) {
@@ -2592,9 +2586,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
25922586
}
25932587

25942588
#ifdef CONFIG_SMP
2595-
p->sched_contributes_to_load = !!task_contributes_to_load(p);
2596-
p->state = TASK_WAKING;
2597-
25982589
/*
25992590
* Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
26002591
* possible to, falsely, observe p->on_cpu == 0.
@@ -2613,8 +2604,20 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
26132604
*
26142605
* Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
26152606
* __schedule(). See the comment for smp_mb__after_spinlock().
2607+
*
2608+
* Form a control-dep-acquire with p->on_rq == 0 above, to ensure
2609+
* schedule()'s deactivate_task() has 'happened' and p will no longer
2610+
* care about it's own p->state. See the comment in __schedule().
26162611
*/
2617-
smp_rmb();
2612+
smp_acquire__after_ctrl_dep();
2613+
2614+
/*
2615+
* We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq
2616+
* == 0), which means we need to do an enqueue, change p->state to
2617+
* TASK_WAKING such that we can unlock p->pi_lock before doing the
2618+
* enqueue, such as ttwu_queue_wakelist().
2619+
*/
2620+
p->state = TASK_WAKING;
26182621

26192622
/*
26202623
* If the owning (remote) CPU is still in the middle of schedule() with
@@ -4097,6 +4100,7 @@ static void __sched notrace __schedule(bool preempt)
40974100
{
40984101
struct task_struct *prev, *next;
40994102
unsigned long *switch_count;
4103+
unsigned long prev_state;
41004104
struct rq_flags rf;
41014105
struct rq *rq;
41024106
int cpu;
@@ -4113,12 +4117,22 @@ static void __sched notrace __schedule(bool preempt)
41134117
local_irq_disable();
41144118
rcu_note_context_switch(preempt);
41154119

4120+
/* See deactivate_task() below. */
4121+
prev_state = prev->state;
4122+
41164123
/*
41174124
* Make sure that signal_pending_state()->signal_pending() below
41184125
* can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
4119-
* done by the caller to avoid the race with signal_wake_up().
4126+
* done by the caller to avoid the race with signal_wake_up():
4127+
*
4128+
* __set_current_state(@state) signal_wake_up()
4129+
* schedule() set_tsk_thread_flag(p, TIF_SIGPENDING)
4130+
* wake_up_state(p, state)
4131+
* LOCK rq->lock LOCK p->pi_state
4132+
* smp_mb__after_spinlock() smp_mb__after_spinlock()
4133+
* if (signal_pending_state()) if (p->state & @state)
41204134
*
4121-
* The membarrier system call requires a full memory barrier
4135+
* Also, the membarrier system call requires a full memory barrier
41224136
* after coming from user-space, before storing to rq->curr.
41234137
*/
41244138
rq_lock(rq, &rf);
@@ -4129,10 +4143,31 @@ static void __sched notrace __schedule(bool preempt)
41294143
update_rq_clock(rq);
41304144

41314145
switch_count = &prev->nivcsw;
4132-
if (!preempt && prev->state) {
4133-
if (signal_pending_state(prev->state, prev)) {
4146+
/*
4147+
* We must re-load prev->state in case ttwu_remote() changed it
4148+
* before we acquired rq->lock.
4149+
*/
4150+
if (!preempt && prev_state && prev_state == prev->state) {
4151+
if (signal_pending_state(prev_state, prev)) {
41344152
prev->state = TASK_RUNNING;
41354153
} else {
4154+
prev->sched_contributes_to_load =
4155+
(prev_state & TASK_UNINTERRUPTIBLE) &&
4156+
!(prev_state & TASK_NOLOAD) &&
4157+
!(prev->flags & PF_FROZEN);
4158+
4159+
if (prev->sched_contributes_to_load)
4160+
rq->nr_uninterruptible++;
4161+
4162+
/*
4163+
* __schedule() ttwu()
4164+
* prev_state = prev->state; if (READ_ONCE(p->on_rq) && ...)
4165+
* LOCK rq->lock goto out;
4166+
* smp_mb__after_spinlock(); smp_acquire__after_ctrl_dep();
4167+
* p->on_rq = 0; p->state = TASK_WAKING;
4168+
*
4169+
* After this, schedule() must not care about p->state any more.
4170+
*/
41364171
deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
41374172

41384173
if (prev->in_iowait) {

0 commit comments

Comments
 (0)