Skip to content

Commit c650812

Browse files
hnazIngo Molnar
authored andcommitted
sched/psi: Fix mistaken CPU pressure indication after corrupted task state bug
Since sched_delayed tasks remain queued even after blocking, the load balancer can migrate them between runqueues while PSI considers them to be asleep. As a result, it misreads the migration requeue followed by a wakeup as a double queue: psi: inconsistent task state! task=... cpu=... psi_flags=4 clear=. set=4 First, call psi_enqueue() after p->sched_class->enqueue_task(). A wakeup will clear p->se.sched_delayed while a migration will not, so psi can use that flag to tell them apart. Then teach psi to migrate any "sleep" state when delayed-dequeue tasks are being migrated. Delayed-dequeue tasks can be revived by ttwu_runnable(), which will call down with a new ENQUEUE_DELAYED. Instead of further complicating the wakeup conditional in enqueue_task(), identify migration contexts instead and default to wakeup handling for all other cases. It's not just the warning in dmesg, the task state corruption causes a permanent CPU pressure indication, which messes with workload/machine health monitoring. Debugged-by-and-original-fix-by: K Prateek Nayak <[email protected]> Fixes: 152e11f ("sched/fair: Implement delayed dequeue") Closes: https://lore.kernel.org/lkml/[email protected]/ Closes: https://lore.kernel.org/all/[email protected]/ Signed-off-by: Johannes Weiner <[email protected]> Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Signed-off-by: Ingo Molnar <[email protected]> Tested-by: K Prateek Nayak <[email protected]> Link: https://lkml.kernel.org/r/[email protected]
1 parent f5aaff7 commit c650812

File tree

2 files changed

+39
-21
lines changed

2 files changed

+39
-21
lines changed

kernel/sched/core.c

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2012,18 +2012,18 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
20122012
if (!(flags & ENQUEUE_NOCLOCK))
20132013
update_rq_clock(rq);
20142014

2015-
if (!(flags & ENQUEUE_RESTORE)) {
2016-
sched_info_enqueue(rq, p);
2017-
psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED));
2018-
}
2019-
20202015
p->sched_class->enqueue_task(rq, p, flags);
20212016
/*
20222017
* Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear
20232018
* ->sched_delayed.
20242019
*/
20252020
uclamp_rq_inc(rq, p);
20262021

2022+
if (!(flags & ENQUEUE_RESTORE)) {
2023+
sched_info_enqueue(rq, p);
2024+
psi_enqueue(p, flags & ENQUEUE_MIGRATED);
2025+
}
2026+
20272027
if (sched_core_enabled(rq))
20282028
sched_core_enqueue(rq, p);
20292029
}
@@ -2041,7 +2041,7 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags)
20412041

20422042
if (!(flags & DEQUEUE_SAVE)) {
20432043
sched_info_dequeue(rq, p);
2044-
psi_dequeue(p, flags & DEQUEUE_SLEEP);
2044+
psi_dequeue(p, !(flags & DEQUEUE_SLEEP));
20452045
}
20462046

20472047
/*

kernel/sched/stats.h

Lines changed: 33 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -119,45 +119,63 @@ static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr,
119119
/*
120120
* PSI tracks state that persists across sleeps, such as iowaits and
121121
* memory stalls. As a result, it has to distinguish between sleeps,
122-
* where a task's runnable state changes, and requeues, where a task
123-
* and its state are being moved between CPUs and runqueues.
122+
* where a task's runnable state changes, and migrations, where a task
123+
* and its runnable state are being moved between CPUs and runqueues.
124+
*
125+
* A notable case is a task whose dequeue is delayed. PSI considers
126+
* those sleeping, but because they are still on the runqueue they can
127+
* go through migration requeues. In this case, *sleeping* states need
128+
* to be transferred.
124129
*/
125-
static inline void psi_enqueue(struct task_struct *p, bool wakeup)
130+
static inline void psi_enqueue(struct task_struct *p, bool migrate)
126131
{
127-
int clear = 0, set = TSK_RUNNING;
132+
int clear = 0, set = 0;
128133

129134
if (static_branch_likely(&psi_disabled))
130135
return;
131136

132-
if (p->in_memstall)
133-
set |= TSK_MEMSTALL_RUNNING;
134-
135-
if (!wakeup) {
137+
if (p->se.sched_delayed) {
138+
/* CPU migration of "sleeping" task */
139+
SCHED_WARN_ON(!migrate);
136140
if (p->in_memstall)
137141
set |= TSK_MEMSTALL;
142+
if (p->in_iowait)
143+
set |= TSK_IOWAIT;
144+
} else if (migrate) {
145+
/* CPU migration of runnable task */
146+
set = TSK_RUNNING;
147+
if (p->in_memstall)
148+
set |= TSK_MEMSTALL | TSK_MEMSTALL_RUNNING;
138149
} else {
150+
/* Wakeup of new or sleeping task */
139151
if (p->in_iowait)
140152
clear |= TSK_IOWAIT;
153+
set = TSK_RUNNING;
154+
if (p->in_memstall)
155+
set |= TSK_MEMSTALL_RUNNING;
141156
}
142157

143158
psi_task_change(p, clear, set);
144159
}
145160

146-
static inline void psi_dequeue(struct task_struct *p, bool sleep)
161+
static inline void psi_dequeue(struct task_struct *p, bool migrate)
147162
{
148163
if (static_branch_likely(&psi_disabled))
149164
return;
150165

166+
/*
167+
* When migrating a task to another CPU, clear all psi
168+
* state. The enqueue callback above will work it out.
169+
*/
170+
if (migrate)
171+
psi_task_change(p, p->psi_flags, 0);
172+
151173
/*
152174
* A voluntary sleep is a dequeue followed by a task switch. To
153175
* avoid walking all ancestors twice, psi_task_switch() handles
154176
* TSK_RUNNING and TSK_IOWAIT for us when it moves TSK_ONCPU.
155177
* Do nothing here.
156178
*/
157-
if (sleep)
158-
return;
159-
160-
psi_task_change(p, p->psi_flags, 0);
161179
}
162180

163181
static inline void psi_ttwu_dequeue(struct task_struct *p)
@@ -190,8 +208,8 @@ static inline void psi_sched_switch(struct task_struct *prev,
190208
}
191209

192210
#else /* CONFIG_PSI */
193-
static inline void psi_enqueue(struct task_struct *p, bool wakeup) {}
194-
static inline void psi_dequeue(struct task_struct *p, bool sleep) {}
211+
static inline void psi_enqueue(struct task_struct *p, bool migrate) {}
212+
static inline void psi_dequeue(struct task_struct *p, bool migrate) {}
195213
static inline void psi_ttwu_dequeue(struct task_struct *p) {}
196214
static inline void psi_sched_switch(struct task_struct *prev,
197215
struct task_struct *next,

0 commit comments

Comments
 (0)