Skip to content

Commit b05e75d

Browse files
hnazPeter Zijlstra
authored andcommitted
psi: Fix cpu.pressure for cpu.max and competing cgroups
For simplicity, cpu pressure is defined as having more than one runnable task on a given CPU. This works on the system-level, but it has limitations in a cgrouped reality: When cpu.max is in use, it doesn't capture the time in which a task is not executing on the CPU due to throttling. Likewise, it doesn't capture the time in which a competing cgroup is occupying the CPU - meaning it only reflects cgroup-internal competitive pressure, not outside pressure. Enable tracking of currently executing tasks, and then change the definition of cpu pressure in a cgroup from NR_RUNNING > 1 to NR_RUNNING > ON_CPU which will capture the effects of cpu.max as well as competition from outside the cgroup. After this patch, a cgroup running `stress -c 1` with a cpu.max setting of 5000 10000 shows ~50% continuous CPU pressure. Signed-off-by: Johannes Weiner <[email protected]> Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Link: https://lkml.kernel.org/r/[email protected]
1 parent 46a87b3 commit b05e75d

File tree

4 files changed

+46
-6
lines changed

4 files changed

+46
-6
lines changed

include/linux/psi_types.h

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,21 @@ enum psi_task_count {
1414
NR_IOWAIT,
1515
NR_MEMSTALL,
1616
NR_RUNNING,
17-
NR_PSI_TASK_COUNTS = 3,
17+
/*
18+
* This can't have values other than 0 or 1 and could be
19+
* implemented as a bit flag. But for now we still have room
20+
* in the first cacheline of psi_group_cpu, and this way we
21+
* don't have to special case any state tracking for it.
22+
*/
23+
NR_ONCPU,
24+
NR_PSI_TASK_COUNTS = 4,
1825
};
1926

2027
/* Task state bitmasks */
2128
#define TSK_IOWAIT (1 << NR_IOWAIT)
2229
#define TSK_MEMSTALL (1 << NR_MEMSTALL)
2330
#define TSK_RUNNING (1 << NR_RUNNING)
31+
#define TSK_ONCPU (1 << NR_ONCPU)
2432

2533
/* Resources that workloads could be stalled on */
2634
enum psi_res {

kernel/sched/core.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4091,6 +4091,8 @@ static void __sched notrace __schedule(bool preempt)
40914091
*/
40924092
++*switch_count;
40934093

4094+
psi_sched_switch(prev, next, !task_on_rq_queued(prev));
4095+
40944096
trace_sched_switch(preempt, prev, next);
40954097

40964098
/* Also unlocks the rq: */

kernel/sched/psi.c

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,7 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
225225
case PSI_MEM_FULL:
226226
return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING];
227227
case PSI_CPU_SOME:
228-
return tasks[NR_RUNNING] > 1;
228+
return tasks[NR_RUNNING] > tasks[NR_ONCPU];
229229
case PSI_NONIDLE:
230230
return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
231231
tasks[NR_RUNNING];
@@ -695,10 +695,10 @@ static u32 psi_group_change(struct psi_group *group, int cpu,
695695
if (!(m & (1 << t)))
696696
continue;
697697
if (groupc->tasks[t] == 0 && !psi_bug) {
698-
printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u] clear=%x set=%x\n",
698+
printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n",
699699
cpu, t, groupc->tasks[0],
700700
groupc->tasks[1], groupc->tasks[2],
701-
clear, set);
701+
groupc->tasks[3], clear, set);
702702
psi_bug = 1;
703703
}
704704
groupc->tasks[t]--;
@@ -916,9 +916,11 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)
916916

917917
rq = task_rq_lock(task, &rf);
918918

919-
if (task_on_rq_queued(task))
919+
if (task_on_rq_queued(task)) {
920920
task_flags = TSK_RUNNING;
921-
else if (task->in_iowait)
921+
if (task_current(rq, task))
922+
task_flags |= TSK_ONCPU;
923+
} else if (task->in_iowait)
922924
task_flags = TSK_IOWAIT;
923925

924926
if (task->flags & PF_MEMSTALL)

kernel/sched/stats.h

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,14 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep)
9393
if (p->flags & PF_MEMSTALL)
9494
clear |= TSK_MEMSTALL;
9595
} else {
96+
/*
97+
* When a task sleeps, schedule() dequeues it before
98+
* switching to the next one. Merge the clearing of
99+
* TSK_RUNNING and TSK_ONCPU to save an unnecessary
100+
* psi_task_change() call in psi_sched_switch().
101+
*/
102+
clear |= TSK_ONCPU;
103+
96104
if (p->in_iowait)
97105
set |= TSK_IOWAIT;
98106
}
@@ -126,6 +134,23 @@ static inline void psi_ttwu_dequeue(struct task_struct *p)
126134
}
127135
}
128136

137+
static inline void psi_sched_switch(struct task_struct *prev,
138+
struct task_struct *next,
139+
bool sleep)
140+
{
141+
if (static_branch_likely(&psi_disabled))
142+
return;
143+
144+
/*
145+
* Clear the TSK_ONCPU state if the task was preempted. If
146+
* it's a voluntary sleep, dequeue will have taken care of it.
147+
*/
148+
if (!sleep)
149+
psi_task_change(prev, TSK_ONCPU, 0);
150+
151+
psi_task_change(next, 0, TSK_ONCPU);
152+
}
153+
129154
static inline void psi_task_tick(struct rq *rq)
130155
{
131156
if (static_branch_likely(&psi_disabled))
@@ -138,6 +163,9 @@ static inline void psi_task_tick(struct rq *rq)
138163
static inline void psi_enqueue(struct task_struct *p, bool wakeup) {}
139164
static inline void psi_dequeue(struct task_struct *p, bool sleep) {}
140165
static inline void psi_ttwu_dequeue(struct task_struct *p) {}
166+
static inline void psi_sched_switch(struct task_struct *prev,
167+
struct task_struct *next,
168+
bool sleep) {}
141169
static inline void psi_task_tick(struct rq *rq) {}
142170
#endif /* CONFIG_PSI */
143171

0 commit comments

Comments
 (0)