Skip to content

Commit 52b1364

Browse files
Chengming ZhouPeter Zijlstra
authored andcommitted
sched/psi: Add PSI_IRQ to track IRQ/SOFTIRQ pressure
Now PSI already tracked workload pressure stall information for CPU, memory and IO. Apart from these, IRQ/SOFTIRQ could have obvious impact on some workload productivity, such as web service workload. When CONFIG_IRQ_TIME_ACCOUNTING, we can get IRQ/SOFTIRQ delta time from update_rq_clock_task(), in which we can record that delta to CPU curr task's cgroups as PSI_IRQ_FULL status. Note we don't use PSI_IRQ_SOME since IRQ/SOFTIRQ always happen in the current task on the CPU, make nothing productive could run even if it were runnable, so we only use PSI_IRQ_FULL. Signed-off-by: Chengming Zhou <[email protected]> Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Acked-by: Johannes Weiner <[email protected]> Link: https://lore.kernel.org/r/[email protected]
1 parent 71dbdde commit 52b1364

File tree

6 files changed

+116
-4
lines changed

6 files changed

+116
-4
lines changed

Documentation/admin-guide/cgroup-v2.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -976,6 +976,12 @@ All cgroup core files are prefixed with "cgroup."
976976
killing cgroups is a process directed operation, i.e. it affects
977977
the whole thread-group.
978978

979+
irq.pressure
980+
A read-write nested-keyed file.
981+
982+
Shows pressure stall information for IRQ/SOFTIRQ. See
983+
:ref:`Documentation/accounting/psi.rst <psi>` for details.
984+
979985
Controllers
980986
===========
981987

include/linux/psi_types.h

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,10 @@ enum psi_res {
4242
PSI_IO,
4343
PSI_MEM,
4444
PSI_CPU,
45-
NR_PSI_RESOURCES = 3,
45+
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
46+
PSI_IRQ,
47+
#endif
48+
NR_PSI_RESOURCES,
4649
};
4750

4851
/*
@@ -58,9 +61,12 @@ enum psi_states {
5861
PSI_MEM_FULL,
5962
PSI_CPU_SOME,
6063
PSI_CPU_FULL,
64+
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
65+
PSI_IRQ_FULL,
66+
#endif
6167
/* Only per-CPU, to weigh the CPU in the global average: */
6268
PSI_NONIDLE,
63-
NR_PSI_STATES = 7,
69+
NR_PSI_STATES,
6470
};
6571

6672
/* Use one bit in the state mask to track TSK_ONCPU */

kernel/cgroup/cgroup.c

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3763,6 +3763,23 @@ static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
37633763
return cgroup_pressure_write(of, buf, nbytes, PSI_CPU);
37643764
}
37653765

3766+
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
3767+
static int cgroup_irq_pressure_show(struct seq_file *seq, void *v)
3768+
{
3769+
struct cgroup *cgrp = seq_css(seq)->cgroup;
3770+
struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
3771+
3772+
return psi_show(seq, psi, PSI_IRQ);
3773+
}
3774+
3775+
static ssize_t cgroup_irq_pressure_write(struct kernfs_open_file *of,
3776+
char *buf, size_t nbytes,
3777+
loff_t off)
3778+
{
3779+
return cgroup_pressure_write(of, buf, nbytes, PSI_IRQ);
3780+
}
3781+
#endif
3782+
37663783
static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
37673784
poll_table *pt)
37683785
{
@@ -5179,6 +5196,16 @@ static struct cftype cgroup_base_files[] = {
51795196
.poll = cgroup_pressure_poll,
51805197
.release = cgroup_pressure_release,
51815198
},
5199+
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
5200+
{
5201+
.name = "irq.pressure",
5202+
.flags = CFTYPE_PRESSURE,
5203+
.seq_show = cgroup_irq_pressure_show,
5204+
.write = cgroup_irq_pressure_write,
5205+
.poll = cgroup_pressure_poll,
5206+
.release = cgroup_pressure_release,
5207+
},
5208+
#endif
51825209
#endif /* CONFIG_PSI */
51835210
{ } /* terminate */
51845211
};

kernel/sched/core.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -708,6 +708,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
708708

709709
rq->prev_irq_time += irq_delta;
710710
delta -= irq_delta;
711+
psi_account_irqtime(rq->curr, irq_delta);
711712
#endif
712713
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
713714
if (static_key_false((&paravirt_steal_rq_enabled))) {

kernel/sched/psi.c

Lines changed: 72 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -904,6 +904,36 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
904904
}
905905
}
906906

907+
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
908+
void psi_account_irqtime(struct task_struct *task, u32 delta)
909+
{
910+
int cpu = task_cpu(task);
911+
void *iter = NULL;
912+
struct psi_group *group;
913+
struct psi_group_cpu *groupc;
914+
u64 now;
915+
916+
if (!task->pid)
917+
return;
918+
919+
now = cpu_clock(cpu);
920+
921+
while ((group = iterate_groups(task, &iter))) {
922+
groupc = per_cpu_ptr(group->pcpu, cpu);
923+
924+
write_seqcount_begin(&groupc->seq);
925+
926+
record_times(groupc, now);
927+
groupc->times[PSI_IRQ_FULL] += delta;
928+
929+
write_seqcount_end(&groupc->seq);
930+
931+
if (group->poll_states & (1 << PSI_IRQ_FULL))
932+
psi_schedule_poll_work(group, 1);
933+
}
934+
}
935+
#endif
936+
907937
/**
908938
* psi_memstall_enter - mark the beginning of a memory stall section
909939
* @flags: flags to handle nested sections
@@ -1065,6 +1095,7 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)
10651095

10661096
int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
10671097
{
1098+
bool only_full = false;
10681099
int full;
10691100
u64 now;
10701101

@@ -1079,7 +1110,11 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
10791110
group->avg_next_update = update_averages(group, now);
10801111
mutex_unlock(&group->avgs_lock);
10811112

1082-
for (full = 0; full < 2; full++) {
1113+
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
1114+
only_full = res == PSI_IRQ;
1115+
#endif
1116+
1117+
for (full = 0; full < 2 - only_full; full++) {
10831118
unsigned long avg[3] = { 0, };
10841119
u64 total = 0;
10851120
int w;
@@ -1093,7 +1128,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
10931128
}
10941129

10951130
seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n",
1096-
full ? "full" : "some",
1131+
full || only_full ? "full" : "some",
10971132
LOAD_INT(avg[0]), LOAD_FRAC(avg[0]),
10981133
LOAD_INT(avg[1]), LOAD_FRAC(avg[1]),
10991134
LOAD_INT(avg[2]), LOAD_FRAC(avg[2]),
@@ -1121,6 +1156,11 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
11211156
else
11221157
return ERR_PTR(-EINVAL);
11231158

1159+
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
1160+
if (res == PSI_IRQ && --state != PSI_IRQ_FULL)
1161+
return ERR_PTR(-EINVAL);
1162+
#endif
1163+
11241164
if (state >= PSI_NONIDLE)
11251165
return ERR_PTR(-EINVAL);
11261166

@@ -1405,13 +1445,43 @@ static const struct proc_ops psi_cpu_proc_ops = {
14051445
.proc_release = psi_fop_release,
14061446
};
14071447

1448+
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
1449+
static int psi_irq_show(struct seq_file *m, void *v)
1450+
{
1451+
return psi_show(m, &psi_system, PSI_IRQ);
1452+
}
1453+
1454+
static int psi_irq_open(struct inode *inode, struct file *file)
1455+
{
1456+
return psi_open(file, psi_irq_show);
1457+
}
1458+
1459+
static ssize_t psi_irq_write(struct file *file, const char __user *user_buf,
1460+
size_t nbytes, loff_t *ppos)
1461+
{
1462+
return psi_write(file, user_buf, nbytes, PSI_IRQ);
1463+
}
1464+
1465+
static const struct proc_ops psi_irq_proc_ops = {
1466+
.proc_open = psi_irq_open,
1467+
.proc_read = seq_read,
1468+
.proc_lseek = seq_lseek,
1469+
.proc_write = psi_irq_write,
1470+
.proc_poll = psi_fop_poll,
1471+
.proc_release = psi_fop_release,
1472+
};
1473+
#endif
1474+
14081475
static int __init psi_proc_init(void)
14091476
{
14101477
if (psi_enable) {
14111478
proc_mkdir("pressure", NULL);
14121479
proc_create("pressure/io", 0666, NULL, &psi_io_proc_ops);
14131480
proc_create("pressure/memory", 0666, NULL, &psi_memory_proc_ops);
14141481
proc_create("pressure/cpu", 0666, NULL, &psi_cpu_proc_ops);
1482+
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
1483+
proc_create("pressure/irq", 0666, NULL, &psi_irq_proc_ops);
1484+
#endif
14151485
}
14161486
return 0;
14171487
}

kernel/sched/stats.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ __schedstats_from_se(struct sched_entity *se)
110110
void psi_task_change(struct task_struct *task, int clear, int set);
111111
void psi_task_switch(struct task_struct *prev, struct task_struct *next,
112112
bool sleep);
113+
void psi_account_irqtime(struct task_struct *task, u32 delta);
113114

114115
/*
115116
* PSI tracks state that persists across sleeps, such as iowaits and
@@ -205,6 +206,7 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) {}
205206
static inline void psi_sched_switch(struct task_struct *prev,
206207
struct task_struct *next,
207208
bool sleep) {}
209+
static inline void psi_account_irqtime(struct task_struct *task, u32 delta) {}
208210
#endif /* CONFIG_PSI */
209211

210212
#ifdef CONFIG_SCHED_INFO

0 commit comments

Comments
 (0)