Skip to content

Commit 34f26a1

Browse files
Chengming ZhouPeter Zijlstra
authored andcommitted
sched/psi: Per-cgroup PSI accounting disable/re-enable interface
PSI accounts stalls for each cgroup separately and aggregates it at each level of the hierarchy. This may cause non-negligible overhead for some workloads when under deep level of the hierarchy. commit 3958e2d ("cgroup: make per-cgroup pressure stall tracking configurable") make PSI to skip per-cgroup stall accounting, only account system-wide to avoid this each level overhead. But for our use case, we also want leaf cgroup PSI stats accounted for userspace adjustment on that cgroup, apart from only system-wide adjustment. So this patch introduce a per-cgroup PSI accounting disable/re-enable interface "cgroup.pressure", which is a read-write single value file that allowed values are "0" and "1", the defaults is "1" so per-cgroup PSI stats is enabled by default. Implementation details: It should be relatively straight-forward to disable and re-enable state aggregation, time tracking, averaging on a per-cgroup level, if we can live with losing history from while it was disabled. I.e. the avgs will restart from 0, total= will have gaps. But it's hard or complex to stop/restart groupc->tasks[] updates, which is not implemented in this patch. So we always update groupc->tasks[] and PSI_ONCPU bit in psi_group_change() even when the cgroup PSI stats is disabled. Suggested-by: Johannes Weiner <[email protected]> Suggested-by: Tejun Heo <[email protected]> Signed-off-by: Chengming Zhou <[email protected]> Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Acked-by: Johannes Weiner <[email protected]> Link: https://lkml.kernel.org/r/[email protected]
1 parent dc86aba commit 34f26a1

File tree

6 files changed

+152
-13
lines changed

6 files changed

+152
-13
lines changed

Documentation/admin-guide/cgroup-v2.rst

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -976,6 +976,23 @@ All cgroup core files are prefixed with "cgroup."
976976
killing cgroups is a process directed operation, i.e. it affects
977977
the whole thread-group.
978978

979+
cgroup.pressure
980+
A read-write single value file that allowed values are "0" and "1".
981+
The default is "1".
982+
983+
Writing "0" to the file will disable the cgroup PSI accounting.
984+
Writing "1" to the file will re-enable the cgroup PSI accounting.
985+
986+
This control attribute is not hierarchical, so disable or enable PSI
987+
accounting in a cgroup does not affect PSI accounting in descendants
988+
and doesn't need pass enablement via ancestors from root.
989+
990+
The reason this control attribute exists is that PSI accounts stalls for
991+
each cgroup separately and aggregates it at each level of the hierarchy.
992+
This may cause non-negligible overhead for some workloads when under
993+
deep level of the hierarchy, in which case this control attribute can
994+
be used to disable PSI accounting in the non-leaf cgroups.
995+
979996
irq.pressure
980997
A read-write nested-keyed file.
981998

include/linux/cgroup-defs.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -428,6 +428,9 @@ struct cgroup {
428428
struct cgroup_file procs_file; /* handle for "cgroup.procs" */
429429
struct cgroup_file events_file; /* handle for "cgroup.events" */
430430

431+
/* handles for "{cpu,memory,io,irq}.pressure" */
432+
struct cgroup_file psi_files[NR_PSI_RESOURCES];
433+
431434
/*
432435
* The bitmask of subsystems enabled on the child cgroups.
433436
* ->subtree_control is the one configured through

include/linux/psi.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
3939
int psi_cgroup_alloc(struct cgroup *cgrp);
4040
void psi_cgroup_free(struct cgroup *cgrp);
4141
void cgroup_move_task(struct task_struct *p, struct css_set *to);
42+
void psi_cgroup_restart(struct psi_group *group);
4243
#endif
4344

4445
#else /* CONFIG_PSI */
@@ -60,6 +61,7 @@ static inline void cgroup_move_task(struct task_struct *p, struct css_set *to)
6061
{
6162
rcu_assign_pointer(p->cgroups, to);
6263
}
64+
static inline void psi_cgroup_restart(struct psi_group *group) {}
6365
#endif
6466

6567
#endif /* CONFIG_PSI */

include/linux/psi_types.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,7 @@ struct psi_trigger {
152152

153153
struct psi_group {
154154
struct psi_group *parent;
155+
bool enabled;
155156

156157
/* Protects data used by the aggregator */
157158
struct mutex avgs_lock;
@@ -194,6 +195,8 @@ struct psi_group {
194195

195196
#else /* CONFIG_PSI */
196197

198+
#define NR_PSI_RESOURCES 0
199+
197200
struct psi_group { };
198201

199202
#endif /* CONFIG_PSI */

kernel/cgroup/cgroup.c

Lines changed: 64 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3708,8 +3708,8 @@ static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
37083708
return psi_show(seq, psi, PSI_CPU);
37093709
}
37103710

3711-
static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
3712-
size_t nbytes, enum psi_res res)
3711+
static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
3712+
size_t nbytes, enum psi_res res)
37133713
{
37143714
struct cgroup_file_ctx *ctx = of->priv;
37153715
struct psi_trigger *new;
@@ -3746,21 +3746,21 @@ static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of,
37463746
char *buf, size_t nbytes,
37473747
loff_t off)
37483748
{
3749-
return cgroup_pressure_write(of, buf, nbytes, PSI_IO);
3749+
return pressure_write(of, buf, nbytes, PSI_IO);
37503750
}
37513751

37523752
static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of,
37533753
char *buf, size_t nbytes,
37543754
loff_t off)
37553755
{
3756-
return cgroup_pressure_write(of, buf, nbytes, PSI_MEM);
3756+
return pressure_write(of, buf, nbytes, PSI_MEM);
37573757
}
37583758

37593759
static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
37603760
char *buf, size_t nbytes,
37613761
loff_t off)
37623762
{
3763-
return cgroup_pressure_write(of, buf, nbytes, PSI_CPU);
3763+
return pressure_write(of, buf, nbytes, PSI_CPU);
37643764
}
37653765

37663766
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -3776,10 +3776,58 @@ static ssize_t cgroup_irq_pressure_write(struct kernfs_open_file *of,
37763776
char *buf, size_t nbytes,
37773777
loff_t off)
37783778
{
3779-
return cgroup_pressure_write(of, buf, nbytes, PSI_IRQ);
3779+
return pressure_write(of, buf, nbytes, PSI_IRQ);
37803780
}
37813781
#endif
37823782

3783+
static int cgroup_pressure_show(struct seq_file *seq, void *v)
3784+
{
3785+
struct cgroup *cgrp = seq_css(seq)->cgroup;
3786+
struct psi_group *psi = cgroup_psi(cgrp);
3787+
3788+
seq_printf(seq, "%d\n", psi->enabled);
3789+
3790+
return 0;
3791+
}
3792+
3793+
static ssize_t cgroup_pressure_write(struct kernfs_open_file *of,
3794+
char *buf, size_t nbytes,
3795+
loff_t off)
3796+
{
3797+
ssize_t ret;
3798+
int enable;
3799+
struct cgroup *cgrp;
3800+
struct psi_group *psi;
3801+
3802+
ret = kstrtoint(strstrip(buf), 0, &enable);
3803+
if (ret)
3804+
return ret;
3805+
3806+
if (enable < 0 || enable > 1)
3807+
return -ERANGE;
3808+
3809+
cgrp = cgroup_kn_lock_live(of->kn, false);
3810+
if (!cgrp)
3811+
return -ENOENT;
3812+
3813+
psi = cgroup_psi(cgrp);
3814+
if (psi->enabled != enable) {
3815+
int i;
3816+
3817+
/* show or hide {cpu,memory,io,irq}.pressure files */
3818+
for (i = 0; i < NR_PSI_RESOURCES; i++)
3819+
cgroup_file_show(&cgrp->psi_files[i], enable);
3820+
3821+
psi->enabled = enable;
3822+
if (enable)
3823+
psi_cgroup_restart(psi);
3824+
}
3825+
3826+
cgroup_kn_unlock(of->kn);
3827+
3828+
return nbytes;
3829+
}
3830+
37833831
static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
37843832
poll_table *pt)
37853833
{
@@ -5175,6 +5223,7 @@ static struct cftype cgroup_base_files[] = {
51755223
{
51765224
.name = "io.pressure",
51775225
.flags = CFTYPE_PRESSURE,
5226+
.file_offset = offsetof(struct cgroup, psi_files[PSI_IO]),
51785227
.seq_show = cgroup_io_pressure_show,
51795228
.write = cgroup_io_pressure_write,
51805229
.poll = cgroup_pressure_poll,
@@ -5183,6 +5232,7 @@ static struct cftype cgroup_base_files[] = {
51835232
{
51845233
.name = "memory.pressure",
51855234
.flags = CFTYPE_PRESSURE,
5235+
.file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]),
51865236
.seq_show = cgroup_memory_pressure_show,
51875237
.write = cgroup_memory_pressure_write,
51885238
.poll = cgroup_pressure_poll,
@@ -5191,6 +5241,7 @@ static struct cftype cgroup_base_files[] = {
51915241
{
51925242
.name = "cpu.pressure",
51935243
.flags = CFTYPE_PRESSURE,
5244+
.file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]),
51945245
.seq_show = cgroup_cpu_pressure_show,
51955246
.write = cgroup_cpu_pressure_write,
51965247
.poll = cgroup_pressure_poll,
@@ -5200,12 +5251,19 @@ static struct cftype cgroup_base_files[] = {
52005251
{
52015252
.name = "irq.pressure",
52025253
.flags = CFTYPE_PRESSURE,
5254+
.file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]),
52035255
.seq_show = cgroup_irq_pressure_show,
52045256
.write = cgroup_irq_pressure_write,
52055257
.poll = cgroup_pressure_poll,
52065258
.release = cgroup_pressure_release,
52075259
},
52085260
#endif
5261+
{
5262+
.name = "cgroup.pressure",
5263+
.flags = CFTYPE_PRESSURE,
5264+
.seq_show = cgroup_pressure_show,
5265+
.write = cgroup_pressure_write,
5266+
},
52095267
#endif /* CONFIG_PSI */
52105268
{ } /* terminate */
52115269
};

kernel/sched/psi.c

Lines changed: 63 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,7 @@ static void group_init(struct psi_group *group)
181181
{
182182
int cpu;
183183

184+
group->enabled = true;
184185
for_each_possible_cpu(cpu)
185186
seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
186187
group->avg_last_update = sched_clock();
@@ -696,17 +697,16 @@ static void psi_group_change(struct psi_group *group, int cpu,
696697
groupc = per_cpu_ptr(group->pcpu, cpu);
697698

698699
/*
699-
* First we assess the aggregate resource states this CPU's
700-
* tasks have been in since the last change, and account any
701-
* SOME and FULL time these may have resulted in.
702-
*
703-
* Then we update the task counts according to the state
700+
* First we update the task counts according to the state
704701
* change requested through the @clear and @set bits.
702+
*
703+
* Then if the cgroup PSI stats accounting enabled, we
704+
* assess the aggregate resource states this CPU's tasks
705+
* have been in since the last change, and account any
706+
* SOME and FULL time these may have resulted in.
705707
*/
706708
write_seqcount_begin(&groupc->seq);
707709

708-
record_times(groupc, now);
709-
710710
/*
711711
* Start with TSK_ONCPU, which doesn't have a corresponding
712712
* task count - it's just a boolean flag directly encoded in
@@ -745,6 +745,23 @@ static void psi_group_change(struct psi_group *group, int cpu,
745745
if (set & (1 << t))
746746
groupc->tasks[t]++;
747747

748+
if (!group->enabled) {
749+
/*
750+
* On the first group change after disabling PSI, conclude
751+
* the current state and flush its time. This is unlikely
752+
* to matter to the user, but aggregation (get_recent_times)
753+
* may have already incorporated the live state into times_prev;
754+
* avoid a delta sample underflow when PSI is later re-enabled.
755+
*/
756+
if (unlikely(groupc->state_mask & (1 << PSI_NONIDLE)))
757+
record_times(groupc, now);
758+
759+
groupc->state_mask = state_mask;
760+
761+
write_seqcount_end(&groupc->seq);
762+
return;
763+
}
764+
748765
for (s = 0; s < NR_PSI_STATES; s++) {
749766
if (test_state(groupc->tasks, s, state_mask & PSI_ONCPU))
750767
state_mask |= (1 << s);
@@ -761,6 +778,8 @@ static void psi_group_change(struct psi_group *group, int cpu,
761778
if (unlikely((state_mask & PSI_ONCPU) && cpu_curr(cpu)->in_memstall))
762779
state_mask |= (1 << PSI_MEM_FULL);
763780

781+
record_times(groupc, now);
782+
764783
groupc->state_mask = state_mask;
765784

766785
write_seqcount_end(&groupc->seq);
@@ -907,6 +926,9 @@ void psi_account_irqtime(struct task_struct *task, u32 delta)
907926

908927
group = task_psi_group(task);
909928
do {
929+
if (!group->enabled)
930+
continue;
931+
910932
groupc = per_cpu_ptr(group->pcpu, cpu);
911933

912934
write_seqcount_begin(&groupc->seq);
@@ -1080,6 +1102,40 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)
10801102

10811103
task_rq_unlock(rq, task, &rf);
10821104
}
1105+
1106+
void psi_cgroup_restart(struct psi_group *group)
1107+
{
1108+
int cpu;
1109+
1110+
/*
1111+
* After we disable psi_group->enabled, we don't actually
1112+
* stop percpu tasks accounting in each psi_group_cpu,
1113+
* instead only stop test_state() loop, record_times()
1114+
* and averaging worker, see psi_group_change() for details.
1115+
*
1116+
* When disable cgroup PSI, this function has nothing to sync
1117+
* since cgroup pressure files are hidden and percpu psi_group_cpu
1118+
* would see !psi_group->enabled and only do task accounting.
1119+
*
1120+
* When re-enable cgroup PSI, this function use psi_group_change()
1121+
* to get correct state mask from test_state() loop on tasks[],
1122+
* and restart groupc->state_start from now, use .clear = .set = 0
1123+
* here since no task status really changed.
1124+
*/
1125+
if (!group->enabled)
1126+
return;
1127+
1128+
for_each_possible_cpu(cpu) {
1129+
struct rq *rq = cpu_rq(cpu);
1130+
struct rq_flags rf;
1131+
u64 now;
1132+
1133+
rq_lock_irq(rq, &rf);
1134+
now = cpu_clock(cpu);
1135+
psi_group_change(group, cpu, 0, 0, now, true);
1136+
rq_unlock_irq(rq, &rf);
1137+
}
1138+
}
10831139
#endif /* CONFIG_CGROUPS */
10841140

10851141
int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)

0 commit comments

Comments
 (0)