Skip to content

Commit 3958e2d

Browse files
surenbaghdasaryanhtejun
authored andcommitted
cgroup: make per-cgroup pressure stall tracking configurable
PSI accounts stalls for each cgroup separately and aggregates it at each level of the hierarchy. This causes additional overhead with psi_avgs_work being called for each cgroup in the hierarchy. psi_avgs_work has been highly optimized, however on systems with large number of cgroups the overhead becomes noticeable. Systems which use PSI only at the system level could avoid this overhead if PSI can be configured to skip per-cgroup stall accounting. Add "cgroup_disable=pressure" kernel command-line option to allow requesting system-wide only pressure stall accounting. When set, it keeps system-wide accounting under /proc/pressure/ but skips accounting for individual cgroups and does not expose PSI nodes in cgroup hierarchy. Signed-off-by: Suren Baghdasaryan <[email protected]> Acked-by: Peter Zijlstra (Intel) <[email protected]> Acked-by: Johannes Weiner <[email protected]> Signed-off-by: Tejun Heo <[email protected]>
1 parent 2ca11b0 commit 3958e2d

File tree

5 files changed

+80
-15
lines changed

5 files changed

+80
-15
lines changed

Documentation/admin-guide/kernel-parameters.txt

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -497,16 +497,21 @@
497497
ccw_timeout_log [S390]
498498
See Documentation/s390/common_io.rst for details.
499499

500-
cgroup_disable= [KNL] Disable a particular controller
501-
Format: {name of the controller(s) to disable}
500+
cgroup_disable= [KNL] Disable a particular controller or optional feature
501+
Format: {name of the controller(s) or feature(s) to disable}
502502
The effects of cgroup_disable=foo are:
503503
- foo isn't auto-mounted if you mount all cgroups in
504504
a single hierarchy
505505
- foo isn't visible as an individually mountable
506506
subsystem
507+
- if foo is an optional feature then the feature is
508+
disabled and corresponding cgroup files are not
509+
created
507510
{Currently only "memory" controller deal with this and
508511
cut the overhead, others just disable the usage. So
509512
only cgroup_disable=memory is actually worthy}
513+
Specifying "pressure" disables per-cgroup pressure
514+
stall information accounting feature
510515

511516
cgroup_no_v1= [KNL] Disable cgroup controllers and named hierarchies in v1
512517
Format: { { controller | "all" | "named" }

include/linux/cgroup-defs.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@ enum {
113113
CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */
114114
CFTYPE_WORLD_WRITABLE = (1 << 4), /* (DON'T USE FOR NEW FILES) S_IWUGO */
115115
CFTYPE_DEBUG = (1 << 5), /* create when cgroup_debug */
116+
CFTYPE_PRESSURE = (1 << 6), /* only if pressure feature is enabled */
116117

117118
/* internal flags, do not use outside cgroup core proper */
118119
__CFTYPE_ONLY_ON_DFL = (1 << 16), /* only on default hierarchy */

include/linux/cgroup.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -676,6 +676,8 @@ static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
676676
return &cgrp->psi;
677677
}
678678

679+
bool cgroup_psi_enabled(void);
680+
679681
static inline void cgroup_init_kthreadd(void)
680682
{
681683
/*
@@ -735,6 +737,11 @@ static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
735737
return NULL;
736738
}
737739

740+
static inline bool cgroup_psi_enabled(void)
741+
{
742+
return false;
743+
}
744+
738745
static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
739746
struct cgroup *ancestor)
740747
{

kernel/cgroup/cgroup.c

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,22 @@ struct cgroup_namespace init_cgroup_ns = {
209209
static struct file_system_type cgroup2_fs_type;
210210
static struct cftype cgroup_base_files[];
211211

212+
/* cgroup optional features */
213+
enum cgroup_opt_features {
214+
#ifdef CONFIG_PSI
215+
OPT_FEATURE_PRESSURE,
216+
#endif
217+
OPT_FEATURE_COUNT
218+
};
219+
220+
static const char *cgroup_opt_feature_names[OPT_FEATURE_COUNT] = {
221+
#ifdef CONFIG_PSI
222+
"pressure",
223+
#endif
224+
};
225+
226+
static u16 cgroup_feature_disable_mask __read_mostly;
227+
212228
static int cgroup_apply_control(struct cgroup *cgrp);
213229
static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
214230
static void css_task_iter_skip(struct css_task_iter *it,
@@ -3631,6 +3647,18 @@ static void cgroup_pressure_release(struct kernfs_open_file *of)
36313647
{
36323648
psi_trigger_replace(&of->priv, NULL);
36333649
}
3650+
3651+
bool cgroup_psi_enabled(void)
3652+
{
3653+
return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0;
3654+
}
3655+
3656+
#else /* CONFIG_PSI */
3657+
bool cgroup_psi_enabled(void)
3658+
{
3659+
return false;
3660+
}
3661+
36343662
#endif /* CONFIG_PSI */
36353663

36363664
static int cgroup_freeze_show(struct seq_file *seq, void *v)
@@ -3955,6 +3983,8 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css,
39553983
restart:
39563984
for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) {
39573985
/* does cft->flags tell us to skip this file on @cgrp? */
3986+
if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
3987+
continue;
39583988
if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
39593989
continue;
39603990
if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
@@ -4032,6 +4062,9 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
40324062

40334063
WARN_ON(cft->ss || cft->kf_ops);
40344064

4065+
if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
4066+
continue;
4067+
40354068
if (cft->seq_start)
40364069
kf_ops = &cgroup_kf_ops;
40374070
else
@@ -4945,20 +4978,23 @@ static struct cftype cgroup_base_files[] = {
49454978
#ifdef CONFIG_PSI
49464979
{
49474980
.name = "io.pressure",
4981+
.flags = CFTYPE_PRESSURE,
49484982
.seq_show = cgroup_io_pressure_show,
49494983
.write = cgroup_io_pressure_write,
49504984
.poll = cgroup_pressure_poll,
49514985
.release = cgroup_pressure_release,
49524986
},
49534987
{
49544988
.name = "memory.pressure",
4989+
.flags = CFTYPE_PRESSURE,
49554990
.seq_show = cgroup_memory_pressure_show,
49564991
.write = cgroup_memory_pressure_write,
49574992
.poll = cgroup_pressure_poll,
49584993
.release = cgroup_pressure_release,
49594994
},
49604995
{
49614996
.name = "cpu.pressure",
4997+
.flags = CFTYPE_PRESSURE,
49624998
.seq_show = cgroup_cpu_pressure_show,
49634999
.write = cgroup_cpu_pressure_write,
49645000
.poll = cgroup_pressure_poll,
@@ -6313,6 +6349,15 @@ static int __init cgroup_disable(char *str)
63136349
pr_info("Disabling %s control group subsystem\n",
63146350
ss->name);
63156351
}
6352+
6353+
for (i = 0; i < OPT_FEATURE_COUNT; i++) {
6354+
if (strcmp(token, cgroup_opt_feature_names[i]))
6355+
continue;
6356+
cgroup_feature_disable_mask |= 1 << i;
6357+
pr_info("Disabling %s control group feature\n",
6358+
cgroup_opt_feature_names[i]);
6359+
break;
6360+
}
63166361
}
63176362
return 1;
63186363
}
@@ -6611,6 +6656,9 @@ static ssize_t show_delegatable_files(struct cftype *files, char *buf,
66116656
if (!(cft->flags & CFTYPE_NS_DELEGATABLE))
66126657
continue;
66136658

6659+
if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
6660+
continue;
6661+
66146662
if (prefix)
66156663
ret += snprintf(buf + ret, size - ret, "%s.", prefix);
66166664

kernel/sched/psi.c

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,7 @@
148148
static int psi_bug __read_mostly;
149149

150150
DEFINE_STATIC_KEY_FALSE(psi_disabled);
151+
DEFINE_STATIC_KEY_TRUE(psi_cgroups_enabled);
151152

152153
#ifdef CONFIG_PSI_DEFAULT_DISABLED
153154
static bool psi_enable;
@@ -211,6 +212,9 @@ void __init psi_init(void)
211212
return;
212213
}
213214

215+
if (!cgroup_psi_enabled())
216+
static_branch_disable(&psi_cgroups_enabled);
217+
214218
psi_period = jiffies_to_nsecs(PSI_FREQ);
215219
group_init(&psi_system);
216220
}
@@ -744,23 +748,23 @@ static void psi_group_change(struct psi_group *group, int cpu,
744748

745749
static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
746750
{
751+
if (*iter == &psi_system)
752+
return NULL;
753+
747754
#ifdef CONFIG_CGROUPS
748-
struct cgroup *cgroup = NULL;
755+
if (static_branch_likely(&psi_cgroups_enabled)) {
756+
struct cgroup *cgroup = NULL;
749757

750-
if (!*iter)
751-
cgroup = task->cgroups->dfl_cgrp;
752-
else if (*iter == &psi_system)
753-
return NULL;
754-
else
755-
cgroup = cgroup_parent(*iter);
758+
if (!*iter)
759+
cgroup = task->cgroups->dfl_cgrp;
760+
else
761+
cgroup = cgroup_parent(*iter);
756762

757-
if (cgroup && cgroup_parent(cgroup)) {
758-
*iter = cgroup;
759-
return cgroup_psi(cgroup);
763+
if (cgroup && cgroup_parent(cgroup)) {
764+
*iter = cgroup;
765+
return cgroup_psi(cgroup);
766+
}
760767
}
761-
#else
762-
if (*iter)
763-
return NULL;
764768
#endif
765769
*iter = &psi_system;
766770
return &psi_system;

0 commit comments

Comments
 (0)