Skip to content

Commit ddceadc

Browse files
committed
sched_ext: Add support for cgroup bandwidth control interface
From 077814f57f8acce13f91dc34bbd2b7e4911fbf25 Mon Sep 17 00:00:00 2001 From: Tejun Heo <[email protected]> Date: Fri, 13 Jun 2025 15:06:47 -1000 - Add CONFIG_GROUP_SCHED_BANDWIDTH which is selected by both CONFIG_CFS_BANDWIDTH and EXT_GROUP_SCHED. - Put bandwidth control interface files for both cgroup v1 and v2 under CONFIG_GROUP_SCHED_BANDWIDTH. - Update tg_bandwidth() to fetch configuration parameters from fair if CONFIG_CFS_BANDWIDTH, SCX otherwise. - Update tg_set_bandwidth() to update the parameters for both fair and SCX. - Add bandwidth control parameters to struct scx_cgroup_init_args. - Add sched_ext_ops.cgroup_set_bandwidth() which is invoked on bandwidth control parameter updates. - Update scx_qmap and maximal selftest to test the new feature. Signed-off-by: Tejun Heo <[email protected]>
1 parent 6e6558a commit ddceadc

File tree

8 files changed

+127
-10
lines changed

8 files changed

+127
-10
lines changed

include/linux/sched/ext.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,9 @@ struct scx_task_group {
219219
#ifdef CONFIG_EXT_GROUP_SCHED
220220
u32 flags; /* SCX_TG_* */
221221
u32 weight;
222+
u64 bw_period_us;
223+
u64 bw_quota_us;
224+
u64 bw_burst_us;
222225
#endif
223226
};
224227

init/Kconfig

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1065,6 +1065,9 @@ if CGROUP_SCHED
10651065
config GROUP_SCHED_WEIGHT
10661066
def_bool n
10671067

1068+
config GROUP_SCHED_BANDWIDTH
1069+
def_bool n
1070+
10681071
config FAIR_GROUP_SCHED
10691072
bool "Group scheduling for SCHED_OTHER"
10701073
depends on CGROUP_SCHED
@@ -1074,6 +1077,7 @@ config FAIR_GROUP_SCHED
10741077
config CFS_BANDWIDTH
10751078
bool "CPU bandwidth provisioning for FAIR_GROUP_SCHED"
10761079
depends on FAIR_GROUP_SCHED
1080+
select GROUP_SCHED_BANDWIDTH
10771081
default n
10781082
help
10791083
This option allows users to define CPU bandwidth rates (limits) for
@@ -1108,6 +1112,7 @@ config EXT_GROUP_SCHED
11081112
bool
11091113
depends on SCHED_CLASS_EXT && CGROUP_SCHED
11101114
select GROUP_SCHED_WEIGHT
1115+
select GROUP_SCHED_BANDWIDTH
11111116
default y
11121117

11131118
endif #CGROUP_SCHED

kernel/sched/core.c

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9545,7 +9545,9 @@ static int cpu_cfs_local_stat_show(struct seq_file *sf, void *v)
95459545

95469546
return 0;
95479547
}
9548+
#endif /* CONFIG_CFS_BANDWIDTH */
95489549

9550+
#ifdef CONFIG_GROUP_SCHED_BANDWIDTH
95499551
const u64 max_bw_quota_period_us = 1 * USEC_PER_SEC; /* 1s */
95509552
static const u64 min_bw_quota_period_us = 1 * USEC_PER_MSEC; /* 1ms */
95519553
/* More than 203 days if BW_SHIFT equals 20. */
@@ -9554,12 +9556,21 @@ static const u64 max_bw_runtime_us = MAX_BW;
95549556
static void tg_bandwidth(struct task_group *tg,
95559557
u64 *period_us_p, u64 *quota_us_p, u64 *burst_us_p)
95569558
{
9559+
#ifdef CONFIG_CFS_BANDWIDTH
95579560
if (period_us_p)
95589561
*period_us_p = tg_get_cfs_period(tg);
95599562
if (quota_us_p)
95609563
*quota_us_p = tg_get_cfs_quota(tg);
95619564
if (burst_us_p)
95629565
*burst_us_p = tg_get_cfs_burst(tg);
9566+
#else /* !CONFIG_CFS_BANDWIDTH */
9567+
if (period_us_p)
9568+
*period_us_p = tg->scx.bw_period_us;
9569+
if (quota_us_p)
9570+
*quota_us_p = tg->scx.bw_quota_us;
9571+
if (burst_us_p)
9572+
*burst_us_p = tg->scx.bw_burst_us;
9573+
#endif /* CONFIG_CFS_BANDWIDTH */
95639574
}
95649575

95659576
static u64 cpu_period_read_u64(struct cgroup_subsys_state *css,
@@ -9575,6 +9586,7 @@ static int tg_set_bandwidth(struct task_group *tg,
95759586
u64 period_us, u64 quota_us, u64 burst_us)
95769587
{
95779588
const u64 max_usec = U64_MAX / NSEC_PER_USEC;
9589+
int ret = 0;
95789590

95799591
if (tg == &root_task_group)
95809592
return -EINVAL;
@@ -9612,7 +9624,12 @@ static int tg_set_bandwidth(struct task_group *tg,
96129624
burst_us + quota_us > max_bw_runtime_us))
96139625
return -EINVAL;
96149626

9615-
return tg_set_cfs_bandwidth(tg, period_us, quota_us, burst_us);
9627+
#ifdef CONFIG_CFS_BANDWIDTH
9628+
ret = tg_set_cfs_bandwidth(tg, period_us, quota_us, burst_us);
9629+
#endif /* CONFIG_CFS_BANDWIDTH */
9630+
if (!ret)
9631+
scx_group_set_bandwidth(tg, period_us, quota_us, burst_us);
9632+
return ret;
96169633
}
96179634

96189635
static s64 cpu_quota_read_s64(struct cgroup_subsys_state *css,
@@ -9665,7 +9682,7 @@ static int cpu_burst_write_u64(struct cgroup_subsys_state *css,
96659682
tg_bandwidth(tg, &period_us, &quota_us, NULL);
96669683
return tg_set_bandwidth(tg, period_us, quota_us, burst_us);
96679684
}
9668-
#endif /* CONFIG_CFS_BANDWIDTH */
9685+
#endif /* CONFIG_GROUP_SCHED_BANDWIDTH */
96699686

96709687
#ifdef CONFIG_RT_GROUP_SCHED
96719688
static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
@@ -9725,7 +9742,7 @@ static struct cftype cpu_legacy_files[] = {
97259742
.write_s64 = cpu_idle_write_s64,
97269743
},
97279744
#endif
9728-
#ifdef CONFIG_CFS_BANDWIDTH
9745+
#ifdef CONFIG_GROUP_SCHED_BANDWIDTH
97299746
{
97309747
.name = "cfs_period_us",
97319748
.read_u64 = cpu_period_read_u64,
@@ -9741,6 +9758,8 @@ static struct cftype cpu_legacy_files[] = {
97419758
.read_u64 = cpu_burst_read_u64,
97429759
.write_u64 = cpu_burst_write_u64,
97439760
},
9761+
#endif
9762+
#ifdef CONFIG_CFS_BANDWIDTH
97449763
{
97459764
.name = "stat",
97469765
.seq_show = cpu_cfs_stat_show,
@@ -9954,7 +9973,7 @@ static int __maybe_unused cpu_period_quota_parse(char *buf, u64 *period_us_p,
99549973
return 0;
99559974
}
99569975

9957-
#ifdef CONFIG_CFS_BANDWIDTH
9976+
#ifdef CONFIG_GROUP_SCHED_BANDWIDTH
99589977
static int cpu_max_show(struct seq_file *sf, void *v)
99599978
{
99609979
struct task_group *tg = css_tg(seq_css(sf));
@@ -10001,7 +10020,7 @@ static struct cftype cpu_files[] = {
1000110020
.write_s64 = cpu_idle_write_s64,
1000210021
},
1000310022
#endif
10004-
#ifdef CONFIG_CFS_BANDWIDTH
10023+
#ifdef CONFIG_GROUP_SCHED_BANDWIDTH
1000510024
{
1000610025
.name = "max",
1000710026
.flags = CFTYPE_NOT_ON_ROOT,

kernel/sched/ext.c

Lines changed: 63 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,11 @@ struct scx_exit_task_args {
203203
struct scx_cgroup_init_args {
204204
/* the weight of the cgroup [1..10000] */
205205
u32 weight;
206+
207+
/* bandwidth control parameters from cpu.max and cpu.max.burst */
208+
u64 bw_period_us;
209+
u64 bw_quota_us;
210+
u64 bw_burst_us;
206211
};
207212

208213
enum scx_cpu_preempt_reason {
@@ -664,9 +669,31 @@ struct sched_ext_ops {
664669
* @cgrp: cgroup whose weight is being updated
665670
* @weight: new weight [1..10000]
666671
*
667-
* Update @tg's weight to @weight.
672+
* Update @cgrp's weight to @weight.
668673
*/
669674
void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight);
675+
676+
/**
677+
* @cgroup_set_bandwidth: A cgroup's bandwidth is being changed
678+
* @cgrp: cgroup whose bandwidth is being updated
679+
* @period_us: bandwidth control period
680+
* @quota_us: bandwidth control quota
681+
* @burst_us: bandwidth control burst
682+
*
683+
* Update @cgrp's bandwidth control parameters. This is from the cpu.max
684+
* cgroup interface.
685+
*
686+
* @quota_us / @period_us determines the CPU bandwidth @cgrp is entitled
687+
* to. For example, if @period_us is 1_000_000 and @quota_us is
688+
* 2_500_000. @cgrp is entitled to 2.5 CPUs. @burst_us can be
689+
* interpreted in the same fashion and specifies how much @cgrp can
690+
* burst temporarily. The specific control mechanism and thus the
691+
* interpretation of @period_us and burstiness is upto to the BPF
692+
* scheduler.
693+
*/
694+
void (*cgroup_set_bandwidth)(struct cgroup *cgrp,
695+
u64 period_us, u64 quota_us, u64 burst_us);
696+
670697
#endif /* CONFIG_EXT_GROUP_SCHED */
671698

672699
/*
@@ -4059,6 +4086,8 @@ static bool scx_cgroup_enabled;
40594086
void scx_tg_init(struct task_group *tg)
40604087
{
40614088
tg->scx.weight = CGROUP_WEIGHT_DFL;
4089+
tg->scx.bw_period_us = default_bw_period_us();
4090+
tg->scx.bw_quota_us = RUNTIME_INF;
40624091
}
40634092

40644093
int scx_tg_online(struct task_group *tg)
@@ -4073,7 +4102,10 @@ int scx_tg_online(struct task_group *tg)
40734102
if (scx_cgroup_enabled) {
40744103
if (SCX_HAS_OP(sch, cgroup_init)) {
40754104
struct scx_cgroup_init_args args =
4076-
{ .weight = tg->scx.weight };
4105+
{ .weight = tg->scx.weight,
4106+
.bw_period_us = tg->scx.bw_period_us,
4107+
.bw_quota_us = tg->scx.bw_quota_us,
4108+
.bw_burst_us = tg->scx.bw_burst_us };
40774109

40784110
ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init,
40794111
NULL, tg->css.cgroup, &args);
@@ -4225,6 +4257,27 @@ void scx_group_set_idle(struct task_group *tg, bool idle)
42254257
/* TODO: Implement ops->cgroup_set_idle() */
42264258
}
42274259

4260+
void scx_group_set_bandwidth(struct task_group *tg,
4261+
u64 period_us, u64 quota_us, u64 burst_us)
4262+
{
4263+
struct scx_sched *sch = scx_root;
4264+
4265+
percpu_down_read(&scx_cgroup_rwsem);
4266+
4267+
if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_bandwidth) &&
4268+
(tg->scx.bw_period_us != period_us ||
4269+
tg->scx.bw_quota_us != quota_us ||
4270+
tg->scx.bw_burst_us != burst_us))
4271+
SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_bandwidth, NULL,
4272+
tg_cgrp(tg), period_us, quota_us, burst_us);
4273+
4274+
tg->scx.bw_period_us = period_us;
4275+
tg->scx.bw_quota_us = quota_us;
4276+
tg->scx.bw_burst_us = burst_us;
4277+
4278+
percpu_up_read(&scx_cgroup_rwsem);
4279+
}
4280+
42284281
static void scx_cgroup_lock(void)
42294282
{
42304283
percpu_down_write(&scx_cgroup_rwsem);
@@ -4400,7 +4453,12 @@ static int scx_cgroup_init(struct scx_sched *sch)
44004453
rcu_read_lock();
44014454
css_for_each_descendant_pre(css, &root_task_group.css) {
44024455
struct task_group *tg = css_tg(css);
4403-
struct scx_cgroup_init_args args = { .weight = tg->scx.weight };
4456+
struct scx_cgroup_init_args args = {
4457+
.weight = tg->scx.weight,
4458+
.bw_period_us = tg->scx.bw_period_us,
4459+
.bw_quota_us = tg->scx.bw_quota_us,
4460+
.bw_burst_us = tg->scx.bw_burst_us,
4461+
};
44044462

44054463
if ((tg->scx.flags &
44064464
(SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE)
@@ -5902,6 +5960,7 @@ static s32 sched_ext_ops__cgroup_prep_move(struct task_struct *p, struct cgroup
59025960
static void sched_ext_ops__cgroup_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {}
59035961
static void sched_ext_ops__cgroup_cancel_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {}
59045962
static void sched_ext_ops__cgroup_set_weight(struct cgroup *cgrp, u32 weight) {}
5963+
static void sched_ext_ops__cgroup_set_bandwidth(struct cgroup *cgrp, u64 period_us, u64 quota_us, u64 burst_us) {}
59055964
#endif
59065965
static void sched_ext_ops__cpu_online(s32 cpu) {}
59075966
static void sched_ext_ops__cpu_offline(s32 cpu) {}
@@ -5939,6 +5998,7 @@ static struct sched_ext_ops __bpf_ops_sched_ext_ops = {
59395998
.cgroup_move = sched_ext_ops__cgroup_move,
59405999
.cgroup_cancel_move = sched_ext_ops__cgroup_cancel_move,
59416000
.cgroup_set_weight = sched_ext_ops__cgroup_set_weight,
6001+
.cgroup_set_bandwidth = sched_ext_ops__cgroup_set_bandwidth,
59426002
#endif
59436003
.cpu_online = sched_ext_ops__cpu_online,
59446004
.cpu_offline = sched_ext_ops__cpu_offline,

kernel/sched/ext.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ void scx_cgroup_finish_attach(void);
104104
void scx_cgroup_cancel_attach(struct cgroup_taskset *tset);
105105
void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight);
106106
void scx_group_set_idle(struct task_group *tg, bool idle);
107+
void scx_group_set_bandwidth(struct task_group *tg, u64 period_us, u64 quota_us, u64 burst_us);
107108
#else /* CONFIG_EXT_GROUP_SCHED */
108109
static inline void scx_tg_init(struct task_group *tg) {}
109110
static inline int scx_tg_online(struct task_group *tg) { return 0; }
@@ -114,5 +115,6 @@ static inline void scx_cgroup_finish_attach(void) {}
114115
static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {}
115116
static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {}
116117
static inline void scx_group_set_idle(struct task_group *tg, bool idle) {}
118+
static inline void scx_group_set_bandwidth(struct task_group *tg, u64 period_us, u64 quota_us, u64 burst_us) {}
117119
#endif /* CONFIG_EXT_GROUP_SCHED */
118120
#endif /* CONFIG_CGROUP_SCHED */

kernel/sched/sched.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -402,7 +402,7 @@ static inline bool dl_server_active(struct sched_dl_entity *dl_se)
402402

403403
extern struct list_head task_groups;
404404

405-
#ifdef CONFIG_CFS_BANDWIDTH
405+
#ifdef CONFIG_GROUP_SCHED_BANDWIDTH
406406
extern const u64 max_bw_quota_period_us;
407407

408408
/*
@@ -413,7 +413,7 @@ static inline u64 default_bw_period_us(void)
413413
{
414414
return 100000ULL;
415415
}
416-
#endif /* CONFIG_CFS_BANDWIDTH */
416+
#endif /* CONFIG_GROUP_SCHED_BANDWIDTH */
417417

418418
struct cfs_bandwidth {
419419
#ifdef CONFIG_CFS_BANDWIDTH

tools/sched_ext/scx_qmap.bpf.c

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -615,6 +615,26 @@ void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struc
615615
taskc->force_local, taskc->core_sched_seq);
616616
}
617617

618+
s32 BPF_STRUCT_OPS(qmap_cgroup_init, struct cgroup *cgrp, struct scx_cgroup_init_args *args)
619+
{
620+
bpf_printk("CGRP INIT %llu weight=%u period=%lu quota=%ld burst=%lu",
621+
cgrp->kn->id, args->weight, args->bw_period_us,
622+
args->bw_quota_us, args->bw_burst_us);
623+
return 0;
624+
}
625+
626+
void BPF_STRUCT_OPS(qmap_cgroup_set_weight, struct cgroup *cgrp, u32 weight)
627+
{
628+
bpf_printk("CGRP SET %llu weight=%u", cgrp->kn->id, weight);
629+
}
630+
631+
void BPF_STRUCT_OPS(qmap_cgroup_set_bandwidth, struct cgroup *cgrp,
632+
u64 period_us, u64 quota_us, u64 burst_us)
633+
{
634+
bpf_printk("CGRP SET %llu period=%lu quota=%ld burst=%lu", cgrp->kn->id,
635+
period_us, quota_us, burst_us);
636+
}
637+
618638
/*
619639
* Print out the online and possible CPU map using bpf_printk() as a
620640
* demonstration of using the cpumask kfuncs and ops.cpu_on/offline().
@@ -840,6 +860,9 @@ SCX_OPS_DEFINE(qmap_ops,
840860
.dump = (void *)qmap_dump,
841861
.dump_cpu = (void *)qmap_dump_cpu,
842862
.dump_task = (void *)qmap_dump_task,
863+
.cgroup_init = (void *)qmap_cgroup_init,
864+
.cgroup_set_weight = (void *)qmap_cgroup_set_weight,
865+
.cgroup_set_bandwidth = (void *)qmap_cgroup_set_bandwidth,
843866
.cpu_online = (void *)qmap_cpu_online,
844867
.cpu_offline = (void *)qmap_cpu_offline,
845868
.init = (void *)qmap_init,

tools/testing/selftests/sched_ext/maximal.bpf.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,10 @@ void BPF_STRUCT_OPS(maximal_cgroup_cancel_move, struct task_struct *p,
123123
void BPF_STRUCT_OPS(maximal_cgroup_set_weight, struct cgroup *cgrp, u32 weight)
124124
{}
125125

126+
void BPF_STRUCT_OPS(maximal_cgroup_set_bandwidth, struct cgroup *cgrp,
127+
u64 period_us, u64 quota_us, u64 burst_us)
128+
{}
129+
126130
s32 BPF_STRUCT_OPS_SLEEPABLE(maximal_init)
127131
{
128132
return scx_bpf_create_dsq(DSQ_ID, -1);
@@ -160,6 +164,7 @@ struct sched_ext_ops maximal_ops = {
160164
.cgroup_move = (void *) maximal_cgroup_move,
161165
.cgroup_cancel_move = (void *) maximal_cgroup_cancel_move,
162166
.cgroup_set_weight = (void *) maximal_cgroup_set_weight,
167+
.cgroup_set_bandwidth = (void *) maximal_cgroup_set_bandwidth,
163168
.init = (void *) maximal_init,
164169
.exit = (void *) maximal_exit,
165170
.name = "maximal",

0 commit comments

Comments
 (0)