Skip to content

Commit 2d285d5

Browse files
committed
scx_qmap: Implement highpri boosting
Implement a silly boosting mechanism for nice -20 tasks. The only purpose is demonstrating and testing scx_bpf_dispatch_from_dsq(). The boosting only works within SHARED_DSQ and makes only minor differences with increased dispatch batch (-b). This exercises moving tasks to a user DSQ and all local DSQs from ops.dispatch() and BPF timerfn. v2: - Updated to use scx_bpf_dispatch_from_dsq_set_{slice|vtime}(). - Drop the workaround for the iterated tasks not being trusted by the verifier. The issue is fixed from BPF side. Signed-off-by: Tejun Heo <[email protected]> Cc: Daniel Hodges <[email protected]> Cc: David Vernet <[email protected]> Cc: Changwoo Min <[email protected]> Cc: Andrea Righi <[email protected]> Cc: Dan Schatzberg <[email protected]>
1 parent 4c30f5c commit 2d285d5

File tree

2 files changed

+130
-14
lines changed

2 files changed

+130
-14
lines changed

tools/sched_ext/scx_qmap.bpf.c

Lines changed: 120 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@
2727
enum consts {
2828
ONE_SEC_IN_NS = 1000000000,
2929
SHARED_DSQ = 0,
30+
HIGHPRI_DSQ = 1,
31+
HIGHPRI_WEIGHT = 8668, /* this is what -20 maps to */
3032
};
3133

3234
char _license[] SEC("license") = "GPL";
@@ -36,10 +38,12 @@ const volatile u32 stall_user_nth;
3638
const volatile u32 stall_kernel_nth;
3739
const volatile u32 dsp_inf_loop_after;
3840
const volatile u32 dsp_batch;
41+
const volatile bool highpri_boosting;
3942
const volatile bool print_shared_dsq;
4043
const volatile s32 disallow_tgid;
4144
const volatile bool suppress_dump;
4245

46+
u64 nr_highpri_queued;
4347
u32 test_error_cnt;
4448

4549
UEI_DEFINE(uei);
@@ -95,6 +99,7 @@ static u64 core_sched_tail_seqs[5];
9599
/* Per-task scheduling context */
96100
struct task_ctx {
97101
bool force_local; /* Dispatch directly to local_dsq */
102+
bool highpri;
98103
u64 core_sched_seq;
99104
};
100105

@@ -122,6 +127,7 @@ struct {
122127
/* Statistics */
123128
u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued, nr_ddsp_from_enq;
124129
u64 nr_core_sched_execed;
130+
u64 nr_expedited_local, nr_expedited_remote, nr_expedited_lost, nr_expedited_from_timer;
125131
u32 cpuperf_min, cpuperf_avg, cpuperf_max;
126132
u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max;
127133

@@ -140,17 +146,25 @@ static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
140146
return -1;
141147
}
142148

149+
static struct task_ctx *lookup_task_ctx(struct task_struct *p)
150+
{
151+
struct task_ctx *tctx;
152+
153+
if (!(tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) {
154+
scx_bpf_error("task_ctx lookup failed");
155+
return NULL;
156+
}
157+
return tctx;
158+
}
159+
143160
s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
144161
s32 prev_cpu, u64 wake_flags)
145162
{
146163
struct task_ctx *tctx;
147164
s32 cpu;
148165

149-
tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
150-
if (!tctx) {
151-
scx_bpf_error("task_ctx lookup failed");
166+
if (!(tctx = lookup_task_ctx(p)))
152167
return -ESRCH;
153-
}
154168

155169
cpu = pick_direct_dispatch_cpu(p, prev_cpu);
156170

@@ -197,11 +211,8 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
197211
if (test_error_cnt && !--test_error_cnt)
198212
scx_bpf_error("test triggering error");
199213

200-
tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
201-
if (!tctx) {
202-
scx_bpf_error("task_ctx lookup failed");
214+
if (!(tctx = lookup_task_ctx(p)))
203215
return;
204-
}
205216

206217
/*
207218
* All enqueued tasks must have their core_sched_seq updated for correct
@@ -255,6 +266,10 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
255266
return;
256267
}
257268

269+
if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) {
270+
tctx->highpri = true;
271+
__sync_fetch_and_add(&nr_highpri_queued, 1);
272+
}
258273
__sync_fetch_and_add(&nr_enqueued, 1);
259274
}
260275

@@ -271,13 +286,80 @@ void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
271286

272287
static void update_core_sched_head_seq(struct task_struct *p)
273288
{
274-
struct task_ctx *tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
275289
int idx = weight_to_idx(p->scx.weight);
290+
struct task_ctx *tctx;
276291

277-
if (tctx)
292+
if ((tctx = lookup_task_ctx(p)))
278293
core_sched_head_seqs[idx] = tctx->core_sched_seq;
279-
else
280-
scx_bpf_error("task_ctx lookup failed");
294+
}
295+
296+
/*
297+
* To demonstrate the use of scx_bpf_dispatch_from_dsq(), implement silly
298+
* selective priority boosting mechanism by scanning SHARED_DSQ looking for
299+
* highpri tasks, moving them to HIGHPRI_DSQ and then consuming them first. This
300+
* makes minor difference only when dsp_batch is larger than 1.
301+
*
302+
* scx_bpf_dispatch[_vtime]_from_dsq() are allowed both from ops.dispatch() and
303+
* non-rq-lock holding BPF programs. As demonstration, this function is called
304+
* from qmap_dispatch() and monitor_timerfn().
305+
*/
306+
static bool dispatch_highpri(bool from_timer)
307+
{
308+
struct task_struct *p;
309+
s32 this_cpu = bpf_get_smp_processor_id();
310+
311+
/* scan SHARED_DSQ and move highpri tasks to HIGHPRI_DSQ */
312+
bpf_for_each(scx_dsq, p, SHARED_DSQ, 0) {
313+
static u64 highpri_seq;
314+
struct task_ctx *tctx;
315+
316+
if (!(tctx = lookup_task_ctx(p)))
317+
return false;
318+
319+
if (tctx->highpri) {
320+
/* exercise the set_*() and vtime interface too */
321+
scx_bpf_dispatch_from_dsq_set_slice(
322+
BPF_FOR_EACH_ITER, slice_ns * 2);
323+
scx_bpf_dispatch_from_dsq_set_vtime(
324+
BPF_FOR_EACH_ITER, highpri_seq++);
325+
scx_bpf_dispatch_vtime_from_dsq(
326+
BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0);
327+
}
328+
}
329+
330+
/*
331+
* Scan HIGHPRI_DSQ and dispatch until a task that can run on this CPU
332+
* is found.
333+
*/
334+
bpf_for_each(scx_dsq, p, HIGHPRI_DSQ, 0) {
335+
bool dispatched = false;
336+
s32 cpu;
337+
338+
if (bpf_cpumask_test_cpu(this_cpu, p->cpus_ptr))
339+
cpu = this_cpu;
340+
else
341+
cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0);
342+
343+
if (scx_bpf_dispatch_from_dsq(BPF_FOR_EACH_ITER, p,
344+
SCX_DSQ_LOCAL_ON | cpu,
345+
SCX_ENQ_PREEMPT)) {
346+
if (cpu == this_cpu) {
347+
dispatched = true;
348+
__sync_fetch_and_add(&nr_expedited_local, 1);
349+
} else {
350+
__sync_fetch_and_add(&nr_expedited_remote, 1);
351+
}
352+
if (from_timer)
353+
__sync_fetch_and_add(&nr_expedited_from_timer, 1);
354+
} else {
355+
__sync_fetch_and_add(&nr_expedited_lost, 1);
356+
}
357+
358+
if (dispatched)
359+
return true;
360+
}
361+
362+
return false;
281363
}
282364

283365
void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
@@ -289,7 +371,10 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
289371
void *fifo;
290372
s32 i, pid;
291373

292-
if (scx_bpf_consume(SHARED_DSQ))
374+
if (dispatch_highpri(false))
375+
return;
376+
377+
if (!nr_highpri_queued && scx_bpf_consume(SHARED_DSQ))
293378
return;
294379

295380
if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) {
@@ -326,20 +411,34 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
326411

327412
/* Dispatch or advance. */
328413
bpf_repeat(BPF_MAX_LOOPS) {
414+
struct task_ctx *tctx;
415+
329416
if (bpf_map_pop_elem(fifo, &pid))
330417
break;
331418

332419
p = bpf_task_from_pid(pid);
333420
if (!p)
334421
continue;
335422

423+
if (!(tctx = lookup_task_ctx(p))) {
424+
bpf_task_release(p);
425+
return;
426+
}
427+
428+
if (tctx->highpri)
429+
__sync_fetch_and_sub(&nr_highpri_queued, 1);
430+
336431
update_core_sched_head_seq(p);
337432
__sync_fetch_and_add(&nr_dispatched, 1);
433+
338434
scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, 0);
339435
bpf_task_release(p);
436+
340437
batch--;
341438
cpuc->dsp_cnt--;
342439
if (!batch || !scx_bpf_dispatch_nr_slots()) {
440+
if (dispatch_highpri(false))
441+
return;
343442
scx_bpf_consume(SHARED_DSQ);
344443
return;
345444
}
@@ -664,6 +763,10 @@ static void dump_shared_dsq(void)
664763

665764
static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer)
666765
{
766+
bpf_rcu_read_lock();
767+
dispatch_highpri(true);
768+
bpf_rcu_read_unlock();
769+
667770
monitor_cpuperf();
668771

669772
if (print_shared_dsq)
@@ -685,6 +788,10 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
685788
if (ret)
686789
return ret;
687790

791+
ret = scx_bpf_create_dsq(HIGHPRI_DSQ, -1);
792+
if (ret)
793+
return ret;
794+
688795
timer = bpf_map_lookup_elem(&monitor_timer, &key);
689796
if (!timer)
690797
return -ESRCH;

tools/sched_ext/scx_qmap.c

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ const char help_fmt[] =
2929
" -l COUNT Trigger dispatch infinite looping after COUNT dispatches\n"
3030
" -b COUNT Dispatch upto COUNT tasks together\n"
3131
" -P Print out DSQ content to trace_pipe every second, use with -b\n"
32+
" -H Boost nice -20 tasks in SHARED_DSQ, use with -b\n"
3233
" -d PID Disallow a process from switching into SCHED_EXT (-1 for self)\n"
3334
" -D LEN Set scx_exit_info.dump buffer length\n"
3435
" -S Suppress qmap-specific debug dump\n"
@@ -63,7 +64,7 @@ int main(int argc, char **argv)
6364

6465
skel = SCX_OPS_OPEN(qmap_ops, scx_qmap);
6566

66-
while ((opt = getopt(argc, argv, "s:e:t:T:l:b:Pd:D:Spvh")) != -1) {
67+
while ((opt = getopt(argc, argv, "s:e:t:T:l:b:PHd:D:Spvh")) != -1) {
6768
switch (opt) {
6869
case 's':
6970
skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
@@ -86,6 +87,9 @@ int main(int argc, char **argv)
8687
case 'P':
8788
skel->rodata->print_shared_dsq = true;
8889
break;
90+
case 'H':
91+
skel->rodata->highpri_boosting = true;
92+
break;
8993
case 'd':
9094
skel->rodata->disallow_tgid = strtol(optarg, NULL, 0);
9195
if (skel->rodata->disallow_tgid < 0)
@@ -121,6 +125,11 @@ int main(int argc, char **argv)
121125
skel->bss->nr_reenqueued, skel->bss->nr_dequeued,
122126
skel->bss->nr_core_sched_execed,
123127
skel->bss->nr_ddsp_from_enq);
128+
printf(" exp_local=%"PRIu64" exp_remote=%"PRIu64" exp_timer=%"PRIu64" exp_lost=%"PRIu64"\n",
129+
skel->bss->nr_expedited_local,
130+
skel->bss->nr_expedited_remote,
131+
skel->bss->nr_expedited_from_timer,
132+
skel->bss->nr_expedited_lost);
124133
if (__COMPAT_has_ksym("scx_bpf_cpuperf_cur"))
125134
printf("cpuperf: cur min/avg/max=%u/%u/%u target min/avg/max=%u/%u/%u\n",
126135
skel->bss->cpuperf_min,

0 commit comments

Comments
 (0)