Skip to content
This repository was archived by the owner on Jun 18, 2024. It is now read-only.

Commit 3b7b9da

Browse files
Byte-Labhtejun
authored andcommitted
sched_ext: Add a basic, userland vruntime scheduler
This patch adds a new scx_userland BPF scheduler that implements a fairly unsophisticated sorted-list vruntime scheduler in userland to demonstrate how most scheduling decisions can be delegated to userland. The scheduler doesn't implement load balancing, and treats all tasks as part of a single domain. v2: * Converted to BPF inline iterators. Signed-off-by: David Vernet <dvernet@meta.com> Reviewed-by: Tejun Heo <tj@kernel.org> Signed-off-by: Tejun Heo <tj@kernel.org>
1 parent 4573227 commit 3b7b9da

File tree

5 files changed

+690
-2
lines changed

5 files changed

+690
-2
lines changed

tools/sched_ext/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ scx_qmap
33
scx_central
44
scx_pair
55
scx_flatcg
6+
scx_userland
67
*.skel.h
78
*.subskel.h
89
/tools/

tools/sched_ext/Makefile

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ BPF_CFLAGS = -g -D__TARGET_ARCH_$(SRCARCH) \
115115
-Wall -Wno-compare-distinct-pointer-types \
116116
-O2 -mcpu=v3
117117

118-
all: scx_simple scx_qmap scx_central scx_pair scx_flatcg
118+
all: scx_simple scx_qmap scx_central scx_pair scx_flatcg scx_userland
119119

120120
# sort removes libbpf duplicates when not cross-building
121121
MAKE_DIRS := $(sort $(BUILD_DIR)/libbpf $(HOST_BUILD_DIR)/libbpf \
@@ -186,10 +186,14 @@ scx_flatcg: scx_flatcg.c scx_flatcg.skel.h user_exit_info.h
186186
$(CC) $(CFLAGS) -c $< -o $@.o
187187
$(CC) -o $@ $@.o $(HOST_BPFOBJ) $(LDFLAGS)
188188

189+
scx_userland: scx_userland.c scx_userland.skel.h scx_userland.h user_exit_info.h
190+
$(CC) $(CFLAGS) -c $< -o $@.o
191+
$(CC) -o $@ $@.o $(HOST_BPFOBJ) $(LDFLAGS)
192+
189193
clean:
190194
rm -rf $(SCRATCH_DIR) $(HOST_SCRATCH_DIR)
191195
rm -f *.o *.bpf.o *.skel.h *.subskel.h
192-
rm -f scx_simple scx_qmap scx_central scx_pair scx_flatcg
196+
rm -f scx_simple scx_qmap scx_central scx_pair scx_flatcg scx_userland
193197

194198
.PHONY: all clean
195199

tools/sched_ext/scx_userland.bpf.c

Lines changed: 262 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,262 @@
1+
/* SPDX-License-Identifier: GPL-2.0 */
2+
/*
3+
* A minimal userland scheduler.
4+
*
5+
* In terms of scheduling, this provides two different types of behaviors:
6+
* 1. A global FIFO scheduling order for _any_ tasks that have CPU affinity.
7+
* All such tasks are direct-dispatched from the kernel, and are never
8+
* enqueued in user space.
9+
* 2. A primitive vruntime scheduler that is implemented in user space, for all
10+
* other tasks.
11+
*
12+
* Some parts of this example user space scheduler could be implemented more
13+
* efficiently using more complex and sophisticated data structures. For
14+
* example, rather than using BPF_MAP_TYPE_QUEUE's,
15+
* BPF_MAP_TYPE_{USER_}RINGBUF's could be used for exchanging messages between
16+
* user space and kernel space. Similarly, we use a simple vruntime-sorted list
17+
* in user space, but an rbtree could be used instead.
18+
*
19+
* Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
20+
* Copyright (c) 2022 Tejun Heo <tj@kernel.org>
21+
* Copyright (c) 2022 David Vernet <dvernet@meta.com>
22+
*/
23+
#include <string.h>
24+
#include "scx_common.bpf.h"
25+
#include "scx_userland.h"
26+
27+
char _license[] SEC("license") = "GPL";
28+
29+
const volatile bool switch_partial;
30+
const volatile s32 usersched_pid;
31+
32+
/* !0 for veristat, set during init */
33+
const volatile u32 num_possible_cpus = 64;
34+
35+
/* Stats that are printed by user space. */
36+
u64 nr_failed_enqueues, nr_kernel_enqueues, nr_user_enqueues;
37+
38+
struct user_exit_info uei;
39+
40+
/*
41+
* Whether the user space scheduler needs to be scheduled due to a task being
42+
* enqueued in user space.
43+
*/
44+
static bool usersched_needed;
45+
46+
/*
47+
* The map containing tasks that are enqueued in user space from the kernel.
48+
*
49+
* This map is drained by the user space scheduler.
50+
*/
51+
struct {
52+
__uint(type, BPF_MAP_TYPE_QUEUE);
53+
__uint(max_entries, USERLAND_MAX_TASKS);
54+
__type(value, struct scx_userland_enqueued_task);
55+
} enqueued SEC(".maps");
56+
57+
/*
58+
* The map containing tasks that are dispatched to the kernel from user space.
59+
*
60+
* Drained by the kernel in userland_dispatch().
61+
*/
62+
struct {
63+
__uint(type, BPF_MAP_TYPE_QUEUE);
64+
__uint(max_entries, USERLAND_MAX_TASKS);
65+
__type(value, s32);
66+
} dispatched SEC(".maps");
67+
68+
/* Per-task scheduling context */
69+
struct task_ctx {
70+
bool force_local; /* Dispatch directly to local DSQ */
71+
};
72+
73+
/* Map that contains task-local storage. */
74+
struct {
75+
__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
76+
__uint(map_flags, BPF_F_NO_PREALLOC);
77+
__type(key, int);
78+
__type(value, struct task_ctx);
79+
} task_ctx_stor SEC(".maps");
80+
81+
static bool is_usersched_task(const struct task_struct *p)
82+
{
83+
return p->pid == usersched_pid;
84+
}
85+
86+
static bool keep_in_kernel(const struct task_struct *p)
87+
{
88+
return p->nr_cpus_allowed < num_possible_cpus;
89+
}
90+
91+
static struct task_struct *usersched_task(void)
92+
{
93+
struct task_struct *p;
94+
95+
p = bpf_task_from_pid(usersched_pid);
96+
/*
97+
* Should never happen -- the usersched task should always be managed
98+
* by sched_ext.
99+
*/
100+
if (!p)
101+
scx_bpf_error("Failed to find usersched task %d", usersched_pid);
102+
103+
return p;
104+
}
105+
106+
s32 BPF_STRUCT_OPS(userland_select_cpu, struct task_struct *p,
107+
s32 prev_cpu, u64 wake_flags)
108+
{
109+
if (keep_in_kernel(p)) {
110+
s32 cpu;
111+
struct task_ctx *tctx;
112+
113+
tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
114+
if (!tctx) {
115+
scx_bpf_error("Failed to look up task-local storage for %s", p->comm);
116+
return -ESRCH;
117+
}
118+
119+
if (p->nr_cpus_allowed == 1 ||
120+
scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
121+
tctx->force_local = true;
122+
return prev_cpu;
123+
}
124+
125+
cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
126+
if (cpu >= 0) {
127+
tctx->force_local = true;
128+
return cpu;
129+
}
130+
}
131+
132+
return prev_cpu;
133+
}
134+
135+
static void dispatch_user_scheduler(void)
136+
{
137+
struct task_struct *p;
138+
139+
usersched_needed = false;
140+
p = usersched_task();
141+
if (p) {
142+
scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
143+
bpf_task_release(p);
144+
}
145+
}
146+
147+
static void enqueue_task_in_user_space(struct task_struct *p, u64 enq_flags)
148+
{
149+
struct scx_userland_enqueued_task task;
150+
151+
memset(&task, 0, sizeof(task));
152+
task.pid = p->pid;
153+
task.sum_exec_runtime = p->se.sum_exec_runtime;
154+
task.weight = p->scx.weight;
155+
156+
if (bpf_map_push_elem(&enqueued, &task, 0)) {
157+
/*
158+
* If we fail to enqueue the task in user space, put it
159+
* directly on the global DSQ.
160+
*/
161+
__sync_fetch_and_add(&nr_failed_enqueues, 1);
162+
scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
163+
} else {
164+
__sync_fetch_and_add(&nr_user_enqueues, 1);
165+
usersched_needed = true;
166+
}
167+
}
168+
169+
void BPF_STRUCT_OPS(userland_enqueue, struct task_struct *p, u64 enq_flags)
170+
{
171+
if (keep_in_kernel(p)) {
172+
u64 dsq_id = SCX_DSQ_GLOBAL;
173+
struct task_ctx *tctx;
174+
175+
tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
176+
if (!tctx) {
177+
scx_bpf_error("Failed to lookup task ctx for %s", p->comm);
178+
return;
179+
}
180+
181+
if (tctx->force_local)
182+
dsq_id = SCX_DSQ_LOCAL;
183+
tctx->force_local = false;
184+
scx_bpf_dispatch(p, dsq_id, SCX_SLICE_DFL, enq_flags);
185+
__sync_fetch_and_add(&nr_kernel_enqueues, 1);
186+
return;
187+
} else if (!is_usersched_task(p)) {
188+
enqueue_task_in_user_space(p, enq_flags);
189+
}
190+
}
191+
192+
void BPF_STRUCT_OPS(userland_dispatch, s32 cpu, struct task_struct *prev)
193+
{
194+
if (usersched_needed)
195+
dispatch_user_scheduler();
196+
197+
bpf_repeat(4096) {
198+
s32 pid;
199+
struct task_struct *p;
200+
201+
if (bpf_map_pop_elem(&dispatched, &pid))
202+
break;
203+
204+
/*
205+
* The task could have exited by the time we get around to
206+
* dispatching it. Treat this as a normal occurrence, and simply
207+
* move onto the next iteration.
208+
*/
209+
p = bpf_task_from_pid(pid);
210+
if (!p)
211+
continue;
212+
213+
scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
214+
bpf_task_release(p);
215+
}
216+
}
217+
218+
s32 BPF_STRUCT_OPS(userland_prep_enable, struct task_struct *p,
219+
struct scx_enable_args *args)
220+
{
221+
if (bpf_task_storage_get(&task_ctx_stor, p, 0,
222+
BPF_LOCAL_STORAGE_GET_F_CREATE))
223+
return 0;
224+
else
225+
return -ENOMEM;
226+
}
227+
228+
s32 BPF_STRUCT_OPS(userland_init)
229+
{
230+
if (num_possible_cpus == 0) {
231+
scx_bpf_error("User scheduler # CPUs uninitialized (%d)",
232+
num_possible_cpus);
233+
return -EINVAL;
234+
}
235+
236+
if (usersched_pid <= 0) {
237+
scx_bpf_error("User scheduler pid uninitialized (%d)",
238+
usersched_pid);
239+
return -EINVAL;
240+
}
241+
242+
if (!switch_partial)
243+
scx_bpf_switch_all();
244+
return 0;
245+
}
246+
247+
void BPF_STRUCT_OPS(userland_exit, struct scx_exit_info *ei)
248+
{
249+
uei_record(&uei, ei);
250+
}
251+
252+
SEC(".struct_ops.link")
253+
struct sched_ext_ops userland_ops = {
254+
.select_cpu = (void *)userland_select_cpu,
255+
.enqueue = (void *)userland_enqueue,
256+
.dispatch = (void *)userland_dispatch,
257+
.prep_enable = (void *)userland_prep_enable,
258+
.init = (void *)userland_init,
259+
.exit = (void *)userland_exit,
260+
.timeout_ms = 3000,
261+
.name = "userland",
262+
};

0 commit comments

Comments
 (0)