Skip to content

Commit 5748e69

Browse files
rgushchinKernel Patches Daemon
authored andcommitted
sched: psi: implement psi trigger handling using bpf
This patch implements a bpf struct ops-based mechanism to create psi triggers, attach them to cgroups or system wide and handle psi events in bpf. The struct ops provides 3 callbacks: - init() called once at load, handy for creating psi triggers - handle_psi_event() called every time a psi trigger fires - handle_cgroup_free() called if a cgroup with an attached trigger is being freed A single struct ops can create a number of psi triggers, both cgroup-scoped and system-wide. All 3 struct ops callbacks can be sleepable. handle_psi_event() handlers are executed using a separate workqueue, so it won't affect the latency of other psi triggers. Signed-off-by: Roman Gushchin <[email protected]>
1 parent 0f8d813 commit 5748e69

File tree

5 files changed

+408
-12
lines changed

5 files changed

+408
-12
lines changed

include/linux/bpf_psi.h

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
/* SPDX-License-Identifier: GPL-2.0+ */
2+
3+
#ifndef __BPF_PSI_H
4+
#define __BPF_PSI_H
5+
6+
#include <linux/list.h>
7+
#include <linux/spinlock.h>
8+
#include <linux/srcu.h>
9+
#include <linux/psi_types.h>
10+
11+
struct cgroup;
12+
struct bpf_psi;
13+
struct psi_trigger;
14+
struct psi_trigger_params;
15+
16+
#define BPF_PSI_FULL 0x80000000
17+
18+
struct bpf_psi_ops {
19+
/**
20+
* @init: Initialization callback, suited for creating psi triggers.
21+
* @bpf_psi: bpf_psi pointer, can be passed to bpf_psi_create_trigger().
22+
*
23+
* A non-0 return value means the initialization has been failed.
24+
*/
25+
int (*init)(struct bpf_psi *bpf_psi);
26+
27+
/**
28+
* @handle_psi_event: PSI event callback
29+
* @t: psi_trigger pointer
30+
*/
31+
void (*handle_psi_event)(struct psi_trigger *t);
32+
33+
/**
34+
* @handle_cgroup_free: Cgroup free callback
35+
* @cgroup_id: Id of freed cgroup
36+
*
37+
* Called every time a cgroup with an attached bpf psi trigger is freed.
38+
* No psi events can be raised after handle_cgroup_free().
39+
*/
40+
void (*handle_cgroup_free)(u64 cgroup_id);
41+
42+
/* private */
43+
struct bpf_psi *bpf_psi;
44+
};
45+
46+
struct bpf_psi {
47+
spinlock_t lock;
48+
struct list_head triggers;
49+
struct bpf_psi_ops *ops;
50+
struct srcu_struct srcu;
51+
};
52+
53+
#ifdef CONFIG_BPF_SYSCALL
54+
void bpf_psi_add_trigger(struct psi_trigger *t,
55+
const struct psi_trigger_params *params);
56+
void bpf_psi_remove_trigger(struct psi_trigger *t);
57+
void bpf_psi_handle_event(struct psi_trigger *t);
58+
#ifdef CONFIG_CGROUPS
59+
void bpf_psi_cgroup_free(struct cgroup *cgroup);
60+
#endif
61+
62+
#else /* CONFIG_BPF_SYSCALL */
63+
static inline void bpf_psi_add_trigger(struct psi_trigger *t,
64+
const struct psi_trigger_params *params) {}
65+
static inline void bpf_psi_remove_trigger(struct psi_trigger *t) {}
66+
static inline void bpf_psi_handle_event(struct psi_trigger *t) {}
67+
static inline void bpf_psi_cgroup_free(struct cgroup *cgroup) {}
68+
69+
#endif /* CONFIG_BPF_SYSCALL */
70+
71+
#endif /* __BPF_PSI_H */

include/linux/psi_types.h

Lines changed: 41 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,7 @@ struct psi_window {
122122
enum psi_trigger_type {
123123
PSI_SYSTEM,
124124
PSI_CGROUP,
125+
PSI_BPF,
125126
};
126127

127128
struct psi_trigger_params {
@@ -143,8 +144,15 @@ struct psi_trigger_params {
143144
/* Privileged triggers are treated differently */
144145
bool privileged;
145146

146-
/* Link to kernfs open file, only for PSI_CGROUP */
147-
struct kernfs_open_file *of;
147+
union {
148+
/* Link to kernfs open file, only for PSI_CGROUP */
149+
struct kernfs_open_file *of;
150+
151+
#ifdef CONFIG_BPF_SYSCALL
152+
/* Link to bpf_psi structure, only for BPF_PSI */
153+
struct bpf_psi *bpf_psi;
154+
#endif
155+
};
148156
};
149157

150158
struct psi_trigger {
@@ -186,6 +194,31 @@ struct psi_trigger {
186194

187195
/* Trigger type - PSI_AVGS for unprivileged, PSI_POLL for RT */
188196
enum psi_aggregators aggregator;
197+
198+
#ifdef CONFIG_BPF_SYSCALL
199+
/* Fields specific to PSI_BPF triggers */
200+
201+
/* Bpf psi structure for events handling */
202+
struct bpf_psi *bpf_psi;
203+
204+
/* List node inside bpf_psi->triggers list */
205+
struct list_head bpf_psi_node;
206+
207+
/* List node inside group->bpf_triggers list */
208+
struct list_head bpf_group_node;
209+
210+
/* Work structure, used to execute event handlers */
211+
struct work_struct bpf_work;
212+
213+
/*
214+
* Whether the trigger is being pinned in memory.
215+
* Protected by group->bpf_triggers_lock.
216+
*/
217+
bool pinned;
218+
219+
/* Cgroup Id */
220+
u64 cgroup_id;
221+
#endif
189222
};
190223

191224
struct psi_group {
@@ -234,6 +267,12 @@ struct psi_group {
234267
u64 rtpoll_total[NR_PSI_STATES - 1];
235268
u64 rtpoll_next_update;
236269
u64 rtpoll_until;
270+
271+
#ifdef CONFIG_BPF_SYSCALL
272+
/* List of triggers owned by bpf and corresponding lock */
273+
spinlock_t bpf_triggers_lock;
274+
struct list_head bpf_triggers;
275+
#endif
237276
};
238277

239278
#else /* CONFIG_PSI */

kernel/sched/bpf_psi.c

Lines changed: 253 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,253 @@
1+
// SPDX-License-Identifier: GPL-2.0-or-later
2+
/*
3+
* BPF PSI event handlers
4+
*
5+
* Author: Roman Gushchin <[email protected]>
6+
*/
7+
8+
#include <linux/bpf_psi.h>
9+
#include <linux/cgroup-defs.h>
10+
11+
static struct workqueue_struct *bpf_psi_wq;
12+
13+
static struct bpf_psi *bpf_psi_create(struct bpf_psi_ops *ops)
14+
{
15+
struct bpf_psi *bpf_psi;
16+
17+
bpf_psi = kzalloc(sizeof(*bpf_psi), GFP_KERNEL);
18+
if (!bpf_psi)
19+
return NULL;
20+
21+
if (init_srcu_struct(&bpf_psi->srcu)) {
22+
kfree(bpf_psi);
23+
return NULL;
24+
}
25+
26+
spin_lock_init(&bpf_psi->lock);
27+
bpf_psi->ops = ops;
28+
INIT_LIST_HEAD(&bpf_psi->triggers);
29+
ops->bpf_psi = bpf_psi;
30+
31+
return bpf_psi;
32+
}
33+
34+
static void bpf_psi_free(struct bpf_psi *bpf_psi)
35+
{
36+
cleanup_srcu_struct(&bpf_psi->srcu);
37+
kfree(bpf_psi);
38+
}
39+
40+
static void bpf_psi_handle_event_fn(struct work_struct *work)
41+
{
42+
struct psi_trigger *t;
43+
struct bpf_psi *bpf_psi;
44+
int idx;
45+
46+
t = container_of(work, struct psi_trigger, bpf_work);
47+
bpf_psi = READ_ONCE(t->bpf_psi);
48+
49+
if (likely(bpf_psi)) {
50+
idx = srcu_read_lock(&bpf_psi->srcu);
51+
if (bpf_psi->ops->handle_psi_event)
52+
bpf_psi->ops->handle_psi_event(t);
53+
srcu_read_unlock(&bpf_psi->srcu, idx);
54+
}
55+
}
56+
57+
void bpf_psi_add_trigger(struct psi_trigger *t,
58+
const struct psi_trigger_params *params)
59+
{
60+
t->bpf_psi = params->bpf_psi;
61+
t->pinned = false;
62+
INIT_WORK(&t->bpf_work, bpf_psi_handle_event_fn);
63+
64+
spin_lock(&t->bpf_psi->lock);
65+
list_add(&t->bpf_psi_node, &t->bpf_psi->triggers);
66+
spin_unlock(&t->bpf_psi->lock);
67+
68+
spin_lock(&t->group->bpf_triggers_lock);
69+
list_add(&t->bpf_group_node, &t->group->bpf_triggers);
70+
spin_unlock(&t->group->bpf_triggers_lock);
71+
}
72+
73+
void bpf_psi_remove_trigger(struct psi_trigger *t)
74+
{
75+
spin_lock(&t->group->bpf_triggers_lock);
76+
list_del(&t->bpf_group_node);
77+
spin_unlock(&t->group->bpf_triggers_lock);
78+
79+
spin_lock(&t->bpf_psi->lock);
80+
list_del(&t->bpf_psi_node);
81+
spin_unlock(&t->bpf_psi->lock);
82+
}
83+
84+
#ifdef CONFIG_CGROUPS
85+
void bpf_psi_cgroup_free(struct cgroup *cgroup)
86+
{
87+
struct psi_group *group = cgroup->psi;
88+
u64 cgrp_id = cgroup_id(cgroup);
89+
struct psi_trigger *t, *p;
90+
struct bpf_psi *bpf_psi;
91+
LIST_HEAD(to_destroy);
92+
int idx;
93+
94+
spin_lock(&group->bpf_triggers_lock);
95+
list_for_each_entry_safe(t, p, &group->bpf_triggers, bpf_group_node) {
96+
if (!t->pinned) {
97+
t->pinned = true;
98+
list_move(&t->bpf_group_node, &to_destroy);
99+
}
100+
}
101+
spin_unlock(&group->bpf_triggers_lock);
102+
103+
list_for_each_entry_safe(t, p, &to_destroy, bpf_group_node) {
104+
bpf_psi = READ_ONCE(t->bpf_psi);
105+
106+
idx = srcu_read_lock(&bpf_psi->srcu);
107+
if (bpf_psi->ops->handle_cgroup_free)
108+
bpf_psi->ops->handle_cgroup_free(cgrp_id);
109+
srcu_read_unlock(&bpf_psi->srcu, idx);
110+
111+
spin_lock(&bpf_psi->lock);
112+
list_del(&t->bpf_psi_node);
113+
spin_unlock(&bpf_psi->lock);
114+
115+
WRITE_ONCE(t->bpf_psi, NULL);
116+
flush_workqueue(bpf_psi_wq);
117+
synchronize_srcu(&bpf_psi->srcu);
118+
psi_trigger_destroy(t);
119+
}
120+
}
121+
#endif
122+
123+
void bpf_psi_handle_event(struct psi_trigger *t)
124+
{
125+
queue_work(bpf_psi_wq, &t->bpf_work);
126+
}
127+
128+
// bpf struct ops
129+
130+
static int __bpf_psi_init(struct bpf_psi *bpf_psi) { return 0; }
131+
static void __bpf_psi_handle_psi_event(struct psi_trigger *t) {}
132+
static void __bpf_psi_handle_cgroup_free(u64 cgroup_id) {}
133+
134+
static struct bpf_psi_ops __bpf_psi_ops = {
135+
.init = __bpf_psi_init,
136+
.handle_psi_event = __bpf_psi_handle_psi_event,
137+
.handle_cgroup_free = __bpf_psi_handle_cgroup_free,
138+
};
139+
140+
static const struct bpf_func_proto *
141+
bpf_psi_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
142+
{
143+
return tracing_prog_func_proto(func_id, prog);
144+
}
145+
146+
static bool bpf_psi_ops_is_valid_access(int off, int size,
147+
enum bpf_access_type type,
148+
const struct bpf_prog *prog,
149+
struct bpf_insn_access_aux *info)
150+
{
151+
return bpf_tracing_btf_ctx_access(off, size, type, prog, info);
152+
}
153+
154+
static const struct bpf_verifier_ops bpf_psi_verifier_ops = {
155+
.get_func_proto = bpf_psi_func_proto,
156+
.is_valid_access = bpf_psi_ops_is_valid_access,
157+
};
158+
159+
static int bpf_psi_ops_reg(void *kdata, struct bpf_link *link)
160+
{
161+
struct bpf_psi_ops *ops = kdata;
162+
struct bpf_psi *bpf_psi;
163+
164+
bpf_psi = bpf_psi_create(ops);
165+
if (!bpf_psi)
166+
return -ENOMEM;
167+
168+
return ops->init(bpf_psi);
169+
}
170+
171+
static void bpf_psi_ops_unreg(void *kdata, struct bpf_link *link)
172+
{
173+
struct bpf_psi_ops *ops = kdata;
174+
struct bpf_psi *bpf_psi = ops->bpf_psi;
175+
struct psi_trigger *t, *p;
176+
LIST_HEAD(to_destroy);
177+
178+
spin_lock(&bpf_psi->lock);
179+
list_for_each_entry_safe(t, p, &bpf_psi->triggers, bpf_psi_node) {
180+
spin_lock(&t->group->bpf_triggers_lock);
181+
if (!t->pinned) {
182+
t->pinned = true;
183+
list_move(&t->bpf_group_node, &to_destroy);
184+
list_del(&t->bpf_psi_node);
185+
186+
WRITE_ONCE(t->bpf_psi, NULL);
187+
}
188+
spin_unlock(&t->group->bpf_triggers_lock);
189+
}
190+
spin_unlock(&bpf_psi->lock);
191+
192+
flush_workqueue(bpf_psi_wq);
193+
synchronize_srcu(&bpf_psi->srcu);
194+
195+
list_for_each_entry_safe(t, p, &to_destroy, bpf_group_node)
196+
psi_trigger_destroy(t);
197+
198+
bpf_psi_free(bpf_psi);
199+
}
200+
201+
static int bpf_psi_ops_check_member(const struct btf_type *t,
202+
const struct btf_member *member,
203+
const struct bpf_prog *prog)
204+
{
205+
return 0;
206+
}
207+
208+
static int bpf_psi_ops_init_member(const struct btf_type *t,
209+
const struct btf_member *member,
210+
void *kdata, const void *udata)
211+
{
212+
return 0;
213+
}
214+
215+
static int bpf_psi_ops_init(struct btf *btf)
216+
{
217+
return 0;
218+
}
219+
220+
static struct bpf_struct_ops bpf_psi_bpf_ops = {
221+
.verifier_ops = &bpf_psi_verifier_ops,
222+
.reg = bpf_psi_ops_reg,
223+
.unreg = bpf_psi_ops_unreg,
224+
.check_member = bpf_psi_ops_check_member,
225+
.init_member = bpf_psi_ops_init_member,
226+
.init = bpf_psi_ops_init,
227+
.name = "bpf_psi_ops",
228+
.owner = THIS_MODULE,
229+
.cfi_stubs = &__bpf_psi_ops
230+
};
231+
232+
static int __init bpf_psi_struct_ops_init(void)
233+
{
234+
int wq_flags = WQ_MEM_RECLAIM | WQ_UNBOUND | WQ_HIGHPRI;
235+
int err;
236+
237+
bpf_psi_wq = alloc_workqueue("bpf_psi_wq", wq_flags, 0);
238+
if (!bpf_psi_wq)
239+
return -ENOMEM;
240+
241+
err = register_bpf_struct_ops(&bpf_psi_bpf_ops, bpf_psi_ops);
242+
if (err) {
243+
pr_warn("error while registering bpf psi struct ops: %d", err);
244+
goto err;
245+
}
246+
247+
return 0;
248+
249+
err:
250+
destroy_workqueue(bpf_psi_wq);
251+
return err;
252+
}
253+
late_initcall(bpf_psi_struct_ops_init);

0 commit comments

Comments
 (0)