Skip to content

Commit 38aa700

Browse files
mykyta5Alexei Starovoitov
authored andcommitted
bpf: task work scheduling kfuncs
Implementation of the new bpf_task_work_schedule kfuncs, that let a BPF program schedule task_work callbacks for a target task: * bpf_task_work_schedule_signal() - schedules with TWA_SIGNAL * bpf_task_work_schedule_resume() - schedules with TWA_RESUME Each map value should embed a struct bpf_task_work, which the kernel side pairs with struct bpf_task_work_kern, containing a pointer to struct bpf_task_work_ctx, that maintains metadata relevant for the concrete callback scheduling. A small state machine and refcounting scheme ensures safe reuse and teardown. State transitions: _______________________________ | | v | [standby] ---> [pending] --> [scheduling] --> [scheduled] ^ |________________|_________ | | | v | [running] |_______________________________________________________| All states may transition into FREED state: [pending] [scheduling] [scheduled] [running] [standby] -> [freed] A FREED terminal state coordinates with map-value deletion (bpf_task_work_cancel_and_free()). Scheduling itself is deferred via irq_work to keep the kfunc callable from NMI context. Lifetime is guarded with refcount_t + RCU Tasks Trace. Main components: * struct bpf_task_work_context – Metadata and state management per task work. * enum bpf_task_work_state – A state machine to serialize work scheduling and execution. * bpf_task_work_schedule() – The central helper that initiates scheduling. * bpf_task_work_acquire_ctx() - Attempts to take ownership of the context, pointed by passed struct bpf_task_work, allocates new context if none exists yet. * bpf_task_work_callback() – Invoked when the actual task_work runs. * bpf_task_work_irq() – An intermediate step (runs in softirq context) to enqueue task work. * bpf_task_work_cancel_and_free() – Cleanup for deleted BPF map entries. Flow of successful task work scheduling 1) bpf_task_work_schedule_* is called from BPF code. 2) Transition state from STANDBY to PENDING, mark context as owned by this task work scheduler 3) irq_work_queue() schedules bpf_task_work_irq(). 4) Transition state from PENDING to SCHEDULING (noop if transition successful) 5) bpf_task_work_irq() attempts task_work_add(). If successful, state transitions to SCHEDULED. 6) Task work calls bpf_task_work_callback(), which transition state to RUNNING. 7) BPF callback is executed 8) Context is cleaned up, refcounts released, context state set back to STANDBY. Signed-off-by: Mykyta Yatsenko <[email protected]> Reviewed-by: Andrii Nakryiko <[email protected]> Reviewed-by: Eduard Zingerman <[email protected]> Acked-by: Kumar Kartikeya Dwivedi <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Alexei Starovoitov <[email protected]>
1 parent 5e8134f commit 38aa700

File tree

1 file changed

+290
-2
lines changed

1 file changed

+290
-2
lines changed

kernel/bpf/helpers.c

Lines changed: 290 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@
2626
#include <linux/bpf_verifier.h>
2727
#include <linux/uaccess.h>
2828
#include <linux/verification.h>
29+
#include <linux/task_work.h>
30+
#include <linux/irq_work.h>
2931

3032
#include "../../lib/kstrtox.h"
3133

@@ -3904,6 +3906,265 @@ __bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p,
39043906

39053907
typedef int (*bpf_task_work_callback_t)(struct bpf_map *map, void *key, void *value);
39063908

3909+
enum bpf_task_work_state {
3910+
/* bpf_task_work is ready to be used */
3911+
BPF_TW_STANDBY = 0,
3912+
/* irq work scheduling in progress */
3913+
BPF_TW_PENDING,
3914+
/* task work scheduling in progress */
3915+
BPF_TW_SCHEDULING,
3916+
/* task work is scheduled successfully */
3917+
BPF_TW_SCHEDULED,
3918+
/* callback is running */
3919+
BPF_TW_RUNNING,
3920+
/* associated BPF map value is deleted */
3921+
BPF_TW_FREED,
3922+
};
3923+
3924+
struct bpf_task_work_ctx {
3925+
enum bpf_task_work_state state;
3926+
refcount_t refcnt;
3927+
struct callback_head work;
3928+
struct irq_work irq_work;
3929+
/* bpf_prog that schedules task work */
3930+
struct bpf_prog *prog;
3931+
/* task for which callback is scheduled */
3932+
struct task_struct *task;
3933+
/* the map and map value associated with this context */
3934+
struct bpf_map *map;
3935+
void *map_val;
3936+
enum task_work_notify_mode mode;
3937+
bpf_task_work_callback_t callback_fn;
3938+
struct rcu_head rcu;
3939+
} __aligned(8);
3940+
3941+
/* Actual type for struct bpf_task_work */
3942+
struct bpf_task_work_kern {
3943+
struct bpf_task_work_ctx *ctx;
3944+
};
3945+
3946+
static void bpf_task_work_ctx_reset(struct bpf_task_work_ctx *ctx)
3947+
{
3948+
if (ctx->prog) {
3949+
bpf_prog_put(ctx->prog);
3950+
ctx->prog = NULL;
3951+
}
3952+
if (ctx->task) {
3953+
bpf_task_release(ctx->task);
3954+
ctx->task = NULL;
3955+
}
3956+
}
3957+
3958+
static bool bpf_task_work_ctx_tryget(struct bpf_task_work_ctx *ctx)
3959+
{
3960+
return refcount_inc_not_zero(&ctx->refcnt);
3961+
}
3962+
3963+
static void bpf_task_work_ctx_put(struct bpf_task_work_ctx *ctx)
3964+
{
3965+
if (!refcount_dec_and_test(&ctx->refcnt))
3966+
return;
3967+
3968+
bpf_task_work_ctx_reset(ctx);
3969+
3970+
/* bpf_mem_free expects migration to be disabled */
3971+
migrate_disable();
3972+
bpf_mem_free(&bpf_global_ma, ctx);
3973+
migrate_enable();
3974+
}
3975+
3976+
static void bpf_task_work_cancel(struct bpf_task_work_ctx *ctx)
3977+
{
3978+
/*
3979+
* Scheduled task_work callback holds ctx ref, so if we successfully
3980+
* cancelled, we put that ref on callback's behalf. If we couldn't
3981+
* cancel, callback will inevitably run or has already completed
3982+
* running, and it would have taken care of its ctx ref itself.
3983+
*/
3984+
if (task_work_cancel(ctx->task, &ctx->work))
3985+
bpf_task_work_ctx_put(ctx);
3986+
}
3987+
3988+
static void bpf_task_work_callback(struct callback_head *cb)
3989+
{
3990+
struct bpf_task_work_ctx *ctx = container_of(cb, struct bpf_task_work_ctx, work);
3991+
enum bpf_task_work_state state;
3992+
u32 idx;
3993+
void *key;
3994+
3995+
/* Read lock is needed to protect ctx and map key/value access */
3996+
guard(rcu_tasks_trace)();
3997+
/*
3998+
* This callback may start running before bpf_task_work_irq() switched to
3999+
* SCHEDULED state, so handle both transition variants SCHEDULING|SCHEDULED -> RUNNING.
4000+
*/
4001+
state = cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_RUNNING);
4002+
if (state == BPF_TW_SCHEDULED)
4003+
state = cmpxchg(&ctx->state, BPF_TW_SCHEDULED, BPF_TW_RUNNING);
4004+
if (state == BPF_TW_FREED) {
4005+
bpf_task_work_ctx_put(ctx);
4006+
return;
4007+
}
4008+
4009+
key = (void *)map_key_from_value(ctx->map, ctx->map_val, &idx);
4010+
4011+
migrate_disable();
4012+
ctx->callback_fn(ctx->map, key, ctx->map_val);
4013+
migrate_enable();
4014+
4015+
bpf_task_work_ctx_reset(ctx);
4016+
(void)cmpxchg(&ctx->state, BPF_TW_RUNNING, BPF_TW_STANDBY);
4017+
4018+
bpf_task_work_ctx_put(ctx);
4019+
}
4020+
4021+
static void bpf_task_work_irq(struct irq_work *irq_work)
4022+
{
4023+
struct bpf_task_work_ctx *ctx = container_of(irq_work, struct bpf_task_work_ctx, irq_work);
4024+
enum bpf_task_work_state state;
4025+
int err;
4026+
4027+
guard(rcu_tasks_trace)();
4028+
4029+
if (cmpxchg(&ctx->state, BPF_TW_PENDING, BPF_TW_SCHEDULING) != BPF_TW_PENDING) {
4030+
bpf_task_work_ctx_put(ctx);
4031+
return;
4032+
}
4033+
4034+
err = task_work_add(ctx->task, &ctx->work, ctx->mode);
4035+
if (err) {
4036+
bpf_task_work_ctx_reset(ctx);
4037+
/*
4038+
* try to switch back to STANDBY for another task_work reuse, but we might have
4039+
* gone to FREED already, which is fine as we already cleaned up after ourselves
4040+
*/
4041+
(void)cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_STANDBY);
4042+
bpf_task_work_ctx_put(ctx);
4043+
return;
4044+
}
4045+
4046+
/*
4047+
* It's technically possible for just scheduled task_work callback to
4048+
* complete running by now, going SCHEDULING -> RUNNING and then
4049+
* dropping its ctx refcount. Instead of capturing extra ref just to
4050+
* protected below ctx->state access, we rely on RCU protection to
4051+
* perform below SCHEDULING -> SCHEDULED attempt.
4052+
*/
4053+
state = cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_SCHEDULED);
4054+
if (state == BPF_TW_FREED)
4055+
bpf_task_work_cancel(ctx); /* clean up if we switched into FREED state */
4056+
}
4057+
4058+
static struct bpf_task_work_ctx *bpf_task_work_fetch_ctx(struct bpf_task_work *tw,
4059+
struct bpf_map *map)
4060+
{
4061+
struct bpf_task_work_kern *twk = (void *)tw;
4062+
struct bpf_task_work_ctx *ctx, *old_ctx;
4063+
4064+
ctx = READ_ONCE(twk->ctx);
4065+
if (ctx)
4066+
return ctx;
4067+
4068+
ctx = bpf_mem_alloc(&bpf_global_ma, sizeof(struct bpf_task_work_ctx));
4069+
if (!ctx)
4070+
return ERR_PTR(-ENOMEM);
4071+
4072+
memset(ctx, 0, sizeof(*ctx));
4073+
refcount_set(&ctx->refcnt, 1); /* map's own ref */
4074+
ctx->state = BPF_TW_STANDBY;
4075+
4076+
old_ctx = cmpxchg(&twk->ctx, NULL, ctx);
4077+
if (old_ctx) {
4078+
/*
4079+
* tw->ctx is set by concurrent BPF program, release allocated
4080+
* memory and try to reuse already set context.
4081+
*/
4082+
bpf_mem_free(&bpf_global_ma, ctx);
4083+
return old_ctx;
4084+
}
4085+
4086+
return ctx; /* Success */
4087+
}
4088+
4089+
static struct bpf_task_work_ctx *bpf_task_work_acquire_ctx(struct bpf_task_work *tw,
4090+
struct bpf_map *map)
4091+
{
4092+
struct bpf_task_work_ctx *ctx;
4093+
4094+
ctx = bpf_task_work_fetch_ctx(tw, map);
4095+
if (IS_ERR(ctx))
4096+
return ctx;
4097+
4098+
/* try to get ref for task_work callback to hold */
4099+
if (!bpf_task_work_ctx_tryget(ctx))
4100+
return ERR_PTR(-EBUSY);
4101+
4102+
if (cmpxchg(&ctx->state, BPF_TW_STANDBY, BPF_TW_PENDING) != BPF_TW_STANDBY) {
4103+
/* lost acquiring race or map_release_uref() stole it from us, put ref and bail */
4104+
bpf_task_work_ctx_put(ctx);
4105+
return ERR_PTR(-EBUSY);
4106+
}
4107+
4108+
/*
4109+
* If no process or bpffs is holding a reference to the map, no new callbacks should be
4110+
* scheduled. This does not address any race or correctness issue, but rather is a policy
4111+
* choice: dropping user references should stop everything.
4112+
*/
4113+
if (!atomic64_read(&map->usercnt)) {
4114+
/* drop ref we just got for task_work callback itself */
4115+
bpf_task_work_ctx_put(ctx);
4116+
/* transfer map's ref into cancel_and_free() */
4117+
bpf_task_work_cancel_and_free(tw);
4118+
return ERR_PTR(-EBUSY);
4119+
}
4120+
4121+
return ctx;
4122+
}
4123+
4124+
static int bpf_task_work_schedule(struct task_struct *task, struct bpf_task_work *tw,
4125+
struct bpf_map *map, bpf_task_work_callback_t callback_fn,
4126+
struct bpf_prog_aux *aux, enum task_work_notify_mode mode)
4127+
{
4128+
struct bpf_prog *prog;
4129+
struct bpf_task_work_ctx *ctx;
4130+
int err;
4131+
4132+
BTF_TYPE_EMIT(struct bpf_task_work);
4133+
4134+
prog = bpf_prog_inc_not_zero(aux->prog);
4135+
if (IS_ERR(prog))
4136+
return -EBADF;
4137+
task = bpf_task_acquire(task);
4138+
if (!task) {
4139+
err = -EBADF;
4140+
goto release_prog;
4141+
}
4142+
4143+
ctx = bpf_task_work_acquire_ctx(tw, map);
4144+
if (IS_ERR(ctx)) {
4145+
err = PTR_ERR(ctx);
4146+
goto release_all;
4147+
}
4148+
4149+
ctx->task = task;
4150+
ctx->callback_fn = callback_fn;
4151+
ctx->prog = prog;
4152+
ctx->mode = mode;
4153+
ctx->map = map;
4154+
ctx->map_val = (void *)tw - map->record->task_work_off;
4155+
init_task_work(&ctx->work, bpf_task_work_callback);
4156+
init_irq_work(&ctx->irq_work, bpf_task_work_irq);
4157+
4158+
irq_work_queue(&ctx->irq_work);
4159+
return 0;
4160+
4161+
release_all:
4162+
bpf_task_release(task);
4163+
release_prog:
4164+
bpf_prog_put(prog);
4165+
return err;
4166+
}
4167+
39074168
/**
39084169
* bpf_task_work_schedule_signal - Schedule BPF callback using task_work_add with TWA_SIGNAL mode
39094170
* @task: Task struct for which callback should be scheduled
@@ -3918,7 +4179,7 @@ __bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task, struct b
39184179
void *map__map, bpf_task_work_callback_t callback,
39194180
void *aux__prog)
39204181
{
3921-
return 0;
4182+
return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_SIGNAL);
39224183
}
39234184

39244185
/**
@@ -3935,13 +4196,38 @@ __bpf_kfunc int bpf_task_work_schedule_resume(struct task_struct *task, struct b
39354196
void *map__map, bpf_task_work_callback_t callback,
39364197
void *aux__prog)
39374198
{
3938-
return 0;
4199+
return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_RESUME);
39394200
}
39404201

39414202
__bpf_kfunc_end_defs();
39424203

4204+
static void bpf_task_work_cancel_scheduled(struct irq_work *irq_work)
4205+
{
4206+
struct bpf_task_work_ctx *ctx = container_of(irq_work, struct bpf_task_work_ctx, irq_work);
4207+
4208+
bpf_task_work_cancel(ctx); /* this might put task_work callback's ref */
4209+
bpf_task_work_ctx_put(ctx); /* and here we put map's own ref that was transferred to us */
4210+
}
4211+
39434212
void bpf_task_work_cancel_and_free(void *val)
39444213
{
4214+
struct bpf_task_work_kern *twk = val;
4215+
struct bpf_task_work_ctx *ctx;
4216+
enum bpf_task_work_state state;
4217+
4218+
ctx = xchg(&twk->ctx, NULL);
4219+
if (!ctx)
4220+
return;
4221+
4222+
state = xchg(&ctx->state, BPF_TW_FREED);
4223+
if (state == BPF_TW_SCHEDULED) {
4224+
/* run in irq_work to avoid locks in NMI */
4225+
init_irq_work(&ctx->irq_work, bpf_task_work_cancel_scheduled);
4226+
irq_work_queue(&ctx->irq_work);
4227+
return;
4228+
}
4229+
4230+
bpf_task_work_ctx_put(ctx); /* put bpf map's ref */
39454231
}
39464232

39474233
BTF_KFUNCS_START(generic_btf_ids)
@@ -4086,6 +4372,8 @@ BTF_ID_FLAGS(func, bpf_strnstr);
40864372
BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU)
40874373
#endif
40884374
BTF_ID_FLAGS(func, bpf_stream_vprintk, KF_TRUSTED_ARGS)
4375+
BTF_ID_FLAGS(func, bpf_task_work_schedule_signal, KF_TRUSTED_ARGS)
4376+
BTF_ID_FLAGS(func, bpf_task_work_schedule_resume, KF_TRUSTED_ARGS)
40894377
BTF_KFUNCS_END(common_btf_ids)
40904378

40914379
static const struct btf_kfunc_id_set common_kfunc_set = {

0 commit comments

Comments
 (0)