2525#include <linux/kasan.h>
2626#include <linux/bpf_verifier.h>
2727#include <linux/uaccess.h>
28+ #include <linux/task_work.h>
29+ #include <linux/irq_work.h>
2830
2931#include "../../lib/kstrtox.h"
3032
@@ -3746,6 +3748,284 @@ __bpf_kfunc int bpf_strstr(const char *s1__ign, const char *s2__ign)
37463748
37473749typedef int (* bpf_task_work_callback_t )(struct bpf_map * map , void * key , void * value );
37483750
3751+ enum bpf_task_work_state {
3752+ /* bpf_task_work is ready to be used */
3753+ BPF_TW_STANDBY = 0 ,
3754+ /* irq work scheduling in progress */
3755+ BPF_TW_PENDING ,
3756+ /* task work scheduling in progress */
3757+ BPF_TW_SCHEDULING ,
3758+ /* task work is scheduled successfully */
3759+ BPF_TW_SCHEDULED ,
3760+ /* callback is running */
3761+ BPF_TW_RUNNING ,
3762+ /* associated BPF map value is deleted */
3763+ BPF_TW_FREED ,
3764+ };
3765+
3766+ struct bpf_task_work_ctx {
3767+ enum bpf_task_work_state state ;
3768+ refcount_t refcnt ;
3769+ struct callback_head work ;
3770+ struct irq_work irq_work ;
3771+ /* bpf_prog that schedules task work */
3772+ struct bpf_prog * prog ;
3773+ /* task for which callback is scheduled */
3774+ struct task_struct * task ;
3775+ /* the map and map value associated with this context */
3776+ struct bpf_map * map ;
3777+ void * map_val ;
3778+ enum task_work_notify_mode mode ;
3779+ bpf_task_work_callback_t callback_fn ;
3780+ struct rcu_head rcu ;
3781+ } __aligned (8 );
3782+
3783+ /* Actual type for struct bpf_task_work */
3784+ struct bpf_task_work_kern {
3785+ struct bpf_task_work_ctx * ctx ;
3786+ };
3787+
3788+ static void bpf_task_work_ctx_free_rcu_gp (struct rcu_head * rcu )
3789+ {
3790+ struct bpf_task_work_ctx * ctx = container_of (rcu , struct bpf_task_work_ctx , rcu );
3791+
3792+ /* bpf_mem_free expects migration to be disabled */
3793+ migrate_disable ();
3794+ bpf_mem_free (& bpf_global_ma , ctx );
3795+ migrate_enable ();
3796+ }
3797+
3798+ static void bpf_task_work_ctx_free_mult_rcu_gp (struct rcu_head * rcu )
3799+ {
3800+ if (rcu_trace_implies_rcu_gp ())
3801+ bpf_task_work_ctx_free_rcu_gp (rcu );
3802+ else
3803+ call_rcu (rcu , bpf_task_work_ctx_free_rcu_gp );
3804+ }
3805+
3806+ static void bpf_task_work_ctx_reset (struct bpf_task_work_ctx * ctx )
3807+ {
3808+ if (ctx -> prog ) {
3809+ bpf_prog_put (ctx -> prog );
3810+ ctx -> prog = NULL ;
3811+ }
3812+ if (ctx -> task ) {
3813+ bpf_task_release (ctx -> task );
3814+ ctx -> task = NULL ;
3815+ }
3816+ }
3817+
3818+ static bool bpf_task_work_ctx_tryget (struct bpf_task_work_ctx * ctx )
3819+ {
3820+ return refcount_inc_not_zero (& ctx -> refcnt );
3821+ }
3822+
3823+ static void bpf_task_work_ctx_put (struct bpf_task_work_ctx * ctx )
3824+ {
3825+ if (!refcount_dec_and_test (& ctx -> refcnt ))
3826+ return ;
3827+
3828+ bpf_task_work_ctx_reset (ctx );
3829+ call_rcu_tasks_trace (& ctx -> rcu , bpf_task_work_ctx_free_mult_rcu_gp );
3830+ }
3831+
3832+ static void bpf_task_work_cancel (struct bpf_task_work_ctx * ctx )
3833+ {
3834+ /*
3835+ * Scheduled task_work callback holds ctx ref, so if we successfully
3836+ * cancelled, we put that ref on callback's behalf. If we couldn't
3837+ * cancel, callback will inevitably run or has already completed
3838+ * running, and it would have taken care of its ctx ref itself.
3839+ */
3840+ if (task_work_cancel (ctx -> task , & ctx -> work ))
3841+ bpf_task_work_ctx_put (ctx );
3842+ }
3843+
3844+ static void bpf_task_work_callback (struct callback_head * cb )
3845+ {
3846+ struct bpf_task_work_ctx * ctx = container_of (cb , struct bpf_task_work_ctx , work );
3847+ enum bpf_task_work_state state ;
3848+ u32 idx ;
3849+ void * key ;
3850+
3851+ /* Read lock is needed to protect ctx and map key/value access */
3852+ guard (rcu_tasks_trace )();
3853+ /*
3854+ * This callback may start running before bpf_task_work_irq() switched to
3855+ * SCHEDULED state, so handle both transition variants SCHEDULING|SCHEDULED -> RUNNING.
3856+ */
3857+ state = cmpxchg (& ctx -> state , BPF_TW_SCHEDULING , BPF_TW_RUNNING );
3858+ if (state == BPF_TW_SCHEDULED )
3859+ state = cmpxchg (& ctx -> state , BPF_TW_SCHEDULED , BPF_TW_RUNNING );
3860+ if (state == BPF_TW_FREED ) {
3861+ bpf_task_work_ctx_put (ctx );
3862+ return ;
3863+ }
3864+
3865+ key = (void * )map_key_from_value (ctx -> map , ctx -> map_val , & idx );
3866+
3867+ migrate_disable ();
3868+ ctx -> callback_fn (ctx -> map , key , ctx -> map_val );
3869+ migrate_enable ();
3870+
3871+ bpf_task_work_ctx_reset (ctx );
3872+ (void )cmpxchg (& ctx -> state , BPF_TW_RUNNING , BPF_TW_STANDBY );
3873+
3874+ bpf_task_work_ctx_put (ctx );
3875+ }
3876+
3877+ static void bpf_task_work_irq (struct irq_work * irq_work )
3878+ {
3879+ struct bpf_task_work_ctx * ctx = container_of (irq_work , struct bpf_task_work_ctx , irq_work );
3880+ enum bpf_task_work_state state ;
3881+ int err ;
3882+
3883+ guard (rcu_tasks_trace )();
3884+
3885+ if (cmpxchg (& ctx -> state , BPF_TW_PENDING , BPF_TW_SCHEDULING ) != BPF_TW_PENDING ) {
3886+ bpf_task_work_ctx_put (ctx );
3887+ return ;
3888+ }
3889+
3890+ err = task_work_add (ctx -> task , & ctx -> work , ctx -> mode );
3891+ if (err ) {
3892+ bpf_task_work_ctx_reset (ctx );
3893+ /*
3894+ * try to switch back to STANDBY for another task_work reuse, but we might have
3895+ * gone to FREED already, which is fine as we already cleaned up after ourselves
3896+ */
3897+ (void )cmpxchg (& ctx -> state , BPF_TW_SCHEDULING , BPF_TW_STANDBY );
3898+ bpf_task_work_ctx_put (ctx );
3899+ return ;
3900+ }
3901+
3902+ /*
3903+ * It's technically possible for just scheduled task_work callback to
3904+ * complete running by now, going SCHEDULING -> RUNNING and then
3905+ * dropping its ctx refcount. Instead of capturing extra ref just to
3906+ * protected below ctx->state access, we rely on RCU protection to
3907+ * perform below SCHEDULING -> SCHEDULED attempt.
3908+ */
3909+ state = cmpxchg (& ctx -> state , BPF_TW_SCHEDULING , BPF_TW_SCHEDULED );
3910+ if (state == BPF_TW_FREED )
3911+ bpf_task_work_cancel (ctx ); /* clean up if we switched into FREED state */
3912+ }
3913+
3914+ static struct bpf_task_work_ctx * bpf_task_work_fetch_ctx (struct bpf_task_work * tw ,
3915+ struct bpf_map * map )
3916+ {
3917+ struct bpf_task_work_kern * twk = (void * )tw ;
3918+ struct bpf_task_work_ctx * ctx , * old_ctx ;
3919+
3920+ ctx = READ_ONCE (twk -> ctx );
3921+ if (ctx )
3922+ return ctx ;
3923+
3924+ ctx = bpf_mem_alloc (& bpf_global_ma , sizeof (struct bpf_task_work_ctx ));
3925+ if (!ctx )
3926+ return ERR_PTR (- ENOMEM );
3927+
3928+ memset (ctx , 0 , sizeof (* ctx ));
3929+ refcount_set (& ctx -> refcnt , 1 ); /* map's own ref */
3930+ ctx -> state = BPF_TW_STANDBY ;
3931+
3932+ old_ctx = cmpxchg (& twk -> ctx , NULL , ctx );
3933+ if (old_ctx ) {
3934+ /*
3935+ * tw->ctx is set by concurrent BPF program, release allocated
3936+ * memory and try to reuse already set context.
3937+ */
3938+ bpf_mem_free (& bpf_global_ma , ctx );
3939+ return old_ctx ;
3940+ }
3941+
3942+ return ctx ; /* Success */
3943+ }
3944+
3945+ static struct bpf_task_work_ctx * bpf_task_work_acquire_ctx (struct bpf_task_work * tw ,
3946+ struct bpf_map * map )
3947+ {
3948+ struct bpf_task_work_ctx * ctx ;
3949+
3950+ /* early check to avoid any work, we'll double check at the end again */
3951+ if (!atomic64_read (& map -> usercnt ))
3952+ return ERR_PTR (- EBUSY );
3953+
3954+ ctx = bpf_task_work_fetch_ctx (tw , map );
3955+ if (IS_ERR (ctx ))
3956+ return ctx ;
3957+
3958+ /* try to get ref for task_work callback to hold */
3959+ if (!bpf_task_work_ctx_tryget (ctx ))
3960+ return ERR_PTR (- EBUSY );
3961+
3962+ if (cmpxchg (& ctx -> state , BPF_TW_STANDBY , BPF_TW_PENDING ) != BPF_TW_STANDBY ) {
3963+ /* lost acquiring race or map_release_uref() stole it from us, put ref and bail */
3964+ bpf_task_work_ctx_put (ctx );
3965+ return ERR_PTR (- EBUSY );
3966+ }
3967+
3968+ /*
3969+ * Double check that map->usercnt wasn't dropped while we were
3970+ * preparing context, and if it was, we need to clean up as if
3971+ * map_release_uref() was called; bpf_task_work_cancel_and_free()
3972+ * is safe to be called twice on the same task work
3973+ */
3974+ if (!atomic64_read (& map -> usercnt )) {
3975+ /* drop ref we just got for task_work callback itself */
3976+ bpf_task_work_ctx_put (ctx );
3977+ /* transfer map's ref into cancel_and_free() */
3978+ bpf_task_work_cancel_and_free (tw );
3979+ return ERR_PTR (- EBUSY );
3980+ }
3981+
3982+ return ctx ;
3983+ }
3984+
3985+ static int bpf_task_work_schedule (struct task_struct * task , struct bpf_task_work * tw ,
3986+ struct bpf_map * map , bpf_task_work_callback_t callback_fn ,
3987+ struct bpf_prog_aux * aux , enum task_work_notify_mode mode )
3988+ {
3989+ struct bpf_prog * prog ;
3990+ struct bpf_task_work_ctx * ctx ;
3991+ int err ;
3992+
3993+ BTF_TYPE_EMIT (struct bpf_task_work );
3994+
3995+ prog = bpf_prog_inc_not_zero (aux -> prog );
3996+ if (IS_ERR (prog ))
3997+ return - EBADF ;
3998+ task = bpf_task_acquire (task );
3999+ if (!task ) {
4000+ err = - EBADF ;
4001+ goto release_prog ;
4002+ }
4003+
4004+ ctx = bpf_task_work_acquire_ctx (tw , map );
4005+ if (IS_ERR (ctx )) {
4006+ err = PTR_ERR (ctx );
4007+ goto release_all ;
4008+ }
4009+
4010+ ctx -> task = task ;
4011+ ctx -> callback_fn = callback_fn ;
4012+ ctx -> prog = prog ;
4013+ ctx -> mode = mode ;
4014+ ctx -> map = map ;
4015+ ctx -> map_val = (void * )tw - map -> record -> task_work_off ;
4016+ init_task_work (& ctx -> work , bpf_task_work_callback );
4017+ init_irq_work (& ctx -> irq_work , bpf_task_work_irq );
4018+
4019+ irq_work_queue (& ctx -> irq_work );
4020+ return 0 ;
4021+
4022+ release_all :
4023+ bpf_task_release (task );
4024+ release_prog :
4025+ bpf_prog_put (prog );
4026+ return err ;
4027+ }
4028+
37494029/**
37504030 * bpf_task_work_schedule_signal - Schedule BPF callback using task_work_add with TWA_SIGNAL mode
37514031 * @task: Task struct for which callback should be scheduled
@@ -3760,7 +4040,7 @@ __bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task, struct b
37604040 void * map__map , bpf_task_work_callback_t callback ,
37614041 void * aux__prog )
37624042{
3763- return 0 ;
4043+ return bpf_task_work_schedule ( task , tw , map__map , callback , aux__prog , TWA_SIGNAL ) ;
37644044}
37654045
37664046/**
@@ -3777,13 +4057,38 @@ __bpf_kfunc int bpf_task_work_schedule_resume(struct task_struct *task, struct b
37774057 void * map__map , bpf_task_work_callback_t callback ,
37784058 void * aux__prog )
37794059{
3780- return 0 ;
4060+ return bpf_task_work_schedule ( task , tw , map__map , callback , aux__prog , TWA_RESUME ) ;
37814061}
37824062
37834063__bpf_kfunc_end_defs ();
37844064
4065+ static void bpf_task_work_cancel_scheduled (struct irq_work * irq_work )
4066+ {
4067+ struct bpf_task_work_ctx * ctx = container_of (irq_work , struct bpf_task_work_ctx , irq_work );
4068+
4069+ bpf_task_work_cancel (ctx ); /* this might put task_work callback's ref */
4070+ bpf_task_work_ctx_put (ctx ); /* and here we put map's own ref that was transferred to us */
4071+ }
4072+
37854073void bpf_task_work_cancel_and_free (void * val )
37864074{
4075+ struct bpf_task_work_kern * twk = val ;
4076+ struct bpf_task_work_ctx * ctx ;
4077+ enum bpf_task_work_state state ;
4078+
4079+ ctx = xchg (& twk -> ctx , NULL );
4080+ if (!ctx )
4081+ return ;
4082+
4083+ state = xchg (& ctx -> state , BPF_TW_FREED );
4084+ if (state == BPF_TW_SCHEDULED ) {
4085+ /* run in irq_work to avoid locks in NMI */
4086+ init_irq_work (& ctx -> irq_work , bpf_task_work_cancel_scheduled );
4087+ irq_work_queue (& ctx -> irq_work );
4088+ return ;
4089+ }
4090+
4091+ bpf_task_work_ctx_put (ctx ); /* put bpf map's ref */
37874092}
37884093
37894094BTF_KFUNCS_START (generic_btf_ids )
@@ -3920,6 +4225,8 @@ BTF_ID_FLAGS(func, bpf_strnstr);
39204225BTF_ID_FLAGS (func , bpf_cgroup_read_xattr , KF_RCU )
39214226#endif
39224227BTF_ID_FLAGS (func , bpf_stream_vprintk , KF_TRUSTED_ARGS )
4228+ BTF_ID_FLAGS (func , bpf_task_work_schedule_signal , KF_TRUSTED_ARGS )
4229+ BTF_ID_FLAGS (func , bpf_task_work_schedule_resume , KF_TRUSTED_ARGS )
39234230BTF_KFUNCS_END (common_btf_ids )
39244231
39254232static const struct btf_kfunc_id_set common_kfunc_set = {
0 commit comments