2525#include <linux/kasan.h>
2626#include <linux/bpf_verifier.h>
2727#include <linux/uaccess.h>
28+ #include <linux/task_work.h>
29+ #include <linux/irq_work.h>
2830
2931#include "../../lib/kstrtox.h"
3032
@@ -3747,6 +3749,284 @@ __bpf_kfunc int bpf_strstr(const char *s1__ign, const char *s2__ign)
37473749
37483750typedef int (* bpf_task_work_callback_t )(struct bpf_map * map , void * key , void * value );
37493751
3752+ enum bpf_task_work_state {
3753+ /* bpf_task_work is ready to be used */
3754+ BPF_TW_STANDBY = 0 ,
3755+ /* irq work scheduling in progress */
3756+ BPF_TW_PENDING ,
3757+ /* task work scheduling in progress */
3758+ BPF_TW_SCHEDULING ,
3759+ /* task work is scheduled successfully */
3760+ BPF_TW_SCHEDULED ,
3761+ /* callback is running */
3762+ BPF_TW_RUNNING ,
3763+ /* associated BPF map value is deleted */
3764+ BPF_TW_FREED ,
3765+ };
3766+
3767+ struct bpf_task_work_ctx {
3768+ enum bpf_task_work_state state ;
3769+ refcount_t refcnt ;
3770+ struct callback_head work ;
3771+ struct irq_work irq_work ;
3772+ /* bpf_prog that schedules task work */
3773+ struct bpf_prog * prog ;
3774+ /* task for which callback is scheduled */
3775+ struct task_struct * task ;
3776+ /* the map and map value associated with this context */
3777+ struct bpf_map * map ;
3778+ void * map_val ;
3779+ enum task_work_notify_mode mode ;
3780+ bpf_task_work_callback_t callback_fn ;
3781+ struct rcu_head rcu ;
3782+ } __aligned (8 );
3783+
3784+ /* Actual type for struct bpf_task_work */
3785+ struct bpf_task_work_kern {
3786+ struct bpf_task_work_ctx * ctx ;
3787+ };
3788+
3789+ static void bpf_task_work_ctx_free_rcu_gp (struct rcu_head * rcu )
3790+ {
3791+ struct bpf_task_work_ctx * ctx = container_of (rcu , struct bpf_task_work_ctx , rcu );
3792+
3793+ /* bpf_mem_free expects migration to be disabled */
3794+ migrate_disable ();
3795+ bpf_mem_free (& bpf_global_ma , ctx );
3796+ migrate_enable ();
3797+ }
3798+
3799+ static void bpf_task_work_ctx_free_mult_rcu_gp (struct rcu_head * rcu )
3800+ {
3801+ if (rcu_trace_implies_rcu_gp ())
3802+ bpf_task_work_ctx_free_rcu_gp (rcu );
3803+ else
3804+ call_rcu (rcu , bpf_task_work_ctx_free_rcu_gp );
3805+ }
3806+
3807+ static void bpf_task_work_ctx_reset (struct bpf_task_work_ctx * ctx )
3808+ {
3809+ if (ctx -> prog ) {
3810+ bpf_prog_put (ctx -> prog );
3811+ ctx -> prog = NULL ;
3812+ }
3813+ if (ctx -> task ) {
3814+ bpf_task_release (ctx -> task );
3815+ ctx -> task = NULL ;
3816+ }
3817+ }
3818+
3819+ static bool bpf_task_work_ctx_tryget (struct bpf_task_work_ctx * ctx )
3820+ {
3821+ return refcount_inc_not_zero (& ctx -> refcnt );
3822+ }
3823+
3824+ static void bpf_task_work_ctx_put (struct bpf_task_work_ctx * ctx )
3825+ {
3826+ if (!refcount_dec_and_test (& ctx -> refcnt ))
3827+ return ;
3828+
3829+ bpf_task_work_ctx_reset (ctx );
3830+ call_rcu_tasks_trace (& ctx -> rcu , bpf_task_work_ctx_free_mult_rcu_gp );
3831+ }
3832+
3833+ static void bpf_task_work_cancel (struct bpf_task_work_ctx * ctx )
3834+ {
3835+ /*
3836+ * Scheduled task_work callback holds ctx ref, so if we successfully
3837+ * cancelled, we put that ref on callback's behalf. If we couldn't
3838+ * cancel, callback will inevitably run or has already completed
3839+ * running, and it would have taken care of its ctx ref itself.
3840+ */
3841+ if (task_work_cancel (ctx -> task , & ctx -> work ))
3842+ bpf_task_work_ctx_put (ctx );
3843+ }
3844+
3845+ static void bpf_task_work_callback (struct callback_head * cb )
3846+ {
3847+ struct bpf_task_work_ctx * ctx = container_of (cb , struct bpf_task_work_ctx , work );
3848+ enum bpf_task_work_state state ;
3849+ u32 idx ;
3850+ void * key ;
3851+
3852+ /* Read lock is needed to protect ctx and map key/value access */
3853+ guard (rcu_tasks_trace )();
3854+ /*
3855+ * This callback may start running before bpf_task_work_irq() switched to
3856+ * SCHEDULED state, so handle both transition variants SCHEDULING|SCHEDULED -> RUNNING.
3857+ */
3858+ state = cmpxchg (& ctx -> state , BPF_TW_SCHEDULING , BPF_TW_RUNNING );
3859+ if (state == BPF_TW_SCHEDULED )
3860+ state = cmpxchg (& ctx -> state , BPF_TW_SCHEDULED , BPF_TW_RUNNING );
3861+ if (state == BPF_TW_FREED ) {
3862+ bpf_task_work_ctx_put (ctx );
3863+ return ;
3864+ }
3865+
3866+ key = (void * )map_key_from_value (ctx -> map , ctx -> map_val , & idx );
3867+
3868+ migrate_disable ();
3869+ ctx -> callback_fn (ctx -> map , key , ctx -> map_val );
3870+ migrate_enable ();
3871+
3872+ bpf_task_work_ctx_reset (ctx );
3873+ (void )cmpxchg (& ctx -> state , BPF_TW_RUNNING , BPF_TW_STANDBY );
3874+
3875+ bpf_task_work_ctx_put (ctx );
3876+ }
3877+
3878+ static void bpf_task_work_irq (struct irq_work * irq_work )
3879+ {
3880+ struct bpf_task_work_ctx * ctx = container_of (irq_work , struct bpf_task_work_ctx , irq_work );
3881+ enum bpf_task_work_state state ;
3882+ int err ;
3883+
3884+ guard (rcu_tasks_trace )();
3885+
3886+ if (cmpxchg (& ctx -> state , BPF_TW_PENDING , BPF_TW_SCHEDULING ) != BPF_TW_PENDING ) {
3887+ bpf_task_work_ctx_put (ctx );
3888+ return ;
3889+ }
3890+
3891+ err = task_work_add (ctx -> task , & ctx -> work , ctx -> mode );
3892+ if (err ) {
3893+ bpf_task_work_ctx_reset (ctx );
3894+ /*
3895+ * try to switch back to STANDBY for another task_work reuse, but we might have
3896+ * gone to FREED already, which is fine as we already cleaned up after ourselves
3897+ */
3898+ (void )cmpxchg (& ctx -> state , BPF_TW_SCHEDULING , BPF_TW_STANDBY );
3899+ bpf_task_work_ctx_put (ctx );
3900+ return ;
3901+ }
3902+
3903+ /*
3904+ * It's technically possible for just scheduled task_work callback to
3905+ * complete running by now, going SCHEDULING -> RUNNING and then
3906+ * dropping its ctx refcount. Instead of capturing extra ref just to
3907+ * protected below ctx->state access, we rely on RCU protection to
3908+ * perform below SCHEDULING -> SCHEDULED attempt.
3909+ */
3910+ state = cmpxchg (& ctx -> state , BPF_TW_SCHEDULING , BPF_TW_SCHEDULED );
3911+ if (state == BPF_TW_FREED )
3912+ bpf_task_work_cancel (ctx ); /* clean up if we switched into FREED state */
3913+ }
3914+
3915+ static struct bpf_task_work_ctx * bpf_task_work_fetch_ctx (struct bpf_task_work * tw ,
3916+ struct bpf_map * map )
3917+ {
3918+ struct bpf_task_work_kern * twk = (void * )tw ;
3919+ struct bpf_task_work_ctx * ctx , * old_ctx ;
3920+
3921+ ctx = READ_ONCE (twk -> ctx );
3922+ if (ctx )
3923+ return ctx ;
3924+
3925+ ctx = bpf_mem_alloc (& bpf_global_ma , sizeof (struct bpf_task_work_ctx ));
3926+ if (!ctx )
3927+ return ERR_PTR (- ENOMEM );
3928+
3929+ memset (ctx , 0 , sizeof (* ctx ));
3930+ refcount_set (& ctx -> refcnt , 1 ); /* map's own ref */
3931+ ctx -> state = BPF_TW_STANDBY ;
3932+
3933+ old_ctx = cmpxchg (& twk -> ctx , NULL , ctx );
3934+ if (old_ctx ) {
3935+ /*
3936+ * tw->ctx is set by concurrent BPF program, release allocated
3937+ * memory and try to reuse already set context.
3938+ */
3939+ bpf_mem_free (& bpf_global_ma , ctx );
3940+ return old_ctx ;
3941+ }
3942+
3943+ return ctx ; /* Success */
3944+ }
3945+
3946+ static struct bpf_task_work_ctx * bpf_task_work_acquire_ctx (struct bpf_task_work * tw ,
3947+ struct bpf_map * map )
3948+ {
3949+ struct bpf_task_work_ctx * ctx ;
3950+
3951+ /* early check to avoid any work, we'll double check at the end again */
3952+ if (!atomic64_read (& map -> usercnt ))
3953+ return ERR_PTR (- EBUSY );
3954+
3955+ ctx = bpf_task_work_fetch_ctx (tw , map );
3956+ if (IS_ERR (ctx ))
3957+ return ctx ;
3958+
3959+ /* try to get ref for task_work callback to hold */
3960+ if (!bpf_task_work_ctx_tryget (ctx ))
3961+ return ERR_PTR (- EBUSY );
3962+
3963+ if (cmpxchg (& ctx -> state , BPF_TW_STANDBY , BPF_TW_PENDING ) != BPF_TW_STANDBY ) {
3964+ /* lost acquiring race or map_release_uref() stole it from us, put ref and bail */
3965+ bpf_task_work_ctx_put (ctx );
3966+ return ERR_PTR (- EBUSY );
3967+ }
3968+
3969+ /*
3970+ * Double check that map->usercnt wasn't dropped while we were
3971+ * preparing context, and if it was, we need to clean up as if
3972+ * map_release_uref() was called; bpf_task_work_cancel_and_free()
3973+ * is safe to be called twice on the same task work
3974+ */
3975+ if (!atomic64_read (& map -> usercnt )) {
3976+ /* drop ref we just got for task_work callback itself */
3977+ bpf_task_work_ctx_put (ctx );
3978+ /* transfer map's ref into cancel_and_free() */
3979+ bpf_task_work_cancel_and_free (tw );
3980+ return ERR_PTR (- EBUSY );
3981+ }
3982+
3983+ return ctx ;
3984+ }
3985+
3986+ static int bpf_task_work_schedule (struct task_struct * task , struct bpf_task_work * tw ,
3987+ struct bpf_map * map , bpf_task_work_callback_t callback_fn ,
3988+ struct bpf_prog_aux * aux , enum task_work_notify_mode mode )
3989+ {
3990+ struct bpf_prog * prog ;
3991+ struct bpf_task_work_ctx * ctx ;
3992+ int err ;
3993+
3994+ BTF_TYPE_EMIT (struct bpf_task_work );
3995+
3996+ prog = bpf_prog_inc_not_zero (aux -> prog );
3997+ if (IS_ERR (prog ))
3998+ return - EBADF ;
3999+ task = bpf_task_acquire (task );
4000+ if (!task ) {
4001+ err = - EBADF ;
4002+ goto release_prog ;
4003+ }
4004+
4005+ ctx = bpf_task_work_acquire_ctx (tw , map );
4006+ if (IS_ERR (ctx )) {
4007+ err = PTR_ERR (ctx );
4008+ goto release_all ;
4009+ }
4010+
4011+ ctx -> task = task ;
4012+ ctx -> callback_fn = callback_fn ;
4013+ ctx -> prog = prog ;
4014+ ctx -> mode = mode ;
4015+ ctx -> map = map ;
4016+ ctx -> map_val = (void * )tw - map -> record -> task_work_off ;
4017+ init_task_work (& ctx -> work , bpf_task_work_callback );
4018+ init_irq_work (& ctx -> irq_work , bpf_task_work_irq );
4019+
4020+ irq_work_queue (& ctx -> irq_work );
4021+ return 0 ;
4022+
4023+ release_all :
4024+ bpf_task_release (task );
4025+ release_prog :
4026+ bpf_prog_put (prog );
4027+ return err ;
4028+ }
4029+
37504030/**
37514031 * bpf_task_work_schedule_signal - Schedule BPF callback using task_work_add with TWA_SIGNAL mode
37524032 * @task: Task struct for which callback should be scheduled
@@ -3761,7 +4041,7 @@ __bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task, struct b
37614041 void * map__map , bpf_task_work_callback_t callback ,
37624042 void * aux__prog )
37634043{
3764- return 0 ;
4044+ return bpf_task_work_schedule ( task , tw , map__map , callback , aux__prog , TWA_SIGNAL ) ;
37654045}
37664046
37674047/**
@@ -3778,13 +4058,38 @@ __bpf_kfunc int bpf_task_work_schedule_resume(struct task_struct *task, struct b
37784058 void * map__map , bpf_task_work_callback_t callback ,
37794059 void * aux__prog )
37804060{
3781- return 0 ;
4061+ return bpf_task_work_schedule ( task , tw , map__map , callback , aux__prog , TWA_RESUME ) ;
37824062}
37834063
37844064__bpf_kfunc_end_defs ();
37854065
4066+ static void bpf_task_work_cancel_scheduled (struct irq_work * irq_work )
4067+ {
4068+ struct bpf_task_work_ctx * ctx = container_of (irq_work , struct bpf_task_work_ctx , irq_work );
4069+
4070+ bpf_task_work_cancel (ctx ); /* this might put task_work callback's ref */
4071+ bpf_task_work_ctx_put (ctx ); /* and here we put map's own ref that was transferred to us */
4072+ }
4073+
37864074void bpf_task_work_cancel_and_free (void * val )
37874075{
4076+ struct bpf_task_work_kern * twk = val ;
4077+ struct bpf_task_work_ctx * ctx ;
4078+ enum bpf_task_work_state state ;
4079+
4080+ ctx = xchg (& twk -> ctx , NULL );
4081+ if (!ctx )
4082+ return ;
4083+
4084+ state = xchg (& ctx -> state , BPF_TW_FREED );
4085+ if (state == BPF_TW_SCHEDULED ) {
4086+ /* run in irq_work to avoid locks in NMI */
4087+ init_irq_work (& ctx -> irq_work , bpf_task_work_cancel_scheduled );
4088+ irq_work_queue (& ctx -> irq_work );
4089+ return ;
4090+ }
4091+
4092+ bpf_task_work_ctx_put (ctx ); /* put bpf map's ref */
37884093}
37894094
37904095BTF_KFUNCS_START (generic_btf_ids )
@@ -3921,6 +4226,8 @@ BTF_ID_FLAGS(func, bpf_strnstr);
39214226BTF_ID_FLAGS (func , bpf_cgroup_read_xattr , KF_RCU )
39224227#endif
39234228BTF_ID_FLAGS (func , bpf_stream_vprintk , KF_TRUSTED_ARGS )
4229+ BTF_ID_FLAGS (func , bpf_task_work_schedule_signal , KF_TRUSTED_ARGS )
4230+ BTF_ID_FLAGS (func , bpf_task_work_schedule_resume , KF_TRUSTED_ARGS )
39244231BTF_KFUNCS_END (common_btf_ids )
39254232
39264233static const struct btf_kfunc_id_set common_kfunc_set = {
0 commit comments