Skip to content

Commit 348f611

Browse files
author
Alexei Starovoitov
committed
Merge branch 'bpf-introduce-deferred-task-context-execution'
Mykyta Yatsenko says: ==================== bpf: Introduce deferred task context execution From: Mykyta Yatsenko <[email protected]> This patch introduces a new mechanism for BPF programs to schedule deferred execution in the context of a specific task using the kernel’s task_work infrastructure. The new bpf_task_work interface enables BPF use cases that require sleepable subprogram execution within task context, for example, scheduling sleepable function from the context that does not allow sleepable, such as NMI. Introduced kfuncs bpf_task_work_schedule_signal() and bpf_task_work_schedule_resume() for scheduling BPF callbacks correspond to different modes used by task_work (TWA_SIGNAL or TWA_RESUME). The implementation manages scheduling state via metadata objects (struct bpf_task_work_context). Pointers to bpf_task_work_context are stored in BPF map values. State transitions are handled via an atomic state machine (bpf_task_work_state) to ensure correctness under concurrent usage and deletion, lifetime is guarded by refcounting and RCU Tasks Trace. Kfuncs call task_work_add() indirectly via irq_work to avoid locking in potentially NMI context. Changelog: --- v7 -> v8 v7: https://lore.kernel.org/bpf/[email protected]/ * Fix unused variable warning in patch 1 * Decrease stress test time from 2 to 1 second * Went through CI warnings, other than unused variable, there are just 2 new in kernel/bpf/helpers.c related to newly introduced kfuncs, these look expected. v6 -> v7 v6: https://lore.kernel.org/bpf/[email protected]/ * Added stress test * Extending refactoring in patch 1 * Changing comment and removing one check for map->usercnt in patch 7 v5 -> v6 v5: https://lore.kernel.org/bpf/[email protected]/ * Fixing readability in verifier.c:check_map_field_pointer() * Removing BUG_ON from helpers.c v4 -> v5 v4: https://lore.kernel.org/all/[email protected]/ * Fix invalid/null pointer dereference bug, reported by syzbot * Nits in selftests v3 -> v4 v3: https://lore.kernel.org/all/[email protected]/ * Modify async callback return value processing in verifier, to allow non-zero return values. * Change return type of the callback from void to int, as verifier expects scalar value. * Switched to void* for bpf_map API kfunc arguments to avoid casts. * Addressing numerous nits and small improvements. v2 -> v3 v2: https://lore.kernel.org/all/[email protected]/ * Introduce ref counting * Add patches with minor verifier and btf.c refactorings to avoid code duplication * Rework initiation of the task work scheduling to handle race with map usercnt dropping to zero ==================== Link: https://patch.msgid.link/[email protected] Signed-off-by: Alexei Starovoitov <[email protected]>
2 parents 58a5820 + c6ae18e commit 348f611

File tree

14 files changed

+1148
-112
lines changed

14 files changed

+1148
-112
lines changed

include/linux/bpf.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,7 @@ enum btf_field_type {
209209
BPF_WORKQUEUE = (1 << 10),
210210
BPF_UPTR = (1 << 11),
211211
BPF_RES_SPIN_LOCK = (1 << 12),
212+
BPF_TASK_WORK = (1 << 13),
212213
};
213214

214215
enum bpf_cgroup_storage_type {
@@ -262,6 +263,7 @@ struct btf_record {
262263
int timer_off;
263264
int wq_off;
264265
int refcount_off;
266+
int task_work_off;
265267
struct btf_field fields[];
266268
};
267269

@@ -363,6 +365,8 @@ static inline const char *btf_field_type_name(enum btf_field_type type)
363365
return "bpf_rb_node";
364366
case BPF_REFCOUNT:
365367
return "bpf_refcount";
368+
case BPF_TASK_WORK:
369+
return "bpf_task_work";
366370
default:
367371
WARN_ON_ONCE(1);
368372
return "unknown";
@@ -401,6 +405,8 @@ static inline u32 btf_field_type_size(enum btf_field_type type)
401405
return sizeof(struct bpf_rb_node);
402406
case BPF_REFCOUNT:
403407
return sizeof(struct bpf_refcount);
408+
case BPF_TASK_WORK:
409+
return sizeof(struct bpf_task_work);
404410
default:
405411
WARN_ON_ONCE(1);
406412
return 0;
@@ -433,6 +439,8 @@ static inline u32 btf_field_type_align(enum btf_field_type type)
433439
return __alignof__(struct bpf_rb_node);
434440
case BPF_REFCOUNT:
435441
return __alignof__(struct bpf_refcount);
442+
case BPF_TASK_WORK:
443+
return __alignof__(struct bpf_task_work);
436444
default:
437445
WARN_ON_ONCE(1);
438446
return 0;
@@ -464,6 +472,7 @@ static inline void bpf_obj_init_field(const struct btf_field *field, void *addr)
464472
case BPF_KPTR_REF:
465473
case BPF_KPTR_PERCPU:
466474
case BPF_UPTR:
475+
case BPF_TASK_WORK:
467476
break;
468477
default:
469478
WARN_ON_ONCE(1);
@@ -600,6 +609,7 @@ void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
600609
bool lock_src);
601610
void bpf_timer_cancel_and_free(void *timer);
602611
void bpf_wq_cancel_and_free(void *timer);
612+
void bpf_task_work_cancel_and_free(void *timer);
603613
void bpf_list_head_free(const struct btf_field *field, void *list_head,
604614
struct bpf_spin_lock *spin_lock);
605615
void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
@@ -2426,6 +2436,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec);
24262436
bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b);
24272437
void bpf_obj_free_timer(const struct btf_record *rec, void *obj);
24282438
void bpf_obj_free_workqueue(const struct btf_record *rec, void *obj);
2439+
void bpf_obj_free_task_work(const struct btf_record *rec, void *obj);
24292440
void bpf_obj_free_fields(const struct btf_record *rec, void *obj);
24302441
void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu);
24312442

include/uapi/linux/bpf.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7436,6 +7436,10 @@ struct bpf_timer {
74367436
__u64 __opaque[2];
74377437
} __attribute__((aligned(8)));
74387438

7439+
struct bpf_task_work {
7440+
__u64 __opaque;
7441+
} __attribute__((aligned(8)));
7442+
74397443
struct bpf_wq {
74407444
__u64 __opaque[2];
74417445
} __attribute__((aligned(8)));

kernel/bpf/arraymap.c

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -443,20 +443,22 @@ static void *array_map_vmalloc_addr(struct bpf_array *array)
443443
return (void *)round_down((unsigned long)array, PAGE_SIZE);
444444
}
445445

446-
static void array_map_free_timers_wq(struct bpf_map *map)
446+
static void array_map_free_internal_structs(struct bpf_map *map)
447447
{
448448
struct bpf_array *array = container_of(map, struct bpf_array, map);
449449
int i;
450450

451451
/* We don't reset or free fields other than timer and workqueue
452452
* on uref dropping to zero.
453453
*/
454-
if (btf_record_has_field(map->record, BPF_TIMER | BPF_WORKQUEUE)) {
454+
if (btf_record_has_field(map->record, BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) {
455455
for (i = 0; i < array->map.max_entries; i++) {
456456
if (btf_record_has_field(map->record, BPF_TIMER))
457457
bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i));
458458
if (btf_record_has_field(map->record, BPF_WORKQUEUE))
459459
bpf_obj_free_workqueue(map->record, array_map_elem_ptr(array, i));
460+
if (btf_record_has_field(map->record, BPF_TASK_WORK))
461+
bpf_obj_free_task_work(map->record, array_map_elem_ptr(array, i));
460462
}
461463
}
462464
}
@@ -795,7 +797,7 @@ const struct bpf_map_ops array_map_ops = {
795797
.map_alloc = array_map_alloc,
796798
.map_free = array_map_free,
797799
.map_get_next_key = array_map_get_next_key,
798-
.map_release_uref = array_map_free_timers_wq,
800+
.map_release_uref = array_map_free_internal_structs,
799801
.map_lookup_elem = array_map_lookup_elem,
800802
.map_update_elem = array_map_update_elem,
801803
.map_delete_elem = array_map_delete_elem,

kernel/bpf/btf.c

Lines changed: 40 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -3478,60 +3478,45 @@ btf_find_graph_root(const struct btf *btf, const struct btf_type *pt,
34783478
return BTF_FIELD_FOUND;
34793479
}
34803480

3481-
#define field_mask_test_name(field_type, field_type_str) \
3482-
if (field_mask & field_type && !strcmp(name, field_type_str)) { \
3483-
type = field_type; \
3484-
goto end; \
3485-
}
3486-
34873481
static int btf_get_field_type(const struct btf *btf, const struct btf_type *var_type,
3488-
u32 field_mask, u32 *seen_mask,
3489-
int *align, int *sz)
3490-
{
3491-
int type = 0;
3482+
u32 field_mask, u32 *seen_mask, int *align, int *sz)
3483+
{
3484+
const struct {
3485+
enum btf_field_type type;
3486+
const char *const name;
3487+
const bool is_unique;
3488+
} field_types[] = {
3489+
{ BPF_SPIN_LOCK, "bpf_spin_lock", true },
3490+
{ BPF_RES_SPIN_LOCK, "bpf_res_spin_lock", true },
3491+
{ BPF_TIMER, "bpf_timer", true },
3492+
{ BPF_WORKQUEUE, "bpf_wq", true },
3493+
{ BPF_TASK_WORK, "bpf_task_work", true },
3494+
{ BPF_LIST_HEAD, "bpf_list_head", false },
3495+
{ BPF_LIST_NODE, "bpf_list_node", false },
3496+
{ BPF_RB_ROOT, "bpf_rb_root", false },
3497+
{ BPF_RB_NODE, "bpf_rb_node", false },
3498+
{ BPF_REFCOUNT, "bpf_refcount", false },
3499+
};
3500+
int type = 0, i;
34923501
const char *name = __btf_name_by_offset(btf, var_type->name_off);
3493-
3494-
if (field_mask & BPF_SPIN_LOCK) {
3495-
if (!strcmp(name, "bpf_spin_lock")) {
3496-
if (*seen_mask & BPF_SPIN_LOCK)
3497-
return -E2BIG;
3498-
*seen_mask |= BPF_SPIN_LOCK;
3499-
type = BPF_SPIN_LOCK;
3500-
goto end;
3501-
}
3502-
}
3503-
if (field_mask & BPF_RES_SPIN_LOCK) {
3504-
if (!strcmp(name, "bpf_res_spin_lock")) {
3505-
if (*seen_mask & BPF_RES_SPIN_LOCK)
3506-
return -E2BIG;
3507-
*seen_mask |= BPF_RES_SPIN_LOCK;
3508-
type = BPF_RES_SPIN_LOCK;
3509-
goto end;
3510-
}
3511-
}
3512-
if (field_mask & BPF_TIMER) {
3513-
if (!strcmp(name, "bpf_timer")) {
3514-
if (*seen_mask & BPF_TIMER)
3515-
return -E2BIG;
3516-
*seen_mask |= BPF_TIMER;
3517-
type = BPF_TIMER;
3518-
goto end;
3519-
}
3520-
}
3521-
if (field_mask & BPF_WORKQUEUE) {
3522-
if (!strcmp(name, "bpf_wq")) {
3523-
if (*seen_mask & BPF_WORKQUEUE)
3502+
const char *field_type_name;
3503+
enum btf_field_type field_type;
3504+
bool is_unique;
3505+
3506+
for (i = 0; i < ARRAY_SIZE(field_types); ++i) {
3507+
field_type = field_types[i].type;
3508+
field_type_name = field_types[i].name;
3509+
is_unique = field_types[i].is_unique;
3510+
if (!(field_mask & field_type) || strcmp(name, field_type_name))
3511+
continue;
3512+
if (is_unique) {
3513+
if (*seen_mask & field_type)
35243514
return -E2BIG;
3525-
*seen_mask |= BPF_WORKQUEUE;
3526-
type = BPF_WORKQUEUE;
3527-
goto end;
3515+
*seen_mask |= field_type;
35283516
}
3517+
type = field_type;
3518+
goto end;
35293519
}
3530-
field_mask_test_name(BPF_LIST_HEAD, "bpf_list_head");
3531-
field_mask_test_name(BPF_LIST_NODE, "bpf_list_node");
3532-
field_mask_test_name(BPF_RB_ROOT, "bpf_rb_root");
3533-
field_mask_test_name(BPF_RB_NODE, "bpf_rb_node");
3534-
field_mask_test_name(BPF_REFCOUNT, "bpf_refcount");
35353520

35363521
/* Only return BPF_KPTR when all other types with matchable names fail */
35373522
if (field_mask & (BPF_KPTR | BPF_UPTR) && !__btf_type_is_struct(var_type)) {
@@ -3545,8 +3530,6 @@ static int btf_get_field_type(const struct btf *btf, const struct btf_type *var_
35453530
return type;
35463531
}
35473532

3548-
#undef field_mask_test_name
3549-
35503533
/* Repeat a number of fields for a specified number of times.
35513534
*
35523535
* Copy the fields starting from the first field and repeat them for
@@ -3693,6 +3676,7 @@ static int btf_find_field_one(const struct btf *btf,
36933676
case BPF_LIST_NODE:
36943677
case BPF_RB_NODE:
36953678
case BPF_REFCOUNT:
3679+
case BPF_TASK_WORK:
36963680
ret = btf_find_struct(btf, var_type, off, sz, field_type,
36973681
info_cnt ? &info[0] : &tmp);
36983682
if (ret < 0)
@@ -3985,6 +3969,7 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
39853969
rec->timer_off = -EINVAL;
39863970
rec->wq_off = -EINVAL;
39873971
rec->refcount_off = -EINVAL;
3972+
rec->task_work_off = -EINVAL;
39883973
for (i = 0; i < cnt; i++) {
39893974
field_type_size = btf_field_type_size(info_arr[i].type);
39903975
if (info_arr[i].off + field_type_size > value_size) {
@@ -4024,6 +4009,10 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
40244009
/* Cache offset for faster lookup at runtime */
40254010
rec->wq_off = rec->fields[i].offset;
40264011
break;
4012+
case BPF_TASK_WORK:
4013+
WARN_ON_ONCE(rec->task_work_off >= 0);
4014+
rec->task_work_off = rec->fields[i].offset;
4015+
break;
40274016
case BPF_REFCOUNT:
40284017
WARN_ON_ONCE(rec->refcount_off >= 0);
40294018
/* Cache offset for faster lookup at runtime */

kernel/bpf/hashtab.c

Lines changed: 23 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,20 @@ static bool htab_has_extra_elems(struct bpf_htab *htab)
215215
return !htab_is_percpu(htab) && !htab_is_lru(htab) && !is_fd_htab(htab);
216216
}
217217

218-
static void htab_free_prealloced_timers_and_wq(struct bpf_htab *htab)
218+
static void htab_free_internal_structs(struct bpf_htab *htab, struct htab_elem *elem)
219+
{
220+
if (btf_record_has_field(htab->map.record, BPF_TIMER))
221+
bpf_obj_free_timer(htab->map.record,
222+
htab_elem_value(elem, htab->map.key_size));
223+
if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE))
224+
bpf_obj_free_workqueue(htab->map.record,
225+
htab_elem_value(elem, htab->map.key_size));
226+
if (btf_record_has_field(htab->map.record, BPF_TASK_WORK))
227+
bpf_obj_free_task_work(htab->map.record,
228+
htab_elem_value(elem, htab->map.key_size));
229+
}
230+
231+
static void htab_free_prealloced_internal_structs(struct bpf_htab *htab)
219232
{
220233
u32 num_entries = htab->map.max_entries;
221234
int i;
@@ -227,12 +240,7 @@ static void htab_free_prealloced_timers_and_wq(struct bpf_htab *htab)
227240
struct htab_elem *elem;
228241

229242
elem = get_htab_elem(htab, i);
230-
if (btf_record_has_field(htab->map.record, BPF_TIMER))
231-
bpf_obj_free_timer(htab->map.record,
232-
htab_elem_value(elem, htab->map.key_size));
233-
if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE))
234-
bpf_obj_free_workqueue(htab->map.record,
235-
htab_elem_value(elem, htab->map.key_size));
243+
htab_free_internal_structs(htab, elem);
236244
cond_resched();
237245
}
238246
}
@@ -1490,7 +1498,7 @@ static void delete_all_elements(struct bpf_htab *htab)
14901498
}
14911499
}
14921500

1493-
static void htab_free_malloced_timers_and_wq(struct bpf_htab *htab)
1501+
static void htab_free_malloced_internal_structs(struct bpf_htab *htab)
14941502
{
14951503
int i;
14961504

@@ -1502,28 +1510,23 @@ static void htab_free_malloced_timers_and_wq(struct bpf_htab *htab)
15021510

15031511
hlist_nulls_for_each_entry(l, n, head, hash_node) {
15041512
/* We only free timer on uref dropping to zero */
1505-
if (btf_record_has_field(htab->map.record, BPF_TIMER))
1506-
bpf_obj_free_timer(htab->map.record,
1507-
htab_elem_value(l, htab->map.key_size));
1508-
if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE))
1509-
bpf_obj_free_workqueue(htab->map.record,
1510-
htab_elem_value(l, htab->map.key_size));
1513+
htab_free_internal_structs(htab, l);
15111514
}
15121515
cond_resched_rcu();
15131516
}
15141517
rcu_read_unlock();
15151518
}
15161519

1517-
static void htab_map_free_timers_and_wq(struct bpf_map *map)
1520+
static void htab_map_free_internal_structs(struct bpf_map *map)
15181521
{
15191522
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
15201523

15211524
/* We only free timer and workqueue on uref dropping to zero */
1522-
if (btf_record_has_field(htab->map.record, BPF_TIMER | BPF_WORKQUEUE)) {
1525+
if (btf_record_has_field(htab->map.record, BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) {
15231526
if (!htab_is_prealloc(htab))
1524-
htab_free_malloced_timers_and_wq(htab);
1527+
htab_free_malloced_internal_structs(htab);
15251528
else
1526-
htab_free_prealloced_timers_and_wq(htab);
1529+
htab_free_prealloced_internal_structs(htab);
15271530
}
15281531
}
15291532

@@ -2255,7 +2258,7 @@ const struct bpf_map_ops htab_map_ops = {
22552258
.map_alloc = htab_map_alloc,
22562259
.map_free = htab_map_free,
22572260
.map_get_next_key = htab_map_get_next_key,
2258-
.map_release_uref = htab_map_free_timers_and_wq,
2261+
.map_release_uref = htab_map_free_internal_structs,
22592262
.map_lookup_elem = htab_map_lookup_elem,
22602263
.map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem,
22612264
.map_update_elem = htab_map_update_elem,
@@ -2276,7 +2279,7 @@ const struct bpf_map_ops htab_lru_map_ops = {
22762279
.map_alloc = htab_map_alloc,
22772280
.map_free = htab_map_free,
22782281
.map_get_next_key = htab_map_get_next_key,
2279-
.map_release_uref = htab_map_free_timers_and_wq,
2282+
.map_release_uref = htab_map_free_internal_structs,
22802283
.map_lookup_elem = htab_lru_map_lookup_elem,
22812284
.map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem,
22822285
.map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys,

0 commit comments

Comments
 (0)