Skip to content

Commit 78a8a85

Browse files
Yonghong SongAlexei Starovoitov
authored andcommitted
bpf: Allow pre-ordering for bpf cgroup progs
Currently for bpf progs in a cgroup hierarchy, the effective prog array is computed from bottom cgroup to upper cgroups (post-ordering). For example, the following cgroup hierarchy root cgroup: p1, p2 subcgroup: p3, p4 have BPF_F_ALLOW_MULTI for both cgroup levels. The effective cgroup array ordering looks like p3 p4 p1 p2 and at run time, progs will execute based on that order. But in some cases, it is desirable to have root prog executes earlier than children progs (pre-ordering). For example, - prog p1 intends to collect original pkt dest addresses. - prog p3 will modify original pkt dest addresses to a proxy address for security reason. The end result is that prog p1 gets proxy address which is not what it wants. Putting p1 to every child cgroup is not desirable either as it will duplicate itself in many child cgroups. And this is exactly a use case we are encountering in Meta. To fix this issue, let us introduce a flag BPF_F_PREORDER. If the flag is specified at attachment time, the prog has higher priority and the ordering with that flag will be from top to bottom (pre-ordering). For example, in the above example, root cgroup: p1, p2 subcgroup: p3, p4 Let us say p2 and p4 are marked with BPF_F_PREORDER. The final effective array ordering will be p2 p4 p3 p1 Suggested-by: Andrii Nakryiko <[email protected]> Acked-by: Andrii Nakryiko <[email protected]> Signed-off-by: Yonghong Song <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Alexei Starovoitov <[email protected]>
1 parent 0ffa016 commit 78a8a85

File tree

5 files changed

+30
-9
lines changed

5 files changed

+30
-9
lines changed

include/linux/bpf-cgroup.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ struct bpf_prog_list {
111111
struct bpf_prog *prog;
112112
struct bpf_cgroup_link *link;
113113
struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE];
114+
u32 flags;
114115
};
115116

116117
int cgroup_bpf_inherit(struct cgroup *cgrp);

include/uapi/linux/bpf.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1207,6 +1207,7 @@ enum bpf_perf_event_type {
12071207
#define BPF_F_BEFORE (1U << 3)
12081208
#define BPF_F_AFTER (1U << 4)
12091209
#define BPF_F_ID (1U << 5)
1210+
#define BPF_F_PREORDER (1U << 6)
12101211
#define BPF_F_LINK BPF_F_LINK /* 1 << 13 */
12111212

12121213
/* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the

kernel/bpf/cgroup.c

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -369,14 +369,16 @@ static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl)
369369
/* count number of elements in the list.
370370
* it's slow but the list cannot be long
371371
*/
372-
static u32 prog_list_length(struct hlist_head *head)
372+
static u32 prog_list_length(struct hlist_head *head, int *preorder_cnt)
373373
{
374374
struct bpf_prog_list *pl;
375375
u32 cnt = 0;
376376

377377
hlist_for_each_entry(pl, head, node) {
378378
if (!prog_list_prog(pl))
379379
continue;
380+
if (preorder_cnt && (pl->flags & BPF_F_PREORDER))
381+
(*preorder_cnt)++;
380382
cnt++;
381383
}
382384
return cnt;
@@ -400,7 +402,7 @@ static bool hierarchy_allows_attach(struct cgroup *cgrp,
400402

401403
if (flags & BPF_F_ALLOW_MULTI)
402404
return true;
403-
cnt = prog_list_length(&p->bpf.progs[atype]);
405+
cnt = prog_list_length(&p->bpf.progs[atype], NULL);
404406
WARN_ON_ONCE(cnt > 1);
405407
if (cnt == 1)
406408
return !!(flags & BPF_F_ALLOW_OVERRIDE);
@@ -423,12 +425,12 @@ static int compute_effective_progs(struct cgroup *cgrp,
423425
struct bpf_prog_array *progs;
424426
struct bpf_prog_list *pl;
425427
struct cgroup *p = cgrp;
426-
int cnt = 0;
428+
int i, j, cnt = 0, preorder_cnt = 0, fstart, bstart, init_bstart;
427429

428430
/* count number of effective programs by walking parents */
429431
do {
430432
if (cnt == 0 || (p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
431-
cnt += prog_list_length(&p->bpf.progs[atype]);
433+
cnt += prog_list_length(&p->bpf.progs[atype], &preorder_cnt);
432434
p = cgroup_parent(p);
433435
} while (p);
434436

@@ -439,20 +441,34 @@ static int compute_effective_progs(struct cgroup *cgrp,
439441
/* populate the array with effective progs */
440442
cnt = 0;
441443
p = cgrp;
444+
fstart = preorder_cnt;
445+
bstart = preorder_cnt - 1;
442446
do {
443447
if (cnt > 0 && !(p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
444448
continue;
445449

450+
init_bstart = bstart;
446451
hlist_for_each_entry(pl, &p->bpf.progs[atype], node) {
447452
if (!prog_list_prog(pl))
448453
continue;
449454

450-
item = &progs->items[cnt];
455+
if (pl->flags & BPF_F_PREORDER) {
456+
item = &progs->items[bstart];
457+
bstart--;
458+
} else {
459+
item = &progs->items[fstart];
460+
fstart++;
461+
}
451462
item->prog = prog_list_prog(pl);
452463
bpf_cgroup_storages_assign(item->cgroup_storage,
453464
pl->storage);
454465
cnt++;
455466
}
467+
468+
/* reverse pre-ordering progs at this cgroup level */
469+
for (i = bstart + 1, j = init_bstart; i < j; i++, j--)
470+
swap(progs->items[i], progs->items[j]);
471+
456472
} while ((p = cgroup_parent(p)));
457473

458474
*array = progs;
@@ -663,7 +679,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
663679
*/
664680
return -EPERM;
665681

666-
if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS)
682+
if (prog_list_length(progs, NULL) >= BPF_CGROUP_MAX_PROGS)
667683
return -E2BIG;
668684

669685
pl = find_attach_entry(progs, prog, link, replace_prog,
@@ -698,6 +714,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
698714

699715
pl->prog = prog;
700716
pl->link = link;
717+
pl->flags = flags;
701718
bpf_cgroup_storages_assign(pl->storage, storage);
702719
cgrp->bpf.flags[atype] = saved_flags;
703720

@@ -1073,7 +1090,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
10731090
lockdep_is_held(&cgroup_mutex));
10741091
total_cnt += bpf_prog_array_length(effective);
10751092
} else {
1076-
total_cnt += prog_list_length(&cgrp->bpf.progs[atype]);
1093+
total_cnt += prog_list_length(&cgrp->bpf.progs[atype], NULL);
10771094
}
10781095
}
10791096

@@ -1105,7 +1122,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
11051122
u32 id;
11061123

11071124
progs = &cgrp->bpf.progs[atype];
1108-
cnt = min_t(int, prog_list_length(progs), total_cnt);
1125+
cnt = min_t(int, prog_list_length(progs, NULL), total_cnt);
11091126
i = 0;
11101127
hlist_for_each_entry(pl, progs, node) {
11111128
prog = prog_list_prog(pl);

kernel/bpf/syscall.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4170,7 +4170,8 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
41704170
#define BPF_F_ATTACH_MASK_BASE \
41714171
(BPF_F_ALLOW_OVERRIDE | \
41724172
BPF_F_ALLOW_MULTI | \
4173-
BPF_F_REPLACE)
4173+
BPF_F_REPLACE | \
4174+
BPF_F_PREORDER)
41744175

41754176
#define BPF_F_ATTACH_MASK_MPROG \
41764177
(BPF_F_REPLACE | \

tools/include/uapi/linux/bpf.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1207,6 +1207,7 @@ enum bpf_perf_event_type {
12071207
#define BPF_F_BEFORE (1U << 3)
12081208
#define BPF_F_AFTER (1U << 4)
12091209
#define BPF_F_ID (1U << 5)
1210+
#define BPF_F_PREORDER (1U << 6)
12101211
#define BPF_F_LINK BPF_F_LINK /* 1 << 13 */
12111212

12121213
/* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the

0 commit comments

Comments
 (0)