Skip to content

Commit c39aa3b

Browse files
Yonghong SongAlexei Starovoitov
authored andcommitted
bpf: Allow per unit prefill for non-fix-size percpu memory allocator
Commit 41a5db8 ("Add support for non-fix-size percpu mem allocation") added support for non-fix-size percpu memory allocation. Such allocation will allocate percpu memory for all buckets on all cpus and the memory consumption is in the order to quadratic. For example, let us say, 4 cpus, unit size 16 bytes, so each cpu has 16 * 4 = 64 bytes, with 4 cpus, total will be 64 * 4 = 256 bytes. Then let us say, 8 cpus with the same unit size, each cpu has 16 * 8 = 128 bytes, with 8 cpus, total will be 128 * 8 = 1024 bytes. So if the number of cpus doubles, the number of memory consumption will be 4 times. So for a system with large number of cpus, the memory consumption goes up quickly with quadratic order. For example, for 4KB percpu allocation, 128 cpus. The total memory consumption will 4KB * 128 * 128 = 64MB. Things will become worse if the number of cpus is bigger (e.g., 512, 1024, etc.) In Commit 41a5db8, the non-fix-size percpu memory allocation is done in boot time, so for system with large number of cpus, the initial percpu memory consumption is very visible. For example, for 128 cpu system, the total percpu memory allocation will be at least (16 + 32 + 64 + 96 + 128 + 196 + 256 + 512 + 1024 + 2048 + 4096) * 128 * 128 = ~138MB. which is pretty big. It will be even bigger for larger number of cpus. Note that the current prefill also allocates 4 entries if the unit size is less than 256. So on top of 138MB memory consumption, this will add more consumption with 3 * (16 + 32 + 64 + 96 + 128 + 196 + 256) * 128 * 128 = ~38MB. Next patch will try to reduce this memory consumption. Later on, Commit 1fda5bb ("bpf: Do not allocate percpu memory at init stage") moved the non-fix-size percpu memory allocation to bpf verificaiton stage. Once a particular bpf_percpu_obj_new() is called by bpf program, the memory allocator will try to fill in the cache with all sizes, causing the same amount of percpu memory consumption as in the boot stage. To reduce the initial percpu memory consumption for non-fix-size percpu memory allocation, instead of filling the cache with all supported allocation sizes, this patch intends to fill the cache only for the requested size. As typically users will not use large percpu data structure, this can save memory significantly. For example, the allocation size is 64 bytes with 128 cpus. Then total percpu memory amount will be 64 * 128 * 128 = 1MB, much less than previous 138MB. Signed-off-by: Yonghong Song <[email protected]> Acked-by: Hou Tao <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Alexei Starovoitov <[email protected]>
1 parent 9fc8e80 commit c39aa3b

File tree

3 files changed

+86
-15
lines changed

3 files changed

+86
-15
lines changed

include/linux/bpf_mem_alloc.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,15 @@ struct bpf_mem_alloc {
2222
* 'size = 0' is for bpf_mem_alloc which manages many fixed-size objects.
2323
* Alloc and free are done with bpf_mem_{alloc,free}() and the size of
2424
* the returned object is given by the size argument of bpf_mem_alloc().
25+
* If percpu equals true, error will be returned in order to avoid
26+
* large memory consumption and the below bpf_mem_alloc_percpu_unit_init()
27+
* should be used to do on-demand per-cpu allocation for each size.
2528
*/
2629
int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu);
30+
/* Initialize a non-fix-size percpu memory allocator */
31+
int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg);
32+
/* The percpu allocation with a specific unit size. */
33+
int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size);
2734
void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma);
2835

2936
/* kmalloc/kfree equivalent: */

kernel/bpf/memalloc.c

Lines changed: 56 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,8 @@ struct bpf_mem_caches {
121121
struct bpf_mem_cache cache[NUM_CACHES];
122122
};
123123

124+
static const u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096};
125+
124126
static struct llist_node notrace *__llist_del_first(struct llist_head *head)
125127
{
126128
struct llist_node *entry, *next;
@@ -499,12 +501,14 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
499501
*/
500502
int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
501503
{
502-
static u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096};
503504
struct bpf_mem_caches *cc, __percpu *pcc;
504505
struct bpf_mem_cache *c, __percpu *pc;
505506
struct obj_cgroup *objcg = NULL;
506507
int cpu, i, unit_size, percpu_size = 0;
507508

509+
if (percpu && size == 0)
510+
return -EINVAL;
511+
508512
/* room for llist_node and per-cpu pointer */
509513
if (percpu)
510514
percpu_size = LLIST_NODE_SZ + sizeof(void *);
@@ -524,6 +528,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
524528
objcg = get_obj_cgroup_from_current();
525529
#endif
526530
ma->objcg = objcg;
531+
527532
for_each_possible_cpu(cpu) {
528533
c = per_cpu_ptr(pc, cpu);
529534
c->unit_size = unit_size;
@@ -562,6 +567,56 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
562567
return 0;
563568
}
564569

570+
int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg)
571+
{
572+
struct bpf_mem_caches __percpu *pcc;
573+
574+
pcc = __alloc_percpu_gfp(sizeof(struct bpf_mem_caches), 8, GFP_KERNEL);
575+
if (!pcc)
576+
return -ENOMEM;
577+
578+
ma->caches = pcc;
579+
ma->objcg = objcg;
580+
ma->percpu = true;
581+
return 0;
582+
}
583+
584+
int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size)
585+
{
586+
struct bpf_mem_caches *cc, __percpu *pcc;
587+
int cpu, i, unit_size, percpu_size;
588+
struct obj_cgroup *objcg;
589+
struct bpf_mem_cache *c;
590+
591+
i = bpf_mem_cache_idx(size);
592+
if (i < 0)
593+
return -EINVAL;
594+
595+
/* room for llist_node and per-cpu pointer */
596+
percpu_size = LLIST_NODE_SZ + sizeof(void *);
597+
598+
unit_size = sizes[i];
599+
objcg = ma->objcg;
600+
pcc = ma->caches;
601+
602+
for_each_possible_cpu(cpu) {
603+
cc = per_cpu_ptr(pcc, cpu);
604+
c = &cc->cache[i];
605+
if (cpu == 0 && c->unit_size)
606+
break;
607+
608+
c->unit_size = unit_size;
609+
c->objcg = objcg;
610+
c->percpu_size = percpu_size;
611+
c->tgt = c;
612+
613+
init_refill_work(c);
614+
prefill_mem_cache(c, cpu);
615+
}
616+
617+
return 0;
618+
}
619+
565620
static void drain_mem_cache(struct bpf_mem_cache *c)
566621
{
567622
bool percpu = !!c->percpu_size;

kernel/bpf/verifier.c

Lines changed: 23 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -12139,20 +12139,6 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
1213912139
if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] && !bpf_global_ma_set)
1214012140
return -ENOMEM;
1214112141

12142-
if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
12143-
if (!bpf_global_percpu_ma_set) {
12144-
mutex_lock(&bpf_percpu_ma_lock);
12145-
if (!bpf_global_percpu_ma_set) {
12146-
err = bpf_mem_alloc_init(&bpf_global_percpu_ma, 0, true);
12147-
if (!err)
12148-
bpf_global_percpu_ma_set = true;
12149-
}
12150-
mutex_unlock(&bpf_percpu_ma_lock);
12151-
if (err)
12152-
return err;
12153-
}
12154-
}
12155-
1215612142
if (((u64)(u32)meta.arg_constant.value) != meta.arg_constant.value) {
1215712143
verbose(env, "local type ID argument must be in range [0, U32_MAX]\n");
1215812144
return -EINVAL;
@@ -12173,6 +12159,29 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
1217312159
return -EINVAL;
1217412160
}
1217512161

12162+
if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
12163+
if (!bpf_global_percpu_ma_set) {
12164+
mutex_lock(&bpf_percpu_ma_lock);
12165+
if (!bpf_global_percpu_ma_set) {
12166+
/* Charge memory allocated with bpf_global_percpu_ma to
12167+
* root memcg. The obj_cgroup for root memcg is NULL.
12168+
*/
12169+
err = bpf_mem_alloc_percpu_init(&bpf_global_percpu_ma, NULL);
12170+
if (!err)
12171+
bpf_global_percpu_ma_set = true;
12172+
}
12173+
mutex_unlock(&bpf_percpu_ma_lock);
12174+
if (err)
12175+
return err;
12176+
}
12177+
12178+
mutex_lock(&bpf_percpu_ma_lock);
12179+
err = bpf_mem_alloc_percpu_unit_init(&bpf_global_percpu_ma, ret_t->size);
12180+
mutex_unlock(&bpf_percpu_ma_lock);
12181+
if (err)
12182+
return err;
12183+
}
12184+
1217612185
struct_meta = btf_find_struct_meta(ret_btf, ret_btf_id);
1217712186
if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
1217812187
if (!__btf_type_is_scalar_struct(env, ret_btf, ret_t, 0)) {

0 commit comments

Comments
 (0)