Skip to content

Commit 8c290e6

Browse files
4astdavem330
authored andcommitted
bpf: fix hashmap extra_elems logic
In both kmalloc and prealloc mode the bpf_map_update_elem() is using per-cpu extra_elems to do atomic update when the map is full. There are two issues with it. The logic can be misused, since it allows max_entries+num_cpus elements to be present in the map. And alloc_extra_elems() at map creation time can fail percpu alloc for large map values with a warn: WARNING: CPU: 3 PID: 2752 at ../mm/percpu.c:892 pcpu_alloc+0x119/0xa60 illegal size (32824) or align (8) for percpu allocation The fixes for both of these issues are different for kmalloc and prealloc modes. For prealloc mode allocate extra num_possible_cpus elements and store their pointers into extra_elems array instead of actual elements. Hence we can use these hidden(spare) elements not only when the map is full but during bpf_map_update_elem() that replaces existing element too. That also improves performance, since pcpu_freelist_pop/push is avoided. Unfortunately this approach cannot be used for kmalloc mode which needs to kfree elements after rcu grace period. Therefore switch it back to normal kmalloc even when full and old element exists like it was prior to commit 6c90598 ("bpf: pre-allocate hash map elements"). Add tests to check for over max_entries and large map values. Reported-by: Dave Jones <[email protected]> Fixes: 6c90598 ("bpf: pre-allocate hash map elements") Signed-off-by: Alexei Starovoitov <[email protected]> Acked-by: Daniel Borkmann <[email protected]> Acked-by: Martin KaFai Lau <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent dd1ef79 commit 8c290e6

File tree

2 files changed

+97
-76
lines changed

2 files changed

+97
-76
lines changed

kernel/bpf/hashtab.c

Lines changed: 71 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -30,18 +30,12 @@ struct bpf_htab {
3030
struct pcpu_freelist freelist;
3131
struct bpf_lru lru;
3232
};
33-
void __percpu *extra_elems;
33+
struct htab_elem *__percpu *extra_elems;
3434
atomic_t count; /* number of elements in this hashtable */
3535
u32 n_buckets; /* number of hash buckets */
3636
u32 elem_size; /* size of each element in bytes */
3737
};
3838

39-
enum extra_elem_state {
40-
HTAB_NOT_AN_EXTRA_ELEM = 0,
41-
HTAB_EXTRA_ELEM_FREE,
42-
HTAB_EXTRA_ELEM_USED
43-
};
44-
4539
/* each htab element is struct htab_elem + key + value */
4640
struct htab_elem {
4741
union {
@@ -56,7 +50,6 @@ struct htab_elem {
5650
};
5751
union {
5852
struct rcu_head rcu;
59-
enum extra_elem_state state;
6053
struct bpf_lru_node lru_node;
6154
};
6255
u32 hash;
@@ -77,6 +70,11 @@ static bool htab_is_percpu(const struct bpf_htab *htab)
7770
htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
7871
}
7972

73+
static bool htab_is_prealloc(const struct bpf_htab *htab)
74+
{
75+
return !(htab->map.map_flags & BPF_F_NO_PREALLOC);
76+
}
77+
8078
static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
8179
void __percpu *pptr)
8280
{
@@ -128,17 +126,20 @@ static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key,
128126

129127
static int prealloc_init(struct bpf_htab *htab)
130128
{
129+
u32 num_entries = htab->map.max_entries;
131130
int err = -ENOMEM, i;
132131

133-
htab->elems = bpf_map_area_alloc(htab->elem_size *
134-
htab->map.max_entries);
132+
if (!htab_is_percpu(htab) && !htab_is_lru(htab))
133+
num_entries += num_possible_cpus();
134+
135+
htab->elems = bpf_map_area_alloc(htab->elem_size * num_entries);
135136
if (!htab->elems)
136137
return -ENOMEM;
137138

138139
if (!htab_is_percpu(htab))
139140
goto skip_percpu_elems;
140141

141-
for (i = 0; i < htab->map.max_entries; i++) {
142+
for (i = 0; i < num_entries; i++) {
142143
u32 size = round_up(htab->map.value_size, 8);
143144
void __percpu *pptr;
144145

@@ -166,11 +167,11 @@ static int prealloc_init(struct bpf_htab *htab)
166167
if (htab_is_lru(htab))
167168
bpf_lru_populate(&htab->lru, htab->elems,
168169
offsetof(struct htab_elem, lru_node),
169-
htab->elem_size, htab->map.max_entries);
170+
htab->elem_size, num_entries);
170171
else
171172
pcpu_freelist_populate(&htab->freelist,
172173
htab->elems + offsetof(struct htab_elem, fnode),
173-
htab->elem_size, htab->map.max_entries);
174+
htab->elem_size, num_entries);
174175

175176
return 0;
176177

@@ -191,16 +192,22 @@ static void prealloc_destroy(struct bpf_htab *htab)
191192

192193
static int alloc_extra_elems(struct bpf_htab *htab)
193194
{
194-
void __percpu *pptr;
195+
struct htab_elem *__percpu *pptr, *l_new;
196+
struct pcpu_freelist_node *l;
195197
int cpu;
196198

197-
pptr = __alloc_percpu_gfp(htab->elem_size, 8, GFP_USER | __GFP_NOWARN);
199+
pptr = __alloc_percpu_gfp(sizeof(struct htab_elem *), 8,
200+
GFP_USER | __GFP_NOWARN);
198201
if (!pptr)
199202
return -ENOMEM;
200203

201204
for_each_possible_cpu(cpu) {
202-
((struct htab_elem *)per_cpu_ptr(pptr, cpu))->state =
203-
HTAB_EXTRA_ELEM_FREE;
205+
l = pcpu_freelist_pop(&htab->freelist);
206+
/* pop will succeed, since prealloc_init()
207+
* preallocated extra num_possible_cpus elements
208+
*/
209+
l_new = container_of(l, struct htab_elem, fnode);
210+
*per_cpu_ptr(pptr, cpu) = l_new;
204211
}
205212
htab->extra_elems = pptr;
206213
return 0;
@@ -342,25 +349,25 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
342349
raw_spin_lock_init(&htab->buckets[i].lock);
343350
}
344351

345-
if (!percpu && !lru) {
346-
/* lru itself can remove the least used element, so
347-
* there is no need for an extra elem during map_update.
348-
*/
349-
err = alloc_extra_elems(htab);
350-
if (err)
351-
goto free_buckets;
352-
}
353-
354352
if (prealloc) {
355353
err = prealloc_init(htab);
356354
if (err)
357-
goto free_extra_elems;
355+
goto free_buckets;
356+
357+
if (!percpu && !lru) {
358+
/* lru itself can remove the least used element, so
359+
* there is no need for an extra elem during map_update.
360+
*/
361+
err = alloc_extra_elems(htab);
362+
if (err)
363+
goto free_prealloc;
364+
}
358365
}
359366

360367
return &htab->map;
361368

362-
free_extra_elems:
363-
free_percpu(htab->extra_elems);
369+
free_prealloc:
370+
prealloc_destroy(htab);
364371
free_buckets:
365372
bpf_map_area_free(htab->buckets);
366373
free_htab:
@@ -575,12 +582,7 @@ static void htab_elem_free_rcu(struct rcu_head *head)
575582

576583
static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
577584
{
578-
if (l->state == HTAB_EXTRA_ELEM_USED) {
579-
l->state = HTAB_EXTRA_ELEM_FREE;
580-
return;
581-
}
582-
583-
if (!(htab->map.map_flags & BPF_F_NO_PREALLOC)) {
585+
if (htab_is_prealloc(htab)) {
584586
pcpu_freelist_push(&htab->freelist, &l->fnode);
585587
} else {
586588
atomic_dec(&htab->count);
@@ -610,47 +612,43 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
610612
static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
611613
void *value, u32 key_size, u32 hash,
612614
bool percpu, bool onallcpus,
613-
bool old_elem_exists)
615+
struct htab_elem *old_elem)
614616
{
615617
u32 size = htab->map.value_size;
616-
bool prealloc = !(htab->map.map_flags & BPF_F_NO_PREALLOC);
617-
struct htab_elem *l_new;
618+
bool prealloc = htab_is_prealloc(htab);
619+
struct htab_elem *l_new, **pl_new;
618620
void __percpu *pptr;
619-
int err = 0;
620621

621622
if (prealloc) {
622-
struct pcpu_freelist_node *l;
623+
if (old_elem) {
624+
/* if we're updating the existing element,
625+
* use per-cpu extra elems to avoid freelist_pop/push
626+
*/
627+
pl_new = this_cpu_ptr(htab->extra_elems);
628+
l_new = *pl_new;
629+
*pl_new = old_elem;
630+
} else {
631+
struct pcpu_freelist_node *l;
623632

624-
l = pcpu_freelist_pop(&htab->freelist);
625-
if (!l)
626-
err = -E2BIG;
627-
else
633+
l = pcpu_freelist_pop(&htab->freelist);
634+
if (!l)
635+
return ERR_PTR(-E2BIG);
628636
l_new = container_of(l, struct htab_elem, fnode);
629-
} else {
630-
if (atomic_inc_return(&htab->count) > htab->map.max_entries) {
631-
atomic_dec(&htab->count);
632-
err = -E2BIG;
633-
} else {
634-
l_new = kmalloc(htab->elem_size,
635-
GFP_ATOMIC | __GFP_NOWARN);
636-
if (!l_new)
637-
return ERR_PTR(-ENOMEM);
638637
}
639-
}
640-
641-
if (err) {
642-
if (!old_elem_exists)
643-
return ERR_PTR(err);
644-
645-
/* if we're updating the existing element and the hash table
646-
* is full, use per-cpu extra elems
647-
*/
648-
l_new = this_cpu_ptr(htab->extra_elems);
649-
if (l_new->state != HTAB_EXTRA_ELEM_FREE)
650-
return ERR_PTR(-E2BIG);
651-
l_new->state = HTAB_EXTRA_ELEM_USED;
652638
} else {
653-
l_new->state = HTAB_NOT_AN_EXTRA_ELEM;
639+
if (atomic_inc_return(&htab->count) > htab->map.max_entries)
640+
if (!old_elem) {
641+
/* when map is full and update() is replacing
642+
* old element, it's ok to allocate, since
643+
* old element will be freed immediately.
644+
* Otherwise return an error
645+
*/
646+
atomic_dec(&htab->count);
647+
return ERR_PTR(-E2BIG);
648+
}
649+
l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN);
650+
if (!l_new)
651+
return ERR_PTR(-ENOMEM);
654652
}
655653

656654
memcpy(l_new->key, key, key_size);
@@ -731,7 +729,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
731729
goto err;
732730

733731
l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false,
734-
!!l_old);
732+
l_old);
735733
if (IS_ERR(l_new)) {
736734
/* all pre-allocated elements are in use or memory exhausted */
737735
ret = PTR_ERR(l_new);
@@ -744,7 +742,8 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
744742
hlist_nulls_add_head_rcu(&l_new->hash_node, head);
745743
if (l_old) {
746744
hlist_nulls_del_rcu(&l_old->hash_node);
747-
free_htab_elem(htab, l_old);
745+
if (!htab_is_prealloc(htab))
746+
free_htab_elem(htab, l_old);
748747
}
749748
ret = 0;
750749
err:
@@ -856,7 +855,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
856855
value, onallcpus);
857856
} else {
858857
l_new = alloc_htab_elem(htab, key, value, key_size,
859-
hash, true, onallcpus, false);
858+
hash, true, onallcpus, NULL);
860859
if (IS_ERR(l_new)) {
861860
ret = PTR_ERR(l_new);
862861
goto err;
@@ -1024,8 +1023,7 @@ static void delete_all_elements(struct bpf_htab *htab)
10241023

10251024
hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
10261025
hlist_nulls_del_rcu(&l->hash_node);
1027-
if (l->state != HTAB_EXTRA_ELEM_USED)
1028-
htab_elem_free(htab, l);
1026+
htab_elem_free(htab, l);
10291027
}
10301028
}
10311029
}
@@ -1045,7 +1043,7 @@ static void htab_map_free(struct bpf_map *map)
10451043
* not have executed. Wait for them.
10461044
*/
10471045
rcu_barrier();
1048-
if (htab->map.map_flags & BPF_F_NO_PREALLOC)
1046+
if (!htab_is_prealloc(htab))
10491047
delete_all_elements(htab);
10501048
else
10511049
prealloc_destroy(htab);

tools/testing/selftests/bpf/test_maps.c

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -80,8 +80,9 @@ static void test_hashmap(int task, void *data)
8080
assert(bpf_map_update_elem(fd, &key, &value, BPF_EXIST) == 0);
8181
key = 2;
8282
assert(bpf_map_update_elem(fd, &key, &value, BPF_ANY) == 0);
83-
key = 1;
84-
assert(bpf_map_update_elem(fd, &key, &value, BPF_ANY) == 0);
83+
key = 3;
84+
assert(bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST) == -1 &&
85+
errno == E2BIG);
8586

8687
/* Check that key = 0 doesn't exist. */
8788
key = 0;
@@ -110,6 +111,24 @@ static void test_hashmap(int task, void *data)
110111
close(fd);
111112
}
112113

114+
static void test_hashmap_sizes(int task, void *data)
115+
{
116+
int fd, i, j;
117+
118+
for (i = 1; i <= 512; i <<= 1)
119+
for (j = 1; j <= 1 << 18; j <<= 1) {
120+
fd = bpf_create_map(BPF_MAP_TYPE_HASH, i, j,
121+
2, map_flags);
122+
if (fd < 0) {
123+
printf("Failed to create hashmap key=%d value=%d '%s'\n",
124+
i, j, strerror(errno));
125+
exit(1);
126+
}
127+
close(fd);
128+
usleep(10); /* give kernel time to destroy */
129+
}
130+
}
131+
113132
static void test_hashmap_percpu(int task, void *data)
114133
{
115134
unsigned int nr_cpus = bpf_num_possible_cpus();
@@ -317,7 +336,10 @@ static void test_arraymap_percpu(int task, void *data)
317336
static void test_arraymap_percpu_many_keys(void)
318337
{
319338
unsigned int nr_cpus = bpf_num_possible_cpus();
320-
unsigned int nr_keys = 20000;
339+
/* nr_keys is not too large otherwise the test stresses percpu
340+
* allocator more than anything else
341+
*/
342+
unsigned int nr_keys = 2000;
321343
long values[nr_cpus];
322344
int key, fd, i;
323345

@@ -419,6 +441,7 @@ static void test_map_stress(void)
419441
{
420442
run_parallel(100, test_hashmap, NULL);
421443
run_parallel(100, test_hashmap_percpu, NULL);
444+
run_parallel(100, test_hashmap_sizes, NULL);
422445

423446
run_parallel(100, test_arraymap, NULL);
424447
run_parallel(100, test_arraymap_percpu, NULL);

0 commit comments

Comments
 (0)