Skip to content

Commit d4adf1c

Browse files
wdebruijAlexei Starovoitov
authored andcommitted
bpf: Adjust free target to avoid global starvation of LRU map
BPF_MAP_TYPE_LRU_HASH can recycle most recent elements well before the map is full, due to percpu reservations and force shrink before neighbor stealing. Once a CPU is unable to borrow from the global map, it will once steal one elem from a neighbor and after that each time flush this one element to the global list and immediately recycle it. Batch value LOCAL_FREE_TARGET (128) will exhaust a 10K element map with 79 CPUs. CPU 79 will observe this behavior even while its neighbors hold 78 * 127 + 1 * 15 == 9921 free elements (99%). CPUs need not be active concurrently. The issue can appear with affinity migration, e.g., irqbalance. Each CPU can reserve and then hold onto its 128 elements indefinitely. Avoid global list exhaustion by limiting aggregate percpu caches to half of map size, by adjusting LOCAL_FREE_TARGET based on cpu count. This change has no effect on sufficiently large tables. Similar to LOCAL_NR_SCANS and lru->nr_scans, introduce a map variable lru->free_target. The extra field fits in a hole in struct bpf_lru. The cacheline is already warm where read in the hot path. The field is only accessed with the lru lock held. Tested-by: Anton Protopopov <[email protected]> Signed-off-by: Willem de Bruijn <[email protected]> Acked-by: Stanislav Fomichev <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Alexei Starovoitov <[email protected]>
1 parent a766cfb commit d4adf1c

File tree

5 files changed

+52
-44
lines changed

5 files changed

+52
-44
lines changed

Documentation/bpf/map_hash.rst

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -233,10 +233,16 @@ attempts in order to enforce the LRU property which have increasing impacts on
233233
other CPUs involved in the following operation attempts:
234234

235235
- Attempt to use CPU-local state to batch operations
236-
- Attempt to fetch free nodes from global lists
236+
- Attempt to fetch ``target_free`` free nodes from global lists
237237
- Attempt to pull any node from a global list and remove it from the hashmap
238238
- Attempt to pull any node from any CPU's list and remove it from the hashmap
239239

240+
The number of nodes to borrow from the global list in a batch, ``target_free``,
241+
depends on the size of the map. Larger batch size reduces lock contention, but
242+
may also exhaust the global structure. The value is computed at map init to
243+
avoid exhaustion, by limiting aggregate reservation by all CPUs to half the map
244+
size. With a minimum of a single element and maximum budget of 128 at a time.
245+
240246
This algorithm is described visually in the following diagram. See the
241247
description in commit 3a08c2fd7634 ("bpf: LRU List") for a full explanation of
242248
the corresponding operations:

Documentation/bpf/map_lru_hash_update.dot

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,18 +35,18 @@ digraph {
3535
fn_bpf_lru_list_pop_free_to_local [shape=rectangle,fillcolor=2,
3636
label="Flush local pending,
3737
Rotate Global list, move
38-
LOCAL_FREE_TARGET
38+
target_free
3939
from global -> local"]
4040
// Also corresponds to:
4141
// fn__local_list_flush()
4242
// fn_bpf_lru_list_rotate()
4343
fn___bpf_lru_node_move_to_free[shape=diamond,fillcolor=2,
44-
label="Able to free\nLOCAL_FREE_TARGET\nnodes?"]
44+
label="Able to free\ntarget_free\nnodes?"]
4545

4646
fn___bpf_lru_list_shrink_inactive [shape=rectangle,fillcolor=3,
4747
label="Shrink inactive list
4848
up to remaining
49-
LOCAL_FREE_TARGET
49+
target_free
5050
(global LRU -> local)"]
5151
fn___bpf_lru_list_shrink [shape=diamond,fillcolor=2,
5252
label="> 0 entries in\nlocal free list?"]

kernel/bpf/bpf_lru_list.c

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -337,12 +337,12 @@ static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru,
337337
list) {
338338
__bpf_lru_node_move_to_free(l, node, local_free_list(loc_l),
339339
BPF_LRU_LOCAL_LIST_T_FREE);
340-
if (++nfree == LOCAL_FREE_TARGET)
340+
if (++nfree == lru->target_free)
341341
break;
342342
}
343343

344-
if (nfree < LOCAL_FREE_TARGET)
345-
__bpf_lru_list_shrink(lru, l, LOCAL_FREE_TARGET - nfree,
344+
if (nfree < lru->target_free)
345+
__bpf_lru_list_shrink(lru, l, lru->target_free - nfree,
346346
local_free_list(loc_l),
347347
BPF_LRU_LOCAL_LIST_T_FREE);
348348

@@ -577,6 +577,9 @@ static void bpf_common_lru_populate(struct bpf_lru *lru, void *buf,
577577
list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]);
578578
buf += elem_size;
579579
}
580+
581+
lru->target_free = clamp((nr_elems / num_possible_cpus()) / 2,
582+
1, LOCAL_FREE_TARGET);
580583
}
581584

582585
static void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf,

kernel/bpf/bpf_lru_list.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ struct bpf_lru {
5858
del_from_htab_func del_from_htab;
5959
void *del_arg;
6060
unsigned int hash_offset;
61+
unsigned int target_free;
6162
unsigned int nr_scans;
6263
bool percpu;
6364
};

tools/testing/selftests/bpf/test_lru_map.c

Lines changed: 35 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,12 @@ static int sched_next_online(int pid, int *next_to_try)
138138
return ret;
139139
}
140140

141+
/* Inverse of how bpf_common_lru_populate derives target_free from map_size. */
142+
static unsigned int __map_size(unsigned int tgt_free)
143+
{
144+
return tgt_free * nr_cpus * 2;
145+
}
146+
141147
/* Size of the LRU map is 2
142148
* Add key=1 (+1 key)
143149
* Add key=2 (+1 key)
@@ -231,11 +237,11 @@ static void test_lru_sanity0(int map_type, int map_flags)
231237
printf("Pass\n");
232238
}
233239

234-
/* Size of the LRU map is 1.5*tgt_free
235-
* Insert 1 to tgt_free (+tgt_free keys)
236-
* Lookup 1 to tgt_free/2
237-
* Insert 1+tgt_free to 2*tgt_free (+tgt_free keys)
238-
* => 1+tgt_free/2 to LOCALFREE_TARGET will be removed by LRU
240+
/* Verify that unreferenced elements are recycled before referenced ones.
241+
* Insert elements.
242+
* Reference a subset of these.
243+
* Insert more, enough to trigger recycling.
244+
* Verify that unreferenced are recycled.
239245
*/
240246
static void test_lru_sanity1(int map_type, int map_flags, unsigned int tgt_free)
241247
{
@@ -257,7 +263,7 @@ static void test_lru_sanity1(int map_type, int map_flags, unsigned int tgt_free)
257263
batch_size = tgt_free / 2;
258264
assert(batch_size * 2 == tgt_free);
259265

260-
map_size = tgt_free + batch_size;
266+
map_size = __map_size(tgt_free) + batch_size;
261267
lru_map_fd = create_map(map_type, map_flags, map_size);
262268
assert(lru_map_fd != -1);
263269

@@ -266,26 +272,27 @@ static void test_lru_sanity1(int map_type, int map_flags, unsigned int tgt_free)
266272

267273
value[0] = 1234;
268274

269-
/* Insert 1 to tgt_free (+tgt_free keys) */
270-
end_key = 1 + tgt_free;
275+
/* Insert map_size - batch_size keys */
276+
end_key = 1 + __map_size(tgt_free);
271277
for (key = 1; key < end_key; key++)
272278
assert(!bpf_map_update_elem(lru_map_fd, &key, value,
273279
BPF_NOEXIST));
274280

275-
/* Lookup 1 to tgt_free/2 */
281+
/* Lookup 1 to batch_size */
276282
end_key = 1 + batch_size;
277283
for (key = 1; key < end_key; key++) {
278284
assert(!bpf_map_lookup_elem_with_ref_bit(lru_map_fd, key, value));
279285
assert(!bpf_map_update_elem(expected_map_fd, &key, value,
280286
BPF_NOEXIST));
281287
}
282288

283-
/* Insert 1+tgt_free to 2*tgt_free
284-
* => 1+tgt_free/2 to LOCALFREE_TARGET will be
289+
/* Insert another map_size - batch_size keys
290+
* Map will contain 1 to batch_size plus these latest, i.e.,
291+
* => previous 1+batch_size to map_size - batch_size will have been
285292
* removed by LRU
286293
*/
287-
key = 1 + tgt_free;
288-
end_key = key + tgt_free;
294+
key = 1 + __map_size(tgt_free);
295+
end_key = key + __map_size(tgt_free);
289296
for (; key < end_key; key++) {
290297
assert(!bpf_map_update_elem(lru_map_fd, &key, value,
291298
BPF_NOEXIST));
@@ -301,17 +308,8 @@ static void test_lru_sanity1(int map_type, int map_flags, unsigned int tgt_free)
301308
printf("Pass\n");
302309
}
303310

304-
/* Size of the LRU map 1.5 * tgt_free
305-
* Insert 1 to tgt_free (+tgt_free keys)
306-
* Update 1 to tgt_free/2
307-
* => The original 1 to tgt_free/2 will be removed due to
308-
* the LRU shrink process
309-
* Re-insert 1 to tgt_free/2 again and do a lookup immeidately
310-
* Insert 1+tgt_free to tgt_free*3/2
311-
* Insert 1+tgt_free*3/2 to tgt_free*5/2
312-
* => Key 1+tgt_free to tgt_free*3/2
313-
* will be removed from LRU because it has never
314-
* been lookup and ref bit is not set
311+
/* Verify that insertions exceeding map size will recycle the oldest.
312+
* Verify that unreferenced elements are recycled before referenced.
315313
*/
316314
static void test_lru_sanity2(int map_type, int map_flags, unsigned int tgt_free)
317315
{
@@ -334,7 +332,7 @@ static void test_lru_sanity2(int map_type, int map_flags, unsigned int tgt_free)
334332
batch_size = tgt_free / 2;
335333
assert(batch_size * 2 == tgt_free);
336334

337-
map_size = tgt_free + batch_size;
335+
map_size = __map_size(tgt_free) + batch_size;
338336
lru_map_fd = create_map(map_type, map_flags, map_size);
339337
assert(lru_map_fd != -1);
340338

@@ -343,8 +341,8 @@ static void test_lru_sanity2(int map_type, int map_flags, unsigned int tgt_free)
343341

344342
value[0] = 1234;
345343

346-
/* Insert 1 to tgt_free (+tgt_free keys) */
347-
end_key = 1 + tgt_free;
344+
/* Insert map_size - batch_size keys */
345+
end_key = 1 + __map_size(tgt_free);
348346
for (key = 1; key < end_key; key++)
349347
assert(!bpf_map_update_elem(lru_map_fd, &key, value,
350348
BPF_NOEXIST));
@@ -357,8 +355,7 @@ static void test_lru_sanity2(int map_type, int map_flags, unsigned int tgt_free)
357355
* shrink the inactive list to get tgt_free
358356
* number of free nodes.
359357
*
360-
* Hence, the oldest key 1 to tgt_free/2
361-
* are removed from the LRU list.
358+
* Hence, the oldest key is removed from the LRU list.
362359
*/
363360
key = 1;
364361
if (map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
@@ -370,8 +367,7 @@ static void test_lru_sanity2(int map_type, int map_flags, unsigned int tgt_free)
370367
BPF_EXIST));
371368
}
372369

373-
/* Re-insert 1 to tgt_free/2 again and do a lookup
374-
* immeidately.
370+
/* Re-insert 1 to batch_size again and do a lookup immediately.
375371
*/
376372
end_key = 1 + batch_size;
377373
value[0] = 4321;
@@ -387,17 +383,18 @@ static void test_lru_sanity2(int map_type, int map_flags, unsigned int tgt_free)
387383

388384
value[0] = 1234;
389385

390-
/* Insert 1+tgt_free to tgt_free*3/2 */
391-
end_key = 1 + tgt_free + batch_size;
392-
for (key = 1 + tgt_free; key < end_key; key++)
386+
/* Insert batch_size new elements */
387+
key = 1 + __map_size(tgt_free);
388+
end_key = key + batch_size;
389+
for (; key < end_key; key++)
393390
/* These newly added but not referenced keys will be
394391
* gone during the next LRU shrink.
395392
*/
396393
assert(!bpf_map_update_elem(lru_map_fd, &key, value,
397394
BPF_NOEXIST));
398395

399-
/* Insert 1+tgt_free*3/2 to tgt_free*5/2 */
400-
end_key = key + tgt_free;
396+
/* Insert map_size - batch_size elements */
397+
end_key += __map_size(tgt_free);
401398
for (; key < end_key; key++) {
402399
assert(!bpf_map_update_elem(lru_map_fd, &key, value,
403400
BPF_NOEXIST));
@@ -500,7 +497,8 @@ static void test_lru_sanity4(int map_type, int map_flags, unsigned int tgt_free)
500497
lru_map_fd = create_map(map_type, map_flags,
501498
3 * tgt_free * nr_cpus);
502499
else
503-
lru_map_fd = create_map(map_type, map_flags, 3 * tgt_free);
500+
lru_map_fd = create_map(map_type, map_flags,
501+
3 * __map_size(tgt_free));
504502
assert(lru_map_fd != -1);
505503

506504
expected_map_fd = create_map(BPF_MAP_TYPE_HASH, 0,

0 commit comments

Comments
 (0)