Skip to content

Commit aa87a85

Browse files
puranjaymohanKernel Patches Daemon
authored andcommitted
bpf: arena: populate vm_area without allocating memory
vm_area_map_pages() may allocate memory while inserting pages into bpf arena's vm_area. In order to make bpf_arena_alloc_pages() kfunc non-sleepable change bpf arena to populate pages without allocating memory: - at arena creation time populate all page table levels except the last level - when new pages need to be inserted call apply_to_page_range() again with apply_range_set_cb() which will only set_pte_at() those pages and will not allocate memory. - when freeing pages call apply_to_existing_page_range with apply_range_clear_cb() to clear the pte for the page to be removed. This doesn't free intermediate page table levels. Signed-off-by: Puranjay Mohan <[email protected]>
1 parent 488318d commit aa87a85

File tree

1 file changed

+70
-6
lines changed

1 file changed

+70
-6
lines changed

kernel/bpf/arena.c

Lines changed: 70 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#include <linux/btf_ids.h>
88
#include <linux/vmalloc.h>
99
#include <linux/pagemap.h>
10+
#include <asm/tlbflush.h>
1011
#include "range_tree.h"
1112

1213
/*
@@ -92,6 +93,62 @@ static long compute_pgoff(struct bpf_arena *arena, long uaddr)
9293
return (u32)(uaddr - (u32)arena->user_vm_start) >> PAGE_SHIFT;
9394
}
9495

96+
struct apply_range_data {
97+
struct page **pages;
98+
int i;
99+
};
100+
101+
static int apply_range_set_cb(pte_t *pte, unsigned long addr, void *data)
102+
{
103+
struct apply_range_data *d = data;
104+
struct page *page;
105+
106+
if (!data)
107+
return 0;
108+
/* sanity check */
109+
if (unlikely(!pte_none(ptep_get(pte))))
110+
return -EBUSY;
111+
112+
page = d->pages[d->i++];
113+
/* paranoia, similar to vmap_pages_pte_range() */
114+
if (WARN_ON_ONCE(!pfn_valid(page_to_pfn(page))))
115+
return -EINVAL;
116+
117+
set_pte_at(&init_mm, addr, pte, mk_pte(page, PAGE_KERNEL));
118+
return 0;
119+
}
120+
121+
static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *data)
122+
{
123+
pte_t old_pte;
124+
struct page *page;
125+
126+
/* sanity check */
127+
old_pte = ptep_get(pte);
128+
if (pte_none(old_pte) || !pte_present(old_pte))
129+
return 0; /* nothing to do */
130+
131+
/* get page and free it */
132+
page = pte_page(old_pte);
133+
if (WARN_ON_ONCE(!page))
134+
return -EINVAL;
135+
136+
pte_clear(&init_mm, addr, pte);
137+
138+
/* ensure no stale TLB entries */
139+
flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
140+
141+
__free_page(page);
142+
143+
return 0;
144+
}
145+
146+
static int populate_pgtable_except_pte(struct bpf_arena *arena)
147+
{
148+
return apply_to_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena),
149+
KERN_VM_SZ - GUARD_SZ, apply_range_set_cb, NULL);
150+
}
151+
95152
static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
96153
{
97154
struct vm_struct *kern_vm;
@@ -144,6 +201,11 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
144201
goto err;
145202
}
146203
mutex_init(&arena->lock);
204+
err = populate_pgtable_except_pte(arena);
205+
if (err) {
206+
bpf_map_area_free(arena);
207+
goto err;
208+
}
147209

148210
return &arena->map;
149211
err:
@@ -286,14 +348,15 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
286348
if (ret)
287349
return VM_FAULT_SIGSEGV;
288350

351+
struct apply_range_data data = { .pages = &page, .i = 0 };
289352
/* Account into memcg of the process that created bpf_arena */
290353
ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page);
291354
if (ret) {
292355
range_tree_set(&arena->rt, vmf->pgoff, 1);
293356
return VM_FAULT_SIGSEGV;
294357
}
295358

296-
ret = vm_area_map_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE, &page);
359+
ret = apply_to_page_range(&init_mm, kaddr, PAGE_SIZE, apply_range_set_cb, &data);
297360
if (ret) {
298361
range_tree_set(&arena->rt, vmf->pgoff, 1);
299362
__free_page(page);
@@ -428,7 +491,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
428491
/* user_vm_end/start are fixed before bpf prog runs */
429492
long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT;
430493
u64 kern_vm_start = bpf_arena_get_kern_vm_start(arena);
431-
struct page **pages;
494+
struct page **pages = NULL;
432495
long pgoff = 0;
433496
u32 uaddr32;
434497
int ret, i;
@@ -465,6 +528,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
465528
if (ret)
466529
goto out_free_pages;
467530

531+
struct apply_range_data data = { .pages = pages, .i = 0 };
468532
ret = bpf_map_alloc_pages(&arena->map, node_id, page_cnt, pages);
469533
if (ret)
470534
goto out;
@@ -477,8 +541,8 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
477541
* kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE - 1 can overflow
478542
* lower 32-bit and it's ok.
479543
*/
480-
ret = vm_area_map_pages(arena->kern_vm, kern_vm_start + uaddr32,
481-
kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE, pages);
544+
ret = apply_to_page_range(&init_mm, kern_vm_start + uaddr32,
545+
page_cnt << PAGE_SHIFT, apply_range_set_cb, &data);
482546
if (ret) {
483547
for (i = 0; i < page_cnt; i++)
484548
__free_page(pages[i]);
@@ -545,8 +609,8 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt)
545609
* page_cnt is big it's faster to do the batched zap.
546610
*/
547611
zap_pages(arena, full_uaddr, 1);
548-
vm_area_unmap_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE);
549-
__free_page(page);
612+
apply_to_existing_page_range(&init_mm, kaddr, PAGE_SIZE, apply_range_clear_cb,
613+
NULL);
550614
}
551615
}
552616

0 commit comments

Comments
 (0)