Skip to content

Commit 50a4cbc

Browse files
puranjaymohanKernel Patches Daemon
authored andcommitted
bpf: arena: populate vm_area without allocating memory
vm_area_map_pages() may allocate memory while inserting pages into bpf arena's vm_area. In order to make bpf_arena_alloc_pages() kfunc non-sleepable change bpf arena to populate pages without allocating memory: - at arena creation time populate all page table levels except the last level - when new pages need to be inserted call apply_to_page_range() again with apply_range_set_cb() which will only set_pte_at() those pages and will not allocate memory. - when freeing pages call apply_to_existing_page_range with apply_range_clear_cb() to clear the pte for the page to be removed. This doesn't free intermediate page table levels. Signed-off-by: Puranjay Mohan <[email protected]>
1 parent 4eb9670 commit 50a4cbc

File tree

1 file changed

+68
-6
lines changed

1 file changed

+68
-6
lines changed

kernel/bpf/arena.c

Lines changed: 68 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,63 @@ static long compute_pgoff(struct bpf_arena *arena, long uaddr)
9292
return (u32)(uaddr - (u32)arena->user_vm_start) >> PAGE_SHIFT;
9393
}
9494

95+
struct apply_range_data {
96+
struct page **pages;
97+
int i;
98+
};
99+
100+
static int apply_range_set_cb(pte_t *pte, unsigned long addr, void *data)
101+
{
102+
struct apply_range_data *d = data;
103+
struct page *page;
104+
105+
if (!data)
106+
return 0;
107+
/* sanity check */
108+
if (unlikely(!pte_none(ptep_get(pte))))
109+
return -EBUSY;
110+
111+
page = d->pages[d->i++];
112+
/* paranoia, similar to vmap_pages_pte_range() */
113+
if (WARN_ON_ONCE(!pfn_valid(page_to_pfn(page))))
114+
return -EINVAL;
115+
116+
set_pte_at(&init_mm, addr, pte, mk_pte(page, PAGE_KERNEL));
117+
return 0;
118+
}
119+
120+
static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *data)
121+
{
122+
struct mm_struct *mm = &init_mm;
123+
pte_t old_pte;
124+
struct page *page;
125+
126+
/* sanity check */
127+
old_pte = ptep_get(pte);
128+
if (pte_none(old_pte) || !pte_present(old_pte))
129+
return 0; /* nothing to do */
130+
131+
/* get page and free it */
132+
page = pte_page(old_pte);
133+
if (WARN_ON_ONCE(!page))
134+
return -EINVAL;
135+
136+
pte_clear(mm, addr, pte);
137+
138+
/* ensure no stale TLB entries */
139+
flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
140+
141+
__free_page(page);
142+
143+
return 0;
144+
}
145+
146+
static int populate_pgtable_except_pte(struct bpf_arena *arena)
147+
{
148+
return apply_to_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena),
149+
KERN_VM_SZ - GUARD_SZ, apply_range_set_cb, NULL);
150+
}
151+
95152
static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
96153
{
97154
struct vm_struct *kern_vm;
@@ -144,6 +201,9 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
144201
goto err;
145202
}
146203
mutex_init(&arena->lock);
204+
err = populate_pgtable_except_pte(arena);
205+
if (err)
206+
goto err;
147207

148208
return &arena->map;
149209
err:
@@ -286,14 +346,15 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
286346
if (ret)
287347
return VM_FAULT_SIGSEGV;
288348

349+
struct apply_range_data data = { .pages = &page, .i = 0 };
289350
/* Account into memcg of the process that created bpf_arena */
290351
ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page);
291352
if (ret) {
292353
range_tree_set(&arena->rt, vmf->pgoff, 1);
293354
return VM_FAULT_SIGSEGV;
294355
}
295356

296-
ret = vm_area_map_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE, &page);
357+
ret = apply_to_page_range(&init_mm, kaddr, PAGE_SIZE, apply_range_set_cb, &data);
297358
if (ret) {
298359
range_tree_set(&arena->rt, vmf->pgoff, 1);
299360
__free_page(page);
@@ -428,7 +489,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
428489
/* user_vm_end/start are fixed before bpf prog runs */
429490
long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT;
430491
u64 kern_vm_start = bpf_arena_get_kern_vm_start(arena);
431-
struct page **pages;
492+
struct page **pages = NULL;
432493
long pgoff = 0;
433494
u32 uaddr32;
434495
int ret, i;
@@ -465,6 +526,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
465526
if (ret)
466527
goto out_free_pages;
467528

529+
struct apply_range_data data = { .pages = pages, .i = 0 };
468530
ret = bpf_map_alloc_pages(&arena->map, node_id, page_cnt, pages);
469531
if (ret)
470532
goto out;
@@ -477,8 +539,8 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
477539
* kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE - 1 can overflow
478540
* lower 32-bit and it's ok.
479541
*/
480-
ret = vm_area_map_pages(arena->kern_vm, kern_vm_start + uaddr32,
481-
kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE, pages);
542+
ret = apply_to_page_range(&init_mm, kern_vm_start + uaddr32,
543+
page_cnt << PAGE_SHIFT, apply_range_set_cb, &data);
482544
if (ret) {
483545
for (i = 0; i < page_cnt; i++)
484546
__free_page(pages[i]);
@@ -545,8 +607,8 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt)
545607
* page_cnt is big it's faster to do the batched zap.
546608
*/
547609
zap_pages(arena, full_uaddr, 1);
548-
vm_area_unmap_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE);
549-
__free_page(page);
610+
apply_to_existing_page_range(&init_mm, kaddr, page_cnt << PAGE_SHIFT,
611+
apply_range_clear_cb, NULL);
550612
}
551613
}
552614

0 commit comments

Comments
 (0)