diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 1074ac4459f2..0e1aa756dea9 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -3,10 +3,13 @@ #include #include #include +#include #include "linux/filter.h" +#include #include #include #include +#include #include "range_tree.h" /* @@ -49,7 +52,18 @@ struct bpf_arena { struct vm_struct *kern_vm; struct range_tree rt; struct list_head vma_list; + struct irq_work free_irq; + struct work_struct free_work; + struct llist_head free_spans; + refcount_t free_refs; struct mutex lock; + rqspinlock_t spinlock; +}; + +struct arena_free_span { + struct llist_node node; + unsigned long uaddr; + u32 page_cnt; }; u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena) @@ -92,6 +106,126 @@ static long compute_pgoff(struct bpf_arena *arena, long uaddr) return (u32)(uaddr - (u32)arena->user_vm_start) >> PAGE_SHIFT; } +struct apply_range_data { + struct page **pages; + int i; +}; + +static int apply_range_set_cb(pte_t *pte, unsigned long addr, void *data) +{ + struct apply_range_data *d = data; + struct page *page; + + if (!data) + return 0; + /* sanity check */ + if (unlikely(!pte_none(ptep_get(pte)))) + return -EBUSY; + + page = d->pages[d->i++]; + /* paranoia, similar to vmap_pages_pte_range() */ + if (WARN_ON_ONCE(!pfn_valid(page_to_pfn(page)))) + return -EINVAL; + + set_pte_at(&init_mm, addr, pte, mk_pte(page, PAGE_KERNEL)); + return 0; +} + +static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *data) +{ + struct mm_struct *mm = &init_mm; + pte_t old_pte; + struct page *page; + + /* sanity check */ + old_pte = ptep_get(pte); + if (pte_none(old_pte) || !pte_present(old_pte)) + return 0; /* nothing to do */ + + /* get page and free it */ + page = pte_page(old_pte); + if (WARN_ON_ONCE(!page)) + return -EINVAL; + + pte_clear(mm, addr, pte); + + /* ensure no stale TLB entries */ + flush_tlb_kernel_range(addr, addr + PAGE_SIZE); + + __free_page(page); + + return 0; +} + +static int populate_pgtable_except_pte(struct bpf_arena *arena) +{ + return apply_to_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena), + KERN_VM_SZ - GUARD_SZ, apply_range_set_cb, NULL); +} + +struct vma_list { + struct vm_area_struct *vma; + struct list_head head; + refcount_t mmap_count; +}; + +/* + * If page is present in vmalloc area, unmap it from vmalloc area, + * unmap it from all user space vma-s, + * and free it. + */ +static void zap_pages(struct bpf_arena *arena, long uaddr, long page_cnt) +{ + struct vma_list *vml; + + list_for_each_entry(vml, &arena->vma_list, head) + zap_page_range_single(vml->vma, uaddr, + PAGE_SIZE * page_cnt, NULL); +} + +static u64 clear_lo32(u64 val) +{ + return val & ~(u64)~0U; +} + +static void arena_free_worker(struct work_struct *work) +{ + struct bpf_arena *arena = container_of(work, struct bpf_arena, free_work); + struct llist_node *pos, *t; + struct arena_free_span *s; + unsigned long full_uaddr; + long kaddr, page_cnt, pgoff; + + llist_for_each_safe(pos, t, llist_del_all(&arena->free_spans)) { + s = llist_entry(pos, struct arena_free_span, node); + page_cnt = s->page_cnt; + full_uaddr = clear_lo32(arena->user_vm_start) + s->uaddr; + kaddr = bpf_arena_get_kern_vm_start(arena) + s->uaddr; + + zap_pages(arena, full_uaddr, page_cnt); + + apply_to_existing_page_range(&init_mm, kaddr, page_cnt << PAGE_SHIFT, + apply_range_clear_cb, NULL); + + flush_tlb_kernel_range(kaddr, kaddr + (page_cnt << PAGE_SHIFT)); + + pgoff = compute_pgoff(arena, s->uaddr); + if (!raw_res_spin_lock(&arena->spinlock)) { + range_tree_set(&arena->rt, pgoff, page_cnt); + raw_res_spin_unlock(&arena->spinlock); + } + + refcount_dec(&arena->free_refs); + kfree_nolock(s); + } +} + +static void arena_free_irq(struct irq_work *iw) +{ + struct bpf_arena *arena = container_of(iw, struct bpf_arena, free_irq); + schedule_work(&arena->free_work); +} + static struct bpf_map *arena_map_alloc(union bpf_attr *attr) { struct vm_struct *kern_vm; @@ -136,6 +270,10 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr) arena->user_vm_end = arena->user_vm_start + vm_range; INIT_LIST_HEAD(&arena->vma_list); + init_llist_head(&arena->free_spans); + init_irq_work(&arena->free_irq, arena_free_irq); + INIT_WORK(&arena->free_work, arena_free_worker); + refcount_set(&arena->free_refs, 1); bpf_map_init_from_attr(&arena->map, attr); range_tree_init(&arena->rt); err = range_tree_set(&arena->rt, 0, attr->max_entries); @@ -144,6 +282,10 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr) goto err; } mutex_init(&arena->lock); + raw_res_spin_lock_init(&arena->spinlock); + err = populate_pgtable_except_pte(arena); + if (err) + goto err; return &arena->map; err: @@ -184,6 +326,11 @@ static void arena_map_free(struct bpf_map *map) if (WARN_ON_ONCE(!list_empty(&arena->vma_list))) return; + /* Ensure no pending deferred frees */ + flush_work(&arena->free_work); + while (!refcount_dec_and_test(&arena->free_refs)) + cpu_relax(); + /* * free_vm_area() calls remove_vm_area() that calls free_unmap_vmap_area(). * It unmaps everything from vmalloc area and clears pgtables. @@ -219,12 +366,6 @@ static u64 arena_map_mem_usage(const struct bpf_map *map) return 0; } -struct vma_list { - struct vm_area_struct *vma; - struct list_head head; - refcount_t mmap_count; -}; - static int remember_vma(struct bpf_arena *arena, struct vm_area_struct *vma) { struct vma_list *vml; @@ -272,7 +413,8 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf) kbase = bpf_arena_get_kern_vm_start(arena); kaddr = kbase + (u32)(vmf->address); - guard(mutex)(&arena->lock); + if (raw_res_spin_lock(&arena->spinlock)) + return VM_FAULT_RETRY; page = vmalloc_to_page((void *)kaddr); if (page) /* already have a page vmap-ed */ @@ -280,29 +422,34 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf) if (arena->map.map_flags & BPF_F_SEGV_ON_FAULT) /* User space requested to segfault when page is not allocated by bpf prog */ - return VM_FAULT_SIGSEGV; + goto out_unlock_sigsegv; ret = range_tree_clear(&arena->rt, vmf->pgoff, 1); if (ret) - return VM_FAULT_SIGSEGV; + goto out_unlock_sigsegv; + struct apply_range_data data = { .pages = &page, .i = 0 }; /* Account into memcg of the process that created bpf_arena */ ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page); if (ret) { range_tree_set(&arena->rt, vmf->pgoff, 1); - return VM_FAULT_SIGSEGV; + goto out_unlock_sigsegv; } - ret = vm_area_map_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE, &page); + ret = apply_to_page_range(&init_mm, kaddr, PAGE_SIZE, apply_range_set_cb, &data); if (ret) { range_tree_set(&arena->rt, vmf->pgoff, 1); __free_page(page); - return VM_FAULT_SIGSEGV; + goto out_unlock_sigsegv;; } out: page_ref_add(page, 1); vmf->page = page; + raw_res_spin_unlock(&arena->spinlock); return 0; +out_unlock_sigsegv: + raw_res_spin_unlock(&arena->spinlock); + return VM_FAULT_SIGSEGV; } static const struct vm_operations_struct arena_vm_ops = { @@ -414,11 +561,6 @@ const struct bpf_map_ops arena_map_ops = { .map_btf_id = &bpf_arena_map_btf_ids[0], }; -static u64 clear_lo32(u64 val) -{ - return val & ~(u64)~0U; -} - /* * Allocate pages and vmap them into kernel vmalloc area. * Later the pages will be mmaped into user space vma. @@ -428,7 +570,8 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt /* user_vm_end/start are fixed before bpf prog runs */ long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT; u64 kern_vm_start = bpf_arena_get_kern_vm_start(arena); - struct page **pages; + struct page **pages = NULL; + unsigned long flags; long pgoff = 0; u32 uaddr32; int ret, i; @@ -446,28 +589,33 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt } /* zeroing is needed, since alloc_pages_bulk() only fills in non-zero entries */ - pages = kvcalloc(page_cnt, sizeof(struct page *), GFP_KERNEL); + pages = kmalloc_nolock(page_cnt * sizeof(struct page *), __GFP_ZERO, 0); if (!pages) return 0; - guard(mutex)(&arena->lock); + struct apply_range_data data = { .pages = pages, .i = 0 }; + ret = bpf_map_alloc_pages(&arena->map, node_id, page_cnt, pages); + if (ret) + goto out_free; + if (raw_res_spin_lock_irqsave(&arena->spinlock, flags)) + goto out_free_pages; if (uaddr) { ret = is_range_tree_set(&arena->rt, pgoff, page_cnt); - if (ret) + if (ret) { + raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); goto out_free_pages; + } ret = range_tree_clear(&arena->rt, pgoff, page_cnt); } else { ret = pgoff = range_tree_find(&arena->rt, page_cnt); if (pgoff >= 0) ret = range_tree_clear(&arena->rt, pgoff, page_cnt); } - if (ret) + if (ret) { + raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); goto out_free_pages; - - ret = bpf_map_alloc_pages(&arena->map, node_id, page_cnt, pages); - if (ret) - goto out; + } uaddr32 = (u32)(arena->user_vm_start + pgoff * PAGE_SIZE); /* Earlier checks made sure that uaddr32 + page_cnt * PAGE_SIZE - 1 @@ -477,41 +625,28 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt * kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE - 1 can overflow * lower 32-bit and it's ok. */ - ret = vm_area_map_pages(arena->kern_vm, kern_vm_start + uaddr32, - kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE, pages); - if (ret) { - for (i = 0; i < page_cnt; i++) - __free_page(pages[i]); + ret = apply_to_page_range(&init_mm, kern_vm_start + uaddr32, + page_cnt << PAGE_SHIFT, apply_range_set_cb, &data); + if (ret) goto out; - } - kvfree(pages); + raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); + kfree_nolock(pages); return clear_lo32(arena->user_vm_start) + uaddr32; out: range_tree_set(&arena->rt, pgoff, page_cnt); + raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); out_free_pages: - kvfree(pages); + for (i = 0; i < page_cnt; i++) + free_pages_nolock(pages[i], 0); +out_free: + kfree_nolock(pages); return 0; } -/* - * If page is present in vmalloc area, unmap it from vmalloc area, - * unmap it from all user space vma-s, - * and free it. - */ -static void zap_pages(struct bpf_arena *arena, long uaddr, long page_cnt) -{ - struct vma_list *vml; - - list_for_each_entry(vml, &arena->vma_list, head) - zap_page_range_single(vml->vma, uaddr, - PAGE_SIZE * page_cnt, NULL); -} - static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt) { u64 full_uaddr, uaddr_end; - long kaddr, pgoff, i; - struct page *page; + struct arena_free_span *s; /* only aligned lower 32-bit are relevant */ uaddr = (u32)uaddr; @@ -523,31 +658,16 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt) page_cnt = (uaddr_end - full_uaddr) >> PAGE_SHIFT; - guard(mutex)(&arena->lock); - - pgoff = compute_pgoff(arena, uaddr); - /* clear range */ - range_tree_set(&arena->rt, pgoff, page_cnt); + s = kmalloc_nolock(sizeof(struct arena_free_span), __GFP_ZERO, 0); + if (!s) + return; - if (page_cnt > 1) - /* bulk zap if multiple pages being freed */ - zap_pages(arena, full_uaddr, page_cnt); + s->page_cnt = page_cnt; + s->uaddr = uaddr; - kaddr = bpf_arena_get_kern_vm_start(arena) + uaddr; - for (i = 0; i < page_cnt; i++, kaddr += PAGE_SIZE, full_uaddr += PAGE_SIZE) { - page = vmalloc_to_page((void *)kaddr); - if (!page) - continue; - if (page_cnt == 1 && page_mapped(page)) /* mapped by some user process */ - /* Optimization for the common case of page_cnt==1: - * If page wasn't mapped into some user vma there - * is no need to call zap_pages which is slow. When - * page_cnt is big it's faster to do the batched zap. - */ - zap_pages(arena, full_uaddr, 1); - vm_area_unmap_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE); - __free_page(page); - } + refcount_inc(&arena->free_refs); + llist_add(&s->node, &arena->free_spans); + irq_work_queue(&arena->free_irq); } /* @@ -558,6 +678,7 @@ static int arena_reserve_pages(struct bpf_arena *arena, long uaddr, u32 page_cnt { long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT; long pgoff; + unsigned long flags; int ret; if (uaddr & ~PAGE_MASK) @@ -567,15 +688,22 @@ static int arena_reserve_pages(struct bpf_arena *arena, long uaddr, u32 page_cnt if (pgoff + page_cnt > page_cnt_max) return -EINVAL; - guard(mutex)(&arena->lock); + if (raw_res_spin_lock_irqsave(&arena->spinlock, flags)) + return -EBUSY; /* Cannot guard already allocated pages. */ ret = is_range_tree_set(&arena->rt, pgoff, page_cnt); - if (ret) - return -EBUSY; + if (ret) { + ret = -EBUSY; + goto out_unlock; + } /* "Allocate" the region to prevent it from being allocated. */ - return range_tree_clear(&arena->rt, pgoff, page_cnt); + ret = range_tree_clear(&arena->rt, pgoff, page_cnt); + +out_unlock: + raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); + return ret; } __bpf_kfunc_start_defs();