Skip to content

Commit a820301

Browse files
Pingfan LiuKernel Patches Daemon
authored andcommitted
bpf: Introduce decompressor kfunc
This commit bridges the gap between bpf-prog and the kernel decompression routines. At present, only a global memory allocator is used for the decompression. Later, if needed, the decompress_fn's prototype can be changed to pass in a task related allocator. This memory allocator can allocate 2MB each time with a transient virtual address, up to a 1GB limit. After decompression finishes, it presents all of the decompressed data in a new unified virtual address space. Signed-off-by: Pingfan Liu <[email protected]> Cc: Alexei Starovoitov <[email protected]> Cc: Daniel Borkmann <[email protected]> Cc: John Fastabend <[email protected]> Cc: Andrii Nakryiko <[email protected]> Cc: Martin KaFai Lau <[email protected]> Cc: Eduard Zingerman <[email protected]> Cc: Song Liu <[email protected]> Cc: Yonghong Song <[email protected]> Cc: KP Singh <[email protected]> Cc: Stanislav Fomichev <[email protected]> Cc: Hao Luo <[email protected]> Cc: Jiri Olsa <[email protected]> To: [email protected]
1 parent 88a24c0 commit a820301

File tree

1 file changed

+226
-0
lines changed

1 file changed

+226
-0
lines changed

kernel/bpf/helpers.c

Lines changed: 226 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#include <linux/kasan.h>
2626
#include <linux/bpf_verifier.h>
2727
#include <linux/uaccess.h>
28+
#include <linux/decompress/generic.h>
2829

2930
#include "../../lib/kstrtox.h"
3031

@@ -3714,13 +3715,238 @@ __bpf_kfunc int bpf_strstr(const char *s1__ign, const char *s2__ign)
37143715
return bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX);
37153716
}
37163717

3718+
#ifdef CONFIG_KEXEC_PE_IMAGE
3719+
3720+
#define MAX_UNCOMPRESSED_BUF_SIZE (1 << 28)
3721+
/* a chunk should be large enough to contain a decompressing */
3722+
#define CHUNK_SIZE (1 << 23)
3723+
3724+
/*
3725+
* At present, one global allocator for decompression. Later if needed, changing the
3726+
* prototype of decompress_fn to introduce each task's allocator.
3727+
*/
3728+
static DEFINE_MUTEX(output_buf_mutex);
3729+
3730+
struct decompress_mem_allocator {
3731+
struct page **pages;
3732+
unsigned int pg_idx;
3733+
void *chunk_start;
3734+
unsigned int chunk_size;
3735+
void *chunk_cur;
3736+
};
3737+
3738+
static struct decompress_mem_allocator dcmpr_allocator;
3739+
3740+
/*
3741+
* Set up an active chunk to hold partial decompressed data.
3742+
*/
3743+
static void *vmap_decompressed_chunk(void)
3744+
{
3745+
struct decompress_mem_allocator *a = &dcmpr_allocator;
3746+
unsigned int i, pg_cnt = a->chunk_size >> PAGE_SHIFT;
3747+
struct page **pg_start = &a->pages[a->pg_idx];
3748+
3749+
for (i = 0; i < pg_cnt; i++)
3750+
a->pages[a->pg_idx++] = alloc_page(GFP_KERNEL | __GFP_ACCOUNT);
3751+
3752+
return vmap(pg_start, pg_cnt, VM_MAP, PAGE_KERNEL);
3753+
}
3754+
3755+
/*
3756+
* Present the scattered pages containing decompressed data at a unified virtual
3757+
* address.
3758+
*/
3759+
static int decompress_mem_allocator_handover(struct decompress_mem_allocator *a,
3760+
struct mem_range_result *range)
3761+
{
3762+
unsigned long pg_array_sz = a->pg_idx * sizeof(struct page *);
3763+
3764+
range->pages = vmalloc(pg_array_sz);
3765+
if (!range->pages)
3766+
return -ENOMEM;
3767+
3768+
range->pg_cnt = a->pg_idx;
3769+
memcpy(range->pages, a->pages, pg_array_sz);
3770+
range->buf = vmap(range->pages, range->pg_cnt, VM_MAP, PAGE_KERNEL);
3771+
if (!range->buf) {
3772+
vfree(range->pages);
3773+
return -1;
3774+
}
3775+
/*
3776+
* Free the tracing pointer; The pages are freed when mem_range_result
3777+
* is released.
3778+
*/
3779+
vfree(a->pages);
3780+
a->pages = NULL;
3781+
3782+
/* vmap-ed */
3783+
range->alloc_type = TYPE_VMAP;
3784+
range->buf_sz = a->pg_idx << PAGE_SHIFT;
3785+
range->data_sz = range->buf_sz - a->chunk_size;
3786+
range->data_sz += a->chunk_cur - a->chunk_start;
3787+
3788+
return 0;
3789+
}
3790+
3791+
static int decompress_mem_allocator_init(
3792+
struct decompress_mem_allocator *allocator,
3793+
unsigned int chunk_size)
3794+
{
3795+
unsigned long sz = (MAX_UNCOMPRESSED_BUF_SIZE >> PAGE_SHIFT) * sizeof(struct page *);
3796+
3797+
allocator->pages = __vmalloc(sz, GFP_KERNEL | __GFP_ACCOUNT);
3798+
if (!allocator->pages)
3799+
return -ENOMEM;
3800+
3801+
allocator->pg_idx = 0;
3802+
allocator->chunk_start = NULL;
3803+
allocator->chunk_size = chunk_size;
3804+
allocator->chunk_cur = NULL;
3805+
return 0;
3806+
}
3807+
3808+
static void decompress_mem_allocator_fini(struct decompress_mem_allocator *allocator)
3809+
{
3810+
unsigned int i;
3811+
3812+
/* unmap the active chunk */
3813+
if (!!allocator->chunk_start)
3814+
vunmap(allocator->chunk_start);
3815+
if (!!allocator->pages) {
3816+
for (i = 0; i < allocator->pg_idx; i++)
3817+
__free_pages(allocator->pages[i], 0);
3818+
vfree(allocator->pages);
3819+
}
3820+
}
3821+
3822+
/*
3823+
* This is a callback for decompress_fn.
3824+
*
3825+
* It copies the partial decompressed content in [buf, buf + len) to dst. If the
3826+
* active chunk is not large enough, retire it and activate a new chunk to hold
3827+
* the remaining data.
3828+
*/
3829+
static long flush(void *buf, unsigned long len)
3830+
{
3831+
struct decompress_mem_allocator *a = &dcmpr_allocator;
3832+
long free, copied = 0;
3833+
3834+
/* The first time allocation */
3835+
if (unlikely(!a->chunk_start)) {
3836+
a->chunk_start = a->chunk_cur = vmap_decompressed_chunk();
3837+
if (unlikely(!a->chunk_start))
3838+
return -1;
3839+
}
3840+
3841+
free = a->chunk_start + a->chunk_size - a->chunk_cur;
3842+
BUG_ON(free < 0);
3843+
if (free < len) {
3844+
/*
3845+
* If the totoal size exceeds MAX_UNCOMPRESSED_BUF_SIZE,
3846+
* return -1 to indicate the decompress method that something
3847+
* is wrong
3848+
*/
3849+
if (unlikely((a->pg_idx >= MAX_UNCOMPRESSED_BUF_SIZE >> PAGE_SHIFT)))
3850+
return -1;
3851+
memcpy(a->chunk_cur, buf, free);
3852+
copied += free;
3853+
a->chunk_cur += free;
3854+
len -= free;
3855+
/*
3856+
* When retiring the active chunk, release its virtual address
3857+
* but do not release the contents in the pages.
3858+
*/
3859+
vunmap(a->chunk_start);
3860+
a->chunk_start = a->chunk_cur = vmap_decompressed_chunk();
3861+
if (unlikely(!a->chunk_start))
3862+
return -1;
3863+
}
3864+
memcpy(a->chunk_cur, buf, len);
3865+
copied += len;
3866+
a->chunk_cur += len;
3867+
return copied;
3868+
}
3869+
3870+
__bpf_kfunc struct mem_range_result *bpf_decompress(char *image_gz_payload, int image_gz_sz)
3871+
{
3872+
struct decompress_mem_allocator *a = &dcmpr_allocator;
3873+
decompress_fn decompressor;
3874+
struct mem_cgroup *memcg, *old_memcg;
3875+
struct mem_range_result *range;
3876+
const char *name;
3877+
char *input_buf;
3878+
int ret;
3879+
3880+
memcg = get_mem_cgroup_from_current();
3881+
old_memcg = set_active_memcg(memcg);
3882+
range = mem_range_result_alloc();
3883+
if (!range) {
3884+
pr_err("fail to allocate mem_range_result\n");
3885+
goto error;
3886+
}
3887+
3888+
input_buf = __vmalloc(image_gz_sz, GFP_KERNEL | __GFP_ACCOUNT);
3889+
if (!input_buf) {
3890+
kfree(range);
3891+
pr_err("fail to allocate input buffer\n");
3892+
goto error;
3893+
}
3894+
3895+
ret = copy_from_kernel_nofault(input_buf, image_gz_payload, image_gz_sz);
3896+
if (ret < 0) {
3897+
kfree(range);
3898+
vfree(input_buf);
3899+
pr_err("Error when copying from 0x%p, size:0x%x\n",
3900+
image_gz_payload, image_gz_sz);
3901+
goto error;
3902+
}
3903+
3904+
mutex_lock(&output_buf_mutex);
3905+
decompress_mem_allocator_init(a, CHUNK_SIZE);
3906+
decompressor = decompress_method(input_buf, image_gz_sz, &name);
3907+
if (!decompressor) {
3908+
kfree(range);
3909+
vfree(input_buf);
3910+
pr_err("Can not find decompress method\n");
3911+
goto error;
3912+
}
3913+
ret = decompressor(input_buf, image_gz_sz, NULL, flush,
3914+
NULL, NULL, NULL);
3915+
3916+
vfree(input_buf);
3917+
if (ret == 0) {
3918+
ret = decompress_mem_allocator_handover(a, range);
3919+
if (!!ret)
3920+
goto fail;
3921+
range->status = 0;
3922+
mem_cgroup_tryget(memcg);
3923+
range->memcg = memcg;
3924+
set_active_memcg(old_memcg);
3925+
}
3926+
fail:
3927+
decompress_mem_allocator_fini(a);
3928+
mutex_unlock(&output_buf_mutex);
3929+
if (!!ret) {
3930+
kfree(range);
3931+
range = NULL;
3932+
pr_err("Decompress error\n");
3933+
}
3934+
3935+
error:
3936+
set_active_memcg(old_memcg);
3937+
mem_cgroup_put(memcg);
3938+
return range;
3939+
}
3940+
#endif
3941+
37173942
__bpf_kfunc_end_defs();
37183943

37193944
BTF_KFUNCS_START(generic_btf_ids)
37203945
#ifdef CONFIG_CRASH_DUMP
37213946
BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE)
37223947
#endif
37233948
#ifdef CONFIG_KEXEC_PE_IMAGE
3949+
BTF_ID_FLAGS(func, bpf_decompress, KF_TRUSTED_ARGS | KF_ACQUIRE | KF_SLEEPABLE)
37243950
BTF_ID_FLAGS(func, bpf_mem_range_result_put, KF_RELEASE | KF_SLEEPABLE)
37253951
BTF_ID_FLAGS(func, bpf_copy_to_kernel, KF_TRUSTED_ARGS | KF_SLEEPABLE)
37263952
#endif

0 commit comments

Comments
 (0)