Skip to content

Commit 88a24c0

Browse files
Pingfan LiuKernel Patches Daemon
authored andcommitted
bpf: Introduce bpf_copy_to_kernel() to buffer the content from bpf-prog
In the security kexec_file_load case, the buffer which holds the kernel image should not be accessible from the userspace. Typically, BPF data flow occurs between user space and kernel space in either direction. However, kexec_file_load presents a unique case where user-originated data must be parsed and then forwarded to the kernel for subsequent parsing stages. This necessitates a mechanism to channel the intermedia data from the BPF program directly to the kernel. bpf_kexec_carrier() is introduced to serve that purpose. Signed-off-by: Pingfan Liu <[email protected]> Cc: Alexei Starovoitov <[email protected]> Cc: Daniel Borkmann <[email protected]> Cc: John Fastabend <[email protected]> Cc: Andrii Nakryiko <[email protected]> Cc: Martin KaFai Lau <[email protected]> Cc: Eduard Zingerman <[email protected]> Cc: Song Liu <[email protected]> Cc: Yonghong Song <[email protected]> Cc: KP Singh <[email protected]> Cc: Stanislav Fomichev <[email protected]> Cc: Hao Luo <[email protected]> Cc: Jiri Olsa <[email protected]> To: [email protected]
1 parent d4bab98 commit 88a24c0

File tree

4 files changed

+264
-0
lines changed

4 files changed

+264
-0
lines changed

include/linux/bpf.h

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3709,4 +3709,46 @@ int bpf_prog_get_file_line(struct bpf_prog *prog, unsigned long ip, const char *
37093709
const char **linep, int *nump);
37103710
struct bpf_prog *bpf_prog_find_from_stack(void);
37113711

3712+
enum alloc_type {
3713+
TYPE_KALLOC,
3714+
TYPE_VMALLOC,
3715+
TYPE_VMAP,
3716+
};
3717+
3718+
struct mem_range_result {
3719+
struct kref ref;
3720+
char *buf;
3721+
uint32_t buf_sz;
3722+
uint32_t data_sz;
3723+
/* kmalloc-ed, vmalloc-ed, or vmap-ed */
3724+
enum alloc_type alloc_type;
3725+
/* Valid if vmap-ed */
3726+
struct page **pages;
3727+
unsigned int pg_cnt;
3728+
int status;
3729+
struct mem_cgroup *memcg;
3730+
};
3731+
3732+
struct mem_range_result *mem_range_result_alloc(void);
3733+
void mem_range_result_get(struct mem_range_result *r);
3734+
void mem_range_result_put(struct mem_range_result *r);
3735+
3736+
__bpf_kfunc int bpf_mem_range_result_put(struct mem_range_result *result);
3737+
__bpf_kfunc int bpf_copy_to_kernel(const char *name, char *buf, int size);
3738+
3739+
typedef int (*resource_handler)(const char *name, struct mem_range_result *r);
3740+
3741+
struct carrier_listener {
3742+
struct hlist_node node;
3743+
char *name;
3744+
resource_handler handler;
3745+
/*
3746+
* bpf_copy_to_kernel() knows the size in advance, so vmap-ed is not
3747+
* supported.
3748+
*/
3749+
enum alloc_type alloc_type;
3750+
};
3751+
3752+
int register_carrier_listener(struct carrier_listener *listener);
3753+
int unregister_carrier_listener(char *str);
37123754
#endif /* _LINUX_BPF_H */

kernel/bpf/Makefile

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,9 @@ obj-$(CONFIG_BPF_SYSCALL) += kmem_cache_iter.o
5656
ifeq ($(CONFIG_DMA_SHARED_BUFFER),y)
5757
obj-$(CONFIG_BPF_SYSCALL) += dmabuf_iter.o
5858
endif
59+
ifeq ($(CONFIG_KEXEC_PE_IMAGE),y)
60+
obj-$(CONFIG_BPF_SYSCALL) += helpers_carrier.o
61+
endif
5962

6063
CFLAGS_REMOVE_percpu_freelist.o = $(CC_FLAGS_FTRACE)
6164
CFLAGS_REMOVE_bpf_lru_list.o = $(CC_FLAGS_FTRACE)

kernel/bpf/helpers.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3720,6 +3720,10 @@ BTF_KFUNCS_START(generic_btf_ids)
37203720
#ifdef CONFIG_CRASH_DUMP
37213721
BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE)
37223722
#endif
3723+
#ifdef CONFIG_KEXEC_PE_IMAGE
3724+
BTF_ID_FLAGS(func, bpf_mem_range_result_put, KF_RELEASE | KF_SLEEPABLE)
3725+
BTF_ID_FLAGS(func, bpf_copy_to_kernel, KF_TRUSTED_ARGS | KF_SLEEPABLE)
3726+
#endif
37233727
BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
37243728
BTF_ID_FLAGS(func, bpf_percpu_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
37253729
BTF_ID_FLAGS(func, bpf_obj_drop_impl, KF_RELEASE)

kernel/bpf/helpers_carrier.c

Lines changed: 215 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,215 @@
1+
// SPDX-License-Identifier: GPL-2.0-only
2+
#include <linux/bpf.h>
3+
#include <linux/bpf-cgroup.h>
4+
#include <linux/cgroup.h>
5+
#include <linux/rcupdate.h>
6+
#include <linux/hashtable.h>
7+
#include <linux/jhash.h>
8+
#include <linux/mutex.h>
9+
#include <linux/kref.h>
10+
#include <linux/vmalloc.h>
11+
#include <linux/slab.h>
12+
13+
DEFINE_STATIC_SRCU(srcu);
14+
static DEFINE_MUTEX(carrier_listeners_mutex);
15+
static DEFINE_HASHTABLE(carrier_listeners, 8);
16+
17+
static struct carrier_listener *find_listener(const char *str)
18+
{
19+
struct carrier_listener *item;
20+
unsigned int hash = jhash(str, strlen(str), 0);
21+
22+
hash_for_each_possible_rcu(carrier_listeners, item, node, hash) {
23+
if (strcmp(item->name, str) == 0)
24+
return item;
25+
}
26+
return NULL;
27+
}
28+
29+
static void __mem_range_result_free(struct kref *kref)
30+
{
31+
struct mem_range_result *result = container_of(kref, struct mem_range_result, ref);
32+
struct mem_cgroup *memcg, *old_memcg;
33+
34+
/* vunmap() is blocking */
35+
might_sleep();
36+
memcg = result->memcg;
37+
old_memcg = set_active_memcg(memcg);
38+
if (likely(!!result->buf)) {
39+
switch (result->alloc_type) {
40+
case TYPE_KALLOC:
41+
kfree(result->buf);
42+
break;
43+
case TYPE_VMALLOC:
44+
vfree(result->buf);
45+
break;
46+
case TYPE_VMAP:
47+
vunmap(result->buf);
48+
for (unsigned int i = 0; i < result->pg_cnt; i++)
49+
__free_pages(result->pages[i], 0);
50+
vfree(result->pages);
51+
}
52+
}
53+
kfree(result);
54+
set_active_memcg(old_memcg);
55+
mem_cgroup_put(memcg);
56+
}
57+
58+
struct mem_range_result *mem_range_result_alloc(void)
59+
{
60+
struct mem_range_result *range;
61+
62+
range = kmalloc(sizeof(struct mem_range_result), GFP_KERNEL);
63+
if (!range)
64+
return NULL;
65+
kref_init(&range->ref);
66+
return range;
67+
}
68+
69+
void mem_range_result_get(struct mem_range_result *r)
70+
{
71+
if (!r)
72+
return;
73+
kref_get(&r->ref);
74+
}
75+
76+
void mem_range_result_put(struct mem_range_result *r)
77+
{
78+
might_sleep();
79+
if (!r)
80+
return;
81+
kref_put(&r->ref, __mem_range_result_free);
82+
}
83+
84+
__bpf_kfunc int bpf_mem_range_result_put(struct mem_range_result *result)
85+
{
86+
mem_range_result_put(result);
87+
return 0;
88+
}
89+
90+
/*
91+
* Cache the content in @buf into kernel
92+
*/
93+
__bpf_kfunc int bpf_copy_to_kernel(const char *name, char *buf, int size)
94+
{
95+
struct mem_range_result *range;
96+
struct mem_cgroup *memcg, *old_memcg;
97+
struct carrier_listener *item;
98+
resource_handler handler;
99+
enum alloc_type alloc_type;
100+
char *kbuf;
101+
int id, ret = 0;
102+
103+
/*
104+
* This lock ensures no use of item after free and there is no in-flight
105+
* handler
106+
*/
107+
id = srcu_read_lock(&srcu);
108+
item = find_listener(name);
109+
if (!item) {
110+
srcu_read_unlock(&srcu, id);
111+
return -EINVAL;
112+
}
113+
alloc_type = item->alloc_type;
114+
handler = item->handler;
115+
memcg = get_mem_cgroup_from_current();
116+
old_memcg = set_active_memcg(memcg);
117+
range = mem_range_result_alloc();
118+
if (!range) {
119+
pr_err("fail to allocate mem_range_result\n");
120+
ret = -ENOMEM;
121+
goto err;
122+
}
123+
124+
switch (alloc_type) {
125+
case TYPE_KALLOC:
126+
kbuf = kmalloc(size, GFP_KERNEL | __GFP_ACCOUNT);
127+
break;
128+
case TYPE_VMALLOC:
129+
kbuf = __vmalloc(size, GFP_KERNEL | __GFP_ACCOUNT);
130+
break;
131+
default:
132+
kfree(range);
133+
ret = -EINVAL;
134+
goto err;
135+
}
136+
if (!kbuf) {
137+
kfree(range);
138+
ret = -ENOMEM;
139+
goto err;
140+
}
141+
ret = copy_from_kernel_nofault(kbuf, buf, size);
142+
if (unlikely(ret < 0)) {
143+
if (range->alloc_type == TYPE_KALLOC)
144+
kfree(kbuf);
145+
else
146+
vfree(kbuf);
147+
kfree(range);
148+
ret = -EINVAL;
149+
goto err;
150+
}
151+
range->buf = kbuf;
152+
range->buf_sz = size;
153+
range->data_sz = size;
154+
range->memcg = memcg;
155+
mem_cgroup_tryget(memcg);
156+
range->status = 0;
157+
range->alloc_type = alloc_type;
158+
/* We exit the lock after the handler finishes */
159+
ret = handler(name, range);
160+
srcu_read_unlock(&srcu, id);
161+
mem_range_result_put(range);
162+
err:
163+
if (ret != 0)
164+
srcu_read_unlock(&srcu, id);
165+
set_active_memcg(old_memcg);
166+
mem_cgroup_put(memcg);
167+
return ret;
168+
}
169+
170+
int register_carrier_listener(struct carrier_listener *listener)
171+
{
172+
unsigned int hash;
173+
int ret = 0;
174+
char *str = listener->name;
175+
176+
/* Not support vmap-ed */
177+
if (listener->alloc_type > TYPE_VMALLOC)
178+
return -EINVAL;
179+
if (!str)
180+
return -EINVAL;
181+
hash = jhash(str, strlen(str), 0);
182+
mutex_lock(&carrier_listeners_mutex);
183+
if (!find_listener(str))
184+
hash_add_rcu(carrier_listeners, &listener->node, hash);
185+
else
186+
ret = -EBUSY;
187+
mutex_unlock(&carrier_listeners_mutex);
188+
189+
return ret;
190+
}
191+
EXPORT_SYMBOL(register_carrier_listener);
192+
193+
int unregister_carrier_listener(char *str)
194+
{
195+
struct carrier_listener *item;
196+
int ret = 0;
197+
198+
mutex_lock(&carrier_listeners_mutex);
199+
item = find_listener(str);
200+
if (!!item) {
201+
hash_del_rcu(&item->node);
202+
/*
203+
* It also waits on in-flight handler. Refer to note on the read
204+
* side
205+
*/
206+
synchronize_srcu(&srcu);
207+
} else {
208+
ret = -EINVAL;
209+
}
210+
mutex_unlock(&carrier_listeners_mutex);
211+
212+
return ret;
213+
}
214+
EXPORT_SYMBOL(unregister_carrier_listener);
215+

0 commit comments

Comments
 (0)