Skip to content

Commit a5eb229

Browse files
sidchintamaneniKernel Patches Daemon
authored andcommitted
bpf: runtime part of fast-path termination approach
Update softlock detection logic to detect any stalls due to BPF programs. When softlockup is detected, bpf_die will be added to a workqueue on a CPU. With this implementation termination handler will only get triggered when CONFIG_SOFTLOCKUP_DETECTOR is enabled. Inside bpf_die, we perform the text_poke to stub helpers/kfuncs. The current implementation handles termination of long running bpf_loop iterators both inlining and non-inlining case. The limitation of this implementation is that the termination handler atleast need a single CPU to run. Signed-off-by: Raj Sahu <[email protected]> Signed-off-by: Siddharth Chintamaneni <[email protected]>
1 parent bea88bd commit a5eb229

File tree

5 files changed

+182
-1
lines changed

5 files changed

+182
-1
lines changed

arch/x86/net/bpf_jit_comp.c

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2606,6 +2606,10 @@ st: if (is_imm8(insn->off))
26062606
if (arena_vm_start)
26072607
pop_r12(&prog);
26082608
}
2609+
/* emiting 5 byte nop for non-inline bpf_loop callback */
2610+
if (bpf_is_subprog(bpf_prog) && bpf_prog->aux->is_bpf_loop_cb_non_inline) {
2611+
emit_nops(&prog, X86_PATCH_SIZE);
2612+
}
26092613
EMIT1(0xC9); /* leave */
26102614
emit_return(&prog, image + addrs[i - 1] + (prog - temp));
26112615
break;
@@ -3833,6 +3837,8 @@ bool bpf_jit_supports_private_stack(void)
38333837
return true;
38343838
}
38353839

3840+
3841+
38363842
void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie)
38373843
{
38383844
#if defined(CONFIG_UNWINDER_ORC)
@@ -3849,6 +3855,132 @@ void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp
38493855
#endif
38503856
}
38513857

3858+
void in_place_patch_bpf_prog(struct bpf_prog *prog)
3859+
{
3860+
struct call_aux_states *call_states;
3861+
unsigned long new_target;
3862+
unsigned char *addr;
3863+
u8 ret_jmp_size = 1;
3864+
if (cpu_wants_rethunk()) {
3865+
ret_jmp_size = 5;
3866+
}
3867+
call_states = prog->term_states->patch_call_sites->call_states;
3868+
for (int i = 0; i < prog->term_states->patch_call_sites->call_sites_cnt; i++) {
3869+
3870+
new_target = (unsigned long) bpf_termination_null_func;
3871+
if (call_states[i].is_bpf_loop_cb_inline) {
3872+
new_target = (unsigned long) bpf_loop_term_callback;
3873+
}
3874+
char new_insn[5];
3875+
3876+
addr = (unsigned char *)prog->bpf_func + call_states->jit_call_idx;
3877+
3878+
unsigned long new_rel = (unsigned long)(new_target - (unsigned long)(addr + 5));
3879+
new_insn[0] = 0xE8;
3880+
new_insn[1] = (new_rel >> 0) & 0xFF;
3881+
new_insn[2] = (new_rel >> 8) & 0xFF;
3882+
new_insn[3] = (new_rel >> 16) & 0xFF;
3883+
new_insn[4] = (new_rel >> 24) & 0xFF;
3884+
3885+
smp_text_poke_batch_add(addr, new_insn, 5 /* call instruction len */, NULL);
3886+
}
3887+
3888+
if (prog->aux->is_bpf_loop_cb_non_inline) {
3889+
3890+
char new_insn[5] = { 0xB8, 0x01, 0x00, 0x00, 0x00 };
3891+
char old_insn[5] = { 0x0F, 0x1F, 0x44, 0x00, 0x00 };
3892+
smp_text_poke_batch_add(prog->bpf_func + prog->jited_len -
3893+
(1 + ret_jmp_size) /* leave, jmp/ ret */ - 5 /* nop size */, new_insn, 5 /* mov eax, 1 */, old_insn);
3894+
}
3895+
3896+
3897+
/* flush all text poke calls */
3898+
smp_text_poke_batch_finish();
3899+
}
3900+
3901+
void bpf_die(struct bpf_prog *prog)
3902+
{
3903+
u8 ret_jmp_size = 1;
3904+
if (cpu_wants_rethunk()) {
3905+
ret_jmp_size = 5;
3906+
}
3907+
3908+
/*
3909+
* Replacing 5 byte nop in prologue with jmp instruction to ret
3910+
*/
3911+
unsigned long jmp_offset = prog->jited_len - (4 /* First endbr is 4 bytes */
3912+
+ 5 /* noop is 5 bytes */
3913+
+ ret_jmp_size /* 5 bytes of jmp return_thunk or 1 byte ret*/);
3914+
3915+
char new_insn[5];
3916+
new_insn[0] = 0xE9;
3917+
new_insn[1] = (jmp_offset >> 0) & 0xFF;
3918+
new_insn[2] = (jmp_offset >> 8) & 0xFF;
3919+
new_insn[3] = (jmp_offset >> 16) & 0xFF;
3920+
new_insn[4] = (jmp_offset >> 24) & 0xFF;
3921+
3922+
smp_text_poke_batch_add(prog->bpf_func + 4, new_insn, 5, NULL);
3923+
3924+
if (prog->aux->func_cnt) {
3925+
for (int i = 0; i < prog->aux->func_cnt; i++) {
3926+
in_place_patch_bpf_prog(prog->aux->func[i]);
3927+
}
3928+
} else {
3929+
in_place_patch_bpf_prog(prog);
3930+
}
3931+
3932+
}
3933+
3934+
void bpf_prog_termination_deferred(struct work_struct *work)
3935+
{
3936+
struct bpf_term_aux_states *term_states = container_of(work, struct bpf_term_aux_states,
3937+
work);
3938+
struct bpf_prog *prog = term_states->prog;
3939+
3940+
bpf_die(prog);
3941+
}
3942+
3943+
static struct workqueue_struct *bpf_termination_wq;
3944+
3945+
void bpf_softlockup(u32 dur_s)
3946+
{
3947+
unsigned long addr;
3948+
struct unwind_state state;
3949+
struct bpf_prog *prog;
3950+
3951+
for (unwind_start(&state, current, NULL, NULL); !unwind_done(&state);
3952+
unwind_next_frame(&state)) {
3953+
addr = unwind_get_return_address(&state);
3954+
if (!addr)
3955+
break;
3956+
3957+
if (!is_bpf_text_address(addr))
3958+
continue;
3959+
3960+
rcu_read_lock();
3961+
prog = bpf_prog_ksym_find(addr);
3962+
rcu_read_unlock();
3963+
if (bpf_is_subprog(prog))
3964+
continue;
3965+
3966+
if (atomic_cmpxchg(&prog->term_states->bpf_die_in_progress, 0, 1))
3967+
break;
3968+
3969+
bpf_termination_wq = alloc_workqueue("bpf_termination_wq", WQ_UNBOUND, 1);
3970+
if (!bpf_termination_wq)
3971+
pr_err("Failed to alloc workqueue for bpf termination.\n");
3972+
3973+
queue_work(bpf_termination_wq, &prog->term_states->work);
3974+
3975+
/* Currently nested programs are not terminated together.
3976+
* Removing this break will result in BPF trampolines being
3977+
* identified as is_bpf_text_address resulting in NULL ptr
3978+
* deref in next step.
3979+
*/
3980+
break;
3981+
}
3982+
}
3983+
38523984
void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke,
38533985
struct bpf_prog *new, struct bpf_prog *old)
38543986
{

include/linux/bpf.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ typedef int (*bpf_iter_init_seq_priv_t)(void *private_data,
7171
typedef void (*bpf_iter_fini_seq_priv_t)(void *private_data);
7272
typedef unsigned int (*bpf_func_t)(const void *,
7373
const struct bpf_insn *);
74+
7475
struct bpf_iter_seq_info {
7576
const struct seq_operations *seq_ops;
7677
bpf_iter_init_seq_priv_t init_seq_private;
@@ -1600,6 +1601,7 @@ struct bpf_term_patch_call_sites {
16001601
struct bpf_term_aux_states {
16011602
struct bpf_prog *prog;
16021603
struct work_struct work;
1604+
atomic_t bpf_die_in_progress;
16031605
struct bpf_term_patch_call_sites *patch_call_sites;
16041606
};
16051607

include/linux/filter.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1123,6 +1123,8 @@ int sk_get_filter(struct sock *sk, sockptr_t optval, unsigned int len);
11231123
bool sk_filter_charge(struct sock *sk, struct sk_filter *fp);
11241124
void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp);
11251125

1126+
void *bpf_termination_null_func(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
1127+
int bpf_loop_term_callback(u64 reg_loop_cnt, u64 *reg_loop_ctx);
11261128
u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
11271129
#define __bpf_call_base_args \
11281130
((u64 (*)(u64, u64, u64, u64, u64, const struct bpf_insn *)) \
@@ -1257,6 +1259,10 @@ bpf_jit_binary_pack_hdr(const struct bpf_prog *fp);
12571259

12581260
void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns);
12591261
void bpf_prog_pack_free(void *ptr, u32 size);
1262+
void bpf_softlockup(u32 dur_s);
1263+
void bpf_prog_termination_deferred(struct work_struct *work);
1264+
void bpf_die(struct bpf_prog *prog);
1265+
void in_place_patch_bpf_prog(struct bpf_prog *prog);
12601266

12611267
static inline bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp)
12621268
{

kernel/bpf/core.c

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
#include <linux/execmem.h>
4242

4343
#include <asm/barrier.h>
44+
#include <asm/unwind.h>
4445
#include <linux/unaligned.h>
4546

4647
/* Registers */
@@ -95,6 +96,37 @@ enum page_size_enum {
9596
__PAGE_SIZE = PAGE_SIZE
9697
};
9798

99+
void *bpf_termination_null_func(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
100+
{
101+
return NULL;
102+
}
103+
104+
int bpf_loop_term_callback(u64 reg_loop_cnt, u64 *reg_loop_ctx)
105+
{
106+
return 1;
107+
}
108+
109+
110+
void __weak in_place_patch_bpf_prog(struct bpf_prog *prog)
111+
{
112+
return;
113+
}
114+
115+
void __weak bpf_die(struct bpf_prog *prog)
116+
{
117+
return;
118+
}
119+
120+
void __weak bpf_prog_termination_deferred(struct work_struct *work)
121+
{
122+
return;
123+
}
124+
125+
void __weak bpf_softlockup(u32 dur_s)
126+
{
127+
return;
128+
}
129+
98130
struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags)
99131
{
100132
gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags);
@@ -134,11 +166,12 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
134166
fp->jit_requested = ebpf_jit_enabled();
135167
fp->blinding_requested = bpf_jit_blinding_enabled(fp);
136168
fp->term_states = term_states;
169+
atomic_set(&fp->term_states->bpf_die_in_progress, 0);
137170
fp->term_states->patch_call_sites = patch_call_sites;
138171
fp->term_states->patch_call_sites->call_sites_cnt = 0;
139172
fp->term_states->patch_call_sites->call_states = NULL;
140173
fp->term_states->prog = fp;
141-
174+
INIT_WORK(&fp->term_states->work, bpf_prog_termination_deferred);
142175
#ifdef CONFIG_CGROUP_BPF
143176
aux->cgroup_atype = CGROUP_BPF_ATTACH_TYPE_INVALID;
144177
#endif

kernel/watchdog.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#include <linux/stop_machine.h>
2626
#include <linux/sysctl.h>
2727
#include <linux/tick.h>
28+
#include <linux/filter.h>
2829

2930
#include <linux/sched/clock.h>
3031
#include <linux/sched/debug.h>
@@ -700,6 +701,13 @@ static int is_softlockup(unsigned long touch_ts,
700701
if (time_after_eq(now, period_ts + get_softlockup_thresh() * 3 / 4))
701702
scx_softlockup(now - touch_ts);
702703

704+
/*
705+
* Long running BPF programs can cause CPU's to stall.
706+
* So trigger fast path termination to terminate such BPF programs.
707+
*/
708+
if (time_after_eq(now, period_ts + get_softlockup_thresh() * 3 / 4))
709+
bpf_softlockup(now - touch_ts);
710+
703711
/* Warn about unreasonable delays. */
704712
if (time_after(now, period_ts + get_softlockup_thresh()))
705713
return now - touch_ts;

0 commit comments

Comments
 (0)