Skip to content

Commit 3a6fa57

Browse files
author
Alexei Starovoitov
committed
Merge branch 'timed-may_goto'
Kumar Kartikeya Dwivedi says: ==================== Timed may_goto This series replaces the current implementation of cond_break, which uses the may_goto instruction, and counts 8 million iterations per stack frame, with an implementation based on sampling time locally on the CPU. This is done to permit a longer time for a given loop per-program invocation. The accounting is still done per-stack frame, but the count is used to instead amortize the cost of the logic to sample and check the time spent since the start. This is needed for expressing more complicated algorithms (spin locks, waiting loops, etc.) in BPF programs without false positive expiration of the loop. For instance, the plan is to make use of this for implementing spin locks for BPF arena [0]. For the loop as follows: for (int i = 0;; i++) {} Testing on a bare-metal Sapphire Rapids Intel server yields the following table (taking an average of 25 runs). +-----------------------------+--------------+--------------+------------------+ | Loop type | Iterations | Time (ms) | Time/iter (ns) | +-----------------------------|--------------+--------------+------------------+ | may_goto | 8388608 | 3 | 0.36 | | timed_may_goto (count=65535)| 589674932 | 250 | 0.42 | | bpf_for | 8388608 | 10 | 1.19 | +-----------------------------+--------------+--------------+------------------+ Here, count is used to amortize the time sampling and checking logic. Obviously, this is the limit of an empty loop. Given the complexity of the loop body, the time spent in the loop can be longer. Cancellations will address the task of imposing an upper bound on program runtime. For now, the implementation only supports x86. [0]: https://lore.kernel.org/bpf/[email protected] Changelog: ---------- v1 -> v2 v1: https://lore.kernel.org/bpf/[email protected] * Address comments from Alexei * Use kernel comment style for new code. * Remove p->count == 0 check in bpf_check_timed_may_goto. * Add comments on AX as argument/retval calling convention. * Add comments describing how the counting logic works. * Use BPF_EMIT_CALL instead of open-coding instruction encoding. * Change if ax != 1 goto pc+X condition to if ax != 0 goto pc+X. ==================== Link: https://patch.msgid.link/[email protected] Signed-off-by: Alexei Starovoitov <[email protected]>
2 parents 2941e21 + 2fb7618 commit 3a6fa57

File tree

9 files changed

+226
-21
lines changed

9 files changed

+226
-21
lines changed

arch/x86/net/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,5 +6,5 @@
66
ifeq ($(CONFIG_X86_32),y)
77
obj-$(CONFIG_BPF_JIT) += bpf_jit_comp32.o
88
else
9-
obj-$(CONFIG_BPF_JIT) += bpf_jit_comp.o
9+
obj-$(CONFIG_BPF_JIT) += bpf_jit_comp.o bpf_timed_may_goto.o
1010
endif

arch/x86/net/bpf_jit_comp.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3791,3 +3791,8 @@ u64 bpf_arch_uaddress_limit(void)
37913791
{
37923792
return 0;
37933793
}
3794+
3795+
bool bpf_jit_supports_timed_may_goto(void)
3796+
{
3797+
return true;
3798+
}

arch/x86/net/bpf_timed_may_goto.S

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
// SPDX-License-Identifier: GPL-2.0
2+
/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
3+
4+
#include <linux/export.h>
5+
#include <linux/linkage.h>
6+
#include <asm/nospec-branch.h>
7+
8+
.code64
9+
.section .text, "ax"
10+
11+
SYM_FUNC_START(arch_bpf_timed_may_goto)
12+
ANNOTATE_NOENDBR
13+
14+
/* Save r0-r5. */
15+
pushq %rax
16+
pushq %rdi
17+
pushq %rsi
18+
pushq %rdx
19+
pushq %rcx
20+
pushq %r8
21+
22+
/*
23+
* r10 passes us stack depth, load the pointer to count and timestamp as
24+
* first argument to the call below.
25+
*/
26+
leaq (%rbp, %r10, 1), %rdi
27+
28+
/* Emit call depth accounting for call below. */
29+
CALL_DEPTH_ACCOUNT
30+
call bpf_check_timed_may_goto
31+
32+
/* BPF_REG_AX=r10 will be stored into count, so move return value to it. */
33+
movq %rax, %r10
34+
35+
/* Restore r5-r0. */
36+
popq %r8
37+
popq %rcx
38+
popq %rdx
39+
popq %rsi
40+
popq %rdi
41+
popq %rax
42+
43+
RET
44+
SYM_FUNC_END(arch_bpf_timed_may_goto)

include/linux/bpf.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1987,6 +1987,7 @@ struct bpf_array {
19871987
*/
19881988
enum {
19891989
BPF_MAX_LOOPS = 8 * 1024 * 1024,
1990+
BPF_MAX_TIMED_LOOPS = 0xffff,
19901991
};
19911992

19921993
#define BPF_F_ACCESS_MASK (BPF_F_RDONLY | \

include/linux/filter.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -669,6 +669,11 @@ struct bpf_prog_stats {
669669
struct u64_stats_sync syncp;
670670
} __aligned(2 * sizeof(u64));
671671

672+
struct bpf_timed_may_goto {
673+
u64 count;
674+
u64 timestamp;
675+
};
676+
672677
struct sk_filter {
673678
refcount_t refcnt;
674679
struct rcu_head rcu;
@@ -1130,8 +1135,11 @@ bool bpf_jit_supports_ptr_xchg(void);
11301135
bool bpf_jit_supports_arena(void);
11311136
bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena);
11321137
bool bpf_jit_supports_private_stack(void);
1138+
bool bpf_jit_supports_timed_may_goto(void);
11331139
u64 bpf_arch_uaddress_limit(void);
11341140
void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie);
1141+
u64 arch_bpf_timed_may_goto(void);
1142+
u64 bpf_check_timed_may_goto(struct bpf_timed_may_goto *);
11351143
bool bpf_helper_changes_pkt_data(enum bpf_func_id func_id);
11361144

11371145
static inline bool bpf_dump_raw_ok(const struct cred *cred)

kernel/bpf/core.c

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3069,6 +3069,32 @@ void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp,
30693069
{
30703070
}
30713071

3072+
bool __weak bpf_jit_supports_timed_may_goto(void)
3073+
{
3074+
return false;
3075+
}
3076+
3077+
u64 __weak arch_bpf_timed_may_goto(void)
3078+
{
3079+
return 0;
3080+
}
3081+
3082+
u64 bpf_check_timed_may_goto(struct bpf_timed_may_goto *p)
3083+
{
3084+
u64 time = ktime_get_mono_fast_ns();
3085+
3086+
/* Populate the timestamp for this stack frame, and refresh count. */
3087+
if (!p->timestamp) {
3088+
p->timestamp = time;
3089+
return BPF_MAX_TIMED_LOOPS;
3090+
}
3091+
/* Check if we've exhausted our time slice, and zero count. */
3092+
if (time - p->timestamp >= (NSEC_PER_SEC / 4))
3093+
return 0;
3094+
/* Refresh the count for the stack frame. */
3095+
return BPF_MAX_TIMED_LOOPS;
3096+
}
3097+
30723098
/* for configs without MMU or 32-bit */
30733099
__weak const struct bpf_map_ops arena_map_ops;
30743100
__weak u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena)

kernel/bpf/verifier.c

Lines changed: 61 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21572,7 +21572,50 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
2157221572
goto next_insn;
2157321573
}
2157421574

21575-
if (is_may_goto_insn(insn)) {
21575+
if (is_may_goto_insn(insn) && bpf_jit_supports_timed_may_goto()) {
21576+
int stack_off_cnt = -stack_depth - 16;
21577+
21578+
/*
21579+
* Two 8 byte slots, depth-16 stores the count, and
21580+
* depth-8 stores the start timestamp of the loop.
21581+
*
21582+
* The starting value of count is BPF_MAX_TIMED_LOOPS
21583+
* (0xffff). Every iteration loads it and subs it by 1,
21584+
* until the value becomes 0 in AX (thus, 1 in stack),
21585+
* after which we call arch_bpf_timed_may_goto, which
21586+
* either sets AX to 0xffff to keep looping, or to 0
21587+
* upon timeout. AX is then stored into the stack. In
21588+
* the next iteration, we either see 0 and break out, or
21589+
* continue iterating until the next time value is 0
21590+
* after subtraction, rinse and repeat.
21591+
*/
21592+
stack_depth_extra = 16;
21593+
insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_AX, BPF_REG_10, stack_off_cnt);
21594+
if (insn->off >= 0)
21595+
insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off + 5);
21596+
else
21597+
insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off - 1);
21598+
insn_buf[2] = BPF_ALU64_IMM(BPF_SUB, BPF_REG_AX, 1);
21599+
insn_buf[3] = BPF_JMP_IMM(BPF_JNE, BPF_REG_AX, 0, 2);
21600+
/*
21601+
* AX is used as an argument to pass in stack_off_cnt
21602+
* (to add to r10/fp), and also as the return value of
21603+
* the call to arch_bpf_timed_may_goto.
21604+
*/
21605+
insn_buf[4] = BPF_MOV64_IMM(BPF_REG_AX, stack_off_cnt);
21606+
insn_buf[5] = BPF_EMIT_CALL(arch_bpf_timed_may_goto);
21607+
insn_buf[6] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_AX, stack_off_cnt);
21608+
cnt = 7;
21609+
21610+
new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
21611+
if (!new_prog)
21612+
return -ENOMEM;
21613+
21614+
delta += cnt - 1;
21615+
env->prog = prog = new_prog;
21616+
insn = new_prog->insnsi + i + delta;
21617+
goto next_insn;
21618+
} else if (is_may_goto_insn(insn)) {
2157621619
int stack_off = -stack_depth - 8;
2157721620

2157821621
stack_depth_extra = 8;
@@ -22113,23 +22156,33 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
2211322156

2211422157
env->prog->aux->stack_depth = subprogs[0].stack_depth;
2211522158
for (i = 0; i < env->subprog_cnt; i++) {
22159+
int delta = bpf_jit_supports_timed_may_goto() ? 2 : 1;
2211622160
int subprog_start = subprogs[i].start;
2211722161
int stack_slots = subprogs[i].stack_extra / 8;
22162+
int slots = delta, cnt = 0;
2211822163

2211922164
if (!stack_slots)
2212022165
continue;
22121-
if (stack_slots > 1) {
22166+
/* We need two slots in case timed may_goto is supported. */
22167+
if (stack_slots > slots) {
2212222168
verbose(env, "verifier bug: stack_slots supports may_goto only\n");
2212322169
return -EFAULT;
2212422170
}
2212522171

22126-
/* Add ST insn to subprog prologue to init extra stack */
22127-
insn_buf[0] = BPF_ST_MEM(BPF_DW, BPF_REG_FP,
22128-
-subprogs[i].stack_depth, BPF_MAX_LOOPS);
22172+
stack_depth = subprogs[i].stack_depth;
22173+
if (bpf_jit_supports_timed_may_goto()) {
22174+
insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth,
22175+
BPF_MAX_TIMED_LOOPS);
22176+
insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth + 8, 0);
22177+
} else {
22178+
/* Add ST insn to subprog prologue to init extra stack */
22179+
insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth,
22180+
BPF_MAX_LOOPS);
22181+
}
2212922182
/* Copy first actual insn to preserve it */
22130-
insn_buf[1] = env->prog->insnsi[subprog_start];
22183+
insn_buf[cnt++] = env->prog->insnsi[subprog_start];
2213122184

22132-
new_prog = bpf_patch_insn_data(env, subprog_start, insn_buf, 2);
22185+
new_prog = bpf_patch_insn_data(env, subprog_start, insn_buf, cnt);
2213322186
if (!new_prog)
2213422187
return -ENOMEM;
2213522188
env->prog = prog = new_prog;
@@ -22139,7 +22192,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
2213922192
* to insn after BPF_ST that inits may_goto count.
2214022193
* Adjustment will succeed because bpf_patch_insn_data() didn't fail.
2214122194
*/
22142-
WARN_ON(adjust_jmp_off(env->prog, subprog_start, 1));
22195+
WARN_ON(adjust_jmp_off(env->prog, subprog_start, delta));
2214322196
}
2214422197

2214522198
/* Since poke tab is now finalized, publish aux to tracker. */

tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c

Lines changed: 48 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -620,23 +620,61 @@ __naked void helper_call_does_not_prevent_bpf_fastcall(void)
620620

621621
SEC("raw_tp")
622622
__arch_x86_64
623+
__log_level(4) __msg("stack depth 24")
624+
/* may_goto counter at -24 */
625+
__xlated("0: *(u64 *)(r10 -24) =")
626+
/* may_goto timestamp at -16 */
627+
__xlated("1: *(u64 *)(r10 -16) =")
628+
__xlated("2: r1 = 1")
629+
__xlated("...")
630+
__xlated("4: r0 = &(void __percpu *)(r0)")
631+
__xlated("...")
632+
/* may_goto expansion starts */
633+
__xlated("6: r11 = *(u64 *)(r10 -24)")
634+
__xlated("7: if r11 == 0x0 goto pc+6")
635+
__xlated("8: r11 -= 1")
636+
__xlated("9: if r11 != 0x0 goto pc+2")
637+
__xlated("10: r11 = -24")
638+
__xlated("11: call unknown")
639+
__xlated("12: *(u64 *)(r10 -24) = r11")
640+
/* may_goto expansion ends */
641+
__xlated("13: *(u64 *)(r10 -8) = r1")
642+
__xlated("14: exit")
643+
__success
644+
__naked void may_goto_interaction_x86_64(void)
645+
{
646+
asm volatile (
647+
"r1 = 1;"
648+
"*(u64 *)(r10 - 16) = r1;"
649+
"call %[bpf_get_smp_processor_id];"
650+
"r1 = *(u64 *)(r10 - 16);"
651+
".8byte %[may_goto];"
652+
/* just touch some stack at -8 */
653+
"*(u64 *)(r10 - 8) = r1;"
654+
"exit;"
655+
:
656+
: __imm(bpf_get_smp_processor_id),
657+
__imm_insn(may_goto, BPF_RAW_INSN(BPF_JMP | BPF_JCOND, 0, 0, +1 /* offset */, 0))
658+
: __clobber_all);
659+
}
660+
661+
SEC("raw_tp")
662+
__arch_arm64
623663
__log_level(4) __msg("stack depth 16")
624664
/* may_goto counter at -16 */
625665
__xlated("0: *(u64 *)(r10 -16) =")
626666
__xlated("1: r1 = 1")
627-
__xlated("...")
628-
__xlated("3: r0 = &(void __percpu *)(r0)")
629-
__xlated("...")
667+
__xlated("2: call bpf_get_smp_processor_id")
630668
/* may_goto expansion starts */
631-
__xlated("5: r11 = *(u64 *)(r10 -16)")
632-
__xlated("6: if r11 == 0x0 goto pc+3")
633-
__xlated("7: r11 -= 1")
634-
__xlated("8: *(u64 *)(r10 -16) = r11")
669+
__xlated("3: r11 = *(u64 *)(r10 -16)")
670+
__xlated("4: if r11 == 0x0 goto pc+3")
671+
__xlated("5: r11 -= 1")
672+
__xlated("6: *(u64 *)(r10 -16) = r11")
635673
/* may_goto expansion ends */
636-
__xlated("9: *(u64 *)(r10 -8) = r1")
637-
__xlated("10: exit")
674+
__xlated("7: *(u64 *)(r10 -8) = r1")
675+
__xlated("8: exit")
638676
__success
639-
__naked void may_goto_interaction(void)
677+
__naked void may_goto_interaction_arm64(void)
640678
{
641679
asm volatile (
642680
"r1 = 1;"

tools/testing/selftests/bpf/progs/verifier_may_goto_1.c

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,8 +69,38 @@ __naked void may_goto_batch_1(void)
6969
}
7070

7171
SEC("raw_tp")
72-
__description("may_goto batch with offsets 2/0")
72+
__description("may_goto batch with offsets 2/0 - x86_64")
7373
__arch_x86_64
74+
__xlated("0: *(u64 *)(r10 -16) = 65535")
75+
__xlated("1: *(u64 *)(r10 -8) = 0")
76+
__xlated("2: r11 = *(u64 *)(r10 -16)")
77+
__xlated("3: if r11 == 0x0 goto pc+6")
78+
__xlated("4: r11 -= 1")
79+
__xlated("5: if r11 != 0x0 goto pc+2")
80+
__xlated("6: r11 = -16")
81+
__xlated("7: call unknown")
82+
__xlated("8: *(u64 *)(r10 -16) = r11")
83+
__xlated("9: r0 = 1")
84+
__xlated("10: r0 = 2")
85+
__xlated("11: exit")
86+
__success
87+
__naked void may_goto_batch_2_x86_64(void)
88+
{
89+
asm volatile (
90+
".8byte %[may_goto1];"
91+
".8byte %[may_goto3];"
92+
"r0 = 1;"
93+
"r0 = 2;"
94+
"exit;"
95+
:
96+
: __imm_insn(may_goto1, BPF_RAW_INSN(BPF_JMP | BPF_JCOND, 0, 0, 2 /* offset */, 0)),
97+
__imm_insn(may_goto3, BPF_RAW_INSN(BPF_JMP | BPF_JCOND, 0, 0, 0 /* offset */, 0))
98+
: __clobber_all);
99+
}
100+
101+
SEC("raw_tp")
102+
__description("may_goto batch with offsets 2/0 - arm64")
103+
__arch_arm64
74104
__xlated("0: *(u64 *)(r10 -8) = 8388608")
75105
__xlated("1: r11 = *(u64 *)(r10 -8)")
76106
__xlated("2: if r11 == 0x0 goto pc+3")
@@ -80,7 +110,7 @@ __xlated("5: r0 = 1")
80110
__xlated("6: r0 = 2")
81111
__xlated("7: exit")
82112
__success
83-
__naked void may_goto_batch_2(void)
113+
__naked void may_goto_batch_2_arm64(void)
84114
{
85115
asm volatile (
86116
".8byte %[may_goto1];"

0 commit comments

Comments
 (0)