Skip to content

Commit 8b57a37

Browse files
anakryikogregkh
authored andcommitted
bpf: stop setting precise in current state
[ Upstream commit f63181b ] Setting reg->precise to true in current state is not necessary from correctness standpoint, but it does pessimise the whole precision (or rather "imprecision", because that's what we want to keep as much as possible) tracking. Why is somewhat subtle and my best attempt to explain this is recorded in an extensive comment for __mark_chain_precise() function. Some more careful thinking and code reading is probably required still to grok this completely, unfortunately. Whiteboarding and a bunch of extra handwaiving in person would be even more helpful, but is deemed impractical in Git commit. Next patch pushes this imprecision property even further, building on top of the insights described in this patch. End results are pretty nice, we get reduction in number of total instructions and states verified due to a better states reuse, as some of the states are now more generic and permissive due to less unnecessary precise=true requirements. SELFTESTS RESULTS ================= $ ./veristat -C -e file,prog,insns,states ~/subprog-precise-results.csv ~/imprecise-early-results.csv | grep -v '+0' File Program Total insns (A) Total insns (B) Total insns (DIFF) Total states (A) Total states (B) Total states (DIFF) --------------------------------------- ---------------------- --------------- --------------- ------------------ ---------------- ---------------- ------------------- bpf_iter_ksym.bpf.linked1.o dump_ksym 347 285 -62 (-17.87%) 20 19 -1 (-5.00%) pyperf600_bpf_loop.bpf.linked1.o on_event 3678 3736 +58 (+1.58%) 276 285 +9 (+3.26%) setget_sockopt.bpf.linked1.o skops_sockopt 4038 3947 -91 (-2.25%) 347 343 -4 (-1.15%) test_l4lb.bpf.linked1.o balancer_ingress 4559 2611 -1948 (-42.73%) 118 105 -13 (-11.02%) test_l4lb_noinline.bpf.linked1.o balancer_ingress 6279 6268 -11 (-0.18%) 237 236 -1 (-0.42%) test_misc_tcp_hdr_options.bpf.linked1.o misc_estab 1307 1303 -4 (-0.31%) 100 99 -1 (-1.00%) test_sk_lookup.bpf.linked1.o ctx_narrow_access 456 447 -9 (-1.97%) 39 38 -1 (-2.56%) test_sysctl_loop1.bpf.linked1.o sysctl_tcp_mem 1389 1384 -5 (-0.36%) 26 25 -1 (-3.85%) test_tc_dtime.bpf.linked1.o egress_fwdns_prio101 518 485 -33 (-6.37%) 51 46 -5 (-9.80%) test_tc_dtime.bpf.linked1.o egress_host 519 468 -51 (-9.83%) 50 44 -6 (-12.00%) test_tc_dtime.bpf.linked1.o ingress_fwdns_prio101 842 1000 +158 (+18.76%) 73 88 +15 (+20.55%) xdp_synproxy_kern.bpf.linked1.o syncookie_tc 405757 373173 -32584 (-8.03%) 25735 22882 -2853 (-11.09%) xdp_synproxy_kern.bpf.linked1.o syncookie_xdp 479055 371590 -107465 (-22.43%) 29145 22207 -6938 (-23.81%) --------------------------------------- ---------------------- --------------- --------------- ------------------ ---------------- ---------------- ------------------- Slight regression in test_tc_dtime.bpf.linked1.o/ingress_fwdns_prio101 is left for a follow up, there might be some more precision-related bugs in existing BPF verifier logic. CILIUM RESULTS ============== $ ./veristat -C -e file,prog,insns,states ~/subprog-precise-results-cilium.csv ~/imprecise-early-results-cilium.csv | grep -v '+0' File Program Total insns (A) Total insns (B) Total insns (DIFF) Total states (A) Total states (B) Total states (DIFF) ------------- ------------------------------ --------------- --------------- ------------------ ---------------- ---------------- ------------------- bpf_host.o cil_from_host 762 556 -206 (-27.03%) 43 37 -6 (-13.95%) bpf_host.o tail_handle_nat_fwd_ipv4 23541 23426 -115 (-0.49%) 1538 1537 -1 (-0.07%) bpf_host.o tail_nodeport_nat_egress_ipv4 33592 33566 -26 (-0.08%) 2163 2161 -2 (-0.09%) bpf_lxc.o tail_handle_nat_fwd_ipv4 23541 23426 -115 (-0.49%) 1538 1537 -1 (-0.07%) bpf_overlay.o tail_nodeport_nat_egress_ipv4 33581 33543 -38 (-0.11%) 2160 2157 -3 (-0.14%) bpf_xdp.o tail_handle_nat_fwd_ipv4 21659 20920 -739 (-3.41%) 1440 1376 -64 (-4.44%) bpf_xdp.o tail_handle_nat_fwd_ipv6 17084 17039 -45 (-0.26%) 907 905 -2 (-0.22%) bpf_xdp.o tail_lb_ipv4 73442 73430 -12 (-0.02%) 4370 4369 -1 (-0.02%) bpf_xdp.o tail_lb_ipv6 152114 151895 -219 (-0.14%) 6493 6479 -14 (-0.22%) bpf_xdp.o tail_nodeport_nat_egress_ipv4 17377 17200 -177 (-1.02%) 1125 1111 -14 (-1.24%) bpf_xdp.o tail_nodeport_nat_ingress_ipv6 6405 6397 -8 (-0.12%) 309 308 -1 (-0.32%) bpf_xdp.o tail_rev_nodeport_lb4 7126 6934 -192 (-2.69%) 414 402 -12 (-2.90%) bpf_xdp.o tail_rev_nodeport_lb6 18059 17905 -154 (-0.85%) 1105 1096 -9 (-0.81%) ------------- ------------------------------ --------------- --------------- ------------------ ---------------- ---------------- ------------------- Signed-off-by: Andrii Nakryiko <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Alexei Starovoitov <[email protected]> Signed-off-by: Eduard Zingerman <[email protected]> Signed-off-by: Greg Kroah-Hartman <[email protected]>
1 parent 56675dd commit 8b57a37

File tree

1 file changed

+91
-12
lines changed

1 file changed

+91
-12
lines changed

kernel/bpf/verifier.c

Lines changed: 91 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2788,8 +2788,11 @@ static void mark_all_scalars_precise(struct bpf_verifier_env *env,
27882788

27892789
/* big hammer: mark all scalars precise in this path.
27902790
* pop_stack may still get !precise scalars.
2791+
* We also skip current state and go straight to first parent state,
2792+
* because precision markings in current non-checkpointed state are
2793+
* not needed. See why in the comment in __mark_chain_precision below.
27912794
*/
2792-
for (; st; st = st->parent)
2795+
for (st = st->parent; st; st = st->parent) {
27932796
for (i = 0; i <= st->curframe; i++) {
27942797
func = st->frame[i];
27952798
for (j = 0; j < BPF_REG_FP; j++) {
@@ -2807,8 +2810,88 @@ static void mark_all_scalars_precise(struct bpf_verifier_env *env,
28072810
reg->precise = true;
28082811
}
28092812
}
2813+
}
28102814
}
28112815

2816+
/*
2817+
* __mark_chain_precision() backtracks BPF program instruction sequence and
2818+
* chain of verifier states making sure that register *regno* (if regno >= 0)
2819+
* and/or stack slot *spi* (if spi >= 0) are marked as precisely tracked
2820+
* SCALARS, as well as any other registers and slots that contribute to
2821+
* a tracked state of given registers/stack slots, depending on specific BPF
2822+
* assembly instructions (see backtrack_insns() for exact instruction handling
2823+
* logic). This backtracking relies on recorded jmp_history and is able to
2824+
* traverse entire chain of parent states. This process ends only when all the
2825+
* necessary registers/slots and their transitive dependencies are marked as
2826+
* precise.
2827+
*
2828+
* One important and subtle aspect is that precise marks *do not matter* in
2829+
* the currently verified state (current state). It is important to understand
2830+
* why this is the case.
2831+
*
2832+
* First, note that current state is the state that is not yet "checkpointed",
2833+
* i.e., it is not yet put into env->explored_states, and it has no children
2834+
* states as well. It's ephemeral, and can end up either a) being discarded if
2835+
* compatible explored state is found at some point or BPF_EXIT instruction is
2836+
* reached or b) checkpointed and put into env->explored_states, branching out
2837+
* into one or more children states.
2838+
*
2839+
* In the former case, precise markings in current state are completely
2840+
* ignored by state comparison code (see regsafe() for details). Only
2841+
* checkpointed ("old") state precise markings are important, and if old
2842+
* state's register/slot is precise, regsafe() assumes current state's
2843+
* register/slot as precise and checks value ranges exactly and precisely. If
2844+
* states turn out to be compatible, current state's necessary precise
2845+
* markings and any required parent states' precise markings are enforced
2846+
* after the fact with propagate_precision() logic, after the fact. But it's
2847+
* important to realize that in this case, even after marking current state
2848+
* registers/slots as precise, we immediately discard current state. So what
2849+
* actually matters is any of the precise markings propagated into current
2850+
* state's parent states, which are always checkpointed (due to b) case above).
2851+
* As such, for scenario a) it doesn't matter if current state has precise
2852+
* markings set or not.
2853+
*
2854+
* Now, for the scenario b), checkpointing and forking into child(ren)
2855+
* state(s). Note that before current state gets to checkpointing step, any
2856+
* processed instruction always assumes precise SCALAR register/slot
2857+
* knowledge: if precise value or range is useful to prune jump branch, BPF
2858+
* verifier takes this opportunity enthusiastically. Similarly, when
2859+
* register's value is used to calculate offset or memory address, exact
2860+
* knowledge of SCALAR range is assumed, checked, and enforced. So, similar to
2861+
* what we mentioned above about state comparison ignoring precise markings
2862+
* during state comparison, BPF verifier ignores and also assumes precise
2863+
* markings *at will* during instruction verification process. But as verifier
2864+
* assumes precision, it also propagates any precision dependencies across
2865+
* parent states, which are not yet finalized, so can be further restricted
2866+
* based on new knowledge gained from restrictions enforced by their children
2867+
* states. This is so that once those parent states are finalized, i.e., when
2868+
* they have no more active children state, state comparison logic in
2869+
* is_state_visited() would enforce strict and precise SCALAR ranges, if
2870+
* required for correctness.
2871+
*
2872+
* To build a bit more intuition, note also that once a state is checkpointed,
2873+
* the path we took to get to that state is not important. This is crucial
2874+
* property for state pruning. When state is checkpointed and finalized at
2875+
* some instruction index, it can be correctly and safely used to "short
2876+
* circuit" any *compatible* state that reaches exactly the same instruction
2877+
* index. I.e., if we jumped to that instruction from a completely different
2878+
* code path than original finalized state was derived from, it doesn't
2879+
* matter, current state can be discarded because from that instruction
2880+
* forward having a compatible state will ensure we will safely reach the
2881+
* exit. States describe preconditions for further exploration, but completely
2882+
* forget the history of how we got here.
2883+
*
2884+
* This also means that even if we needed precise SCALAR range to get to
2885+
* finalized state, but from that point forward *that same* SCALAR register is
2886+
* never used in a precise context (i.e., it's precise value is not needed for
2887+
* correctness), it's correct and safe to mark such register as "imprecise"
2888+
* (i.e., precise marking set to false). This is what we rely on when we do
2889+
* not set precise marking in current state. If no child state requires
2890+
* precision for any given SCALAR register, it's safe to dictate that it can
2891+
* be imprecise. If any child state does require this register to be precise,
2892+
* we'll mark it precise later retroactively during precise markings
2893+
* propagation from child state to parent states.
2894+
*/
28122895
static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int regno,
28132896
int spi)
28142897
{
@@ -2826,18 +2909,18 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int r
28262909
if (!env->bpf_capable)
28272910
return 0;
28282911

2912+
/* Do sanity checks against current state of register and/or stack
2913+
* slot, but don't set precise flag in current state, as precision
2914+
* tracking in the current state is unnecessary.
2915+
*/
28292916
func = st->frame[frame];
28302917
if (regno >= 0) {
28312918
reg = &func->regs[regno];
28322919
if (reg->type != SCALAR_VALUE) {
28332920
WARN_ONCE(1, "backtracing misuse");
28342921
return -EFAULT;
28352922
}
2836-
if (!reg->precise)
2837-
new_marks = true;
2838-
else
2839-
reg_mask = 0;
2840-
reg->precise = true;
2923+
new_marks = true;
28412924
}
28422925

28432926
while (spi >= 0) {
@@ -2850,11 +2933,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int r
28502933
stack_mask = 0;
28512934
break;
28522935
}
2853-
if (!reg->precise)
2854-
new_marks = true;
2855-
else
2856-
stack_mask = 0;
2857-
reg->precise = true;
2936+
new_marks = true;
28582937
break;
28592938
}
28602939

@@ -11668,7 +11747,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
1166811747
if (env->explore_alu_limits)
1166911748
return false;
1167011749
if (rcur->type == SCALAR_VALUE) {
11671-
if (!rold->precise && !rcur->precise)
11750+
if (!rold->precise)
1167211751
return true;
1167311752
/* new val must satisfy old val knowledge */
1167411753
return range_within(rold, rcur) &&

0 commit comments

Comments
 (0)