Skip to content

Commit 403f3e8

Browse files
author
Alexei Starovoitov
committed
Merge branch 'add-bpf_xdp_get_xfrm_state-kfunc'
Daniel Xu says: ==================== Add bpf_xdp_get_xfrm_state() kfunc This patchset adds two kfunc helpers, bpf_xdp_get_xfrm_state() and bpf_xdp_xfrm_state_release() that wrap xfrm_state_lookup() and xfrm_state_put(). The intent is to support software RSS (via XDP) for the ongoing/upcoming ipsec pcpu work [0]. Recent experiments performed on (hopefully) reproducible AWS testbeds indicate that single tunnel pcpu ipsec can reach line rate on 100G ENA nics. Note this patchset only tests/shows generic xfrm_state access. The "secret sauce" (if you can really even call it that) involves accessing a soon-to-be-upstreamed pcpu_num field in xfrm_state. Early example is available here [1]. [0]: https://datatracker.ietf.org/doc/draft-ietf-ipsecme-multi-sa-performance/03/ [1]: https://github.com/danobi/xdp-tools/blob/e89a1c617aba3b50d990f779357d6ce2863ecb27/xdp-bench/xdp_redirect_cpumap.bpf.c#L385-L406 Changes from v5: * Improve kfunc doc comments * Remove extraneous replay-window setting on selftest reverse path * Squash two kfunc commits into one * Rebase to bpf-next to pick up bitfield write patches * Remove testing of opts.error in selftest prog Changes from v4: * Fixup commit message for selftest * Set opts->error -ENOENT for !x * Revert single file xfrm + bpf Changes from v3: * Place all xfrm bpf integrations in xfrm_bpf.c * Avoid using nval as a temporary * Rebase to bpf-next * Remove extraneous __failure_unpriv annotation for verifier tests Changes from v2: * Fix/simplify BPF_CORE_WRITE_BITFIELD() algorithm * Added verifier tests for bitfield writes * Fix state leakage across test_tunnel subtests Changes from v1: * Move xfrm tunnel tests to test_progs * Fix writing to opts->error when opts is invalid * Use __bpf_kfunc_start_defs() * Remove unused vxlanhdr definition * Add and use BPF_CORE_WRITE_BITFIELD() macro * Make series bisect clean Changes from RFCv2: * Rebased to ipsec-next * Fix netns leak Changes from RFCv1: * Add Antony's commit tags * Add KF_ACQUIRE and KF_RELEASE semantics ==================== Reviewed-by: Eyal Birger <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Alexei Starovoitov <[email protected]>
2 parents 56925f3 + 2cd07b0 commit 403f3e8

File tree

8 files changed

+384
-155
lines changed

8 files changed

+384
-155
lines changed

include/net/xfrm.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2190,4 +2190,13 @@ static inline int register_xfrm_interface_bpf(void)
21902190

21912191
#endif
21922192

2193+
#if IS_ENABLED(CONFIG_DEBUG_INFO_BTF)
2194+
int register_xfrm_state_bpf(void);
2195+
#else
2196+
static inline int register_xfrm_state_bpf(void)
2197+
{
2198+
return 0;
2199+
}
2200+
#endif
2201+
21932202
#endif /* _NET_XFRM_H */

net/xfrm/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,3 +21,4 @@ obj-$(CONFIG_XFRM_USER_COMPAT) += xfrm_compat.o
2121
obj-$(CONFIG_XFRM_IPCOMP) += xfrm_ipcomp.o
2222
obj-$(CONFIG_XFRM_INTERFACE) += xfrm_interface.o
2323
obj-$(CONFIG_XFRM_ESPINTCP) += espintcp.o
24+
obj-$(CONFIG_DEBUG_INFO_BTF) += xfrm_state_bpf.o

net/xfrm/xfrm_policy.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4218,6 +4218,8 @@ void __init xfrm_init(void)
42184218
#ifdef CONFIG_XFRM_ESPINTCP
42194219
espintcp_init();
42204220
#endif
4221+
4222+
register_xfrm_state_bpf();
42214223
}
42224224

42234225
#ifdef CONFIG_AUDITSYSCALL

net/xfrm/xfrm_state_bpf.c

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
// SPDX-License-Identifier: GPL-2.0-only
2+
/* Unstable XFRM state BPF helpers.
3+
*
4+
* Note that it is allowed to break compatibility for these functions since the
5+
* interface they are exposed through to BPF programs is explicitly unstable.
6+
*/
7+
8+
#include <linux/bpf.h>
9+
#include <linux/btf.h>
10+
#include <linux/btf_ids.h>
11+
#include <net/xdp.h>
12+
#include <net/xfrm.h>
13+
14+
/* bpf_xfrm_state_opts - Options for XFRM state lookup helpers
15+
*
16+
* Members:
17+
* @error - Out parameter, set for any errors encountered
18+
* Values:
19+
* -EINVAL - netns_id is less than -1
20+
* -EINVAL - opts__sz isn't BPF_XFRM_STATE_OPTS_SZ
21+
* -ENONET - No network namespace found for netns_id
22+
* -ENOENT - No xfrm_state found
23+
* @netns_id - Specify the network namespace for lookup
24+
* Values:
25+
* BPF_F_CURRENT_NETNS (-1)
26+
* Use namespace associated with ctx
27+
* [0, S32_MAX]
28+
* Network Namespace ID
29+
* @mark - XFRM mark to match on
30+
* @daddr - Destination address to match on
31+
* @spi - Security parameter index to match on
32+
* @proto - IP protocol to match on (eg. IPPROTO_ESP)
33+
* @family - Protocol family to match on (AF_INET/AF_INET6)
34+
*/
35+
struct bpf_xfrm_state_opts {
36+
s32 error;
37+
s32 netns_id;
38+
u32 mark;
39+
xfrm_address_t daddr;
40+
__be32 spi;
41+
u8 proto;
42+
u16 family;
43+
};
44+
45+
enum {
46+
BPF_XFRM_STATE_OPTS_SZ = sizeof(struct bpf_xfrm_state_opts),
47+
};
48+
49+
__bpf_kfunc_start_defs();
50+
51+
/* bpf_xdp_get_xfrm_state - Get XFRM state
52+
*
53+
* A `struct xfrm_state *`, if found, must be released with a corresponding
54+
* bpf_xdp_xfrm_state_release.
55+
*
56+
* Parameters:
57+
* @ctx - Pointer to ctx (xdp_md) in XDP program
58+
* Cannot be NULL
59+
* @opts - Options for lookup (documented above)
60+
* Cannot be NULL
61+
* @opts__sz - Length of the bpf_xfrm_state_opts structure
62+
* Must be BPF_XFRM_STATE_OPTS_SZ
63+
*/
64+
__bpf_kfunc struct xfrm_state *
65+
bpf_xdp_get_xfrm_state(struct xdp_md *ctx, struct bpf_xfrm_state_opts *opts, u32 opts__sz)
66+
{
67+
struct xdp_buff *xdp = (struct xdp_buff *)ctx;
68+
struct net *net = dev_net(xdp->rxq->dev);
69+
struct xfrm_state *x;
70+
71+
if (!opts || opts__sz < sizeof(opts->error))
72+
return NULL;
73+
74+
if (opts__sz != BPF_XFRM_STATE_OPTS_SZ) {
75+
opts->error = -EINVAL;
76+
return NULL;
77+
}
78+
79+
if (unlikely(opts->netns_id < BPF_F_CURRENT_NETNS)) {
80+
opts->error = -EINVAL;
81+
return NULL;
82+
}
83+
84+
if (opts->netns_id >= 0) {
85+
net = get_net_ns_by_id(net, opts->netns_id);
86+
if (unlikely(!net)) {
87+
opts->error = -ENONET;
88+
return NULL;
89+
}
90+
}
91+
92+
x = xfrm_state_lookup(net, opts->mark, &opts->daddr, opts->spi,
93+
opts->proto, opts->family);
94+
95+
if (opts->netns_id >= 0)
96+
put_net(net);
97+
if (!x)
98+
opts->error = -ENOENT;
99+
100+
return x;
101+
}
102+
103+
/* bpf_xdp_xfrm_state_release - Release acquired xfrm_state object
104+
*
105+
* This must be invoked for referenced PTR_TO_BTF_ID, and the verifier rejects
106+
* the program if any references remain in the program in all of the explored
107+
* states.
108+
*
109+
* Parameters:
110+
* @x - Pointer to referenced xfrm_state object, obtained using
111+
* bpf_xdp_get_xfrm_state.
112+
*/
113+
__bpf_kfunc void bpf_xdp_xfrm_state_release(struct xfrm_state *x)
114+
{
115+
xfrm_state_put(x);
116+
}
117+
118+
__bpf_kfunc_end_defs();
119+
120+
BTF_SET8_START(xfrm_state_kfunc_set)
121+
BTF_ID_FLAGS(func, bpf_xdp_get_xfrm_state, KF_RET_NULL | KF_ACQUIRE)
122+
BTF_ID_FLAGS(func, bpf_xdp_xfrm_state_release, KF_RELEASE)
123+
BTF_SET8_END(xfrm_state_kfunc_set)
124+
125+
static const struct btf_kfunc_id_set xfrm_state_xdp_kfunc_set = {
126+
.owner = THIS_MODULE,
127+
.set = &xfrm_state_kfunc_set,
128+
};
129+
130+
int __init register_xfrm_state_bpf(void)
131+
{
132+
return register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP,
133+
&xfrm_state_xdp_kfunc_set);
134+
}

tools/testing/selftests/bpf/prog_tests/test_tunnel.c

Lines changed: 157 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050
*/
5151

5252
#include <arpa/inet.h>
53+
#include <linux/if_link.h>
5354
#include <linux/if_tun.h>
5455
#include <linux/limits.h>
5556
#include <linux/sysctl.h>
@@ -92,6 +93,11 @@
9293
#define IPIP_TUNL_DEV0 "ipip00"
9394
#define IPIP_TUNL_DEV1 "ipip11"
9495

96+
#define XFRM_AUTH "0x1111111111111111111111111111111111111111"
97+
#define XFRM_ENC "0x22222222222222222222222222222222"
98+
#define XFRM_SPI_IN_TO_OUT 0x1
99+
#define XFRM_SPI_OUT_TO_IN 0x2
100+
95101
#define PING_ARGS "-i 0.01 -c 3 -w 10 -q"
96102

97103
static int config_device(void)
@@ -264,6 +270,92 @@ static void delete_ipip_tunnel(void)
264270
SYS_NOFAIL("ip fou del port 5555 2> /dev/null");
265271
}
266272

273+
static int add_xfrm_tunnel(void)
274+
{
275+
/* at_ns0 namespace
276+
* at_ns0 -> root
277+
*/
278+
SYS(fail,
279+
"ip netns exec at_ns0 "
280+
"ip xfrm state add src %s dst %s proto esp "
281+
"spi %d reqid 1 mode tunnel replay-window 42 "
282+
"auth-trunc 'hmac(sha1)' %s 96 enc 'cbc(aes)' %s",
283+
IP4_ADDR_VETH0, IP4_ADDR1_VETH1, XFRM_SPI_IN_TO_OUT, XFRM_AUTH, XFRM_ENC);
284+
SYS(fail,
285+
"ip netns exec at_ns0 "
286+
"ip xfrm policy add src %s/32 dst %s/32 dir out "
287+
"tmpl src %s dst %s proto esp reqid 1 "
288+
"mode tunnel",
289+
IP4_ADDR_TUNL_DEV0, IP4_ADDR_TUNL_DEV1, IP4_ADDR_VETH0, IP4_ADDR1_VETH1);
290+
291+
/* root -> at_ns0 */
292+
SYS(fail,
293+
"ip netns exec at_ns0 "
294+
"ip xfrm state add src %s dst %s proto esp "
295+
"spi %d reqid 2 mode tunnel "
296+
"auth-trunc 'hmac(sha1)' %s 96 enc 'cbc(aes)' %s",
297+
IP4_ADDR1_VETH1, IP4_ADDR_VETH0, XFRM_SPI_OUT_TO_IN, XFRM_AUTH, XFRM_ENC);
298+
SYS(fail,
299+
"ip netns exec at_ns0 "
300+
"ip xfrm policy add src %s/32 dst %s/32 dir in "
301+
"tmpl src %s dst %s proto esp reqid 2 "
302+
"mode tunnel",
303+
IP4_ADDR_TUNL_DEV1, IP4_ADDR_TUNL_DEV0, IP4_ADDR1_VETH1, IP4_ADDR_VETH0);
304+
305+
/* address & route */
306+
SYS(fail, "ip netns exec at_ns0 ip addr add dev veth0 %s/32",
307+
IP4_ADDR_TUNL_DEV0);
308+
SYS(fail, "ip netns exec at_ns0 ip route add %s dev veth0 via %s src %s",
309+
IP4_ADDR_TUNL_DEV1, IP4_ADDR1_VETH1, IP4_ADDR_TUNL_DEV0);
310+
311+
/* root namespace
312+
* at_ns0 -> root
313+
*/
314+
SYS(fail,
315+
"ip xfrm state add src %s dst %s proto esp "
316+
"spi %d reqid 1 mode tunnel replay-window 42 "
317+
"auth-trunc 'hmac(sha1)' %s 96 enc 'cbc(aes)' %s",
318+
IP4_ADDR_VETH0, IP4_ADDR1_VETH1, XFRM_SPI_IN_TO_OUT, XFRM_AUTH, XFRM_ENC);
319+
SYS(fail,
320+
"ip xfrm policy add src %s/32 dst %s/32 dir in "
321+
"tmpl src %s dst %s proto esp reqid 1 "
322+
"mode tunnel",
323+
IP4_ADDR_TUNL_DEV0, IP4_ADDR_TUNL_DEV1, IP4_ADDR_VETH0, IP4_ADDR1_VETH1);
324+
325+
/* root -> at_ns0 */
326+
SYS(fail,
327+
"ip xfrm state add src %s dst %s proto esp "
328+
"spi %d reqid 2 mode tunnel "
329+
"auth-trunc 'hmac(sha1)' %s 96 enc 'cbc(aes)' %s",
330+
IP4_ADDR1_VETH1, IP4_ADDR_VETH0, XFRM_SPI_OUT_TO_IN, XFRM_AUTH, XFRM_ENC);
331+
SYS(fail,
332+
"ip xfrm policy add src %s/32 dst %s/32 dir out "
333+
"tmpl src %s dst %s proto esp reqid 2 "
334+
"mode tunnel",
335+
IP4_ADDR_TUNL_DEV1, IP4_ADDR_TUNL_DEV0, IP4_ADDR1_VETH1, IP4_ADDR_VETH0);
336+
337+
/* address & route */
338+
SYS(fail, "ip addr add dev veth1 %s/32", IP4_ADDR_TUNL_DEV1);
339+
SYS(fail, "ip route add %s dev veth1 via %s src %s",
340+
IP4_ADDR_TUNL_DEV0, IP4_ADDR_VETH0, IP4_ADDR_TUNL_DEV1);
341+
342+
return 0;
343+
fail:
344+
return -1;
345+
}
346+
347+
static void delete_xfrm_tunnel(void)
348+
{
349+
SYS_NOFAIL("ip xfrm policy delete dir out src %s/32 dst %s/32 2> /dev/null",
350+
IP4_ADDR_TUNL_DEV1, IP4_ADDR_TUNL_DEV0);
351+
SYS_NOFAIL("ip xfrm policy delete dir in src %s/32 dst %s/32 2> /dev/null",
352+
IP4_ADDR_TUNL_DEV0, IP4_ADDR_TUNL_DEV1);
353+
SYS_NOFAIL("ip xfrm state delete src %s dst %s proto esp spi %d 2> /dev/null",
354+
IP4_ADDR_VETH0, IP4_ADDR1_VETH1, XFRM_SPI_IN_TO_OUT);
355+
SYS_NOFAIL("ip xfrm state delete src %s dst %s proto esp spi %d 2> /dev/null",
356+
IP4_ADDR1_VETH1, IP4_ADDR_VETH0, XFRM_SPI_OUT_TO_IN);
357+
}
358+
267359
static int test_ping(int family, const char *addr)
268360
{
269361
SYS(fail, "%s %s %s > /dev/null", ping_command(family), PING_ARGS, addr);
@@ -532,25 +624,85 @@ static void test_ipip_tunnel(enum ipip_encap encap)
532624
test_tunnel_kern__destroy(skel);
533625
}
534626

627+
static void test_xfrm_tunnel(void)
628+
{
629+
DECLARE_LIBBPF_OPTS(bpf_tc_hook, tc_hook,
630+
.attach_point = BPF_TC_INGRESS);
631+
LIBBPF_OPTS(bpf_xdp_attach_opts, opts);
632+
struct test_tunnel_kern *skel = NULL;
633+
struct nstoken *nstoken;
634+
int xdp_prog_fd;
635+
int tc_prog_fd;
636+
int ifindex;
637+
int err;
638+
639+
err = add_xfrm_tunnel();
640+
if (!ASSERT_OK(err, "add_xfrm_tunnel"))
641+
return;
642+
643+
skel = test_tunnel_kern__open_and_load();
644+
if (!ASSERT_OK_PTR(skel, "test_tunnel_kern__open_and_load"))
645+
goto done;
646+
647+
ifindex = if_nametoindex("veth1");
648+
if (!ASSERT_NEQ(ifindex, 0, "veth1 ifindex"))
649+
goto done;
650+
651+
/* attach tc prog to tunnel dev */
652+
tc_hook.ifindex = ifindex;
653+
tc_prog_fd = bpf_program__fd(skel->progs.xfrm_get_state);
654+
if (!ASSERT_GE(tc_prog_fd, 0, "bpf_program__fd"))
655+
goto done;
656+
if (attach_tc_prog(&tc_hook, tc_prog_fd, -1))
657+
goto done;
658+
659+
/* attach xdp prog to tunnel dev */
660+
xdp_prog_fd = bpf_program__fd(skel->progs.xfrm_get_state_xdp);
661+
if (!ASSERT_GE(xdp_prog_fd, 0, "bpf_program__fd"))
662+
goto done;
663+
err = bpf_xdp_attach(ifindex, xdp_prog_fd, XDP_FLAGS_REPLACE, &opts);
664+
if (!ASSERT_OK(err, "bpf_xdp_attach"))
665+
goto done;
666+
667+
/* ping from at_ns0 namespace test */
668+
nstoken = open_netns("at_ns0");
669+
err = test_ping(AF_INET, IP4_ADDR_TUNL_DEV1);
670+
close_netns(nstoken);
671+
if (!ASSERT_OK(err, "test_ping"))
672+
goto done;
673+
674+
if (!ASSERT_EQ(skel->bss->xfrm_reqid, 1, "req_id"))
675+
goto done;
676+
if (!ASSERT_EQ(skel->bss->xfrm_spi, XFRM_SPI_IN_TO_OUT, "spi"))
677+
goto done;
678+
if (!ASSERT_EQ(skel->bss->xfrm_remote_ip, 0xac100164, "remote_ip"))
679+
goto done;
680+
if (!ASSERT_EQ(skel->bss->xfrm_replay_window, 42, "replay_window"))
681+
goto done;
682+
683+
done:
684+
delete_xfrm_tunnel();
685+
if (skel)
686+
test_tunnel_kern__destroy(skel);
687+
}
688+
535689
#define RUN_TEST(name, ...) \
536690
({ \
537691
if (test__start_subtest(#name)) { \
692+
config_device(); \
538693
test_ ## name(__VA_ARGS__); \
694+
cleanup(); \
539695
} \
540696
})
541697

542698
static void *test_tunnel_run_tests(void *arg)
543699
{
544-
cleanup();
545-
config_device();
546-
547700
RUN_TEST(vxlan_tunnel);
548701
RUN_TEST(ip6vxlan_tunnel);
549702
RUN_TEST(ipip_tunnel, NONE);
550703
RUN_TEST(ipip_tunnel, FOU);
551704
RUN_TEST(ipip_tunnel, GUE);
552-
553-
cleanup();
705+
RUN_TEST(xfrm_tunnel);
554706

555707
return NULL;
556708
}

tools/testing/selftests/bpf/progs/bpf_tracing_net.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
#define IPV6_AUTOFLOWLABEL 70
2727

2828
#define TC_ACT_UNSPEC (-1)
29+
#define TC_ACT_OK 0
2930
#define TC_ACT_SHOT 2
3031

3132
#define SOL_TCP 6

0 commit comments

Comments
 (0)