Skip to content

Commit c56e67f

Browse files
committed
Florian Westphal says: ==================== netfilter patches for net First patch resolves a regression with vlan header matching, this was broken since 6.5 release. From myself. Second patch fixes an ancient problem with sctp connection tracking in case INIT_ACK packets are delayed. This comes with a selftest, both patches from Xin Long. Patch 4 extends the existing nftables audit selftest, from Phil Sutter. Patch 5, also from Phil, avoids a situation where nftables would emit an audit record twice. This was broken since 5.13 days. Patch 6, from myself, avoids spurious insertion failure if we encounter an overlapping but expired range during element insertion with the 'nft_set_rbtree' backend. This problem exists since 6.2. * tag 'nf-23-10-04' of https://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf: netfilter: nf_tables: nft_set_rbtree: fix spurious insertion failure netfilter: nf_tables: Deduplicate nft_register_obj audit logs selftests: netfilter: Extend nft_audit.sh selftests: netfilter: test for sctp collision processing in nf_conntrack netfilter: handle the connecting collision properly in nf_conntrack_proto_sctp netfilter: nft_payload: rebuild vlan header on h_proto access ==================== Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Jakub Kicinski <[email protected]>
2 parents 513dbc1 + 0873882 commit c56e67f

File tree

9 files changed

+395
-62
lines changed

9 files changed

+395
-62
lines changed

include/linux/netfilter/nf_conntrack_sctp.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ struct ip_ct_sctp {
99
enum sctp_conntrack state;
1010

1111
__be32 vtag[IP_CT_DIR_MAX];
12+
u8 init[IP_CT_DIR_MAX];
1213
u8 last_dir;
1314
u8 flags;
1415
};

net/netfilter/nf_conntrack_proto_sctp.c

Lines changed: 33 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = {
112112
/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA, sSA},
113113
/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL},/* Can't have Stale cookie*/
114114
/* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA, sCL},/* 5.2.4 - Big TODO */
115-
/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL},/* Can't come in orig dir */
115+
/* cookie_ack */ {sCL, sCL, sCW, sES, sES, sSS, sSR, sSA, sCL},/* Can't come in orig dir */
116116
/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL, sCL},
117117
/* heartbeat */ {sHS, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS},
118118
/* heartbeat_ack*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS},
@@ -126,7 +126,7 @@ static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = {
126126
/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA, sIV},
127127
/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA, sIV},
128128
/* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA, sIV},
129-
/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV},/* Can't come in reply dir */
129+
/* cookie_echo */ {sIV, sCL, sCE, sCE, sES, sSS, sSR, sSA, sIV},/* Can't come in reply dir */
130130
/* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA, sIV},
131131
/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL, sIV},
132132
/* heartbeat */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS},
@@ -412,6 +412,9 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct,
412412
/* (D) vtag must be same as init_vtag as found in INIT_ACK */
413413
if (sh->vtag != ct->proto.sctp.vtag[dir])
414414
goto out_unlock;
415+
} else if (sch->type == SCTP_CID_COOKIE_ACK) {
416+
ct->proto.sctp.init[dir] = 0;
417+
ct->proto.sctp.init[!dir] = 0;
415418
} else if (sch->type == SCTP_CID_HEARTBEAT) {
416419
if (ct->proto.sctp.vtag[dir] == 0) {
417420
pr_debug("Setting %d vtag %x for dir %d\n", sch->type, sh->vtag, dir);
@@ -461,16 +464,18 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct,
461464
}
462465

463466
/* If it is an INIT or an INIT ACK note down the vtag */
464-
if (sch->type == SCTP_CID_INIT ||
465-
sch->type == SCTP_CID_INIT_ACK) {
466-
struct sctp_inithdr _inithdr, *ih;
467+
if (sch->type == SCTP_CID_INIT) {
468+
struct sctp_inithdr _ih, *ih;
467469

468-
ih = skb_header_pointer(skb, offset + sizeof(_sch),
469-
sizeof(_inithdr), &_inithdr);
470-
if (ih == NULL)
470+
ih = skb_header_pointer(skb, offset + sizeof(_sch), sizeof(*ih), &_ih);
471+
if (!ih)
471472
goto out_unlock;
472-
pr_debug("Setting vtag %x for dir %d\n",
473-
ih->init_tag, !dir);
473+
474+
if (ct->proto.sctp.init[dir] && ct->proto.sctp.init[!dir])
475+
ct->proto.sctp.init[!dir] = 0;
476+
ct->proto.sctp.init[dir] = 1;
477+
478+
pr_debug("Setting vtag %x for dir %d\n", ih->init_tag, !dir);
474479
ct->proto.sctp.vtag[!dir] = ih->init_tag;
475480

476481
/* don't renew timeout on init retransmit so
@@ -481,6 +486,24 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct,
481486
old_state == SCTP_CONNTRACK_CLOSED &&
482487
nf_ct_is_confirmed(ct))
483488
ignore = true;
489+
} else if (sch->type == SCTP_CID_INIT_ACK) {
490+
struct sctp_inithdr _ih, *ih;
491+
__be32 vtag;
492+
493+
ih = skb_header_pointer(skb, offset + sizeof(_sch), sizeof(*ih), &_ih);
494+
if (!ih)
495+
goto out_unlock;
496+
497+
vtag = ct->proto.sctp.vtag[!dir];
498+
if (!ct->proto.sctp.init[!dir] && vtag && vtag != ih->init_tag)
499+
goto out_unlock;
500+
/* collision */
501+
if (ct->proto.sctp.init[dir] && ct->proto.sctp.init[!dir] &&
502+
vtag != ih->init_tag)
503+
goto out_unlock;
504+
505+
pr_debug("Setting vtag %x for dir %d\n", ih->init_tag, !dir);
506+
ct->proto.sctp.vtag[!dir] = ih->init_tag;
484507
}
485508

486509
ct->proto.sctp.state = new_state;

net/netfilter/nf_tables_api.c

Lines changed: 28 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -7871,24 +7871,14 @@ static int nf_tables_delobj(struct sk_buff *skb, const struct nfnl_info *info,
78717871
return nft_delobj(&ctx, obj);
78727872
}
78737873

7874-
void nft_obj_notify(struct net *net, const struct nft_table *table,
7875-
struct nft_object *obj, u32 portid, u32 seq, int event,
7876-
u16 flags, int family, int report, gfp_t gfp)
7874+
static void
7875+
__nft_obj_notify(struct net *net, const struct nft_table *table,
7876+
struct nft_object *obj, u32 portid, u32 seq, int event,
7877+
u16 flags, int family, int report, gfp_t gfp)
78777878
{
78787879
struct nftables_pernet *nft_net = nft_pernet(net);
78797880
struct sk_buff *skb;
78807881
int err;
7881-
char *buf = kasprintf(gfp, "%s:%u",
7882-
table->name, nft_net->base_seq);
7883-
7884-
audit_log_nfcfg(buf,
7885-
family,
7886-
obj->handle,
7887-
event == NFT_MSG_NEWOBJ ?
7888-
AUDIT_NFT_OP_OBJ_REGISTER :
7889-
AUDIT_NFT_OP_OBJ_UNREGISTER,
7890-
gfp);
7891-
kfree(buf);
78927882

78937883
if (!report &&
78947884
!nfnetlink_has_listeners(net, NFNLGRP_NFTABLES))
@@ -7911,13 +7901,35 @@ void nft_obj_notify(struct net *net, const struct nft_table *table,
79117901
err:
79127902
nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, -ENOBUFS);
79137903
}
7904+
7905+
void nft_obj_notify(struct net *net, const struct nft_table *table,
7906+
struct nft_object *obj, u32 portid, u32 seq, int event,
7907+
u16 flags, int family, int report, gfp_t gfp)
7908+
{
7909+
struct nftables_pernet *nft_net = nft_pernet(net);
7910+
char *buf = kasprintf(gfp, "%s:%u",
7911+
table->name, nft_net->base_seq);
7912+
7913+
audit_log_nfcfg(buf,
7914+
family,
7915+
obj->handle,
7916+
event == NFT_MSG_NEWOBJ ?
7917+
AUDIT_NFT_OP_OBJ_REGISTER :
7918+
AUDIT_NFT_OP_OBJ_UNREGISTER,
7919+
gfp);
7920+
kfree(buf);
7921+
7922+
__nft_obj_notify(net, table, obj, portid, seq, event,
7923+
flags, family, report, gfp);
7924+
}
79147925
EXPORT_SYMBOL_GPL(nft_obj_notify);
79157926

79167927
static void nf_tables_obj_notify(const struct nft_ctx *ctx,
79177928
struct nft_object *obj, int event)
79187929
{
7919-
nft_obj_notify(ctx->net, ctx->table, obj, ctx->portid, ctx->seq, event,
7920-
ctx->flags, ctx->family, ctx->report, GFP_KERNEL);
7930+
__nft_obj_notify(ctx->net, ctx->table, obj, ctx->portid,
7931+
ctx->seq, event, ctx->flags, ctx->family,
7932+
ctx->report, GFP_KERNEL);
79217933
}
79227934

79237935
/*

net/netfilter/nft_payload.c

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,17 @@ int nft_payload_inner_offset(const struct nft_pktinfo *pkt)
154154
return pkt->inneroff;
155155
}
156156

157+
static bool nft_payload_need_vlan_copy(const struct nft_payload *priv)
158+
{
159+
unsigned int len = priv->offset + priv->len;
160+
161+
/* data past ether src/dst requested, copy needed */
162+
if (len > offsetof(struct ethhdr, h_proto))
163+
return true;
164+
165+
return false;
166+
}
167+
157168
void nft_payload_eval(const struct nft_expr *expr,
158169
struct nft_regs *regs,
159170
const struct nft_pktinfo *pkt)
@@ -172,7 +183,7 @@ void nft_payload_eval(const struct nft_expr *expr,
172183
goto err;
173184

174185
if (skb_vlan_tag_present(skb) &&
175-
priv->offset >= offsetof(struct ethhdr, h_proto)) {
186+
nft_payload_need_vlan_copy(priv)) {
176187
if (!nft_payload_copy_vlan(dest, skb,
177188
priv->offset, priv->len))
178189
goto err;

net/netfilter/nft_set_rbtree.c

Lines changed: 29 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -233,10 +233,9 @@ static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set,
233233
rb_erase(&rbe->node, &priv->root);
234234
}
235235

236-
static int nft_rbtree_gc_elem(const struct nft_set *__set,
237-
struct nft_rbtree *priv,
238-
struct nft_rbtree_elem *rbe,
239-
u8 genmask)
236+
static const struct nft_rbtree_elem *
237+
nft_rbtree_gc_elem(const struct nft_set *__set, struct nft_rbtree *priv,
238+
struct nft_rbtree_elem *rbe, u8 genmask)
240239
{
241240
struct nft_set *set = (struct nft_set *)__set;
242241
struct rb_node *prev = rb_prev(&rbe->node);
@@ -246,7 +245,7 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set,
246245

247246
gc = nft_trans_gc_alloc(set, 0, GFP_ATOMIC);
248247
if (!gc)
249-
return -ENOMEM;
248+
return ERR_PTR(-ENOMEM);
250249

251250
/* search for end interval coming before this element.
252251
* end intervals don't carry a timeout extension, they
@@ -261,6 +260,7 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set,
261260
prev = rb_prev(prev);
262261
}
263262

263+
rbe_prev = NULL;
264264
if (prev) {
265265
rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node);
266266
nft_rbtree_gc_remove(net, set, priv, rbe_prev);
@@ -272,21 +272,21 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set,
272272
*/
273273
gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC);
274274
if (WARN_ON_ONCE(!gc))
275-
return -ENOMEM;
275+
return ERR_PTR(-ENOMEM);
276276

277277
nft_trans_gc_elem_add(gc, rbe_prev);
278278
}
279279

280280
nft_rbtree_gc_remove(net, set, priv, rbe);
281281
gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC);
282282
if (WARN_ON_ONCE(!gc))
283-
return -ENOMEM;
283+
return ERR_PTR(-ENOMEM);
284284

285285
nft_trans_gc_elem_add(gc, rbe);
286286

287287
nft_trans_gc_queue_sync_done(gc);
288288

289-
return 0;
289+
return rbe_prev;
290290
}
291291

292292
static bool nft_rbtree_update_first(const struct nft_set *set,
@@ -314,7 +314,7 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
314314
struct nft_rbtree *priv = nft_set_priv(set);
315315
u8 cur_genmask = nft_genmask_cur(net);
316316
u8 genmask = nft_genmask_next(net);
317-
int d, err;
317+
int d;
318318

319319
/* Descend the tree to search for an existing element greater than the
320320
* key value to insert that is greater than the new element. This is the
@@ -363,9 +363,14 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
363363
*/
364364
if (nft_set_elem_expired(&rbe->ext) &&
365365
nft_set_elem_active(&rbe->ext, cur_genmask)) {
366-
err = nft_rbtree_gc_elem(set, priv, rbe, genmask);
367-
if (err < 0)
368-
return err;
366+
const struct nft_rbtree_elem *removed_end;
367+
368+
removed_end = nft_rbtree_gc_elem(set, priv, rbe, genmask);
369+
if (IS_ERR(removed_end))
370+
return PTR_ERR(removed_end);
371+
372+
if (removed_end == rbe_le || removed_end == rbe_ge)
373+
return -EAGAIN;
369374

370375
continue;
371376
}
@@ -486,11 +491,18 @@ static int nft_rbtree_insert(const struct net *net, const struct nft_set *set,
486491
struct nft_rbtree_elem *rbe = elem->priv;
487492
int err;
488493

489-
write_lock_bh(&priv->lock);
490-
write_seqcount_begin(&priv->count);
491-
err = __nft_rbtree_insert(net, set, rbe, ext);
492-
write_seqcount_end(&priv->count);
493-
write_unlock_bh(&priv->lock);
494+
do {
495+
if (fatal_signal_pending(current))
496+
return -EINTR;
497+
498+
cond_resched();
499+
500+
write_lock_bh(&priv->lock);
501+
write_seqcount_begin(&priv->count);
502+
err = __nft_rbtree_insert(net, set, rbe, ext);
503+
write_seqcount_end(&priv->count);
504+
write_unlock_bh(&priv->lock);
505+
} while (err == -EAGAIN);
494506

495507
return err;
496508
}

tools/testing/selftests/netfilter/Makefile

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,14 @@ TEST_PROGS := nft_trans_stress.sh nft_fib.sh nft_nat.sh bridge_brouter.sh \
66
nft_concat_range.sh nft_conntrack_helper.sh \
77
nft_queue.sh nft_meta.sh nf_nat_edemux.sh \
88
ipip-conntrack-mtu.sh conntrack_tcp_unreplied.sh \
9-
conntrack_vrf.sh nft_synproxy.sh rpath.sh nft_audit.sh
9+
conntrack_vrf.sh nft_synproxy.sh rpath.sh nft_audit.sh \
10+
conntrack_sctp_collision.sh
1011

1112
HOSTPKG_CONFIG := pkg-config
1213

1314
CFLAGS += $(shell $(HOSTPKG_CONFIG) --cflags libmnl 2>/dev/null)
1415
LDLIBS += $(shell $(HOSTPKG_CONFIG) --libs libmnl 2>/dev/null || echo -lmnl)
1516

16-
TEST_GEN_FILES = nf-queue connect_close audit_logread
17+
TEST_GEN_FILES = nf-queue connect_close audit_logread sctp_collision
1718

1819
include ../lib.mk
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
#!/bin/bash
2+
# SPDX-License-Identifier: GPL-2.0
3+
#
4+
# Testing For SCTP COLLISION SCENARIO as Below:
5+
#
6+
# 14:35:47.655279 IP CLIENT_IP.PORT > SERVER_IP.PORT: sctp (1) [INIT] [init tag: 2017837359]
7+
# 14:35:48.353250 IP SERVER_IP.PORT > CLIENT_IP.PORT: sctp (1) [INIT] [init tag: 1187206187]
8+
# 14:35:48.353275 IP CLIENT_IP.PORT > SERVER_IP.PORT: sctp (1) [INIT ACK] [init tag: 2017837359]
9+
# 14:35:48.353283 IP SERVER_IP.PORT > CLIENT_IP.PORT: sctp (1) [COOKIE ECHO]
10+
# 14:35:48.353977 IP CLIENT_IP.PORT > SERVER_IP.PORT: sctp (1) [COOKIE ACK]
11+
# 14:35:48.855335 IP SERVER_IP.PORT > CLIENT_IP.PORT: sctp (1) [INIT ACK] [init tag: 164579970]
12+
#
13+
# TOPO: SERVER_NS (link0)<--->(link1) ROUTER_NS (link2)<--->(link3) CLIENT_NS
14+
15+
CLIENT_NS=$(mktemp -u client-XXXXXXXX)
16+
CLIENT_IP="198.51.200.1"
17+
CLIENT_PORT=1234
18+
19+
SERVER_NS=$(mktemp -u server-XXXXXXXX)
20+
SERVER_IP="198.51.100.1"
21+
SERVER_PORT=1234
22+
23+
ROUTER_NS=$(mktemp -u router-XXXXXXXX)
24+
CLIENT_GW="198.51.200.2"
25+
SERVER_GW="198.51.100.2"
26+
27+
# setup the topo
28+
setup() {
29+
ip net add $CLIENT_NS
30+
ip net add $SERVER_NS
31+
ip net add $ROUTER_NS
32+
ip -n $SERVER_NS link add link0 type veth peer name link1 netns $ROUTER_NS
33+
ip -n $CLIENT_NS link add link3 type veth peer name link2 netns $ROUTER_NS
34+
35+
ip -n $SERVER_NS link set link0 up
36+
ip -n $SERVER_NS addr add $SERVER_IP/24 dev link0
37+
ip -n $SERVER_NS route add $CLIENT_IP dev link0 via $SERVER_GW
38+
39+
ip -n $ROUTER_NS link set link1 up
40+
ip -n $ROUTER_NS link set link2 up
41+
ip -n $ROUTER_NS addr add $SERVER_GW/24 dev link1
42+
ip -n $ROUTER_NS addr add $CLIENT_GW/24 dev link2
43+
ip net exec $ROUTER_NS sysctl -wq net.ipv4.ip_forward=1
44+
45+
ip -n $CLIENT_NS link set link3 up
46+
ip -n $CLIENT_NS addr add $CLIENT_IP/24 dev link3
47+
ip -n $CLIENT_NS route add $SERVER_IP dev link3 via $CLIENT_GW
48+
49+
# simulate the delay on OVS upcall by setting up a delay for INIT_ACK with
50+
# tc on $SERVER_NS side
51+
tc -n $SERVER_NS qdisc add dev link0 root handle 1: htb
52+
tc -n $SERVER_NS class add dev link0 parent 1: classid 1:1 htb rate 100mbit
53+
tc -n $SERVER_NS filter add dev link0 parent 1: protocol ip u32 match ip protocol 132 \
54+
0xff match u8 2 0xff at 32 flowid 1:1
55+
tc -n $SERVER_NS qdisc add dev link0 parent 1:1 handle 10: netem delay 1200ms
56+
57+
# simulate the ctstate check on OVS nf_conntrack
58+
ip net exec $ROUTER_NS iptables -A FORWARD -m state --state INVALID,UNTRACKED -j DROP
59+
ip net exec $ROUTER_NS iptables -A INPUT -p sctp -j DROP
60+
61+
# use a smaller number for assoc's max_retrans to reproduce the issue
62+
modprobe sctp
63+
ip net exec $CLIENT_NS sysctl -wq net.sctp.association_max_retrans=3
64+
}
65+
66+
cleanup() {
67+
ip net exec $CLIENT_NS pkill sctp_collision 2>&1 >/dev/null
68+
ip net exec $SERVER_NS pkill sctp_collision 2>&1 >/dev/null
69+
ip net del "$CLIENT_NS"
70+
ip net del "$SERVER_NS"
71+
ip net del "$ROUTER_NS"
72+
}
73+
74+
do_test() {
75+
ip net exec $SERVER_NS ./sctp_collision server \
76+
$SERVER_IP $SERVER_PORT $CLIENT_IP $CLIENT_PORT &
77+
ip net exec $CLIENT_NS ./sctp_collision client \
78+
$CLIENT_IP $CLIENT_PORT $SERVER_IP $SERVER_PORT
79+
}
80+
81+
# NOTE: one way to work around the issue is set a smaller hb_interval
82+
# ip net exec $CLIENT_NS sysctl -wq net.sctp.hb_interval=3500
83+
84+
# run the test case
85+
trap cleanup EXIT
86+
setup && \
87+
echo "Test for SCTP Collision in nf_conntrack:" && \
88+
do_test && echo "PASS!"
89+
exit $?

0 commit comments

Comments
 (0)