Skip to content

Commit 2ac24d6

Browse files
committed
Merge branch 'Support-PMTU-discovery-with-bridged-UDP-tunnels'
Stefano Brivio says: ==================== Support PMTU discovery with bridged UDP tunnels Currently, PMTU discovery for UDP tunnels only works if packets are routed to the encapsulating interfaces, not bridged. This results from the fact that we generally don't have valid routes to the senders we can use to relay ICMP and ICMPv6 errors, and makes PMTU discovery completely non-functional for VXLAN and GENEVE ports of both regular bridges and Open vSwitch instances. If the sender is local, and packets are forwarded to the port by a regular bridge, all it takes is to generate a corresponding route exception on the encapsulating device. The bridge then finds the route exception carrying the PMTU value estimate as it forwards frames, and relays ICMP messages back to the socket of the local sender. Patch 1/6 fixes this case. If the sender resides on another node, we actually need to reply to IP and IPv6 packets ourselves and send these ICMP or ICMPv6 errors back, using the same encapsulating device. Patch 2/6, based on an original idea by Florian Westphal, adds the needed functionality, while patches 3/6 and 4/6 add matching support for VXLAN and GENEVE. Finally, 5/6 and 6/6 introduce selftests for all combinations of inner and outer IP versions, covering both VXLAN and GENEVE, with both regular bridges and Open vSwitch instances. v2: Add helper to check for any bridge port, skip oif check for PMTU routes for bridge ports only, split IPv4 and IPv6 helpers and functions (all suggested by David Ahern) ==================== Signed-off-by: David S. Miller <[email protected]>
2 parents cabf06e + 7b53682 commit 2ac24d6

File tree

9 files changed

+690
-29
lines changed

9 files changed

+690
-29
lines changed

drivers/net/bareudp.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -308,7 +308,7 @@ static int bareudp_xmit_skb(struct sk_buff *skb, struct net_device *dev,
308308
return PTR_ERR(rt);
309309

310310
skb_tunnel_check_pmtu(skb, &rt->dst,
311-
BAREUDP_IPV4_HLEN + info->options_len);
311+
BAREUDP_IPV4_HLEN + info->options_len, false);
312312

313313
sport = udp_flow_src_port(bareudp->net, skb,
314314
bareudp->sport_min, USHRT_MAX,
@@ -369,7 +369,8 @@ static int bareudp6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
369369
if (IS_ERR(dst))
370370
return PTR_ERR(dst);
371371

372-
skb_tunnel_check_pmtu(skb, dst, BAREUDP_IPV6_HLEN + info->options_len);
372+
skb_tunnel_check_pmtu(skb, dst, BAREUDP_IPV6_HLEN + info->options_len,
373+
false);
373374

374375
sport = udp_flow_src_port(bareudp->net, skb,
375376
bareudp->sport_min, USHRT_MAX,

drivers/net/geneve.c

Lines changed: 51 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -893,8 +893,31 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev,
893893
if (IS_ERR(rt))
894894
return PTR_ERR(rt);
895895

896-
skb_tunnel_check_pmtu(skb, &rt->dst,
897-
GENEVE_IPV4_HLEN + info->options_len);
896+
err = skb_tunnel_check_pmtu(skb, &rt->dst,
897+
GENEVE_IPV4_HLEN + info->options_len,
898+
netif_is_any_bridge_port(dev));
899+
if (err < 0) {
900+
dst_release(&rt->dst);
901+
return err;
902+
} else if (err) {
903+
struct ip_tunnel_info *info;
904+
905+
info = skb_tunnel_info(skb);
906+
if (info) {
907+
info->key.u.ipv4.dst = fl4.saddr;
908+
info->key.u.ipv4.src = fl4.daddr;
909+
}
910+
911+
if (!pskb_may_pull(skb, ETH_HLEN)) {
912+
dst_release(&rt->dst);
913+
return -EINVAL;
914+
}
915+
916+
skb->protocol = eth_type_trans(skb, geneve->dev);
917+
netif_rx(skb);
918+
dst_release(&rt->dst);
919+
return -EMSGSIZE;
920+
}
898921

899922
sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true);
900923
if (geneve->cfg.collect_md) {
@@ -955,7 +978,30 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
955978
if (IS_ERR(dst))
956979
return PTR_ERR(dst);
957980

958-
skb_tunnel_check_pmtu(skb, dst, GENEVE_IPV6_HLEN + info->options_len);
981+
err = skb_tunnel_check_pmtu(skb, dst,
982+
GENEVE_IPV6_HLEN + info->options_len,
983+
netif_is_any_bridge_port(dev));
984+
if (err < 0) {
985+
dst_release(dst);
986+
return err;
987+
} else if (err) {
988+
struct ip_tunnel_info *info = skb_tunnel_info(skb);
989+
990+
if (info) {
991+
info->key.u.ipv6.dst = fl6.saddr;
992+
info->key.u.ipv6.src = fl6.daddr;
993+
}
994+
995+
if (!pskb_may_pull(skb, ETH_HLEN)) {
996+
dst_release(dst);
997+
return -EINVAL;
998+
}
999+
1000+
skb->protocol = eth_type_trans(skb, geneve->dev);
1001+
netif_rx(skb);
1002+
dst_release(dst);
1003+
return -EMSGSIZE;
1004+
}
9591005

9601006
sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true);
9611007
if (geneve->cfg.collect_md) {
@@ -1012,7 +1058,8 @@ static netdev_tx_t geneve_xmit(struct sk_buff *skb, struct net_device *dev)
10121058
if (likely(!err))
10131059
return NETDEV_TX_OK;
10141060

1015-
dev_kfree_skb(skb);
1061+
if (err != -EMSGSIZE)
1062+
dev_kfree_skb(skb);
10161063

10171064
if (err == -ELOOP)
10181065
dev->stats.collisions++;

drivers/net/vxlan.c

Lines changed: 41 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2500,7 +2500,8 @@ static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan,
25002500

25012501
/* Bypass encapsulation if the destination is local */
25022502
static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan,
2503-
struct vxlan_dev *dst_vxlan, __be32 vni)
2503+
struct vxlan_dev *dst_vxlan, __be32 vni,
2504+
bool snoop)
25042505
{
25052506
struct pcpu_sw_netstats *tx_stats, *rx_stats;
25062507
union vxlan_addr loopback;
@@ -2532,7 +2533,7 @@ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan,
25322533
goto drop;
25332534
}
25342535

2535-
if (dst_vxlan->cfg.flags & VXLAN_F_LEARN)
2536+
if ((dst_vxlan->cfg.flags & VXLAN_F_LEARN) && snoop)
25362537
vxlan_snoop(dev, &loopback, eth_hdr(skb)->h_source, 0, vni);
25372538

25382539
u64_stats_update_begin(&tx_stats->syncp);
@@ -2581,7 +2582,7 @@ static int encap_bypass_if_local(struct sk_buff *skb, struct net_device *dev,
25812582

25822583
return -ENOENT;
25832584
}
2584-
vxlan_encap_bypass(skb, vxlan, dst_vxlan, vni);
2585+
vxlan_encap_bypass(skb, vxlan, dst_vxlan, vni, true);
25852586
return 1;
25862587
}
25872588

@@ -2617,7 +2618,8 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
26172618
if (vxlan_addr_any(dst)) {
26182619
if (did_rsc) {
26192620
/* short-circuited back to local bridge */
2620-
vxlan_encap_bypass(skb, vxlan, vxlan, default_vni);
2621+
vxlan_encap_bypass(skb, vxlan, vxlan,
2622+
default_vni, true);
26212623
return;
26222624
}
26232625
goto drop;
@@ -2720,7 +2722,23 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
27202722
}
27212723

27222724
ndst = &rt->dst;
2723-
skb_tunnel_check_pmtu(skb, ndst, VXLAN_HEADROOM);
2725+
err = skb_tunnel_check_pmtu(skb, ndst, VXLAN_HEADROOM,
2726+
netif_is_any_bridge_port(dev));
2727+
if (err < 0) {
2728+
goto tx_error;
2729+
} else if (err) {
2730+
if (info) {
2731+
struct in_addr src, dst;
2732+
2733+
src = remote_ip.sin.sin_addr;
2734+
dst = local_ip.sin.sin_addr;
2735+
info->key.u.ipv4.src = src.s_addr;
2736+
info->key.u.ipv4.dst = dst.s_addr;
2737+
}
2738+
vxlan_encap_bypass(skb, vxlan, vxlan, vni, false);
2739+
dst_release(ndst);
2740+
goto out_unlock;
2741+
}
27242742

27252743
tos = ip_tunnel_ecn_encap(RT_TOS(tos), old_iph, skb);
27262744
ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
@@ -2760,7 +2778,24 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
27602778
goto out_unlock;
27612779
}
27622780

2763-
skb_tunnel_check_pmtu(skb, ndst, VXLAN6_HEADROOM);
2781+
err = skb_tunnel_check_pmtu(skb, ndst, VXLAN6_HEADROOM,
2782+
netif_is_any_bridge_port(dev));
2783+
if (err < 0) {
2784+
goto tx_error;
2785+
} else if (err) {
2786+
if (info) {
2787+
struct in6_addr src, dst;
2788+
2789+
src = remote_ip.sin6.sin6_addr;
2790+
dst = local_ip.sin6.sin6_addr;
2791+
info->key.u.ipv6.src = src;
2792+
info->key.u.ipv6.dst = dst;
2793+
}
2794+
2795+
vxlan_encap_bypass(skb, vxlan, vxlan, vni, false);
2796+
dst_release(ndst);
2797+
goto out_unlock;
2798+
}
27642799

27652800
tos = ip_tunnel_ecn_encap(RT_TOS(tos), old_iph, skb);
27662801
ttl = ttl ? : ip6_dst_hoplimit(ndst);

include/linux/netdevice.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4840,6 +4840,11 @@ static inline bool netif_is_ovs_port(const struct net_device *dev)
48404840
return dev->priv_flags & IFF_OVS_DATAPATH;
48414841
}
48424842

4843+
static inline bool netif_is_any_bridge_port(const struct net_device *dev)
4844+
{
4845+
return netif_is_bridge_port(dev) || netif_is_ovs_port(dev);
4846+
}
4847+
48434848
static inline bool netif_is_team_master(const struct net_device *dev)
48444849
{
48454850
return dev->priv_flags & IFF_TEAM;

include/net/dst.h

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -535,14 +535,4 @@ static inline void skb_dst_update_pmtu_no_confirm(struct sk_buff *skb, u32 mtu)
535535
dst->ops->update_pmtu(dst, NULL, skb, mtu, false);
536536
}
537537

538-
static inline void skb_tunnel_check_pmtu(struct sk_buff *skb,
539-
struct dst_entry *encap_dst,
540-
int headroom)
541-
{
542-
u32 encap_mtu = dst_mtu(encap_dst);
543-
544-
if (skb->len > encap_mtu - headroom)
545-
skb_dst_update_pmtu_no_confirm(skb, encap_mtu - headroom);
546-
}
547-
548538
#endif /* _NET_DST_H */

include/net/ip_tunnels.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -420,6 +420,8 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
420420
u8 tos, u8 ttl, __be16 df, bool xnet);
421421
struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md,
422422
gfp_t flags);
423+
int skb_tunnel_check_pmtu(struct sk_buff *skb, struct dst_entry *encap_dst,
424+
int headroom, bool reply);
423425

424426
int iptunnel_handle_offloads(struct sk_buff *skb, int gso_type_mask);
425427

0 commit comments

Comments
 (0)