Skip to content

Commit d8a3070

Browse files
committed
Merge branch 'dst-hint-multipath'
Sriram Yagnaraman says: ==================== Avoid TCP resets when using ECMP for load-balancing between multiple servers. All packets in the same flow (L3/L4 depending on multipath hash policy) should be directed to the same target, but after [0]/[1] we see stray packets directed towards other targets. This, for instance, causes RST to be sent on TCP connections. The first two patches solve the problem by ignoring route hints for destinations that are part of multipath group, by using new SKB flags for IPv4 and IPv6. The third patch is a selftest that tests the scenario. Thanks to Ido, for reviewing and suggesting a way forward in [2] and also suggesting how to write a selftest for this. v4->v5: - Fixed review comments from Ido v3->v4: - Remove single path test - Rebase to latest v2->v3: - Add NULL check for skb in fib6_select_path (Ido Schimmel) - Use fib_tests.sh for selftest instead of the forwarding suite (Ido Schimmel) v1->v2: - Update to commit messages describing the solution (Ido Schimmel) - Use perf stat to count fib table lookups in selftest (Ido Schimmel) ==================== Signed-off-by: David S. Miller <[email protected]>
2 parents 2ea3528 + 8ae9efb commit d8a3070

File tree

7 files changed

+164
-3
lines changed

7 files changed

+164
-3
lines changed

include/linux/ipv6.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,7 @@ struct inet6_skb_parm {
147147
#define IP6SKB_JUMBOGRAM 128
148148
#define IP6SKB_SEG6 256
149149
#define IP6SKB_FAKEJUMBO 512
150+
#define IP6SKB_MULTIPATH 1024
150151
};
151152

152153
#if defined(CONFIG_NET_L3_MASTER_DEV)

include/net/ip.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ struct inet_skb_parm {
5757
#define IPSKB_FRAG_PMTU BIT(6)
5858
#define IPSKB_L3SLAVE BIT(7)
5959
#define IPSKB_NOPOLICY BIT(8)
60+
#define IPSKB_MULTIPATH BIT(9)
6061

6162
u16 frag_max_size;
6263
};

net/ipv4/ip_input.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -584,7 +584,8 @@ static void ip_sublist_rcv_finish(struct list_head *head)
584584
static struct sk_buff *ip_extract_route_hint(const struct net *net,
585585
struct sk_buff *skb, int rt_type)
586586
{
587-
if (fib4_has_custom_rules(net) || rt_type == RTN_BROADCAST)
587+
if (fib4_has_custom_rules(net) || rt_type == RTN_BROADCAST ||
588+
IPCB(skb)->flags & IPSKB_MULTIPATH)
588589
return NULL;
589590

590591
return skb;

net/ipv4/route.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2144,6 +2144,7 @@ static int ip_mkroute_input(struct sk_buff *skb,
21442144
int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
21452145

21462146
fib_select_multipath(res, h);
2147+
IPCB(skb)->flags |= IPSKB_MULTIPATH;
21472148
}
21482149
#endif
21492150

net/ipv6/ip6_input.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,8 @@ static bool ip6_can_use_hint(const struct sk_buff *skb,
9999
static struct sk_buff *ip6_extract_route_hint(const struct net *net,
100100
struct sk_buff *skb)
101101
{
102-
if (fib6_routes_require_src(net) || fib6_has_custom_rules(net))
102+
if (fib6_routes_require_src(net) || fib6_has_custom_rules(net) ||
103+
IP6CB(skb)->flags & IP6SKB_MULTIPATH)
103104
return NULL;
104105

105106
return skb;

net/ipv6/route.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -423,6 +423,9 @@ void fib6_select_path(const struct net *net, struct fib6_result *res,
423423
if (match->nh && have_oif_match && res->nh)
424424
return;
425425

426+
if (skb)
427+
IP6CB(skb)->flags |= IP6SKB_MULTIPATH;
428+
426429
/* We might have already computed the hash for ICMPv6 errors. In such
427430
* case it will always be non-zero. Otherwise now is the time to do it.
428431
*/

tools/testing/selftests/net/fib_tests.sh

Lines changed: 154 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@ ksft_skip=4
1212
TESTS="unregister down carrier nexthop suppress ipv6_notify ipv4_notify \
1313
ipv6_rt ipv4_rt ipv6_addr_metric ipv4_addr_metric ipv6_route_metrics \
1414
ipv4_route_metrics ipv4_route_v6_gw rp_filter ipv4_del_addr \
15-
ipv6_del_addr ipv4_mangle ipv6_mangle ipv4_bcast_neigh fib6_gc_test"
15+
ipv6_del_addr ipv4_mangle ipv6_mangle ipv4_bcast_neigh fib6_gc_test \
16+
ipv4_mpath_list ipv6_mpath_list"
1617

1718
VERBOSE=0
1819
PAUSE_ON_FAIL=no
@@ -2352,6 +2353,156 @@ ipv4_bcast_neigh_test()
23522353
cleanup
23532354
}
23542355

2356+
mpath_dep_check()
2357+
{
2358+
if [ ! -x "$(command -v mausezahn)" ]; then
2359+
echo "mausezahn command not found. Skipping test"
2360+
return 1
2361+
fi
2362+
2363+
if [ ! -x "$(command -v jq)" ]; then
2364+
echo "jq command not found. Skipping test"
2365+
return 1
2366+
fi
2367+
2368+
if [ ! -x "$(command -v bc)" ]; then
2369+
echo "bc command not found. Skipping test"
2370+
return 1
2371+
fi
2372+
2373+
if [ ! -x "$(command -v perf)" ]; then
2374+
echo "perf command not found. Skipping test"
2375+
return 1
2376+
fi
2377+
2378+
perf list fib:* | grep -q fib_table_lookup
2379+
if [ $? -ne 0 ]; then
2380+
echo "IPv4 FIB tracepoint not found. Skipping test"
2381+
return 1
2382+
fi
2383+
2384+
perf list fib6:* | grep -q fib6_table_lookup
2385+
if [ $? -ne 0 ]; then
2386+
echo "IPv6 FIB tracepoint not found. Skipping test"
2387+
return 1
2388+
fi
2389+
2390+
return 0
2391+
}
2392+
2393+
link_stats_get()
2394+
{
2395+
local ns=$1; shift
2396+
local dev=$1; shift
2397+
local dir=$1; shift
2398+
local stat=$1; shift
2399+
2400+
ip -n $ns -j -s link show dev $dev \
2401+
| jq '.[]["stats64"]["'$dir'"]["'$stat'"]'
2402+
}
2403+
2404+
list_rcv_eval()
2405+
{
2406+
local file=$1; shift
2407+
local expected=$1; shift
2408+
2409+
local count=$(tail -n 1 $file | jq '.["counter-value"] | tonumber | floor')
2410+
local ratio=$(echo "scale=2; $count / $expected" | bc -l)
2411+
local res=$(echo "$ratio >= 0.95" | bc)
2412+
[[ $res -eq 1 ]]
2413+
log_test $? 0 "Multipath route hit ratio ($ratio)"
2414+
}
2415+
2416+
ipv4_mpath_list_test()
2417+
{
2418+
echo
2419+
echo "IPv4 multipath list receive tests"
2420+
2421+
mpath_dep_check || return 1
2422+
2423+
route_setup
2424+
2425+
set -e
2426+
run_cmd "ip netns exec ns1 ethtool -K veth1 tcp-segmentation-offload off"
2427+
2428+
run_cmd "ip netns exec ns2 bash -c \"echo 20000 > /sys/class/net/veth2/gro_flush_timeout\""
2429+
run_cmd "ip netns exec ns2 bash -c \"echo 1 > /sys/class/net/veth2/napi_defer_hard_irqs\""
2430+
run_cmd "ip netns exec ns2 ethtool -K veth2 generic-receive-offload on"
2431+
run_cmd "ip -n ns2 link add name nh1 up type dummy"
2432+
run_cmd "ip -n ns2 link add name nh2 up type dummy"
2433+
run_cmd "ip -n ns2 address add 172.16.201.1/24 dev nh1"
2434+
run_cmd "ip -n ns2 address add 172.16.202.1/24 dev nh2"
2435+
run_cmd "ip -n ns2 neigh add 172.16.201.2 lladdr 00:11:22:33:44:55 nud perm dev nh1"
2436+
run_cmd "ip -n ns2 neigh add 172.16.202.2 lladdr 00:aa:bb:cc:dd:ee nud perm dev nh2"
2437+
run_cmd "ip -n ns2 route add 203.0.113.0/24
2438+
nexthop via 172.16.201.2 nexthop via 172.16.202.2"
2439+
run_cmd "ip netns exec ns2 sysctl -qw net.ipv4.fib_multipath_hash_policy=1"
2440+
set +e
2441+
2442+
local dmac=$(ip -n ns2 -j link show dev veth2 | jq -r '.[]["address"]')
2443+
local tmp_file=$(mktemp)
2444+
local cmd="ip netns exec ns1 mausezahn veth1 -a own -b $dmac
2445+
-A 172.16.101.1 -B 203.0.113.1 -t udp 'sp=12345,dp=0-65535' -q"
2446+
2447+
# Packets forwarded in a list using a multipath route must not reuse a
2448+
# cached result so that a flow always hits the same nexthop. In other
2449+
# words, the FIB lookup tracepoint needs to be triggered for every
2450+
# packet.
2451+
local t0_rx_pkts=$(link_stats_get ns2 veth2 rx packets)
2452+
run_cmd "perf stat -e fib:fib_table_lookup --filter 'err == 0' -j -o $tmp_file -- $cmd"
2453+
local t1_rx_pkts=$(link_stats_get ns2 veth2 rx packets)
2454+
local diff=$(echo $t1_rx_pkts - $t0_rx_pkts | bc -l)
2455+
list_rcv_eval $tmp_file $diff
2456+
2457+
rm $tmp_file
2458+
route_cleanup
2459+
}
2460+
2461+
ipv6_mpath_list_test()
2462+
{
2463+
echo
2464+
echo "IPv6 multipath list receive tests"
2465+
2466+
mpath_dep_check || return 1
2467+
2468+
route_setup
2469+
2470+
set -e
2471+
run_cmd "ip netns exec ns1 ethtool -K veth1 tcp-segmentation-offload off"
2472+
2473+
run_cmd "ip netns exec ns2 bash -c \"echo 20000 > /sys/class/net/veth2/gro_flush_timeout\""
2474+
run_cmd "ip netns exec ns2 bash -c \"echo 1 > /sys/class/net/veth2/napi_defer_hard_irqs\""
2475+
run_cmd "ip netns exec ns2 ethtool -K veth2 generic-receive-offload on"
2476+
run_cmd "ip -n ns2 link add name nh1 up type dummy"
2477+
run_cmd "ip -n ns2 link add name nh2 up type dummy"
2478+
run_cmd "ip -n ns2 -6 address add 2001:db8:201::1/64 dev nh1"
2479+
run_cmd "ip -n ns2 -6 address add 2001:db8:202::1/64 dev nh2"
2480+
run_cmd "ip -n ns2 -6 neigh add 2001:db8:201::2 lladdr 00:11:22:33:44:55 nud perm dev nh1"
2481+
run_cmd "ip -n ns2 -6 neigh add 2001:db8:202::2 lladdr 00:aa:bb:cc:dd:ee nud perm dev nh2"
2482+
run_cmd "ip -n ns2 -6 route add 2001:db8:301::/64
2483+
nexthop via 2001:db8:201::2 nexthop via 2001:db8:202::2"
2484+
run_cmd "ip netns exec ns2 sysctl -qw net.ipv6.fib_multipath_hash_policy=1"
2485+
set +e
2486+
2487+
local dmac=$(ip -n ns2 -j link show dev veth2 | jq -r '.[]["address"]')
2488+
local tmp_file=$(mktemp)
2489+
local cmd="ip netns exec ns1 mausezahn -6 veth1 -a own -b $dmac
2490+
-A 2001:db8:101::1 -B 2001:db8:301::1 -t udp 'sp=12345,dp=0-65535' -q"
2491+
2492+
# Packets forwarded in a list using a multipath route must not reuse a
2493+
# cached result so that a flow always hits the same nexthop. In other
2494+
# words, the FIB lookup tracepoint needs to be triggered for every
2495+
# packet.
2496+
local t0_rx_pkts=$(link_stats_get ns2 veth2 rx packets)
2497+
run_cmd "perf stat -e fib6:fib6_table_lookup --filter 'err == 0' -j -o $tmp_file -- $cmd"
2498+
local t1_rx_pkts=$(link_stats_get ns2 veth2 rx packets)
2499+
local diff=$(echo $t1_rx_pkts - $t0_rx_pkts | bc -l)
2500+
list_rcv_eval $tmp_file $diff
2501+
2502+
rm $tmp_file
2503+
route_cleanup
2504+
}
2505+
23552506
################################################################################
23562507
# usage
23572508

@@ -2433,6 +2584,8 @@ do
24332584
ipv6_mangle) ipv6_mangle_test;;
24342585
ipv4_bcast_neigh) ipv4_bcast_neigh_test;;
24352586
fib6_gc_test|ipv6_gc) fib6_gc_test;;
2587+
ipv4_mpath_list) ipv4_mpath_list_test;;
2588+
ipv6_mpath_list) ipv6_mpath_list_test;;
24362589

24372590
help) echo "Test names: $TESTS"; exit 0;;
24382591
esac

0 commit comments

Comments
 (0)