Skip to content

Commit b650bf0

Browse files
edumazetkuba-moo
authored andcommitted
udp: remove busylock and add per NUMA queues
busylock was protecting UDP sockets against packet floods, but unfortunately was not protecting the host itself. Under stress, many cpus could spin while acquiring the busylock, and NIC had to drop packets. Or packets would be dropped in cpu backlog if RPS/RFS were in place. This patch replaces the busylock by intermediate lockless queues. (One queue per NUMA node). This means that fewer number of cpus have to acquire the UDP receive queue lock. Most of the cpus can either: - immediately drop the packet. - or queue it in their NUMA aware lockless queue. Then one of the cpu is chosen to process this lockless queue in a batch. The batch only contains packets that were cooked on the same NUMA node, thus with very limited latency impact. Tested: DDOS targeting a victim UDP socket, on a platform with 6 NUMA nodes (Intel(R) Xeon(R) 6985P-C) Before: nstat -n ; sleep 1 ; nstat | grep Udp Udp6InDatagrams 1004179 0.0 Udp6InErrors 3117 0.0 Udp6RcvbufErrors 3117 0.0 After: nstat -n ; sleep 1 ; nstat | grep Udp Udp6InDatagrams 1116633 0.0 Udp6InErrors 14197275 0.0 Udp6RcvbufErrors 14197275 0.0 We can see this host can now proces 14.2 M more packets per second while under attack, and the victim socket can receive 11 % more packets. I used a small bpftrace program measuring time (in us) spent in __udp_enqueue_schedule_skb(). Before: @udp_enqueue_us[398]: [0] 24901 |@@@ | [1] 63512 |@@@@@@@@@ | [2, 4) 344827 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| [4, 8) 244673 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | [8, 16) 54022 |@@@@@@@@ | [16, 32) 222134 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | [32, 64) 232042 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | [64, 128) 4219 | | [128, 256) 188 | | After: @udp_enqueue_us[398]: [0] 5608855 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| [1] 1111277 |@@@@@@@@@@ | [2, 4) 501439 |@@@@ | [4, 8) 102921 | | [8, 16) 29895 | | [16, 32) 43500 | | [32, 64) 31552 | | [64, 128) 979 | | [128, 256) 13 | | Note that the remaining bottleneck for this platform is in udp_drops_inc() because we limited struct numa_drop_counters to only two nodes so far. Signed-off-by: Eric Dumazet <[email protected]> Acked-by: Paolo Abeni <[email protected]> Reviewed-by: Willem de Bruijn <[email protected]> Reviewed-by: Kuniyuki Iwashima <[email protected]> Link: https://patch.msgid.link/[email protected] Signed-off-by: Jakub Kicinski <[email protected]>
1 parent df15267 commit b650bf0

File tree

4 files changed

+91
-51
lines changed

4 files changed

+91
-51
lines changed

include/linux/udp.h

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,12 @@ enum {
4444
UDP_FLAGS_UDPLITE_RECV_CC, /* set via udplite setsockopt */
4545
};
4646

47+
/* per NUMA structure for lockless producer usage. */
48+
struct udp_prod_queue {
49+
struct llist_head ll_root ____cacheline_aligned_in_smp;
50+
atomic_t rmem_alloc;
51+
};
52+
4753
struct udp_sock {
4854
/* inet_sock has to be the first member */
4955
struct inet_sock inet;
@@ -90,6 +96,8 @@ struct udp_sock {
9096
struct sk_buff *skb,
9197
int nhoff);
9298

99+
struct udp_prod_queue *udp_prod_queue;
100+
93101
/* udp_recvmsg try to use this before splicing sk_receive_queue */
94102
struct sk_buff_head reader_queue ____cacheline_aligned_in_smp;
95103

@@ -109,7 +117,6 @@ struct udp_sock {
109117
*/
110118
struct hlist_node tunnel_list;
111119
struct numa_drop_counters drop_counters;
112-
spinlock_t busylock ____cacheline_aligned_in_smp;
113120
};
114121

115122
#define udp_test_bit(nr, sk) \

include/net/udp.h

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -284,16 +284,23 @@ INDIRECT_CALLABLE_DECLARE(int udpv6_rcv(struct sk_buff *));
284284
struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
285285
netdev_features_t features, bool is_ipv6);
286286

287-
static inline void udp_lib_init_sock(struct sock *sk)
287+
static inline int udp_lib_init_sock(struct sock *sk)
288288
{
289289
struct udp_sock *up = udp_sk(sk);
290290

291291
sk->sk_drop_counters = &up->drop_counters;
292-
spin_lock_init(&up->busylock);
293292
skb_queue_head_init(&up->reader_queue);
294293
INIT_HLIST_NODE(&up->tunnel_list);
295294
up->forward_threshold = sk->sk_rcvbuf >> 2;
296295
set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags);
296+
297+
up->udp_prod_queue = kcalloc(nr_node_ids, sizeof(*up->udp_prod_queue),
298+
GFP_KERNEL);
299+
if (!up->udp_prod_queue)
300+
return -ENOMEM;
301+
for (int i = 0; i < nr_node_ids; i++)
302+
init_llist_head(&up->udp_prod_queue[i].ll_root);
303+
return 0;
297304
}
298305

299306
static inline void udp_drops_inc(struct sock *sk)

net/ipv4/udp.c

Lines changed: 71 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1685,25 +1685,6 @@ static void udp_skb_dtor_locked(struct sock *sk, struct sk_buff *skb)
16851685
udp_rmem_release(sk, udp_skb_truesize(skb), 1, true);
16861686
}
16871687

1688-
/* Idea of busylocks is to let producers grab an extra spinlock
1689-
* to relieve pressure on the receive_queue spinlock shared by consumer.
1690-
* Under flood, this means that only one producer can be in line
1691-
* trying to acquire the receive_queue spinlock.
1692-
*/
1693-
static spinlock_t *busylock_acquire(struct sock *sk)
1694-
{
1695-
spinlock_t *busy = &udp_sk(sk)->busylock;
1696-
1697-
spin_lock(busy);
1698-
return busy;
1699-
}
1700-
1701-
static void busylock_release(spinlock_t *busy)
1702-
{
1703-
if (busy)
1704-
spin_unlock(busy);
1705-
}
1706-
17071688
static int udp_rmem_schedule(struct sock *sk, int size)
17081689
{
17091690
int delta;
@@ -1718,14 +1699,24 @@ static int udp_rmem_schedule(struct sock *sk, int size)
17181699
int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
17191700
{
17201701
struct sk_buff_head *list = &sk->sk_receive_queue;
1702+
struct udp_prod_queue *udp_prod_queue;
1703+
struct sk_buff *next, *to_drop = NULL;
1704+
struct llist_node *ll_list;
17211705
unsigned int rmem, rcvbuf;
1722-
spinlock_t *busy = NULL;
17231706
int size, err = -ENOMEM;
1707+
int total_size = 0;
1708+
int q_size = 0;
1709+
int dropcount;
1710+
int nb = 0;
17241711

17251712
rmem = atomic_read(&sk->sk_rmem_alloc);
17261713
rcvbuf = READ_ONCE(sk->sk_rcvbuf);
17271714
size = skb->truesize;
17281715

1716+
udp_prod_queue = &udp_sk(sk)->udp_prod_queue[numa_node_id()];
1717+
1718+
rmem += atomic_read(&udp_prod_queue->rmem_alloc);
1719+
17291720
/* Immediately drop when the receive queue is full.
17301721
* Cast to unsigned int performs the boundary check for INT_MAX.
17311722
*/
@@ -1747,45 +1738,77 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
17471738
if (rmem > (rcvbuf >> 1)) {
17481739
skb_condense(skb);
17491740
size = skb->truesize;
1750-
rmem = atomic_add_return(size, &sk->sk_rmem_alloc);
1751-
if (rmem > rcvbuf)
1752-
goto uncharge_drop;
1753-
busy = busylock_acquire(sk);
1754-
} else {
1755-
atomic_add(size, &sk->sk_rmem_alloc);
17561741
}
17571742

17581743
udp_set_dev_scratch(skb);
17591744

1745+
atomic_add(size, &udp_prod_queue->rmem_alloc);
1746+
1747+
if (!llist_add(&skb->ll_node, &udp_prod_queue->ll_root))
1748+
return 0;
1749+
1750+
dropcount = sock_flag(sk, SOCK_RXQ_OVFL) ? sk_drops_read(sk) : 0;
1751+
17601752
spin_lock(&list->lock);
1761-
err = udp_rmem_schedule(sk, size);
1762-
if (err) {
1763-
spin_unlock(&list->lock);
1764-
goto uncharge_drop;
1765-
}
17661753

1767-
sk_forward_alloc_add(sk, -size);
1754+
ll_list = llist_del_all(&udp_prod_queue->ll_root);
17681755

1769-
/* no need to setup a destructor, we will explicitly release the
1770-
* forward allocated memory on dequeue
1771-
*/
1772-
sock_skb_set_dropcount(sk, skb);
1756+
ll_list = llist_reverse_order(ll_list);
1757+
1758+
llist_for_each_entry_safe(skb, next, ll_list, ll_node) {
1759+
size = udp_skb_truesize(skb);
1760+
total_size += size;
1761+
err = udp_rmem_schedule(sk, size);
1762+
if (unlikely(err)) {
1763+
/* Free the skbs outside of locked section. */
1764+
skb->next = to_drop;
1765+
to_drop = skb;
1766+
continue;
1767+
}
1768+
1769+
q_size += size;
1770+
sk_forward_alloc_add(sk, -size);
1771+
1772+
/* no need to setup a destructor, we will explicitly release the
1773+
* forward allocated memory on dequeue
1774+
*/
1775+
SOCK_SKB_CB(skb)->dropcount = dropcount;
1776+
nb++;
1777+
__skb_queue_tail(list, skb);
1778+
}
1779+
1780+
atomic_add(q_size, &sk->sk_rmem_alloc);
17731781

1774-
__skb_queue_tail(list, skb);
17751782
spin_unlock(&list->lock);
17761783

1777-
if (!sock_flag(sk, SOCK_DEAD))
1778-
INDIRECT_CALL_1(sk->sk_data_ready, sock_def_readable, sk);
1784+
if (!sock_flag(sk, SOCK_DEAD)) {
1785+
/* Multiple threads might be blocked in recvmsg(),
1786+
* using prepare_to_wait_exclusive().
1787+
*/
1788+
while (nb) {
1789+
INDIRECT_CALL_1(sk->sk_data_ready,
1790+
sock_def_readable, sk);
1791+
nb--;
1792+
}
1793+
}
1794+
1795+
if (unlikely(to_drop)) {
1796+
for (nb = 0; to_drop != NULL; nb++) {
1797+
skb = to_drop;
1798+
to_drop = skb->next;
1799+
skb_mark_not_on_list(skb);
1800+
/* TODO: update SNMP values. */
1801+
sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_PROTO_MEM);
1802+
}
1803+
numa_drop_add(&udp_sk(sk)->drop_counters, nb);
1804+
}
17791805

1780-
busylock_release(busy);
1781-
return 0;
1806+
atomic_sub(total_size, &udp_prod_queue->rmem_alloc);
17821807

1783-
uncharge_drop:
1784-
atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
1808+
return 0;
17851809

17861810
drop:
17871811
udp_drops_inc(sk);
1788-
busylock_release(busy);
17891812
return err;
17901813
}
17911814
EXPORT_IPV6_MOD_GPL(__udp_enqueue_schedule_skb);
@@ -1803,6 +1826,7 @@ void udp_destruct_common(struct sock *sk)
18031826
kfree_skb(skb);
18041827
}
18051828
udp_rmem_release(sk, total, 0, true);
1829+
kfree(up->udp_prod_queue);
18061830
}
18071831
EXPORT_IPV6_MOD_GPL(udp_destruct_common);
18081832

@@ -1814,10 +1838,11 @@ static void udp_destruct_sock(struct sock *sk)
18141838

18151839
int udp_init_sock(struct sock *sk)
18161840
{
1817-
udp_lib_init_sock(sk);
1841+
int res = udp_lib_init_sock(sk);
1842+
18181843
sk->sk_destruct = udp_destruct_sock;
18191844
set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
1820-
return 0;
1845+
return res;
18211846
}
18221847

18231848
void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len)

net/ipv6/udp.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,10 +67,11 @@ static void udpv6_destruct_sock(struct sock *sk)
6767

6868
int udpv6_init_sock(struct sock *sk)
6969
{
70-
udp_lib_init_sock(sk);
70+
int res = udp_lib_init_sock(sk);
71+
7172
sk->sk_destruct = udpv6_destruct_sock;
7273
set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
73-
return 0;
74+
return res;
7475
}
7576

7677
INDIRECT_CALLABLE_SCOPE

0 commit comments

Comments
 (0)