Skip to content

Commit c18b0f5

Browse files
author
Paolo Abeni
committed
Merge branch 'net-lockless-skb_attempt_defer_free'
Eric Dumazet says: ==================== net: lockless skb_attempt_defer_free() Platforms with many cpus and relatively slow inter connect show a significant spinlock contention in skb_attempt_defer_free(). This series refactors this infrastructure to be NUMA aware, and lockless. Tested on various platforms, including AMD Zen 2/3/4 and Intel Granite Rapids, showing significant cost reductions under network stress (more than 20 Mpps). ==================== Link: https://patch.msgid.link/[email protected] Signed-off-by: Paolo Abeni <[email protected]>
2 parents 2c0592b + 5628f3f commit c18b0f5

File tree

5 files changed

+43
-37
lines changed

5 files changed

+43
-37
lines changed

include/linux/netdevice.h

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3536,11 +3536,7 @@ struct softnet_data {
35363536

35373537
struct numa_drop_counters drop_counters;
35383538

3539-
/* Another possibly contended cache line */
3540-
spinlock_t defer_lock ____cacheline_aligned_in_smp;
3541-
int defer_count;
3542-
int defer_ipi_scheduled;
3543-
struct sk_buff *defer_list;
3539+
int defer_ipi_scheduled ____cacheline_aligned_in_smp;
35443540
call_single_data_t defer_csd;
35453541
};
35463542

include/net/hotdata.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,16 @@
22
#ifndef _NET_HOTDATA_H
33
#define _NET_HOTDATA_H
44

5+
#include <linux/llist.h>
56
#include <linux/types.h>
67
#include <linux/netdevice.h>
78
#include <net/protocol.h>
89

10+
struct skb_defer_node {
11+
struct llist_head defer_list;
12+
atomic_long_t defer_count;
13+
} ____cacheline_aligned_in_smp;
14+
915
/* Read mostly data used in network fast paths. */
1016
struct net_hotdata {
1117
#if IS_ENABLED(CONFIG_INET)
@@ -30,6 +36,7 @@ struct net_hotdata {
3036
struct rps_sock_flow_table __rcu *rps_sock_flow_table;
3137
u32 rps_cpu_mask;
3238
#endif
39+
struct skb_defer_node __percpu *skb_defer_nodes;
3340
int gro_normal_batch;
3441
int netdev_budget;
3542
int netdev_budget_usecs;

net/core/dev.c

Lines changed: 23 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -5180,8 +5180,9 @@ static void napi_schedule_rps(struct softnet_data *sd)
51805180
__napi_schedule_irqoff(&mysd->backlog);
51815181
}
51825182

5183-
void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu)
5183+
void kick_defer_list_purge(unsigned int cpu)
51845184
{
5185+
struct softnet_data *sd = &per_cpu(softnet_data, cpu);
51855186
unsigned long flags;
51865187

51875188
if (use_backlog_threads()) {
@@ -6715,24 +6716,24 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
67156716
}
67166717
EXPORT_SYMBOL(napi_complete_done);
67176718

6718-
static void skb_defer_free_flush(struct softnet_data *sd)
6719+
static void skb_defer_free_flush(void)
67196720
{
6721+
struct llist_node *free_list;
67206722
struct sk_buff *skb, *next;
6723+
struct skb_defer_node *sdn;
6724+
int node;
67216725

6722-
/* Paired with WRITE_ONCE() in skb_attempt_defer_free() */
6723-
if (!READ_ONCE(sd->defer_list))
6724-
return;
6726+
for_each_node(node) {
6727+
sdn = this_cpu_ptr(net_hotdata.skb_defer_nodes) + node;
67256728

6726-
spin_lock(&sd->defer_lock);
6727-
skb = sd->defer_list;
6728-
sd->defer_list = NULL;
6729-
sd->defer_count = 0;
6730-
spin_unlock(&sd->defer_lock);
6729+
if (llist_empty(&sdn->defer_list))
6730+
continue;
6731+
atomic_long_set(&sdn->defer_count, 0);
6732+
free_list = llist_del_all(&sdn->defer_list);
67316733

6732-
while (skb != NULL) {
6733-
next = skb->next;
6734-
napi_consume_skb(skb, 1);
6735-
skb = next;
6734+
llist_for_each_entry_safe(skb, next, free_list, ll_node) {
6735+
napi_consume_skb(skb, 1);
6736+
}
67366737
}
67376738
}
67386739

@@ -6860,7 +6861,7 @@ static void __napi_busy_loop(unsigned int napi_id,
68606861
if (work > 0)
68616862
__NET_ADD_STATS(dev_net(napi->dev),
68626863
LINUX_MIB_BUSYPOLLRXPACKETS, work);
6863-
skb_defer_free_flush(this_cpu_ptr(&softnet_data));
6864+
skb_defer_free_flush();
68646865
bpf_net_ctx_clear(bpf_net_ctx);
68656866
local_bh_enable();
68666867

@@ -7719,7 +7720,7 @@ static void napi_threaded_poll_loop(struct napi_struct *napi)
77197720
local_irq_disable();
77207721
net_rps_action_and_irq_enable(sd);
77217722
}
7722-
skb_defer_free_flush(sd);
7723+
skb_defer_free_flush();
77237724
bpf_net_ctx_clear(bpf_net_ctx);
77247725
local_bh_enable();
77257726

@@ -7761,7 +7762,7 @@ static __latent_entropy void net_rx_action(void)
77617762
for (;;) {
77627763
struct napi_struct *n;
77637764

7764-
skb_defer_free_flush(sd);
7765+
skb_defer_free_flush();
77657766

77667767
if (list_empty(&list)) {
77677768
if (list_empty(&repoll)) {
@@ -12995,7 +12996,6 @@ static int __init net_dev_init(void)
1299512996
sd->cpu = i;
1299612997
#endif
1299712998
INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd);
12998-
spin_lock_init(&sd->defer_lock);
1299912999

1300013000
gro_init(&sd->backlog.gro);
1300113001
sd->backlog.poll = process_backlog;
@@ -13005,6 +13005,11 @@ static int __init net_dev_init(void)
1300513005
if (net_page_pool_create(i))
1300613006
goto out;
1300713007
}
13008+
net_hotdata.skb_defer_nodes =
13009+
__alloc_percpu(sizeof(struct skb_defer_node) * nr_node_ids,
13010+
__alignof__(struct skb_defer_node));
13011+
if (!net_hotdata.skb_defer_nodes)
13012+
goto out;
1300813013
if (use_backlog_threads())
1300913014
smpboot_register_percpu_thread(&backlog_threads);
1301013015

net/core/dev.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -357,7 +357,7 @@ static inline void napi_assert_will_not_race(const struct napi_struct *napi)
357357
WARN_ON(READ_ONCE(napi->list_owner) != -1);
358358
}
359359

360-
void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu);
360+
void kick_defer_list_purge(unsigned int cpu);
361361

362362
#define XMIT_RECURSION_LIMIT 8
363363

net/core/skbuff.c

Lines changed: 11 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -7185,8 +7185,9 @@ static void kfree_skb_napi_cache(struct sk_buff *skb)
71857185
*/
71867186
void skb_attempt_defer_free(struct sk_buff *skb)
71877187
{
7188+
struct skb_defer_node *sdn;
7189+
unsigned long defer_count;
71887190
int cpu = skb->alloc_cpu;
7189-
struct softnet_data *sd;
71907191
unsigned int defer_max;
71917192
bool kick;
71927193

@@ -7200,27 +7201,24 @@ nodefer: kfree_skb_napi_cache(skb);
72007201
DEBUG_NET_WARN_ON_ONCE(skb_dst(skb));
72017202
DEBUG_NET_WARN_ON_ONCE(skb->destructor);
72027203

7203-
sd = &per_cpu(softnet_data, cpu);
7204+
sdn = per_cpu_ptr(net_hotdata.skb_defer_nodes, cpu) + numa_node_id();
7205+
72047206
defer_max = READ_ONCE(net_hotdata.sysctl_skb_defer_max);
7205-
if (READ_ONCE(sd->defer_count) >= defer_max)
7207+
defer_count = atomic_long_inc_return(&sdn->defer_count);
7208+
7209+
if (defer_count >= defer_max)
72067210
goto nodefer;
72077211

7208-
spin_lock_bh(&sd->defer_lock);
7209-
/* Send an IPI every time queue reaches half capacity. */
7210-
kick = sd->defer_count == (defer_max >> 1);
7211-
/* Paired with the READ_ONCE() few lines above */
7212-
WRITE_ONCE(sd->defer_count, sd->defer_count + 1);
7212+
llist_add(&skb->ll_node, &sdn->defer_list);
72137213

7214-
skb->next = sd->defer_list;
7215-
/* Paired with READ_ONCE() in skb_defer_free_flush() */
7216-
WRITE_ONCE(sd->defer_list, skb);
7217-
spin_unlock_bh(&sd->defer_lock);
7214+
/* Send an IPI every time queue reaches half capacity. */
7215+
kick = (defer_count - 1) == (defer_max >> 1);
72187216

72197217
/* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU
72207218
* if we are unlucky enough (this seems very unlikely).
72217219
*/
72227220
if (unlikely(kick))
7223-
kick_defer_list_purge(sd, cpu);
7221+
kick_defer_list_purge(cpu);
72247222
}
72257223

72267224
static void skb_splice_csum_page(struct sk_buff *skb, struct page *page,

0 commit comments

Comments
 (0)