Skip to content

Commit 2df75cc

Browse files
committed
Merge branch 'net-optimize-tx-throughput-and-efficiency'
Eric Dumazet says: ==================== net: optimize TX throughput and efficiency In this series, I replace the busylock spinlock we have in __dev_queue_xmit() and use lockless list (llist) to reduce spinlock contention to the minimum. Idea is that only one cpu might spin on the qdisc spinlock, while others simply add their skb in the llist. After this series, we get a 300 % (4x) improvement on heavy TX workloads, sending twice the number of packets per second, for half the cpu cycles. ==================== Link: https://patch.msgid.link/[email protected] Signed-off-by: Jakub Kicinski <[email protected]>
2 parents 01b6aca + 100dfa7 commit 2df75cc

File tree

7 files changed

+111
-104
lines changed

7 files changed

+111
-104
lines changed

include/linux/netdevice_xmit.h

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,21 @@
22
#ifndef _LINUX_NETDEVICE_XMIT_H
33
#define _LINUX_NETDEVICE_XMIT_H
44

5+
#if IS_ENABLED(CONFIG_NET_ACT_MIRRED)
6+
#define MIRRED_NEST_LIMIT 4
7+
#endif
8+
9+
struct net_device;
10+
511
struct netdev_xmit {
612
u16 recursion;
713
u8 more;
814
#ifdef CONFIG_NET_EGRESS
915
u8 skip_txqueue;
1016
#endif
1117
#if IS_ENABLED(CONFIG_NET_ACT_MIRRED)
12-
u8 sched_mirred_nest;
18+
u8 sched_mirred_nest;
19+
struct net_device *sched_mirred_dev[MIRRED_NEST_LIMIT];
1320
#endif
1421
#if IS_ENABLED(CONFIG_NF_DUP_NETDEV)
1522
u8 nf_dup_skb_recursion;

include/net/sch_generic.h

Lines changed: 10 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -41,13 +41,6 @@ enum qdisc_state_t {
4141
__QDISC_STATE_DRAINING,
4242
};
4343

44-
enum qdisc_state2_t {
45-
/* Only for !TCQ_F_NOLOCK qdisc. Never access it directly.
46-
* Use qdisc_run_begin/end() or qdisc_is_running() instead.
47-
*/
48-
__QDISC_STATE2_RUNNING,
49-
};
50-
5144
#define QDISC_STATE_MISSED BIT(__QDISC_STATE_MISSED)
5245
#define QDISC_STATE_DRAINING BIT(__QDISC_STATE_DRAINING)
5346

@@ -117,13 +110,14 @@ struct Qdisc {
117110
struct qdisc_skb_head q;
118111
struct gnet_stats_basic_sync bstats;
119112
struct gnet_stats_queue qstats;
120-
int owner;
113+
bool running; /* must be written under qdisc spinlock */
121114
unsigned long state;
122-
unsigned long state2; /* must be written under qdisc spinlock */
123115
struct Qdisc *next_sched;
124116
struct sk_buff_head skb_bad_txq;
125117

126-
spinlock_t busylock ____cacheline_aligned_in_smp;
118+
atomic_long_t defer_count ____cacheline_aligned_in_smp;
119+
struct llist_head defer_list;
120+
127121
spinlock_t seqlock;
128122

129123
struct rcu_head rcu;
@@ -168,7 +162,7 @@ static inline bool qdisc_is_running(struct Qdisc *qdisc)
168162
{
169163
if (qdisc->flags & TCQ_F_NOLOCK)
170164
return spin_is_locked(&qdisc->seqlock);
171-
return test_bit(__QDISC_STATE2_RUNNING, &qdisc->state2);
165+
return READ_ONCE(qdisc->running);
172166
}
173167

174168
static inline bool nolock_qdisc_is_empty(const struct Qdisc *qdisc)
@@ -211,7 +205,10 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc)
211205
*/
212206
return spin_trylock(&qdisc->seqlock);
213207
}
214-
return !__test_and_set_bit(__QDISC_STATE2_RUNNING, &qdisc->state2);
208+
if (READ_ONCE(qdisc->running))
209+
return false;
210+
WRITE_ONCE(qdisc->running, true);
211+
return true;
215212
}
216213

217214
static inline void qdisc_run_end(struct Qdisc *qdisc)
@@ -229,7 +226,7 @@ static inline void qdisc_run_end(struct Qdisc *qdisc)
229226
&qdisc->state)))
230227
__netif_schedule(qdisc);
231228
} else {
232-
__clear_bit(__QDISC_STATE2_RUNNING, &qdisc->state2);
229+
WRITE_ONCE(qdisc->running, false);
233230
}
234231
}
235232

net/core/dev.c

Lines changed: 56 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -4125,9 +4125,10 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
41254125
struct net_device *dev,
41264126
struct netdev_queue *txq)
41274127
{
4128+
struct sk_buff *next, *to_free = NULL;
41284129
spinlock_t *root_lock = qdisc_lock(q);
4129-
struct sk_buff *to_free = NULL;
4130-
bool contended;
4130+
struct llist_node *ll_list, *first_n;
4131+
unsigned long defer_count = 0;
41314132
int rc;
41324133

41334134
qdisc_calculate_pkt_len(skb, q);
@@ -4167,67 +4168,81 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
41674168
return rc;
41684169
}
41694170

4170-
if (unlikely(READ_ONCE(q->owner) == smp_processor_id())) {
4171-
kfree_skb_reason(skb, SKB_DROP_REASON_TC_RECLASSIFY_LOOP);
4172-
return NET_XMIT_DROP;
4173-
}
4174-
/*
4175-
* Heuristic to force contended enqueues to serialize on a
4176-
* separate lock before trying to get qdisc main lock.
4177-
* This permits qdisc->running owner to get the lock more
4178-
* often and dequeue packets faster.
4179-
* On PREEMPT_RT it is possible to preempt the qdisc owner during xmit
4180-
* and then other tasks will only enqueue packets. The packets will be
4181-
* sent after the qdisc owner is scheduled again. To prevent this
4182-
* scenario the task always serialize on the lock.
4171+
/* Open code llist_add(&skb->ll_node, &q->defer_list) + queue limit.
4172+
* In the try_cmpxchg() loop, we want to increment q->defer_count
4173+
* at most once to limit the number of skbs in defer_list.
4174+
* We perform the defer_count increment only if the list is not empty,
4175+
* because some arches have slow atomic_long_inc_return().
4176+
*/
4177+
first_n = READ_ONCE(q->defer_list.first);
4178+
do {
4179+
if (first_n && !defer_count) {
4180+
defer_count = atomic_long_inc_return(&q->defer_count);
4181+
if (unlikely(defer_count > q->limit)) {
4182+
kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_DROP);
4183+
return NET_XMIT_DROP;
4184+
}
4185+
}
4186+
skb->ll_node.next = first_n;
4187+
} while (!try_cmpxchg(&q->defer_list.first, &first_n, &skb->ll_node));
4188+
4189+
/* If defer_list was not empty, we know the cpu which queued
4190+
* the first skb will process the whole list for us.
41834191
*/
4184-
contended = qdisc_is_running(q) || IS_ENABLED(CONFIG_PREEMPT_RT);
4185-
if (unlikely(contended))
4186-
spin_lock(&q->busylock);
4192+
if (first_n)
4193+
return NET_XMIT_SUCCESS;
41874194

41884195
spin_lock(root_lock);
4196+
4197+
ll_list = llist_del_all(&q->defer_list);
4198+
/* There is a small race because we clear defer_count not atomically
4199+
* with the prior llist_del_all(). This means defer_list could grow
4200+
* over q->limit.
4201+
*/
4202+
atomic_long_set(&q->defer_count, 0);
4203+
4204+
ll_list = llist_reverse_order(ll_list);
4205+
41894206
if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
4190-
__qdisc_drop(skb, &to_free);
4207+
llist_for_each_entry_safe(skb, next, ll_list, ll_node)
4208+
__qdisc_drop(skb, &to_free);
41914209
rc = NET_XMIT_DROP;
4192-
} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
4193-
qdisc_run_begin(q)) {
4210+
goto unlock;
4211+
}
4212+
if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
4213+
!llist_next(ll_list) && qdisc_run_begin(q)) {
41944214
/*
41954215
* This is a work-conserving queue; there are no old skbs
41964216
* waiting to be sent out; and the qdisc is not running -
41974217
* xmit the skb directly.
41984218
*/
41994219

4220+
DEBUG_NET_WARN_ON_ONCE(skb != llist_entry(ll_list,
4221+
struct sk_buff,
4222+
ll_node));
42004223
qdisc_bstats_update(q, skb);
4201-
4202-
if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
4203-
if (unlikely(contended)) {
4204-
spin_unlock(&q->busylock);
4205-
contended = false;
4206-
}
4224+
if (sch_direct_xmit(skb, q, dev, txq, root_lock, true))
42074225
__qdisc_run(q);
4208-
}
4209-
42104226
qdisc_run_end(q);
42114227
rc = NET_XMIT_SUCCESS;
42124228
} else {
4213-
WRITE_ONCE(q->owner, smp_processor_id());
4214-
rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
4215-
WRITE_ONCE(q->owner, -1);
4216-
if (qdisc_run_begin(q)) {
4217-
if (unlikely(contended)) {
4218-
spin_unlock(&q->busylock);
4219-
contended = false;
4220-
}
4221-
__qdisc_run(q);
4222-
qdisc_run_end(q);
4229+
int count = 0;
4230+
4231+
llist_for_each_entry_safe(skb, next, ll_list, ll_node) {
4232+
prefetch(next);
4233+
skb_mark_not_on_list(skb);
4234+
rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
4235+
count++;
42234236
}
4237+
qdisc_run(q);
4238+
if (count != 1)
4239+
rc = NET_XMIT_SUCCESS;
42244240
}
4241+
unlock:
42254242
spin_unlock(root_lock);
42264243
if (unlikely(to_free))
42274244
kfree_skb_list_reason(to_free,
42284245
tcf_get_drop_reason(to_free));
4229-
if (unlikely(contended))
4230-
spin_unlock(&q->busylock);
42314246
return rc;
42324247
}
42334248

net/core/skbuff.c

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1136,7 +1136,16 @@ void skb_release_head_state(struct sk_buff *skb)
11361136
skb_dst_drop(skb);
11371137
if (skb->destructor) {
11381138
DEBUG_NET_WARN_ON_ONCE(in_hardirq());
1139-
skb->destructor(skb);
1139+
#ifdef CONFIG_INET
1140+
INDIRECT_CALL_3(skb->destructor,
1141+
tcp_wfree, __sock_wfree, sock_wfree,
1142+
skb);
1143+
#else
1144+
INDIRECT_CALL_1(skb->destructor,
1145+
sock_wfree,
1146+
skb);
1147+
1148+
#endif
11401149
}
11411150
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
11421151
nf_conntrack_put(skb_nfct(skb));

net/sched/act_mirred.c

Lines changed: 23 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -29,31 +29,6 @@
2929
static LIST_HEAD(mirred_list);
3030
static DEFINE_SPINLOCK(mirred_list_lock);
3131

32-
#define MIRRED_NEST_LIMIT 4
33-
34-
#ifndef CONFIG_PREEMPT_RT
35-
static u8 tcf_mirred_nest_level_inc_return(void)
36-
{
37-
return __this_cpu_inc_return(softnet_data.xmit.sched_mirred_nest);
38-
}
39-
40-
static void tcf_mirred_nest_level_dec(void)
41-
{
42-
__this_cpu_dec(softnet_data.xmit.sched_mirred_nest);
43-
}
44-
45-
#else
46-
static u8 tcf_mirred_nest_level_inc_return(void)
47-
{
48-
return current->net_xmit.sched_mirred_nest++;
49-
}
50-
51-
static void tcf_mirred_nest_level_dec(void)
52-
{
53-
current->net_xmit.sched_mirred_nest--;
54-
}
55-
#endif
56-
5732
static bool tcf_mirred_is_act_redirect(int action)
5833
{
5934
return action == TCA_EGRESS_REDIR || action == TCA_INGRESS_REDIR;
@@ -439,44 +414,53 @@ TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb,
439414
{
440415
struct tcf_mirred *m = to_mirred(a);
441416
int retval = READ_ONCE(m->tcf_action);
442-
unsigned int nest_level;
417+
struct netdev_xmit *xmit;
443418
bool m_mac_header_xmit;
444419
struct net_device *dev;
445-
int m_eaction;
420+
int i, m_eaction;
446421
u32 blockid;
447422

448-
nest_level = tcf_mirred_nest_level_inc_return();
449-
if (unlikely(nest_level > MIRRED_NEST_LIMIT)) {
423+
#ifdef CONFIG_PREEMPT_RT
424+
xmit = &current->net_xmit;
425+
#else
426+
xmit = this_cpu_ptr(&softnet_data.xmit);
427+
#endif
428+
if (unlikely(xmit->sched_mirred_nest >= MIRRED_NEST_LIMIT)) {
450429
net_warn_ratelimited("Packet exceeded mirred recursion limit on dev %s\n",
451430
netdev_name(skb->dev));
452-
retval = TC_ACT_SHOT;
453-
goto dec_nest_level;
431+
return TC_ACT_SHOT;
454432
}
455433

456434
tcf_lastuse_update(&m->tcf_tm);
457435
tcf_action_update_bstats(&m->common, skb);
458436

459437
blockid = READ_ONCE(m->tcfm_blockid);
460-
if (blockid) {
461-
retval = tcf_blockcast(skb, m, blockid, res, retval);
462-
goto dec_nest_level;
463-
}
438+
if (blockid)
439+
return tcf_blockcast(skb, m, blockid, res, retval);
464440

465441
dev = rcu_dereference_bh(m->tcfm_dev);
466442
if (unlikely(!dev)) {
467443
pr_notice_once("tc mirred: target device is gone\n");
468444
tcf_action_inc_overlimit_qstats(&m->common);
469-
goto dec_nest_level;
445+
return retval;
470446
}
447+
for (i = 0; i < xmit->sched_mirred_nest; i++) {
448+
if (xmit->sched_mirred_dev[i] != dev)
449+
continue;
450+
pr_notice_once("tc mirred: loop on device %s\n",
451+
netdev_name(dev));
452+
tcf_action_inc_overlimit_qstats(&m->common);
453+
return retval;
454+
}
455+
456+
xmit->sched_mirred_dev[xmit->sched_mirred_nest++] = dev;
471457

472458
m_mac_header_xmit = READ_ONCE(m->tcfm_mac_header_xmit);
473459
m_eaction = READ_ONCE(m->tcfm_eaction);
474460

475461
retval = tcf_mirred_to_dev(skb, m, dev, m_mac_header_xmit, m_eaction,
476462
retval);
477-
478-
dec_nest_level:
479-
tcf_mirred_nest_level_dec();
463+
xmit->sched_mirred_nest--;
480464

481465
return retval;
482466
}

net/sched/sch_generic.c

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -666,7 +666,6 @@ struct Qdisc noop_qdisc = {
666666
.ops = &noop_qdisc_ops,
667667
.q.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
668668
.dev_queue = &noop_netdev_queue,
669-
.busylock = __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock),
670669
.gso_skb = {
671670
.next = (struct sk_buff *)&noop_qdisc.gso_skb,
672671
.prev = (struct sk_buff *)&noop_qdisc.gso_skb,
@@ -679,7 +678,6 @@ struct Qdisc noop_qdisc = {
679678
.qlen = 0,
680679
.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.skb_bad_txq.lock),
681680
},
682-
.owner = -1,
683681
};
684682
EXPORT_SYMBOL(noop_qdisc);
685683

@@ -971,10 +969,6 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
971969
}
972970
}
973971

974-
spin_lock_init(&sch->busylock);
975-
lockdep_set_class(&sch->busylock,
976-
dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
977-
978972
/* seqlock has the same scope of busylock, for NOLOCK qdisc */
979973
spin_lock_init(&sch->seqlock);
980974
lockdep_set_class(&sch->seqlock,
@@ -985,7 +979,6 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
985979
sch->enqueue = ops->enqueue;
986980
sch->dequeue = ops->dequeue;
987981
sch->dev_queue = dev_queue;
988-
sch->owner = -1;
989982
netdev_hold(dev, &sch->dev_tracker, GFP_KERNEL);
990983
refcount_set(&sch->refcnt, 1);
991984

tools/testing/selftests/net/packetdrill/tcp_user_timeout_user-timeout-probe.pkt

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,14 +23,16 @@
2323

2424
// install a qdisc dropping all packets
2525
+0 `tc qdisc delete dev tun0 root 2>/dev/null ; tc qdisc add dev tun0 root pfifo limit 0`
26+
2627
+0 write(4, ..., 24) = 24
2728
// When qdisc is congested we retry every 500ms
2829
// (TCP_RESOURCE_PROBE_INTERVAL) and therefore
2930
// we retry 6 times before hitting 3s timeout.
3031
// First verify that the connection is alive:
31-
+3.250 write(4, ..., 24) = 24
32+
+3 write(4, ..., 24) = 24
33+
3234
// Now verify that shortly after that the socket is dead:
33-
+.100 write(4, ..., 24) = -1 ETIMEDOUT (Connection timed out)
35+
+1 write(4, ..., 24) = -1 ETIMEDOUT (Connection timed out)
3436

3537
+0 %{ assert tcpi_probes == 6, tcpi_probes; \
3638
assert tcpi_backoff == 0, tcpi_backoff }%

0 commit comments

Comments
 (0)