6.19 Revert "net/sched: Fix mirred deadlock on device recursion" & net: dev_queue_xmit() llist adoption & net: add a fast path in __netif_schedule()

edumazet · aosemp · commit 22f578f016d2 · 2025-12-13T04:09:18.000Z
This reverts commits 0f022d3 and 44180fe. Prior patch in this series implemented loop detection in act_mirred, we can remove q->owner to save some cycles in the fast path. Signed-off-by: Eric Dumazet <edumazet@google.com> Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com> Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com> Reviewed-by: Victor Nogueira <victor@mojatatu.com> Tested-by: Jamal Hadi Salim <jhs@mojatatu.com> Acked-by: Jamal Hadi Salim <jhs@mojatatu.com> Link: https://patch.msgid.link/20251014171907.3554413-5-edumazet@google.com Signed-off-by: Jakub Kicinski <kuba@kernel.org> net: dev_queue_xmit() llist adoption Remove busylock spinlock and use a lockless list (llist) to reduce spinlock contention to the minimum. Idea is that only one cpu might spin on the qdisc spinlock, while others simply add their skb in the llist. After this patch, we get a 300 % improvement on heavy TX workloads. - Sending twice the number of packets per second. - While consuming 50 % less cycles. Note that this also allows in the future to submit batches to various qdisc->enqueue() methods. Tested: - Dual Intel(R) Xeon(R) 6985P-C (480 hyper threads). - 100Gbit NIC, 30 TX queues with FQ packet scheduler. - echo 64 >/sys/kernel/slab/skbuff_small_head/cpu_partial (avoid contention in mm) - 240 concurrent "netperf -t UDP_STREAM -- -m 120 -n" Before: 16 Mpps (41 Mpps if each thread is pinned to a different cpu) vmstat 2 5 procs -----------memory---------- ---swap-- -----io---- -system-- ------cpu----- r b swpd free buff cache si so bi bo in cs us sy id wa st 243 0 0 2368988672 51036 1100852 0 0 146 1 242 60 0 9 91 0 0 244 0 0 2368988672 51036 1100852 0 0 536 10 487745 14718 0 52 48 0 0 244 0 0 2368988672 51036 1100852 0 0 512 0 503067 46033 0 52 48 0 0 244 0 0 2368988672 51036 1100852 0 0 512 0 494807 12107 0 52 48 0 0 244 0 0 2368988672 51036 1100852 0 0 702 26 492845 10110 0 52 48 0 0 Lock contention (1 second sample taken on 8 cores) perf lock record -C0-7 sleep 1; perf lock contention contended total wait max wait avg wait type caller 442111 6.79 s 162.47 ms 15.35 us spinlock dev_hard_start_xmit+0xcd 5961 9.57 ms 8.12 us 1.60 us spinlock __dev_queue_xmit+0x3a0 244 560.63 us 7.63 us 2.30 us spinlock do_softirq+0x5b 13 25.09 us 3.21 us 1.93 us spinlock net_tx_action+0xf8 If netperf threads are pinned, spinlock stress is very high. perf lock record -C0-7 sleep 1; perf lock contention contended total wait max wait avg wait type caller 964508 7.10 s 147.25 ms 7.36 us spinlock dev_hard_start_xmit+0xcd 201 268.05 us 4.65 us 1.33 us spinlock __dev_queue_xmit+0x3a0 12 26.05 us 3.84 us 2.17 us spinlock do_softirq+0x5b @__dev_queue_xmit_ns: [256, 512) 21 | | [512, 1K) 631 | | [1K, 2K) 27328 |@ | [2K, 4K) 265392 |@@@@@@@@@@@@@@@@ | [4K, 8K) 417543 |@@@@@@@@@@@@@@@@@@@@@@@@@@ | [8K, 16K) 826292 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| [16K, 32K) 733822 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | [32K, 64K) 19055 |@ | [64K, 128K) 17240 |@ | [128K, 256K) 25633 |@ | [256K, 512K) 4 | | After: 29 Mpps (57 Mpps if each thread is pinned to a different cpu) vmstat 2 5 procs -----------memory---------- ---swap-- -----io---- -system-- ------cpu----- r b swpd free buff cache si so bi bo in cs us sy id wa st 78 0 0 2369573632 32896 1350988 0 0 22 0 331 254 0 8 92 0 0 75 0 0 2369573632 32896 1350988 0 0 22 50 425713 280199 0 23 76 0 0 104 0 0 2369573632 32896 1350988 0 0 290 0 430238 298247 0 23 76 0 0 86 0 0 2369573632 32896 1350988 0 0 132 0 428019 291865 0 24 76 0 0 90 0 0 2369573632 32896 1350988 0 0 502 0 422498 278672 0 23 76 0 0 perf lock record -C0-7 sleep 1; perf lock contention contended total wait max wait avg wait type caller 2524 116.15 ms 486.61 us 46.02 us spinlock __dev_queue_xmit+0x55b 5821 107.18 ms 371.67 us 18.41 us spinlock dev_hard_start_xmit+0xcd 2377 9.73 ms 35.86 us 4.09 us spinlock ___slab_alloc+0x4e0 923 5.74 ms 20.91 us 6.22 us spinlock ___slab_alloc+0x5c9 121 3.42 ms 193.05 us 28.24 us spinlock net_tx_action+0xf8 6 564.33 us 167.60 us 94.05 us spinlock do_softirq+0x5b If netperf threads are pinned (~54 Mpps) perf lock record -C0-7 sleep 1; perf lock contention 32907 316.98 ms 195.98 us 9.63 us spinlock dev_hard_start_xmit+0xcd 4507 61.83 ms 212.73 us 13.72 us spinlock __dev_queue_xmit+0x554 2781 23.53 ms 40.03 us 8.46 us spinlock ___slab_alloc+0x5c9 3554 18.94 ms 34.69 us 5.33 us spinlock ___slab_alloc+0x4e0 233 9.09 ms 215.70 us 38.99 us spinlock do_softirq+0x5b 153 930.66 us 48.67 us 6.08 us spinlock net_tx_action+0xfd 84 331.10 us 14.22 us 3.94 us spinlock ___slab_alloc+0x5c9 140 323.71 us 9.94 us 2.31 us spinlock ___slab_alloc+0x4e0 @__dev_queue_xmit_ns: [128, 256) 1539830 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | [256, 512) 2299558 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| [512, 1K) 483936 |@@@@@@@@@@ | [1K, 2K) 265345 |@@@@@@ | [2K, 4K) 145463 |@@@ | [4K, 8K) 54571 |@ | [8K, 16K) 10270 | | [16K, 32K) 9385 | | [32K, 64K) 7749 | | [64K, 128K) 26799 | | [128K, 256K) 2665 | | [256K, 512K) 665 | | Signed-off-by: Eric Dumazet <edumazet@google.com> Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com> Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com> Tested-by: Jamal Hadi Salim <jhs@mojatatu.com> Acked-by: Jamal Hadi Salim <jhs@mojatatu.com> Link: https://patch.msgid.link/20251014171907.3554413-7-edumazet@google.com Signed-off-by: Jakub Kicinski <kuba@kernel.org> net: add a fast path in __netif_schedule() Cpus serving NIC interrupts and specifically TX completions are often trapped in also restarting a busy qdisc (because qdisc was stopped by BQL or the driver's own flow control). When they call netdev_tx_completed_queue() or netif_tx_wake_queue(), they call __netif_schedule() so that the queue can be run later from net_tx_action() (involving NET_TX_SOFTIRQ) Quite often, by the time the cpu reaches net_tx_action(), another cpu grabbed the qdisc spinlock from __dev_xmit_skb(), and we spend too much time spinning on this lock. We can detect in __netif_schedule() if a cpu is already at a specific point in __dev_xmit_skb() where we have the guarantee the queue will be run. This patch gives a 13 % increase of throughput on an IDPF NIC (200Gbit), 32 TX qeues, sending UDP packets of 120 bytes. This also helps __qdisc_run() to not force a NET_TX_SOFTIRQ if another thread is waiting in __dev_xmit_skb() Before: sar -n DEV 5 5|grep eth1|grep Average Average: eth1 1496.44 52191462.56 210.00 13369396.90 0.00 0.00 0.00 54.76 After: sar -n DEV 5 5|grep eth1|grep Average Average: eth1 1457.88 59363099.96 205.08 15206384.35 0.00 0.00 0.00 62.29 Signed-off-by: Eric Dumazet <edumazet@google.com> Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com> Link: https://patch.msgid.link/20251017145334.3016097-1-edumazet@google.com Signed-off-by: Jakub Kicinski <kuba@kernel.org> net: allow busy connected flows to switch tx queues This is a followup of commit 726e9e8 ("tcp: refine skb->ooo_okay setting") and of prior commit in this series ("net: control skb->ooo_okay from skb_set_owner_w()") skb->ooo_okay might never be set for bulk flows that always have at least one skb in a qdisc queue of NIC queue, especially if TX completion is delayed because of a stressed cpu. The so-called "strange attractors" has caused many performance issues (see for instance 9b462d0 ("tcp: TCP Small Queues and strange attractors")), we need to do better. We have tried very hard to avoid reorders because TCP was not dealing with them nicely a decade ago. Use the new net.core.txq_reselection_ms sysctl to let flows follow XPS and select a more efficient queue. After this patch, we no longer have to make sure threads are pinned to cpus, they now can be migrated without adding too much spinlock/qdisc/TX completion pressure anymore. TX completion part was problematic, because it added false sharing on various socket fields, but also added false sharing and spinlock contention in mm layers. Calling skb_orphan() from ndo_start_xmit() is not an option unfortunately. Note for later: 1) move sk->sk_tx_queue_mapping closer to sk_tx_queue_mapping_jiffies for better cache locality. 2) Study if 9b462d0 ("tcp: TCP Small Queues and strange attractors") could be revised. Tested: Used a host with 32 TX queues, shared by groups of 8 cores. XPS setup : echo ff >/sys/class/net/eth1/queue/tx-0/xps_cpus echo ff00 >/sys/class/net/eth1/queue/tx-1/xps_cpus echo ff0000 >/sys/class/net/eth1/queue/tx-2/xps_cpus echo ff000000 >/sys/class/net/eth1/queue/tx-3/xps_cpus echo ff,00000000 >/sys/class/net/eth1/queue/tx-4/xps_cpus echo ff00,00000000 >/sys/class/net/eth1/queue/tx-5/xps_cpus echo ff0000,00000000 >/sys/class/net/eth1/queue/tx-6/xps_cpus echo ff000000,00000000 >/sys/class/net/eth1/queue/tx-7/xps_cpus ... Launched a tcp_stream with 15 threads and 1000 flows, initially affined to core 0-15 taskset -c 0-15 tcp_stream -T15 -F1000 -l1000 -c -H target_host Checked that only queues 0 and 1 are used as instructed by XPS : tc -s qdisc show dev eth1|grep backlog|grep -v "backlog 0b 0p" backlog 123489410b 1890p backlog 69809026b 1064p backlog 52401054b 805p Then force each thread to run on cpu 1,9,17,25,33,41,49,57,65,73,81,89,97,105,113,121 C=1;PID=`pidof tcp_stream`;for P in `ls /proc/$PID/task`; do taskset -pc $C $P; C=$(($C + 8));done Set txq_reselection_ms to 1000 echo 1000 > /proc/sys/net/core/txq_reselection_ms Check that the flows have migrated nicely: tc -s qdisc show dev eth1|grep backlog|grep -v "backlog 0b 0p" backlog 130508314b 1916p backlog 8584380b 126p backlog 8584380b 126p backlog 8379990b 123p backlog 8584380b 126p backlog 8487484b 125p backlog 8584380b 126p backlog 8448120b 124p backlog 8584380b 126p backlog 8720640b 128p backlog 8856900b 130p backlog 8584380b 126p backlog 8652510b 127p backlog 8448120b 124p backlog 8516250b 125p backlog 7834950b 115p Signed-off-by: Eric Dumazet <edumazet@google.com> Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com> Link: https://patch.msgid.link/20251013152234.842065-5-edumazet@google.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
@@ -123,7 +123,9 @@ struct Qdisc {
 	struct Qdisc            *next_sched;
 	struct sk_buff_head	skb_bad_txq;
 
-	spinlock_t		busylock ____cacheline_aligned_in_smp;
+	atomic_long_t		defer_count ____cacheline_aligned_in_smp;
+	struct llist_head	defer_list;
+
 	spinlock_t		seqlock;
 
 	struct rcu_head		rcu;
diff --git a/net/core/dev.c b/net/core/dev.c
@@ -3373,6 +3373,13 @@ static void __netif_reschedule(struct Qdisc *q)
 
 void __netif_schedule(struct Qdisc *q)
 {
+	/* If q->defer_list is not empty, at least one thread is
+	 * in __dev_xmit_skb() before llist_del_all(&q->defer_list).
+	 * This thread will attempt to run the queue.
+	 */
+	if (!llist_empty(&q->defer_list))
+		return;
+
 	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
 		__netif_reschedule(q);
 }
@@ -4125,9 +4132,10 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 				 struct net_device *dev,
 				 struct netdev_queue *txq)
 {
+	struct sk_buff *next, *to_free = NULL;
 	spinlock_t *root_lock = qdisc_lock(q);
-	struct sk_buff *to_free = NULL;
-	bool contended;
+	struct llist_node *ll_list, *first_n;
+	unsigned long defer_count = 0;
 	int rc;
 
 	qdisc_calculate_pkt_len(skb, q);
@@ -4167,67 +4175,81 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 		return rc;
 	}
 
-	if (unlikely(READ_ONCE(q->owner) == smp_processor_id())) {
-		kfree_skb_reason(skb, SKB_DROP_REASON_TC_RECLASSIFY_LOOP);
-		return NET_XMIT_DROP;
-	}
-	/*
-	 * Heuristic to force contended enqueues to serialize on a
-	 * separate lock before trying to get qdisc main lock.
-	 * This permits qdisc->running owner to get the lock more
-	 * often and dequeue packets faster.
-	 * On PREEMPT_RT it is possible to preempt the qdisc owner during xmit
-	 * and then other tasks will only enqueue packets. The packets will be
-	 * sent after the qdisc owner is scheduled again. To prevent this
-	 * scenario the task always serialize on the lock.
+	/* Open code llist_add(&skb->ll_node, &q->defer_list) + queue limit.
+	 * In the try_cmpxchg() loop, we want to increment q->defer_count
+	 * at most once to limit the number of skbs in defer_list.
+	 * We perform the defer_count increment only if the list is not empty,
+	 * because some arches have slow atomic_long_inc_return().
 	 */
-	contended = qdisc_is_running(q) || IS_ENABLED(CONFIG_PREEMPT_RT);
-	if (unlikely(contended))
-		spin_lock(&q->busylock);
+	first_n = READ_ONCE(q->defer_list.first);
+	do {
+		if (first_n && !defer_count) {
+			defer_count = atomic_long_inc_return(&q->defer_count);
+			if (unlikely(defer_count > q->limit)) {
+				kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_DROP);
+				return NET_XMIT_DROP;
+			}
+		}
+		skb->ll_node.next = first_n;
+	} while (!try_cmpxchg(&q->defer_list.first, &first_n, &skb->ll_node));
+
+	/* If defer_list was not empty, we know the cpu which queued
+	 * the first skb will process the whole list for us.
+	 */
+	if (first_n)
+		return NET_XMIT_SUCCESS;
 
 	spin_lock(root_lock);
+
+	ll_list = llist_del_all(&q->defer_list);
+	/* There is a small race because we clear defer_count not atomically
+	 * with the prior llist_del_all(). This means defer_list could grow
+	 * over q->limit.
+	 */
+	atomic_long_set(&q->defer_count, 0);
+
+	ll_list = llist_reverse_order(ll_list);
+
 	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
-		__qdisc_drop(skb, &to_free);
+		llist_for_each_entry_safe(skb, next, ll_list, ll_node)
+			__qdisc_drop(skb, &to_free);
 		rc = NET_XMIT_DROP;
-	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
-		   qdisc_run_begin(q)) {
+		goto unlock;
+	}
+	if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
+	    !llist_next(ll_list) && qdisc_run_begin(q)) {
 		/*
 		 * This is a work-conserving queue; there are no old skbs
 		 * waiting to be sent out; and the qdisc is not running -
 		 * xmit the skb directly.
 		 */
 
+		DEBUG_NET_WARN_ON_ONCE(skb != llist_entry(ll_list,
+							  struct sk_buff,
+							  ll_node));
 		qdisc_bstats_update(q, skb);
-
-		if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
-			if (unlikely(contended)) {
-				spin_unlock(&q->busylock);
-				contended = false;
-			}
+		if (sch_direct_xmit(skb, q, dev, txq, root_lock, true))
 			__qdisc_run(q);
-		}
-
 		qdisc_run_end(q);
 		rc = NET_XMIT_SUCCESS;
 	} else {
-		WRITE_ONCE(q->owner, smp_processor_id());
-		rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
-		WRITE_ONCE(q->owner, -1);
-		if (qdisc_run_begin(q)) {
-			if (unlikely(contended)) {
-				spin_unlock(&q->busylock);
-				contended = false;
-			}
-			__qdisc_run(q);
-			qdisc_run_end(q);
+		int count = 0;
+
+		llist_for_each_entry_safe(skb, next, ll_list, ll_node) {
+			prefetch(next);
+			skb_mark_not_on_list(skb);
+			rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
+			count++;
 		}
+		qdisc_run(q);
+		if (count != 1)
+			rc = NET_XMIT_SUCCESS;
 	}
+unlock:
 	spin_unlock(root_lock);
 	if (unlikely(to_free))
 		kfree_skb_list_reason(to_free,
 				      tcf_get_drop_reason(to_free));
-	if (unlikely(contended))
-		spin_unlock(&q->busylock);
 	return rc;
 }
 
@@ -4591,6 +4613,32 @@ u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
 }
 EXPORT_SYMBOL(dev_pick_tx_zero);
 
+int sk_tx_queue_get(const struct sock *sk)
+{
+	int resel, val;
+
+	if (!sk)
+		return -1;
+	/* Paired with WRITE_ONCE() in sk_tx_queue_clear()
+	 * and sk_tx_queue_set().
+	 */
+	val = READ_ONCE(sk->sk_tx_queue_mapping);
+
+	if (val == NO_QUEUE_MAPPING)
+		return -1;
+
+	if (!sk_fullsock(sk))
+		return val;
+
+	resel = READ_ONCE(sock_net(sk)->core.sysctl_txq_reselection);
+	if (resel && time_is_before_jiffies(
+			READ_ONCE(sk->sk_tx_queue_mapping_jiffies) + resel))
+		return -1;
+
+	return val;
+}
+EXPORT_SYMBOL(sk_tx_queue_get);
+
 u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
 		     struct net_device *sb_dev)
 {
@@ -4606,8 +4654,7 @@ u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
 		if (new_index < 0)
 			new_index = skb_tx_hash(dev, sb_dev, skb);
 
-		if (queue_index != new_index && sk &&
-		    sk_fullsock(sk) &&
+		if (sk && sk_fullsock(sk) &&
 		    rcu_access_pointer(sk->sk_dst_cache))
 			sk_tx_queue_set(sk, new_index);
 
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
@@ -669,7 +669,6 @@ struct Qdisc noop_qdisc = {
 	.ops		=	&noop_qdisc_ops,
 	.q.lock		=	__SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
 	.dev_queue	=	&noop_netdev_queue,
-	.busylock	=	__SPIN_LOCK_UNLOCKED(noop_qdisc.busylock),
 	.gso_skb = {
 		.next = (struct sk_buff *)&noop_qdisc.gso_skb,
 		.prev = (struct sk_buff *)&noop_qdisc.gso_skb,
@@ -974,10 +973,6 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
 		}
 	}
 
-	spin_lock_init(&sch->busylock);
-	lockdep_set_class(&sch->busylock,
-			  dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
-
 	/* seqlock has the same scope of busylock, for NOLOCK qdisc */
 	spin_lock_init(&sch->seqlock);
 	lockdep_set_class(&sch->seqlock,