@@ -4125,9 +4125,10 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
41254125 struct net_device * dev ,
41264126 struct netdev_queue * txq )
41274127{
4128+ struct sk_buff * next , * to_free = NULL ;
41284129 spinlock_t * root_lock = qdisc_lock (q );
4129- struct sk_buff * to_free = NULL ;
4130- bool contended ;
4130+ struct llist_node * ll_list , * first_n ;
4131+ unsigned long defer_count = 0 ;
41314132 int rc ;
41324133
41334134 qdisc_calculate_pkt_len (skb , q );
@@ -4167,67 +4168,81 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
41674168 return rc ;
41684169 }
41694170
4170- if (unlikely (READ_ONCE (q -> owner ) == smp_processor_id ())) {
4171- kfree_skb_reason (skb , SKB_DROP_REASON_TC_RECLASSIFY_LOOP );
4172- return NET_XMIT_DROP ;
4173- }
4174- /*
4175- * Heuristic to force contended enqueues to serialize on a
4176- * separate lock before trying to get qdisc main lock.
4177- * This permits qdisc->running owner to get the lock more
4178- * often and dequeue packets faster.
4179- * On PREEMPT_RT it is possible to preempt the qdisc owner during xmit
4180- * and then other tasks will only enqueue packets. The packets will be
4181- * sent after the qdisc owner is scheduled again. To prevent this
4182- * scenario the task always serialize on the lock.
4171+ /* Open code llist_add(&skb->ll_node, &q->defer_list) + queue limit.
4172+ * In the try_cmpxchg() loop, we want to increment q->defer_count
4173+ * at most once to limit the number of skbs in defer_list.
4174+ * We perform the defer_count increment only if the list is not empty,
4175+ * because some arches have slow atomic_long_inc_return().
4176+ */
4177+ first_n = READ_ONCE (q -> defer_list .first );
4178+ do {
4179+ if (first_n && !defer_count ) {
4180+ defer_count = atomic_long_inc_return (& q -> defer_count );
4181+ if (unlikely (defer_count > q -> limit )) {
4182+ kfree_skb_reason (skb , SKB_DROP_REASON_QDISC_DROP );
4183+ return NET_XMIT_DROP ;
4184+ }
4185+ }
4186+ skb -> ll_node .next = first_n ;
4187+ } while (!try_cmpxchg (& q -> defer_list .first , & first_n , & skb -> ll_node ));
4188+
4189+ /* If defer_list was not empty, we know the cpu which queued
4190+ * the first skb will process the whole list for us.
41834191 */
4184- contended = qdisc_is_running (q ) || IS_ENABLED (CONFIG_PREEMPT_RT );
4185- if (unlikely (contended ))
4186- spin_lock (& q -> busylock );
4192+ if (first_n )
4193+ return NET_XMIT_SUCCESS ;
41874194
41884195 spin_lock (root_lock );
4196+
4197+ ll_list = llist_del_all (& q -> defer_list );
4198+ /* There is a small race because we clear defer_count not atomically
4199+ * with the prior llist_del_all(). This means defer_list could grow
4200+ * over q->limit.
4201+ */
4202+ atomic_long_set (& q -> defer_count , 0 );
4203+
4204+ ll_list = llist_reverse_order (ll_list );
4205+
41894206 if (unlikely (test_bit (__QDISC_STATE_DEACTIVATED , & q -> state ))) {
4190- __qdisc_drop (skb , & to_free );
4207+ llist_for_each_entry_safe (skb , next , ll_list , ll_node )
4208+ __qdisc_drop (skb , & to_free );
41914209 rc = NET_XMIT_DROP ;
4192- } else if ((q -> flags & TCQ_F_CAN_BYPASS ) && !qdisc_qlen (q ) &&
4193- qdisc_run_begin (q )) {
4210+ goto unlock ;
4211+ }
4212+ if ((q -> flags & TCQ_F_CAN_BYPASS ) && !qdisc_qlen (q ) &&
4213+ !llist_next (ll_list ) && qdisc_run_begin (q )) {
41944214 /*
41954215 * This is a work-conserving queue; there are no old skbs
41964216 * waiting to be sent out; and the qdisc is not running -
41974217 * xmit the skb directly.
41984218 */
41994219
4220+ DEBUG_NET_WARN_ON_ONCE (skb != llist_entry (ll_list ,
4221+ struct sk_buff ,
4222+ ll_node ));
42004223 qdisc_bstats_update (q , skb );
4201-
4202- if (sch_direct_xmit (skb , q , dev , txq , root_lock , true)) {
4203- if (unlikely (contended )) {
4204- spin_unlock (& q -> busylock );
4205- contended = false;
4206- }
4224+ if (sch_direct_xmit (skb , q , dev , txq , root_lock , true))
42074225 __qdisc_run (q );
4208- }
4209-
42104226 qdisc_run_end (q );
42114227 rc = NET_XMIT_SUCCESS ;
42124228 } else {
4213- WRITE_ONCE (q -> owner , smp_processor_id ());
4214- rc = dev_qdisc_enqueue (skb , q , & to_free , txq );
4215- WRITE_ONCE (q -> owner , -1 );
4216- if (qdisc_run_begin (q )) {
4217- if (unlikely (contended )) {
4218- spin_unlock (& q -> busylock );
4219- contended = false;
4220- }
4221- __qdisc_run (q );
4222- qdisc_run_end (q );
4229+ int count = 0 ;
4230+
4231+ llist_for_each_entry_safe (skb , next , ll_list , ll_node ) {
4232+ prefetch (next );
4233+ skb_mark_not_on_list (skb );
4234+ rc = dev_qdisc_enqueue (skb , q , & to_free , txq );
4235+ count ++ ;
42234236 }
4237+ qdisc_run (q );
4238+ if (count != 1 )
4239+ rc = NET_XMIT_SUCCESS ;
42244240 }
4241+ unlock :
42254242 spin_unlock (root_lock );
42264243 if (unlikely (to_free ))
42274244 kfree_skb_list_reason (to_free ,
42284245 tcf_get_drop_reason (to_free ));
4229- if (unlikely (contended ))
4230- spin_unlock (& q -> busylock );
42314246 return rc ;
42324247}
42334248
0 commit comments