Skip to content

Commit 84f8ca1

Browse files
Andrew GallatinAndrew Gallatin
authored andcommitted
iflib: add a simple transmit routine
While mp_ring can provide amazing scalability in scenarios where the number of cores exceeds the number of NIC tx rings, it can also lead to greatly reduced performance in simpler, high packet rate scenarios due to extra CPU cycles and cache misses stemming from its complexity. This change implements a simple if_transmit routine, selected at driver load. This routine does not queue anything, and uses a simple queue selection and ends up being far more cache friendly. In testing on a 400GbE NIC in an AMD 7502P EPYC server, this simple tx routine is roughly 2.5 times as fast as mp_ring (8Gbs -> 20Gb/s). and 5x as fast as mp_ring with tx_abdicate=1 (4Gbs -> 20Gb/s) for a simple in-kernel packet generator, which is closed source currently. It also shows a 50% speedup for a simple netperf -tUDP_STREAM test (5Gb/s -> 8Gbs). This change is mostly a noop, as it not enabled by default. The one exception is the change to iflib_encap() to immediately reclaim completed tx descriptors, and only failing the transmit and scheduling a later reclaim if iflib_completed_tx_reclaim() didn't free enough descriptors. Reviewed by: kbowling, sumit.saxena_broadcom.com, vmaffione Sponsored by: Netflix Differential Revision: https://reviews.freebsd.org/D51905
1 parent fe2418f commit 84f8ca1

File tree

2 files changed

+93
-13
lines changed

2 files changed

+93
-13
lines changed

share/man/man4/iflib.4

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
.Dd September 27, 2018
1+
.Dd August 20, 2025
22
.Dt IFLIB 4
33
.Os
44
.Sh NAME
@@ -64,6 +64,18 @@ If this is zero or not set, an RX and TX queue pair will be assigned to each
6464
core.
6565
When set to a non-zero value, TX queues are assigned to cores following the
6666
last RX queue.
67+
.It Va simple_tx
68+
When set to one, iflib uses a simple transmit routine with no queuing at all.
69+
By default, iflib uses a highly optimized, lockless, transmit queue called
70+
mp_ring.
71+
This performs well when there are more CPU cores than NIC
72+
queues and prevents lock contention for transmit resources.
73+
Unfortunately, mp_ring incurs unneeded overheads on workloads where
74+
resource contention is not a problem (well behaved applications on
75+
systems where there are as many NIC queues as CPU cores).
76+
Note that when this is enabled, the tx_abdicate sysctl is no longer
77+
applicable and is ignored.
78+
Defaults to zero.
6779
.El
6880
.Pp
6981
These

sys/net/iflib.c

Lines changed: 80 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,7 @@ struct iflib_ctx;
142142
static void iru_init(if_rxd_update_t iru, iflib_rxq_t rxq, uint8_t flid);
143143
static void iflib_timer(void *arg);
144144
static void iflib_tqg_detach(if_ctx_t ctx);
145+
static int iflib_simple_transmit(if_t ifp, struct mbuf *m);
145146

146147
typedef struct iflib_filter_info {
147148
driver_filter_t *ifi_filter;
@@ -198,6 +199,7 @@ struct iflib_ctx {
198199
uint8_t ifc_sysctl_use_logical_cores;
199200
uint16_t ifc_sysctl_extra_msix_vectors;
200201
bool ifc_cpus_are_physical_cores;
202+
bool ifc_sysctl_simple_tx;
201203

202204
qidx_t ifc_sysctl_ntxds[8];
203205
qidx_t ifc_sysctl_nrxds[8];
@@ -725,6 +727,7 @@ static void iflib_free_intr_mem(if_ctx_t ctx);
725727
#ifndef __NO_STRICT_ALIGNMENT
726728
static struct mbuf *iflib_fixup_rx(struct mbuf *m);
727729
#endif
730+
static __inline int iflib_completed_tx_reclaim(iflib_txq_t txq, int thresh);
728731

729732
static SLIST_HEAD(cpu_offset_list, cpu_offset) cpu_offsets =
730733
SLIST_HEAD_INITIALIZER(cpu_offsets);
@@ -2624,8 +2627,10 @@ iflib_stop(if_ctx_t ctx)
26242627
#endif /* DEV_NETMAP */
26252628
CALLOUT_UNLOCK(txq);
26262629

2627-
/* clean any enqueued buffers */
2628-
iflib_ifmp_purge(txq);
2630+
if (!ctx->ifc_sysctl_simple_tx) {
2631+
/* clean any enqueued buffers */
2632+
iflib_ifmp_purge(txq);
2633+
}
26292634
/* Free any existing tx buffers. */
26302635
for (j = 0; j < txq->ift_size; j++) {
26312636
iflib_txsd_free(ctx, txq, j);
@@ -3635,13 +3640,16 @@ iflib_encap(iflib_txq_t txq, struct mbuf **m_headp)
36353640
* cxgb
36363641
*/
36373642
if (__predict_false(nsegs + 2 > TXQ_AVAIL(txq))) {
3638-
txq->ift_no_desc_avail++;
3639-
bus_dmamap_unload(buf_tag, map);
3640-
DBG_COUNTER_INC(encap_txq_avail_fail);
3641-
DBG_COUNTER_INC(encap_txd_encap_fail);
3642-
if ((txq->ift_task.gt_task.ta_flags & TASK_ENQUEUED) == 0)
3643-
GROUPTASK_ENQUEUE(&txq->ift_task);
3644-
return (ENOBUFS);
3643+
(void)iflib_completed_tx_reclaim(txq, RECLAIM_THRESH(ctx));
3644+
if (__predict_false(nsegs + 2 > TXQ_AVAIL(txq))) {
3645+
txq->ift_no_desc_avail++;
3646+
bus_dmamap_unload(buf_tag, map);
3647+
DBG_COUNTER_INC(encap_txq_avail_fail);
3648+
DBG_COUNTER_INC(encap_txd_encap_fail);
3649+
if ((txq->ift_task.gt_task.ta_flags & TASK_ENQUEUED) == 0)
3650+
GROUPTASK_ENQUEUE(&txq->ift_task);
3651+
return (ENOBUFS);
3652+
}
36453653
}
36463654
/*
36473655
* On Intel cards we can greatly reduce the number of TX interrupts
@@ -4014,6 +4022,12 @@ _task_fn_tx(void *context)
40144022
netmap_tx_irq(ifp, txq->ift_id))
40154023
goto skip_ifmp;
40164024
#endif
4025+
if (ctx->ifc_sysctl_simple_tx) {
4026+
mtx_lock(&txq->ift_mtx);
4027+
(void)iflib_completed_tx_reclaim(txq, RECLAIM_THRESH(ctx));
4028+
mtx_unlock(&txq->ift_mtx);
4029+
goto skip_ifmp;
4030+
}
40174031
#ifdef ALTQ
40184032
if (if_altq_is_enabled(ifp))
40194033
iflib_altq_if_start(ifp);
@@ -4027,9 +4041,8 @@ _task_fn_tx(void *context)
40274041
*/
40284042
if (abdicate)
40294043
ifmp_ring_check_drainage(txq->ift_br, TX_BATCH_SIZE);
4030-
#ifdef DEV_NETMAP
4044+
40314045
skip_ifmp:
4032-
#endif
40334046
if (ctx->ifc_flags & IFC_LEGACY)
40344047
IFDI_INTR_ENABLE(ctx);
40354048
else
@@ -5131,7 +5144,14 @@ iflib_device_register(device_t dev, void *sc, if_shared_ctx_t sctx, if_ctx_t *ct
51315144

51325145
scctx = &ctx->ifc_softc_ctx;
51335146
ifp = ctx->ifc_ifp;
5134-
5147+
if (ctx->ifc_sysctl_simple_tx) {
5148+
#ifndef ALTQ
5149+
if_settransmitfn(ifp, iflib_simple_transmit);
5150+
device_printf(dev, "using simple if_transmit\n");
5151+
#else
5152+
device_printf(dev, "ALTQ prevents using simple if_transmit\n");
5153+
#endif
5154+
}
51355155
iflib_reset_qvalues(ctx);
51365156
IFNET_WLOCK();
51375157
CTX_LOCK(ctx);
@@ -6766,6 +6786,9 @@ iflib_add_device_sysctl_pre(if_ctx_t ctx)
67666786
SYSCTL_ADD_CONST_STRING(ctx_list, oid_list, OID_AUTO, "driver_version",
67676787
CTLFLAG_RD, ctx->ifc_sctx->isc_driver_version, "driver version");
67686788

6789+
SYSCTL_ADD_BOOL(ctx_list, oid_list, OID_AUTO, "simple_tx",
6790+
CTLFLAG_RDTUN, &ctx->ifc_sysctl_simple_tx, 0,
6791+
"use simple tx ring");
67696792
SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "override_ntxqs",
67706793
CTLFLAG_RWTUN, &ctx->ifc_sysctl_ntxqs, 0,
67716794
"# of txqs to use, 0 => use default #");
@@ -7088,3 +7111,48 @@ iflib_debugnet_poll(if_t ifp, int count)
70887111
return (0);
70897112
}
70907113
#endif /* DEBUGNET */
7114+
7115+
7116+
static inline iflib_txq_t
7117+
iflib_simple_select_queue(if_ctx_t ctx, struct mbuf *m)
7118+
{
7119+
int qidx;
7120+
7121+
if ((NTXQSETS(ctx) > 1) && M_HASHTYPE_GET(m))
7122+
qidx = QIDX(ctx, m);
7123+
else
7124+
qidx = NTXQSETS(ctx) + FIRST_QSET(ctx) - 1;
7125+
return (&ctx->ifc_txqs[qidx]);
7126+
}
7127+
7128+
static int
7129+
iflib_simple_transmit(if_t ifp, struct mbuf *m)
7130+
{
7131+
if_ctx_t ctx;
7132+
iflib_txq_t txq;
7133+
int error;
7134+
int bytes_sent = 0, pkt_sent = 0, mcast_sent = 0;
7135+
7136+
7137+
ctx = if_getsoftc(ifp);
7138+
if ((if_getdrvflags(ifp) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
7139+
IFF_DRV_RUNNING)
7140+
return (EBUSY);
7141+
txq = iflib_simple_select_queue(ctx, m);
7142+
mtx_lock(&txq->ift_mtx);
7143+
error = iflib_encap(txq, &m);
7144+
if (error == 0) {
7145+
pkt_sent++;
7146+
bytes_sent += m->m_pkthdr.len;
7147+
mcast_sent += !!(m->m_flags & M_MCAST);
7148+
(void)iflib_txd_db_check(txq, true);
7149+
}
7150+
(void)iflib_completed_tx_reclaim(txq, RECLAIM_THRESH(ctx));
7151+
mtx_unlock(&txq->ift_mtx);
7152+
if_inc_counter(ifp, IFCOUNTER_OBYTES, bytes_sent);
7153+
if_inc_counter(ifp, IFCOUNTER_OPACKETS, pkt_sent);
7154+
if (mcast_sent)
7155+
if_inc_counter(ifp, IFCOUNTER_OMCASTS, mcast_sent);
7156+
7157+
return (error);
7158+
}

0 commit comments

Comments
 (0)