Skip to content

Commit 9f87b79

Browse files
committed
stm32/eth: Implement zero-copy of lwIP pbufs for TX path.
This option (currently only enabled for N6) allows the TX path to hold on to a pbuf reference while the DMA accesses the pbuf's memory directly, instead of copying the entire pbuf data into the internal buffers. This is necessary to achieve gigabit speeds on the N6, although actually achieving that speed requires higher up parts of the stack to be efficient as well. Signed-off-by: Damien George <[email protected]>
1 parent f5a65b3 commit 9f87b79

File tree

1 file changed

+154
-24
lines changed

1 file changed

+154
-24
lines changed

ports/stm32/eth.c

Lines changed: 154 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@
6363
#define TX_DESCR_3_FD_Pos (29)
6464
#define TX_DESCR_3_LD_Pos (28)
6565
#define TX_DESCR_3_CIC_Pos (16)
66+
#define TX_DESCR_2_IOC_Pos (31)
6667
#define TX_DESCR_2_B1L_Pos (0)
6768
#define TX_DESCR_2_B1L_Msk (0x3fff << TX_DESCR_2_B1L_Pos)
6869
#elif defined(STM32H7)
@@ -111,8 +112,17 @@
111112
#define RX_BUF_SIZE (1528) // includes 4-byte CRC at end
112113
#define TX_BUF_SIZE (1528)
113114

115+
#if defined(MICROPY_HW_ETH_RMII_REF_CLK)
116+
// RMII in use.
114117
#define RX_BUF_NUM (5)
115118
#define TX_BUF_NUM (5)
119+
#define USE_PBUF_REF_FOR_TX (0)
120+
#else
121+
// RGMII in use, so increase number of buffers and use pbuf zero copy if possible.
122+
#define RX_BUF_NUM (16)
123+
#define TX_BUF_NUM (16)
124+
#define USE_PBUF_REF_FOR_TX (1)
125+
#endif
116126

117127
#if defined(STM32N6)
118128
// The N6 has two DMA channels, so use one for RX and one for TX.
@@ -132,7 +142,9 @@ typedef struct _eth_dma_t {
132142
eth_dma_rx_descr_t rx_descr[RX_BUF_NUM];
133143
eth_dma_tx_descr_t tx_descr[TX_BUF_NUM];
134144
uint8_t rx_buf[RX_BUF_NUM * RX_BUF_SIZE] __attribute__((aligned(8)));
145+
#if !USE_PBUF_REF_FOR_TX
135146
uint8_t tx_buf[TX_BUF_NUM * TX_BUF_SIZE] __attribute__((aligned(8)));
147+
#endif
136148
#if !defined(STM32H5) && !defined(STM32N6)
137149
// Make sure the size of this struct is 16k, for the MPU.
138150
uint8_t padding[16 * 1024
@@ -156,6 +168,11 @@ typedef struct _eth_t {
156168
// to go in a special RAM section, or have MPU settings applied.
157169
static eth_dma_t eth_dma MICROPY_HW_ETH_DMA_ATTRIBUTE;
158170

171+
#if USE_PBUF_REF_FOR_TX
172+
// This array holds lwIP pbufs that are currently in use by the DMA.
173+
static struct pbuf *eth_dma_pbuf[TX_BUF_NUM];
174+
#endif
175+
159176
// These variables index the buffers in eth_dma and are not shared with DMA.
160177
static size_t eth_dma_rx_descr_idx;
161178
static size_t eth_dma_tx_descr_idx;
@@ -521,6 +538,16 @@ static int eth_mac_init(eth_t *self) {
521538
ETH_DMACxIER_NIE // enable normal interrupts
522539
| ETH_DMACxIER_RIE // enable RX interrupt
523540
;
541+
#if USE_PBUF_REF_FOR_TX
542+
#if RX_DMA_CH == TX_DMA_CH
543+
ETH->DMA_CH[TX_DMA_CH].DMACIER |= ETH_DMACxIER_TIE; // enable TX interrupt
544+
#else
545+
ETH->DMA_CH[TX_DMA_CH].DMACIER =
546+
ETH_DMACxIER_NIE // enable normal interrupts
547+
| ETH_DMACxIER_TIE // enable TX interrupt
548+
;
549+
#endif
550+
#endif
524551
#else
525552
ETH->DMAIER =
526553
ETH_DMAIER_NISE // enable normal interrupts
@@ -565,7 +592,7 @@ static int eth_mac_init(eth_t *self) {
565592
#if defined(STM32H5) || defined(STM32H7) || defined(STM32N6)
566593
eth_dma.tx_descr[i].tdes0 = 0;
567594
eth_dma.tx_descr[i].tdes1 = 0;
568-
eth_dma.tx_descr[i].tdes2 = TX_BUF_SIZE & TX_DESCR_2_B1L_Msk;
595+
eth_dma.tx_descr[i].tdes2 = 0;
569596
eth_dma.tx_descr[i].tdes3 = 0;
570597
#else
571598
eth_dma.tx_descr[i].tdes0 = 1 << TX_DESCR_0_TCH_Pos;
@@ -590,6 +617,11 @@ static int eth_mac_init(eth_t *self) {
590617
ETH->DMATDLAR = (uint32_t)&eth_dma.tx_descr[0];
591618
#endif
592619
eth_dma_tx_descr_idx = 0;
620+
#if USE_PBUF_REF_FOR_TX
621+
for (int i = 0; i < TX_BUF_NUM; ++i) {
622+
eth_dma_pbuf[i] = NULL;
623+
}
624+
#endif
593625

594626
// Configure DMA
595627
#if defined(STM32H5) || defined(STM32H7)
@@ -728,7 +760,9 @@ static void eth_mac_deinit(eth_t *self) {
728760
#endif
729761
}
730762

731-
static int eth_tx_buf_get(size_t len, uint8_t **buf) {
763+
#if !USE_PBUF_REF_FOR_TX
764+
765+
int eth_tx_buf_get(size_t len, uint8_t **buf) {
732766
if (len > TX_BUF_SIZE) {
733767
return -MP_EINVAL;
734768
}
@@ -767,28 +801,51 @@ static int eth_tx_buf_get(size_t len, uint8_t **buf) {
767801
return 0;
768802
}
769803

770-
static int eth_tx_buf_send(void) {
771-
// Get TX descriptor and move to next one
772-
eth_dma_tx_descr_t *tx_descr = &eth_dma.tx_descr[eth_dma_tx_descr_idx];
773-
eth_dma_tx_descr_idx = (eth_dma_tx_descr_idx + 1) % TX_BUF_NUM;
804+
#else
774805

775-
// Schedule to send next outgoing frame
776-
#if defined(STM32H5) || defined(STM32H7) || defined(STM32N6)
777-
tx_descr->tdes3 =
778-
1 << TX_DESCR_3_OWN_Pos // owned by DMA
779-
| 1 << TX_DESCR_3_LD_Pos // last segment
780-
| 1 << TX_DESCR_3_FD_Pos // first segment
781-
| 3 << TX_DESCR_3_CIC_Pos // enable all checksums inserted by hardware
782-
;
783-
#else
784-
tx_descr->tdes0 =
785-
1 << TX_DESCR_0_OWN_Pos // owned by DMA
786-
| 1 << TX_DESCR_0_LS_Pos // last segment
787-
| 1 << TX_DESCR_0_FS_Pos // first segment
788-
| 3 << TX_DESCR_0_CIC_Pos // enable all checksums inserted by hardware
789-
| 1 << TX_DESCR_0_TCH_Pos // TX descriptor is chained
790-
;
791-
#endif
806+
int eth_tx_buf_get_ref(size_t len, uint8_t *buf, unsigned int idx) {
807+
// Wait for DMA to release the current TX descriptor (if it has it).
808+
eth_dma_tx_descr_t *tx_descr = &eth_dma.tx_descr[(eth_dma_tx_descr_idx + idx) % TX_BUF_NUM];
809+
uint32_t t0 = mp_hal_ticks_ms();
810+
while (tx_descr->tdes3 & (1 << TX_DESCR_3_OWN_Pos)) {
811+
if (mp_hal_ticks_ms() - t0 > 1000) {
812+
return -MP_ETIMEDOUT;
813+
}
814+
}
815+
816+
MP_HAL_CLEAN_DCACHE(buf, len);
817+
tx_descr->tdes2 = (len & TX_DESCR_2_B1L_Msk) | (1 << TX_DESCR_2_IOC_Pos);
818+
tx_descr->tdes0 = (uint32_t)buf;
819+
820+
return 0;
821+
}
822+
823+
#endif
824+
825+
static int eth_tx_buf_send(unsigned int num_segments) {
826+
for (unsigned int segment = 0; segment < num_segments; ++segment) {
827+
// Get TX descriptor and move to next one
828+
eth_dma_tx_descr_t *tx_descr = &eth_dma.tx_descr[eth_dma_tx_descr_idx];
829+
eth_dma_tx_descr_idx = (eth_dma_tx_descr_idx + 1) % TX_BUF_NUM;
830+
831+
// Schedule to send next outgoing frame
832+
#if defined(STM32H5) || defined(STM32H7) || defined(STM32N6)
833+
tx_descr->tdes3 =
834+
1 << TX_DESCR_3_OWN_Pos // owned by DMA
835+
| (segment == num_segments - 1) << TX_DESCR_3_LD_Pos // last segment
836+
| (segment == 0) << TX_DESCR_3_FD_Pos // first segment
837+
| 3 << TX_DESCR_3_CIC_Pos // enable all checksums inserted by hardware
838+
;
839+
#else
840+
tx_descr->tdes0 =
841+
1 << TX_DESCR_0_OWN_Pos // owned by DMA
842+
| (segment == num_segments - 1) << TX_DESCR_0_LS_Pos // last segment
843+
| (segment == 0) << TX_DESCR_0_FS_Pos // first segment
844+
| 3 << TX_DESCR_0_CIC_Pos // enable all checksums inserted by hardware
845+
| 1 << TX_DESCR_0_TCH_Pos // TX descriptor is chained
846+
;
847+
#endif
848+
}
792849

793850
// Notify ETH DMA that there is a new TX descriptor for sending
794851
__DMB();
@@ -902,6 +959,28 @@ void ETH_IRQHandler(void) {
902959
eth_dma_rx_free();
903960
}
904961
}
962+
963+
#if USE_PBUF_REF_FOR_TX
964+
#if RX_DMA_CH != TX_DMA_CH
965+
sr = ETH->DMA_CH[TX_DMA_CH].DMACSR;
966+
ETH->DMA_CH[TX_DMA_CH].DMACSR = ETH_DMACxSR_NIS;
967+
#endif
968+
uint32_t tx_interrupt = sr & ETH_DMACxSR_TI;
969+
if (tx_interrupt) {
970+
ETH->DMA_CH[TX_DMA_CH].DMACSR = ETH_DMACxSR_TI;
971+
for (int i = 0; i < TX_BUF_NUM; ++i) {
972+
eth_dma_tx_descr_t *tx_descr = &eth_dma.tx_descr[i];
973+
if (!(tx_descr->tdes3 & (1 << TX_DESCR_3_OWN_Pos))) {
974+
// DMA does not own it
975+
if (eth_dma_pbuf[i] != NULL) {
976+
// release pbuf
977+
pbuf_free(eth_dma_pbuf[i]);
978+
eth_dma_pbuf[i] = NULL;
979+
}
980+
}
981+
}
982+
}
983+
#endif
905984
}
906985

907986
/*******************************************************************************/
@@ -938,13 +1017,64 @@ static err_t eth_netif_output(struct netif *netif, struct pbuf *p) {
9381017
LINK_STATS_INC(link.xmit);
9391018
eth_trace(netif->state, (size_t)-1, p, NETUTILS_TRACE_IS_TX | NETUTILS_TRACE_NEWLINE);
9401019

1020+
#if USE_PBUF_REF_FOR_TX
1021+
1022+
// Work out how many segments the pbuf has, and if it needs a copy made.
1023+
bool made_pbuf_copy = false;
1024+
unsigned int num_segments = 0;
1025+
for (struct pbuf *pb = p; pb != NULL; pb = pb->next) {
1026+
if (PBUF_NEEDS_COPY(pb)) {
1027+
// Note: this path is called for large UDP packets that are fragmented,
1028+
// because the fragments use PBUF_REF to divide up the original data.
1029+
p = pbuf_clone(PBUF_RAW, PBUF_RAM, p);
1030+
made_pbuf_copy = true;
1031+
num_segments = 1;
1032+
break;
1033+
}
1034+
++num_segments;
1035+
}
1036+
1037+
// Allocate TX buffer slots.
1038+
unsigned int idx = 0;
1039+
for (struct pbuf *pb = p; pb != NULL; pb = pb->next) {
1040+
int ret = eth_tx_buf_get_ref(pb->len, pb->payload, idx++);
1041+
if (ret != 0) {
1042+
if (made_pbuf_copy) {
1043+
pbuf_free(p);
1044+
}
1045+
return ERR_BUF;
1046+
}
1047+
}
1048+
1049+
// Take references to pbufs
1050+
idx = 0;
1051+
for (struct pbuf *pb = p; pb != NULL; pb = pb->next) {
1052+
unsigned int tx_idx = (eth_dma_tx_descr_idx + idx) % TX_BUF_NUM;
1053+
if (eth_dma_pbuf[tx_idx] != NULL) {
1054+
pbuf_free(eth_dma_pbuf[tx_idx]);
1055+
}
1056+
if (!made_pbuf_copy) {
1057+
pbuf_ref(pb);
1058+
}
1059+
eth_dma_pbuf[tx_idx] = pb;
1060+
++idx;
1061+
}
1062+
1063+
// Start the transmission.
1064+
int ret = eth_tx_buf_send(num_segments);
1065+
1066+
#else
1067+
1068+
// Allocate TX slot, copy the pbuf, and start the transmission.
9411069
uint8_t *buf;
9421070
int ret = eth_tx_buf_get(p->tot_len, &buf);
9431071
if (ret == 0) {
9441072
pbuf_copy_partial(p, buf, p->tot_len, 0);
945-
ret = eth_tx_buf_send();
1073+
ret = eth_tx_buf_send(1);
9461074
}
9471075

1076+
#endif
1077+
9481078
return ret ? ERR_BUF : ERR_OK;
9491079
}
9501080

0 commit comments

Comments
 (0)