Skip to content

Commit 78d9018

Browse files
tlebkuba-moo
authored andcommitted
net: macb: single dma_alloc_coherent() for DMA descriptors
Move from 2*NUM_QUEUES dma_alloc_coherent() for DMA descriptor rings to 2 calls overall. Issue is with how all queues share the same register for configuring the upper 32-bits of Tx/Rx descriptor rings. Taking Tx, notice how TBQPH does *not* depend on the queue index: #define GEM_TBQP(hw_q) (0x0440 + ((hw_q) << 2)) #define GEM_TBQPH(hw_q) (0x04C8) queue_writel(queue, TBQP, lower_32_bits(queue->tx_ring_dma)); #ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT if (bp->hw_dma_cap & HW_DMA_CAP_64B) queue_writel(queue, TBQPH, upper_32_bits(queue->tx_ring_dma)); #endif To maximise our chances of getting valid DMA addresses, we do a single dma_alloc_coherent() across queues. This improves the odds because alloc_pages() guarantees natural alignment. Other codepaths (IOMMU or dev/arch dma_map_ops) don't give high enough guarantees (even page-aligned isn't enough). Two consideration: - dma_alloc_coherent() gives us page alignment. Here we remove this constraint meaning each queue's ring won't be page-aligned anymore. - This can save some tiny amounts of memory. Fewer allocations means (1) less overhead (constant cost per alloc) and (2) less wasted bytes due to alignment constraints. Example for (2): 4 queues, default ring size (512), 64-bit DMA descriptors, 16K pages: - Before: 8 allocs of 8K, each rounded to 16K => 64K wasted. - After: 2 allocs of 32K => 0K wasted. Fixes: 02c958d ("net/macb: add TX multiqueue support for gem") Reviewed-by: Sean Anderson <[email protected]> Acked-by: Nicolas Ferre <[email protected]> Tested-by: Nicolas Ferre <[email protected]> # on sam9x75 Signed-off-by: Théo Lebrun <[email protected]> Reviewed-by: Simon Horman <[email protected]> Link: https://patch.msgid.link/[email protected] Signed-off-by: Jakub Kicinski <[email protected]>
1 parent 92d4256 commit 78d9018

File tree

1 file changed

+41
-39
lines changed

1 file changed

+41
-39
lines changed

drivers/net/ethernet/cadence/macb_main.c

Lines changed: 41 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -2478,32 +2478,30 @@ static unsigned int macb_rx_ring_size_per_queue(struct macb *bp)
24782478

24792479
static void macb_free_consistent(struct macb *bp)
24802480
{
2481+
struct device *dev = &bp->pdev->dev;
24812482
struct macb_queue *queue;
24822483
unsigned int q;
2484+
size_t size;
24832485

24842486
if (bp->rx_ring_tieoff) {
2485-
dma_free_coherent(&bp->pdev->dev, macb_dma_desc_get_size(bp),
2487+
dma_free_coherent(dev, macb_dma_desc_get_size(bp),
24862488
bp->rx_ring_tieoff, bp->rx_ring_tieoff_dma);
24872489
bp->rx_ring_tieoff = NULL;
24882490
}
24892491

24902492
bp->macbgem_ops.mog_free_rx_buffers(bp);
24912493

2494+
size = bp->num_queues * macb_tx_ring_size_per_queue(bp);
2495+
dma_free_coherent(dev, size, bp->queues[0].tx_ring, bp->queues[0].tx_ring_dma);
2496+
2497+
size = bp->num_queues * macb_rx_ring_size_per_queue(bp);
2498+
dma_free_coherent(dev, size, bp->queues[0].rx_ring, bp->queues[0].rx_ring_dma);
2499+
24922500
for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) {
24932501
kfree(queue->tx_skb);
24942502
queue->tx_skb = NULL;
2495-
if (queue->tx_ring) {
2496-
dma_free_coherent(&bp->pdev->dev,
2497-
macb_tx_ring_size_per_queue(bp),
2498-
queue->tx_ring, queue->tx_ring_dma);
2499-
queue->tx_ring = NULL;
2500-
}
2501-
if (queue->rx_ring) {
2502-
dma_free_coherent(&bp->pdev->dev,
2503-
macb_rx_ring_size_per_queue(bp),
2504-
queue->rx_ring, queue->rx_ring_dma);
2505-
queue->rx_ring = NULL;
2506-
}
2503+
queue->tx_ring = NULL;
2504+
queue->rx_ring = NULL;
25072505
}
25082506
}
25092507

@@ -2545,41 +2543,45 @@ static int macb_alloc_rx_buffers(struct macb *bp)
25452543

25462544
static int macb_alloc_consistent(struct macb *bp)
25472545
{
2546+
struct device *dev = &bp->pdev->dev;
2547+
dma_addr_t tx_dma, rx_dma;
25482548
struct macb_queue *queue;
25492549
unsigned int q;
2550-
u32 upper;
2551-
int size;
2550+
void *tx, *rx;
2551+
size_t size;
2552+
2553+
/*
2554+
* Upper 32-bits of Tx/Rx DMA descriptor for each queues much match!
2555+
* We cannot enforce this guarantee, the best we can do is do a single
2556+
* allocation and hope it will land into alloc_pages() that guarantees
2557+
* natural alignment of physical addresses.
2558+
*/
2559+
2560+
size = bp->num_queues * macb_tx_ring_size_per_queue(bp);
2561+
tx = dma_alloc_coherent(dev, size, &tx_dma, GFP_KERNEL);
2562+
if (!tx || upper_32_bits(tx_dma) != upper_32_bits(tx_dma + size - 1))
2563+
goto out_err;
2564+
netdev_dbg(bp->dev, "Allocated %zu bytes for %u TX rings at %08lx (mapped %p)\n",
2565+
size, bp->num_queues, (unsigned long)tx_dma, tx);
2566+
2567+
size = bp->num_queues * macb_rx_ring_size_per_queue(bp);
2568+
rx = dma_alloc_coherent(dev, size, &rx_dma, GFP_KERNEL);
2569+
if (!rx || upper_32_bits(rx_dma) != upper_32_bits(rx_dma + size - 1))
2570+
goto out_err;
2571+
netdev_dbg(bp->dev, "Allocated %zu bytes for %u RX rings at %08lx (mapped %p)\n",
2572+
size, bp->num_queues, (unsigned long)rx_dma, rx);
25522573

25532574
for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) {
2554-
size = macb_tx_ring_size_per_queue(bp);
2555-
queue->tx_ring = dma_alloc_coherent(&bp->pdev->dev, size,
2556-
&queue->tx_ring_dma,
2557-
GFP_KERNEL);
2558-
upper = upper_32_bits(queue->tx_ring_dma);
2559-
if (!queue->tx_ring ||
2560-
upper != upper_32_bits(bp->queues[0].tx_ring_dma))
2561-
goto out_err;
2562-
netdev_dbg(bp->dev,
2563-
"Allocated TX ring for queue %u of %d bytes at %08lx (mapped %p)\n",
2564-
q, size, (unsigned long)queue->tx_ring_dma,
2565-
queue->tx_ring);
2575+
queue->tx_ring = tx + macb_tx_ring_size_per_queue(bp) * q;
2576+
queue->tx_ring_dma = tx_dma + macb_tx_ring_size_per_queue(bp) * q;
2577+
2578+
queue->rx_ring = rx + macb_rx_ring_size_per_queue(bp) * q;
2579+
queue->rx_ring_dma = rx_dma + macb_rx_ring_size_per_queue(bp) * q;
25662580

25672581
size = bp->tx_ring_size * sizeof(struct macb_tx_skb);
25682582
queue->tx_skb = kmalloc(size, GFP_KERNEL);
25692583
if (!queue->tx_skb)
25702584
goto out_err;
2571-
2572-
size = macb_rx_ring_size_per_queue(bp);
2573-
queue->rx_ring = dma_alloc_coherent(&bp->pdev->dev, size,
2574-
&queue->rx_ring_dma,
2575-
GFP_KERNEL);
2576-
upper = upper_32_bits(queue->rx_ring_dma);
2577-
if (!queue->rx_ring ||
2578-
upper != upper_32_bits(bp->queues[0].rx_ring_dma))
2579-
goto out_err;
2580-
netdev_dbg(bp->dev,
2581-
"Allocated RX ring of %d bytes at %08lx (mapped %p)\n",
2582-
size, (unsigned long)queue->rx_ring_dma, queue->rx_ring);
25832585
}
25842586
if (bp->macbgem_ops.mog_alloc_rx_buffers(bp))
25852587
goto out_err;

0 commit comments

Comments
 (0)