Skip to content

Commit f406c8e

Browse files
alobakinChristoph Hellwig
authored andcommitted
dma: avoid redundant calls for sync operations
Quite often, devices do not need dma_sync operations on x86_64 at least. Indeed, when dev_is_dma_coherent(dev) is true and dev_use_swiotlb(dev) is false, iommu_dma_sync_single_for_cpu() and friends do nothing. However, indirectly calling them when CONFIG_RETPOLINE=y consumes about 10% of cycles on a cpu receiving packets from softirq at ~100Gbit rate. Even if/when CONFIG_RETPOLINE is not set, there is a cost of about 3%. Add dev->need_dma_sync boolean and turn it off during the device initialization (dma_set_mask()) depending on the setup: dev_is_dma_coherent() for the direct DMA, !(sync_single_for_device || sync_single_for_cpu) or the new dma_map_ops flag, %DMA_F_CAN_SKIP_SYNC, advertised for non-NULL DMA ops. Then later, if/when swiotlb is used for the first time, the flag is reset back to on, from swiotlb_tbl_map_single(). On iavf, the UDP trafficgen with XDP_DROP in skb mode test shows +3-5% increase for direct DMA. Suggested-by: Christoph Hellwig <[email protected]> # direct DMA shortcut Co-developed-by: Eric Dumazet <[email protected]> Signed-off-by: Eric Dumazet <[email protected]> Signed-off-by: Alexander Lobakin <[email protected]> Signed-off-by: Christoph Hellwig <[email protected]>
1 parent fe7514b commit f406c8e

File tree

5 files changed

+113
-17
lines changed

5 files changed

+113
-17
lines changed

include/linux/device.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -691,6 +691,7 @@ struct device_physical_location {
691691
* and optionall (if the coherent mask is large enough) also
692692
* for dma allocations. This flag is managed by the dma ops
693693
* instance from ->dma_supported.
694+
* @dma_need_sync: The device needs performing DMA sync operations.
694695
*
695696
* At the lowest level, every device in a Linux system is represented by an
696697
* instance of struct device. The device structure contains the information
@@ -803,6 +804,9 @@ struct device {
803804
#ifdef CONFIG_DMA_OPS_BYPASS
804805
bool dma_ops_bypass : 1;
805806
#endif
807+
#ifdef CONFIG_DMA_NEED_SYNC
808+
bool dma_need_sync:1;
809+
#endif
806810
};
807811

808812
/**

include/linux/dma-map-ops.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,11 @@ struct iommu_ops;
1818
*
1919
* DMA_F_PCI_P2PDMA_SUPPORTED: Indicates the dma_map_ops implementation can
2020
* handle PCI P2PDMA pages in the map_sg/unmap_sg operation.
21+
* DMA_F_CAN_SKIP_SYNC: DMA sync operations can be skipped if the device is
22+
* coherent and it's not an SWIOTLB buffer.
2123
*/
2224
#define DMA_F_PCI_P2PDMA_SUPPORTED (1 << 0)
25+
#define DMA_F_CAN_SKIP_SYNC (1 << 1)
2326

2427
struct dma_map_ops {
2528
unsigned int flags;
@@ -273,6 +276,15 @@ static inline bool dev_is_dma_coherent(struct device *dev)
273276
}
274277
#endif /* CONFIG_ARCH_HAS_DMA_COHERENCE_H */
275278

279+
static inline void dma_reset_need_sync(struct device *dev)
280+
{
281+
#ifdef CONFIG_DMA_NEED_SYNC
282+
/* Reset it only once so that the function can be called on hotpath */
283+
if (unlikely(!dev->dma_need_sync))
284+
dev->dma_need_sync = true;
285+
#endif
286+
}
287+
276288
/*
277289
* Check whether potential kmalloc() buffers are safe for non-coherent DMA.
278290
*/

include/linux/dma-mapping.h

Lines changed: 48 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -282,16 +282,59 @@ static inline int dma_mmap_noncontiguous(struct device *dev,
282282
#endif /* CONFIG_HAS_DMA */
283283

284284
#if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC)
285-
void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size,
285+
void __dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size,
286286
enum dma_data_direction dir);
287-
void dma_sync_single_for_device(struct device *dev, dma_addr_t addr,
287+
void __dma_sync_single_for_device(struct device *dev, dma_addr_t addr,
288288
size_t size, enum dma_data_direction dir);
289-
void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
289+
void __dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
290290
int nelems, enum dma_data_direction dir);
291-
void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
291+
void __dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
292292
int nelems, enum dma_data_direction dir);
293-
bool dma_need_sync(struct device *dev, dma_addr_t dma_addr);
293+
bool __dma_need_sync(struct device *dev, dma_addr_t dma_addr);
294+
295+
static inline bool dma_dev_need_sync(const struct device *dev)
296+
{
297+
/* Always call DMA sync operations when debugging is enabled */
298+
return dev->dma_need_sync || IS_ENABLED(CONFIG_DMA_API_DEBUG);
299+
}
300+
301+
static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
302+
size_t size, enum dma_data_direction dir)
303+
{
304+
if (dma_dev_need_sync(dev))
305+
__dma_sync_single_for_cpu(dev, addr, size, dir);
306+
}
307+
308+
static inline void dma_sync_single_for_device(struct device *dev,
309+
dma_addr_t addr, size_t size, enum dma_data_direction dir)
310+
{
311+
if (dma_dev_need_sync(dev))
312+
__dma_sync_single_for_device(dev, addr, size, dir);
313+
}
314+
315+
static inline void dma_sync_sg_for_cpu(struct device *dev,
316+
struct scatterlist *sg, int nelems, enum dma_data_direction dir)
317+
{
318+
if (dma_dev_need_sync(dev))
319+
__dma_sync_sg_for_cpu(dev, sg, nelems, dir);
320+
}
321+
322+
static inline void dma_sync_sg_for_device(struct device *dev,
323+
struct scatterlist *sg, int nelems, enum dma_data_direction dir)
324+
{
325+
if (dma_dev_need_sync(dev))
326+
__dma_sync_sg_for_device(dev, sg, nelems, dir);
327+
}
328+
329+
static inline bool dma_need_sync(struct device *dev, dma_addr_t dma_addr)
330+
{
331+
return dma_dev_need_sync(dev) ? __dma_need_sync(dev, dma_addr) : false;
332+
}
294333
#else /* !CONFIG_HAS_DMA || !CONFIG_DMA_NEED_SYNC */
334+
static inline bool dma_dev_need_sync(const struct device *dev)
335+
{
336+
return false;
337+
}
295338
static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
296339
size_t size, enum dma_data_direction dir)
297340
{

kernel/dma/mapping.c

Lines changed: 43 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -330,7 +330,7 @@ void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size,
330330
EXPORT_SYMBOL(dma_unmap_resource);
331331

332332
#ifdef CONFIG_DMA_NEED_SYNC
333-
void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size,
333+
void __dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size,
334334
enum dma_data_direction dir)
335335
{
336336
const struct dma_map_ops *ops = get_dma_ops(dev);
@@ -342,9 +342,9 @@ void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size,
342342
ops->sync_single_for_cpu(dev, addr, size, dir);
343343
debug_dma_sync_single_for_cpu(dev, addr, size, dir);
344344
}
345-
EXPORT_SYMBOL(dma_sync_single_for_cpu);
345+
EXPORT_SYMBOL(__dma_sync_single_for_cpu);
346346

347-
void dma_sync_single_for_device(struct device *dev, dma_addr_t addr,
347+
void __dma_sync_single_for_device(struct device *dev, dma_addr_t addr,
348348
size_t size, enum dma_data_direction dir)
349349
{
350350
const struct dma_map_ops *ops = get_dma_ops(dev);
@@ -356,9 +356,9 @@ void dma_sync_single_for_device(struct device *dev, dma_addr_t addr,
356356
ops->sync_single_for_device(dev, addr, size, dir);
357357
debug_dma_sync_single_for_device(dev, addr, size, dir);
358358
}
359-
EXPORT_SYMBOL(dma_sync_single_for_device);
359+
EXPORT_SYMBOL(__dma_sync_single_for_device);
360360

361-
void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
361+
void __dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
362362
int nelems, enum dma_data_direction dir)
363363
{
364364
const struct dma_map_ops *ops = get_dma_ops(dev);
@@ -370,9 +370,9 @@ void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
370370
ops->sync_sg_for_cpu(dev, sg, nelems, dir);
371371
debug_dma_sync_sg_for_cpu(dev, sg, nelems, dir);
372372
}
373-
EXPORT_SYMBOL(dma_sync_sg_for_cpu);
373+
EXPORT_SYMBOL(__dma_sync_sg_for_cpu);
374374

375-
void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
375+
void __dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
376376
int nelems, enum dma_data_direction dir)
377377
{
378378
const struct dma_map_ops *ops = get_dma_ops(dev);
@@ -384,18 +384,47 @@ void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
384384
ops->sync_sg_for_device(dev, sg, nelems, dir);
385385
debug_dma_sync_sg_for_device(dev, sg, nelems, dir);
386386
}
387-
EXPORT_SYMBOL(dma_sync_sg_for_device);
387+
EXPORT_SYMBOL(__dma_sync_sg_for_device);
388388

389-
bool dma_need_sync(struct device *dev, dma_addr_t dma_addr)
389+
bool __dma_need_sync(struct device *dev, dma_addr_t dma_addr)
390390
{
391391
const struct dma_map_ops *ops = get_dma_ops(dev);
392392

393393
if (dma_map_direct(dev, ops))
394+
/*
395+
* dma_need_sync could've been reset on first SWIOTLB buffer
396+
* mapping, but @dma_addr is not necessary an SWIOTLB buffer.
397+
* In this case, fall back to more granular check.
398+
*/
394399
return dma_direct_need_sync(dev, dma_addr);
395-
return ops->sync_single_for_cpu || ops->sync_single_for_device;
400+
return true;
396401
}
397-
EXPORT_SYMBOL_GPL(dma_need_sync);
398-
#endif /* CONFIG_DMA_NEED_SYNC */
402+
EXPORT_SYMBOL_GPL(__dma_need_sync);
403+
404+
static void dma_setup_need_sync(struct device *dev)
405+
{
406+
const struct dma_map_ops *ops = get_dma_ops(dev);
407+
408+
if (dma_map_direct(dev, ops) || (ops->flags & DMA_F_CAN_SKIP_SYNC))
409+
/*
410+
* dma_need_sync will be reset to %true on first SWIOTLB buffer
411+
* mapping, if any. During the device initialization, it's
412+
* enough to check only for the DMA coherence.
413+
*/
414+
dev->dma_need_sync = !dev_is_dma_coherent(dev);
415+
else if (!ops->sync_single_for_device && !ops->sync_single_for_cpu &&
416+
!ops->sync_sg_for_device && !ops->sync_sg_for_cpu)
417+
/*
418+
* Synchronization is not possible when none of DMA sync ops
419+
* is set.
420+
*/
421+
dev->dma_need_sync = false;
422+
else
423+
dev->dma_need_sync = true;
424+
}
425+
#else /* !CONFIG_DMA_NEED_SYNC */
426+
static inline void dma_setup_need_sync(struct device *dev) { }
427+
#endif /* !CONFIG_DMA_NEED_SYNC */
399428

400429
/*
401430
* The whole dma_get_sgtable() idea is fundamentally unsafe - it seems
@@ -785,6 +814,8 @@ int dma_set_mask(struct device *dev, u64 mask)
785814

786815
arch_dma_set_mask(dev, mask);
787816
*dev->dma_mask = mask;
817+
dma_setup_need_sync(dev);
818+
788819
return 0;
789820
}
790821
EXPORT_SYMBOL(dma_set_mask);

kernel/dma/swiotlb.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1408,6 +1408,12 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
14081408
return (phys_addr_t)DMA_MAPPING_ERROR;
14091409
}
14101410

1411+
/*
1412+
* If dma_need_sync wasn't set, reset it on first SWIOTLB buffer
1413+
* mapping to always sync SWIOTLB buffers.
1414+
*/
1415+
dma_reset_need_sync(dev);
1416+
14111417
/*
14121418
* Save away the mapping from the original address to the DMA address.
14131419
* This is needed when we sync the memory. Then we sync the buffer if

0 commit comments

Comments
 (0)