Skip to content

Commit 8cad471

Browse files
committed
mm/hmm: provide generic DMA managing logic
HMM callers use PFN list to populate range while calling to hmm_range_fault(), the conversion from PFN to DMA address is done by the callers with help of another DMA list. However, it is wasteful on any modern platform and by doing the right logic, that DMA list can be avoided. Provide generic logic to manage these lists and gave an interface to map/unmap PFNs to DMA addresses, without requiring from the callers to be an experts in DMA core API. Tested-by: Jens Axboe <[email protected]> Reviewed-by: Jason Gunthorpe <[email protected]> Signed-off-by: Leon Romanovsky <[email protected]>
1 parent 285e871 commit 8cad471

File tree

3 files changed

+251
-2
lines changed

3 files changed

+251
-2
lines changed

include/linux/hmm-dma.h

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
/* SPDX-License-Identifier: GPL-2.0-or-later */
2+
/* Copyright (c) 2024 NVIDIA Corporation & Affiliates */
3+
#ifndef LINUX_HMM_DMA_H
4+
#define LINUX_HMM_DMA_H
5+
6+
#include <linux/dma-mapping.h>
7+
8+
struct dma_iova_state;
9+
struct pci_p2pdma_map_state;
10+
11+
/*
12+
* struct hmm_dma_map - array of PFNs and DMA addresses
13+
*
14+
* @state: DMA IOVA state
15+
* @pfns: array of PFNs
16+
* @dma_list: array of DMA addresses
17+
* @dma_entry_size: size of each DMA entry in the array
18+
*/
19+
struct hmm_dma_map {
20+
struct dma_iova_state state;
21+
unsigned long *pfn_list;
22+
dma_addr_t *dma_list;
23+
size_t dma_entry_size;
24+
};
25+
26+
int hmm_dma_map_alloc(struct device *dev, struct hmm_dma_map *map,
27+
size_t nr_entries, size_t dma_entry_size);
28+
void hmm_dma_map_free(struct device *dev, struct hmm_dma_map *map);
29+
dma_addr_t hmm_dma_map_pfn(struct device *dev, struct hmm_dma_map *map,
30+
size_t idx,
31+
struct pci_p2pdma_map_state *p2pdma_state);
32+
bool hmm_dma_unmap_pfn(struct device *dev, struct hmm_dma_map *map, size_t idx);
33+
#endif /* LINUX_HMM_DMA_H */

include/linux/hmm.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@ struct mmu_interval_notifier;
2323
* HMM_PFN_WRITE - if the page memory can be written to (requires HMM_PFN_VALID)
2424
* HMM_PFN_ERROR - accessing the pfn is impossible and the device should
2525
* fail. ie poisoned memory, special pages, no vma, etc
26+
* HMM_PFN_P2PDMA - P2P page
27+
* HMM_PFN_P2PDMA_BUS - Bus mapped P2P transfer
2628
* HMM_PFN_DMA_MAPPED - Flag preserved on input-to-output transformation
2729
* to mark that page is already DMA mapped
2830
*
@@ -43,8 +45,10 @@ enum hmm_pfn_flags {
4345
* don't forget to update HMM_PFN_INOUT_FLAGS
4446
*/
4547
HMM_PFN_DMA_MAPPED = 1UL << (BITS_PER_LONG - 4),
48+
HMM_PFN_P2PDMA = 1UL << (BITS_PER_LONG - 5),
49+
HMM_PFN_P2PDMA_BUS = 1UL << (BITS_PER_LONG - 6),
4650

47-
HMM_PFN_ORDER_SHIFT = (BITS_PER_LONG - 9),
51+
HMM_PFN_ORDER_SHIFT = (BITS_PER_LONG - 11),
4852

4953
/* Input flags */
5054
HMM_PFN_REQ_FAULT = HMM_PFN_VALID,

mm/hmm.c

Lines changed: 213 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
*/
1111
#include <linux/pagewalk.h>
1212
#include <linux/hmm.h>
13+
#include <linux/hmm-dma.h>
1314
#include <linux/init.h>
1415
#include <linux/rmap.h>
1516
#include <linux/swap.h>
@@ -23,6 +24,7 @@
2324
#include <linux/sched/mm.h>
2425
#include <linux/jump_label.h>
2526
#include <linux/dma-mapping.h>
27+
#include <linux/pci-p2pdma.h>
2628
#include <linux/mmu_notifier.h>
2729
#include <linux/memory_hotplug.h>
2830

@@ -41,7 +43,8 @@ enum {
4143

4244
enum {
4345
/* These flags are carried from input-to-output */
44-
HMM_PFN_INOUT_FLAGS = HMM_PFN_DMA_MAPPED,
46+
HMM_PFN_INOUT_FLAGS = HMM_PFN_DMA_MAPPED | HMM_PFN_P2PDMA |
47+
HMM_PFN_P2PDMA_BUS,
4548
};
4649

4750
static int hmm_pfns_fill(unsigned long addr, unsigned long end,
@@ -620,3 +623,212 @@ int hmm_range_fault(struct hmm_range *range)
620623
return ret;
621624
}
622625
EXPORT_SYMBOL(hmm_range_fault);
626+
627+
/**
628+
* hmm_dma_map_alloc - Allocate HMM map structure
629+
* @dev: device to allocate structure for
630+
* @map: HMM map to allocate
631+
* @nr_entries: number of entries in the map
632+
* @dma_entry_size: size of the DMA entry in the map
633+
*
634+
* Allocate the HMM map structure and all the lists it contains.
635+
* Return 0 on success, -ENOMEM on failure.
636+
*/
637+
int hmm_dma_map_alloc(struct device *dev, struct hmm_dma_map *map,
638+
size_t nr_entries, size_t dma_entry_size)
639+
{
640+
bool dma_need_sync = false;
641+
bool use_iova;
642+
643+
if (!(nr_entries * PAGE_SIZE / dma_entry_size))
644+
return -EINVAL;
645+
646+
/*
647+
* The HMM API violates our normal DMA buffer ownership rules and can't
648+
* transfer buffer ownership. The dma_addressing_limited() check is a
649+
* best approximation to ensure no swiotlb buffering happens.
650+
*/
651+
#ifdef CONFIG_DMA_NEED_SYNC
652+
dma_need_sync = !dev->dma_skip_sync;
653+
#endif /* CONFIG_DMA_NEED_SYNC */
654+
if (dma_need_sync || dma_addressing_limited(dev))
655+
return -EOPNOTSUPP;
656+
657+
map->dma_entry_size = dma_entry_size;
658+
map->pfn_list = kvcalloc(nr_entries, sizeof(*map->pfn_list),
659+
GFP_KERNEL | __GFP_NOWARN);
660+
if (!map->pfn_list)
661+
return -ENOMEM;
662+
663+
use_iova = dma_iova_try_alloc(dev, &map->state, 0,
664+
nr_entries * PAGE_SIZE);
665+
if (!use_iova && dma_need_unmap(dev)) {
666+
map->dma_list = kvcalloc(nr_entries, sizeof(*map->dma_list),
667+
GFP_KERNEL | __GFP_NOWARN);
668+
if (!map->dma_list)
669+
goto err_dma;
670+
}
671+
return 0;
672+
673+
err_dma:
674+
kvfree(map->pfn_list);
675+
return -ENOMEM;
676+
}
677+
EXPORT_SYMBOL_GPL(hmm_dma_map_alloc);
678+
679+
/**
680+
* hmm_dma_map_free - iFree HMM map structure
681+
* @dev: device to free structure from
682+
* @map: HMM map containing the various lists and state
683+
*
684+
* Free the HMM map structure and all the lists it contains.
685+
*/
686+
void hmm_dma_map_free(struct device *dev, struct hmm_dma_map *map)
687+
{
688+
if (dma_use_iova(&map->state))
689+
dma_iova_free(dev, &map->state);
690+
kvfree(map->pfn_list);
691+
kvfree(map->dma_list);
692+
}
693+
EXPORT_SYMBOL_GPL(hmm_dma_map_free);
694+
695+
/**
696+
* hmm_dma_map_pfn - Map a physical HMM page to DMA address
697+
* @dev: Device to map the page for
698+
* @map: HMM map
699+
* @idx: Index into the PFN and dma address arrays
700+
* @p2pdma_state: PCI P2P state.
701+
*
702+
* dma_alloc_iova() allocates IOVA based on the size specified by their use in
703+
* iova->size. Call this function after IOVA allocation to link whole @page
704+
* to get the DMA address. Note that very first call to this function
705+
* will have @offset set to 0 in the IOVA space allocated from
706+
* dma_alloc_iova(). For subsequent calls to this function on same @iova,
707+
* @offset needs to be advanced by the caller with the size of previous
708+
* page that was linked + DMA address returned for the previous page that was
709+
* linked by this function.
710+
*/
711+
dma_addr_t hmm_dma_map_pfn(struct device *dev, struct hmm_dma_map *map,
712+
size_t idx,
713+
struct pci_p2pdma_map_state *p2pdma_state)
714+
{
715+
struct dma_iova_state *state = &map->state;
716+
dma_addr_t *dma_addrs = map->dma_list;
717+
unsigned long *pfns = map->pfn_list;
718+
struct page *page = hmm_pfn_to_page(pfns[idx]);
719+
phys_addr_t paddr = hmm_pfn_to_phys(pfns[idx]);
720+
size_t offset = idx * map->dma_entry_size;
721+
unsigned long attrs = 0;
722+
dma_addr_t dma_addr;
723+
int ret;
724+
725+
if ((pfns[idx] & HMM_PFN_DMA_MAPPED) &&
726+
!(pfns[idx] & HMM_PFN_P2PDMA_BUS)) {
727+
/*
728+
* We are in this flow when there is a need to resync flags,
729+
* for example when page was already linked in prefetch call
730+
* with READ flag and now we need to add WRITE flag
731+
*
732+
* This page was already programmed to HW and we don't want/need
733+
* to unlink and link it again just to resync flags.
734+
*/
735+
if (dma_use_iova(state))
736+
return state->addr + offset;
737+
738+
/*
739+
* Without dma_need_unmap, the dma_addrs array is NULL, thus we
740+
* need to regenerate the address below even if there already
741+
* was a mapping. But !dma_need_unmap implies that the
742+
* mapping stateless, so this is fine.
743+
*/
744+
if (dma_need_unmap(dev))
745+
return dma_addrs[idx];
746+
747+
/* Continue to remapping */
748+
}
749+
750+
switch (pci_p2pdma_state(p2pdma_state, dev, page)) {
751+
case PCI_P2PDMA_MAP_NONE:
752+
break;
753+
case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
754+
attrs |= DMA_ATTR_SKIP_CPU_SYNC;
755+
pfns[idx] |= HMM_PFN_P2PDMA;
756+
break;
757+
case PCI_P2PDMA_MAP_BUS_ADDR:
758+
pfns[idx] |= HMM_PFN_P2PDMA_BUS | HMM_PFN_DMA_MAPPED;
759+
return pci_p2pdma_bus_addr_map(p2pdma_state, paddr);
760+
default:
761+
return DMA_MAPPING_ERROR;
762+
}
763+
764+
if (dma_use_iova(state)) {
765+
ret = dma_iova_link(dev, state, paddr, offset,
766+
map->dma_entry_size, DMA_BIDIRECTIONAL,
767+
attrs);
768+
if (ret)
769+
goto error;
770+
771+
ret = dma_iova_sync(dev, state, offset, map->dma_entry_size);
772+
if (ret) {
773+
dma_iova_unlink(dev, state, offset, map->dma_entry_size,
774+
DMA_BIDIRECTIONAL, attrs);
775+
goto error;
776+
}
777+
778+
dma_addr = state->addr + offset;
779+
} else {
780+
if (WARN_ON_ONCE(dma_need_unmap(dev) && !dma_addrs))
781+
goto error;
782+
783+
dma_addr = dma_map_page(dev, page, 0, map->dma_entry_size,
784+
DMA_BIDIRECTIONAL);
785+
if (dma_mapping_error(dev, dma_addr))
786+
goto error;
787+
788+
if (dma_need_unmap(dev))
789+
dma_addrs[idx] = dma_addr;
790+
}
791+
pfns[idx] |= HMM_PFN_DMA_MAPPED;
792+
return dma_addr;
793+
error:
794+
pfns[idx] &= ~HMM_PFN_P2PDMA;
795+
return DMA_MAPPING_ERROR;
796+
797+
}
798+
EXPORT_SYMBOL_GPL(hmm_dma_map_pfn);
799+
800+
/**
801+
* hmm_dma_unmap_pfn - Unmap a physical HMM page from DMA address
802+
* @dev: Device to unmap the page from
803+
* @map: HMM map
804+
* @idx: Index of the PFN to unmap
805+
*
806+
* Returns true if the PFN was mapped and has been unmapped, false otherwise.
807+
*/
808+
bool hmm_dma_unmap_pfn(struct device *dev, struct hmm_dma_map *map, size_t idx)
809+
{
810+
const unsigned long valid_dma = HMM_PFN_VALID | HMM_PFN_DMA_MAPPED;
811+
struct dma_iova_state *state = &map->state;
812+
dma_addr_t *dma_addrs = map->dma_list;
813+
unsigned long *pfns = map->pfn_list;
814+
unsigned long attrs = 0;
815+
816+
if ((pfns[idx] & valid_dma) != valid_dma)
817+
return false;
818+
819+
if (pfns[idx] & HMM_PFN_P2PDMA_BUS)
820+
; /* no need to unmap bus address P2P mappings */
821+
else if (dma_use_iova(state)) {
822+
if (pfns[idx] & HMM_PFN_P2PDMA)
823+
attrs |= DMA_ATTR_SKIP_CPU_SYNC;
824+
dma_iova_unlink(dev, state, idx * map->dma_entry_size,
825+
map->dma_entry_size, DMA_BIDIRECTIONAL, attrs);
826+
} else if (dma_need_unmap(dev))
827+
dma_unmap_page(dev, dma_addrs[idx], map->dma_entry_size,
828+
DMA_BIDIRECTIONAL);
829+
830+
pfns[idx] &=
831+
~(HMM_PFN_DMA_MAPPED | HMM_PFN_P2PDMA | HMM_PFN_P2PDMA_BUS);
832+
return true;
833+
}
834+
EXPORT_SYMBOL_GPL(hmm_dma_unmap_pfn);

0 commit comments

Comments
 (0)