Skip to content

Commit a72ae6f

Browse files
committed
mm: Provide address mask in struct follow_pfnmap_args
jira LE-3557 Rebuild_History Non-Buildable kernel-5.14.0-570.26.1.el9_6 commit-author Alex Williamson <[email protected]> commit 62fb8ad Empty-Commit: Cherry-Pick Conflicts during history rebuild. Will be included in final tarball splat. Ref for failed cherry-pick at: ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/62fb8adc.failed follow_pfnmap_start() walks the page table for a given address and fills out the struct follow_pfnmap_args in pfnmap_args_setup(). The address mask of the page table level is already provided to this latter function for calculating the pfn. This address mask can also be useful for the caller to determine the extent of the contiguous mapping. For example, vfio-pci now supports huge_fault for pfnmaps and is able to insert pud and pmd mappings. When we DMA map these pfnmaps, ex. PCI MMIO BARs, we iterate follow_pfnmap_start() to get each pfn to test for a contiguous pfn range. Providing the mapping address mask allows us to skip the extent of the mapping level. Assuming a 1GB pud level and 4KB page size, iterations are reduced by a factor of 256K. In wall clock time, mapping a 32GB PCI BAR is reduced from ~1s to <1ms. Cc: Andrew Morton <[email protected]> Cc: David Hildenbrand <[email protected]> Cc: [email protected] Reviewed-by: Peter Xu <[email protected]> Reviewed-by: Mitchell Augustin <[email protected]> Tested-by: Mitchell Augustin <[email protected]> Reviewed-by: Jason Gunthorpe <[email protected]> Acked-by: David Hildenbrand <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Alex Williamson <[email protected]> (cherry picked from commit 62fb8ad) Signed-off-by: Jonathan Maple <[email protected]> # Conflicts: # include/linux/mm.h # mm/memory.c
1 parent 6987ff5 commit a72ae6f

File tree

1 file changed

+268
-0
lines changed

1 file changed

+268
-0
lines changed
Lines changed: 268 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,268 @@
1+
mm: Provide address mask in struct follow_pfnmap_args
2+
3+
jira LE-3557
4+
Rebuild_History Non-Buildable kernel-5.14.0-570.26.1.el9_6
5+
commit-author Alex Williamson <[email protected]>
6+
commit 62fb8adc43afad5fa1c9cadc6f3a8e9fb72af194
7+
Empty-Commit: Cherry-Pick Conflicts during history rebuild.
8+
Will be included in final tarball splat. Ref for failed cherry-pick at:
9+
ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/62fb8adc.failed
10+
11+
follow_pfnmap_start() walks the page table for a given address and
12+
fills out the struct follow_pfnmap_args in pfnmap_args_setup().
13+
The address mask of the page table level is already provided to this
14+
latter function for calculating the pfn. This address mask can also
15+
be useful for the caller to determine the extent of the contiguous
16+
mapping.
17+
18+
For example, vfio-pci now supports huge_fault for pfnmaps and is able
19+
to insert pud and pmd mappings. When we DMA map these pfnmaps, ex.
20+
PCI MMIO BARs, we iterate follow_pfnmap_start() to get each pfn to test
21+
for a contiguous pfn range. Providing the mapping address mask allows
22+
us to skip the extent of the mapping level. Assuming a 1GB pud level
23+
and 4KB page size, iterations are reduced by a factor of 256K. In wall
24+
clock time, mapping a 32GB PCI BAR is reduced from ~1s to <1ms.
25+
26+
Cc: Andrew Morton <[email protected]>
27+
Cc: David Hildenbrand <[email protected]>
28+
29+
Reviewed-by: Peter Xu <[email protected]>
30+
Reviewed-by: Mitchell Augustin <[email protected]>
31+
Tested-by: Mitchell Augustin <[email protected]>
32+
Reviewed-by: Jason Gunthorpe <[email protected]>
33+
Acked-by: David Hildenbrand <[email protected]>
34+
Link: https://lore.kernel.org/r/[email protected]
35+
Signed-off-by: Alex Williamson <[email protected]>
36+
(cherry picked from commit 62fb8adc43afad5fa1c9cadc6f3a8e9fb72af194)
37+
Signed-off-by: Jonathan Maple <[email protected]>
38+
39+
# Conflicts:
40+
# include/linux/mm.h
41+
# mm/memory.c
42+
diff --cc include/linux/mm.h
43+
index 196c481ec160,92b30dba7e38..000000000000
44+
--- a/include/linux/mm.h
45+
+++ b/include/linux/mm.h
46+
@@@ -2436,6 -2398,39 +2436,42 @@@ int follow_phys(struct vm_area_struct *
47+
int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
48+
void *buf, int len, int write);
49+
50+
++<<<<<<< HEAD
51+
++=======
52+
+ struct follow_pfnmap_args {
53+
+ /**
54+
+ * Inputs:
55+
+ * @vma: Pointer to @vm_area_struct struct
56+
+ * @address: the virtual address to walk
57+
+ */
58+
+ struct vm_area_struct *vma;
59+
+ unsigned long address;
60+
+ /**
61+
+ * Internals:
62+
+ *
63+
+ * The caller shouldn't touch any of these.
64+
+ */
65+
+ spinlock_t *lock;
66+
+ pte_t *ptep;
67+
+ /**
68+
+ * Outputs:
69+
+ *
70+
+ * @pfn: the PFN of the address
71+
+ * @addr_mask: address mask covering pfn
72+
+ * @pgprot: the pgprot_t of the mapping
73+
+ * @writable: whether the mapping is writable
74+
+ * @special: whether the mapping is a special mapping (real PFN maps)
75+
+ */
76+
+ unsigned long pfn;
77+
+ unsigned long addr_mask;
78+
+ pgprot_t pgprot;
79+
+ bool writable;
80+
+ bool special;
81+
+ };
82+
+ int follow_pfnmap_start(struct follow_pfnmap_args *args);
83+
+ void follow_pfnmap_end(struct follow_pfnmap_args *args);
84+
+
85+
++>>>>>>> 62fb8adc43af (mm: Provide address mask in struct follow_pfnmap_args)
86+
extern void truncate_pagecache(struct inode *inode, loff_t new);
87+
extern void truncate_setsize(struct inode *inode, loff_t newsize);
88+
void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to);
89+
diff --cc mm/memory.c
90+
index e2794e3b8919,68aa0f11633e..000000000000
91+
--- a/mm/memory.c
92+
+++ b/mm/memory.c
93+
@@@ -5607,60 -6479,137 +5607,92 @@@ int __pmd_alloc(struct mm_struct *mm, p
94+
}
95+
#endif /* __PAGETABLE_PMD_FOLDED */
96+
97+
++<<<<<<< HEAD
98+
++=======
99+
+ static inline void pfnmap_args_setup(struct follow_pfnmap_args *args,
100+
+ spinlock_t *lock, pte_t *ptep,
101+
+ pgprot_t pgprot, unsigned long pfn_base,
102+
+ unsigned long addr_mask, bool writable,
103+
+ bool special)
104+
+ {
105+
+ args->lock = lock;
106+
+ args->ptep = ptep;
107+
+ args->pfn = pfn_base + ((args->address & ~addr_mask) >> PAGE_SHIFT);
108+
+ args->addr_mask = addr_mask;
109+
+ args->pgprot = pgprot;
110+
+ args->writable = writable;
111+
+ args->special = special;
112+
+ }
113+
+
114+
+ static inline void pfnmap_lockdep_assert(struct vm_area_struct *vma)
115+
+ {
116+
+ #ifdef CONFIG_LOCKDEP
117+
+ struct file *file = vma->vm_file;
118+
+ struct address_space *mapping = file ? file->f_mapping : NULL;
119+
+
120+
+ if (mapping)
121+
+ lockdep_assert(lockdep_is_held(&mapping->i_mmap_rwsem) ||
122+
+ lockdep_is_held(&vma->vm_mm->mmap_lock));
123+
+ else
124+
+ lockdep_assert(lockdep_is_held(&vma->vm_mm->mmap_lock));
125+
+ #endif
126+
+ }
127+
+
128+
++>>>>>>> 62fb8adc43af (mm: Provide address mask in struct follow_pfnmap_args)
129+
/**
130+
- * follow_pfnmap_start() - Look up a pfn mapping at a user virtual address
131+
- * @args: Pointer to struct @follow_pfnmap_args
132+
- *
133+
- * The caller needs to setup args->vma and args->address to point to the
134+
- * virtual address as the target of such lookup. On a successful return,
135+
- * the results will be put into other output fields.
136+
+ * follow_pte - look up PTE at a user virtual address
137+
+ * @mm: the mm_struct of the target address space
138+
+ * @address: user virtual address
139+
+ * @ptepp: location to store found PTE
140+
+ * @ptlp: location to store the lock for the PTE
141+
*
142+
- * After the caller finished using the fields, the caller must invoke
143+
- * another follow_pfnmap_end() to proper releases the locks and resources
144+
- * of such look up request.
145+
- *
146+
- * During the start() and end() calls, the results in @args will be valid
147+
- * as proper locks will be held. After the end() is called, all the fields
148+
- * in @follow_pfnmap_args will be invalid to be further accessed. Further
149+
- * use of such information after end() may require proper synchronizations
150+
- * by the caller with page table updates, otherwise it can create a
151+
- * security bug.
152+
- *
153+
- * If the PTE maps a refcounted page, callers are responsible to protect
154+
- * against invalidation with MMU notifiers; otherwise access to the PFN at
155+
- * a later point in time can trigger use-after-free.
156+
+ * On a successful return, the pointer to the PTE is stored in @ptepp;
157+
+ * the corresponding lock is taken and its location is stored in @ptlp.
158+
+ * The contents of the PTE are only stable until @ptlp is released;
159+
+ * any further use, if any, must be protected against invalidation
160+
+ * with MMU notifiers.
161+
*
162+
* Only IO mappings and raw PFN mappings are allowed. The mmap semaphore
163+
- * should be taken for read, and the mmap semaphore cannot be released
164+
- * before the end() is invoked.
165+
+ * should be taken for read.
166+
*
167+
- * This function must not be used to modify PTE content.
168+
+ * KVM uses this function. While it is arguably less bad than ``follow_pfn``,
169+
+ * it is not a good general-purpose API.
170+
*
171+
- * Return: zero on success, negative otherwise.
172+
+ * Return: zero on success, -ve otherwise.
173+
*/
174+
-int follow_pfnmap_start(struct follow_pfnmap_args *args)
175+
+int follow_pte(struct mm_struct *mm, unsigned long address,
176+
+ pte_t **ptepp, spinlock_t **ptlp)
177+
{
178+
- struct vm_area_struct *vma = args->vma;
179+
- unsigned long address = args->address;
180+
- struct mm_struct *mm = vma->vm_mm;
181+
- spinlock_t *lock;
182+
- pgd_t *pgdp;
183+
- p4d_t *p4dp, p4d;
184+
- pud_t *pudp, pud;
185+
- pmd_t *pmdp, pmd;
186+
- pte_t *ptep, pte;
187+
-
188+
- pfnmap_lockdep_assert(vma);
189+
-
190+
- if (unlikely(address < vma->vm_start || address >= vma->vm_end))
191+
- goto out;
192+
+ pgd_t *pgd;
193+
+ p4d_t *p4d;
194+
+ pud_t *pud;
195+
+ pmd_t *pmd;
196+
+ pte_t *ptep;
197+
198+
- if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
199+
- goto out;
200+
-retry:
201+
- pgdp = pgd_offset(mm, address);
202+
- if (pgd_none(*pgdp) || unlikely(pgd_bad(*pgdp)))
203+
+ pgd = pgd_offset(mm, address);
204+
+ if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
205+
goto out;
206+
207+
- p4dp = p4d_offset(pgdp, address);
208+
- p4d = READ_ONCE(*p4dp);
209+
- if (p4d_none(p4d) || unlikely(p4d_bad(p4d)))
210+
+ p4d = p4d_offset(pgd, address);
211+
+ if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d)))
212+
goto out;
213+
214+
- pudp = pud_offset(p4dp, address);
215+
- pud = READ_ONCE(*pudp);
216+
- if (pud_none(pud))
217+
+ pud = pud_offset(p4d, address);
218+
+ if (pud_none(*pud) || unlikely(pud_bad(*pud)))
219+
goto out;
220+
- if (pud_leaf(pud)) {
221+
- lock = pud_lock(mm, pudp);
222+
- if (!unlikely(pud_leaf(pud))) {
223+
- spin_unlock(lock);
224+
- goto retry;
225+
- }
226+
- pfnmap_args_setup(args, lock, NULL, pud_pgprot(pud),
227+
- pud_pfn(pud), PUD_MASK, pud_write(pud),
228+
- pud_special(pud));
229+
- return 0;
230+
- }
231+
232+
- pmdp = pmd_offset(pudp, address);
233+
- pmd = pmdp_get_lockless(pmdp);
234+
- if (pmd_leaf(pmd)) {
235+
- lock = pmd_lock(mm, pmdp);
236+
- if (!unlikely(pmd_leaf(pmd))) {
237+
- spin_unlock(lock);
238+
- goto retry;
239+
- }
240+
- pfnmap_args_setup(args, lock, NULL, pmd_pgprot(pmd),
241+
- pmd_pfn(pmd), PMD_MASK, pmd_write(pmd),
242+
- pmd_special(pmd));
243+
- return 0;
244+
- }
245+
+ pmd = pmd_offset(pud, address);
246+
+ VM_BUG_ON(pmd_trans_huge(*pmd));
247+
248+
- ptep = pte_offset_map_lock(mm, pmdp, address, &lock);
249+
+ ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
250+
if (!ptep)
251+
goto out;
252+
- pte = ptep_get(ptep);
253+
- if (!pte_present(pte))
254+
+ if (!pte_present(ptep_get(ptep)))
255+
goto unlock;
256+
- pfnmap_args_setup(args, lock, ptep, pte_pgprot(pte),
257+
- pte_pfn(pte), PAGE_MASK, pte_write(pte),
258+
- pte_special(pte));
259+
+ *ptepp = ptep;
260+
return 0;
261+
unlock:
262+
- pte_unmap_unlock(ptep, lock);
263+
+ pte_unmap_unlock(ptep, *ptlp);
264+
out:
265+
return -EINVAL;
266+
}
267+
* Unmerged path include/linux/mm.h
268+
* Unmerged path mm/memory.c

0 commit comments

Comments
 (0)