|
| 1 | +mm: Provide address mask in struct follow_pfnmap_args |
| 2 | + |
| 3 | +jira LE-3557 |
| 4 | +Rebuild_History Non-Buildable kernel-5.14.0-570.26.1.el9_6 |
| 5 | +commit-author Alex Williamson < [email protected]> |
| 6 | +commit 62fb8adc43afad5fa1c9cadc6f3a8e9fb72af194 |
| 7 | +Empty-Commit: Cherry-Pick Conflicts during history rebuild. |
| 8 | +Will be included in final tarball splat. Ref for failed cherry-pick at: |
| 9 | +ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/62fb8adc.failed |
| 10 | + |
| 11 | +follow_pfnmap_start() walks the page table for a given address and |
| 12 | +fills out the struct follow_pfnmap_args in pfnmap_args_setup(). |
| 13 | +The address mask of the page table level is already provided to this |
| 14 | +latter function for calculating the pfn. This address mask can also |
| 15 | +be useful for the caller to determine the extent of the contiguous |
| 16 | +mapping. |
| 17 | + |
| 18 | +For example, vfio-pci now supports huge_fault for pfnmaps and is able |
| 19 | +to insert pud and pmd mappings. When we DMA map these pfnmaps, ex. |
| 20 | +PCI MMIO BARs, we iterate follow_pfnmap_start() to get each pfn to test |
| 21 | +for a contiguous pfn range. Providing the mapping address mask allows |
| 22 | +us to skip the extent of the mapping level. Assuming a 1GB pud level |
| 23 | +and 4KB page size, iterations are reduced by a factor of 256K. In wall |
| 24 | +clock time, mapping a 32GB PCI BAR is reduced from ~1s to <1ms. |
| 25 | + |
| 26 | + Cc: Andrew Morton < [email protected]> |
| 27 | + Cc: David Hildenbrand < [email protected]> |
| 28 | + |
| 29 | + Reviewed-by: Peter Xu < [email protected]> |
| 30 | + Reviewed-by: Mitchell Augustin < [email protected]> |
| 31 | + Tested-by: Mitchell Augustin < [email protected]> |
| 32 | + Reviewed-by: Jason Gunthorpe < [email protected]> |
| 33 | + Acked-by: David Hildenbrand < [email protected]> |
| 34 | +Link: https://lore.kernel.org/r/ [email protected] |
| 35 | + Signed-off-by: Alex Williamson < [email protected]> |
| 36 | +(cherry picked from commit 62fb8adc43afad5fa1c9cadc6f3a8e9fb72af194) |
| 37 | + Signed-off-by: Jonathan Maple < [email protected]> |
| 38 | + |
| 39 | +# Conflicts: |
| 40 | +# include/linux/mm.h |
| 41 | +# mm/memory.c |
| 42 | +diff --cc include/linux/mm.h |
| 43 | +index 196c481ec160,92b30dba7e38..000000000000 |
| 44 | +--- a/include/linux/mm.h |
| 45 | ++++ b/include/linux/mm.h |
| 46 | +@@@ -2436,6 -2398,39 +2436,42 @@@ int follow_phys(struct vm_area_struct * |
| 47 | + int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, |
| 48 | + void *buf, int len, int write); |
| 49 | + |
| 50 | +++<<<<<<< HEAD |
| 51 | +++======= |
| 52 | ++ struct follow_pfnmap_args { |
| 53 | ++ /** |
| 54 | ++ * Inputs: |
| 55 | ++ * @vma: Pointer to @vm_area_struct struct |
| 56 | ++ * @address: the virtual address to walk |
| 57 | ++ */ |
| 58 | ++ struct vm_area_struct *vma; |
| 59 | ++ unsigned long address; |
| 60 | ++ /** |
| 61 | ++ * Internals: |
| 62 | ++ * |
| 63 | ++ * The caller shouldn't touch any of these. |
| 64 | ++ */ |
| 65 | ++ spinlock_t *lock; |
| 66 | ++ pte_t *ptep; |
| 67 | ++ /** |
| 68 | ++ * Outputs: |
| 69 | ++ * |
| 70 | ++ * @pfn: the PFN of the address |
| 71 | ++ * @addr_mask: address mask covering pfn |
| 72 | ++ * @pgprot: the pgprot_t of the mapping |
| 73 | ++ * @writable: whether the mapping is writable |
| 74 | ++ * @special: whether the mapping is a special mapping (real PFN maps) |
| 75 | ++ */ |
| 76 | ++ unsigned long pfn; |
| 77 | ++ unsigned long addr_mask; |
| 78 | ++ pgprot_t pgprot; |
| 79 | ++ bool writable; |
| 80 | ++ bool special; |
| 81 | ++ }; |
| 82 | ++ int follow_pfnmap_start(struct follow_pfnmap_args *args); |
| 83 | ++ void follow_pfnmap_end(struct follow_pfnmap_args *args); |
| 84 | ++ |
| 85 | +++>>>>>>> 62fb8adc43af (mm: Provide address mask in struct follow_pfnmap_args) |
| 86 | + extern void truncate_pagecache(struct inode *inode, loff_t new); |
| 87 | + extern void truncate_setsize(struct inode *inode, loff_t newsize); |
| 88 | + void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to); |
| 89 | +diff --cc mm/memory.c |
| 90 | +index e2794e3b8919,68aa0f11633e..000000000000 |
| 91 | +--- a/mm/memory.c |
| 92 | ++++ b/mm/memory.c |
| 93 | +@@@ -5607,60 -6479,137 +5607,92 @@@ int __pmd_alloc(struct mm_struct *mm, p |
| 94 | + } |
| 95 | + #endif /* __PAGETABLE_PMD_FOLDED */ |
| 96 | + |
| 97 | +++<<<<<<< HEAD |
| 98 | +++======= |
| 99 | ++ static inline void pfnmap_args_setup(struct follow_pfnmap_args *args, |
| 100 | ++ spinlock_t *lock, pte_t *ptep, |
| 101 | ++ pgprot_t pgprot, unsigned long pfn_base, |
| 102 | ++ unsigned long addr_mask, bool writable, |
| 103 | ++ bool special) |
| 104 | ++ { |
| 105 | ++ args->lock = lock; |
| 106 | ++ args->ptep = ptep; |
| 107 | ++ args->pfn = pfn_base + ((args->address & ~addr_mask) >> PAGE_SHIFT); |
| 108 | ++ args->addr_mask = addr_mask; |
| 109 | ++ args->pgprot = pgprot; |
| 110 | ++ args->writable = writable; |
| 111 | ++ args->special = special; |
| 112 | ++ } |
| 113 | ++ |
| 114 | ++ static inline void pfnmap_lockdep_assert(struct vm_area_struct *vma) |
| 115 | ++ { |
| 116 | ++ #ifdef CONFIG_LOCKDEP |
| 117 | ++ struct file *file = vma->vm_file; |
| 118 | ++ struct address_space *mapping = file ? file->f_mapping : NULL; |
| 119 | ++ |
| 120 | ++ if (mapping) |
| 121 | ++ lockdep_assert(lockdep_is_held(&mapping->i_mmap_rwsem) || |
| 122 | ++ lockdep_is_held(&vma->vm_mm->mmap_lock)); |
| 123 | ++ else |
| 124 | ++ lockdep_assert(lockdep_is_held(&vma->vm_mm->mmap_lock)); |
| 125 | ++ #endif |
| 126 | ++ } |
| 127 | ++ |
| 128 | +++>>>>>>> 62fb8adc43af (mm: Provide address mask in struct follow_pfnmap_args) |
| 129 | + /** |
| 130 | + - * follow_pfnmap_start() - Look up a pfn mapping at a user virtual address |
| 131 | + - * @args: Pointer to struct @follow_pfnmap_args |
| 132 | + - * |
| 133 | + - * The caller needs to setup args->vma and args->address to point to the |
| 134 | + - * virtual address as the target of such lookup. On a successful return, |
| 135 | + - * the results will be put into other output fields. |
| 136 | + + * follow_pte - look up PTE at a user virtual address |
| 137 | + + * @mm: the mm_struct of the target address space |
| 138 | + + * @address: user virtual address |
| 139 | + + * @ptepp: location to store found PTE |
| 140 | + + * @ptlp: location to store the lock for the PTE |
| 141 | + * |
| 142 | + - * After the caller finished using the fields, the caller must invoke |
| 143 | + - * another follow_pfnmap_end() to proper releases the locks and resources |
| 144 | + - * of such look up request. |
| 145 | + - * |
| 146 | + - * During the start() and end() calls, the results in @args will be valid |
| 147 | + - * as proper locks will be held. After the end() is called, all the fields |
| 148 | + - * in @follow_pfnmap_args will be invalid to be further accessed. Further |
| 149 | + - * use of such information after end() may require proper synchronizations |
| 150 | + - * by the caller with page table updates, otherwise it can create a |
| 151 | + - * security bug. |
| 152 | + - * |
| 153 | + - * If the PTE maps a refcounted page, callers are responsible to protect |
| 154 | + - * against invalidation with MMU notifiers; otherwise access to the PFN at |
| 155 | + - * a later point in time can trigger use-after-free. |
| 156 | + + * On a successful return, the pointer to the PTE is stored in @ptepp; |
| 157 | + + * the corresponding lock is taken and its location is stored in @ptlp. |
| 158 | + + * The contents of the PTE are only stable until @ptlp is released; |
| 159 | + + * any further use, if any, must be protected against invalidation |
| 160 | + + * with MMU notifiers. |
| 161 | + * |
| 162 | + * Only IO mappings and raw PFN mappings are allowed. The mmap semaphore |
| 163 | + - * should be taken for read, and the mmap semaphore cannot be released |
| 164 | + - * before the end() is invoked. |
| 165 | + + * should be taken for read. |
| 166 | + * |
| 167 | + - * This function must not be used to modify PTE content. |
| 168 | + + * KVM uses this function. While it is arguably less bad than ``follow_pfn``, |
| 169 | + + * it is not a good general-purpose API. |
| 170 | + * |
| 171 | + - * Return: zero on success, negative otherwise. |
| 172 | + + * Return: zero on success, -ve otherwise. |
| 173 | + */ |
| 174 | + -int follow_pfnmap_start(struct follow_pfnmap_args *args) |
| 175 | + +int follow_pte(struct mm_struct *mm, unsigned long address, |
| 176 | + + pte_t **ptepp, spinlock_t **ptlp) |
| 177 | + { |
| 178 | + - struct vm_area_struct *vma = args->vma; |
| 179 | + - unsigned long address = args->address; |
| 180 | + - struct mm_struct *mm = vma->vm_mm; |
| 181 | + - spinlock_t *lock; |
| 182 | + - pgd_t *pgdp; |
| 183 | + - p4d_t *p4dp, p4d; |
| 184 | + - pud_t *pudp, pud; |
| 185 | + - pmd_t *pmdp, pmd; |
| 186 | + - pte_t *ptep, pte; |
| 187 | + - |
| 188 | + - pfnmap_lockdep_assert(vma); |
| 189 | + - |
| 190 | + - if (unlikely(address < vma->vm_start || address >= vma->vm_end)) |
| 191 | + - goto out; |
| 192 | + + pgd_t *pgd; |
| 193 | + + p4d_t *p4d; |
| 194 | + + pud_t *pud; |
| 195 | + + pmd_t *pmd; |
| 196 | + + pte_t *ptep; |
| 197 | + |
| 198 | + - if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) |
| 199 | + - goto out; |
| 200 | + -retry: |
| 201 | + - pgdp = pgd_offset(mm, address); |
| 202 | + - if (pgd_none(*pgdp) || unlikely(pgd_bad(*pgdp))) |
| 203 | + + pgd = pgd_offset(mm, address); |
| 204 | + + if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) |
| 205 | + goto out; |
| 206 | + |
| 207 | + - p4dp = p4d_offset(pgdp, address); |
| 208 | + - p4d = READ_ONCE(*p4dp); |
| 209 | + - if (p4d_none(p4d) || unlikely(p4d_bad(p4d))) |
| 210 | + + p4d = p4d_offset(pgd, address); |
| 211 | + + if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d))) |
| 212 | + goto out; |
| 213 | + |
| 214 | + - pudp = pud_offset(p4dp, address); |
| 215 | + - pud = READ_ONCE(*pudp); |
| 216 | + - if (pud_none(pud)) |
| 217 | + + pud = pud_offset(p4d, address); |
| 218 | + + if (pud_none(*pud) || unlikely(pud_bad(*pud))) |
| 219 | + goto out; |
| 220 | + - if (pud_leaf(pud)) { |
| 221 | + - lock = pud_lock(mm, pudp); |
| 222 | + - if (!unlikely(pud_leaf(pud))) { |
| 223 | + - spin_unlock(lock); |
| 224 | + - goto retry; |
| 225 | + - } |
| 226 | + - pfnmap_args_setup(args, lock, NULL, pud_pgprot(pud), |
| 227 | + - pud_pfn(pud), PUD_MASK, pud_write(pud), |
| 228 | + - pud_special(pud)); |
| 229 | + - return 0; |
| 230 | + - } |
| 231 | + |
| 232 | + - pmdp = pmd_offset(pudp, address); |
| 233 | + - pmd = pmdp_get_lockless(pmdp); |
| 234 | + - if (pmd_leaf(pmd)) { |
| 235 | + - lock = pmd_lock(mm, pmdp); |
| 236 | + - if (!unlikely(pmd_leaf(pmd))) { |
| 237 | + - spin_unlock(lock); |
| 238 | + - goto retry; |
| 239 | + - } |
| 240 | + - pfnmap_args_setup(args, lock, NULL, pmd_pgprot(pmd), |
| 241 | + - pmd_pfn(pmd), PMD_MASK, pmd_write(pmd), |
| 242 | + - pmd_special(pmd)); |
| 243 | + - return 0; |
| 244 | + - } |
| 245 | + + pmd = pmd_offset(pud, address); |
| 246 | + + VM_BUG_ON(pmd_trans_huge(*pmd)); |
| 247 | + |
| 248 | + - ptep = pte_offset_map_lock(mm, pmdp, address, &lock); |
| 249 | + + ptep = pte_offset_map_lock(mm, pmd, address, ptlp); |
| 250 | + if (!ptep) |
| 251 | + goto out; |
| 252 | + - pte = ptep_get(ptep); |
| 253 | + - if (!pte_present(pte)) |
| 254 | + + if (!pte_present(ptep_get(ptep))) |
| 255 | + goto unlock; |
| 256 | + - pfnmap_args_setup(args, lock, ptep, pte_pgprot(pte), |
| 257 | + - pte_pfn(pte), PAGE_MASK, pte_write(pte), |
| 258 | + - pte_special(pte)); |
| 259 | + + *ptepp = ptep; |
| 260 | + return 0; |
| 261 | + unlock: |
| 262 | + - pte_unmap_unlock(ptep, lock); |
| 263 | + + pte_unmap_unlock(ptep, *ptlp); |
| 264 | + out: |
| 265 | + return -EINVAL; |
| 266 | + } |
| 267 | +* Unmerged path include/linux/mm.h |
| 268 | +* Unmerged path mm/memory.c |
0 commit comments