|
| 1 | +mm/pagewalk: check pfnmap for folio_walk_start() |
| 2 | + |
| 3 | +jira LE-3557 |
| 4 | +Rebuild_History Non-Buildable kernel-5.14.0-570.26.1.el9_6 |
| 5 | +commit-author Peter Xu < [email protected]> |
| 6 | +commit 10d83d7781a8a6ff02bafd172c1ab183b27f8d5a |
| 7 | +Empty-Commit: Cherry-Pick Conflicts during history rebuild. |
| 8 | +Will be included in final tarball splat. Ref for failed cherry-pick at: |
| 9 | +ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/10d83d77.failed |
| 10 | + |
| 11 | +Teach folio_walk_start() to recognize special pmd/pud mappings, and fail |
| 12 | +them properly as it means there's no folio backing them. |
| 13 | + |
| 14 | +[ [email protected]: remove some stale comments, per David] |
| 15 | + Link: https://lkml.kernel.org/r/ [email protected] |
| 16 | +Link: https://lkml.kernel.org/r/ [email protected] |
| 17 | + Signed-off-by: Peter Xu < [email protected]> |
| 18 | + Cc: David Hildenbrand < [email protected]> |
| 19 | + Cc: Alexander Gordeev < [email protected]> |
| 20 | + Cc: Alex Williamson < [email protected]> |
| 21 | + Cc: Aneesh Kumar K.V < [email protected]> |
| 22 | + Cc: Borislav Petkov < [email protected]> |
| 23 | + Cc: Catalin Marinas < [email protected]> |
| 24 | + Cc: Christian Borntraeger < [email protected]> |
| 25 | + Cc: Dave Hansen < [email protected]> |
| 26 | + Cc: Gavin Shan < [email protected]> |
| 27 | + Cc: Gerald Schaefer < [email protected]> |
| 28 | + Cc: Heiko Carstens < [email protected]> |
| 29 | + Cc: Ingo Molnar < [email protected]> |
| 30 | + Cc: Jason Gunthorpe < [email protected]> |
| 31 | + Cc: Matthew Wilcox < [email protected]> |
| 32 | + Cc: Niklas Schnelle < [email protected]> |
| 33 | + Cc: Paolo Bonzini < [email protected]> |
| 34 | + Cc: Ryan Roberts < [email protected]> |
| 35 | + Cc: Sean Christopherson < [email protected]> |
| 36 | + Cc: Sven Schnelle < [email protected]> |
| 37 | + Cc: Thomas Gleixner < [email protected]> |
| 38 | + Cc: Vasily Gorbik < [email protected]> |
| 39 | + Cc: Will Deacon < [email protected]> |
| 40 | + |
| 41 | + Signed-off-by: Andrew Morton < [email protected]> |
| 42 | +(cherry picked from commit 10d83d7781a8a6ff02bafd172c1ab183b27f8d5a) |
| 43 | + Signed-off-by: Jonathan Maple < [email protected]> |
| 44 | + |
| 45 | +# Conflicts: |
| 46 | +# mm/pagewalk.c |
| 47 | +diff --cc mm/pagewalk.c |
| 48 | +index b7d7e4fcfad7,461ea3bbd8d9..000000000000 |
| 49 | +--- a/mm/pagewalk.c |
| 50 | ++++ b/mm/pagewalk.c |
| 51 | +@@@ -676,3 -656,203 +676,206 @@@ int walk_page_mapping(struct address_sp |
| 52 | + |
| 53 | + return err; |
| 54 | + } |
| 55 | +++<<<<<<< HEAD |
| 56 | +++======= |
| 57 | ++ |
| 58 | ++ /** |
| 59 | ++ * folio_walk_start - walk the page tables to a folio |
| 60 | ++ * @fw: filled with information on success. |
| 61 | ++ * @vma: the VMA. |
| 62 | ++ * @addr: the virtual address to use for the page table walk. |
| 63 | ++ * @flags: flags modifying which folios to walk to. |
| 64 | ++ * |
| 65 | ++ * Walk the page tables using @addr in a given @vma to a mapped folio and |
| 66 | ++ * return the folio, making sure that the page table entry referenced by |
| 67 | ++ * @addr cannot change until folio_walk_end() was called. |
| 68 | ++ * |
| 69 | ++ * As default, this function returns only folios that are not special (e.g., not |
| 70 | ++ * the zeropage) and never returns folios that are supposed to be ignored by the |
| 71 | ++ * VM as documented by vm_normal_page(). If requested, zeropages will be |
| 72 | ++ * returned as well. |
| 73 | ++ * |
| 74 | ++ * As default, this function only considers present page table entries. |
| 75 | ++ * If requested, it will also consider migration entries. |
| 76 | ++ * |
| 77 | ++ * If this function returns NULL it might either indicate "there is nothing" or |
| 78 | ++ * "there is nothing suitable". |
| 79 | ++ * |
| 80 | ++ * On success, @fw is filled and the function returns the folio while the PTL |
| 81 | ++ * is still held and folio_walk_end() must be called to clean up, |
| 82 | ++ * releasing any held locks. The returned folio must *not* be used after the |
| 83 | ++ * call to folio_walk_end(), unless a short-term folio reference is taken before |
| 84 | ++ * that call. |
| 85 | ++ * |
| 86 | ++ * @fw->page will correspond to the page that is effectively referenced by |
| 87 | ++ * @addr. However, for migration entries and shared zeropages @fw->page is |
| 88 | ++ * set to NULL. Note that large folios might be mapped by multiple page table |
| 89 | ++ * entries, and this function will always only lookup a single entry as |
| 90 | ++ * specified by @addr, which might or might not cover more than a single page of |
| 91 | ++ * the returned folio. |
| 92 | ++ * |
| 93 | ++ * This function must *not* be used as a naive replacement for |
| 94 | ++ * get_user_pages() / pin_user_pages(), especially not to perform DMA or |
| 95 | ++ * to carelessly modify page content. This function may *only* be used to grab |
| 96 | ++ * short-term folio references, never to grab long-term folio references. |
| 97 | ++ * |
| 98 | ++ * Using the page table entry pointers in @fw for reading or modifying the |
| 99 | ++ * entry should be avoided where possible: however, there might be valid |
| 100 | ++ * use cases. |
| 101 | ++ * |
| 102 | ++ * WARNING: Modifying page table entries in hugetlb VMAs requires a lot of care. |
| 103 | ++ * For example, PMD page table sharing might require prior unsharing. Also, |
| 104 | ++ * logical hugetlb entries might span multiple physical page table entries, |
| 105 | ++ * which *must* be modified in a single operation (set_huge_pte_at(), |
| 106 | ++ * huge_ptep_set_*, ...). Note that the page table entry stored in @fw might |
| 107 | ++ * not correspond to the first physical entry of a logical hugetlb entry. |
| 108 | ++ * |
| 109 | ++ * The mmap lock must be held in read mode. |
| 110 | ++ * |
| 111 | ++ * Return: folio pointer on success, otherwise NULL. |
| 112 | ++ */ |
| 113 | ++ struct folio *folio_walk_start(struct folio_walk *fw, |
| 114 | ++ struct vm_area_struct *vma, unsigned long addr, |
| 115 | ++ folio_walk_flags_t flags) |
| 116 | ++ { |
| 117 | ++ unsigned long entry_size; |
| 118 | ++ bool expose_page = true; |
| 119 | ++ struct page *page; |
| 120 | ++ pud_t *pudp, pud; |
| 121 | ++ pmd_t *pmdp, pmd; |
| 122 | ++ pte_t *ptep, pte; |
| 123 | ++ spinlock_t *ptl; |
| 124 | ++ pgd_t *pgdp; |
| 125 | ++ p4d_t *p4dp; |
| 126 | ++ |
| 127 | ++ mmap_assert_locked(vma->vm_mm); |
| 128 | ++ vma_pgtable_walk_begin(vma); |
| 129 | ++ |
| 130 | ++ if (WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end)) |
| 131 | ++ goto not_found; |
| 132 | ++ |
| 133 | ++ pgdp = pgd_offset(vma->vm_mm, addr); |
| 134 | ++ if (pgd_none_or_clear_bad(pgdp)) |
| 135 | ++ goto not_found; |
| 136 | ++ |
| 137 | ++ p4dp = p4d_offset(pgdp, addr); |
| 138 | ++ if (p4d_none_or_clear_bad(p4dp)) |
| 139 | ++ goto not_found; |
| 140 | ++ |
| 141 | ++ pudp = pud_offset(p4dp, addr); |
| 142 | ++ pud = pudp_get(pudp); |
| 143 | ++ if (pud_none(pud)) |
| 144 | ++ goto not_found; |
| 145 | ++ if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && pud_leaf(pud)) { |
| 146 | ++ ptl = pud_lock(vma->vm_mm, pudp); |
| 147 | ++ pud = pudp_get(pudp); |
| 148 | ++ |
| 149 | ++ entry_size = PUD_SIZE; |
| 150 | ++ fw->level = FW_LEVEL_PUD; |
| 151 | ++ fw->pudp = pudp; |
| 152 | ++ fw->pud = pud; |
| 153 | ++ |
| 154 | ++ if (!pud_present(pud) || pud_devmap(pud) || pud_special(pud)) { |
| 155 | ++ spin_unlock(ptl); |
| 156 | ++ goto not_found; |
| 157 | ++ } else if (!pud_leaf(pud)) { |
| 158 | ++ spin_unlock(ptl); |
| 159 | ++ goto pmd_table; |
| 160 | ++ } |
| 161 | ++ /* |
| 162 | ++ * TODO: vm_normal_page_pud() will be handy once we want to |
| 163 | ++ * support PUD mappings in VM_PFNMAP|VM_MIXEDMAP VMAs. |
| 164 | ++ */ |
| 165 | ++ page = pud_page(pud); |
| 166 | ++ goto found; |
| 167 | ++ } |
| 168 | ++ |
| 169 | ++ pmd_table: |
| 170 | ++ VM_WARN_ON_ONCE(pud_leaf(*pudp)); |
| 171 | ++ pmdp = pmd_offset(pudp, addr); |
| 172 | ++ pmd = pmdp_get_lockless(pmdp); |
| 173 | ++ if (pmd_none(pmd)) |
| 174 | ++ goto not_found; |
| 175 | ++ if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && pmd_leaf(pmd)) { |
| 176 | ++ ptl = pmd_lock(vma->vm_mm, pmdp); |
| 177 | ++ pmd = pmdp_get(pmdp); |
| 178 | ++ |
| 179 | ++ entry_size = PMD_SIZE; |
| 180 | ++ fw->level = FW_LEVEL_PMD; |
| 181 | ++ fw->pmdp = pmdp; |
| 182 | ++ fw->pmd = pmd; |
| 183 | ++ |
| 184 | ++ if (pmd_none(pmd)) { |
| 185 | ++ spin_unlock(ptl); |
| 186 | ++ goto not_found; |
| 187 | ++ } else if (!pmd_leaf(pmd)) { |
| 188 | ++ spin_unlock(ptl); |
| 189 | ++ goto pte_table; |
| 190 | ++ } else if (pmd_present(pmd)) { |
| 191 | ++ page = vm_normal_page_pmd(vma, addr, pmd); |
| 192 | ++ if (page) { |
| 193 | ++ goto found; |
| 194 | ++ } else if ((flags & FW_ZEROPAGE) && |
| 195 | ++ is_huge_zero_pmd(pmd)) { |
| 196 | ++ page = pfn_to_page(pmd_pfn(pmd)); |
| 197 | ++ expose_page = false; |
| 198 | ++ goto found; |
| 199 | ++ } |
| 200 | ++ } else if ((flags & FW_MIGRATION) && |
| 201 | ++ is_pmd_migration_entry(pmd)) { |
| 202 | ++ swp_entry_t entry = pmd_to_swp_entry(pmd); |
| 203 | ++ |
| 204 | ++ page = pfn_swap_entry_to_page(entry); |
| 205 | ++ expose_page = false; |
| 206 | ++ goto found; |
| 207 | ++ } |
| 208 | ++ spin_unlock(ptl); |
| 209 | ++ goto not_found; |
| 210 | ++ } |
| 211 | ++ |
| 212 | ++ pte_table: |
| 213 | ++ VM_WARN_ON_ONCE(pmd_leaf(pmdp_get_lockless(pmdp))); |
| 214 | ++ ptep = pte_offset_map_lock(vma->vm_mm, pmdp, addr, &ptl); |
| 215 | ++ if (!ptep) |
| 216 | ++ goto not_found; |
| 217 | ++ pte = ptep_get(ptep); |
| 218 | ++ |
| 219 | ++ entry_size = PAGE_SIZE; |
| 220 | ++ fw->level = FW_LEVEL_PTE; |
| 221 | ++ fw->ptep = ptep; |
| 222 | ++ fw->pte = pte; |
| 223 | ++ |
| 224 | ++ if (pte_present(pte)) { |
| 225 | ++ page = vm_normal_page(vma, addr, pte); |
| 226 | ++ if (page) |
| 227 | ++ goto found; |
| 228 | ++ if ((flags & FW_ZEROPAGE) && |
| 229 | ++ is_zero_pfn(pte_pfn(pte))) { |
| 230 | ++ page = pfn_to_page(pte_pfn(pte)); |
| 231 | ++ expose_page = false; |
| 232 | ++ goto found; |
| 233 | ++ } |
| 234 | ++ } else if (!pte_none(pte)) { |
| 235 | ++ swp_entry_t entry = pte_to_swp_entry(pte); |
| 236 | ++ |
| 237 | ++ if ((flags & FW_MIGRATION) && |
| 238 | ++ is_migration_entry(entry)) { |
| 239 | ++ page = pfn_swap_entry_to_page(entry); |
| 240 | ++ expose_page = false; |
| 241 | ++ goto found; |
| 242 | ++ } |
| 243 | ++ } |
| 244 | ++ pte_unmap_unlock(ptep, ptl); |
| 245 | ++ not_found: |
| 246 | ++ vma_pgtable_walk_end(vma); |
| 247 | ++ return NULL; |
| 248 | ++ found: |
| 249 | ++ if (expose_page) |
| 250 | ++ /* Note: Offset from the mapped page, not the folio start. */ |
| 251 | ++ fw->page = nth_page(page, (addr & (entry_size - 1)) >> PAGE_SHIFT); |
| 252 | ++ else |
| 253 | ++ fw->page = NULL; |
| 254 | ++ fw->ptl = ptl; |
| 255 | ++ return page_folio(page); |
| 256 | ++ } |
| 257 | +++>>>>>>> 10d83d7781a8 (mm/pagewalk: check pfnmap for folio_walk_start()) |
| 258 | +diff --git a/mm/memory.c b/mm/memory.c |
| 259 | +index e2794e3b8919..e8a797dd7721 100644 |
| 260 | +--- a/mm/memory.c |
| 261 | ++++ b/mm/memory.c |
| 262 | +@@ -659,11 +659,10 @@ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, |
| 263 | + { |
| 264 | + unsigned long pfn = pmd_pfn(pmd); |
| 265 | + |
| 266 | +- /* |
| 267 | +- * There is no pmd_special() but there may be special pmds, e.g. |
| 268 | +- * in a direct-access (dax) mapping, so let's just replicate the |
| 269 | +- * !CONFIG_ARCH_HAS_PTE_SPECIAL case from vm_normal_page() here. |
| 270 | +- */ |
| 271 | ++ /* Currently it's only used for huge pfnmaps */ |
| 272 | ++ if (unlikely(pmd_special(pmd))) |
| 273 | ++ return NULL; |
| 274 | ++ |
| 275 | + if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { |
| 276 | + if (vma->vm_flags & VM_MIXEDMAP) { |
| 277 | + if (!pfn_valid(pfn)) |
| 278 | +* Unmerged path mm/pagewalk.c |
0 commit comments