|
| 1 | +vfio/pci: implement huge_fault support |
| 2 | + |
| 3 | +jira LE-3557 |
| 4 | +Rebuild_History Non-Buildable kernel-5.14.0-570.26.1.el9_6 |
| 5 | +commit-author Alex Williamson < [email protected]> |
| 6 | +commit f9e54c3a2f5b79ecc57c7bc7d0d3521e461a2101 |
| 7 | +Empty-Commit: Cherry-Pick Conflicts during history rebuild. |
| 8 | +Will be included in final tarball splat. Ref for failed cherry-pick at: |
| 9 | +ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/f9e54c3a.failed |
| 10 | + |
| 11 | +With the addition of pfnmap support in vmf_insert_pfn_{pmd,pud}() we can |
| 12 | +take advantage of PMD and PUD faults to PCI BAR mmaps and create more |
| 13 | +efficient mappings. PCI BARs are always a power of two and will typically |
| 14 | +get at least PMD alignment without userspace even trying. Userspace |
| 15 | +alignment for PUD mappings is also not too difficult. |
| 16 | + |
| 17 | +Consolidate faults through a single handler with a new wrapper for |
| 18 | +standard single page faults. The pre-faulting behavior of commit |
| 19 | +d71a989cf5d9 ("vfio/pci: Insert full vma on mmap'd MMIO fault") is removed |
| 20 | +in this refactoring since huge_fault will cover the bulk of the faults and |
| 21 | +results in more efficient page table usage. We also want to avoid that |
| 22 | +pre-faulted single page mappings preempt huge page mappings. |
| 23 | + |
| 24 | +Link: https://lkml.kernel.org/r/ [email protected] |
| 25 | + Signed-off-by: Alex Williamson < [email protected]> |
| 26 | + Signed-off-by: Peter Xu < [email protected]> |
| 27 | + Cc: Alexander Gordeev < [email protected]> |
| 28 | + Cc: Aneesh Kumar K.V < [email protected]> |
| 29 | + Cc: Borislav Petkov < [email protected]> |
| 30 | + Cc: Catalin Marinas < [email protected]> |
| 31 | + Cc: Christian Borntraeger < [email protected]> |
| 32 | + Cc: Dave Hansen < [email protected]> |
| 33 | + Cc: David Hildenbrand < [email protected]> |
| 34 | + Cc: Gavin Shan < [email protected]> |
| 35 | + Cc: Gerald Schaefer < [email protected]> |
| 36 | + Cc: Heiko Carstens < [email protected]> |
| 37 | + Cc: Ingo Molnar < [email protected]> |
| 38 | + Cc: Jason Gunthorpe < [email protected]> |
| 39 | + Cc: Matthew Wilcox < [email protected]> |
| 40 | + Cc: Niklas Schnelle < [email protected]> |
| 41 | + Cc: Paolo Bonzini < [email protected]> |
| 42 | + Cc: Ryan Roberts < [email protected]> |
| 43 | + Cc: Sean Christopherson < [email protected]> |
| 44 | + Cc: Sven Schnelle < [email protected]> |
| 45 | + Cc: Thomas Gleixner < [email protected]> |
| 46 | + Cc: Vasily Gorbik < [email protected]> |
| 47 | + Cc: Will Deacon < [email protected]> |
| 48 | + |
| 49 | + Signed-off-by: Andrew Morton < [email protected]> |
| 50 | +(cherry picked from commit f9e54c3a2f5b79ecc57c7bc7d0d3521e461a2101) |
| 51 | + Signed-off-by: Jonathan Maple < [email protected]> |
| 52 | + |
| 53 | +# Conflicts: |
| 54 | +# drivers/vfio/pci/vfio_pci_core.c |
| 55 | +diff --cc drivers/vfio/pci/vfio_pci_core.c |
| 56 | +index ffda816e0119,2d7478e9a62d..000000000000 |
| 57 | +--- a/drivers/vfio/pci/vfio_pci_core.c |
| 58 | ++++ b/drivers/vfio/pci/vfio_pci_core.c |
| 59 | +@@@ -1725,100 -1646,82 +1726,161 @@@ void vfio_pci_memory_unlock_and_restore |
| 60 | + up_write(&vdev->memory_lock); |
| 61 | + } |
| 62 | + |
| 63 | + -static unsigned long vma_to_pfn(struct vm_area_struct *vma) |
| 64 | + +/* Caller holds vma_lock */ |
| 65 | + +static int __vfio_pci_add_vma(struct vfio_pci_core_device *vdev, |
| 66 | + + struct vm_area_struct *vma) |
| 67 | + { |
| 68 | + - struct vfio_pci_core_device *vdev = vma->vm_private_data; |
| 69 | + - int index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); |
| 70 | + - u64 pgoff; |
| 71 | + + struct vfio_pci_mmap_vma *mmap_vma; |
| 72 | + |
| 73 | + - pgoff = vma->vm_pgoff & |
| 74 | + - ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); |
| 75 | + + mmap_vma = kmalloc(sizeof(*mmap_vma), GFP_KERNEL_ACCOUNT); |
| 76 | + + if (!mmap_vma) |
| 77 | + + return -ENOMEM; |
| 78 | + + |
| 79 | + + mmap_vma->vma = vma; |
| 80 | + + list_add(&mmap_vma->vma_next, &vdev->vma_list); |
| 81 | + + |
| 82 | + + return 0; |
| 83 | + +} |
| 84 | + + |
| 85 | + +/* |
| 86 | + + * Zap mmaps on open so that we can fault them in on access and therefore |
| 87 | + + * our vma_list only tracks mappings accessed since last zap. |
| 88 | + + */ |
| 89 | + +static void vfio_pci_mmap_open(struct vm_area_struct *vma) |
| 90 | + +{ |
| 91 | + + zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); |
| 92 | + +} |
| 93 | + + |
| 94 | + +static void vfio_pci_mmap_close(struct vm_area_struct *vma) |
| 95 | + +{ |
| 96 | + + struct vfio_pci_core_device *vdev = vma->vm_private_data; |
| 97 | + + struct vfio_pci_mmap_vma *mmap_vma; |
| 98 | + |
| 99 | + - return (pci_resource_start(vdev->pdev, index) >> PAGE_SHIFT) + pgoff; |
| 100 | + + mutex_lock(&vdev->vma_lock); |
| 101 | + + list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) { |
| 102 | + + if (mmap_vma->vma == vma) { |
| 103 | + + list_del(&mmap_vma->vma_next); |
| 104 | + + kfree(mmap_vma); |
| 105 | + + break; |
| 106 | + + } |
| 107 | + + } |
| 108 | + + mutex_unlock(&vdev->vma_lock); |
| 109 | + } |
| 110 | + |
| 111 | +- static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf) |
| 112 | ++ static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf, |
| 113 | ++ unsigned int order) |
| 114 | + { |
| 115 | + struct vm_area_struct *vma = vmf->vma; |
| 116 | + struct vfio_pci_core_device *vdev = vma->vm_private_data; |
| 117 | +++<<<<<<< HEAD |
| 118 | + + struct vfio_pci_mmap_vma *mmap_vma; |
| 119 | + + vm_fault_t ret = VM_FAULT_NOPAGE; |
| 120 | +++======= |
| 121 | ++ unsigned long pfn, pgoff = vmf->pgoff - vma->vm_pgoff; |
| 122 | ++ vm_fault_t ret = VM_FAULT_SIGBUS; |
| 123 | ++ |
| 124 | ++ if (order && (vmf->address & ((PAGE_SIZE << order) - 1) || |
| 125 | ++ vmf->address + (PAGE_SIZE << order) > vma->vm_end)) { |
| 126 | ++ ret = VM_FAULT_FALLBACK; |
| 127 | ++ goto out; |
| 128 | ++ } |
| 129 | ++ |
| 130 | ++ pfn = vma_to_pfn(vma); |
| 131 | +++>>>>>>> f9e54c3a2f5b (vfio/pci: implement huge_fault support) |
| 132 | + |
| 133 | + + mutex_lock(&vdev->vma_lock); |
| 134 | + down_read(&vdev->memory_lock); |
| 135 | + |
| 136 | +++<<<<<<< HEAD |
| 137 | + + /* |
| 138 | + + * Memory region cannot be accessed if the low power feature is engaged |
| 139 | + + * or memory access is disabled. |
| 140 | + + */ |
| 141 | + + if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev)) { |
| 142 | + + ret = VM_FAULT_SIGBUS; |
| 143 | + + goto up_out; |
| 144 | + + } |
| 145 | + + |
| 146 | + + /* |
| 147 | + + * We populate the whole vma on fault, so we need to test whether |
| 148 | + + * the vma has already been mapped, such as for concurrent faults |
| 149 | + + * to the same vma. io_remap_pfn_range() will trigger a BUG_ON if |
| 150 | + + * we ask it to fill the same range again. |
| 151 | + + */ |
| 152 | + + list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) { |
| 153 | + + if (mmap_vma->vma == vma) |
| 154 | + + goto up_out; |
| 155 | + + } |
| 156 | +++======= |
| 157 | ++ if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev)) |
| 158 | ++ goto out_unlock; |
| 159 | ++ |
| 160 | ++ switch (order) { |
| 161 | ++ case 0: |
| 162 | ++ ret = vmf_insert_pfn(vma, vmf->address, pfn + pgoff); |
| 163 | ++ break; |
| 164 | ++ #ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP |
| 165 | ++ case PMD_ORDER: |
| 166 | ++ ret = vmf_insert_pfn_pmd(vmf, __pfn_to_pfn_t(pfn + pgoff, |
| 167 | ++ PFN_DEV), false); |
| 168 | ++ break; |
| 169 | ++ #endif |
| 170 | ++ #ifdef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP |
| 171 | ++ case PUD_ORDER: |
| 172 | ++ ret = vmf_insert_pfn_pud(vmf, __pfn_to_pfn_t(pfn + pgoff, |
| 173 | ++ PFN_DEV), false); |
| 174 | ++ break; |
| 175 | ++ #endif |
| 176 | ++ default: |
| 177 | ++ ret = VM_FAULT_FALLBACK; |
| 178 | ++ } |
| 179 | ++ |
| 180 | ++ out_unlock: |
| 181 | ++ up_read(&vdev->memory_lock); |
| 182 | ++ out: |
| 183 | ++ dev_dbg_ratelimited(&vdev->pdev->dev, |
| 184 | ++ "%s(,order = %d) BAR %ld page offset 0x%lx: 0x%x\n", |
| 185 | ++ __func__, order, |
| 186 | ++ vma->vm_pgoff >> |
| 187 | ++ (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT), |
| 188 | ++ pgoff, (unsigned int)ret); |
| 189 | +++>>>>>>> f9e54c3a2f5b (vfio/pci: implement huge_fault support) |
| 190 | + + |
| 191 | + + if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, |
| 192 | + + vma->vm_end - vma->vm_start, |
| 193 | + + vma->vm_page_prot)) { |
| 194 | + + ret = VM_FAULT_SIGBUS; |
| 195 | + + zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); |
| 196 | + + goto up_out; |
| 197 | + + } |
| 198 | + + |
| 199 | + + if (__vfio_pci_add_vma(vdev, vma)) { |
| 200 | + + ret = VM_FAULT_OOM; |
| 201 | + + zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); |
| 202 | + + } |
| 203 | + |
| 204 | + +up_out: |
| 205 | + + up_read(&vdev->memory_lock); |
| 206 | + + mutex_unlock(&vdev->vma_lock); |
| 207 | + return ret; |
| 208 | + } |
| 209 | + |
| 210 | ++ static vm_fault_t vfio_pci_mmap_page_fault(struct vm_fault *vmf) |
| 211 | ++ { |
| 212 | ++ return vfio_pci_mmap_huge_fault(vmf, 0); |
| 213 | ++ } |
| 214 | ++ |
| 215 | + static const struct vm_operations_struct vfio_pci_mmap_ops = { |
| 216 | +++<<<<<<< HEAD |
| 217 | + + .open = vfio_pci_mmap_open, |
| 218 | + + .close = vfio_pci_mmap_close, |
| 219 | + + .fault = vfio_pci_mmap_fault, |
| 220 | +++======= |
| 221 | ++ .fault = vfio_pci_mmap_page_fault, |
| 222 | ++ #ifdef CONFIG_ARCH_SUPPORTS_HUGE_PFNMAP |
| 223 | ++ .huge_fault = vfio_pci_mmap_huge_fault, |
| 224 | ++ #endif |
| 225 | +++>>>>>>> f9e54c3a2f5b (vfio/pci: implement huge_fault support) |
| 226 | + }; |
| 227 | + |
| 228 | + int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma) |
| 229 | +* Unmerged path drivers/vfio/pci/vfio_pci_core.c |
0 commit comments