diff --git a/COPYING-5.14.0-570.25.1.el9_6 b/COPYING-5.14.0-570.26.1.el9_6 similarity index 100% rename from COPYING-5.14.0-570.25.1.el9_6 rename to COPYING-5.14.0-570.26.1.el9_6 diff --git a/Makefile.rhelver b/Makefile.rhelver index 02ac9c022433c..f74ec79875955 100644 --- a/Makefile.rhelver +++ b/Makefile.rhelver @@ -12,7 +12,7 @@ RHEL_MINOR = 6 # # Use this spot to avoid future merge conflicts. # Do not trim this comment. -RHEL_RELEASE = 570.25.1 +RHEL_RELEASE = 570.26.1 # # ZSTREAM diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index ca1f39e496316..1906e414303ce 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -97,6 +97,7 @@ config ARM64 select ARCH_SUPPORTS_NUMA_BALANCING select ARCH_SUPPORTS_PAGE_TABLE_CHECK select ARCH_SUPPORTS_PER_VMA_LOCK + select ARCH_SUPPORTS_HUGE_PFNMAP if TRANSPARENT_HUGEPAGE select ARCH_SUPPORTS_RT select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 9e44e693fcd26..7fa291d3f90ab 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -354,6 +354,7 @@ static inline void __sync_cache_and_tags(pte_t pte, unsigned int nr_pages) /* * Select all bits except the pfn */ +#define pte_pgprot pte_pgprot static inline pgprot_t pte_pgprot(pte_t pte) { unsigned long pfn = pte_pfn(pte); @@ -527,6 +528,14 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd) return pte_pmd(set_pte_bit(pmd_pte(pmd), __pgprot(PTE_DEVMAP))); } +#ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP +#define pmd_special(pte) (!!((pmd_val(pte) & PTE_SPECIAL))) +static inline pmd_t pmd_mkspecial(pmd_t pmd) +{ + return set_pmd_bit(pmd, __pgprot(PTE_SPECIAL)); +} +#endif + #define __pmd_to_phys(pmd) __pte_to_phys(pmd_pte(pmd)) #define __phys_to_pmd_val(phys) __phys_to_pte_val(phys) #define pmd_pfn(pmd) ((__pmd_to_phys(pmd) & PMD_MASK) >> PAGE_SHIFT) @@ -544,6 +553,27 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd) #define pud_pfn(pud) ((__pud_to_phys(pud) & PUD_MASK) >> PAGE_SHIFT) #define pfn_pud(pfn,prot) __pud(__phys_to_pud_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot)) +#ifdef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP +#define pud_special(pte) pte_special(pud_pte(pud)) +#define pud_mkspecial(pte) pte_pud(pte_mkspecial(pud_pte(pud))) +#endif + +#define pmd_pgprot pmd_pgprot +static inline pgprot_t pmd_pgprot(pmd_t pmd) +{ + unsigned long pfn = pmd_pfn(pmd); + + return __pgprot(pmd_val(pfn_pmd(pfn, __pgprot(0))) ^ pmd_val(pmd)); +} + +#define pud_pgprot pud_pgprot +static inline pgprot_t pud_pgprot(pud_t pud) +{ + unsigned long pfn = pud_pfn(pud); + + return __pgprot(pud_val(pfn_pud(pfn, __pgprot(0))) ^ pud_val(pud)); +} + static inline void __set_pte_at(struct mm_struct *mm, unsigned long __always_unused addr, pte_t *ptep, pte_t pte, unsigned int nr) diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index 5c12b43e746bd..4833c86ec4829 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -396,33 +396,35 @@ static inline void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) #define __flush_tlb_range_op(op, start, pages, stride, \ asid, tlb_level, tlbi_user, lpa2) \ do { \ + typeof(start) __flush_start = start; \ + typeof(pages) __flush_pages = pages; \ int num = 0; \ int scale = 3; \ int shift = lpa2 ? 16 : PAGE_SHIFT; \ unsigned long addr; \ \ - while (pages > 0) { \ + while (__flush_pages > 0) { \ if (!system_supports_tlb_range() || \ - pages == 1 || \ - (lpa2 && start != ALIGN(start, SZ_64K))) { \ - addr = __TLBI_VADDR(start, asid); \ + __flush_pages == 1 || \ + (lpa2 && __flush_start != ALIGN(__flush_start, SZ_64K))) { \ + addr = __TLBI_VADDR(__flush_start, asid); \ __tlbi_level(op, addr, tlb_level); \ if (tlbi_user) \ __tlbi_user_level(op, addr, tlb_level); \ - start += stride; \ - pages -= stride >> PAGE_SHIFT; \ + __flush_start += stride; \ + __flush_pages -= stride >> PAGE_SHIFT; \ continue; \ } \ \ - num = __TLBI_RANGE_NUM(pages, scale); \ + num = __TLBI_RANGE_NUM(__flush_pages, scale); \ if (num >= 0) { \ - addr = __TLBI_VADDR_RANGE(start >> shift, asid, \ + addr = __TLBI_VADDR_RANGE(__flush_start >> shift, asid, \ scale, num, tlb_level); \ __tlbi(r##op, addr); \ if (tlbi_user) \ __tlbi_user(r##op, addr); \ - start += __TLBI_RANGE_PAGES(num, scale) << PAGE_SHIFT; \ - pages -= __TLBI_RANGE_PAGES(num, scale); \ + __flush_start += __TLBI_RANGE_PAGES(num, scale) << PAGE_SHIFT; \ + __flush_pages -= __TLBI_RANGE_PAGES(num, scale);\ } \ scale--; \ } \ diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 6543642f56e45..96f6376ad2bde 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -44,6 +44,7 @@ static inline unsigned long pte_pfn(pte_t pte) /* * Select all bits except the pfn */ +#define pte_pgprot pte_pgprot static inline pgprot_t pte_pgprot(pte_t pte) { unsigned long pte_flags; diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 2d2d224207bcc..61ec7143cddbb 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -941,6 +941,7 @@ static inline int pte_unused(pte_t pte) * young/old accounting is not supported, i.e _PAGE_PROTECT and _PAGE_INVALID * must not be set. */ +#define pte_pgprot pte_pgprot static inline pgprot_t pte_pgprot(pte_t pte) { unsigned long pte_flags = pte_val(pte) & _PAGE_CHG_MASK; diff --git a/arch/s390/pci/pci_mmio.c b/arch/s390/pci/pci_mmio.c index 5880893329310..84a8c8f517fb2 100644 --- a/arch/s390/pci/pci_mmio.c +++ b/arch/s390/pci/pci_mmio.c @@ -118,12 +118,11 @@ static inline int __memcpy_toio_inuser(void __iomem *dst, SYSCALL_DEFINE3(s390_pci_mmio_write, unsigned long, mmio_addr, const void __user *, user_buffer, size_t, length) { + struct follow_pfnmap_args args = { }; u8 local_buf[64]; void __iomem *io_addr; void *buf; struct vm_area_struct *vma; - pte_t *ptep; - spinlock_t *ptl; long ret; if (!zpci_is_enabled()) @@ -169,11 +168,13 @@ SYSCALL_DEFINE3(s390_pci_mmio_write, unsigned long, mmio_addr, if (!(vma->vm_flags & VM_WRITE)) goto out_unlock_mmap; - ret = follow_pte(vma->vm_mm, mmio_addr, &ptep, &ptl); + args.address = mmio_addr; + args.vma = vma; + ret = follow_pfnmap_start(&args); if (ret) goto out_unlock_mmap; - io_addr = (void __iomem *)((pte_pfn(*ptep) << PAGE_SHIFT) | + io_addr = (void __iomem *)((args.pfn << PAGE_SHIFT) | (mmio_addr & ~PAGE_MASK)); if ((unsigned long) io_addr < ZPCI_IOMAP_ADDR_BASE) @@ -181,7 +182,7 @@ SYSCALL_DEFINE3(s390_pci_mmio_write, unsigned long, mmio_addr, ret = zpci_memcpy_toio(io_addr, buf, length); out_unlock_pt: - pte_unmap_unlock(ptep, ptl); + follow_pfnmap_end(&args); out_unlock_mmap: mmap_read_unlock(current->mm); out_free: @@ -260,12 +261,11 @@ static inline int __memcpy_fromio_inuser(void __user *dst, SYSCALL_DEFINE3(s390_pci_mmio_read, unsigned long, mmio_addr, void __user *, user_buffer, size_t, length) { + struct follow_pfnmap_args args = { }; u8 local_buf[64]; void __iomem *io_addr; void *buf; struct vm_area_struct *vma; - pte_t *ptep; - spinlock_t *ptl; long ret; if (!zpci_is_enabled()) @@ -308,11 +308,13 @@ SYSCALL_DEFINE3(s390_pci_mmio_read, unsigned long, mmio_addr, if (!(vma->vm_flags & VM_WRITE)) goto out_unlock_mmap; - ret = follow_pte(vma->vm_mm, mmio_addr, &ptep, &ptl); + args.vma = vma; + args.address = mmio_addr; + ret = follow_pfnmap_start(&args); if (ret) goto out_unlock_mmap; - io_addr = (void __iomem *)((pte_pfn(*ptep) << PAGE_SHIFT) | + io_addr = (void __iomem *)((args.pfn << PAGE_SHIFT) | (mmio_addr & ~PAGE_MASK)); if ((unsigned long) io_addr < ZPCI_IOMAP_ADDR_BASE) { @@ -322,7 +324,7 @@ SYSCALL_DEFINE3(s390_pci_mmio_read, unsigned long, mmio_addr, ret = zpci_memcpy_fromio(buf, io_addr, length); out_unlock_pt: - pte_unmap_unlock(ptep, ptl); + follow_pfnmap_end(&args); out_unlock_mmap: mmap_read_unlock(current->mm); diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 896d9b786736d..da3a5f673ca59 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -782,6 +782,7 @@ static inline pmd_t pmd_mkwrite_novma(pmd_t pmd) return __pmd(pte_val(pte)); } +#define pmd_pgprot pmd_pgprot static inline pgprot_t pmd_pgprot(pmd_t entry) { unsigned long val = pmd_val(entry); diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index a5a59118efe4b..5d4f050bd59f9 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -28,6 +28,7 @@ config X86_64 select ARCH_HAS_GIGANTIC_PAGE select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 select ARCH_SUPPORTS_PER_VMA_LOCK + select ARCH_SUPPORTS_HUGE_PFNMAP if TRANSPARENT_HUGEPAGE select ARCH_SUPPORTS_RT select HAVE_ARCH_SOFT_DIRTY select MODULES_USE_ELF_RELA diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 8149afec43a4e..c5bc120fade87 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -121,6 +121,34 @@ extern pmdval_t early_pmd_flags; #define arch_end_context_switch(prev) do {} while(0) #endif /* CONFIG_PARAVIRT_XXL */ +static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set) +{ + pmdval_t v = native_pmd_val(pmd); + + return native_make_pmd(v | set); +} + +static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear) +{ + pmdval_t v = native_pmd_val(pmd); + + return native_make_pmd(v & ~clear); +} + +static inline pud_t pud_set_flags(pud_t pud, pudval_t set) +{ + pudval_t v = native_pud_val(pud); + + return native_make_pud(v | set); +} + +static inline pud_t pud_clear_flags(pud_t pud, pudval_t clear) +{ + pudval_t v = native_pud_val(pud); + + return native_make_pud(v & ~clear); +} + /* * The following only work if pte_present() is true. * Undefined behaviour if not.. @@ -310,6 +338,30 @@ static inline int pud_devmap(pud_t pud) } #endif +#ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP +static inline bool pmd_special(pmd_t pmd) +{ + return pmd_flags(pmd) & _PAGE_SPECIAL; +} + +static inline pmd_t pmd_mkspecial(pmd_t pmd) +{ + return pmd_set_flags(pmd, _PAGE_SPECIAL); +} +#endif /* CONFIG_ARCH_SUPPORTS_PMD_PFNMAP */ + +#ifdef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP +static inline bool pud_special(pud_t pud) +{ + return pud_flags(pud) & _PAGE_SPECIAL; +} + +static inline pud_t pud_mkspecial(pud_t pud) +{ + return pud_set_flags(pud, _PAGE_SPECIAL); +} +#endif /* CONFIG_ARCH_SUPPORTS_PUD_PFNMAP */ + static inline int pgd_devmap(pgd_t pgd) { return 0; @@ -480,20 +532,6 @@ static inline pte_t pte_mkdevmap(pte_t pte) return pte_set_flags(pte, _PAGE_SPECIAL|_PAGE_DEVMAP); } -static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set) -{ - pmdval_t v = native_pmd_val(pmd); - - return native_make_pmd(v | set); -} - -static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear) -{ - pmdval_t v = native_pmd_val(pmd); - - return native_make_pmd(v & ~clear); -} - /* See comments above mksaveddirty_shift() */ static inline pmd_t pmd_mksaveddirty(pmd_t pmd) { @@ -588,20 +626,6 @@ static inline pmd_t pmd_mkwrite_novma(pmd_t pmd) pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); #define pmd_mkwrite pmd_mkwrite -static inline pud_t pud_set_flags(pud_t pud, pudval_t set) -{ - pudval_t v = native_pud_val(pud); - - return native_make_pud(v | set); -} - -static inline pud_t pud_clear_flags(pud_t pud, pudval_t clear) -{ - pudval_t v = native_pud_val(pud); - - return native_make_pud(v & ~clear); -} - /* See comments above mksaveddirty_shift() */ static inline pud_t pud_mksaveddirty(pud_t pud) { diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index c0d56c02b8da9..9e84bcedd9adf 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -834,7 +834,7 @@ static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t siz return ret; } - for_each_node(nid) { + for_each_node_with_cpus(nid) { cpu = cpumask_first(cpumask_of_node(nid)); c = &cpu_data(cpu); diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index 36b603d0cddef..fd210b362a04d 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -947,6 +948,26 @@ static void free_pfn_range(u64 paddr, unsigned long size) memtype_free(paddr, paddr + size); } +static int follow_phys(struct vm_area_struct *vma, unsigned long *prot, + resource_size_t *phys) +{ + struct follow_pfnmap_args args = { .vma = vma, .address = vma->vm_start }; + + if (follow_pfnmap_start(&args)) + return -EINVAL; + + /* Never return PFNs of anon folios in COW mappings. */ + if (!args.special) { + follow_pfnmap_end(&args); + return -EINVAL; + } + + *prot = pgprot_val(args.pgprot); + *phys = (resource_size_t)args.pfn << PAGE_SHIFT; + follow_pfnmap_end(&args); + return 0; +} + static int get_pat_info(struct vm_area_struct *vma, resource_size_t *paddr, pgprot_t *pgprot) { @@ -964,7 +985,7 @@ static int get_pat_info(struct vm_area_struct *vma, resource_size_t *paddr, * detect the PFN. If we need the cachemode as well, we're out of luck * for now and have to fail fork(). */ - if (!follow_phys(vma, vma->vm_start, 0, &prot, paddr)) { + if (!follow_phys(vma, &prot, paddr)) { if (pgprot) *pgprot = __pgprot(prot); return 0; diff --git a/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/09dfc8a5.failed b/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/09dfc8a5.failed new file mode 100644 index 0000000000000..7cb365f0647ea --- /dev/null +++ b/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/09dfc8a5.failed @@ -0,0 +1,134 @@ +vfio/pci: Fallback huge faults for unaligned pfn + +jira LE-3557 +Rebuild_History Non-Buildable kernel-5.14.0-570.26.1.el9_6 +commit-author Alex Williamson +commit 09dfc8a5f2ce897005a94bf66cca4f91e4e03700 +Empty-Commit: Cherry-Pick Conflicts during history rebuild. +Will be included in final tarball splat. Ref for failed cherry-pick at: +ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/09dfc8a5.failed + +The PFN must also be aligned to the fault order to insert a huge +pfnmap. Test the alignment and fallback when unaligned. + +Fixes: f9e54c3a2f5b ("vfio/pci: implement huge_fault support") +Link: https://bugzilla.kernel.org/show_bug.cgi?id=219619 + Reported-by: Athul Krishna + Reported-by: Precific + Reviewed-by: Peter Xu + Tested-by: Precific +Link: https://lore.kernel.org/r/20250102183416.1841878-1-alex.williamson@redhat.com + Cc: stable@vger.kernel.org + Signed-off-by: Alex Williamson +(cherry picked from commit 09dfc8a5f2ce897005a94bf66cca4f91e4e03700) + Signed-off-by: Jonathan Maple + +# Conflicts: +# drivers/vfio/pci/vfio_pci_core.c +diff --cc drivers/vfio/pci/vfio_pci_core.c +index ffda816e0119,1a4ed5a357d3..000000000000 +--- a/drivers/vfio/pci/vfio_pci_core.c ++++ b/drivers/vfio/pci/vfio_pci_core.c +@@@ -1770,49 -1658,59 +1770,87 @@@ static vm_fault_t vfio_pci_mmap_fault(s + { + struct vm_area_struct *vma = vmf->vma; + struct vfio_pci_core_device *vdev = vma->vm_private_data; +++<<<<<<< HEAD + + struct vfio_pci_mmap_vma *mmap_vma; + + vm_fault_t ret = VM_FAULT_NOPAGE; + + + + mutex_lock(&vdev->vma_lock); + + down_read(&vdev->memory_lock); + + + + /* + + * Memory region cannot be accessed if the low power feature is engaged + + * or memory access is disabled. + + */ + + if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev)) { + + ret = VM_FAULT_SIGBUS; + + goto up_out; +++======= ++ unsigned long pfn, pgoff = vmf->pgoff - vma->vm_pgoff; ++ vm_fault_t ret = VM_FAULT_SIGBUS; ++ ++ pfn = vma_to_pfn(vma) + pgoff; ++ ++ if (order && (pfn & ((1 << order) - 1) || ++ vmf->address & ((PAGE_SIZE << order) - 1) || ++ vmf->address + (PAGE_SIZE << order) > vma->vm_end)) { ++ ret = VM_FAULT_FALLBACK; ++ goto out; ++ } ++ ++ down_read(&vdev->memory_lock); ++ ++ if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev)) ++ goto out_unlock; ++ ++ switch (order) { ++ case 0: ++ ret = vmf_insert_pfn(vma, vmf->address, pfn); ++ break; ++ #ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP ++ case PMD_ORDER: ++ ret = vmf_insert_pfn_pmd(vmf, ++ __pfn_to_pfn_t(pfn, PFN_DEV), false); ++ break; ++ #endif ++ #ifdef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP ++ case PUD_ORDER: ++ ret = vmf_insert_pfn_pud(vmf, ++ __pfn_to_pfn_t(pfn, PFN_DEV), false); ++ break; ++ #endif ++ default: ++ ret = VM_FAULT_FALLBACK; +++>>>>>>> 09dfc8a5f2ce (vfio/pci: Fallback huge faults for unaligned pfn) + } + + -out_unlock: + - up_read(&vdev->memory_lock); + -out: + - dev_dbg_ratelimited(&vdev->pdev->dev, + - "%s(,order = %d) BAR %ld page offset 0x%lx: 0x%x\n", + - __func__, order, + - vma->vm_pgoff >> + - (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT), + - pgoff, (unsigned int)ret); + + /* + + * We populate the whole vma on fault, so we need to test whether + + * the vma has already been mapped, such as for concurrent faults + + * to the same vma. io_remap_pfn_range() will trigger a BUG_ON if + + * we ask it to fill the same range again. + + */ + + list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) { + + if (mmap_vma->vma == vma) + + goto up_out; + + } + + - return ret; + -} + + if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, + + vma->vm_end - vma->vm_start, + + vma->vm_page_prot)) { + + ret = VM_FAULT_SIGBUS; + + zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); + + goto up_out; + + } + + -static vm_fault_t vfio_pci_mmap_page_fault(struct vm_fault *vmf) + -{ + - return vfio_pci_mmap_huge_fault(vmf, 0); + + if (__vfio_pci_add_vma(vdev, vma)) { + + ret = VM_FAULT_OOM; + + zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); + + } + + + +up_out: + + up_read(&vdev->memory_lock); + + mutex_unlock(&vdev->vma_lock); + + return ret; + } + + static const struct vm_operations_struct vfio_pci_mmap_ops = { +* Unmerged path drivers/vfio/pci/vfio_pci_core.c diff --git a/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/0fd06844.failed b/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/0fd06844.failed new file mode 100644 index 0000000000000..da674785e8369 --- /dev/null +++ b/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/0fd06844.failed @@ -0,0 +1,83 @@ +vfio/type1: Use mapping page mask for pfnmaps + +jira LE-3557 +Rebuild_History Non-Buildable kernel-5.14.0-570.26.1.el9_6 +commit-author Alex Williamson +commit 0fd06844de5d063cb384384e06a11ec7141a35d5 +Empty-Commit: Cherry-Pick Conflicts during history rebuild. +Will be included in final tarball splat. Ref for failed cherry-pick at: +ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/0fd06844.failed + +vfio-pci supports huge_fault for PCI MMIO BARs and will insert pud and +pmd mappings for well aligned mappings. follow_pfnmap_start() walks the +page table and therefore knows the page mask of the level where the +address is found and returns this through follow_pfnmap_args.addr_mask. +Subsequent pfns from this address until the end of the mapping page are +necessarily consecutive. Use this information to retrieve a range of +pfnmap pfns in a single pass. + +With optimal mappings and alignment on systems with 1GB pud and 4KB +page size, this reduces iterations for DMA mapping PCI BARs by a +factor of 256K. In real world testing, the overhead of iterating +pfns for a VM DMA mapping a 32GB PCI BAR is reduced from ~1s to +sub-millisecond overhead. + + Reviewed-by: Peter Xu + Reviewed-by: Mitchell Augustin + Tested-by: Mitchell Augustin + Reviewed-by: Jason Gunthorpe +Link: https://lore.kernel.org/r/20250218222209.1382449-7-alex.williamson@redhat.com + Signed-off-by: Alex Williamson +(cherry picked from commit 0fd06844de5d063cb384384e06a11ec7141a35d5) + Signed-off-by: Jonathan Maple + +# Conflicts: +# drivers/vfio/vfio_iommu_type1.c +diff --cc drivers/vfio/vfio_iommu_type1.c +index 410214696525,0ac56072af9f..000000000000 +--- a/drivers/vfio/vfio_iommu_type1.c ++++ b/drivers/vfio/vfio_iommu_type1.c +@@@ -523,14 -520,12 +523,14 @@@ static void vfio_batch_fini(struct vfio + + static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm, + unsigned long vaddr, unsigned long *pfn, +- bool write_fault) ++ unsigned long *addr_mask, bool write_fault) + { + - struct follow_pfnmap_args args = { .vma = vma, .address = vaddr }; + + pte_t *ptep; + + pte_t pte; + + spinlock_t *ptl; + int ret; + + - ret = follow_pfnmap_start(&args); + + ret = follow_pte(vma->vm_mm, vaddr, &ptep, &ptl); + if (ret) { + bool unlocked = false; + +@@@ -549,14 -544,14 +549,23 @@@ + return ret; + } + +++<<<<<<< HEAD + + pte = ptep_get(ptep); + + + + if (write_fault && !pte_write(pte)) + + ret = -EFAULT; + + else + + *pfn = pte_pfn(pte); +++======= ++ if (write_fault && !args.writable) { ++ ret = -EFAULT; ++ } else { ++ *pfn = args.pfn; ++ *addr_mask = args.addr_mask; ++ } +++>>>>>>> 0fd06844de5d (vfio/type1: Use mapping page mask for pfnmaps) + + - follow_pfnmap_end(&args); + + pte_unmap_unlock(ptep, ptl); + return ret; + } + +* Unmerged path drivers/vfio/vfio_iommu_type1.c diff --git a/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/10d83d77.failed b/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/10d83d77.failed new file mode 100644 index 0000000000000..be51f1ee459b8 --- /dev/null +++ b/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/10d83d77.failed @@ -0,0 +1,278 @@ +mm/pagewalk: check pfnmap for folio_walk_start() + +jira LE-3557 +Rebuild_History Non-Buildable kernel-5.14.0-570.26.1.el9_6 +commit-author Peter Xu +commit 10d83d7781a8a6ff02bafd172c1ab183b27f8d5a +Empty-Commit: Cherry-Pick Conflicts during history rebuild. +Will be included in final tarball splat. Ref for failed cherry-pick at: +ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/10d83d77.failed + +Teach folio_walk_start() to recognize special pmd/pud mappings, and fail +them properly as it means there's no folio backing them. + +[peterx@redhat.com: remove some stale comments, per David] + Link: https://lkml.kernel.org/r/20240829202237.2640288-1-peterx@redhat.com +Link: https://lkml.kernel.org/r/20240826204353.2228736-7-peterx@redhat.com + Signed-off-by: Peter Xu + Cc: David Hildenbrand + Cc: Alexander Gordeev + Cc: Alex Williamson + Cc: Aneesh Kumar K.V + Cc: Borislav Petkov + Cc: Catalin Marinas + Cc: Christian Borntraeger + Cc: Dave Hansen + Cc: Gavin Shan + Cc: Gerald Schaefer + Cc: Heiko Carstens + Cc: Ingo Molnar + Cc: Jason Gunthorpe + Cc: Matthew Wilcox + Cc: Niklas Schnelle + Cc: Paolo Bonzini + Cc: Ryan Roberts + Cc: Sean Christopherson + Cc: Sven Schnelle + Cc: Thomas Gleixner + Cc: Vasily Gorbik + Cc: Will Deacon + Cc: Zi Yan + Signed-off-by: Andrew Morton +(cherry picked from commit 10d83d7781a8a6ff02bafd172c1ab183b27f8d5a) + Signed-off-by: Jonathan Maple + +# Conflicts: +# mm/pagewalk.c +diff --cc mm/pagewalk.c +index b7d7e4fcfad7,461ea3bbd8d9..000000000000 +--- a/mm/pagewalk.c ++++ b/mm/pagewalk.c +@@@ -676,3 -656,203 +676,206 @@@ int walk_page_mapping(struct address_sp + + return err; + } +++<<<<<<< HEAD +++======= ++ ++ /** ++ * folio_walk_start - walk the page tables to a folio ++ * @fw: filled with information on success. ++ * @vma: the VMA. ++ * @addr: the virtual address to use for the page table walk. ++ * @flags: flags modifying which folios to walk to. ++ * ++ * Walk the page tables using @addr in a given @vma to a mapped folio and ++ * return the folio, making sure that the page table entry referenced by ++ * @addr cannot change until folio_walk_end() was called. ++ * ++ * As default, this function returns only folios that are not special (e.g., not ++ * the zeropage) and never returns folios that are supposed to be ignored by the ++ * VM as documented by vm_normal_page(). If requested, zeropages will be ++ * returned as well. ++ * ++ * As default, this function only considers present page table entries. ++ * If requested, it will also consider migration entries. ++ * ++ * If this function returns NULL it might either indicate "there is nothing" or ++ * "there is nothing suitable". ++ * ++ * On success, @fw is filled and the function returns the folio while the PTL ++ * is still held and folio_walk_end() must be called to clean up, ++ * releasing any held locks. The returned folio must *not* be used after the ++ * call to folio_walk_end(), unless a short-term folio reference is taken before ++ * that call. ++ * ++ * @fw->page will correspond to the page that is effectively referenced by ++ * @addr. However, for migration entries and shared zeropages @fw->page is ++ * set to NULL. Note that large folios might be mapped by multiple page table ++ * entries, and this function will always only lookup a single entry as ++ * specified by @addr, which might or might not cover more than a single page of ++ * the returned folio. ++ * ++ * This function must *not* be used as a naive replacement for ++ * get_user_pages() / pin_user_pages(), especially not to perform DMA or ++ * to carelessly modify page content. This function may *only* be used to grab ++ * short-term folio references, never to grab long-term folio references. ++ * ++ * Using the page table entry pointers in @fw for reading or modifying the ++ * entry should be avoided where possible: however, there might be valid ++ * use cases. ++ * ++ * WARNING: Modifying page table entries in hugetlb VMAs requires a lot of care. ++ * For example, PMD page table sharing might require prior unsharing. Also, ++ * logical hugetlb entries might span multiple physical page table entries, ++ * which *must* be modified in a single operation (set_huge_pte_at(), ++ * huge_ptep_set_*, ...). Note that the page table entry stored in @fw might ++ * not correspond to the first physical entry of a logical hugetlb entry. ++ * ++ * The mmap lock must be held in read mode. ++ * ++ * Return: folio pointer on success, otherwise NULL. ++ */ ++ struct folio *folio_walk_start(struct folio_walk *fw, ++ struct vm_area_struct *vma, unsigned long addr, ++ folio_walk_flags_t flags) ++ { ++ unsigned long entry_size; ++ bool expose_page = true; ++ struct page *page; ++ pud_t *pudp, pud; ++ pmd_t *pmdp, pmd; ++ pte_t *ptep, pte; ++ spinlock_t *ptl; ++ pgd_t *pgdp; ++ p4d_t *p4dp; ++ ++ mmap_assert_locked(vma->vm_mm); ++ vma_pgtable_walk_begin(vma); ++ ++ if (WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end)) ++ goto not_found; ++ ++ pgdp = pgd_offset(vma->vm_mm, addr); ++ if (pgd_none_or_clear_bad(pgdp)) ++ goto not_found; ++ ++ p4dp = p4d_offset(pgdp, addr); ++ if (p4d_none_or_clear_bad(p4dp)) ++ goto not_found; ++ ++ pudp = pud_offset(p4dp, addr); ++ pud = pudp_get(pudp); ++ if (pud_none(pud)) ++ goto not_found; ++ if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && pud_leaf(pud)) { ++ ptl = pud_lock(vma->vm_mm, pudp); ++ pud = pudp_get(pudp); ++ ++ entry_size = PUD_SIZE; ++ fw->level = FW_LEVEL_PUD; ++ fw->pudp = pudp; ++ fw->pud = pud; ++ ++ if (!pud_present(pud) || pud_devmap(pud) || pud_special(pud)) { ++ spin_unlock(ptl); ++ goto not_found; ++ } else if (!pud_leaf(pud)) { ++ spin_unlock(ptl); ++ goto pmd_table; ++ } ++ /* ++ * TODO: vm_normal_page_pud() will be handy once we want to ++ * support PUD mappings in VM_PFNMAP|VM_MIXEDMAP VMAs. ++ */ ++ page = pud_page(pud); ++ goto found; ++ } ++ ++ pmd_table: ++ VM_WARN_ON_ONCE(pud_leaf(*pudp)); ++ pmdp = pmd_offset(pudp, addr); ++ pmd = pmdp_get_lockless(pmdp); ++ if (pmd_none(pmd)) ++ goto not_found; ++ if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && pmd_leaf(pmd)) { ++ ptl = pmd_lock(vma->vm_mm, pmdp); ++ pmd = pmdp_get(pmdp); ++ ++ entry_size = PMD_SIZE; ++ fw->level = FW_LEVEL_PMD; ++ fw->pmdp = pmdp; ++ fw->pmd = pmd; ++ ++ if (pmd_none(pmd)) { ++ spin_unlock(ptl); ++ goto not_found; ++ } else if (!pmd_leaf(pmd)) { ++ spin_unlock(ptl); ++ goto pte_table; ++ } else if (pmd_present(pmd)) { ++ page = vm_normal_page_pmd(vma, addr, pmd); ++ if (page) { ++ goto found; ++ } else if ((flags & FW_ZEROPAGE) && ++ is_huge_zero_pmd(pmd)) { ++ page = pfn_to_page(pmd_pfn(pmd)); ++ expose_page = false; ++ goto found; ++ } ++ } else if ((flags & FW_MIGRATION) && ++ is_pmd_migration_entry(pmd)) { ++ swp_entry_t entry = pmd_to_swp_entry(pmd); ++ ++ page = pfn_swap_entry_to_page(entry); ++ expose_page = false; ++ goto found; ++ } ++ spin_unlock(ptl); ++ goto not_found; ++ } ++ ++ pte_table: ++ VM_WARN_ON_ONCE(pmd_leaf(pmdp_get_lockless(pmdp))); ++ ptep = pte_offset_map_lock(vma->vm_mm, pmdp, addr, &ptl); ++ if (!ptep) ++ goto not_found; ++ pte = ptep_get(ptep); ++ ++ entry_size = PAGE_SIZE; ++ fw->level = FW_LEVEL_PTE; ++ fw->ptep = ptep; ++ fw->pte = pte; ++ ++ if (pte_present(pte)) { ++ page = vm_normal_page(vma, addr, pte); ++ if (page) ++ goto found; ++ if ((flags & FW_ZEROPAGE) && ++ is_zero_pfn(pte_pfn(pte))) { ++ page = pfn_to_page(pte_pfn(pte)); ++ expose_page = false; ++ goto found; ++ } ++ } else if (!pte_none(pte)) { ++ swp_entry_t entry = pte_to_swp_entry(pte); ++ ++ if ((flags & FW_MIGRATION) && ++ is_migration_entry(entry)) { ++ page = pfn_swap_entry_to_page(entry); ++ expose_page = false; ++ goto found; ++ } ++ } ++ pte_unmap_unlock(ptep, ptl); ++ not_found: ++ vma_pgtable_walk_end(vma); ++ return NULL; ++ found: ++ if (expose_page) ++ /* Note: Offset from the mapped page, not the folio start. */ ++ fw->page = nth_page(page, (addr & (entry_size - 1)) >> PAGE_SHIFT); ++ else ++ fw->page = NULL; ++ fw->ptl = ptl; ++ return page_folio(page); ++ } +++>>>>>>> 10d83d7781a8 (mm/pagewalk: check pfnmap for folio_walk_start()) +diff --git a/mm/memory.c b/mm/memory.c +index e2794e3b8919..e8a797dd7721 100644 +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -659,11 +659,10 @@ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, + { + unsigned long pfn = pmd_pfn(pmd); + +- /* +- * There is no pmd_special() but there may be special pmds, e.g. +- * in a direct-access (dax) mapping, so let's just replicate the +- * !CONFIG_ARCH_HAS_PTE_SPECIAL case from vm_normal_page() here. +- */ ++ /* Currently it's only used for huge pfnmaps */ ++ if (unlikely(pmd_special(pmd))) ++ return NULL; ++ + if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { + if (vma->vm_flags & VM_MIXEDMAP) { + if (!pfn_valid(pfn)) +* Unmerged path mm/pagewalk.c diff --git a/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/29ae7d96.failed b/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/29ae7d96.failed new file mode 100644 index 0000000000000..6ceae3c4b0909 --- /dev/null +++ b/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/29ae7d96.failed @@ -0,0 +1,287 @@ +mm: pass VMA instead of MM to follow_pte() + +jira LE-3557 +Rebuild_History Non-Buildable kernel-5.14.0-570.26.1.el9_6 +commit-author David Hildenbrand +commit 29ae7d96d166fa08c7232daf8a314ef5ba1efd20 +Empty-Commit: Cherry-Pick Conflicts during history rebuild. +Will be included in final tarball splat. Ref for failed cherry-pick at: +ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/29ae7d96.failed + +... and centralize the VM_IO/VM_PFNMAP sanity check in there. We'll +now also perform these sanity checks for direct follow_pte() +invocations. + +For generic_access_phys(), we might now check multiple times: nothing to +worry about, really. + +Link: https://lkml.kernel.org/r/20240410155527.474777-3-david@redhat.com + Signed-off-by: David Hildenbrand + Acked-by: Sean Christopherson [KVM] + Cc: Alex Williamson + Cc: Christoph Hellwig + Cc: Fei Li + Cc: Gerald Schaefer + Cc: Heiko Carstens + Cc: Ingo Molnar + Cc: Paolo Bonzini + Cc: Yonghua Huang + Signed-off-by: Andrew Morton +(cherry picked from commit 29ae7d96d166fa08c7232daf8a314ef5ba1efd20) + Signed-off-by: Jonathan Maple + +# Conflicts: +# arch/x86/mm/pat/memtype.c +# drivers/virt/acrn/mm.c +diff --cc arch/x86/mm/pat/memtype.c +index 36b603d0cdde,bdc2a240c2aa..000000000000 +--- a/arch/x86/mm/pat/memtype.c ++++ b/arch/x86/mm/pat/memtype.c +@@@ -947,6 -948,29 +947,32 @@@ static void free_pfn_range(u64 paddr, u + memtype_free(paddr, paddr + size); + } + +++<<<<<<< HEAD +++======= ++ static int follow_phys(struct vm_area_struct *vma, unsigned long *prot, ++ resource_size_t *phys) ++ { ++ pte_t *ptep, pte; ++ spinlock_t *ptl; ++ ++ if (follow_pte(vma, vma->vm_start, &ptep, &ptl)) ++ return -EINVAL; ++ ++ pte = ptep_get(ptep); ++ ++ /* Never return PFNs of anon folios in COW mappings. */ ++ if (vm_normal_folio(vma, vma->vm_start, pte)) { ++ pte_unmap_unlock(ptep, ptl); ++ return -EINVAL; ++ } ++ ++ *prot = pgprot_val(pte_pgprot(pte)); ++ *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT; ++ pte_unmap_unlock(ptep, ptl); ++ return 0; ++ } ++ +++>>>>>>> 29ae7d96d166 (mm: pass VMA instead of MM to follow_pte()) + static int get_pat_info(struct vm_area_struct *vma, resource_size_t *paddr, + pgprot_t *pgprot) + { +diff --cc drivers/virt/acrn/mm.c +index c4f2e15c8a2b,db8ff1d0ac23..000000000000 +--- a/drivers/virt/acrn/mm.c ++++ b/drivers/virt/acrn/mm.c +@@@ -168,7 -170,69 +168,73 @@@ int acrn_vm_ram_map(struct acrn_vm *vm + + /* Get the page number of the map region */ + nr_pages = memmap->len >> PAGE_SHIFT; +++<<<<<<< HEAD + + pages = vzalloc(nr_pages * sizeof(struct page *)); +++======= ++ if (!nr_pages) ++ return -EINVAL; ++ ++ mmap_read_lock(current->mm); ++ vma = vma_lookup(current->mm, memmap->vma_base); ++ if (vma && ((vma->vm_flags & VM_PFNMAP) != 0)) { ++ unsigned long start_pfn, cur_pfn; ++ spinlock_t *ptl; ++ bool writable; ++ pte_t *ptep; ++ ++ if ((memmap->vma_base + memmap->len) > vma->vm_end) { ++ mmap_read_unlock(current->mm); ++ return -EINVAL; ++ } ++ ++ for (i = 0; i < nr_pages; i++) { ++ ret = follow_pte(vma, memmap->vma_base + i * PAGE_SIZE, ++ &ptep, &ptl); ++ if (ret) ++ break; ++ ++ cur_pfn = pte_pfn(ptep_get(ptep)); ++ if (i == 0) ++ start_pfn = cur_pfn; ++ writable = !!pte_write(ptep_get(ptep)); ++ pte_unmap_unlock(ptep, ptl); ++ ++ /* Disallow write access if the PTE is not writable. */ ++ if (!writable && ++ (memmap->attr & ACRN_MEM_ACCESS_WRITE)) { ++ ret = -EFAULT; ++ break; ++ } ++ ++ /* Disallow refcounted pages. */ ++ if (pfn_valid(cur_pfn) && ++ !PageReserved(pfn_to_page(cur_pfn))) { ++ ret = -EFAULT; ++ break; ++ } ++ ++ /* Disallow non-contiguous ranges. */ ++ if (cur_pfn != start_pfn + i) { ++ ret = -EINVAL; ++ break; ++ } ++ } ++ mmap_read_unlock(current->mm); ++ ++ if (ret) { ++ dev_dbg(acrn_dev.this_device, ++ "Failed to lookup PFN at VMA:%pK.\n", (void *)memmap->vma_base); ++ return ret; ++ } ++ ++ return acrn_mm_region_add(vm, memmap->user_vm_pa, ++ PFN_PHYS(start_pfn), memmap->len, ++ ACRN_MEM_TYPE_WB, memmap->attr); ++ } ++ mmap_read_unlock(current->mm); ++ ++ pages = vzalloc(array_size(nr_pages, sizeof(*pages))); +++>>>>>>> 29ae7d96d166 (mm: pass VMA instead of MM to follow_pte()) + if (!pages) + return -ENOMEM; + +diff --git a/arch/s390/pci/pci_mmio.c b/arch/s390/pci/pci_mmio.c +index 588089332931..bca6af2ee723 100644 +--- a/arch/s390/pci/pci_mmio.c ++++ b/arch/s390/pci/pci_mmio.c +@@ -169,7 +169,7 @@ SYSCALL_DEFINE3(s390_pci_mmio_write, unsigned long, mmio_addr, + if (!(vma->vm_flags & VM_WRITE)) + goto out_unlock_mmap; + +- ret = follow_pte(vma->vm_mm, mmio_addr, &ptep, &ptl); ++ ret = follow_pte(vma, mmio_addr, &ptep, &ptl); + if (ret) + goto out_unlock_mmap; + +@@ -308,7 +308,7 @@ SYSCALL_DEFINE3(s390_pci_mmio_read, unsigned long, mmio_addr, + if (!(vma->vm_flags & VM_WRITE)) + goto out_unlock_mmap; + +- ret = follow_pte(vma->vm_mm, mmio_addr, &ptep, &ptl); ++ ret = follow_pte(vma, mmio_addr, &ptep, &ptl); + if (ret) + goto out_unlock_mmap; + +* Unmerged path arch/x86/mm/pat/memtype.c +diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c +index 6c6586af7953..ec4d0003ba2f 100644 +--- a/drivers/vfio/vfio_iommu_type1.c ++++ b/drivers/vfio/vfio_iommu_type1.c +@@ -520,7 +520,7 @@ static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm, + spinlock_t *ptl; + int ret; + +- ret = follow_pte(vma->vm_mm, vaddr, &ptep, &ptl); ++ ret = follow_pte(vma, vaddr, &ptep, &ptl); + if (ret) { + bool unlocked = false; + +@@ -534,7 +534,7 @@ static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm, + if (ret) + return ret; + +- ret = follow_pte(vma->vm_mm, vaddr, &ptep, &ptl); ++ ret = follow_pte(vma, vaddr, &ptep, &ptl); + if (ret) + return ret; + } +* Unmerged path drivers/virt/acrn/mm.c +diff --git a/include/linux/mm.h b/include/linux/mm.h +index 196c481ec160..b85fd05660e5 100644 +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -2427,7 +2427,7 @@ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, + unsigned long end, unsigned long floor, unsigned long ceiling); + int + copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma); +-int follow_pte(struct mm_struct *mm, unsigned long address, ++int follow_pte(struct vm_area_struct *vma, unsigned long address, + pte_t **ptepp, spinlock_t **ptlp); + int follow_pfn(struct vm_area_struct *vma, unsigned long address, + unsigned long *pfn); +diff --git a/mm/memory.c b/mm/memory.c +index e2794e3b8919..6706b9830402 100644 +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -5609,7 +5609,7 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) + + /** + * follow_pte - look up PTE at a user virtual address +- * @mm: the mm_struct of the target address space ++ * @vma: the memory mapping + * @address: user virtual address + * @ptepp: location to store found PTE + * @ptlp: location to store the lock for the PTE +@@ -5628,15 +5628,19 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) + * + * Return: zero on success, -ve otherwise. + */ +-int follow_pte(struct mm_struct *mm, unsigned long address, ++int follow_pte(struct vm_area_struct *vma, unsigned long address, + pte_t **ptepp, spinlock_t **ptlp) + { ++ struct mm_struct *mm = vma->vm_mm; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *ptep; + ++ if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) ++ goto out; ++ + pgd = pgd_offset(mm, address); + if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) + goto out; +@@ -5754,11 +5758,8 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, + int offset = offset_in_page(addr); + int ret = -EINVAL; + +- if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) +- return -EINVAL; +- + retry: +- if (follow_pte(vma->vm_mm, addr, &ptep, &ptl)) ++ if (follow_pte(vma, addr, &ptep, &ptl)) + return -EINVAL; + pte = ptep_get(ptep); + pte_unmap_unlock(ptep, ptl); +@@ -5773,7 +5774,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, + if (!maddr) + return -ENOMEM; + +- if (follow_pte(vma->vm_mm, addr, &ptep, &ptl)) ++ if (follow_pte(vma, addr, &ptep, &ptl)) + goto out_unmap; + + if (!pte_same(pte, ptep_get(ptep))) { +diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c +index b163a079fe65..acd8c5aee080 100644 +--- a/virt/kvm/kvm_main.c ++++ b/virt/kvm/kvm_main.c +@@ -2884,7 +2884,7 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, + spinlock_t *ptl; + int r; + +- r = follow_pte(vma->vm_mm, addr, &ptep, &ptl); ++ r = follow_pte(vma, addr, &ptep, &ptl); + if (r) { + /* + * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does +@@ -2899,7 +2899,7 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, + if (r) + return r; + +- r = follow_pte(vma->vm_mm, addr, &ptep, &ptl); ++ r = follow_pte(vma, addr, &ptep, &ptl); + if (r) + return r; + } diff --git a/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/3e509c9b.failed b/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/3e509c9b.failed new file mode 100644 index 0000000000000..c30be2e3188c4 --- /dev/null +++ b/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/3e509c9b.failed @@ -0,0 +1,118 @@ +mm/arm64: support large pfn mappings + +jira LE-3557 +Rebuild_History Non-Buildable kernel-5.14.0-570.26.1.el9_6 +commit-author Peter Xu +commit 3e509c9b03f9abc7804c80bed266a6cc4286a5a8 +Empty-Commit: Cherry-Pick Conflicts during history rebuild. +Will be included in final tarball splat. Ref for failed cherry-pick at: +ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/3e509c9b.failed + +Support huge pfnmaps by using bit 56 (PTE_SPECIAL) for "special" on +pmds/puds. Provide the pmd/pud helpers to set/get special bit. + +There's one more thing missing for arm64 which is the pxx_pgprot() for +pmd/pud. Add them too, which is mostly the same as the pte version by +dropping the pfn field. These helpers are essential to be used in the new +follow_pfnmap*() API to report valid pgprot_t results. + +Note that arm64 doesn't yet support huge PUD yet, but it's still +straightforward to provide the pud helpers that we need altogether. Only +PMD helpers will make an immediate benefit until arm64 will support huge +PUDs first in general (e.g. in THPs). + +Link: https://lkml.kernel.org/r/20240826204353.2228736-19-peterx@redhat.com + Signed-off-by: Peter Xu + Cc: Catalin Marinas + Cc: Will Deacon + Cc: Alexander Gordeev + Cc: Alex Williamson + Cc: Aneesh Kumar K.V + Cc: Borislav Petkov + Cc: Christian Borntraeger + Cc: Dave Hansen + Cc: David Hildenbrand + Cc: Gavin Shan + Cc: Gerald Schaefer + Cc: Heiko Carstens + Cc: Ingo Molnar + Cc: Jason Gunthorpe + Cc: Matthew Wilcox + Cc: Niklas Schnelle + Cc: Paolo Bonzini + Cc: Ryan Roberts + Cc: Sean Christopherson + Cc: Sven Schnelle + Cc: Thomas Gleixner + Cc: Vasily Gorbik + Cc: Zi Yan + Signed-off-by: Andrew Morton +(cherry picked from commit 3e509c9b03f9abc7804c80bed266a6cc4286a5a8) + Signed-off-by: Jonathan Maple + +# Conflicts: +# arch/arm64/Kconfig +diff --cc arch/arm64/Kconfig +index ca1f39e49631,6607ed8fdbb4..000000000000 +--- a/arch/arm64/Kconfig ++++ b/arch/arm64/Kconfig +@@@ -97,7 -99,7 +97,11 @@@ config ARM6 + select ARCH_SUPPORTS_NUMA_BALANCING + select ARCH_SUPPORTS_PAGE_TABLE_CHECK + select ARCH_SUPPORTS_PER_VMA_LOCK +++<<<<<<< HEAD + + select ARCH_SUPPORTS_RT +++======= ++ select ARCH_SUPPORTS_HUGE_PFNMAP if TRANSPARENT_HUGEPAGE +++>>>>>>> 3e509c9b03f9 (mm/arm64: support large pfn mappings) + select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH + select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT + select ARCH_WANT_DEFAULT_BPF_JIT +* Unmerged path arch/arm64/Kconfig +diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h +index e3ea0ef9673d..7fa291d3f90a 100644 +--- a/arch/arm64/include/asm/pgtable.h ++++ b/arch/arm64/include/asm/pgtable.h +@@ -528,6 +528,14 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd) + return pte_pmd(set_pte_bit(pmd_pte(pmd), __pgprot(PTE_DEVMAP))); + } + ++#ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP ++#define pmd_special(pte) (!!((pmd_val(pte) & PTE_SPECIAL))) ++static inline pmd_t pmd_mkspecial(pmd_t pmd) ++{ ++ return set_pmd_bit(pmd, __pgprot(PTE_SPECIAL)); ++} ++#endif ++ + #define __pmd_to_phys(pmd) __pte_to_phys(pmd_pte(pmd)) + #define __phys_to_pmd_val(phys) __phys_to_pte_val(phys) + #define pmd_pfn(pmd) ((__pmd_to_phys(pmd) & PMD_MASK) >> PAGE_SHIFT) +@@ -545,6 +553,27 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd) + #define pud_pfn(pud) ((__pud_to_phys(pud) & PUD_MASK) >> PAGE_SHIFT) + #define pfn_pud(pfn,prot) __pud(__phys_to_pud_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot)) + ++#ifdef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP ++#define pud_special(pte) pte_special(pud_pte(pud)) ++#define pud_mkspecial(pte) pte_pud(pte_mkspecial(pud_pte(pud))) ++#endif ++ ++#define pmd_pgprot pmd_pgprot ++static inline pgprot_t pmd_pgprot(pmd_t pmd) ++{ ++ unsigned long pfn = pmd_pfn(pmd); ++ ++ return __pgprot(pmd_val(pfn_pmd(pfn, __pgprot(0))) ^ pmd_val(pmd)); ++} ++ ++#define pud_pgprot pud_pgprot ++static inline pgprot_t pud_pgprot(pud_t pud) ++{ ++ unsigned long pfn = pud_pfn(pud); ++ ++ return __pgprot(pud_val(pfn_pud(pfn, __pgprot(0))) ^ pud_val(pud)); ++} ++ + static inline void __set_pte_at(struct mm_struct *mm, + unsigned long __always_unused addr, + pte_t *ptep, pte_t pte, unsigned int nr) diff --git a/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/5731aacd.failed b/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/5731aacd.failed new file mode 100644 index 0000000000000..debf935c26f86 --- /dev/null +++ b/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/5731aacd.failed @@ -0,0 +1,80 @@ +KVM: use follow_pfnmap API + +jira LE-3557 +Rebuild_History Non-Buildable kernel-5.14.0-570.26.1.el9_6 +commit-author Peter Xu +commit 5731aacd54a883dd2c1a5e8c85e1fe78fc728dc7 +Empty-Commit: Cherry-Pick Conflicts during history rebuild. +Will be included in final tarball splat. Ref for failed cherry-pick at: +ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/5731aacd.failed + +Use the new pfnmap API to allow huge MMIO mappings for VMs. The rest work +is done perfectly on the other side (host_pfn_mapping_level()). + +Link: https://lkml.kernel.org/r/20240826204353.2228736-11-peterx@redhat.com + Signed-off-by: Peter Xu + Cc: Paolo Bonzini + Cc: Sean Christopherson + Cc: Alexander Gordeev + Cc: Alex Williamson + Cc: Aneesh Kumar K.V + Cc: Borislav Petkov + Cc: Catalin Marinas + Cc: Christian Borntraeger + Cc: Dave Hansen + Cc: David Hildenbrand + Cc: Gavin Shan + Cc: Gerald Schaefer + Cc: Heiko Carstens + Cc: Ingo Molnar + Cc: Jason Gunthorpe + Cc: Matthew Wilcox + Cc: Niklas Schnelle + Cc: Ryan Roberts + Cc: Sven Schnelle + Cc: Thomas Gleixner + Cc: Vasily Gorbik + Cc: Will Deacon + Cc: Zi Yan + Signed-off-by: Andrew Morton +(cherry picked from commit 5731aacd54a883dd2c1a5e8c85e1fe78fc728dc7) + Signed-off-by: Jonathan Maple + +# Conflicts: +# virt/kvm/kvm_main.c +diff --cc virt/kvm/kvm_main.c +index b163a079fe65,f416d5e3f9c0..000000000000 +--- a/virt/kvm/kvm_main.c ++++ b/virt/kvm/kvm_main.c +@@@ -2878,13 -2860,11 +2878,15 @@@ static int hva_to_pfn_remapped(struct v + unsigned long addr, bool write_fault, + bool *writable, kvm_pfn_t *p_pfn) + { ++ struct follow_pfnmap_args args = { .vma = vma, .address = addr }; + kvm_pfn_t pfn; +- pte_t *ptep; +- pte_t pte; +- spinlock_t *ptl; + int r; + +++<<<<<<< HEAD + + r = follow_pte(vma->vm_mm, addr, &ptep, &ptl); +++======= ++ r = follow_pfnmap_start(&args); +++>>>>>>> 5731aacd54a8 (KVM: use follow_pfnmap API) + if (r) { + /* + * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does +@@@ -2899,7 -2879,7 +2901,11 @@@ + if (r) + return r; + +++<<<<<<< HEAD + + r = follow_pte(vma->vm_mm, addr, &ptep, &ptl); +++======= ++ r = follow_pfnmap_start(&args); +++>>>>>>> 5731aacd54a8 (KVM: use follow_pfnmap API) + if (r) + return r; + } +* Unmerged path virt/kvm/kvm_main.c diff --git a/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/5b34b76c.failed b/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/5b34b76c.failed new file mode 100644 index 0000000000000..8f30775e13fed --- /dev/null +++ b/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/5b34b76c.failed @@ -0,0 +1,148 @@ +mm: move follow_phys to arch/x86/mm/pat/memtype.c + +jira LE-3557 +Rebuild_History Non-Buildable kernel-5.14.0-570.26.1.el9_6 +commit-author Christoph Hellwig +commit 5b34b76cb0cd8a21dee5c7677eae98480b0d05cc +Empty-Commit: Cherry-Pick Conflicts during history rebuild. +Will be included in final tarball splat. Ref for failed cherry-pick at: +ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/5b34b76c.failed + +follow_phys is only used by two callers in arch/x86/mm/pat/memtype.c. +Move it there and hardcode the two arguments that get the same values +passed by both callers. + +[david@redhat.com: conflict resolutions] +Link: https://lkml.kernel.org/r/20240403212131.929421-4-david@redhat.com +Link: https://lkml.kernel.org/r/20240324234542.2038726-4-hch@lst.de + Signed-off-by: Christoph Hellwig + Signed-off-by: David Hildenbrand + Reviewed-by: David Hildenbrand + Cc: Andy Lutomirski + Cc: Dave Hansen + Cc: Fei Li + Cc: Ingo Molnar + Cc: Peter Zijlstra + Cc: Nathan Chancellor + Signed-off-by: Andrew Morton +(cherry picked from commit 5b34b76cb0cd8a21dee5c7677eae98480b0d05cc) + Signed-off-by: Jonathan Maple + +# Conflicts: +# include/linux/mm.h +diff --cc include/linux/mm.h +index 196c481ec160,5dc65618e386..000000000000 +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@@ -2429,10 -2424,6 +2429,13 @@@ in + copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma); + int follow_pte(struct mm_struct *mm, unsigned long address, + pte_t **ptepp, spinlock_t **ptlp); +++<<<<<<< HEAD + +int follow_pfn(struct vm_area_struct *vma, unsigned long address, + + unsigned long *pfn); + +int follow_phys(struct vm_area_struct *vma, unsigned long address, + + unsigned int flags, unsigned long *prot, resource_size_t *phys); +++======= +++>>>>>>> 5b34b76cb0cd (mm: move follow_phys to arch/x86/mm/pat/memtype.c) + int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, + void *buf, int len, int write); + +diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c +index 36b603d0cdde..d01c3b0bd6eb 100644 +--- a/arch/x86/mm/pat/memtype.c ++++ b/arch/x86/mm/pat/memtype.c +@@ -39,6 +39,7 @@ + #include + #include + #include ++#include + #include + #include + +@@ -947,6 +948,32 @@ static void free_pfn_range(u64 paddr, unsigned long size) + memtype_free(paddr, paddr + size); + } + ++static int follow_phys(struct vm_area_struct *vma, unsigned long *prot, ++ resource_size_t *phys) ++{ ++ pte_t *ptep, pte; ++ spinlock_t *ptl; ++ ++ if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) ++ return -EINVAL; ++ ++ if (follow_pte(vma->vm_mm, vma->vm_start, &ptep, &ptl)) ++ return -EINVAL; ++ ++ pte = ptep_get(ptep); ++ ++ /* Never return PFNs of anon folios in COW mappings. */ ++ if (vm_normal_folio(vma, vma->vm_start, pte)) { ++ pte_unmap_unlock(ptep, ptl); ++ return -EINVAL; ++ } ++ ++ *prot = pgprot_val(pte_pgprot(pte)); ++ *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT; ++ pte_unmap_unlock(ptep, ptl); ++ return 0; ++} ++ + static int get_pat_info(struct vm_area_struct *vma, resource_size_t *paddr, + pgprot_t *pgprot) + { +@@ -964,7 +991,7 @@ static int get_pat_info(struct vm_area_struct *vma, resource_size_t *paddr, + * detect the PFN. If we need the cachemode as well, we're out of luck + * for now and have to fail fork(). + */ +- if (!follow_phys(vma, vma->vm_start, 0, &prot, paddr)) { ++ if (!follow_phys(vma, &prot, paddr)) { + if (pgprot) + *pgprot = __pgprot(prot); + return 0; +* Unmerged path include/linux/mm.h +diff --git a/mm/memory.c b/mm/memory.c +index e2794e3b8919..257618e95c0e 100644 +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -5699,38 +5699,6 @@ int follow_pfn(struct vm_area_struct *vma, unsigned long address, + EXPORT_SYMBOL(follow_pfn); + + #ifdef CONFIG_HAVE_IOREMAP_PROT +-int follow_phys(struct vm_area_struct *vma, +- unsigned long address, unsigned int flags, +- unsigned long *prot, resource_size_t *phys) +-{ +- int ret = -EINVAL; +- pte_t *ptep, pte; +- spinlock_t *ptl; +- +- if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) +- goto out; +- +- if (follow_pte(vma->vm_mm, address, &ptep, &ptl)) +- goto out; +- pte = ptep_get(ptep); +- +- /* Never return PFNs of anon folios in COW mappings. */ +- if (vm_normal_folio(vma, address, pte)) +- goto unlock; +- +- if ((flags & FOLL_WRITE) && !pte_write(pte)) +- goto unlock; +- +- *prot = pgprot_val(pte_pgprot(pte)); +- *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT; +- +- ret = 0; +-unlock: +- pte_unmap_unlock(ptep, ptl); +-out: +- return ret; +-} +- + /** + * generic_access_phys - generic implementation for iomem mmap access + * @vma: the vma to access diff --git a/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/62fb8adc.failed b/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/62fb8adc.failed new file mode 100644 index 0000000000000..f77fa7edb2880 --- /dev/null +++ b/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/62fb8adc.failed @@ -0,0 +1,268 @@ +mm: Provide address mask in struct follow_pfnmap_args + +jira LE-3557 +Rebuild_History Non-Buildable kernel-5.14.0-570.26.1.el9_6 +commit-author Alex Williamson +commit 62fb8adc43afad5fa1c9cadc6f3a8e9fb72af194 +Empty-Commit: Cherry-Pick Conflicts during history rebuild. +Will be included in final tarball splat. Ref for failed cherry-pick at: +ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/62fb8adc.failed + +follow_pfnmap_start() walks the page table for a given address and +fills out the struct follow_pfnmap_args in pfnmap_args_setup(). +The address mask of the page table level is already provided to this +latter function for calculating the pfn. This address mask can also +be useful for the caller to determine the extent of the contiguous +mapping. + +For example, vfio-pci now supports huge_fault for pfnmaps and is able +to insert pud and pmd mappings. When we DMA map these pfnmaps, ex. +PCI MMIO BARs, we iterate follow_pfnmap_start() to get each pfn to test +for a contiguous pfn range. Providing the mapping address mask allows +us to skip the extent of the mapping level. Assuming a 1GB pud level +and 4KB page size, iterations are reduced by a factor of 256K. In wall +clock time, mapping a 32GB PCI BAR is reduced from ~1s to <1ms. + + Cc: Andrew Morton + Cc: David Hildenbrand + Cc: linux-mm@kvack.org + Reviewed-by: Peter Xu + Reviewed-by: Mitchell Augustin + Tested-by: Mitchell Augustin + Reviewed-by: Jason Gunthorpe + Acked-by: David Hildenbrand +Link: https://lore.kernel.org/r/20250218222209.1382449-6-alex.williamson@redhat.com + Signed-off-by: Alex Williamson +(cherry picked from commit 62fb8adc43afad5fa1c9cadc6f3a8e9fb72af194) + Signed-off-by: Jonathan Maple + +# Conflicts: +# include/linux/mm.h +# mm/memory.c +diff --cc include/linux/mm.h +index 196c481ec160,92b30dba7e38..000000000000 +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@@ -2436,6 -2398,39 +2436,42 @@@ int follow_phys(struct vm_area_struct * + int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, + void *buf, int len, int write); + +++<<<<<<< HEAD +++======= ++ struct follow_pfnmap_args { ++ /** ++ * Inputs: ++ * @vma: Pointer to @vm_area_struct struct ++ * @address: the virtual address to walk ++ */ ++ struct vm_area_struct *vma; ++ unsigned long address; ++ /** ++ * Internals: ++ * ++ * The caller shouldn't touch any of these. ++ */ ++ spinlock_t *lock; ++ pte_t *ptep; ++ /** ++ * Outputs: ++ * ++ * @pfn: the PFN of the address ++ * @addr_mask: address mask covering pfn ++ * @pgprot: the pgprot_t of the mapping ++ * @writable: whether the mapping is writable ++ * @special: whether the mapping is a special mapping (real PFN maps) ++ */ ++ unsigned long pfn; ++ unsigned long addr_mask; ++ pgprot_t pgprot; ++ bool writable; ++ bool special; ++ }; ++ int follow_pfnmap_start(struct follow_pfnmap_args *args); ++ void follow_pfnmap_end(struct follow_pfnmap_args *args); ++ +++>>>>>>> 62fb8adc43af (mm: Provide address mask in struct follow_pfnmap_args) + extern void truncate_pagecache(struct inode *inode, loff_t new); + extern void truncate_setsize(struct inode *inode, loff_t newsize); + void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to); +diff --cc mm/memory.c +index e2794e3b8919,68aa0f11633e..000000000000 +--- a/mm/memory.c ++++ b/mm/memory.c +@@@ -5607,60 -6479,137 +5607,92 @@@ int __pmd_alloc(struct mm_struct *mm, p + } + #endif /* __PAGETABLE_PMD_FOLDED */ + +++<<<<<<< HEAD +++======= ++ static inline void pfnmap_args_setup(struct follow_pfnmap_args *args, ++ spinlock_t *lock, pte_t *ptep, ++ pgprot_t pgprot, unsigned long pfn_base, ++ unsigned long addr_mask, bool writable, ++ bool special) ++ { ++ args->lock = lock; ++ args->ptep = ptep; ++ args->pfn = pfn_base + ((args->address & ~addr_mask) >> PAGE_SHIFT); ++ args->addr_mask = addr_mask; ++ args->pgprot = pgprot; ++ args->writable = writable; ++ args->special = special; ++ } ++ ++ static inline void pfnmap_lockdep_assert(struct vm_area_struct *vma) ++ { ++ #ifdef CONFIG_LOCKDEP ++ struct file *file = vma->vm_file; ++ struct address_space *mapping = file ? file->f_mapping : NULL; ++ ++ if (mapping) ++ lockdep_assert(lockdep_is_held(&mapping->i_mmap_rwsem) || ++ lockdep_is_held(&vma->vm_mm->mmap_lock)); ++ else ++ lockdep_assert(lockdep_is_held(&vma->vm_mm->mmap_lock)); ++ #endif ++ } ++ +++>>>>>>> 62fb8adc43af (mm: Provide address mask in struct follow_pfnmap_args) + /** + - * follow_pfnmap_start() - Look up a pfn mapping at a user virtual address + - * @args: Pointer to struct @follow_pfnmap_args + - * + - * The caller needs to setup args->vma and args->address to point to the + - * virtual address as the target of such lookup. On a successful return, + - * the results will be put into other output fields. + + * follow_pte - look up PTE at a user virtual address + + * @mm: the mm_struct of the target address space + + * @address: user virtual address + + * @ptepp: location to store found PTE + + * @ptlp: location to store the lock for the PTE + * + - * After the caller finished using the fields, the caller must invoke + - * another follow_pfnmap_end() to proper releases the locks and resources + - * of such look up request. + - * + - * During the start() and end() calls, the results in @args will be valid + - * as proper locks will be held. After the end() is called, all the fields + - * in @follow_pfnmap_args will be invalid to be further accessed. Further + - * use of such information after end() may require proper synchronizations + - * by the caller with page table updates, otherwise it can create a + - * security bug. + - * + - * If the PTE maps a refcounted page, callers are responsible to protect + - * against invalidation with MMU notifiers; otherwise access to the PFN at + - * a later point in time can trigger use-after-free. + + * On a successful return, the pointer to the PTE is stored in @ptepp; + + * the corresponding lock is taken and its location is stored in @ptlp. + + * The contents of the PTE are only stable until @ptlp is released; + + * any further use, if any, must be protected against invalidation + + * with MMU notifiers. + * + * Only IO mappings and raw PFN mappings are allowed. The mmap semaphore + - * should be taken for read, and the mmap semaphore cannot be released + - * before the end() is invoked. + + * should be taken for read. + * + - * This function must not be used to modify PTE content. + + * KVM uses this function. While it is arguably less bad than ``follow_pfn``, + + * it is not a good general-purpose API. + * + - * Return: zero on success, negative otherwise. + + * Return: zero on success, -ve otherwise. + */ + -int follow_pfnmap_start(struct follow_pfnmap_args *args) + +int follow_pte(struct mm_struct *mm, unsigned long address, + + pte_t **ptepp, spinlock_t **ptlp) + { + - struct vm_area_struct *vma = args->vma; + - unsigned long address = args->address; + - struct mm_struct *mm = vma->vm_mm; + - spinlock_t *lock; + - pgd_t *pgdp; + - p4d_t *p4dp, p4d; + - pud_t *pudp, pud; + - pmd_t *pmdp, pmd; + - pte_t *ptep, pte; + - + - pfnmap_lockdep_assert(vma); + - + - if (unlikely(address < vma->vm_start || address >= vma->vm_end)) + - goto out; + + pgd_t *pgd; + + p4d_t *p4d; + + pud_t *pud; + + pmd_t *pmd; + + pte_t *ptep; + + - if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) + - goto out; + -retry: + - pgdp = pgd_offset(mm, address); + - if (pgd_none(*pgdp) || unlikely(pgd_bad(*pgdp))) + + pgd = pgd_offset(mm, address); + + if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) + goto out; + + - p4dp = p4d_offset(pgdp, address); + - p4d = READ_ONCE(*p4dp); + - if (p4d_none(p4d) || unlikely(p4d_bad(p4d))) + + p4d = p4d_offset(pgd, address); + + if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d))) + goto out; + + - pudp = pud_offset(p4dp, address); + - pud = READ_ONCE(*pudp); + - if (pud_none(pud)) + + pud = pud_offset(p4d, address); + + if (pud_none(*pud) || unlikely(pud_bad(*pud))) + goto out; + - if (pud_leaf(pud)) { + - lock = pud_lock(mm, pudp); + - if (!unlikely(pud_leaf(pud))) { + - spin_unlock(lock); + - goto retry; + - } + - pfnmap_args_setup(args, lock, NULL, pud_pgprot(pud), + - pud_pfn(pud), PUD_MASK, pud_write(pud), + - pud_special(pud)); + - return 0; + - } + + - pmdp = pmd_offset(pudp, address); + - pmd = pmdp_get_lockless(pmdp); + - if (pmd_leaf(pmd)) { + - lock = pmd_lock(mm, pmdp); + - if (!unlikely(pmd_leaf(pmd))) { + - spin_unlock(lock); + - goto retry; + - } + - pfnmap_args_setup(args, lock, NULL, pmd_pgprot(pmd), + - pmd_pfn(pmd), PMD_MASK, pmd_write(pmd), + - pmd_special(pmd)); + - return 0; + - } + + pmd = pmd_offset(pud, address); + + VM_BUG_ON(pmd_trans_huge(*pmd)); + + - ptep = pte_offset_map_lock(mm, pmdp, address, &lock); + + ptep = pte_offset_map_lock(mm, pmd, address, ptlp); + if (!ptep) + goto out; + - pte = ptep_get(ptep); + - if (!pte_present(pte)) + + if (!pte_present(ptep_get(ptep))) + goto unlock; + - pfnmap_args_setup(args, lock, ptep, pte_pgprot(pte), + - pte_pfn(pte), PAGE_MASK, pte_write(pte), + - pte_special(pte)); + + *ptepp = ptep; + return 0; + unlock: + - pte_unmap_unlock(ptep, lock); + + pte_unmap_unlock(ptep, *ptlp); + out: + return -EINVAL; + } +* Unmerged path include/linux/mm.h +* Unmerged path mm/memory.c diff --git a/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/6857be5f.failed b/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/6857be5f.failed new file mode 100644 index 0000000000000..865ebb964bd95 --- /dev/null +++ b/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/6857be5f.failed @@ -0,0 +1,246 @@ +mm: introduce ARCH_SUPPORTS_HUGE_PFNMAP and special bits to pmd/pud + +jira LE-3557 +Rebuild_History Non-Buildable kernel-5.14.0-570.26.1.el9_6 +commit-author Peter Xu +commit 6857be5fecaebd9773ff27b6d29b6fff3b1abbce +Empty-Commit: Cherry-Pick Conflicts during history rebuild. +Will be included in final tarball splat. Ref for failed cherry-pick at: +ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/6857be5f.failed + +Patch series "mm: Support huge pfnmaps", v2. + +Overview +======== + +This series implements huge pfnmaps support for mm in general. Huge +pfnmap allows e.g. VM_PFNMAP vmas to map in either PMD or PUD levels, +similar to what we do with dax / thp / hugetlb so far to benefit from TLB +hits. Now we extend that idea to PFN mappings, e.g. PCI MMIO bars where +it can grow as large as 8GB or even bigger. + +Currently, only x86_64 (1G+2M) and arm64 (2M) are supported. The last +patch (from Alex Williamson) will be the first user of huge pfnmap, so as +to enable vfio-pci driver to fault in huge pfn mappings. + +Implementation +============== + +In reality, it's relatively simple to add such support comparing to many +other types of mappings, because of PFNMAP's specialties when there's no +vmemmap backing it, so that most of the kernel routines on huge mappings +should simply already fail for them, like GUPs or old-school follow_page() +(which is recently rewritten to be folio_walk* APIs by David). + +One trick here is that we're still unmature on PUDs in generic paths here +and there, as DAX is so far the only user. This patchset will add the 2nd +user of it. Hugetlb can be a 3rd user if the hugetlb unification work can +go on smoothly, but to be discussed later. + +The other trick is how to allow gup-fast working for such huge mappings +even if there's no direct sign of knowing whether it's a normal page or +MMIO mapping. This series chose to keep the pte_special solution, so that +it reuses similar idea on setting a special bit to pfnmap PMDs/PUDs so +that gup-fast will be able to identify them and fail properly. + +Along the way, we'll also notice that the major pgtable pfn walker, aka, +follow_pte(), will need to retire soon due to the fact that it only works +with ptes. A new set of simple API is introduced (follow_pfnmap* API) to +be able to do whatever follow_pte() can already do, plus that it can also +process huge pfnmaps now. Half of this series is about that and +converting all existing pfnmap walkers to use the new API properly. +Hopefully the new API also looks better to avoid exposing e.g. pgtable +lock details into the callers, so that it can be used in an even more +straightforward way. + +Here, three more options will be introduced and involved in huge pfnmap: + + - ARCH_SUPPORTS_HUGE_PFNMAP + + Arch developers will need to select this option when huge pfnmap is + supported in arch's Kconfig. After this patchset applied, both x86_64 + and arm64 will start to enable it by default. + + - ARCH_SUPPORTS_PMD_PFNMAP / ARCH_SUPPORTS_PUD_PFNMAP + + These options are for driver developers to identify whether current + arch / config supports huge pfnmaps, making decision on whether it can + use the huge pfnmap APIs to inject them. One can refer to the last + vfio-pci patch from Alex on the use of them properly in a device + driver. + +So after the whole set applied, and if one would enable some dynamic debug +lines in vfio-pci core files, we should observe things like: + + vfio-pci 0000:00:06.0: vfio_pci_mmap_huge_fault(,order = 9) BAR 0 page offset 0x0: 0x100 + vfio-pci 0000:00:06.0: vfio_pci_mmap_huge_fault(,order = 9) BAR 0 page offset 0x200: 0x100 + vfio-pci 0000:00:06.0: vfio_pci_mmap_huge_fault(,order = 9) BAR 0 page offset 0x400: 0x100 + +In this specific case, it says that vfio-pci faults in PMDs properly for a +few BAR0 offsets. + +Patch Layout +============ + +Patch 1: Introduce the new options mentioned above for huge PFNMAPs +Patch 2: A tiny cleanup +Patch 3-8: Preparation patches for huge pfnmap (include introduce + special bit for pmd/pud) +Patch 9-16: Introduce follow_pfnmap*() API, use it everywhere, and + then drop follow_pte() API +Patch 17: Add huge pfnmap support for x86_64 +Patch 18: Add huge pfnmap support for arm64 +Patch 19: Add vfio-pci support for all kinds of huge pfnmaps (Alex) + +TODO +==== + +More architectures / More page sizes +------------------------------------ + +Currently only x86_64 (2M+1G) and arm64 (2M) are supported. There seems +to have plan to support arm64 1G later on top of this series [2]. + +Any arch will need to first support THP / THP_1G, then provide a special +bit in pmds/puds to support huge pfnmaps. + +remap_pfn_range() support +------------------------- + +Currently, remap_pfn_range() still only maps PTEs. With the new option, +remap_pfn_range() can logically start to inject either PMDs or PUDs when +the alignment requirements match on the VAs. + +When the support is there, it should be able to silently benefit all +drivers that is using remap_pfn_range() in its mmap() handler on better +TLB hit rate and overall faster MMIO accesses similar to processor on +hugepages. + +More driver support +------------------- + +VFIO is so far the only consumer for the huge pfnmaps after this series +applied. Besides above remap_pfn_range() generic optimization, device +driver can also try to optimize its mmap() on a better VA alignment for +either PMD/PUD sizes. This may, iiuc, normally require userspace changes, +as the driver doesn't normally decide the VA to map a bar. But I don't +think I know all the drivers to know the full picture. + +Credits all go to Alex on help testing the GPU/NIC use cases above. + +[0] https://lore.kernel.org/r/73ad9540-3fb8-4154-9a4f-30a0a2b03d41@lucifer.local +[1] https://lore.kernel.org/r/20240807194812.819412-1-peterx@redhat.com +[2] https://lore.kernel.org/r/498e0731-81a4-4f75-95b4-a8ad0bcc7665@huawei.com + + +This patch (of 19): + +This patch introduces the option to introduce special pte bit into +pmd/puds. Archs can start to define pmd_special / pud_special when +supported by selecting the new option. Per-arch support will be added +later. + +Before that, create fallbacks for these helpers so that they are always +available. + +Link: https://lkml.kernel.org/r/20240826204353.2228736-1-peterx@redhat.com +Link: https://lkml.kernel.org/r/20240826204353.2228736-2-peterx@redhat.com + Signed-off-by: Peter Xu + Cc: Alexander Gordeev + Cc: Alex Williamson + Cc: Aneesh Kumar K.V + Cc: Borislav Petkov + Cc: Catalin Marinas + Cc: Christian Borntraeger + Cc: Dave Hansen + Cc: David Hildenbrand + Cc: Gavin Shan + Cc: Gerald Schaefer + Cc: Heiko Carstens + Cc: Ingo Molnar + Cc: Jason Gunthorpe + Cc: Matthew Wilcox + Cc: Niklas Schnelle + Cc: Paolo Bonzini + Cc: Ryan Roberts + Cc: Sean Christopherson + Cc: Sven Schnelle + Cc: Thomas Gleixner + Cc: Vasily Gorbik + Cc: Will Deacon + Cc: Zi Yan + Signed-off-by: Andrew Morton +(cherry picked from commit 6857be5fecaebd9773ff27b6d29b6fff3b1abbce) + Signed-off-by: Jonathan Maple + +# Conflicts: +# mm/Kconfig +diff --cc mm/Kconfig +index a91823e31f45,1aa282e35dc7..000000000000 +--- a/mm/Kconfig ++++ b/mm/Kconfig +@@@ -898,6 -870,25 +898,28 @@@ config READ_ONLY_THP_FOR_F + endif # TRANSPARENT_HUGEPAGE + + # +++<<<<<<< HEAD +++======= ++ # The architecture supports pgtable leaves that is larger than PAGE_SIZE ++ # ++ config PGTABLE_HAS_HUGE_LEAVES ++ def_bool TRANSPARENT_HUGEPAGE || HUGETLB_PAGE ++ ++ # TODO: Allow to be enabled without THP ++ config ARCH_SUPPORTS_HUGE_PFNMAP ++ def_bool n ++ depends on TRANSPARENT_HUGEPAGE ++ ++ config ARCH_SUPPORTS_PMD_PFNMAP ++ def_bool y ++ depends on ARCH_SUPPORTS_HUGE_PFNMAP && HAVE_ARCH_TRANSPARENT_HUGEPAGE ++ ++ config ARCH_SUPPORTS_PUD_PFNMAP ++ def_bool y ++ depends on ARCH_SUPPORTS_HUGE_PFNMAP && HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD ++ ++ # +++>>>>>>> 6857be5fecae (mm: introduce ARCH_SUPPORTS_HUGE_PFNMAP and special bits to pmd/pud) + # UP and nommu archs use km based percpu allocator + # + config NEED_PER_CPU_KM +diff --git a/include/linux/mm.h b/include/linux/mm.h +index 196c481ec160..7b6f347d05b9 100644 +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -2730,6 +2730,30 @@ static inline pte_t pte_mkspecial(pte_t pte) + } + #endif + ++#ifndef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP ++static inline bool pmd_special(pmd_t pmd) ++{ ++ return false; ++} ++ ++static inline pmd_t pmd_mkspecial(pmd_t pmd) ++{ ++ return pmd; ++} ++#endif /* CONFIG_ARCH_SUPPORTS_PMD_PFNMAP */ ++ ++#ifndef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP ++static inline bool pud_special(pud_t pud) ++{ ++ return false; ++} ++ ++static inline pud_t pud_mkspecial(pud_t pud) ++{ ++ return pud; ++} ++#endif /* CONFIG_ARCH_SUPPORTS_PUD_PFNMAP */ ++ + #ifndef CONFIG_ARCH_HAS_PTE_DEVMAP + static inline int pte_devmap(pte_t pte) + { +* Unmerged path mm/Kconfig diff --git a/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/6da8e963.failed b/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/6da8e963.failed new file mode 100644 index 0000000000000..82e01ae0d49b8 --- /dev/null +++ b/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/6da8e963.failed @@ -0,0 +1,316 @@ +mm: new follow_pfnmap API + +jira LE-3557 +Rebuild_History Non-Buildable kernel-5.14.0-570.26.1.el9_6 +commit-author Peter Xu +commit 6da8e9634bb7e3fdad9ae0e4db873a05036c4343 +Empty-Commit: Cherry-Pick Conflicts during history rebuild. +Will be included in final tarball splat. Ref for failed cherry-pick at: +ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/6da8e963.failed + +Introduce a pair of APIs to follow pfn mappings to get entry information. +It's very similar to what follow_pte() does before, but different in that +it recognizes huge pfn mappings. + +Link: https://lkml.kernel.org/r/20240826204353.2228736-10-peterx@redhat.com + Signed-off-by: Peter Xu + Cc: Alexander Gordeev + Cc: Alex Williamson + Cc: Aneesh Kumar K.V + Cc: Borislav Petkov + Cc: Catalin Marinas + Cc: Christian Borntraeger + Cc: Dave Hansen + Cc: David Hildenbrand + Cc: Gavin Shan + Cc: Gerald Schaefer + Cc: Heiko Carstens + Cc: Ingo Molnar + Cc: Jason Gunthorpe + Cc: Matthew Wilcox + Cc: Niklas Schnelle + Cc: Paolo Bonzini + Cc: Ryan Roberts + Cc: Sean Christopherson + Cc: Sven Schnelle + Cc: Thomas Gleixner + Cc: Vasily Gorbik + Cc: Will Deacon + Cc: Zi Yan + Signed-off-by: Andrew Morton +(cherry picked from commit 6da8e9634bb7e3fdad9ae0e4db873a05036c4343) + Signed-off-by: Jonathan Maple + +# Conflicts: +# mm/memory.c +diff --cc mm/memory.c +index e2794e3b8919,3878bf69bc14..000000000000 +--- a/mm/memory.c ++++ b/mm/memory.c +@@@ -5666,71 -6172,157 +5666,223 @@@ out + } + EXPORT_SYMBOL_GPL(follow_pte); + +++<<<<<<< HEAD + +/** + + * follow_pfn - look up PFN at a user virtual address + + * @vma: memory mapping + + * @address: user virtual address + + * @pfn: location to store found PFN + + * + + * Only IO mappings and raw PFN mappings are allowed. + + * + + * This function does not allow the caller to read the permissions + + * of the PTE. Do not use it. + + * + + * Return: zero and the pfn at @pfn on success, -ve otherwise. + + */ + +int follow_pfn(struct vm_area_struct *vma, unsigned long address, + + unsigned long *pfn) + +{ + + int ret = -EINVAL; + + spinlock_t *ptl; + + pte_t *ptep; + + + + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) + + return ret; + + + + ret = follow_pte(vma->vm_mm, address, &ptep, &ptl); + + if (ret) + + return ret; + + *pfn = pte_pfn(ptep_get(ptep)); + + pte_unmap_unlock(ptep, ptl); + + return 0; + +} + +EXPORT_SYMBOL(follow_pfn); +++======= ++ static inline void pfnmap_args_setup(struct follow_pfnmap_args *args, ++ spinlock_t *lock, pte_t *ptep, ++ pgprot_t pgprot, unsigned long pfn_base, ++ unsigned long addr_mask, bool writable, ++ bool special) ++ { ++ args->lock = lock; ++ args->ptep = ptep; ++ args->pfn = pfn_base + ((args->address & ~addr_mask) >> PAGE_SHIFT); ++ args->pgprot = pgprot; ++ args->writable = writable; ++ args->special = special; ++ } ++ ++ static inline void pfnmap_lockdep_assert(struct vm_area_struct *vma) ++ { ++ #ifdef CONFIG_LOCKDEP ++ struct address_space *mapping = vma->vm_file->f_mapping; ++ ++ if (mapping) ++ lockdep_assert(lockdep_is_held(&vma->vm_file->f_mapping->i_mmap_rwsem) || ++ lockdep_is_held(&vma->vm_mm->mmap_lock)); ++ else ++ lockdep_assert(lockdep_is_held(&vma->vm_mm->mmap_lock)); ++ #endif ++ } ++ ++ /** ++ * follow_pfnmap_start() - Look up a pfn mapping at a user virtual address ++ * @args: Pointer to struct @follow_pfnmap_args ++ * ++ * The caller needs to setup args->vma and args->address to point to the ++ * virtual address as the target of such lookup. On a successful return, ++ * the results will be put into other output fields. ++ * ++ * After the caller finished using the fields, the caller must invoke ++ * another follow_pfnmap_end() to proper releases the locks and resources ++ * of such look up request. ++ * ++ * During the start() and end() calls, the results in @args will be valid ++ * as proper locks will be held. After the end() is called, all the fields ++ * in @follow_pfnmap_args will be invalid to be further accessed. Further ++ * use of such information after end() may require proper synchronizations ++ * by the caller with page table updates, otherwise it can create a ++ * security bug. ++ * ++ * If the PTE maps a refcounted page, callers are responsible to protect ++ * against invalidation with MMU notifiers; otherwise access to the PFN at ++ * a later point in time can trigger use-after-free. ++ * ++ * Only IO mappings and raw PFN mappings are allowed. The mmap semaphore ++ * should be taken for read, and the mmap semaphore cannot be released ++ * before the end() is invoked. ++ * ++ * This function must not be used to modify PTE content. ++ * ++ * Return: zero on success, negative otherwise. ++ */ ++ int follow_pfnmap_start(struct follow_pfnmap_args *args) ++ { ++ struct vm_area_struct *vma = args->vma; ++ unsigned long address = args->address; ++ struct mm_struct *mm = vma->vm_mm; ++ spinlock_t *lock; ++ pgd_t *pgdp; ++ p4d_t *p4dp, p4d; ++ pud_t *pudp, pud; ++ pmd_t *pmdp, pmd; ++ pte_t *ptep, pte; ++ ++ pfnmap_lockdep_assert(vma); ++ ++ if (unlikely(address < vma->vm_start || address >= vma->vm_end)) ++ goto out; ++ ++ if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) ++ goto out; ++ retry: ++ pgdp = pgd_offset(mm, address); ++ if (pgd_none(*pgdp) || unlikely(pgd_bad(*pgdp))) ++ goto out; ++ ++ p4dp = p4d_offset(pgdp, address); ++ p4d = READ_ONCE(*p4dp); ++ if (p4d_none(p4d) || unlikely(p4d_bad(p4d))) ++ goto out; ++ ++ pudp = pud_offset(p4dp, address); ++ pud = READ_ONCE(*pudp); ++ if (pud_none(pud)) ++ goto out; ++ if (pud_leaf(pud)) { ++ lock = pud_lock(mm, pudp); ++ if (!unlikely(pud_leaf(pud))) { ++ spin_unlock(lock); ++ goto retry; ++ } ++ pfnmap_args_setup(args, lock, NULL, pud_pgprot(pud), ++ pud_pfn(pud), PUD_MASK, pud_write(pud), ++ pud_special(pud)); ++ return 0; ++ } ++ ++ pmdp = pmd_offset(pudp, address); ++ pmd = pmdp_get_lockless(pmdp); ++ if (pmd_leaf(pmd)) { ++ lock = pmd_lock(mm, pmdp); ++ if (!unlikely(pmd_leaf(pmd))) { ++ spin_unlock(lock); ++ goto retry; ++ } ++ pfnmap_args_setup(args, lock, NULL, pmd_pgprot(pmd), ++ pmd_pfn(pmd), PMD_MASK, pmd_write(pmd), ++ pmd_special(pmd)); ++ return 0; ++ } ++ ++ ptep = pte_offset_map_lock(mm, pmdp, address, &lock); ++ if (!ptep) ++ goto out; ++ pte = ptep_get(ptep); ++ if (!pte_present(pte)) ++ goto unlock; ++ pfnmap_args_setup(args, lock, ptep, pte_pgprot(pte), ++ pte_pfn(pte), PAGE_MASK, pte_write(pte), ++ pte_special(pte)); ++ return 0; ++ unlock: ++ pte_unmap_unlock(ptep, lock); ++ out: ++ return -EINVAL; ++ } ++ EXPORT_SYMBOL_GPL(follow_pfnmap_start); ++ ++ /** ++ * follow_pfnmap_end(): End a follow_pfnmap_start() process ++ * @args: Pointer to struct @follow_pfnmap_args ++ * ++ * Must be used in pair of follow_pfnmap_start(). See the start() function ++ * above for more information. ++ */ ++ void follow_pfnmap_end(struct follow_pfnmap_args *args) ++ { ++ if (args->lock) ++ spin_unlock(args->lock); ++ if (args->ptep) ++ pte_unmap(args->ptep); ++ } ++ EXPORT_SYMBOL_GPL(follow_pfnmap_end); +++>>>>>>> 6da8e9634bb7 (mm: new follow_pfnmap API) + + #ifdef CONFIG_HAVE_IOREMAP_PROT + +int follow_phys(struct vm_area_struct *vma, + + unsigned long address, unsigned int flags, + + unsigned long *prot, resource_size_t *phys) + +{ + + int ret = -EINVAL; + + pte_t *ptep, pte; + + spinlock_t *ptl; + + + + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) + + goto out; + + + + if (follow_pte(vma->vm_mm, address, &ptep, &ptl)) + + goto out; + + pte = ptep_get(ptep); + + + + /* Never return PFNs of anon folios in COW mappings. */ + + if (vm_normal_folio(vma, address, pte)) + + goto unlock; + + + + if ((flags & FOLL_WRITE) && !pte_write(pte)) + + goto unlock; + + + + *prot = pgprot_val(pte_pgprot(pte)); + + *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT; + + + + ret = 0; + +unlock: + + pte_unmap_unlock(ptep, ptl); + +out: + + return ret; + +} + + + /** + * generic_access_phys - generic implementation for iomem mmap access + * @vma: the vma to access +diff --git a/include/linux/mm.h b/include/linux/mm.h +index 196c481ec160..51f28b4e78fc 100644 +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -2436,6 +2436,37 @@ int follow_phys(struct vm_area_struct *vma, unsigned long address, + int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, + void *buf, int len, int write); + ++struct follow_pfnmap_args { ++ /** ++ * Inputs: ++ * @vma: Pointer to @vm_area_struct struct ++ * @address: the virtual address to walk ++ */ ++ struct vm_area_struct *vma; ++ unsigned long address; ++ /** ++ * Internals: ++ * ++ * The caller shouldn't touch any of these. ++ */ ++ spinlock_t *lock; ++ pte_t *ptep; ++ /** ++ * Outputs: ++ * ++ * @pfn: the PFN of the address ++ * @pgprot: the pgprot_t of the mapping ++ * @writable: whether the mapping is writable ++ * @special: whether the mapping is a special mapping (real PFN maps) ++ */ ++ unsigned long pfn; ++ pgprot_t pgprot; ++ bool writable; ++ bool special; ++}; ++int follow_pfnmap_start(struct follow_pfnmap_args *args); ++void follow_pfnmap_end(struct follow_pfnmap_args *args); ++ + extern void truncate_pagecache(struct inode *inode, loff_t new); + extern void truncate_setsize(struct inode *inode, loff_t newsize); + void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to); +* Unmerged path mm/memory.c diff --git a/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/75182022.failed b/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/75182022.failed new file mode 100644 index 0000000000000..294d2eff42b30 --- /dev/null +++ b/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/75182022.failed @@ -0,0 +1,179 @@ +mm/x86: support large pfn mappings + +jira LE-3557 +Rebuild_History Non-Buildable kernel-5.14.0-570.26.1.el9_6 +commit-author Peter Xu +commit 75182022a0439788415b2dd1db3086e07aa506f7 +Empty-Commit: Cherry-Pick Conflicts during history rebuild. +Will be included in final tarball splat. Ref for failed cherry-pick at: +ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/75182022.failed + +Helpers to install and detect special pmd/pud entries. In short, bit 9 on +x86 is not used for pmd/pud, so we can directly define them the same as +the pte level. One note is that it's also used in _PAGE_BIT_CPA_TEST but +that is only used in the debug test, and shouldn't conflict in this case. + +One note is that pxx_set|clear_flags() for pmd/pud will need to be moved +upper so that they can be referenced by the new special bit helpers. +There's no change in the code that was moved. + +Link: https://lkml.kernel.org/r/20240826204353.2228736-18-peterx@redhat.com + Signed-off-by: Peter Xu + Cc: Thomas Gleixner + Cc: Ingo Molnar + Cc: Borislav Petkov + Cc: Dave Hansen + Cc: Alexander Gordeev + Cc: Alex Williamson + Cc: Aneesh Kumar K.V + Cc: Catalin Marinas + Cc: Christian Borntraeger + Cc: David Hildenbrand + Cc: Gavin Shan + Cc: Gerald Schaefer + Cc: Heiko Carstens + Cc: Jason Gunthorpe + Cc: Matthew Wilcox + Cc: Niklas Schnelle + Cc: Paolo Bonzini + Cc: Ryan Roberts + Cc: Sean Christopherson + Cc: Sven Schnelle + Cc: Vasily Gorbik + Cc: Will Deacon + Cc: Zi Yan + Signed-off-by: Andrew Morton +(cherry picked from commit 75182022a0439788415b2dd1db3086e07aa506f7) + Signed-off-by: Jonathan Maple + +# Conflicts: +# arch/x86/Kconfig +diff --cc arch/x86/Kconfig +index a5a59118efe4,d4dbe9717e96..000000000000 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@@ -28,7 -28,7 +28,11 @@@ config X86_6 + select ARCH_HAS_GIGANTIC_PAGE + select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 + select ARCH_SUPPORTS_PER_VMA_LOCK +++<<<<<<< HEAD + + select ARCH_SUPPORTS_RT +++======= ++ select ARCH_SUPPORTS_HUGE_PFNMAP if TRANSPARENT_HUGEPAGE +++>>>>>>> 75182022a043 (mm/x86: support large pfn mappings) + select HAVE_ARCH_SOFT_DIRTY + select MODULES_USE_ELF_RELA + select NEED_DMA_MAP_STATE +* Unmerged path arch/x86/Kconfig +diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h +index 8149afec43a4..c5bc120fade8 100644 +--- a/arch/x86/include/asm/pgtable.h ++++ b/arch/x86/include/asm/pgtable.h +@@ -121,6 +121,34 @@ extern pmdval_t early_pmd_flags; + #define arch_end_context_switch(prev) do {} while(0) + #endif /* CONFIG_PARAVIRT_XXL */ + ++static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set) ++{ ++ pmdval_t v = native_pmd_val(pmd); ++ ++ return native_make_pmd(v | set); ++} ++ ++static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear) ++{ ++ pmdval_t v = native_pmd_val(pmd); ++ ++ return native_make_pmd(v & ~clear); ++} ++ ++static inline pud_t pud_set_flags(pud_t pud, pudval_t set) ++{ ++ pudval_t v = native_pud_val(pud); ++ ++ return native_make_pud(v | set); ++} ++ ++static inline pud_t pud_clear_flags(pud_t pud, pudval_t clear) ++{ ++ pudval_t v = native_pud_val(pud); ++ ++ return native_make_pud(v & ~clear); ++} ++ + /* + * The following only work if pte_present() is true. + * Undefined behaviour if not.. +@@ -310,6 +338,30 @@ static inline int pud_devmap(pud_t pud) + } + #endif + ++#ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP ++static inline bool pmd_special(pmd_t pmd) ++{ ++ return pmd_flags(pmd) & _PAGE_SPECIAL; ++} ++ ++static inline pmd_t pmd_mkspecial(pmd_t pmd) ++{ ++ return pmd_set_flags(pmd, _PAGE_SPECIAL); ++} ++#endif /* CONFIG_ARCH_SUPPORTS_PMD_PFNMAP */ ++ ++#ifdef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP ++static inline bool pud_special(pud_t pud) ++{ ++ return pud_flags(pud) & _PAGE_SPECIAL; ++} ++ ++static inline pud_t pud_mkspecial(pud_t pud) ++{ ++ return pud_set_flags(pud, _PAGE_SPECIAL); ++} ++#endif /* CONFIG_ARCH_SUPPORTS_PUD_PFNMAP */ ++ + static inline int pgd_devmap(pgd_t pgd) + { + return 0; +@@ -480,20 +532,6 @@ static inline pte_t pte_mkdevmap(pte_t pte) + return pte_set_flags(pte, _PAGE_SPECIAL|_PAGE_DEVMAP); + } + +-static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set) +-{ +- pmdval_t v = native_pmd_val(pmd); +- +- return native_make_pmd(v | set); +-} +- +-static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear) +-{ +- pmdval_t v = native_pmd_val(pmd); +- +- return native_make_pmd(v & ~clear); +-} +- + /* See comments above mksaveddirty_shift() */ + static inline pmd_t pmd_mksaveddirty(pmd_t pmd) + { +@@ -588,20 +626,6 @@ static inline pmd_t pmd_mkwrite_novma(pmd_t pmd) + pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); + #define pmd_mkwrite pmd_mkwrite + +-static inline pud_t pud_set_flags(pud_t pud, pudval_t set) +-{ +- pudval_t v = native_pud_val(pud); +- +- return native_make_pud(v | set); +-} +- +-static inline pud_t pud_clear_flags(pud_t pud, pudval_t clear) +-{ +- pudval_t v = native_pud_val(pud); +- +- return native_make_pud(v & ~clear); +-} +- + /* See comments above mksaveddirty_shift() */ + static inline pud_t pud_mksaveddirty(pud_t pud) + { diff --git a/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/a77f9489.failed b/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/a77f9489.failed new file mode 100644 index 0000000000000..d44a440ac766d --- /dev/null +++ b/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/a77f9489.failed @@ -0,0 +1,78 @@ +vfio: use the new follow_pfnmap API + +jira LE-3557 +Rebuild_History Non-Buildable kernel-5.14.0-570.26.1.el9_6 +commit-author Peter Xu +commit a77f9489f1d7873a56e1d6640cc0c4865f64176b +Empty-Commit: Cherry-Pick Conflicts during history rebuild. +Will be included in final tarball splat. Ref for failed cherry-pick at: +ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/a77f9489.failed + +Use the new API that can understand huge pfn mappings. + +Link: https://lkml.kernel.org/r/20240826204353.2228736-14-peterx@redhat.com + Signed-off-by: Peter Xu + Cc: Alex Williamson + Cc: Jason Gunthorpe + Cc: Alexander Gordeev + Cc: Aneesh Kumar K.V + Cc: Borislav Petkov + Cc: Catalin Marinas + Cc: Christian Borntraeger + Cc: Dave Hansen + Cc: David Hildenbrand + Cc: Gavin Shan + Cc: Gerald Schaefer + Cc: Heiko Carstens + Cc: Ingo Molnar + Cc: Matthew Wilcox + Cc: Niklas Schnelle + Cc: Paolo Bonzini + Cc: Ryan Roberts + Cc: Sean Christopherson + Cc: Sven Schnelle + Cc: Thomas Gleixner + Cc: Vasily Gorbik + Cc: Will Deacon + Cc: Zi Yan + Signed-off-by: Andrew Morton +(cherry picked from commit a77f9489f1d7873a56e1d6640cc0c4865f64176b) + Signed-off-by: Jonathan Maple + +# Conflicts: +# drivers/vfio/vfio_iommu_type1.c +diff --cc drivers/vfio/vfio_iommu_type1.c +index 6c6586af7953,bf391b40e576..000000000000 +--- a/drivers/vfio/vfio_iommu_type1.c ++++ b/drivers/vfio/vfio_iommu_type1.c +@@@ -515,12 -513,10 +515,14 @@@ static int follow_fault_pfn(struct vm_a + unsigned long vaddr, unsigned long *pfn, + bool write_fault) + { +- pte_t *ptep; +- pte_t pte; +- spinlock_t *ptl; ++ struct follow_pfnmap_args args = { .vma = vma, .address = vaddr }; + int ret; + +++<<<<<<< HEAD + + ret = follow_pte(vma->vm_mm, vaddr, &ptep, &ptl); +++======= ++ ret = follow_pfnmap_start(&args); +++>>>>>>> a77f9489f1d7 (vfio: use the new follow_pfnmap API) + if (ret) { + bool unlocked = false; + +@@@ -534,7 -530,7 +536,11 @@@ + if (ret) + return ret; + +++<<<<<<< HEAD + + ret = follow_pte(vma->vm_mm, vaddr, &ptep, &ptl); +++======= ++ ret = follow_pfnmap_start(&args); +++>>>>>>> a77f9489f1d7 (vfio: use the new follow_pfnmap API) + if (ret) + return ret; + } +* Unmerged path drivers/vfio/vfio_iommu_type1.c diff --git a/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/b0a1c0d0.failed b/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/b0a1c0d0.failed new file mode 100644 index 0000000000000..2727b419cfefb --- /dev/null +++ b/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/b0a1c0d0.failed @@ -0,0 +1,335 @@ +mm: remove follow_pte() + +jira LE-3557 +Rebuild_History Non-Buildable kernel-5.14.0-570.26.1.el9_6 +commit-author Peter Xu +commit b0a1c0d0edcd75a0f8ec5fd19dbd64b8d097f534 +Empty-Commit: Cherry-Pick Conflicts during history rebuild. +Will be included in final tarball splat. Ref for failed cherry-pick at: +ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/b0a1c0d0.failed + +follow_pte() users have been converted to follow_pfnmap*(). Remove the +API. + +Link: https://lkml.kernel.org/r/20240826204353.2228736-17-peterx@redhat.com + Signed-off-by: Peter Xu + Cc: Alexander Gordeev + Cc: Alex Williamson + Cc: Aneesh Kumar K.V + Cc: Borislav Petkov + Cc: Catalin Marinas + Cc: Christian Borntraeger + Cc: Dave Hansen + Cc: David Hildenbrand + Cc: Gavin Shan + Cc: Gerald Schaefer + Cc: Heiko Carstens + Cc: Ingo Molnar + Cc: Jason Gunthorpe + Cc: Matthew Wilcox + Cc: Niklas Schnelle + Cc: Paolo Bonzini + Cc: Ryan Roberts + Cc: Sean Christopherson + Cc: Sven Schnelle + Cc: Thomas Gleixner + Cc: Vasily Gorbik + Cc: Will Deacon + Cc: Zi Yan + Signed-off-by: Andrew Morton +(cherry picked from commit b0a1c0d0edcd75a0f8ec5fd19dbd64b8d097f534) + Signed-off-by: Jonathan Maple + +# Conflicts: +# include/linux/mm.h +# mm/memory.c +diff --cc include/linux/mm.h +index 196c481ec160,d750be768121..000000000000 +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@@ -2427,12 -2368,6 +2427,15 @@@ void free_pgd_range(struct mmu_gather * + unsigned long end, unsigned long floor, unsigned long ceiling); + int + copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma); +++<<<<<<< HEAD + +int follow_pte(struct mm_struct *mm, unsigned long address, + + pte_t **ptepp, spinlock_t **ptlp); + +int follow_pfn(struct vm_area_struct *vma, unsigned long address, + + unsigned long *pfn); + +int follow_phys(struct vm_area_struct *vma, unsigned long address, + + unsigned int flags, unsigned long *prot, resource_size_t *phys); +++======= +++>>>>>>> b0a1c0d0edcd (mm: remove follow_pte()) + int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, + void *buf, int len, int write); + +diff --cc mm/memory.c +index e2794e3b8919,42674c0748cb..000000000000 +--- a/mm/memory.c ++++ b/mm/memory.c +@@@ -5607,130 -6099,157 +5607,159 @@@ int __pmd_alloc(struct mm_struct *mm, p + } + #endif /* __PAGETABLE_PMD_FOLDED */ + +++<<<<<<< HEAD + +/** + + * follow_pte - look up PTE at a user virtual address + + * @mm: the mm_struct of the target address space + + * @address: user virtual address + + * @ptepp: location to store found PTE + + * @ptlp: location to store the lock for the PTE + + * + + * On a successful return, the pointer to the PTE is stored in @ptepp; + + * the corresponding lock is taken and its location is stored in @ptlp. + + * The contents of the PTE are only stable until @ptlp is released; + + * any further use, if any, must be protected against invalidation + + * with MMU notifiers. + + * + + * Only IO mappings and raw PFN mappings are allowed. The mmap semaphore + + * should be taken for read. + + * + + * KVM uses this function. While it is arguably less bad than ``follow_pfn``, + + * it is not a good general-purpose API. + + * + + * Return: zero on success, -ve otherwise. + + */ + +int follow_pte(struct mm_struct *mm, unsigned long address, + + pte_t **ptepp, spinlock_t **ptlp) + +{ + + pgd_t *pgd; + + p4d_t *p4d; + + pud_t *pud; + + pmd_t *pmd; + + pte_t *ptep; + + + + pgd = pgd_offset(mm, address); + + if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) + + goto out; + + + + p4d = p4d_offset(pgd, address); + + if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d))) + + goto out; + + + + pud = pud_offset(p4d, address); + + if (pud_none(*pud) || unlikely(pud_bad(*pud))) + + goto out; + + + + pmd = pmd_offset(pud, address); + + VM_BUG_ON(pmd_trans_huge(*pmd)); + + + + ptep = pte_offset_map_lock(mm, pmd, address, ptlp); + + if (!ptep) + + goto out; + + if (!pte_present(ptep_get(ptep))) + + goto unlock; + + *ptepp = ptep; + + return 0; + +unlock: + + pte_unmap_unlock(ptep, *ptlp); + +out: + + return -EINVAL; + +} + +EXPORT_SYMBOL_GPL(follow_pte); +++======= ++ static inline void pfnmap_args_setup(struct follow_pfnmap_args *args, ++ spinlock_t *lock, pte_t *ptep, ++ pgprot_t pgprot, unsigned long pfn_base, ++ unsigned long addr_mask, bool writable, ++ bool special) ++ { ++ args->lock = lock; ++ args->ptep = ptep; ++ args->pfn = pfn_base + ((args->address & ~addr_mask) >> PAGE_SHIFT); ++ args->pgprot = pgprot; ++ args->writable = writable; ++ args->special = special; ++ } ++ ++ static inline void pfnmap_lockdep_assert(struct vm_area_struct *vma) ++ { ++ #ifdef CONFIG_LOCKDEP ++ struct address_space *mapping = vma->vm_file->f_mapping; ++ ++ if (mapping) ++ lockdep_assert(lockdep_is_held(&vma->vm_file->f_mapping->i_mmap_rwsem) || ++ lockdep_is_held(&vma->vm_mm->mmap_lock)); ++ else ++ lockdep_assert(lockdep_is_held(&vma->vm_mm->mmap_lock)); ++ #endif ++ } +++>>>>>>> b0a1c0d0edcd (mm: remove follow_pte()) + + /** + - * follow_pfnmap_start() - Look up a pfn mapping at a user virtual address + - * @args: Pointer to struct @follow_pfnmap_args + + * follow_pfn - look up PFN at a user virtual address + + * @vma: memory mapping + + * @address: user virtual address + + * @pfn: location to store found PFN + * + - * The caller needs to setup args->vma and args->address to point to the + - * virtual address as the target of such lookup. On a successful return, + - * the results will be put into other output fields. + + * Only IO mappings and raw PFN mappings are allowed. + * + - * After the caller finished using the fields, the caller must invoke + - * another follow_pfnmap_end() to proper releases the locks and resources + - * of such look up request. + + * This function does not allow the caller to read the permissions + + * of the PTE. Do not use it. + * + - * During the start() and end() calls, the results in @args will be valid + - * as proper locks will be held. After the end() is called, all the fields + - * in @follow_pfnmap_args will be invalid to be further accessed. Further + - * use of such information after end() may require proper synchronizations + - * by the caller with page table updates, otherwise it can create a + - * security bug. + - * + - * If the PTE maps a refcounted page, callers are responsible to protect + - * against invalidation with MMU notifiers; otherwise access to the PFN at + - * a later point in time can trigger use-after-free. + - * + - * Only IO mappings and raw PFN mappings are allowed. The mmap semaphore + - * should be taken for read, and the mmap semaphore cannot be released + - * before the end() is invoked. + - * + - * This function must not be used to modify PTE content. + - * + - * Return: zero on success, negative otherwise. + + * Return: zero and the pfn at @pfn on success, -ve otherwise. + */ + -int follow_pfnmap_start(struct follow_pfnmap_args *args) + +int follow_pfn(struct vm_area_struct *vma, unsigned long address, + + unsigned long *pfn) + { + - struct vm_area_struct *vma = args->vma; + - unsigned long address = args->address; + - struct mm_struct *mm = vma->vm_mm; + - spinlock_t *lock; + - pgd_t *pgdp; + - p4d_t *p4dp, p4d; + - pud_t *pudp, pud; + - pmd_t *pmdp, pmd; + - pte_t *ptep, pte; + + int ret = -EINVAL; + + spinlock_t *ptl; + + pte_t *ptep; + + - pfnmap_lockdep_assert(vma); + + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) + + return ret; + + - if (unlikely(address < vma->vm_start || address >= vma->vm_end)) + - goto out; + + ret = follow_pte(vma->vm_mm, address, &ptep, &ptl); + + if (ret) + + return ret; + + *pfn = pte_pfn(ptep_get(ptep)); + + pte_unmap_unlock(ptep, ptl); + + return 0; + +} + +EXPORT_SYMBOL(follow_pfn); + + - if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) + - goto out; + -retry: + - pgdp = pgd_offset(mm, address); + - if (pgd_none(*pgdp) || unlikely(pgd_bad(*pgdp))) + - goto out; + +#ifdef CONFIG_HAVE_IOREMAP_PROT + +int follow_phys(struct vm_area_struct *vma, + + unsigned long address, unsigned int flags, + + unsigned long *prot, resource_size_t *phys) + +{ + + int ret = -EINVAL; + + pte_t *ptep, pte; + + spinlock_t *ptl; + + - p4dp = p4d_offset(pgdp, address); + - p4d = READ_ONCE(*p4dp); + - if (p4d_none(p4d) || unlikely(p4d_bad(p4d))) + + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) + goto out; + + - pudp = pud_offset(p4dp, address); + - pud = READ_ONCE(*pudp); + - if (pud_none(pud)) + + if (follow_pte(vma->vm_mm, address, &ptep, &ptl)) + goto out; + - if (pud_leaf(pud)) { + - lock = pud_lock(mm, pudp); + - if (!unlikely(pud_leaf(pud))) { + - spin_unlock(lock); + - goto retry; + - } + - pfnmap_args_setup(args, lock, NULL, pud_pgprot(pud), + - pud_pfn(pud), PUD_MASK, pud_write(pud), + - pud_special(pud)); + - return 0; + - } + + pte = ptep_get(ptep); + + - pmdp = pmd_offset(pudp, address); + - pmd = pmdp_get_lockless(pmdp); + - if (pmd_leaf(pmd)) { + - lock = pmd_lock(mm, pmdp); + - if (!unlikely(pmd_leaf(pmd))) { + - spin_unlock(lock); + - goto retry; + - } + - pfnmap_args_setup(args, lock, NULL, pmd_pgprot(pmd), + - pmd_pfn(pmd), PMD_MASK, pmd_write(pmd), + - pmd_special(pmd)); + - return 0; + - } + + /* Never return PFNs of anon folios in COW mappings. */ + + if (vm_normal_folio(vma, address, pte)) + + goto unlock; + + - ptep = pte_offset_map_lock(mm, pmdp, address, &lock); + - if (!ptep) + - goto out; + - pte = ptep_get(ptep); + - if (!pte_present(pte)) + + if ((flags & FOLL_WRITE) && !pte_write(pte)) + goto unlock; + - pfnmap_args_setup(args, lock, ptep, pte_pgprot(pte), + - pte_pfn(pte), PAGE_MASK, pte_write(pte), + - pte_special(pte)); + - return 0; + + + + *prot = pgprot_val(pte_pgprot(pte)); + + *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT; + + + + ret = 0; + unlock: + - pte_unmap_unlock(ptep, lock); + + pte_unmap_unlock(ptep, ptl); + out: + - return -EINVAL; + -} + -EXPORT_SYMBOL_GPL(follow_pfnmap_start); + - + -/** + - * follow_pfnmap_end(): End a follow_pfnmap_start() process + - * @args: Pointer to struct @follow_pfnmap_args + - * + - * Must be used in pair of follow_pfnmap_start(). See the start() function + - * above for more information. + - */ + -void follow_pfnmap_end(struct follow_pfnmap_args *args) + -{ + - if (args->lock) + - spin_unlock(args->lock); + - if (args->ptep) + - pte_unmap(args->ptep); + + return ret; + } + -EXPORT_SYMBOL_GPL(follow_pfnmap_end); + + -#ifdef CONFIG_HAVE_IOREMAP_PROT + /** + * generic_access_phys - generic implementation for iomem mmap access + * @vma: the vma to access +* Unmerged path include/linux/mm.h +* Unmerged path mm/memory.c diff --git a/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/b17269a5.failed b/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/b17269a5.failed new file mode 100644 index 0000000000000..25f21aaca81ce --- /dev/null +++ b/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/b17269a5.failed @@ -0,0 +1,105 @@ +mm/access_process_vm: use the new follow_pfnmap API + +jira LE-3557 +Rebuild_History Non-Buildable kernel-5.14.0-570.26.1.el9_6 +commit-author Peter Xu +commit b17269a51cc7f046a6f2cf9a6c314a0de885e5a5 +Empty-Commit: Cherry-Pick Conflicts during history rebuild. +Will be included in final tarball splat. Ref for failed cherry-pick at: +ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/b17269a5.failed + +Use the new API that can understand huge pfn mappings. + +Link: https://lkml.kernel.org/r/20240826204353.2228736-16-peterx@redhat.com + Signed-off-by: Peter Xu + Cc: Alexander Gordeev + Cc: Alex Williamson + Cc: Aneesh Kumar K.V + Cc: Borislav Petkov + Cc: Catalin Marinas + Cc: Christian Borntraeger + Cc: Dave Hansen + Cc: David Hildenbrand + Cc: Gavin Shan + Cc: Gerald Schaefer + Cc: Heiko Carstens + Cc: Ingo Molnar + Cc: Jason Gunthorpe + Cc: Matthew Wilcox + Cc: Niklas Schnelle + Cc: Paolo Bonzini + Cc: Ryan Roberts + Cc: Sean Christopherson + Cc: Sven Schnelle + Cc: Thomas Gleixner + Cc: Vasily Gorbik + Cc: Will Deacon + Cc: Zi Yan + Signed-off-by: Andrew Morton +(cherry picked from commit b17269a51cc7f046a6f2cf9a6c314a0de885e5a5) + Signed-off-by: Jonathan Maple + +# Conflicts: +# mm/memory.c +diff --cc mm/memory.c +index e2794e3b8919,cfc278691466..000000000000 +--- a/mm/memory.c ++++ b/mm/memory.c +@@@ -5749,37 -6341,34 +5749,45 @@@ int generic_access_phys(struct vm_area_ + resource_size_t phys_addr; + unsigned long prot = 0; + void __iomem *maddr; +- pte_t *ptep, pte; +- spinlock_t *ptl; + int offset = offset_in_page(addr); + int ret = -EINVAL; ++ bool writable; ++ struct follow_pfnmap_args args = { .vma = vma, .address = addr }; + + + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) + + return -EINVAL; + + + retry: +++<<<<<<< HEAD + + if (follow_pte(vma->vm_mm, addr, &ptep, &ptl)) +++======= ++ if (follow_pfnmap_start(&args)) +++>>>>>>> b17269a51cc7 (mm/access_process_vm: use the new follow_pfnmap API) + return -EINVAL; +- pte = ptep_get(ptep); +- pte_unmap_unlock(ptep, ptl); ++ prot = pgprot_val(args.pgprot); ++ phys_addr = (resource_size_t)args.pfn << PAGE_SHIFT; ++ writable = args.writable; ++ follow_pfnmap_end(&args); + +- prot = pgprot_val(pte_pgprot(pte)); +- phys_addr = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT; +- +- if ((write & FOLL_WRITE) && !pte_write(pte)) ++ if ((write & FOLL_WRITE) && !writable) + return -EINVAL; + + maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot); + if (!maddr) + return -ENOMEM; + +++<<<<<<< HEAD + + if (follow_pte(vma->vm_mm, addr, &ptep, &ptl)) +++======= ++ if (follow_pfnmap_start(&args)) +++>>>>>>> b17269a51cc7 (mm/access_process_vm: use the new follow_pfnmap API) + goto out_unmap; + +- if (!pte_same(pte, ptep_get(ptep))) { +- pte_unmap_unlock(ptep, ptl); ++ if ((prot != pgprot_val(args.pgprot)) || ++ (phys_addr != (args.pfn << PAGE_SHIFT)) || ++ (writable != args.writable)) { ++ follow_pfnmap_end(&args); + iounmap(maddr); +- + goto retry; + } + +* Unmerged path mm/memory.c diff --git a/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/b1b46751.failed b/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/b1b46751.failed new file mode 100644 index 0000000000000..871b89f15e6a1 --- /dev/null +++ b/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/b1b46751.failed @@ -0,0 +1,209 @@ +mm: fix follow_pfnmap API lockdep assert + +jira LE-3557 +Rebuild_History Non-Buildable kernel-5.14.0-570.26.1.el9_6 +commit-author Linus Torvalds +commit b1b46751671be5a426982f037a47ae05f37ff80b +Empty-Commit: Cherry-Pick Conflicts during history rebuild. +Will be included in final tarball splat. Ref for failed cherry-pick at: +ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/b1b46751.failed + +The lockdep asserts for the new follow_pfnmap() API "knows" that a +pfnmap always has a vma->vm_file, since that's the only way to create +such a mapping. + +And that's actually true for all the normal cases. But not for the mmap +failure case, where the incomplete mapping is torn down and we have +cleared vma->vm_file because the failure occured before the file was +linked to the vma. + +So this codepath does actually need to check for vm_file being NULL. + + Reported-by: Jann Horn +Fixes: 6da8e9634bb7 ("mm: new follow_pfnmap API") + Cc: Peter Xu + Cc: Andrew Morton + Signed-off-by: Linus Torvalds +(cherry picked from commit b1b46751671be5a426982f037a47ae05f37ff80b) + Signed-off-by: Jonathan Maple + +# Conflicts: +# mm/memory.c +diff --cc mm/memory.c +index e2794e3b8919,3ccee51adfbb..000000000000 +--- a/mm/memory.c ++++ b/mm/memory.c +@@@ -5607,60 -6333,136 +5607,91 @@@ int __pmd_alloc(struct mm_struct *mm, p + } + #endif /* __PAGETABLE_PMD_FOLDED */ + +++<<<<<<< HEAD +++======= ++ static inline void pfnmap_args_setup(struct follow_pfnmap_args *args, ++ spinlock_t *lock, pte_t *ptep, ++ pgprot_t pgprot, unsigned long pfn_base, ++ unsigned long addr_mask, bool writable, ++ bool special) ++ { ++ args->lock = lock; ++ args->ptep = ptep; ++ args->pfn = pfn_base + ((args->address & ~addr_mask) >> PAGE_SHIFT); ++ args->pgprot = pgprot; ++ args->writable = writable; ++ args->special = special; ++ } ++ ++ static inline void pfnmap_lockdep_assert(struct vm_area_struct *vma) ++ { ++ #ifdef CONFIG_LOCKDEP ++ struct file *file = vma->vm_file; ++ struct address_space *mapping = file ? file->f_mapping : NULL; ++ ++ if (mapping) ++ lockdep_assert(lockdep_is_held(&vma->vm_file->f_mapping->i_mmap_rwsem) || ++ lockdep_is_held(&vma->vm_mm->mmap_lock)); ++ else ++ lockdep_assert(lockdep_is_held(&vma->vm_mm->mmap_lock)); ++ #endif ++ } ++ +++>>>>>>> b1b46751671b (mm: fix follow_pfnmap API lockdep assert) + /** + - * follow_pfnmap_start() - Look up a pfn mapping at a user virtual address + - * @args: Pointer to struct @follow_pfnmap_args + - * + - * The caller needs to setup args->vma and args->address to point to the + - * virtual address as the target of such lookup. On a successful return, + - * the results will be put into other output fields. + - * + - * After the caller finished using the fields, the caller must invoke + - * another follow_pfnmap_end() to proper releases the locks and resources + - * of such look up request. + - * + - * During the start() and end() calls, the results in @args will be valid + - * as proper locks will be held. After the end() is called, all the fields + - * in @follow_pfnmap_args will be invalid to be further accessed. Further + - * use of such information after end() may require proper synchronizations + - * by the caller with page table updates, otherwise it can create a + - * security bug. + + * follow_pte - look up PTE at a user virtual address + + * @mm: the mm_struct of the target address space + + * @address: user virtual address + + * @ptepp: location to store found PTE + + * @ptlp: location to store the lock for the PTE + * + - * If the PTE maps a refcounted page, callers are responsible to protect + - * against invalidation with MMU notifiers; otherwise access to the PFN at + - * a later point in time can trigger use-after-free. + + * On a successful return, the pointer to the PTE is stored in @ptepp; + + * the corresponding lock is taken and its location is stored in @ptlp. + + * The contents of the PTE are only stable until @ptlp is released; + + * any further use, if any, must be protected against invalidation + + * with MMU notifiers. + * + * Only IO mappings and raw PFN mappings are allowed. The mmap semaphore + - * should be taken for read, and the mmap semaphore cannot be released + - * before the end() is invoked. + + * should be taken for read. + * + - * This function must not be used to modify PTE content. + + * KVM uses this function. While it is arguably less bad than ``follow_pfn``, + + * it is not a good general-purpose API. + * + - * Return: zero on success, negative otherwise. + + * Return: zero on success, -ve otherwise. + */ + -int follow_pfnmap_start(struct follow_pfnmap_args *args) + +int follow_pte(struct mm_struct *mm, unsigned long address, + + pte_t **ptepp, spinlock_t **ptlp) + { + - struct vm_area_struct *vma = args->vma; + - unsigned long address = args->address; + - struct mm_struct *mm = vma->vm_mm; + - spinlock_t *lock; + - pgd_t *pgdp; + - p4d_t *p4dp, p4d; + - pud_t *pudp, pud; + - pmd_t *pmdp, pmd; + - pte_t *ptep, pte; + - + - pfnmap_lockdep_assert(vma); + - + - if (unlikely(address < vma->vm_start || address >= vma->vm_end)) + - goto out; + + pgd_t *pgd; + + p4d_t *p4d; + + pud_t *pud; + + pmd_t *pmd; + + pte_t *ptep; + + - if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) + - goto out; + -retry: + - pgdp = pgd_offset(mm, address); + - if (pgd_none(*pgdp) || unlikely(pgd_bad(*pgdp))) + + pgd = pgd_offset(mm, address); + + if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) + goto out; + + - p4dp = p4d_offset(pgdp, address); + - p4d = READ_ONCE(*p4dp); + - if (p4d_none(p4d) || unlikely(p4d_bad(p4d))) + + p4d = p4d_offset(pgd, address); + + if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d))) + goto out; + + - pudp = pud_offset(p4dp, address); + - pud = READ_ONCE(*pudp); + - if (pud_none(pud)) + + pud = pud_offset(p4d, address); + + if (pud_none(*pud) || unlikely(pud_bad(*pud))) + goto out; + - if (pud_leaf(pud)) { + - lock = pud_lock(mm, pudp); + - if (!unlikely(pud_leaf(pud))) { + - spin_unlock(lock); + - goto retry; + - } + - pfnmap_args_setup(args, lock, NULL, pud_pgprot(pud), + - pud_pfn(pud), PUD_MASK, pud_write(pud), + - pud_special(pud)); + - return 0; + - } + + - pmdp = pmd_offset(pudp, address); + - pmd = pmdp_get_lockless(pmdp); + - if (pmd_leaf(pmd)) { + - lock = pmd_lock(mm, pmdp); + - if (!unlikely(pmd_leaf(pmd))) { + - spin_unlock(lock); + - goto retry; + - } + - pfnmap_args_setup(args, lock, NULL, pmd_pgprot(pmd), + - pmd_pfn(pmd), PMD_MASK, pmd_write(pmd), + - pmd_special(pmd)); + - return 0; + - } + + pmd = pmd_offset(pud, address); + + VM_BUG_ON(pmd_trans_huge(*pmd)); + + - ptep = pte_offset_map_lock(mm, pmdp, address, &lock); + + ptep = pte_offset_map_lock(mm, pmd, address, ptlp); + if (!ptep) + goto out; + - pte = ptep_get(ptep); + - if (!pte_present(pte)) + + if (!pte_present(ptep_get(ptep))) + goto unlock; + - pfnmap_args_setup(args, lock, ptep, pte_pgprot(pte), + - pte_pfn(pte), PAGE_MASK, pte_write(pte), + - pte_special(pte)); + + *ptepp = ptep; + return 0; + unlock: + - pte_unmap_unlock(ptep, lock); + + pte_unmap_unlock(ptep, *ptlp); + out: + return -EINVAL; + } +* Unmerged path mm/memory.c diff --git a/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/bd8c2d18.failed b/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/bd8c2d18.failed new file mode 100644 index 0000000000000..da24628b81a33 --- /dev/null +++ b/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/bd8c2d18.failed @@ -0,0 +1,76 @@ +s390/pci_mmio: use follow_pfnmap API + +jira LE-3557 +Rebuild_History Non-Buildable kernel-5.14.0-570.26.1.el9_6 +commit-author Peter Xu +commit bd8c2d18bf5cccd8842d00b17d6f222beb98b1b3 +Empty-Commit: Cherry-Pick Conflicts during history rebuild. +Will be included in final tarball splat. Ref for failed cherry-pick at: +ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/bd8c2d18.failed + +Use the new API that can understand huge pfn mappings. + +Link: https://lkml.kernel.org/r/20240826204353.2228736-12-peterx@redhat.com + Signed-off-by: Peter Xu + Cc: Niklas Schnelle + Cc: Gerald Schaefer + Cc: Heiko Carstens + Cc: Vasily Gorbik + Cc: Alexander Gordeev + Cc: Christian Borntraeger + Cc: Sven Schnelle + Cc: Alex Williamson + Cc: Aneesh Kumar K.V + Cc: Borislav Petkov + Cc: Catalin Marinas + Cc: Dave Hansen + Cc: David Hildenbrand + Cc: Gavin Shan + Cc: Ingo Molnar + Cc: Jason Gunthorpe + Cc: Matthew Wilcox + Cc: Paolo Bonzini + Cc: Ryan Roberts + Cc: Sean Christopherson + Cc: Thomas Gleixner + Cc: Will Deacon + Cc: Zi Yan + Signed-off-by: Andrew Morton +(cherry picked from commit bd8c2d18bf5cccd8842d00b17d6f222beb98b1b3) + Signed-off-by: Jonathan Maple + +# Conflicts: +# arch/s390/pci/pci_mmio.c +diff --cc arch/s390/pci/pci_mmio.c +index 588089332931,de5c0b389a3e..000000000000 +--- a/arch/s390/pci/pci_mmio.c ++++ b/arch/s390/pci/pci_mmio.c +@@@ -169,7 -168,9 +168,13 @@@ SYSCALL_DEFINE3(s390_pci_mmio_write, un + if (!(vma->vm_flags & VM_WRITE)) + goto out_unlock_mmap; + +++<<<<<<< HEAD + + ret = follow_pte(vma->vm_mm, mmio_addr, &ptep, &ptl); +++======= ++ args.address = mmio_addr; ++ args.vma = vma; ++ ret = follow_pfnmap_start(&args); +++>>>>>>> bd8c2d18bf5c (s390/pci_mmio: use follow_pfnmap API) + if (ret) + goto out_unlock_mmap; + +@@@ -308,7 -308,9 +312,13 @@@ SYSCALL_DEFINE3(s390_pci_mmio_read, uns + if (!(vma->vm_flags & VM_WRITE)) + goto out_unlock_mmap; + +++<<<<<<< HEAD + + ret = follow_pte(vma->vm_mm, mmio_addr, &ptep, &ptl); +++======= ++ args.vma = vma; ++ args.address = mmio_addr; ++ ret = follow_pfnmap_start(&args); +++>>>>>>> bd8c2d18bf5c (s390/pci_mmio: use follow_pfnmap API) + if (ret) + goto out_unlock_mmap; + +* Unmerged path arch/s390/pci/pci_mmio.c diff --git a/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/c1d9dac0.failed b/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/c1d9dac0.failed new file mode 100644 index 0000000000000..ec6e80e4cf602 --- /dev/null +++ b/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/c1d9dac0.failed @@ -0,0 +1,157 @@ +vfio/pci: Align huge faults to order + +jira LE-3557 +Rebuild_History Non-Buildable kernel-5.14.0-570.26.1.el9_6 +commit-author Alex Williamson +commit c1d9dac0db168198b6f63f460665256dedad9b6e +Empty-Commit: Cherry-Pick Conflicts during history rebuild. +Will be included in final tarball splat. Ref for failed cherry-pick at: +ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/c1d9dac0.failed + +The vfio-pci huge_fault handler doesn't make any attempt to insert a +mapping containing the faulting address, it only inserts mappings if the +faulting address and resulting pfn are aligned. This works in a lot of +cases, particularly in conjunction with QEMU where DMA mappings linearly +fault the mmap. However, there are configurations where we don't get +that linear faulting and pages are faulted on-demand. + +The scenario reported in the bug below is such a case, where the physical +address width of the CPU is greater than that of the IOMMU, resulting in a +VM where guest firmware has mapped device MMIO beyond the address width of +the IOMMU. In this configuration, the MMIO is faulted on demand and +tracing indicates that occasionally the faults generate a VM_FAULT_OOM. +Given the use case, this results in a "error: kvm run failed Bad address", +killing the VM. + +The host is not under memory pressure in this test, therefore it's +suspected that VM_FAULT_OOM is actually the result of a NULL return from +__pte_offset_map_lock() in the get_locked_pte() path from insert_pfn(). +This suggests a potential race inserting a pte concurrent to a pmd, and +maybe indicates some deficiency in the mm layer properly handling such a +case. + +Nevertheless, Peter noted the inconsistency of vfio-pci's huge_fault +handler where our mapping granularity depends on the alignment of the +faulting address relative to the order rather than aligning the faulting +address to the order to more consistently insert huge mappings. This +change not only uses the page tables more consistently and efficiently, but +as any fault to an aligned page results in the same mapping, the race +condition suspected in the VM_FAULT_OOM is avoided. + + Reported-by: Adolfo +Closes: https://bugzilla.kernel.org/show_bug.cgi?id=220057 +Fixes: 09dfc8a5f2ce ("vfio/pci: Fallback huge faults for unaligned pfn") + Cc: stable@vger.kernel.org + Tested-by: Adolfo +Co-developed-by: Peter Xu + Signed-off-by: Peter Xu +Link: https://lore.kernel.org/r/20250502224035.3183451-1-alex.williamson@redhat.com + Signed-off-by: Alex Williamson +(cherry picked from commit c1d9dac0db168198b6f63f460665256dedad9b6e) + Signed-off-by: Jonathan Maple + +# Conflicts: +# drivers/vfio/pci/vfio_pci_core.c +diff --cc drivers/vfio/pci/vfio_pci_core.c +index ffda816e0119,6328c3a05bcd..000000000000 +--- a/drivers/vfio/pci/vfio_pci_core.c ++++ b/drivers/vfio/pci/vfio_pci_core.c +@@@ -1770,49 -1646,59 +1770,63 @@@ static vm_fault_t vfio_pci_mmap_fault(s + { + struct vm_area_struct *vma = vmf->vma; + struct vfio_pci_core_device *vdev = vma->vm_private_data; +++<<<<<<< HEAD + + struct vfio_pci_mmap_vma *mmap_vma; + + vm_fault_t ret = VM_FAULT_NOPAGE; +++======= ++ unsigned long addr = vmf->address & ~((PAGE_SIZE << order) - 1); ++ unsigned long pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; ++ unsigned long pfn = vma_to_pfn(vma) + pgoff; ++ vm_fault_t ret = VM_FAULT_SIGBUS; ++ ++ if (order && (addr < vma->vm_start || ++ addr + (PAGE_SIZE << order) > vma->vm_end || ++ pfn & ((1 << order) - 1))) { ++ ret = VM_FAULT_FALLBACK; ++ goto out; ++ } +++>>>>>>> c1d9dac0db16 (vfio/pci: Align huge faults to order) + + + mutex_lock(&vdev->vma_lock); + down_read(&vdev->memory_lock); + + - if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev)) + - goto out_unlock; + + /* + + * Memory region cannot be accessed if the low power feature is engaged + + * or memory access is disabled. + + */ + + if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev)) { + + ret = VM_FAULT_SIGBUS; + + goto up_out; + + } + + - switch (order) { + - case 0: + - ret = vmf_insert_pfn(vma, vmf->address, pfn); + - break; + -#ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP + - case PMD_ORDER: + - ret = vmf_insert_pfn_pmd(vmf, + - __pfn_to_pfn_t(pfn, PFN_DEV), false); + - break; + -#endif + -#ifdef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP + - case PUD_ORDER: + - ret = vmf_insert_pfn_pud(vmf, + - __pfn_to_pfn_t(pfn, PFN_DEV), false); + - break; + -#endif + - default: + - ret = VM_FAULT_FALLBACK; + + /* + + * We populate the whole vma on fault, so we need to test whether + + * the vma has already been mapped, such as for concurrent faults + + * to the same vma. io_remap_pfn_range() will trigger a BUG_ON if + + * we ask it to fill the same range again. + + */ + + list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) { + + if (mmap_vma->vma == vma) + + goto up_out; + } + + -out_unlock: + - up_read(&vdev->memory_lock); + -out: + - dev_dbg_ratelimited(&vdev->pdev->dev, + - "%s(,order = %d) BAR %ld page offset 0x%lx: 0x%x\n", + - __func__, order, + - vma->vm_pgoff >> + - (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT), + - pgoff, (unsigned int)ret); + + if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, + + vma->vm_end - vma->vm_start, + + vma->vm_page_prot)) { + + ret = VM_FAULT_SIGBUS; + + zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); + + goto up_out; + + } + + - return ret; + -} + + if (__vfio_pci_add_vma(vdev, vma)) { + + ret = VM_FAULT_OOM; + + zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); + + } + + -static vm_fault_t vfio_pci_mmap_page_fault(struct vm_fault *vmf) + -{ + - return vfio_pci_mmap_huge_fault(vmf, 0); + +up_out: + + up_read(&vdev->memory_lock); + + mutex_unlock(&vdev->vma_lock); + + return ret; + } + + static const struct vm_operations_struct vfio_pci_mmap_ops = { +* Unmerged path drivers/vfio/pci/vfio_pci_core.c diff --git a/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/c5541ba3.failed b/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/c5541ba3.failed new file mode 100644 index 0000000000000..efc4e67b46099 --- /dev/null +++ b/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/c5541ba3.failed @@ -0,0 +1,83 @@ +mm: follow_pte() improvements + +jira LE-3557 +Rebuild_History Non-Buildable kernel-5.14.0-570.26.1.el9_6 +commit-author David Hildenbrand +commit c5541ba378e3d36ea88bf5839d5b23e33e7d1627 +Empty-Commit: Cherry-Pick Conflicts during history rebuild. +Will be included in final tarball splat. Ref for failed cherry-pick at: +ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/c5541ba3.failed + +follow_pte() is now our main function to lookup PTEs in VM_PFNMAP/VM_IO +VMAs. Let's perform some more sanity checks to make this exported +function harder to abuse. + +Further, extend the doc a bit, it still focuses on the KVM use case with +MMU notifiers. Drop the KVM+follow_pfn() comment, follow_pfn() is no +more, and we have other users nowadays. + +Also extend the doc regarding refcounted pages and the interaction with +MMU notifiers. + +KVM is one example that uses MMU notifiers and can deal with refcounted +pages properly. VFIO is one example that doesn't use MMU notifiers, and +to prevent use-after-free, rejects refcounted pages: pfn_valid(pfn) && +!PageReserved(pfn_to_page(pfn)). Protection changes are less of a concern +for users like VFIO: the behavior is similar to longterm-pinning a page, +and getting the PTE protection changed afterwards. + +The primary concern with refcounted pages is use-after-free, which callers +should be aware of. + +Link: https://lkml.kernel.org/r/20240410155527.474777-4-david@redhat.com + Signed-off-by: David Hildenbrand + Cc: Alex Williamson + Cc: Christoph Hellwig + Cc: Fei Li + Cc: Gerald Schaefer + Cc: Heiko Carstens + Cc: Ingo Molnar + Cc: Paolo Bonzini + Cc: Sean Christopherson + Cc: Yonghua Huang + Signed-off-by: Andrew Morton +(cherry picked from commit c5541ba378e3d36ea88bf5839d5b23e33e7d1627) + Signed-off-by: Jonathan Maple + +# Conflicts: +# mm/memory.c +diff --cc mm/memory.c +index e2794e3b8919,36ba94eae853..000000000000 +--- a/mm/memory.c ++++ b/mm/memory.c +@@@ -5623,8 -5947,7 +5630,12 @@@ int __pmd_alloc(struct mm_struct *mm, p + * Only IO mappings and raw PFN mappings are allowed. The mmap semaphore + * should be taken for read. + * +++<<<<<<< HEAD + + * KVM uses this function. While it is arguably less bad than ``follow_pfn``, + + * it is not a good general-purpose API. +++======= ++ * This function must not be used to modify PTE content. +++>>>>>>> c5541ba378e3 (mm: follow_pte() improvements) + * + * Return: zero on success, -ve otherwise. + */ +@@@ -5637,6 -5961,13 +5648,16 @@@ int follow_pte(struct mm_struct *mm, un + pmd_t *pmd; + pte_t *ptep; + +++<<<<<<< HEAD +++======= ++ mmap_assert_locked(mm); ++ if (unlikely(address < vma->vm_start || address >= vma->vm_end)) ++ goto out; ++ ++ if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) ++ goto out; ++ +++>>>>>>> c5541ba378e3 (mm: follow_pte() improvements) + pgd = pgd_offset(mm, address); + if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) + goto out; +* Unmerged path mm/memory.c diff --git a/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/cb10c28a.failed b/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/cb10c28a.failed new file mode 100644 index 0000000000000..db5b5c1060134 --- /dev/null +++ b/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/cb10c28a.failed @@ -0,0 +1,132 @@ +mm: remove follow_pfn + +jira LE-3557 +Rebuild_History Non-Buildable kernel-5.14.0-570.26.1.el9_6 +commit-author Christoph Hellwig +commit cb10c28ac82c9b7a5e9b3b1dc7157036c20c36dd +Empty-Commit: Cherry-Pick Conflicts during history rebuild. +Will be included in final tarball splat. Ref for failed cherry-pick at: +ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/cb10c28a.failed + +Remove follow_pfn now that the last user is gone. + +Link: https://lkml.kernel.org/r/20240324234542.2038726-3-hch@lst.de + Signed-off-by: Christoph Hellwig + Reviewed-by: David Hildenbrand + Cc: Andy Lutomirski + Cc: Dave Hansen + Cc: Fei Li + Cc: Ingo Molnar + Cc: Peter Zijlstra + Cc: Nathan Chancellor + Signed-off-by: Andrew Morton +(cherry picked from commit cb10c28ac82c9b7a5e9b3b1dc7157036c20c36dd) + Signed-off-by: Jonathan Maple + +# Conflicts: +# mm/nommu.c +diff --cc mm/nommu.c +index f3f6a7e97647,331d2f778695..000000000000 +--- a/mm/nommu.c ++++ b/mm/nommu.c +@@@ -110,29 -110,6 +110,32 @@@ unsigned int kobjsize(const void *objp + return page_size(page); + } + +++<<<<<<< HEAD + +/** + + * follow_pfn - look up PFN at a user virtual address + + * @vma: memory mapping + + * @address: user virtual address + + * @pfn: location to store found PFN + + * + + * Only IO mappings and raw PFN mappings are allowed. + + * + + * Returns zero and the pfn at @pfn on success, -ve otherwise. + + */ + +int follow_pfn(struct vm_area_struct *vma, unsigned long address, + + unsigned long *pfn) + +{ + + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) + + return -EINVAL; + + + + *pfn = address >> PAGE_SHIFT; + + return 0; + +} + +EXPORT_SYMBOL(follow_pfn); + + + +LIST_HEAD(vmap_area_list); + + +++======= +++>>>>>>> cb10c28ac82c (mm: remove follow_pfn) + void vfree(const void *addr) + { + kfree(addr); +diff --git a/include/linux/mm.h b/include/linux/mm.h +index 196c481ec160..cd16e4cb2ce0 100644 +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -2429,8 +2429,6 @@ int + copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma); + int follow_pte(struct mm_struct *mm, unsigned long address, + pte_t **ptepp, spinlock_t **ptlp); +-int follow_pfn(struct vm_area_struct *vma, unsigned long address, +- unsigned long *pfn); + int follow_phys(struct vm_area_struct *vma, unsigned long address, + unsigned int flags, unsigned long *prot, resource_size_t *phys); + int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, +diff --git a/mm/memory.c b/mm/memory.c +index e2794e3b8919..4498a39fb51d 100644 +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -5623,8 +5623,8 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) + * Only IO mappings and raw PFN mappings are allowed. The mmap semaphore + * should be taken for read. + * +- * KVM uses this function. While it is arguably less bad than ``follow_pfn``, +- * it is not a good general-purpose API. ++ * KVM uses this function. While it is arguably less bad than the historic ++ * ``follow_pfn``, it is not a good general-purpose API. + * + * Return: zero on success, -ve otherwise. + */ +@@ -5666,38 +5666,6 @@ int follow_pte(struct mm_struct *mm, unsigned long address, + } + EXPORT_SYMBOL_GPL(follow_pte); + +-/** +- * follow_pfn - look up PFN at a user virtual address +- * @vma: memory mapping +- * @address: user virtual address +- * @pfn: location to store found PFN +- * +- * Only IO mappings and raw PFN mappings are allowed. +- * +- * This function does not allow the caller to read the permissions +- * of the PTE. Do not use it. +- * +- * Return: zero and the pfn at @pfn on success, -ve otherwise. +- */ +-int follow_pfn(struct vm_area_struct *vma, unsigned long address, +- unsigned long *pfn) +-{ +- int ret = -EINVAL; +- spinlock_t *ptl; +- pte_t *ptep; +- +- if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) +- return ret; +- +- ret = follow_pte(vma->vm_mm, address, &ptep, &ptl); +- if (ret) +- return ret; +- *pfn = pte_pfn(ptep_get(ptep)); +- pte_unmap_unlock(ptep, ptl); +- return 0; +-} +-EXPORT_SYMBOL(follow_pfn); +- + #ifdef CONFIG_HAVE_IOREMAP_PROT + int follow_phys(struct vm_area_struct *vma, + unsigned long address, unsigned int flags, +* Unmerged path mm/nommu.c diff --git a/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/cbea8536.failed b/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/cbea8536.failed new file mode 100644 index 0000000000000..3341542fa241c --- /dev/null +++ b/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/cbea8536.failed @@ -0,0 +1,78 @@ +mm/x86/pat: use the new follow_pfnmap API + +jira LE-3557 +Rebuild_History Non-Buildable kernel-5.14.0-570.26.1.el9_6 +commit-author Peter Xu +commit cbea8536d933d546ceb1005bf9c04f9d01da8092 +Empty-Commit: Cherry-Pick Conflicts during history rebuild. +Will be included in final tarball splat. Ref for failed cherry-pick at: +ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/cbea8536.failed + +Use the new API that can understand huge pfn mappings. + +Link: https://lkml.kernel.org/r/20240826204353.2228736-13-peterx@redhat.com + Signed-off-by: Peter Xu + Cc: Thomas Gleixner + Cc: Ingo Molnar + Cc: Borislav Petkov + Cc: Dave Hansen + Cc: Alexander Gordeev + Cc: Alex Williamson + Cc: Aneesh Kumar K.V + Cc: Catalin Marinas + Cc: Christian Borntraeger + Cc: David Hildenbrand + Cc: Gavin Shan + Cc: Gerald Schaefer + Cc: Heiko Carstens + Cc: Jason Gunthorpe + Cc: Matthew Wilcox + Cc: Niklas Schnelle + Cc: Paolo Bonzini + Cc: Ryan Roberts + Cc: Sean Christopherson + Cc: Sven Schnelle + Cc: Vasily Gorbik + Cc: Will Deacon + Cc: Zi Yan + Signed-off-by: Andrew Morton +(cherry picked from commit cbea8536d933d546ceb1005bf9c04f9d01da8092) + Signed-off-by: Jonathan Maple + +# Conflicts: +# arch/x86/mm/pat/memtype.c +diff --cc arch/x86/mm/pat/memtype.c +index 36b603d0cdde,f73b5ce270b3..000000000000 +--- a/arch/x86/mm/pat/memtype.c ++++ b/arch/x86/mm/pat/memtype.c +@@@ -947,6 -948,26 +947,29 @@@ static void free_pfn_range(u64 paddr, u + memtype_free(paddr, paddr + size); + } + +++<<<<<<< HEAD +++======= ++ static int follow_phys(struct vm_area_struct *vma, unsigned long *prot, ++ resource_size_t *phys) ++ { ++ struct follow_pfnmap_args args = { .vma = vma, .address = vma->vm_start }; ++ ++ if (follow_pfnmap_start(&args)) ++ return -EINVAL; ++ ++ /* Never return PFNs of anon folios in COW mappings. */ ++ if (!args.special) { ++ follow_pfnmap_end(&args); ++ return -EINVAL; ++ } ++ ++ *prot = pgprot_val(args.pgprot); ++ *phys = (resource_size_t)args.pfn << PAGE_SHIFT; ++ follow_pfnmap_end(&args); ++ return 0; ++ } ++ +++>>>>>>> cbea8536d933 (mm/x86/pat: use the new follow_pfnmap API) + static int get_pat_info(struct vm_area_struct *vma, resource_size_t *paddr, + pgprot_t *pgprot) + { +* Unmerged path arch/x86/mm/pat/memtype.c diff --git a/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/ef713ec3.failed b/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/ef713ec3.failed new file mode 100644 index 0000000000000..a094d940ee94c --- /dev/null +++ b/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/ef713ec3.failed @@ -0,0 +1,118 @@ +mm: drop is_huge_zero_pud() + +jira LE-3557 +Rebuild_History Non-Buildable kernel-5.14.0-570.26.1.el9_6 +commit-author Peter Xu +commit ef713ec3a566d3e5e011c5d6201eb661ebf94c1f +Empty-Commit: Cherry-Pick Conflicts during history rebuild. +Will be included in final tarball splat. Ref for failed cherry-pick at: +ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/ef713ec3.failed + +It constantly returns false since 2017. One assertion is added in 2019 but +it should never have triggered, IOW it means what is checked should be +asserted instead. + +If it didn't exist for 7 years maybe it's good idea to remove it and only +add it when it comes. + +Link: https://lkml.kernel.org/r/20240826204353.2228736-3-peterx@redhat.com + Signed-off-by: Peter Xu + Reviewed-by: Jason Gunthorpe + Acked-by: David Hildenbrand + Cc: Matthew Wilcox + Cc: Aneesh Kumar K.V + Cc: Alexander Gordeev + Cc: Alex Williamson + Cc: Borislav Petkov + Cc: Catalin Marinas + Cc: Christian Borntraeger + Cc: Dave Hansen + Cc: Gavin Shan + Cc: Gerald Schaefer + Cc: Heiko Carstens + Cc: Ingo Molnar + Cc: Niklas Schnelle + Cc: Paolo Bonzini + Cc: Ryan Roberts + Cc: Sean Christopherson + Cc: Sven Schnelle + Cc: Thomas Gleixner + Cc: Vasily Gorbik + Cc: Will Deacon + Cc: Zi Yan + Signed-off-by: Andrew Morton +(cherry picked from commit ef713ec3a566d3e5e011c5d6201eb661ebf94c1f) + Signed-off-by: Jonathan Maple + +# Conflicts: +# include/linux/huge_mm.h +# mm/huge_memory.c +diff --cc include/linux/huge_mm.h +index fc789c0ac85b,ffca706bac81..000000000000 +--- a/include/linux/huge_mm.h ++++ b/include/linux/huge_mm.h +@@@ -256,13 -433,8 +256,18 @@@ static inline bool is_huge_zero_pmd(pmd + return pmd_present(pmd) && READ_ONCE(huge_zero_pfn) == pmd_pfn(pmd); + } + +++<<<<<<< HEAD + +static inline bool is_huge_zero_pud(pud_t pud) + +{ + + return false; + +} + + + +struct page *mm_get_huge_zero_page(struct mm_struct *mm); + +void mm_put_huge_zero_page(struct mm_struct *mm); +++======= ++ struct folio *mm_get_huge_zero_folio(struct mm_struct *mm); ++ void mm_put_huge_zero_folio(struct mm_struct *mm); +++>>>>>>> ef713ec3a566 (mm: drop is_huge_zero_pud()) + + #define mk_huge_pmd(page, prot) pmd_mkhuge(mk_pmd(page, prot)) + +@@@ -379,12 -573,7 +384,16 @@@ static inline bool is_huge_zero_pmd(pmd + return false; + } + +++<<<<<<< HEAD + +static inline bool is_huge_zero_pud(pud_t pud) + +{ + + return false; + +} + + + +static inline void mm_put_huge_zero_page(struct mm_struct *mm) +++======= ++ static inline void mm_put_huge_zero_folio(struct mm_struct *mm) +++>>>>>>> ef713ec3a566 (mm: drop is_huge_zero_pud()) + { + return; + } +diff --cc mm/huge_memory.c +index 20d9b3971dc8,a4a14b81e013..000000000000 +--- a/mm/huge_memory.c ++++ b/mm/huge_memory.c +@@@ -1236,17 -1703,8 +1234,22 @@@ int copy_huge_pud(struct mm_struct *dst + goto out_unlock; + + /* +++<<<<<<< HEAD + + * When page table lock is held, the huge zero pud should not be + + * under splitting since we don't split the page itself, only pud to + + * a page table. + + */ + + if (is_huge_zero_pud(pud)) { + + /* No huge zero pud yet */ + + } + + + + /* + + * TODO: once we support anonymous pages, use page_try_dup_anon_rmap() + + * and split if duplicating fails. +++======= ++ * TODO: once we support anonymous pages, use ++ * folio_try_dup_anon_rmap_*() and split if duplicating fails. +++>>>>>>> ef713ec3a566 (mm: drop is_huge_zero_pud()) + */ + pudp_set_wrprotect(src_mm, addr, src_pud); + pud = pud_mkold(pud_wrprotect(pud)); +* Unmerged path include/linux/huge_mm.h +* Unmerged path mm/huge_memory.c diff --git a/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/f9e54c3a.failed b/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/f9e54c3a.failed new file mode 100644 index 0000000000000..5c70b216c6943 --- /dev/null +++ b/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/f9e54c3a.failed @@ -0,0 +1,229 @@ +vfio/pci: implement huge_fault support + +jira LE-3557 +Rebuild_History Non-Buildable kernel-5.14.0-570.26.1.el9_6 +commit-author Alex Williamson +commit f9e54c3a2f5b79ecc57c7bc7d0d3521e461a2101 +Empty-Commit: Cherry-Pick Conflicts during history rebuild. +Will be included in final tarball splat. Ref for failed cherry-pick at: +ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/f9e54c3a.failed + +With the addition of pfnmap support in vmf_insert_pfn_{pmd,pud}() we can +take advantage of PMD and PUD faults to PCI BAR mmaps and create more +efficient mappings. PCI BARs are always a power of two and will typically +get at least PMD alignment without userspace even trying. Userspace +alignment for PUD mappings is also not too difficult. + +Consolidate faults through a single handler with a new wrapper for +standard single page faults. The pre-faulting behavior of commit +d71a989cf5d9 ("vfio/pci: Insert full vma on mmap'd MMIO fault") is removed +in this refactoring since huge_fault will cover the bulk of the faults and +results in more efficient page table usage. We also want to avoid that +pre-faulted single page mappings preempt huge page mappings. + +Link: https://lkml.kernel.org/r/20240826204353.2228736-20-peterx@redhat.com + Signed-off-by: Alex Williamson + Signed-off-by: Peter Xu + Cc: Alexander Gordeev + Cc: Aneesh Kumar K.V + Cc: Borislav Petkov + Cc: Catalin Marinas + Cc: Christian Borntraeger + Cc: Dave Hansen + Cc: David Hildenbrand + Cc: Gavin Shan + Cc: Gerald Schaefer + Cc: Heiko Carstens + Cc: Ingo Molnar + Cc: Jason Gunthorpe + Cc: Matthew Wilcox + Cc: Niklas Schnelle + Cc: Paolo Bonzini + Cc: Ryan Roberts + Cc: Sean Christopherson + Cc: Sven Schnelle + Cc: Thomas Gleixner + Cc: Vasily Gorbik + Cc: Will Deacon + Cc: Zi Yan + Signed-off-by: Andrew Morton +(cherry picked from commit f9e54c3a2f5b79ecc57c7bc7d0d3521e461a2101) + Signed-off-by: Jonathan Maple + +# Conflicts: +# drivers/vfio/pci/vfio_pci_core.c +diff --cc drivers/vfio/pci/vfio_pci_core.c +index ffda816e0119,2d7478e9a62d..000000000000 +--- a/drivers/vfio/pci/vfio_pci_core.c ++++ b/drivers/vfio/pci/vfio_pci_core.c +@@@ -1725,100 -1646,82 +1726,161 @@@ void vfio_pci_memory_unlock_and_restore + up_write(&vdev->memory_lock); + } + + -static unsigned long vma_to_pfn(struct vm_area_struct *vma) + +/* Caller holds vma_lock */ + +static int __vfio_pci_add_vma(struct vfio_pci_core_device *vdev, + + struct vm_area_struct *vma) + { + - struct vfio_pci_core_device *vdev = vma->vm_private_data; + - int index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); + - u64 pgoff; + + struct vfio_pci_mmap_vma *mmap_vma; + + - pgoff = vma->vm_pgoff & + - ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); + + mmap_vma = kmalloc(sizeof(*mmap_vma), GFP_KERNEL_ACCOUNT); + + if (!mmap_vma) + + return -ENOMEM; + + + + mmap_vma->vma = vma; + + list_add(&mmap_vma->vma_next, &vdev->vma_list); + + + + return 0; + +} + + + +/* + + * Zap mmaps on open so that we can fault them in on access and therefore + + * our vma_list only tracks mappings accessed since last zap. + + */ + +static void vfio_pci_mmap_open(struct vm_area_struct *vma) + +{ + + zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); + +} + + + +static void vfio_pci_mmap_close(struct vm_area_struct *vma) + +{ + + struct vfio_pci_core_device *vdev = vma->vm_private_data; + + struct vfio_pci_mmap_vma *mmap_vma; + + - return (pci_resource_start(vdev->pdev, index) >> PAGE_SHIFT) + pgoff; + + mutex_lock(&vdev->vma_lock); + + list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) { + + if (mmap_vma->vma == vma) { + + list_del(&mmap_vma->vma_next); + + kfree(mmap_vma); + + break; + + } + + } + + mutex_unlock(&vdev->vma_lock); + } + +- static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf) ++ static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf, ++ unsigned int order) + { + struct vm_area_struct *vma = vmf->vma; + struct vfio_pci_core_device *vdev = vma->vm_private_data; +++<<<<<<< HEAD + + struct vfio_pci_mmap_vma *mmap_vma; + + vm_fault_t ret = VM_FAULT_NOPAGE; +++======= ++ unsigned long pfn, pgoff = vmf->pgoff - vma->vm_pgoff; ++ vm_fault_t ret = VM_FAULT_SIGBUS; ++ ++ if (order && (vmf->address & ((PAGE_SIZE << order) - 1) || ++ vmf->address + (PAGE_SIZE << order) > vma->vm_end)) { ++ ret = VM_FAULT_FALLBACK; ++ goto out; ++ } ++ ++ pfn = vma_to_pfn(vma); +++>>>>>>> f9e54c3a2f5b (vfio/pci: implement huge_fault support) + + + mutex_lock(&vdev->vma_lock); + down_read(&vdev->memory_lock); + +++<<<<<<< HEAD + + /* + + * Memory region cannot be accessed if the low power feature is engaged + + * or memory access is disabled. + + */ + + if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev)) { + + ret = VM_FAULT_SIGBUS; + + goto up_out; + + } + + + + /* + + * We populate the whole vma on fault, so we need to test whether + + * the vma has already been mapped, such as for concurrent faults + + * to the same vma. io_remap_pfn_range() will trigger a BUG_ON if + + * we ask it to fill the same range again. + + */ + + list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) { + + if (mmap_vma->vma == vma) + + goto up_out; + + } +++======= ++ if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev)) ++ goto out_unlock; ++ ++ switch (order) { ++ case 0: ++ ret = vmf_insert_pfn(vma, vmf->address, pfn + pgoff); ++ break; ++ #ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP ++ case PMD_ORDER: ++ ret = vmf_insert_pfn_pmd(vmf, __pfn_to_pfn_t(pfn + pgoff, ++ PFN_DEV), false); ++ break; ++ #endif ++ #ifdef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP ++ case PUD_ORDER: ++ ret = vmf_insert_pfn_pud(vmf, __pfn_to_pfn_t(pfn + pgoff, ++ PFN_DEV), false); ++ break; ++ #endif ++ default: ++ ret = VM_FAULT_FALLBACK; ++ } ++ ++ out_unlock: ++ up_read(&vdev->memory_lock); ++ out: ++ dev_dbg_ratelimited(&vdev->pdev->dev, ++ "%s(,order = %d) BAR %ld page offset 0x%lx: 0x%x\n", ++ __func__, order, ++ vma->vm_pgoff >> ++ (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT), ++ pgoff, (unsigned int)ret); +++>>>>>>> f9e54c3a2f5b (vfio/pci: implement huge_fault support) + + + + if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, + + vma->vm_end - vma->vm_start, + + vma->vm_page_prot)) { + + ret = VM_FAULT_SIGBUS; + + zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); + + goto up_out; + + } + + + + if (__vfio_pci_add_vma(vdev, vma)) { + + ret = VM_FAULT_OOM; + + zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); + + } + + +up_out: + + up_read(&vdev->memory_lock); + + mutex_unlock(&vdev->vma_lock); + return ret; + } + ++ static vm_fault_t vfio_pci_mmap_page_fault(struct vm_fault *vmf) ++ { ++ return vfio_pci_mmap_huge_fault(vmf, 0); ++ } ++ + static const struct vm_operations_struct vfio_pci_mmap_ops = { +++<<<<<<< HEAD + + .open = vfio_pci_mmap_open, + + .close = vfio_pci_mmap_close, + + .fault = vfio_pci_mmap_fault, +++======= ++ .fault = vfio_pci_mmap_page_fault, ++ #ifdef CONFIG_ARCH_SUPPORTS_HUGE_PFNMAP ++ .huge_fault = vfio_pci_mmap_huge_fault, ++ #endif +++>>>>>>> f9e54c3a2f5b (vfio/pci: implement huge_fault support) + }; + + int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma) +* Unmerged path drivers/vfio/pci/vfio_pci_core.c diff --git a/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/rebuild.details.txt b/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/rebuild.details.txt new file mode 100644 index 0000000000000..5dddd8579777c --- /dev/null +++ b/ciq/ciq_backports/kernel-5.14.0-570.26.1.el9_6/rebuild.details.txt @@ -0,0 +1,40 @@ +Rebuild_History BUILDABLE +Rebuilding Kernel from rpm changelog with Fuzz Limit: 87.50% +Number of commits in upstream range v5.14~1..kernel-mainline: 309912 +Number of commits in rpm: 41 +Number of commits matched with upstream: 39 (95.12%) +Number of commits in upstream but not in rpm: 309873 +Number of commits NOT found in upstream: 2 (4.88%) + +Rebuilding Kernel on Branch rocky9_6_rebuild_kernel-5.14.0-570.26.1.el9_6 for kernel-5.14.0-570.26.1.el9_6 +Clean Cherry Picks: 14 (35.90%) +Empty Cherry Picks: 22 (56.41%) +_______________________________ + +__EMPTY COMMITS__________________________ +6857be5fecaebd9773ff27b6d29b6fff3b1abbce mm: introduce ARCH_SUPPORTS_HUGE_PFNMAP and special bits to pmd/pud +ef713ec3a566d3e5e011c5d6201eb661ebf94c1f mm: drop is_huge_zero_pud() +10d83d7781a8a6ff02bafd172c1ab183b27f8d5a mm/pagewalk: check pfnmap for folio_walk_start() +cb10c28ac82c9b7a5e9b3b1dc7157036c20c36dd mm: remove follow_pfn +6da8e9634bb7e3fdad9ae0e4db873a05036c4343 mm: new follow_pfnmap API +b1b46751671be5a426982f037a47ae05f37ff80b mm: fix follow_pfnmap API lockdep assert +5b34b76cb0cd8a21dee5c7677eae98480b0d05cc mm: move follow_phys to arch/x86/mm/pat/memtype.c +29ae7d96d166fa08c7232daf8a314ef5ba1efd20 mm: pass VMA instead of MM to follow_pte() +5731aacd54a883dd2c1a5e8c85e1fe78fc728dc7 KVM: use follow_pfnmap API +bd8c2d18bf5cccd8842d00b17d6f222beb98b1b3 s390/pci_mmio: use follow_pfnmap API +cbea8536d933d546ceb1005bf9c04f9d01da8092 mm/x86/pat: use the new follow_pfnmap API +a77f9489f1d7873a56e1d6640cc0c4865f64176b vfio: use the new follow_pfnmap API +b17269a51cc7f046a6f2cf9a6c314a0de885e5a5 mm/access_process_vm: use the new follow_pfnmap API +c5541ba378e3d36ea88bf5839d5b23e33e7d1627 mm: follow_pte() improvements +b0a1c0d0edcd75a0f8ec5fd19dbd64b8d097f534 mm: remove follow_pte() +75182022a0439788415b2dd1db3086e07aa506f7 mm/x86: support large pfn mappings +3e509c9b03f9abc7804c80bed266a6cc4286a5a8 mm/arm64: support large pfn mappings +f9e54c3a2f5b79ecc57c7bc7d0d3521e461a2101 vfio/pci: implement huge_fault support +09dfc8a5f2ce897005a94bf66cca4f91e4e03700 vfio/pci: Fallback huge faults for unaligned pfn +62fb8adc43afad5fa1c9cadc6f3a8e9fb72af194 mm: Provide address mask in struct follow_pfnmap_args +0fd06844de5d063cb384384e06a11ec7141a35d5 vfio/type1: Use mapping page mask for pfnmaps +c1d9dac0db168198b6f63f460665256dedad9b6e vfio/pci: Align huge faults to order + +__CHANGES NOT IN UPSTREAM________________ +Porting to Rocky Linux 9, debranding and Rocky branding' +Ensure aarch64 kernel is not compressed' diff --git a/configs/kernel-5.14.0-aarch64-64k-debug.config b/configs/kernel-5.14.0-aarch64-64k-debug.config index 787b193b8bca9..227706bfe5415 100644 --- a/configs/kernel-5.14.0-aarch64-64k-debug.config +++ b/configs/kernel-5.14.0-aarch64-64k-debug.config @@ -1076,6 +1076,8 @@ CONFIG_TRANSPARENT_HUGEPAGE=y CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS=y # CONFIG_TRANSPARENT_HUGEPAGE_MADVISE is not set # CONFIG_READ_ONLY_THP_FOR_FS is not set +CONFIG_ARCH_SUPPORTS_HUGE_PFNMAP=y +CONFIG_ARCH_SUPPORTS_PMD_PFNMAP=y CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK=y CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK=y CONFIG_USE_PERCPU_NUMA_NODE_ID=y diff --git a/configs/kernel-5.14.0-aarch64-64k.config b/configs/kernel-5.14.0-aarch64-64k.config index 39c43faf4a1d1..8974bca2568f7 100644 --- a/configs/kernel-5.14.0-aarch64-64k.config +++ b/configs/kernel-5.14.0-aarch64-64k.config @@ -1072,6 +1072,8 @@ CONFIG_TRANSPARENT_HUGEPAGE=y CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS=y # CONFIG_TRANSPARENT_HUGEPAGE_MADVISE is not set # CONFIG_READ_ONLY_THP_FOR_FS is not set +CONFIG_ARCH_SUPPORTS_HUGE_PFNMAP=y +CONFIG_ARCH_SUPPORTS_PMD_PFNMAP=y CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK=y CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK=y CONFIG_USE_PERCPU_NUMA_NODE_ID=y diff --git a/configs/kernel-5.14.0-aarch64-debug.config b/configs/kernel-5.14.0-aarch64-debug.config index 8b374663a4658..6f63b063221ee 100644 --- a/configs/kernel-5.14.0-aarch64-debug.config +++ b/configs/kernel-5.14.0-aarch64-debug.config @@ -1079,6 +1079,8 @@ CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS=y # CONFIG_TRANSPARENT_HUGEPAGE_MADVISE is not set CONFIG_THP_SWAP=y # CONFIG_READ_ONLY_THP_FOR_FS is not set +CONFIG_ARCH_SUPPORTS_HUGE_PFNMAP=y +CONFIG_ARCH_SUPPORTS_PMD_PFNMAP=y CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK=y CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK=y CONFIG_USE_PERCPU_NUMA_NODE_ID=y diff --git a/configs/kernel-5.14.0-aarch64.config b/configs/kernel-5.14.0-aarch64.config index 4403dc2c2a26e..ab69d33420388 100644 --- a/configs/kernel-5.14.0-aarch64.config +++ b/configs/kernel-5.14.0-aarch64.config @@ -1075,6 +1075,8 @@ CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS=y # CONFIG_TRANSPARENT_HUGEPAGE_MADVISE is not set CONFIG_THP_SWAP=y # CONFIG_READ_ONLY_THP_FOR_FS is not set +CONFIG_ARCH_SUPPORTS_HUGE_PFNMAP=y +CONFIG_ARCH_SUPPORTS_PMD_PFNMAP=y CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK=y CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK=y CONFIG_USE_PERCPU_NUMA_NODE_ID=y diff --git a/configs/kernel-5.14.0-x86_64-debug.config b/configs/kernel-5.14.0-x86_64-debug.config index 93ef181632d25..3403e9c20fb5b 100644 --- a/configs/kernel-5.14.0-x86_64-debug.config +++ b/configs/kernel-5.14.0-x86_64-debug.config @@ -1138,6 +1138,9 @@ CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS=y # CONFIG_TRANSPARENT_HUGEPAGE_MADVISE is not set CONFIG_THP_SWAP=y # CONFIG_READ_ONLY_THP_FOR_FS is not set +CONFIG_ARCH_SUPPORTS_HUGE_PFNMAP=y +CONFIG_ARCH_SUPPORTS_PMD_PFNMAP=y +CONFIG_ARCH_SUPPORTS_PUD_PFNMAP=y CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK=y CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK=y CONFIG_USE_PERCPU_NUMA_NODE_ID=y diff --git a/configs/kernel-5.14.0-x86_64.config b/configs/kernel-5.14.0-x86_64.config index 8f5eccfacea09..33d762cce77bd 100644 --- a/configs/kernel-5.14.0-x86_64.config +++ b/configs/kernel-5.14.0-x86_64.config @@ -1133,6 +1133,9 @@ CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS=y # CONFIG_TRANSPARENT_HUGEPAGE_MADVISE is not set CONFIG_THP_SWAP=y # CONFIG_READ_ONLY_THP_FOR_FS is not set +CONFIG_ARCH_SUPPORTS_HUGE_PFNMAP=y +CONFIG_ARCH_SUPPORTS_PMD_PFNMAP=y +CONFIG_ARCH_SUPPORTS_PUD_PFNMAP=y CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK=y CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK=y CONFIG_USE_PERCPU_NUMA_NODE_ID=y diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 36442eeffb828..73aa4347a2be4 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -600,6 +600,9 @@ static bool turbo_is_disabled(void) { u64 misc_en; + if (!cpu_feature_enabled(X86_FEATURE_IDA)) + return true; + rdmsrl(MSR_IA32_MISC_ENABLE, misc_en); return !!(misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE); diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index ffda816e01193..c9eaba2276365 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -57,11 +58,6 @@ struct vfio_pci_vf_token { int users; }; -struct vfio_pci_mmap_vma { - struct vm_area_struct *vma; - struct list_head vma_next; -}; - static inline bool vfio_vga_disabled(void) { #ifdef CONFIG_VFIO_PCI_VGA @@ -1610,100 +1606,20 @@ ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *bu } EXPORT_SYMBOL_GPL(vfio_pci_core_write); -/* Return 1 on zap and vma_lock acquired, 0 on contention (only with @try) */ -static int vfio_pci_zap_and_vma_lock(struct vfio_pci_core_device *vdev, bool try) +static void vfio_pci_zap_bars(struct vfio_pci_core_device *vdev) { - struct vfio_pci_mmap_vma *mmap_vma, *tmp; - - /* - * Lock ordering: - * vma_lock is nested under mmap_lock for vm_ops callback paths. - * The memory_lock semaphore is used by both code paths calling - * into this function to zap vmas and the vm_ops.fault callback - * to protect the memory enable state of the device. - * - * When zapping vmas we need to maintain the mmap_lock => vma_lock - * ordering, which requires using vma_lock to walk vma_list to - * acquire an mm, then dropping vma_lock to get the mmap_lock and - * reacquiring vma_lock. This logic is derived from similar - * requirements in uverbs_user_mmap_disassociate(). - * - * mmap_lock must always be the top-level lock when it is taken. - * Therefore we can only hold the memory_lock write lock when - * vma_list is empty, as we'd need to take mmap_lock to clear - * entries. vma_list can only be guaranteed empty when holding - * vma_lock, thus memory_lock is nested under vma_lock. - * - * This enables the vm_ops.fault callback to acquire vma_lock, - * followed by memory_lock read lock, while already holding - * mmap_lock without risk of deadlock. - */ - while (1) { - struct mm_struct *mm = NULL; - - if (try) { - if (!mutex_trylock(&vdev->vma_lock)) - return 0; - } else { - mutex_lock(&vdev->vma_lock); - } - while (!list_empty(&vdev->vma_list)) { - mmap_vma = list_first_entry(&vdev->vma_list, - struct vfio_pci_mmap_vma, - vma_next); - mm = mmap_vma->vma->vm_mm; - if (mmget_not_zero(mm)) - break; + struct vfio_device *core_vdev = &vdev->vdev; + loff_t start = VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_BAR0_REGION_INDEX); + loff_t end = VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_ROM_REGION_INDEX); + loff_t len = end - start; - list_del(&mmap_vma->vma_next); - kfree(mmap_vma); - mm = NULL; - } - if (!mm) - return 1; - mutex_unlock(&vdev->vma_lock); - - if (try) { - if (!mmap_read_trylock(mm)) { - mmput(mm); - return 0; - } - } else { - mmap_read_lock(mm); - } - if (try) { - if (!mutex_trylock(&vdev->vma_lock)) { - mmap_read_unlock(mm); - mmput(mm); - return 0; - } - } else { - mutex_lock(&vdev->vma_lock); - } - list_for_each_entry_safe(mmap_vma, tmp, - &vdev->vma_list, vma_next) { - struct vm_area_struct *vma = mmap_vma->vma; - - if (vma->vm_mm != mm) - continue; - - list_del(&mmap_vma->vma_next); - kfree(mmap_vma); - - zap_vma_ptes(vma, vma->vm_start, - vma->vm_end - vma->vm_start); - } - mutex_unlock(&vdev->vma_lock); - mmap_read_unlock(mm); - mmput(mm); - } + unmap_mapping_range(core_vdev->inode->i_mapping, start, len, true); } void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_core_device *vdev) { - vfio_pci_zap_and_vma_lock(vdev, false); down_write(&vdev->memory_lock); - mutex_unlock(&vdev->vma_lock); + vfio_pci_zap_bars(vdev); } u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_core_device *vdev) @@ -1725,100 +1641,83 @@ void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev, u16 c up_write(&vdev->memory_lock); } -/* Caller holds vma_lock */ -static int __vfio_pci_add_vma(struct vfio_pci_core_device *vdev, - struct vm_area_struct *vma) -{ - struct vfio_pci_mmap_vma *mmap_vma; - - mmap_vma = kmalloc(sizeof(*mmap_vma), GFP_KERNEL_ACCOUNT); - if (!mmap_vma) - return -ENOMEM; - - mmap_vma->vma = vma; - list_add(&mmap_vma->vma_next, &vdev->vma_list); - - return 0; -} - -/* - * Zap mmaps on open so that we can fault them in on access and therefore - * our vma_list only tracks mappings accessed since last zap. - */ -static void vfio_pci_mmap_open(struct vm_area_struct *vma) -{ - zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); -} - -static void vfio_pci_mmap_close(struct vm_area_struct *vma) +static unsigned long vma_to_pfn(struct vm_area_struct *vma) { struct vfio_pci_core_device *vdev = vma->vm_private_data; - struct vfio_pci_mmap_vma *mmap_vma; + int index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); + u64 pgoff; - mutex_lock(&vdev->vma_lock); - list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) { - if (mmap_vma->vma == vma) { - list_del(&mmap_vma->vma_next); - kfree(mmap_vma); - break; - } - } - mutex_unlock(&vdev->vma_lock); + pgoff = vma->vm_pgoff & + ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); + + return (pci_resource_start(vdev->pdev, index) >> PAGE_SHIFT) + pgoff; } -static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf) +static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf, + unsigned int order) { struct vm_area_struct *vma = vmf->vma; struct vfio_pci_core_device *vdev = vma->vm_private_data; - struct vfio_pci_mmap_vma *mmap_vma; - vm_fault_t ret = VM_FAULT_NOPAGE; - - mutex_lock(&vdev->vma_lock); - down_read(&vdev->memory_lock); - - /* - * Memory region cannot be accessed if the low power feature is engaged - * or memory access is disabled. - */ - if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev)) { - ret = VM_FAULT_SIGBUS; - goto up_out; + unsigned long addr = vmf->address & ~((PAGE_SIZE << order) - 1); + unsigned long pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; + unsigned long pfn = vma_to_pfn(vma) + pgoff; + vm_fault_t ret = VM_FAULT_SIGBUS; + + if (order && (addr < vma->vm_start || + addr + (PAGE_SIZE << order) > vma->vm_end || + pfn & ((1 << order) - 1))) { + ret = VM_FAULT_FALLBACK; + goto out; } - /* - * We populate the whole vma on fault, so we need to test whether - * the vma has already been mapped, such as for concurrent faults - * to the same vma. io_remap_pfn_range() will trigger a BUG_ON if - * we ask it to fill the same range again. - */ - list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) { - if (mmap_vma->vma == vma) - goto up_out; - } + down_read(&vdev->memory_lock); - if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, - vma->vm_end - vma->vm_start, - vma->vm_page_prot)) { - ret = VM_FAULT_SIGBUS; - zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); - goto up_out; - } + if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev)) + goto out_unlock; - if (__vfio_pci_add_vma(vdev, vma)) { - ret = VM_FAULT_OOM; - zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); + switch (order) { + case 0: + ret = vmf_insert_pfn(vma, vmf->address, pfn); + break; +#ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP + case PMD_ORDER: + ret = vmf_insert_pfn_pmd(vmf, + __pfn_to_pfn_t(pfn, PFN_DEV), false); + break; +#endif +#ifdef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP + case PUD_ORDER: + ret = vmf_insert_pfn_pud(vmf, + __pfn_to_pfn_t(pfn, PFN_DEV), false); + break; +#endif + default: + ret = VM_FAULT_FALLBACK; } -up_out: +out_unlock: up_read(&vdev->memory_lock); - mutex_unlock(&vdev->vma_lock); +out: + dev_dbg_ratelimited(&vdev->pdev->dev, + "%s(,order = %d) BAR %ld page offset 0x%lx: 0x%x\n", + __func__, order, + vma->vm_pgoff >> + (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT), + pgoff, (unsigned int)ret); + return ret; } +static vm_fault_t vfio_pci_mmap_page_fault(struct vm_fault *vmf) +{ + return vfio_pci_mmap_huge_fault(vmf, 0); +} + static const struct vm_operations_struct vfio_pci_mmap_ops = { - .open = vfio_pci_mmap_open, - .close = vfio_pci_mmap_close, - .fault = vfio_pci_mmap_fault, + .fault = vfio_pci_mmap_page_fault, +#ifdef CONFIG_ARCH_SUPPORTS_HUGE_PFNMAP + .huge_fault = vfio_pci_mmap_huge_fault, +#endif }; int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma) @@ -1880,11 +1779,12 @@ int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma vma->vm_private_data = vdev; vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff; + vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot); /* - * See remap_pfn_range(), called from vfio_pci_fault() but we can't - * change vm_flags within the fault handler. Set them now. + * Set vm_flags now, they should not be changed in the fault handler. + * We want the same flags and page protection (decrypted above) as + * io_remap_pfn_range() would set. * * VM_ALLOW_ANY_UNCACHED: The VMA flag is implemented for ARM64, * allowing KVM stage 2 device mapping attributes to use Normal-NC @@ -2202,8 +2102,6 @@ int vfio_pci_core_init_dev(struct vfio_device *core_vdev) mutex_init(&vdev->ioeventfds_lock); INIT_LIST_HEAD(&vdev->dummy_resources_list); INIT_LIST_HEAD(&vdev->ioeventfds_list); - mutex_init(&vdev->vma_lock); - INIT_LIST_HEAD(&vdev->vma_list); INIT_LIST_HEAD(&vdev->sriov_pfs_item); init_rwsem(&vdev->memory_lock); xa_init(&vdev->ctx); @@ -2219,7 +2117,6 @@ void vfio_pci_core_release_dev(struct vfio_device *core_vdev) mutex_destroy(&vdev->igate); mutex_destroy(&vdev->ioeventfds_lock); - mutex_destroy(&vdev->vma_lock); kfree(vdev->region); kfree(vdev->pm_save); } @@ -2497,26 +2394,15 @@ static int vfio_pci_dev_set_pm_runtime_get(struct vfio_device_set *dev_set) return ret; } -/* - * We need to get memory_lock for each device, but devices can share mmap_lock, - * therefore we need to zap and hold the vma_lock for each device, and only then - * get each memory_lock. - */ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, struct vfio_pci_group_info *groups, struct iommufd_ctx *iommufd_ctx) { - struct vfio_pci_core_device *cur_mem; - struct vfio_pci_core_device *cur_vma; - struct vfio_pci_core_device *cur; + struct vfio_pci_core_device *vdev; struct pci_dev *pdev; - bool is_mem = true; int ret; mutex_lock(&dev_set->lock); - cur_mem = list_first_entry(&dev_set->device_list, - struct vfio_pci_core_device, - vdev.dev_set_list); pdev = vfio_pci_dev_set_resettable(dev_set); if (!pdev) { @@ -2533,7 +2419,7 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, if (ret) goto err_unlock; - list_for_each_entry(cur_vma, &dev_set->device_list, vdev.dev_set_list) { + list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list) { bool owned; /* @@ -2557,38 +2443,38 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, * Otherwise, reset is not allowed. */ if (iommufd_ctx) { - int devid = vfio_iommufd_get_dev_id(&cur_vma->vdev, + int devid = vfio_iommufd_get_dev_id(&vdev->vdev, iommufd_ctx); owned = (devid > 0 || devid == -ENOENT); } else { - owned = vfio_dev_in_groups(&cur_vma->vdev, groups); + owned = vfio_dev_in_groups(&vdev->vdev, groups); } if (!owned) { ret = -EINVAL; - goto err_undo; + break; } /* - * Locking multiple devices is prone to deadlock, runaway and - * unwind if we hit contention. + * Take the memory write lock for each device and zap BAR + * mappings to prevent the user accessing the device while in + * reset. Locking multiple devices is prone to deadlock, + * runaway and unwind if we hit contention. */ - if (!vfio_pci_zap_and_vma_lock(cur_vma, true)) { + if (!down_write_trylock(&vdev->memory_lock)) { ret = -EBUSY; - goto err_undo; + break; } + + vfio_pci_zap_bars(vdev); } - cur_vma = NULL; - list_for_each_entry(cur_mem, &dev_set->device_list, vdev.dev_set_list) { - if (!down_write_trylock(&cur_mem->memory_lock)) { - ret = -EBUSY; - goto err_undo; - } - mutex_unlock(&cur_mem->vma_lock); + if (!list_entry_is_head(vdev, + &dev_set->device_list, vdev.dev_set_list)) { + vdev = list_prev_entry(vdev, vdev.dev_set_list); + goto err_undo; } - cur_mem = NULL; /* * The pci_reset_bus() will reset all the devices in the bus. @@ -2599,25 +2485,22 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, * cause the PCI config space reset without restoring the original * state (saved locally in 'vdev->pm_save'). */ - list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) - vfio_pci_set_power_state(cur, PCI_D0); + list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list) + vfio_pci_set_power_state(vdev, PCI_D0); ret = pci_reset_bus(pdev); + vdev = list_last_entry(&dev_set->device_list, + struct vfio_pci_core_device, vdev.dev_set_list); + err_undo: - list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) { - if (cur == cur_mem) - is_mem = false; - if (cur == cur_vma) - break; - if (is_mem) - up_write(&cur->memory_lock); - else - mutex_unlock(&cur->vma_lock); - } + list_for_each_entry_from_reverse(vdev, &dev_set->device_list, + vdev.dev_set_list) + up_write(&vdev->memory_lock); + + list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list) + pm_runtime_put(&vdev->pdev->dev); - list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) - pm_runtime_put(&cur->pdev->dev); err_unlock: mutex_unlock(&dev_set->lock); return ret; diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 6c6586af79532..f8b8f3bcc7803 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -105,9 +105,9 @@ struct vfio_dma { struct vfio_batch { struct page **pages; /* for pin_user_pages_remote */ struct page *fallback_page; /* if pages alloc fails */ - int capacity; /* length of pages array */ - int size; /* of batch currently */ - int offset; /* of next entry in pages */ + unsigned int capacity; /* length of pages array */ + unsigned int size; /* of batch currently */ + unsigned int offset; /* of next entry in pages */ }; struct vfio_iommu_group { @@ -474,12 +474,12 @@ static int put_pfn(unsigned long pfn, int prot) #define VFIO_BATCH_MAX_CAPACITY (PAGE_SIZE / sizeof(struct page *)) -static void vfio_batch_init(struct vfio_batch *batch) +static void __vfio_batch_init(struct vfio_batch *batch, bool single) { batch->size = 0; batch->offset = 0; - if (unlikely(disable_hugepages)) + if (single || unlikely(disable_hugepages)) goto fallback; batch->pages = (struct page **) __get_free_page(GFP_KERNEL); @@ -494,6 +494,16 @@ static void vfio_batch_init(struct vfio_batch *batch) batch->capacity = 1; } +static void vfio_batch_init(struct vfio_batch *batch) +{ + __vfio_batch_init(batch, false); +} + +static void vfio_batch_init_single(struct vfio_batch *batch) +{ + __vfio_batch_init(batch, true); +} + static void vfio_batch_unpin(struct vfio_batch *batch, struct vfio_dma *dma) { while (batch->size) { @@ -513,14 +523,12 @@ static void vfio_batch_fini(struct vfio_batch *batch) static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm, unsigned long vaddr, unsigned long *pfn, - bool write_fault) + unsigned long *addr_mask, bool write_fault) { - pte_t *ptep; - pte_t pte; - spinlock_t *ptl; + struct follow_pfnmap_args args = { .vma = vma, .address = vaddr }; int ret; - ret = follow_pte(vma->vm_mm, vaddr, &ptep, &ptl); + ret = follow_pfnmap_start(&args); if (ret) { bool unlocked = false; @@ -534,43 +542,51 @@ static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm, if (ret) return ret; - ret = follow_pte(vma->vm_mm, vaddr, &ptep, &ptl); + ret = follow_pfnmap_start(&args); if (ret) return ret; } - pte = ptep_get(ptep); - - if (write_fault && !pte_write(pte)) + if (write_fault && !args.writable) { ret = -EFAULT; - else - *pfn = pte_pfn(pte); + } else { + *pfn = args.pfn; + *addr_mask = args.addr_mask; + } - pte_unmap_unlock(ptep, ptl); + follow_pfnmap_end(&args); return ret; } /* * Returns the positive number of pfns successfully obtained or a negative - * error code. + * error code. The initial pfn is stored in the pfn arg. For page-backed + * pfns, the provided batch is also updated to indicate the filled pages and + * initial offset. For VM_PFNMAP pfns, only the returned number of pfns and + * returned initial pfn are provided; subsequent pfns are contiguous. */ -static int vaddr_get_pfns(struct mm_struct *mm, unsigned long vaddr, - long npages, int prot, unsigned long *pfn, - struct page **pages) +static long vaddr_get_pfns(struct mm_struct *mm, unsigned long vaddr, + unsigned long npages, int prot, unsigned long *pfn, + struct vfio_batch *batch) { + unsigned long pin_pages = min_t(unsigned long, npages, batch->capacity); struct vm_area_struct *vma; unsigned int flags = 0; - int ret; + long ret; if (prot & IOMMU_WRITE) flags |= FOLL_WRITE; mmap_read_lock(mm); - ret = pin_user_pages_remote(mm, vaddr, npages, flags | FOLL_LONGTERM, - pages, NULL); + ret = pin_user_pages_remote(mm, vaddr, pin_pages, flags | FOLL_LONGTERM, + batch->pages, NULL); if (ret > 0) { - *pfn = page_to_pfn(pages[0]); + *pfn = page_to_pfn(batch->pages[0]); + batch->size = ret; + batch->offset = 0; goto done; + } else if (!ret) { + ret = -EFAULT; } vaddr = untagged_addr_remote(mm, vaddr); @@ -579,15 +595,22 @@ static int vaddr_get_pfns(struct mm_struct *mm, unsigned long vaddr, vma = vma_lookup(mm, vaddr); if (vma && vma->vm_flags & VM_PFNMAP) { - ret = follow_fault_pfn(vma, mm, vaddr, pfn, prot & IOMMU_WRITE); + unsigned long addr_mask; + + ret = follow_fault_pfn(vma, mm, vaddr, pfn, &addr_mask, + prot & IOMMU_WRITE); if (ret == -EAGAIN) goto retry; if (!ret) { - if (is_invalid_reserved_pfn(*pfn)) - ret = 1; - else + if (is_invalid_reserved_pfn(*pfn)) { + unsigned long epfn; + + epfn = (*pfn | (~addr_mask >> PAGE_SHIFT)) + 1; + ret = min_t(long, npages, epfn - *pfn); + } else { ret = -EFAULT; + } } } done: @@ -601,7 +624,7 @@ static int vaddr_get_pfns(struct mm_struct *mm, unsigned long vaddr, * first page and all consecutive pages with the same locking. */ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, - long npage, unsigned long *pfn_base, + unsigned long npage, unsigned long *pfn_base, unsigned long limit, struct vfio_batch *batch) { unsigned long pfn; @@ -623,32 +646,42 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, *pfn_base = 0; } + if (unlikely(disable_hugepages)) + npage = 1; + while (npage) { if (!batch->size) { /* Empty batch, so refill it. */ - long req_pages = min_t(long, npage, batch->capacity); - - ret = vaddr_get_pfns(mm, vaddr, req_pages, dma->prot, - &pfn, batch->pages); + ret = vaddr_get_pfns(mm, vaddr, npage, dma->prot, + &pfn, batch); if (ret < 0) goto unpin_out; - batch->size = ret; - batch->offset = 0; - if (!*pfn_base) { *pfn_base = pfn; rsvd = is_invalid_reserved_pfn(*pfn_base); } + + /* Handle pfnmap */ + if (!batch->size) { + if (pfn != *pfn_base + pinned || !rsvd) + goto out; + + pinned += ret; + npage -= ret; + vaddr += (PAGE_SIZE * ret); + iova += (PAGE_SIZE * ret); + continue; + } } /* - * pfn is preset for the first iteration of this inner loop and - * updated at the end to handle a VM_PFNMAP pfn. In that case, - * batch->pages isn't valid (there's no struct page), so allow - * batch->pages to be touched only when there's more than one - * pfn to check, which guarantees the pfns are from a - * !VM_PFNMAP vma. + * pfn is preset for the first iteration of this inner loop + * due to the fact that vaddr_get_pfns() needs to provide the + * initial pfn for pfnmaps. Therefore to reduce redundancy, + * the next pfn is fetched at the end of the loop. + * A PageReserved() page could still qualify as page backed + * and rsvd here, and therefore continues to use the batch. */ while (true) { if (pfn != *pfn_base + pinned || @@ -683,21 +716,12 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, pfn = page_to_pfn(batch->pages[batch->offset]); } - - if (unlikely(disable_hugepages)) - break; } out: ret = vfio_lock_acct(dma, lock_acct, false); unpin_out: - if (batch->size == 1 && !batch->offset) { - /* May be a VM_PFNMAP pfn, which the batch can't remember. */ - put_pfn(pfn, dma->prot); - batch->size = 0; - } - if (ret < 0) { if (pinned && !rsvd) { for (pfn = *pfn_base ; pinned ; pfn++, pinned--) @@ -712,7 +736,7 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, } static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova, - unsigned long pfn, long npage, + unsigned long pfn, unsigned long npage, bool do_accounting) { long unlocked = 0, locked = 0; @@ -735,7 +759,7 @@ static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova, static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr, unsigned long *pfn_base, bool do_accounting) { - struct page *pages[1]; + struct vfio_batch batch; struct mm_struct *mm; int ret; @@ -743,7 +767,9 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr, if (!mmget_not_zero(mm)) return -ENODEV; - ret = vaddr_get_pfns(mm, vaddr, 1, dma->prot, pfn_base, pages); + vfio_batch_init_single(&batch); + + ret = vaddr_get_pfns(mm, vaddr, 1, dma->prot, pfn_base, &batch); if (ret != 1) goto out; @@ -762,6 +788,7 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr, } out: + vfio_batch_fini(&batch); mmput(mm); return ret; } diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h index 429529f7a484c..245a99afe9ce8 100644 --- a/fs/smb/client/cifsproto.h +++ b/fs/smb/client/cifsproto.h @@ -158,6 +158,8 @@ extern int cifs_get_writable_path(struct cifs_tcon *tcon, const char *name, extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool); extern int cifs_get_readable_path(struct cifs_tcon *tcon, const char *name, struct cifsFileInfo **ret_file); +extern int cifs_get_hardlink_path(struct cifs_tcon *tcon, struct inode *inode, + struct file *file); extern unsigned int smbCalcSize(void *buf); extern int decode_negTokenInit(unsigned char *security_blob, int length, struct TCP_Server_Info *server); diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index dd6cb08fd4231..254347d16db68 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -685,15 +685,23 @@ int cifs_open(struct inode *inode, struct file *file) rc = cifs_get_readable_path(tcon, full_path, &cfile); } if (rc == 0) { - if (file->f_flags == cfile->f_flags) { + unsigned int oflags = file->f_flags & ~(O_CREAT|O_EXCL|O_TRUNC); + unsigned int cflags = cfile->f_flags & ~(O_CREAT|O_EXCL|O_TRUNC); + + if (cifs_convert_flags(oflags, 0) == cifs_convert_flags(cflags, 0) && + (oflags & (O_SYNC|O_DIRECT)) == (cflags & (O_SYNC|O_DIRECT))) { file->private_data = cfile; spin_lock(&CIFS_I(inode)->deferred_lock); cifs_del_deferred_close(cfile); spin_unlock(&CIFS_I(inode)->deferred_lock); goto use_cache; - } else { - _cifsFileInfo_put(cfile, true, false); } + _cifsFileInfo_put(cfile, true, false); + } else { + /* hard link on the defeered close file */ + rc = cifs_get_hardlink_path(tcon, inode, file); + if (rc) + cifs_close_deferred_file(CIFS_I(inode)); } if (server->oplocks) @@ -1754,6 +1762,29 @@ cifs_move_llist(struct list_head *source, struct list_head *dest) list_move(li, dest); } +int +cifs_get_hardlink_path(struct cifs_tcon *tcon, struct inode *inode, + struct file *file) +{ + struct cifsFileInfo *open_file = NULL; + struct cifsInodeInfo *cinode = CIFS_I(inode); + int rc = 0; + + spin_lock(&tcon->open_file_lock); + spin_lock(&cinode->open_file_lock); + + list_for_each_entry(open_file, &cinode->openFileList, flist) { + if (file->f_flags == open_file->f_flags) { + rc = -EINVAL; + break; + } + } + + spin_unlock(&cinode->open_file_lock); + spin_unlock(&tcon->open_file_lock); + return rc; +} + void cifs_free_llist(struct list_head *llist) { diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index fc789c0ac85b8..eaba832e03575 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -256,11 +256,6 @@ static inline bool is_huge_zero_pmd(pmd_t pmd) return pmd_present(pmd) && READ_ONCE(huge_zero_pfn) == pmd_pfn(pmd); } -static inline bool is_huge_zero_pud(pud_t pud) -{ - return false; -} - struct page *mm_get_huge_zero_page(struct mm_struct *mm); void mm_put_huge_zero_page(struct mm_struct *mm); @@ -379,11 +374,6 @@ static inline bool is_huge_zero_pmd(pmd_t pmd) return false; } -static inline bool is_huge_zero_pud(pud_t pud) -{ - return false; -} - static inline void mm_put_huge_zero_page(struct mm_struct *mm) { return; diff --git a/include/linux/mm.h b/include/linux/mm.h index 196c481ec1603..ab100f6bd25ad 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2427,15 +2427,42 @@ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); int copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma); -int follow_pte(struct mm_struct *mm, unsigned long address, - pte_t **ptepp, spinlock_t **ptlp); -int follow_pfn(struct vm_area_struct *vma, unsigned long address, - unsigned long *pfn); -int follow_phys(struct vm_area_struct *vma, unsigned long address, - unsigned int flags, unsigned long *prot, resource_size_t *phys); int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write); +struct follow_pfnmap_args { + /** + * Inputs: + * @vma: Pointer to @vm_area_struct struct + * @address: the virtual address to walk + */ + struct vm_area_struct *vma; + unsigned long address; + /** + * Internals: + * + * The caller shouldn't touch any of these. + */ + spinlock_t *lock; + pte_t *ptep; + /** + * Outputs: + * + * @pfn: the PFN of the address + * @addr_mask: address mask covering pfn + * @pgprot: the pgprot_t of the mapping + * @writable: whether the mapping is writable + * @special: whether the mapping is a special mapping (real PFN maps) + */ + unsigned long pfn; + unsigned long addr_mask; + pgprot_t pgprot; + bool writable; + bool special; +}; +int follow_pfnmap_start(struct follow_pfnmap_args *args); +void follow_pfnmap_end(struct follow_pfnmap_args *args); + extern void truncate_pagecache(struct inode *inode, loff_t new); extern void truncate_setsize(struct inode *inode, loff_t newsize); void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to); @@ -2730,6 +2757,30 @@ static inline pte_t pte_mkspecial(pte_t pte) } #endif +#ifndef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP +static inline bool pmd_special(pmd_t pmd) +{ + return false; +} + +static inline pmd_t pmd_mkspecial(pmd_t pmd) +{ + return pmd; +} +#endif /* CONFIG_ARCH_SUPPORTS_PMD_PFNMAP */ + +#ifndef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP +static inline bool pud_special(pud_t pud) +{ + return false; +} + +static inline pud_t pud_mkspecial(pud_t pud) +{ + return pud; +} +#endif /* CONFIG_ARCH_SUPPORTS_PUD_PFNMAP */ + #ifndef CONFIG_ARCH_HAS_PTE_DEVMAP static inline int pte_devmap(pte_t pte) { diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index f62a9c9f3ce10..73eca45d91a10 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1685,6 +1685,18 @@ typedef unsigned int pgtbl_mod_mask; #define MAX_PTRS_PER_P4D PTRS_PER_P4D #endif +#ifndef pte_pgprot +#define pte_pgprot(x) ((pgprot_t) {0}) +#endif + +#ifndef pmd_pgprot +#define pmd_pgprot(x) ((pgprot_t) {0}) +#endif + +#ifndef pud_pgprot +#define pud_pgprot(x) ((pgprot_t) {0}) +#endif + /* description of effects of mapping type and prot in current implementation. * this is due to the limited x86 page protection hardware. The expected * behavior is in parens: diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 30c79194eecc8..fbb472dd99b36 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -93,8 +93,6 @@ struct vfio_pci_core_device { struct list_head sriov_pfs_item; struct vfio_pci_core_device *sriov_pf_core_dev; struct notifier_block nb; - struct mutex vma_lock; - struct list_head vma_list; struct rw_semaphore memory_lock; }; diff --git a/mm/Kconfig b/mm/Kconfig index a91823e31f45b..ab0e794e3fc03 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -897,6 +897,19 @@ config READ_ONLY_THP_FOR_FS endif # TRANSPARENT_HUGEPAGE +# TODO: Allow to be enabled without THP +config ARCH_SUPPORTS_HUGE_PFNMAP + def_bool n + depends on TRANSPARENT_HUGEPAGE + +config ARCH_SUPPORTS_PMD_PFNMAP + def_bool y + depends on ARCH_SUPPORTS_HUGE_PFNMAP && HAVE_ARCH_TRANSPARENT_HUGEPAGE + +config ARCH_SUPPORTS_PUD_PFNMAP + def_bool y + depends on ARCH_SUPPORTS_HUGE_PFNMAP && HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD + # # UP and nommu archs use km based percpu allocator # diff --git a/mm/gup.c b/mm/gup.c index ad7345cfba91d..16cdddef91585 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2903,6 +2903,9 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, if (!pmd_access_permitted(orig, flags & FOLL_WRITE)) return 0; + if (pmd_special(orig)) + return 0; + if (pmd_devmap(orig)) { if (unlikely(flags & FOLL_LONGTERM)) return 0; @@ -2947,6 +2950,9 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, if (!pud_access_permitted(orig, flags & FOLL_WRITE)) return 0; + if (pud_special(orig)) + return 0; + if (pud_devmap(orig)) { if (unlikely(flags & FOLL_LONGTERM)) return 0; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 20d9b3971dc88..c1cdbd21dddea 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -860,6 +860,8 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, entry = pmd_mkhuge(pfn_t_pmd(pfn, prot)); if (pfn_t_devmap(pfn)) entry = pmd_mkdevmap(entry); + else + entry = pmd_mkspecial(entry); if (write) { entry = pmd_mkyoung(pmd_mkdirty(entry)); entry = maybe_pmd_mkwrite(entry, vma); @@ -943,10 +945,8 @@ static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, ptl = pud_lock(mm, pud); if (!pud_none(*pud)) { if (write) { - if (pud_pfn(*pud) != pfn_t_to_pfn(pfn)) { - WARN_ON_ONCE(!is_huge_zero_pud(*pud)); + if (WARN_ON_ONCE(pud_pfn(*pud) != pfn_t_to_pfn(pfn))) goto out_unlock; - } entry = pud_mkyoung(*pud); entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma); if (pudp_set_access_flags(vma, addr, pud, entry, 1)) @@ -958,6 +958,8 @@ static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, entry = pud_mkhuge(pfn_t_pud(pfn, prot)); if (pfn_t_devmap(pfn)) entry = pud_mkdevmap(entry); + else + entry = pud_mkspecial(entry); if (write) { entry = pud_mkyoung(pud_mkdirty(entry)); entry = maybe_pud_mkwrite(entry, vma); @@ -1070,6 +1072,24 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pgtable_t pgtable = NULL; int ret = -ENOMEM; + pmd = pmdp_get_lockless(src_pmd); + if (unlikely(pmd_present(pmd) && pmd_special(pmd))) { + dst_ptl = pmd_lock(dst_mm, dst_pmd); + src_ptl = pmd_lockptr(src_mm, src_pmd); + spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); + /* + * No need to recheck the pmd, it can't change with write + * mmap lock held here. + * + * Meanwhile, making sure it's not a CoW VMA with writable + * mapping, otherwise it means either the anon page wrongly + * applied special bit, or we made the PRIVATE mapping be + * able to wrongly write to the backend MMIO. + */ + VM_WARN_ON_ONCE(is_cow_mapping(src_vma->vm_flags) && pmd_write(pmd)); + goto set_pmd; + } + /* Skip if can be re-fill on fault */ if (!vma_is_anonymous(dst_vma)) return 0; @@ -1150,7 +1170,9 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmdp_set_wrprotect(src_mm, addr, src_pmd); if (!userfaultfd_wp(dst_vma)) pmd = pmd_clear_uffd_wp(pmd); - pmd = pmd_mkold(pmd_wrprotect(pmd)); + pmd = pmd_wrprotect(pmd); +set_pmd: + pmd = pmd_mkold(pmd); set_pmd_at(dst_mm, addr, dst_pmd, pmd); ret = 0; @@ -1235,21 +1257,15 @@ int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud))) goto out_unlock; - /* - * When page table lock is held, the huge zero pud should not be - * under splitting since we don't split the page itself, only pud to - * a page table. - */ - if (is_huge_zero_pud(pud)) { - /* No huge zero pud yet */ - } - /* * TODO: once we support anonymous pages, use page_try_dup_anon_rmap() * and split if duplicating fails. */ - pudp_set_wrprotect(src_mm, addr, src_pud); - pud = pud_mkold(pud_wrprotect(pud)); + if (is_cow_mapping(vma->vm_flags) && pud_write(pud)) { + pudp_set_wrprotect(src_mm, addr, src_pud); + pud = pud_wrprotect(pud); + } + pud = pud_mkold(pud); set_pud_at(dst_mm, addr, dst_pud, pud); ret = 0; diff --git a/mm/memory.c b/mm/memory.c index e2794e3b8919b..0338ced72b7df 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -659,11 +659,10 @@ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, { unsigned long pfn = pmd_pfn(pmd); - /* - * There is no pmd_special() but there may be special pmds, e.g. - * in a direct-access (dax) mapping, so let's just replicate the - * !CONFIG_ARCH_HAS_PTE_SPECIAL case from vm_normal_page() here. - */ + /* Currently it's only used for huge pfnmaps */ + if (unlikely(pmd_special(pmd))) + return NULL; + if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { if (vma->vm_flags & VM_MIXEDMAP) { if (!pfn_valid(pfn)) @@ -5607,130 +5606,159 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) } #endif /* __PAGETABLE_PMD_FOLDED */ +static inline void pfnmap_args_setup(struct follow_pfnmap_args *args, + spinlock_t *lock, pte_t *ptep, + pgprot_t pgprot, unsigned long pfn_base, + unsigned long addr_mask, bool writable, + bool special) +{ + args->lock = lock; + args->ptep = ptep; + args->pfn = pfn_base + ((args->address & ~addr_mask) >> PAGE_SHIFT); + args->addr_mask = addr_mask; + args->pgprot = pgprot; + args->writable = writable; + args->special = special; +} + +static inline void pfnmap_lockdep_assert(struct vm_area_struct *vma) +{ +#ifdef CONFIG_LOCKDEP + struct file *file = vma->vm_file; + struct address_space *mapping = file ? file->f_mapping : NULL; + + if (mapping) + lockdep_assert(lockdep_is_held(&vma->vm_file->f_mapping->i_mmap_rwsem) || + lockdep_is_held(&vma->vm_mm->mmap_lock)); + else + lockdep_assert(lockdep_is_held(&vma->vm_mm->mmap_lock)); +#endif +} + /** - * follow_pte - look up PTE at a user virtual address - * @mm: the mm_struct of the target address space - * @address: user virtual address - * @ptepp: location to store found PTE - * @ptlp: location to store the lock for the PTE + * follow_pfnmap_start() - Look up a pfn mapping at a user virtual address + * @args: Pointer to struct @follow_pfnmap_args + * + * The caller needs to setup args->vma and args->address to point to the + * virtual address as the target of such lookup. On a successful return, + * the results will be put into other output fields. + * + * After the caller finished using the fields, the caller must invoke + * another follow_pfnmap_end() to proper releases the locks and resources + * of such look up request. * - * On a successful return, the pointer to the PTE is stored in @ptepp; - * the corresponding lock is taken and its location is stored in @ptlp. - * The contents of the PTE are only stable until @ptlp is released; - * any further use, if any, must be protected against invalidation - * with MMU notifiers. + * During the start() and end() calls, the results in @args will be valid + * as proper locks will be held. After the end() is called, all the fields + * in @follow_pfnmap_args will be invalid to be further accessed. Further + * use of such information after end() may require proper synchronizations + * by the caller with page table updates, otherwise it can create a + * security bug. + * + * If the PTE maps a refcounted page, callers are responsible to protect + * against invalidation with MMU notifiers; otherwise access to the PFN at + * a later point in time can trigger use-after-free. * * Only IO mappings and raw PFN mappings are allowed. The mmap semaphore - * should be taken for read. + * should be taken for read, and the mmap semaphore cannot be released + * before the end() is invoked. * - * KVM uses this function. While it is arguably less bad than ``follow_pfn``, - * it is not a good general-purpose API. + * This function must not be used to modify PTE content. * - * Return: zero on success, -ve otherwise. + * Return: zero on success, negative otherwise. */ -int follow_pte(struct mm_struct *mm, unsigned long address, - pte_t **ptepp, spinlock_t **ptlp) +int follow_pfnmap_start(struct follow_pfnmap_args *args) { - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - pte_t *ptep; + struct vm_area_struct *vma = args->vma; + unsigned long address = args->address; + struct mm_struct *mm = vma->vm_mm; + spinlock_t *lock; + pgd_t *pgdp; + p4d_t *p4dp, p4d; + pud_t *pudp, pud; + pmd_t *pmdp, pmd; + pte_t *ptep, pte; - pgd = pgd_offset(mm, address); - if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) + pfnmap_lockdep_assert(vma); + + if (unlikely(address < vma->vm_start || address >= vma->vm_end)) goto out; - p4d = p4d_offset(pgd, address); - if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d))) + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) + goto out; +retry: + pgdp = pgd_offset(mm, address); + if (pgd_none(*pgdp) || unlikely(pgd_bad(*pgdp))) goto out; - pud = pud_offset(p4d, address); - if (pud_none(*pud) || unlikely(pud_bad(*pud))) + p4dp = p4d_offset(pgdp, address); + p4d = READ_ONCE(*p4dp); + if (p4d_none(p4d) || unlikely(p4d_bad(p4d))) goto out; - pmd = pmd_offset(pud, address); - VM_BUG_ON(pmd_trans_huge(*pmd)); + pudp = pud_offset(p4dp, address); + pud = READ_ONCE(*pudp); + if (pud_none(pud)) + goto out; + if (pud_leaf(pud)) { + lock = pud_lock(mm, pudp); + if (!unlikely(pud_leaf(pud))) { + spin_unlock(lock); + goto retry; + } + pfnmap_args_setup(args, lock, NULL, pud_pgprot(pud), + pud_pfn(pud), PUD_MASK, pud_write(pud), + pud_special(pud)); + return 0; + } + + pmdp = pmd_offset(pudp, address); + pmd = pmdp_get_lockless(pmdp); + if (pmd_leaf(pmd)) { + lock = pmd_lock(mm, pmdp); + if (!unlikely(pmd_leaf(pmd))) { + spin_unlock(lock); + goto retry; + } + pfnmap_args_setup(args, lock, NULL, pmd_pgprot(pmd), + pmd_pfn(pmd), PMD_MASK, pmd_write(pmd), + pmd_special(pmd)); + return 0; + } - ptep = pte_offset_map_lock(mm, pmd, address, ptlp); + ptep = pte_offset_map_lock(mm, pmdp, address, &lock); if (!ptep) goto out; - if (!pte_present(ptep_get(ptep))) + pte = ptep_get(ptep); + if (!pte_present(pte)) goto unlock; - *ptepp = ptep; + pfnmap_args_setup(args, lock, ptep, pte_pgprot(pte), + pte_pfn(pte), PAGE_MASK, pte_write(pte), + pte_special(pte)); return 0; unlock: - pte_unmap_unlock(ptep, *ptlp); + pte_unmap_unlock(ptep, lock); out: return -EINVAL; } -EXPORT_SYMBOL_GPL(follow_pte); +EXPORT_SYMBOL_GPL(follow_pfnmap_start); /** - * follow_pfn - look up PFN at a user virtual address - * @vma: memory mapping - * @address: user virtual address - * @pfn: location to store found PFN - * - * Only IO mappings and raw PFN mappings are allowed. - * - * This function does not allow the caller to read the permissions - * of the PTE. Do not use it. + * follow_pfnmap_end(): End a follow_pfnmap_start() process + * @args: Pointer to struct @follow_pfnmap_args * - * Return: zero and the pfn at @pfn on success, -ve otherwise. + * Must be used in pair of follow_pfnmap_start(). See the start() function + * above for more information. */ -int follow_pfn(struct vm_area_struct *vma, unsigned long address, - unsigned long *pfn) +void follow_pfnmap_end(struct follow_pfnmap_args *args) { - int ret = -EINVAL; - spinlock_t *ptl; - pte_t *ptep; - - if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) - return ret; - - ret = follow_pte(vma->vm_mm, address, &ptep, &ptl); - if (ret) - return ret; - *pfn = pte_pfn(ptep_get(ptep)); - pte_unmap_unlock(ptep, ptl); - return 0; + if (args->lock) + spin_unlock(args->lock); + if (args->ptep) + pte_unmap(args->ptep); } -EXPORT_SYMBOL(follow_pfn); +EXPORT_SYMBOL_GPL(follow_pfnmap_end); #ifdef CONFIG_HAVE_IOREMAP_PROT -int follow_phys(struct vm_area_struct *vma, - unsigned long address, unsigned int flags, - unsigned long *prot, resource_size_t *phys) -{ - int ret = -EINVAL; - pte_t *ptep, pte; - spinlock_t *ptl; - - if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) - goto out; - - if (follow_pte(vma->vm_mm, address, &ptep, &ptl)) - goto out; - pte = ptep_get(ptep); - - /* Never return PFNs of anon folios in COW mappings. */ - if (vm_normal_folio(vma, address, pte)) - goto unlock; - - if ((flags & FOLL_WRITE) && !pte_write(pte)) - goto unlock; - - *prot = pgprot_val(pte_pgprot(pte)); - *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT; - - ret = 0; -unlock: - pte_unmap_unlock(ptep, ptl); -out: - return ret; -} - /** * generic_access_phys - generic implementation for iomem mmap access * @vma: the vma to access @@ -5749,37 +5777,34 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, resource_size_t phys_addr; unsigned long prot = 0; void __iomem *maddr; - pte_t *ptep, pte; - spinlock_t *ptl; int offset = offset_in_page(addr); int ret = -EINVAL; - - if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) - return -EINVAL; + bool writable; + struct follow_pfnmap_args args = { .vma = vma, .address = addr }; retry: - if (follow_pte(vma->vm_mm, addr, &ptep, &ptl)) + if (follow_pfnmap_start(&args)) return -EINVAL; - pte = ptep_get(ptep); - pte_unmap_unlock(ptep, ptl); + prot = pgprot_val(args.pgprot); + phys_addr = (resource_size_t)args.pfn << PAGE_SHIFT; + writable = args.writable; + follow_pfnmap_end(&args); - prot = pgprot_val(pte_pgprot(pte)); - phys_addr = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT; - - if ((write & FOLL_WRITE) && !pte_write(pte)) + if ((write & FOLL_WRITE) && !writable) return -EINVAL; maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot); if (!maddr) return -ENOMEM; - if (follow_pte(vma->vm_mm, addr, &ptep, &ptl)) + if (follow_pfnmap_start(&args)) goto out_unmap; - if (!pte_same(pte, ptep_get(ptep))) { - pte_unmap_unlock(ptep, ptl); + if ((prot != pgprot_val(args.pgprot)) || + (phys_addr != (args.pfn << PAGE_SHIFT)) || + (writable != args.writable)) { + follow_pfnmap_end(&args); iounmap(maddr); - goto retry; } @@ -5788,7 +5813,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, else memcpy_fromio(buf, maddr + offset, len); ret = len; - pte_unmap_unlock(ptep, ptl); + follow_pfnmap_end(&args); out_unmap: iounmap(maddr); diff --git a/mm/nommu.c b/mm/nommu.c index f3f6a7e976470..de9ecac05da5d 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -110,27 +110,6 @@ unsigned int kobjsize(const void *objp) return page_size(page); } -/** - * follow_pfn - look up PFN at a user virtual address - * @vma: memory mapping - * @address: user virtual address - * @pfn: location to store found PFN - * - * Only IO mappings and raw PFN mappings are allowed. - * - * Returns zero and the pfn at @pfn on success, -ve otherwise. - */ -int follow_pfn(struct vm_area_struct *vma, unsigned long address, - unsigned long *pfn) -{ - if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) - return -EINVAL; - - *pfn = address >> PAGE_SHIFT; - return 0; -} -EXPORT_SYMBOL(follow_pfn); - LIST_HEAD(vmap_area_list); void vfree(const void *addr) diff --git a/redhat/kernel.changelog-9.6 b/redhat/kernel.changelog-9.6 index 45aa1339d2d60..c750b086df2cd 100644 --- a/redhat/kernel.changelog-9.6 +++ b/redhat/kernel.changelog-9.6 @@ -1,3 +1,45 @@ +* Sat Jul 05 2025 CKI KWF Bot [5.14.0-570.26.1.el9_6] +- x86/microcode/AMD: Fix out-of-bounds on systems with CPU-less NUMA nodes (CKI Backport Bot) [RHEL-98996] {CVE-2025-21991} +- cpufreq: intel_pstate: Unchecked MSR aceess in legacy mode (David Arcari) [RHEL-90212] +- smb: client: fix perf regression with deferred closes (Paulo Alcantara) [RHEL-97482] +- smb3 client: fix open hardlink on deferred close file error (Paulo Alcantara) [RHEL-97482] +- Fix mmu notifiers for range-based invalidates (Jay Shin) [RHEL-93743] +- vfio/pci: Align huge faults to order (Alex Williamson) [RHEL-88275] +- vfio/type1: Use mapping page mask for pfnmaps (Alex Williamson) [RHEL-88275] +- mm: Provide address mask in struct follow_pfnmap_args (Alex Williamson) [RHEL-88275] +- vfio/type1: Use consistent types for page counts (Alex Williamson) [RHEL-88275] +- vfio/type1: Use vfio_batch for vaddr_get_pfns() (Alex Williamson) [RHEL-88275] +- vfio/type1: Convert all vaddr_get_pfns() callers to use vfio_batch (Alex Williamson) [RHEL-88275] +- vfio/type1: Catch zero from pin_user_pages_remote() (Alex Williamson) [RHEL-88275] +- vfio/pci: Fallback huge faults for unaligned pfn (Donald Dutile) [RHEL-85623] +- vfio/pci: implement huge_fault support (Donald Dutile) [RHEL-85623] +- vfio/pci: Remove unused struct 'vfio_pci_mmap_vma' (Donald Dutile) [RHEL-85623] +- vfio/pci: Insert full vma on mmap'd MMIO fault (Donald Dutile) [RHEL-85623] +- vfio/pci: Use unmap_mapping_range() (Donald Dutile) [RHEL-85623] +- mm/arm64: support large pfn mappings (Donald Dutile) [RHEL-85623] +- mm/x86: support large pfn mappings (Donald Dutile) [RHEL-85623] +- mm: remove follow_pte() (Donald Dutile) [RHEL-85623] +- mm: follow_pte() improvements (Donald Dutile) [RHEL-85623] +- mm/access_process_vm: use the new follow_pfnmap API (Donald Dutile) [RHEL-85623] +- vfio: use the new follow_pfnmap API (Donald Dutile) [RHEL-85623] +- mm/x86/pat: use the new follow_pfnmap API (Donald Dutile) [RHEL-85623] +- s390/pci_mmio: use follow_pfnmap API (Donald Dutile) [RHEL-85623] +- KVM: use follow_pfnmap API (Donald Dutile) [RHEL-85623] +- mm: pass VMA instead of MM to follow_pte() (Donald Dutile) [RHEL-85623] +- mm: move follow_phys to arch/x86/mm/pat/memtype.c (Donald Dutile) [RHEL-85623] +- mm: fix follow_pfnmap API lockdep assert (Donald Dutile) [RHEL-85623] +- mm: new follow_pfnmap API (Donald Dutile) [RHEL-85623] +- mm: remove follow_pfn (Donald Dutile) [RHEL-85623] +- mm: always define pxx_pgprot() (Donald Dutile) [RHEL-85623] +- mm/huge_memory: check pmd_special() only after pmd_present() (Donald Dutile) [RHEL-85623] +- mm/fork: accept huge pfnmap entries (Donald Dutile) [RHEL-85623] +- mm/pagewalk: check pfnmap for folio_walk_start() (Donald Dutile) [RHEL-85623] +- mm/gup: detect huge pfnmap entries in gup-fast (Donald Dutile) [RHEL-85623] +- mm: mark special bits for huge pfn mappings when inject (Donald Dutile) [RHEL-85623] +- mm: drop is_huge_zero_pud() (Donald Dutile) [RHEL-85623] +- mm: introduce ARCH_SUPPORTS_HUGE_PFNMAP and special bits to pmd/pud (Donald Dutile) [RHEL-85623] +Resolves: RHEL-85623, RHEL-88275, RHEL-90212, RHEL-93743, RHEL-97482, RHEL-98996 + * Sat Jun 28 2025 CKI KWF Bot [5.14.0-570.25.1.el9_6] - udf: Fix a slab-out-of-bounds write bug in udf_find_entry() (CKI Backport Bot) [RHEL-99124] {CVE-2022-49846} - vmxnet3: Fix malformed packet sizing in vmxnet3_process_xdp (CKI Backport Bot) [RHEL-97110] {CVE-2025-37799} diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index b163a079fe65e..279572294aba3 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2878,13 +2878,11 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, unsigned long addr, bool write_fault, bool *writable, kvm_pfn_t *p_pfn) { + struct follow_pfnmap_args args = { .vma = vma, .address = addr }; kvm_pfn_t pfn; - pte_t *ptep; - pte_t pte; - spinlock_t *ptl; int r; - r = follow_pte(vma->vm_mm, addr, &ptep, &ptl); + r = follow_pfnmap_start(&args); if (r) { /* * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does @@ -2899,21 +2897,19 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, if (r) return r; - r = follow_pte(vma->vm_mm, addr, &ptep, &ptl); + r = follow_pfnmap_start(&args); if (r) return r; } - pte = ptep_get(ptep); - - if (write_fault && !pte_write(pte)) { + if (write_fault && !args.writable) { pfn = KVM_PFN_ERR_RO_FAULT; goto out; } if (writable) - *writable = pte_write(pte); - pfn = pte_pfn(pte); + *writable = args.writable; + pfn = args.pfn; /* * Get a reference here because callers of *hva_to_pfn* and @@ -2934,9 +2930,8 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, */ if (!kvm_try_get_pfn(pfn)) r = -EFAULT; - out: - pte_unmap_unlock(ptep, ptl); + follow_pfnmap_end(&args); *p_pfn = pfn; return r;