Skip to content

Commit b36f556

Browse files
author
Audra Mitchell
committed
x86/mm/pat: Fix VM_PAT handling when fork() fails in copy_page_range()
JIRA: https://issues.redhat.com/browse/RHEL-104908 CVE: CVE-2025-22090 This patch is a backport of the following upstream commit: commit dc84bc2 Author: David Hildenbrand <[email protected]> Date: Fri Mar 21 12:23:23 2025 +0100 x86/mm/pat: Fix VM_PAT handling when fork() fails in copy_page_range() If track_pfn_copy() fails, we already added the dst VMA to the maple tree. As fork() fails, we'll cleanup the maple tree, and stumble over the dst VMA for which we neither performed any reservation nor copied any page tables. Consequently untrack_pfn() will see VM_PAT and try obtaining the PAT information from the page table -- which fails because the page table was not copied. The easiest fix would be to simply clear the VM_PAT flag of the dst VMA if track_pfn_copy() fails. However, the whole thing is about "simply" clearing the VM_PAT flag is shaky as well: if we passed track_pfn_copy() and performed a reservation, but copying the page tables fails, we'll simply clear the VM_PAT flag, not properly undoing the reservation ... which is also wrong. So let's fix it properly: set the VM_PAT flag only if the reservation succeeded (leaving it clear initially), and undo the reservation if anything goes wrong while copying the page tables: clearing the VM_PAT flag after undoing the reservation. Note that any copied page table entries will get zapped when the VMA will get removed later, after copy_page_range() succeeded; as VM_PAT is not set then, we won't try cleaning VM_PAT up once more and untrack_pfn() will be happy. Note that leaving these page tables in place without a reservation is not a problem, as we are aborting fork(); this process will never run. A reproducer can trigger this usually at the first try: https://gitlab.com/davidhildenbrand/scratchspace/-/raw/main/reproducers/pat_fork.c WARNING: CPU: 26 PID: 11650 at arch/x86/mm/pat/memtype.c:983 get_pat_info+0xf6/0x110 Modules linked in: ... CPU: 26 UID: 0 PID: 11650 Comm: repro3 Not tainted 6.12.0-rc5+ #92 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-2.fc40 04/01/2014 RIP: 0010:get_pat_info+0xf6/0x110 ... Call Trace: <TASK> ... untrack_pfn+0x52/0x110 unmap_single_vma+0xa6/0xe0 unmap_vmas+0x105/0x1f0 exit_mmap+0xf6/0x460 __mmput+0x4b/0x120 copy_process+0x1bf6/0x2aa0 kernel_clone+0xab/0x440 __do_sys_clone+0x66/0x90 do_syscall_64+0x95/0x180 Likely this case was missed in: d155df5 ("x86/mm/pat: clear VM_PAT if copy_p4d_range failed") ... and instead of undoing the reservation we simply cleared the VM_PAT flag. Keep the documentation of these functions in include/linux/pgtable.h, one place is more than sufficient -- we should clean that up for the other functions like track_pfn_remap/untrack_pfn separately. Fixes: d155df5 ("x86/mm/pat: clear VM_PAT if copy_p4d_range failed") Fixes: 2ab6403 ("x86: PAT: hooks in generic vm code to help archs to track pfnmap regions - v3") Reported-by: xingwei lee <[email protected]> Reported-by: yuxin wang <[email protected]> Reported-by: Marius Fleischer <[email protected]> Signed-off-by: David Hildenbrand <[email protected]> Signed-off-by: Ingo Molnar <[email protected]> Cc: Andy Lutomirski <[email protected]> Cc: Peter Zijlstra <[email protected]> Cc: Rik van Riel <[email protected]> Cc: "H. Peter Anvin" <[email protected]> Cc: Linus Torvalds <[email protected]> Cc: Andrew Morton <[email protected]> Cc: [email protected] Link: https://lore.kernel.org/r/[email protected] Closes: https://lore.kernel.org/lkml/CABOYnLx_dnqzpCW99G81DmOr+2UzdmZMk=T3uxwNxwz+R1RAwg@mail.gmail.com/ Closes: https://lore.kernel.org/lkml/CAJg=8jwijTP5fre8woS4JVJQ8iUA6v+iNcsOgtj9Zfpc3obDOQ@mail.gmail.com/ Signed-off-by: Audra Mitchell <[email protected]>
1 parent b6e317f commit b36f556

File tree

4 files changed

+58
-37
lines changed

4 files changed

+58
-37
lines changed

arch/x86/mm/pat/memtype.c

Lines changed: 28 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1000,29 +1000,42 @@ static int get_pat_info(struct vm_area_struct *vma, resource_size_t *paddr,
10001000
return -EINVAL;
10011001
}
10021002

1003-
/*
1004-
* track_pfn_copy is called when vma that is covering the pfnmap gets
1005-
* copied through copy_page_range().
1006-
*
1007-
* If the vma has a linear pfn mapping for the entire range, we get the prot
1008-
* from pte and reserve the entire vma range with single reserve_pfn_range call.
1009-
*/
1010-
int track_pfn_copy(struct vm_area_struct *vma)
1003+
int track_pfn_copy(struct vm_area_struct *dst_vma,
1004+
struct vm_area_struct *src_vma, unsigned long *pfn)
10111005
{
1006+
const unsigned long vma_size = src_vma->vm_end - src_vma->vm_start;
10121007
resource_size_t paddr;
1013-
unsigned long vma_size = vma->vm_end - vma->vm_start;
10141008
pgprot_t pgprot;
1009+
int rc;
10151010

1016-
if (vma->vm_flags & VM_PAT) {
1017-
if (get_pat_info(vma, &paddr, &pgprot))
1018-
return -EINVAL;
1019-
/* reserve the whole chunk covered by vma. */
1020-
return reserve_pfn_range(paddr, vma_size, &pgprot, 1);
1021-
}
1011+
if (!(src_vma->vm_flags & VM_PAT))
1012+
return 0;
1013+
1014+
/*
1015+
* Duplicate the PAT information for the dst VMA based on the src
1016+
* VMA.
1017+
*/
1018+
if (get_pat_info(src_vma, &paddr, &pgprot))
1019+
return -EINVAL;
1020+
rc = reserve_pfn_range(paddr, vma_size, &pgprot, 1);
1021+
if (rc)
1022+
return rc;
10221023

1024+
/* Reservation for the destination VMA succeeded. */
1025+
vm_flags_set(dst_vma, VM_PAT);
1026+
*pfn = PHYS_PFN(paddr);
10231027
return 0;
10241028
}
10251029

1030+
void untrack_pfn_copy(struct vm_area_struct *dst_vma, unsigned long pfn)
1031+
{
1032+
untrack_pfn(dst_vma, pfn, dst_vma->vm_end - dst_vma->vm_start, true);
1033+
/*
1034+
* Reservation was freed, any copied page tables will get cleaned
1035+
* up later, but without getting PAT involved again.
1036+
*/
1037+
}
1038+
10261039
/*
10271040
* prot is passed in as a parameter for the new mapping. If the vma has
10281041
* a linear pfn mapping for the entire range, or no vma is provided,
@@ -1111,15 +1124,6 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
11111124
}
11121125
}
11131126

1114-
/*
1115-
* untrack_pfn_clear is called if the following situation fits:
1116-
*
1117-
* 1) while mremapping a pfnmap for a new region, with the old vma after
1118-
* its pfnmap page table has been removed. The new vma has a new pfnmap
1119-
* to the same pfn & cache type with VM_PAT set.
1120-
* 2) while duplicating vm area, the new vma fails to copy the pgtable from
1121-
* old vma.
1122-
*/
11231127
void untrack_pfn_clear(struct vm_area_struct *vma)
11241128
{
11251129
vm_flags_clear(vma, VM_PAT);

include/linux/pgtable.h

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1277,14 +1277,25 @@ static inline void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
12771277
}
12781278

12791279
/*
1280-
* track_pfn_copy is called when vma that is covering the pfnmap gets
1281-
* copied through copy_page_range().
1280+
* track_pfn_copy is called when a VM_PFNMAP VMA is about to get the page
1281+
* tables copied during copy_page_range(). On success, stores the pfn to be
1282+
* passed to untrack_pfn_copy().
12821283
*/
1283-
static inline int track_pfn_copy(struct vm_area_struct *vma)
1284+
static inline int track_pfn_copy(struct vm_area_struct *dst_vma,
1285+
struct vm_area_struct *src_vma, unsigned long *pfn)
12841286
{
12851287
return 0;
12861288
}
12871289

1290+
/*
1291+
* untrack_pfn_copy is called when a VM_PFNMAP VMA failed to copy during
1292+
* copy_page_range(), but after track_pfn_copy() was already called.
1293+
*/
1294+
static inline void untrack_pfn_copy(struct vm_area_struct *dst_vma,
1295+
unsigned long pfn)
1296+
{
1297+
}
1298+
12881299
/*
12891300
* untrack_pfn is called while unmapping a pfnmap for a region.
12901301
* untrack can be called for a specific region indicated by pfn and size or
@@ -1297,8 +1308,10 @@ static inline void untrack_pfn(struct vm_area_struct *vma,
12971308
}
12981309

12991310
/*
1300-
* untrack_pfn_clear is called while mremapping a pfnmap for a new region
1301-
* or fails to copy pgtable during duplicate vm area.
1311+
* untrack_pfn_clear is called in the following cases on a VM_PFNMAP VMA:
1312+
*
1313+
* 1) During mremap() on the src VMA after the page tables were moved.
1314+
* 2) During fork() on the dst VMA, immediately after duplicating the src VMA.
13021315
*/
13031316
static inline void untrack_pfn_clear(struct vm_area_struct *vma)
13041317
{
@@ -1309,7 +1322,10 @@ extern int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
13091322
unsigned long size);
13101323
extern void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
13111324
pfn_t pfn);
1312-
extern int track_pfn_copy(struct vm_area_struct *vma);
1325+
extern int track_pfn_copy(struct vm_area_struct *dst_vma,
1326+
struct vm_area_struct *src_vma, unsigned long *pfn);
1327+
extern void untrack_pfn_copy(struct vm_area_struct *dst_vma,
1328+
unsigned long pfn);
13131329
extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
13141330
unsigned long size, bool mm_wr_locked);
13151331
extern void untrack_pfn_clear(struct vm_area_struct *vma);

kernel/fork.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -522,6 +522,10 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
522522
vma_numab_state_init(new);
523523
dup_anon_vma_name(orig, new);
524524

525+
/* track_pfn_copy() will later take care of copying internal state. */
526+
if (unlikely(new->vm_flags & VM_PFNMAP))
527+
untrack_pfn_clear(new);
528+
525529
return new;
526530
}
527531

mm/memory.c

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1279,12 +1279,12 @@ int
12791279
copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
12801280
{
12811281
pgd_t *src_pgd, *dst_pgd;
1282-
unsigned long next;
12831282
unsigned long addr = src_vma->vm_start;
12841283
unsigned long end = src_vma->vm_end;
12851284
struct mm_struct *dst_mm = dst_vma->vm_mm;
12861285
struct mm_struct *src_mm = src_vma->vm_mm;
12871286
struct mmu_notifier_range range;
1287+
unsigned long next, pfn;
12881288
bool is_cow;
12891289
int ret;
12901290

@@ -1295,11 +1295,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
12951295
return copy_hugetlb_page_range(dst_mm, src_mm, dst_vma, src_vma);
12961296

12971297
if (unlikely(src_vma->vm_flags & VM_PFNMAP)) {
1298-
/*
1299-
* We do not free on error cases below as remove_vma
1300-
* gets called on error from higher level routine
1301-
*/
1302-
ret = track_pfn_copy(src_vma);
1298+
ret = track_pfn_copy(dst_vma, src_vma, &pfn);
13031299
if (ret)
13041300
return ret;
13051301
}
@@ -1336,7 +1332,6 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
13361332
continue;
13371333
if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd,
13381334
addr, next))) {
1339-
untrack_pfn_clear(dst_vma);
13401335
ret = -ENOMEM;
13411336
break;
13421337
}
@@ -1346,6 +1341,8 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
13461341
raw_write_seqcount_end(&src_mm->write_protect_seq);
13471342
mmu_notifier_invalidate_range_end(&range);
13481343
}
1344+
if (ret && unlikely(src_vma->vm_flags & VM_PFNMAP))
1345+
untrack_pfn_copy(dst_vma, pfn);
13491346
return ret;
13501347
}
13511348

0 commit comments

Comments
 (0)