Skip to content

Commit 24d7275

Browse files
yang-shitorvalds
authored andcommitted
fs/proc: task_mmu.c: don't read mapcount for migration entry
The syzbot reported the below BUG: kernel BUG at include/linux/page-flags.h:785! invalid opcode: 0000 [#1] PREEMPT SMP KASAN CPU: 1 PID: 4392 Comm: syz-executor560 Not tainted 5.16.0-rc6-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:PageDoubleMap include/linux/page-flags.h:785 [inline] RIP: 0010:__page_mapcount+0x2d2/0x350 mm/util.c:744 Call Trace: page_mapcount include/linux/mm.h:837 [inline] smaps_account+0x470/0xb10 fs/proc/task_mmu.c:466 smaps_pte_entry fs/proc/task_mmu.c:538 [inline] smaps_pte_range+0x611/0x1250 fs/proc/task_mmu.c:601 walk_pmd_range mm/pagewalk.c:128 [inline] walk_pud_range mm/pagewalk.c:205 [inline] walk_p4d_range mm/pagewalk.c:240 [inline] walk_pgd_range mm/pagewalk.c:277 [inline] __walk_page_range+0xe23/0x1ea0 mm/pagewalk.c:379 walk_page_vma+0x277/0x350 mm/pagewalk.c:530 smap_gather_stats.part.0+0x148/0x260 fs/proc/task_mmu.c:768 smap_gather_stats fs/proc/task_mmu.c:741 [inline] show_smap+0xc6/0x440 fs/proc/task_mmu.c:822 seq_read_iter+0xbb0/0x1240 fs/seq_file.c:272 seq_read+0x3e0/0x5b0 fs/seq_file.c:162 vfs_read+0x1b5/0x600 fs/read_write.c:479 ksys_read+0x12d/0x250 fs/read_write.c:619 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae The reproducer was trying to read /proc/$PID/smaps when calling MADV_FREE at the mean time. MADV_FREE may split THPs if it is called for partial THP. It may trigger the below race: CPU A CPU B ----- ----- smaps walk: MADV_FREE: page_mapcount() PageCompound() split_huge_page() page = compound_head(page) PageDoubleMap(page) When calling PageDoubleMap() this page is not a tail page of THP anymore so the BUG is triggered. This could be fixed by elevated refcount of the page before calling mapcount, but that would prevent it from counting migration entries, and it seems overkilling because the race just could happen when PMD is split so all PTE entries of tail pages are actually migration entries, and smaps_account() does treat migration entries as mapcount == 1 as Kirill pointed out. Add a new parameter for smaps_account() to tell this entry is migration entry then skip calling page_mapcount(). Don't skip getting mapcount for device private entries since they do track references with mapcount. Pagemap also has the similar issue although it was not reported. Fixed it as well. [[email protected]: v4] Link: https://lkml.kernel.org/r/[email protected] [[email protected]: avoid unused variable warning in pagemap_pmd_range()] Link: https://lkml.kernel.org/r/[email protected] Link: https://lkml.kernel.org/r/[email protected] Fixes: e9b61f1 ("thp: reintroduce split_huge_page()") Signed-off-by: Yang Shi <[email protected]> Signed-off-by: Nathan Chancellor <[email protected]> Reported-by: [email protected] Acked-by: David Hildenbrand <[email protected]> Cc: "Kirill A. Shutemov" <[email protected]> Cc: Jann Horn <[email protected]> Cc: Matthew Wilcox <[email protected]> Cc: Alexey Dobriyan <[email protected]> Cc: <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent 925346c commit 24d7275

File tree

1 file changed

+31
-9
lines changed

1 file changed

+31
-9
lines changed

fs/proc/task_mmu.c

Lines changed: 31 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -440,7 +440,8 @@ static void smaps_page_accumulate(struct mem_size_stats *mss,
440440
}
441441

442442
static void smaps_account(struct mem_size_stats *mss, struct page *page,
443-
bool compound, bool young, bool dirty, bool locked)
443+
bool compound, bool young, bool dirty, bool locked,
444+
bool migration)
444445
{
445446
int i, nr = compound ? compound_nr(page) : 1;
446447
unsigned long size = nr * PAGE_SIZE;
@@ -467,8 +468,15 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
467468
* page_count(page) == 1 guarantees the page is mapped exactly once.
468469
* If any subpage of the compound page mapped with PTE it would elevate
469470
* page_count().
471+
*
472+
* The page_mapcount() is called to get a snapshot of the mapcount.
473+
* Without holding the page lock this snapshot can be slightly wrong as
474+
* we cannot always read the mapcount atomically. It is not safe to
475+
* call page_mapcount() even with PTL held if the page is not mapped,
476+
* especially for migration entries. Treat regular migration entries
477+
* as mapcount == 1.
470478
*/
471-
if (page_count(page) == 1) {
479+
if ((page_count(page) == 1) || migration) {
472480
smaps_page_accumulate(mss, page, size, size << PSS_SHIFT, dirty,
473481
locked, true);
474482
return;
@@ -517,6 +525,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
517525
struct vm_area_struct *vma = walk->vma;
518526
bool locked = !!(vma->vm_flags & VM_LOCKED);
519527
struct page *page = NULL;
528+
bool migration = false;
520529

521530
if (pte_present(*pte)) {
522531
page = vm_normal_page(vma, addr, *pte);
@@ -536,8 +545,11 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
536545
} else {
537546
mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT;
538547
}
539-
} else if (is_pfn_swap_entry(swpent))
548+
} else if (is_pfn_swap_entry(swpent)) {
549+
if (is_migration_entry(swpent))
550+
migration = true;
540551
page = pfn_swap_entry_to_page(swpent);
552+
}
541553
} else {
542554
smaps_pte_hole_lookup(addr, walk);
543555
return;
@@ -546,7 +558,8 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
546558
if (!page)
547559
return;
548560

549-
smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte), locked);
561+
smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte),
562+
locked, migration);
550563
}
551564

552565
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -557,15 +570,18 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
557570
struct vm_area_struct *vma = walk->vma;
558571
bool locked = !!(vma->vm_flags & VM_LOCKED);
559572
struct page *page = NULL;
573+
bool migration = false;
560574

561575
if (pmd_present(*pmd)) {
562576
/* FOLL_DUMP will return -EFAULT on huge zero page */
563577
page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP);
564578
} else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) {
565579
swp_entry_t entry = pmd_to_swp_entry(*pmd);
566580

567-
if (is_migration_entry(entry))
581+
if (is_migration_entry(entry)) {
582+
migration = true;
568583
page = pfn_swap_entry_to_page(entry);
584+
}
569585
}
570586
if (IS_ERR_OR_NULL(page))
571587
return;
@@ -577,7 +593,9 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
577593
/* pass */;
578594
else
579595
mss->file_thp += HPAGE_PMD_SIZE;
580-
smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), locked);
596+
597+
smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd),
598+
locked, migration);
581599
}
582600
#else
583601
static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
@@ -1378,6 +1396,7 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
13781396
{
13791397
u64 frame = 0, flags = 0;
13801398
struct page *page = NULL;
1399+
bool migration = false;
13811400

13821401
if (pte_present(pte)) {
13831402
if (pm->show_pfn)
@@ -1399,13 +1418,14 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
13991418
frame = swp_type(entry) |
14001419
(swp_offset(entry) << MAX_SWAPFILES_SHIFT);
14011420
flags |= PM_SWAP;
1421+
migration = is_migration_entry(entry);
14021422
if (is_pfn_swap_entry(entry))
14031423
page = pfn_swap_entry_to_page(entry);
14041424
}
14051425

14061426
if (page && !PageAnon(page))
14071427
flags |= PM_FILE;
1408-
if (page && page_mapcount(page) == 1)
1428+
if (page && !migration && page_mapcount(page) == 1)
14091429
flags |= PM_MMAP_EXCLUSIVE;
14101430
if (vma->vm_flags & VM_SOFTDIRTY)
14111431
flags |= PM_SOFT_DIRTY;
@@ -1421,8 +1441,9 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
14211441
spinlock_t *ptl;
14221442
pte_t *pte, *orig_pte;
14231443
int err = 0;
1424-
14251444
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1445+
bool migration = false;
1446+
14261447
ptl = pmd_trans_huge_lock(pmdp, vma);
14271448
if (ptl) {
14281449
u64 flags = 0, frame = 0;
@@ -1461,11 +1482,12 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
14611482
if (pmd_swp_uffd_wp(pmd))
14621483
flags |= PM_UFFD_WP;
14631484
VM_BUG_ON(!is_pmd_migration_entry(pmd));
1485+
migration = is_migration_entry(entry);
14641486
page = pfn_swap_entry_to_page(entry);
14651487
}
14661488
#endif
14671489

1468-
if (page && page_mapcount(page) == 1)
1490+
if (page && !migration && page_mapcount(page) == 1)
14691491
flags |= PM_MMAP_EXCLUSIVE;
14701492

14711493
for (; addr != end; addr += PAGE_SIZE) {

0 commit comments

Comments
 (0)