From 2c5a144cef102a0501105a89898bc28f5ca6fbb7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 28 Mar 2024 22:58:27 +0000 Subject: [PATCH 01/17] mm: correct page_mapped_in_vma() for large folios Patch series "Unify vma_address and vma_pgoff_address". The current vma_address() pretends that the ambiguity between head & tail page is an advantage. If you pass a head page to vma_address(), it will operate on all pages in the folio, while if you pass a tail page, it will operate on a single page. That's not what any of the callers actually want, so first convert all callers to use vma_pgoff_address() and then rename vma_pgoff_address() to vma_address(). This patch (of 3): If 'page' is the first page of a large folio then vma_address() will scan for any page in the entire folio. This can lead to page_mapped_in_vma() returning true if some of the tail pages are mapped and the head page is not. This could lead to memory failure choosing to kill a task unnecessarily. Link: https://lkml.kernel.org/r/20240328225831.1765286-1-willy@infradead.org Link: https://lkml.kernel.org/r/20240328225831.1765286-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton (cherry picked from commit 7e8347413e5bc4d54712942dad43bfcf2501ab3b) --- mm/page_vma_mapped.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index dcc1ee3d059e9..b78765235db22 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -329,6 +329,8 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) */ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) { + struct folio *folio = page_folio(page); + pgoff_t pgoff = folio->index + folio_page_idx(folio, page); struct page_vma_mapped_walk pvmw = { .pfn = page_to_pfn(page), .nr_pages = 1, @@ -336,7 +338,7 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) .flags = PVMW_SYNC, }; - pvmw.address = vma_address(page, vma); + pvmw.address = vma_pgoff_address(pgoff, 1, vma); if (pvmw.address == -EFAULT) return 0; if (!page_vma_mapped_walk(&pvmw)) From 3b64cad13960700ce6662dbfb12437fce576c411 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 28 Mar 2024 22:58:28 +0000 Subject: [PATCH 02/17] mm: remove vma_address() Convert the three remaining callers to call vma_pgoff_address() directly. This removes an ambiguity where we'd check just one page if passed a tail page and all N pages if passed a head page. Also add better kernel-doc for vma_pgoff_address(). Link: https://lkml.kernel.org/r/20240328225831.1765286-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton (cherry picked from commit 412ad5fbe9285fd8066d3b977db0cd7fb39f671d) --- mm/internal.h | 23 ++++++++--------------- mm/rmap.c | 12 +++++++++--- 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 12009d9e13636..a6958a9bfc3e1 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -754,9 +754,14 @@ void mlock_drain_remote(int cpu); extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); -/* - * Return the start of user virtual address at the specific offset within - * a vma. +/** + * vma_pgoff_address - Find the virtual address a page range is mapped at + * @pgoff: The page offset within its object. + * @nr_pages: The number of pages to consider. + * @vma: The vma which maps this object. + * + * If any page in this range is mapped by this VMA, return the first address + * where any of these pages appear. Otherwise, return -EFAULT. */ static inline unsigned long vma_pgoff_address(pgoff_t pgoff, unsigned long nr_pages, @@ -779,18 +784,6 @@ vma_pgoff_address(pgoff_t pgoff, unsigned long nr_pages, return address; } -/* - * Return the start of user virtual address of a page within a vma. - * Returns -EFAULT if all of the page is outside the range of vma. - * If page is a compound head, the entire compound page is considered. - */ -static inline unsigned long -vma_address(struct page *page, struct vm_area_struct *vma) -{ - VM_BUG_ON_PAGE(PageKsm(page), page); /* KSM page->index unusable */ - return vma_pgoff_address(page_to_pgoff(page), compound_nr(page), vma); -} - /* * Then at what user virtual address will none of the range be found in vma? * Assumes that vma_address() already returned a good starting address. diff --git a/mm/rmap.c b/mm/rmap.c index 8c10e55d911aa..4ade1f8a8d6d0 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -745,6 +745,8 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) { struct folio *folio = page_folio(page); + pgoff_t pgoff; + if (folio_test_anon(folio)) { struct anon_vma *page__anon_vma = folio_anon_vma(folio); /* @@ -760,7 +762,9 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) return -EFAULT; } - return vma_address(page, vma); + /* The !page__anon_vma above handles KSM folios */ + pgoff = folio->index + folio_page_idx(folio, page); + return vma_pgoff_address(pgoff, 1, vma); } /* @@ -2475,7 +2479,8 @@ static void rmap_walk_anon(struct folio *folio, anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff_start, pgoff_end) { struct vm_area_struct *vma = avc->vma; - unsigned long address = vma_address(&folio->page, vma); + unsigned long address = vma_pgoff_address(pgoff_start, + folio_nr_pages(folio), vma); VM_BUG_ON_VMA(address == -EFAULT, vma); cond_resched(); @@ -2536,7 +2541,8 @@ static void rmap_walk_file(struct folio *folio, lookup: vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff_start, pgoff_end) { - unsigned long address = vma_address(&folio->page, vma); + unsigned long address = vma_pgoff_address(pgoff_start, + folio_nr_pages(folio), vma); VM_BUG_ON_VMA(address == -EFAULT, vma); cond_resched(); From 99d18a9943bef7ba3ad4355b9de3e1b8f2f2a54a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 28 Mar 2024 22:58:29 +0000 Subject: [PATCH 03/17] mm: rename vma_pgoff_address back to vma_address With all callers converted, we can use the nice shorter name. Take this opportunity to reorder the arguments to the logical order (larger object first). Link: https://lkml.kernel.org/r/20240328225831.1765286-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton (cherry picked from commit e0abfbb67142448d57d7841b749d35981a0b92c7) --- mm/internal.h | 9 ++++----- mm/memory-failure.c | 2 +- mm/page_vma_mapped.c | 2 +- mm/rmap.c | 12 ++++++------ 4 files changed, 12 insertions(+), 13 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index a6958a9bfc3e1..fb481284374dc 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -755,17 +755,16 @@ void mlock_drain_remote(int cpu); extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); /** - * vma_pgoff_address - Find the virtual address a page range is mapped at + * vma_address - Find the virtual address a page range is mapped at + * @vma: The vma which maps this object. * @pgoff: The page offset within its object. * @nr_pages: The number of pages to consider. - * @vma: The vma which maps this object. * * If any page in this range is mapped by this VMA, return the first address * where any of these pages appear. Otherwise, return -EFAULT. */ -static inline unsigned long -vma_pgoff_address(pgoff_t pgoff, unsigned long nr_pages, - struct vm_area_struct *vma) +static inline unsigned long vma_address(struct vm_area_struct *vma, + pgoff_t pgoff, unsigned long nr_pages) { unsigned long address; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index e02593464b063..c396f38ba5688 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -455,7 +455,7 @@ static void __add_to_kill(struct task_struct *tsk, struct page *p, tk->addr = ksm_addr ? ksm_addr : page_address_in_vma(p, vma); if (is_zone_device_page(p)) { if (fsdax_pgoff != FSDAX_INVALID_PGOFF) - tk->addr = vma_pgoff_address(fsdax_pgoff, 1, vma); + tk->addr = vma_address(vma, fsdax_pgoff, 1); tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr); } else tk->size_shift = page_shift(compound_head(p)); diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index b78765235db22..f9f847dc97510 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -338,7 +338,7 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) .flags = PVMW_SYNC, }; - pvmw.address = vma_pgoff_address(pgoff, 1, vma); + pvmw.address = vma_address(vma, pgoff, 1); if (pvmw.address == -EFAULT) return 0; if (!page_vma_mapped_walk(&pvmw)) diff --git a/mm/rmap.c b/mm/rmap.c index 4ade1f8a8d6d0..bcd44172e853e 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -764,7 +764,7 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) /* The !page__anon_vma above handles KSM folios */ pgoff = folio->index + folio_page_idx(folio, page); - return vma_pgoff_address(pgoff, 1, vma); + return vma_address(vma, pgoff, 1); } /* @@ -1102,7 +1102,7 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, if (invalid_mkclean_vma(vma, NULL)) return 0; - pvmw.address = vma_pgoff_address(pgoff, nr_pages, vma); + pvmw.address = vma_address(vma, pgoff, nr_pages); VM_BUG_ON_VMA(pvmw.address == -EFAULT, vma); return page_vma_mkclean_one(&pvmw); @@ -2479,8 +2479,8 @@ static void rmap_walk_anon(struct folio *folio, anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff_start, pgoff_end) { struct vm_area_struct *vma = avc->vma; - unsigned long address = vma_pgoff_address(pgoff_start, - folio_nr_pages(folio), vma); + unsigned long address = vma_address(vma, pgoff_start, + folio_nr_pages(folio)); VM_BUG_ON_VMA(address == -EFAULT, vma); cond_resched(); @@ -2541,8 +2541,8 @@ static void rmap_walk_file(struct folio *folio, lookup: vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff_start, pgoff_end) { - unsigned long address = vma_pgoff_address(pgoff_start, - folio_nr_pages(folio), vma); + unsigned long address = vma_address(vma, pgoff_start, + folio_nr_pages(folio)); VM_BUG_ON_VMA(address == -EFAULT, vma); cond_resched(); From cacbf6d55e849a1620093acde37fb5b2cafea03d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 12 Apr 2024 20:34:58 +0100 Subject: [PATCH 04/17] mm/memory-failure: remove fsdax_pgoff argument from __add_to_kill Patch series "Some cleanups for memory-failure", v3. A lot of folio conversions, plus some other simplifications. This patch (of 11): Unify the KSM and DAX codepaths by calculating the addr in add_to_kill_fsdax() instead of telling __add_to_kill() to calculate it. Link: https://lkml.kernel.org/r/20240412193510.2356957-1-willy@infradead.org Link: https://lkml.kernel.org/r/20240412193510.2356957-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Miaohe Lin Reviewed-by: Jane Chu Reviewed-by: Dan Williams Reviewed-by: Oscar Salvador Signed-off-by: Andrew Morton (cherry picked from commit 1c0501e8315c0713c3fbc7a2df7fbbf151fb214b) --- mm/memory-failure.c | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index c396f38ba5688..4bd43628ae123 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -428,21 +428,13 @@ static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma, * not much we can do. We just print a message and ignore otherwise. */ -#define FSDAX_INVALID_PGOFF ULONG_MAX - /* * Schedule a process for later kill. * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM. - * - * Note: @fsdax_pgoff is used only when @p is a fsdax page and a - * filesystem with a memory failure handler has claimed the - * memory_failure event. In all other cases, page->index and - * page->mapping are sufficient for mapping the page back to its - * corresponding user virtual address. */ static void __add_to_kill(struct task_struct *tsk, struct page *p, struct vm_area_struct *vma, struct list_head *to_kill, - unsigned long ksm_addr, pgoff_t fsdax_pgoff) + unsigned long addr) { struct to_kill *tk; @@ -452,12 +444,10 @@ static void __add_to_kill(struct task_struct *tsk, struct page *p, return; } - tk->addr = ksm_addr ? ksm_addr : page_address_in_vma(p, vma); - if (is_zone_device_page(p)) { - if (fsdax_pgoff != FSDAX_INVALID_PGOFF) - tk->addr = vma_address(vma, fsdax_pgoff, 1); + tk->addr = addr ? addr : page_address_in_vma(p, vma); + if (is_zone_device_page(p)) tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr); - } else + else tk->size_shift = page_shift(compound_head(p)); /* @@ -487,7 +477,7 @@ static void add_to_kill_anon_file(struct task_struct *tsk, struct page *p, struct vm_area_struct *vma, struct list_head *to_kill) { - __add_to_kill(tsk, p, vma, to_kill, 0, FSDAX_INVALID_PGOFF); + __add_to_kill(tsk, p, vma, to_kill, 0); } #ifdef CONFIG_KSM @@ -505,10 +495,10 @@ static bool task_in_to_kill_list(struct list_head *to_kill, } void add_to_kill_ksm(struct task_struct *tsk, struct page *p, struct vm_area_struct *vma, struct list_head *to_kill, - unsigned long ksm_addr) + unsigned long addr) { if (!task_in_to_kill_list(to_kill, tsk)) - __add_to_kill(tsk, p, vma, to_kill, ksm_addr, FSDAX_INVALID_PGOFF); + __add_to_kill(tsk, p, vma, to_kill, addr); } #endif /* @@ -682,7 +672,8 @@ static void add_to_kill_fsdax(struct task_struct *tsk, struct page *p, struct vm_area_struct *vma, struct list_head *to_kill, pgoff_t pgoff) { - __add_to_kill(tsk, p, vma, to_kill, 0, pgoff); + unsigned long addr = vma_address(vma, pgoff, 1); + __add_to_kill(tsk, p, vma, to_kill, addr); } /* From 28b62846a9e79fb4c68c994c22f97ee068171d71 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 12 Apr 2024 20:34:59 +0100 Subject: [PATCH 05/17] mm/memory-failure: pass addr to __add_to_kill() Handle anon/file folios the same way as KSM & DAX folios by passing in the address. Link: https://lkml.kernel.org/r/20240412193510.2356957-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Miaohe Lin Reviewed-by: Jane Chu Reviewed-by: Oscar Salvador Cc: Dan Williams Signed-off-by: Andrew Morton (cherry picked from commit f2b37197c267206979213592923b418039d232dd) --- mm/memory-failure.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 4bd43628ae123..4912809020f26 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -444,7 +444,7 @@ static void __add_to_kill(struct task_struct *tsk, struct page *p, return; } - tk->addr = addr ? addr : page_address_in_vma(p, vma); + tk->addr = addr; if (is_zone_device_page(p)) tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr); else @@ -477,7 +477,8 @@ static void add_to_kill_anon_file(struct task_struct *tsk, struct page *p, struct vm_area_struct *vma, struct list_head *to_kill) { - __add_to_kill(tsk, p, vma, to_kill, 0); + unsigned long addr = page_address_in_vma(p, vma); + __add_to_kill(tsk, p, vma, to_kill, addr); } #ifdef CONFIG_KSM @@ -493,6 +494,7 @@ static bool task_in_to_kill_list(struct list_head *to_kill, return false; } + void add_to_kill_ksm(struct task_struct *tsk, struct page *p, struct vm_area_struct *vma, struct list_head *to_kill, unsigned long addr) From ff362c139f902dd07eb5b88e23feade5b86c8b19 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 12 Apr 2024 20:35:00 +0100 Subject: [PATCH 06/17] mm: return the address from page_mapped_in_vma() The only user of this function calls page_address_in_vma() immediately after page_mapped_in_vma() calculates it and uses it to return true/false. Return the address instead, allowing memory-failure to skip the call to page_address_in_vma(). Link: https://lkml.kernel.org/r/20240412193510.2356957-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Miaohe Lin Reviewed-by: Jane Chu Cc: Dan Williams Cc: Oscar Salvador Signed-off-by: Andrew Morton (cherry picked from commit 37bc2ff506b184411e4cc80f111c638b2b4c83d4) --- include/linux/rmap.h | 2 +- mm/memory-failure.c | 22 +++++++++++++--------- mm/page_vma_mapped.c | 16 +++++++++------- 3 files changed, 23 insertions(+), 17 deletions(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index b1fb58b435a98..ffd0a4295dc1b 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -441,7 +441,7 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked); -int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); +unsigned long page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); /* * rmap_walk_control: To control rmap traversing for specific needs diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 4912809020f26..e52b27592b6ff 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -474,10 +474,11 @@ static void __add_to_kill(struct task_struct *tsk, struct page *p, } static void add_to_kill_anon_file(struct task_struct *tsk, struct page *p, - struct vm_area_struct *vma, - struct list_head *to_kill) + struct vm_area_struct *vma, struct list_head *to_kill, + unsigned long addr) { - unsigned long addr = page_address_in_vma(p, vma); + if (addr == -EFAULT) + return; __add_to_kill(tsk, p, vma, to_kill, addr); } @@ -602,7 +603,6 @@ struct task_struct *task_early_kill(struct task_struct *tsk, int force_early) static void collect_procs_anon(struct folio *folio, struct page *page, struct list_head *to_kill, int force_early) { - struct vm_area_struct *vma; struct task_struct *tsk; struct anon_vma *av; pgoff_t pgoff; @@ -614,8 +614,10 @@ static void collect_procs_anon(struct folio *folio, struct page *page, pgoff = page_to_pgoff(page); rcu_read_lock(); for_each_process(tsk) { + struct vm_area_struct *vma; struct anon_vma_chain *vmac; struct task_struct *t = task_early_kill(tsk, force_early); + unsigned long addr; if (!t) continue; @@ -624,9 +626,8 @@ static void collect_procs_anon(struct folio *folio, struct page *page, vma = vmac->vma; if (vma->vm_mm != t->mm) continue; - if (!page_mapped_in_vma(page, vma)) - continue; - add_to_kill_anon_file(t, page, vma, to_kill); + addr = page_mapped_in_vma(page, vma); + add_to_kill_anon_file(t, page, vma, to_kill, addr); } } rcu_read_unlock(); @@ -649,6 +650,7 @@ static void collect_procs_file(struct folio *folio, struct page *page, pgoff = page_to_pgoff(page); for_each_process(tsk) { struct task_struct *t = task_early_kill(tsk, force_early); + unsigned long addr; if (!t) continue; @@ -661,8 +663,10 @@ static void collect_procs_file(struct folio *folio, struct page *page, * Assume applications who requested early kill want * to be informed of all such data corruptions. */ - if (vma->vm_mm == t->mm) - add_to_kill_anon_file(t, page, vma, to_kill); + if (vma->vm_mm != t->mm) + continue; + addr = page_address_in_vma(page, vma); + add_to_kill_anon_file(t, page, vma, to_kill, addr); } } rcu_read_unlock(); diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index f9f847dc97510..2592dcdb1e866 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -323,11 +323,12 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) * @page: the page to test * @vma: the VMA to test * - * Returns 1 if the page is mapped into the page tables of the VMA, 0 - * if the page is not mapped into the page tables of this VMA. Only - * valid for normal file or anonymous VMAs. + * Return: The address the page is mapped at if the page is in the range + * covered by the VMA and present in the page table. If the page is + * outside the VMA or not present, returns -EFAULT. + * Only valid for normal file or anonymous VMAs. */ -int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) +unsigned long page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) { struct folio *folio = page_folio(page); pgoff_t pgoff = folio->index + folio_page_idx(folio, page); @@ -340,9 +341,10 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) pvmw.address = vma_address(vma, pgoff, 1); if (pvmw.address == -EFAULT) - return 0; + goto out; if (!page_vma_mapped_walk(&pvmw)) - return 0; + return -EFAULT; page_vma_mapped_walk_done(&pvmw); - return 1; +out: + return pvmw.address; } From cd7e0c72a9d0eaaeb45989428ea7076d821ce09b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 12 Apr 2024 20:35:01 +0100 Subject: [PATCH 07/17] mm: make page_mapped_in_vma conditional on CONFIG_MEMORY_FAILURE This function is only currently used by the memory-failure code, so we can omit it if we're not compiling in the memory-failure code. Link: https://lkml.kernel.org/r/20240412193510.2356957-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Suggested-by: Miaohe Lin Reviewed-by: Jane Chu Reviewed-by: Oscar Salvador Acked-by: Miaohe Lin Cc: Dan Williams Signed-off-by: Andrew Morton (cherry picked from commit b87f978dc77519943b659dc8b753f544772e7c85) --- mm/page_vma_mapped.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index 2592dcdb1e866..774c662c76939 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -318,6 +318,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) return false; } +#ifdef CONFIG_MEMORY_FAILURE /** * page_mapped_in_vma - check whether a page is really mapped in a VMA * @page: the page to test @@ -348,3 +349,4 @@ unsigned long page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) out: return pvmw.address; } +#endif From 2d1e6f1c814b22e951fe787dd07ba9f5e19af4f8 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 12 Apr 2024 20:35:02 +0100 Subject: [PATCH 08/17] mm/memory-failure: convert shake_page() to shake_folio() Removes two calls to compound_head(). Move the prototype to internal.h; we definitely don't want code outside mm using it. Link: https://lkml.kernel.org/r/20240412193510.2356957-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jane Chu Acked-by: Miaohe Lin Cc: Dan Williams Cc: Miaohe Lin Cc: Oscar Salvador Signed-off-by: Andrew Morton (cherry picked from commit fed5348ee2b136c84c5a27d6fceef14066beeb66) --- include/linux/mm.h | 1 - mm/hwpoison-inject.c | 11 ++++++----- mm/internal.h | 1 + mm/memory-failure.c | 15 ++++++++++----- 4 files changed, 17 insertions(+), 11 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index dea152b606cb2..6078af58f4864 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3913,7 +3913,6 @@ int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index, extern int memory_failure(unsigned long pfn, int flags); extern void memory_failure_queue_kick(int cpu); extern int unpoison_memory(unsigned long pfn); -extern void shake_page(struct page *p); extern atomic_long_t num_poisoned_pages __read_mostly; extern int soft_offline_page(unsigned long pfn, int flags); #ifdef CONFIG_MEMORY_FAILURE diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index d0548e382b6ba..c9d653f51e45b 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -15,7 +15,7 @@ static int hwpoison_inject(void *data, u64 val) { unsigned long pfn = val; struct page *p; - struct page *hpage; + struct folio *folio; int err; if (!capable(CAP_SYS_ADMIN)) @@ -25,16 +25,17 @@ static int hwpoison_inject(void *data, u64 val) return -ENXIO; p = pfn_to_page(pfn); - hpage = compound_head(p); + folio = page_folio(p); if (!hwpoison_filter_enable) goto inject; - shake_page(hpage); + shake_folio(folio); /* * This implies unable to support non-LRU pages except free page. */ - if (!PageLRU(hpage) && !PageHuge(p) && !is_free_buddy_page(p)) + if (!folio_test_lru(folio) && !folio_test_hugetlb(folio) && + !is_free_buddy_page(p)) return 0; /* @@ -42,7 +43,7 @@ static int hwpoison_inject(void *data, u64 val) * the targeted owner (or on a free page). * memory_failure() will redo the check reliably inside page lock. */ - err = hwpoison_filter(hpage); + err = hwpoison_filter(&folio->page); if (err) return 0; diff --git a/mm/internal.h b/mm/internal.h index fb481284374dc..eae810a7fa496 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -904,6 +904,7 @@ static inline int find_next_best_node(int node, nodemask_t *used_node_mask) /* * mm/memory-failure.c */ +void shake_folio(struct folio *folio); extern int hwpoison_filter(struct page *p); extern u32 hwpoison_filter_dev_major; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index e52b27592b6ff..464173fa15b07 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -370,20 +370,25 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags) * Unknown page type encountered. Try to check whether it can turn PageLRU by * lru_add_drain_all. */ -void shake_page(struct page *p) +void shake_folio(struct folio *folio) { - if (PageHuge(p)) + if (folio_test_hugetlb(folio)) return; /* * TODO: Could shrink slab caches here if a lightweight range-based * shrinker will be available. */ - if (PageSlab(p)) + if (folio_test_slab(folio)) return; lru_add_drain_all(); } -EXPORT_SYMBOL_GPL(shake_page); +EXPORT_SYMBOL_GPL(shake_folio); + +static void shake_page(struct page *page) +{ + shake_folio(page_folio(page)); +} static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma, unsigned long address) @@ -1656,7 +1661,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, * shake_page() again to ensure that it's flushed. */ if (mlocked) - shake_page(hpage); + shake_folio(folio); /* * Now that the dirty bit has been propagated to the From d3f368d9e556b51228f591fd606c118204ad061e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 12 Apr 2024 20:35:03 +0100 Subject: [PATCH 09/17] mm: convert hugetlb_page_mapping_lock_write to folio MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The page is only used to get the mapping, so the folio will do just as well. Both callers already have a folio available, so this saves a call to compound_head(). Link: https://lkml.kernel.org/r/20240412193510.2356957-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jane Chu  Reviewed-by: Oscar Salvador Acked-by: Miaohe Lin Cc: Dan Williams Signed-off-by: Andrew Morton (cherry picked from commit 6e8cda4c2c87b2a44828e651a10705647a6fd542) --- include/linux/hugetlb.h | 6 +++--- mm/hugetlb.c | 6 +++--- mm/memory-failure.c | 2 +- mm/migrate.c | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 4034912ada51a..3047dd6ba8747 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -175,7 +175,7 @@ u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx); pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pud_t *pud); -struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage); +struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio); extern int sysctl_hugetlb_shm_group; extern struct list_head huge_boot_pages; @@ -298,8 +298,8 @@ static inline unsigned long hugetlb_total_pages(void) return 0; } -static inline struct address_space *hugetlb_page_mapping_lock_write( - struct page *hpage) +static inline struct address_space *hugetlb_folio_mapping_lock_write( + struct folio *folio) { return NULL; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f25f119de64a8..56ac890f42c9e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2087,13 +2087,13 @@ static bool prep_compound_gigantic_folio_for_demote(struct folio *folio, /* * Find and lock address space (mapping) in write mode. * - * Upon entry, the page is locked which means that page_mapping() is + * Upon entry, the folio is locked which means that folio_mapping() is * stable. Due to locking order, we can only trylock_write. If we can * not get the lock, simply return NULL to caller. */ -struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage) +struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio) { - struct address_space *mapping = page_mapping(hpage); + struct address_space *mapping = folio_mapping(folio); if (!mapping) return mapping; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 464173fa15b07..74d95b33d7391 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1641,7 +1641,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, * TTU_RMAP_LOCKED to indicate we have taken the lock * at this higher level. */ - mapping = hugetlb_page_mapping_lock_write(hpage); + mapping = hugetlb_folio_mapping_lock_write(folio); if (mapping) { try_to_unmap(folio, ttu|TTU_RMAP_LOCKED); i_mmap_unlock_write(mapping); diff --git a/mm/migrate.c b/mm/migrate.c index c9eaacd5d6b92..5cfe98bf20fe3 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1419,7 +1419,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio, * semaphore in write mode here and set TTU_RMAP_LOCKED * to let lower levels know we have taken the lock. */ - mapping = hugetlb_page_mapping_lock_write(&src->page); + mapping = hugetlb_folio_mapping_lock_write(src); if (unlikely(!mapping)) goto unlock_put_anon; From 9dcba1afa6054d40053a0853c5a8b35fb0bfc521 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 12 Apr 2024 20:35:04 +0100 Subject: [PATCH 10/17] mm/memory-failure: convert memory_failure() to use a folio Saves dozens of calls to compound_head(). Link: https://lkml.kernel.org/r/20240412193510.2356957-8-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Miaohe Lin Cc: Dan Williams Cc: Jane Chu Cc: Miaohe Lin Cc: Oscar Salvador Signed-off-by: Andrew Morton (cherry picked from commit 5dba5c356ab3bbf6b00a42632f3e14728f327553) --- mm/memory-failure.c | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 74d95b33d7391..24ea4ba1c8d18 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2200,7 +2200,7 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, int memory_failure(unsigned long pfn, int flags) { struct page *p; - struct page *hpage; + struct folio *folio; struct dev_pagemap *pgmap; int res = 0; unsigned long page_flags; @@ -2288,8 +2288,8 @@ int memory_failure(unsigned long pfn, int flags) } } - hpage = compound_head(p); - if (PageTransHuge(hpage)) { + folio = page_folio(p); + if (folio_test_large(folio)) { /* * The flag must be set after the refcount is bumped * otherwise it may race with THP split. @@ -2303,12 +2303,13 @@ int memory_failure(unsigned long pfn, int flags) * or unhandlable page. The refcount is bumped iff the * page is a valid handlable page. */ - SetPageHasHWPoisoned(hpage); + folio_set_has_hwpoisoned(folio); if (try_to_split_thp_page(p) < 0) { res = action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED); goto unlock_mutex; } VM_BUG_ON_PAGE(!page_count(p), p); + folio = page_folio(p); } /* @@ -2319,9 +2320,9 @@ int memory_failure(unsigned long pfn, int flags) * The check (unnecessarily) ignores LRU pages being isolated and * walked by the page reclaim code, however that's not a big loss. */ - shake_page(p); + shake_folio(folio); - lock_page(p); + folio_lock(folio); /* * We're only intended to deal with the non-Compound page here. @@ -2329,11 +2330,11 @@ int memory_failure(unsigned long pfn, int flags) * race window. If this happens, we could try again to hopefully * handle the page next round. */ - if (PageCompound(p)) { + if (folio_test_large(folio)) { if (retry) { ClearPageHWPoison(p); - unlock_page(p); - put_page(p); + folio_unlock(folio); + folio_put(folio); flags &= ~MF_COUNT_INCREASED; retry = false; goto try_again; @@ -2349,29 +2350,29 @@ int memory_failure(unsigned long pfn, int flags) * page_remove_rmap() in try_to_unmap_one(). So to determine page status * correctly, we save a copy of the page flags at this time. */ - page_flags = p->flags; + page_flags = folio->flags; if (hwpoison_filter(p)) { ClearPageHWPoison(p); - unlock_page(p); - put_page(p); + folio_unlock(folio); + folio_put(folio); res = -EOPNOTSUPP; goto unlock_mutex; } /* - * __munlock_folio() may clear a writeback page's LRU flag without - * page_lock. We need wait writeback completion for this page or it - * may trigger vfs BUG while evict inode. + * __munlock_folio() may clear a writeback folio's LRU flag without + * the folio lock. We need to wait for writeback completion for this + * folio or it may trigger a vfs BUG while evicting inode. */ - if (!PageLRU(p) && !PageWriteback(p)) + if (!folio_test_lru(folio) && !folio_test_writeback(folio)) goto identify_page_state; /* * It's very difficult to mess with pages currently under IO * and in many cases impossible, so we just avoid it here. */ - wait_on_page_writeback(p); + folio_wait_writeback(folio); /* * Now take care of user space mappings. @@ -2385,7 +2386,8 @@ int memory_failure(unsigned long pfn, int flags) /* * Torn down by someone else? */ - if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { + if (folio_test_lru(folio) && !folio_test_swapcache(folio) && + folio->mapping == NULL) { res = action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED); goto unlock_page; } @@ -2395,7 +2397,7 @@ int memory_failure(unsigned long pfn, int flags) mutex_unlock(&mf_mutex); return res; unlock_page: - unlock_page(p); + folio_unlock(folio); unlock_mutex: mutex_unlock(&mf_mutex); return res; From 205e68b3dd14cfb8af10e974bcf13256eff35378 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 12 Apr 2024 20:35:05 +0100 Subject: [PATCH 11/17] mm/memory-failure: convert hwpoison_user_mappings to take a folio Pass the folio from the callers, and use it throughout instead of hpage. Saves dozens of calls to compound_head(). Link: https://lkml.kernel.org/r/20240412193510.2356957-9-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Miaohe Lin Reviewed-by: Jane Chu Cc: Dan Williams Cc: Oscar Salvador Signed-off-by: Andrew Morton (cherry picked from commit 03468a0f52893b8dea4a96677ad9ff78bf55d765) --- mm/memory-failure.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 24ea4ba1c8d18..835a1a39b4679 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1576,24 +1576,24 @@ static int get_hwpoison_page(struct page *p, unsigned long flags) * Do all that is necessary to remove user space mappings. Unmap * the pages and send SIGBUS to the processes if the data was dirty. */ -static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, - int flags, struct page *hpage) +static bool hwpoison_user_mappings(struct folio *folio, struct page *p, + unsigned long pfn, int flags) { - struct folio *folio = page_folio(hpage); enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC | TTU_HWPOISON; struct address_space *mapping; LIST_HEAD(tokill); bool unmap_success; int forcekill; - bool mlocked = PageMlocked(hpage); + bool mlocked = folio_test_mlocked(folio); /* * Here we are interested only in user-mapped pages, so skip any * other types of pages. */ - if (PageReserved(p) || PageSlab(p) || PageTable(p) || PageOffline(p)) + if (folio_test_reserved(folio) || folio_test_slab(folio) || + folio_test_pgtable(folio) || folio_test_offline(folio)) return true; - if (!(PageLRU(hpage) || PageHuge(p))) + if (!(folio_test_lru(folio) || folio_test_hugetlb(folio))) return true; /* @@ -1603,7 +1603,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, if (!page_mapped(p)) return true; - if (PageSwapCache(p)) { + if (folio_test_swapcache(folio)) { pr_err("%#lx: keeping poisoned page in swap cache\n", pfn); ttu &= ~TTU_HWPOISON; } @@ -1614,11 +1614,11 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, * XXX: the dirty test could be racy: set_page_dirty() may not always * be called inside page lock (it's recommended but not enforced). */ - mapping = page_mapping(hpage); - if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping && + mapping = folio_mapping(folio); + if (!(flags & MF_MUST_KILL) && !folio_test_dirty(folio) && mapping && mapping_can_writeback(mapping)) { - if (page_mkclean(hpage)) { - SetPageDirty(hpage); + if (folio_mkclean(folio)) { + folio_set_dirty(folio); } else { ttu &= ~TTU_HWPOISON; pr_info("%#lx: corrupted page was clean: dropped without side effects\n", @@ -1633,7 +1633,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, */ collect_procs(folio, p, &tokill, flags & MF_ACTION_REQUIRED); - if (PageHuge(hpage) && !PageAnon(hpage)) { + if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) { /* * For hugetlb pages in shared mappings, try_to_unmap * could potentially call huge_pmd_unshare. Because of @@ -1673,7 +1673,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, * use a more force-full uncatchable kill to prevent * any accesses to the poisoned memory. */ - forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL) || + forcekill = folio_test_dirty(folio) || (flags & MF_MUST_KILL) || !unmap_success; kill_procs(&tokill, forcekill, !unmap_success, pfn, flags); @@ -2111,7 +2111,7 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb page_flags = folio->flags; - if (!hwpoison_user_mappings(p, pfn, flags, &folio->page)) { + if (!hwpoison_user_mappings(folio, p, pfn, flags)) { folio_unlock(folio); return action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); } @@ -2378,7 +2378,7 @@ int memory_failure(unsigned long pfn, int flags) * Now take care of user space mappings. * Abort on fail: __filemap_remove_folio() assumes unmapped page. */ - if (!hwpoison_user_mappings(p, pfn, flags, p)) { + if (!hwpoison_user_mappings(folio, p, pfn, flags)) { res = action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); goto unlock_page; } From 253e8eab1ffe66cc2a93a74f624504e5c1892ad6 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 12 Apr 2024 20:35:06 +0100 Subject: [PATCH 12/17] mm/memory-failure: add some folio conversions to unpoison_memory Some of these folio APIs didn't exist when the unpoison_memory() conversion was done originally. Link: https://lkml.kernel.org/r/20240412193510.2356957-10-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Miaohe Lin Reviewed-by: Jane Chu Cc: Dan Williams Cc: Oscar Salvador Signed-off-by: Andrew Morton (cherry picked from commit ee299e9849736f60e6e01f7a5dcb258de7c7d1b9) --- mm/memory-failure.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 835a1a39b4679..43c49c5e42819 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2575,8 +2575,8 @@ int unpoison_memory(unsigned long pfn) goto unlock_mutex; } - if (folio_test_slab(folio) || PageTable(&folio->page) || - folio_test_reserved(folio) || PageOffline(&folio->page)) + if (folio_test_slab(folio) || folio_test_pgtable(folio) || + folio_test_reserved(folio) || folio_test_offline(folio)) goto unlock_mutex; /* @@ -2597,7 +2597,7 @@ int unpoison_memory(unsigned long pfn) ghp = get_hwpoison_page(p, MF_UNPOISON); if (!ghp) { - if (PageHuge(p)) { + if (folio_test_hugetlb(folio)) { huge = true; count = folio_free_raw_hwp(folio, false); if (count == 0) @@ -2613,7 +2613,7 @@ int unpoison_memory(unsigned long pfn) pfn, &unpoison_rs); } } else { - if (PageHuge(p)) { + if (folio_test_hugetlb(folio)) { huge = true; count = folio_free_raw_hwp(folio, false); if (count == 0) { From dccef487577b94918be2faf06769489cf636fc76 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 12 Apr 2024 20:35:07 +0100 Subject: [PATCH 13/17] mm/memory-failure: use folio functions throughout collect_procs() Saves a couple of calls to compound_head(). Link: https://lkml.kernel.org/r/20240412193510.2356957-11-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jane Chu Acked-by: Miaohe Lin Cc: Dan Williams Cc: Miaohe Lin Cc: Oscar Salvador Signed-off-by: Andrew Morton (cherry picked from commit 0edb5b282ac5a4f9b1bdc22120c9b145be315622) --- mm/memory-failure.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 43c49c5e42819..c8ac581f59239 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -722,9 +722,9 @@ static void collect_procs(struct folio *folio, struct page *page, { if (!folio->mapping) return; - if (unlikely(PageKsm(page))) + if (unlikely(folio_test_ksm(folio))) collect_procs_ksm(page, tokill, force_early); - else if (PageAnon(page)) + else if (folio_test_anon(folio)) collect_procs_anon(folio, page, tokill, force_early); else collect_procs_file(folio, page, tokill, force_early); From 8116253c31d4f2bb0e7d6217da0c57adbc48a51d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 12 Apr 2024 20:35:08 +0100 Subject: [PATCH 14/17] mm/memory-failure: pass the folio to collect_procs_ksm() We've already calculated it, so pass it in instead of recalculating it in collect_procs_ksm(). Link: https://lkml.kernel.org/r/20240412193510.2356957-12-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jane Chu Reviewed-by: Miaohe Lin Cc: Dan Williams Cc: Miaohe Lin Cc: Oscar Salvador Signed-off-by: Andrew Morton (cherry picked from commit b650e1d2aefbbb31e7578ad60a0a71bf5e5c5346) --- include/linux/ksm.h | 14 +++----------- mm/ksm.c | 5 ++--- mm/memory-failure.c | 2 +- 3 files changed, 6 insertions(+), 15 deletions(-) diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 165aabddb7234..73fe1538d525a 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -91,15 +91,9 @@ struct page *ksm_might_need_to_copy(struct page *page, void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc); void folio_migrate_ksm(struct folio *newfolio, struct folio *folio); - -#ifdef CONFIG_MEMORY_FAILURE -void collect_procs_ksm(struct page *page, struct list_head *to_kill, - int force_early); -#endif - -#ifdef CONFIG_PROC_FS +void collect_procs_ksm(struct folio *folio, struct page *page, + struct list_head *to_kill, int force_early); long ksm_process_profit(struct mm_struct *); -#endif /* CONFIG_PROC_FS */ #else /* !CONFIG_KSM */ @@ -129,12 +123,10 @@ static inline void ksm_might_unmap_zero_page(struct mm_struct *mm, pte_t pte) { } -#ifdef CONFIG_MEMORY_FAILURE -static inline void collect_procs_ksm(struct page *page, +static inline void collect_procs_ksm(struct folio *folio, struct page *page, struct list_head *to_kill, int force_early) { } -#endif #ifdef CONFIG_MMU static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start, diff --git a/mm/ksm.c b/mm/ksm.c index 2e4cd681622de..8d7d200769c02 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -2906,12 +2906,11 @@ void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc) /* * Collect processes when the error hit an ksm page. */ -void collect_procs_ksm(struct page *page, struct list_head *to_kill, - int force_early) +void collect_procs_ksm(struct folio *folio, struct page *page, + struct list_head *to_kill, int force_early) { struct ksm_stable_node *stable_node; struct ksm_rmap_item *rmap_item; - struct folio *folio = page_folio(page); struct vm_area_struct *vma; struct task_struct *tsk; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index c8ac581f59239..4a9d7c9c58249 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -723,7 +723,7 @@ static void collect_procs(struct folio *folio, struct page *page, if (!folio->mapping) return; if (unlikely(folio_test_ksm(folio))) - collect_procs_ksm(page, tokill, force_early); + collect_procs_ksm(folio, page, tokill, force_early); else if (folio_test_anon(folio)) collect_procs_anon(folio, page, tokill, force_early); else From 323b28550cbf4117488650e4a8151d8409786387 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Tue, 27 Aug 2024 19:47:24 +0800 Subject: [PATCH 15/17] mm: memory_hotplug: remove head variable in do_migrate_range() Patch series "mm: memory_hotplug: improve do_migrate_range()", v3. Unify hwpoisoned page handling and isolation of HugeTLB/LRU/non-LRU movable page, also convert to use folios in do_migrate_range(). This patch (of 5): Directly use a folio for HugeTLB and THP when calculate the next pfn, then remove unused head variable. Link: https://lkml.kernel.org/r/20240827114728.3212578-1-wangkefeng.wang@huawei.com Link: https://lkml.kernel.org/r/20240827114728.3212578-2-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Miaohe Lin Acked-by: David Hildenbrand Cc: Dan Carpenter Cc: Jonathan Cameron Cc: Naoya Horiguchi Cc: Oscar Salvador Signed-off-by: Andrew Morton (cherry picked from commit b62b51d2d1593e6590669e781768c3c5a7394eb5) --- mm/memory_hotplug.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index aab1669054520..0a1f5d59a27fe 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1703,7 +1703,7 @@ static int scan_movable_pages(unsigned long start, unsigned long end, static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) { unsigned long pfn; - struct page *page, *head; + struct page *page; LIST_HEAD(source); static DEFINE_RATELIMIT_STATE(migrate_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); @@ -1716,14 +1716,20 @@ static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) continue; page = pfn_to_page(pfn); folio = page_folio(page); - head = &folio->page; - if (PageHuge(page)) { - pfn = page_to_pfn(head) + compound_nr(head) - 1; - isolate_hugetlb(folio, &source); - continue; - } else if (PageTransHuge(page)) - pfn = page_to_pfn(head) + thp_nr_pages(page) - 1; + /* + * No reference or lock is held on the folio, so it might + * be modified concurrently (e.g. split). As such, + * folio_nr_pages() may read garbage. This is fine as the outer + * loop will revisit the split folio later. + */ + if (folio_test_large(folio)) { + pfn = folio_pfn(folio) + folio_nr_pages(folio) - 1; + if (folio_test_hugetlb(folio)) { + isolate_hugetlb(folio, &source); + continue; + } + } /* * HWPoison pages have elevated reference counts so the migration would From 37da28d0eeb04a05aae9260ae46bf5f1107c143b Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Tue, 27 Aug 2024 19:47:25 +0800 Subject: [PATCH 16/17] mm: memory-failure: add unmap_poisoned_folio() Add unmap_poisoned_folio() helper which will be reused by do_migrate_range() from memory hotplug soon. [akpm@linux-foundation.org: whitespace tweak, per Miaohe Lin] Link: https://lkml.kernel.org/r/1f80c7e3-c30d-1ac1-6a36-d1a5f5907f7c@huawei.com Link: https://lkml.kernel.org/r/20240827114728.3212578-3-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: David Hildenbrand Acked-by: Miaohe Lin Cc: Dan Carpenter Cc: Jonathan Cameron Cc: Naoya Horiguchi Cc: Oscar Salvador Signed-off-by: Andrew Morton (cherry picked from commit 16038c4fffd802cf5bc7391fedfe8228dd0adf53) --- mm/internal.h | 8 ++++++++ mm/memory-failure.c | 44 +++++++++++++++++++++++++++----------------- 2 files changed, 35 insertions(+), 17 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index eae810a7fa496..9b4df4228d9ed 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -904,6 +904,8 @@ static inline int find_next_best_node(int node, nodemask_t *used_node_mask) /* * mm/memory-failure.c */ +#ifdef CONFIG_MEMORY_FAILURE +void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu); void shake_folio(struct folio *folio); extern int hwpoison_filter(struct page *p); @@ -914,6 +916,12 @@ extern u64 hwpoison_filter_flags_value; extern u64 hwpoison_filter_memcg; extern u32 hwpoison_filter_enable; +#else +static inline void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu) +{ +} +#endif + extern unsigned long __must_check vm_mmap_pgoff(struct file *, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 4a9d7c9c58249..285d8b4d383ef 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1572,6 +1572,32 @@ static int get_hwpoison_page(struct page *p, unsigned long flags) return ret; } +void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu) +{ + if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) { + struct address_space *mapping; + + /* + * For hugetlb folios in shared mappings, try_to_unmap + * could potentially call huge_pmd_unshare. Because of + * this, take semaphore in write mode here and set + * TTU_RMAP_LOCKED to indicate we have taken the lock + * at this higher level. + */ + mapping = hugetlb_folio_mapping_lock_write(folio); + if (!mapping) { + pr_info("%#lx: could not lock mapping for mapped hugetlb folio\n", + folio_pfn(folio)); + return; + } + + try_to_unmap(folio, ttu|TTU_RMAP_LOCKED); + i_mmap_unlock_write(mapping); + } else { + try_to_unmap(folio, ttu); + } +} + /* * Do all that is necessary to remove user space mappings. Unmap * the pages and send SIGBUS to the processes if the data was dirty. @@ -1633,23 +1659,7 @@ static bool hwpoison_user_mappings(struct folio *folio, struct page *p, */ collect_procs(folio, p, &tokill, flags & MF_ACTION_REQUIRED); - if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) { - /* - * For hugetlb pages in shared mappings, try_to_unmap - * could potentially call huge_pmd_unshare. Because of - * this, take semaphore in write mode here and set - * TTU_RMAP_LOCKED to indicate we have taken the lock - * at this higher level. - */ - mapping = hugetlb_folio_mapping_lock_write(folio); - if (mapping) { - try_to_unmap(folio, ttu|TTU_RMAP_LOCKED); - i_mmap_unlock_write(mapping); - } else - pr_info("%#lx: could not lock mapping for mapped huge page\n", pfn); - } else { - try_to_unmap(folio, ttu); - } + unmap_poisoned_folio(folio, ttu); unmap_success = !page_mapped(p); if (!unmap_success) From 84a1e3b4cf08c91fb55edc625493779e7d59b524 Mon Sep 17 00:00:00 2001 From: Jinjiang Tu Date: Tue, 18 Mar 2025 16:39:39 +0800 Subject: [PATCH 17/17] mm/vmscan: don't try to reclaim hwpoison folio mainline inclusion from mainline-v6.15-rc1 category: bugfix commit 1b0449544c6482179ac84530b61fc192a6527bfd upstream. Syzkaller reports a bug as follows: Injecting memory failure for pfn 0x18b00e at process virtual address 0x20ffd000 Memory failure: 0x18b00e: dirty swapcache page still referenced by 2 users Memory failure: 0x18b00e: recovery action for dirty swapcache page: Failed page: refcount:2 mapcount:0 mapping:0000000000000000 index:0x20ffd pfn:0x18b00e memcg:ffff0000dd6d9000 anon flags: 0x5ffffe00482011(locked|dirty|arch_1|swapbacked|hwpoison|node=0|zone=2|lastcpupid=0xfffff) raw: 005ffffe00482011 dead000000000100 dead000000000122 ffff0000e232a7c9 raw: 0000000000020ffd 0000000000000000 00000002ffffffff ffff0000dd6d9000 page dumped because: VM_BUG_ON_FOLIO(!folio_test_uptodate(folio)) ------------[ cut here ]------------ kernel BUG at mm/swap_state.c:184! Internal error: Oops - BUG: 00000000f2000800 [#1] SMP Modules linked in: CPU: 0 PID: 60 Comm: kswapd0 Not tainted 6.6.0-gcb097e7de84e #3 Hardware name: linux,dummy-virt (DT) pstate: 80400005 (Nzcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : add_to_swap+0xbc/0x158 lr : add_to_swap+0xbc/0x158 sp : ffff800087f37340 x29: ffff800087f37340 x28: fffffc00052c0380 x27: ffff800087f37780 x26: ffff800087f37490 x25: ffff800087f37c78 x24: ffff800087f377a0 x23: ffff800087f37c50 x22: 0000000000000000 x21: fffffc00052c03b4 x20: 0000000000000000 x19: fffffc00052c0380 x18: 0000000000000000 x17: 296f696c6f662865 x16: 7461646f7470755f x15: 747365745f6f696c x14: 6f6621284f494c4f x13: 0000000000000001 x12: ffff600036d8b97b x11: 1fffe00036d8b97a x10: ffff600036d8b97a x9 : dfff800000000000 x8 : 00009fffc9274686 x7 : ffff0001b6c5cbd3 x6 : 0000000000000001 x5 : ffff0000c25896c0 x4 : 0000000000000000 x3 : 0000000000000000 x2 : 0000000000000000 x1 : ffff0000c25896c0 x0 : 0000000000000000 Call trace: add_to_swap+0xbc/0x158 shrink_folio_list+0x12ac/0x2648 shrink_inactive_list+0x318/0x948 shrink_lruvec+0x450/0x720 shrink_node_memcgs+0x280/0x4a8 shrink_node+0x128/0x978 balance_pgdat+0x4f0/0xb20 kswapd+0x228/0x438 kthread+0x214/0x230 ret_from_fork+0x10/0x20 I can reproduce this issue with the following steps: 1) When a dirty swapcache page is isolated by reclaim process and the page isn't locked, inject memory failure for the page. me_swapcache_dirty() clears uptodate flag and tries to delete from lru, but fails. Reclaim process will put the hwpoisoned page back to lru. 2) The process that maps the hwpoisoned page exits, the page is deleted the page will never be freed and will be in the lru forever. 3) If we trigger a reclaim again and tries to reclaim the page, add_to_swap() will trigger VM_BUG_ON_FOLIO due to the uptodate flag is cleared. To fix it, skip the hwpoisoned page in shrink_folio_list(). Besides, the hwpoison folio may not be unmapped by hwpoison_user_mappings() yet, unmap it in shrink_folio_list(), otherwise the folio will fail to be unmaped by hwpoison_user_mappings() since the folio isn't in lru list. Link: https://lkml.kernel.org/r/20250318083939.987651-3-tujinjiang@huawei.com Signed-off-by: Jinjiang Tu Acked-by: Miaohe Lin Cc: David Hildenbrand Cc: Kefeng Wang Cc: Nanyong Sun Cc: Naoya Horiguchi Cc: Signed-off-by: Andrew Morton (cherry picked from commit 1b0449544c6482179ac84530b61fc192a6527bfd) [Guan Wentao: add helper from commit ("mm/hwpoison: introduce folio_contain_hwpoisoned_page() helper")] Signed-off-by: Wentao Guan --- include/linux/page-flags.h | 6 ++++++ mm/vmscan.c | 7 +++++++ 2 files changed, 13 insertions(+) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index c8f5438c6fc79..92f4af8020c7d 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -1046,6 +1046,12 @@ static inline bool is_page_hwpoison(struct page *page) return PageHuge(page) && PageHWPoison(compound_head(page)); } +static inline bool folio_contain_hwpoisoned_page(struct folio *folio) +{ + return folio_test_hwpoison(folio) || + (folio_test_large(folio) && folio_test_has_hwpoisoned(folio)); +} + extern bool is_free_buddy_page(struct page *page); PAGEFLAG(Isolated, isolated, PF_ANY); diff --git a/mm/vmscan.c b/mm/vmscan.c index 7bde085549564..c7325ecccf888 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1741,6 +1741,13 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, if (!folio_trylock(folio)) goto keep; + if (folio_contain_hwpoisoned_page(folio)) { + unmap_poisoned_folio(folio, folio_pfn(folio), false); + folio_unlock(folio); + folio_put(folio); + continue; + } + VM_BUG_ON_FOLIO(folio_test_active(folio), folio); nr_pages = folio_nr_pages(folio);