Skip to content

Commit 32e8656

Browse files
author
Audra Mitchell
committed
mm: multi-gen LRU: use {ptep,pmdp}_clear_young_notify()
JIRA: https://issues.redhat.com/browse/RHEL-104908 Conflicts: Context conflict in include/linux/mmzone.h due to missing: 745b13e ("mm/mglru: remove CONFIG_MEMCG") 7eb2d01 ("mm/mglru: remove CONFIG_TRANSPARENT_HUGEPAGE") Context conflicts in mm/rmap.c due to downstream RHEL-only commit: 7535c3f ("Revert "mm: add vma_has_recency()"") Context conflicts in mm/vmscan.c due to missing commits: 7eb2d01 ("mm/mglru: remove CONFIG_TRANSPARENT_HUGEPAGE") 61dd3f2 ("mm/mglru: add CONFIG_LRU_GEN_WALKS_MMU") This patch is a backport of the following upstream commit: commit 1d4832b Author: Yu Zhao <[email protected]> Date: Sat Oct 19 01:29:39 2024 +0000 mm: multi-gen LRU: use {ptep,pmdp}_clear_young_notify() When the MM_WALK capability is enabled, memory that is mostly accessed by a VM appears younger than it really is, therefore this memory will be less likely to be evicted. Therefore, the presence of a running VM can significantly increase swap-outs for non-VM memory, regressing the performance for the rest of the system. Fix this regression by always calling {ptep,pmdp}_clear_young_notify() whenever we clear the young bits on PMDs/PTEs. [[email protected]: fix link-time error] Link: https://lkml.kernel.org/r/[email protected] Fixes: bd74fda ("mm: multi-gen LRU: support page table walks") Signed-off-by: Yu Zhao <[email protected]> Signed-off-by: James Houghton <[email protected]> Reported-by: David Stevens <[email protected]> Cc: Axel Rasmussen <[email protected]> Cc: David Matlack <[email protected]> Cc: David Rientjes <[email protected]> Cc: Oliver Upton <[email protected]> Cc: Paolo Bonzini <[email protected]> Cc: Sean Christopherson <[email protected]> Cc: Wei Xu <[email protected]> Cc: <[email protected]> Cc: kernel test robot <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Audra Mitchell <[email protected]>
1 parent 6c24349 commit 32e8656

File tree

3 files changed

+56
-47
lines changed

3 files changed

+56
-47
lines changed

include/linux/mmzone.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -498,7 +498,7 @@ struct lru_gen_mm_walk {
498498
};
499499

500500
void lru_gen_init_lruvec(struct lruvec *lruvec);
501-
void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
501+
bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
502502

503503
#ifdef CONFIG_MEMCG
504504

@@ -590,8 +590,9 @@ static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
590590
{
591591
}
592592

593-
static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
593+
static inline bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
594594
{
595+
return false;
595596
}
596597

597598
#ifdef CONFIG_MEMCG

mm/rmap.c

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -820,13 +820,11 @@ static bool folio_referenced_one(struct folio *folio,
820820
return false; /* To break the loop */
821821
}
822822

823-
if (pvmw.pte) {
824-
if (lru_gen_enabled() && pte_young(ptep_get(pvmw.pte)) &&
825-
!(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))) {
826-
lru_gen_look_around(&pvmw);
823+
if ((lru_gen_enabled() && pvmw.pte) &&
824+
!(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))) {
825+
if (lru_gen_look_around(&pvmw))
827826
referenced++;
828-
}
829-
827+
} else if (pvmw.pte) {
830828
if (ptep_clear_flush_young_notify(vma, address,
831829
pvmw.pte)) {
832830
/*

mm/vmscan.c

Lines changed: 49 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@
5757
#include <linux/khugepaged.h>
5858
#include <linux/rculist_nulls.h>
5959
#include <linux/random.h>
60+
#include <linux/mmu_notifier.h>
6061

6162
#include <asm/tlbflush.h>
6263
#include <asm/div64.h>
@@ -3214,7 +3215,8 @@ static bool get_next_vma(unsigned long mask, unsigned long size, struct mm_walk
32143215
return false;
32153216
}
32163217

3217-
static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr)
3218+
static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr,
3219+
struct pglist_data *pgdat)
32183220
{
32193221
unsigned long pfn = pte_pfn(pte);
32203222

@@ -3226,14 +3228,21 @@ static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned
32263228
if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte)))
32273229
return -1;
32283230

3231+
if (!pte_young(pte) && !mm_has_notifiers(vma->vm_mm))
3232+
return -1;
3233+
32293234
if (WARN_ON_ONCE(!pfn_valid(pfn)))
32303235
return -1;
32313236

3237+
if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
3238+
return -1;
3239+
32323240
return pfn;
32333241
}
32343242

32353243
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
3236-
static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr)
3244+
static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr,
3245+
struct pglist_data *pgdat)
32373246
{
32383247
unsigned long pfn = pmd_pfn(pmd);
32393248

@@ -3245,9 +3254,15 @@ static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned
32453254
if (WARN_ON_ONCE(pmd_devmap(pmd)))
32463255
return -1;
32473256

3257+
if (!pmd_young(pmd) && !mm_has_notifiers(vma->vm_mm))
3258+
return -1;
3259+
32483260
if (WARN_ON_ONCE(!pfn_valid(pfn)))
32493261
return -1;
32503262

3263+
if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
3264+
return -1;
3265+
32513266
return pfn;
32523267
}
32533268
#endif
@@ -3257,10 +3272,6 @@ static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg,
32573272
{
32583273
struct folio *folio;
32593274

3260-
/* try to avoid unnecessary memory loads */
3261-
if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
3262-
return NULL;
3263-
32643275
folio = pfn_folio(pfn);
32653276
if (folio_nid(folio) != pgdat->node_id)
32663277
return NULL;
@@ -3315,20 +3326,16 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
33153326
total++;
33163327
walk->mm_stats[MM_LEAF_TOTAL]++;
33173328

3318-
pfn = get_pte_pfn(ptent, args->vma, addr);
3329+
pfn = get_pte_pfn(ptent, args->vma, addr, pgdat);
33193330
if (pfn == -1)
33203331
continue;
33213332

3322-
if (!pte_young(ptent)) {
3323-
continue;
3324-
}
3325-
33263333
folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap);
33273334
if (!folio)
33283335
continue;
33293336

3330-
if (!ptep_test_and_clear_young(args->vma, addr, pte + i))
3331-
VM_WARN_ON_ONCE(true);
3337+
if (!ptep_clear_young_notify(args->vma, addr, pte + i))
3338+
continue;
33323339

33333340
young++;
33343341
walk->mm_stats[MM_LEAF_YOUNG]++;
@@ -3394,21 +3401,25 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area
33943401
/* don't round down the first address */
33953402
addr = i ? (*first & PMD_MASK) + i * PMD_SIZE : *first;
33963403

3397-
pfn = get_pmd_pfn(pmd[i], vma, addr);
3398-
if (pfn == -1)
3404+
if (!pmd_present(pmd[i]))
33993405
goto next;
34003406

34013407
if (!pmd_trans_huge(pmd[i])) {
3402-
if (!walk->force_scan && should_clear_pmd_young())
3408+
if (!walk->force_scan && should_clear_pmd_young() &&
3409+
!mm_has_notifiers(args->mm))
34033410
pmdp_test_and_clear_young(vma, addr, pmd + i);
34043411
goto next;
34053412
}
34063413

3414+
pfn = get_pmd_pfn(pmd[i], vma, addr, pgdat);
3415+
if (pfn == -1)
3416+
goto next;
3417+
34073418
folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap);
34083419
if (!folio)
34093420
goto next;
34103421

3411-
if (!pmdp_test_and_clear_young(vma, addr, pmd + i))
3422+
if (!pmdp_clear_young_notify(vma, addr, pmd + i))
34123423
goto next;
34133424

34143425
walk->mm_stats[MM_LEAF_YOUNG]++;
@@ -3472,25 +3483,19 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
34723483

34733484
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
34743485
if (pmd_trans_huge(val)) {
3475-
unsigned long pfn = pmd_pfn(val);
34763486
struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
3487+
unsigned long pfn = get_pmd_pfn(val, vma, addr, pgdat);
34773488

34783489
walk->mm_stats[MM_LEAF_TOTAL]++;
34793490

3480-
if (!pmd_young(val)) {
3481-
continue;
3482-
}
3483-
3484-
/* try to avoid unnecessary memory loads */
3485-
if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
3486-
continue;
3487-
3488-
walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first);
3491+
if (pfn != -1)
3492+
walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first);
34893493
continue;
34903494
}
34913495
#endif
34923496

3493-
if (!walk->force_scan && should_clear_pmd_young()) {
3497+
if (!walk->force_scan && should_clear_pmd_young() &&
3498+
!mm_has_notifiers(args->mm)) {
34943499
if (!pmd_young(val))
34953500
continue;
34963501

@@ -3948,13 +3953,13 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
39483953
* the PTE table to the Bloom filter. This forms a feedback loop between the
39493954
* eviction and the aging.
39503955
*/
3951-
void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
3956+
bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
39523957
{
39533958
int i;
39543959
unsigned long start;
39553960
unsigned long end;
39563961
struct lru_gen_mm_walk *walk;
3957-
int young = 0;
3962+
int young = 1;
39583963
pte_t *pte = pvmw->pte;
39593964
unsigned long addr = pvmw->address;
39603965
struct vm_area_struct *vma = pvmw->vma;
@@ -3969,19 +3974,25 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
39693974
lockdep_assert_held(pvmw->ptl);
39703975
VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio);
39713976

3977+
if (!ptep_clear_young_notify(vma, addr, pte))
3978+
return false;
3979+
39723980
if (spin_is_contended(pvmw->ptl))
3973-
return;
3981+
return true;
39743982

39753983
/* exclude special VMAs containing anon pages from COW */
39763984
if (vma->vm_flags & VM_SPECIAL)
3977-
return;
3985+
return true;
39783986

39793987
/* avoid taking the LRU lock under the PTL when possible */
39803988
walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL;
39813989

39823990
start = max(addr & PMD_MASK, vma->vm_start);
39833991
end = min(addr | ~PMD_MASK, vma->vm_end - 1) + 1;
39843992

3993+
if (end - start == PAGE_SIZE)
3994+
return true;
3995+
39853996
if (end - start > MIN_LRU_BATCH * PAGE_SIZE) {
39863997
if (addr - start < MIN_LRU_BATCH * PAGE_SIZE / 2)
39873998
end = start + MIN_LRU_BATCH * PAGE_SIZE;
@@ -3995,7 +4006,7 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
39954006

39964007
/* folio_update_gen() requires stable folio_memcg() */
39974008
if (!mem_cgroup_trylock_pages(memcg))
3998-
return;
4009+
return true;
39994010

40004011
arch_enter_lazy_mmu_mode();
40014012

@@ -4005,19 +4016,16 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
40054016
unsigned long pfn;
40064017
pte_t ptent = ptep_get(pte + i);
40074018

4008-
pfn = get_pte_pfn(ptent, vma, addr);
4019+
pfn = get_pte_pfn(ptent, vma, addr, pgdat);
40094020
if (pfn == -1)
40104021
continue;
40114022

4012-
if (!pte_young(ptent))
4013-
continue;
4014-
40154023
folio = get_pfn_folio(pfn, memcg, pgdat, can_swap);
40164024
if (!folio)
40174025
continue;
40184026

4019-
if (!ptep_test_and_clear_young(vma, addr, pte + i))
4020-
VM_WARN_ON_ONCE(true);
4027+
if (!ptep_clear_young_notify(vma, addr, pte + i))
4028+
continue;
40214029

40224030
young++;
40234031

@@ -4049,6 +4057,8 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
40494057
/* feedback from rmap walkers to page table walkers */
40504058
if (suitable_to_scan(i, young))
40514059
update_bloom_filter(lruvec, max_seq, pvmw->pmd);
4060+
4061+
return true;
40524062
}
40534063

40544064
/******************************************************************************

0 commit comments

Comments
 (0)