Skip to content

Commit 30d77b7

Browse files
Yu Zhaoakpm00
authored andcommitted
mm/mglru: fix ineffective protection calculation
mem_cgroup_calculate_protection() is not stateless and should only be used as part of a top-down tree traversal. shrink_one() traverses the per-node memcg LRU instead of the root_mem_cgroup tree, and therefore it should not call mem_cgroup_calculate_protection(). The existing misuse in shrink_one() can cause ineffective protection of sub-trees that are grandchildren of root_mem_cgroup. Fix it by reusing lru_gen_age_node(), which already traverses the root_mem_cgroup tree, to calculate the protection. Previously lru_gen_age_node() opportunistically skips the first pass, i.e., when scan_control->priority is DEF_PRIORITY. On the second pass, lruvec_is_sizable() uses appropriate scan_control->priority, set by set_initial_priority() from lru_gen_shrink_node(), to decide whether a memcg is too small to reclaim from. Now lru_gen_age_node() unconditionally traverses the root_mem_cgroup tree. So it should call set_initial_priority() upfront, to make sure lruvec_is_sizable() uses appropriate scan_control->priority on the first pass. Otherwise, lruvec_is_reclaimable() can return false negatives and result in premature OOM kills when min_ttl_ms is used. Link: https://lkml.kernel.org/r/[email protected] Fixes: e4dde56 ("mm: multi-gen LRU: per-node lru_gen_folio lists") Signed-off-by: Yu Zhao <[email protected]> Reported-by: T.J. Mercier <[email protected]> Cc: <[email protected]> Signed-off-by: Andrew Morton <[email protected]>
1 parent b749cb0 commit 30d77b7

File tree

1 file changed

+38
-44
lines changed

1 file changed

+38
-44
lines changed

mm/vmscan.c

Lines changed: 38 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -3915,6 +3915,32 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq,
39153915
* working set protection
39163916
******************************************************************************/
39173917

3918+
static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc)
3919+
{
3920+
int priority;
3921+
unsigned long reclaimable;
3922+
3923+
if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH)
3924+
return;
3925+
/*
3926+
* Determine the initial priority based on
3927+
* (total >> priority) * reclaimed_to_scanned_ratio = nr_to_reclaim,
3928+
* where reclaimed_to_scanned_ratio = inactive / total.
3929+
*/
3930+
reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE);
3931+
if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc))
3932+
reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON);
3933+
3934+
/* round down reclaimable and round up sc->nr_to_reclaim */
3935+
priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1);
3936+
3937+
/*
3938+
* The estimation is based on LRU pages only, so cap it to prevent
3939+
* overshoots of shrinker objects by large margins.
3940+
*/
3941+
sc->priority = clamp(priority, DEF_PRIORITY / 2, DEF_PRIORITY);
3942+
}
3943+
39183944
static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc)
39193945
{
39203946
int gen, type, zone;
@@ -3948,19 +3974,17 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc
39483974
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
39493975
DEFINE_MIN_SEQ(lruvec);
39503976

3951-
/* see the comment on lru_gen_folio */
3952-
gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
3953-
birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
3954-
3955-
if (time_is_after_jiffies(birth + min_ttl))
3977+
if (mem_cgroup_below_min(NULL, memcg))
39563978
return false;
39573979

39583980
if (!lruvec_is_sizable(lruvec, sc))
39593981
return false;
39603982

3961-
mem_cgroup_calculate_protection(NULL, memcg);
3983+
/* see the comment on lru_gen_folio */
3984+
gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
3985+
birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
39623986

3963-
return !mem_cgroup_below_min(NULL, memcg);
3987+
return time_is_before_jiffies(birth + min_ttl);
39643988
}
39653989

39663990
/* to protect the working set of the last N jiffies */
@@ -3970,31 +3994,28 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
39703994
{
39713995
struct mem_cgroup *memcg;
39723996
unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl);
3997+
bool reclaimable = !min_ttl;
39733998

39743999
VM_WARN_ON_ONCE(!current_is_kswapd());
39754000

3976-
/* check the order to exclude compaction-induced reclaim */
3977-
if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY)
3978-
return;
4001+
set_initial_priority(pgdat, sc);
39794002

39804003
memcg = mem_cgroup_iter(NULL, NULL, NULL);
39814004
do {
39824005
struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
39834006

3984-
if (lruvec_is_reclaimable(lruvec, sc, min_ttl)) {
3985-
mem_cgroup_iter_break(NULL, memcg);
3986-
return;
3987-
}
4007+
mem_cgroup_calculate_protection(NULL, memcg);
39884008

3989-
cond_resched();
4009+
if (!reclaimable)
4010+
reclaimable = lruvec_is_reclaimable(lruvec, sc, min_ttl);
39904011
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
39914012

39924013
/*
39934014
* The main goal is to OOM kill if every generation from all memcgs is
39944015
* younger than min_ttl. However, another possibility is all memcgs are
39954016
* either too small or below min.
39964017
*/
3997-
if (mutex_trylock(&oom_lock)) {
4018+
if (!reclaimable && mutex_trylock(&oom_lock)) {
39984019
struct oom_control oc = {
39994020
.gfp_mask = sc->gfp_mask,
40004021
};
@@ -4786,8 +4807,7 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
47864807
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
47874808
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
47884809

4789-
mem_cgroup_calculate_protection(NULL, memcg);
4790-
4810+
/* lru_gen_age_node() called mem_cgroup_calculate_protection() */
47914811
if (mem_cgroup_below_min(NULL, memcg))
47924812
return MEMCG_LRU_YOUNG;
47934813

@@ -4911,32 +4931,6 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
49114931
blk_finish_plug(&plug);
49124932
}
49134933

4914-
static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc)
4915-
{
4916-
int priority;
4917-
unsigned long reclaimable;
4918-
4919-
if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH)
4920-
return;
4921-
/*
4922-
* Determine the initial priority based on
4923-
* (total >> priority) * reclaimed_to_scanned_ratio = nr_to_reclaim,
4924-
* where reclaimed_to_scanned_ratio = inactive / total.
4925-
*/
4926-
reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE);
4927-
if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc))
4928-
reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON);
4929-
4930-
/* round down reclaimable and round up sc->nr_to_reclaim */
4931-
priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1);
4932-
4933-
/*
4934-
* The estimation is based on LRU pages only, so cap it to prevent
4935-
* overshoots of shrinker objects by large margins.
4936-
*/
4937-
sc->priority = clamp(priority, DEF_PRIORITY / 2, DEF_PRIORITY);
4938-
}
4939-
49404934
static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
49414935
{
49424936
struct blk_plug plug;

0 commit comments

Comments
 (0)