Skip to content

Commit 7cf111b

Browse files
hnaztorvalds
authored andcommitted
mm: vmscan: determine anon/file pressure balance at the reclaim root
We split the LRU lists into anon and file, and we rebalance the scan pressure between them when one of them begins thrashing: if the file cache experiences workingset refaults, we increase the pressure on anonymous pages; if the workload is stalled on swapins, we increase the pressure on the file cache instead. With cgroups and their nested LRU lists, we currently don't do this correctly. While recursive cgroup reclaim establishes a relative LRU order among the pages of all involved cgroups, LRU pressure balancing is done on an individual cgroup LRU level. As a result, when one cgroup is thrashing on the filesystem cache while a sibling may have cold anonymous pages, pressure doesn't get equalized between them. This patch moves LRU balancing decision to the root of reclaim - the same level where the LRU order is established. It does this by tracking LRU cost recursively, so that every level of the cgroup tree knows the aggregate LRU cost of all memory within its domain. When the page scanner calculates the scan balance for any given individual cgroup's LRU list, it uses the values from the ancestor cgroup that initiated the reclaim cycle. If one sibling is then thrashing on the cache, it will tip the pressure balance inside its ancestors, and the next hierarchical reclaim iteration will go more after the anon pages in the tree. Signed-off-by: Johannes Weiner <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Cc: Joonsoo Kim <[email protected]> Cc: Michal Hocko <[email protected]> Cc: Minchan Kim <[email protected]> Cc: Rik van Riel <[email protected]> Link: http://lkml.kernel.org/r/[email protected] Signed-off-by: Linus Torvalds <[email protected]>
1 parent 314b57f commit 7cf111b

File tree

3 files changed

+57
-29
lines changed

3 files changed

+57
-29
lines changed

include/linux/memcontrol.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1242,6 +1242,19 @@ static inline void dec_lruvec_page_state(struct page *page,
12421242
mod_lruvec_page_state(page, idx, -1);
12431243
}
12441244

1245+
static inline struct lruvec *parent_lruvec(struct lruvec *lruvec)
1246+
{
1247+
struct mem_cgroup *memcg;
1248+
1249+
memcg = lruvec_memcg(lruvec);
1250+
if (!memcg)
1251+
return NULL;
1252+
memcg = parent_mem_cgroup(memcg);
1253+
if (!memcg)
1254+
return NULL;
1255+
return mem_cgroup_lruvec(memcg, lruvec_pgdat(lruvec));
1256+
}
1257+
12451258
#ifdef CONFIG_CGROUP_WRITEBACK
12461259

12471260
struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb);

mm/swap.c

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -282,11 +282,33 @@ void lru_note_cost(struct page *page)
282282
{
283283
struct lruvec *lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
284284

285-
/* Record new data point */
286-
if (page_is_file_lru(page))
287-
lruvec->file_cost++;
288-
else
289-
lruvec->anon_cost++;
285+
do {
286+
unsigned long lrusize;
287+
288+
/* Record cost event */
289+
if (page_is_file_lru(page))
290+
lruvec->file_cost++;
291+
else
292+
lruvec->anon_cost++;
293+
294+
/*
295+
* Decay previous events
296+
*
297+
* Because workloads change over time (and to avoid
298+
* overflow) we keep these statistics as a floating
299+
* average, which ends up weighing recent refaults
300+
* more than old ones.
301+
*/
302+
lrusize = lruvec_page_state(lruvec, NR_INACTIVE_ANON) +
303+
lruvec_page_state(lruvec, NR_ACTIVE_ANON) +
304+
lruvec_page_state(lruvec, NR_INACTIVE_FILE) +
305+
lruvec_page_state(lruvec, NR_ACTIVE_FILE);
306+
307+
if (lruvec->file_cost + lruvec->anon_cost > lrusize / 4) {
308+
lruvec->file_cost /= 2;
309+
lruvec->anon_cost /= 2;
310+
}
311+
} while ((lruvec = parent_lruvec(lruvec)));
290312
}
291313

292314
static void __activate_page(struct page *page, struct lruvec *lruvec,

mm/vmscan.c

Lines changed: 17 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,12 @@ struct scan_control {
7979
*/
8080
struct mem_cgroup *target_mem_cgroup;
8181

82+
/*
83+
* Scan pressure balancing between anon and file LRUs
84+
*/
85+
unsigned long anon_cost;
86+
unsigned long file_cost;
87+
8288
/* Can active pages be deactivated as part of reclaim? */
8389
#define DEACTIVATE_ANON 1
8490
#define DEACTIVATE_FILE 2
@@ -2231,10 +2237,8 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
22312237
int swappiness = mem_cgroup_swappiness(memcg);
22322238
u64 fraction[2];
22332239
u64 denominator = 0; /* gcc */
2234-
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
22352240
unsigned long anon_prio, file_prio;
22362241
enum scan_balance scan_balance;
2237-
unsigned long anon, file;
22382242
unsigned long totalcost;
22392243
unsigned long ap, fp;
22402244
enum lru_list lru;
@@ -2285,7 +2289,6 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
22852289
}
22862290

22872291
scan_balance = SCAN_FRACT;
2288-
22892292
/*
22902293
* Calculate the pressure balance between anon and file pages.
22912294
*
@@ -2300,30 +2303,12 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
23002303
anon_prio = swappiness;
23012304
file_prio = 200 - anon_prio;
23022305

2303-
/*
2304-
* Because workloads change over time (and to avoid overflow)
2305-
* we keep these statistics as a floating average, which ends
2306-
* up weighing recent refaults more than old ones.
2307-
*/
2308-
2309-
anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) +
2310-
lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES);
2311-
file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES) +
2312-
lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES);
2313-
2314-
spin_lock_irq(&pgdat->lru_lock);
2315-
totalcost = lruvec->anon_cost + lruvec->file_cost;
2316-
if (unlikely(totalcost > (anon + file) / 4)) {
2317-
lruvec->anon_cost /= 2;
2318-
lruvec->file_cost /= 2;
2319-
totalcost /= 2;
2320-
}
2306+
totalcost = sc->anon_cost + sc->file_cost;
23212307
ap = anon_prio * (totalcost + 1);
2322-
ap /= lruvec->anon_cost + 1;
2308+
ap /= sc->anon_cost + 1;
23232309

23242310
fp = file_prio * (totalcost + 1);
2325-
fp /= lruvec->file_cost + 1;
2326-
spin_unlock_irq(&pgdat->lru_lock);
2311+
fp /= sc->file_cost + 1;
23272312

23282313
fraction[0] = ap;
23292314
fraction[1] = fp;
@@ -2687,6 +2672,14 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
26872672
nr_reclaimed = sc->nr_reclaimed;
26882673
nr_scanned = sc->nr_scanned;
26892674

2675+
/*
2676+
* Determine the scan balance between anon and file LRUs.
2677+
*/
2678+
spin_lock_irq(&pgdat->lru_lock);
2679+
sc->anon_cost = target_lruvec->anon_cost;
2680+
sc->file_cost = target_lruvec->file_cost;
2681+
spin_unlock_irq(&pgdat->lru_lock);
2682+
26902683
/*
26912684
* Target desirable inactive:active list ratios for the anon
26922685
* and file LRU lists.

0 commit comments

Comments
 (0)