Skip to content

Commit 314b57f

Browse files
hnaztorvalds
authored andcommitted
mm: balance LRU lists based on relative thrashing
Since the LRUs were split into anon and file lists, the VM has been balancing between page cache and anonymous pages based on per-list ratios of scanned vs. rotated pages. In most cases that tips page reclaim towards the list that is easier to reclaim and has the fewest actively used pages, but there are a few problems with it: 1. Refaults and LRU rotations are weighted the same way, even though one costs IO and the other costs a bit of CPU. 2. The less we scan an LRU list based on already observed rotations, the more we increase the sampling interval for new references, and rotations become even more likely on that list. This can enter a death spiral in which we stop looking at one list completely until the other one is all but annihilated by page reclaim. Since commit a528910 ("mm: thrash detection-based file cache sizing") we have refault detection for the page cache. Along with swapin events, they are good indicators of when the file or anon list, respectively, is too small for its workingset and needs to grow. For example, if the page cache is thrashing, the cache pages need more time in memory, while there may be colder pages on the anonymous list. Likewise, if swapped pages are faulting back in, it indicates that we reclaim anonymous pages too aggressively and should back off. Replace LRU rotations with refaults and swapins as the basis for relative reclaim cost of the two LRUs. This will have the VM target list balances that incur the least amount of IO on aggregate. Signed-off-by: Johannes Weiner <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Cc: Joonsoo Kim <[email protected]> Cc: Michal Hocko <[email protected]> Cc: Minchan Kim <[email protected]> Cc: Rik van Riel <[email protected]> Link: http://lkml.kernel.org/r/[email protected] Signed-off-by: Linus Torvalds <[email protected]>
1 parent 264e90c commit 314b57f

File tree

5 files changed

+27
-35
lines changed

5 files changed

+27
-35
lines changed

include/linux/swap.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -334,8 +334,7 @@ extern unsigned long nr_free_pagecache_pages(void);
334334

335335

336336
/* linux/mm/swap.c */
337-
extern void lru_note_cost(struct lruvec *lruvec, bool file,
338-
unsigned int nr_pages);
337+
extern void lru_note_cost(struct page *);
339338
extern void lru_cache_add(struct page *);
340339
extern void lru_add_page_tail(struct page *page, struct page *page_tail,
341340
struct lruvec *lruvec, struct list_head *head);

mm/swap.c

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -278,12 +278,15 @@ void rotate_reclaimable_page(struct page *page)
278278
}
279279
}
280280

281-
void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
281+
void lru_note_cost(struct page *page)
282282
{
283-
if (file)
284-
lruvec->file_cost += nr_pages;
283+
struct lruvec *lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
284+
285+
/* Record new data point */
286+
if (page_is_file_lru(page))
287+
lruvec->file_cost++;
285288
else
286-
lruvec->anon_cost += nr_pages;
289+
lruvec->anon_cost++;
287290
}
288291

289292
static void __activate_page(struct page *page, struct lruvec *lruvec,

mm/swap_state.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -440,6 +440,11 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
440440
goto fail_unlock;
441441
}
442442

443+
/* XXX: Move to lru_cache_add() when it supports new vs putback */
444+
spin_lock_irq(&page_pgdat(page)->lru_lock);
445+
lru_note_cost(page);
446+
spin_unlock_irq(&page_pgdat(page)->lru_lock);
447+
443448
/* Caller will initiate read into locked page */
444449
SetPageWorkingset(page);
445450
lru_cache_add(page);

mm/vmscan.c

Lines changed: 10 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1958,12 +1958,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
19581958
move_pages_to_lru(lruvec, &page_list);
19591959

19601960
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
1961-
/*
1962-
* Rotating pages costs CPU without actually
1963-
* progressing toward the reclaim goal.
1964-
*/
1965-
lru_note_cost(lruvec, 0, stat.nr_activate[0]);
1966-
lru_note_cost(lruvec, 1, stat.nr_activate[1]);
19671961
item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
19681962
if (!cgroup_reclaim(sc))
19691963
__count_vm_events(item, nr_reclaimed);
@@ -2079,11 +2073,6 @@ static void shrink_active_list(unsigned long nr_to_scan,
20792073
* Move pages back to the lru list.
20802074
*/
20812075
spin_lock_irq(&pgdat->lru_lock);
2082-
/*
2083-
* Rotating pages costs CPU without actually
2084-
* progressing toward the reclaim goal.
2085-
*/
2086-
lru_note_cost(lruvec, file, nr_rotated);
20872076

20882077
nr_activate = move_pages_to_lru(lruvec, &l_active);
20892078
nr_deactivate = move_pages_to_lru(lruvec, &l_inactive);
@@ -2298,22 +2287,23 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
22982287
scan_balance = SCAN_FRACT;
22992288

23002289
/*
2301-
* With swappiness at 100, anonymous and file have the same priority.
2302-
* This scanning priority is essentially the inverse of IO cost.
2290+
* Calculate the pressure balance between anon and file pages.
2291+
*
2292+
* The amount of pressure we put on each LRU is inversely
2293+
* proportional to the cost of reclaiming each list, as
2294+
* determined by the share of pages that are refaulting, times
2295+
* the relative IO cost of bringing back a swapped out
2296+
* anonymous page vs reloading a filesystem page (swappiness).
2297+
*
2298+
* With swappiness at 100, anon and file have equal IO cost.
23032299
*/
23042300
anon_prio = swappiness;
23052301
file_prio = 200 - anon_prio;
23062302

23072303
/*
2308-
* OK, so we have swap space and a fair amount of page cache
2309-
* pages. We use the recently rotated / recently scanned
2310-
* ratios to determine how valuable each cache is.
2311-
*
23122304
* Because workloads change over time (and to avoid overflow)
23132305
* we keep these statistics as a floating average, which ends
2314-
* up weighing recent references more than old ones.
2315-
*
2316-
* anon in [0], file in [1]
2306+
* up weighing recent refaults more than old ones.
23172307
*/
23182308

23192309
anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) +
@@ -2328,15 +2318,6 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
23282318
lruvec->file_cost /= 2;
23292319
totalcost /= 2;
23302320
}
2331-
2332-
/*
2333-
* The amount of pressure on anon vs file pages is inversely
2334-
* proportional to the assumed cost of reclaiming each list,
2335-
* as determined by the share of pages that are likely going
2336-
* to refault or rotate on each list (recently referenced),
2337-
* times the relative IO cost of bringing back a swapped out
2338-
* anonymous page vs reloading a filesystem page (swappiness).
2339-
*/
23402321
ap = anon_prio * (totalcost + 1);
23412322
ap /= lruvec->anon_cost + 1;
23422323

mm/workingset.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -365,6 +365,10 @@ void workingset_refault(struct page *page, void *shadow)
365365
/* Page was active prior to eviction */
366366
if (workingset) {
367367
SetPageWorkingset(page);
368+
/* XXX: Move to lru_cache_add() when it supports new vs putback */
369+
spin_lock_irq(&page_pgdat(page)->lru_lock);
370+
lru_note_cost(page);
371+
spin_unlock_irq(&page_pgdat(page)->lru_lock);
368372
inc_lruvec_state(lruvec, WORKINGSET_RESTORE);
369373
}
370374
out:

0 commit comments

Comments
 (0)