Skip to content

Commit 1b4e3f2

Browse files
gormanmtorvalds
authored andcommitted
mm: vmscan: Reduce throttling due to a failure to make progress
Mike Galbraith, Alexey Avramov and Darrick Wong all reported similar problems due to reclaim throttling for excessive lengths of time. In Alexey's case, a memory hog that should go OOM quickly stalls for several minutes before stalling. In Mike and Darrick's cases, a small memcg environment stalled excessively even though the system had enough memory overall. Commit 69392a4 ("mm/vmscan: throttle reclaim when no progress is being made") introduced the problem although commit a19594c ("mm/vmscan: increase the timeout if page reclaim is not making progress") made it worse. Systems at or near an OOM state that cannot be recovered must reach OOM quickly and memcg should kill tasks if a memcg is near OOM. To address this, only stall for the first zone in the zonelist, reduce the timeout to 1 tick for VMSCAN_THROTTLE_NOPROGRESS and only stall if the scan control nr_reclaimed is 0, kswapd is still active and there were excessive pages pending for writeback. If kswapd has stopped reclaiming due to excessive failures, do not stall at all so that OOM triggers relatively quickly. Similarly, if an LRU is simply congested, only lightly throttle similar to NOPROGRESS. Alexey's original case was the most straight forward for i in {1..3}; do tail /dev/zero; done On vanilla 5.16-rc1, this test stalled heavily, after the patch the test completes in a few seconds similar to 5.15. Alexey's second test case added watching a youtube video while tail runs 10 times. On 5.15, playback only jitters slightly, 5.16-rc1 stalls a lot with lots of frames missing and numerous audio glitches. With this patch applies, the video plays similarly to 5.15. [[email protected]: Fix W=1 build warning] Link: https://lore.kernel.org/r/[email protected] Link: https://lore.kernel.org/r/[email protected] Link: https://lore.kernel.org/r/[email protected] Link: https://lore.kernel.org/r/[email protected] Link: https://linux-regtracking.leemhuis.info/regzbot/regression/[email protected]/ Reported-and-tested-by: Alexey Avramov <[email protected]> Reported-and-tested-by: Mike Galbraith <[email protected]> Reported-and-tested-by: Darrick J. Wong <[email protected]> Reported-by: kernel test robot <[email protected]> Acked-by: Hugh Dickins <[email protected]> Tracked-by: Thorsten Leemhuis <[email protected]> Fixes: 69392a4 ("mm/vmscan: throttle reclaim when no progress is being made") Signed-off-by: Mel Gorman <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent f87bcc8 commit 1b4e3f2

File tree

3 files changed

+59
-10
lines changed

3 files changed

+59
-10
lines changed

include/linux/mmzone.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,7 @@ enum vmscan_throttle_state {
277277
VMSCAN_THROTTLE_WRITEBACK,
278278
VMSCAN_THROTTLE_ISOLATED,
279279
VMSCAN_THROTTLE_NOPROGRESS,
280+
VMSCAN_THROTTLE_CONGESTED,
280281
NR_VMSCAN_THROTTLE,
281282
};
282283

include/trace/events/vmscan.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,12 +30,14 @@
3030
#define _VMSCAN_THROTTLE_WRITEBACK (1 << VMSCAN_THROTTLE_WRITEBACK)
3131
#define _VMSCAN_THROTTLE_ISOLATED (1 << VMSCAN_THROTTLE_ISOLATED)
3232
#define _VMSCAN_THROTTLE_NOPROGRESS (1 << VMSCAN_THROTTLE_NOPROGRESS)
33+
#define _VMSCAN_THROTTLE_CONGESTED (1 << VMSCAN_THROTTLE_CONGESTED)
3334

3435
#define show_throttle_flags(flags) \
3536
(flags) ? __print_flags(flags, "|", \
3637
{_VMSCAN_THROTTLE_WRITEBACK, "VMSCAN_THROTTLE_WRITEBACK"}, \
3738
{_VMSCAN_THROTTLE_ISOLATED, "VMSCAN_THROTTLE_ISOLATED"}, \
38-
{_VMSCAN_THROTTLE_NOPROGRESS, "VMSCAN_THROTTLE_NOPROGRESS"} \
39+
{_VMSCAN_THROTTLE_NOPROGRESS, "VMSCAN_THROTTLE_NOPROGRESS"}, \
40+
{_VMSCAN_THROTTLE_CONGESTED, "VMSCAN_THROTTLE_CONGESTED"} \
3941
) : "VMSCAN_THROTTLE_NONE"
4042

4143

mm/vmscan.c

Lines changed: 55 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1021,6 +1021,39 @@ static void handle_write_error(struct address_space *mapping,
10211021
unlock_page(page);
10221022
}
10231023

1024+
static bool skip_throttle_noprogress(pg_data_t *pgdat)
1025+
{
1026+
int reclaimable = 0, write_pending = 0;
1027+
int i;
1028+
1029+
/*
1030+
* If kswapd is disabled, reschedule if necessary but do not
1031+
* throttle as the system is likely near OOM.
1032+
*/
1033+
if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
1034+
return true;
1035+
1036+
/*
1037+
* If there are a lot of dirty/writeback pages then do not
1038+
* throttle as throttling will occur when the pages cycle
1039+
* towards the end of the LRU if still under writeback.
1040+
*/
1041+
for (i = 0; i < MAX_NR_ZONES; i++) {
1042+
struct zone *zone = pgdat->node_zones + i;
1043+
1044+
if (!populated_zone(zone))
1045+
continue;
1046+
1047+
reclaimable += zone_reclaimable_pages(zone);
1048+
write_pending += zone_page_state_snapshot(zone,
1049+
NR_ZONE_WRITE_PENDING);
1050+
}
1051+
if (2 * write_pending <= reclaimable)
1052+
return true;
1053+
1054+
return false;
1055+
}
1056+
10241057
void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason)
10251058
{
10261059
wait_queue_head_t *wqh = &pgdat->reclaim_wait[reason];
@@ -1056,8 +1089,16 @@ void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason)
10561089
}
10571090

10581091
break;
1092+
case VMSCAN_THROTTLE_CONGESTED:
1093+
fallthrough;
10591094
case VMSCAN_THROTTLE_NOPROGRESS:
1060-
timeout = HZ/2;
1095+
if (skip_throttle_noprogress(pgdat)) {
1096+
cond_resched();
1097+
return;
1098+
}
1099+
1100+
timeout = 1;
1101+
10611102
break;
10621103
case VMSCAN_THROTTLE_ISOLATED:
10631104
timeout = HZ/50;
@@ -3321,7 +3362,7 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
33213362
if (!current_is_kswapd() && current_may_throttle() &&
33223363
!sc->hibernation_mode &&
33233364
test_bit(LRUVEC_CONGESTED, &target_lruvec->flags))
3324-
reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK);
3365+
reclaim_throttle(pgdat, VMSCAN_THROTTLE_CONGESTED);
33253366

33263367
if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
33273368
sc))
@@ -3386,16 +3427,16 @@ static void consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc)
33863427
}
33873428

33883429
/*
3389-
* Do not throttle kswapd on NOPROGRESS as it will throttle on
3390-
* VMSCAN_THROTTLE_WRITEBACK if there are too many pages under
3391-
* writeback and marked for immediate reclaim at the tail of
3392-
* the LRU.
3430+
* Do not throttle kswapd or cgroup reclaim on NOPROGRESS as it will
3431+
* throttle on VMSCAN_THROTTLE_WRITEBACK if there are too many pages
3432+
* under writeback and marked for immediate reclaim at the tail of the
3433+
* LRU.
33933434
*/
3394-
if (current_is_kswapd())
3435+
if (current_is_kswapd() || cgroup_reclaim(sc))
33953436
return;
33963437

33973438
/* Throttle if making no progress at high prioities. */
3398-
if (sc->priority < DEF_PRIORITY - 2)
3439+
if (sc->priority == 1 && !sc->nr_reclaimed)
33993440
reclaim_throttle(pgdat, VMSCAN_THROTTLE_NOPROGRESS);
34003441
}
34013442

@@ -3415,6 +3456,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
34153456
unsigned long nr_soft_scanned;
34163457
gfp_t orig_mask;
34173458
pg_data_t *last_pgdat = NULL;
3459+
pg_data_t *first_pgdat = NULL;
34183460

34193461
/*
34203462
* If the number of buffer_heads in the machine exceeds the maximum
@@ -3478,14 +3520,18 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
34783520
/* need some check for avoid more shrink_zone() */
34793521
}
34803522

3523+
if (!first_pgdat)
3524+
first_pgdat = zone->zone_pgdat;
3525+
34813526
/* See comment about same check for global reclaim above */
34823527
if (zone->zone_pgdat == last_pgdat)
34833528
continue;
34843529
last_pgdat = zone->zone_pgdat;
34853530
shrink_node(zone->zone_pgdat, sc);
3486-
consider_reclaim_throttle(zone->zone_pgdat, sc);
34873531
}
34883532

3533+
consider_reclaim_throttle(first_pgdat, sc);
3534+
34893535
/*
34903536
* Restore to original mask to avoid the impact on the caller if we
34913537
* promoted it to __GFP_HIGHMEM.

0 commit comments

Comments
 (0)