Skip to content

Commit 5168a68

Browse files
ryncsnakpm00
authored andcommitted
mm, swap: avoid over reclaim of full clusters
When running low on usable slots, cluster allocator will try to reclaim the full clusters aggressively to reclaim HAS_CACHE slots. This guarantees that as long as there are any usable slots, HAS_CACHE or not, the swap device will be usable and workload won't go OOM early. Before the cluster allocator, swap allocator fails easily if device is filled up with reclaimable HAS_CACHE slots. Which can be easily reproduced with following simple program: #include <stdio.h> #include <string.h> #include <linux/mman.h> #include <sys/mman.h> #define SIZE 8192UL * 1024UL * 1024UL int main(int argc, char **argv) { long tmp; char *p = mmap(NULL, SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); memset(p, 0, SIZE); madvise(p, SIZE, MADV_PAGEOUT); for (unsigned long i = 0; i < SIZE; ++i) tmp += p[i]; getchar(); /* Pause */ return 0; } Setup an 8G non ramdisk swap, the first run of the program will swapout 8G ram successfully. But run same program again after the first run paused, the second run can't swapout all 8G memory as now half of the swap device is pinned by HAS_CACHE. There was a random scan in the old allocator that may reclaim part of the HAS_CACHE by luck, but it's unreliable. The new allocator's added reclaim of full clusters when device is low on usable slots. But when multiple CPUs are seeing the device is low on usable slots at the same time, they ran into a thundering herd problem. This is an observable problem on large machine with mass parallel workload, as full cluster reclaim is slower on large swap device and higher number of CPUs will also make things worse. Testing using a 128G ZRAM on a 48c96t system. When the swap device is very close to full (eg. 124G / 128G), running build linux kernel with make -j96 in a 1G memory cgroup will hung (not a softlockup though) spinning in full cluster reclaim for about ~5min before go OOM. To solve this, split the full reclaim into two parts: - Instead of do a synchronous aggressively reclaim when device is low, do only one aggressively reclaim when device is strictly full with a kworker. This still ensures in worst case the device won't be unusable because of HAS_CACHE slots. - To avoid allocation (especially higher order) suffer from HAS_CACHE filling up clusters and kworker not responsive enough, do one synchronous scan every time the free list is drained, and only scan one cluster. This is kind of similar to the random reclaim before, keeps the full clusters rotated and has a minimal latency. This should provide a fair reclaim strategy suitable for most workloads. Link: https://lkml.kernel.org/r/[email protected] Fixes: 2cacbdf ("mm: swap: add a adaptive full cluster cache reclaim") Signed-off-by: Kairui Song <[email protected]> Cc: Barry Song <[email protected]> Cc: Chris Li <[email protected]> Cc: "Huang, Ying" <[email protected]> Cc: Hugh Dickins <[email protected]> Cc: Kalesh Singh <[email protected]> Cc: Ryan Roberts <[email protected]> Cc: Yosry Ahmed <[email protected]> Signed-off-by: Andrew Morton <[email protected]>
1 parent b54e1bf commit 5168a68

File tree

2 files changed

+31
-19
lines changed

2 files changed

+31
-19
lines changed

include/linux/swap.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -335,6 +335,7 @@ struct swap_info_struct {
335335
* list.
336336
*/
337337
struct work_struct discard_work; /* discard worker */
338+
struct work_struct reclaim_work; /* reclaim worker */
338339
struct list_head discard_clusters; /* discard clusters list */
339340
struct plist_node avail_lists[]; /*
340341
* entries in swap_avail_heads, one

mm/swapfile.c

Lines changed: 30 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -731,15 +731,16 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigne
731731
return offset;
732732
}
733733

734-
static void swap_reclaim_full_clusters(struct swap_info_struct *si)
734+
/* Return true if reclaimed a whole cluster */
735+
static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
735736
{
736737
long to_scan = 1;
737738
unsigned long offset, end;
738739
struct swap_cluster_info *ci;
739740
unsigned char *map = si->swap_map;
740-
int nr_reclaim, total_reclaimed = 0;
741+
int nr_reclaim;
741742

742-
if (atomic_long_read(&nr_swap_pages) <= SWAPFILE_CLUSTER)
743+
if (force)
743744
to_scan = si->inuse_pages / SWAPFILE_CLUSTER;
744745

745746
while (!list_empty(&si->full_clusters)) {
@@ -749,28 +750,36 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si)
749750
end = min(si->max, offset + SWAPFILE_CLUSTER);
750751
to_scan--;
751752

753+
spin_unlock(&si->lock);
752754
while (offset < end) {
753755
if (READ_ONCE(map[offset]) == SWAP_HAS_CACHE) {
754-
spin_unlock(&si->lock);
755756
nr_reclaim = __try_to_reclaim_swap(si, offset,
756757
TTRS_ANYWAY | TTRS_DIRECT);
757-
spin_lock(&si->lock);
758-
if (nr_reclaim > 0) {
759-
offset += nr_reclaim;
760-
total_reclaimed += nr_reclaim;
761-
continue;
762-
} else if (nr_reclaim < 0) {
763-
offset += -nr_reclaim;
758+
if (nr_reclaim) {
759+
offset += abs(nr_reclaim);
764760
continue;
765761
}
766762
}
767763
offset++;
768764
}
769-
if (to_scan <= 0 || total_reclaimed)
765+
spin_lock(&si->lock);
766+
767+
if (to_scan <= 0)
770768
break;
771769
}
772770
}
773771

772+
static void swap_reclaim_work(struct work_struct *work)
773+
{
774+
struct swap_info_struct *si;
775+
776+
si = container_of(work, struct swap_info_struct, reclaim_work);
777+
778+
spin_lock(&si->lock);
779+
swap_reclaim_full_clusters(si, true);
780+
spin_unlock(&si->lock);
781+
}
782+
774783
/*
775784
* Try to get swap entries with specified order from current cpu's swap entry
776785
* pool (a cluster). This might involve allocating a new cluster for current CPU
@@ -800,6 +809,10 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
800809
goto done;
801810
}
802811

812+
/* Try reclaim from full clusters if free clusters list is drained */
813+
if (vm_swap_full())
814+
swap_reclaim_full_clusters(si, false);
815+
803816
if (order < PMD_ORDER) {
804817
unsigned int frags = 0;
805818

@@ -881,13 +894,6 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
881894
}
882895

883896
done:
884-
/* Try reclaim from full clusters if device is nearfull */
885-
if (vm_swap_full() && (!found || (si->pages - si->inuse_pages) < SWAPFILE_CLUSTER)) {
886-
swap_reclaim_full_clusters(si);
887-
if (!found && !order && si->pages != si->inuse_pages)
888-
goto new_cluster;
889-
}
890-
891897
cluster->next[order] = offset;
892898
return found;
893899
}
@@ -922,6 +928,9 @@ static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
922928
si->lowest_bit = si->max;
923929
si->highest_bit = 0;
924930
del_from_avail_list(si);
931+
932+
if (vm_swap_full())
933+
schedule_work(&si->reclaim_work);
925934
}
926935
}
927936

@@ -2816,6 +2825,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
28162825
wait_for_completion(&p->comp);
28172826

28182827
flush_work(&p->discard_work);
2828+
flush_work(&p->reclaim_work);
28192829

28202830
destroy_swap_extents(p);
28212831
if (p->flags & SWP_CONTINUED)
@@ -3376,6 +3386,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
33763386
return PTR_ERR(si);
33773387

33783388
INIT_WORK(&si->discard_work, swap_discard_work);
3389+
INIT_WORK(&si->reclaim_work, swap_reclaim_work);
33793390

33803391
name = getname(specialfile);
33813392
if (IS_ERR(name)) {

0 commit comments

Comments
 (0)