Skip to content

Commit cf11e85

Browse files
rgushchintorvalds
authored andcommitted
mm: hugetlb: optionally allocate gigantic hugepages using cma
Commit 944d9fe ("hugetlb: add support for gigantic page allocation at runtime") has added the run-time allocation of gigantic pages. However it actually works only at early stages of the system loading, when the majority of memory is free. After some time the memory gets fragmented by non-movable pages, so the chances to find a contiguous 1GB block are getting close to zero. Even dropping caches manually doesn't help a lot. At large scale rebooting servers in order to allocate gigantic hugepages is quite expensive and complex. At the same time keeping some constant percentage of memory in reserved hugepages even if the workload isn't using it is a big waste: not all workloads can benefit from using 1 GB pages. The following solution can solve the problem: 1) On boot time a dedicated cma area* is reserved. The size is passed as a kernel argument. 2) Run-time allocations of gigantic hugepages are performed using the cma allocator and the dedicated cma area In this case gigantic hugepages can be allocated successfully with a high probability, however the memory isn't completely wasted if nobody is using 1GB hugepages: it can be used for pagecache, anon memory, THPs, etc. * On a multi-node machine a per-node cma area is allocated on each node. Following gigantic hugetlb allocation are using the first available numa node if the mask isn't specified by a user. Usage: 1) configure the kernel to allocate a cma area for hugetlb allocations: pass hugetlb_cma=10G as a kernel argument 2) allocate hugetlb pages as usual, e.g. echo 10 > /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages If the option isn't enabled or the allocation of the cma area failed, the current behavior of the system is preserved. x86 and arm-64 are covered by this patch, other architectures can be trivially added later. The patch contains clean-ups and fixes proposed and implemented by Aslan Bakirov and Randy Dunlap. It also contains ideas and suggestions proposed by Rik van Riel, Michal Hocko and Mike Kravetz. Thanks! Signed-off-by: Roman Gushchin <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Tested-by: Andreas Schaufler <[email protected]> Acked-by: Mike Kravetz <[email protected]> Acked-by: Michal Hocko <[email protected]> Cc: Aslan Bakirov <[email protected]> Cc: Randy Dunlap <[email protected]> Cc: Rik van Riel <[email protected]> Cc: Joonsoo Kim <[email protected]> Link: http://lkml.kernel.org/r/[email protected] Signed-off-by: Linus Torvalds <[email protected]>
1 parent 8676af1 commit cf11e85

File tree

5 files changed

+139
-0
lines changed

5 files changed

+139
-0
lines changed

Documentation/admin-guide/kernel-parameters.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1475,6 +1475,14 @@
14751475
hpet_mmap= [X86, HPET_MMAP] Allow userspace to mmap HPET
14761476
registers. Default set by CONFIG_HPET_MMAP_DEFAULT.
14771477

1478+
hugetlb_cma= [HW] The size of a cma area used for allocation
1479+
of gigantic hugepages.
1480+
Format: nn[KMGTPE]
1481+
1482+
Reserve a cma area of given size and allocate gigantic
1483+
hugepages using the cma allocator. If enabled, the
1484+
boot-time allocation of gigantic hugepages is skipped.
1485+
14781486
hugepages= [HW,X86-32,IA-64] HugeTLB pages to allocate at boot.
14791487
hugepagesz= [HW,IA-64,PPC,X86-64] The size of the HugeTLB pages.
14801488
On x86-64 and powerpc, this option can be specified

arch/arm64/mm/init.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#include <linux/mm.h>
3030
#include <linux/kexec.h>
3131
#include <linux/crash_dump.h>
32+
#include <linux/hugetlb.h>
3233

3334
#include <asm/boot.h>
3435
#include <asm/fixmap.h>
@@ -457,6 +458,11 @@ void __init arm64_memblock_init(void)
457458
high_memory = __va(memblock_end_of_DRAM() - 1) + 1;
458459

459460
dma_contiguous_reserve(arm64_dma32_phys_limit);
461+
462+
#ifdef CONFIG_ARM64_4K_PAGES
463+
hugetlb_cma_reserve(PUD_SHIFT - PAGE_SHIFT);
464+
#endif
465+
460466
}
461467

462468
void __init bootmem_init(void)

arch/x86/kernel/setup.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include <linux/pci.h>
1717
#include <linux/root_dev.h>
1818
#include <linux/sfi.h>
19+
#include <linux/hugetlb.h>
1920
#include <linux/tboot.h>
2021
#include <linux/usb/xhci-dbgp.h>
2122

@@ -1157,6 +1158,9 @@ void __init setup_arch(char **cmdline_p)
11571158
initmem_init();
11581159
dma_contiguous_reserve(max_pfn_mapped << PAGE_SHIFT);
11591160

1161+
if (boot_cpu_has(X86_FEATURE_GBPAGES))
1162+
hugetlb_cma_reserve(PUD_SHIFT - PAGE_SHIFT);
1163+
11601164
/*
11611165
* Reserve memory for crash kernel after SRAT is parsed so that it
11621166
* won't consume hotpluggable memory.

include/linux/hugetlb.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -895,4 +895,16 @@ static inline spinlock_t *huge_pte_lock(struct hstate *h,
895895
return ptl;
896896
}
897897

898+
#if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_CMA)
899+
extern void __init hugetlb_cma_reserve(int order);
900+
extern void __init hugetlb_cma_check(void);
901+
#else
902+
static inline __init void hugetlb_cma_reserve(int order)
903+
{
904+
}
905+
static inline __init void hugetlb_cma_check(void)
906+
{
907+
}
908+
#endif
909+
898910
#endif /* _LINUX_HUGETLB_H */

mm/hugetlb.c

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
#include <linux/jhash.h>
2929
#include <linux/numa.h>
3030
#include <linux/llist.h>
31+
#include <linux/cma.h>
3132

3233
#include <asm/page.h>
3334
#include <asm/pgtable.h>
@@ -44,6 +45,9 @@
4445
int hugetlb_max_hstate __read_mostly;
4546
unsigned int default_hstate_idx;
4647
struct hstate hstates[HUGE_MAX_HSTATE];
48+
49+
static struct cma *hugetlb_cma[MAX_NUMNODES];
50+
4751
/*
4852
* Minimum page order among possible hugepage sizes, set to a proper value
4953
* at boot time.
@@ -1228,6 +1232,14 @@ static void destroy_compound_gigantic_page(struct page *page,
12281232

12291233
static void free_gigantic_page(struct page *page, unsigned int order)
12301234
{
1235+
/*
1236+
* If the page isn't allocated using the cma allocator,
1237+
* cma_release() returns false.
1238+
*/
1239+
if (IS_ENABLED(CONFIG_CMA) &&
1240+
cma_release(hugetlb_cma[page_to_nid(page)], page, 1 << order))
1241+
return;
1242+
12311243
free_contig_range(page_to_pfn(page), 1 << order);
12321244
}
12331245

@@ -1237,6 +1249,21 @@ static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
12371249
{
12381250
unsigned long nr_pages = 1UL << huge_page_order(h);
12391251

1252+
if (IS_ENABLED(CONFIG_CMA)) {
1253+
struct page *page;
1254+
int node;
1255+
1256+
for_each_node_mask(node, *nodemask) {
1257+
if (!hugetlb_cma[node])
1258+
continue;
1259+
1260+
page = cma_alloc(hugetlb_cma[node], nr_pages,
1261+
huge_page_order(h), true);
1262+
if (page)
1263+
return page;
1264+
}
1265+
}
1266+
12401267
return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask);
12411268
}
12421269

@@ -1281,8 +1308,14 @@ static void update_and_free_page(struct hstate *h, struct page *page)
12811308
set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
12821309
set_page_refcounted(page);
12831310
if (hstate_is_gigantic(h)) {
1311+
/*
1312+
* Temporarily drop the hugetlb_lock, because
1313+
* we might block in free_gigantic_page().
1314+
*/
1315+
spin_unlock(&hugetlb_lock);
12841316
destroy_compound_gigantic_page(page, huge_page_order(h));
12851317
free_gigantic_page(page, huge_page_order(h));
1318+
spin_lock(&hugetlb_lock);
12861319
} else {
12871320
__free_pages(page, huge_page_order(h));
12881321
}
@@ -2539,6 +2572,10 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
25392572

25402573
for (i = 0; i < h->max_huge_pages; ++i) {
25412574
if (hstate_is_gigantic(h)) {
2575+
if (IS_ENABLED(CONFIG_CMA) && hugetlb_cma[0]) {
2576+
pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n");
2577+
break;
2578+
}
25422579
if (!alloc_bootmem_huge_page(h))
25432580
break;
25442581
} else if (!alloc_pool_huge_page(h,
@@ -3194,6 +3231,7 @@ static int __init hugetlb_init(void)
31943231
default_hstate.max_huge_pages = default_hstate_max_huge_pages;
31953232
}
31963233

3234+
hugetlb_cma_check();
31973235
hugetlb_init_hstates();
31983236
gather_bootmem_prealloc();
31993237
report_hugepages();
@@ -5506,3 +5544,74 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
55065544
spin_unlock(&hugetlb_lock);
55075545
}
55085546
}
5547+
5548+
#ifdef CONFIG_CMA
5549+
static unsigned long hugetlb_cma_size __initdata;
5550+
static bool cma_reserve_called __initdata;
5551+
5552+
static int __init cmdline_parse_hugetlb_cma(char *p)
5553+
{
5554+
hugetlb_cma_size = memparse(p, &p);
5555+
return 0;
5556+
}
5557+
5558+
early_param("hugetlb_cma", cmdline_parse_hugetlb_cma);
5559+
5560+
void __init hugetlb_cma_reserve(int order)
5561+
{
5562+
unsigned long size, reserved, per_node;
5563+
int nid;
5564+
5565+
cma_reserve_called = true;
5566+
5567+
if (!hugetlb_cma_size)
5568+
return;
5569+
5570+
if (hugetlb_cma_size < (PAGE_SIZE << order)) {
5571+
pr_warn("hugetlb_cma: cma area should be at least %lu MiB\n",
5572+
(PAGE_SIZE << order) / SZ_1M);
5573+
return;
5574+
}
5575+
5576+
/*
5577+
* If 3 GB area is requested on a machine with 4 numa nodes,
5578+
* let's allocate 1 GB on first three nodes and ignore the last one.
5579+
*/
5580+
per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes);
5581+
pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n",
5582+
hugetlb_cma_size / SZ_1M, per_node / SZ_1M);
5583+
5584+
reserved = 0;
5585+
for_each_node_state(nid, N_ONLINE) {
5586+
int res;
5587+
5588+
size = min(per_node, hugetlb_cma_size - reserved);
5589+
size = round_up(size, PAGE_SIZE << order);
5590+
5591+
res = cma_declare_contiguous_nid(0, size, 0, PAGE_SIZE << order,
5592+
0, false, "hugetlb",
5593+
&hugetlb_cma[nid], nid);
5594+
if (res) {
5595+
pr_warn("hugetlb_cma: reservation failed: err %d, node %d",
5596+
res, nid);
5597+
continue;
5598+
}
5599+
5600+
reserved += size;
5601+
pr_info("hugetlb_cma: reserved %lu MiB on node %d\n",
5602+
size / SZ_1M, nid);
5603+
5604+
if (reserved >= hugetlb_cma_size)
5605+
break;
5606+
}
5607+
}
5608+
5609+
void __init hugetlb_cma_check(void)
5610+
{
5611+
if (!hugetlb_cma_size || cma_reserve_called)
5612+
return;
5613+
5614+
pr_warn("hugetlb_cma: the option isn't supported by current arch\n");
5615+
}
5616+
5617+
#endif /* CONFIG_CMA */

0 commit comments

Comments
 (0)