Skip to content

Commit 0f651d0

Browse files
committed
8377771: Race on allocating overreserved large pages on a NUMA system
1 parent bfb6de5 commit 0f651d0

File tree

11 files changed

+109
-47
lines changed

11 files changed

+109
-47
lines changed

src/hotspot/os/bsd/gc/z/zNUMA_bsd.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
#include "runtime/globals_extension.hpp"
2828

2929
void ZNUMA::pd_initialize() {
30+
_is_numa_system = false;
3031
_enabled = false;
3132
_count = !FLAG_IS_DEFAULT(ZFakeNUMA)
3233
? ZFakeNUMA

src/hotspot/os/linux/gc/z/zNUMA_linux.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ static uint* z_numa_id_to_node = nullptr;
3636
static uint32_t* z_node_to_numa_id = nullptr;
3737

3838
void ZNUMA::pd_initialize() {
39+
_is_numa_system = os::Linux::numa_available() != -1;
3940
_enabled = UseNUMA;
4041

4142
size_t configured_nodes = 0;

src/hotspot/os/linux/gc/z/zPhysicalMemoryBacking_linux.cpp

Lines changed: 29 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -385,9 +385,9 @@ bool ZPhysicalMemoryBacking::tmpfs_supports_transparent_huge_pages() const {
385385
return access(ZFILENAME_SHMEM_ENABLED, R_OK) == 0;
386386
}
387387

388-
ZErrno ZPhysicalMemoryBacking::fallocate_compat_mmap_hugetlbfs(zbacking_offset offset, size_t length, bool touch) const {
388+
ZErrno ZPhysicalMemoryBacking::fallocate_compat_mmap_hugetlbfs(zbacking_offset offset, size_t length, bool force_touch) const {
389389
// On hugetlbfs, mapping a file segment will fail immediately, without
390-
// the need to touch the mapped pages first, if there aren't enough huge
390+
// the need to touch the mapped pages first, if there aren't enough large
391391
// pages available to back the mapping.
392392
void* const addr = mmap(nullptr, length, PROT_READ|PROT_WRITE, MAP_SHARED, _fd, untype(offset));
393393
if (addr == MAP_FAILED) {
@@ -396,44 +396,34 @@ ZErrno ZPhysicalMemoryBacking::fallocate_compat_mmap_hugetlbfs(zbacking_offset o
396396
}
397397

398398
// Once mapped, the huge pages are only reserved. We need to touch them
399-
// to associate them with the file segment. Note that we can not punch
400-
// hole in file segments which only have reserved pages.
401-
if (touch) {
402-
char* const start = (char*)addr;
403-
char* const end = start + length;
404-
os::pretouch_memory(start, end, _block_size);
399+
// to associate them with the file segment. This needs to be done immediately
400+
// if running on a NUMA system, where shared memory may overreserve the number
401+
// of large pages available.
402+
const bool should_touch = force_touch || ZNUMA::is_numa_system();
403+
bool touched = false;
404+
405+
if (should_touch) {
406+
// Touch the large pages safely
407+
touched = os::Linux::safe_touch_memory(addr, length, ZGranuleSize);
405408
}
406409

407-
// Unmap again. From now on, the huge pages that were mapped are allocated
408-
// to this file. There's no risk of getting a SIGBUS when mapping and
409-
// touching these pages again.
410+
// Unmap again
410411
if (munmap(addr, length) == -1) {
411412
// Failed
412413
return errno;
413414
}
414415

415-
// Success
416-
return 0;
417-
}
418-
419-
static bool safe_touch_mapping(void* addr, size_t length, size_t page_size) {
420-
char* const start = (char*)addr;
421-
char* const end = start + length;
422-
423-
// Touching a mapping that can't be backed by memory will generate a
424-
// SIGBUS. By using SafeFetch32 any SIGBUS will be safely caught and
425-
// handled. On tmpfs, doing a fetch (rather than a store) is enough
426-
// to cause backing pages to be allocated (there's no zero-page to
427-
// worry about).
428-
for (char *p = start; p < end; p += page_size) {
429-
if (SafeFetch32((int*)p, -1) == -1) {
430-
// Failed
431-
return false;
432-
}
416+
if (should_touch && !touched) {
417+
// Failed to touch all large pages
418+
return errno;
433419
}
434420

421+
// From now on, the large pages that were mapped are allocated to this file.
422+
// There's no risk of getting a SIGBUS when mapping and touching these pages
423+
// again.
424+
435425
// Success
436-
return true;
426+
return 0;
437427
}
438428

439429
ZErrno ZPhysicalMemoryBacking::fallocate_compat_mmap_tmpfs(zbacking_offset offset, size_t length) const {
@@ -451,7 +441,7 @@ ZErrno ZPhysicalMemoryBacking::fallocate_compat_mmap_tmpfs(zbacking_offset offse
451441
}
452442

453443
// Touch the mapping (safely) to make sure it's backed by memory
454-
const bool backed = safe_touch_mapping(addr, length, _block_size);
444+
const bool backed = os::Linux::safe_touch_memory(addr, length, _block_size);
455445

456446
// Unmap again. If successfully touched, the backing memory will
457447
// be allocated to this file. There's no risk of getting a SIGBUS
@@ -461,7 +451,7 @@ ZErrno ZPhysicalMemoryBacking::fallocate_compat_mmap_tmpfs(zbacking_offset offse
461451
return errno;
462452
}
463453

464-
// Success
454+
// Success?
465455
return backed ? 0 : ENOMEM;
466456
}
467457

@@ -486,7 +476,7 @@ ZErrno ZPhysicalMemoryBacking::fallocate_fill_hole_compat(zbacking_offset offset
486476
// mmap/munmap (for hugetlbfs and tmpfs with transparent huge pages) or pwrite
487477
// (for tmpfs without transparent huge pages and other filesystem types).
488478
if (ZLargePages::is_explicit()) {
489-
return fallocate_compat_mmap_hugetlbfs(offset, length, false /* touch */);
479+
return fallocate_compat_mmap_hugetlbfs(offset, length, false /* force_touch */);
490480
} else if (ZLargePages::is_transparent()) {
491481
return fallocate_compat_mmap_tmpfs(offset, length);
492482
} else {
@@ -534,14 +524,16 @@ ZErrno ZPhysicalMemoryBacking::fallocate_fill_hole(zbacking_offset offset, size_
534524
}
535525

536526
ZErrno ZPhysicalMemoryBacking::fallocate_punch_hole(zbacking_offset offset, size_t length) const {
537-
if (ZLargePages::is_explicit()) {
527+
// On a NUMA system we have to touch all the large pages when committing, so
528+
// there is no need to touch them again here.
529+
if (ZLargePages::is_explicit() && !ZNUMA::is_numa_system()) {
538530
// We can only punch hole in pages that have been touched. Non-touched
539531
// pages are only reserved, and not associated with any specific file
540532
// segment. We don't know which pages have been previously touched, so
541533
// we always touch them here to guarantee that we can punch hole.
542-
const ZErrno err = fallocate_compat_mmap_hugetlbfs(offset, length, true /* touch */);
534+
//
535+
const ZErrno err = fallocate_compat_mmap_hugetlbfs(offset, length, true /* force_touch */);
543536
if (err) {
544-
// Failed
545537
return err;
546538
}
547539
}
@@ -665,9 +657,7 @@ size_t ZPhysicalMemoryBacking::commit_default(zbacking_offset offset, size_t len
665657
}
666658

667659
size_t ZPhysicalMemoryBacking::commit(zbacking_offset offset, size_t length, uint32_t numa_id) const {
668-
if (ZNUMA::is_enabled() && !ZLargePages::is_explicit()) {
669-
// The memory is required to be preferred at the time it is paged in. As a
670-
// consequence we must prefer the memory when committing non-large pages.
660+
if (ZNUMA::is_enabled()) {
671661
return commit_numa_preferred(offset, length, numa_id);
672662
}
673663

src/hotspot/os/linux/gc/z/zPhysicalMemoryBacking_linux.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ class ZPhysicalMemoryBacking {
4848
bool is_hugetlbfs() const;
4949
bool tmpfs_supports_transparent_huge_pages() const;
5050

51-
ZErrno fallocate_compat_mmap_hugetlbfs(zbacking_offset offset, size_t length, bool touch) const;
51+
ZErrno fallocate_compat_mmap_hugetlbfs(zbacking_offset offset, size_t length, bool force_touch) const;
5252
ZErrno fallocate_compat_mmap_tmpfs(zbacking_offset offset, size_t length) const;
5353
ZErrno fallocate_compat_pwrite(zbacking_offset offset, size_t length) const;
5454
ZErrno fallocate_fill_hole_compat(zbacking_offset offset, size_t length) const;

src/hotspot/os/linux/os_linux.cpp

Lines changed: 66 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@
5656
#include "runtime/osInfo.hpp"
5757
#include "runtime/osThread.hpp"
5858
#include "runtime/perfMemory.hpp"
59+
#include "runtime/safefetch.hpp"
5960
#include "runtime/sharedRuntime.hpp"
6061
#include "runtime/stubRoutines.hpp"
6162
#include "runtime/threads.hpp"
@@ -3005,6 +3006,36 @@ void os::Linux::madvise_transparent_huge_pages(void* addr, size_t bytes) {
30053006
::madvise(addr, bytes, MADV_HUGEPAGE);
30063007
}
30073008

3009+
bool os::Linux::safe_touch_memory(void* addr, size_t bytes, size_t page_size) {
3010+
const int result = ::madvise(addr, bytes, MADV_POPULATE_WRITE);
3011+
if (result == 0) {
3012+
// Success
3013+
return true;
3014+
} else if (errno != EINVAL) {
3015+
// Failed call to madvise for some other reason than EINVAL
3016+
return false;
3017+
}
3018+
3019+
// If we failed because of EINVAL it might be because MADV_POPULATE_WRITE is
3020+
// not supported. We then try touching the memory using SafeFetch.
3021+
3022+
char* const start = (char*)addr;
3023+
char* const end = start + bytes;
3024+
3025+
// Touching a mapping that can't be backed by memory will generate a
3026+
// SIGBUS. By using SafeFetch32 any SIGBUS will be safely caught and
3027+
// handled. A fetch is enough to cause backing pages to be allocated.
3028+
for (char *p = start; p < end; p += page_size) {
3029+
if (SafeFetch32((int*)p, -1) == -1) {
3030+
// Failed
3031+
return false;
3032+
}
3033+
}
3034+
3035+
// Success
3036+
return true;
3037+
}
3038+
30083039
void os::pd_realign_memory(char *addr, size_t bytes, size_t alignment_hint) {
30093040
if (Linux::should_madvise_anonymous_thps() && alignment_hint > vm_page_size()) {
30103041
Linux::madvise_transparent_huge_pages(addr, bytes);
@@ -3203,6 +3234,19 @@ static bool numa_syscall_check() {
32033234
return true;
32043235
}
32053236

3237+
void os::Linux::libnuma_early_init() {
3238+
// This function only sets up functionality that we need regardless of
3239+
// whether NUMA is enabled/disabled in the JVM via UseNUMA. This is a
3240+
// best effort, as libnuma is likely not available on all systems.
3241+
void *handle = dlopen("libnuma.so.1", RTLD_LAZY);
3242+
if (handle != nullptr) {
3243+
log_info(gc, numa)("Setting up numa available");
3244+
set_numa_available(CAST_TO_FN_PTR(numa_available_func_t,
3245+
libnuma_dlsym(handle, "numa_available")));
3246+
3247+
}
3248+
}
3249+
32063250
bool os::Linux::libnuma_init() {
32073251
// Requires sched_getcpu() and numa dependent syscalls support
32083252
if ((sched_getcpu() != -1) && numa_syscall_check()) {
@@ -3216,8 +3260,6 @@ bool os::Linux::libnuma_init() {
32163260
libnuma_dlsym(handle, "numa_max_node")));
32173261
set_numa_num_configured_nodes(CAST_TO_FN_PTR(numa_num_configured_nodes_func_t,
32183262
libnuma_dlsym(handle, "numa_num_configured_nodes")));
3219-
set_numa_available(CAST_TO_FN_PTR(numa_available_func_t,
3220-
libnuma_dlsym(handle, "numa_available")));
32213263
set_numa_tonode_memory(CAST_TO_FN_PTR(numa_tonode_memory_func_t,
32223264
libnuma_dlsym(handle, "numa_tonode_memory")));
32233265
set_numa_interleave_memory(CAST_TO_FN_PTR(numa_interleave_memory_func_t,
@@ -4194,9 +4236,27 @@ char* os::pd_reserve_memory_special(size_t bytes, size_t alignment, size_t page_
41944236

41954237
char* const addr = reserve_memory_special_huge_tlbfs(bytes, alignment, page_size, req_addr, exec);
41964238

4239+
log_info(gc,numa)("Using ZGC: %s, numa_available: %d", UseZGC ? "true" : "false", os::Linux::numa_available());
4240+
41974241
if (addr != nullptr) {
41984242
if (UseNUMAInterleaving) {
41994243
numa_make_global(addr, bytes);
4244+
} else if (UseZGC && os::Linux::numa_available() != -1) {
4245+
// Large pages are committed during reservation so that they are reserved for us.
4246+
// However, large pages may be overreserved on a NUMA system if using shared memory,
4247+
// which ZGC does. To make sure we can back the pages we need to fault them in
4248+
// immediately.
4249+
if (!os::Linux::safe_touch_memory(addr, bytes, page_size)) {
4250+
log_info(gc,numa)("Failed to touch all special memory");
4251+
if (::munmap(addr, bytes) != 0) {
4252+
ErrnoPreserver ep;
4253+
log_trace(os, map)("munmap failed: " RANGEFMT " errno=(%s)",
4254+
RANGEFMTARGS(addr, bytes),
4255+
os::strerror(ep.saved_errno()));
4256+
}
4257+
return nullptr;
4258+
}
4259+
log_info(gc,numa)("Touched %zu bytes special", bytes);
42004260
}
42014261
}
42024262

@@ -4610,6 +4670,10 @@ jint os::init_2(void) {
46104670
init_adjust_stacksize_for_guard_pages();
46114671
#endif
46124672

4673+
// Sets up functionality that may be needed regardless of whether the
4674+
// JVM should enable NUMA optimizations or not.
4675+
Linux::libnuma_early_init();
4676+
46134677
if (UseNUMA || UseNUMAInterleaving) {
46144678
Linux::numa_init();
46154679
}

src/hotspot/os/linux/os_linux.hpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,7 @@ class os::Linux {
126126

127127
static void libpthread_init();
128128
static void sched_getcpu_init();
129+
static void libnuma_early_init();
129130
static bool libnuma_init();
130131
static void* libnuma_dlsym(void* handle, const char* name);
131132
// libnuma v2 (libnuma_1.2) symbols
@@ -195,6 +196,8 @@ class os::Linux {
195196

196197
static void madvise_transparent_huge_pages(void* addr, size_t bytes);
197198

199+
static bool safe_touch_memory(void* addr, size_t bytes, size_t page_size);
200+
198201
// Stack repair handling
199202

200203
// none present

src/hotspot/os/windows/gc/z/zNUMA_windows.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
#include "runtime/globals_extension.hpp"
2828

2929
void ZNUMA::pd_initialize() {
30+
_is_numa_system = false;
3031
_enabled = false;
3132
_count = !FLAG_IS_DEFAULT(ZFakeNUMA)
3233
? ZFakeNUMA

src/hotspot/share/gc/z/zNUMA.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
#include "gc/z/zNUMA.inline.hpp"
2727
#include "utilities/macros.hpp"
2828

29+
bool ZNUMA::_is_numa_system;
2930
bool ZNUMA::_enabled;
3031
uint32_t ZNUMA::_count;
3132

src/hotspot/share/gc/z/zNUMA.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ class ZNUMA : public AllStatic {
3434
friend class ZTest;
3535

3636
private:
37+
static bool _is_numa_system;
3738
static bool _enabled;
3839
static uint32_t _count;
3940

@@ -42,6 +43,7 @@ class ZNUMA : public AllStatic {
4243
public:
4344
static void initialize();
4445

46+
static bool is_numa_system();
4547
static bool is_enabled();
4648
static bool is_faked();
4749

src/hotspot/share/gc/z/zNUMA.inline.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,10 @@
3030
#include "gc/z/zGlobals.hpp"
3131
#include "utilities/align.hpp"
3232

33+
inline bool ZNUMA::is_numa_system() {
34+
return _is_numa_system;
35+
}
36+
3337
inline bool ZNUMA::is_enabled() {
3438
return _enabled;
3539
}

0 commit comments

Comments
 (0)