diff --git a/.github/workflows/pr_push.yml b/.github/workflows/pr_push.yml index 52bd73756a..b80cb8177f 100644 --- a/.github/workflows/pr_push.yml +++ b/.github/workflows/pr_push.yml @@ -19,19 +19,15 @@ permissions: jobs: CodeChecks: uses: ./.github/workflows/reusable_checks.yml - FastBuild: - name: Fast builds - needs: [CodeChecks] - uses: ./.github/workflows/reusable_fast.yml Build: name: Basic builds - needs: [FastBuild] + uses: ./.github/workflows/reusable_basic.yml DevDax: - needs: [FastBuild] + uses: ./.github/workflows/reusable_dax.yml MultiNuma: - needs: [FastBuild] + uses: ./.github/workflows/reusable_multi_numa.yml L0: needs: [Build] @@ -56,10 +52,10 @@ jobs: runner: "CUDA" shared_lib: "['ON']" Sanitizers: - needs: [FastBuild] + uses: ./.github/workflows/reusable_sanitizers.yml QEMU: - needs: [FastBuild] + uses: ./.github/workflows/reusable_qemu.yml with: short_run: true diff --git a/include/umf/providers/provider_os_memory.h b/include/umf/providers/provider_os_memory.h index 978965621c..5b2c5ff6c0 100644 --- a/include/umf/providers/provider_os_memory.h +++ b/include/umf/providers/provider_os_memory.h @@ -46,6 +46,7 @@ typedef enum umf_numa_mode_t { /// umf_numa_split_partition_t can be passed in umf_os_memory_provider_params_t structure /// to specify other distribution. UMF_NUMA_MODE_SPLIT, + /// The memory is allocated on the node of the CPU that triggered the /// allocation. If this mode is specified, nodemask must be NULL and /// maxnode must be 0. @@ -58,6 +59,7 @@ typedef struct umf_numa_split_partition_t { /// The weight of the partition, representing the proportion of /// the allocation that should be assigned to this NUMA node. unsigned weight; + /// The NUMA node where the pages assigned to this partition will be bound. unsigned target; } umf_numa_split_partition_t; diff --git a/src/memtargets/memtarget_numa.c b/src/memtargets/memtarget_numa.c index a0a1e592a0..56aea9ca49 100644 --- a/src/memtargets/memtarget_numa.c +++ b/src/memtargets/memtarget_numa.c @@ -218,6 +218,8 @@ static umf_result_t numa_get_capacity(void *memTarget, size_t *capacity) { return UMF_RESULT_ERROR_INVALID_ARGUMENT; } +#if defined(_WIN32) || defined(__APPLE__) + hwloc_topology_t topology = umfGetTopology(); if (!topology) { return UMF_RESULT_ERROR_NOT_SUPPORTED; @@ -234,6 +236,44 @@ static umf_result_t numa_get_capacity(void *memTarget, size_t *capacity) { } *capacity = numaNode->attr->numanode.local_memory; + +#else // Linux + + struct numa_memtarget_t *numaTarget = (struct numa_memtarget_t *)memTarget; + unsigned node = numaTarget->physical_id; + + char path[256]; + snprintf(path, sizeof(path), "/sys/devices/system/node/node%u/meminfo", + node); + FILE *file = fopen(path, "r"); + if (!file) { + LOG_PDEBUG("Opening sysfs file %s failed", path); + return UMF_RESULT_ERROR_INVALID_ARGUMENT; + } + + char line[256]; + size_t node_size = 0; + while (fgets(line, sizeof(line), file)) { + // search for the MemTotal line + if (strncmp(line, "Node ", 5) == 0 && + sscanf(line, "Node %u MemTotal: %zu kB", &node, &node_size) == 2 && + node == numaTarget->physical_id) { + // convert kB to bytes + node_size *= 1024; + break; + } + } + fclose(file); + + if (node_size == 0) { + LOG_ERR("Failed to find MemTotal for node %u", numaTarget->physical_id); + return UMF_RESULT_ERROR_INVALID_ARGUMENT; + } + + *capacity = (size_t)node_size; + +#endif + return UMF_RESULT_SUCCESS; } @@ -254,9 +294,28 @@ static size_t memattr_get_worst_value(memattr_type_t type) { } } +#if !defined(_WIN32) && !defined(__APPLE__) + +static size_t memattr_get_best_value(memattr_type_t type) { + switch (type) { + case MEMATTR_TYPE_BANDWIDTH: + return SIZE_MAX; + case MEMATTR_TYPE_LATENCY: + return 0; + default: + assert(0); // Should not be reachable + return 0; + } +} + +#endif // !defined(_WIN32) && !defined(__APPLE__) + static umf_result_t query_attribute_value(void *srcMemoryTarget, void *dstMemoryTarget, size_t *value, memattr_type_t type) { + +#if defined(_WIN32) || defined(__APPLE__) + hwloc_topology_t topology = umfGetTopology(); if (!topology) { LOG_PERR("Retrieving cached topology failed"); @@ -315,6 +374,60 @@ static umf_result_t query_attribute_value(void *srcMemoryTarget, *value = memAttrValue; +#else + + struct numa_memtarget_t *srcNumaTarget = + (struct numa_memtarget_t *)srcMemoryTarget; + struct numa_memtarget_t *dstNumaTarget = + (struct numa_memtarget_t *)dstMemoryTarget; + + if (srcNumaTarget->physical_id == dstNumaTarget->physical_id) { + // If both targets are the same, we return the best possible value. + *value = memattr_get_best_value(type); + return UMF_RESULT_SUCCESS; + } + + // For Linux, we use sysfs to query the bandwidth and latency. + char path[256]; + if (type == MEMATTR_TYPE_BANDWIDTH) { + snprintf(path, sizeof(path), + "/sys/devices/system/node/node%u/node%u/memory_bandwidth", + srcNumaTarget->physical_id, dstNumaTarget->physical_id); + } else if (type == MEMATTR_TYPE_LATENCY) { + snprintf(path, sizeof(path), + "/sys/devices/system/node/node%u/node%u/memory_latency", + srcNumaTarget->physical_id, dstNumaTarget->physical_id); + } else { + assert(0); // Shouldn't be reachable. + return UMF_RESULT_ERROR_INVALID_ARGUMENT; + } + FILE *file = fopen(path, "r"); + if (!file) { + LOG_PDEBUG("Opening sysfs file %s failed", path); + *value = memattr_get_worst_value(type); + return UMF_RESULT_SUCCESS; + } + + char line[64]; + if (!fgets(line, sizeof(line), file)) { + LOG_PDEBUG("Reading sysfs file %s failed", path); + fclose(file); + *value = memattr_get_worst_value(type); + return UMF_RESULT_SUCCESS; + } + fclose(file); + char *endptr; + long long val = strtoll(line, &endptr, 10); + if (endptr == line || *endptr != '\n' || val < 0) { + LOG_PDEBUG("Parsing sysfs file %s failed", path); + *value = memattr_get_worst_value(type); + return UMF_RESULT_SUCCESS; + } + + *value = (size_t)val; + +#endif // _WIN32 || _APPLE_ + return UMF_RESULT_SUCCESS; } diff --git a/src/provider/provider_os_memory.c b/src/provider/provider_os_memory.c index abea227a35..35863fada4 100644 --- a/src/provider/provider_os_memory.c +++ b/src/provider/provider_os_memory.c @@ -8,11 +8,16 @@ #include #include #include - #include #include #include #include + +#if !defined(_WIN32) && !defined(__APPLE__) +#include +#include +#endif + #include #include #include @@ -24,6 +29,7 @@ #include "ctl/ctl_internal.h" #include "libumf.h" #include "provider_os_memory_internal.h" +#include "topology.h" #include "utils_assert.h" #include "utils_common.h" #include "utils_concurrency.h" @@ -32,8 +38,8 @@ #define CTL_PROVIDER_TYPE os_memory_provider_t #include "provider_ctl_stats_impl.h" +#define MAX_NUMNODES 1024 #define NODESET_STR_BUF_LEN 1024 - #define TLS_MSG_BUF_LEN 1024 static const char *DEFAULT_NAME = "OS"; @@ -152,8 +158,14 @@ static umf_result_t initialize_nodeset(os_memory_provider_t *os_provider, // Hwloc_set_area_membind fails if empty nodeset is passed so // if no node is specified, just pass all available nodes. // For modes where no node is needed, they will be ignored anyway. + +#if defined(_WIN32) || defined(__APPLE__) out_nodeset[0] = hwloc_bitmap_dup( hwloc_topology_get_complete_nodeset(os_provider->topo)); +#else + out_nodeset[0] = hwloc_bitmap_dup(umfGetTopology2()); +#endif + if (!out_nodeset[0]) { goto err_free_list; } @@ -518,6 +530,11 @@ translate_params(const umf_os_memory_provider_params_t *in_params, provider->numa_flags = getHwlocMembindFlags(in_params->numa_mode, is_dedicated_node_bind); + +#if !defined(_WIN32) && !defined(__APPLE__) + provider->dedicated = is_dedicated_node_bind; +#endif + provider->mode = in_params->numa_mode; provider->part_size = in_params->part_size; @@ -561,6 +578,11 @@ static umf_result_t os_initialize(const void *params, void **provider) { snprintf(os_provider->name, sizeof(os_provider->name), "%s", in_params->name); +#if defined(_WIN32) || defined(__APPLE__) + + //struct timespec ts_init_start, ts_init_end; + //clock_gettime(CLOCK_MONOTONIC, &ts_init_start); + int r = hwloc_topology_init(&os_provider->topo); if (r) { LOG_ERR("HWLOC topology init failed"); @@ -577,6 +599,13 @@ static umf_result_t os_initialize(const void *params, void **provider) { goto err_destroy_hwloc_topology; } + //clock_gettime(CLOCK_MONOTONIC, &ts_init_end); + //LOG_FATAL("HWLOC topology initialized in %ld.%09ld seconds", + // ts_init_end.tv_sec - ts_init_start.tv_sec, + // ts_init_end.tv_nsec - ts_init_start.tv_nsec); + +#endif // _WIN32 + os_provider->fd_offset_map = critnib_new(NULL, NULL); if (!os_provider->fd_offset_map) { LOG_ERR("creating file descriptor offset map failed"); @@ -625,8 +654,11 @@ static umf_result_t os_initialize(const void *params, void **provider) { err_destroy_critnib: critnib_delete(os_provider->fd_offset_map); err_destroy_hwloc_topology: + +#if defined(_WIN32) || defined(__APPLE__) hwloc_topology_destroy(os_provider->topo); err_free_os_provider: +#endif umf_ba_global_free(os_provider); return ret; } @@ -649,7 +681,10 @@ static umf_result_t os_finalize(void *provider) { if (os_provider->nodeset_str_buf) { umf_ba_global_free(os_provider->nodeset_str_buf); } + +#if defined(_WIN32) || defined(__APPLE__) hwloc_topology_destroy(os_provider->topo); +#endif umf_ba_global_free(os_provider); return UMF_RESULT_SUCCESS; } @@ -1012,10 +1047,52 @@ static umf_result_t os_alloc(void *provider, size_t size, size_t alignment, do { errno = 0; + ret = 0; + +#if defined(_WIN32) || defined(__APPLE__) ret = hwloc_set_area_membind(os_provider->topo, membind.addr, membind.bind_size, membind.bitmap, os_provider->numa_policy, os_provider->numa_flags); +#else // !_WIN32 && !_APPLE__ + + // NOTE: could we done this + + // on Linux, use mbind syscall directly instead of hwloc + unsigned long nodemask = 0; + int maxnode = 8 * sizeof(nodemask); // up to 64 nodes + if (membind.bitmap) { + for (int i = 0; i < maxnode; ++i) { + if (hwloc_bitmap_isset(membind.bitmap, i)) { + nodemask |= (1UL << i); + } + } + } + + int mbind_mode = MPOL_DEFAULT; + if (os_provider->mode == UMF_NUMA_MODE_INTERLEAVE && + os_provider->dedicated == 0) { + mbind_mode = MPOL_INTERLEAVE; + } else if (os_provider->mode == UMF_NUMA_MODE_SPLIT) { + mbind_mode = MPOL_BIND; + } else if (os_provider->mode == UMF_NUMA_MODE_LOCAL) { + mbind_mode = MPOL_LOCAL; + nodemask = 0; + } else if (os_provider->mode == UMF_NUMA_MODE_PREFERRED) { + mbind_mode = MPOL_BIND; + } else if (os_provider->mode == UMF_NUMA_MODE_BIND || + os_provider->dedicated) { + mbind_mode = MPOL_BIND; + } + + unsigned long mbind_flags = 0; + if (os_provider->dedicated) { + mbind_flags |= MPOL_MF_STRICT; + } + + ret = syscall(__NR_mbind, membind.addr, membind.bind_size, + mbind_mode, &nodemask, maxnode, mbind_flags); +#endif // !_WIN32 && !_APPLE__ if (ret) { os_store_last_native_error(UMF_OS_RESULT_ERROR_BIND_FAILED, diff --git a/src/provider/provider_os_memory_internal.h b/src/provider/provider_os_memory_internal.h index 3648d4a88f..87a3f28433 100644 --- a/src/provider/provider_os_memory_internal.h +++ b/src/provider/provider_os_memory_internal.h @@ -68,7 +68,13 @@ typedef struct os_memory_provider_t { unsigned partitions_len; size_t partitions_weight_sum; +#if defined(_WIN32) || defined(__APPLE__) hwloc_topology_t topo; +#else + // NOTE: on linux we don't want to use hwloc_topology_t directly because + // of its long initialization time + int dedicated; +#endif char name[64]; diff --git a/src/topology.c b/src/topology.c index eab7992ced..1ada1078a6 100644 --- a/src/topology.c +++ b/src/topology.c @@ -1,19 +1,34 @@ /* * - * Copyright (C) 2024 Intel Corporation + * Copyright (C) 2024-2025 Intel Corporation * * Under the Apache License v2.0 with LLVM Exceptions. See LICENSE.TXT. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + #include "base_alloc_global.h" #include "umf_hwloc.h" #include "utils_concurrency.h" #include "utils_log.h" static hwloc_topology_t topology = NULL; +static hwloc_bitmap_t topology2 = NULL; + static UTIL_ONCE_FLAG topology_initialized = UTIL_ONCE_FLAG_INIT; +static UTIL_ONCE_FLAG topology_initialized2 = UTIL_ONCE_FLAG_INIT; void umfDestroyTopology(void) { if (topology) { @@ -23,6 +38,10 @@ void umfDestroyTopology(void) { static UTIL_ONCE_FLAG is_initialized = UTIL_ONCE_FLAG_INIT; memcpy(&topology_initialized, &is_initialized, sizeof(topology_initialized)); + + static UTIL_ONCE_FLAG is_initialized2 = UTIL_ONCE_FLAG_INIT; + memcpy(&topology_initialized2, &is_initialized2, + sizeof(topology_initialized2)); } } @@ -40,7 +59,60 @@ static void umfCreateTopology(void) { } } +int utils_get_complete_nodeset(size_t *nodes, size_t nodes_size, size_t *num) { + DIR *dir = opendir("/sys/devices/system/node/"); + if (!dir) { + return -1; + } + + struct dirent *entry; + while ((entry = readdir(dir)) != NULL) { + if (strncmp(entry->d_name, "node", 4) == 0) { + char *endptr; + long node_id = strtol(entry->d_name + 4, &endptr, 10); + if (*endptr == '\0' && node_id >= 0 && *num < nodes_size) { + nodes[*num] = (size_t)node_id; + (*num)++; + } + } + } + + closedir(dir); + return 0; +} + +static void umfCreateTopology2(void) { + + topology2 = hwloc_bitmap_alloc(); + + size_t *nodes = umf_ba_global_alloc(sizeof(size_t) * 1024); + if (!nodes) { + return; + } + + if (!topology2) { + return; + } + + size_t num = 0; + int ret = utils_get_complete_nodeset(nodes, 1024, &num); + if (ret < 0) { + return; + } + + for (size_t i = 0; i < num; i++) { + hwloc_bitmap_set(topology2, (int)nodes[i]); + } + + umf_ba_global_free(nodes); +} + hwloc_topology_t umfGetTopology(void) { utils_init_once(&topology_initialized, umfCreateTopology); return topology; } + +hwloc_bitmap_t umfGetTopology2(void) { + utils_init_once(&topology_initialized2, umfCreateTopology2); + return topology2; +} diff --git a/src/topology.h b/src/topology.h index c20defda7a..975ef1f73f 100644 --- a/src/topology.h +++ b/src/topology.h @@ -1,6 +1,6 @@ /* * - * Copyright (C) 2024 Intel Corporation + * Copyright (C) 2024-2025 Intel Corporation * * Under the Apache License v2.0 with LLVM Exceptions. See LICENSE.TXT. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception @@ -19,6 +19,8 @@ extern "C" { hwloc_topology_t umfGetTopology(void); void umfDestroyTopology(void); +hwloc_bitmap_t umfGetTopology2(void); + #ifdef __cplusplus } #endif diff --git a/src/utils/utils_linux_common.c b/src/utils/utils_linux_common.c index cd0fefd2ae..c0283b8516 100644 --- a/src/utils/utils_linux_common.c +++ b/src/utils/utils_linux_common.c @@ -1,15 +1,18 @@ /* * - * Copyright (C) 2023-2024 Intel Corporation + * Copyright (C) 2023-2025 Intel Corporation * * Under the Apache License v2.0 with LLVM Exceptions. See LICENSE.TXT. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * */ +#include #include #include #include +#include +#include #include #include #include