Skip to content
6 changes: 6 additions & 0 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2276,12 +2276,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
"- distribute: spread execution evenly over all nodes\n"
"- isolate: only spawn threads on CPUs on the node that execution started on\n"
"- numactl: use the CPU map provided by numactl\n"
#ifdef GGML_USE_NUMA_MIGRATE
"- migrate: for affinity threads with page migration across NUMA nodes\n"
#endif
"if run without this previously, it is recommended to drop the system page cache before using this\n"
"see https://github.com/ggml-org/llama.cpp/issues/1437",
[](common_params & params, const std::string & value) {
/**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
#ifdef GGML_USE_NUMA_MIGRATE
else if (value == "migrate") { params.numa = GGML_NUMA_STRATEGY_MIGRATE; }
#endif
else { throw std::invalid_argument("invalid value"); }
}
).set_env("LLAMA_ARG_NUMA"));
Expand Down
5 changes: 5 additions & 0 deletions ggml/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,11 @@ set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING
"ggml: BLAS library vendor")
option(GGML_LLAMAFILE "ggml: use LLAMAFILE" ${GGML_LLAMAFILE_DEFAULT})

option(GGML_NUMA_MIGRATE "ggml: use NUMA_MIGRATE" OFF)
set(GGML_NUMA_MIGRATE_NODES "2" CACHE STRING
"ggml: the number of NUMA nodes during page migration")
option(GGML_NUMA_MIGRATE_DEBUG "ggml: enable debugging of NUMA_MIGRATE" OFF)

option(GGML_CUDA "ggml: use CUDA" OFF)
option(GGML_MUSA "ggml: use MUSA" OFF)
option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF)
Expand Down
3 changes: 3 additions & 0 deletions ggml/include/ggml-backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,9 @@ extern "C" {
// CPU buffer types are always available
GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
#ifdef GGML_USE_NUMA_MIGRATE
GGML_API size_t ggml_backend_get_page_size(void);
#endif

#ifdef __cplusplus
}
Expand Down
6 changes: 6 additions & 0 deletions ggml/include/ggml-cpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ extern "C" {
struct ggml_cplan {
size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
#ifdef GGML_USE_NUMA_MIGRATE
uint8_t * work_data_numa[GGML_NUMA_MIGRATE_NODES];
#endif

int n_threads;
struct ggml_threadpool * threadpool;
Expand All @@ -28,6 +31,9 @@ extern "C" {
GGML_NUMA_STRATEGY_ISOLATE = 2,
GGML_NUMA_STRATEGY_NUMACTL = 3,
GGML_NUMA_STRATEGY_MIRROR = 4,
#ifdef GGML_USE_NUMA_MIGRATE
GGML_NUMA_STRATEGY_MIGRATE = 5,
#endif
GGML_NUMA_STRATEGY_COUNT
};

Expand Down
24 changes: 24 additions & 0 deletions ggml/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -343,3 +343,27 @@ if (BUILD_SHARED_LIBS)
target_compile_definitions(${target} PUBLIC GGML_SHARED)
endforeach()
endif()

if (GGML_NUMA_MIGRATE)
find_path(NUMA_ROOT_DIR
NAMES include/numa.h
PATHS ENV NUMA_ROOT
DOC "NUMA root directory")

find_library(NUMA_LIBRARY
NAMES numa
HINTS ${NUMA_ROOT_DIR}
DOC "NUMA library")

if (NOT NUMA_LIBRARY)
message(FATAL_ERROR "Could NOT find NUMA library.")
endif()

if (GGML_NUMA_MIGRATE_DEBUG)
target_compile_definitions(ggml-base PUBLIC GGML_USE_NUMA_MIGRATE GGML_NUMA_MIGRATE_NODES=${GGML_NUMA_MIGRATE_NODES} GGML_USE_NUMA_MIGRATE_DEBUG)
else()
target_compile_definitions(ggml-base PUBLIC GGML_USE_NUMA_MIGRATE GGML_NUMA_MIGRATE_NODES=${GGML_NUMA_MIGRATE_NODES})
endif()

target_link_libraries(ggml-base PRIVATE ${NUMA_LIBRARY})
endif()
16 changes: 16 additions & 0 deletions ggml/src/ggml-alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -948,6 +948,22 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
ggml_backend_buffer_type_t buft, size_t size,
ggml_backend_buffer_t ** buffers, size_t * n_buffers) {

#ifdef GGML_USE_NUMA_MIGRATE
size_t num_of_tensors = 0;
for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
if (t->data == NULL) {
if (t->view_src == NULL) {
num_of_tensors++;
}
}
}
size_t ps = ggml_backend_get_page_size();
size_t original_size = size;
size += ps * num_of_tensors;
GGML_LOG_DEBUG("alloc buffer for NUMA page migration, num of tensors: %ld, size increased from %ld to %ld, increased %ld MiB\n",
num_of_tensors, original_size, size, (size - original_size) / 1024 / 1024);
#endif

ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
if (buffer == NULL) {
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
Expand Down
Loading