Skip to content

Commit b8bf5fa

Browse files
committed
don't try to mirror weights when we're not in --numa mirror mode. Also make logging prettier.
1 parent 34a5017 commit b8bf5fa

File tree

9 files changed

+129
-61
lines changed

9 files changed

+129
-61
lines changed

.github/copilot-instructions.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ source ../../../.venv/bin/activate
102102
./build/bin/llama-cli --version
103103

104104
# Test model loading (requires model file)
105-
./build/bin/llama-cli -m path/to/model.gguf -p "Hello" -n 10
105+
./build/bin/llama-cli -m path/to/model.gguf -p "Hello" -n 10 -no-cnv
106106
```
107107

108108
## Code Quality and Linting

.github/instructions/numa-mirroring-implementation.md

Lines changed: 62 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ This document describes the NUMA (Non-Uniform Memory Access) mirroring implement
88

99
On a 2-NUMA-node system testing with Qwen2.5-0.5B-Instruct-Q8_0:
1010

11-
Without numa mirroring:
11+
Without numa_mirroring
1212
```
1313
developer@81ec6c6e6af6:/workspaces/llama-cpp-dbsanfte-dev/llama-cpp-numa-mirror$ cd /workspaces/llama-cpp-dbsanfte-dev/llama-cpp-numa-mirror && ./build-release/bin/llama-bench -m ../.devcontainer/Qwen3-32B-Q6_K.gguf
1414
| model | size | params | backend | threads | test | t/s |
@@ -17,7 +17,7 @@ developer@81ec6c6e6af6:/workspaces/llama-cpp-dbsanfte-dev/llama-cpp-numa-mirror$
1717
| qwen3 32B Q6_K | 25.03 GiB | 32.76 B | CPU | 56 | tg128 | 1.91 ± 0.00 |
1818
```
1919

20-
With numa mirroring:
20+
With numa_mirroring
2121
```
2222
developer@81ec6c6e6af6:/workspaces/llama-cpp-dbsanfte-dev$ ./build/bin/llama-bench -m .
2323
/.devcontainer/Qwen3-32B-Q6_K.gguf --numa mirror
@@ -118,7 +118,7 @@ cmake --build build --parallel
118118
### Command Line Usage
119119
```bash
120120
# Enable NUMA mirroring for inference
121-
./llama-cli -m model.gguf --numa mirror -p "Hello world"
121+
./llama-cli -m model.gguf --numa mirror -p "Hello world" -no-cnv
122122

123123
# Benchmark with NUMA mirroring
124124
./llama-bench -m model.gguf --numa mirror
@@ -308,15 +308,15 @@ In `llama-mmap.cpp`: First-touch allocation at model weight loading time
308308
// Bind current thread to the target NUMA node for first-touch
309309
struct bitmask* old_mask = numa_get_run_node_mask();
310310
if (numa_run_on_node(node) != 0) {
311-
LLAMA_LOG_DEBUG("NUMA MIRRORING: Warning: could not bind thread to NUMA node %d: %s\n", node, strerror(errno));
311+
LLAMA_LOG_DEBUG("numa_mirroring Warning: could not bind thread to NUMA node %d: %s\n", node, strerror(errno));
312312
// Continue anyway - might still work
313313
}
314314

315315
// Use posix_memalign for SIMD alignment
316316
void* ptr = nullptr;
317317
int ret = posix_memalign(&ptr, alignment, size);
318318
if (ret != 0) {
319-
LLAMA_LOG_DEBUG("NUMA MIRRORING: posix_memalign failed for %zu bytes with alignment %zu: %s\n",
319+
LLAMA_LOG_DEBUG("numa_mirroring posix_memalign failed for %zu bytes with alignment %zu: %s\n",
320320
size, alignment, strerror(ret));
321321
// Restore original thread binding
322322
if (old_mask) {
@@ -339,23 +339,23 @@ In `llama-mmap.cpp`: First-touch allocation at model weight loading time
339339
numa_free_nodemask(old_mask);
340340
}
341341

342-
LLAMA_LOG_DEBUG("NUMA MIRRORING: First-touch allocation: %zu bytes for node %d at %p (SIMD aligned to %zu bytes)\n",
342+
LLAMA_LOG_DEBUG("numa_mirroring First-touch allocation: %zu bytes for node %d at %p (SIMD aligned to %zu bytes)\n",
343343
size, node, ptr, alignment);
344344
return ptr;
345345
}
346346

347347
void mmap_numa_mirror(struct llama_file * file) {
348348
int num_nodes = numa_num_configured_nodes();
349349
if (num_nodes <= 1) {
350-
throw std::runtime_error("NUMA MIRRORING: NUMA mirror mode requires multiple NUMA nodes");
350+
throw std::runtime_error("numa_mirroring NUMA mirror mode requires multiple NUMA nodes");
351351
}
352352
353-
LLAMA_LOG_DEBUG("NUMA MIRRORING: NUMA mirroring enabled - allocating %.2f MB on each of %d nodes using first-touch\n",
353+
LLAMA_LOG_INFO("numa_mirroring NUMA mirroring enabled - allocating %.2f MB on each of %d nodes using first-touch\n",
354354
file->size() / (1024.0 * 1024.0), num_nodes);
355355

356356
size_t total_size = file->size();
357357
for (int node = 0; node < num_nodes; ++node) {
358-
LLAMA_LOG_DEBUG("NUMA MIRRORING: Allocating on node %d using first-touch approach\n", node);
358+
LLAMA_LOG_INFO("numa_mirroring Allocating on node %d \n", node);
359359

360360
void* node_mem = numa_alloc_first_touch(total_size, node);
361361
if (!node_mem) {
@@ -368,24 +368,71 @@ In `llama-mmap.cpp`: First-touch allocation at model weight loading time
368368
// VERIFICATION: Check that memory was actually allocated on the expected NUMA node
369369
int actual_node = -1;
370370
if (get_mempolicy(&actual_node, NULL, 0, node_mem, MPOL_F_NODE | MPOL_F_ADDR) == 0) {
371-
LLAMA_LOG_DEBUG("NUMA MIRRORING: Memory at %p allocated on node %d (expected %d)\n",
371+
LLAMA_LOG_DEBUG("numa_mirroring Memory at %p allocated on node %d (expected %d)\n",
372372
node_mem, actual_node, node);
373373
if (actual_node != node) {
374-
LLAMA_LOG_WARN("NUMA MIRRORING: WARNING: Memory allocated on wrong node! Expected %d, got %d\n",
374+
LLAMA_LOG_WARN("numa_mirroring WARNING: Memory allocated on wrong node! Expected %d, got %d\n",
375375
node, actual_node);
376376
} else {
377-
LLAMA_LOG_DEBUG("NUMA MIRRORING: First-touch succeeded - memory correctly placed on node %d\n", node);
377+
LLAMA_LOG_DEBUG("numa_mirroring First-touch succeeded - memory correctly placed on node %d\n", node);
378378
}
379379
} else {
380-
LLAMA_LOG_WARN("NUMA MIRRORING: Could not verify allocation node for %p: %s\n",
380+
LLAMA_LOG_WARN("numa_mirroring Could not verify allocation node for %p: %s\n",
381381
node_mem, strerror(errno));
382382
}
383383

384384
file->seek(0, SEEK_SET);
385385
file->read_raw(node_mem, total_size);
386386
numa_mappings.push_back({node_mem, total_size});
387387

388-
LLAMA_LOG_DEBUG("NUMA MIRRORING: Successfully allocated and populated %.2f MB on node %d at %p\n",
388+
LLAMA_LOG_DEBUG("numa_mirroring Successfully allocated and populated %.2f MB on node %d at %p\n",
389+
total_size / (1024.0 * 1024.0), node, node_mem);
390+
}
391+
addr = numa_mappings.empty() ? nullptr : numa_mappings[0].addr;
392+
}
393+
394+
void mmap_numa_mirror(struct llama_file * file) {
395+
int num_nodes = numa_num_configured_nodes();
396+
if (num_nodes <= 1) {
397+
throw std::runtime_error("numa_mirroring NUMA mirror mode requires multiple NUMA nodes");
398+
}
399+
400+
LLAMA_LOG_INFO("numa_mirroring NUMA mirroring enabled - allocating %.2f MB on each of %d nodes using first-touch\n",
401+
file->size() / (1024.0 * 1024.0), num_nodes);
402+
403+
size_t total_size = file->size();
404+
for (int node = 0; node < num_nodes; ++node) {
405+
LLAMA_LOG_INFO("numa_mirroring Allocating on node %d \n", node);
406+
407+
void* node_mem = numa_alloc_first_touch(total_size, node);
408+
if (!node_mem) {
409+
for (const auto& mapping : numa_mappings) {
410+
free(mapping.addr); // Use free() for posix_memalign allocated memory
411+
}
412+
throw std::runtime_error("NUMA mirror allocation failed");
413+
}
414+
415+
// VERIFICATION: Check that memory was actually allocated on the expected NUMA node
416+
int actual_node = -1;
417+
if (get_mempolicy(&actual_node, NULL, 0, node_mem, MPOL_F_NODE | MPOL_F_ADDR) == 0) {
418+
LLAMA_LOG_DEBUG("numa_mirroring Memory at %p allocated on node %d (expected %d)\n",
419+
node_mem, actual_node, node);
420+
if (actual_node != node) {
421+
LLAMA_LOG_WARN("numa_mirroring WARNING: Memory allocated on wrong node! Expected %d, got %d\n",
422+
node, actual_node);
423+
} else {
424+
LLAMA_LOG_DEBUG("numa_mirroring First-touch succeeded - memory correctly placed on node %d\n", node);
425+
}
426+
} else {
427+
LLAMA_LOG_WARN("numa_mirroring Could not verify allocation node for %p: %s\n",
428+
node_mem, strerror(errno));
429+
}
430+
431+
file->seek(0, SEEK_SET);
432+
file->read_raw(node_mem, total_size);
433+
numa_mappings.push_back({node_mem, total_size});
434+
435+
LLAMA_LOG_DEBUG("numa_mirroring Successfully allocated and populated %.2f MB on node %d at %p\n",
389436
total_size / (1024.0 * 1024.0), node, node_mem);
390437
}
391438
addr = numa_mappings.empty() ? nullptr : numa_mappings[0].addr;
@@ -424,7 +471,7 @@ There are models you can use for testing in our .devcontainer folder:
424471

425472
Use qwen2.5-0.5b-instruct-q8_0.gguf for a quick verification run, while a bigger, dense model like Qwen3-32B-Q6_K.gguf will be good to test relative speed gains.
426473

427-
If testing with `llama-cli`, always be sure to use the `--no-cnv` switch to prevent it from starting an interactive conversation.
474+
If testing with `llama-cli`, always be sure to use the `-no-cnv` switch to prevent it from starting an interactive conversation.
428475

429476

430477
### System Requirements Check

common/CMakeLists.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,11 @@ if (LLAMA_CURL)
8989
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARIES})
9090
endif ()
9191

92+
# Check if OpenMP is enabled in ggml-cpu and add the definition
93+
if (GGML_OPENMP_ENABLED)
94+
target_compile_definitions(${TARGET} PRIVATE GGML_USE_OPENMP)
95+
endif ()
96+
9297
if (LLAMA_LLGUIDANCE)
9398
include(ExternalProject)
9499
set(LLGUIDANCE_SRC ${CMAKE_BINARY_DIR}/llguidance/source)

ggml/CMakeLists.txt

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -378,16 +378,11 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml-config.cmake
378378
${CMAKE_CURRENT_BINARY_DIR}/ggml-version.cmake
379379
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/ggml)
380380

381-
# Always enable NUMA support (controlled at runtime via --numa mirror)
381+
# Always enable NUMA support
382382
find_library(NUMA_LIBRARY NAMES numa)
383383
if (NUMA_LIBRARY)
384384
message(STATUS "libnuma: ${NUMA_LIBRARY}")
385-
message(STATUS
386-
"-----------------\n"
387-
"NUMA support enabled (controlled at runtime via --numa mirror)\n"
388-
"Uses numa_alloc_onnode() for reliable NUMA-aware memory allocation")
389-
message(STATUS
390-
"-----------------")
385+
message(STATUS "NUMA support enabled")
391386

392387
foreach(lib "ggml" "ggml-base")
393388
target_link_libraries(${lib} PUBLIC ${NUMA_LIBRARY})

ggml/include/ggml-cpu.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ extern "C" {
3333

3434
GGML_BACKEND_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
3535
GGML_BACKEND_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
36+
GGML_BACKEND_API enum ggml_numa_strategy ggml_numa_get_strategy(void); // get current NUMA strategy
3637

3738
GGML_BACKEND_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
3839
GGML_BACKEND_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -775,6 +775,10 @@ bool ggml_is_numa(void) {
775775
g_state.numa.numa_strategy != GGML_NUMA_STRATEGY_DISABLED;
776776
}
777777

778+
enum ggml_numa_strategy ggml_numa_get_strategy(void) {
779+
return g_state.numa.numa_strategy;
780+
}
781+
778782
//
779783
// NUMA-aware work buffer allocation:
780784
// Based on empirical testing, allocating work buffers on node 0 provides

ggml/src/ggml-cpu/ggml-cpu.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -632,6 +632,9 @@ static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const ch
632632
if (strcmp(name, "ggml_backend_cpu_is_numa") == 0) {
633633
return (void *)ggml_is_numa;
634634
}
635+
if (strcmp(name, "ggml_backend_cpu_numa_get_strategy") == 0) {
636+
return (void *)ggml_numa_get_strategy;
637+
}
635638

636639
// threadpool - TODO: move to ggml-base
637640
if (strcmp(name, "ggml_threadpool_new") == 0) {

src/llama-mmap.cpp

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -313,15 +313,15 @@ struct llama_mmap::impl {
313313
// Bind current thread to the target NUMA node for first-touch
314314
struct bitmask* old_mask = numa_get_run_node_mask();
315315
if (numa_run_on_node(node) != 0) {
316-
LLAMA_LOG_DEBUG("NUMA MIRRORING: Warning: could not bind thread to NUMA node %d: %s\n", node, strerror(errno));
316+
LLAMA_LOG_DEBUG("numa_mirroring Warning: could not bind thread to NUMA node %d: %s\n", node, strerror(errno));
317317
// Continue anyway - might still work
318318
}
319319

320320
// Use posix_memalign for SIMD alignment
321321
void* ptr = nullptr;
322322
int ret = posix_memalign(&ptr, alignment, size);
323323
if (ret != 0) {
324-
LLAMA_LOG_DEBUG("NUMA MIRRORING: posix_memalign failed for %zu bytes with alignment %zu: %s\n",
324+
LLAMA_LOG_DEBUG("numa_mirroring posix_memalign failed for %zu bytes with alignment %zu: %s\n",
325325
size, alignment, strerror(ret));
326326
// Restore original thread binding
327327
if (old_mask) {
@@ -344,23 +344,23 @@ struct llama_mmap::impl {
344344
numa_free_nodemask(old_mask);
345345
}
346346

347-
LLAMA_LOG_DEBUG("NUMA MIRRORING: First-touch allocation: %zu bytes for node %d at %p (SIMD aligned to %zu bytes)\n",
347+
LLAMA_LOG_DEBUG("numa_mirroring First-touch allocation: %zu bytes for node %d at %p (SIMD aligned to %zu bytes)\n",
348348
size, node, ptr, alignment);
349349
return ptr;
350350
}
351351

352352
void mmap_numa_mirror(struct llama_file * file) {
353353
int num_nodes = numa_num_configured_nodes();
354354
if (num_nodes <= 1) {
355-
throw std::runtime_error("NUMA MIRRORING: NUMA mirror mode requires multiple NUMA nodes");
355+
throw std::runtime_error("numa_mirroring NUMA mirror mode requires multiple NUMA nodes");
356356
}
357357

358-
LLAMA_LOG_DEBUG("NUMA MIRRORING: NUMA mirroring enabled - allocating %.2f MB on each of %d nodes using first-touch\n",
358+
LLAMA_LOG_INFO("numa_mirroring NUMA mirroring enabled - allocating %.2f MB on each of %d nodes using first-touch\n",
359359
file->size() / (1024.0 * 1024.0), num_nodes);
360360

361361
size_t total_size = file->size();
362362
for (int node = 0; node < num_nodes; ++node) {
363-
LLAMA_LOG_DEBUG("NUMA MIRRORING: Allocating on node %d using first-touch approach\n", node);
363+
LLAMA_LOG_INFO("numa_mirroring Allocating on node %d \n", node);
364364

365365
void* node_mem = numa_alloc_first_touch(total_size, node);
366366
if (!node_mem) {
@@ -373,24 +373,24 @@ struct llama_mmap::impl {
373373
// VERIFICATION: Check that memory was actually allocated on the expected NUMA node
374374
int actual_node = -1;
375375
if (get_mempolicy(&actual_node, NULL, 0, node_mem, MPOL_F_NODE | MPOL_F_ADDR) == 0) {
376-
LLAMA_LOG_DEBUG("NUMA MIRRORING: Memory at %p allocated on node %d (expected %d)\n",
376+
LLAMA_LOG_DEBUG("numa_mirroring Memory at %p allocated on node %d (expected %d)\n",
377377
node_mem, actual_node, node);
378378
if (actual_node != node) {
379-
LLAMA_LOG_WARN("NUMA MIRRORING: WARNING: Memory allocated on wrong node! Expected %d, got %d\n",
379+
LLAMA_LOG_WARN("numa_mirroring WARNING: Memory allocated on wrong node! Expected %d, got %d\n",
380380
node, actual_node);
381381
} else {
382-
LLAMA_LOG_DEBUG("NUMA MIRRORING: First-touch succeeded - memory correctly placed on node %d\n", node);
382+
LLAMA_LOG_DEBUG("numa_mirroring First-touch succeeded - memory correctly placed on node %d\n", node);
383383
}
384384
} else {
385-
LLAMA_LOG_WARN("NUMA MIRRORING: Could not verify allocation node for %p: %s\n",
385+
LLAMA_LOG_WARN("numa_mirroring Could not verify allocation node for %p: %s\n",
386386
node_mem, strerror(errno));
387387
}
388388

389389
file->seek(0, SEEK_SET);
390390
file->read_raw(node_mem, total_size);
391391
numa_mappings.push_back({node_mem, total_size});
392392

393-
LLAMA_LOG_DEBUG("NUMA MIRRORING: Successfully allocated and populated %.2f MB on node %d at %p\n",
393+
LLAMA_LOG_DEBUG("numa_mirroring Successfully allocated and populated %.2f MB on node %d at %p\n",
394394
total_size / (1024.0 * 1024.0), node, node_mem);
395395
}
396396
addr = numa_mappings.empty() ? nullptr : numa_mappings[0].addr;

0 commit comments

Comments
 (0)