@@ -8,7 +8,7 @@ This document describes the NUMA (Non-Uniform Memory Access) mirroring implement
88
99On a 2-NUMA-node system testing with Qwen2.5-0.5B-Instruct-Q8_0:
1010
11- Without numa mirroring:
11+ Without numa_mirroring
1212```
1313developer@81ec6c6e6af6:/workspaces/llama-cpp-dbsanfte-dev/llama-cpp-numa-mirror$ cd /workspaces/llama-cpp-dbsanfte-dev/llama-cpp-numa-mirror && ./build-release/bin/llama-bench -m ../.devcontainer/Qwen3-32B-Q6_K.gguf
1414| model | size | params | backend | threads | test | t/s |
@@ -17,7 +17,7 @@ developer@81ec6c6e6af6:/workspaces/llama-cpp-dbsanfte-dev/llama-cpp-numa-mirror$
1717| qwen3 32B Q6_K | 25.03 GiB | 32.76 B | CPU | 56 | tg128 | 1.91 ± 0.00 |
1818```
1919
20- With numa mirroring:
20+ With numa_mirroring
2121```
2222developer@81ec6c6e6af6:/workspaces/llama-cpp-dbsanfte-dev$ ./build/bin/llama-bench -m .
2323/.devcontainer/Qwen3-32B-Q6_K.gguf --numa mirror
@@ -118,7 +118,7 @@ cmake --build build --parallel
118118### Command Line Usage
119119``` bash
120120# Enable NUMA mirroring for inference
121- ./llama-cli -m model.gguf --numa mirror -p " Hello world"
121+ ./llama-cli -m model.gguf --numa mirror -p " Hello world" -no-cnv
122122
123123# Benchmark with NUMA mirroring
124124./llama-bench -m model.gguf --numa mirror
@@ -308,15 +308,15 @@ In `llama-mmap.cpp`: First-touch allocation at model weight loading time
308308 // Bind current thread to the target NUMA node for first-touch
309309 struct bitmask* old_mask = numa_get_run_node_mask();
310310 if (numa_run_on_node(node) != 0) {
311- LLAMA_LOG_DEBUG ("NUMA MIRRORING: Warning: could not bind thread to NUMA node %d: %s\n", node, strerror(errno));
311+ LLAMA_LOG_DEBUG ("numa_mirroring Warning: could not bind thread to NUMA node %d: %s\n", node, strerror(errno));
312312 // Continue anyway - might still work
313313 }
314314
315315 // Use posix_memalign for SIMD alignment
316316 void* ptr = nullptr;
317317 int ret = posix_memalign(&ptr, alignment, size);
318318 if (ret != 0) {
319- LLAMA_LOG_DEBUG("NUMA MIRRORING: posix_memalign failed for %zu bytes with alignment %zu: %s\n",
319+ LLAMA_LOG_DEBUG("numa_mirroring posix_memalign failed for %zu bytes with alignment %zu: %s\n",
320320 size, alignment, strerror(ret));
321321 // Restore original thread binding
322322 if (old_mask) {
@@ -339,23 +339,23 @@ In `llama-mmap.cpp`: First-touch allocation at model weight loading time
339339 numa_free_nodemask(old_mask);
340340 }
341341
342- LLAMA_LOG_DEBUG("NUMA MIRRORING: First-touch allocation: %zu bytes for node %d at %p (SIMD aligned to %zu bytes)\n",
342+ LLAMA_LOG_DEBUG("numa_mirroring First-touch allocation: %zu bytes for node %d at %p (SIMD aligned to %zu bytes)\n",
343343 size, node, ptr, alignment);
344344 return ptr;
345345 }
346346
347347 void mmap_numa_mirror(struct llama_file * file) {
348348 int num_nodes = numa_num_configured_nodes();
349349 if (num_nodes <= 1) {
350- throw std::runtime_error("NUMA MIRRORING: NUMA mirror mode requires multiple NUMA nodes");
350+ throw std::runtime_error("numa_mirroring NUMA mirror mode requires multiple NUMA nodes");
351351 }
352352
353- LLAMA_LOG_DEBUG ("NUMA MIRRORING: NUMA mirroring enabled - allocating %.2f MB on each of %d nodes using first-touch\n",
353+ LLAMA_LOG_INFO ("numa_mirroring NUMA mirroring enabled - allocating %.2f MB on each of %d nodes using first-touch\n",
354354 file->size() / (1024.0 * 1024.0), num_nodes);
355355
356356 size_t total_size = file->size();
357357 for (int node = 0; node < num_nodes; ++node) {
358- LLAMA_LOG_DEBUG("NUMA MIRRORING: Allocating on node %d using first-touch approach \n", node);
358+ LLAMA_LOG_INFO("numa_mirroring Allocating on node %d \n", node);
359359
360360 void* node_mem = numa_alloc_first_touch(total_size, node);
361361 if (!node_mem) {
@@ -368,24 +368,71 @@ In `llama-mmap.cpp`: First-touch allocation at model weight loading time
368368 // VERIFICATION: Check that memory was actually allocated on the expected NUMA node
369369 int actual_node = -1;
370370 if (get_mempolicy(&actual_node, NULL, 0, node_mem, MPOL_F_NODE | MPOL_F_ADDR) == 0) {
371- LLAMA_LOG_DEBUG("NUMA MIRRORING: Memory at %p allocated on node %d (expected %d)\n",
371+ LLAMA_LOG_DEBUG("numa_mirroring Memory at %p allocated on node %d (expected %d)\n",
372372 node_mem, actual_node, node);
373373 if (actual_node != node) {
374- LLAMA_LOG_WARN("NUMA MIRRORING: WARNING: Memory allocated on wrong node! Expected %d, got %d\n",
374+ LLAMA_LOG_WARN("numa_mirroring WARNING: Memory allocated on wrong node! Expected %d, got %d\n",
375375 node, actual_node);
376376 } else {
377- LLAMA_LOG_DEBUG("NUMA MIRRORING: First-touch succeeded - memory correctly placed on node %d\n", node);
377+ LLAMA_LOG_DEBUG("numa_mirroring First-touch succeeded - memory correctly placed on node %d\n", node);
378378 }
379379 } else {
380- LLAMA_LOG_WARN("NUMA MIRRORING: Could not verify allocation node for %p: %s\n",
380+ LLAMA_LOG_WARN("numa_mirroring Could not verify allocation node for %p: %s\n",
381381 node_mem, strerror(errno));
382382 }
383383
384384 file->seek(0, SEEK_SET);
385385 file->read_raw(node_mem, total_size);
386386 numa_mappings.push_back({node_mem, total_size});
387387
388- LLAMA_LOG_DEBUG ("NUMA MIRRORING: Successfully allocated and populated %.2f MB on node %d at %p\n",
388+ LLAMA_LOG_DEBUG ("numa_mirroring Successfully allocated and populated %.2f MB on node %d at %p\n",
389+ total_size / (1024.0 * 1024.0), node, node_mem);
390+ }
391+ addr = numa_mappings.empty() ? nullptr : numa_mappings[ 0] .addr;
392+ }
393+
394+ void mmap_numa_mirror(struct llama_file * file) {
395+ int num_nodes = numa_num_configured_nodes();
396+ if (num_nodes <= 1) {
397+ throw std::runtime_error("numa_mirroring NUMA mirror mode requires multiple NUMA nodes");
398+ }
399+
400+ LLAMA_LOG_INFO ("numa_mirroring NUMA mirroring enabled - allocating %.2f MB on each of %d nodes using first-touch\n",
401+ file->size() / (1024.0 * 1024.0), num_nodes);
402+
403+ size_t total_size = file->size();
404+ for (int node = 0; node < num_nodes; ++node) {
405+ LLAMA_LOG_INFO("numa_mirroring Allocating on node %d \n", node);
406+
407+ void* node_mem = numa_alloc_first_touch(total_size, node);
408+ if (!node_mem) {
409+ for (const auto& mapping : numa_mappings) {
410+ free(mapping.addr); // Use free() for posix_memalign allocated memory
411+ }
412+ throw std::runtime_error("NUMA mirror allocation failed");
413+ }
414+
415+ // VERIFICATION: Check that memory was actually allocated on the expected NUMA node
416+ int actual_node = -1;
417+ if (get_mempolicy(&actual_node, NULL, 0, node_mem, MPOL_F_NODE | MPOL_F_ADDR) == 0) {
418+ LLAMA_LOG_DEBUG("numa_mirroring Memory at %p allocated on node %d (expected %d)\n",
419+ node_mem, actual_node, node);
420+ if (actual_node != node) {
421+ LLAMA_LOG_WARN("numa_mirroring WARNING: Memory allocated on wrong node! Expected %d, got %d\n",
422+ node, actual_node);
423+ } else {
424+ LLAMA_LOG_DEBUG("numa_mirroring First-touch succeeded - memory correctly placed on node %d\n", node);
425+ }
426+ } else {
427+ LLAMA_LOG_WARN("numa_mirroring Could not verify allocation node for %p: %s\n",
428+ node_mem, strerror(errno));
429+ }
430+
431+ file->seek(0, SEEK_SET);
432+ file->read_raw(node_mem, total_size);
433+ numa_mappings.push_back({node_mem, total_size});
434+
435+ LLAMA_LOG_DEBUG ("numa_mirroring Successfully allocated and populated %.2f MB on node %d at %p\n",
389436 total_size / (1024.0 * 1024.0), node, node_mem);
390437 }
391438 addr = numa_mappings.empty() ? nullptr : numa_mappings[ 0] .addr;
@@ -424,7 +471,7 @@ There are models you can use for testing in our .devcontainer folder:
424471
425472Use qwen2.5-0.5b-instruct-q8_0.gguf for a quick verification run, while a bigger, dense model like Qwen3-32B-Q6_K.gguf will be good to test relative speed gains.
426473
427- If testing with ` llama-cli ` , always be sure to use the ` -- no-cnv ` switch to prevent it from starting an interactive conversation.
474+ If testing with ` llama-cli ` , always be sure to use the ` -no-cnv ` switch to prevent it from starting an interactive conversation.
428475
429476
430477### System Requirements Check
0 commit comments