don't try to mirror weights when we're not in --numa mirror mode. Also make logging prettier.

dbsanfte · dbsanfte · commit b8bf5fa118c0 · 2025-09-15T20:17:43.000Z
diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md
@@ -102,7 +102,7 @@ source ../../../.venv/bin/activate
 ./build/bin/llama-cli --version
 
 # Test model loading (requires model file)
-./build/bin/llama-cli -m path/to/model.gguf -p "Hello" -n 10
+./build/bin/llama-cli -m path/to/model.gguf -p "Hello" -n 10 -no-cnv
 ```
 
 ## Code Quality and Linting
diff --git a/.github/instructions/numa-mirroring-implementation.md b/.github/instructions/numa-mirroring-implementation.md
@@ -8,7 +8,7 @@ This document describes the NUMA (Non-Uniform Memory Access) mirroring implement
 
 On a 2-NUMA-node system testing with Qwen2.5-0.5B-Instruct-Q8_0:
 
-Without numa mirroring:
+Without numa_mirroring
 ```
 developer@81ec6c6e6af6:/workspaces/llama-cpp-dbsanfte-dev/llama-cpp-numa-mirror$ cd /workspaces/llama-cpp-dbsanfte-dev/llama-cpp-numa-mirror && ./build-release/bin/llama-bench -m ../.devcontainer/Qwen3-32B-Q6_K.gguf                                       
 | model                          |       size |     params | backend    | threads |            test |                  t/s |
@@ -17,7 +17,7 @@ developer@81ec6c6e6af6:/workspaces/llama-cpp-dbsanfte-dev/llama-cpp-numa-mirror$
 | qwen3 32B Q6_K                 |  25.03 GiB |    32.76 B | CPU        |      56 |           tg128 |          1.91 ± 0.00 |
 ```
 
-With numa mirroring:
+With numa_mirroring
 ```
 developer@81ec6c6e6af6:/workspaces/llama-cpp-dbsanfte-dev$ ./build/bin/llama-bench -m .
 /.devcontainer/Qwen3-32B-Q6_K.gguf --numa mirror
@@ -118,7 +118,7 @@ cmake --build build --parallel
 ### Command Line Usage
 ```bash
 # Enable NUMA mirroring for inference
-./llama-cli -m model.gguf --numa mirror -p "Hello world"
+./llama-cli -m model.gguf --numa mirror -p "Hello world" -no-cnv
 
 # Benchmark with NUMA mirroring
 ./llama-bench -m model.gguf --numa mirror
@@ -308,15 +308,15 @@ In `llama-mmap.cpp`: First-touch allocation at model weight loading time
         // Bind current thread to the target NUMA node for first-touch
         struct bitmask* old_mask = numa_get_run_node_mask();
         if (numa_run_on_node(node) != 0) {
-            LLAMA_LOG_DEBUG("NUMA MIRRORING: Warning: could not bind thread to NUMA node %d: %s\n", node, strerror(errno));
+            LLAMA_LOG_DEBUG("numa_mirroring Warning: could not bind thread to NUMA node %d: %s\n", node, strerror(errno));
             // Continue anyway - might still work
         }
         
         // Use posix_memalign for SIMD alignment
         void* ptr = nullptr;
         int ret = posix_memalign(&ptr, alignment, size);
         if (ret != 0) {
-            LLAMA_LOG_DEBUG("NUMA MIRRORING: posix_memalign failed for %zu bytes with alignment %zu: %s\n", 
+            LLAMA_LOG_DEBUG("numa_mirroring posix_memalign failed for %zu bytes with alignment %zu: %s\n", 
                            size, alignment, strerror(ret));
             // Restore original thread binding
             if (old_mask) {
@@ -339,23 +339,23 @@ In `llama-mmap.cpp`: First-touch allocation at model weight loading time
             numa_free_nodemask(old_mask);
         }
         
-        LLAMA_LOG_DEBUG("NUMA MIRRORING: First-touch allocation: %zu bytes for node %d at %p (SIMD aligned to %zu bytes)\n", 
+        LLAMA_LOG_DEBUG("numa_mirroring First-touch allocation: %zu bytes for node %d at %p (SIMD aligned to %zu bytes)\n", 
                        size, node, ptr, alignment);
         return ptr;
     }
 
     void mmap_numa_mirror(struct llama_file * file) {
         int num_nodes = numa_num_configured_nodes();
         if (num_nodes <= 1) {
-            throw std::runtime_error("NUMA MIRRORING: NUMA mirror mode requires multiple NUMA nodes");
+            throw std::runtime_error("numa_mirroring NUMA mirror mode requires multiple NUMA nodes");
         }
         
-        LLAMA_LOG_DEBUG("NUMA MIRRORING: NUMA mirroring enabled - allocating %.2f MB on each of %d nodes using first-touch\n", 
+        LLAMA_LOG_INFO("numa_mirroring NUMA mirroring enabled - allocating %.2f MB on each of %d nodes using first-touch\n", 
                 file->size() / (1024.0 * 1024.0), num_nodes);
         
         size_t total_size = file->size();
         for (int node = 0; node < num_nodes; ++node) {
-            LLAMA_LOG_DEBUG("NUMA MIRRORING: Allocating on node %d using first-touch approach\n", node);
+            LLAMA_LOG_INFO("numa_mirroring Allocating on node %d \n", node);
             
             void* node_mem = numa_alloc_first_touch(total_size, node);
             if (!node_mem) {
@@ -368,24 +368,71 @@ In `llama-mmap.cpp`: First-touch allocation at model weight loading time
             // VERIFICATION: Check that memory was actually allocated on the expected NUMA node
             int actual_node = -1;
             if (get_mempolicy(&actual_node, NULL, 0, node_mem, MPOL_F_NODE | MPOL_F_ADDR) == 0) {
-                LLAMA_LOG_DEBUG("NUMA MIRRORING: Memory at %p allocated on node %d (expected %d)\n", 
+                LLAMA_LOG_DEBUG("numa_mirroring Memory at %p allocated on node %d (expected %d)\n", 
                                node_mem, actual_node, node);
                 if (actual_node != node) {
-                    LLAMA_LOG_WARN("NUMA MIRRORING: WARNING: Memory allocated on wrong node! Expected %d, got %d\n", 
+                    LLAMA_LOG_WARN("numa_mirroring WARNING: Memory allocated on wrong node! Expected %d, got %d\n", 
                                    node, actual_node);
                 } else {
-                    LLAMA_LOG_DEBUG("NUMA MIRRORING: First-touch succeeded - memory correctly placed on node %d\n", node);
+                    LLAMA_LOG_DEBUG("numa_mirroring First-touch succeeded - memory correctly placed on node %d\n", node);
                 }
             } else {
-                LLAMA_LOG_WARN("NUMA MIRRORING: Could not verify allocation node for %p: %s\n", 
+                LLAMA_LOG_WARN("numa_mirroring Could not verify allocation node for %p: %s\n", 
                                node_mem, strerror(errno));
             }
             
             file->seek(0, SEEK_SET);
             file->read_raw(node_mem, total_size);
             numa_mappings.push_back({node_mem, total_size});
 
-            LLAMA_LOG_DEBUG("NUMA MIRRORING: Successfully allocated and populated %.2f MB on node %d at %p\n",
+            LLAMA_LOG_DEBUG("numa_mirroring Successfully allocated and populated %.2f MB on node %d at %p\n",
+                           total_size / (1024.0 * 1024.0), node, node_mem);
+        }
+        addr = numa_mappings.empty() ? nullptr : numa_mappings[0].addr;
+    }
+
+    void mmap_numa_mirror(struct llama_file * file) {
+        int num_nodes = numa_num_configured_nodes();
+        if (num_nodes <= 1) {
+            throw std::runtime_error("numa_mirroring NUMA mirror mode requires multiple NUMA nodes");
+        }
+        
+        LLAMA_LOG_INFO("numa_mirroring NUMA mirroring enabled - allocating %.2f MB on each of %d nodes using first-touch\n", 
+                file->size() / (1024.0 * 1024.0), num_nodes);
+        
+        size_t total_size = file->size();
+        for (int node = 0; node < num_nodes; ++node) {
+            LLAMA_LOG_INFO("numa_mirroring Allocating on node %d \n", node);
+            
+            void* node_mem = numa_alloc_first_touch(total_size, node);
+            if (!node_mem) {
+                for (const auto& mapping : numa_mappings) {
+                    free(mapping.addr);  // Use free() for posix_memalign allocated memory
+                }
+                throw std::runtime_error("NUMA mirror allocation failed");
+            }
+            
+            // VERIFICATION: Check that memory was actually allocated on the expected NUMA node
+            int actual_node = -1;
+            if (get_mempolicy(&actual_node, NULL, 0, node_mem, MPOL_F_NODE | MPOL_F_ADDR) == 0) {
+                LLAMA_LOG_DEBUG("numa_mirroring Memory at %p allocated on node %d (expected %d)\n", 
+                               node_mem, actual_node, node);
+                if (actual_node != node) {
+                    LLAMA_LOG_WARN("numa_mirroring WARNING: Memory allocated on wrong node! Expected %d, got %d\n", 
+                                   node, actual_node);
+                } else {
+                    LLAMA_LOG_DEBUG("numa_mirroring First-touch succeeded - memory correctly placed on node %d\n", node);
+                }
+            } else {
+                LLAMA_LOG_WARN("numa_mirroring Could not verify allocation node for %p: %s\n", 
+                               node_mem, strerror(errno));
+            }
+            
+            file->seek(0, SEEK_SET);
+            file->read_raw(node_mem, total_size);
+            numa_mappings.push_back({node_mem, total_size});
+
+            LLAMA_LOG_DEBUG("numa_mirroring Successfully allocated and populated %.2f MB on node %d at %p\n",
                            total_size / (1024.0 * 1024.0), node, node_mem);
         }
         addr = numa_mappings.empty() ? nullptr : numa_mappings[0].addr;
@@ -424,7 +471,7 @@ There are models you can use for testing in our .devcontainer folder:
 
 Use qwen2.5-0.5b-instruct-q8_0.gguf for a quick verification run, while a bigger, dense model like Qwen3-32B-Q6_K.gguf will be good to test relative speed gains.
 
-If testing with `llama-cli`, always be sure to use the `--no-cnv` switch to prevent it from starting an interactive conversation.
+If testing with `llama-cli`, always be sure to use the `-no-cnv` switch to prevent it from starting an interactive conversation.
 
 
 ### System Requirements Check
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
@@ -89,6 +89,11 @@ if (LLAMA_CURL)
     set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARIES})
 endif ()
 
+# Check if OpenMP is enabled in ggml-cpu and add the definition
+if (GGML_OPENMP_ENABLED)
+    target_compile_definitions(${TARGET} PRIVATE GGML_USE_OPENMP)
+endif ()
+
 if (LLAMA_LLGUIDANCE)
     include(ExternalProject)
     set(LLGUIDANCE_SRC ${CMAKE_BINARY_DIR}/llguidance/source)
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
@@ -378,16 +378,11 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml-config.cmake
               ${CMAKE_CURRENT_BINARY_DIR}/ggml-version.cmake
         DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/ggml)
 
-# Always enable NUMA support (controlled at runtime via --numa mirror)
+# Always enable NUMA support
 find_library(NUMA_LIBRARY NAMES numa)
 if (NUMA_LIBRARY)
     message(STATUS "libnuma: ${NUMA_LIBRARY}")
-    message(STATUS
-            "-----------------\n"
-            "NUMA support enabled (controlled at runtime via --numa mirror)\n"
-            "Uses numa_alloc_onnode() for reliable NUMA-aware memory allocation")
-    message(STATUS
-            "-----------------")
+    message(STATUS "NUMA support enabled")
 
     foreach(lib "ggml" "ggml-base")
         target_link_libraries(${lib} PUBLIC ${NUMA_LIBRARY})
diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h
@@ -33,6 +33,7 @@ extern "C" {
 
     GGML_BACKEND_API void    ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
     GGML_BACKEND_API bool    ggml_is_numa(void); // true if init detected that system has >1 NUMA node
+    GGML_BACKEND_API enum ggml_numa_strategy ggml_numa_get_strategy(void); // get current NUMA strategy
 
     GGML_BACKEND_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
     GGML_BACKEND_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -775,6 +775,10 @@ bool ggml_is_numa(void) {
            g_state.numa.numa_strategy != GGML_NUMA_STRATEGY_DISABLED;
 }
 
+enum ggml_numa_strategy ggml_numa_get_strategy(void) {
+    return g_state.numa.numa_strategy;
+}
+
 //
 // NUMA-aware work buffer allocation:
 // Based on empirical testing, allocating work buffers on node 0 provides
diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp
@@ -632,6 +632,9 @@ static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const ch
     if (strcmp(name, "ggml_backend_cpu_is_numa") == 0) {
         return (void *)ggml_is_numa;
     }
+    if (strcmp(name, "ggml_backend_cpu_numa_get_strategy") == 0) {
+        return (void *)ggml_numa_get_strategy;
+    }
 
     // threadpool - TODO:  move to ggml-base
     if (strcmp(name, "ggml_threadpool_new") == 0) {
diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp
@@ -313,15 +313,15 @@ struct llama_mmap::impl {
         // Bind current thread to the target NUMA node for first-touch
         struct bitmask* old_mask = numa_get_run_node_mask();
         if (numa_run_on_node(node) != 0) {
-            LLAMA_LOG_DEBUG("NUMA MIRRORING: Warning: could not bind thread to NUMA node %d: %s\n", node, strerror(errno));
+            LLAMA_LOG_DEBUG("numa_mirroring Warning: could not bind thread to NUMA node %d: %s\n", node, strerror(errno));
             // Continue anyway - might still work
         }
         
         // Use posix_memalign for SIMD alignment
         void* ptr = nullptr;
         int ret = posix_memalign(&ptr, alignment, size);
         if (ret != 0) {
-            LLAMA_LOG_DEBUG("NUMA MIRRORING: posix_memalign failed for %zu bytes with alignment %zu: %s\n", 
+            LLAMA_LOG_DEBUG("numa_mirroring posix_memalign failed for %zu bytes with alignment %zu: %s\n", 
                            size, alignment, strerror(ret));
             // Restore original thread binding
             if (old_mask) {
@@ -344,23 +344,23 @@ struct llama_mmap::impl {
             numa_free_nodemask(old_mask);
         }
         
-        LLAMA_LOG_DEBUG("NUMA MIRRORING: First-touch allocation: %zu bytes for node %d at %p (SIMD aligned to %zu bytes)\n", 
+        LLAMA_LOG_DEBUG("numa_mirroring First-touch allocation: %zu bytes for node %d at %p (SIMD aligned to %zu bytes)\n", 
                        size, node, ptr, alignment);
         return ptr;
     }
 
     void mmap_numa_mirror(struct llama_file * file) {
         int num_nodes = numa_num_configured_nodes();
         if (num_nodes <= 1) {
-            throw std::runtime_error("NUMA MIRRORING: NUMA mirror mode requires multiple NUMA nodes");
+            throw std::runtime_error("numa_mirroring NUMA mirror mode requires multiple NUMA nodes");
         }
         
-        LLAMA_LOG_DEBUG("NUMA MIRRORING: NUMA mirroring enabled - allocating %.2f MB on each of %d nodes using first-touch\n", 
+        LLAMA_LOG_INFO("numa_mirroring NUMA mirroring enabled - allocating %.2f MB on each of %d nodes using first-touch\n", 
                 file->size() / (1024.0 * 1024.0), num_nodes);
         
         size_t total_size = file->size();
         for (int node = 0; node < num_nodes; ++node) {
-            LLAMA_LOG_DEBUG("NUMA MIRRORING: Allocating on node %d using first-touch approach\n", node);
+            LLAMA_LOG_INFO("numa_mirroring Allocating on node %d \n", node);
             
             void* node_mem = numa_alloc_first_touch(total_size, node);
             if (!node_mem) {
@@ -373,24 +373,24 @@ struct llama_mmap::impl {
             // VERIFICATION: Check that memory was actually allocated on the expected NUMA node
             int actual_node = -1;
             if (get_mempolicy(&actual_node, NULL, 0, node_mem, MPOL_F_NODE | MPOL_F_ADDR) == 0) {
-                LLAMA_LOG_DEBUG("NUMA MIRRORING: Memory at %p allocated on node %d (expected %d)\n", 
+                LLAMA_LOG_DEBUG("numa_mirroring Memory at %p allocated on node %d (expected %d)\n", 
                                node_mem, actual_node, node);
                 if (actual_node != node) {
-                    LLAMA_LOG_WARN("NUMA MIRRORING: WARNING: Memory allocated on wrong node! Expected %d, got %d\n", 
+                    LLAMA_LOG_WARN("numa_mirroring WARNING: Memory allocated on wrong node! Expected %d, got %d\n", 
                                    node, actual_node);
                 } else {
-                    LLAMA_LOG_DEBUG("NUMA MIRRORING: First-touch succeeded - memory correctly placed on node %d\n", node);
+                    LLAMA_LOG_DEBUG("numa_mirroring First-touch succeeded - memory correctly placed on node %d\n", node);
                 }
             } else {
-                LLAMA_LOG_WARN("NUMA MIRRORING: Could not verify allocation node for %p: %s\n", 
+                LLAMA_LOG_WARN("numa_mirroring Could not verify allocation node for %p: %s\n", 
                                node_mem, strerror(errno));
             }
             
             file->seek(0, SEEK_SET);
             file->read_raw(node_mem, total_size);
             numa_mappings.push_back({node_mem, total_size});
 
-            LLAMA_LOG_DEBUG("NUMA MIRRORING: Successfully allocated and populated %.2f MB on node %d at %p\n",
+            LLAMA_LOG_DEBUG("numa_mirroring Successfully allocated and populated %.2f MB on node %d at %p\n",
                            total_size / (1024.0 * 1024.0), node, node_mem);
         }
         addr = numa_mappings.empty() ? nullptr : numa_mappings[0].addr;
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp

Original file line number	Diff line number	Diff line change
`@@ -632,6 +632,9 @@ static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const ch`
`632`	`632`	`if (strcmp(name, "ggml_backend_cpu_is_numa") == 0) {`
`633`	`633`	`return (void *)ggml_is_numa;`
`634`	`634`	`}`
	`635`	`+ if (strcmp(name, "ggml_backend_cpu_numa_get_strategy") == 0) {`
	`636`	`+ return (void *)ggml_numa_get_strategy;`
	`637`	`+ }`
`635`	`638`
`636`	`639`	`// threadpool - TODO: move to ggml-base`
`637`	`640`	`if (strcmp(name, "ggml_threadpool_new") == 0) {`