all tensors we load in llama-model-loader.cpp are model weights.

dbsanfte · dbsanfte · commit 4da24f78f9b7 · 2025-09-15T20:26:34.000Z
Make logging prettier
diff --git a/.github/instructions/numa-mirroring-implementation.md b/.github/instructions/numa-mirroring-implementation.md
@@ -308,15 +308,15 @@ In `llama-mmap.cpp`: First-touch allocation at model weight loading time
         // Bind current thread to the target NUMA node for first-touch
         struct bitmask* old_mask = numa_get_run_node_mask();
         if (numa_run_on_node(node) != 0) {
-            LLAMA_LOG_DEBUG("numa_mirroring Warning: could not bind thread to NUMA node %d: %s\n", node, strerror(errno));
+            LLAMA_LOG_DEBUG("numa_mirroring: Warning: could not bind thread to NUMA node %d: %s\n", node, strerror(errno));
             // Continue anyway - might still work
         }
         
         // Use posix_memalign for SIMD alignment
         void* ptr = nullptr;
         int ret = posix_memalign(&ptr, alignment, size);
         if (ret != 0) {
-            LLAMA_LOG_DEBUG("numa_mirroring posix_memalign failed for %zu bytes with alignment %zu: %s\n", 
+            LLAMA_LOG_DEBUG("numa_mirroring: posix_memalign failed for %zu bytes with alignment %zu: %s\n", 
                            size, alignment, strerror(ret));
             // Restore original thread binding
             if (old_mask) {
@@ -339,23 +339,23 @@ In `llama-mmap.cpp`: First-touch allocation at model weight loading time
             numa_free_nodemask(old_mask);
         }
         
-        LLAMA_LOG_DEBUG("numa_mirroring First-touch allocation: %zu bytes for node %d at %p (SIMD aligned to %zu bytes)\n", 
+        LLAMA_LOG_DEBUG("numa_mirroring: First-touch allocation: %zu bytes for node %d at %p (SIMD aligned to %zu bytes)\n", 
                        size, node, ptr, alignment);
         return ptr;
     }
 
     void mmap_numa_mirror(struct llama_file * file) {
         int num_nodes = numa_num_configured_nodes();
         if (num_nodes <= 1) {
-            throw std::runtime_error("numa_mirroring NUMA mirror mode requires multiple NUMA nodes");
+            throw std::runtime_error("numa_mirroring: NUMA mirror mode requires multiple NUMA nodes");
         }
         
-        LLAMA_LOG_INFO("numa_mirroring NUMA mirroring enabled - allocating %.2f MB on each of %d nodes using first-touch\n", 
+        LLAMA_LOG_INFO("numa_mirroring: NUMA mirroring enabled - allocating %.2f MB on each of %d nodes using first-touch\n", 
                 file->size() / (1024.0 * 1024.0), num_nodes);
         
         size_t total_size = file->size();
         for (int node = 0; node < num_nodes; ++node) {
-            LLAMA_LOG_INFO("numa_mirroring Allocating on node %d \n", node);
+            LLAMA_LOG_INFO("numa_mirroring: Allocating on node %d \n", node);
             
             void* node_mem = numa_alloc_first_touch(total_size, node);
             if (!node_mem) {
@@ -368,24 +368,24 @@ In `llama-mmap.cpp`: First-touch allocation at model weight loading time
             // VERIFICATION: Check that memory was actually allocated on the expected NUMA node
             int actual_node = -1;
             if (get_mempolicy(&actual_node, NULL, 0, node_mem, MPOL_F_NODE | MPOL_F_ADDR) == 0) {
-                LLAMA_LOG_DEBUG("numa_mirroring Memory at %p allocated on node %d (expected %d)\n", 
+                LLAMA_LOG_DEBUG("numa_mirroring: Memory at %p allocated on node %d (expected %d)\n", 
                                node_mem, actual_node, node);
                 if (actual_node != node) {
-                    LLAMA_LOG_WARN("numa_mirroring WARNING: Memory allocated on wrong node! Expected %d, got %d\n", 
+                    LLAMA_LOG_WARN("numa_mirroring: WARNING: Memory allocated on wrong node! Expected %d, got %d\n", 
                                    node, actual_node);
                 } else {
-                    LLAMA_LOG_DEBUG("numa_mirroring First-touch succeeded - memory correctly placed on node %d\n", node);
+                    LLAMA_LOG_DEBUG("numa_mirroring: First-touch succeeded - memory correctly placed on node %d\n", node);
                 }
             } else {
-                LLAMA_LOG_WARN("numa_mirroring Could not verify allocation node for %p: %s\n", 
+                LLAMA_LOG_WARN("numa_mirroring: Could not verify allocation node for %p: %s\n", 
                                node_mem, strerror(errno));
             }
             
             file->seek(0, SEEK_SET);
             file->read_raw(node_mem, total_size);
             numa_mappings.push_back({node_mem, total_size});
 
-            LLAMA_LOG_DEBUG("numa_mirroring Successfully allocated and populated %.2f MB on node %d at %p\n",
+            LLAMA_LOG_DEBUG("numa_mirroring: Successfully allocated and populated %.2f MB on node %d at %p\n",
                            total_size / (1024.0 * 1024.0), node, node_mem);
         }
         addr = numa_mappings.empty() ? nullptr : numa_mappings[0].addr;
@@ -394,15 +394,15 @@ In `llama-mmap.cpp`: First-touch allocation at model weight loading time
     void mmap_numa_mirror(struct llama_file * file) {
         int num_nodes = numa_num_configured_nodes();
         if (num_nodes <= 1) {
-            throw std::runtime_error("numa_mirroring NUMA mirror mode requires multiple NUMA nodes");
+            throw std::runtime_error("numa_mirroring: NUMA mirror mode requires multiple NUMA nodes");
         }
         
-        LLAMA_LOG_INFO("numa_mirroring NUMA mirroring enabled - allocating %.2f MB on each of %d nodes using first-touch\n", 
+        LLAMA_LOG_INFO("numa_mirroring: NUMA mirroring enabled - allocating %.2f MB on each of %d nodes using first-touch\n", 
                 file->size() / (1024.0 * 1024.0), num_nodes);
         
         size_t total_size = file->size();
         for (int node = 0; node < num_nodes; ++node) {
-            LLAMA_LOG_INFO("numa_mirroring Allocating on node %d \n", node);
+            LLAMA_LOG_INFO("numa_mirroring: Allocating on node %d \n", node);
             
             void* node_mem = numa_alloc_first_touch(total_size, node);
             if (!node_mem) {
@@ -415,24 +415,24 @@ In `llama-mmap.cpp`: First-touch allocation at model weight loading time
             // VERIFICATION: Check that memory was actually allocated on the expected NUMA node
             int actual_node = -1;
             if (get_mempolicy(&actual_node, NULL, 0, node_mem, MPOL_F_NODE | MPOL_F_ADDR) == 0) {
-                LLAMA_LOG_DEBUG("numa_mirroring Memory at %p allocated on node %d (expected %d)\n", 
+                LLAMA_LOG_DEBUG("numa_mirroring: Memory at %p allocated on node %d (expected %d)\n", 
                                node_mem, actual_node, node);
                 if (actual_node != node) {
-                    LLAMA_LOG_WARN("numa_mirroring WARNING: Memory allocated on wrong node! Expected %d, got %d\n", 
+                    LLAMA_LOG_WARN("numa_mirroring: WARNING: Memory allocated on wrong node! Expected %d, got %d\n", 
                                    node, actual_node);
                 } else {
-                    LLAMA_LOG_DEBUG("numa_mirroring First-touch succeeded - memory correctly placed on node %d\n", node);
+                    LLAMA_LOG_DEBUG("numa_mirroring: First-touch succeeded - memory correctly placed on node %d\n", node);
                 }
             } else {
-                LLAMA_LOG_WARN("numa_mirroring Could not verify allocation node for %p: %s\n", 
+                LLAMA_LOG_WARN("numa_mirroring: Could not verify allocation node for %p: %s\n", 
                                node_mem, strerror(errno));
             }
             
             file->seek(0, SEEK_SET);
             file->read_raw(node_mem, total_size);
             numa_mappings.push_back({node_mem, total_size});
 
-            LLAMA_LOG_DEBUG("numa_mirroring Successfully allocated and populated %.2f MB on node %d at %p\n",
+            LLAMA_LOG_DEBUG("numa_mirroring: Successfully allocated and populated %.2f MB on node %d at %p\n",
                            total_size / (1024.0 * 1024.0), node, node_mem);
         }
         addr = numa_mappings.empty() ? nullptr : numa_mappings[0].addr;
diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp
@@ -313,15 +313,15 @@ struct llama_mmap::impl {
         // Bind current thread to the target NUMA node for first-touch
         struct bitmask* old_mask = numa_get_run_node_mask();
         if (numa_run_on_node(node) != 0) {
-            LLAMA_LOG_DEBUG("numa_mirroring Warning: could not bind thread to NUMA node %d: %s\n", node, strerror(errno));
+            LLAMA_LOG_DEBUG("numa_mirroring: Warning: could not bind thread to NUMA node %d: %s\n", node, strerror(errno));
             // Continue anyway - might still work
         }
         
         // Use posix_memalign for SIMD alignment
         void* ptr = nullptr;
         int ret = posix_memalign(&ptr, alignment, size);
         if (ret != 0) {
-            LLAMA_LOG_DEBUG("numa_mirroring posix_memalign failed for %zu bytes with alignment %zu: %s\n", 
+            LLAMA_LOG_DEBUG("numa_mirroring: posix_memalign failed for %zu bytes with alignment %zu: %s\n", 
                            size, alignment, strerror(ret));
             // Restore original thread binding
             if (old_mask) {
@@ -344,23 +344,23 @@ struct llama_mmap::impl {
             numa_free_nodemask(old_mask);
         }
         
-        LLAMA_LOG_DEBUG("numa_mirroring First-touch allocation: %zu bytes for node %d at %p (SIMD aligned to %zu bytes)\n", 
+        LLAMA_LOG_DEBUG("numa_mirroring: First-touch allocation: %zu bytes for node %d at %p (SIMD aligned to %zu bytes)\n", 
                        size, node, ptr, alignment);
         return ptr;
     }
 
     void mmap_numa_mirror(struct llama_file * file) {
         int num_nodes = numa_num_configured_nodes();
         if (num_nodes <= 1) {
-            throw std::runtime_error("numa_mirroring NUMA mirror mode requires multiple NUMA nodes");
+            throw std::runtime_error("numa_mirroring: NUMA mirror mode requires multiple NUMA nodes");
         }
         
-        LLAMA_LOG_INFO("numa_mirroring NUMA mirroring enabled - allocating %.2f MB on each of %d nodes using first-touch\n", 
+        LLAMA_LOG_INFO("numa_mirroring: NUMA mirroring enabled - allocating %.2f MB on each of %d nodes using first-touch\n", 
                 file->size() / (1024.0 * 1024.0), num_nodes);
         
         size_t total_size = file->size();
         for (int node = 0; node < num_nodes; ++node) {
-            LLAMA_LOG_INFO("numa_mirroring Allocating on node %d \n", node);
+            LLAMA_LOG_INFO("numa_mirroring: Allocating on node %d \n", node);
             
             void* node_mem = numa_alloc_first_touch(total_size, node);
             if (!node_mem) {
@@ -373,24 +373,24 @@ struct llama_mmap::impl {
             // VERIFICATION: Check that memory was actually allocated on the expected NUMA node
             int actual_node = -1;
             if (get_mempolicy(&actual_node, NULL, 0, node_mem, MPOL_F_NODE | MPOL_F_ADDR) == 0) {
-                LLAMA_LOG_DEBUG("numa_mirroring Memory at %p allocated on node %d (expected %d)\n", 
+                LLAMA_LOG_DEBUG("numa_mirroring: Memory at %p allocated on node %d (expected %d)\n", 
                                node_mem, actual_node, node);
                 if (actual_node != node) {
-                    LLAMA_LOG_WARN("numa_mirroring WARNING: Memory allocated on wrong node! Expected %d, got %d\n", 
+                    LLAMA_LOG_WARN("numa_mirroring: WARNING: Memory allocated on wrong node! Expected %d, got %d\n", 
                                    node, actual_node);
                 } else {
-                    LLAMA_LOG_DEBUG("numa_mirroring First-touch succeeded - memory correctly placed on node %d\n", node);
+                    LLAMA_LOG_DEBUG("numa_mirroring: First-touch succeeded - memory correctly placed on node %d\n", node);
                 }
             } else {
-                LLAMA_LOG_WARN("numa_mirroring Could not verify allocation node for %p: %s\n", 
+                LLAMA_LOG_WARN("numa_mirroring: Could not verify allocation node for %p: %s\n", 
                                node_mem, strerror(errno));
             }
             
             file->seek(0, SEEK_SET);
             file->read_raw(node_mem, total_size);
             numa_mappings.push_back({node_mem, total_size});
 
-            LLAMA_LOG_DEBUG("numa_mirroring Successfully allocated and populated %.2f MB on node %d at %p\n",
+            LLAMA_LOG_DEBUG("numa_mirroring: Successfully allocated and populated %.2f MB on node %d at %p\n",
                            total_size / (1024.0 * 1024.0), node, node_mem);
         }
         addr = numa_mappings.empty() ? nullptr : numa_mappings[0].addr;
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp