Merge branch 'numa-improvements-take2-iteration' into numa-improvements-take2

dbsanfte · dbsanfte · commit 9d66473bb2a6 · 2025-08-01T13:51:51.000Z
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
@@ -885,9 +885,13 @@ void llama_model_loader::init_mappings(bool prefetch, llama_mlocks * mlock_mmaps
                     // Add empty entries for consistency
                     mlock_mmaps->emplace_back(nullptr);
                 }
-                // Store the same unified mapping for each file index
-                mappings.emplace_back(i == 0 ? std::move(unified_mapping) : 
-                    std::unique_ptr<llama_mmap>(nullptr));
+                // Store the unified mapping only in the first slot
+                // Other slots remain nullptr - access code will check for unified mapping
+                if (i == 0) {
+                    mappings.emplace_back(std::move(unified_mapping));
+                } else {
+                    mappings.emplace_back(nullptr);
+                }
             }
         } else {
 #endif
@@ -917,17 +921,8 @@ void llama_model_loader::get_mapping_range(size_t * first, size_t * last, void *
     GGML_ASSERT(!mappings.empty());
     
 #ifdef GGML_NUMA_MIRROR
-    // Check if this is a unified mapping (mapping[0] exists but others are null)  
-    bool is_unified_mapping = mappings.size() > 1 && mappings[0];
-    // Verify it's truly unified by checking that all other mappings are null
-    if (is_unified_mapping) {
-        for (size_t i = 1; i < mappings.size(); ++i) {
-            if (mappings[i]) {
-                is_unified_mapping = false;
-                break;
-            }
-        }
-    }
+    // Check if this is a unified mapping by seeing if mappings[1] is null but mappings[0] exists
+    bool is_unified_mapping = mappings.size() > 1 && mappings[0] && !mappings[1];
     
     if (is_unified_mapping) {
         // For unified mapping, use the first (and only real) mapping
@@ -986,12 +981,12 @@ void llama_model_loader::load_data_for(struct ggml_tensor * cur) const {
 
     if (use_mmap) {
 #ifdef GGML_NUMA_MIRROR
-        // Check if this is a unified mapping (mapping[0] exists but others are null)
-        bool is_unified_mapping = mappings.size() > 1 && mappings[0];
-        // Verify it's truly unified by checking that all other mappings are null
+        // Check if this is a unified mapping by comparing if all mappings point to the same object
+        bool is_unified_mapping = mappings.size() > 1;
         if (is_unified_mapping) {
+            llama_mmap * first_ptr = mappings[0].get();
             for (size_t i = 1; i < mappings.size(); ++i) {
-                if (mappings[i]) {
+                if (mappings[i].get() != first_ptr) {
                     is_unified_mapping = false;
                     break;
                 }
@@ -1152,12 +1147,34 @@ bool llama_model_loader::load_all_data(
         size_t n_size = ggml_nbytes(cur);
 
         if (use_mmap) {
-            const auto & mapping = mappings.at(weight->idx);
+            // Check if this is a unified mapping and get the appropriate mapping
+            std::unique_ptr<llama_mmap> * mapping_ptr;
+            size_t file_offset = 0;
+            
+#ifdef GGML_NUMA_MIRROR
+            // Check if this is a unified mapping by seeing if mappings[1] is null but mappings[0] exists
+            bool is_unified_mapping = mappings.size() > 1 && mappings[0] && !mappings[1];
+            if (is_unified_mapping) {
+                // For unified mapping, always use mappings[0] and calculate the file offset
+                mapping_ptr = &mappings[0];
+                // Calculate offset for this file within the unified mapping
+                for (int i = 0; i < weight->idx; ++i) {
+                    file_offset += files[i]->size();
+                }
+            } else {
+                // Standard per-file mapping
+                mapping_ptr = &mappings.at(weight->idx);
+            }
+#else
+            mapping_ptr = &mappings.at(weight->idx);
+#endif
+            
+            const auto & mapping = *mapping_ptr;
             ggml_backend_buffer_t buf_mmap = nullptr;
             if (bufs.count(weight->idx)) {
                 buf_mmap = bufs.at(weight->idx);
             }
-            uint8_t * data = (uint8_t *) mapping->addr() + weight->offs;
+            uint8_t * data = (uint8_t *) mapping->addr() + file_offset + weight->offs;
 
             if (check_tensors) {
                 validation_result.emplace_back(std::async(std::launch::async, [cur, data, n_size] {
@@ -1250,16 +1267,8 @@ bool llama_model_loader::load_all_data(
     if (size_done >= size_data) {
         // unmap offloaded tensors and metadata
         if (use_mmap) {
-            // Check if this is a unified mapping (mapping[0] exists but others are null)
-            bool is_unified_mapping = mappings.size() > 1 && mappings[0];
-            if (is_unified_mapping) {
-                for (size_t i = 1; i < mappings.size(); ++i) {
-                    if (mappings[i]) {
-                        is_unified_mapping = false;
-                        break;
-                    }
-                }
-            }
+            // Check if this is a unified mapping by seeing if mappings[1] is null but mappings[0] exists
+            bool is_unified_mapping = mappings.size() > 1 && mappings[0] && !mappings[1];
             
             if (is_unified_mapping) {
                 // For unified mappings, skip unmap_fragment calls entirely
@@ -1271,11 +1280,6 @@ bool llama_model_loader::load_all_data(
                     const auto & mmap_used = mmaps_used.at(idx);
                     auto & mapping = mappings.at(idx);
                     
-                    // Skip null mappings
-                    if (!mapping) {
-                        continue;
-                    }
-                    
                     mapping->unmap_fragment(0, mmap_used.first);
                     if (mmap_used.second != 0) {
                         mapping->unmap_fragment(mmap_used.second, mapping->size());