Refactored llama-model to adapt upstream's unique_ptr vs raw pointer approach

iancris · iancris · commit 9324c761d1f2 · 2025-11-13T18:54:32.000+08:00
diff --git a/src/llama-model-load.h b/src/llama-model-load.h
@@ -96,16 +96,17 @@ struct IncrementalSplitsTensorLoad {
                 throw std::runtime_error("failed to create ggml context for split-file");
             }
 
-            ctx_split_map[key] = ctx;
-            model_impl->ctxs.emplace_back(ctx);
+            ctx_split_map[key] = ggml_context_ptr(ctx);
+            // Contexts are cleaned up when create_split_backend_buffers is called
+            // Review: this will be an issue if this ctx_split_map is used after create_split_backend_buffers is called
 
             return ctx;
         }
-        return it->second;
+        return it->second.get();
     }
 
     // public so that it can be processed by the backend storage allocator
-    std::map<std::pair<ggml_backend_buffer_type_t, uint16_t>, ggml_context *> ctx_split_map;
+    std::map<std::pair<ggml_backend_buffer_type_t, uint16_t>, ggml_context_ptr> ctx_split_map;
 
   private:
     struct TensorInfo {
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -2284,12 +2284,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
     max_n_tensors += n_layer*2; // duplicated rope freq tensors
     const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;
 
-    // define a comparator for the buft -> ctx map to ensure that the order is well-defined:
-    struct ggml_backend_buft_comparator {
-        bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
-            return strcmp(ggml_backend_buft_name(lhs), ggml_backend_buft_name(rhs)) < 0;
-        }
-    };
     std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;
 
     auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
@@ -6354,14 +6348,20 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
 }
 
 bool llama_model::create_split_backend_buffers(
-    const uint16_t idx, std::map<std::pair<ggml_backend_buffer_type_t, uint16_t>, ggml_context *> & ctx_split_map,
+    const uint16_t idx, std::map<std::pair<ggml_backend_buffer_type_t, uint16_t>, ggml_context_ptr> & ctx_split_map,
     llama_model_loader & ml, const bool use_mmap_buffer, const bool use_mlock, const int32_t n_gpu_layers) {
     // Extract contexts for the given split index from ctx_split_map into a new map
-    std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
-    for (const auto & [buft_split_idx, ctx] : ctx_split_map) {
+    std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;
+    for (auto it = ctx_split_map.begin(); it != ctx_split_map.end();) {
+        const auto & [buft_split_idx, ctx_ptr] = *it;
         const auto & [buft, split_idx] = buft_split_idx;
         if (split_idx == idx) {
-            ctx_map[buft] = ctx;
+            // Move the context from ctx_split_map to ctx_map
+            ctx_map[buft] = std::move(it->second);
+            // Remove from ctx_split_map since ownership has been transferred
+            it = ctx_split_map.erase(it);
+        } else {
+            ++it;
         }
     }
 
@@ -6370,12 +6370,15 @@ bool llama_model::create_split_backend_buffers(
     constexpr bool do_print_backend_buffers_info = false;
     const bool     creation_success = create_backend_buffers(split_data_size, ctx_map, ml, use_mmap_buffer, use_mlock,
                                                              n_gpu_layers, do_print_backend_buffers_info);
+    
+    // Note: create_backend_buffers moves the contexts into ctxs_bufs, taking ownership
+    // The contexts in ctx_map are now empty after the move, which is expected
 
     return creation_success;
 }
 
 bool llama_model::create_backend_buffers(std::size_t                                                  size_data,
-                                         const std::map<ggml_backend_buffer_type_t, ggml_context *> & ctx_map,
+                                         std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> & ctx_map,
                                          llama_model_loader & ml, const bool use_mmap_buffer, const bool use_mlock,
                                          const int32_t n_gpu_layers, bool do_print_backend_buffers_info) {
     // create the backend buffers
diff --git a/src/llama-model.h b/src/llama-model.h
@@ -8,6 +8,7 @@
 #include "llama-vocab.h"
 
 #include <cstdint>
+#include <cstring>
 #include <memory>
 #include <string>
 #include <unordered_map>
@@ -408,6 +409,13 @@ struct llama_layer {
     struct llama_layer_nextn nextn;
 };
 
+// define a comparator for the buft -> ctx map to ensure that the order is well-defined:
+struct ggml_backend_buft_comparator {
+    bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
+        return strcmp(ggml_backend_buft_name(lhs), ggml_backend_buft_name(rhs)) < 0;
+    }
+};
+
 struct llama_model {
     llm_type type = LLM_TYPE_UNKNOWN;
     llm_arch arch = LLM_ARCH_UNKNOWN;
@@ -475,13 +483,13 @@ struct llama_model {
 
     /// @brief Create backend buffers for all tensors
     bool create_backend_buffers(std::size_t                                                  size_data,
-                                const std::map<ggml_backend_buffer_type_t, ggml_context *> & ctx_map,
+                                std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> & ctx_map,
                                 llama_model_loader & ml, bool use_mmap_buffer, bool use_mlock, int32_t n_gpu_layers,
                                 bool do_print_backend_buffers_info = true);
 
     /// @brief Create backend buffers for tensors on a split file idenfified by `idx`. Removes the split from the map.
     bool create_split_backend_buffers(
-        uint16_t idx, std::map<std::pair<ggml_backend_buffer_type_t, uint16_t>, ggml_context *> & ctx_split_map,
+        uint16_t idx, std::map<std::pair<ggml_backend_buffer_type_t, uint16_t>, ggml_context_ptr> & ctx_split_map,
         llama_model_loader & ml, bool use_mmap_buffer, bool use_mlock, int32_t n_gpu_layers);
 
     void print_backend_buffers_info(int32_t n_gpu_layers);

Original file line number	Diff line number	Diff line change
`@@ -96,16 +96,17 @@ struct IncrementalSplitsTensorLoad {`
`96`	`96`	`throw std::runtime_error("failed to create ggml context for split-file");`
`97`	`97`	`}`
`98`	`98`
`99`		`- ctx_split_map[key] = ctx;`
`100`		`- model_impl->ctxs.emplace_back(ctx);`
	`99`	`+ ctx_split_map[key] = ggml_context_ptr(ctx);`
	`100`	`+ // Contexts are cleaned up when create_split_backend_buffers is called`
	`101`	`+ // Review: this will be an issue if this ctx_split_map is used after create_split_backend_buffers is called`
`101`	`102`
`102`	`103`	`return ctx;`
`103`	`104`	`}`
`104`		`- return it->second;`
	`105`	`+ return it->second.get();`
`105`	`106`	`}`
`106`	`107`
`107`	`108`	`// public so that it can be processed by the backend storage allocator`
`108`		`- std::map<std::pair<ggml_backend_buffer_type_t, uint16_t>, ggml_context *> ctx_split_map;`
	`109`	`+ std::map<std::pair<ggml_backend_buffer_type_t, uint16_t>, ggml_context_ptr> ctx_split_map;`
`109`	`110`
`110`	`111`	`private:`
`111`	`112`	`struct TensorInfo {`