[fbuffers] Incremental model load

jesusmb1995 · jesusmb1995 · commit 69aeda8d8351 · 2025-08-19T17:11:30.000+02:00
Adapt the loader and model load to incrementally load files and upload tensors.
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
@@ -1,11 +1,12 @@
 #include "llama-model-loader.h"
 
 #include "ggml.h"
-#include "llama-model-load-input.h"
+#include "llama-mmap.h"
 #include "llama-model-load.h"
 
 #include <array>
 #include <cinttypes>
+#include <cstdint>
 #include <cstring>
 #include <future>
 #include <stdexcept>
@@ -512,9 +513,16 @@ llama_model_loader::llama_model_loader(
 
     tensor_buft_overrides = param_tensor_buft_overrides_p;
 
+    std::optional<std::set<std::string>> tensor_list = load_input_variant::parse_tensor_list_from_future(load_input);
+
     struct ggml_context * ctx = NULL;
     gguf_file_load main_gguf(&ctx, load_input);
-    process_loaded_gguf(ctx, main_gguf, 0);
+
+    if (load_input_variant::variant_supports_split_load_from_memory(load_input)) {
+        incremental_splits_tensor_load.emplace(ctx, *this, main_gguf, std::move(*tensor_list));
+    } else {
+        process_loaded_gguf(ctx, main_gguf, 0);
+    }
 
     meta = std::move(main_gguf.meta);
 
@@ -526,8 +534,8 @@ llama_model_loader::llama_model_loader(
 
     // Load additional GGML contexts
     if (load_input_variant::variant_supports_split_load(load_input) && n_split > 1) {
+
         load_input_variant::fname_load_input base_split = load_input_variant::split_name_from_variant(load_input);
-        std::vector<std::string> &           splits     = base_split.splits;
 
         // make sure the main file is loaded first
         uint16_t idx = 0;
@@ -538,13 +546,13 @@ llama_model_loader::llama_model_loader(
         }
 
         // generate list of splits if needed
-        if (splits.empty()) {
-            splits = llama_get_list_splits(base_split.fname, idx, n_split);
+        if (base_split.splits.empty()) {
+            base_split.splits = llama_get_list_splits(base_split.fname, idx, n_split);
         }
 
         // in case user give a custom list of splits, check if it matches the expected number
-        if (n_split != (uint16_t)splits.size()) {
-            throw std::runtime_error(format("invalid split count, given: %zu splits, but expected %d", splits.size(), n_split));
+        if (n_split != (uint16_t)base_split.splits.size()) {
+            throw std::runtime_error(format("invalid split count, given: %zu splits, but expected %d", base_split.splits.size(), n_split));
         }
 
         if (trace > 0) {
@@ -553,30 +561,20 @@ llama_model_loader::llama_model_loader(
 
         // load other splits
         for (idx = 1; idx < n_split; idx++) {
-            const char * fname_split = splits[idx].c_str();
-
-            gguf_file_load split_gguf(&ctx, load_input_variant::fname_load_input{fname_split, splits});
-            gguf_context_ptr& split_meta = split_gguf.meta;
+            SplitLoad split_load(load_input, base_split, idx, kv_split_no);
 
-            // check idx
-            {
-                const int kid = gguf_find_key(split_meta.get(), kv_split_no.c_str());
-                if (kid < 0) {
-                    throw std::runtime_error(format("missing key %s in GGUF split %s", kv_split_no.c_str(), fname_split));
-                }
-                int idx_gguf = gguf_get_val_u16(split_meta.get(), kid);
-                if (idx_gguf != idx) {
-                    throw std::runtime_error(format("invalid split file idx: %d (file: %s), expected %d", idx_gguf, fname_split, idx));
-                }
+            if(incremental_splits_tensor_load.has_value()) {
+                incremental_splits_tensor_load->add_split(std::move(split_load));
+            }
+            else {
+                split_load.load(*this);
             }
-
-            process_loaded_gguf(ctx, split_gguf, idx);
         }
 
         get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors);
 
-        // sanity check
-        {
+        // sanity check (the incremental loader does the check after loading the last split)
+        if(!incremental_splits_tensor_load.has_value()) {
             const int n_tensors_loaded = (int) weights_map.size();
             if (n_tensors != n_tensors_loaded) {
                 throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded));
@@ -587,7 +585,13 @@ llama_model_loader::llama_model_loader(
     }
 
     n_kv      = gguf_get_n_kv(meta.get());
-    n_tensors = weights_map.size();
+    if (incremental_splits_tensor_load.has_value()) {
+        n_tensors = incremental_splits_tensor_load->expected_n_tensors();
+        LLAMA_LOG_CMAKE_DEBUG("%s: n_tensors (expected from summary list): %d\n", __func__, n_tensors);
+    } else {
+        n_tensors = weights_map.size();
+        LLAMA_LOG_CMAKE_DEBUG("%s: exact n_tensors: %d\n", __func__,  n_tensors);
+    }
 
     fver = (enum llama_fver) gguf_get_version(meta.get());
 
@@ -596,7 +600,7 @@ llama_model_loader::llama_model_loader(
 
     // determine file type based on the number of tensors for each quantization and print meta data
     // TODO: make optional
-    {
+    if(!incremental_splits_tensor_load.has_value()) {
         std::map<enum ggml_type, uint32_t> n_type;
 
         uint32_t n_type_max = 0;
diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h
@@ -79,6 +79,9 @@ struct llama_model_loader {
     llama_mmaps mappings;
 
     std::map<std::string, llama_tensor_weight, weight_name_comparer> weights_map;
+
+    std::optional<IncrementalSplitsTensorLoad> incremental_splits_tensor_load;
+
     std::unordered_map<std::string, llama_model_kv_override> kv_overrides;
     const llama_model_tensor_buft_override * tensor_buft_overrides;
 
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -17,10 +17,10 @@
 #include <cassert>
 #include <cmath>
 #include <cfloat>
+#include <cstdint>
 #include <cstring>
 #include <cmath>
 #include <functional>
-#include <map>
 #include <regex>
 #include <sstream>
 #include <stdexcept>
@@ -1589,6 +1589,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
     const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;
 
     std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
+    std::set<uint16_t> created_backend_buffer_splits;
+
     auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
         auto it = ctx_map.find(buft);
         if (it == ctx_map.end()) {
@@ -1643,9 +1645,19 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
         ggml_backend_buffer_type_t first_moved_to_buft = nullptr;
 
         auto create_tensor = [&](const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) -> ggml_tensor * {
-            ggml_tensor * t_meta = ml.get_tensor_meta(tn.str().c_str());
-
+            const std::string& tensor_name = tn.str();
+            ggml_tensor * t_meta = ml.get_tensor_meta(tensor_name.c_str());
+            std::optional<uint16_t> split_idx;
+            if (!t_meta && (flags & TENSOR_NOT_REQUIRED) &&
+                IncrementalSplitsTensorLoad::tensor_ignored(ml.incremental_splits_tensor_load, tensor_name.c_str())) {
+                return nullptr;
+            }
+            if (ml.incremental_splits_tensor_load.has_value()) {
+                split_idx = ml.incremental_splits_tensor_load->load_tensor_metadata(ml, tn.str().c_str(), &t_meta);
+                LLAMA_LOG_CMAKE_DEBUG("split idx for tensor %s: %d\n", tn.str().c_str(), *split_idx);
+            }
             if (!t_meta) {
+                LLAMA_LOG_ERROR("%s: missing tensor %s\n", __func__ , tn.str().c_str());
                 if (flags & TENSOR_NOT_REQUIRED) {
                     return nullptr;
                 }
@@ -1758,7 +1770,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                 }
             }
 
-            ggml_context * ctx = ctx_for_buft(buft);
+            ggml_context * ctx =
+                split_idx.has_value() ?
+                    ml.incremental_splits_tensor_load->get_model_ctx_for_split_buft(buft, *split_idx, pimpl.get()) :
+                    ctx_for_buft(buft);
 
             // if duplicated, check if the original tensor was allocated in the same buffer type context and avoid creating a new one
             if (flags & TENSOR_DUPLICATED) {
@@ -1767,7 +1782,20 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                     return t;
                 }
             }
-            return ml.create_tensor(ctx, tn, ne, flags);
+            struct ggml_tensor * tensor = ml.create_tensor(ctx, tn, ne, flags);
+
+            if (split_idx.has_value() && ml.incremental_splits_tensor_load->all_tensors_are_loaded(*split_idx) &&
+                created_backend_buffer_splits.find(*split_idx) == created_backend_buffer_splits.end()) {
+                // Upload right now.
+                if (!create_split_backend_buffers(*split_idx, ml.incremental_splits_tensor_load->ctx_split_map, ml,
+                                                  use_mmap_buffer, use_mlock, n_gpu_layers)) {
+                    throw std::runtime_error("Failed to create incremental backend buffers");
+                }
+                IncrementalSplitsTensorLoad::release_split(ml, *split_idx);
+                created_backend_buffer_splits.insert(*split_idx);
+            }
+
+            return tensor;
         };
 
         layers.resize(n_layer);
@@ -4285,12 +4313,49 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
 
     ml.done_getting_tensors();
 
+    if (ml.incremental_splits_tensor_load.has_value()) {
+        // Already did incremental load.
+        print_backend_buffers_info(n_gpu_layers);
+        return true;
+    }
+
     ml.init_mappings(true, use_mlock ? &pimpl->mlock_mmaps : nullptr);
     pimpl->mappings.reserve(ml.mappings.size());
 
     return create_backend_buffers(ml.size_data, ctx_map, ml, use_mmap_buffer, use_mlock, n_gpu_layers);
 }
 
+bool llama_model::create_split_backend_buffers(
+    const uint16_t idx, std::map<std::pair<ggml_backend_buffer_type_t, uint16_t>, ggml_context *> & ctx_split_map,
+    llama_model_loader & ml, const bool use_mmap_buffer, const bool use_mlock, const int32_t n_gpu_layers) {
+    // Extract contexts for the given split index from ctx_split_map into a new map
+    std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
+    for (const auto & [buft_split_idx, ctx] : ctx_split_map) {
+        const auto & [buft, split_idx] = buft_split_idx;
+        if (split_idx == idx) {
+            ctx_map[buft] = ctx;
+        }
+    }
+
+    const std::size_t split_data_size = ml.incremental_splits_tensor_load->get_split_data_size(idx);
+    LLAMA_LOG_CMAKE_DEBUG("%s: creating backend buffers for split %d with size %zu\n", __func__, idx, split_data_size);
+    constexpr bool do_print_backend_buffers_info = false;
+    const bool     creation_success = create_backend_buffers(split_data_size, ctx_map, ml, use_mmap_buffer, use_mlock,
+                                                             n_gpu_layers, do_print_backend_buffers_info);
+
+    if (creation_success) {
+        for (auto it = ctx_split_map.begin(); it != ctx_split_map.end();) {
+            if (it->first.second == idx) {
+                it = ctx_split_map.erase(it);
+            } else {
+                ++it;
+            }
+        }
+    }
+
+    return creation_success;
+}
+
 bool llama_model::create_backend_buffers(std::size_t                                                  size_data,
                                          const std::map<ggml_backend_buffer_type_t, ggml_context *> & ctx_map,
                                          llama_model_loader & ml, const bool use_mmap_buffer, const bool use_mlock,
diff --git a/src/llama-model.h b/src/llama-model.h
@@ -381,6 +381,11 @@ struct llama_model {
                                 llama_model_loader & ml, bool use_mmap_buffer, bool use_mlock, int32_t n_gpu_layers,
                                 bool do_print_backend_buffers_info = true);
 
+    /// @brief Create backend buffers for tensors on a split file idenfified by `idx`. Removes the split from the map.
+    bool create_split_backend_buffers(
+        uint16_t idx, std::map<std::pair<ggml_backend_buffer_type_t, uint16_t>, ggml_context *> & ctx_split_map,
+        llama_model_loader & ml, bool use_mmap_buffer, bool use_mlock, int32_t n_gpu_layers);
+
     void print_backend_buffers_info(int32_t n_gpu_layers);
 
     void load_stats  (llama_model_loader & ml);