[refactor][mbuffer] File load from variant

jesusmb1995 · jesusmb1995 · commit 296ba81534ea · 2025-07-30T19:49:07.000+02:00
- Add code to be able to load a gguf file from a variant (memory or disk).
- Some structs simplify how to load a file and keep track of the pointers (which are now in the same struct).
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -26,6 +26,8 @@ add_library(llama
             llama-memory-hybrid.cpp
             llama-memory-recurrent.cpp
             llama-mmap.cpp
+            llama-model-load-input.cpp
+            llama-model-load.cpp
             llama-model-loader.cpp
             llama-model-saver.cpp
             llama-model.cpp
diff --git a/src/llama-model-load-input.cpp b/src/llama-model-load-input.cpp
@@ -0,0 +1,23 @@
+#include "llama-model-load-input.h"
+
+namespace load_input_variant {
+
+const char * identifier(load_input_t & load_input) {
+    if (std::holds_alternative<fname_load_input>(load_input)) {
+        const auto & file_input = std::get<fname_load_input>(load_input);
+        return file_input.fname.c_str();
+    }
+    static const char * buffer_id_str = "buffer";
+    return buffer_id_str;
+}
+
+fname_load_input split_name_from_variant(load_input_t & load_input) {
+    auto file_input = std::get<fname_load_input>(load_input);
+    return file_input;
+}
+
+bool variant_supports_split_load(load_input_t & load_input) {
+    return std::holds_alternative<fname_load_input>(load_input);
+}
+
+}  // namespace load_input_variant
diff --git a/src/llama-model-load-input.h b/src/llama-model-load-input.h
@@ -0,0 +1,31 @@
+#pragma once
+
+#include <stdint.h>
+
+#include <memory>
+#include <string>
+#include <variant>
+#include <vector>
+
+namespace load_input_variant {
+
+struct fname_load_input {
+    const std::string &        fname;
+    std::vector<std::string> & splits;  // optional, only need if the split does not follow naming scheme
+};
+
+struct buffer_load_input {
+    std::unique_ptr<std::basic_streambuf<uint8_t>> & streambuf;
+};
+
+}  // namespace load_input_variant
+
+using load_input_t = std::variant<load_input_variant::fname_load_input, load_input_variant::buffer_load_input>;
+
+namespace load_input_variant {
+const char * identifier(load_input_t & load_input);
+
+fname_load_input split_name_from_variant(load_input_t & load_input);
+
+bool variant_supports_split_load(load_input_t & load_input);
+}  // namespace load_input_variant
diff --git a/src/llama-model-load.cpp b/src/llama-model-load.cpp
@@ -0,0 +1,30 @@
+#include "llama-model-load.h"
+
+#include <memory>
+#include <stdexcept>
+#include <variant>
+
+#include "llama-model-loader.h"
+
+gguf_file_load::gguf_file_load(struct ggml_context ** ctx, load_input_t load_input) :
+    params({
+        /*.no_alloc = */ true,
+        /*.ctx      = */ ctx,
+    }) {
+    using namespace load_input_variant;
+    if (std::holds_alternative<fname_load_input>(load_input)) {
+        const auto & file_input = std::get<fname_load_input>(load_input);
+        meta.reset(gguf_init_from_file(file_input.fname.c_str(), params));
+        if (!meta) {
+            throw std::runtime_error(format("%s: failed to load model from %s", __func__, file_input.fname.c_str()));
+        }
+        file = std::make_unique<llama_file_disk>(file_input.fname.c_str(), "ro");
+    } else {
+        const auto & buffer_input = std::get<buffer_load_input>(load_input);
+        meta.reset(gguf_init_from_buffer(*buffer_input.streambuf, params));
+        if (!meta) {
+            throw std::runtime_error(format("%s: failed to load model from buffer", __func__));
+        }
+        file = std::make_unique<llama_file_buffer_ro>(std::move(buffer_input.streambuf));
+    }
+}
diff --git a/src/llama-model-load.h b/src/llama-model-load.h
@@ -0,0 +1,19 @@
+#pragma once
+
+#include <cstdint>
+#include <set>
+
+#include "ggml-cpp.h"
+#include "llama-mmap.h"
+#include "llama-model-load-input.h"
+
+struct llama_model_loader;
+
+/// @brief Immediately loads and stores relevant data in the struct fields.
+struct gguf_file_load {
+    struct gguf_init_params     params;
+    gguf_context_ptr            meta;
+    std::unique_ptr<llama_file> file = nullptr;
+
+    gguf_file_load(struct ggml_context ** ctx, load_input_t load_input);
+};
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
@@ -1,6 +1,8 @@
 #include "llama-model-loader.h"
 
 #include "ggml.h"
+#include "llama-model-load-input.h"
+#include "llama-model-load.h"
 
 #include <array>
 #include <cinttypes>
@@ -485,22 +487,14 @@ llama_model_loader::llama_model_loader(
 
     tensor_buft_overrides = param_tensor_buft_overrides_p;
 
-    // Load the main GGUF
     struct ggml_context * ctx = NULL;
-    struct gguf_init_params params = {
-        /*.no_alloc = */ true,
-        /*.ctx      = */ &ctx,
-    };
-
-    meta.reset(gguf_init_from_file(fname.c_str(), params));
-    if (!meta) {
-        throw std::runtime_error(format("%s: failed to load model from %s", __func__, fname.c_str()));
-    }
+    gguf_file_load main_gguf(&ctx, load_input_variant::fname_load_input{fname, splits});
+    meta = std::move(main_gguf.meta);
 
     get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
     llm_kv = LLM_KV(llm_arch_from_string(arch_name));
 
-    files.emplace_back(new llama_file_disk(fname.c_str(), "rb"));
+    files.emplace_back(std::move(main_gguf.file));
     contexts.emplace_back(ctx);
 
     // Save tensors data offset of the main file.
@@ -547,28 +541,22 @@ llama_model_loader::llama_model_loader(
         for (idx = 1; idx < n_split; idx++) {
             const char * fname_split = splits[idx].c_str();
 
-            struct gguf_init_params split_params = {
-                /*.no_alloc = */ true,
-                /*.ctx      = */ &ctx,
-            };
-            gguf_context_ptr ctx_gguf { gguf_init_from_file(fname_split, split_params) };
-            if (!ctx_gguf) {
-                throw std::runtime_error(format("%s: failed to load GGUF split from %s", __func__, fname_split));
-            }
+            gguf_file_load split_gguf(&ctx, load_input_variant::fname_load_input{fname_split, splits});
+            gguf_context_ptr& split_meta = split_gguf.meta;
 
             // check idx
             {
-                const int kid = gguf_find_key(ctx_gguf.get(), kv_split_no.c_str());
+                const int kid = gguf_find_key(split_meta.get(), kv_split_no.c_str());
                 if (kid < 0) {
                     throw std::runtime_error(format("missing key %s in GGUF split %s", kv_split_no.c_str(), fname_split));
                 }
-                int idx_gguf = gguf_get_val_u16(ctx_gguf.get(), kid);
+                int idx_gguf = gguf_get_val_u16(split_meta.get(), kid);
                 if (idx_gguf != idx) {
                     throw std::runtime_error(format("invalid split file idx: %d (file: %s), expected %d", idx_gguf, fname_split, idx));
                 }
             }
 
-            files.emplace_back(new llama_file_disk(fname_split, "rb"));
+            files.emplace_back(std::move(split_gguf.file));
             contexts.emplace_back(ctx);
 
             // Save tensors data offset info of the shard.
@@ -580,7 +568,7 @@ llama_model_loader::llama_model_loader(
                 }
                 n_elements += ggml_nelements(cur);
                 n_bytes    += ggml_nbytes(cur);
-                weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), idx, ctx_gguf.get(), cur));
+                weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), idx, split_meta.get(), cur));
             }
         }