tetherto · jesusmb1995 · Jul 16, 2025 · Jul 29, 2025 · Jul 16, 2025 · Jul 30, 2025
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -193,6 +193,10 @@ if (LLAMA_BUILD_COMMON)
     add_subdirectory(common)
 endif()
 
+if(LLAMA_BUILD_EXAMPLES OR LLAMA_BUILD_TESTS)
+    add_subdirectory(common_test)
+endif()
+
 if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
     include(CTest)
     add_subdirectory(tests)

diff --git a/common/common.cpp b/common/common.cpp
@@ -899,15 +899,7 @@ std::string fs_get_cache_file(const std::string & filename) {
 // Model utils
 //
 
-struct common_init_result common_init_from_params(common_params & params) {
-    common_init_result iparams;
-    auto mparams = common_model_params_to_llama(params);
-
-    llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
-    if (model == NULL) {
-        LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
-        return iparams;
-    }
+struct common_init_result common_init_from_model_and_params(llama_model* model, common_init_result iparams, common_params & params) {
 
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
@@ -1068,6 +1060,19 @@ struct common_init_result common_init_from_params(common_params & params) {
     return iparams;
 }
 
+struct common_init_result common_init_from_params(common_params & params) {
+    common_init_result iparams;
+    auto               mparams = common_model_params_to_llama(params);
+
+    llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
+    if (model == NULL) {
+        LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
+        return iparams;
+    }
+
+    return common_init_from_model_and_params(model, std::move(iparams), params);
+}
+
 std::string get_model_endpoint() {
     const char * model_endpoint_env = getenv("MODEL_ENDPOINT");
     // We still respect the use of environment-variable "HF_ENDPOINT" for backward-compatibility.

diff --git a/common/common.h b/common/common.h
@@ -551,6 +551,8 @@ struct common_init_result {
 };
 
 struct common_init_result     common_init_from_params(common_params & params);
+struct common_init_result     common_init_from_model_and_params(llama_model * model, common_init_result iparams,
+                                                                common_params & params);
 
 struct llama_model_params     common_model_params_to_llama  (      common_params & params);
 struct llama_context_params   common_context_params_to_llama(const common_params & params);

diff --git a/common_test/CMakeLists.txt b/common_test/CMakeLists.txt
@@ -0,0 +1,15 @@
+# common_test library for load_into_memory.h and uint8-buff-stream.h
+
+set(TARGET llama-common-test)
+
+add_library(${TARGET} INTERFACE)
+
+target_include_directories(${TARGET} INTERFACE
+    ${CMAKE_CURRENT_SOURCE_DIR}
+)
+
+target_compile_definitions(${TARGET} INTERFACE LLAMA_COMMON_TEST_HEADERS)
+
+target_compile_features(${TARGET} INTERFACE cxx_std_17)
+
+target_link_libraries(${TARGET} INTERFACE common)
diff --git a/common_test/load_into_memory.h b/common_test/load_into_memory.h
@@ -0,0 +1,220 @@
+#pragma once
+
+#include <chrono>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <ctime>
+#include <fstream>
+#include <memory>
+#include <sstream>
+#include <streambuf>
+#include <string>
+#include <thread>
+#include <vector>
+
+// header-only utilities to showcase how to directly load a model from memory
+#include "uint8-buff-stream-wrapper.h"
+
+namespace {
+bool is_split_file(const char * const model_path) {
+    if (!model_path) {
+        fprintf(stderr, "No model file provided\n");
+        exit(EXIT_FAILURE);
+    }
+
+    std::string path(model_path);
+    return path.find("-of-") != std::string::npos;
+}
+
+std::vector<uint8_t> load_file_into_buffer(const char * const model_path) {
+    std::ifstream file_stream(model_path, std::ios::binary | std::ios::ate);
+    if (!file_stream) {
+        fprintf(stderr, "Failed to open file %s for reading into streambuf\n", model_path);
+        exit(EXIT_FAILURE);
+    }
+
+    const size_t file_size = file_stream.tellg();
+    file_stream.seekg(0, std::ios::beg);
+
+    static_assert(sizeof(std::uint8_t) == sizeof(char), "uint8_t must be same size as char");
+    std::vector<std::uint8_t> buffer(file_size);
+    if (!file_stream.read((char *) buffer.data(), file_size)) {
+        fprintf(stderr, "Failed to read entire file into buffer\n");
+        exit(EXIT_FAILURE);
+    }
+
+    return buffer;
+}
+
+std::unique_ptr<std::basic_streambuf<uint8_t>> load_file_into_streambuf(const char * const model_path) {
+    return std::make_unique<Uint8BufferStreamBuf>(load_file_into_buffer(model_path));
+}
+
+struct file_entry {
+    std::string                                    path;
+    std::unique_ptr<std::basic_streambuf<uint8_t>> streambuf;
+};
+
+std::vector<file_entry> load_files_into_streambuf(const char * const model_path) {
+    std::vector<file_entry> files;
+
+    // Extract pattern from first file path
+    std::string path(model_path);
+
+    // Split by '-'
+    std::vector<std::string> parts;
+    std::stringstream        ss(path);
+    std::string              item;
+    while (std::getline(ss, item, '-')) {
+        parts.push_back(item);
+    }
+
+    // Split the last part by '.'
+    std::string last_part = parts.back();
+    parts.pop_back();
+    size_t dot_pos = last_part.find('.');
+    if (dot_pos != std::string::npos) {
+        parts.push_back(last_part.substr(0, dot_pos));
+        parts.push_back(last_part.substr(dot_pos + 1));  // extension
+    } else {
+        parts.push_back(last_part);
+    }
+
+    // Check if we have enough parts
+    if (parts.size() < 4) {
+        fprintf(stderr, "Model path does not contain expected pattern\n");
+        exit(EXIT_FAILURE);
+    }
+
+    // Get total files from [-2] position (before the extension)
+    int total_files = std::stoi(parts[parts.size() - 2]);
+
+    // Get base path by joining all parts except -start-of-end.gguf
+    std::string base_path;
+    for (size_t i = 0; i < parts.size() - 4; i++) {
+        if (i > 0) {
+            base_path += "-";
+        }
+        base_path += parts[i];
+    }
+
+    for (int i = 1; i <= total_files; i++) {
+        char numbered_path[1024];
+        snprintf(numbered_path, sizeof(numbered_path), "%s-%05d-of-%05d.gguf", base_path.c_str(), i, total_files);
+
+        files.push_back({ numbered_path, load_file_into_streambuf(numbered_path) });
+    }
+
+    return files;
+}
+
+file_entry load_tensor_list_file(const char * const model_path) {
+    std::string path(model_path);
+
+    // Split by '-'
+    std::vector<std::string> parts;
+    std::stringstream        ss(path);
+    std::string              item;
+    while (std::getline(ss, item, '-')) {
+        parts.push_back(item);
+    }
+
+    // Split the last part by '.'
+    std::string last_part = parts.back();
+    parts.pop_back();
+    size_t dot_pos = last_part.find('.');
+    if (dot_pos != std::string::npos) {
+        parts.push_back(last_part.substr(0, dot_pos));
+        parts.push_back(last_part.substr(dot_pos + 1));  // extension
+    } else {
+        parts.push_back(last_part);
+    }
+
+    // Check if we have enough parts
+    if (parts.size() < 4) {
+        fprintf(stderr, "Model path does not contain expected pattern\n");
+        exit(EXIT_FAILURE);
+    }
+
+    // Get base path by joining all parts except -start-of-end.gguf
+    std::string base_path;
+    for (size_t i = 0; i < parts.size() - 4; i++) {
+        if (i > 0) {
+            base_path += "-";
+        }
+        base_path += parts[i];
+    }
+
+    // Construct tensor list file path
+    std::string tensor_list_path = base_path + ".tensors.txt";
+
+    printf("Loading tensor list file: %s\n", tensor_list_path.c_str());
+    return { tensor_list_path, load_file_into_streambuf(tensor_list_path.c_str()) };
+}
+
+llama_model * load_model_from_memory_configuration(const char * model_path, llama_model_params & model_params) {
+    llama_model *                         model;
+    std::chrono::steady_clock::time_point load_start_time;
+    if (getenv("LLAMA_EXAMPLE_MEMORY_BUFFER")) {
+        std::vector<uint8_t> buffer = load_file_into_buffer(model_path);
+        fprintf(stdout, "%s: loading model from memory buffer\n", __func__);
+        load_start_time = std::chrono::steady_clock::now();
+        model           = llama_model_load_from_buffer(std::move(buffer), model_params);
+    } else if (getenv("LLAMA_EXAMPLE_MEMORY_BUFFER_SPLIT")) {
+        file_entry              tensor_list_file = load_tensor_list_file(model_path);
+        std::vector<file_entry> files            = load_files_into_streambuf(model_path);
+        fprintf(stdout, "%s: loading model from %zu file streambufs\n", __func__, files.size());
+
+        std::vector<const char *> file_paths;
+        for (const auto & file : files) {
+            printf("Found file %s with streambuf\n", file.path.c_str());
+            file_paths.push_back(file.path.c_str());
+        }
+
+        load_start_time                 = std::chrono::steady_clock::now();
+        const char * async_load_context = "test-model-load";
+        std::thread  fulfill_thread([&files, &tensor_list_file, &async_load_context]() {
+            const bool success = llama_model_load_fulfill_split_future(
+                tensor_list_file.path.c_str(), async_load_context, std::move(tensor_list_file.streambuf));
+            printf("Fulfilling tensor list file %s: %s\n", tensor_list_file.path.c_str(),
+                   success ? "success" : "failure");
+            if (!success) {
+                exit(EXIT_FAILURE);
+            }
+
+            for (auto & file : files) {
+                const bool success = llama_model_load_fulfill_split_future(file.path.c_str(), async_load_context,
+                                                                            std::move(file.streambuf));
+                printf("Fulfilling file %s with streambuf: %s\n", file.path.c_str(), success ? "success" : "failure");
+                if (!success) {
+                    exit(EXIT_FAILURE);
+                }
+            }
+        });
+        fprintf(stderr, "Loading model from splits\n");
+        model = llama_model_load_from_split_futures(file_paths.data(), file_paths.size(), async_load_context,
+                                                    tensor_list_file.path.c_str(), model_params);
+        fulfill_thread.join();
+    } else if (getenv("LLAMA_EXAMPLE_FROM_FILE")) {
+        load_start_time = std::chrono::steady_clock::now();
+        model           = llama_model_load_from_file(model_path, model_params);
+    } else {
+        return nullptr;
+    }
+
+    if (model == NULL) {
+        fprintf(stderr, "%s: error: unable to load model\n", __func__);
+        exit(1);
+    }
+    std::chrono::steady_clock::time_point load_end_time = std::chrono::steady_clock::now();
+    std::chrono::duration<double>         load_duration = load_end_time - load_start_time;
+    fprintf(stdout, "%s: loading model took %f seconds\n", __func__, load_duration.count());
+    return model;
+}
+
+bool memory_configuration_env_is_set() {
+    return getenv("LLAMA_EXAMPLE_MEMORY_BUFFER") || getenv("LLAMA_EXAMPLE_MEMORY_BUFFER_SPLIT") ||
+           getenv("LLAMA_EXAMPLE_FROM_FILE");
+}
+}  // namespace
diff --git a/common_test/uint8-buff-stream-wrapper.h b/common_test/uint8-buff-stream-wrapper.h
@@ -0,0 +1,5 @@
+#pragma once
+
+// Wrapper to include the specific header from src
+#include "uint8-buff-stream.h"
+
diff --git a/examples/embedding/CMakeLists.txt b/examples/embedding/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-embedding)
 add_executable(${TARGET} embedding.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common llama llama-common-test ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
@@ -1,15 +1,25 @@
+#include <algorithm>
+#include <chrono>
+#include <cstdint>
+#include <cstdlib>
+#include <ctime>
+#include <fstream>
+#include <thread>
+#include <vector>
+
 #include "arg.h"
 #include "common.h"
+#include "llama-cpp.h"
 #include "log.h"
-#include "llama.h"
-
-#include <ctime>
-#include <algorithm>
 
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
+#ifdef LLAMA_COMMON_TEST_HEADERS
+#include "load_into_memory.h"
+#endif
+
 static std::vector<std::string> split_lines(const std::string & s, const std::string & separator = "\n") {
     std::vector<std::string> lines;
     size_t start = 0;
@@ -94,7 +104,20 @@ int main(int argc, char ** argv) {
     llama_numa_init(params.numa);
 
     // load the model
-    common_init_result llama_init = common_init_from_params(params);
+    common_init_result llama_init;
+
+#ifdef LLAMA_COMMON_TEST_HEADERS
+    if (memory_configuration_env_is_set()) {
+        llama_model_params mparams = common_model_params_to_llama(params);
+        common_init_result iparams;
+        llama_model *      model = load_model_from_memory_configuration(params.model.path.c_str(), mparams);
+        llama_init               = common_init_from_model_and_params(model, std::move(iparams), params);
+    } else {
+        llama_init = common_init_from_params(params);
+    }
+#else
+    llama_init = common_init_from_params(params);
+#endif
 
     llama_model * model = llama_init.model.get();
     llama_context * ctx = llama_init.context.get();

diff --git a/examples/simple/CMakeLists.txt b/examples/simple/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-simple)
 add_executable(${TARGET} simple.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama llama-common-test ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)