[aux] Auto. streams memory loading test

jesusmb1995 · jesusmb1995 · commit 71c63a43b40b · 2025-10-22T17:44:52.000+02:00
Add some automatic tests that load from memory (multiple async splits) diff --git a/ggml/include/gguf.h b/ggml/include/gguf.h index 3471d4b..377fc60de 100644 --- a/ggml/include/gguf.h +++ b/ggml/include/gguf.h @@ -200,7 +200,7 @@ extern "C" { } #endif -#if defined(__cplusplus) && __cplusplus >= 201703L +#if defined(__cplusplus) #include <ios> GGML_API struct gguf_context * gguf_init_from_buffer(std::basic_streambuf<char>& streambuf, struct gguf_init_params params); #endif diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index f1e5c22..87127a1 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -202,6 +202,7 @@ llama_build_and_test(test-backend-ops.cpp) llama_build_and_test(test-model-load-cancel.cpp LABEL "model") llama_build_and_test(test-model-load-disk.cpp LABEL "model") llama_build_and_test(test-model-load-memory.cpp LABEL "model") +llama_build_and_test(test-model-load-memory-split.cpp LABEL "model") llama_build_and_test(test-autorelease.cpp LABEL "model") if (NOT GGML_BACKEND_DL) diff --git a/tests/test-model-load-memory-split.cpp b/tests/test-model-load-memory-split.cpp new file mode 100644 index 000000000..2d3dd21 --- /dev/null +++ b/tests/test-model-load-memory-split.cpp @@ -0,0 +1,76 @@ +#include "get-model.h" +#include "llama-cpp.h" +#include "load-into-memory.h" + +#include <cstdlib> +#include <thread> +#include <vector> + +using namespace common_load_into_memory; + +int main(int argc, char * argv[]) { + auto * model_path = get_model_or_exit(argc, argv); + + if (!is_split_file(model_path)) { + printf("Skipping not-split model %s\n", model_path); + return EXIT_SUCCESS; + } + + // Manually load into a memory buffer first + llama_file_entry tensor_list_file = load_tensor_list_file(model_path); + std::vector<llama_file_entry> files = load_files_into_streambuf(model_path); + + llama_backend_init(); + auto params = llama_model_params{}; + params.use_mmap = false; + params.progress_callback = [](float progress, void * ctx) { + (void) ctx; + fprintf(stderr, "%.2f%% ", progress * 100.0f); + // true means: Don't cancel the load + return true; + }; + + printf("Loading model from %zu files\n", files.size()); + + std::vector<const char *> file_paths; + for (size_t i = 0; i < files.size(); i++) { + printf("Found file %s \n", files[i].path.c_str()); + file_paths.push_back(files[i].path.c_str()); + } + + const char * async_load_context = "test-model-load"; + std::thread fulfill_thread([&files, &tensor_list_file, &async_load_context]() { + const bool success = llama_model_load_fulfill_split_future(tensor_list_file.path.c_str(), async_load_context, + std::move(tensor_list_file.streambuf)); + printf("Fulfilling tensor list file %s: %s\n", tensor_list_file.path.c_str(), success ? "success" : "failure"); + if (!success) { + exit(EXIT_FAILURE); + } + for (size_t i = 0; i < files.size(); i++) { + const bool success = llama_model_load_fulfill_split_future(files[i].path.c_str(), async_load_context, + std::move(files[i].streambuf)); + printf("Fulfilling file %s: %s\n", files[i].path.c_str(), success ? "success" : "failure"); + if (!success) { + exit(EXIT_FAILURE); + } + } + }); + fprintf(stderr, "Loading model from splits\n"); + auto * model = llama_model_load_from_split_futures(file_paths.data(), file_paths.size(), async_load_context, + tensor_list_file.path.c_str(), params); + fulfill_thread.join(); + + fprintf(stderr, "\n"); + + if (model == nullptr) { + fprintf(stderr, "Failed to load model\n"); + llama_backend_free(); + return EXIT_FAILURE; + } + + fprintf(stderr, "Model loaded successfully\n"); + llama_model_free(model); + llama_backend_free(); + + return EXIT_SUCCESS; +}
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -206,6 +206,7 @@ llama_build_and_test(test-backend-ops.cpp)
 llama_build_and_test(test-model-load-cancel.cpp  LABEL "model")
 llama_build_and_test(test-model-load-disk.cpp    LABEL "model")
 llama_build_and_test(test-model-load-memory.cpp  LABEL "model")
+llama_build_and_test(test-model-load-memory-split.cpp LABEL "model")
 llama_build_and_test(test-autorelease.cpp        LABEL "model")
 
 if (NOT GGML_BACKEND_DL)
diff --git a/tests/test-model-load-memory-split.cpp b/tests/test-model-load-memory-split.cpp
@@ -0,0 +1,83 @@
+#include "common.h"
+#include "get-model.h"
+#include "llama-cpp.h"
+#include "load-into-memory.h"
+
+#include <cstdlib>
+#include <thread>
+#include <vector>
+
+using namespace common_load_into_memory;
+
+int main(int argc, char * argv[]) {
+    auto * model_path = get_model_or_exit(argc, argv);
+
+    if (!is_split_file(model_path)) {
+        printf("Skipping not-split model %s\n", model_path);
+        return EXIT_SUCCESS;
+    }
+
+    // Manually load into a memory buffer first
+    llama_file_entry              tensor_list_file = load_tensor_list_file(model_path);
+    std::vector<llama_file_entry> files            = load_files_into_streambuf(model_path);
+
+    llama_backend_init();
+    auto params     = llama_model_params{};
+    params.use_mmap = false;
+
+    // Use CPU-only mode if no GPU devices are available
+    if (!common_has_gpu_devices()) {
+        params.main_gpu = -1;
+    }
+
+    params.progress_callback = [](float progress, void * ctx) {
+        (void) ctx;
+        fprintf(stderr, "%.2f%% ", progress * 100.0f);
+        // true means: Don't cancel the load
+        return true;
+    };
+
+    printf("Loading model from %zu files\n", files.size());
+
+    std::vector<const char *> file_paths;
+    for (size_t i = 0; i < files.size(); i++) {
+        printf("Found file %s \n", files[i].path.c_str());
+        file_paths.push_back(files[i].path.c_str());
+    }
+
+    const char * async_load_context = "test-model-load";
+    std::thread  fulfill_thread([&files, &tensor_list_file, &async_load_context]() {
+        const bool success = llama_model_load_fulfill_split_future(tensor_list_file.path.c_str(), async_load_context,
+                                                                    std::move(tensor_list_file.streambuf));
+        printf("Fulfilling tensor list file %s: %s\n", tensor_list_file.path.c_str(), success ? "success" : "failure");
+        if (!success) {
+            exit(EXIT_FAILURE);
+        }
+        for (size_t i = 0; i < files.size(); i++) {
+            const bool success = llama_model_load_fulfill_split_future(files[i].path.c_str(), async_load_context,
+                                                                        std::move(files[i].streambuf));
+            printf("Fulfilling file %s: %s\n", files[i].path.c_str(), success ? "success" : "failure");
+            if (!success) {
+                exit(EXIT_FAILURE);
+            }
+        }
+    });
+    fprintf(stderr, "Loading model from splits\n");
+    auto * model = llama_model_load_from_split_futures(file_paths.data(), file_paths.size(), async_load_context,
+                                                       tensor_list_file.path.c_str(), params);
+    fulfill_thread.join();
+
+    fprintf(stderr, "\n");
+
+    if (model == nullptr) {
+        fprintf(stderr, "Failed to load model\n");
+        llama_backend_free();
+        return EXIT_FAILURE;
+    }
+
+    fprintf(stderr, "Model loaded successfully\n");
+    llama_model_free(model);
+    llama_backend_free();
+
+    return EXIT_SUCCESS;
+}