Skip to content

Commit 71c63a4

Browse files
committed
[aux] Auto. streams memory loading test
Add some automatic tests that load from memory (multiple async splits) diff --git a/ggml/include/gguf.h b/ggml/include/gguf.h index 3471d4b..377fc60de 100644 --- a/ggml/include/gguf.h +++ b/ggml/include/gguf.h @@ -200,7 +200,7 @@ extern "C" { } #endif -#if defined(__cplusplus) && __cplusplus >= 201703L +#if defined(__cplusplus) #include <ios> GGML_API struct gguf_context * gguf_init_from_buffer(std::basic_streambuf<char>& streambuf, struct gguf_init_params params); #endif diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index f1e5c22..87127a1 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -202,6 +202,7 @@ llama_build_and_test(test-backend-ops.cpp) llama_build_and_test(test-model-load-cancel.cpp LABEL "model") llama_build_and_test(test-model-load-disk.cpp LABEL "model") llama_build_and_test(test-model-load-memory.cpp LABEL "model") +llama_build_and_test(test-model-load-memory-split.cpp LABEL "model") llama_build_and_test(test-autorelease.cpp LABEL "model") if (NOT GGML_BACKEND_DL) diff --git a/tests/test-model-load-memory-split.cpp b/tests/test-model-load-memory-split.cpp new file mode 100644 index 000000000..2d3dd21 --- /dev/null +++ b/tests/test-model-load-memory-split.cpp @@ -0,0 +1,76 @@ +#include "get-model.h" +#include "llama-cpp.h" +#include "load-into-memory.h" + +#include <cstdlib> +#include <thread> +#include <vector> + +using namespace common_load_into_memory; + +int main(int argc, char * argv[]) { + auto * model_path = get_model_or_exit(argc, argv); + + if (!is_split_file(model_path)) { + printf("Skipping not-split model %s\n", model_path); + return EXIT_SUCCESS; + } + + // Manually load into a memory buffer first + llama_file_entry tensor_list_file = load_tensor_list_file(model_path); + std::vector<llama_file_entry> files = load_files_into_streambuf(model_path); + + llama_backend_init(); + auto params = llama_model_params{}; + params.use_mmap = false; + params.progress_callback = [](float progress, void * ctx) { + (void) ctx; + fprintf(stderr, "%.2f%% ", progress * 100.0f); + // true means: Don't cancel the load + return true; + }; + + printf("Loading model from %zu files\n", files.size()); + + std::vector<const char *> file_paths; + for (size_t i = 0; i < files.size(); i++) { + printf("Found file %s \n", files[i].path.c_str()); + file_paths.push_back(files[i].path.c_str()); + } + + const char * async_load_context = "test-model-load"; + std::thread fulfill_thread([&files, &tensor_list_file, &async_load_context]() { + const bool success = llama_model_load_fulfill_split_future(tensor_list_file.path.c_str(), async_load_context, + std::move(tensor_list_file.streambuf)); + printf("Fulfilling tensor list file %s: %s\n", tensor_list_file.path.c_str(), success ? "success" : "failure"); + if (!success) { + exit(EXIT_FAILURE); + } + for (size_t i = 0; i < files.size(); i++) { + const bool success = llama_model_load_fulfill_split_future(files[i].path.c_str(), async_load_context, + std::move(files[i].streambuf)); + printf("Fulfilling file %s: %s\n", files[i].path.c_str(), success ? "success" : "failure"); + if (!success) { + exit(EXIT_FAILURE); + } + } + }); + fprintf(stderr, "Loading model from splits\n"); + auto * model = llama_model_load_from_split_futures(file_paths.data(), file_paths.size(), async_load_context, + tensor_list_file.path.c_str(), params); + fulfill_thread.join(); + + fprintf(stderr, "\n"); + + if (model == nullptr) { + fprintf(stderr, "Failed to load model\n"); + llama_backend_free(); + return EXIT_FAILURE; + } + + fprintf(stderr, "Model loaded successfully\n"); + llama_model_free(model); + llama_backend_free(); + + return EXIT_SUCCESS; +}
1 parent a726b69 commit 71c63a4

File tree

2 files changed

+84
-0
lines changed

2 files changed

+84
-0
lines changed

tests/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,7 @@ llama_build_and_test(test-backend-ops.cpp)
206206
llama_build_and_test(test-model-load-cancel.cpp LABEL "model")
207207
llama_build_and_test(test-model-load-disk.cpp LABEL "model")
208208
llama_build_and_test(test-model-load-memory.cpp LABEL "model")
209+
llama_build_and_test(test-model-load-memory-split.cpp LABEL "model")
209210
llama_build_and_test(test-autorelease.cpp LABEL "model")
210211

211212
if (NOT GGML_BACKEND_DL)
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
#include "common.h"
2+
#include "get-model.h"
3+
#include "llama-cpp.h"
4+
#include "load-into-memory.h"
5+
6+
#include <cstdlib>
7+
#include <thread>
8+
#include <vector>
9+
10+
using namespace common_load_into_memory;
11+
12+
int main(int argc, char * argv[]) {
13+
auto * model_path = get_model_or_exit(argc, argv);
14+
15+
if (!is_split_file(model_path)) {
16+
printf("Skipping not-split model %s\n", model_path);
17+
return EXIT_SUCCESS;
18+
}
19+
20+
// Manually load into a memory buffer first
21+
llama_file_entry tensor_list_file = load_tensor_list_file(model_path);
22+
std::vector<llama_file_entry> files = load_files_into_streambuf(model_path);
23+
24+
llama_backend_init();
25+
auto params = llama_model_params{};
26+
params.use_mmap = false;
27+
28+
// Use CPU-only mode if no GPU devices are available
29+
if (!common_has_gpu_devices()) {
30+
params.main_gpu = -1;
31+
}
32+
33+
params.progress_callback = [](float progress, void * ctx) {
34+
(void) ctx;
35+
fprintf(stderr, "%.2f%% ", progress * 100.0f);
36+
// true means: Don't cancel the load
37+
return true;
38+
};
39+
40+
printf("Loading model from %zu files\n", files.size());
41+
42+
std::vector<const char *> file_paths;
43+
for (size_t i = 0; i < files.size(); i++) {
44+
printf("Found file %s \n", files[i].path.c_str());
45+
file_paths.push_back(files[i].path.c_str());
46+
}
47+
48+
const char * async_load_context = "test-model-load";
49+
std::thread fulfill_thread([&files, &tensor_list_file, &async_load_context]() {
50+
const bool success = llama_model_load_fulfill_split_future(tensor_list_file.path.c_str(), async_load_context,
51+
std::move(tensor_list_file.streambuf));
52+
printf("Fulfilling tensor list file %s: %s\n", tensor_list_file.path.c_str(), success ? "success" : "failure");
53+
if (!success) {
54+
exit(EXIT_FAILURE);
55+
}
56+
for (size_t i = 0; i < files.size(); i++) {
57+
const bool success = llama_model_load_fulfill_split_future(files[i].path.c_str(), async_load_context,
58+
std::move(files[i].streambuf));
59+
printf("Fulfilling file %s: %s\n", files[i].path.c_str(), success ? "success" : "failure");
60+
if (!success) {
61+
exit(EXIT_FAILURE);
62+
}
63+
}
64+
});
65+
fprintf(stderr, "Loading model from splits\n");
66+
auto * model = llama_model_load_from_split_futures(file_paths.data(), file_paths.size(), async_load_context,
67+
tensor_list_file.path.c_str(), params);
68+
fulfill_thread.join();
69+
70+
fprintf(stderr, "\n");
71+
72+
if (model == nullptr) {
73+
fprintf(stderr, "Failed to load model\n");
74+
llama_backend_free();
75+
return EXIT_FAILURE;
76+
}
77+
78+
fprintf(stderr, "Model loaded successfully\n");
79+
llama_model_free(model);
80+
llama_backend_free();
81+
82+
return EXIT_SUCCESS;
83+
}

0 commit comments

Comments
 (0)