Skip to content

Commit bfa84c3

Browse files
authored
Merge pull request #1 from jesusmb1995/jmb/memory_load_pr
QVAC-3697: Load GGUF File From Buffer
2 parents 4fb2556 + cd1b485 commit bfa84c3

38 files changed

+1957
-195
lines changed

CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,10 @@ if (LLAMA_BUILD_COMMON)
192192
add_subdirectory(common)
193193
endif()
194194

195+
if(LLAMA_BUILD_EXAMPLES OR LLAMA_BUILD_TESTS)
196+
add_subdirectory(common_test)
197+
endif()
198+
195199
if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
196200
include(CTest)
197201
add_subdirectory(tests)

common/common.cpp

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -899,15 +899,7 @@ std::string fs_get_cache_file(const std::string & filename) {
899899
// Model utils
900900
//
901901

902-
struct common_init_result common_init_from_params(common_params & params) {
903-
common_init_result iparams;
904-
auto mparams = common_model_params_to_llama(params);
905-
906-
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
907-
if (model == NULL) {
908-
LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
909-
return iparams;
910-
}
902+
struct common_init_result common_init_from_model_and_params(llama_model* model, common_init_result iparams, common_params & params) {
911903

912904
const llama_vocab * vocab = llama_model_get_vocab(model);
913905

@@ -1074,6 +1066,19 @@ struct common_init_result common_init_from_params(common_params & params) {
10741066
return iparams;
10751067
}
10761068

1069+
struct common_init_result common_init_from_params(common_params & params) {
1070+
common_init_result iparams;
1071+
auto mparams = common_model_params_to_llama(params);
1072+
1073+
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
1074+
if (model == NULL) {
1075+
LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
1076+
return iparams;
1077+
}
1078+
1079+
return common_init_from_model_and_params(model, std::move(iparams), params);
1080+
}
1081+
10771082
std::string get_model_endpoint() {
10781083
const char * model_endpoint_env = getenv("MODEL_ENDPOINT");
10791084
// We still respect the use of environment-variable "HF_ENDPOINT" for backward-compatibility.

common/common.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -567,6 +567,8 @@ struct common_init_result {
567567
};
568568

569569
struct common_init_result common_init_from_params(common_params & params);
570+
struct common_init_result common_init_from_model_and_params(llama_model * model, common_init_result iparams,
571+
common_params & params);
570572

571573
struct llama_model_params common_model_params_to_llama ( common_params & params);
572574
struct llama_context_params common_context_params_to_llama(const common_params & params);

common_test/CMakeLists.txt

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# common_test library for load_into_memory.h and uint8-buff-stream.h
2+
3+
set(TARGET llama-common-test)
4+
5+
add_library(${TARGET} INTERFACE)
6+
7+
target_include_directories(${TARGET} INTERFACE
8+
${CMAKE_CURRENT_SOURCE_DIR}
9+
)
10+
11+
target_compile_definitions(${TARGET} INTERFACE LLAMA_COMMON_TEST_HEADERS)
12+
13+
target_compile_features(${TARGET} INTERFACE cxx_std_17)
14+
15+
target_link_libraries(${TARGET} INTERFACE common)

common_test/load_into_memory.h

Lines changed: 220 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,220 @@
1+
#pragma once
2+
3+
#include <chrono>
4+
#include <cstdint>
5+
#include <cstdio>
6+
#include <cstring>
7+
#include <ctime>
8+
#include <fstream>
9+
#include <memory>
10+
#include <sstream>
11+
#include <streambuf>
12+
#include <string>
13+
#include <thread>
14+
#include <vector>
15+
16+
// header-only utilities to showcase how to directly load a model from memory
17+
#include "uint8-buff-stream-wrapper.h"
18+
19+
namespace {
20+
bool is_split_file(const char * const model_path) {
21+
if (!model_path) {
22+
fprintf(stderr, "No model file provided\n");
23+
exit(EXIT_FAILURE);
24+
}
25+
26+
std::string path(model_path);
27+
return path.find("-of-") != std::string::npos;
28+
}
29+
30+
std::vector<uint8_t> load_file_into_buffer(const char * const model_path) {
31+
std::ifstream file_stream(model_path, std::ios::binary | std::ios::ate);
32+
if (!file_stream) {
33+
fprintf(stderr, "Failed to open file %s for reading into streambuf\n", model_path);
34+
exit(EXIT_FAILURE);
35+
}
36+
37+
const size_t file_size = file_stream.tellg();
38+
file_stream.seekg(0, std::ios::beg);
39+
40+
static_assert(sizeof(std::uint8_t) == sizeof(char), "uint8_t must be same size as char");
41+
std::vector<std::uint8_t> buffer(file_size);
42+
if (!file_stream.read((char *) buffer.data(), file_size)) {
43+
fprintf(stderr, "Failed to read entire file into buffer\n");
44+
exit(EXIT_FAILURE);
45+
}
46+
47+
return buffer;
48+
}
49+
50+
std::unique_ptr<std::basic_streambuf<uint8_t>> load_file_into_streambuf(const char * const model_path) {
51+
return std::make_unique<Uint8BufferStreamBuf>(load_file_into_buffer(model_path));
52+
}
53+
54+
struct file_entry {
55+
std::string path;
56+
std::unique_ptr<std::basic_streambuf<uint8_t>> streambuf;
57+
};
58+
59+
std::vector<file_entry> load_files_into_streambuf(const char * const model_path) {
60+
std::vector<file_entry> files;
61+
62+
// Extract pattern from first file path
63+
std::string path(model_path);
64+
65+
// Split by '-'
66+
std::vector<std::string> parts;
67+
std::stringstream ss(path);
68+
std::string item;
69+
while (std::getline(ss, item, '-')) {
70+
parts.push_back(item);
71+
}
72+
73+
// Split the last part by '.'
74+
std::string last_part = parts.back();
75+
parts.pop_back();
76+
size_t dot_pos = last_part.find('.');
77+
if (dot_pos != std::string::npos) {
78+
parts.push_back(last_part.substr(0, dot_pos));
79+
parts.push_back(last_part.substr(dot_pos + 1)); // extension
80+
} else {
81+
parts.push_back(last_part);
82+
}
83+
84+
// Check if we have enough parts
85+
if (parts.size() < 4) {
86+
fprintf(stderr, "Model path does not contain expected pattern\n");
87+
exit(EXIT_FAILURE);
88+
}
89+
90+
// Get total files from [-2] position (before the extension)
91+
int total_files = std::stoi(parts[parts.size() - 2]);
92+
93+
// Get base path by joining all parts except -start-of-end.gguf
94+
std::string base_path;
95+
for (size_t i = 0; i < parts.size() - 4; i++) {
96+
if (i > 0) {
97+
base_path += "-";
98+
}
99+
base_path += parts[i];
100+
}
101+
102+
for (int i = 1; i <= total_files; i++) {
103+
char numbered_path[1024];
104+
snprintf(numbered_path, sizeof(numbered_path), "%s-%05d-of-%05d.gguf", base_path.c_str(), i, total_files);
105+
106+
files.push_back({ numbered_path, load_file_into_streambuf(numbered_path) });
107+
}
108+
109+
return files;
110+
}
111+
112+
file_entry load_tensor_list_file(const char * const model_path) {
113+
std::string path(model_path);
114+
115+
// Split by '-'
116+
std::vector<std::string> parts;
117+
std::stringstream ss(path);
118+
std::string item;
119+
while (std::getline(ss, item, '-')) {
120+
parts.push_back(item);
121+
}
122+
123+
// Split the last part by '.'
124+
std::string last_part = parts.back();
125+
parts.pop_back();
126+
size_t dot_pos = last_part.find('.');
127+
if (dot_pos != std::string::npos) {
128+
parts.push_back(last_part.substr(0, dot_pos));
129+
parts.push_back(last_part.substr(dot_pos + 1)); // extension
130+
} else {
131+
parts.push_back(last_part);
132+
}
133+
134+
// Check if we have enough parts
135+
if (parts.size() < 4) {
136+
fprintf(stderr, "Model path does not contain expected pattern\n");
137+
exit(EXIT_FAILURE);
138+
}
139+
140+
// Get base path by joining all parts except -start-of-end.gguf
141+
std::string base_path;
142+
for (size_t i = 0; i < parts.size() - 4; i++) {
143+
if (i > 0) {
144+
base_path += "-";
145+
}
146+
base_path += parts[i];
147+
}
148+
149+
// Construct tensor list file path
150+
std::string tensor_list_path = base_path + ".tensors.txt";
151+
152+
printf("Loading tensor list file: %s\n", tensor_list_path.c_str());
153+
return { tensor_list_path, load_file_into_streambuf(tensor_list_path.c_str()) };
154+
}
155+
156+
llama_model * load_model_from_memory_configuration(const char * model_path, llama_model_params & model_params) {
157+
llama_model * model;
158+
std::chrono::steady_clock::time_point load_start_time;
159+
if (getenv("LLAMA_EXAMPLE_MEMORY_BUFFER")) {
160+
std::vector<uint8_t> buffer = load_file_into_buffer(model_path);
161+
fprintf(stdout, "%s: loading model from memory buffer\n", __func__);
162+
load_start_time = std::chrono::steady_clock::now();
163+
model = llama_model_load_from_buffer(std::move(buffer), model_params);
164+
} else if (getenv("LLAMA_EXAMPLE_MEMORY_BUFFER_SPLIT")) {
165+
file_entry tensor_list_file = load_tensor_list_file(model_path);
166+
std::vector<file_entry> files = load_files_into_streambuf(model_path);
167+
fprintf(stdout, "%s: loading model from %zu file streambufs\n", __func__, files.size());
168+
169+
std::vector<const char *> file_paths;
170+
for (const auto & file : files) {
171+
printf("Found file %s with streambuf\n", file.path.c_str());
172+
file_paths.push_back(file.path.c_str());
173+
}
174+
175+
load_start_time = std::chrono::steady_clock::now();
176+
const char * async_load_context = "test-model-load";
177+
std::thread fulfill_thread([&files, &tensor_list_file, &async_load_context]() {
178+
const bool success = llama_model_load_fulfill_split_future(
179+
tensor_list_file.path.c_str(), async_load_context, std::move(tensor_list_file.streambuf));
180+
printf("Fulfilling tensor list file %s: %s\n", tensor_list_file.path.c_str(),
181+
success ? "success" : "failure");
182+
if (!success) {
183+
exit(EXIT_FAILURE);
184+
}
185+
186+
for (auto & file : files) {
187+
const bool success = llama_model_load_fulfill_split_future(file.path.c_str(), async_load_context,
188+
std::move(file.streambuf));
189+
printf("Fulfilling file %s with streambuf: %s\n", file.path.c_str(), success ? "success" : "failure");
190+
if (!success) {
191+
exit(EXIT_FAILURE);
192+
}
193+
}
194+
});
195+
fprintf(stderr, "Loading model from splits\n");
196+
model = llama_model_load_from_split_futures(file_paths.data(), file_paths.size(), async_load_context,
197+
tensor_list_file.path.c_str(), model_params);
198+
fulfill_thread.join();
199+
} else if (getenv("LLAMA_EXAMPLE_FROM_FILE")) {
200+
load_start_time = std::chrono::steady_clock::now();
201+
model = llama_model_load_from_file(model_path, model_params);
202+
} else {
203+
return nullptr;
204+
}
205+
206+
if (model == NULL) {
207+
fprintf(stderr, "%s: error: unable to load model\n", __func__);
208+
exit(1);
209+
}
210+
std::chrono::steady_clock::time_point load_end_time = std::chrono::steady_clock::now();
211+
std::chrono::duration<double> load_duration = load_end_time - load_start_time;
212+
fprintf(stdout, "%s: loading model took %f seconds\n", __func__, load_duration.count());
213+
return model;
214+
}
215+
216+
bool memory_configuration_env_is_set() {
217+
return getenv("LLAMA_EXAMPLE_MEMORY_BUFFER") || getenv("LLAMA_EXAMPLE_MEMORY_BUFFER_SPLIT") ||
218+
getenv("LLAMA_EXAMPLE_FROM_FILE");
219+
}
220+
} // namespace
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
#pragma once
2+
3+
// Wrapper to include the specific header from src
4+
#include "uint8-buff-stream.h"
5+

examples/embedding/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
set(TARGET llama-embedding)
22
add_executable(${TARGET} embedding.cpp)
33
install(TARGETS ${TARGET} RUNTIME)
4-
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
4+
target_link_libraries(${TARGET} PRIVATE common llama llama-common-test ${CMAKE_THREAD_LIBS_INIT})
55
target_compile_features(${TARGET} PRIVATE cxx_std_17)

examples/embedding/embedding.cpp

Lines changed: 28 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,25 @@
1+
#include <algorithm>
2+
#include <chrono>
3+
#include <cstdint>
4+
#include <cstdlib>
5+
#include <ctime>
6+
#include <fstream>
7+
#include <thread>
8+
#include <vector>
9+
110
#include "arg.h"
211
#include "common.h"
12+
#include "llama-cpp.h"
313
#include "log.h"
4-
#include "llama.h"
5-
6-
#include <ctime>
7-
#include <algorithm>
814

915
#if defined(_MSC_VER)
1016
#pragma warning(disable: 4244 4267) // possible loss of data
1117
#endif
1218

19+
#ifdef LLAMA_COMMON_TEST_HEADERS
20+
#include "load_into_memory.h"
21+
#endif
22+
1323
static std::vector<std::string> split_lines(const std::string & s, const std::string & separator = "\n") {
1424
std::vector<std::string> lines;
1525
size_t start = 0;
@@ -94,7 +104,20 @@ int main(int argc, char ** argv) {
94104
llama_numa_init(params.numa);
95105

96106
// load the model
97-
common_init_result llama_init = common_init_from_params(params);
107+
common_init_result llama_init;
108+
109+
#ifdef LLAMA_COMMON_TEST_HEADERS
110+
if (memory_configuration_env_is_set()) {
111+
llama_model_params mparams = common_model_params_to_llama(params);
112+
common_init_result iparams;
113+
llama_model * model = load_model_from_memory_configuration(params.model.path.c_str(), mparams);
114+
llama_init = common_init_from_model_and_params(model, std::move(iparams), params);
115+
} else {
116+
llama_init = common_init_from_params(params);
117+
}
118+
#else
119+
llama_init = common_init_from_params(params);
120+
#endif
98121

99122
llama_model * model = llama_init.model.get();
100123
llama_context * ctx = llama_init.context.get();

examples/simple/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
set(TARGET llama-simple)
22
add_executable(${TARGET} simple.cpp)
33
install(TARGETS ${TARGET} RUNTIME)
4-
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
4+
target_link_libraries(${TARGET} PRIVATE llama llama-common-test ${CMAKE_THREAD_LIBS_INIT})
55
target_compile_features(${TARGET} PRIVATE cxx_std_17)

0 commit comments

Comments
 (0)