9
9
10
10
#include " ggml.h"
11
11
#include " ggml-backend.h"
12
+ #include " uint8-buff-stream.h"
12
13
13
14
#include < algorithm>
14
15
#include < cstddef>
15
16
#include < cstdint>
16
17
#include < cstdio>
17
18
#include < cstring>
18
19
#include < ctime>
20
+ #include < stdexcept>
19
21
20
22
#if defined(_MSC_VER)
21
23
#pragma warning(disable: 4244 4267) // possible loss of data
22
24
#endif
23
25
26
+ #ifdef __cplusplus
27
+ #include " llama-cpp.h"
28
+ #endif
29
+
24
30
//
25
31
// interface implementation
26
32
//
@@ -84,7 +90,7 @@ int64_t llama_time_us(void) {
84
90
}
85
91
86
92
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
87
- static int llama_model_load (const std::string & fname, std::vector<std::string> & splits , llama_model & model, llama_model_params & params) {
93
+ static int llama_model_load (llama_model_loader & ml , llama_model & model, llama_model_params & params) {
88
94
// loading time will be recalculated after the first eval, so
89
95
// we take page faults deferred by mmap() into consideration
90
96
model.t_load_us = 0 ;
@@ -93,8 +99,6 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
93
99
model.t_start_us = tm.t_start_us ;
94
100
95
101
try {
96
- llama_model_loader ml (fname, splits, params.use_mmap , params.check_tensors , params.kv_overrides , params.tensor_buft_overrides );
97
-
98
102
ml.print_info ();
99
103
100
104
model.hparams .vocab_only = params.vocab_only ;
@@ -135,8 +139,7 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
135
139
}
136
140
137
141
static struct llama_model * llama_model_load_from_file_impl (
138
- const std::string & path_model,
139
- std::vector<std::string> & splits,
142
+ llama_model_loader& ml,
140
143
struct llama_model_params params) {
141
144
ggml_time_init ();
142
145
@@ -218,7 +221,7 @@ static struct llama_model * llama_model_load_from_file_impl(
218
221
LLAMA_LOG_INFO (" %s: using device %s (%s) - %zu MiB free\n " , __func__, ggml_backend_dev_name (dev), ggml_backend_dev_description (dev), free/1024 /1024 );
219
222
}
220
223
221
- const int status = llama_model_load (path_model, splits , *model, params);
224
+ const int status = llama_model_load (ml , *model, params);
222
225
GGML_ASSERT (status <= 0 );
223
226
if (status < 0 ) {
224
227
if (status == -1 ) {
@@ -241,11 +244,34 @@ struct llama_model * llama_load_model_from_file(
241
244
return llama_model_load_from_file (path_model, params);
242
245
}
243
246
244
- struct llama_model * llama_model_load_from_file (
245
- const char * path_model,
246
- struct llama_model_params params) {
247
+ static llama_model_loader create_disk_fileloader (const char * path_model, std::vector<std::string> & splits,
248
+ struct llama_model_params params) {
249
+ load_input_variant::fname_load_input loader_input{ path_model, splits };
250
+ return llama_model_loader (loader_input, params.use_mmap , params.check_tensors , params.kv_overrides ,
251
+ params.tensor_buft_overrides );
252
+ }
253
+
254
+ struct llama_model * llama_model_load_from_file (const char * path_model, struct llama_model_params params) {
247
255
std::vector<std::string> splits = {};
248
- return llama_model_load_from_file_impl (path_model, splits, params);
256
+ llama_model_loader ml = create_disk_fileloader (path_model, splits, params);
257
+ return llama_model_load_from_file_impl (ml, params);
258
+ }
259
+
260
+ namespace {
261
+ void override_and_disable_mmap (struct llama_model_params & params) {
262
+ if (params.use_mmap ) {
263
+ LLAMA_LOG_WARN (" Overriding and disabling memory mapping when loading from memory buffer\n " );
264
+ params.use_mmap = false ;
265
+ }
266
+ }
267
+ } // namespace
268
+
269
+ struct llama_model * llama_model_load_from_buffer (std::vector<uint8_t > && data, struct llama_model_params params) {
270
+ std::unique_ptr<std::basic_streambuf<uint8_t >> streambuf = std::make_unique<Uint8BufferStreamBuf>(std::move (data));
271
+ override_and_disable_mmap (params);
272
+ llama_model_loader ml (load_input_variant::buffer_load_input{ streambuf }, params.use_mmap , params.check_tensors ,
273
+ params.kv_overrides , params.tensor_buft_overrides );
274
+ return llama_model_load_from_file_impl (ml, params);
249
275
}
250
276
251
277
namespace {
@@ -268,7 +294,8 @@ struct llama_model * llama_model_load_from_splits(const char ** paths, size_t n_
268
294
if (splits.empty ()) {
269
295
return nullptr ;
270
296
}
271
- return llama_model_load_from_file_impl (splits.front (), splits, params);
297
+ llama_model_loader ml = create_disk_fileloader (splits.front ().c_str (), splits, params);
298
+ return llama_model_load_from_file_impl (ml, params);
272
299
}
273
300
274
301
void llama_model_save_to_file (const struct llama_model * model, const char * path_model) {
0 commit comments