Skip to content

Commit ac7eddf

Browse files
committed
[mbuffer] Expose single-buffer loading to Llama interface
Add new C++ function to Llama main header to load from a single memory buffer, and propagate changes to internal calls/constructors.
1 parent 81ec564 commit ac7eddf

File tree

5 files changed

+54
-21
lines changed

5 files changed

+54
-21
lines changed

include/llama-cpp.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#endif
66

77
#include <memory>
8+
#include <vector>
89

910
#include "llama.h"
1011

@@ -28,3 +29,5 @@ typedef std::unique_ptr<llama_model, llama_model_deleter> llama_model_ptr;
2829
typedef std::unique_ptr<llama_context, llama_context_deleter> llama_context_ptr;
2930
typedef std::unique_ptr<llama_sampler, llama_sampler_deleter> llama_sampler_ptr;
3031
typedef std::unique_ptr<llama_adapter_lora, llama_adapter_lora_deleter> llama_adapter_lora_ptr;
32+
33+
struct llama_model * llama_model_load_from_buffer(std::vector<uint8_t> && data, struct llama_model_params params);

src/llama-model-loader.cpp

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#include <cinttypes>
99
#include <cstring>
1010
#include <future>
11+
#include <stdexcept>
1112

1213
static const size_t kiB = 1024;
1314
static const size_t MiB = 1024*kiB;
@@ -493,8 +494,7 @@ namespace GGUFMeta {
493494
}
494495

495496
llama_model_loader::llama_model_loader(
496-
const std::string & fname,
497-
std::vector<std::string> & splits,
497+
load_input_t load_input,
498498
bool use_mmap,
499499
bool check_tensors,
500500
const llama_model_kv_override * param_overrides_p,
@@ -513,7 +513,7 @@ llama_model_loader::llama_model_loader(
513513
tensor_buft_overrides = param_tensor_buft_overrides_p;
514514

515515
struct ggml_context * ctx = NULL;
516-
gguf_file_load main_gguf(&ctx, load_input_variant::fname_load_input{fname, splits});
516+
gguf_file_load main_gguf(&ctx, load_input);
517517
process_loaded_gguf(ctx, main_gguf, 0);
518518

519519
meta = std::move(main_gguf.meta);
@@ -525,18 +525,21 @@ llama_model_loader::llama_model_loader(
525525
get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
526526

527527
// Load additional GGML contexts
528-
if (n_split > 1) {
528+
if (load_input_variant::variant_supports_split_load(load_input) && n_split > 1) {
529+
load_input_variant::fname_load_input base_split = load_input_variant::split_name_from_variant(load_input);
530+
std::vector<std::string> & splits = base_split.splits;
531+
529532
// make sure the main file is loaded first
530533
uint16_t idx = 0;
531534
const std::string kv_split_no = llm_kv(LLM_KV_SPLIT_NO);
532535
get_key(kv_split_no, idx);
533536
if (idx != 0) {
534-
throw std::runtime_error(format("illegal split file idx: %d (file: %s), model must be loaded with the first split", idx, fname.c_str()));
537+
throw std::runtime_error(format("illegal split file idx: %d (file: %s), model must be loaded with the first split", idx, base_split.fname.c_str()));
535538
}
536539

537540
// generate list of splits if needed
538541
if (splits.empty()) {
539-
splits = llama_get_list_splits(fname, idx, n_split);
542+
splits = llama_get_list_splits(base_split.fname, idx, n_split);
540543
}
541544

542545
// in case user give a custom list of splits, check if it matches the expected number
@@ -589,7 +592,7 @@ llama_model_loader::llama_model_loader(
589592
fver = (enum llama_fver) gguf_get_version(meta.get());
590593

591594
LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
592-
__func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver));
595+
__func__, n_kv, n_tensors, load_input_variant::identifier(load_input), llama_file_version_name(fver));
593596

594597
// determine file type based on the number of tensors for each quantization and print meta data
595598
// TODO: make optional

src/llama-model-loader.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -95,8 +95,7 @@ struct llama_model_loader {
9595
void process_loaded_gguf(struct ggml_context * ctx, gguf_file_load & gguf_load, uint16_t idx);
9696

9797
llama_model_loader(
98-
const std::string & fname,
99-
std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
98+
load_input_t load_input,
10099
bool use_mmap,
101100
bool check_tensors,
102101
const llama_model_kv_override * param_overrides_p,

src/llama-quant.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -583,7 +583,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
583583
}
584584

585585
std::vector<std::string> splits = {};
586-
llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides, nullptr);
586+
load_input_variant::fname_load_input inp{fname_inp, splits};
587+
llama_model_loader ml(inp, use_mmap, /*check_tensors*/ true, kv_overrides, nullptr);
587588
ml.init_mappings(false); // no prefetching
588589

589590
llama_model model(llama_model_default_params());

src/llama.cpp

Lines changed: 38 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9,18 +9,24 @@
99

1010
#include "ggml.h"
1111
#include "ggml-backend.h"
12+
#include "uint8-buff-stream.h"
1213

1314
#include <algorithm>
1415
#include <cstddef>
1516
#include <cstdint>
1617
#include <cstdio>
1718
#include <cstring>
1819
#include <ctime>
20+
#include <stdexcept>
1921

2022
#if defined(_MSC_VER)
2123
#pragma warning(disable: 4244 4267) // possible loss of data
2224
#endif
2325

26+
#ifdef __cplusplus
27+
#include "llama-cpp.h"
28+
#endif
29+
2430
//
2531
// interface implementation
2632
//
@@ -84,7 +90,7 @@ int64_t llama_time_us(void) {
8490
}
8591

8692
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
87-
static int llama_model_load(const std::string & fname, std::vector<std::string> & splits, llama_model & model, llama_model_params & params) {
93+
static int llama_model_load(llama_model_loader & ml, llama_model & model, llama_model_params & params) {
8894
// loading time will be recalculated after the first eval, so
8995
// we take page faults deferred by mmap() into consideration
9096
model.t_load_us = 0;
@@ -93,8 +99,6 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
9399
model.t_start_us = tm.t_start_us;
94100

95101
try {
96-
llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides, params.tensor_buft_overrides);
97-
98102
ml.print_info();
99103

100104
model.hparams.vocab_only = params.vocab_only;
@@ -135,8 +139,7 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
135139
}
136140

137141
static struct llama_model * llama_model_load_from_file_impl(
138-
const std::string & path_model,
139-
std::vector<std::string> & splits,
142+
llama_model_loader& ml,
140143
struct llama_model_params params) {
141144
ggml_time_init();
142145

@@ -218,7 +221,7 @@ static struct llama_model * llama_model_load_from_file_impl(
218221
LLAMA_LOG_INFO("%s: using device %s (%s) - %zu MiB free\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), free/1024/1024);
219222
}
220223

221-
const int status = llama_model_load(path_model, splits, *model, params);
224+
const int status = llama_model_load(ml, *model, params);
222225
GGML_ASSERT(status <= 0);
223226
if (status < 0) {
224227
if (status == -1) {
@@ -241,11 +244,34 @@ struct llama_model * llama_load_model_from_file(
241244
return llama_model_load_from_file(path_model, params);
242245
}
243246

244-
struct llama_model * llama_model_load_from_file(
245-
const char * path_model,
246-
struct llama_model_params params) {
247+
static llama_model_loader create_disk_fileloader(const char * path_model, std::vector<std::string> & splits,
248+
struct llama_model_params params) {
249+
load_input_variant::fname_load_input loader_input{ path_model, splits };
250+
return llama_model_loader(loader_input, params.use_mmap, params.check_tensors, params.kv_overrides,
251+
params.tensor_buft_overrides);
252+
}
253+
254+
struct llama_model * llama_model_load_from_file(const char * path_model, struct llama_model_params params) {
247255
std::vector<std::string> splits = {};
248-
return llama_model_load_from_file_impl(path_model, splits, params);
256+
llama_model_loader ml = create_disk_fileloader(path_model, splits, params);
257+
return llama_model_load_from_file_impl(ml, params);
258+
}
259+
260+
namespace {
261+
void override_and_disable_mmap(struct llama_model_params & params) {
262+
if (params.use_mmap) {
263+
LLAMA_LOG_WARN("Overriding and disabling memory mapping when loading from memory buffer\n");
264+
params.use_mmap = false;
265+
}
266+
}
267+
} // namespace
268+
269+
struct llama_model * llama_model_load_from_buffer(std::vector<uint8_t> && data, struct llama_model_params params) {
270+
std::unique_ptr<std::basic_streambuf<uint8_t>> streambuf = std::make_unique<Uint8BufferStreamBuf>(std::move(data));
271+
override_and_disable_mmap(params);
272+
llama_model_loader ml(load_input_variant::buffer_load_input{ streambuf }, params.use_mmap, params.check_tensors,
273+
params.kv_overrides, params.tensor_buft_overrides);
274+
return llama_model_load_from_file_impl(ml, params);
249275
}
250276

251277
namespace {
@@ -268,7 +294,8 @@ struct llama_model * llama_model_load_from_splits(const char ** paths, size_t n_
268294
if (splits.empty()) {
269295
return nullptr;
270296
}
271-
return llama_model_load_from_file_impl(splits.front(), splits, params);
297+
llama_model_loader ml = create_disk_fileloader(splits.front().c_str(), splits, params);
298+
return llama_model_load_from_file_impl(ml, params);
272299
}
273300

274301
void llama_model_save_to_file(const struct llama_model * model, const char * path_model) {

0 commit comments

Comments
 (0)