Skip to content

Commit bc2b3d2

Browse files
committed
[fbuffers] Incremental model load
Adapt the loader and model load to incrementally load files and upload tensors.
1 parent 8a11951 commit bc2b3d2

File tree

4 files changed

+98
-33
lines changed

4 files changed

+98
-33
lines changed

src/llama-model-loader.cpp

Lines changed: 31 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
#include "llama-model-loader.h"
22

33
#include "ggml.h"
4-
#include "llama-model-load-input.h"
4+
#include "llama-mmap.h"
55
#include "llama-model-load.h"
66

77
#include <array>
88
#include <cinttypes>
9+
#include <cstdint>
910
#include <cstring>
1011
#include <future>
1112
#include <stdexcept>
@@ -512,9 +513,16 @@ llama_model_loader::llama_model_loader(
512513

513514
tensor_buft_overrides = param_tensor_buft_overrides_p;
514515

516+
std::optional<std::set<std::string>> tensor_list = load_input_variant::parse_tensor_list_from_future(load_input);
517+
515518
struct ggml_context * ctx = NULL;
516519
gguf_file_load main_gguf(&ctx, load_input);
517-
process_loaded_gguf(ctx, main_gguf, 0);
520+
521+
if (load_input_variant::variant_supports_split_load_from_memory(load_input)) {
522+
incremental_splits_tensor_load.emplace(ctx, *this, main_gguf, std::move(*tensor_list));
523+
} else {
524+
process_loaded_gguf(ctx, main_gguf, 0);
525+
}
518526

519527
meta = std::move(main_gguf.meta);
520528

@@ -526,8 +534,8 @@ llama_model_loader::llama_model_loader(
526534

527535
// Load additional GGML contexts
528536
if (load_input_variant::variant_supports_split_load(load_input) && n_split > 1) {
537+
529538
load_input_variant::fname_load_input base_split = load_input_variant::split_name_from_variant(load_input);
530-
std::vector<std::string> & splits = base_split.splits;
531539

532540
// make sure the main file is loaded first
533541
uint16_t idx = 0;
@@ -538,13 +546,13 @@ llama_model_loader::llama_model_loader(
538546
}
539547

540548
// generate list of splits if needed
541-
if (splits.empty()) {
542-
splits = llama_get_list_splits(base_split.fname, idx, n_split);
549+
if (base_split.splits.empty()) {
550+
base_split.splits = llama_get_list_splits(base_split.fname, idx, n_split);
543551
}
544552

545553
// in case user give a custom list of splits, check if it matches the expected number
546-
if (n_split != (uint16_t)splits.size()) {
547-
throw std::runtime_error(format("invalid split count, given: %zu splits, but expected %d", splits.size(), n_split));
554+
if (n_split != (uint16_t)base_split.splits.size()) {
555+
throw std::runtime_error(format("invalid split count, given: %zu splits, but expected %d", base_split.splits.size(), n_split));
548556
}
549557

550558
if (trace > 0) {
@@ -553,30 +561,20 @@ llama_model_loader::llama_model_loader(
553561

554562
// load other splits
555563
for (idx = 1; idx < n_split; idx++) {
556-
const char * fname_split = splits[idx].c_str();
557-
558-
gguf_file_load split_gguf(&ctx, load_input_variant::fname_load_input{fname_split, splits});
559-
gguf_context_ptr& split_meta = split_gguf.meta;
564+
SplitLoad split_load(load_input, base_split, idx, kv_split_no);
560565

561-
// check idx
562-
{
563-
const int kid = gguf_find_key(split_meta.get(), kv_split_no.c_str());
564-
if (kid < 0) {
565-
throw std::runtime_error(format("missing key %s in GGUF split %s", kv_split_no.c_str(), fname_split));
566-
}
567-
int idx_gguf = gguf_get_val_u16(split_meta.get(), kid);
568-
if (idx_gguf != idx) {
569-
throw std::runtime_error(format("invalid split file idx: %d (file: %s), expected %d", idx_gguf, fname_split, idx));
570-
}
566+
if(incremental_splits_tensor_load.has_value()) {
567+
incremental_splits_tensor_load->add_split(std::move(split_load));
568+
}
569+
else {
570+
split_load.load(*this);
571571
}
572-
573-
process_loaded_gguf(ctx, split_gguf, idx);
574572
}
575573

576574
get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors);
577575

578-
// sanity check
579-
{
576+
// sanity check (the incremental loader does the check after loading the last split)
577+
if(!incremental_splits_tensor_load.has_value()) {
580578
const int n_tensors_loaded = (int) weights_map.size();
581579
if (n_tensors != n_tensors_loaded) {
582580
throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded));
@@ -587,7 +585,13 @@ llama_model_loader::llama_model_loader(
587585
}
588586

589587
n_kv = gguf_get_n_kv(meta.get());
590-
n_tensors = weights_map.size();
588+
if (incremental_splits_tensor_load.has_value()) {
589+
n_tensors = incremental_splits_tensor_load->expected_n_tensors();
590+
LLAMA_LOG_CMAKE_DEBUG("%s: n_tensors (expected from summary list): %d\n", __func__, n_tensors);
591+
} else {
592+
n_tensors = weights_map.size();
593+
LLAMA_LOG_CMAKE_DEBUG("%s: exact n_tensors: %d\n", __func__, n_tensors);
594+
}
591595

592596
fver = (enum llama_fver) gguf_get_version(meta.get());
593597

@@ -596,7 +600,7 @@ llama_model_loader::llama_model_loader(
596600

597601
// determine file type based on the number of tensors for each quantization and print meta data
598602
// TODO: make optional
599-
{
603+
if(!incremental_splits_tensor_load.has_value()) {
600604
std::map<enum ggml_type, uint32_t> n_type;
601605

602606
uint32_t n_type_max = 0;

src/llama-model-loader.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,9 @@ struct llama_model_loader {
7979
llama_mmaps mappings;
8080

8181
std::map<std::string, llama_tensor_weight, weight_name_comparer> weights_map;
82+
83+
std::optional<IncrementalSplitsTensorLoad> incremental_splits_tensor_load;
84+
8285
std::unordered_map<std::string, llama_model_kv_override> kv_overrides;
8386
const llama_model_tensor_buft_override * tensor_buft_overrides;
8487

src/llama-model.cpp

Lines changed: 59 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,10 @@
1717
#include <cassert>
1818
#include <cmath>
1919
#include <cfloat>
20+
#include <cstdint>
2021
#include <cstring>
2122
#include <cmath>
2223
#include <functional>
23-
#include <map>
2424
#include <regex>
2525
#include <sstream>
2626
#include <stdexcept>
@@ -1643,9 +1643,19 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
16431643
ggml_backend_buffer_type_t first_moved_to_buft = nullptr;
16441644

16451645
auto create_tensor = [&](const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) -> ggml_tensor * {
1646-
ggml_tensor * t_meta = ml.get_tensor_meta(tn.str().c_str());
1647-
1646+
const std::string& tensor_name = tn.str();
1647+
ggml_tensor * t_meta = ml.get_tensor_meta(tensor_name.c_str());
1648+
std::optional<uint16_t> split_idx;
1649+
if (!t_meta && (flags & TENSOR_NOT_REQUIRED) &&
1650+
IncrementalSplitsTensorLoad::tensor_ignored(ml.incremental_splits_tensor_load, tensor_name.c_str())) {
1651+
return nullptr;
1652+
}
1653+
if (ml.incremental_splits_tensor_load.has_value()) {
1654+
split_idx = ml.incremental_splits_tensor_load->load_tensor_metadata(ml, tn.str().c_str(), &t_meta);
1655+
LLAMA_LOG_CMAKE_DEBUG("split idx for tensor %s: %d\n", tn.str().c_str(), *split_idx);
1656+
}
16481657
if (!t_meta) {
1658+
LLAMA_LOG_ERROR("%s: missing tensor %s\n", __func__ , tn.str().c_str());
16491659
if (flags & TENSOR_NOT_REQUIRED) {
16501660
return nullptr;
16511661
}
@@ -1758,16 +1768,32 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
17581768
}
17591769
}
17601770

1761-
ggml_context * ctx = ctx_for_buft(buft);
1771+
ggml_context * ctx =
1772+
split_idx.has_value() ?
1773+
ml.incremental_splits_tensor_load->get_model_ctx_for_split_buft(buft, *split_idx, pimpl.get()) :
1774+
ctx_for_buft(buft);
17621775

17631776
// if duplicated, check if the original tensor was allocated in the same buffer type context and avoid creating a new one
17641777
if (flags & TENSOR_DUPLICATED) {
1765-
ggml_tensor * t = ggml_get_tensor(ctx, tn.str().c_str());
1778+
auto tn_str = tn.str();
1779+
ggml_tensor * t = ggml_get_tensor(ctx, tn_str.c_str());
17661780
if (t) {
17671781
return t;
17681782
}
1783+
LLAMA_LOG_WARN("%s: duplicated tensor %s not found on existing context\n", tn_str.c_str(), __func__);
1784+
}
1785+
struct ggml_tensor * tensor = ml.create_tensor(ctx, tn, ne, flags);
1786+
1787+
if (split_idx.has_value() && ml.incremental_splits_tensor_load->all_tensors_are_loaded(*split_idx)) {
1788+
// Upload right now.
1789+
if (!create_split_backend_buffers(*split_idx, ml.incremental_splits_tensor_load->ctx_split_map, ml,
1790+
use_mmap_buffer, use_mlock, n_gpu_layers)) {
1791+
throw std::runtime_error("Failed to create incremental backend buffers");
1792+
}
1793+
IncrementalSplitsTensorLoad::release_split(ml, *split_idx);
17691794
}
1770-
return ml.create_tensor(ctx, tn, ne, flags);
1795+
1796+
return tensor;
17711797
};
17721798

17731799
layers.resize(n_layer);
@@ -4285,12 +4311,39 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
42854311

42864312
ml.done_getting_tensors();
42874313

4314+
if (ml.incremental_splits_tensor_load.has_value()) {
4315+
// Already did incremental load.
4316+
print_backend_buffers_info(n_gpu_layers);
4317+
return true;
4318+
}
4319+
42884320
ml.init_mappings(true, use_mlock ? &pimpl->mlock_mmaps : nullptr);
42894321
pimpl->mappings.reserve(ml.mappings.size());
42904322

42914323
return create_backend_buffers(ml.size_data, ctx_map, ml, use_mmap_buffer, use_mlock, n_gpu_layers);
42924324
}
42934325

4326+
bool llama_model::create_split_backend_buffers(
4327+
const uint16_t idx, std::map<std::pair<ggml_backend_buffer_type_t, uint16_t>, ggml_context *> & ctx_split_map,
4328+
llama_model_loader & ml, const bool use_mmap_buffer, const bool use_mlock, const int32_t n_gpu_layers) {
4329+
// Extract contexts for the given split index from ctx_split_map into a new map
4330+
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
4331+
for (const auto & [buft_split_idx, ctx] : ctx_split_map) {
4332+
const auto & [buft, split_idx] = buft_split_idx;
4333+
if (split_idx == idx) {
4334+
ctx_map[buft] = ctx;
4335+
}
4336+
}
4337+
4338+
const std::size_t split_data_size = ml.incremental_splits_tensor_load->get_split_data_size(idx);
4339+
LLAMA_LOG_CMAKE_DEBUG("%s: creating backend buffers for split %d with size %zu\n", __func__, idx, split_data_size);
4340+
constexpr bool do_print_backend_buffers_info = false;
4341+
const bool creation_success = create_backend_buffers(split_data_size, ctx_map, ml, use_mmap_buffer, use_mlock,
4342+
n_gpu_layers, do_print_backend_buffers_info);
4343+
4344+
return creation_success;
4345+
}
4346+
42944347
bool llama_model::create_backend_buffers(std::size_t size_data,
42954348
const std::map<ggml_backend_buffer_type_t, ggml_context *> & ctx_map,
42964349
llama_model_loader & ml, const bool use_mmap_buffer, const bool use_mlock,

src/llama-model.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -381,6 +381,11 @@ struct llama_model {
381381
llama_model_loader & ml, bool use_mmap_buffer, bool use_mlock, int32_t n_gpu_layers,
382382
bool do_print_backend_buffers_info = true);
383383

384+
/// @brief Create backend buffers for tensors on a split file idenfified by `idx`. Removes the split from the map.
385+
bool create_split_backend_buffers(
386+
uint16_t idx, std::map<std::pair<ggml_backend_buffer_type_t, uint16_t>, ggml_context *> & ctx_split_map,
387+
llama_model_loader & ml, bool use_mmap_buffer, bool use_mlock, int32_t n_gpu_layers);
388+
384389
void print_backend_buffers_info(int32_t n_gpu_layers);
385390

386391
void load_stats (llama_model_loader & ml);

0 commit comments

Comments
 (0)