1
1
#include " llama-model-loader.h"
2
2
3
3
#include " ggml.h"
4
- #include " llama-model-load-input .h"
4
+ #include " llama-mmap .h"
5
5
#include " llama-model-load.h"
6
6
7
7
#include < array>
8
8
#include < cinttypes>
9
+ #include < cstdint>
9
10
#include < cstring>
10
11
#include < future>
11
12
#include < stdexcept>
@@ -512,9 +513,16 @@ llama_model_loader::llama_model_loader(
512
513
513
514
tensor_buft_overrides = param_tensor_buft_overrides_p;
514
515
516
+ std::optional<std::set<std::string>> tensor_list = load_input_variant::parse_tensor_list_from_future (load_input);
517
+
515
518
struct ggml_context * ctx = NULL ;
516
519
gguf_file_load main_gguf (&ctx, load_input);
517
- process_loaded_gguf (ctx, main_gguf, 0 );
520
+
521
+ if (load_input_variant::variant_supports_split_load_from_memory (load_input)) {
522
+ incremental_splits_tensor_load.emplace (ctx, *this , main_gguf, std::move (*tensor_list));
523
+ } else {
524
+ process_loaded_gguf (ctx, main_gguf, 0 );
525
+ }
518
526
519
527
meta = std::move (main_gguf.meta );
520
528
@@ -526,8 +534,8 @@ llama_model_loader::llama_model_loader(
526
534
527
535
// Load additional GGML contexts
528
536
if (load_input_variant::variant_supports_split_load (load_input) && n_split > 1 ) {
537
+
529
538
load_input_variant::fname_load_input base_split = load_input_variant::split_name_from_variant (load_input);
530
- std::vector<std::string> & splits = base_split.splits ;
531
539
532
540
// make sure the main file is loaded first
533
541
uint16_t idx = 0 ;
@@ -538,13 +546,13 @@ llama_model_loader::llama_model_loader(
538
546
}
539
547
540
548
// generate list of splits if needed
541
- if (splits.empty ()) {
542
- splits = llama_get_list_splits (base_split.fname , idx, n_split);
549
+ if (base_split. splits .empty ()) {
550
+ base_split. splits = llama_get_list_splits (base_split.fname , idx, n_split);
543
551
}
544
552
545
553
// in case user give a custom list of splits, check if it matches the expected number
546
- if (n_split != (uint16_t )splits.size ()) {
547
- throw std::runtime_error (format (" invalid split count, given: %zu splits, but expected %d" , splits.size (), n_split));
554
+ if (n_split != (uint16_t )base_split. splits .size ()) {
555
+ throw std::runtime_error (format (" invalid split count, given: %zu splits, but expected %d" , base_split. splits .size (), n_split));
548
556
}
549
557
550
558
if (trace > 0 ) {
@@ -553,30 +561,20 @@ llama_model_loader::llama_model_loader(
553
561
554
562
// load other splits
555
563
for (idx = 1 ; idx < n_split; idx++) {
556
- const char * fname_split = splits[idx].c_str ();
557
-
558
- gguf_file_load split_gguf (&ctx, load_input_variant::fname_load_input{fname_split, splits});
559
- gguf_context_ptr& split_meta = split_gguf.meta ;
564
+ SplitLoad split_load (load_input, base_split, idx, kv_split_no);
560
565
561
- // check idx
562
- {
563
- const int kid = gguf_find_key (split_meta.get (), kv_split_no.c_str ());
564
- if (kid < 0 ) {
565
- throw std::runtime_error (format (" missing key %s in GGUF split %s" , kv_split_no.c_str (), fname_split));
566
- }
567
- int idx_gguf = gguf_get_val_u16 (split_meta.get (), kid);
568
- if (idx_gguf != idx) {
569
- throw std::runtime_error (format (" invalid split file idx: %d (file: %s), expected %d" , idx_gguf, fname_split, idx));
570
- }
566
+ if (incremental_splits_tensor_load.has_value ()) {
567
+ incremental_splits_tensor_load->add_split (std::move (split_load));
568
+ }
569
+ else {
570
+ split_load.load (*this );
571
571
}
572
-
573
- process_loaded_gguf (ctx, split_gguf, idx);
574
572
}
575
573
576
574
get_key (llm_kv (LLM_KV_SPLIT_TENSORS_COUNT), n_tensors);
577
575
578
- // sanity check
579
- {
576
+ // sanity check (the incremental loader does the check after loading the last split)
577
+ if (!incremental_splits_tensor_load. has_value ()) {
580
578
const int n_tensors_loaded = (int ) weights_map.size ();
581
579
if (n_tensors != n_tensors_loaded) {
582
580
throw std::runtime_error (format (" corrupted model: %d tensors expected but %d found" , n_tensors, n_tensors_loaded));
@@ -587,7 +585,13 @@ llama_model_loader::llama_model_loader(
587
585
}
588
586
589
587
n_kv = gguf_get_n_kv (meta.get ());
590
- n_tensors = weights_map.size ();
588
+ if (incremental_splits_tensor_load.has_value ()) {
589
+ n_tensors = incremental_splits_tensor_load->expected_n_tensors ();
590
+ LLAMA_LOG_CMAKE_DEBUG (" %s: n_tensors (expected from summary list): %d\n " , __func__, n_tensors);
591
+ } else {
592
+ n_tensors = weights_map.size ();
593
+ LLAMA_LOG_CMAKE_DEBUG (" %s: exact n_tensors: %d\n " , __func__, n_tensors);
594
+ }
591
595
592
596
fver = (enum llama_fver) gguf_get_version (meta.get ());
593
597
@@ -596,7 +600,7 @@ llama_model_loader::llama_model_loader(
596
600
597
601
// determine file type based on the number of tensors for each quantization and print meta data
598
602
// TODO: make optional
599
- {
603
+ if (!incremental_splits_tensor_load. has_value ()) {
600
604
std::map<enum ggml_type, uint32_t > n_type;
601
605
602
606
uint32_t n_type_max = 0 ;
0 commit comments