@@ -364,96 +364,14 @@ static size_t llama_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml
364364 return size / ggml_blck_size (type);
365365}
366366
367- struct llama_load_tensor_shard {
368- std::vector<uint32_t > ne;
369- size_t size;
370- enum ggml_type type;
371- size_t file_idx;
372- size_t file_off;
373-
374- void calc_size () {
375- size = llama_calc_tensor_size (ne, type);
376- }
377- };
378-
379- enum llama_split_type {
380- SPLIT_NONE,
381- SPLIT_BY_COLUMNS,
382- SPLIT_BY_ROWS
383- };
384-
385367struct llama_load_tensor {
386- std::vector<llama_load_tensor_shard> shards;
387-
388368 std::string name;
389369 enum ggml_type type = GGML_TYPE_F32;
390- llama_split_type split_type = SPLIT_NONE;
391370 std::vector<uint32_t > ne;
371+ size_t file_off;
392372 size_t size;
393373 struct ggml_tensor * ggml_tensor = NULL ;
394374 uint8_t * data;
395-
396- llama_load_tensor (const std::string & name) : name(name) {}
397-
398- void calc_all () {
399- calc_type ();
400- calc_split_type ();
401- calc_ne ();
402- calc_size ();
403- }
404-
405- void calc_type () {
406- const auto & first_shard = shards.at (0 );
407- for (const auto & shard : shards) {
408- if (shard.type != first_shard.type ) {
409- throw std::runtime_error (format (" inconsistent tensor shard type in '%s'" , name.c_str ()));
410- }
411- }
412- type = first_shard.type ;
413- }
414-
415- void calc_split_type () {
416- if (shards.at (0 ).ne .size () == 1 || // 1D tensors are just duplicated in every file
417- shards.size () == 1 ) { // only one file?
418- split_type = SPLIT_NONE;
419- } else if (name.find (" tok_embeddings." ) == 0 ||
420- name.find (" .attention.wo.weight" ) != std::string::npos ||
421- name.find (" .feed_forward.w2.weight" ) != std::string::npos) {
422- split_type = SPLIT_BY_COLUMNS;
423- } else {
424- split_type = SPLIT_BY_ROWS;
425- }
426- }
427-
428- void calc_ne () {
429- const auto & first_shard = shards.at (0 );
430- for (const auto & shard : shards) {
431- if (shard.ne != first_shard.ne ) {
432- throw std::runtime_error (format (" inconsistent tensor shard shape in '%s': first was %s, other was %s" ,
433- name.c_str (), llama_format_tensor_shape (first_shard.ne ).c_str (), llama_format_tensor_shape (shard.ne ).c_str ()));
434- }
435- }
436- ne = first_shard.ne ;
437- LLAMA_ASSERT (shards.size () <= UINT32_MAX);
438- uint32_t n_shards = (uint32_t ) shards.size ();
439- switch (split_type) {
440- case SPLIT_NONE:
441- ne = first_shard.ne ;
442- break ;
443- case SPLIT_BY_COLUMNS:
444- ne = {checked_mul<uint32_t >(first_shard.ne [0 ], n_shards),
445- first_shard.ne [1 ]};
446- break ;
447- case SPLIT_BY_ROWS:
448- ne = {first_shard.ne [0 ],
449- checked_mul<uint32_t >(first_shard.ne [1 ], n_shards)};
450- break ;
451- }
452- }
453-
454- void calc_size () {
455- size = llama_calc_tensor_size (ne, type);
456- }
457375};
458376
459377struct llama_load_tensors_map {
@@ -476,13 +394,13 @@ struct llama_file_loader {
476394 llama_hparams hparams;
477395 llama_vocab vocab;
478396
479- llama_file_loader (const char * fname, size_t file_idx, llama_load_tensors_map & tensors_map)
397+ llama_file_loader (const char * fname, llama_load_tensors_map & tensors_map)
480398 : file(fname, " rb" ) {
481399 fprintf (stderr, " llama.cpp: loading model from %s\n " , fname);
482400 read_magic ();
483401 read_hparams ();
484402 read_vocab ();
485- read_tensor_metadata (file_idx, tensors_map);
403+ read_tensor_metadata (tensors_map);
486404 }
487405 void read_magic () {
488406 uint32_t magic = file.read_u32 ();
@@ -539,19 +457,19 @@ struct llama_file_loader {
539457 tok_score.score = score;
540458 }
541459 }
542- void read_tensor_metadata (size_t file_idx, llama_load_tensors_map & tensors_map) {
460+ void read_tensor_metadata (llama_load_tensors_map & tensors_map) {
543461 while (file.tell () < file.size ) {
544- llama_load_tensor_shard shard ;
462+ llama_load_tensor tensor ;
545463 uint32_t n_dims = file.read_u32 ();
546464 uint32_t name_len = file.read_u32 ();
547- shard .type = (enum ggml_type) file.read_u32 ();
548- shard .ne .resize (n_dims);
549- file.read_raw (shard .ne .data (), sizeof (shard .ne [0 ]) * n_dims);
465+ tensor .type = (enum ggml_type) file.read_u32 ();
466+ tensor .ne .resize (n_dims);
467+ file.read_raw (tensor .ne .data (), sizeof (tensor .ne [0 ]) * n_dims);
550468 std::string name = file.read_string (name_len);
551469 if (n_dims < 1 || n_dims > 2 ) {
552470 throw std::runtime_error (format (" llama.cpp: tensor '%s' should not be %u-dimensional" , name.c_str (), n_dims));
553471 }
554- switch (shard .type ) {
472+ switch (tensor .type ) {
555473 case GGML_TYPE_F32:
556474 case GGML_TYPE_F16:
557475 case GGML_TYPE_Q4_0:
@@ -566,30 +484,20 @@ struct llama_file_loader {
566484 case GGML_TYPE_Q6_K:
567485 break ;
568486 default : {
569- throw std::runtime_error (format (" unrecognized tensor type %u\n " , shard .type ));
487+ throw std::runtime_error (format (" unrecognized tensor type %u\n " , tensor .type ));
570488 }
571489 }
572490
573- if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
574- // skip to the next multiple of 32 bytes
575- file.seek (-static_cast <ptrdiff_t >(file.tell ()) & 31 , SEEK_CUR);
576- }
577- shard.file_idx = file_idx;
578- shard.file_off = file.tell ();
491+ // skip to the next multiple of 32 bytes
492+ file.seek (-static_cast <ptrdiff_t >(file.tell ()) & 31 , SEEK_CUR);
579493
580- shard.calc_size ();
581- file.seek (shard.size , SEEK_CUR);
494+ tensor.file_off = file.tell ();
495+ tensor.name = name;
496+ tensor.size = llama_calc_tensor_size (tensor.ne , tensor.type );
497+ file.seek (tensor.size , SEEK_CUR);
582498
583- auto it = tensors_map.name_to_idx .find (name);
584- size_t idx;
585- if (it != tensors_map.name_to_idx .end ()) {
586- idx = it->second ;
587- } else {
588- tensors_map.tensors .emplace_back (name);
589- idx = tensors_map.tensors .size () - 1 ;
590- tensors_map.name_to_idx .emplace (name, idx);
591- }
592- tensors_map.tensors .at (idx).shards .push_back (shard);
499+ tensors_map.tensors .push_back (tensor);
500+ tensors_map.name_to_idx [name] = tensors_map.tensors .size () - 1 ;
593501 }
594502 }
595503};
@@ -659,56 +567,19 @@ struct llama_file_saver {
659567};
660568
661569struct llama_model_loader {
662- std::vector<std:: unique_ptr<llama_file_loader>> file_loaders ;
570+ std::unique_ptr<llama_file_loader> file_loader ;
663571 llama_load_tensors_map tensors_map;
664572 bool use_mmap;
665573 size_t num_ggml_tensors_created = 0 ;
666574 struct ggml_context * ggml_ctx = NULL ;
667575 std::unique_ptr<llama_mmap> mapping;
668576
669- llama_model_loader (const std::string & fname_base, bool use_mmap, bool vocab_only) {
670- auto * first_file = new llama_file_loader (fname_base.c_str (), 0 , tensors_map);
671- file_loaders.emplace_back (first_file);
672- uint32_t n_parts = vocab_only ? 1 : guess_n_parts ();
673- for (uint32_t i = 1 ; i < n_parts; i++) {
674- std::string fname = fname_base + " ." + std::to_string (i);
675- auto * ith_file = new llama_file_loader (fname.c_str (), i, tensors_map);
676- file_loaders.emplace_back (ith_file);
677- if (ith_file->hparams != first_file->hparams ) {
678- throw std::runtime_error (format (" llama.cpp: hparams inconsistent between files" ));
679- }
680- }
577+ llama_model_loader (const std::string & fname_base, bool use_mmap) {
578+ file_loader = std::unique_ptr<llama_file_loader>(new llama_file_loader (fname_base.c_str (), tensors_map));
681579 if (!llama_mmap::SUPPORTED) {
682580 use_mmap = false ;
683581 }
684- if (use_mmap && alignment_prevents_mmap ()) {
685- fprintf (stderr, " llama.cpp: can't use mmap because tensors are not aligned; convert to new format to avoid this\n " );
686- use_mmap = false ;
687- }
688582 this ->use_mmap = use_mmap;
689- for (llama_load_tensor & lt : tensors_map.tensors ) {
690- lt.calc_all ();
691- }
692- }
693-
694- bool alignment_prevents_mmap () {
695- for (const llama_load_tensor & lt : tensors_map.tensors ) {
696- for (const llama_load_tensor_shard & shard : lt.shards ) {
697- if (shard.file_off & 3 ) {
698- return true ;
699- }
700- }
701- }
702- return false ;
703- }
704-
705- uint32_t guess_n_parts () const {
706- auto it = tensors_map.name_to_idx .find (" tok_embeddings.weight" );
707- if (it == tensors_map.name_to_idx .end ()) {
708- throw std::runtime_error (std::string (" missing tok_embeddings.weight" ));
709- }
710- const llama_load_tensor & lt = tensors_map.tensors .at (it->second );
711- return file_loaders.at (0 )->hparams .n_embd / lt.shards .at (0 ).ne .at (0 );
712583 }
713584
714585 void calc_sizes (size_t * ctx_size_p, size_t * mmapped_size_p) const {
@@ -774,7 +645,7 @@ struct llama_model_loader {
774645 }
775646
776647 if (use_mmap) {
777- mapping.reset (new llama_mmap (&file_loaders. at ( 0 ) ->file , prefetch_size, ggml_is_numa ()));
648+ mapping.reset (new llama_mmap (&file_loader ->file , prefetch_size, ggml_is_numa ()));
778649 if (lmlock) {
779650 lmlock->init (mapping->addr );
780651 }
@@ -830,45 +701,13 @@ struct llama_model_loader {
830701
831702 void load_data_for (llama_load_tensor & lt) {
832703 if (use_mmap) {
833- LLAMA_ASSERT (lt.shards .size () == 1 );
834- lt.data = (uint8_t *) mapping->addr + lt.shards .at (0 ).file_off ;
835- } else if (lt.split_type == SPLIT_NONE) {
836- llama_file & file = file_loaders.at (lt.shards .at (0 ).file_idx )->file ;
837- file.seek (lt.shards .at (0 ).file_off , SEEK_SET);
704+ lt.data = (uint8_t *) mapping->addr + lt.file_off ;
705+ } else {
706+ llama_file & file = file_loader->file ;
707+ file.seek (lt.file_off , SEEK_SET);
838708 file.read_raw (lt.data , lt.size );
839- } else if (lt.split_type == SPLIT_BY_ROWS) {
840- size_t offset = 0 ;
841- for (llama_load_tensor_shard & shard : lt.shards ) {
842- llama_file & file = file_loaders.at (shard.file_idx )->file ;
843- file.seek (shard.file_off , SEEK_SET);
844- file.read_raw (lt.data + offset, shard.size );
845- offset += shard.size ;
846- }
847- LLAMA_ASSERT (offset == lt.size );
848- } else if (lt.split_type == SPLIT_BY_COLUMNS) {
849- // Let's load the data into temporary buffers to ensure the OS performs large loads.
850- std::vector<llama_buffer> tmp_bufs (lt.shards .size ());
851- for (size_t i = 0 ; i < lt.shards .size (); i++) {
852- llama_load_tensor_shard & shard = lt.shards .at (i);
853- llama_file & file = file_loaders.at (shard.file_idx )->file ;
854- file.seek (shard.file_off , SEEK_SET);
855- tmp_bufs.at (i).resize (shard.size );
856- file.read_raw (tmp_bufs.at (i).addr , shard.size );
857- }
858- // Then reshape.
859- size_t num_rows = lt.ne .at (1 );
860- size_t per_shard_row_size = lt.shards .at (0 ).size / num_rows;
861- size_t out_offset = 0 ;
862- for (size_t row = 0 ; row < num_rows; row++) {
863- for (llama_buffer & tmp_buf : tmp_bufs) {
864- memcpy (lt.data + out_offset,
865- tmp_buf.addr + row * per_shard_row_size,
866- per_shard_row_size);
867- out_offset += per_shard_row_size;
868- }
869- }
870- LLAMA_ASSERT (out_offset == lt.size );
871709 }
710+
872711 if (0 ) {
873712 print_checksum (lt);
874713 }
@@ -1067,12 +906,12 @@ static void llama_model_load_internal(
1067906
1068907 model.t_start_us = ggml_time_us ();
1069908
1070- std::unique_ptr<llama_model_loader> ml (new llama_model_loader (fname, use_mmap, vocab_only ));
909+ std::unique_ptr<llama_model_loader> ml (new llama_model_loader (fname, use_mmap));
1071910
1072- vocab = std::move (ml->file_loaders . at ( 0 ) ->vocab );
1073- model.hparams = ml->file_loaders . at ( 0 ) ->hparams ;
911+ vocab = std::move (ml->file_loader ->vocab );
912+ model.hparams = ml->file_loader ->hparams ;
1074913 model.n_gpu_layers = n_gpu_layers;
1075- llama_file_version file_version = ml->file_loaders . at ( 0 ) ->file_version ;
914+ llama_file_version file_version = ml->file_loader ->file_version ;
1076915 auto & hparams = model.hparams ;
1077916
1078917 {
@@ -1106,7 +945,6 @@ static void llama_model_load_internal(
1106945 fprintf (stderr, " %s: n_rot = %u\n " , __func__, hparams.n_rot );
1107946 fprintf (stderr, " %s: ftype = %u (%s)\n " , __func__, hparams.ftype , llama_ftype_name (hparams.ftype ));
1108947 fprintf (stderr, " %s: n_ff = %u\n " , __func__, n_ff);
1109- fprintf (stderr, " %s: n_parts = %zu\n " , __func__, ml->file_loaders .size ());
1110948 fprintf (stderr, " %s: model size = %s\n " , __func__, llama_model_type_name (model.type ));
1111949 }
1112950
@@ -2461,9 +2299,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
24612299 nthread = std::thread::hardware_concurrency ();
24622300 }
24632301
2464- std::unique_ptr<llama_model_loader> model_loader (new llama_model_loader (fname_inp, /* use_mmap*/ false ,
2465- /* vocab_only*/ false ));
2466- llama_file_saver file_saver (fname_out.c_str (), model_loader->file_loaders .at (0 ).get (), params->ftype );
2302+ std::unique_ptr<llama_model_loader> model_loader (new llama_model_loader (fname_inp, /* use_mmap*/ false ));
2303+ llama_file_saver file_saver (fname_out.c_str (), model_loader->file_loader .get (), params->ftype );
24672304
24682305#ifdef GGML_USE_K_QUANTS
24692306 int n_attention_wv = 0 ;
@@ -2897,7 +2734,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
28972734 llama_buffer base_buf;
28982735 if (path_base_model) {
28992736 fprintf (stderr, " %s: loading base model from '%s'\n " , __func__, path_base_model);
2900- model_loader.reset (new llama_model_loader (path_base_model, /* use_mmap*/ true , /* vocab_only */ false ));
2737+ model_loader.reset (new llama_model_loader (path_base_model, /* use_mmap*/ true ));
29012738
29022739 size_t ctx_size;
29032740 size_t mmapped_size;
@@ -2915,7 +2752,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
29152752
29162753 // maybe this should in llama_model_loader
29172754 if (model_loader->use_mmap ) {
2918- model_loader->mapping .reset (new llama_mmap (&model_loader->file_loaders . at ( 0 ) ->file , /* prefetch */ 0 , ggml_is_numa ()));
2755+ model_loader->mapping .reset (new llama_mmap (&model_loader->file_loader ->file , /* prefetch */ 0 , ggml_is_numa ()));
29192756 }
29202757 }
29212758
0 commit comments