@@ -4275,20 +4275,34 @@ struct llama_model_loader {
42754275
42764276 ggml_tensor * tensor;
42774277
4278- llama_tensor_weight(const llama_file * file, uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
4279- const int tensor_idx = gguf_find_tensor(gguf_ctx, name );
4278+ llama_tensor_weight(const llama_file * file, uint16_t idx, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
4279+ const int tensor_idx = gguf_find_tensor(gguf_ctx, ggml_get_name(tensor) );
42804280 if (tensor_idx < 0) {
4281- throw std::runtime_error(format("tensor '%s' not found in the model", name ));
4281+ throw std::runtime_error(format("tensor '%s' not found in the model", ggml_get_name(tensor) ));
42824282 }
42834283
42844284 offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
42854285 if (offs + ggml_nbytes(tensor) < offs || offs + ggml_nbytes(tensor) > file->size) {
4286- throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", name ));
4286+ throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", ggml_get_name(tensor) ));
42874287 }
42884288 }
42894289 };
4290- std::vector<llama_tensor_weight> weights;
42914290
4291+ // custom comparator to sort weights more nicely by layer
4292+ struct weight_name_comparer {
4293+ bool operator()(const std::string & a, const std::string & b) const {
4294+ int a_layer = -1;
4295+ int b_layer = -1;
4296+ sscanf(a.c_str(), "blk.%d.", &a_layer);
4297+ sscanf(b.c_str(), "blk.%d.", &b_layer);
4298+ if (a_layer != b_layer) {
4299+ return a_layer < b_layer;
4300+ }
4301+ return a < b;
4302+ }
4303+ };
4304+
4305+ std::map<std::string, struct llama_tensor_weight, weight_name_comparer> weights_map;
42924306 std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;
42934307
42944308 struct gguf_context * meta = NULL;
@@ -4330,7 +4344,14 @@ struct llama_model_loader {
43304344 // For subsidiary files, `meta` tensor data offset must not be used,
43314345 // so we build a unified tensors index for weights.
43324346 for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
4333- weights.emplace_back(files.back().get(), 0, cur->name, meta, cur);
4347+ std::string tensor_name = std::string(cur->name);
4348+ // make sure there is no duplicated tensor names
4349+ if (weights_map.find(tensor_name) != weights_map.end()) {
4350+ throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
4351+ }
4352+ n_elements += ggml_nelements(cur);
4353+ n_bytes += ggml_nbytes(cur);
4354+ weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, meta, cur));
43344355 }
43354356 uint16_t n_split = 0;
43364357 get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
@@ -4370,7 +4391,14 @@ struct llama_model_loader {
43704391
43714392 // Save tensors data offset info of the shard.
43724393 for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
4373- weights.emplace_back(files.back().get(), idx, cur->name, ctx_gguf, cur);
4394+ std::string tensor_name = std::string(cur->name);
4395+ // make sure there is no duplicated tensor names
4396+ if (weights_map.find(tensor_name) != weights_map.end()) {
4397+ throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
4398+ }
4399+ n_elements += ggml_nelements(cur);
4400+ n_bytes += ggml_nbytes(cur);
4401+ weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), idx, ctx_gguf, cur));
43744402 }
43754403
43764404 gguf_free(ctx_gguf);
@@ -4380,7 +4408,7 @@ struct llama_model_loader {
43804408
43814409 // sanity check
43824410 {
4383- const int n_tensors_loaded = (int) weights .size();
4411+ const int n_tensors_loaded = (int) weights_map .size();
43844412 if (n_tensors != n_tensors_loaded) {
43854413 throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded));
43864414 }
@@ -4390,23 +4418,10 @@ struct llama_model_loader {
43904418 }
43914419
43924420 n_kv = gguf_get_n_kv(meta);
4393- n_tensors = weights .size();
4421+ n_tensors = weights_map .size();
43944422
43954423 fver = (enum llama_fver) gguf_get_version(meta);
43964424
4397- std::set<std::string> tensor_names;
4398- for (auto & w : weights) {
4399- n_elements += ggml_nelements(w.tensor);
4400- n_bytes += ggml_nbytes(w.tensor);
4401- // make sure there is no duplicated tensor names
4402- const std::string name(w.tensor->name);
4403- auto found = tensor_names.find(name);
4404- if (found != tensor_names.end()) {
4405- throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", w.tensor->name));
4406- }
4407- tensor_names.insert(name);
4408- }
4409-
44104425 LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
44114426 __func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver));
44124427
@@ -4418,8 +4433,10 @@ struct llama_model_loader {
44184433 uint32_t n_type_max = 0;
44194434 enum ggml_type type_max = GGML_TYPE_F32;
44204435
4421- for (int i = 0; i < n_tensors; i++) {
4422- const ggml_tensor * tensor = weights.at(i).tensor;
4436+ for (const auto & it : weights_map) {
4437+ const llama_tensor_weight & w = it.second;
4438+ const ggml_tensor * tensor = w.tensor;
4439+
44234440 enum ggml_type type = tensor->type;
44244441
44254442 n_type[type]++;
@@ -4430,8 +4447,8 @@ struct llama_model_loader {
44304447 }
44314448
44324449 if (trace > 0) {
4433- const uint16_t sid = weights.at(i) .idx;
4434- LLAMA_LOG_INFO("%s: - tensor %4d, split %2d: %32s %-8s [ %s ]\n", __func__, i , sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str());
4450+ const uint16_t sid = w .idx;
4451+ LLAMA_LOG_INFO("%s: - tensor split %2d: %32s %-8s [ %s ]\n", __func__, sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str());
44354452 }
44364453 }
44374454
@@ -4695,21 +4712,13 @@ struct llama_model_loader {
46954712 return llm_kv.arch;
46964713 }
46974714
4698- const char * get_tensor_name(int i) const {
4699- return weights.at(i).tensor->name;
4700- }
4701-
47024715 const llama_tensor_weight * get_weight(const char * name) const {
4703- for (const auto & weight : weights) {
4704- if (strcmp(name, weight.tensor->name) == 0) {
4705- return &weight;
4706- }
4716+ auto pos = weights_map.find(name);
4717+ if (pos != weights_map.end()) {
4718+ return &pos->second;
47074719 }
4708- return nullptr;
4709- }
47104720
4711- const llama_tensor_weight * get_weight(int i) const {
4712- return get_weight(get_tensor_name(i));
4721+ return nullptr;
47134722 }
47144723
47154724 const llama_tensor_weight & require_weight(const char * name) const {
@@ -4736,10 +4745,6 @@ struct llama_model_loader {
47364745 return tensor;
47374746 }
47384747
4739- struct ggml_tensor * get_tensor_meta(int i) const {
4740- return get_tensor_meta(get_tensor_name(i));
4741- }
4742-
47434748 const struct ggml_tensor * check_tensor_dims(const std::string & name, const std::vector<int64_t> & ne, bool required) const {
47444749 const struct ggml_tensor * cur = get_tensor_meta(name.c_str());
47454750
@@ -4846,8 +4851,8 @@ struct llama_model_loader {
48464851 }
48474852
48484853 // compute the total size of all tensors for progress reporting
4849- for (auto & w : weights ) {
4850- size_data += ggml_nbytes(w .tensor);
4854+ for (const auto & it : weights_map ) {
4855+ size_data += ggml_nbytes(it.second .tensor);
48514856 }
48524857 }
48534858
@@ -18607,10 +18612,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1860718612 }
1860818613 }
1860918614
18610- for (int i = 0; i < ml.n_tensors; ++i ) {
18611- const struct ggml_tensor * meta = ml.get_tensor_meta(i) ;
18615+ for (const auto & it : ml.weights_map ) {
18616+ const struct ggml_tensor * tensor = it.second.tensor ;
1861218617
18613- const std::string name = ggml_get_name(meta );
18618+ const std::string name = ggml_get_name(tensor );
1861418619
1861518620 // TODO: avoid hardcoded tensor names - use the TN_* constants
1861618621 if (name.find("attn_v.weight") != std::string::npos ||
@@ -18648,20 +18653,22 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1864818653 std::vector<no_init<float>> f32_conv_buf;
1864918654
1865018655 uint16_t n_split = 1;
18656+ const auto & weights_map = ml.weights_map;
18657+
1865118658 // Assume split index is continuous
1865218659 if (params->keep_split) {
18653- for (int i = 0; i < ml.n_tensors; ++i ) {
18654- n_split = std::max(uint16_t(ml.get_weight(i)-> idx+ 1), n_split);
18660+ for (const auto & it : weights_map ) {
18661+ n_split = std::max(uint16_t(it.second. idx + 1), n_split);
1865518662 }
18663+
1865618664 }
1865718665 std::vector<gguf_context*> ctx_outs(n_split, NULL);
1865818666 ctx_outs[0] = ctx_out;
1865918667
1866018668 // populate the original tensors so we get an initial meta data
18661- for (int i = 0; i < ml.n_tensors; ++i) {
18662- auto weight = ml.get_weight(i);
18663- uint16_t i_split = params->keep_split ? weight->idx : 0;
18664- struct ggml_tensor * tensor = weight->tensor;
18669+ for (const auto & it : weights_map) {
18670+ uint16_t i_split = params->keep_split ? it.second.idx : 0;
18671+ struct ggml_tensor * tensor = it.second.tensor;
1866518672 if (ctx_outs[i_split] == NULL) {
1866618673 ctx_outs[i_split] = gguf_init_empty();
1866718674 }
@@ -18708,12 +18715,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1870818715
1870918716 const auto tn = LLM_TN(model.arch);
1871018717 new_ofstream(0);
18711- for (int i = 0; i < ml.n_tensors; ++i ) {
18712- auto weight = ml.get_weight(i) ;
18713- struct ggml_tensor * tensor = weight-> tensor;
18714- if (weight-> idx != cur_split && params->keep_split) {
18718+ for (const auto & it : weights_map ) {
18719+ const auto & weight = it.second ;
18720+ struct ggml_tensor * tensor = weight. tensor;
18721+ if (weight. idx != cur_split && params->keep_split) {
1871518722 close_ofstream();
18716- new_ofstream(weight-> idx);
18723+ new_ofstream(weight. idx);
1871718724 }
1871818725
1871918726 const std::string name = ggml_get_name(tensor);
0 commit comments