@@ -4271,20 +4271,34 @@ struct llama_model_loader {
42714271
42724272 ggml_tensor * tensor;
42734273
4274- llama_tensor_weight(const llama_file * file, uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
4275- const int tensor_idx = gguf_find_tensor(gguf_ctx, name );
4274+ llama_tensor_weight(const llama_file * file, uint16_t idx, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
4275+ const int tensor_idx = gguf_find_tensor(gguf_ctx, ggml_get_name(tensor) );
42764276 if (tensor_idx < 0) {
4277- throw std::runtime_error(format("tensor '%s' not found in the model", name ));
4277+ throw std::runtime_error(format("tensor '%s' not found in the model", ggml_get_name(tensor) ));
42784278 }
42794279
42804280 offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
42814281 if (offs + ggml_nbytes(tensor) < offs || offs + ggml_nbytes(tensor) > file->size) {
4282- throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", name ));
4282+ throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", ggml_get_name(tensor) ));
42834283 }
42844284 }
42854285 };
4286- std::vector<llama_tensor_weight> weights;
42874286
4287+ // custom comparator to sort weights more nicely by layer
4288+ struct weight_name_comparer {
4289+ bool operator()(const std::string & a, const std::string & b) const {
4290+ int a_layer = -1;
4291+ int b_layer = -1;
4292+ sscanf(a.c_str(), "blk.%d.", &a_layer);
4293+ sscanf(b.c_str(), "blk.%d.", &b_layer);
4294+ if (a_layer != b_layer) {
4295+ return a_layer < b_layer;
4296+ }
4297+ return a < b;
4298+ }
4299+ };
4300+
4301+ std::map<std::string, struct llama_tensor_weight, weight_name_comparer> weights_map;
42884302 std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;
42894303
42904304 struct gguf_context * meta = NULL;
@@ -4326,7 +4340,14 @@ struct llama_model_loader {
43264340 // For subsidiary files, `meta` tensor data offset must not be used,
43274341 // so we build a unified tensors index for weights.
43284342 for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
4329- weights.emplace_back(files.back().get(), 0, cur->name, meta, cur);
4343+ std::string tensor_name = std::string(cur->name);
4344+ // make sure there is no duplicated tensor names
4345+ if (weights_map.find(tensor_name) != weights_map.end()) {
4346+ throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
4347+ }
4348+ n_elements += ggml_nelements(cur);
4349+ n_bytes += ggml_nbytes(cur);
4350+ weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, meta, cur));
43304351 }
43314352 uint16_t n_split = 0;
43324353 get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
@@ -4366,7 +4387,14 @@ struct llama_model_loader {
43664387
43674388 // Save tensors data offset info of the shard.
43684389 for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
4369- weights.emplace_back(files.back().get(), idx, cur->name, ctx_gguf, cur);
4390+ std::string tensor_name = std::string(cur->name);
4391+ // make sure there is no duplicated tensor names
4392+ if (weights_map.find(tensor_name) != weights_map.end()) {
4393+ throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
4394+ }
4395+ n_elements += ggml_nelements(cur);
4396+ n_bytes += ggml_nbytes(cur);
4397+ weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), idx, ctx_gguf, cur));
43704398 }
43714399
43724400 gguf_free(ctx_gguf);
@@ -4376,7 +4404,7 @@ struct llama_model_loader {
43764404
43774405 // sanity check
43784406 {
4379- const int n_tensors_loaded = (int) weights .size();
4407+ const int n_tensors_loaded = (int) weights_map .size();
43804408 if (n_tensors != n_tensors_loaded) {
43814409 throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded));
43824410 }
@@ -4386,23 +4414,10 @@ struct llama_model_loader {
43864414 }
43874415
43884416 n_kv = gguf_get_n_kv(meta);
4389- n_tensors = weights .size();
4417+ n_tensors = weights_map .size();
43904418
43914419 fver = (enum llama_fver) gguf_get_version(meta);
43924420
4393- std::set<std::string> tensor_names;
4394- for (auto & w : weights) {
4395- n_elements += ggml_nelements(w.tensor);
4396- n_bytes += ggml_nbytes(w.tensor);
4397- // make sure there is no duplicated tensor names
4398- const std::string name(w.tensor->name);
4399- auto found = tensor_names.find(name);
4400- if (found != tensor_names.end()) {
4401- throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", w.tensor->name));
4402- }
4403- tensor_names.insert(name);
4404- }
4405-
44064421 LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
44074422 __func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver));
44084423
@@ -4414,8 +4429,10 @@ struct llama_model_loader {
44144429 uint32_t n_type_max = 0;
44154430 enum ggml_type type_max = GGML_TYPE_F32;
44164431
4417- for (int i = 0; i < n_tensors; i++) {
4418- const ggml_tensor * tensor = weights.at(i).tensor;
4432+ for (const auto & it : weights_map) {
4433+ const llama_tensor_weight & w = it.second;
4434+ const ggml_tensor * tensor = w.tensor;
4435+
44194436 enum ggml_type type = tensor->type;
44204437
44214438 n_type[type]++;
@@ -4426,8 +4443,8 @@ struct llama_model_loader {
44264443 }
44274444
44284445 if (trace > 0) {
4429- const uint16_t sid = weights.at(i) .idx;
4430- LLAMA_LOG_INFO("%s: - tensor %4d, split %2d: %32s %-8s [ %s ]\n", __func__, i , sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str());
4446+ const uint16_t sid = w .idx;
4447+ LLAMA_LOG_INFO("%s: - tensor split %2d: %32s %-8s [ %s ]\n", __func__, sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str());
44314448 }
44324449 }
44334450
@@ -4691,21 +4708,13 @@ struct llama_model_loader {
46914708 return llm_kv.arch;
46924709 }
46934710
4694- const char * get_tensor_name(int i) const {
4695- return weights.at(i).tensor->name;
4696- }
4697-
46984711 const llama_tensor_weight * get_weight(const char * name) const {
4699- for (const auto & weight : weights) {
4700- if (strcmp(name, weight.tensor->name) == 0) {
4701- return &weight;
4702- }
4712+ auto pos = weights_map.find(name);
4713+ if (pos != weights_map.end()) {
4714+ return &pos->second;
47034715 }
4704- return nullptr;
4705- }
47064716
4707- const llama_tensor_weight * get_weight(int i) const {
4708- return get_weight(get_tensor_name(i));
4717+ return nullptr;
47094718 }
47104719
47114720 const llama_tensor_weight & require_weight(const char * name) const {
@@ -4732,10 +4741,6 @@ struct llama_model_loader {
47324741 return tensor;
47334742 }
47344743
4735- struct ggml_tensor * get_tensor_meta(int i) const {
4736- return get_tensor_meta(get_tensor_name(i));
4737- }
4738-
47394744 const struct ggml_tensor * check_tensor_dims(const std::string & name, const std::vector<int64_t> & ne, bool required) const {
47404745 const struct ggml_tensor * cur = get_tensor_meta(name.c_str());
47414746
@@ -4842,8 +4847,8 @@ struct llama_model_loader {
48424847 }
48434848
48444849 // compute the total size of all tensors for progress reporting
4845- for (auto & w : weights ) {
4846- size_data += ggml_nbytes(w .tensor);
4850+ for (const auto & it : weights_map ) {
4851+ size_data += ggml_nbytes(it.second .tensor);
48474852 }
48484853 }
48494854
@@ -18598,10 +18603,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1859818603 }
1859918604 }
1860018605
18601- for (int i = 0; i < ml.n_tensors; ++i ) {
18602- const struct ggml_tensor * meta = ml.get_tensor_meta(i) ;
18606+ for (const auto & it : ml.weights_map ) {
18607+ const struct ggml_tensor * tensor = it.second.tensor ;
1860318608
18604- const std::string name = ggml_get_name(meta );
18609+ const std::string name = ggml_get_name(tensor );
1860518610
1860618611 // TODO: avoid hardcoded tensor names - use the TN_* constants
1860718612 if (name.find("attn_v.weight") != std::string::npos ||
@@ -18639,20 +18644,22 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1863918644 std::vector<no_init<float>> f32_conv_buf;
1864018645
1864118646 uint16_t n_split = 1;
18647+ const auto & weights_map = ml.weights_map;
18648+
1864218649 // Assume split index is continuous
1864318650 if (params->keep_split) {
18644- for (int i = 0; i < ml.n_tensors; ++i ) {
18645- n_split = std::max(uint16_t(ml.get_weight(i)-> idx+ 1), n_split);
18651+ for (const auto & it : weights_map ) {
18652+ n_split = std::max(uint16_t(it.second. idx + 1), n_split);
1864618653 }
18654+
1864718655 }
1864818656 std::vector<gguf_context*> ctx_outs(n_split, NULL);
1864918657 ctx_outs[0] = ctx_out;
1865018658
1865118659 // populate the original tensors so we get an initial meta data
18652- for (int i = 0; i < ml.n_tensors; ++i) {
18653- auto weight = ml.get_weight(i);
18654- uint16_t i_split = params->keep_split ? weight->idx : 0;
18655- struct ggml_tensor * tensor = weight->tensor;
18660+ for (const auto & it : weights_map) {
18661+ uint16_t i_split = params->keep_split ? it.second.idx : 0;
18662+ struct ggml_tensor * tensor = it.second.tensor;
1865618663 if (ctx_outs[i_split] == NULL) {
1865718664 ctx_outs[i_split] = gguf_init_empty();
1865818665 }
@@ -18699,12 +18706,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1869918706
1870018707 const auto tn = LLM_TN(model.arch);
1870118708 new_ofstream(0);
18702- for (int i = 0; i < ml.n_tensors; ++i ) {
18703- auto weight = ml.get_weight(i) ;
18704- struct ggml_tensor * tensor = weight-> tensor;
18705- if (weight-> idx != cur_split && params->keep_split) {
18709+ for (const auto & it : weights_map ) {
18710+ const auto & weight = it.second ;
18711+ struct ggml_tensor * tensor = weight. tensor;
18712+ if (weight. idx != cur_split && params->keep_split) {
1870618713 close_ofstream();
18707- new_ofstream(weight-> idx);
18714+ new_ofstream(weight. idx);
1870818715 }
1870918716
1871018717 const std::string name = ggml_get_name(tensor);
0 commit comments