@@ -4860,19 +4860,12 @@ struct llama_model_loader {
48604860 *last = 0;
48614861 *addr = mapping->addr;
48624862 for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
4863- try {
4864- const auto * weight = get_weight(ggml_get_name(tensor));
4865- if (!weight) {
4866- continue;
4867- }
4868- if (weight->idx != idx) {
4869- continue;
4870- }
4871- *first = std::min(*first, weight->offs);
4872- *last = std::max(*last, weight->offs + ggml_nbytes(tensor));
4873- } catch(...) {
4874- // the tensor is not in the model
4863+ const auto * weight = get_weight(ggml_get_name(tensor));
4864+ if (!weight || weight->idx != idx) {
4865+ continue;
48754866 }
4867+ *first = std::min(*first, weight->offs);
4868+ *last = std::max(*last, weight->offs + ggml_nbytes(tensor));
48764869 }
48774870 }
48784871
@@ -5049,7 +5042,6 @@ struct llama_model_loader {
50495042 ggml_backend_tensor_set(cur, data, 0, n_size);
50505043 }
50515044 } else {
5052- GGML_ASSERT(weight->idx < files.size());
50535045 const auto & file = files.at(weight->idx);
50545046 if (ggml_backend_buffer_is_host(cur->buffer)) {
50555047 file->seek(weight->offs, SEEK_SET);
@@ -18603,8 +18595,25 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1860318595 }
1860418596 }
1860518597
18598+ // make a list of weights
18599+ std::vector<const llama_model_loader::llama_tensor_weight *> tensors;
18600+ tensors.reserve(ml.weights_map.size());
1860618601 for (const auto & it : ml.weights_map) {
18607- const struct ggml_tensor * tensor = it.second.tensor;
18602+ tensors.push_back(&it.second);
18603+ }
18604+
18605+ // keep_split requires that the weights are sorted by split index
18606+ if (params->keep_split) {
18607+ std::sort(tensors.begin(), tensors.end(), [](const llama_model_loader::llama_tensor_weight * a, const llama_model_loader::llama_tensor_weight * b) {
18608+ if (a->idx == b->idx) {
18609+ return a->offs < b->offs;
18610+ }
18611+ return a->idx < b->idx;
18612+ });
18613+ }
18614+
18615+ for (const auto * it : tensors) {
18616+ const struct ggml_tensor * tensor = it->tensor;
1860818617
1860918618 const std::string name = ggml_get_name(tensor);
1861018619
@@ -18644,22 +18653,20 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1864418653 std::vector<no_init<float>> f32_conv_buf;
1864518654
1864618655 uint16_t n_split = 1;
18647- const auto & weights_map = ml.weights_map;
1864818656
1864918657 // Assume split index is continuous
1865018658 if (params->keep_split) {
18651- for (const auto & it : weights_map ) {
18652- n_split = std::max(uint16_t(it.second. idx + 1), n_split);
18659+ for (const auto * it : tensors ) {
18660+ n_split = std::max(uint16_t(it-> idx + 1), n_split);
1865318661 }
18654-
1865518662 }
1865618663 std::vector<gguf_context*> ctx_outs(n_split, NULL);
1865718664 ctx_outs[0] = ctx_out;
1865818665
1865918666 // populate the original tensors so we get an initial meta data
18660- for (const auto & it : weights_map ) {
18661- uint16_t i_split = params->keep_split ? it.second. idx : 0;
18662- struct ggml_tensor * tensor = it.second. tensor;
18667+ for (const auto * it : tensors ) {
18668+ uint16_t i_split = params->keep_split ? it-> idx : 0;
18669+ struct ggml_tensor * tensor = it-> tensor;
1866318670 if (ctx_outs[i_split] == NULL) {
1866418671 ctx_outs[i_split] = gguf_init_empty();
1866518672 }
@@ -18706,8 +18713,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1870618713
1870718714 const auto tn = LLM_TN(model.arch);
1870818715 new_ofstream(0);
18709- for (const auto & it : weights_map ) {
18710- const auto & weight = it.second ;
18716+ for (const auto * it : tensors ) {
18717+ const auto & weight = *it ;
1871118718 struct ggml_tensor * tensor = weight.tensor;
1871218719 if (weight.idx != cur_split && params->keep_split) {
1871318720 close_ofstream();
0 commit comments