@@ -4864,19 +4864,12 @@ struct llama_model_loader {
48644864 *last = 0;
48654865 *addr = mapping->addr;
48664866 for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
4867- try {
4868- const auto * weight = get_weight(ggml_get_name(tensor));
4869- if (!weight) {
4870- continue;
4871- }
4872- if (weight->idx != idx) {
4873- continue;
4874- }
4875- *first = std::min(*first, weight->offs);
4876- *last = std::max(*last, weight->offs + ggml_nbytes(tensor));
4877- } catch(...) {
4878- // the tensor is not in the model
4867+ const auto * weight = get_weight(ggml_get_name(tensor));
4868+ if (!weight || weight->idx != idx) {
4869+ continue;
48794870 }
4871+ *first = std::min(*first, weight->offs);
4872+ *last = std::max(*last, weight->offs + ggml_nbytes(tensor));
48804873 }
48814874 }
48824875
@@ -5053,7 +5046,6 @@ struct llama_model_loader {
50535046 ggml_backend_tensor_set(cur, data, 0, n_size);
50545047 }
50555048 } else {
5056- GGML_ASSERT(weight->idx < files.size());
50575049 const auto & file = files.at(weight->idx);
50585050 if (ggml_backend_buffer_is_host(cur->buffer)) {
50595051 file->seek(weight->offs, SEEK_SET);
@@ -18632,8 +18624,25 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1863218624 }
1863318625 }
1863418626
18627+ // make a list of weights
18628+ std::vector<const llama_model_loader::llama_tensor_weight *> tensors;
18629+ tensors.reserve(ml.weights_map.size());
1863518630 for (const auto & it : ml.weights_map) {
18636- const struct ggml_tensor * tensor = it.second.tensor;
18631+ tensors.push_back(&it.second);
18632+ }
18633+
18634+ // keep_split requires that the weights are sorted by split index
18635+ if (params->keep_split) {
18636+ std::sort(tensors.begin(), tensors.end(), [](const llama_model_loader::llama_tensor_weight * a, const llama_model_loader::llama_tensor_weight * b) {
18637+ if (a->idx == b->idx) {
18638+ return a->offs < b->offs;
18639+ }
18640+ return a->idx < b->idx;
18641+ });
18642+ }
18643+
18644+ for (const auto * it : tensors) {
18645+ const struct ggml_tensor * tensor = it->tensor;
1863718646
1863818647 const std::string name = ggml_get_name(tensor);
1863918648
@@ -18673,22 +18682,20 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1867318682 std::vector<no_init<float>> f32_conv_buf;
1867418683
1867518684 uint16_t n_split = 1;
18676- const auto & weights_map = ml.weights_map;
1867718685
1867818686 // Assume split index is continuous
1867918687 if (params->keep_split) {
18680- for (const auto & it : weights_map ) {
18681- n_split = std::max(uint16_t(it.second. idx + 1), n_split);
18688+ for (const auto * it : tensors ) {
18689+ n_split = std::max(uint16_t(it-> idx + 1), n_split);
1868218690 }
18683-
1868418691 }
1868518692 std::vector<gguf_context*> ctx_outs(n_split, NULL);
1868618693 ctx_outs[0] = ctx_out;
1868718694
1868818695 // populate the original tensors so we get an initial meta data
18689- for (const auto & it : weights_map ) {
18690- uint16_t i_split = params->keep_split ? it.second. idx : 0;
18691- struct ggml_tensor * tensor = it.second. tensor;
18696+ for (const auto * it : tensors ) {
18697+ uint16_t i_split = params->keep_split ? it-> idx : 0;
18698+ struct ggml_tensor * tensor = it-> tensor;
1869218699 if (ctx_outs[i_split] == NULL) {
1869318700 ctx_outs[i_split] = gguf_init_empty();
1869418701 }
@@ -18735,8 +18742,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1873518742
1873618743 const auto tn = LLM_TN(model.arch);
1873718744 new_ofstream(0);
18738- for (const auto & it : weights_map ) {
18739- const auto & weight = it.second ;
18745+ for (const auto * it : tensors ) {
18746+ const auto & weight = *it ;
1874018747 struct ggml_tensor * tensor = weight.tensor;
1874118748 if (weight.idx != cur_split && params->keep_split) {
1874218749 close_ofstream();
0 commit comments