@@ -4864,19 +4864,12 @@ struct llama_model_loader {
48644864 *last = 0;
48654865 *addr = mapping->addr;
48664866 for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
4867- try {
4868- const auto * weight = get_weight(ggml_get_name(tensor));
4869- if (!weight) {
4870- continue;
4871- }
4872- if (weight->idx != idx) {
4873- continue;
4874- }
4875- *first = std::min(*first, weight->offs);
4876- *last = std::max(*last, weight->offs + ggml_nbytes(tensor));
4877- } catch(...) {
4878- // the tensor is not in the model
4867+ const auto * weight = get_weight(ggml_get_name(tensor));
4868+ if (!weight || weight->idx != idx) {
4869+ continue;
48794870 }
4871+ *first = std::min(*first, weight->offs);
4872+ *last = std::max(*last, weight->offs + ggml_nbytes(tensor));
48804873 }
48814874 }
48824875
@@ -5053,7 +5046,6 @@ struct llama_model_loader {
50535046 ggml_backend_tensor_set(cur, data, 0, n_size);
50545047 }
50555048 } else {
5056- GGML_ASSERT(weight->idx < files.size());
50575049 const auto & file = files.at(weight->idx);
50585050 if (ggml_backend_buffer_is_host(cur->buffer)) {
50595051 file->seek(weight->offs, SEEK_SET);
@@ -18631,8 +18623,25 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1863118623 }
1863218624 }
1863318625
18626+ // make a list of weights
18627+ std::vector<const llama_model_loader::llama_tensor_weight *> tensors;
18628+ tensors.reserve(ml.weights_map.size());
1863418629 for (const auto & it : ml.weights_map) {
18635- const struct ggml_tensor * tensor = it.second.tensor;
18630+ tensors.push_back(&it.second);
18631+ }
18632+
18633+ // keep_split requires that the weights are sorted by split index
18634+ if (params->keep_split) {
18635+ std::sort(tensors.begin(), tensors.end(), [](const llama_model_loader::llama_tensor_weight * a, const llama_model_loader::llama_tensor_weight * b) {
18636+ if (a->idx == b->idx) {
18637+ return a->offs < b->offs;
18638+ }
18639+ return a->idx < b->idx;
18640+ });
18641+ }
18642+
18643+ for (const auto * it : tensors) {
18644+ const struct ggml_tensor * tensor = it->tensor;
1863618645
1863718646 const std::string name = ggml_get_name(tensor);
1863818647
@@ -18672,22 +18681,20 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1867218681 std::vector<no_init<float>> f32_conv_buf;
1867318682
1867418683 uint16_t n_split = 1;
18675- const auto & weights_map = ml.weights_map;
1867618684
1867718685 // Assume split index is continuous
1867818686 if (params->keep_split) {
18679- for (const auto & it : weights_map ) {
18680- n_split = std::max(uint16_t(it.second. idx + 1), n_split);
18687+ for (const auto * it : tensors ) {
18688+ n_split = std::max(uint16_t(it-> idx + 1), n_split);
1868118689 }
18682-
1868318690 }
1868418691 std::vector<gguf_context*> ctx_outs(n_split, NULL);
1868518692 ctx_outs[0] = ctx_out;
1868618693
1868718694 // populate the original tensors so we get an initial meta data
18688- for (const auto & it : weights_map ) {
18689- uint16_t i_split = params->keep_split ? it.second. idx : 0;
18690- struct ggml_tensor * tensor = it.second. tensor;
18695+ for (const auto * it : tensors ) {
18696+ uint16_t i_split = params->keep_split ? it-> idx : 0;
18697+ struct ggml_tensor * tensor = it-> tensor;
1869118698 if (ctx_outs[i_split] == NULL) {
1869218699 ctx_outs[i_split] = gguf_init_empty();
1869318700 }
@@ -18734,8 +18741,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1873418741
1873518742 const auto tn = LLM_TN(model.arch);
1873618743 new_ofstream(0);
18737- for (const auto & it : weights_map ) {
18738- const auto & weight = it.second ;
18744+ for (const auto * it : tensors ) {
18745+ const auto & weight = *it ;
1873918746 struct ggml_tensor * tensor = weight.tensor;
1874018747 if (weight.idx != cur_split && params->keep_split) {
1874118748 close_ofstream();
0 commit comments