@@ -16567,7 +16567,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1656716567 new_type = qs.i_attention_wv < qs.n_attention_wv/8 ? GGML_TYPE_Q6_K :
1656816568 difquant_fl_more_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
1656916569 }
16570- else difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
16570+ else new_type = difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
1657116571 }
1657216572 ++qs.i_attention_wv;
1657316573 } else if (name.find("attn_k.weight") != std::string::npos) {
@@ -16710,7 +16710,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1671016710 new_type = qs.i_attention_wq < qs.n_attention_wq/8 ? GGML_TYPE_IQ4_XS :
1671116711 difquant_first_last_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1671216712 }
16713- else difquant_fl_more_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16713+ else new_type = difquant_fl_more_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1671416714 }
1671516715 ++qs.i_attention_wq;
1671616716 } else if (name.find("attn_output.weight") != std::string::npos) {
@@ -16737,7 +16737,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1673716737 new_type = qs.i_attention_wq < qs.n_attention_wq/8 ? GGML_TYPE_Q5_K :
1673816738 difquant_first_last_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
1673916739 }
16740- else difquant_three_eights_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
16740+ else new_type = difquant_three_eights_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
1674116741 }
1674216742 else if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) {
1674316743 if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M)
@@ -16787,7 +16787,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1678716787 new_type = qs.i_attention_wv < qs.n_attention_wv/8 ? GGML_TYPE_Q5_K :
1678816788 difquant_fl_more_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
1678916789 }
16790- else difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
16790+ else new_type = difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
1679116791 }
1679216792 ++qs.i_attention_wv;
1679316793 } else if (name.find("ffn_gate") != std::string::npos) {
@@ -16983,7 +16983,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1698316983 new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q5_K :
1698416984 difquant_fl_more_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
1698516985 }
16986- else difquant_three_eights_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
16986+ else new_type = difquant_three_eights_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
1698716987 }
1698816988 else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
1698916989 if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
0 commit comments