@@ -15508,7 +15508,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1550815508 else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || new_type == GGML_TYPE_Q4_0_8_8) {
1550915509 new_type = GGML_TYPE_Q4_0;
1551015510 }
15511- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) new_type = GGML_TYPE_Q8_0 ;
15511+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) new_type = GGML_TYPE_IQ4_XS ;
1551215512 }
1551315513 } else if (name.find("attn_v.weight") != std::string::npos) {
1551415514 if (qs.model.hparams.n_expert >= 4) {
@@ -15568,8 +15568,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1556815568 else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
1556915569 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
1557015570 if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
15571- new_type = qs.i_attention_wv < qs.n_attention_wv/8 ? GGML_TYPE_Q6_K :
15572- use_more_bits(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q5_K;
15571+ new_type = qs.i_attention_wv < qs.n_attention_wv/8 ? GGML_TYPE_Q5_K :
15572+ use_more_bits(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
1557315573 }
1557415574 }
1557515575 ++qs.i_attention_wv;
@@ -15622,7 +15622,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1562215622 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
1562315623 if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
1562415624 new_type = qs.i_attention_wk < qs.n_attention_wk/8 ? GGML_TYPE_Q5_K :
15625- use_more_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ4_XS ;
15625+ use_more_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_Q5_K ;
1562615626 }
1562715627 }
1562815628 ++qs.i_attention_wk;
@@ -15704,7 +15704,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1570415704 }
1570515705 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
1570615706 if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
15707- new_type = i_layer < n_layer/8 ? GGML_TYPE_Q5_K :
15707+ new_type = i_layer < n_layer/8 ? GGML_TYPE_IQ4_XS :
1570815708 use_more_bits(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ4_XS;
1570915709 }
1571015710 }
@@ -15737,7 +15737,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1573715737 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) new_type = GGML_TYPE_IQ4_XS;
1573815738 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
1573915739 if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
15740- new_type = qs.i_attention_wo < qs.n_attention_wo/8 ? GGML_TYPE_Q5_K :
15740+ new_type = qs.i_attention_wo < qs.n_attention_wo/8 ? GGML_TYPE_IQ4_XS :
1574115741 use_more_bits(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ4_XS;
1574215742 }
1574315743 }
@@ -15783,7 +15783,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1578315783 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS;
1578415784 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S;
1578515785 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
15786- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR && (i_layer < n_layer/8)) new_type = GGML_TYPE_Q5_K;
1578715786 ++qs.i_ffn_gate;
1578815787 }
1578915788 else if (name.find("ffn_up") != std::string::npos) {
@@ -15799,7 +15798,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1579915798 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS;
1580015799 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S;
1580115800 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
15802- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR && (i_layer < n_layer/8)) new_type = GGML_TYPE_Q5_K;
1580315801 ++qs.i_ffn_up;
1580415802 }
1580515803
0 commit comments