@@ -4489,6 +4489,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
44894489 case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
44904490 case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.70 bpw";
44914491 case LLAMA_FTYPE_MOSTLY_IQ3_XL: return "IQ3_S mix - 3.95 bpw";
4492+ case LLAMA_FTYPE_MOSTLY_IQ4_XSR: return "IQ4_XS mix - 4.xx bpw";
44924493 case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4";
44934494 case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8";
44944495 case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8";
@@ -15347,10 +15348,17 @@ struct quantize_state_internal {
1534715348 const llama_model_quantize_params * params;
1534815349
1534915350 int n_attention_wv = 0;
15351+ int n_attention_wk = 0;
15352+ int n_attention_wq = 0;
15353+ int n_attention_wo = 0;
1535015354 int n_ffn_down = 0;
1535115355 int n_ffn_gate = 0;
1535215356 int n_ffn_up = 0;
15357+
1535315358 int i_attention_wv = 0;
15359+ int i_attention_wk = 0;
15360+ int i_attention_wq = 0;
15361+ int i_attention_wo = 0;
1535415362 int i_ffn_down = 0;
1535515363 int i_ffn_gate = 0;
1535615364 int i_ffn_up = 0;
@@ -15505,6 +15513,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1550515513 else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || new_type == GGML_TYPE_Q4_0_8_8) {
1550615514 new_type = GGML_TYPE_Q4_0;
1550715515 }
15516+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) new_type = GGML_TYPE_Q8_0;
1550815517 }
1550915518 } else if (name.find("attn_v.weight") != std::string::npos) {
1551015519 if (qs.model.hparams.n_expert >= 4) {
@@ -15556,9 +15565,18 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1555615565 (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) {
1555715566 new_type = GGML_TYPE_Q5_K;
1555815567 }
15568+ else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && qs.model.hparams.n_gqa() >= 4) {
15569+ new_type = GGML_TYPE_Q5_K;
15570+ }
1555915571 else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
1556015572 use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
1556115573 else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
15574+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
15575+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
15576+ new_type = qs.i_attention_wv < qs.n_attention_wv/8 ? GGML_TYPE_Q6_K :
15577+ use_more_bits(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q5_K;
15578+ }
15579+ }
1556215580 ++qs.i_attention_wv;
1556315581 } else if (name.find("attn_k.weight") != std::string::npos) {
1556415582 if (qs.model.hparams.n_expert >= 4) {
@@ -15606,6 +15624,13 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1560615624 ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) {
1560715625 new_type = GGML_TYPE_Q5_K;
1560815626 }
15627+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
15628+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
15629+ new_type = qs.i_attention_wk < qs.n_attention_wk/8 ? GGML_TYPE_Q5_K :
15630+ use_more_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ4_XS;
15631+ }
15632+ }
15633+ ++qs.i_attention_wk;
1560915634 } else if (name.find("attn_q.weight") != std::string::npos) {
1561015635 if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ2_S;
1561115636 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) new_type = GGML_TYPE_IQ3_XXS;
@@ -15618,6 +15643,14 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1561815643 else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) {
1561915644 if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q3_K;
1562015645 }
15646+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
15647+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
15648+ new_type = qs.i_attention_wq < qs.n_attention_wq/8 ? GGML_TYPE_IQ4_XS :
15649+ use_more_bits(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ4_XS;
15650+ }
15651+ }
15652+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) new_type = GGML_TYPE_IQ3_S;
15653+ ++qs.i_attention_wq;
1562115654 } else if (name.find("ffn_down") != std::string::npos) {
1562215655 auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
1562315656 int i_layer = info.first, n_layer = info.second;
@@ -15674,6 +15707,12 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1567415707 // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
1567515708 new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
1567615709 }
15710+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
15711+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
15712+ new_type = i_layer < n_layer/8 ? GGML_TYPE_Q5_K :
15713+ use_more_bits(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ4_XS;
15714+ }
15715+ }
1567715716 ++qs.i_ffn_down;
1567815717 } else if (name.find("attn_output.weight") != std::string::npos) {
1567915718 if (arch != LLM_ARCH_FALCON) {
@@ -15682,6 +15721,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1568215721 ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL ||
1568315722 ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
1568415723 ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
15724+ ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR ||
1568515725 ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) new_type = GGML_TYPE_Q5_K;
1568615726 }
1568715727 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
@@ -15700,10 +15740,17 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1570015740 else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K;
1570115741 else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K;
1570215742 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) new_type = GGML_TYPE_IQ4_XS;
15743+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
15744+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
15745+ new_type = qs.i_attention_wo < qs.n_attention_wo/8 ? GGML_TYPE_Q5_K :
15746+ use_more_bits(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ4_XS;
15747+ }
15748+ }
1570315749 }
1570415750 } else {
1570515751 if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
1570615752 }
15753+ ++qs.i_attention_wo;
1570715754 }
1570815755 else if (name.find("attn_qkv.weight") != std::string::npos) {
1570915756 if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
@@ -15723,8 +15770,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1572315770 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) new_type = GGML_TYPE_IQ4_XS;
1572415771 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) new_type = GGML_TYPE_Q4_K;
1572515772 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) new_type = GGML_TYPE_Q4_K;
15773+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) new_type = GGML_TYPE_Q5_K;
1572615774 else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
1572715775 else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
15776+ ++qs.i_attention_wv;
1572815777 }
1572915778 else if (name.find("ffn_gate") != std::string::npos) {
1573015779 auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
@@ -15739,6 +15788,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1573915788 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS;
1574015789 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S;
1574115790 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
15791+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR && (i_layer < n_layer/8)) new_type = GGML_TYPE_Q5_K;
1574215792 ++qs.i_ffn_gate;
1574315793 }
1574415794 else if (name.find("ffn_up") != std::string::npos) {
@@ -15754,6 +15804,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1575415804 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS;
1575515805 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S;
1575615806 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
15807+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR && (i_layer < n_layer/8)) new_type = GGML_TYPE_Q5_K;
1575715808 ++qs.i_ffn_up;
1575815809 }
1575915810
@@ -15900,6 +15951,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1590015951 case LLAMA_FTYPE_MOSTLY_IQ1_XL: default_type = GGML_TYPE_IQ1_M; break;
1590115952 case LLAMA_FTYPE_MOSTLY_IQ4_NL: default_type = GGML_TYPE_IQ4_NL; break;
1590215953 case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break;
15954+ case LLAMA_FTYPE_MOSTLY_IQ4_XSR: default_type = GGML_TYPE_IQ4_XS; break;
1590315955 case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;
1590415956 case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break;
1590515957 case LLAMA_FTYPE_MOSTLY_IQ3_XL: default_type = GGML_TYPE_IQ3_S; break;
@@ -15998,6 +16050,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1599816050 if (name.find("attn_v.weight") != std::string::npos ||
1599916051 name.find("attn_qkv.weight") != std::string::npos) {
1600016052 ++qs.n_attention_wv;
16053+ } else if (name.find("attn_k.weight") != std::string::npos) {
16054+ ++qs.n_attention_wk;
16055+ } else if (name.find("attn_q.weight") != std::string::npos) {
16056+ ++qs.n_attention_wq;
16057+ } else if (name.find("attn_output.weight") != std::string::npos) {
16058+ ++qs.n_attention_wo;
1600116059 } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
1600216060 qs.has_output = true;
1600316061 }
@@ -16012,6 +16070,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1601216070 // - qs.n_attention_wv == 3 * model.hparams.n_layer for Encoder-Decoder models
1601316071 //
1601416072 GGML_ASSERT((qs.n_attention_wv == 0 || qs.n_attention_wv == (int)model.hparams.n_layer || qs.n_attention_wv == 3 * (int)model.hparams.n_layer) && "n_attention_wv is unexpected");
16073+ GGML_ASSERT((qs.n_attention_wk == 0 || qs.n_attention_wk == (int)model.hparams.n_layer || qs.n_attention_wk == 3 * (int)model.hparams.n_layer) && "n_attention_wk is unexpected");
16074+ GGML_ASSERT((qs.n_attention_wq == 0 || qs.n_attention_wq == (int)model.hparams.n_layer || qs.n_attention_wq == 3 * (int)model.hparams.n_layer) && "n_attention_wq is unexpected");
16075+ GGML_ASSERT((qs.n_attention_wo == 0 || qs.n_attention_wo == (int)model.hparams.n_layer || qs.n_attention_wo == 3 * (int)model.hparams.n_layer) && "n_attention_wo is unexpected");
1601516076
1601616077 size_t total_size_org = 0;
1601716078 size_t total_size_new = 0;
0 commit comments