@@ -16621,14 +16621,14 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1662116621 new_type = difquant_fl_more_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ4_XS;
1662216622 else new_type = difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1662316623 }
16624- else if (qs.model.hparams.n_gqa() >= 7) {
16624+ // else if (qs.model.hparams.n_gqa() >= 7) {
1662516625 // The Llama 70B models have 8 heads sharing the same attn_v weights (-> GQA 8). As a result, the attn_v.weight tensor is
1662616626 // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
1662716627 // nearly negligible increase in model size by quantizing this tensor with more bits.
1662816628 // That logic applies also to models like Yi 34B (-> GQA 7) and Mistral Large 123B (-> GQA 12).
16629- if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K || new_type == GGML_TYPE_IQ3_S ||
16630- new_type == GGML_TYPE_IQ4_XS) new_type = GGML_TYPE_Q5_K;
16631- }
16629+ // if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K || new_type == GGML_TYPE_IQ3_S ||
16630+ // new_type == GGML_TYPE_IQ4_XS) new_type = GGML_TYPE_Q5_K;
16631+ // }
1663216632 else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
1663316633 new_type = (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
1663416634 }
@@ -16650,30 +16650,43 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1665016650 ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
1665116651 new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_XXS;
1665216652 }
16653- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL ) {
16653+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
1665416654 new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ3_S;
1665516655 }
16656+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) {
16657+ if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q5_K;
16658+ else if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q4_K;
16659+ else new_type = GGML_TYPE_IQ3_S;
16660+ }
1665616661 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
16657- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16662+ if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q5_K;
16663+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1665816664 new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
1665916665 else new_type = difquant_fl_more_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ3_S;
1666016666 }
1666116667 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
16662- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16668+ if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q5_K;
16669+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1666316670 new_type = difquant_five_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
1666416671 else new_type = GGML_TYPE_Q4_K;
1666516672 }
16666- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ||
16667- ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
16668- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K;
16673+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
16674+ if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q5_K;
16675+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K;
16676+ else new_type = GGML_TYPE_Q4_K;
16677+ }
16678+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
16679+ if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q6_K;
16680+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K;
1666916681 else new_type = GGML_TYPE_Q4_K;
1667016682 }
1667116683 else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) &&
1667216684 (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) {
1667316685 new_type = GGML_TYPE_Q5_K;
1667416686 }
1667516687 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
16676- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
16688+ if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q6_K;
16689+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
1667716690 new_type = qs.i_attention_wv < qs.n_attention_wv/8 ? GGML_TYPE_Q6_K :
1667816691 difquant_fl_more_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
1667916692 }
@@ -16846,41 +16859,49 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1684616859 else new_type = difquant_half_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS;
1684716860 }
1684816861 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) {
16849- if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_S;
16862+ if (qs.model.hparams.n_gqa() >= 8 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q4_K;
16863+ else if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_S;
1685016864 else new_type = GGML_TYPE_IQ3_XXS;
1685116865 }
1685216866 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
16853- if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2)
16867+ if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q5_K;
16868+ else if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2)
1685416869 new_type = difquant_first_last_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1685516870 else new_type = difquant_fl_more_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
1685616871 }
1685716872 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
16858- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16873+ if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q5_K;
16874+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1685916875 new_type = difquant_five_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1686016876 else new_type = GGML_TYPE_IQ3_S;
1686116877 }
1686216878 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) {
16863- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16879+ if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q6_K;
16880+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1686416881 new_type = difquant_first_last_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
1686516882 else new_type = difquant_fl_more_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1686616883 }
1686716884 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
16868- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16885+ if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q5_K;
16886+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1686916887 new_type = difquant_three_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
1687016888 else new_type = difquant_half_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1687116889 }
1687216890 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) {
16873- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16891+ if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q5_K;
16892+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1687416893 new_type = difquant_half_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
1687516894 else new_type = difquant_six_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1687616895 }
1687716896 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
16878- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16897+ if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q6_K;
16898+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1687916899 new_type = difquant_six_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
1688016900 else new_type = GGML_TYPE_IQ4_XS;
1688116901 }
1688216902 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
16883- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
16903+ if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q6_K;
16904+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
1688416905 new_type = qs.i_attention_wk < qs.n_attention_wk/8 ? GGML_TYPE_Q6_K :
1688516906 difquant_fl_more_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
1688616907 }
@@ -16992,8 +17013,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1699217013 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) new_type = GGML_TYPE_IQ2_XXS;
1699317014 }
1699417015 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
16995- ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL ||
16996- ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
17016+ ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
1699717017 if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_IQ4_XS;
1699817018 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ2_S;
1699917019 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) new_type = GGML_TYPE_IQ3_XXS;
0 commit comments