@@ -16443,11 +16443,12 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1644316443 }
1644416444 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
1644516445 if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q6_K;
16446+ else if (qs.model.hparams.n_head <= 20) new_type = GGML_TYPE_IQ4_XS;
1644616447 else new_type = GGML_TYPE_Q4_K;
1644716448 }
1644816449 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) {
1644916450 if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q6_K;
16450- else if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_Q4_K;
16451+ else if (qs.model.hparams.n_vocab >= 127999 || qs.model.hparams.n_head <= 20 ) new_type = GGML_TYPE_Q4_K;
1645116452 else new_type = GGML_TYPE_Q5_K;
1645216453 }
1645316454 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
@@ -16456,7 +16457,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1645616457 }
1645716458 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
1645816459 if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q6_K;
16459- else if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_Q5_K;
16460+ else if (qs.model.hparams.n_vocab >= 127999 || qs.model.hparams.n_head <= 20 ) new_type = GGML_TYPE_Q5_K;
1646016461 else new_type = GGML_TYPE_Q6_K;
1646116462 }
1646216463 else if (new_type != GGML_TYPE_Q8_0) {
@@ -16487,17 +16488,18 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1648716488 new_type = GGML_TYPE_IQ2_S;
1648816489 }
1648916490 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
16490- if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_IQ2_S;
16491+ if (qs.model.hparams.n_vocab >= 127999 || qs.model.hparams.n_head <= 20 ) new_type = GGML_TYPE_IQ2_S;
1649116492 else new_type = GGML_TYPE_IQ3_XXS;
1649216493 }
16493- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) {
16494+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ) {
1649416495 new_type = GGML_TYPE_IQ3_XXS;
1649516496 }
16496- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
16497- if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_IQ3_XXS;
16497+ else if ( || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
16498+ if (qs.model.hparams.n_vocab >= 127999 || qs.model.hparams.n_head <= 20 ) new_type = GGML_TYPE_IQ3_XXS;
1649816499 else new_type = GGML_TYPE_IQ3_S;
1649916500 }
16500- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
16501+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
16502+ if (qs.model.hparams.n_vocab >= 127999 || qs.model.hparams.n_head <= 20) new_type = GGML_TYPE_IQ3_S;
1650116503 new_type = GGML_TYPE_IQ4_XS;
1650216504 }
1650316505 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS;
@@ -16550,10 +16552,15 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1655016552 }
1655116553 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
1655216554 if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16553- new_type = difquant_first_last_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
16554- else new_type = difquant_three_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ3_S;
16555+ new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
16556+ else new_type = difquant_fl_more_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ3_S;
16557+ }
16558+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
16559+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16560+ new_type = difquant_five_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
16561+ else new_type = GGML_TYPE_Q4_K;
1655516562 }
16556- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ||
16563+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ||
1655716564 ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
1655816565 if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K;
1655916566 else new_type = GGML_TYPE_Q4_K;
@@ -16650,7 +16657,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1665016657 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
1665116658 if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2)
1665216659 new_type = difquant_first_last_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16653- else new_type = difquant_three_eights_tensors (qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
16660+ else new_type = difquant_fl_more_tensors (qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
1665416661 }
1665516662 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
1665616663 if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@@ -16659,8 +16666,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1665916666 }
1666016667 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) {
1666116668 if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16662- new_type = difquant_fl_more_tensors (qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
16663- else new_type = difquant_three_eights_tensors (qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16669+ new_type = difquant_first_last_tensors (qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
16670+ else new_type = difquant_fl_more_tensors (qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1666416671 }
1666516672 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
1666616673 if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@@ -16863,8 +16870,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1686316870 }
1686416871 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
1686516872 if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16866- new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
16867- else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
16873+ new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
1686816874 }
1686916875 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
1687016876 if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@@ -16873,8 +16879,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1687316879 }
1687416880 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) {
1687516881 if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16876- new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16877- else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16882+ new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1687816883 }
1687916884 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
1688016885 if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@@ -16947,8 +16952,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1694716952 }
1694816953 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
1694916954 if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16950- new_type = (difquant_fl_more_tensors (i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
16951- else new_type = (difquant_three_eights_tensors (i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
16955+ new_type = (difquant_first_last_tensors (i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
16956+ else new_type = (difquant_fl_more_tensors (i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
1695216957 }
1695316958 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
1695416959 if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@@ -16957,8 +16962,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1695716962 }
1695816963 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) {
1695916964 if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16960- new_type = (difquant_fl_more_tensors (i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16961- else new_type = (difquant_three_eights_tensors (i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16965+ new_type = (difquant_first_last_tensors (i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16966+ else new_type = (difquant_fl_more_tensors (i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1696216967 }
1696316968 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
1696416969 if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@@ -17102,8 +17107,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1710217107 }
1710317108 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
1710417109 if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
17105- new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
17106- else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
17110+ new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
1710717111 }
1710817112 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
1710917113 if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@@ -17112,8 +17116,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1711217116 }
1711317117 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) {
1711417118 if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
17115- new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
17116- else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
17119+ new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1711717120 }
1711817121 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
1711917122 if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
0 commit comments