Skip to content

Commit 16e9c37

Browse files
committed
various corrections on IQ2_S+ and IQ3 quants
1 parent 380b53d commit 16e9c37

File tree

1 file changed

+28
-25
lines changed

1 file changed

+28
-25
lines changed

src/llama.cpp

Lines changed: 28 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -16443,11 +16443,12 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1644316443
}
1644416444
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
1644516445
if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q6_K;
16446+
else if (qs.model.hparams.n_head <= 20) new_type = GGML_TYPE_IQ4_XS;
1644616447
else new_type = GGML_TYPE_Q4_K;
1644716448
}
1644816449
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) {
1644916450
if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q6_K;
16450-
else if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_Q4_K;
16451+
else if (qs.model.hparams.n_vocab >= 127999 || qs.model.hparams.n_head <= 20) new_type = GGML_TYPE_Q4_K;
1645116452
else new_type = GGML_TYPE_Q5_K;
1645216453
}
1645316454
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
@@ -16456,7 +16457,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1645616457
}
1645716458
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
1645816459
if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q6_K;
16459-
else if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_Q5_K;
16460+
else if (qs.model.hparams.n_vocab >= 127999 || qs.model.hparams.n_head <= 20) new_type = GGML_TYPE_Q5_K;
1646016461
else new_type = GGML_TYPE_Q6_K;
1646116462
}
1646216463
else if (new_type != GGML_TYPE_Q8_0) {
@@ -16487,17 +16488,18 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1648716488
new_type = GGML_TYPE_IQ2_S;
1648816489
}
1648916490
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
16490-
if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_IQ2_S;
16491+
if (qs.model.hparams.n_vocab >= 127999 || qs.model.hparams.n_head <= 20) new_type = GGML_TYPE_IQ2_S;
1649116492
else new_type = GGML_TYPE_IQ3_XXS;
1649216493
}
16493-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) {
16494+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
1649416495
new_type = GGML_TYPE_IQ3_XXS;
1649516496
}
16496-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
16497-
if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_IQ3_XXS;
16497+
else if ( || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
16498+
if (qs.model.hparams.n_vocab >= 127999 || qs.model.hparams.n_head <= 20) new_type = GGML_TYPE_IQ3_XXS;
1649816499
else new_type = GGML_TYPE_IQ3_S;
1649916500
}
16500-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
16501+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
16502+
if (qs.model.hparams.n_vocab >= 127999 || qs.model.hparams.n_head <= 20) new_type = GGML_TYPE_IQ3_S;
1650116503
new_type = GGML_TYPE_IQ4_XS;
1650216504
}
1650316505
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS;
@@ -16550,10 +16552,15 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1655016552
}
1655116553
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
1655216554
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16553-
new_type = difquant_first_last_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
16554-
else new_type = difquant_three_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ3_S;
16555+
new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
16556+
else new_type = difquant_fl_more_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ3_S;
16557+
}
16558+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
16559+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16560+
new_type = difquant_five_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
16561+
else new_type = GGML_TYPE_Q4_K;
1655516562
}
16556-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ||
16563+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ||
1655716564
ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
1655816565
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K;
1655916566
else new_type = GGML_TYPE_Q4_K;
@@ -16650,7 +16657,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1665016657
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
1665116658
if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2)
1665216659
new_type = difquant_first_last_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16653-
else new_type = difquant_three_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
16660+
else new_type = difquant_fl_more_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
1665416661
}
1665516662
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
1665616663
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@@ -16659,8 +16666,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1665916666
}
1666016667
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) {
1666116668
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16662-
new_type = difquant_fl_more_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
16663-
else new_type = difquant_three_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16669+
new_type = difquant_first_last_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
16670+
else new_type = difquant_fl_more_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1666416671
}
1666516672
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
1666616673
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@@ -16863,8 +16870,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1686316870
}
1686416871
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
1686516872
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16866-
new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
16867-
else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
16873+
new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
1686816874
}
1686916875
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
1687016876
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@@ -16873,8 +16879,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1687316879
}
1687416880
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) {
1687516881
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16876-
new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16877-
else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16882+
new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1687816883
}
1687916884
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
1688016885
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@@ -16947,8 +16952,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1694716952
}
1694816953
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
1694916954
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16950-
new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
16951-
else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
16955+
new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
16956+
else new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
1695216957
}
1695316958
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
1695416959
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@@ -16957,8 +16962,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1695716962
}
1695816963
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) {
1695916964
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16960-
new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16961-
else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16965+
new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16966+
else new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1696216967
}
1696316968
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
1696416969
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@@ -17102,8 +17107,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1710217107
}
1710317108
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
1710417109
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
17105-
new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
17106-
else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
17110+
new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
1710717111
}
1710817112
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
1710917113
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@@ -17112,8 +17116,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1711217116
}
1711317117
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) {
1711417118
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
17115-
new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
17116-
else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
17119+
new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1711717120
}
1711817121
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
1711917122
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)

0 commit comments

Comments
 (0)