Skip to content

Commit 8c9017b

Browse files
committed
Simplify IQ4_XSR
But leave in place as a "demo" the more complex template set by Ikawrakow to customize the layers quants, with the added attn_q, attn_k, and attn_output tensors.
1 parent 8c10533 commit 8c9017b

File tree

1 file changed

+6
-8
lines changed

1 file changed

+6
-8
lines changed

src/llama.cpp

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15508,7 +15508,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1550815508
else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || new_type == GGML_TYPE_Q4_0_8_8) {
1550915509
new_type = GGML_TYPE_Q4_0;
1551015510
}
15511-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) new_type = GGML_TYPE_Q8_0;
15511+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) new_type = GGML_TYPE_IQ4_XS;
1551215512
}
1551315513
} else if (name.find("attn_v.weight") != std::string::npos) {
1551415514
if (qs.model.hparams.n_expert >= 4) {
@@ -15568,8 +15568,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1556815568
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
1556915569
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
1557015570
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
15571-
new_type = qs.i_attention_wv < qs.n_attention_wv/8 ? GGML_TYPE_Q6_K :
15572-
use_more_bits(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q5_K;
15571+
new_type = qs.i_attention_wv < qs.n_attention_wv/8 ? GGML_TYPE_Q5_K :
15572+
use_more_bits(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
1557315573
}
1557415574
}
1557515575
++qs.i_attention_wv;
@@ -15622,7 +15622,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1562215622
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
1562315623
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
1562415624
new_type = qs.i_attention_wk < qs.n_attention_wk/8 ? GGML_TYPE_Q5_K :
15625-
use_more_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ4_XS;
15625+
use_more_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_Q5_K;
1562615626
}
1562715627
}
1562815628
++qs.i_attention_wk;
@@ -15704,7 +15704,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1570415704
}
1570515705
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
1570615706
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
15707-
new_type = i_layer < n_layer/8 ? GGML_TYPE_Q5_K :
15707+
new_type = i_layer < n_layer/8 ? GGML_TYPE_IQ4_XS :
1570815708
use_more_bits(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ4_XS;
1570915709
}
1571015710
}
@@ -15737,7 +15737,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1573715737
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) new_type = GGML_TYPE_IQ4_XS;
1573815738
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
1573915739
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
15740-
new_type = qs.i_attention_wo < qs.n_attention_wo/8 ? GGML_TYPE_Q5_K :
15740+
new_type = qs.i_attention_wo < qs.n_attention_wo/8 ? GGML_TYPE_IQ4_XS :
1574115741
use_more_bits(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ4_XS;
1574215742
}
1574315743
}
@@ -15783,7 +15783,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1578315783
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS;
1578415784
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S;
1578515785
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
15786-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR && (i_layer < n_layer/8)) new_type = GGML_TYPE_Q5_K;
1578715786
++qs.i_ffn_gate;
1578815787
}
1578915788
else if (name.find("ffn_up") != std::string::npos) {
@@ -15799,7 +15798,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1579915798
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS;
1580015799
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S;
1580115800
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
15802-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR && (i_layer < n_layer/8)) new_type = GGML_TYPE_Q5_K;
1580315801
++qs.i_ffn_up;
1580415802
}
1580515803

0 commit comments

Comments
 (0)