Skip to content

Commit bd575f0

Browse files
committed
Revert tensors quantization tree edits
1 parent fc4ed23 commit bd575f0

File tree

1 file changed

+111
-143
lines changed

1 file changed

+111
-143
lines changed

src/llama.cpp

Lines changed: 111 additions & 143 deletions
Original file line numberDiff line numberDiff line change
@@ -15381,179 +15381,147 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1538115381
}
1538215382
}
1538315383
} else if (name.find("attn_v.weight") != std::string::npos) {
15384-
if (qs.params->attn_v_type < GGML_TYPE_COUNT) {
15385-
new_type = qs.params->attn_v_type;
15386-
} else {
15387-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
15388-
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
15389-
}
15390-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model.hparams.n_gqa() >= 4) {
15391-
new_type = GGML_TYPE_Q4_K;
15392-
}
15393-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
15394-
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
15395-
}
15396-
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs.model.hparams.n_gqa() >= 4) {
15397-
new_type = GGML_TYPE_Q4_K;
15398-
}
15399-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
15400-
new_type = GGML_TYPE_Q4_K;
15401-
}
15402-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
15403-
new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
15404-
}
15405-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
15406-
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && qs.model.hparams.n_gqa() >= 4) {
15407-
new_type = GGML_TYPE_Q5_K;
15408-
}
15409-
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
15410-
use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
15411-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
15412-
if (qs.model.type == MODEL_70B) {
15413-
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
15414-
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
15415-
// nearly negligible increase in model size by quantizing this tensor with more bits:
15416-
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
15417-
}
15418-
if (qs.model.hparams.n_expert == 8) {
15419-
// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
15420-
// TODO: explore better strategies
15421-
new_type = GGML_TYPE_Q8_0;
15422-
}
15384+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
15385+
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
15386+
}
15387+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model.hparams.n_gqa() >= 4) {
15388+
new_type = GGML_TYPE_Q4_K;
15389+
}
15390+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
15391+
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
15392+
}
15393+
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs.model.hparams.n_gqa() >= 4) {
15394+
new_type = GGML_TYPE_Q4_K;
15395+
}
15396+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
15397+
new_type = GGML_TYPE_Q4_K;
15398+
}
15399+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
15400+
new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
15401+
}
15402+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
15403+
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && qs.model.hparams.n_gqa() >= 4) {
15404+
new_type = GGML_TYPE_Q5_K;
15405+
}
15406+
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
15407+
use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
15408+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
15409+
if (qs.model.type == MODEL_70B) {
15410+
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
15411+
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
15412+
// nearly negligible increase in model size by quantizing this tensor with more bits:
15413+
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
15414+
}
15415+
if (qs.model.hparams.n_expert == 8) {
15416+
// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
15417+
// TODO: explore better strategies
15418+
new_type = GGML_TYPE_Q8_0;
1542315419
}
1542415420
++qs.i_attention_wv;
1542515421
} else if (name.find("attn_k.weight") != std::string::npos) {
15426-
if (qs.params->attn_k_type < GGML_TYPE_COUNT) {
15427-
new_type = qs.params->attn_k_type;
15428-
} else {
15429-
if (qs.model.hparams.n_expert == 8) {
15430-
// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
15431-
// TODO: explore better strategies
15432-
new_type = GGML_TYPE_Q8_0;
15433-
}
15434-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
15435-
new_type = GGML_TYPE_IQ3_XXS;
15436-
}
15437-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
15438-
new_type = GGML_TYPE_IQ2_S;
15439-
}
15422+
if (qs.model.hparams.n_expert == 8) {
15423+
// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
15424+
// TODO: explore better strategies
15425+
new_type = GGML_TYPE_Q8_0;
15426+
}
15427+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
15428+
new_type = GGML_TYPE_IQ3_XXS;
15429+
}
15430+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
15431+
new_type = GGML_TYPE_IQ2_S;
1544015432
}
1544115433
} else if (name.find("attn_q.weight") != std::string::npos) {
15442-
if (qs.params->attn_q_type < GGML_TYPE_COUNT) {
15443-
new_type = qs.params->attn_q_type;
15444-
} else {
15445-
if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
15446-
new_type = GGML_TYPE_IQ3_XXS;
15447-
}
15448-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
15449-
new_type = GGML_TYPE_IQ2_S;
15450-
}
15434+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
15435+
new_type = GGML_TYPE_IQ3_XXS;
15436+
}
15437+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
15438+
new_type = GGML_TYPE_IQ2_S;
1545115439
}
1545215440
} else if (name.find("ffn_down") != std::string::npos) {
1545315441
auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
1545415442
int i_layer = info.first, n_layer = info.second;
15455-
if (qs.params->ffn_down_type < GGML_TYPE_COUNT) {
15456-
new_type = qs.params->ffn_down_type;
15457-
} else {
15458-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
15459-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
15460-
if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
15461-
}
15462-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) {
15463-
new_type = i_layer < n_layer/8 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
15464-
}
15465-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
15466-
new_type = i_layer < n_layer/16 ? GGML_TYPE_Q5_K
15467-
: arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
15468-
: GGML_TYPE_Q3_K;
15469-
}
15470-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (i_layer < n_layer/8 ||
15471-
(qs.model.hparams.n_expert == 8 && use_more_bits(i_layer, n_layer)))) {
15472-
new_type = GGML_TYPE_Q4_K;
15473-
}
15474-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
15475-
new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
15476-
}
15477-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
15478-
if (arch == LLM_ARCH_FALCON) {
15479-
new_type = i_layer < n_layer/16 ? GGML_TYPE_Q6_K :
15480-
use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
15481-
} else {
15482-
if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
15483-
}
15484-
}
15485-
else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs.has_imatrix) {
15486-
new_type = GGML_TYPE_Q5_K;
15487-
}
15488-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
15489-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
15490-
new_type = GGML_TYPE_Q5_K;
15491-
}
15492-
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || ftype == LLAMA_FTYPE_MOSTLY_Q5_0)
15493-
&& qs.has_imatrix && i_layer < n_layer/8) {
15494-
// Guard against craziness in the first few ffn_down layers that can happen even with imatrix for Q4_0/Q5_0.
15495-
// We only do it when an imatrix is provided because a) we want to make sure that one can always get the
15496-
// same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
15497-
new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
15443+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
15444+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
15445+
if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
15446+
}
15447+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) {
15448+
new_type = i_layer < n_layer/8 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
15449+
}
15450+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
15451+
new_type = i_layer < n_layer/16 ? GGML_TYPE_Q5_K
15452+
: arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
15453+
: GGML_TYPE_Q3_K;
15454+
}
15455+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (i_layer < n_layer/8 ||
15456+
(qs.model.hparams.n_expert == 8 && use_more_bits(i_layer, n_layer)))) {
15457+
new_type = GGML_TYPE_Q4_K;
15458+
}
15459+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
15460+
new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
15461+
}
15462+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
15463+
if (arch == LLM_ARCH_FALCON) {
15464+
new_type = i_layer < n_layer/16 ? GGML_TYPE_Q6_K :
15465+
use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
15466+
} else {
15467+
if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
1549815468
}
1549915469
}
15470+
else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs.has_imatrix) {
15471+
new_type = GGML_TYPE_Q5_K;
15472+
}
15473+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
15474+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
15475+
new_type = GGML_TYPE_Q5_K;
15476+
}
15477+
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || ftype == LLAMA_FTYPE_MOSTLY_Q5_0)
15478+
&& qs.has_imatrix && i_layer < n_layer/8) {
15479+
// Guard against craziness in the first few ffn_down layers that can happen even with imatrix for Q4_0/Q5_0.
15480+
// We only do it when an imatrix is provided because a) we want to make sure that one can always get the
15481+
// same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
15482+
new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
15483+
}
1550015484
++qs.i_ffn_down;
1550115485
} else if (name.find("attn_output.weight") != std::string::npos) {
15502-
if (qs.params->attn_output_type < GGML_TYPE_COUNT) {
15503-
new_type = qs.params->attn_output_type;
15504-
} else {
15505-
if (arch != LLM_ARCH_FALCON) {
15506-
if (qs.model.hparams.n_expert == 8) {
15507-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
15508-
ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
15509-
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
15510-
ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) {
15511-
new_type = GGML_TYPE_Q5_K;
15512-
}
15513-
} else {
15514-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
15515-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S;
15516-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K;
15517-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K;
15518-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ) new_type = GGML_TYPE_Q4_K;
15486+
if (arch != LLM_ARCH_FALCON) {
15487+
if (qs.model.hparams.n_expert == 8) {
15488+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
15489+
ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
15490+
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
15491+
ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) {
15492+
new_type = GGML_TYPE_Q5_K;
1551915493
}
1552015494
} else {
15521-
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
15495+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
15496+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S;
15497+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K;
15498+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K;
15499+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ) new_type = GGML_TYPE_Q4_K;
1552215500
}
15501+
} else {
15502+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
1552315503
}
1552415504
}
1552515505
else if (name.find("attn_qkv.weight") != std::string::npos) {
15526-
if (qs.params->attn_qkv_type < GGML_TYPE_COUNT) {
15527-
new_type = qs.params->attn_qkv_type;
15528-
} else {
15529-
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
15530-
new_type = GGML_TYPE_Q4_K;
15531-
}
15532-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
15533-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
15506+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
15507+
new_type = GGML_TYPE_Q4_K;
1553415508
}
15509+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
15510+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
1553515511
}
1553615512
else if (name.find("ffn_gate") != std::string::npos) {
1553715513
auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
1553815514
int i_layer = info.first, n_layer = info.second;
15539-
if (qs.params->ffn_gate_type < GGML_TYPE_COUNT) {
15540-
new_type = qs.params->ffn_gate_type;
15541-
} else {
15542-
if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
15543-
new_type = GGML_TYPE_IQ3_XXS;
15544-
}
15515+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
15516+
new_type = GGML_TYPE_IQ3_XXS;
1554515517
}
1554615518
++qs.i_ffn_gate;
1554715519
}
1554815520
else if (name.find("ffn_up") != std::string::npos) {
1554915521
auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
1555015522
int i_layer = info.first, n_layer = info.second;
15551-
if (qs.params->ffn_up_type < GGML_TYPE_COUNT) {
15552-
new_type = qs.params->ffn_up_type;
15553-
} else {
15554-
if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
15555-
new_type = GGML_TYPE_IQ3_XXS;
15556-
}
15523+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
15524+
new_type = GGML_TYPE_IQ3_XXS;
1555715525
}
1555815526
++qs.i_ffn_up;
1555915527
}

0 commit comments

Comments
 (0)