Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 53 additions & 35 deletions src/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15769,44 +15769,64 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
}
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M ||
ftype == LLAMA_FTYPE_MOSTLY_IQ2_KS) {
ftype == LLAMA_FTYPE_MOSTLY_IQ2_KS) { // TODO: Possibly spread this tree of FTYPES into its respective tensors categories.
if (name.find("attn_v.weight") != std::string::npos) {
if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_IQ4_K;
else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_K;
if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ4_K;
else if (qs.model.hparams.n_gqa() >= 2) new_type = GGML_TYPE_IQ3_K;
else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
++qs.i_attention_wv;
}
else if (qs.model.hparams.n_expert == 8 && name.find("attn_k.weight") != std::string::npos) {
new_type = GGML_TYPE_Q4_K;
else if (qs.model.hparams.n_expert >= 2 && name.find("attn_k.weight") != std::string::npos) {
new_type = GGML_TYPE_IQ4_KS;
}
else if (qs.model.hparams.n_expert >= 2 && name.find("attn_q.weight") != std::string::npos) {
new_type = GGML_TYPE_IQ3_XXS;
}
else if (name.find("attn_qkv.weight") != std::string::npos) {
new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_K;
}
else if (name.find("ffn_down") != std::string::npos) {
if (qs.i_ffn_down < qs.n_ffn_down/8) {
new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
if (qs.i_ffn_down < qs.n_ffn_down/4 || qs.i_ffn_down >= 7*qs.n_ffn_down/8) {
new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S;
}
++qs.i_ffn_down;
}
else if (name.find("attn_output.weight") != std::string::npos) {
if (qs.model.hparams.n_expert == 8) {
if (qs.model.hparams.n_expert >= 4) {
new_type = GGML_TYPE_Q5_K;
}
else if (qs.model.hparams.n_expert >= 2) {
new_type = GGML_TYPE_Q4_K;
} else {
if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S;
if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_KS;
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) new_type = GGML_TYPE_IQ2_XS;
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) new_type = GGML_TYPE_IQ2_S;
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_KS) new_type = GGML_TYPE_IQ2_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS;
}
}
} else if (name.find("attn_v.weight") != std::string::npos) {
if (qs.params->attn_v_type < GGML_TYPE_COUNT) new_type = qs.params->attn_v_type;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
else if (qs.model.type == MODEL_70B) {
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
// nearly negligible increase in model size by quantizing this tensor with more bits:
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
}
else if (qs.model.hparams.n_expert >= 4) {
// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
// TODO: explore better strategies
new_type = GGML_TYPE_Q8_0;
}
else if (qs.model.hparams.n_expert >= 2 || new_type != GGML_TYPE_IQ6_K) {
new_type = GGML_TYPE_Q6_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
new_type = qs.model.hparams.n_gqa() >= 2 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_K) {
new_type = qs.model.hparams.n_gqa() >= 2 ? GGML_TYPE_IQ4_K : GGML_TYPE_IQ3_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model.hparams.n_gqa() >= 4) {
new_type = GGML_TYPE_Q4_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : qs.model.hparams.n_gqa() >= 2 ? GGML_TYPE_IQ3_K
: !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
Expand Down Expand Up @@ -15840,19 +15860,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) {
if (qs.model.hparams.n_vocab >= 127999 && (qs.model.type == MODEL_8B || qs.model.type == MODEL_70B))
new_type = GGML_TYPE_Q6_K;
}
if (qs.model.type == MODEL_70B) {
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
// nearly negligible increase in model size by quantizing this tensor with more bits:
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
}
if (qs.model.hparams.n_expert == 8) {
// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
// TODO: explore better strategies
new_type = GGML_TYPE_Q8_0;
}
else if (qs.model.hparams.n_gqa() >= 4) {
else if (qs.model.hparams.n_gqa() >= 4) { // TODO: Possibly reintegrate this into FTYPE conditionality
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_IQ3_XXS) new_type = GGML_TYPE_IQ3_S;
else if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_IQ3_S ) new_type = GGML_TYPE_Q4_K;
else if (new_type == GGML_TYPE_Q4_K || new_type == GGML_TYPE_IQ4_XS) new_type = GGML_TYPE_Q5_K;
Expand All @@ -15862,23 +15871,29 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
++qs.i_attention_wv;
} else if (name.find("attn_k.weight") != std::string::npos) {
if (qs.params->attn_k_type < GGML_TYPE_COUNT) new_type = qs.params->attn_k_type;
else if (qs.model.hparams.n_expert == 8) {
else if (qs.model.hparams.n_expert >= 4) {
// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
// TODO: explore better strategies
new_type = GGML_TYPE_Q8_0;
}
else if (qs.model.hparams.n_expert >= 2 || new_type != GGML_TYPE_IQ6_K) {
new_type = GGML_TYPE_Q6_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
if (qs.model.hparams.n_gqa() >= 2) new_type = GGML_TYPE_Q3_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
new_type = GGML_TYPE_IQ3_XXS;
if (qs.model.hparams.n_gqa() <= 2) new_type = GGML_TYPE_IQ3_XXS;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
new_type = GGML_TYPE_IQ2_S;
if (qs.model.hparams.n_gqa() <= 2) new_type = GGML_TYPE_IQ2_S;
}
} else if (name.find("attn_q.weight") != std::string::npos) {
if (qs.params->attn_q_type < GGML_TYPE_COUNT) new_type = qs.params->attn_q_type;
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && qs.model.hparams.n_expert <= 2) {
new_type = GGML_TYPE_IQ3_XXS;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && qs.model.hparams.n_expert <= 2) {
new_type = GGML_TYPE_IQ2_S;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) {
Expand All @@ -15891,7 +15906,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
if (qs.params->ffn_down_type < GGML_TYPE_COUNT) new_type = qs.params->ffn_down_type;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
if (i_layer < n_layer/8 || i_layer >= 7*n_layer/8) new_type = GGML_TYPE_Q3_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) {
new_type = i_layer < n_layer/8 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
Expand All @@ -15902,7 +15917,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
: GGML_TYPE_Q3_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (i_layer < n_layer/8 ||
(qs.model.hparams.n_expert == 8 && use_more_bits(i_layer, n_layer)))) {
(qs.model.hparams.n_expert >= 2 && use_more_bits(i_layer, n_layer)))) {
new_type = GGML_TYPE_IQ4_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
Expand Down Expand Up @@ -15938,7 +15953,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
} else if (name.find("attn_output.weight") != std::string::npos) {
if (qs.params->attn_output_type < GGML_TYPE_COUNT) new_type = qs.params->attn_output_type;
else if (arch != LLM_ARCH_FALCON) {
if (qs.model.hparams.n_expert >= 8) {
if (qs.model.hparams.n_expert >= 2) {
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
Expand All @@ -15961,6 +15976,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
}
else if (name.find("attn_qkv.weight") != std::string::npos) {
if (qs.params->attn_qkv_type < GGML_TYPE_COUNT) new_type = qs.params->attn_qkv_type;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
new_type = GGML_TYPE_Q3_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
new_type = GGML_TYPE_Q4_K;
}
Expand Down