Skip to content

Commit 7889d1b

Browse files
committed
A bit more weight to shared experts for larger sizes
1 parent a5c7f9e commit 7889d1b

File tree

1 file changed

+24
-0
lines changed

1 file changed

+24
-0
lines changed

src/llama-quant.cpp

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -443,6 +443,30 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
443443
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
444444
new_type = GGML_TYPE_IQ2_S;
445445
}
446+
} else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_down_shexp.weight") != std::string::npos) {
447+
new_type = GGML_TYPE_Q5_K;
448+
if (use_more_bits(qs.i_ffn_down_shexp, qs.n_ffn_down_shexp)) {
449+
new_type = GGML_TYPE_Q8_0;
450+
}
451+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K;
452+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0;
453+
++qs.i_ffn_down_shexp;
454+
} else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_gate_shexp.weight") != std::string::npos) {
455+
new_type = GGML_TYPE_Q5_K;
456+
if (use_more_bits(qs.i_ffn_gate_shexp, qs.n_ffn_gate_shexp)) {
457+
new_type = GGML_TYPE_Q8_0;
458+
}
459+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K;
460+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0;
461+
++qs.i_ffn_gate_shexp;
462+
} else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_up_shexp.weight") != std::string::npos) {
463+
new_type = GGML_TYPE_Q5_K;
464+
if (use_more_bits(qs.i_ffn_up_shexp, qs.n_ffn_up_shexp)) {
465+
new_type = GGML_TYPE_Q8_0;
466+
}
467+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K;
468+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0;
469+
++qs.i_ffn_up_shexp;
446470
} else if (name.find("ffn_down") != std::string::npos) {
447471
auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
448472
int i_layer = info.first, n_layer = info.second;

0 commit comments

Comments
 (0)