Skip to content

Commit 71ab742

Browse files
committed
A bit more weight to shared experts for larger sizes
1 parent feae28b commit 71ab742

File tree

1 file changed

+24
-0
lines changed

1 file changed

+24
-0
lines changed

src/llama-quant.cpp

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -376,6 +376,30 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
376376
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
377377
new_type = GGML_TYPE_IQ2_S;
378378
}
379+
} else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_down_shexp.weight") != std::string::npos) {
380+
new_type = GGML_TYPE_Q5_K;
381+
if (use_more_bits(qs.i_ffn_down_shexp, qs.n_ffn_down_shexp)) {
382+
new_type = GGML_TYPE_Q8_0;
383+
}
384+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K;
385+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0;
386+
++qs.i_ffn_down_shexp;
387+
} else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_gate_shexp.weight") != std::string::npos) {
388+
new_type = GGML_TYPE_Q5_K;
389+
if (use_more_bits(qs.i_ffn_gate_shexp, qs.n_ffn_gate_shexp)) {
390+
new_type = GGML_TYPE_Q8_0;
391+
}
392+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K;
393+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0;
394+
++qs.i_ffn_gate_shexp;
395+
} else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_up_shexp.weight") != std::string::npos) {
396+
new_type = GGML_TYPE_Q5_K;
397+
if (use_more_bits(qs.i_ffn_up_shexp, qs.n_ffn_up_shexp)) {
398+
new_type = GGML_TYPE_Q8_0;
399+
}
400+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K;
401+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0;
402+
++qs.i_ffn_up_shexp;
379403
} else if (name.find("ffn_down") != std::string::npos) {
380404
auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
381405
int i_layer = info.first, n_layer = info.second;

0 commit comments

Comments
 (0)