@@ -443,6 +443,30 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
443443 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
444444 new_type = GGML_TYPE_IQ2_S;
445445 }
446+ } else if (qs.model .hparams .n_expert >= 8 && name.find (" ffn_down_shexp.weight" ) != std::string::npos) {
447+ new_type = GGML_TYPE_Q5_K;
448+ if (use_more_bits (qs.i_ffn_down_shexp , qs.n_ffn_down_shexp )) {
449+ new_type = GGML_TYPE_Q8_0;
450+ }
451+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K;
452+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0;
453+ ++qs.i_ffn_down_shexp ;
454+ } else if (qs.model .hparams .n_expert >= 8 && name.find (" ffn_gate_shexp.weight" ) != std::string::npos) {
455+ new_type = GGML_TYPE_Q5_K;
456+ if (use_more_bits (qs.i_ffn_gate_shexp , qs.n_ffn_gate_shexp )) {
457+ new_type = GGML_TYPE_Q8_0;
458+ }
459+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K;
460+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0;
461+ ++qs.i_ffn_gate_shexp ;
462+ } else if (qs.model .hparams .n_expert >= 8 && name.find (" ffn_up_shexp.weight" ) != std::string::npos) {
463+ new_type = GGML_TYPE_Q5_K;
464+ if (use_more_bits (qs.i_ffn_up_shexp , qs.n_ffn_up_shexp )) {
465+ new_type = GGML_TYPE_Q8_0;
466+ }
467+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K;
468+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0;
469+ ++qs.i_ffn_up_shexp ;
446470 } else if (name.find (" ffn_down" ) != std::string::npos) {
447471 auto info = layer_info (qs.i_ffn_down , qs.n_ffn_down , name.c_str ());
448472 int i_layer = info.first , n_layer = info.second ;
0 commit comments