@@ -376,6 +376,30 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
376376 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
377377 new_type = GGML_TYPE_IQ2_S;
378378 }
379+ } else if (qs.model .hparams .n_expert >= 8 && name.find (" ffn_down_shexp.weight" ) != std::string::npos) {
380+ new_type = GGML_TYPE_Q5_K;
381+ if (use_more_bits (qs.i_ffn_down_shexp , qs.n_ffn_down_shexp )) {
382+ new_type = GGML_TYPE_Q8_0;
383+ }
384+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K;
385+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0;
386+ ++qs.i_ffn_down_shexp ;
387+ } else if (qs.model .hparams .n_expert >= 8 && name.find (" ffn_gate_shexp.weight" ) != std::string::npos) {
388+ new_type = GGML_TYPE_Q5_K;
389+ if (use_more_bits (qs.i_ffn_gate_shexp , qs.n_ffn_gate_shexp )) {
390+ new_type = GGML_TYPE_Q8_0;
391+ }
392+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K;
393+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0;
394+ ++qs.i_ffn_gate_shexp ;
395+ } else if (qs.model .hparams .n_expert >= 8 && name.find (" ffn_up_shexp.weight" ) != std::string::npos) {
396+ new_type = GGML_TYPE_Q5_K;
397+ if (use_more_bits (qs.i_ffn_up_shexp , qs.n_ffn_up_shexp )) {
398+ new_type = GGML_TYPE_Q8_0;
399+ }
400+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K;
401+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0;
402+ ++qs.i_ffn_up_shexp ;
379403 } else if (name.find (" ffn_down" ) != std::string::npos) {
380404 auto info = layer_info (qs.i_ffn_down , qs.n_ffn_down , name.c_str ());
381405 int i_layer = info.first , n_layer = info.second ;
0 commit comments