@@ -8829,6 +8829,23 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
88298829 auto use_more_bits = [](int i_layer, int num_layers) -> bool {
88308830 return i_layer < num_layers/8 || i_layer >= 7 *num_layers/8 || (i_layer - num_layers/8 )%3 == 2 ;
88318831 };
8832+ const int n_expert = std::max (1 , (int )qs.model .hparams .n_expert );
8833+ auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name) {
8834+ if (n_expert > 1 ) {
8835+ // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly
8836+ // sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
8837+ // for getting the current layer as I initially thought, and we need to resort to parsing the
8838+ // tensor name.
8839+ n_layer /= n_expert;
8840+ if (sscanf (name, " blk.%d." , &i_layer) != 1 ) {
8841+ throw std::runtime_error (format (" Failed to determine layer for tensor %s" , name));
8842+ }
8843+ if (i_layer < 0 || i_layer >= n_layer) {
8844+ throw std::runtime_error (format (" Bad layer %d for tensor %s. Must be in [0, %d)" , i_layer, name, n_layer));
8845+ }
8846+ }
8847+ return std::make_pair (i_layer, n_layer);
8848+ };
88328849
88338850 if (name == tn (LLM_TENSOR_OUTPUT, " weight" )) {
88348851 int nx = tensor->ne [0 ];
@@ -8890,24 +8907,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
88908907 new_type = GGML_TYPE_Q2_K;
88918908 }
88928909 } else if (name.find (" ffn_down" ) != std::string::npos) {
8893- const int n_expert = std::max (1 , (int )qs.model .hparams .n_expert );
8894- int i_layer, n_layer;
8895- if (n_expert == 1 ) {
8896- i_layer = qs.i_ffn_down ;
8897- n_layer = qs.n_ffn_down ;
8898- } else {
8899- // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly
8900- // sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
8901- // for getting the current layer as I initially thought, and we need to resort to parsing the
8902- // tensor name.
8903- n_layer = qs.n_ffn_down / n_expert;
8904- if (sscanf (name.c_str (), " blk.%d.ffn_down" , &i_layer) != 1 ) {
8905- throw std::runtime_error (format (" Failed to determine layer for tensor %s" , name.c_str ()));
8906- }
8907- if (i_layer < 0 || i_layer >= n_layer) {
8908- throw std::runtime_error (format (" Bad layer %d for tensor %s. Must be in [0, %d)" , i_layer, name.c_str (), n_layer));
8909- }
8910- }
8910+ auto info = layer_info (qs.i_ffn_down , qs.n_ffn_down , name.c_str ());
8911+ int i_layer = info.first , n_layer = info.second ;
89118912 if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
89128913 else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
89138914 if (i_layer < n_layer/8 ) new_type = GGML_TYPE_Q4_K;
@@ -8963,13 +8964,17 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
89638964 else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
89648965 }
89658966 else if (name.find (" ffn_gate" ) != std::string::npos) {
8966- if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits (qs.i_ffn_gate , qs.n_ffn_gate )) {
8967+ auto info = layer_info (qs.i_ffn_gate , qs.n_ffn_gate , name.c_str ());
8968+ int i_layer = info.first , n_layer = info.second ;
8969+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits (i_layer, n_layer)) {
89678970 new_type = GGML_TYPE_Q2_K;
89688971 }
89698972 ++qs.i_ffn_gate ;
89708973 }
89718974 else if (name.find (" ffn_up" ) != std::string::npos) {
8972- if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits (qs.i_ffn_up , qs.n_ffn_up )) {
8975+ auto info = layer_info (qs.i_ffn_up , qs.n_ffn_up , name.c_str ());
8976+ int i_layer = info.first , n_layer = info.second ;
8977+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits (i_layer, n_layer)) {
89738978 new_type = GGML_TYPE_Q2_K;
89748979 }
89758980 ++qs.i_ffn_up ;
0 commit comments