@@ -192,17 +192,47 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
192192 else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
193193 ++qs.i_attention_wv ;
194194 }
195- else if (qs.model .hparams .n_expert == 8 && name.find (" attn_k.weight " ) != std::string::npos) {
195+ else if (qs.model .hparams .n_expert >= 8 && name.find (" attn_k" ) != std::string::npos) {
196196 new_type = GGML_TYPE_Q4_K;
197197 }
198+ else if (qs.model .hparams .n_expert >= 8 && name.find (" attn_q" ) != std::string::npos) {
199+ new_type = GGML_TYPE_Q4_K;
200+ }
201+ else if (qs.model .hparams .n_expert >= 8 && name.find (" ffn_down" ) != std::string::npos) {
202+ if (qs.i_ffn_down < qs.n_ffn_down /16 ) {
203+ new_type = GGML_TYPE_Q4_K;
204+ }
205+ else if (qs.i_ffn_down < qs.n_ffn_down /8 ) {
206+ new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
207+ }
208+ ++qs.i_ffn_down ;
209+ }
210+ else if (qs.model .hparams .n_expert >= 8 && name.find (" ffn_gate" ) != std::string::npos) {
211+ if (qs.i_ffn_gate < qs.n_ffn_gate /16 ) {
212+ new_type = GGML_TYPE_Q4_K;
213+ }
214+ else if (qs.i_ffn_gate < qs.n_ffn_gate /8 || qs.i_ffn_gate >= 7 *qs.n_ffn_gate /8 ) {
215+ new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
216+ }
217+ ++qs.i_ffn_gate ;
218+ }
219+ else if (qs.model .hparams .n_expert >= 8 && name.find (" ffn_up" ) != std::string::npos) {
220+ if (qs.i_ffn_up < qs.n_ffn_up /16 ) {
221+ new_type = GGML_TYPE_Q4_K;
222+ }
223+ else if (qs.i_ffn_up < qs.n_ffn_up /8 ) {
224+ new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
225+ }
226+ ++qs.i_ffn_up ;
227+ }
198228 else if (name.find (" ffn_down" ) != std::string::npos) {
199229 if (qs.i_ffn_down < qs.n_ffn_down /8 ) {
200230 new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
201231 }
202232 ++qs.i_ffn_down ;
203233 }
204234 else if (name.find (" attn_output.weight" ) != std::string::npos) {
205- if (qs.model .hparams .n_expert = = 8 ) {
235+ if (qs.model .hparams .n_expert > = 8 ) {
206236 new_type = GGML_TYPE_Q5_K;
207237 } else {
208238 if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS;
@@ -313,7 +343,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
313343 ++qs.i_ffn_down ;
314344 } else if (name.find (" attn_output.weight" ) != std::string::npos) {
315345 if (arch != LLM_ARCH_FALCON) {
316- if (qs.model .hparams .n_expert = = 8 ) {
346+ if (qs.model .hparams .n_expert > = 8 ) {
317347 if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
318348 ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
319349 ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
@@ -353,6 +383,38 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
353383 new_type = GGML_TYPE_IQ3_XXS;
354384 }
355385 ++qs.i_ffn_up ;
386+ } else if (name.find (" attn_kv_a_mqa" ) != std::string::npos) {
387+ if (qs.model .hparams .n_expert >= 8 ) {
388+ new_type = GGML_TYPE_Q8_0;
389+ }
390+ } else if (name.find (" attn_kv_b.weight" ) != std::string::npos) {
391+ if (qs.model .hparams .n_expert >= 8 ) {
392+ new_type = GGML_TYPE_Q4_K;
393+ if (qs.i_attention_wv < qs.n_attention_wv /16 ) {
394+ new_type = GGML_TYPE_Q8_0;
395+ } else if (use_more_bits (qs.i_attention_wv , qs.n_attention_wv )) {
396+ new_type = GGML_TYPE_Q6_K;
397+ }
398+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) new_type = GGML_TYPE_Q5_K;
399+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
400+ }
401+ ++qs.i_attention_wv ;
402+ } else if (name.find (" attn_q_b.weight" ) != std::string::npos) {
403+ if (qs.model .hparams .n_expert >= 8 ) {
404+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
405+ new_type = GGML_TYPE_Q4_K;
406+ }
407+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
408+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
409+ }
410+ } else if (name.find (" attn_q_a.weight" ) != std::string::npos) {
411+ if (qs.model .hparams .n_expert >= 8 ) {
412+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
413+ new_type = GGML_TYPE_Q4_K;
414+ }
415+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
416+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
417+ }
356418 }
357419
358420 // if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
0 commit comments