@@ -29,19 +29,19 @@ struct quantize_state_impl {
2929 int n_ffn_down = 0 ;
3030 int n_ffn_gate = 0 ;
3131 int n_ffn_up = 0 ;
32- int n_ffn_down_exp = 0 ;
33- int n_ffn_gate_exp = 0 ;
34- int n_ffn_up_exp = 0 ;
32+ int n_ffn_down_exps = 0 ;
33+ int n_ffn_gate_exps = 0 ;
34+ int n_ffn_up_exps = 0 ;
3535 int n_ffn_down_shexp = 0 ;
3636 int n_ffn_gate_shexp = 0 ;
3737 int n_ffn_up_shexp = 0 ;
3838 int i_attention_wv = 0 ;
3939 int i_ffn_down = 0 ;
4040 int i_ffn_gate = 0 ;
4141 int i_ffn_up = 0 ;
42- int i_ffn_down_exp = 0 ;
43- int i_ffn_gate_exp = 0 ;
44- int i_ffn_up_exp = 0 ;
42+ int i_ffn_down_exps = 0 ;
43+ int i_ffn_gate_exps = 0 ;
44+ int i_ffn_up_exps = 0 ;
4545 int i_ffn_down_shexp = 0 ;
4646 int i_ffn_gate_shexp = 0 ;
4747 int i_ffn_up_shexp = 0 ;
@@ -138,21 +138,54 @@ static void llama_tensor_dequantize_impl(
138138 workers.clear ();
139139}
140140
141- // Check if ftype is specifically IQ2_S or IQ2_M
142- static inline bool is_iq2s_or_iq2m (llama_ftype ftype) {
143- return ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M;
141+
142+ // Returns the appropriate type for expert _exps tensors based on ftype
143+ static inline ggml_type get_exps_type_low_bpw_bump (llama_ftype ftype, ggml_type new_type) {
144+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS;
145+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) new_type = GGML_TYPE_IQ2_S;
146+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) new_type = GGML_TYPE_IQ2_S;
147+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) new_type = GGML_TYPE_IQ2_XS;
148+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS;
149+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) new_type = GGML_TYPE_IQ1_M;
150+ return new_type;
144151}
145152
146- // Check if ftype belongs to the IQ1 group
147- static inline bool is_iq1_group (llama_ftype ftype) {
148- return ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M;
153+ static inline ggml_type get_exps_type_low_bpw_squash (llama_ftype ftype, ggml_type new_type) {
154+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ2_XS;
155+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) new_type = GGML_TYPE_IQ2_XXS;
156+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) new_type = GGML_TYPE_IQ2_XXS;
157+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) new_type = GGML_TYPE_IQ1_M;
158+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ1_S;
159+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) new_type = GGML_TYPE_IQ1_S;
160+ return new_type;
149161}
150162
151- // Returns the appropriate type for expert _exps tensors based on ftype
152- static inline ggml_type get_expert_exps_type (llama_ftype ftype) {
153- if (is_iq1_group (ftype)) return GGML_TYPE_IQ2_XXS;
154- if (is_iq2s_or_iq2m (ftype)) return GGML_TYPE_IQ3_XXS;
155- /* otherwise */ return GGML_TYPE_IQ2_XS;
163+ static inline ggml_type get_exps_type_high_bpw_bump (llama_ftype ftype, ggml_type new_type, bool has_imatrix) {
164+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
165+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
166+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
167+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S) new_type = GGML_TYPE_Q5_K;
168+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q6_K;
169+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
170+ // Bump I-quants
171+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) new_type = GGML_TYPE_Q4_K;
172+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S;
173+ else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !has_imatrix) new_type = GGML_TYPE_Q5_K;
174+
175+ return new_type;
176+ }
177+
178+ static inline ggml_type get_exps_type_high_bpw_squash (llama_ftype ftype, ggml_type new_type, bool has_imatrix) {
179+ // Squash K-quants
180+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q2_K;
181+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S) new_type = GGML_TYPE_Q3_K;
182+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
183+ // Squash I-quants
184+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) new_type = GGML_TYPE_IQ3_XXS;
185+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
186+ new_type = has_imatrix ? GGML_TYPE_IQ2_S : GGML_TYPE_Q2_K;
187+ }
188+ return new_type;
156189}
157190
158191static ggml_type llama_tensor_get_type (quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
@@ -211,7 +244,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
211244 ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
212245 new_type = GGML_TYPE_Q2_K;
213246 }
214- else if (is_iq2s_or_iq2m ( ftype) ) {
247+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ) {
215248 new_type = GGML_TYPE_IQ3_S;
216249 }
217250 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
@@ -225,7 +258,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
225258 ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
226259 if (name.find (" attn_v.weight" ) != std::string::npos) {
227260 if (qs.model .hparams .n_gqa () >= 4 || qs.model .hparams .n_expert >= 4 ) new_type = GGML_TYPE_Q4_K;
228- else new_type = is_iq2s_or_iq2m (ftype) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
261+ else new_type = (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
229262 ++qs.i_attention_wv ;
230263 }
231264 else if (qs.model .hparams .n_expert >= 8 && name.find (" attn_k.weight" ) != std::string::npos) {
@@ -239,22 +272,22 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
239272 new_type = GGML_TYPE_Q4_K;
240273 }
241274 else if (use_more_bits (qs.i_attention_wv , qs.n_attention_wv )) {
242- new_type = is_iq2s_or_iq2m (ftype) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
275+ new_type = (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
243276 }
244277 ++qs.i_attention_wv ;
245278 }
246279 else if (qs.model .hparams .n_expert >= 8 && name.find (" attn_q_a.weight" ) != std::string::npos) {
247280 new_type = GGML_TYPE_Q4_K;
248281 }
249282 else if (qs.model .hparams .n_expert >= 8 && name.find (" attn_q_b.weight" ) != std::string::npos) {
250- new_type = is_iq2s_or_iq2m (ftype) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
283+ new_type = (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
251284 }
252285 else if (qs.model .hparams .n_expert >= 8 && name.find (" ffn_down.weight" ) != std::string::npos) {
253286 if (qs.i_ffn_down < qs.n_ffn_down /16 ) {
254- new_type = GGML_TYPE_Q4_K ;
287+ new_type = GGML_TYPE_Q6_K ;
255288 }
256289 else if (qs.i_ffn_down < qs.n_ffn_down /8 ) {
257- new_type = is_iq2s_or_iq2m (ftype) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
290+ new_type = (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
258291 }
259292 ++qs.i_ffn_down ;
260293 }
@@ -263,7 +296,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
263296 new_type = GGML_TYPE_Q4_K;
264297 }
265298 else if (qs.i_ffn_gate < qs.n_ffn_gate /8 ) {
266- new_type = is_iq2s_or_iq2m (ftype) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
299+ new_type = (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
267300 }
268301 ++qs.i_ffn_gate ;
269302 }
@@ -272,58 +305,64 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
272305 new_type = GGML_TYPE_Q4_K;
273306 }
274307 else if (qs.i_ffn_up < qs.n_ffn_up /8 ) {
275- new_type = is_iq2s_or_iq2m (ftype) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
308+ new_type = (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
276309 }
277310 ++qs.i_ffn_up ;
278311 }
279312 else if (qs.model .hparams .n_expert >= 8 && name.find (" ffn_down_exps.weight" ) != std::string::npos) {
280- if (qs.i_ffn_down_exp < qs.n_ffn_down_exp /8 ) {
281- new_type = get_expert_exps_type (ftype);
313+ if (qs.i_ffn_down_exps < qs.n_ffn_down_exps /8 || qs.i_ffn_down_exps > 7 *qs.n_ffn_down_exps /8 ) {
314+ new_type = get_exps_type_low_bpw_bump (ftype, new_type);
315+ } else {
316+ new_type = get_exps_type_low_bpw_squash (ftype, new_type);
282317 }
283- ++qs.i_ffn_down_exp ;
318+ ++qs.i_ffn_down_exps ;
284319 }
285320 else if (qs.model .hparams .n_expert >= 8 && name.find (" ffn_gate_exps.weight" ) != std::string::npos) {
286- if (qs.i_ffn_gate_exp < qs.n_ffn_gate_exp /8 ) {
287- new_type = get_expert_exps_type (ftype);
321+ if (qs.i_ffn_gate_exps < qs.n_ffn_gate_exps /8 || qs.i_ffn_gate_exps > 7 *qs.n_ffn_gate_exps /8 ) {
322+ new_type = get_exps_type_low_bpw_bump (ftype, new_type);
323+ } else {
324+ new_type = get_exps_type_low_bpw_squash (ftype, new_type);
288325 }
289- ++qs.i_ffn_gate_exp ;
326+ ++qs.i_ffn_gate_exps ;
290327 }
291328 else if (qs.model .hparams .n_expert >= 8 && name.find (" ffn_up_exps.weight" ) != std::string::npos) {
292- if (qs.i_ffn_up_exp < qs.n_ffn_up_exp /8 ) {
293- new_type = get_expert_exps_type (ftype);
329+ if (qs.i_ffn_up_exps < qs.n_ffn_up_exps /8 || qs.i_ffn_up_exps > 7 *qs.n_ffn_up_exps /8 ) {
330+ new_type = get_exps_type_low_bpw_bump (ftype, new_type);
331+ } else {
332+ new_type = get_exps_type_low_bpw_squash (ftype, new_type);
294333 }
295- ++qs.i_ffn_up_exp ;
334+ ++qs.i_ffn_up_exps ;
296335 }
297336 else if (qs.model .hparams .n_expert >= 8 && name.find (" ffn_down_shexp.weight" ) != std::string::npos) {
298337 if (use_more_bits (qs.i_ffn_down_shexp , qs.n_ffn_down_shexp )) {
299- new_type = GGML_TYPE_Q4_K ;
338+ new_type = GGML_TYPE_Q6_K ;
300339 }
301340 ++qs.i_ffn_down_shexp ;
302341 }
303342 else if (qs.model .hparams .n_expert >= 8 && name.find (" ffn_gate_shexp.weight" ) != std::string::npos) {
304343 if (use_more_bits (qs.i_ffn_gate_shexp , qs.n_ffn_gate_shexp )) {
305- new_type = GGML_TYPE_Q4_K ;
344+ new_type = GGML_TYPE_Q6_K ;
306345 }
307346 ++qs.i_ffn_gate_shexp ;
308347 }
309348 else if (qs.model .hparams .n_expert >= 8 && name.find (" ffn_up_shexp.weight" ) != std::string::npos) {
310349 if (use_more_bits (qs.i_ffn_up_shexp , qs.n_ffn_up_shexp )) {
311- new_type = GGML_TYPE_Q4_K ;
350+ new_type = GGML_TYPE_Q6_K ;
312351 }
313352 ++qs.i_ffn_up_shexp ;
314353 }
315354 else if (name.find (" ffn_down" ) != std::string::npos) {
316355 if (qs.i_ffn_down < qs.n_ffn_down /8 ) {
317- new_type = is_iq2s_or_iq2m (ftype) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
356+ new_type = (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
318357 }
319358 ++qs.i_ffn_down ;
320359 }
321360 else if (name.find (" attn_output.weight" ) != std::string::npos) {
322361 if (qs.model .hparams .n_expert >= 8 ) {
323- new_type = is_iq2s_or_iq2m (ftype) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
362+ new_type = (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
324363 } else {
325- if (is_iq1_group ( ftype) ) new_type = GGML_TYPE_IQ2_XXS;
326- else if (is_iq2s_or_iq2m ( ftype) ) new_type = GGML_TYPE_IQ3_S;
364+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M ) new_type = GGML_TYPE_IQ2_XXS;
365+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ) new_type = GGML_TYPE_IQ3_S;
327366 }
328367 }
329368 } else if (name.find (" attn_v.weight" ) != std::string::npos) {
@@ -365,7 +404,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
365404 }
366405 ++qs.i_attention_wv ;
367406 } else if (name.find (" attn_k.weight" ) != std::string::npos) {
368- if (qs.model .hparams .n_expert = = 8 ) {
407+ if (qs.model .hparams .n_expert > = 8 ) {
369408 // for the 8-expert model, bumping this to Q8_0 trades just ~128MB
370409 // TODO: explore better strategies
371410 new_type = GGML_TYPE_Q8_0;
@@ -385,28 +424,46 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
385424 }
386425 } else if (qs.model .hparams .n_expert >= 8 && name.find (" ffn_down_shexp.weight" ) != std::string::npos) {
387426 new_type = GGML_TYPE_Q5_K;
427+ // if (qs.i_ffn_down_shexp < qs.n_ffn_down_shexp/8 || qs.i_ffn_down_shexp > 7*qs.n_ffn_down_shexp/8) {
388428 if (use_more_bits (qs.i_ffn_down_shexp , qs.n_ffn_down_shexp )) {
389429 new_type = GGML_TYPE_Q8_0;
390430 }
431+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
391432 else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K;
392433 else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0;
393434 ++qs.i_ffn_down_shexp ;
394435 } else if (qs.model .hparams .n_expert >= 8 && name.find (" ffn_gate_shexp.weight" ) != std::string::npos) {
395436 new_type = GGML_TYPE_Q5_K;
437+ // if (qs.i_ffn_gate_shexp < qs.n_ffn_gate_shexp/8 || qs.i_ffn_gate_shexp > 7*qs.n_ffn_gate_shexp/8) {
396438 if (use_more_bits (qs.i_ffn_gate_shexp , qs.n_ffn_gate_shexp )) {
397439 new_type = GGML_TYPE_Q8_0;
398440 }
441+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
399442 else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K;
400443 else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0;
401444 ++qs.i_ffn_gate_shexp ;
402445 } else if (qs.model .hparams .n_expert >= 8 && name.find (" ffn_up_shexp.weight" ) != std::string::npos) {
403446 new_type = GGML_TYPE_Q5_K;
447+ // if (qs.i_ffn_up_shexp < qs.n_ffn_up_shexp/8 || qs.i_ffn_up_shexp > 7*qs.n_ffn_up_shexp/8) {
404448 if (use_more_bits (qs.i_ffn_up_shexp , qs.n_ffn_up_shexp )) {
405449 new_type = GGML_TYPE_Q8_0;
406450 }
451+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
407452 else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K;
408453 else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0;
409454 ++qs.i_ffn_up_shexp ;
455+ } else if (qs.model .hparams .n_expert >= 8 && name.find (" ffn_down_exps.weight" ) != std::string::npos) {
456+ if (use_more_bits (qs.i_ffn_down_exps , qs.n_ffn_down_exps )) {
457+ if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || ftype == LLAMA_FTYPE_MOSTLY_Q5_0) && qs.has_imatrix ) {
458+ // Guard against craziness in the first few ffn_down layers that can happen even with imatrix for Q4_0/Q5_0.
459+ // We only do it when an imatrix is provided because a) we want to make sure that one can always get the
460+ // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
461+ new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
462+ } else {
463+ new_type = get_exps_type_high_bpw_bump (ftype, new_type, qs.has_imatrix );
464+ }
465+ }
466+ ++qs.i_ffn_down_exps ;
410467 } else if (name.find (" ffn_down" ) != std::string::npos) {
411468 auto info = layer_info (qs.i_ffn_down , qs.n_ffn_down , name.c_str ());
412469 int i_layer = info.first , n_layer = info.second ;
@@ -496,21 +553,17 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
496553 ++qs.i_ffn_up ;
497554 } else if (qs.model .hparams .n_expert >= 8 && name.find (" attn_kv_a_mqa.weight" ) != std::string::npos) {
498555 new_type = GGML_TYPE_Q8_0;
499- } else if (qs.model .hparams .n_expert >= 8 && name.find (" attn_kv_b.weight" ) != std::string::npos) {
500- new_type = GGML_TYPE_Q4_K;
501- if (qs.i_attention_wv < qs.n_attention_wv /16 ) {
502- new_type = GGML_TYPE_Q8_0;
503- } else if (use_more_bits (qs.i_attention_wv , qs.n_attention_wv )) {
504- new_type = GGML_TYPE_Q6_K;
505- }
506- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) new_type = GGML_TYPE_Q5_K;
507- else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
508- ++qs.i_attention_wv ;
556+ } else if (qs.model .hparams .n_expert >= 8 && name.find (" attn_k_b.weight" ) != std::string::npos) {
557+ new_type = GGML_TYPE_Q5_K;
558+ if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K;
559+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0;
560+ } else if (qs.model .hparams .n_expert >= 8 && name.find (" attn_v_b.weight" ) != std::string::npos) {
561+ new_type = GGML_TYPE_Q5_K;
562+ if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K;
563+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0;
509564 } else if (qs.model .hparams .n_expert >= 8 &&name.find (" attn_q_b.weight" ) != std::string::npos) {
510- if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
511- new_type = GGML_TYPE_Q4_K;
512- }
513- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q5_K;
565+ new_type = GGML_TYPE_Q4_K;
566+ if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q5_K;
514567 else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
515568 } else if (qs.model .hparams .n_expert >= 8 && name.find (" attn_q_a.weight" ) != std::string::npos) {
516569 new_type = GGML_TYPE_Q5_K;
@@ -782,15 +835,15 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
782835 } else if (name == LLM_TN (model.arch )(LLM_TENSOR_OUTPUT, " weight" )) {
783836 qs.has_output = true ;
784837 } else if (name.find (" ffn_gate_exps.weight" ) != std::string::npos) {
785- ++qs.n_ffn_gate_exp ;
838+ ++qs.n_ffn_gate_exps ;
786839 } else if (name.find (" ffn_gate_shexp.weight" ) != std::string::npos) {
787840 ++qs.n_ffn_gate_shexp ;
788841 } else if (name.find (" ffn_down_exps.weight" ) != std::string::npos) {
789- ++qs.n_ffn_down_exp ;
842+ ++qs.n_ffn_down_exps ;
790843 } else if (name.find (" ffn_down_shexp.weight" ) != std::string::npos) {
791844 ++qs.n_ffn_down_shexp ;
792845 } else if (name.find (" ffn_up_exps.weight" ) != std::string::npos) {
793- ++qs.n_ffn_up_exp ;
846+ ++qs.n_ffn_up_exps ;
794847 } else if (name.find (" ffn_up_shexp.weight" ) != std::string::npos) {
795848 ++qs.n_ffn_up_shexp ;
796849 }
0 commit comments