Skip to content

Commit 3665704

Browse files
committed
Update some of the weightings, remove some complication
1 parent ce2c4c7 commit 3665704

File tree

1 file changed

+112
-59
lines changed

1 file changed

+112
-59
lines changed

src/llama-quant.cpp

Lines changed: 112 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -29,19 +29,19 @@ struct quantize_state_impl {
2929
int n_ffn_down = 0;
3030
int n_ffn_gate = 0;
3131
int n_ffn_up = 0;
32-
int n_ffn_down_exp = 0;
33-
int n_ffn_gate_exp = 0;
34-
int n_ffn_up_exp = 0;
32+
int n_ffn_down_exps = 0;
33+
int n_ffn_gate_exps = 0;
34+
int n_ffn_up_exps = 0;
3535
int n_ffn_down_shexp = 0;
3636
int n_ffn_gate_shexp = 0;
3737
int n_ffn_up_shexp = 0;
3838
int i_attention_wv = 0;
3939
int i_ffn_down = 0;
4040
int i_ffn_gate = 0;
4141
int i_ffn_up = 0;
42-
int i_ffn_down_exp = 0;
43-
int i_ffn_gate_exp = 0;
44-
int i_ffn_up_exp = 0;
42+
int i_ffn_down_exps = 0;
43+
int i_ffn_gate_exps = 0;
44+
int i_ffn_up_exps = 0;
4545
int i_ffn_down_shexp = 0;
4646
int i_ffn_gate_shexp = 0;
4747
int i_ffn_up_shexp = 0;
@@ -138,21 +138,54 @@ static void llama_tensor_dequantize_impl(
138138
workers.clear();
139139
}
140140

141-
// Check if ftype is specifically IQ2_S or IQ2_M
142-
static inline bool is_iq2s_or_iq2m(llama_ftype ftype) {
143-
return ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M;
141+
142+
// Returns the appropriate type for expert _exps tensors based on ftype
143+
static inline ggml_type get_exps_type_low_bpw_bump(llama_ftype ftype, ggml_type new_type) {
144+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS;
145+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) new_type = GGML_TYPE_IQ2_S;
146+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) new_type = GGML_TYPE_IQ2_S;
147+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) new_type = GGML_TYPE_IQ2_XS;
148+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS;
149+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) new_type = GGML_TYPE_IQ1_M;
150+
return new_type;
144151
}
145152

146-
// Check if ftype belongs to the IQ1 group
147-
static inline bool is_iq1_group(llama_ftype ftype) {
148-
return ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M;
153+
static inline ggml_type get_exps_type_low_bpw_squash(llama_ftype ftype, ggml_type new_type) {
154+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ2_XS;
155+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) new_type = GGML_TYPE_IQ2_XXS;
156+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) new_type = GGML_TYPE_IQ2_XXS;
157+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) new_type = GGML_TYPE_IQ1_M;
158+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ1_S;
159+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) new_type = GGML_TYPE_IQ1_S;
160+
return new_type;
149161
}
150162

151-
// Returns the appropriate type for expert _exps tensors based on ftype
152-
static inline ggml_type get_expert_exps_type(llama_ftype ftype) {
153-
if (is_iq1_group(ftype)) return GGML_TYPE_IQ2_XXS;
154-
if (is_iq2s_or_iq2m(ftype)) return GGML_TYPE_IQ3_XXS;
155-
/* otherwise */ return GGML_TYPE_IQ2_XS;
163+
static inline ggml_type get_exps_type_high_bpw_bump(llama_ftype ftype, ggml_type new_type, bool has_imatrix) {
164+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
165+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
166+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
167+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S) new_type = GGML_TYPE_Q5_K;
168+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q6_K;
169+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
170+
// Bump I-quants
171+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) new_type = GGML_TYPE_Q4_K;
172+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S;
173+
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !has_imatrix) new_type = GGML_TYPE_Q5_K;
174+
175+
return new_type;
176+
}
177+
178+
static inline ggml_type get_exps_type_high_bpw_squash(llama_ftype ftype, ggml_type new_type, bool has_imatrix) {
179+
// Squash K-quants
180+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q2_K;
181+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S) new_type = GGML_TYPE_Q3_K;
182+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
183+
// Squash I-quants
184+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) new_type = GGML_TYPE_IQ3_XXS;
185+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
186+
new_type = has_imatrix ? GGML_TYPE_IQ2_S : GGML_TYPE_Q2_K;
187+
}
188+
return new_type;
156189
}
157190

158191
static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
@@ -211,7 +244,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
211244
ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
212245
new_type = GGML_TYPE_Q2_K;
213246
}
214-
else if (is_iq2s_or_iq2m(ftype)) {
247+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
215248
new_type = GGML_TYPE_IQ3_S;
216249
}
217250
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
@@ -225,7 +258,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
225258
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
226259
if (name.find("attn_v.weight") != std::string::npos) {
227260
if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
228-
else new_type = is_iq2s_or_iq2m(ftype) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
261+
else new_type = (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
229262
++qs.i_attention_wv;
230263
}
231264
else if (qs.model.hparams.n_expert >= 8 && name.find("attn_k.weight") != std::string::npos) {
@@ -239,22 +272,22 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
239272
new_type = GGML_TYPE_Q4_K;
240273
}
241274
else if (use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) {
242-
new_type = is_iq2s_or_iq2m(ftype) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
275+
new_type = (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
243276
}
244277
++qs.i_attention_wv;
245278
}
246279
else if (qs.model.hparams.n_expert >= 8 && name.find("attn_q_a.weight") != std::string::npos) {
247280
new_type = GGML_TYPE_Q4_K;
248281
}
249282
else if (qs.model.hparams.n_expert >= 8 && name.find("attn_q_b.weight") != std::string::npos) {
250-
new_type = is_iq2s_or_iq2m(ftype) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
283+
new_type = (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
251284
}
252285
else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_down.weight") != std::string::npos) {
253286
if (qs.i_ffn_down < qs.n_ffn_down/16) {
254-
new_type = GGML_TYPE_Q4_K;
287+
new_type = GGML_TYPE_Q6_K;
255288
}
256289
else if (qs.i_ffn_down < qs.n_ffn_down/8) {
257-
new_type = is_iq2s_or_iq2m(ftype) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
290+
new_type = (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
258291
}
259292
++qs.i_ffn_down;
260293
}
@@ -263,7 +296,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
263296
new_type = GGML_TYPE_Q4_K;
264297
}
265298
else if (qs.i_ffn_gate < qs.n_ffn_gate/8) {
266-
new_type = is_iq2s_or_iq2m(ftype) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
299+
new_type = (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
267300
}
268301
++qs.i_ffn_gate;
269302
}
@@ -272,58 +305,64 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
272305
new_type = GGML_TYPE_Q4_K;
273306
}
274307
else if (qs.i_ffn_up < qs.n_ffn_up/8) {
275-
new_type = is_iq2s_or_iq2m(ftype) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
308+
new_type = (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
276309
}
277310
++qs.i_ffn_up;
278311
}
279312
else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_down_exps.weight") != std::string::npos) {
280-
if (qs.i_ffn_down_exp < qs.n_ffn_down_exp/8) {
281-
new_type = get_expert_exps_type(ftype);
313+
if (qs.i_ffn_down_exps < qs.n_ffn_down_exps/8 || qs.i_ffn_down_exps > 7*qs.n_ffn_down_exps/8) {
314+
new_type = get_exps_type_low_bpw_bump(ftype, new_type);
315+
} else {
316+
new_type = get_exps_type_low_bpw_squash(ftype, new_type);
282317
}
283-
++qs.i_ffn_down_exp;
318+
++qs.i_ffn_down_exps;
284319
}
285320
else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_gate_exps.weight") != std::string::npos) {
286-
if (qs.i_ffn_gate_exp < qs.n_ffn_gate_exp/8) {
287-
new_type = get_expert_exps_type(ftype);
321+
if (qs.i_ffn_gate_exps < qs.n_ffn_gate_exps/8 || qs.i_ffn_gate_exps > 7*qs.n_ffn_gate_exps/8) {
322+
new_type = get_exps_type_low_bpw_bump(ftype, new_type);
323+
} else {
324+
new_type = get_exps_type_low_bpw_squash(ftype, new_type);
288325
}
289-
++qs.i_ffn_gate_exp;
326+
++qs.i_ffn_gate_exps;
290327
}
291328
else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_up_exps.weight") != std::string::npos) {
292-
if (qs.i_ffn_up_exp < qs.n_ffn_up_exp/8) {
293-
new_type = get_expert_exps_type(ftype);
329+
if (qs.i_ffn_up_exps < qs.n_ffn_up_exps/8 || qs.i_ffn_up_exps > 7*qs.n_ffn_up_exps/8) {
330+
new_type = get_exps_type_low_bpw_bump(ftype, new_type);
331+
} else {
332+
new_type = get_exps_type_low_bpw_squash(ftype, new_type);
294333
}
295-
++qs.i_ffn_up_exp;
334+
++qs.i_ffn_up_exps;
296335
}
297336
else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_down_shexp.weight") != std::string::npos) {
298337
if (use_more_bits(qs.i_ffn_down_shexp, qs.n_ffn_down_shexp)) {
299-
new_type = GGML_TYPE_Q4_K;
338+
new_type = GGML_TYPE_Q6_K;
300339
}
301340
++qs.i_ffn_down_shexp;
302341
}
303342
else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_gate_shexp.weight") != std::string::npos) {
304343
if (use_more_bits(qs.i_ffn_gate_shexp, qs.n_ffn_gate_shexp)) {
305-
new_type = GGML_TYPE_Q4_K;
344+
new_type = GGML_TYPE_Q6_K;
306345
}
307346
++qs.i_ffn_gate_shexp;
308347
}
309348
else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_up_shexp.weight") != std::string::npos) {
310349
if (use_more_bits(qs.i_ffn_up_shexp, qs.n_ffn_up_shexp)) {
311-
new_type = GGML_TYPE_Q4_K;
350+
new_type = GGML_TYPE_Q6_K;
312351
}
313352
++qs.i_ffn_up_shexp;
314353
}
315354
else if (name.find("ffn_down") != std::string::npos) {
316355
if (qs.i_ffn_down < qs.n_ffn_down/8) {
317-
new_type = is_iq2s_or_iq2m(ftype) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
356+
new_type = (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
318357
}
319358
++qs.i_ffn_down;
320359
}
321360
else if (name.find("attn_output.weight") != std::string::npos) {
322361
if (qs.model.hparams.n_expert >= 8) {
323-
new_type = is_iq2s_or_iq2m(ftype) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
362+
new_type = (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
324363
} else {
325-
if (is_iq1_group(ftype)) new_type = GGML_TYPE_IQ2_XXS;
326-
else if (is_iq2s_or_iq2m(ftype)) new_type = GGML_TYPE_IQ3_S;
364+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS;
365+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S;
327366
}
328367
}
329368
} else if (name.find("attn_v.weight") != std::string::npos) {
@@ -365,7 +404,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
365404
}
366405
++qs.i_attention_wv;
367406
} else if (name.find("attn_k.weight") != std::string::npos) {
368-
if (qs.model.hparams.n_expert == 8) {
407+
if (qs.model.hparams.n_expert >= 8) {
369408
// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
370409
// TODO: explore better strategies
371410
new_type = GGML_TYPE_Q8_0;
@@ -385,28 +424,46 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
385424
}
386425
} else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_down_shexp.weight") != std::string::npos) {
387426
new_type = GGML_TYPE_Q5_K;
427+
//if (qs.i_ffn_down_shexp < qs.n_ffn_down_shexp/8 || qs.i_ffn_down_shexp > 7*qs.n_ffn_down_shexp/8) {
388428
if (use_more_bits(qs.i_ffn_down_shexp, qs.n_ffn_down_shexp)) {
389429
new_type = GGML_TYPE_Q8_0;
390430
}
431+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
391432
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K;
392433
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0;
393434
++qs.i_ffn_down_shexp;
394435
} else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_gate_shexp.weight") != std::string::npos) {
395436
new_type = GGML_TYPE_Q5_K;
437+
//if (qs.i_ffn_gate_shexp < qs.n_ffn_gate_shexp/8 || qs.i_ffn_gate_shexp > 7*qs.n_ffn_gate_shexp/8) {
396438
if (use_more_bits(qs.i_ffn_gate_shexp, qs.n_ffn_gate_shexp)) {
397439
new_type = GGML_TYPE_Q8_0;
398440
}
441+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
399442
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K;
400443
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0;
401444
++qs.i_ffn_gate_shexp;
402445
} else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_up_shexp.weight") != std::string::npos) {
403446
new_type = GGML_TYPE_Q5_K;
447+
//if (qs.i_ffn_up_shexp < qs.n_ffn_up_shexp/8 || qs.i_ffn_up_shexp > 7*qs.n_ffn_up_shexp/8) {
404448
if (use_more_bits(qs.i_ffn_up_shexp, qs.n_ffn_up_shexp)) {
405449
new_type = GGML_TYPE_Q8_0;
406450
}
451+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
407452
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K;
408453
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0;
409454
++qs.i_ffn_up_shexp;
455+
} else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_down_exps.weight") != std::string::npos) {
456+
if (use_more_bits(qs.i_ffn_down_exps, qs.n_ffn_down_exps)) {
457+
if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || ftype == LLAMA_FTYPE_MOSTLY_Q5_0) && qs.has_imatrix) {
458+
// Guard against craziness in the first few ffn_down layers that can happen even with imatrix for Q4_0/Q5_0.
459+
// We only do it when an imatrix is provided because a) we want to make sure that one can always get the
460+
// same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
461+
new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
462+
} else {
463+
new_type = get_exps_type_high_bpw_bump(ftype, new_type, qs.has_imatrix);
464+
}
465+
}
466+
++qs.i_ffn_down_exps;
410467
} else if (name.find("ffn_down") != std::string::npos) {
411468
auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
412469
int i_layer = info.first, n_layer = info.second;
@@ -496,21 +553,17 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
496553
++qs.i_ffn_up;
497554
} else if (qs.model.hparams.n_expert >= 8 && name.find("attn_kv_a_mqa.weight") != std::string::npos) {
498555
new_type = GGML_TYPE_Q8_0;
499-
} else if (qs.model.hparams.n_expert >= 8 && name.find("attn_kv_b.weight") != std::string::npos) {
500-
new_type = GGML_TYPE_Q4_K;
501-
if (qs.i_attention_wv < qs.n_attention_wv/16) {
502-
new_type = GGML_TYPE_Q8_0;
503-
} else if (use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) {
504-
new_type = GGML_TYPE_Q6_K;
505-
}
506-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) new_type = GGML_TYPE_Q5_K;
507-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
508-
++qs.i_attention_wv;
556+
} else if (qs.model.hparams.n_expert >= 8 && name.find("attn_k_b.weight") != std::string::npos) {
557+
new_type = GGML_TYPE_Q5_K;
558+
if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K;
559+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0;
560+
} else if (qs.model.hparams.n_expert >= 8 && name.find("attn_v_b.weight") != std::string::npos) {
561+
new_type = GGML_TYPE_Q5_K;
562+
if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K;
563+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0;
509564
} else if (qs.model.hparams.n_expert >= 8 &&name.find("attn_q_b.weight") != std::string::npos) {
510-
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
511-
new_type = GGML_TYPE_Q4_K;
512-
}
513-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q5_K;
565+
new_type = GGML_TYPE_Q4_K;
566+
if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q5_K;
514567
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
515568
} else if (qs.model.hparams.n_expert >= 8 && name.find("attn_q_a.weight") != std::string::npos) {
516569
new_type = GGML_TYPE_Q5_K;
@@ -782,15 +835,15 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
782835
} else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
783836
qs.has_output = true;
784837
} else if (name.find("ffn_gate_exps.weight") != std::string::npos) {
785-
++qs.n_ffn_gate_exp;
838+
++qs.n_ffn_gate_exps;
786839
} else if (name.find("ffn_gate_shexp.weight") != std::string::npos) {
787840
++qs.n_ffn_gate_shexp;
788841
} else if (name.find("ffn_down_exps.weight") != std::string::npos) {
789-
++qs.n_ffn_down_exp;
842+
++qs.n_ffn_down_exps;
790843
} else if (name.find("ffn_down_shexp.weight") != std::string::npos) {
791844
++qs.n_ffn_down_shexp;
792845
} else if (name.find("ffn_up_exps.weight") != std::string::npos) {
793-
++qs.n_ffn_up_exp;
846+
++qs.n_ffn_up_exps;
794847
} else if (name.find("ffn_up_shexp.weight") != std::string::npos) {
795848
++qs.n_ffn_up_shexp;
796849
}

0 commit comments

Comments
 (0)