@@ -189,8 +189,9 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
189189 ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
190190 bool is_one_bit = (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S);
191191 if (name.find (" attn_v.weight" ) != std::string::npos) {
192- if (qs.model .hparams .n_gqa () >= 4 || qs.model .hparams .n_expert >= 4 ) new_type = GGML_TYPE_Q4_K;
193- else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
192+ new_type = GGML_TYPE_Q4_K;
193+ // if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
194+ // else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
194195 ++qs.i_attention_wv ;
195196 }
196197 else if (qs.model .hparams .n_expert == 8 && name.find (" attn_k.weight" ) != std::string::npos) {
@@ -271,15 +272,29 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
271272 // else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S;
272273 // }
273274 }
275+ else if (name.find (" attn_k.weight" ) != std::string::npos) {
276+ // Leave as 4bit
277+ new_type = GGML_TYPE_Q4_K;
278+ }
279+ else if (name.find (" attn_q.weight" ) != std::string::npos) {
280+ // Leave as 4bit
281+ new_type = GGML_TYPE_Q4_K;
282+ }
283+ else if (name.find (" attn_v.weight" ) != std::string::npos) {
284+ // Leave as 4bit
285+ new_type = GGML_TYPE_Q4_K;
286+ }
274287 } else if (name.find (" attn_v.weight" ) != std::string::npos) {
275288 if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
276- new_type = qs.model .hparams .n_gqa () >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
289+ new_type = GGML_TYPE_Q4_K;
290+ // new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
277291 }
278292 else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model .hparams .n_gqa () >= 4 ) {
279293 new_type = GGML_TYPE_Q4_K;
280294 }
281295 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
282- new_type = qs.model .hparams .n_gqa () >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
296+ new_type = GGML_TYPE_Q4_K;
297+ // new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
283298 }
284299 else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs.model .hparams .n_gqa () >= 4 ) {
285300 new_type = GGML_TYPE_Q4_K;
@@ -316,17 +331,21 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
316331 new_type = GGML_TYPE_Q8_0;
317332 }
318333 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
319- new_type = GGML_TYPE_IQ3_XXS;
334+ // new_type = GGML_TYPE_IQ3_XXS;
335+ new_type = GGML_TYPE_Q4_K;
320336 }
321337 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
322- new_type = GGML_TYPE_IQ2_S;
338+ // new_type = GGML_TYPE_IQ2_S;
339+ new_type = GGML_TYPE_Q4_K;
323340 }
324341 } else if (name.find (" attn_q.weight" ) != std::string::npos) {
325342 if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
326- new_type = GGML_TYPE_IQ3_XXS;
343+ // new_type = GGML_TYPE_IQ3_XXS;
344+ new_type = GGML_TYPE_Q4_K;
327345 }
328346 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
329- new_type = GGML_TYPE_IQ2_S;
347+ // new_type = GGML_TYPE_IQ2_S;
348+ new_type = GGML_TYPE_Q4_K;
330349 }
331350 } else if (name.find (" ffn_down.weight" ) != std::string::npos) {
332351 // First 3 Layers
0 commit comments