@@ -16748,6 +16748,144 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1674816748 else new_type = difquant_fl_more_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1674916749 }
1675016750 ++qs.i_attention_wq;
16751+ } else if (name.find("ffn_down") != std::string::npos) {
16752+ auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
16753+ int i_layer = info.first, n_layer = info.second;
16754+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
16755+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16756+ new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K;
16757+ else new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K;
16758+ }
16759+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) {
16760+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16761+ new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K;
16762+ else new_type = GGML_TYPE_Q3_K;
16763+ }
16764+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) {
16765+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16766+ new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
16767+ else new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
16768+ }
16769+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
16770+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16771+ new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
16772+ else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
16773+ }
16774+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
16775+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16776+ new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
16777+ else new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
16778+ }
16779+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XL) {
16780+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16781+ new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
16782+ else new_type = GGML_TYPE_Q4_K;
16783+ }
16784+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
16785+ if (arch == LLM_ARCH_FALCON) {
16786+ new_type = i_layer < n_layer/16 ? GGML_TYPE_Q6_K :
16787+ difquant_half_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
16788+ } else {
16789+ if (difquant_half_tensors(i_layer, n_layer)) new_type = GGML_TYPE_Q5_K;
16790+ }
16791+ }
16792+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
16793+ new_type = GGML_TYPE_Q5_K;
16794+ }
16795+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && difquant_half_tensors(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
16796+ else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || ftype == LLAMA_FTYPE_MOSTLY_Q5_0)
16797+ && qs.has_imatrix && i_layer < n_layer/8) {
16798+ // Guard against craziness in the first few ffn_down layers that can happen even with imatrix for Q4_0/Q5_0.
16799+ // We only do it when an imatrix is provided because a) we want to make sure that one can always get the
16800+ // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
16801+ new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
16802+ }
16803+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) {
16804+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16805+ new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S;
16806+ else new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S;
16807+ }
16808+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
16809+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16810+ new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S;
16811+ else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S;
16812+ }
16813+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
16814+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16815+ new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M;
16816+ else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M;
16817+ }
16818+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL) {
16819+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16820+ new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M;
16821+ else new_type = GGML_TYPE_IQ2_XXS;
16822+ }
16823+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) {
16824+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16825+ new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XS : GGML_TYPE_IQ2_XXS;
16826+ else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XS : GGML_TYPE_IQ2_XXS;
16827+ }
16828+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
16829+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16830+ new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS;
16831+ else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS;
16832+ }
16833+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) {
16834+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16835+ new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS;
16836+ else new_type = GGML_TYPE_IQ2_S;
16837+ }
16838+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
16839+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16840+ new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S;
16841+ else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S;
16842+ }
16843+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) {
16844+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16845+ new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S;
16846+ else new_type = GGML_TYPE_IQ3_XXS;
16847+ }
16848+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
16849+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16850+ new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
16851+ else new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
16852+ }
16853+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
16854+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16855+ new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
16856+ else new_type = GGML_TYPE_IQ3_S;
16857+ }
16858+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) {
16859+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16860+ new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16861+ else new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16862+ }
16863+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
16864+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16865+ new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16866+ else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16867+ }
16868+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) {
16869+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16870+ new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16871+ else new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16872+ }
16873+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
16874+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16875+ new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16876+ else new_type = GGML_TYPE_IQ4_XS;
16877+ }
16878+ else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs.has_imatrix) {
16879+ new_type = GGML_TYPE_Q5_K;
16880+ }
16881+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
16882+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
16883+ new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q5_K :
16884+ difquant_fl_more_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
16885+ }
16886+ else new_type = difquant_three_eights_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
16887+ }
16888+ ++qs.i_ffn_down;
1675116889 } else if (name.find("attn_output.weight") != std::string::npos) {
1675216890 if (qs.model.hparams.n_expert >= 4) {
1675316891 if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
@@ -16936,144 +17074,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1693617074 else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
1693717075 }
1693817076 ++qs.i_ffn_gate;
16939- } else if (name.find("ffn_down") != std::string::npos) {
16940- auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
16941- int i_layer = info.first, n_layer = info.second;
16942- if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) {
16943- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16944- new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S;
16945- else new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S;
16946- }
16947- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
16948- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16949- new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S;
16950- else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S;
16951- }
16952- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
16953- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16954- new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M;
16955- else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M;
16956- }
16957- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL) {
16958- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16959- new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M;
16960- else new_type = GGML_TYPE_IQ2_XXS;
16961- }
16962- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) {
16963- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16964- new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XS : GGML_TYPE_IQ2_XXS;
16965- else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XS : GGML_TYPE_IQ2_XXS;
16966- }
16967- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
16968- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16969- new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS;
16970- else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS;
16971- }
16972- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) {
16973- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16974- new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS;
16975- else new_type = GGML_TYPE_IQ2_S;
16976- }
16977- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
16978- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16979- new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S;
16980- else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S;
16981- }
16982- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) {
16983- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16984- new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S;
16985- else new_type = GGML_TYPE_IQ3_XXS;
16986- }
16987- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
16988- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16989- new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
16990- else new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
16991- }
16992- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
16993- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16994- new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
16995- else new_type = GGML_TYPE_IQ3_S;
16996- }
16997- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) {
16998- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16999- new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
17000- else new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
17001- }
17002- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
17003- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
17004- new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
17005- else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
17006- }
17007- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) {
17008- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
17009- new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
17010- else new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
17011- }
17012- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
17013- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
17014- new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
17015- else new_type = GGML_TYPE_IQ4_XS;
17016- }
17017- else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs.has_imatrix) {
17018- new_type = GGML_TYPE_Q5_K;
17019- }
17020- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
17021- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
17022- new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q5_K :
17023- difquant_fl_more_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
17024- }
17025- else new_type = difquant_three_eights_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
17026- }
17027- else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
17028- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
17029- new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K;
17030- else new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K;
17031- }
17032- else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) {
17033- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
17034- new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K;
17035- else new_type = GGML_TYPE_Q3_K;
17036- }
17037- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) {
17038- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
17039- new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
17040- else new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
17041- }
17042- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
17043- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
17044- new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
17045- else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
17046- }
17047- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
17048- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
17049- new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
17050- else new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
17051- }
17052- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XL) {
17053- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
17054- new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
17055- else new_type = GGML_TYPE_Q4_K;
17056- }
17057- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
17058- if (arch == LLM_ARCH_FALCON) {
17059- new_type = i_layer < n_layer/16 ? GGML_TYPE_Q6_K :
17060- difquant_half_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
17061- } else {
17062- if (difquant_half_tensors(i_layer, n_layer)) new_type = GGML_TYPE_Q5_K;
17063- }
17064- }
17065- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
17066- new_type = GGML_TYPE_Q5_K;
17067- }
17068- else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && difquant_half_tensors(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
17069- else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || ftype == LLAMA_FTYPE_MOSTLY_Q5_0)
17070- && qs.has_imatrix && i_layer < n_layer/8) {
17071- // Guard against craziness in the first few ffn_down layers that can happen even with imatrix for Q4_0/Q5_0.
17072- // We only do it when an imatrix is provided because a) we want to make sure that one can always get the
17073- // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
17074- new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
17075- }
17076- ++qs.i_ffn_down;
1707717077 } else if (name.find("ffn_up") != std::string::npos) {
1707817078 auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
1707917079 int i_layer = info.first, n_layer = info.second;
0 commit comments