Skip to content

Commit f63860e

Browse files
committed
Put back ffn_down tree where it was before.
1 parent 8fc46df commit f63860e

File tree

1 file changed

+138
-138
lines changed

1 file changed

+138
-138
lines changed

src/llama.cpp

Lines changed: 138 additions & 138 deletions
Original file line numberDiff line numberDiff line change
@@ -16748,6 +16748,144 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1674816748
else new_type = difquant_fl_more_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1674916749
}
1675016750
++qs.i_attention_wq;
16751+
} else if (name.find("ffn_down") != std::string::npos) {
16752+
auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
16753+
int i_layer = info.first, n_layer = info.second;
16754+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
16755+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16756+
new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K;
16757+
else new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K;
16758+
}
16759+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) {
16760+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16761+
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K;
16762+
else new_type = GGML_TYPE_Q3_K;
16763+
}
16764+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) {
16765+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16766+
new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
16767+
else new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
16768+
}
16769+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
16770+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16771+
new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
16772+
else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
16773+
}
16774+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
16775+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16776+
new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
16777+
else new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
16778+
}
16779+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XL) {
16780+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16781+
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
16782+
else new_type = GGML_TYPE_Q4_K;
16783+
}
16784+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
16785+
if (arch == LLM_ARCH_FALCON) {
16786+
new_type = i_layer < n_layer/16 ? GGML_TYPE_Q6_K :
16787+
difquant_half_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
16788+
} else {
16789+
if (difquant_half_tensors(i_layer, n_layer)) new_type = GGML_TYPE_Q5_K;
16790+
}
16791+
}
16792+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
16793+
new_type = GGML_TYPE_Q5_K;
16794+
}
16795+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && difquant_half_tensors(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
16796+
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || ftype == LLAMA_FTYPE_MOSTLY_Q5_0)
16797+
&& qs.has_imatrix && i_layer < n_layer/8) {
16798+
// Guard against craziness in the first few ffn_down layers that can happen even with imatrix for Q4_0/Q5_0.
16799+
// We only do it when an imatrix is provided because a) we want to make sure that one can always get the
16800+
// same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
16801+
new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
16802+
}
16803+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) {
16804+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16805+
new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S;
16806+
else new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S;
16807+
}
16808+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
16809+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16810+
new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S;
16811+
else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S;
16812+
}
16813+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
16814+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16815+
new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M;
16816+
else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M;
16817+
}
16818+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL) {
16819+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16820+
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M;
16821+
else new_type = GGML_TYPE_IQ2_XXS;
16822+
}
16823+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) {
16824+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16825+
new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XS : GGML_TYPE_IQ2_XXS;
16826+
else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XS : GGML_TYPE_IQ2_XXS;
16827+
}
16828+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
16829+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16830+
new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS;
16831+
else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS;
16832+
}
16833+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) {
16834+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16835+
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS;
16836+
else new_type = GGML_TYPE_IQ2_S;
16837+
}
16838+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
16839+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16840+
new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S;
16841+
else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S;
16842+
}
16843+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) {
16844+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16845+
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S;
16846+
else new_type = GGML_TYPE_IQ3_XXS;
16847+
}
16848+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
16849+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16850+
new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
16851+
else new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
16852+
}
16853+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
16854+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16855+
new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
16856+
else new_type = GGML_TYPE_IQ3_S;
16857+
}
16858+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) {
16859+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16860+
new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16861+
else new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16862+
}
16863+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
16864+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16865+
new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16866+
else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16867+
}
16868+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) {
16869+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16870+
new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16871+
else new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16872+
}
16873+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
16874+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16875+
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16876+
else new_type = GGML_TYPE_IQ4_XS;
16877+
}
16878+
else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs.has_imatrix) {
16879+
new_type = GGML_TYPE_Q5_K;
16880+
}
16881+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
16882+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
16883+
new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q5_K :
16884+
difquant_fl_more_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
16885+
}
16886+
else new_type = difquant_three_eights_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
16887+
}
16888+
++qs.i_ffn_down;
1675116889
} else if (name.find("attn_output.weight") != std::string::npos) {
1675216890
if (qs.model.hparams.n_expert >= 4) {
1675316891
if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
@@ -16936,144 +17074,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1693617074
else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
1693717075
}
1693817076
++qs.i_ffn_gate;
16939-
} else if (name.find("ffn_down") != std::string::npos) {
16940-
auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
16941-
int i_layer = info.first, n_layer = info.second;
16942-
if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) {
16943-
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16944-
new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S;
16945-
else new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S;
16946-
}
16947-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
16948-
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16949-
new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S;
16950-
else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S;
16951-
}
16952-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
16953-
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16954-
new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M;
16955-
else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M;
16956-
}
16957-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL) {
16958-
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16959-
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M;
16960-
else new_type = GGML_TYPE_IQ2_XXS;
16961-
}
16962-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) {
16963-
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16964-
new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XS : GGML_TYPE_IQ2_XXS;
16965-
else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XS : GGML_TYPE_IQ2_XXS;
16966-
}
16967-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
16968-
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16969-
new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS;
16970-
else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS;
16971-
}
16972-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) {
16973-
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16974-
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS;
16975-
else new_type = GGML_TYPE_IQ2_S;
16976-
}
16977-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
16978-
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16979-
new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S;
16980-
else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S;
16981-
}
16982-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) {
16983-
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16984-
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S;
16985-
else new_type = GGML_TYPE_IQ3_XXS;
16986-
}
16987-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
16988-
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16989-
new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
16990-
else new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
16991-
}
16992-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
16993-
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16994-
new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
16995-
else new_type = GGML_TYPE_IQ3_S;
16996-
}
16997-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) {
16998-
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16999-
new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
17000-
else new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
17001-
}
17002-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
17003-
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
17004-
new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
17005-
else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
17006-
}
17007-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) {
17008-
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
17009-
new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
17010-
else new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
17011-
}
17012-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
17013-
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
17014-
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
17015-
else new_type = GGML_TYPE_IQ4_XS;
17016-
}
17017-
else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs.has_imatrix) {
17018-
new_type = GGML_TYPE_Q5_K;
17019-
}
17020-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
17021-
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
17022-
new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q5_K :
17023-
difquant_fl_more_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
17024-
}
17025-
else new_type = difquant_three_eights_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
17026-
}
17027-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
17028-
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
17029-
new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K;
17030-
else new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K;
17031-
}
17032-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) {
17033-
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
17034-
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K;
17035-
else new_type = GGML_TYPE_Q3_K;
17036-
}
17037-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) {
17038-
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
17039-
new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
17040-
else new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
17041-
}
17042-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
17043-
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
17044-
new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
17045-
else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
17046-
}
17047-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
17048-
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
17049-
new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
17050-
else new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
17051-
}
17052-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XL) {
17053-
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
17054-
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
17055-
else new_type = GGML_TYPE_Q4_K;
17056-
}
17057-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
17058-
if (arch == LLM_ARCH_FALCON) {
17059-
new_type = i_layer < n_layer/16 ? GGML_TYPE_Q6_K :
17060-
difquant_half_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
17061-
} else {
17062-
if (difquant_half_tensors(i_layer, n_layer)) new_type = GGML_TYPE_Q5_K;
17063-
}
17064-
}
17065-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
17066-
new_type = GGML_TYPE_Q5_K;
17067-
}
17068-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && difquant_half_tensors(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
17069-
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || ftype == LLAMA_FTYPE_MOSTLY_Q5_0)
17070-
&& qs.has_imatrix && i_layer < n_layer/8) {
17071-
// Guard against craziness in the first few ffn_down layers that can happen even with imatrix for Q4_0/Q5_0.
17072-
// We only do it when an imatrix is provided because a) we want to make sure that one can always get the
17073-
// same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
17074-
new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
17075-
}
17076-
++qs.i_ffn_down;
1707717077
} else if (name.find("ffn_up") != std::string::npos) {
1707817078
auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
1707917079
int i_layer = info.first, n_layer = info.second;

0 commit comments

Comments
 (0)