@@ -15908,9 +15908,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1590815908 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) new_type = GGML_TYPE_IQ4_XS;
1590915909 }
1591015910 } else if (name.find("attn_v.weight") != std::string::npos) {
15911- if (ftype == LLAMA_FTYPE_CQS && qs.params->attn_v_type < GGML_TYPE_COUNT) {
15912- new_type = qs.params->attn_v_type;
15913- }
15911+ if (ftype == LLAMA_FTYPE_CQS && qs.params->attn_v_type < GGML_TYPE_COUNT) new_type = qs.params->attn_v_type;
1591415912 else if (qs.model.hparams.n_expert >= 4) {
1591515913 // for the 8-expert model, bumping this to Q8_0 trades just ~128MB
1591615914 // TODO: explore better strategies
@@ -15974,9 +15972,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1597415972 }
1597515973 ++qs.i_attention_wv;
1597615974 } else if (name.find("attn_k.weight") != std::string::npos) {
15977- if (ftype == LLAMA_FTYPE_CQS && qs.params->attn_k_type < GGML_TYPE_COUNT) {
15978- new_type = qs.params->attn_k_type;
15979- }
15975+ if (ftype == LLAMA_FTYPE_CQS && qs.params->attn_k_type < GGML_TYPE_COUNT) new_type = qs.params->attn_k_type;
1598015976 else if (qs.model.hparams.n_expert >= 4) {
1598115977 // for the 8-expert model, bumping this to Q8_0 trades just ~128MB
1598215978 // TODO: explore better strategies
@@ -16030,9 +16026,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1603016026 }
1603116027 ++qs.i_attention_wk;
1603216028 } else if (name.find("attn_q.weight") != std::string::npos) {
16033- if (ftype == LLAMA_FTYPE_CQS && qs.params->attn_q_type < GGML_TYPE_COUNT) {
16034- new_type = qs.params->attn_q_type;
16035- }
16029+ if (ftype == LLAMA_FTYPE_CQS && qs.params->attn_q_type < GGML_TYPE_COUNT) new_type = qs.params->attn_q_type;
1603616030 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ2_S;
1603716031 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) new_type = GGML_TYPE_IQ3_XXS;
1603816032 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
@@ -16055,9 +16049,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1605516049 } else if (name.find("ffn_down") != std::string::npos) {
1605616050 auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
1605716051 int i_layer = info.first, n_layer = info.second;
16058- if (ftype == LLAMA_FTYPE_CQS && qs.params->ffn_down_type < GGML_TYPE_COUNT) {
16059- new_type = qs.params->ffn_down_type;
16060- }
16052+ if (ftype == LLAMA_FTYPE_CQS && qs.params->ffn_down_type < GGML_TYPE_COUNT) new_type = qs.params->ffn_down_type;
1606116053 else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) new_type = GGML_TYPE_Q3_K;
1606216054 else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_Q3_K;
1606316055 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XXS;
@@ -16119,9 +16111,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1611916111 }
1612016112 ++qs.i_ffn_down;
1612116113 } else if (name.find("attn_output.weight") != std::string::npos) {
16122- if (ftype == LLAMA_FTYPE_CQS && qs.params->attn_output_type < GGML_TYPE_COUNT) {
16123- new_type = qs.params->attn_output_type;
16124- }
16114+ if (ftype == LLAMA_FTYPE_CQS && qs.params->attn_output_type < GGML_TYPE_COUNT) new_type = qs.params->attn_output_type;
1612516115 else if (arch != LLM_ARCH_FALCON) {
1612616116 if (qs.model.hparams.n_expert >= 4) {
1612716117 if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
@@ -16160,9 +16150,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1616016150 ++qs.i_attention_wo;
1616116151 }
1616216152 else if (name.find("attn_qkv.weight") != std::string::npos) {
16163- if (ftype == LLAMA_FTYPE_CQS && qs.params->attn_qkv_type < GGML_TYPE_COUNT) {
16164- new_type = qs.params->attn_qkv_type;
16165- }
16153+ if (ftype == LLAMA_FTYPE_CQS && qs.params->attn_qkv_type < GGML_TYPE_COUNT) new_type = qs.params->attn_qkv_type;
1616616154 else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
1616716155 new_type = GGML_TYPE_Q4_K;
1616816156 }
@@ -16188,9 +16176,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1618816176 else if (name.find("ffn_gate") != std::string::npos) {
1618916177 auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
1619016178 int i_layer = info.first, n_layer = info.second;
16191- if (ftype == LLAMA_FTYPE_CQS && qs.params->ffn_gate_type < GGML_TYPE_COUNT) {
16192- new_type = qs.params->ffn_gate_type;
16193- }
16179+ if (ftype == LLAMA_FTYPE_CQS && qs.params->ffn_gate_type < GGML_TYPE_COUNT) new_type = qs.params->ffn_gate_type;
1619416180 else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_Q3_K;
1619516181 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XXS;
1619616182 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XXS;
@@ -16206,9 +16192,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1620616192 else if (name.find("ffn_up") != std::string::npos) {
1620716193 auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
1620816194 int i_layer = info.first, n_layer = info.second;
16209- if (ftype == LLAMA_FTYPE_CQS && qs.params->ffn_up_type < GGML_TYPE_COUNT) {
16210- new_type = qs.params->ffn_up_type;
16211- }
16195+ if (ftype == LLAMA_FTYPE_CQS && qs.params->ffn_up_type < GGML_TYPE_COUNT) new_type = qs.params->ffn_up_type;
1621216196 else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_Q3_K;
1621316197 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XXS;
1621416198 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XXS;
0 commit comments