@@ -15908,7 +15908,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1590815908 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) new_type = GGML_TYPE_IQ4_XS;
1590915909 }
1591015910 } else if (name.find("attn_v.weight") != std::string::npos) {
15911- if (ftype == LLAMA_FTYPE_CQS && qs.params->attn_v_type < GGML_TYPE_COUNT) new_type = qs.params->attn_v_type;
15911+ if (qs.params->attn_v_type < GGML_TYPE_COUNT) new_type = qs.params->attn_v_type;
1591215912 else if (qs.model.hparams.n_expert >= 4) {
1591315913 // for the 8-expert model, bumping this to Q8_0 trades just ~128MB
1591415914 // TODO: explore better strategies
@@ -15972,7 +15972,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1597215972 }
1597315973 ++qs.i_attention_wv;
1597415974 } else if (name.find("attn_k.weight") != std::string::npos) {
15975- if (ftype == LLAMA_FTYPE_CQS && qs.params->attn_k_type < GGML_TYPE_COUNT) new_type = qs.params->attn_k_type;
15975+ if (qs.params->attn_k_type < GGML_TYPE_COUNT) new_type = qs.params->attn_k_type;
1597615976 else if (qs.model.hparams.n_expert >= 4) {
1597715977 // for the 8-expert model, bumping this to Q8_0 trades just ~128MB
1597815978 // TODO: explore better strategies
@@ -16026,7 +16026,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1602616026 }
1602716027 ++qs.i_attention_wk;
1602816028 } else if (name.find("attn_q.weight") != std::string::npos) {
16029- if (ftype == LLAMA_FTYPE_CQS && qs.params->attn_q_type < GGML_TYPE_COUNT) new_type = qs.params->attn_q_type;
16029+ if (qs.params->attn_q_type < GGML_TYPE_COUNT) new_type = qs.params->attn_q_type;
1603016030 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ2_S;
1603116031 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) new_type = GGML_TYPE_IQ3_XXS;
1603216032 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
@@ -16049,7 +16049,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1604916049 } else if (name.find("ffn_down") != std::string::npos) {
1605016050 auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
1605116051 int i_layer = info.first, n_layer = info.second;
16052- if (ftype == LLAMA_FTYPE_CQS && qs.params->ffn_down_type < GGML_TYPE_COUNT) new_type = qs.params->ffn_down_type;
16052+ if (qs.params->ffn_down_type < GGML_TYPE_COUNT) new_type = qs.params->ffn_down_type;
1605316053 else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) new_type = GGML_TYPE_Q3_K;
1605416054 else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_Q3_K;
1605516055 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XXS;
@@ -16111,7 +16111,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1611116111 }
1611216112 ++qs.i_ffn_down;
1611316113 } else if (name.find("attn_output.weight") != std::string::npos) {
16114- if (ftype == LLAMA_FTYPE_CQS && qs.params->attn_output_type < GGML_TYPE_COUNT) new_type = qs.params->attn_output_type;
16114+ if (qs.params->attn_output_type < GGML_TYPE_COUNT) new_type = qs.params->attn_output_type;
1611516115 else if (arch != LLM_ARCH_FALCON) {
1611616116 if (qs.model.hparams.n_expert >= 4) {
1611716117 if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
@@ -16150,7 +16150,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1615016150 ++qs.i_attention_wo;
1615116151 }
1615216152 else if (name.find("attn_qkv.weight") != std::string::npos) {
16153- if (ftype == LLAMA_FTYPE_CQS && qs.params->attn_qkv_type < GGML_TYPE_COUNT) new_type = qs.params->attn_qkv_type;
16153+ if (qs.params->attn_qkv_type < GGML_TYPE_COUNT) new_type = qs.params->attn_qkv_type;
1615416154 else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
1615516155 new_type = GGML_TYPE_Q4_K;
1615616156 }
@@ -16176,7 +16176,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1617616176 else if (name.find("ffn_gate") != std::string::npos) {
1617716177 auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
1617816178 int i_layer = info.first, n_layer = info.second;
16179- if (ftype == LLAMA_FTYPE_CQS && qs.params->ffn_gate_type < GGML_TYPE_COUNT) new_type = qs.params->ffn_gate_type;
16179+ if (qs.params->ffn_gate_type < GGML_TYPE_COUNT) new_type = qs.params->ffn_gate_type;
1618016180 else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_Q3_K;
1618116181 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XXS;
1618216182 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XXS;
@@ -16192,7 +16192,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1619216192 else if (name.find("ffn_up") != std::string::npos) {
1619316193 auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
1619416194 int i_layer = info.first, n_layer = info.second;
16195- if (ftype == LLAMA_FTYPE_CQS && qs.params->ffn_up_type < GGML_TYPE_COUNT) new_type = qs.params->ffn_up_type;
16195+ if (qs.params->ffn_up_type < GGML_TYPE_COUNT) new_type = qs.params->ffn_up_type;
1619616196 else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_Q3_K;
1619716197 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XXS;
1619816198 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XXS;
@@ -16206,10 +16206,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1620616206 ++qs.i_ffn_up;
1620716207 }
1620816208 else if (name.find("attn_norm.weight") != std::string::npos) {
16209- if (ftype == LLAMA_FTYPE_CQS && qs.params->attn_norm_type < GGML_TYPE_COUNT) new_type = qs.params->attn_norm_type;
16209+ if (qs.params->attn_norm_type < GGML_TYPE_COUNT) new_type = qs.params->attn_norm_type;
1621016210 }
1621116211 else if (name.find("ffn_norm") != std::string::npos) {
16212- if (ftype == LLAMA_FTYPE_CQS && qs.params->ffn_norm_type < GGML_TYPE_COUNT) new_type = qs.params->ffn_norm_type;
16212+ if (qs.params->ffn_norm_type < GGML_TYPE_COUNT) new_type = qs.params->ffn_norm_type;
1621316213 }
1621416214
1621516215 // if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
0 commit comments