@@ -4478,6 +4478,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
44784478 case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4";
44794479 case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8";
44804480 case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8";
4481+ case LLAMA_FTYPE_CQS: return "Custom Quantization Scheme";
44814482
44824483 default: return "unknown, may not work";
44834484 }
@@ -15381,7 +15382,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1538115382 }
1538215383 }
1538315384 } else if (name.find("attn_v.weight") != std::string::npos) {
15384- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
15385+ if (ftype == LLAMA_FTYPE_CQS && qs.params->attn_v_type < GGML_TYPE_COUNT) {
15386+ new_type = qs.params->attn_v_type;
15387+ }
15388+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
1538515389 new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
1538615390 }
1538715391 else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model.hparams.n_gqa() >= 4) {
@@ -15419,7 +15423,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1541915423 }
1542015424 ++qs.i_attention_wv;
1542115425 } else if (name.find("attn_k.weight") != std::string::npos) {
15422- if (qs.model.hparams.n_expert == 8) {
15426+ if (ftype == LLAMA_FTYPE_CQS && qs.params->attn_k_type < GGML_TYPE_COUNT) {
15427+ new_type = qs.params->attn_k_type;
15428+ }
15429+ else if (qs.model.hparams.n_expert == 8) {
1542315430 // for the 8-expert model, bumping this to Q8_0 trades just ~128MB
1542415431 // TODO: explore better strategies
1542515432 new_type = GGML_TYPE_Q8_0;
@@ -15431,6 +15438,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1543115438 new_type = GGML_TYPE_IQ2_S;
1543215439 }
1543315440 } else if (name.find("attn_q.weight") != std::string::npos) {
15441+ if (ftype == LLAMA_FTYPE_CQS && qs.params->attn_q_type < GGML_TYPE_COUNT) {
15442+ new_type = qs.params->attn_q_type;
15443+ }
1543415444 if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
1543515445 new_type = GGML_TYPE_IQ3_XXS;
1543615446 }
@@ -15440,7 +15450,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1544015450 } else if (name.find("ffn_down") != std::string::npos) {
1544115451 auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
1544215452 int i_layer = info.first, n_layer = info.second;
15443- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
15453+ if (ftype == LLAMA_FTYPE_CQS && qs.params->ffn_down_type < GGML_TYPE_COUNT) {
15454+ new_type = qs.params->ffn_down_type;
15455+ }
15456+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
1544415457 else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
1544515458 if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
1544615459 }
@@ -15483,7 +15496,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1548315496 }
1548415497 ++qs.i_ffn_down;
1548515498 } else if (name.find("attn_output.weight") != std::string::npos) {
15486- if (arch != LLM_ARCH_FALCON) {
15499+ if (ftype == LLAMA_FTYPE_CQS && qs.params->attn_output_type < GGML_TYPE_COUNT) {
15500+ new_type = qs.params->attn_output_type;
15501+ }
15502+ else if (arch != LLM_ARCH_FALCON) {
1548715503 if (qs.model.hparams.n_expert == 8) {
1548815504 if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
1548915505 ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
@@ -15503,6 +15519,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1550315519 }
1550415520 }
1550515521 else if (name.find("attn_qkv.weight") != std::string::npos) {
15522+ if (ftype == LLAMA_FTYPE_CQS && qs.params->attn_qkv_type < GGML_TYPE_COUNT) {
15523+ new_type = qs.params->attn_qkv_type;
15524+ }
1550615525 if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
1550715526 new_type = GGML_TYPE_Q4_K;
1550815527 }
@@ -15512,15 +15531,21 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1551215531 else if (name.find("ffn_gate") != std::string::npos) {
1551315532 auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
1551415533 int i_layer = info.first, n_layer = info.second;
15515- if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
15534+ if (ftype == LLAMA_FTYPE_CQS && qs.params->ffn_gate_type < GGML_TYPE_COUNT) {
15535+ new_type = qs.params->ffn_gate_type;
15536+ }
15537+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
1551615538 new_type = GGML_TYPE_IQ3_XXS;
1551715539 }
1551815540 ++qs.i_ffn_gate;
1551915541 }
1552015542 else if (name.find("ffn_up") != std::string::npos) {
1552115543 auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
1552215544 int i_layer = info.first, n_layer = info.second;
15523- if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
15545+ if (ftype == LLAMA_FTYPE_CQS && qs.params->ffn_up_type < GGML_TYPE_COUNT) {
15546+ new_type = qs.params->ffn_up_type;
15547+ }
15548+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
1552415549 new_type = GGML_TYPE_IQ3_XXS;
1552515550 }
1552615551 ++qs.i_ffn_up;
@@ -15671,6 +15696,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1567115696 case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: default_type = GGML_TYPE_Q4_0_4_8; break;
1567215697 case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: default_type = GGML_TYPE_Q4_0_8_8; break;
1567315698
15699+ // Custom Quantization Scheme
15700+ case LLAMA_FTYPE_CQS: default_type = GGML_TYPE_Q2_K; break;
15701+
1567415702 default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
1567515703 }
1567615704
0 commit comments