@@ -15381,179 +15381,147 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1538115381 }
1538215382 }
1538315383 } else if (name.find("attn_v.weight") != std::string::npos) {
15384- if (qs.params->attn_v_type < GGML_TYPE_COUNT) {
15385- new_type = qs.params->attn_v_type;
15386- } else {
15387- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
15388- new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
15389- }
15390- else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model.hparams.n_gqa() >= 4) {
15391- new_type = GGML_TYPE_Q4_K;
15392- }
15393- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
15394- new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
15395- }
15396- else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs.model.hparams.n_gqa() >= 4) {
15397- new_type = GGML_TYPE_Q4_K;
15398- }
15399- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
15400- new_type = GGML_TYPE_Q4_K;
15401- }
15402- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
15403- new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
15404- }
15405- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
15406- else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && qs.model.hparams.n_gqa() >= 4) {
15407- new_type = GGML_TYPE_Q5_K;
15408- }
15409- else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
15410- use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
15411- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
15412- if (qs.model.type == MODEL_70B) {
15413- // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
15414- // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
15415- // nearly negligible increase in model size by quantizing this tensor with more bits:
15416- if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
15417- }
15418- if (qs.model.hparams.n_expert == 8) {
15419- // for the 8-expert model, bumping this to Q8_0 trades just ~128MB
15420- // TODO: explore better strategies
15421- new_type = GGML_TYPE_Q8_0;
15422- }
15384+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
15385+ new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
15386+ }
15387+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model.hparams.n_gqa() >= 4) {
15388+ new_type = GGML_TYPE_Q4_K;
15389+ }
15390+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
15391+ new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
15392+ }
15393+ else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs.model.hparams.n_gqa() >= 4) {
15394+ new_type = GGML_TYPE_Q4_K;
15395+ }
15396+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
15397+ new_type = GGML_TYPE_Q4_K;
15398+ }
15399+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
15400+ new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
15401+ }
15402+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
15403+ else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && qs.model.hparams.n_gqa() >= 4) {
15404+ new_type = GGML_TYPE_Q5_K;
15405+ }
15406+ else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
15407+ use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
15408+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
15409+ if (qs.model.type == MODEL_70B) {
15410+ // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
15411+ // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
15412+ // nearly negligible increase in model size by quantizing this tensor with more bits:
15413+ if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
15414+ }
15415+ if (qs.model.hparams.n_expert == 8) {
15416+ // for the 8-expert model, bumping this to Q8_0 trades just ~128MB
15417+ // TODO: explore better strategies
15418+ new_type = GGML_TYPE_Q8_0;
1542315419 }
1542415420 ++qs.i_attention_wv;
1542515421 } else if (name.find("attn_k.weight") != std::string::npos) {
15426- if (qs.params->attn_k_type < GGML_TYPE_COUNT) {
15427- new_type = qs.params->attn_k_type;
15428- } else {
15429- if (qs.model.hparams.n_expert == 8) {
15430- // for the 8-expert model, bumping this to Q8_0 trades just ~128MB
15431- // TODO: explore better strategies
15432- new_type = GGML_TYPE_Q8_0;
15433- }
15434- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
15435- new_type = GGML_TYPE_IQ3_XXS;
15436- }
15437- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
15438- new_type = GGML_TYPE_IQ2_S;
15439- }
15422+ if (qs.model.hparams.n_expert == 8) {
15423+ // for the 8-expert model, bumping this to Q8_0 trades just ~128MB
15424+ // TODO: explore better strategies
15425+ new_type = GGML_TYPE_Q8_0;
15426+ }
15427+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
15428+ new_type = GGML_TYPE_IQ3_XXS;
15429+ }
15430+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
15431+ new_type = GGML_TYPE_IQ2_S;
1544015432 }
1544115433 } else if (name.find("attn_q.weight") != std::string::npos) {
15442- if (qs.params->attn_q_type < GGML_TYPE_COUNT) {
15443- new_type = qs.params->attn_q_type;
15444- } else {
15445- if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
15446- new_type = GGML_TYPE_IQ3_XXS;
15447- }
15448- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
15449- new_type = GGML_TYPE_IQ2_S;
15450- }
15434+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
15435+ new_type = GGML_TYPE_IQ3_XXS;
15436+ }
15437+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
15438+ new_type = GGML_TYPE_IQ2_S;
1545115439 }
1545215440 } else if (name.find("ffn_down") != std::string::npos) {
1545315441 auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
1545415442 int i_layer = info.first, n_layer = info.second;
15455- if (qs.params->ffn_down_type < GGML_TYPE_COUNT) {
15456- new_type = qs.params->ffn_down_type;
15457- } else {
15458- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
15459- else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
15460- if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
15461- }
15462- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) {
15463- new_type = i_layer < n_layer/8 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
15464- }
15465- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
15466- new_type = i_layer < n_layer/16 ? GGML_TYPE_Q5_K
15467- : arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
15468- : GGML_TYPE_Q3_K;
15469- }
15470- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (i_layer < n_layer/8 ||
15471- (qs.model.hparams.n_expert == 8 && use_more_bits(i_layer, n_layer)))) {
15472- new_type = GGML_TYPE_Q4_K;
15473- }
15474- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
15475- new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
15476- }
15477- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
15478- if (arch == LLM_ARCH_FALCON) {
15479- new_type = i_layer < n_layer/16 ? GGML_TYPE_Q6_K :
15480- use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
15481- } else {
15482- if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
15483- }
15484- }
15485- else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs.has_imatrix) {
15486- new_type = GGML_TYPE_Q5_K;
15487- }
15488- else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
15489- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
15490- new_type = GGML_TYPE_Q5_K;
15491- }
15492- else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || ftype == LLAMA_FTYPE_MOSTLY_Q5_0)
15493- && qs.has_imatrix && i_layer < n_layer/8) {
15494- // Guard against craziness in the first few ffn_down layers that can happen even with imatrix for Q4_0/Q5_0.
15495- // We only do it when an imatrix is provided because a) we want to make sure that one can always get the
15496- // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
15497- new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
15443+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
15444+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
15445+ if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
15446+ }
15447+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) {
15448+ new_type = i_layer < n_layer/8 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
15449+ }
15450+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
15451+ new_type = i_layer < n_layer/16 ? GGML_TYPE_Q5_K
15452+ : arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
15453+ : GGML_TYPE_Q3_K;
15454+ }
15455+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (i_layer < n_layer/8 ||
15456+ (qs.model.hparams.n_expert == 8 && use_more_bits(i_layer, n_layer)))) {
15457+ new_type = GGML_TYPE_Q4_K;
15458+ }
15459+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
15460+ new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
15461+ }
15462+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
15463+ if (arch == LLM_ARCH_FALCON) {
15464+ new_type = i_layer < n_layer/16 ? GGML_TYPE_Q6_K :
15465+ use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
15466+ } else {
15467+ if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
1549815468 }
1549915469 }
15470+ else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs.has_imatrix) {
15471+ new_type = GGML_TYPE_Q5_K;
15472+ }
15473+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
15474+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
15475+ new_type = GGML_TYPE_Q5_K;
15476+ }
15477+ else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || ftype == LLAMA_FTYPE_MOSTLY_Q5_0)
15478+ && qs.has_imatrix && i_layer < n_layer/8) {
15479+ // Guard against craziness in the first few ffn_down layers that can happen even with imatrix for Q4_0/Q5_0.
15480+ // We only do it when an imatrix is provided because a) we want to make sure that one can always get the
15481+ // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
15482+ new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
15483+ }
1550015484 ++qs.i_ffn_down;
1550115485 } else if (name.find("attn_output.weight") != std::string::npos) {
15502- if (qs.params->attn_output_type < GGML_TYPE_COUNT) {
15503- new_type = qs.params->attn_output_type;
15504- } else {
15505- if (arch != LLM_ARCH_FALCON) {
15506- if (qs.model.hparams.n_expert == 8) {
15507- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
15508- ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
15509- ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
15510- ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) {
15511- new_type = GGML_TYPE_Q5_K;
15512- }
15513- } else {
15514- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
15515- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S;
15516- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K;
15517- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K;
15518- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ) new_type = GGML_TYPE_Q4_K;
15486+ if (arch != LLM_ARCH_FALCON) {
15487+ if (qs.model.hparams.n_expert == 8) {
15488+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
15489+ ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
15490+ ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
15491+ ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) {
15492+ new_type = GGML_TYPE_Q5_K;
1551915493 }
1552015494 } else {
15521- if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
15495+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
15496+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S;
15497+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K;
15498+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K;
15499+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ) new_type = GGML_TYPE_Q4_K;
1552215500 }
15501+ } else {
15502+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
1552315503 }
1552415504 }
1552515505 else if (name.find("attn_qkv.weight") != std::string::npos) {
15526- if (qs.params->attn_qkv_type < GGML_TYPE_COUNT) {
15527- new_type = qs.params->attn_qkv_type;
15528- } else {
15529- if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
15530- new_type = GGML_TYPE_Q4_K;
15531- }
15532- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
15533- else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
15506+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
15507+ new_type = GGML_TYPE_Q4_K;
1553415508 }
15509+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
15510+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
1553515511 }
1553615512 else if (name.find("ffn_gate") != std::string::npos) {
1553715513 auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
1553815514 int i_layer = info.first, n_layer = info.second;
15539- if (qs.params->ffn_gate_type < GGML_TYPE_COUNT) {
15540- new_type = qs.params->ffn_gate_type;
15541- } else {
15542- if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
15543- new_type = GGML_TYPE_IQ3_XXS;
15544- }
15515+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
15516+ new_type = GGML_TYPE_IQ3_XXS;
1554515517 }
1554615518 ++qs.i_ffn_gate;
1554715519 }
1554815520 else if (name.find("ffn_up") != std::string::npos) {
1554915521 auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
1555015522 int i_layer = info.first, n_layer = info.second;
15551- if (qs.params->ffn_up_type < GGML_TYPE_COUNT) {
15552- new_type = qs.params->ffn_up_type;
15553- } else {
15554- if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
15555- new_type = GGML_TYPE_IQ3_XXS;
15556- }
15523+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
15524+ new_type = GGML_TYPE_IQ3_XXS;
1555715525 }
1555815526 ++qs.i_ffn_up;
1555915527 }
0 commit comments