@@ -16762,6 +16762,78 @@ static void llama_tensor_dequantize_internal(
1676216762 workers.clear();
1676316763}
1676416764
16765+ static ggml_type change_type_if_necessar(ggml_type new_type, int nx, int ny) {
16766+ bool convert_incompatible_tensor = false;
16767+ if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
16768+ new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS ||
16769+ new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S ||
16770+ new_type == GGML_TYPE_IQ3_XXS || new_type == GGML_TYPE_IQ1_S || new_type == GGML_TYPE_IQ3_S ||
16771+ new_type == GGML_TYPE_IQ1_M || new_type == GGML_TYPE_IQ4_K || new_type == GGML_TYPE_IQ2_K ||
16772+ new_type == GGML_TYPE_IQ5_K || new_type == GGML_TYPE_IQ3_K || new_type == GGML_TYPE_Q4_K_R4 ||
16773+ new_type == GGML_TYPE_IQ6_K || new_type == GGML_TYPE_IQ4_KS || new_type == GGML_TYPE_IQ4_XS_R8 ||
16774+ new_type == GGML_TYPE_IQ2_KS || new_type == GGML_TYPE_IQ4_KSS || new_type == GGML_TYPE_Q6_K_R4 ||
16775+ new_type == GGML_TYPE_Q5_K_R4 || new_type == GGML_TYPE_Q3_K_R4 || new_type == GGML_TYPE_Q2_K_R4 ||
16776+ new_type == GGML_TYPE_IQ4_K_R4|| new_type == GGML_TYPE_Q8_K_R8 || new_type == GGML_TYPE_IQ3_K_R4||
16777+ new_type == GGML_TYPE_IQ2_K_R4|| new_type == GGML_TYPE_IQ5_K_R4|| new_type == GGML_TYPE_IQ4_KS_R4 ||
16778+ new_type == GGML_TYPE_IQ3_XXS_R4 || new_type == GGML_TYPE_IQ2_XXS_R4 || new_type == GGML_TYPE_IQ2_XS_R4 ||
16779+ new_type == GGML_TYPE_IQ2_S_R4|| new_type == GGML_TYPE_IQ3_S_R4) {
16780+ if (nx % QK_K != 0) {
16781+ LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for %s", __func__, nx, ny, QK_K, ggml_type_name(new_type));
16782+ convert_incompatible_tensor = true;
16783+ }
16784+ }
16785+ if (new_type == GGML_TYPE_IQ1_BN || new_type == GGML_TYPE_IQ2_BN || new_type == GGML_TYPE_IQ2_BN_R4) {
16786+ if (nx % QK_IQ1BN != 0) {
16787+ convert_incompatible_tensor = true;
16788+ }
16789+ }
16790+ if (convert_incompatible_tensor) {
16791+ switch (new_type) {
16792+ case GGML_TYPE_IQ2_XXS:
16793+ case GGML_TYPE_IQ2_XXS_R4:
16794+ case GGML_TYPE_IQ2_XS:
16795+ case GGML_TYPE_IQ2_XS_R4:
16796+ case GGML_TYPE_IQ2_KS:
16797+ case GGML_TYPE_IQ2_S:
16798+ case GGML_TYPE_IQ2_S_R4:
16799+ case GGML_TYPE_IQ3_XXS:
16800+ case GGML_TYPE_IQ3_XXS_R4:
16801+ case GGML_TYPE_IQ3_S:
16802+ case GGML_TYPE_IQ3_S_R4:
16803+ case GGML_TYPE_IQ1_S:
16804+ case GGML_TYPE_IQ1_M:
16805+ case GGML_TYPE_Q2_K:
16806+ case GGML_TYPE_Q2_K_R4:
16807+ case GGML_TYPE_Q3_K:
16808+ case GGML_TYPE_Q3_K_R4:
16809+ case GGML_TYPE_IQ2_K:
16810+ case GGML_TYPE_IQ2_K_R4:
16811+ case GGML_TYPE_IQ3_K:
16812+ case GGML_TYPE_IQ3_K_R4:
16813+ case GGML_TYPE_IQ4_KSS:
16814+ case GGML_TYPE_IQ4_KS:
16815+ case GGML_TYPE_IQ4_KS_R4:
16816+ case GGML_TYPE_IQ4_XS_R8:
16817+ case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
16818+ case GGML_TYPE_IQ4_K:
16819+ case GGML_TYPE_IQ4_K_R4:
16820+ case GGML_TYPE_Q4_K_R4:
16821+ case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
16822+ case GGML_TYPE_IQ5_K:
16823+ case GGML_TYPE_IQ5_K_R4:
16824+ case GGML_TYPE_Q5_K_R4:
16825+ case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q6_0; break;
16826+ case GGML_TYPE_IQ6_K:
16827+ case GGML_TYPE_Q6_K_R4:
16828+ case GGML_TYPE_Q8_K_R8:
16829+ case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
16830+ default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
16831+ }
16832+ LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
16833+ }
16834+ return new_type;
16835+ }
16836+
1676516837static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
1676616838 const std::string name = ggml_get_name(tensor);
1676716839
@@ -17260,90 +17332,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1726017332 LLAMA_LOG_INFO("Using custom type %s for tensor %s\n", ggml_type_name(new_type), name.c_str());
1726117333 }
1726217334
17263- // if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
17264- //}
17265- // IK: let's remove this, else Q2_K is almost the same as Q3_K_S
17266- //else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) {
17267- // if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
17268- //}
17269- // This can be used to reduce the size of the Q5_K_S model.
17270- // The associated PPL increase is fully in line with the size reduction
17271- //else {
17272- // if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
17273- //}
17274- bool convert_incompatible_tensor = false;
17275- if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
17276- new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS ||
17277- new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S ||
17278- new_type == GGML_TYPE_IQ3_XXS || new_type == GGML_TYPE_IQ1_S || new_type == GGML_TYPE_IQ3_S ||
17279- new_type == GGML_TYPE_IQ1_M || new_type == GGML_TYPE_IQ4_K || new_type == GGML_TYPE_IQ2_K ||
17280- new_type == GGML_TYPE_IQ5_K || new_type == GGML_TYPE_IQ3_K || new_type == GGML_TYPE_Q4_K_R4 ||
17281- new_type == GGML_TYPE_IQ6_K || new_type == GGML_TYPE_IQ4_KS || new_type == GGML_TYPE_IQ4_XS_R8 ||
17282- new_type == GGML_TYPE_IQ2_KS || new_type == GGML_TYPE_IQ4_KSS || new_type == GGML_TYPE_Q6_K_R4 ||
17283- new_type == GGML_TYPE_Q5_K_R4 || new_type == GGML_TYPE_Q3_K_R4 || new_type == GGML_TYPE_Q2_K_R4 ||
17284- new_type == GGML_TYPE_IQ4_K_R4|| new_type == GGML_TYPE_Q8_K_R8 || new_type == GGML_TYPE_IQ3_K_R4||
17285- new_type == GGML_TYPE_IQ2_K_R4|| new_type == GGML_TYPE_IQ5_K_R4|| new_type == GGML_TYPE_IQ4_KS_R4 ||
17286- new_type == GGML_TYPE_IQ3_XXS_R4 || new_type == GGML_TYPE_IQ2_XXS_R4 || new_type == GGML_TYPE_IQ2_XS_R4 ||
17287- new_type == GGML_TYPE_IQ2_S_R4|| new_type == GGML_TYPE_IQ3_S_R4) {
17288- int nx = tensor->ne[0];
17289- int ny = tensor->ne[1];
17290- if (nx % QK_K != 0) {
17291- LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for %s", __func__, nx, ny, QK_K, ggml_type_name(new_type));
17292- convert_incompatible_tensor = true;
17293- } else {
17294- ++qs.n_k_quantized;
17295- }
17296- }
17297- if (new_type == GGML_TYPE_IQ1_BN || new_type == GGML_TYPE_IQ2_BN || new_type == GGML_TYPE_IQ2_BN_R4) {
17298- int nx = tensor->ne[0];
17299- if (nx % QK_IQ1BN != 0) {
17300- convert_incompatible_tensor = true;
17301- }
17302- }
17303- if (convert_incompatible_tensor) {
17304- switch (new_type) {
17305- case GGML_TYPE_IQ2_XXS:
17306- case GGML_TYPE_IQ2_XXS_R4:
17307- case GGML_TYPE_IQ2_XS:
17308- case GGML_TYPE_IQ2_XS_R4:
17309- case GGML_TYPE_IQ2_KS:
17310- case GGML_TYPE_IQ2_S:
17311- case GGML_TYPE_IQ2_S_R4:
17312- case GGML_TYPE_IQ3_XXS:
17313- case GGML_TYPE_IQ3_XXS_R4:
17314- case GGML_TYPE_IQ3_S:
17315- case GGML_TYPE_IQ3_S_R4:
17316- case GGML_TYPE_IQ1_S:
17317- case GGML_TYPE_IQ1_M:
17318- case GGML_TYPE_Q2_K:
17319- case GGML_TYPE_Q2_K_R4:
17320- case GGML_TYPE_Q3_K:
17321- case GGML_TYPE_Q3_K_R4:
17322- case GGML_TYPE_IQ2_K:
17323- case GGML_TYPE_IQ2_K_R4:
17324- case GGML_TYPE_IQ3_K:
17325- case GGML_TYPE_IQ3_K_R4:
17326- case GGML_TYPE_IQ4_KSS:
17327- case GGML_TYPE_IQ4_KS:
17328- case GGML_TYPE_IQ4_KS_R4:
17329- case GGML_TYPE_IQ4_XS_R8:
17330- case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
17331- case GGML_TYPE_IQ4_K:
17332- case GGML_TYPE_IQ4_K_R4:
17333- case GGML_TYPE_Q4_K_R4:
17334- case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
17335- case GGML_TYPE_IQ5_K:
17336- case GGML_TYPE_IQ5_K_R4:
17337- case GGML_TYPE_Q5_K_R4:
17338- case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q6_0; break;
17339- case GGML_TYPE_IQ6_K:
17340- case GGML_TYPE_Q6_K_R4:
17341- case GGML_TYPE_Q8_K_R8:
17342- case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
17343- default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
17344- }
17345- LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
17335+ auto working_type = change_type_if_necessar(new_type, tensor->ne[0], tensor->ne[1]);
17336+ if (working_type != new_type) {
1734617337 ++qs.n_fallback;
17338+ new_type = working_type;
1734717339 }
1734817340
1734917341 return new_type;
@@ -17848,7 +17840,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1784817840 }
1784917841
1785017842 // get more optimal quantization type based on the tensor shape, layer, etc.
17851- if (!params->pure && ggml_is_quantized(default_type)) {
17843+ if (params->pure) {
17844+ auto working_type = change_type_if_necessar(new_type, tensor->ne[0], tensor->ne[1]);
17845+ if (working_type != new_type) {
17846+ ++qs.n_fallback;
17847+ new_type = working_type;
17848+ }
17849+ }
17850+ else if (ggml_is_quantized(default_type)) {
1785217851 new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
1785317852 }
1785417853 if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
0 commit comments