@@ -16762,7 +16762,7 @@ static void llama_tensor_dequantize_internal(
1676216762 workers.clear();
1676316763}
1676416764
16765- static ggml_type change_type_if_necessar (ggml_type new_type, int nx, int ny) {
16765+ static ggml_type change_type_if_necessary (ggml_type new_type, int nx, int ny) {
1676616766 bool convert_incompatible_tensor = false;
1676716767 if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
1676816768 new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS ||
@@ -16834,6 +16834,43 @@ static ggml_type change_type_if_necessar(ggml_type new_type, int nx, int ny) {
1683416834 return new_type;
1683516835}
1683616836
16837+ static std::pair<ggml_type, int> interleaved_properties(ggml_type type) {
16838+ static std::unordered_map<ggml_type, std::pair<ggml_type, int>> k_map = {
16839+ { GGML_TYPE_Q4_0_4_4, { GGML_TYPE_Q4_0, 4} },
16840+ { GGML_TYPE_Q4_0_4_8, { GGML_TYPE_Q4_0, 4} },
16841+ { GGML_TYPE_Q4_0_8_8, { GGML_TYPE_Q4_0, 8} },
16842+ { GGML_TYPE_Q4_0_R8, { GGML_TYPE_Q4_0, 8} },
16843+ { GGML_TYPE_Q5_0_R4, { GGML_TYPE_Q5_0, 4} },
16844+ { GGML_TYPE_Q6_0_R4, { GGML_TYPE_Q6_0, 4} },
16845+ { GGML_TYPE_Q8_0_R8, { GGML_TYPE_Q8_0, 8} },
16846+ { GGML_TYPE_Q2_K_R4, { GGML_TYPE_Q2_K, 4} },
16847+ { GGML_TYPE_Q3_K_R4, { GGML_TYPE_Q3_K, 4} },
16848+ { GGML_TYPE_Q4_K_R4, { GGML_TYPE_Q4_K, 4} },
16849+ { GGML_TYPE_Q5_K_R4, { GGML_TYPE_Q5_K, 4} },
16850+ { GGML_TYPE_Q6_K_R4, { GGML_TYPE_Q6_K, 4} },
16851+ { GGML_TYPE_IQ2_XXS_R4, { GGML_TYPE_IQ2_XXS, 4} },
16852+ { GGML_TYPE_IQ2_XS_R4, { GGML_TYPE_IQ2_XS, 4} },
16853+ { GGML_TYPE_IQ2_S_R4, { GGML_TYPE_IQ2_S, 4} },
16854+ { GGML_TYPE_IQ3_XXS_R4, { GGML_TYPE_IQ3_XXS, 4} },
16855+ { GGML_TYPE_IQ3_S_R4, { GGML_TYPE_IQ3_S, 4} },
16856+ { GGML_TYPE_IQ4_XS_R8, { GGML_TYPE_IQ4_XS, 8} },
16857+ { GGML_TYPE_IQ4_NL_R4, { GGML_TYPE_IQ4_NL, 4} },
16858+ { GGML_TYPE_IQ1_S_R4, { GGML_TYPE_IQ1_S, 4} },
16859+ { GGML_TYPE_IQ1_M_R4, { GGML_TYPE_IQ1_M, 4} },
16860+ { GGML_TYPE_IQ2_BN_R4, { GGML_TYPE_IQ2_BN, 4} },
16861+ { GGML_TYPE_IQ2_K_R4, { GGML_TYPE_IQ2_K, 4} },
16862+ { GGML_TYPE_IQ3_K_R4, { GGML_TYPE_IQ3_K, 4} },
16863+ { GGML_TYPE_IQ4_K_R4, { GGML_TYPE_IQ4_K, 4} },
16864+ { GGML_TYPE_IQ4_KS_R4, { GGML_TYPE_IQ4_KS, 4} },
16865+ { GGML_TYPE_IQ5_K_R4, { GGML_TYPE_IQ5_K, 4} },
16866+ { GGML_TYPE_Q8_KV_R8, { GGML_TYPE_Q8_KV, 8} },
16867+ { GGML_TYPE_Q8_K_R8, { GGML_TYPE_Q8_K, 8} },
16868+ { GGML_TYPE_BF16_R16, { GGML_TYPE_BF16, 16} },
16869+ };
16870+ if (auto it = k_map.find(type); it != k_map.end()) return it->second;
16871+ return {type, 1};
16872+ }
16873+
1683716874static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
1683816875 const std::string name = ggml_get_name(tensor);
1683916876
@@ -16939,70 +16976,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1693916976 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_BN || ftype == LLAMA_FTYPE_MOSTLY_IQ2_BN || ftype == LLAMA_FTYPE_MOSTLY_IQ2_BN_R4) {
1694016977 new_type = GGML_TYPE_IQ4_NL;
1694116978 }
16942- else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 ||
16943- new_type == GGML_TYPE_Q4_0_8_8) {
16944- new_type = GGML_TYPE_Q4_0;
16945- }
16946- else if (new_type == GGML_TYPE_IQ4_NL_R4) {
16947- new_type = GGML_TYPE_IQ4_NL;
16948- }
16949- else if (new_type == GGML_TYPE_IQ4_XS_R8) {
16950- new_type = GGML_TYPE_IQ4_XS;
16951- }
16952- else if (new_type == GGML_TYPE_Q2_K_R4) {
16953- new_type = GGML_TYPE_Q2_K;
16954- }
16955- else if (new_type == GGML_TYPE_Q3_K_R4) {
16956- new_type = GGML_TYPE_Q3_K;
16957- }
16958- else if (new_type == GGML_TYPE_Q4_K_R4) {
16959- new_type = GGML_TYPE_Q4_K;
16960- }
16961- else if (new_type == GGML_TYPE_Q5_K_R4) {
16962- new_type = GGML_TYPE_Q5_K;
16963- }
16964- else if (new_type == GGML_TYPE_Q6_K_R4) {
16965- new_type = GGML_TYPE_Q6_K;
16966- }
16967- else if (new_type == GGML_TYPE_Q8_K_R8) {
16968- new_type = GGML_TYPE_Q8_0;
16969- }
16970- else if (new_type == GGML_TYPE_Q8_KV_R8) {
16971- new_type = GGML_TYPE_Q8_0;
16972- }
16973- else if (new_type == GGML_TYPE_IQ2_K_R4) {
16974- new_type = GGML_TYPE_IQ2_K;
16975- }
16976- else if (new_type == GGML_TYPE_IQ3_K_R4) {
16977- new_type = GGML_TYPE_IQ3_K;
16978- }
16979- else if (new_type == GGML_TYPE_IQ3_S_R4) {
16980- new_type = GGML_TYPE_IQ3_S;
16981- }
16982- else if (new_type == GGML_TYPE_IQ4_K_R4) {
16983- new_type = GGML_TYPE_IQ4_K;
16984- }
16985- else if (new_type == GGML_TYPE_IQ5_K_R4) {
16986- new_type = GGML_TYPE_IQ5_K;
16987- }
16988- else if (new_type == GGML_TYPE_IQ4_KS_R4) {
16989- new_type = GGML_TYPE_IQ4_KS;
16990- }
16991- else if (new_type == GGML_TYPE_Q4_0_R8) {
16992- new_type = GGML_TYPE_Q4_0;
16993- }
16994- else if (new_type == GGML_TYPE_Q5_0_R4) {
16995- new_type = GGML_TYPE_Q5_0;
16996- }
16997- else if (new_type == GGML_TYPE_Q6_0_R4) {
16998- new_type = GGML_TYPE_Q6_0;
16999- }
17000- else if (new_type == GGML_TYPE_Q8_0_R8) {
17001- new_type = GGML_TYPE_Q8_0;
17002- }
17003- else if (new_type == GGML_TYPE_BF16_R16) {
17004- new_type = GGML_TYPE_BF16;
17005- }
1700616979 }
1700716980 } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M_R4) {
1700816981 if (name.find("attn_v.weight") != std::string::npos) {
@@ -17332,12 +17305,21 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1733217305 LLAMA_LOG_INFO("Using custom type %s for tensor %s\n", ggml_type_name(new_type), name.c_str());
1733317306 }
1733417307
17335- auto working_type = change_type_if_necessar (new_type, tensor->ne[0], tensor->ne[1]);
17308+ auto working_type = change_type_if_necessary (new_type, tensor->ne[0], tensor->ne[1]);
1733617309 if (working_type != new_type) {
1733717310 ++qs.n_fallback;
1733817311 new_type = working_type;
1733917312 }
1734017313
17314+ if (name == "token_embd.weight") {
17315+ auto working_type = interleaved_properties(new_type).first;
17316+ if (working_type != new_type) {
17317+ printf("\n============ Token embeddings cannot be quantized with row-interleaved quants\n");
17318+ printf("---> Changed %s to %s\n", ggml_type_name(new_type), ggml_type_name(working_type));
17319+ new_type = working_type;
17320+ }
17321+ }
17322+
1734117323 return new_type;
1734217324}
1734317325
@@ -17834,14 +17816,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1783417816 }
1783517817
1783617818 if (quantize) {
17819+
1783717820 new_type = default_type;
17838- if (new_type == GGML_TYPE_BF16_R16 && strcmp(tensor->name, "token_embd.weight") == 0) {
17839- new_type = GGML_TYPE_BF16;
17840- }
1784117821
1784217822 // get more optimal quantization type based on the tensor shape, layer, etc.
1784317823 if (params->pure) {
17844- auto working_type = change_type_if_necessar (new_type, tensor->ne[0], tensor->ne[1]);
17824+ auto working_type = change_type_if_necessary (new_type, tensor->ne[0], tensor->ne[1]);
1784517825 if (working_type != new_type) {
1784617826 ++qs.n_fallback;
1784717827 new_type = working_type;
@@ -17881,6 +17861,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1788117861 new_type = params->ffn_up_type;
1788217862 }
1788317863
17864+ if (strcmp(tensor->name, "token_embd.weight") == 0) {
17865+ // token embeddings cannot be quantized with row-interleaved quants
17866+ auto working_type = interleaved_properties(new_type).first;
17867+ if (working_type != new_type) {
17868+ printf("\n============ Token embeddings cannot be quantized with row-interleaved quants\n");
17869+ printf("---> Changed %s to %s\n", ggml_type_name(new_type), ggml_type_name(working_type));
17870+ new_type = working_type;
17871+ }
17872+ }
17873+
1788417874 // If we've decided to quantize to the same type the tensor is already
1788517875 // in then there's nothing to do.
1788617876 quantize = tensor->type != new_type;
@@ -17965,119 +17955,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1796517955 }
1796617956
1796717957 int chunk_size_multiplier = 1;
17968- if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || new_type == GGML_TYPE_Q4_0_8_8) {
17969- if ((new_type == GGML_TYPE_Q4_0_8_8) && (tensor->ne[1] % 8 != 0)) new_type = GGML_TYPE_Q4_0;
17970- else if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q4_0;
17971- if (new_type == GGML_TYPE_Q4_0_8_8) chunk_size_multiplier = 8;
17972- else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8) chunk_size_multiplier = 4;
17973- }
17974- else if (new_type == GGML_TYPE_IQ4_NL_R4) {
17975- if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ4_NL;
17976- else chunk_size_multiplier = 4;
17977- }
17978- else if (new_type == GGML_TYPE_IQ4_XS_R8) {
17979- if (tensor->ne[1] % 8 != 0) new_type = GGML_TYPE_IQ4_XS;
17980- else chunk_size_multiplier = 8;
17981- }
17982- else if (new_type == GGML_TYPE_Q4_0_R8) {
17983- if (tensor->ne[1] % 8 != 0) new_type = GGML_TYPE_Q4_0;
17984- else chunk_size_multiplier = 8;
17985- }
17986- else if (new_type == GGML_TYPE_Q5_0_R4) {
17987- if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q5_0;
17988- else chunk_size_multiplier = 4;
17989- }
17990- else if (new_type == GGML_TYPE_Q6_0_R4) {
17991- if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q6_0;
17992- else chunk_size_multiplier = 4;
17993- }
17994- else if (new_type == GGML_TYPE_Q8_0_R8) {
17995- if (tensor->ne[1] % 8 != 0) new_type = GGML_TYPE_Q8_0;
17996- else chunk_size_multiplier = 8;
17997- }
17998- else if (new_type == GGML_TYPE_Q2_K_R4) {
17999- if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q2_K;
18000- else chunk_size_multiplier = 4;
18001- }
18002- else if (new_type == GGML_TYPE_Q3_K_R4) {
18003- if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q3_K;
18004- else chunk_size_multiplier = 4;
18005- }
18006- else if (new_type == GGML_TYPE_Q4_K_R4) {
18007- if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q4_K;
18008- else chunk_size_multiplier = 4;
18009- }
18010- else if (new_type == GGML_TYPE_Q5_K_R4) {
18011- if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q5_K;
18012- else chunk_size_multiplier = 4;
18013- }
18014- else if (new_type == GGML_TYPE_Q6_K_R4) {
18015- if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q6_K;
18016- else chunk_size_multiplier = 4;
18017- }
18018- else if (new_type == GGML_TYPE_Q8_K_R8) {
18019- if (tensor->ne[1] % 8 != 0) new_type = GGML_TYPE_Q8_0;
18020- else chunk_size_multiplier = 8;
18021- }
18022- else if (new_type == GGML_TYPE_Q8_KV_R8) {
18023- if (tensor->ne[1] % 8 != 0) new_type = GGML_TYPE_Q8_0;
18024- else chunk_size_multiplier = 8;
18025- }
18026- else if (new_type == GGML_TYPE_IQ2_BN_R4) {
18027- if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ2_BN;
18028- else chunk_size_multiplier = 4;
18029- }
18030- else if (new_type == GGML_TYPE_IQ2_K_R4) {
18031- if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ2_K;
18032- else chunk_size_multiplier = 4;
18033- }
18034- else if (new_type == GGML_TYPE_IQ3_K_R4) {
18035- if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ3_K;
18036- else chunk_size_multiplier = 4;
18037- }
18038- else if (new_type == GGML_TYPE_IQ4_K_R4) {
18039- if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ4_K;
18040- else chunk_size_multiplier = 4;
18041- }
18042- else if (new_type == GGML_TYPE_IQ5_K_R4) {
18043- if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ5_K;
18044- else chunk_size_multiplier = 4;
18045- }
18046- else if (new_type == GGML_TYPE_IQ4_KS_R4) {
18047- if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ4_KS;
18048- else chunk_size_multiplier = 4;
18049- }
18050- else if (new_type == GGML_TYPE_IQ2_XXS_R4) {
18051- if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ2_XXS;
18052- else chunk_size_multiplier = 4;
18053- }
18054- else if (new_type == GGML_TYPE_IQ2_XS_R4) {
18055- if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ2_XS;
18056- else chunk_size_multiplier = 4;
18057- }
18058- else if (new_type == GGML_TYPE_IQ2_S_R4) {
18059- if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ2_S;
18060- else chunk_size_multiplier = 4;
18061- }
18062- else if (new_type == GGML_TYPE_IQ3_XXS_R4) {
18063- if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ3_XXS;
18064- else chunk_size_multiplier = 4;
18065- }
18066- else if (new_type == GGML_TYPE_IQ3_S_R4) {
18067- if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ3_S;
18068- else chunk_size_multiplier = 4;
18069- }
18070- else if (new_type == GGML_TYPE_IQ1_S_R4) {
18071- if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ1_S;
18072- else chunk_size_multiplier = 4;
18073- }
18074- else if (new_type == GGML_TYPE_IQ1_M_R4) {
18075- if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ1_M;
18076- else chunk_size_multiplier = 4;
18077- }
18078- else if (new_type == GGML_TYPE_BF16_R16) {
18079- if (tensor->ne[1] % 16 != 0) new_type = GGML_TYPE_BF16;
18080- else chunk_size_multiplier = 16;
17958+ auto [working_type, num_rows] = interleaved_properties(new_type);
17959+ if (tensor->ne[1] % num_rows != 0) {
17960+ new_type = working_type;
17961+ } else {
17962+ chunk_size_multiplier = num_rows;
1808117963 }
1808217964
1808317965 LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
0 commit comments