Skip to content

Commit b07a337

Browse files
ikawrakowIwan Kawrakow
andauthored
Additional guards for interleaved quants (#299)
* Make sure no interleaved quants are being used for token embeddings also with `--pure` and/or `--custom-q`. * Simplify --------- Co-authored-by: Iwan Kawrakow <[email protected]>
1 parent 6e5156c commit b07a337

File tree

1 file changed

+65
-183
lines changed

1 file changed

+65
-183
lines changed

src/llama.cpp

Lines changed: 65 additions & 183 deletions
Original file line numberDiff line numberDiff line change
@@ -16762,7 +16762,7 @@ static void llama_tensor_dequantize_internal(
1676216762
workers.clear();
1676316763
}
1676416764

16765-
static ggml_type change_type_if_necessar(ggml_type new_type, int nx, int ny) {
16765+
static ggml_type change_type_if_necessary(ggml_type new_type, int nx, int ny) {
1676616766
bool convert_incompatible_tensor = false;
1676716767
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
1676816768
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS ||
@@ -16834,6 +16834,43 @@ static ggml_type change_type_if_necessar(ggml_type new_type, int nx, int ny) {
1683416834
return new_type;
1683516835
}
1683616836

16837+
static std::pair<ggml_type, int> interleaved_properties(ggml_type type) {
16838+
static std::unordered_map<ggml_type, std::pair<ggml_type, int>> k_map = {
16839+
{ GGML_TYPE_Q4_0_4_4, { GGML_TYPE_Q4_0, 4} },
16840+
{ GGML_TYPE_Q4_0_4_8, { GGML_TYPE_Q4_0, 4} },
16841+
{ GGML_TYPE_Q4_0_8_8, { GGML_TYPE_Q4_0, 8} },
16842+
{ GGML_TYPE_Q4_0_R8, { GGML_TYPE_Q4_0, 8} },
16843+
{ GGML_TYPE_Q5_0_R4, { GGML_TYPE_Q5_0, 4} },
16844+
{ GGML_TYPE_Q6_0_R4, { GGML_TYPE_Q6_0, 4} },
16845+
{ GGML_TYPE_Q8_0_R8, { GGML_TYPE_Q8_0, 8} },
16846+
{ GGML_TYPE_Q2_K_R4, { GGML_TYPE_Q2_K, 4} },
16847+
{ GGML_TYPE_Q3_K_R4, { GGML_TYPE_Q3_K, 4} },
16848+
{ GGML_TYPE_Q4_K_R4, { GGML_TYPE_Q4_K, 4} },
16849+
{ GGML_TYPE_Q5_K_R4, { GGML_TYPE_Q5_K, 4} },
16850+
{ GGML_TYPE_Q6_K_R4, { GGML_TYPE_Q6_K, 4} },
16851+
{ GGML_TYPE_IQ2_XXS_R4, { GGML_TYPE_IQ2_XXS, 4} },
16852+
{ GGML_TYPE_IQ2_XS_R4, { GGML_TYPE_IQ2_XS, 4} },
16853+
{ GGML_TYPE_IQ2_S_R4, { GGML_TYPE_IQ2_S, 4} },
16854+
{ GGML_TYPE_IQ3_XXS_R4, { GGML_TYPE_IQ3_XXS, 4} },
16855+
{ GGML_TYPE_IQ3_S_R4, { GGML_TYPE_IQ3_S, 4} },
16856+
{ GGML_TYPE_IQ4_XS_R8, { GGML_TYPE_IQ4_XS, 8} },
16857+
{ GGML_TYPE_IQ4_NL_R4, { GGML_TYPE_IQ4_NL, 4} },
16858+
{ GGML_TYPE_IQ1_S_R4, { GGML_TYPE_IQ1_S, 4} },
16859+
{ GGML_TYPE_IQ1_M_R4, { GGML_TYPE_IQ1_M, 4} },
16860+
{ GGML_TYPE_IQ2_BN_R4, { GGML_TYPE_IQ2_BN, 4} },
16861+
{ GGML_TYPE_IQ2_K_R4, { GGML_TYPE_IQ2_K, 4} },
16862+
{ GGML_TYPE_IQ3_K_R4, { GGML_TYPE_IQ3_K, 4} },
16863+
{ GGML_TYPE_IQ4_K_R4, { GGML_TYPE_IQ4_K, 4} },
16864+
{ GGML_TYPE_IQ4_KS_R4, { GGML_TYPE_IQ4_KS, 4} },
16865+
{ GGML_TYPE_IQ5_K_R4, { GGML_TYPE_IQ5_K, 4} },
16866+
{ GGML_TYPE_Q8_KV_R8, { GGML_TYPE_Q8_KV, 8} },
16867+
{ GGML_TYPE_Q8_K_R8, { GGML_TYPE_Q8_K, 8} },
16868+
{ GGML_TYPE_BF16_R16, { GGML_TYPE_BF16, 16} },
16869+
};
16870+
if (auto it = k_map.find(type); it != k_map.end()) return it->second;
16871+
return {type, 1};
16872+
}
16873+
1683716874
static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
1683816875
const std::string name = ggml_get_name(tensor);
1683916876

@@ -16939,70 +16976,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1693916976
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_BN || ftype == LLAMA_FTYPE_MOSTLY_IQ2_BN || ftype == LLAMA_FTYPE_MOSTLY_IQ2_BN_R4) {
1694016977
new_type = GGML_TYPE_IQ4_NL;
1694116978
}
16942-
else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 ||
16943-
new_type == GGML_TYPE_Q4_0_8_8) {
16944-
new_type = GGML_TYPE_Q4_0;
16945-
}
16946-
else if (new_type == GGML_TYPE_IQ4_NL_R4) {
16947-
new_type = GGML_TYPE_IQ4_NL;
16948-
}
16949-
else if (new_type == GGML_TYPE_IQ4_XS_R8) {
16950-
new_type = GGML_TYPE_IQ4_XS;
16951-
}
16952-
else if (new_type == GGML_TYPE_Q2_K_R4) {
16953-
new_type = GGML_TYPE_Q2_K;
16954-
}
16955-
else if (new_type == GGML_TYPE_Q3_K_R4) {
16956-
new_type = GGML_TYPE_Q3_K;
16957-
}
16958-
else if (new_type == GGML_TYPE_Q4_K_R4) {
16959-
new_type = GGML_TYPE_Q4_K;
16960-
}
16961-
else if (new_type == GGML_TYPE_Q5_K_R4) {
16962-
new_type = GGML_TYPE_Q5_K;
16963-
}
16964-
else if (new_type == GGML_TYPE_Q6_K_R4) {
16965-
new_type = GGML_TYPE_Q6_K;
16966-
}
16967-
else if (new_type == GGML_TYPE_Q8_K_R8) {
16968-
new_type = GGML_TYPE_Q8_0;
16969-
}
16970-
else if (new_type == GGML_TYPE_Q8_KV_R8) {
16971-
new_type = GGML_TYPE_Q8_0;
16972-
}
16973-
else if (new_type == GGML_TYPE_IQ2_K_R4) {
16974-
new_type = GGML_TYPE_IQ2_K;
16975-
}
16976-
else if (new_type == GGML_TYPE_IQ3_K_R4) {
16977-
new_type = GGML_TYPE_IQ3_K;
16978-
}
16979-
else if (new_type == GGML_TYPE_IQ3_S_R4) {
16980-
new_type = GGML_TYPE_IQ3_S;
16981-
}
16982-
else if (new_type == GGML_TYPE_IQ4_K_R4) {
16983-
new_type = GGML_TYPE_IQ4_K;
16984-
}
16985-
else if (new_type == GGML_TYPE_IQ5_K_R4) {
16986-
new_type = GGML_TYPE_IQ5_K;
16987-
}
16988-
else if (new_type == GGML_TYPE_IQ4_KS_R4) {
16989-
new_type = GGML_TYPE_IQ4_KS;
16990-
}
16991-
else if (new_type == GGML_TYPE_Q4_0_R8) {
16992-
new_type = GGML_TYPE_Q4_0;
16993-
}
16994-
else if (new_type == GGML_TYPE_Q5_0_R4) {
16995-
new_type = GGML_TYPE_Q5_0;
16996-
}
16997-
else if (new_type == GGML_TYPE_Q6_0_R4) {
16998-
new_type = GGML_TYPE_Q6_0;
16999-
}
17000-
else if (new_type == GGML_TYPE_Q8_0_R8) {
17001-
new_type = GGML_TYPE_Q8_0;
17002-
}
17003-
else if (new_type == GGML_TYPE_BF16_R16) {
17004-
new_type = GGML_TYPE_BF16;
17005-
}
1700616979
}
1700716980
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M_R4) {
1700816981
if (name.find("attn_v.weight") != std::string::npos) {
@@ -17332,12 +17305,21 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1733217305
LLAMA_LOG_INFO("Using custom type %s for tensor %s\n", ggml_type_name(new_type), name.c_str());
1733317306
}
1733417307

17335-
auto working_type = change_type_if_necessar(new_type, tensor->ne[0], tensor->ne[1]);
17308+
auto working_type = change_type_if_necessary(new_type, tensor->ne[0], tensor->ne[1]);
1733617309
if (working_type != new_type) {
1733717310
++qs.n_fallback;
1733817311
new_type = working_type;
1733917312
}
1734017313

17314+
if (name == "token_embd.weight") {
17315+
auto working_type = interleaved_properties(new_type).first;
17316+
if (working_type != new_type) {
17317+
printf("\n============ Token embeddings cannot be quantized with row-interleaved quants\n");
17318+
printf("---> Changed %s to %s\n", ggml_type_name(new_type), ggml_type_name(working_type));
17319+
new_type = working_type;
17320+
}
17321+
}
17322+
1734117323
return new_type;
1734217324
}
1734317325

@@ -17834,14 +17816,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1783417816
}
1783517817

1783617818
if (quantize) {
17819+
1783717820
new_type = default_type;
17838-
if (new_type == GGML_TYPE_BF16_R16 && strcmp(tensor->name, "token_embd.weight") == 0) {
17839-
new_type = GGML_TYPE_BF16;
17840-
}
1784117821

1784217822
// get more optimal quantization type based on the tensor shape, layer, etc.
1784317823
if (params->pure) {
17844-
auto working_type = change_type_if_necessar(new_type, tensor->ne[0], tensor->ne[1]);
17824+
auto working_type = change_type_if_necessary(new_type, tensor->ne[0], tensor->ne[1]);
1784517825
if (working_type != new_type) {
1784617826
++qs.n_fallback;
1784717827
new_type = working_type;
@@ -17881,6 +17861,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1788117861
new_type = params->ffn_up_type;
1788217862
}
1788317863

17864+
if (strcmp(tensor->name, "token_embd.weight") == 0) {
17865+
// token embeddings cannot be quantized with row-interleaved quants
17866+
auto working_type = interleaved_properties(new_type).first;
17867+
if (working_type != new_type) {
17868+
printf("\n============ Token embeddings cannot be quantized with row-interleaved quants\n");
17869+
printf("---> Changed %s to %s\n", ggml_type_name(new_type), ggml_type_name(working_type));
17870+
new_type = working_type;
17871+
}
17872+
}
17873+
1788417874
// If we've decided to quantize to the same type the tensor is already
1788517875
// in then there's nothing to do.
1788617876
quantize = tensor->type != new_type;
@@ -17965,119 +17955,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1796517955
}
1796617956

1796717957
int chunk_size_multiplier = 1;
17968-
if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || new_type == GGML_TYPE_Q4_0_8_8) {
17969-
if ((new_type == GGML_TYPE_Q4_0_8_8) && (tensor->ne[1] % 8 != 0)) new_type = GGML_TYPE_Q4_0;
17970-
else if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q4_0;
17971-
if (new_type == GGML_TYPE_Q4_0_8_8) chunk_size_multiplier = 8;
17972-
else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8) chunk_size_multiplier = 4;
17973-
}
17974-
else if (new_type == GGML_TYPE_IQ4_NL_R4) {
17975-
if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ4_NL;
17976-
else chunk_size_multiplier = 4;
17977-
}
17978-
else if (new_type == GGML_TYPE_IQ4_XS_R8) {
17979-
if (tensor->ne[1] % 8 != 0) new_type = GGML_TYPE_IQ4_XS;
17980-
else chunk_size_multiplier = 8;
17981-
}
17982-
else if (new_type == GGML_TYPE_Q4_0_R8) {
17983-
if (tensor->ne[1] % 8 != 0) new_type = GGML_TYPE_Q4_0;
17984-
else chunk_size_multiplier = 8;
17985-
}
17986-
else if (new_type == GGML_TYPE_Q5_0_R4) {
17987-
if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q5_0;
17988-
else chunk_size_multiplier = 4;
17989-
}
17990-
else if (new_type == GGML_TYPE_Q6_0_R4) {
17991-
if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q6_0;
17992-
else chunk_size_multiplier = 4;
17993-
}
17994-
else if (new_type == GGML_TYPE_Q8_0_R8) {
17995-
if (tensor->ne[1] % 8 != 0) new_type = GGML_TYPE_Q8_0;
17996-
else chunk_size_multiplier = 8;
17997-
}
17998-
else if (new_type == GGML_TYPE_Q2_K_R4) {
17999-
if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q2_K;
18000-
else chunk_size_multiplier = 4;
18001-
}
18002-
else if (new_type == GGML_TYPE_Q3_K_R4) {
18003-
if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q3_K;
18004-
else chunk_size_multiplier = 4;
18005-
}
18006-
else if (new_type == GGML_TYPE_Q4_K_R4) {
18007-
if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q4_K;
18008-
else chunk_size_multiplier = 4;
18009-
}
18010-
else if (new_type == GGML_TYPE_Q5_K_R4) {
18011-
if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q5_K;
18012-
else chunk_size_multiplier = 4;
18013-
}
18014-
else if (new_type == GGML_TYPE_Q6_K_R4) {
18015-
if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q6_K;
18016-
else chunk_size_multiplier = 4;
18017-
}
18018-
else if (new_type == GGML_TYPE_Q8_K_R8) {
18019-
if (tensor->ne[1] % 8 != 0) new_type = GGML_TYPE_Q8_0;
18020-
else chunk_size_multiplier = 8;
18021-
}
18022-
else if (new_type == GGML_TYPE_Q8_KV_R8) {
18023-
if (tensor->ne[1] % 8 != 0) new_type = GGML_TYPE_Q8_0;
18024-
else chunk_size_multiplier = 8;
18025-
}
18026-
else if (new_type == GGML_TYPE_IQ2_BN_R4) {
18027-
if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ2_BN;
18028-
else chunk_size_multiplier = 4;
18029-
}
18030-
else if (new_type == GGML_TYPE_IQ2_K_R4) {
18031-
if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ2_K;
18032-
else chunk_size_multiplier = 4;
18033-
}
18034-
else if (new_type == GGML_TYPE_IQ3_K_R4) {
18035-
if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ3_K;
18036-
else chunk_size_multiplier = 4;
18037-
}
18038-
else if (new_type == GGML_TYPE_IQ4_K_R4) {
18039-
if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ4_K;
18040-
else chunk_size_multiplier = 4;
18041-
}
18042-
else if (new_type == GGML_TYPE_IQ5_K_R4) {
18043-
if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ5_K;
18044-
else chunk_size_multiplier = 4;
18045-
}
18046-
else if (new_type == GGML_TYPE_IQ4_KS_R4) {
18047-
if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ4_KS;
18048-
else chunk_size_multiplier = 4;
18049-
}
18050-
else if (new_type == GGML_TYPE_IQ2_XXS_R4) {
18051-
if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ2_XXS;
18052-
else chunk_size_multiplier = 4;
18053-
}
18054-
else if (new_type == GGML_TYPE_IQ2_XS_R4) {
18055-
if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ2_XS;
18056-
else chunk_size_multiplier = 4;
18057-
}
18058-
else if (new_type == GGML_TYPE_IQ2_S_R4) {
18059-
if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ2_S;
18060-
else chunk_size_multiplier = 4;
18061-
}
18062-
else if (new_type == GGML_TYPE_IQ3_XXS_R4) {
18063-
if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ3_XXS;
18064-
else chunk_size_multiplier = 4;
18065-
}
18066-
else if (new_type == GGML_TYPE_IQ3_S_R4) {
18067-
if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ3_S;
18068-
else chunk_size_multiplier = 4;
18069-
}
18070-
else if (new_type == GGML_TYPE_IQ1_S_R4) {
18071-
if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ1_S;
18072-
else chunk_size_multiplier = 4;
18073-
}
18074-
else if (new_type == GGML_TYPE_IQ1_M_R4) {
18075-
if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ1_M;
18076-
else chunk_size_multiplier = 4;
18077-
}
18078-
else if (new_type == GGML_TYPE_BF16_R16) {
18079-
if (tensor->ne[1] % 16 != 0) new_type = GGML_TYPE_BF16;
18080-
else chunk_size_multiplier = 16;
17958+
auto [working_type, num_rows] = interleaved_properties(new_type);
17959+
if (tensor->ne[1] % num_rows != 0) {
17960+
new_type = working_type;
17961+
} else {
17962+
chunk_size_multiplier = num_rows;
1808117963
}
1808217964

1808317965
LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));

0 commit comments

Comments
 (0)