Skip to content

Commit 23b0add

Browse files
ikawrakowIwan Kawrakow
andauthored
Make sure tensor row size is multiple of block size also when quantizing with --pure (#294)
* WIP - not working * q8_0 without bells and wistles works * It works for q8_0 * Use bf16 instead of f16,int16 * q4_0_r8 * q5_0_r4 * q6_0_r4 * Also q4_1 and q5_1 * Add check if selected type is possible with --pure I often want to quantize with --pure to see quantization performance without quantization mixes. But for models where there qre tensors with row sizes that are not multiple of 256, this results in a crash for k- and i-quants. Hence, lets add a check if the quant selected via --pure is applicable, and change it if not. --------- Co-authored-by: Iwan Kawrakow <[email protected]>
1 parent d0b5207 commit 23b0add

File tree

1 file changed

+83
-84
lines changed

1 file changed

+83
-84
lines changed

src/llama.cpp

Lines changed: 83 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -16762,6 +16762,78 @@ static void llama_tensor_dequantize_internal(
1676216762
workers.clear();
1676316763
}
1676416764

16765+
static ggml_type change_type_if_necessar(ggml_type new_type, int nx, int ny) {
16766+
bool convert_incompatible_tensor = false;
16767+
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
16768+
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS ||
16769+
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S ||
16770+
new_type == GGML_TYPE_IQ3_XXS || new_type == GGML_TYPE_IQ1_S || new_type == GGML_TYPE_IQ3_S ||
16771+
new_type == GGML_TYPE_IQ1_M || new_type == GGML_TYPE_IQ4_K || new_type == GGML_TYPE_IQ2_K ||
16772+
new_type == GGML_TYPE_IQ5_K || new_type == GGML_TYPE_IQ3_K || new_type == GGML_TYPE_Q4_K_R4 ||
16773+
new_type == GGML_TYPE_IQ6_K || new_type == GGML_TYPE_IQ4_KS || new_type == GGML_TYPE_IQ4_XS_R8 ||
16774+
new_type == GGML_TYPE_IQ2_KS || new_type == GGML_TYPE_IQ4_KSS || new_type == GGML_TYPE_Q6_K_R4 ||
16775+
new_type == GGML_TYPE_Q5_K_R4 || new_type == GGML_TYPE_Q3_K_R4 || new_type == GGML_TYPE_Q2_K_R4 ||
16776+
new_type == GGML_TYPE_IQ4_K_R4|| new_type == GGML_TYPE_Q8_K_R8 || new_type == GGML_TYPE_IQ3_K_R4||
16777+
new_type == GGML_TYPE_IQ2_K_R4|| new_type == GGML_TYPE_IQ5_K_R4|| new_type == GGML_TYPE_IQ4_KS_R4 ||
16778+
new_type == GGML_TYPE_IQ3_XXS_R4 || new_type == GGML_TYPE_IQ2_XXS_R4 || new_type == GGML_TYPE_IQ2_XS_R4 ||
16779+
new_type == GGML_TYPE_IQ2_S_R4|| new_type == GGML_TYPE_IQ3_S_R4) {
16780+
if (nx % QK_K != 0) {
16781+
LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for %s", __func__, nx, ny, QK_K, ggml_type_name(new_type));
16782+
convert_incompatible_tensor = true;
16783+
}
16784+
}
16785+
if (new_type == GGML_TYPE_IQ1_BN || new_type == GGML_TYPE_IQ2_BN || new_type == GGML_TYPE_IQ2_BN_R4) {
16786+
if (nx % QK_IQ1BN != 0) {
16787+
convert_incompatible_tensor = true;
16788+
}
16789+
}
16790+
if (convert_incompatible_tensor) {
16791+
switch (new_type) {
16792+
case GGML_TYPE_IQ2_XXS:
16793+
case GGML_TYPE_IQ2_XXS_R4:
16794+
case GGML_TYPE_IQ2_XS:
16795+
case GGML_TYPE_IQ2_XS_R4:
16796+
case GGML_TYPE_IQ2_KS:
16797+
case GGML_TYPE_IQ2_S:
16798+
case GGML_TYPE_IQ2_S_R4:
16799+
case GGML_TYPE_IQ3_XXS:
16800+
case GGML_TYPE_IQ3_XXS_R4:
16801+
case GGML_TYPE_IQ3_S:
16802+
case GGML_TYPE_IQ3_S_R4:
16803+
case GGML_TYPE_IQ1_S:
16804+
case GGML_TYPE_IQ1_M:
16805+
case GGML_TYPE_Q2_K:
16806+
case GGML_TYPE_Q2_K_R4:
16807+
case GGML_TYPE_Q3_K:
16808+
case GGML_TYPE_Q3_K_R4:
16809+
case GGML_TYPE_IQ2_K:
16810+
case GGML_TYPE_IQ2_K_R4:
16811+
case GGML_TYPE_IQ3_K:
16812+
case GGML_TYPE_IQ3_K_R4:
16813+
case GGML_TYPE_IQ4_KSS:
16814+
case GGML_TYPE_IQ4_KS:
16815+
case GGML_TYPE_IQ4_KS_R4:
16816+
case GGML_TYPE_IQ4_XS_R8:
16817+
case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
16818+
case GGML_TYPE_IQ4_K:
16819+
case GGML_TYPE_IQ4_K_R4:
16820+
case GGML_TYPE_Q4_K_R4:
16821+
case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
16822+
case GGML_TYPE_IQ5_K:
16823+
case GGML_TYPE_IQ5_K_R4:
16824+
case GGML_TYPE_Q5_K_R4:
16825+
case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q6_0; break;
16826+
case GGML_TYPE_IQ6_K:
16827+
case GGML_TYPE_Q6_K_R4:
16828+
case GGML_TYPE_Q8_K_R8:
16829+
case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
16830+
default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
16831+
}
16832+
LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
16833+
}
16834+
return new_type;
16835+
}
16836+
1676516837
static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
1676616838
const std::string name = ggml_get_name(tensor);
1676716839

@@ -17260,90 +17332,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1726017332
LLAMA_LOG_INFO("Using custom type %s for tensor %s\n", ggml_type_name(new_type), name.c_str());
1726117333
}
1726217334

17263-
// if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
17264-
//}
17265-
// IK: let's remove this, else Q2_K is almost the same as Q3_K_S
17266-
//else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) {
17267-
// if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
17268-
//}
17269-
// This can be used to reduce the size of the Q5_K_S model.
17270-
// The associated PPL increase is fully in line with the size reduction
17271-
//else {
17272-
// if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
17273-
//}
17274-
bool convert_incompatible_tensor = false;
17275-
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
17276-
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS ||
17277-
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S ||
17278-
new_type == GGML_TYPE_IQ3_XXS || new_type == GGML_TYPE_IQ1_S || new_type == GGML_TYPE_IQ3_S ||
17279-
new_type == GGML_TYPE_IQ1_M || new_type == GGML_TYPE_IQ4_K || new_type == GGML_TYPE_IQ2_K ||
17280-
new_type == GGML_TYPE_IQ5_K || new_type == GGML_TYPE_IQ3_K || new_type == GGML_TYPE_Q4_K_R4 ||
17281-
new_type == GGML_TYPE_IQ6_K || new_type == GGML_TYPE_IQ4_KS || new_type == GGML_TYPE_IQ4_XS_R8 ||
17282-
new_type == GGML_TYPE_IQ2_KS || new_type == GGML_TYPE_IQ4_KSS || new_type == GGML_TYPE_Q6_K_R4 ||
17283-
new_type == GGML_TYPE_Q5_K_R4 || new_type == GGML_TYPE_Q3_K_R4 || new_type == GGML_TYPE_Q2_K_R4 ||
17284-
new_type == GGML_TYPE_IQ4_K_R4|| new_type == GGML_TYPE_Q8_K_R8 || new_type == GGML_TYPE_IQ3_K_R4||
17285-
new_type == GGML_TYPE_IQ2_K_R4|| new_type == GGML_TYPE_IQ5_K_R4|| new_type == GGML_TYPE_IQ4_KS_R4 ||
17286-
new_type == GGML_TYPE_IQ3_XXS_R4 || new_type == GGML_TYPE_IQ2_XXS_R4 || new_type == GGML_TYPE_IQ2_XS_R4 ||
17287-
new_type == GGML_TYPE_IQ2_S_R4|| new_type == GGML_TYPE_IQ3_S_R4) {
17288-
int nx = tensor->ne[0];
17289-
int ny = tensor->ne[1];
17290-
if (nx % QK_K != 0) {
17291-
LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for %s", __func__, nx, ny, QK_K, ggml_type_name(new_type));
17292-
convert_incompatible_tensor = true;
17293-
} else {
17294-
++qs.n_k_quantized;
17295-
}
17296-
}
17297-
if (new_type == GGML_TYPE_IQ1_BN || new_type == GGML_TYPE_IQ2_BN || new_type == GGML_TYPE_IQ2_BN_R4) {
17298-
int nx = tensor->ne[0];
17299-
if (nx % QK_IQ1BN != 0) {
17300-
convert_incompatible_tensor = true;
17301-
}
17302-
}
17303-
if (convert_incompatible_tensor) {
17304-
switch (new_type) {
17305-
case GGML_TYPE_IQ2_XXS:
17306-
case GGML_TYPE_IQ2_XXS_R4:
17307-
case GGML_TYPE_IQ2_XS:
17308-
case GGML_TYPE_IQ2_XS_R4:
17309-
case GGML_TYPE_IQ2_KS:
17310-
case GGML_TYPE_IQ2_S:
17311-
case GGML_TYPE_IQ2_S_R4:
17312-
case GGML_TYPE_IQ3_XXS:
17313-
case GGML_TYPE_IQ3_XXS_R4:
17314-
case GGML_TYPE_IQ3_S:
17315-
case GGML_TYPE_IQ3_S_R4:
17316-
case GGML_TYPE_IQ1_S:
17317-
case GGML_TYPE_IQ1_M:
17318-
case GGML_TYPE_Q2_K:
17319-
case GGML_TYPE_Q2_K_R4:
17320-
case GGML_TYPE_Q3_K:
17321-
case GGML_TYPE_Q3_K_R4:
17322-
case GGML_TYPE_IQ2_K:
17323-
case GGML_TYPE_IQ2_K_R4:
17324-
case GGML_TYPE_IQ3_K:
17325-
case GGML_TYPE_IQ3_K_R4:
17326-
case GGML_TYPE_IQ4_KSS:
17327-
case GGML_TYPE_IQ4_KS:
17328-
case GGML_TYPE_IQ4_KS_R4:
17329-
case GGML_TYPE_IQ4_XS_R8:
17330-
case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
17331-
case GGML_TYPE_IQ4_K:
17332-
case GGML_TYPE_IQ4_K_R4:
17333-
case GGML_TYPE_Q4_K_R4:
17334-
case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
17335-
case GGML_TYPE_IQ5_K:
17336-
case GGML_TYPE_IQ5_K_R4:
17337-
case GGML_TYPE_Q5_K_R4:
17338-
case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q6_0; break;
17339-
case GGML_TYPE_IQ6_K:
17340-
case GGML_TYPE_Q6_K_R4:
17341-
case GGML_TYPE_Q8_K_R8:
17342-
case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
17343-
default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
17344-
}
17345-
LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
17335+
auto working_type = change_type_if_necessar(new_type, tensor->ne[0], tensor->ne[1]);
17336+
if (working_type != new_type) {
1734617337
++qs.n_fallback;
17338+
new_type = working_type;
1734717339
}
1734817340

1734917341
return new_type;
@@ -17848,7 +17840,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1784817840
}
1784917841

1785017842
// get more optimal quantization type based on the tensor shape, layer, etc.
17851-
if (!params->pure && ggml_is_quantized(default_type)) {
17843+
if (params->pure) {
17844+
auto working_type = change_type_if_necessar(new_type, tensor->ne[0], tensor->ne[1]);
17845+
if (working_type != new_type) {
17846+
++qs.n_fallback;
17847+
new_type = working_type;
17848+
}
17849+
}
17850+
else if (ggml_is_quantized(default_type)) {
1785217851
new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
1785317852
}
1785417853
if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {

0 commit comments

Comments
 (0)