Skip to content

Commit ab0cba5

Browse files
committed
Statically quant tensors where the imatrix size differs and fix "Missing importance matrix for tensor XXX in a very low-bit quantization" by instead using the nearest static quant with better quality than the specified target quant
1 parent 91ecc29 commit ab0cba5

File tree

1 file changed

+22
-2
lines changed

1 file changed

+22
-2
lines changed

src/llama-quant.cpp

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -846,12 +846,32 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
846846
// since many people will miss the error and not realize that most of the model is being quantized without an imatrix
847847
// tok_embd should be ignored in this case, since it always causes this warning
848848
if (name != tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
849-
throw std::runtime_error(format("imatrix size %d is different from tensor size %d for %s",
850-
int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name));
849+
LLAMA_LOG_INFO("imatrix size %d is different from tensor size %d for %s", int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name);
851850
}
852851
}
853852
}
854853
}
854+
if ((new_type == GGML_TYPE_IQ2_XXS ||
855+
new_type == GGML_TYPE_IQ1_S ||
856+
(new_type == GGML_TYPE_IQ1_M && strcmp(tensor->name, "token_embd.weight") && strcmp(tensor->name, "output.weight"))) && !imatrix) {
857+
LLAMA_LOG_INFO("\n\n============================================================\n");
858+
LLAMA_LOG_INFO("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
859+
LLAMA_LOG_INFO("The result will be garbage, so using GGML_TYPE_Q2_K\n");
860+
LLAMA_LOG_INFO("============================================================\n\n");
861+
new_type = GGML_TYPE_Q2_K;
862+
//throw std::runtime_error(format("Missing importance matrix for tensor %s in a very low-bit quantization", tensor->name));
863+
}
864+
865+
if ((new_type == GGML_TYPE_IQ2_XS ||
866+
new_type == GGML_TYPE_IQ2_S ||
867+
(new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
868+
LLAMA_LOG_INFO("\n\n============================================================\n");
869+
LLAMA_LOG_INFO("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
870+
LLAMA_LOG_INFO("The result will be garbage, so using GGML_TYPE_Q3_K\n");
871+
LLAMA_LOG_INFO("============================================================\n\n");
872+
new_type = GGML_TYPE_Q3_K;
873+
}
874+
855875
if ((new_type == GGML_TYPE_IQ2_XXS ||
856876
new_type == GGML_TYPE_IQ2_XS ||
857877
new_type == GGML_TYPE_IQ2_S ||

0 commit comments

Comments
 (0)