Skip to content

Commit 0d7245a

Browse files
committed
fix: Only flatten to Q8_0 if the raw target type is a quantization
Branch: GraniteEmbedQuant Signed-off-by: Gabe Goodhart <[email protected]>
1 parent 614c6e6 commit 0d7245a

File tree

1 file changed

+4
-2
lines changed

1 file changed

+4
-2
lines changed

src/llama-quant.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -148,14 +148,16 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
148148

149149
// for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
150150
// with the quantization of the output tensor
151+
const bool is_quantized = ggml_is_quantized(new_type);
151152
if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs.has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) {
152153
if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
153154
new_type = qs.params->output_tensor_type;
154155
} else {
155156
const int64_t nx = tensor->ne[0];
156157
const int64_t qk_k = ggml_blck_size(new_type);
157158

158-
if (arch == LLM_ARCH_FALCON || arch == LLM_ARCH_GRANITE || arch == LLM_ARCH_GRANITE_MOE || nx % qk_k != 0) {
159+
if (arch == LLM_ARCH_FALCON || nx % qk_k != 0 ||
160+
(is_quantized && (arch == LLM_ARCH_GRANITE || arch == LLM_ARCH_GRANITE_MOE))) {
159161
new_type = GGML_TYPE_Q8_0;
160162
}
161163
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
@@ -171,7 +173,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
171173
if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
172174
new_type = qs.params->token_embedding_type;
173175
} else {
174-
if (arch == LLM_ARCH_GRANITE || arch == LLM_ARCH_GRANITE_MOE) {
176+
if (is_quantized && (arch == LLM_ARCH_GRANITE || arch == LLM_ARCH_GRANITE_MOE)) {
175177
new_type = GGML_TYPE_Q8_0;
176178
}
177179
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||

0 commit comments

Comments
 (0)