@@ -148,14 +148,16 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
148148
149149 // for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
150150 // with the quantization of the output tensor
151+ const bool is_quantized = ggml_is_quantized (new_type);
151152 if (name == tn (LLM_TENSOR_OUTPUT, " weight" ) || (!qs.has_output && name == tn (LLM_TENSOR_TOKEN_EMBD, " weight" ))) {
152153 if (qs.params ->output_tensor_type < GGML_TYPE_COUNT) {
153154 new_type = qs.params ->output_tensor_type ;
154155 } else {
155156 const int64_t nx = tensor->ne [0 ];
156157 const int64_t qk_k = ggml_blck_size (new_type);
157158
158- if (arch == LLM_ARCH_FALCON || arch == LLM_ARCH_GRANITE || arch == LLM_ARCH_GRANITE_MOE || nx % qk_k != 0 ) {
159+ if (arch == LLM_ARCH_FALCON || nx % qk_k != 0 ||
160+ (is_quantized && (arch == LLM_ARCH_GRANITE || arch == LLM_ARCH_GRANITE_MOE))) {
159161 new_type = GGML_TYPE_Q8_0;
160162 }
161163 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
@@ -171,7 +173,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
171173 if (qs.params ->token_embedding_type < GGML_TYPE_COUNT) {
172174 new_type = qs.params ->token_embedding_type ;
173175 } else {
174- if (arch == LLM_ARCH_GRANITE || arch == LLM_ARCH_GRANITE_MOE) {
176+ if (is_quantized && ( arch == LLM_ARCH_GRANITE || arch == LLM_ARCH_GRANITE_MOE) ) {
175177 new_type = GGML_TYPE_Q8_0;
176178 }
177179 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
0 commit comments