Skip to content

Commit 0d1bbde

Browse files
ikawrakowIwan Kawrakow
andauthored
Fix dequantization when requantizing (ikawrakow#795)
Co-authored-by: Iwan Kawrakow <[email protected]>
1 parent cde2eb5 commit 0d1bbde

File tree

1 file changed

+9
-2
lines changed

1 file changed

+9
-2
lines changed

src/llama.cpp

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17788,13 +17788,20 @@ static void llama_tensor_dequantize_internal(
1778817788
return;
1778917789
}
1779017790

17791-
if (nthread < 2) {
17791+
if (nthread < 2 || (ggml_is_quantized(tensor->type) && qtype.row_meta_size > 0)) {
1779217792
if (tensor->type == GGML_TYPE_F16) {
1779317793
ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor->data, f32_output, nelements);
1779417794
} else if (tensor->type == GGML_TYPE_BF16) {
1779517795
ggml_bf16_to_fp32_row((ggml_bf16_t *)tensor->data, f32_output, nelements);
1779617796
} else if (ggml_is_quantized(tensor->type)) {
17797-
qtype.to_float(tensor->data, f32_output, nelements);
17797+
auto row_size = ggml_row_size(tensor->type, tensor->ne[0]);
17798+
int nrows = ggml_nrows(tensor);
17799+
auto qsrc = (const char *)tensor->data;
17800+
for (int row = 0; row < nrows; ++row) {
17801+
qtype.to_float(qsrc, f32_output, tensor->ne[0]);
17802+
qsrc += row_size;
17803+
f32_output += tensor->ne[0];
17804+
}
1779817805
} else {
1779917806
GGML_ABORT("fatal error"); // unreachable
1780017807
}

0 commit comments

Comments
 (0)