Fix dequantization when requantizing (ikawrakow#795)

ikawrakow · Iwan Kawrakow · web-flow · commit 0d1bbde1c40e · 2025-09-24T12:44:30.000+02:00
Co-authored-by: Iwan Kawrakow &lt;iwan.kawrakow@gmail.com&gt;
diff --git a/src/llama.cpp b/src/llama.cpp
@@ -17788,13 +17788,20 @@ static void llama_tensor_dequantize_internal(
         return;
     }
 
-    if (nthread < 2) {
+    if (nthread < 2 || (ggml_is_quantized(tensor->type) && qtype.row_meta_size > 0)) {
         if (tensor->type == GGML_TYPE_F16) {
             ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor->data, f32_output, nelements);
         } else if (tensor->type == GGML_TYPE_BF16) {
             ggml_bf16_to_fp32_row((ggml_bf16_t *)tensor->data, f32_output, nelements);
         } else if (ggml_is_quantized(tensor->type)) {
-            qtype.to_float(tensor->data, f32_output, nelements);
+            auto row_size = ggml_row_size(tensor->type, tensor->ne[0]);
+            int nrows = ggml_nrows(tensor);
+            auto qsrc = (const char *)tensor->data;
+            for (int row = 0; row < nrows; ++row) {
+                qtype.to_float(qsrc, f32_output, tensor->ne[0]);
+                qsrc += row_size;
+                f32_output += tensor->ne[0];
+            }
         } else {
             GGML_ABORT("fatal error"); // unreachable
         }