Fix quantized K cache without FA (ikawrakow#680)

ikawrakow · Iwan Kawrakow · web-flow · commit 41f0d2e5dea8 · 2025-08-08T13:51:14.000+03:00
* Prevent assert with quantized K cache and no FA

* Fix MMQ when running with quantized K cache without FA

---------

Co-authored-by: Iwan Kawrakow &lt;iwan.kawrakow@gmail.com&gt;
diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
@@ -1647,7 +1647,7 @@ static void ggml_cuda_op_mul_mat(
     }
 
     const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
-    if (!(split && used_devices > 1) && quantization_done && ne11 == 1 && ne12 > 1 && ne13 == 1) {
+    if (!(split && used_devices > 1) && quantization_done && ne11 == 1 && ne12 > 1 && ne13 == 1 && ne02 == ne12 && ne02 == dst->ne[2]) {
         //printf("invoking fast path for %s x %s\n", src0->name, src1->name);
         int id = ctx.device;
         char  *  src0_dd_i =  dev[id].src0_dd;
diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu
@@ -14,7 +14,7 @@ void ggml_cuda_op_mul_mat_q(
     const int64_t src1_padded_row_size, cudaStream_t stream) {
 
     const int64_t ne00 = src0->ne[0];
-    const int64_t nb01 = src0->nb[1];
+    const int64_t nb01 = ggml_row_size(src0->type, ne00);
 
     const int64_t ne10 = src1->ne[0];
     const int64_t ne11 = src1->ne[1];

Original file line number	Diff line number	Diff line change
`@@ -1647,7 +1647,7 @@ static void ggml_cuda_op_mul_mat(`
`1647`	`1647`	`}`
`1648`	`1648`
`1649`	`1649`	`const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;`
`1650`		`- if (!(split && used_devices > 1) && quantization_done && ne11 == 1 && ne12 > 1 && ne13 == 1) {`
	`1650`	`+ if (!(split && used_devices > 1) && quantization_done && ne11 == 1 && ne12 > 1 && ne13 == 1 && ne02 == ne12 && ne02 == dst->ne[2]) {`
`1651`	`1651`	`//printf("invoking fast path for %s x %s\n", src0->name, src1->name);`
`1652`	`1652`	`int id = ctx.device;`
`1653`	`1653`	`char * src0_dd_i = dev[id].src0_dd;`