Skip to content

Commit 41f0d2e

Browse files
ikawrakowIwan Kawrakow
andauthored
Fix quantized K cache without FA (ikawrakow#680)
* Prevent assert with quantized K cache and no FA * Fix MMQ when running with quantized K cache without FA --------- Co-authored-by: Iwan Kawrakow <[email protected]>
1 parent 58f3bda commit 41f0d2e

File tree

2 files changed

+2
-2
lines changed

2 files changed

+2
-2
lines changed

ggml/src/ggml-cuda.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1647,7 +1647,7 @@ static void ggml_cuda_op_mul_mat(
16471647
}
16481648

16491649
const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
1650-
if (!(split && used_devices > 1) && quantization_done && ne11 == 1 && ne12 > 1 && ne13 == 1) {
1650+
if (!(split && used_devices > 1) && quantization_done && ne11 == 1 && ne12 > 1 && ne13 == 1 && ne02 == ne12 && ne02 == dst->ne[2]) {
16511651
//printf("invoking fast path for %s x %s\n", src0->name, src1->name);
16521652
int id = ctx.device;
16531653
char * src0_dd_i = dev[id].src0_dd;

ggml/src/ggml-cuda/mmq.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ void ggml_cuda_op_mul_mat_q(
1414
const int64_t src1_padded_row_size, cudaStream_t stream) {
1515

1616
const int64_t ne00 = src0->ne[0];
17-
const int64_t nb01 = src0->nb[1];
17+
const int64_t nb01 = ggml_row_size(src0->type, ne00);
1818

1919
const int64_t ne10 = src1->ne[0];
2020
const int64_t ne11 = src1->ne[1];

0 commit comments

Comments
 (0)