vulkan: use q8_1_x4 blocks in mul_mmq shader

0cc4m · 0cc4m · commit 7c5f8ded16d8 · 2025-08-31T09:18:39.000Z
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -5763,7 +5763,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
     GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr);  // NOLINT
 
     if (quantize_y) {
-        to_q8_1 = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1, false);
+        to_q8_1 = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1, true);
     }
 
     if (dryrun) {
@@ -5780,7 +5780,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
             ctx->prealloc_size_x = x_sz_upd;
         }
         if ((qy_needs_dequant || quantize_y) && ctx->prealloc_size_y < y_sz_upd) {
-            ctx->prealloc_size_y = y_sz_upd;
+            ctx->prealloc_size_y = CEIL_DIV(y_sz_upd, 128) * 128;
         }
         if (split_k > 1 && ctx->prealloc_size_split_k < split_k_size) {
             ctx->prealloc_size_split_k = split_k_size;
@@ -5871,7 +5871,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
             if (ctx->prealloc_y_need_sync) {
                 ggml_vk_sync_buffers(ctx, subctx);
             }
-            ggml_vk_quantize_q8_1(ctx, subctx, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, y_ne * ne12 * ne13);
+            ggml_vk_quantize_q8_1(ctx, subctx, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, y_ne * ne12 * ne13, true);
             ctx->prealloc_y_last_pipeline_used = to_q8_1.get();
             ctx->prealloc_y_last_tensor_used = src1;
         }
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp
@@ -28,7 +28,7 @@ layout (binding = 0) readonly buffer A {A_TYPE_PACKED16 data_a[];};
 #if defined(A_TYPE_PACKED32)
 layout (binding = 0) readonly buffer A_PACKED32 {A_TYPE_PACKED32 data_a_packed32[];};
 #endif
-layout (binding = 1) readonly buffer B {block_q8_1_packed32 data_b[];};
+layout (binding = 1) readonly buffer B {block_q8_1_x4_packed128 data_b[];};
 layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
 
 #ifdef MUL_MAT_ID
@@ -98,7 +98,7 @@ shared FLOAT_TYPE_VEC2 buf_b_ds[BN];
 #endif
 
 #define LOAD_VEC_A (4 * QUANT_R)
-#define LOAD_VEC_B 4
+#define LOAD_VEC_B 16
 
 #ifdef MUL_MAT_ID
 shared u16vec2 row_ids[4096];
@@ -270,15 +270,22 @@ void main() {
             const uint iqs = idx & 0x7;
 #else
             const uint ib = pos_b_ib + (loadc_b + l) * p.stride_b / BK;
+            const uint ib_outer = ib / 4;
+            const uint ib_inner = ib % 4;
+
             const uint iqs = loadr_b;
 #endif
 
             const uint buf_ib = loadc_b + l;
 
             if (iqs == 0) {
-                buf_b_ds[buf_ib] = FLOAT_TYPE_VEC2(data_b[ib].ds);
+                buf_b_ds[buf_ib] = FLOAT_TYPE_VEC2(data_b[ib_outer].ds[ib_inner]);
             }
-            buf_b_qs[buf_ib * SHMEM_STRIDE + iqs] = data_b[ib].qs[iqs];
+            const ivec4 values = data_b[ib_outer].qs[ib_inner * 2 + iqs];
+            buf_b_qs[buf_ib * SHMEM_STRIDE + iqs * 4    ] = values.x;
+            buf_b_qs[buf_ib * SHMEM_STRIDE + iqs * 4 + 1] = values.y;
+            buf_b_qs[buf_ib * SHMEM_STRIDE + iqs * 4 + 2] = values.z;
+            buf_b_qs[buf_ib * SHMEM_STRIDE + iqs * 4 + 3] = values.w;
         }
 
         barrier();

Original file line number	Diff line number	Diff line change
`@@ -5763,7 +5763,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub`
`5763`	`5763`	`GGML_ASSERT(!qy_needs_dequant \|\| to_fp16_vk_1 != nullptr); // NOLINT`
`5764`	`5764`
`5765`	`5765`	`if (quantize_y) {`
`5766`		`- to_q8_1 = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1, false);`
	`5766`	`+ to_q8_1 = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1, true);`
`5767`	`5767`	`}`
`5768`	`5768`
`5769`	`5769`	`if (dryrun) {`
`@@ -5780,7 +5780,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub`
`5780`	`5780`	`ctx->prealloc_size_x = x_sz_upd;`
`5781`	`5781`	`}`
`5782`	`5782`	`if ((qy_needs_dequant \|\| quantize_y) && ctx->prealloc_size_y < y_sz_upd) {`
`5783`		`- ctx->prealloc_size_y = y_sz_upd;`
	`5783`	`+ ctx->prealloc_size_y = CEIL_DIV(y_sz_upd, 128) * 128;`
`5784`	`5784`	`}`
`5785`	`5785`	`if (split_k > 1 && ctx->prealloc_size_split_k < split_k_size) {`
`5786`	`5786`	`ctx->prealloc_size_split_k = split_k_size;`
`@@ -5871,7 +5871,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub`
`5871`	`5871`	`if (ctx->prealloc_y_need_sync) {`
`5872`	`5872`	`ggml_vk_sync_buffers(ctx, subctx);`
`5873`	`5873`	`}`
`5874`		`- ggml_vk_quantize_q8_1(ctx, subctx, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, y_ne * ne12 * ne13);`
	`5874`	`+ ggml_vk_quantize_q8_1(ctx, subctx, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, y_ne * ne12 * ne13, true);`
`5875`	`5875`	`ctx->prealloc_y_last_pipeline_used = to_q8_1.get();`
`5876`	`5876`	`ctx->prealloc_y_last_tensor_used = src1;`
`5877`	`5877`	`}`