Remove redundant code, use non-saturating integer dot, enable all matmul sizes for mmq

0cc4m · 0cc4m · commit a527b9cc5938 · 2025-03-30T17:22:57.000Z
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -1899,14 +1899,6 @@ static void ggml_vk_load_shaders(vk_device& device) {
         if (device->mul_mat ## ID ## _s[TYPE]) \
             ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align, false, true);   \
 
-#define CREATE_MMQ(TYPE, PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
-        if (device->mul_mat ## ID ## _l[TYPE]) \
-            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1);   \
-        if (device->mul_mat ## ID ## _m[TYPE]) \
-            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1);   \
-        if (device->mul_mat ## ID ## _s[TYPE]) \
-            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1);   \
-
         // Create 2 variants, {f16,f32} accumulator
 #define CREATE_MM2(TYPE, PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
         if (device->coopmat_acc_f16_support) { \
@@ -2013,7 +2005,6 @@ static void ggml_vk_load_shaders(vk_device& device) {
             CREATE_MM(GGML_TYPE_IQ4_NL,  pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc,  matmul_id_iq4_nl_f32,  , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
         }
 #undef CREATE_MM2
-#undef CREATE_MMQ
 #undef CREATE_MM
     } else
 #endif  // defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
@@ -4151,7 +4142,7 @@ static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx,
         return aligned ? mmp->a_s : mmp->s;
     }
 
-    if ((ctx->device->mul_mat_s[src0_type] && (m <= 32 || n <= 32)) || (!ctx->device->mul_mat_m[src0_type] && !ctx->device->mul_mat_l[src0_type]) || src1_type == GGML_TYPE_Q8_1) {
+    if ((ctx->device->mul_mat_s[src0_type] && (m <= 32 || n <= 32)) || (!ctx->device->mul_mat_m[src0_type] && !ctx->device->mul_mat_l[src0_type])) {
         return aligned ? mmp->a_s : mmp->s;
     }
     if ((ctx->device->mul_mat_m[src0_type] && (m <= 64 || n <= 64)) || !ctx->device->mul_mat_l[src0_type]) {
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp
@@ -113,8 +113,8 @@ shared ACC_TYPE coopmat_stage[TM * TN * NUM_WARPS];
 #include "mul_mmq_funcs.comp"
 
 void main() {
-#if defined(DATA_A_IQ4_NL)
-    init_iq4nl_shmem();
+#ifdef NEEDS_INIT_IQ_SHMEM
+    init_iq_shmem(gl_WorkGroupSize);
 #endif
 
 #ifdef MUL_MAT_ID
@@ -347,9 +347,8 @@ void main() {
                         const uint sums_idx = (wsic * TN + cc) * (WMITER * TM) + wsir * TM + cr;
                         int32_t q_sum = 0;
                         [[unroll]] for (uint idx_k = 0; idx_k < BK / 4; idx_k++) {
-                            q_sum = dotPacked4x8AccSatEXT(cache_a_qs[cache_a_idx * (BK / 4) + idx_k],
-                                                          cache_b_qs[cc * (BK / 4) + idx_k],
-                                                          q_sum);
+                            q_sum += dotPacked4x8EXT(cache_a_qs[cache_a_idx * (BK / 4) + idx_k],
+                                                    cache_b_qs[cc * (BK / 4) + idx_k]);
                         }
 
                         sums[sums_idx] += mul_q8_1(q_sum, cache_a_dm[cache_a_idx], cache_b_ds[cc]);
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp
@@ -40,18 +40,12 @@ i32vec2 repack(uint ib, uint iqs) {
     const u16vec2 quants = u16vec2(data_a[ib].qs[iqs * 2    ],
                                    data_a[ib].qs[iqs * 2 + 1]);
     const uint32_t vui = pack32(quants);
-    const uint32_t qh = (uint32_t(data_a[ib].qh[1]) << 16 | data_a[ib].qh[0]) >> (4 * iqs);
-    int32_t v0 = int32_t(vui & 0x0F0F0F0F);
-    v0 |= int32_t((qh <<  4) & 0x00000010); // 0 ->  4
-    v0 |= int32_t((qh << 11) & 0x00001000); // 1 -> 12
-    v0 |= int32_t((qh << 18) & 0x00100000); // 2 -> 20
-    v0 |= int32_t((qh << 25) & 0x10000000); // 3 -> 28
-
-    int32_t v1 = int32_t((vui >> 4) & 0x0F0F0F0F);
-    v1 |= int32_t((qh >> 12) & 0x00000010); // 16 ->  4
-    v1 |= int32_t((qh >>  5) & 0x00001000); // 17 -> 12
-    v1 |= int32_t((qh <<  2) & 0x00100000); // 18 -> 20
-    v1 |= int32_t((qh <<  9) & 0x10000000); // 19 -> 28
+    const int32_t qh = int32_t((uint32_t(data_a[ib].qh[1]) << 16 | data_a[ib].qh[0]) >> (4 * iqs));
+    const int32_t v0 = int32_t(vui & 0x0F0F0F0F)
+                     | ((qh & 0xF) * 0x02040810) & 0x10101010; // (0,1,2,3) -> (4,12,20,28)
+
+    const int32_t v1 = int32_t((vui >> 4) & 0x0F0F0F0F)
+                     | (((qh >> 16) & 0xF) * 0x02040810) & 0x10101010; // (16,17,18,19) -> (4,12,20,28)
 
     return i32vec2(v0, v1);
 }
@@ -65,18 +59,12 @@ ACC_TYPE mul_q8_1(int32_t q_sum, float da, vec2 dsb) {
 i32vec2 repack(uint ib, uint iqs) {
     // Use 4-byte loads since a q5_1 block (24 bytes) is divisible by 4
     const uint32_t vui = data_a_packed32[ib].qs[iqs];
-    const uint32_t qh = data_a_packed32[ib].qh >> (4 * iqs);
-    int32_t v0 = int32_t(vui & 0x0F0F0F0F);
-    v0 |= int32_t((qh <<  4) & 0x00000010); // 0 ->  4
-    v0 |= int32_t((qh << 11) & 0x00001000); // 1 -> 12
-    v0 |= int32_t((qh << 18) & 0x00100000); // 2 -> 20
-    v0 |= int32_t((qh << 25) & 0x10000000); // 3 -> 28
-
-    int32_t v1 = int32_t((vui >> 4) & 0x0F0F0F0F);
-    v1 |= int32_t((qh >> 12) & 0x00000010); // 16 ->  4
-    v1 |= int32_t((qh >>  5) & 0x00001000); // 17 -> 12
-    v1 |= int32_t((qh <<  2) & 0x00100000); // 18 -> 20
-    v1 |= int32_t((qh <<  9) & 0x10000000); // 19 -> 28
+    const int32_t qh = int32_t(data_a_packed32[ib].qh >> (4 * iqs));
+    const int32_t v0 = int32_t(vui & 0x0F0F0F0F)
+                     | ((qh & 0xF) * 0x02040810) & 0x10101010; // (0,1,2,3) -> (4,12,20,28)
+
+    const int32_t v1 = int32_t((vui >> 4) & 0x0F0F0F0F)
+                     | (((qh >> 16) & 0xF) * 0x02040810) & 0x10101010; // (16,17,18,19) -> (4,12,20,28)
 
     return i32vec2(v0, v1);
 }