Vulkan: TQ2_0 x Q8_1 MUL_MAT perf improvements

Italo Nicola · Italo Nicola · commit 417679c44c98 · 2025-11-21T16:17:26.000-05:00
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -3148,6 +3148,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
                 const uint32_t subgroup_size_int = (device->vendor_id == VK_VENDOR_ID_INTEL && device->subgroup_size_control) ? device->subgroup_min_size : device->subgroup_size;
                 const uint32_t wg_size_subgroup_int = (w == DMMV_WG_SIZE_SUBGROUP) ? subgroup_size_int : (subgroup_size_int * 4);
 
+                ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_TQ2_0][i], "mul_mat_vec_tq2_0_q8_1_f32", arr_dmmv_tq2_0_q8_1_f32_len[reduc], arr_dmmv_tq2_0_q8_1_f32_data[reduc], "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup_int, 2*rm_stdq, i+1}, 1, true, use_subgroups, subgroup_size_int);
                 ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q4_0][i], "mul_mat_vec_q4_0_q8_1_f32", arr_dmmv_q4_0_q8_1_f32_len[reduc], arr_dmmv_q4_0_q8_1_f32_data[reduc], "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup_int, 2*rm_stdq, i+1}, 1, true, use_subgroups, subgroup_size_int);
                 ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q4_1][i], "mul_mat_vec_q4_1_q8_1_f32", arr_dmmv_q4_1_q8_1_f32_len[reduc], arr_dmmv_q4_1_q8_1_f32_data[reduc], "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup_int, 2*rm_stdq, i+1}, 1, true, use_subgroups, subgroup_size_int);
                 ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q5_0][i], "mul_mat_vec_q5_0_q8_1_f32", arr_dmmv_q5_0_q8_1_f32_len[reduc], arr_dmmv_q5_0_q8_1_f32_data[reduc], "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup_int, 2*rm_stdq, i+1}, 1, true, use_subgroups, subgroup_size_int);
@@ -4829,6 +4830,7 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context *
 
     if (b_type == GGML_TYPE_Q8_1) {
         switch (a_type) {
+            case GGML_TYPE_TQ2_0:
             case GGML_TYPE_Q4_0:
             case GGML_TYPE_Q4_1:
             case GGML_TYPE_Q5_0:
@@ -4891,6 +4893,9 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context *
         if (ctx->device->vendor_id == VK_VENDOR_ID_INTEL) {
             dmmv_wg = DMMV_WG_SIZE_SUBGROUP;
         }
+        if (ctx->device->vendor_id == VK_VENDOR_ID_QUALCOMM) {
+            dmmv_wg = DMMV_WG_SIZE_SUBGROUP;
+        }
         return ctx->device->pipeline_dequant_mul_mat_vec_q8_1_f32[dmmv_wg][a_type][num_cols-1];
     }
 
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp
@@ -664,8 +664,12 @@ float16_t dequantFuncTQ2_0(const in decodeBufTQ2_0 bl, const in uint blockCoords
     const float16_t d = bl.block.d;
     const uint idx = coordInBlock[1];
 
-    const uint byte_idx = ((idx >> 7) << 5) + (idx & 31u);
-    const uint qsshift = (((idx & 127u) >> 5) << 1);
+    const uint iqs = idx % 128u;
+    const uint upper = idx / 128u;
+
+    const uint byte_idx = (upper * 32u) + (iqs % 32u);
+
+    const uint qsshift = (iqs / 32u) * 2u;
 
     const uint c = (uint(bl.block.qs[byte_idx]) >> qsshift) & 3u;
     return d * float16_t(float(c) - 1.0f);
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp
@@ -13,7 +13,7 @@
 
 #include "types.comp"
 
-#ifndef MMQ
+#if !defined(MMQ) || !defined(A_TYPE_PACKED16)
 layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
 #else
 layout (binding = 0) readonly buffer A {A_TYPE_PACKED16 data_a[];};
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_tq2_0.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_tq2_0.comp
@@ -21,15 +21,18 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
         }
     }
 
-    [[unroll]] for (uint i = tid; i < num_blocks_per_row; i += gl_WorkGroupSize.x) {
-
-        [[unroll]] for (uint n = 0; n < num_rows; ++n) {
-            const uint ib0 = a_offset / QUANT_K + (first_row + n) * num_blocks_per_row;
-            const float d = float(data_a[ib0 + i].d);
-            
-            [[unroll]] for (uint j = 0; j < 64; j += 32) {
-                [[unroll]] for (uint l = 0; l < 4; ++l) {
-                    [[unroll]] for (uint k = 0; k < 32; ++k) {
+    for (uint n = 0; n < num_rows; ++n) {
+        const uint ib0 = a_offset / QUANT_K + (first_row + n) * num_blocks_per_row;
+        for (uint jcol = 0; jcol < NUM_COLS; ++jcol) {
+            const uint b_base = (jcol * p.batch_stride_b);
+            FLOAT_TYPE acc = 0.0f;
+            for (uint i = tid/8; i < num_blocks_per_row; i += gl_WorkGroupSize.x/8) {
+                const FLOAT_TYPE d = float(data_a[ib0 + i].d);
+                
+                [[unroll]] for (uint j = 0; j < 64; j += 32) {
+                    [[unroll]] for (uint l = 0; l < 4; ++l) {
+                        [[unroll]] for (uint k = tid%8; k < 32; k+=8) {
+                        //uint k = (tid % 8) * 4;
                         // Extract quantized value: ((x[i].qs[j + k] >> (l*2)) & 3) - 1
                         const uint q_byte = uint(data_a[ib0 + i].qs[j + k]);
                         const uint shift = l * 2;
@@ -38,10 +41,7 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
                         
                         // y-data access pattern: y[i].qs[j*4 + l*32 + k]
                         const uint b_idx = i * QUANT_K + j * 4 + l * 32 + k;
-                        if (b_idx < p.ncols) {
-                            [[unroll]] for (uint jcol = 0; jcol < NUM_COLS; ++jcol) {
-                                temp[jcol][n] += dequant_val * FLOAT_TYPE(data_b[jcol * p.batch_stride_b + b_offset + b_idx]);
-                            }
+                        temp[jcol][n] += dequant_val * FLOAT_TYPE(data_b[b_base + b_offset + b_idx]);
                         }
                     }
                 }
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_tq2_0_q.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_tq2_0_q.comp
@@ -0,0 +1,95 @@
+#version 450
+#extension GL_EXT_shader_explicit_arithmetic_types : require
+#extension GL_EXT_integer_dot_product : require
+
+#define MMQ
+#define B_TYPE block_q8_1_x4
+
+#include "mul_mat_vec_base.comp"
+#include "mul_mmq_funcs.comp"
+
+layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
+
+FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
+
+void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
+    uint a_offset, b_offset, d_offset;
+    get_offsets(a_offset, b_offset, d_offset);
+
+    const uint num_blocks_per_row = p.ncols / QUANT_K;
+
+    [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
+        [[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
+            temp[j][i] = FLOAT_TYPE(0);
+        }
+    }
+
+    const uint tid = gl_LocalInvocationID.x;
+
+    for (uint jcol = 0; jcol < NUM_COLS; jcol++) {
+        const uint b_base = (jcol * p.batch_stride_b);
+        for (uint n = 0; n < num_rows; ++n) {
+            const uint ib0 = a_offset / QUANT_K + (first_row + n) * num_blocks_per_row;
+            FLOAT_TYPE acc = 0.0f;
+            for (uint i = tid/8; i < num_blocks_per_row; i+=gl_WorkGroupSize.x/8) {
+                const float d = float(data_a[ib0 + i].d);
+                [[unroll]] for (uint j = 0; j < 64; j += 32) {
+                    [[unroll]] for (uint l = 0; l < 4; l+=2) {
+                        uint k = (tid % 8) * 4;
+
+                        const uint shift0 = l * 2u;
+                        const int c00 = int(((data_a[ib0 + i].qs[j + k]) >> shift0) & 3u);
+                        const int c01 = int(((data_a[ib0 + i].qs[j + k + 1]) >> shift0) & 3u);
+                        const int c02 = int(((data_a[ib0 + i].qs[j + k + 2]) >> shift0) & 3u);
+                        const int c03 = int(((data_a[ib0 + i].qs[j + k + 3]) >> shift0) & 3u);
+                        const int32_t a0_packed = c00 | (c01 << 8) | (c02 << 16) | (c03 << 24);
+                        const uint b0_idx = i * QUANT_K + j * 4 + l * 32;
+
+                        const uint shift1 = (l+1) * 2u;
+                        const int c10 = int(((data_a[ib0 + i].qs[j + k]) >> shift1) & 3u);
+                        const int c11 = int(((data_a[ib0 + i].qs[j + k + 1]) >> shift1) & 3u);
+                        const int c12 = int(((data_a[ib0 + i].qs[j + k + 2]) >> shift1) & 3u);
+                        const int c13 = int(((data_a[ib0 + i].qs[j + k + 3]) >> shift1) & 3u);
+                        const int32_t a1_packed = c10 | (c11 << 8) | (c12 << 16) | (c13 << 24);
+                        const uint b1_idx = i * QUANT_K + j * 4 + (l+1) * 32;
+
+                        // Not checking for OOB since we're guaranteed to be multiple of 256
+                        const uint b0_block_idx = b_offset + (b_base + b0_idx) / QUANT_K_Q8_1;
+                        const uint b1_block_idx = b_offset + (b_base + b1_idx) / QUANT_K_Q8_1;
+                        const uint b0_block_idx_outer = b0_block_idx / 4;
+                        const uint b1_block_idx_outer = b1_block_idx / 4;
+                        const uint b0_block_idx_inner = b0_block_idx % 4;
+                        const uint b1_block_idx_inner = b1_block_idx % 4;
+                        vec2 ds0 = vec2(data_b[b_offset + b0_block_idx_outer].ds[b0_block_idx_inner]);
+                        vec2 ds1 = vec2(data_b[b_offset + b1_block_idx_outer].ds[b1_block_idx_inner]);
+
+                        const uint vec_idx = k / 4;
+                        int32_t b0_packed = data_b[b_offset + b0_block_idx_outer].qs[b0_block_idx_inner * 8 + vec_idx];
+                        int32_t b1_packed = data_b[b_offset + b1_block_idx_outer].qs[b1_block_idx_inner * 8 + vec_idx];
+
+                        int32_t q0_sum = dotPacked4x8EXT(a0_packed, b0_packed);
+                        int32_t q1_sum = dotPacked4x8EXT(a1_packed, b1_packed);
+                        acc += ACC_TYPE(d * (FLOAT_TYPE(q0_sum) * ds0.x - FLOAT_TYPE(1.0f / 8) * ds0.y));
+                        acc += ACC_TYPE(d * (FLOAT_TYPE(q1_sum) * ds1.x - FLOAT_TYPE(1.0f / 8) * ds1.y));
+                    }
+                }
+            }
+            temp[jcol][n] = acc; 
+        }
+    }
+
+    reduce_result(temp, d_offset, first_row, num_rows, tid);
+}
+
+void main() {
+    const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
+
+    if (first_row + NUM_ROWS <= p.stride_d) {
+        compute_outputs(first_row, NUM_ROWS);
+    } else {
+        if (first_row >= p.stride_d) {
+            return;
+        }
+        compute_outputs(first_row, p.stride_d - first_row);
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp
@@ -30,7 +30,7 @@ void iter(inout FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const uint first_row, const
         const uint b_block_idx_inner = b_block_idx % 4;
         cache_b_ds = vec2(data_b[b_block_idx_outer].ds[b_block_idx_inner]);
 
-#if QUANT_R == 2
+#if QUANT_R == 2 || QUANT_R == 4
         cache_b_qs[0] = data_b[b_block_idx_outer].qs[b_block_idx_inner * 8 + b_qs_idx];
         cache_b_qs[1] = data_b[b_block_idx_outer].qs[b_block_idx_inner * 8 + b_qs_idx + 4];
 #else
@@ -40,12 +40,19 @@ void iter(inout FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const uint first_row, const
 
         uint ibi = first_row*p.ncols;
         [[unroll]] for (uint n = 0; n < num_rows; ++n) {
-            const uint a_block_idx = (ibi + col)/QUANT_K + a_offset;
-            ibi += p.ncols;
+            const uint cur_ibi = ibi;
+            const uint a_block_idx = (cur_ibi + col)/QUANT_K + a_offset;
 
             int32_t q_sum = 0;
-#if QUANT_R == 2
+#if QUANT_R == 2 || QUANT_R == 4
+#if defined(DATA_A_TQ2_0)
+            // For TQ2_0 (QUANT_K=256), repack needs the within-block K base to select
+            // the correct half (k<128 or k>=128) and 32-wide quarter. Pass k_base+b_qs_idx.
+            const uint k_base = (cur_ibi + col) % QUANT_K;
+            const i32vec2 data_a_qs = repack(a_block_idx, k_base + b_qs_idx);
+#else
             const i32vec2 data_a_qs = repack(a_block_idx, b_qs_idx);
+#endif
             q_sum += dotPacked4x8EXT(data_a_qs.x,
                                      cache_b_qs[0]);
             q_sum += dotPacked4x8EXT(data_a_qs.y,
@@ -59,11 +66,14 @@ void iter(inout FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const uint first_row, const
                                      cache_b_qs[1]);
 #endif
 
-#if QUANT_AUXF == 1
+#if QUANT_AUXF == 1 && QUANT_R <= 2
+            temp[j][n] += mul_q8_1(q_sum,  get_d(a_block_idx), cache_b_ds, 4);
+#elif QUANT_AUXF == 1 && QUANT_R == 4
             temp[j][n] += mul_q8_1(q_sum,  get_d(a_block_idx), cache_b_ds, 4);
 #else
             temp[j][n] += mul_q8_1(q_sum, get_dm(a_block_idx), cache_b_ds, 4);
 #endif
+            ibi += p.ncols;
         }
     }
 }
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp
@@ -86,7 +86,82 @@ ACC_TYPE mul_q8_1(const int32_t q_sum, const float da, const vec2 dsb, const int
 }
 #endif
 
-#if defined(DATA_A_Q4_0) || defined(DATA_A_Q5_0) || defined(DATA_A_Q8_0) || defined(DATA_A_IQ1_S) || defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_XS) || defined(DATA_A_IQ4_NL)
+#if defined(DATA_A_TQ2_0)
+i32vec2 repack(uint ib, uint iqs) {
+    const uint k00 = iqs + 0u;
+    const uint ip00 = ((k00 >> 7) & 1u) * 32u;
+    const uint b00 = (k00 & 31u) + ip00;
+    const uint s00 = ((k00 >> 5) & 3u) * 2u;
+
+    const uint k01 = iqs + 1u;
+    const uint ip01 = ((k01 >> 7) & 1u) * 32u;
+    const uint b01 = (k01 & 31u) + ip01;
+    const uint s01 = ((k01 >> 5) & 3u) * 2u;
+
+    const uint k02 = iqs + 2u;
+    const uint ip02 = ((k02 >> 7) & 1u) * 32u;
+    const uint b02 = (k02 & 31u) + ip02;
+    const uint s02 = ((k02 >> 5) & 3u) * 2u;
+
+    const uint k03 = iqs + 3u;
+    const uint ip03 = ((k03 >> 7) & 1u) * 32u;
+    const uint b03 = (k03 & 31u) + ip03;
+    const uint s03 = ((k03 >> 5) & 3u) * 2u;
+
+    const int q00 = int(data_a[ib].qs[b00]);
+    const int q01 = int(data_a[ib].qs[b01]);
+    const int q02 = int(data_a[ib].qs[b02]);
+    const int q03 = int(data_a[ib].qs[b03]);
+
+    const int t00 = (q00 >> int(s00)) & 3;
+    const int t01 = (q01 >> int(s01)) & 3;
+    const int t02 = (q02 >> int(s02)) & 3;
+    const int t03 = (q03 >> int(s03)) & 3;
+
+    const int v0 = (t00 & 0xFF) | ((t01 & 0xFF) << 8) | ((t02 & 0xFF) << 16) | ((t03 & 0xFF) << 24);
+
+
+    const uint k10 = iqs + 16u + 0u;
+    const uint ip10 = ((k10 >> 7) & 1u) * 32u;
+    const uint b10 = (k10 & 31u) + ip10;
+    const uint s10 = ((k10 >> 5) & 3u) * 2u;
+
+    const uint k11 = iqs + 16u + 1u;
+    const uint ip11 = ((k11 >> 7) & 1u) * 32u;
+    const uint b11 = (k11 & 31u) + ip11;
+    const uint s11 = ((k11 >> 5) & 3u) * 2u;
+
+    const uint k12 = iqs + 16u + 2u;
+    const uint ip12 = ((k12 >> 7) & 1u) * 32u;
+    const uint b12 = (k12 & 31u) + ip12;
+    const uint s12 = ((k12 >> 5) & 3u) * 2u;
+
+    const uint k13 = iqs + 16u + 3u;
+    const uint ip13 = ((k13 >> 7) & 1u) * 32u;
+    const uint b13 = (k13 & 31u) + ip13;
+    const uint s13 = ((k13 >> 5) & 3u) * 2u;
+
+    const int q10 = int(data_a[ib].qs[b10]);
+    const int q11 = int(data_a[ib].qs[b11]);
+    const int q12 = int(data_a[ib].qs[b12]);
+    const int q13 = int(data_a[ib].qs[b13]);
+
+    const int u10 = (q10 >> int(s10)) & 3;
+    const int u11 = (q11 >> int(s11)) & 3;
+    const int u12 = (q12 >> int(s12)) & 3;
+    const int u13 = (q13 >> int(s13)) & 3;
+
+    const int v1 = (u10 & 0xFF) | ((u11 & 0xFF) << 8) | ((u12 & 0xFF) << 16) | ((u13 & 0xFF) << 24);
+
+    return i32vec2(v0, v1);
+}
+
+ACC_TYPE mul_q8_1(const int32_t q_sum, const float da, const vec2 dsb, const int32_t sum_divisor) {
+    return ACC_TYPE(da * (float(q_sum) * dsb.x - float(1.0f / sum_divisor) * dsb.y));
+}
+#endif
+
+#if defined(DATA_A_TQ2_0) || defined(DATA_A_Q4_0) || defined(DATA_A_Q5_0) || defined(DATA_A_Q8_0) || defined(DATA_A_IQ1_S) || defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_XS) || defined(DATA_A_IQ4_NL)
 FLOAT_TYPE get_d(uint ib) {
     return FLOAT_TYPE(data_a[ib].d);
 }
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/types.comp b/ggml/src/ggml-vulkan/vulkan-shaders/types.comp
@@ -1368,6 +1368,7 @@ struct block_tq2_0
 #if defined(DATA_A_TQ2_0)
 #define QUANT_K QUANT_K_TQ2_0
 #define QUANT_R QUANT_R_TQ2_0
+#define QUANT_AUXF 1
 #define A_TYPE block_tq2_0
 #endif
 
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
@@ -532,6 +532,10 @@ void process_shaders() {
             string_to_spv("mul_mat_vec_" + tname + "_q8_1_f32", "mul_mat_vecq.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"FLOAT_TYPE_VEC2", "vec2"}, {"ACC_TYPE", "float"}}));
             string_to_spv("mul_mat_vec_" + tname + "_q8_1_f32_subgroup", "mul_mat_vecq.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"FLOAT_TYPE_VEC2", "vec2"}, {"ACC_TYPE", "float"}, {"USE_SUBGROUP_ADD", "1"}}));
             string_to_spv("mul_mat_vec_" + tname + "_q8_1_f32_subgroup_no_shmem", "mul_mat_vecq.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"FLOAT_TYPE_VEC2", "vec2"}, {"ACC_TYPE", "float"}, {"USE_SUBGROUP_ADD_NO_SHMEM", "1"}}));
+        } else if (tname == "tq2_0") {
+            string_to_spv("mul_mat_vec_" + tname + "_q8_1_f32", "mul_mat_vec_tq2_0_q.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"FLOAT_TYPE_VEC2", "vec2"}, {"ACC_TYPE", "float"}}));
+            string_to_spv("mul_mat_vec_" + tname + "_q8_1_f32_subgroup", "mul_mat_vec_tq2_0_q.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"FLOAT_TYPE_VEC2", "vec2"}, {"ACC_TYPE", "float"}, {"USE_SUBGROUP_ADD", "1"}}));
+            string_to_spv("mul_mat_vec_" + tname + "_q8_1_f32_subgroup_no_shmem", "mul_mat_vec_tq2_0_q.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"FLOAT_TYPE_VEC2", "vec2"}, {"ACC_TYPE", "float"}, {"USE_SUBGROUP_ADD_NO_SHMEM", "1"}}));
         }
 #endif
 
@@ -992,7 +996,7 @@ void write_output_files() {
 
     for (const std::string& btype : btypes) {
     for (const auto& tname : type_names) {
-        if (btype == "q8_1" && !is_legacy_quant(tname)) {
+        if (btype == "q8_1" && !is_legacy_quant(tname) && tname != "tq2_0") {
             continue;
         }
         fprintf(hdr, "extern unsigned char *arr_dmmv_%s_%s_f32_data[3];\n", tname.c_str(), btype.c_str());
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
@@ -6181,13 +6181,23 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
     test_cases.emplace_back(new test_out_prod(GGML_TYPE_F32, GGML_TYPE_F32, 1024, 256, 4096*40, { 1,  1}, {1, 1}));
 #endif
 
+    //test_cases.emplace_back(new test_mul_mat(GGML_TYPE_TQ2_0, GGML_TYPE_F32, 16, 1, 16, {1, 1}, {1, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_TQ2_0, GGML_TYPE_F32, 16, 1, 256, {1, 1}, {1, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_TQ2_0, GGML_TYPE_F32, 16, 2, 256, {1, 1}, {1, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_TQ2_0, GGML_TYPE_F32, 16, 4, 256, {1, 1}, {1, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_TQ2_0, GGML_TYPE_F32, 16, 8, 256, {1, 1}, {1, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_TQ2_0, GGML_TYPE_F32, 32, 32, 256, {1, 1}, {1, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_TQ2_0, GGML_TYPE_F32, 16, 1, 1024, {1, 1}, {1, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_TQ2_0, GGML_TYPE_F32, 16, 2, 1024, {1, 1}, {1, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_TQ2_0, GGML_TYPE_F32, 16, 4, 1024, {1, 1}, {1, 1}));
+
+#if 0
     for (ggml_type type_a : all_types) {
         for (int i = 1; i < 10; ++i) {
             test_cases.emplace_back(new test_mul_mat(type_a,    GGML_TYPE_F32, 16,  i, 256, { 1,  1}, {1, 1}));
         }
     }
 
-#if 1
     for (ggml_type type_a : base_types) {
         for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) {
             std::vector<int> ks = { 256 };