(wip) Vulkan: Adreno Q6_K fix

Italo Nicola · Italo Nicola · commit c5b71626f10c · 2025-09-30T14:02:50.000-03:00
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -240,14 +240,14 @@ enum FaHeadSizes {
 };
 
 // XXX: Use value queried from the driver
-#if 0
+#if 1
 const uint64_t MAX_ADDRESS_SPACE_SIZE = 1 << 27;
 const uint64_t MAX_ADDRESS_SPACE_SIZE_MUL_MAT = 1 << 27;
 const uint64_t MAX_ADDRESS_SPACE_SIZE_OUT_PROD = 1 << 27;
 #else
-const uint64_t MAX_ADDRESS_SPACE_SIZE = 1 << 27;
-const uint64_t MAX_ADDRESS_SPACE_SIZE_MUL_MAT = 1 << 27;
-const uint64_t MAX_ADDRESS_SPACE_SIZE_OUT_PROD = 1 << 27;
+const uint64_t MAX_ADDRESS_SPACE_SIZE = 1 << 26;
+const uint64_t MAX_ADDRESS_SPACE_SIZE_MUL_MAT = 1 << 26;
+const uint64_t MAX_ADDRESS_SPACE_SIZE_OUT_PROD = 1 << 26;
 #endif
 
 static vk_device_architecture get_device_architecture(const vk::PhysicalDevice& device) {
@@ -4415,11 +4415,13 @@ static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context&
     const uint32_t wg0 = CEIL_DIV(elements[0], pipeline->wg_denoms[0]);
     const uint32_t wg1 = CEIL_DIV(elements[1], pipeline->wg_denoms[1]);
     const uint32_t wg2 = CEIL_DIV(elements[2], pipeline->wg_denoms[2]);
-    VK_LOG_DEBUG("ggml_vk_dispatch_pipeline(" << pipeline->name << ", {";
+#if 0
+    std::cerr << "ggml_vk_dispatch_pipeline(" << pipeline->name << ", {";
     for (auto& buffer : descriptor_buffer_infos) {
         std::cerr << "(" << buffer.buffer << ", " << buffer.offset << ", " << buffer.range << "), ";
     }
-    std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))");
+    std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))" << std::endl;
+#endif
     GGML_ASSERT(ctx->descriptor_set_idx < ctx->descriptor_sets.size());
     GGML_ASSERT(descriptor_buffer_infos.size() <= MAX_PARAMETER_COUNT);
 
@@ -5439,6 +5441,10 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
          d_sz * ne12 * ne13 >= tiling_threshold);
 #endif
 
+    if (tiling_debug) {
+        fprintf(stderr, "tiling enabled ? %d (%lu > %lu ?)\n", do_tiling, x_sz * ne02 * ne03 + y_sz * ne12 * ne13 + d_sz * ne12 * ne13, tiling_threshold);
+    }
+
     // XXX
     bool do_splitting = false;
 #if 0
@@ -5980,7 +5986,9 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
 
     if (ne01 > max_groups_x) {
         groups_z = 64;
+        //groups_z = 96;
         groups_x = CEIL_DIV(groups_x, groups_z);
+        GGML_ASSERT(max_groups_x > groups_x);
     }
 
     // compute
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp
@@ -1,4 +1,5 @@
 #extension GL_EXT_control_flow_attributes : enable
+//#extension GL_EXT_integer_dot_product : require
 #extension GL_EXT_shader_16bit_storage : require
 #extension GL_EXT_shader_8bit_storage : require
 
@@ -10,7 +11,7 @@
 
 layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
 layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
-#if !defined(DATA_A_Q8_0) && !defined(DATA_A_Q4_0) && !defined(DATA_A_Q4_1)
+#if !defined(DATA_A_Q8_0) && !defined(DATA_A_Q4_0) && !defined(DATA_A_Q4_1) && !defined(DATA_A_Q6_K)
 layout (binding = 1) readonly buffer BV2 {B_TYPE_VEC2 data_b_v2[];};
 layout (binding = 1) readonly buffer BV4 {B_TYPE_VEC4 data_b_v4[];};
 #endif
@@ -94,15 +95,15 @@ shared FLOAT_TYPE tmpsh[NUM_COLS][NUM_ROWS][BLOCK_SIZE];
 
 void reduce_result(const in FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const in uint32_t d_offset, const in uint32_t first_row, const in uint32_t num_rows, const in uint32_t tid) {
     // sum up partial sums and write back result
-    [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
-        [[unroll]] for (uint n = 0; n < num_rows; ++n) {
+    for (uint j = 0; j < NUM_COLS; ++j) {
+        for (uint n = 0; n < num_rows; ++n) {
             tmpsh[j][n][tid] = temp[j][n];
         }
     }
     barrier();
-    [[unroll]] for (uint s = BLOCK_SIZE/2; s > 0; s >>= 1) {
+    for (uint s = BLOCK_SIZE/2; s > 0; s >>= 1) {
         if (tid < s) {
-            [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
+            for (uint j = 0; j < NUM_COLS; ++j) {
                 [[unroll]] for (uint n = 0; n < num_rows; ++n) {
                     tmpsh[j][n][tid] += tmpsh[j][n][tid + s];
                 }
@@ -111,8 +112,8 @@ void reduce_result(const in FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const in uint32
         barrier();
     }
     if (tid == 0) {
-        [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
-            [[unroll]] for (uint n = 0; n < num_rows; ++n) {
+        for (uint j = 0; j < NUM_COLS; ++j) {
+            for (uint n = 0; n < num_rows; ++n) {
                 data_d[j*p.batch_stride_d + d_offset + first_row + n] = D_TYPE(tmpsh[j][n][0]);
             }
         }
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp
@@ -14,7 +14,7 @@ uint csel = 0;
 void calc_superblock(const uint a_offset, const uint b_offset, const uint itid, const uint ix, const uint ql_offset, const uint qh_offset, const uint s_offset, const uint y_offset, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows, const bool all_threads) {
     const uint y_idx = i * QUANT_K + y_offset;
 
-    [[unroll]] for (uint n = 0; n < num_rows; ++n) {
+    for (uint n = 0; n < num_rows; ++n) {
         const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
         csel ^= 1;
 
@@ -27,15 +27,39 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid,
                 continue;
         }
 
-        const uint32_t ql0_u32 =  uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 1]) << 16);
-        const uint32_t ql32_u32 = uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 16]) | (uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 17]) << 16);
+#if 0
+        const uint32_t ql0_u32 = 
+            uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2]) |
+            (uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 1]) << 16);
+        const uint32_t ql32_u32 =
+            uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 16]) |
+            (uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 17]) << 16);
+        const uint32_t qh_u32 =
+            uint32_t(data_a_packed16[ib0 + i].qh[qh_offset / 2]) |
+            (uint32_t(data_a_packed16[ib0 + i].qh[qh_offset / 2 + 1]) << 16);
+#else
+        const uint32_t ql0_u32 = 
+            uint32_t(data_a[ib0 + i].ql[ql_offset]) |
+            (uint32_t(data_a[ib0 + i].ql[ql_offset + 1]) << 8) |
+            (uint32_t(data_a[ib0 + i].ql[ql_offset + 2]) << 16) |
+            (uint32_t(data_a[ib0 + i].ql[ql_offset + 3]) << 24);
+        const uint32_t ql32_u32 =
+            uint32_t(data_a[ib0 + i].ql[ql_offset  + 32]) |
+            (uint32_t(data_a[ib0 + i].ql[ql_offset + 33]) << 8) |
+            (uint32_t(data_a[ib0 + i].ql[ql_offset + 34]) << 16) |
+            (uint32_t(data_a[ib0 + i].ql[ql_offset + 35]) << 24);
+        const uint32_t qh_u32 =
+            uint32_t(data_a[ib0 + i].qh[qh_offset + 0]) |
+            (uint32_t(data_a[ib0 + i].qh[qh_offset + 1]) << 8) |
+            (uint32_t(data_a[ib0 + i].qh[qh_offset + 2]) << 16) |
+            (uint32_t(data_a[ib0 + i].qh[qh_offset + 3]) << 24);
+#endif
 
         const uint32_t ql0_u32_lo4 = ql0_u32 & 0x0F0F0F0F;
         const uint32_t ql0_u32_hi4 = (ql0_u32 >> 4) & 0x0F0F0F0F;
         const uint32_t ql32_u32_lo4 = ql32_u32 & 0x0F0F0F0F;
         const uint32_t ql32_u32_hi4 = (ql32_u32 >> 4) & 0x0F0F0F0F;
 
-        const uint32_t qh_u32 = uint32_t(data_a_packed16[ib0 + i].qh[qh_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].qh[qh_offset / 2 + 1]) << 16);
         const uint32_t qh0_u32 = (qh_u32 & 0x03030303) << 4;
         const uint32_t qh2_u32 = (qh_u32 & 0x0C0C0C0C) << 2;
         const uint32_t qh4_u32 = (qh_u32 & 0x30303030);
@@ -46,10 +70,17 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid,
         const uint32_t q2_u32 = ql0_u32_hi4  | qh4_u32;
         const uint32_t q3_u32 = ql32_u32_hi4 | qh6_u32;
 
+#if 0
         const vec4 q0 = vec4(unpack8(q0_u32)) - 32;
         const vec4 q1 = vec4(unpack8(q1_u32)) - 32;
         const vec4 q2 = vec4(unpack8(q2_u32)) - 32;
         const vec4 q3 = vec4(unpack8(q3_u32)) - 32;
+#else
+        const vec4 q0 = vec4(float(q0_u32 & 0xFF), float((q0_u32 >> 8) & 0xFF), float((q0_u32 >> 16) & 0xFF), float(q0_u32 >> 24)) - 32;
+        const vec4 q1 = vec4(float(q1_u32 & 0xFF), float((q1_u32 >> 8) & 0xFF), float((q1_u32 >> 16) & 0xFF), float(q1_u32 >> 24)) - 32;
+        const vec4 q2 = vec4(float(q2_u32 & 0xFF), float((q2_u32 >> 8) & 0xFF), float((q2_u32 >> 16) & 0xFF), float(q2_u32 >> 24)) - 32;
+        const vec4 q3 = vec4(float(q3_u32 & 0xFF), float((q3_u32 >> 8) & 0xFF), float((q3_u32 >> 16) & 0xFF), float(q3_u32 >> 24)) - 32;
+#endif
 
         if (all_threads) {
             sccache[csel][ix][itid] = FLOAT_TYPE(data_a[ib0 + i].scales[itid]);
@@ -58,14 +89,38 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid,
 
         const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d);
 
-        [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
+        for (uint j = 0; j < NUM_COLS; ++j) {
+
+#if 0
             vec4 by0  = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4     ]);
             vec4 by32 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 +  8]);
             vec4 by64 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 16]);
             vec4 by96 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 24]);
+#else
+            vec4 by0  =
+                vec4(data_b[(j*p.batch_stride_b + b_offset + y_idx) + 0],
+                     data_b[(j*p.batch_stride_b + b_offset + y_idx) + 1],
+                     data_b[(j*p.batch_stride_b + b_offset + y_idx) + 2],
+                     data_b[(j*p.batch_stride_b + b_offset + y_idx) + 3]);
+            vec4 by32  =
+                vec4(data_b[(j*p.batch_stride_b + b_offset + y_idx) + 4 * 8],
+                     data_b[(j*p.batch_stride_b + b_offset + y_idx) + 4 * 8 + 1],
+                     data_b[(j*p.batch_stride_b + b_offset + y_idx) + 4 * 8 + 2],
+                     data_b[(j*p.batch_stride_b + b_offset + y_idx) + 4 * 8 + 3]);
+            vec4 by64  =
+                vec4(data_b[(j*p.batch_stride_b + b_offset + y_idx) + 4 * 16],
+                     data_b[(j*p.batch_stride_b + b_offset + y_idx) + 4 * 16 + 1],
+                     data_b[(j*p.batch_stride_b + b_offset + y_idx) + 4 * 16 + 2],
+                     data_b[(j*p.batch_stride_b + b_offset + y_idx) + 4 * 16 + 3]);
+            vec4 by96  =
+                vec4(data_b[(j*p.batch_stride_b + b_offset + y_idx) + 4 * 24],
+                     data_b[(j*p.batch_stride_b + b_offset + y_idx) + 4 * 24 + 1],
+                     data_b[(j*p.batch_stride_b + b_offset + y_idx) + 4 * 24 + 2],
+                     data_b[(j*p.batch_stride_b + b_offset + y_idx) + 4 * 24 + 3]);
+#endif
 
             FLOAT_TYPE sum[4] = {0, 0, 0, 0};
-            [[unroll]] for (uint l = 0; l < 4; ++l) {
+            for (uint l = 0; l < 4; ++l) {
                 sum[0] = fma(FLOAT_TYPE(by0[l]), q0[l], sum[0]);
                 sum[1] = fma(FLOAT_TYPE(by32[l]), q1[l], sum[1]);
                 sum[2] = fma(FLOAT_TYPE(by64[l]), q2[l], sum[2]);
@@ -99,16 +154,16 @@ void compute_outputs(const uint first_row, const uint num_rows) {
     const uint s_offset  =  8*v_im + is;
     const uint y_offset = 128*v_im + l0;
 
-    [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
-        [[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
+    for (uint j = 0; j < NUM_COLS; ++j) {
+        for (uint i = 0; i < NUM_ROWS; ++i) {
             temp[j][i] = FLOAT_TYPE(0);
         }
     }
 
     const uint nbr_par_th = num_blocks_per_row%it_size;
     const uint nbr_all_th = num_blocks_per_row - nbr_par_th;
     uint i0 = 0;
-    [[unroll]] for (; i0 < nbr_all_th; i0 += it_size)
+    for (; i0 < nbr_all_th; i0 += it_size)
         calc_superblock(a_offset, b_offset, itid, ix, ql_offset, qh_offset, s_offset, y_offset, i0 + ix, num_blocks_per_row, first_row, num_rows, true);
     calc_superblock(a_offset, b_offset, itid, ix, ql_offset, qh_offset, s_offset, y_offset, i0 + ix, num_blocks_per_row, first_row, num_rows, false);
 
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/types.comp b/ggml/src/ggml-vulkan/vulkan-shaders/types.comp
@@ -347,7 +347,7 @@ struct block_q6_K_packed16
 #if defined(DATA_A_Q6_K)
 #define QUANT_K QUANT_K_Q6_K
 #define A_TYPE block_q6_K
-#define A_TYPE_PACKED16 block_q6_K_packed16
+//#define A_TYPE_PACKED16 block_q6_K_packed16
 #endif
 
 // IQuants
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
@@ -4927,7 +4927,7 @@ static const ggml_type all_types[] = {
     GGML_TYPE_Q8_0,
 //    GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
 //    GGML_TYPE_Q4_K, GGML_TYPE_Q5_K,
-//    GGML_TYPE_Q6_K,
+    GGML_TYPE_Q6_K,
     // GGML_TYPE_TQ1_0, GGML_TYPE_TQ2_0, // TODO: implement for all backends
 //    GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S,
 //   GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M,
@@ -4950,7 +4950,7 @@ static const ggml_type other_types[] = {
     GGML_TYPE_Q8_0,
 //    GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
 //    GGML_TYPE_Q5_K,
-//    GGML_TYPE_Q6_K,
+    GGML_TYPE_Q6_K,
     // GGML_TYPE_TQ1_0, GGML_TYPE_TQ2_0, // TODO: implement for all backends
 //    GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S,
 //    GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M,
@@ -5303,6 +5303,8 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
     test_cases.emplace_back(new test_out_prod(GGML_TYPE_F32, GGML_TYPE_F32, 1024, 256, 4096*40, {1,  1}, {1, 1}));
     test_cases.emplace_back(new test_out_prod(GGML_TYPE_Q8_0, GGML_TYPE_F32, 1024, 256, 4096*40, {1,  1}, {1, 1}));
 
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q6_K, GGML_TYPE_F32, 151936, 1, 1024, {1,  1}, {1, 1}));
+
 #if 0
     test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32, GGML_TYPE_F32, 2, 2, 32, {1,  1}, {1, 1}));
     test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32, GGML_TYPE_F32, 4, 4, 32, {1,  1}, {1, 1}));