fuse q8_1 quantization in q6_k_tiled_gemv

AD2605 · AD2605 · commit 472c495426bb · 2025-07-09T17:14:15.000+01:00
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -3371,7 +3371,7 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
         opt_for_reorder(&ctx, src0, src1, dst, mul_mat_algo::DMMV);
         ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, convert_src1_to_q8_1);
     } else if (use_mul_mat_vec_q) {
-        constexpr bool convert_src1_to_q8_1 = true;
+        bool convert_src1_to_q8_1 = ctx.opt_feature.can_use_intel_builtins ? false : true;
         opt_for_reorder(&ctx, src0, src1, dst, mul_mat_algo::MMVQ);
         ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, convert_src1_to_q8_1);
     } else if (use_mul_mat_q) {
diff --git a/ggml/src/ggml-sycl/mmvq.cpp b/ggml/src/ggml-sycl/mmvq.cpp
@@ -14,9 +14,9 @@
 #include "quants.hpp"
 #include "vecdotq.hpp"
 
-static void q6_k_tiled_gemv(const int8_t * q6_k_low, const int8_t * q6_k_high, const int8_t * q8_1_input,
-                            const int8_t * q6_scales, const sycl::half * q6_k_superblock_scales,
-                            const sycl::half2 * q8_scales, float * output, std::size_t m, std::size_t k,
+static void q6_k_tiled_gemv(const int8_t * q6_k_low, const int8_t * q6_k_high, const float * src1_f32,
+                            const int8_t * q6_scales, const sycl::half * q6_k_superblock_scales, 
+                            float * output, std::size_t m, std::size_t k,
                             dpct::queue_ptr stream) {
     constexpr int     SubgroupSize           = 16;
     constexpr int     tile_height            = 16;
@@ -28,8 +28,8 @@ static void q6_k_tiled_gemv(const int8_t * q6_k_low, const int8_t * q6_k_high, c
     sycl_launch(stream, [&](sycl::handler & cgh) {
         sycl_parallel_for(cgh, sycl::nd_range<1>({ global_range }, { local_range }),
                           [=](sycl::nd_item<1> it) [[sycl::reqd_sub_group_size(SubgroupSize)]] {
-                              [[clang::always_inline]] sycl::q6k_tiled_gemv(q6_k_low, q6_k_high, q8_1_input, output,
-                                                                            q6_scales, q8_scales,
+                              [[clang::always_inline]] sycl::q6k_tiled_gemv(q6_k_low, q6_k_high, src1_f32, output,
+                                                                            q6_scales,
                                                                             q6_k_superblock_scales, m, k, it);
                           });
     });
@@ -1005,6 +1005,7 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens
         const size_t src1_ddq_i_offset = i * src1_padded_col_size * q8_1_ts / q8_1_bs;
         const char * src1_ddq_i_bs     = src1_ddq_i + src1_ddq_i_offset;
         float *      dst_dd_i_bs       = dst_dd_i + i * dst->ne[0];
+        const float*       src1_ddfi_row     = src1_ddf_i + i * src1_padded_col_size;
         switch (src0->type) {
             case GGML_TYPE_Q4_0:
                 if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
@@ -1060,9 +1061,8 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens
                         auto q6_h_ptr          = q6_l_ptr + (QK_K / 2) * num_q6_blocks;
                         auto scales_u8_q6_k    = q6_h_ptr + (QK_K / 4) * num_q6_blocks;
                         auto scales_q6_k_superblock      = (sycl::half*)(scales_u8_q6_k  + num_q6_blocks * (QK_K / 16));
-                        auto q8_1_input        = (int8_t *) src1_ddq_i_bs;
-                        auto q8_1_input_scales = (sycl::half2 *) (q8_1_input + k);
-                        q6_k_tiled_gemv(q6_l_ptr, q6_h_ptr, q8_1_input, scales_u8_q6_k, scales_q6_k_superblock, q8_1_input_scales, dst_dd_i_bs, m, k,
+                        auto src_1_f32        = src1_ddfi_row;
+                        q6_k_tiled_gemv(q6_l_ptr, q6_h_ptr, src_1_f32, scales_u8_q6_k, scales_q6_k_superblock, dst_dd_i_bs, m, k,
                                         stream);
                     } else {
                         GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q6_k_q8_1_sycl\n");
@@ -1107,6 +1107,5 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens
     }
     GGML_UNUSED(src1);
     GGML_UNUSED(dst);
-    GGML_UNUSED(src1_ddf_i);
     GGML_UNUSED(ctx);
 }
diff --git a/ggml/src/ggml-sycl/q6_k_tiled_gemv.hpp b/ggml/src/ggml-sycl/q6_k_tiled_gemv.hpp
@@ -4,20 +4,47 @@
 #include <sys/types.h>
 
 #include <cstdint>
-#include <sycl/aliases.hpp>
-#include <sycl/ext/oneapi/experimental/root_group.hpp>
-#include <sycl/functional.hpp>
-#include <sycl/group_algorithm.hpp>
-#include <sycl/nd_item.hpp>
+#include <tuple>
+
 #include <sycl/sycl.hpp>
-#include <sycl/vector.hpp>
 
 #include "builtins.hpp"
 #include "cacheopts.hpp"
 #include "ggml-quants.h"
 #include "ggml-sycl/dpct/helper.hpp"
 
-#define sycl_print sycl::ext::oneapi::experimental::printf
+__attribute__((always_inline)) inline std::tuple<int, float> quantize_and_pack_input(
+    const sycl::vec<float, 4> & loaded_fp32_vals, int wi_id_in_sg, sycl::sub_group & sg) {
+    float amax          = 0;
+    int   packed_quants = 0;
+#pragma unroll(4)
+    for (int i = 0; i < 4; i++) {
+        amax             = sycl::fmax(amax, sycl::fabs(loaded_fp32_vals[i]));
+    }
+
+    float amax_value_to_contribute = wi_id_in_sg > 7 ? 0 : amax;
+
+    // first reduce for workitems 0 - 7;
+    float abs_max_0_7 = sycl::reduce_over_group(sg, amax_value_to_contribute, sycl::maximum<float>());
+
+    amax_value_to_contribute = wi_id_in_sg < 7 ? 0 : amax;
+
+    float abs_max_8_15 = sycl::reduce_over_group(sg, amax_value_to_contribute, sycl::maximum<float>());
+
+    float amax_value = wi_id_in_sg > 7 ? abs_max_8_15 : abs_max_0_7;
+
+    float scale_value = amax_value == 0 ? 1 : amax_value / 127;
+
+#pragma unroll(4)
+    for (int i = 0; i < 4; i++) {
+        int8_t quantized_value = sycl::round(loaded_fp32_vals[i] / scale_value);
+        packed_quants          = packed_quants | (int32_t) ((uint8_t) quantized_value) << (8 * i);
+    }
+    scale_value = amax_value == 0 ? 0 : scale_value;
+
+    return { packed_quants, scale_value };
+}
+
 //
 /**
  * @brief This function packs 4 q6_k quants in a 32 bit value.
@@ -32,21 +59,23 @@ __attribute__((always_inline)) inline int pack_q6_k(const short & low_bits, cons
 // TODO: Reduce the number of brackets by checking the precedence order :)
 #pragma unroll(4)
     for (uint8_t i = 0; i < 4; i++) {
-        uint16_t   mask_low_bits  = (0x000F) << (4 * i);
-        uint8_t  mask_high_bits = (0x3) << (2 * i);
-        uint8_t desired_low_bits = (low_bits & mask_low_bits) >> (4 * i);
-        uint8_t desired_high_bits = ((high_bits & mask_high_bits) >> (2 * i)) << 4;
-        int8_t full_value = static_cast<int8_t>(desired_high_bits | desired_low_bits);
-        full_value = sycl::sub_sat(full_value, (int8_t)32);
+        uint16_t mask_low_bits     = (0x000F) << (4 * i);
+        uint8_t  mask_high_bits    = (0x3) << (2 * i);
+        uint8_t  desired_low_bits  = (low_bits & mask_low_bits) >> (4 * i);
+        uint8_t  desired_high_bits = ((high_bits & mask_high_bits) >> (2 * i)) << 4;
+        int8_t   full_value        = static_cast<int8_t>(desired_high_bits | desired_low_bits);
+        full_value                 = sycl::sub_sat(full_value, (int8_t) 32);
         packed_q6_k |= (static_cast<uint32_t>(static_cast<uint8_t>(full_value)) << (8 * i));
     }
     return packed_q6_k;
 }
 
 namespace sycl {
-__attribute__((always_inline)) inline void q6k_tiled_gemv(
-    const int8_t * q6_k_l, const int8_t * q6_k_h, const int8_t * q8_1, float * result, const int8_t * q6_u8_bit_scales,
-    const sycl::half2 * q8_dm_scales, const sycl::half * q6_k_superblock_scale, int m, int k, const nd_item<1> & it) {
+__attribute__((always_inline)) inline void q6k_tiled_gemv(const int8_t * q6_k_l, const int8_t * q6_k_h,
+                                                          const float * q8_1, float * result,
+                                                          const int8_t *     q6_u8_bit_scales,
+                                                          const sycl::half * q6_k_superblock_scale, int m, int k,
+                                                          const nd_item<1> & it) {
     // Performs a (m x k ) X (k x 1) GEMM
     // Each subgroup is responsible for 16 output elements.
 
@@ -69,18 +98,17 @@ __attribute__((always_inline)) inline void q6k_tiled_gemv(
     auto sg_id             = it.get_group(0) * num_sgs_in_wg + sg.get_group_id();
     auto wi_id_in_sg       = sg.get_local_linear_id();
 
-    auto       q6_k_l_width       = ((k / 2 - 1) * sizeof(int8_t));  // as we have 2 4 bit values packed in an int8_t;
-    auto       q6_k_h_width       = ((k / 4 - 1) * sizeof(int8_t));  // as we have 4 2 bit values packed in an int8_t;
-    auto       q8_1_width         = (k - 1) * sizeof(int8_t);
-    auto       result_width       = (m - 1) * sizeof(float);
-    auto       q6_u8_scale_width  = ((k / QK_K) * 16 - 1) * sizeof(int8_t);
+    auto       q6_k_l_width      = ((k / 2 - 1) * sizeof(int8_t));  // as we have 2 4 bit values packed in an int8_t;
+    auto       q6_k_h_width      = ((k / 4 - 1) * sizeof(int8_t));  // as we have 4 2 bit values packed in an int8_t;
+    auto       result_width      = (m - 1) * sizeof(float);
+    auto       q6_u8_scale_width = ((k / QK_K) * 16 - 1) * sizeof(int8_t);
     auto       super_block_scale_width = (m - 1) * sizeof(sycl::half);
     const auto num_blocks_per_row      = k / QK_K;
 
-    const int              tiles_required = m / tile_height;
-    sycl::vec<float, 16>   accumulator;
-    vector_types::char16   q6_u8_scales_vals;
-    sycl::half             super_block_scale;
+    const int            tiles_required = m / tile_height;
+    sycl::vec<float, 16> accumulator;
+    vector_types::char16 q6_u8_scales_vals;
+    sycl::half           super_block_scale;
 
     for (; sg_id < tiles_required; sg_id += num_sgs_in_kernel) {
         auto h_coord = sg_id * tile_height;
@@ -99,11 +127,11 @@ __attribute__((always_inline)) inline void q6k_tiled_gemv(
             auto super_block_scale_loaded = __builtin_IB_subgroup_block_read_flat_u16_m1k16v1(
                 (intptr_t) q6_k_superblock_scale, super_block_scale_width, num_blocks_per_row - 1,
                 super_block_scale_width, vector_types::uint2{ (uint) (h_coord), (uint) i });
-            super_block_scale = *reinterpret_cast<sycl::half*>(&super_block_scale_loaded);
-            
+            super_block_scale = *reinterpret_cast<sycl::half *>(&super_block_scale_loaded);
+
             auto element_width_offset = i * QK_K;
-            auto q6_l_w_coord_start = i * (QK_K / 2);
-            auto q6_h_w_coord_start = i * (QK_K / 4);
+            auto q6_l_w_coord_start   = i * (QK_K / 2);
+            auto q6_h_w_coord_start   = i * (QK_K / 4);
 
 #    pragma unroll(4)
             for (int j = 0; j < QK_K; j += tile_width) {
@@ -115,21 +143,22 @@ __attribute__((always_inline)) inline void q6k_tiled_gemv(
                     (intptr_t) (q6_k_h), q6_k_h_width, m - 1, q6_k_h_width,
                     vector_types::uint2{ (uint) (q6_h_w_coord_start + j / 4), (uint) h_coord });
 
-                int packed_q8_1_vals = __builtin_IB_subgroup_block_read_flat_u8_m1k64v1(
-                    (intptr_t) (q8_1), q8_1_width, 0, q8_1_width,
-                    vector_types::uint2{ (uint) (element_width_offset + j), (uint) 0 });
+                auto loaded_fp32_vals = *reinterpret_cast<const sycl::vec<float, 4> *>(q8_1 + element_width_offset + j);
+                // int packed_q8_1_vals = __builtin_IB_subgroup_block_read_flat_u8_m1k64v1(
+                //     (intptr_t) (q8_1), q8_1_width, 0, q8_1_width,
+                //     vector_types::uint2{ (uint) (element_width_offset + j), (uint) 0 });
 
-                sycl::half2 q8_dm_val =
-                    q8_dm_scales[element_width_offset / QK8_1 + j / QK8_1 + (wi_id_in_sg * 4) / QK8_1];
+                auto [packed_q8_1_vals, q8_scale_fp32] = quantize_and_pack_input(loaded_fp32_vals, wi_id_in_sg, sg);
 
 #    pragma unroll(16)
                 for (uint8_t l = 0; l < 16; l++) {
                     int packed_q6_k_vals = pack_q6_k(q6_low_bits[l], q6_high_bits[l]);
                     int dp4a_val = __builtin_IB_dp4a_ss(0, packed_q6_k_vals, packed_q8_1_vals, dp4a_with_saturation);
                     sycl::half q6_super_block_value = sycl::select_from_group(sg, super_block_scale, l);
-                    int8_t     q6_block_scale_val   = sycl::select_from_group(sg, q6_u8_scales_vals[l], j / 16 + (wi_id_in_sg ) / 4);
+                    int8_t     q6_block_scale_val =
+                        sycl::select_from_group(sg, q6_u8_scales_vals[l], j / 16 + (wi_id_in_sg) / 4);
                     accumulator[l] += dp4a_val * static_cast<float>(q6_super_block_value) *
-                                      static_cast<float>(q6_block_scale_val) * static_cast<float>(q8_dm_val[0]);
+                                      static_cast<float>(q6_block_scale_val) * q8_scale_fp32;
                 }
             }
         }