working q8 reorder commit

AD2605 · AD2605 · commit acd80eca6359 · 2025-05-27T10:29:41.000+01:00
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -1418,19 +1418,8 @@ template <int ElementsPerWI>
 static __dpct_inline__ void quantize_and_reorder_q8_1(const float * __restrict__ x, void * reordered_q8_tensor,
                                                       const int kx, const int kx_padded, const sycl::nd_item<1> & it) {
     /*
-        quantize and reorders the resultant q8 tensor in a per row fashion
-        Each sub-group calculates one quant block
-        work_group_size = sub_group_size;
-
-        |------------------------------ Matrix Pitch -------------------------|
-        |------- Matrix Width --------|
-        q_00 q_01 q_02 ..... q_0n-1 q_n ds00 ds01 ... ds0n/32 ... padding ... |
-        .                             .                                       |
-        .                             .                                       |
-        .                             .                                   Matrix Height
-        .                             .                                       |
-        .                             .                                       |
-        q_n0 q_n1 q_n2 ..... q_nn-1 q_n dsn0 dsn1 ... dsnn/32 ... padding ... | 
+        Quantizes and reorders the resultant q8 tensor in a per row fashion
+        Each sub-group calculates one quant block. i.e. QK8_1 quant values and the d and sum values
     */
 
     auto subgroup_id = it.get_group(0);
diff --git a/ggml/src/ggml-sycl/mmvq.cpp b/ggml/src/ggml-sycl/mmvq.cpp
@@ -40,7 +40,7 @@ static void mul_mat_vec_q_reorder(const void * __restrict__ vx, const void * __r
         // Y block index that aligns with ibx
         const int iby = i * block_type::block_to_q8_1_ratio();
         const int8_t* q8_1_quant_ptr = (const int8_t*)vy + iby * QK8_1;
-        sycl::half2 q8_1_ds_ptr = *(const sycl::half2*)((const char*)vy + ncols + iby * sizeof(sycl::half2)); 
+        const sycl::half2* q8_1_ds_ptr = (const sycl::half2*)((const char*)vy + ncols + iby * sizeof(sycl::half2)); 
 
 #pragma unroll
         for (int elem = 0; elem < block_elements_per_subgroup; elem += WARP_SIZE) {
diff --git a/ggml/src/ggml-sycl/vecdotq.hpp b/ggml/src/ggml-sycl/vecdotq.hpp
@@ -285,7 +285,7 @@ template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q4_0> {
     }
 
     __dpct_inline__ float operator()(const void * __restrict__ vbq, const int ibx_offset, const int d_offset,
-                     const int8_t* q8_1_quant_ptr, const sycl::half2& q8_1_ds, const int & iqs, int /* nblocks */) {
+                     const int8_t* q8_1_quant_ptr, const sycl::half2* q8_1_ds, const int & iqs, int /* nblocks */) {
         const uint8_t * bq4_0 = static_cast<const uint8_t *>(vbq) + ibx_offset;
         const ggml_half d     = *(reinterpret_cast<const ggml_half *>(static_cast<const uint8_t *>(vbq) + d_offset));
         int             v[q4_0_traits::vdr_mmvq];
@@ -299,7 +299,7 @@ template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q4_0> {
             u[2 * i + 1] = get_int_from_int8_aligned(q8_1_quant_ptr, iqs + i + q4_0_traits::qi);
         }
 
-        return vec_dot_q4_0_q8_1_impl(v, u, d, q8_1_ds);
+        return vec_dot_q4_0_q8_1_impl(v, u, d, *q8_1_ds);
     };
 };
 
@@ -347,7 +347,7 @@ template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q4_K> {
     using q4_k_traits = typename q4_k_block::traits;
 
     float operator()(const void * __restrict__ vbq, const int ibx_offset, const int d_offset,
-                     const int8_t* q8_1_quant_ptr, const sycl::half2& q8_1_ds, const int & iqs, int nblocks) {
+                     const int8_t* q8_1_quant_ptr, const sycl::half2* q8_1_ds, const int & iqs, int nblocks) {
         const int ib = ibx_offset / (QK_K / 2);
 
         const uint8_t *    base           = static_cast<const uint8_t *>(vbq);
@@ -360,7 +360,38 @@ template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q4_K> {
         const int *      q4         = (const int *) (qs + 16 * bq8_offset + 4 * ((iqs / 2) % 4));
         const uint16_t * scales     = (const uint16_t *) scs;
 
-        return vec_dot_q4_K_q8_1_common(q4, scales, *dms, bq8_1, iqs);
+        int   v[2];
+        int   u[2 * QR4_K];
+        float d8[QR4_K];
+
+        v[0] = q4[0];
+        v[1] = q4[4];
+
+        uint16_t  aux[2];
+        const int j = (QR4_K * ((iqs / 2) / (QI8_1 / 2))) / 2;
+        if (j < 2) {
+            aux[0] = scales[j + 0] & 0x3f3f;
+            aux[1] = scales[j + 2] & 0x3f3f;
+        } else {
+            aux[0] = ((scales[j + 2] >> 0) & 0x0f0f) | ((scales[j - 2] & 0xc0c0) >> 2);
+            aux[1] = ((scales[j + 2] >> 4) & 0x0f0f) | ((scales[j - 0] & 0xc0c0) >> 2);
+        }
+
+        const uint8_t * sc = (const uint8_t *) aux;
+        const uint8_t * m  = sc + 2;
+
+        for (int i = 0; i < QR4_K; ++i) {
+            const int8_t* quant_base_ptr = q8_1_quant_ptr + (bq8_offset + i) * QK8_1;
+            sycl::half2 ds_values = *(q8_1_ds + bq8_offset + i);
+
+            d8[i]                   = ds_values[0];
+
+            const int * q8 = (const int *) quant_base_ptr + ((iqs / 2) % 4);
+            u[2 * i + 0]   = q8[0];
+            u[2 * i + 1]   = q8[4];
+        }
+
+        return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, *dms, d8);
     }
 };