@@ -402,7 +402,7 @@ struct ggml_backend_opencl_context {
402402    cl_program program_conv_2d_f16_f32;
403403    cl_program program_tsembd;
404404    cl_program program_mul_mv_id_q4_0_f32_8x_flat;
405-     cl_program program_mul_mv_id_q8_0_f32;
405+     cl_program program_mul_mv_id_q8_0_f32, program_mul_mv_id_q8_0_f32_flat ;
406406    cl_program program_mul_mv_id_mxfp4_f32;
407407    cl_program program_mul_mm_f32_f32_l4_lm;
408408    cl_program program_mul_mm_f16_f32_l4_lm;
@@ -472,7 +472,7 @@ struct ggml_backend_opencl_context {
472472    cl_kernel kernel_conv_2d_f16_f32;
473473    cl_kernel kernel_timestep_embedding;
474474    cl_kernel kernel_mul_mv_id_q4_0_f32_8x_flat;
475-     cl_kernel kernel_mul_mv_id_q8_0_f32;
475+     cl_kernel kernel_mul_mv_id_q8_0_f32, kernel_mul_mv_id_q8_0_f32_flat ;
476476    cl_kernel kernel_mul_mv_id_mxfp4_f32;
477477    cl_kernel kernel_mul_mm_f32_f32_l4_lm;
478478    cl_kernel kernel_mul_mm_f16_f32_l4_lm;
@@ -1766,6 +1766,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
17661766        GGML_LOG_CONT (" ." 
17671767    }
17681768
1769+     //  mul_mv_id_q8_0_f32_flat
1770+     {
1771+ #ifdef  GGML_OPENCL_EMBED_KERNELS
1772+         const  std::string kernel_src {
1773+             #include  " mul_mv_id_q8_0_f32_flat.cl.h" 
1774+         };
1775+ #else 
1776+         const  std::string kernel_src = read_file (" mul_mv_id_q8_0_f32_flat.cl" 
1777+ #endif 
1778+         backend_ctx->program_mul_mv_id_q8_0_f32_flat  =
1779+             build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
1780+ 
1781+         CL_CHECK ((backend_ctx->kernel_mul_mv_id_q8_0_f32_flat  = clCreateKernel (backend_ctx->program_mul_mv_id_q8_0_f32_flat , " kernel_mul_mv_id_q8_0_f32_flat" 
1782+         GGML_LOG_CONT (" ." 
1783+     }
1784+ 
17691785    //  mul_mv_id_mxfp4_f32
17701786    {
17711787#ifdef  GGML_OPENCL_EMBED_KERNELS
@@ -7133,6 +7149,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
71337149
71347150#ifdef  GGML_OPENCL_SOA_Q
71357151    ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra ;
7152+     ggml_tensor_extra_cl_q8_0 * extra0_q8_0 = (ggml_tensor_extra_cl_q8_0 *)src0->extra ;
71367153#endif 
71377154
71387155    const  int  ne00 = src0->ne [0 ];
@@ -7221,6 +7238,43 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
72217238            break ;
72227239        }
72237240        case  GGML_TYPE_Q8_0: {
7241+ #ifdef  GGML_OPENCL_SOA_Q
7242+             kernel = backend_ctx->kernel_mul_mv_id_q8_0_f32_flat ;
7243+ 
7244+             if  (backend_ctx->gpu_family  == INTEL) {
7245+                 sgs  = 16 ;
7246+                 nsg  = 2 ;
7247+                 ndst = 4 ;
7248+             } else  if  (backend_ctx->gpu_family  == ADRENO) {
7249+                 sgs  = 64 ;
7250+                 nsg  = 2 ;
7251+                 ndst = 4 ;
7252+             } else  {
7253+                 GGML_ASSERT (false  && " TODO: Unknown GPU" 
7254+             }
7255+ 
7256+             CL_CHECK (clSetKernelArg (kernel,  0 , sizeof (cl_mem),   &extra0_q8_0->q ));
7257+             CL_CHECK (clSetKernelArg (kernel,  1 , sizeof (cl_mem),   &extra0_q8_0->d ));
7258+             CL_CHECK (clSetKernelArg (kernel,  2 , sizeof (cl_mem),   &extra1->data_device ));
7259+             CL_CHECK (clSetKernelArg (kernel,  3 , sizeof (cl_ulong), &offset1));
7260+             CL_CHECK (clSetKernelArg (kernel,  4 , sizeof (cl_mem),   &extra2->data_device ));
7261+             CL_CHECK (clSetKernelArg (kernel,  5 , sizeof (cl_ulong), &offset2));
7262+             CL_CHECK (clSetKernelArg (kernel,  6 , sizeof (cl_mem),   &extrad->data_device ));
7263+             CL_CHECK (clSetKernelArg (kernel,  7 , sizeof (cl_ulong), &offsetd));
7264+             CL_CHECK (clSetKernelArg (kernel,  8 , sizeof (int ),      &ne00));
7265+             CL_CHECK (clSetKernelArg (kernel,  9 , sizeof (int ),      &ne01));
7266+             CL_CHECK (clSetKernelArg (kernel, 10 , sizeof (cl_ulong), &nb01));
7267+             CL_CHECK (clSetKernelArg (kernel, 11 , sizeof (cl_ulong), &nb02));
7268+             CL_CHECK (clSetKernelArg (kernel, 12 , sizeof (int ),      &ne11));
7269+             CL_CHECK (clSetKernelArg (kernel, 13 , sizeof (int ),      &ne12));
7270+             CL_CHECK (clSetKernelArg (kernel, 14 , sizeof (cl_ulong), &nb11));
7271+             CL_CHECK (clSetKernelArg (kernel, 15 , sizeof (cl_ulong), &nb12));
7272+             CL_CHECK (clSetKernelArg (kernel, 16 , sizeof (int ),      &ne20));
7273+             CL_CHECK (clSetKernelArg (kernel, 17 , sizeof (int ),      &ne21));
7274+             CL_CHECK (clSetKernelArg (kernel, 18 , sizeof (cl_ulong), &nb21));
7275+             CL_CHECK (clSetKernelArg (kernel, 19 , sizeof (int ),      &ne0));
7276+             CL_CHECK (clSetKernelArg (kernel, 20 , sizeof (int ),      &ne1));
7277+ #else 
72247278            kernel = backend_ctx->kernel_mul_mv_id_q8_0_f32 ;
72257279
72267280            if  (backend_ctx->gpu_family  == INTEL) {
@@ -7230,7 +7284,6 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
72307284            } else  if  (backend_ctx->gpu_family  == ADRENO) {
72317285                sgs  = 64 ;
72327286                nsg  = 2 ;
7233-                 ndst = 8 ;
72347287                ndst = 4 ;
72357288            } else  {
72367289                GGML_ASSERT (false  && " TODO: Unknown GPU" 
@@ -7257,7 +7310,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
72577310            CL_CHECK (clSetKernelArg (kernel, 18 , sizeof (cl_ulong), &nb21));
72587311            CL_CHECK (clSetKernelArg (kernel, 19 , sizeof (int ),      &ne0));
72597312            CL_CHECK (clSetKernelArg (kernel, 20 , sizeof (int ),      &ne1));
7260- 
7313+ # endif   //  GGML_OPENCL_SOA_Q 
72617314            break ;
72627315        }
72637316        case  GGML_TYPE_MXFP4: {
0 commit comments