@@ -402,7 +402,7 @@ struct ggml_backend_opencl_context {
402402 cl_program program_conv_2d_f16_f32;
403403 cl_program program_tsembd;
404404 cl_program program_mul_mv_id_q4_0_f32_8x_flat;
405- cl_program program_mul_mv_id_q8_0_f32;
405+ cl_program program_mul_mv_id_q8_0_f32, program_mul_mv_id_q8_0_f32_flat ;
406406 cl_program program_mul_mv_id_mxfp4_f32;
407407 cl_program program_mul_mm_f32_f32_l4_lm;
408408 cl_program program_mul_mm_f16_f32_l4_lm;
@@ -472,7 +472,7 @@ struct ggml_backend_opencl_context {
472472 cl_kernel kernel_conv_2d_f16_f32;
473473 cl_kernel kernel_timestep_embedding;
474474 cl_kernel kernel_mul_mv_id_q4_0_f32_8x_flat;
475- cl_kernel kernel_mul_mv_id_q8_0_f32;
475+ cl_kernel kernel_mul_mv_id_q8_0_f32, kernel_mul_mv_id_q8_0_f32_flat ;
476476 cl_kernel kernel_mul_mv_id_mxfp4_f32;
477477 cl_kernel kernel_mul_mm_f32_f32_l4_lm;
478478 cl_kernel kernel_mul_mm_f16_f32_l4_lm;
@@ -1766,6 +1766,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
17661766 GGML_LOG_CONT (" ." );
17671767 }
17681768
1769+ // mul_mv_id_q8_0_f32_flat
1770+ {
1771+ #ifdef GGML_OPENCL_EMBED_KERNELS
1772+ const std::string kernel_src {
1773+ #include " mul_mv_id_q8_0_f32_flat.cl.h"
1774+ };
1775+ #else
1776+ const std::string kernel_src = read_file (" mul_mv_id_q8_0_f32_flat.cl" );
1777+ #endif
1778+ backend_ctx->program_mul_mv_id_q8_0_f32_flat =
1779+ build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
1780+
1781+ CL_CHECK ((backend_ctx->kernel_mul_mv_id_q8_0_f32_flat = clCreateKernel (backend_ctx->program_mul_mv_id_q8_0_f32_flat , " kernel_mul_mv_id_q8_0_f32_flat" , &err), err));
1782+ GGML_LOG_CONT (" ." );
1783+ }
1784+
17691785 // mul_mv_id_mxfp4_f32
17701786 {
17711787#ifdef GGML_OPENCL_EMBED_KERNELS
@@ -7136,6 +7152,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
71367152
71377153#ifdef GGML_OPENCL_SOA_Q
71387154 ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra ;
7155+ ggml_tensor_extra_cl_q8_0 * extra0_q8_0 = (ggml_tensor_extra_cl_q8_0 *)src0->extra ;
71397156#endif
71407157
71417158 const int ne00 = src0->ne [0 ];
@@ -7224,6 +7241,43 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
72247241 break ;
72257242 }
72267243 case GGML_TYPE_Q8_0: {
7244+ #ifdef GGML_OPENCL_SOA_Q
7245+ kernel = backend_ctx->kernel_mul_mv_id_q8_0_f32_flat ;
7246+
7247+ if (backend_ctx->gpu_family == INTEL) {
7248+ sgs = 16 ;
7249+ nsg = 2 ;
7250+ ndst = 4 ;
7251+ } else if (backend_ctx->gpu_family == ADRENO) {
7252+ sgs = 64 ;
7253+ nsg = 2 ;
7254+ ndst = 4 ;
7255+ } else {
7256+ GGML_ASSERT (false && " TODO: Unknown GPU" );
7257+ }
7258+
7259+ CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &extra0_q8_0->q ));
7260+ CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_mem), &extra0_q8_0->d ));
7261+ CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_mem), &extra1->data_device ));
7262+ CL_CHECK (clSetKernelArg (kernel, 3 , sizeof (cl_ulong), &offset1));
7263+ CL_CHECK (clSetKernelArg (kernel, 4 , sizeof (cl_mem), &extra2->data_device ));
7264+ CL_CHECK (clSetKernelArg (kernel, 5 , sizeof (cl_ulong), &offset2));
7265+ CL_CHECK (clSetKernelArg (kernel, 6 , sizeof (cl_mem), &extrad->data_device ));
7266+ CL_CHECK (clSetKernelArg (kernel, 7 , sizeof (cl_ulong), &offsetd));
7267+ CL_CHECK (clSetKernelArg (kernel, 8 , sizeof (int ), &ne00));
7268+ CL_CHECK (clSetKernelArg (kernel, 9 , sizeof (int ), &ne01));
7269+ CL_CHECK (clSetKernelArg (kernel, 10 , sizeof (cl_ulong), &nb01));
7270+ CL_CHECK (clSetKernelArg (kernel, 11 , sizeof (cl_ulong), &nb02));
7271+ CL_CHECK (clSetKernelArg (kernel, 12 , sizeof (int ), &ne11));
7272+ CL_CHECK (clSetKernelArg (kernel, 13 , sizeof (int ), &ne12));
7273+ CL_CHECK (clSetKernelArg (kernel, 14 , sizeof (cl_ulong), &nb11));
7274+ CL_CHECK (clSetKernelArg (kernel, 15 , sizeof (cl_ulong), &nb12));
7275+ CL_CHECK (clSetKernelArg (kernel, 16 , sizeof (int ), &ne20));
7276+ CL_CHECK (clSetKernelArg (kernel, 17 , sizeof (int ), &ne21));
7277+ CL_CHECK (clSetKernelArg (kernel, 18 , sizeof (cl_ulong), &nb21));
7278+ CL_CHECK (clSetKernelArg (kernel, 19 , sizeof (int ), &ne0));
7279+ CL_CHECK (clSetKernelArg (kernel, 20 , sizeof (int ), &ne1));
7280+ #else
72277281 kernel = backend_ctx->kernel_mul_mv_id_q8_0_f32 ;
72287282
72297283 if (backend_ctx->gpu_family == INTEL) {
@@ -7233,7 +7287,6 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
72337287 } else if (backend_ctx->gpu_family == ADRENO) {
72347288 sgs = 64 ;
72357289 nsg = 2 ;
7236- ndst = 8 ;
72377290 ndst = 4 ;
72387291 } else {
72397292 GGML_ASSERT (false && " TODO: Unknown GPU" );
@@ -7260,7 +7313,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
72607313 CL_CHECK (clSetKernelArg (kernel, 18 , sizeof (cl_ulong), &nb21));
72617314 CL_CHECK (clSetKernelArg (kernel, 19 , sizeof (int ), &ne0));
72627315 CL_CHECK (clSetKernelArg (kernel, 20 , sizeof (int ), &ne1));
7263-
7316+ # endif // GGML_OPENCL_SOA_Q
72647317 break ;
72657318 }
72667319 case GGML_TYPE_MXFP4: {
0 commit comments