@@ -403,7 +403,7 @@ struct ggml_backend_opencl_context {
403403 cl_program program_conv_2d_f16_f32;
404404 cl_program program_tsembd;
405405 cl_program program_mul_mv_id_q4_0_f32_8x_flat;
406- cl_program program_mul_mv_id_q8_0_f32;
406+ cl_program program_mul_mv_id_q8_0_f32, program_mul_mv_id_q8_0_f32_flat ;
407407 cl_program program_mul_mv_id_mxfp4_f32;
408408 cl_program program_mul_mv_id_mxfp4_f32_flat;
409409 cl_program program_mul_mm_f32_f32_l4_lm;
@@ -475,7 +475,7 @@ struct ggml_backend_opencl_context {
475475 cl_kernel kernel_conv_2d_f16_f32;
476476 cl_kernel kernel_timestep_embedding;
477477 cl_kernel kernel_mul_mv_id_q4_0_f32_8x_flat;
478- cl_kernel kernel_mul_mv_id_q8_0_f32;
478+ cl_kernel kernel_mul_mv_id_q8_0_f32, kernel_mul_mv_id_q8_0_f32_flat ;
479479 cl_kernel kernel_mul_mv_id_mxfp4_f32;
480480 cl_kernel kernel_mul_mv_id_mxfp4_f32_flat;
481481 cl_kernel kernel_mul_mm_f32_f32_l4_lm;
@@ -1788,6 +1788,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
17881788 GGML_LOG_CONT (" ." );
17891789 }
17901790
1791+ // mul_mv_id_q8_0_f32_flat
1792+ {
1793+ #ifdef GGML_OPENCL_EMBED_KERNELS
1794+ const std::string kernel_src {
1795+ #include " mul_mv_id_q8_0_f32_flat.cl.h"
1796+ };
1797+ #else
1798+ const std::string kernel_src = read_file (" mul_mv_id_q8_0_f32_flat.cl" );
1799+ #endif
1800+ backend_ctx->program_mul_mv_id_q8_0_f32_flat =
1801+ build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
1802+
1803+ CL_CHECK ((backend_ctx->kernel_mul_mv_id_q8_0_f32_flat = clCreateKernel (backend_ctx->program_mul_mv_id_q8_0_f32_flat , " kernel_mul_mv_id_q8_0_f32_flat" , &err), err));
1804+ GGML_LOG_CONT (" ." );
1805+ }
1806+
17911807 // mul_mv_id_mxfp4_f32
17921808 {
17931809#ifdef GGML_OPENCL_EMBED_KERNELS
@@ -7397,6 +7413,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
73977413#ifdef GGML_OPENCL_SOA_Q
73987414 ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra ;
73997415 ggml_tensor_extra_cl_mxfp4 * extra0_mxfp4 = (ggml_tensor_extra_cl_mxfp4 *)src0->extra ;
7416+ ggml_tensor_extra_cl_q8_0 * extra0_q8_0 = (ggml_tensor_extra_cl_q8_0 *)src0->extra ;
74007417#endif
74017418
74027419 const int ne00 = src0->ne [0 ];
@@ -7485,6 +7502,43 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
74857502 break ;
74867503 }
74877504 case GGML_TYPE_Q8_0: {
7505+ #ifdef GGML_OPENCL_SOA_Q
7506+ kernel = backend_ctx->kernel_mul_mv_id_q8_0_f32_flat ;
7507+
7508+ if (backend_ctx->gpu_family == INTEL) {
7509+ sgs = 16 ;
7510+ nsg = 2 ;
7511+ ndst = 4 ;
7512+ } else if (backend_ctx->gpu_family == ADRENO) {
7513+ sgs = 64 ;
7514+ nsg = 2 ;
7515+ ndst = 4 ;
7516+ } else {
7517+ GGML_ASSERT (false && " TODO: Unknown GPU" );
7518+ }
7519+
7520+ CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &extra0_q8_0->q ));
7521+ CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_mem), &extra0_q8_0->d ));
7522+ CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_mem), &extra1->data_device ));
7523+ CL_CHECK (clSetKernelArg (kernel, 3 , sizeof (cl_ulong), &offset1));
7524+ CL_CHECK (clSetKernelArg (kernel, 4 , sizeof (cl_mem), &extra2->data_device ));
7525+ CL_CHECK (clSetKernelArg (kernel, 5 , sizeof (cl_ulong), &offset2));
7526+ CL_CHECK (clSetKernelArg (kernel, 6 , sizeof (cl_mem), &extrad->data_device ));
7527+ CL_CHECK (clSetKernelArg (kernel, 7 , sizeof (cl_ulong), &offsetd));
7528+ CL_CHECK (clSetKernelArg (kernel, 8 , sizeof (int ), &ne00));
7529+ CL_CHECK (clSetKernelArg (kernel, 9 , sizeof (int ), &ne01));
7530+ CL_CHECK (clSetKernelArg (kernel, 10 , sizeof (cl_ulong), &nb01));
7531+ CL_CHECK (clSetKernelArg (kernel, 11 , sizeof (cl_ulong), &nb02));
7532+ CL_CHECK (clSetKernelArg (kernel, 12 , sizeof (int ), &ne11));
7533+ CL_CHECK (clSetKernelArg (kernel, 13 , sizeof (int ), &ne12));
7534+ CL_CHECK (clSetKernelArg (kernel, 14 , sizeof (cl_ulong), &nb11));
7535+ CL_CHECK (clSetKernelArg (kernel, 15 , sizeof (cl_ulong), &nb12));
7536+ CL_CHECK (clSetKernelArg (kernel, 16 , sizeof (int ), &ne20));
7537+ CL_CHECK (clSetKernelArg (kernel, 17 , sizeof (int ), &ne21));
7538+ CL_CHECK (clSetKernelArg (kernel, 18 , sizeof (cl_ulong), &nb21));
7539+ CL_CHECK (clSetKernelArg (kernel, 19 , sizeof (int ), &ne0));
7540+ CL_CHECK (clSetKernelArg (kernel, 20 , sizeof (int ), &ne1));
7541+ #else
74887542 kernel = backend_ctx->kernel_mul_mv_id_q8_0_f32 ;
74897543
74907544 if (backend_ctx->gpu_family == INTEL) {
@@ -7494,7 +7548,6 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
74947548 } else if (backend_ctx->gpu_family == ADRENO) {
74957549 sgs = 64 ;
74967550 nsg = 2 ;
7497- ndst = 8 ;
74987551 ndst = 4 ;
74997552 } else {
75007553 GGML_ASSERT (false && " TODO: Unknown GPU" );
@@ -7521,7 +7574,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
75217574 CL_CHECK (clSetKernelArg (kernel, 18 , sizeof (cl_ulong), &nb21));
75227575 CL_CHECK (clSetKernelArg (kernel, 19 , sizeof (int ), &ne0));
75237576 CL_CHECK (clSetKernelArg (kernel, 20 , sizeof (int ), &ne1));
7524-
7577+ # endif // GGML_OPENCL_SOA_Q
75257578 break ;
75267579 }
75277580 case GGML_TYPE_MXFP4: {
0 commit comments