@@ -403,6 +403,7 @@ struct ggml_backend_opencl_context {
403403 cl_program program_conv_2d_f16_f32;
404404 cl_program program_tsembd;
405405 cl_program program_mul_mv_id_q4_0_f32_8x_flat;
406+ cl_program program_mul_mv_id_q8_0_f32;
406407 cl_program program_mul_mv_id_mxfp4_f32;
407408 cl_program program_mul_mv_id_mxfp4_f32_flat;
408409 cl_program program_mul_mm_f32_f32_l4_lm;
@@ -473,6 +474,7 @@ struct ggml_backend_opencl_context {
473474 cl_kernel kernel_conv_2d_f16_f32;
474475 cl_kernel kernel_timestep_embedding;
475476 cl_kernel kernel_mul_mv_id_q4_0_f32_8x_flat;
477+ cl_kernel kernel_mul_mv_id_q8_0_f32;
476478 cl_kernel kernel_mul_mv_id_mxfp4_f32;
477479 cl_kernel kernel_mul_mv_id_mxfp4_f32_flat;
478480 cl_kernel kernel_mul_mm_f32_f32_l4_lm;
@@ -1751,6 +1753,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
17511753 GGML_LOG_CONT (" ." );
17521754 }
17531755
1756+ // mul_mv_id_q8_0_f32
1757+ {
1758+ #ifdef GGML_OPENCL_EMBED_KERNELS
1759+ const std::string kernel_src {
1760+ #include " mul_mv_id_q8_0_f32.cl.h"
1761+ };
1762+ #else
1763+ const std::string kernel_src = read_file (" mul_mv_id_q8_0_f32.cl" );
1764+ #endif
1765+ backend_ctx->program_mul_mv_id_q8_0_f32 =
1766+ build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
1767+
1768+ CL_CHECK ((backend_ctx->kernel_mul_mv_id_q8_0_f32 = clCreateKernel (backend_ctx->program_mul_mv_id_q8_0_f32 , " kernel_mul_mv_id_q8_0_f32" , &err), err));
1769+ GGML_LOG_CONT (" ." );
1770+ }
1771+
17541772 // mul_mv_id_mxfp4_f32
17551773 {
17561774#ifdef GGML_OPENCL_EMBED_KERNELS
@@ -2831,6 +2849,7 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
28312849 return false ;
28322850 case GGML_OP_MUL_MAT_ID:
28332851 if (op->src [0 ]->type == GGML_TYPE_Q4_0 ||
2852+ op->src [0 ]->type == GGML_TYPE_Q8_0 ||
28342853 op->src [0 ]->type == GGML_TYPE_MXFP4) {
28352854 if (op->src [1 ]->type == GGML_TYPE_F32) {
28362855 return ggml_is_contiguous (op->src [0 ]) && ggml_is_contiguous (op->src [1 ]);
@@ -7260,6 +7279,46 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
72607279
72617280 break ;
72627281 }
7282+ case GGML_TYPE_Q8_0: {
7283+ kernel = backend_ctx->kernel_mul_mv_id_q8_0_f32 ;
7284+
7285+ if (backend_ctx->gpu_family == INTEL) {
7286+ sgs = 16 ;
7287+ nsg = 2 ;
7288+ ndst = 4 ;
7289+ } else if (backend_ctx->gpu_family == ADRENO) {
7290+ sgs = 64 ;
7291+ nsg = 2 ;
7292+ ndst = 8 ;
7293+ ndst = 4 ;
7294+ } else {
7295+ GGML_ASSERT (false && " TODO: Unknown GPU" );
7296+ }
7297+
7298+ CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &extra0->data_device ));
7299+ CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_ulong), &offset0));
7300+ CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_mem), &extra1->data_device ));
7301+ CL_CHECK (clSetKernelArg (kernel, 3 , sizeof (cl_ulong), &offset1));
7302+ CL_CHECK (clSetKernelArg (kernel, 4 , sizeof (cl_mem), &extra2->data_device ));
7303+ CL_CHECK (clSetKernelArg (kernel, 5 , sizeof (cl_ulong), &offset2));
7304+ CL_CHECK (clSetKernelArg (kernel, 6 , sizeof (cl_mem), &extrad->data_device ));
7305+ CL_CHECK (clSetKernelArg (kernel, 7 , sizeof (cl_ulong), &offsetd));
7306+ CL_CHECK (clSetKernelArg (kernel, 8 , sizeof (int ), &ne00));
7307+ CL_CHECK (clSetKernelArg (kernel, 9 , sizeof (int ), &ne01));
7308+ CL_CHECK (clSetKernelArg (kernel, 10 , sizeof (cl_ulong), &nb01));
7309+ CL_CHECK (clSetKernelArg (kernel, 11 , sizeof (cl_ulong), &nb02));
7310+ CL_CHECK (clSetKernelArg (kernel, 12 , sizeof (int ), &ne11));
7311+ CL_CHECK (clSetKernelArg (kernel, 13 , sizeof (int ), &ne12));
7312+ CL_CHECK (clSetKernelArg (kernel, 14 , sizeof (cl_ulong), &nb11));
7313+ CL_CHECK (clSetKernelArg (kernel, 15 , sizeof (cl_ulong), &nb12));
7314+ CL_CHECK (clSetKernelArg (kernel, 16 , sizeof (int ), &ne20));
7315+ CL_CHECK (clSetKernelArg (kernel, 17 , sizeof (int ), &ne21));
7316+ CL_CHECK (clSetKernelArg (kernel, 18 , sizeof (cl_ulong), &nb21));
7317+ CL_CHECK (clSetKernelArg (kernel, 19 , sizeof (int ), &ne0));
7318+ CL_CHECK (clSetKernelArg (kernel, 20 , sizeof (int ), &ne1));
7319+
7320+ break ;
7321+ }
72637322 case GGML_TYPE_MXFP4: {
72647323#ifdef GGML_OPENCL_SOA_Q
72657324 kernel = backend_ctx->kernel_mul_mv_id_mxfp4_f32_flat ;
0 commit comments