@@ -367,6 +367,7 @@ struct ggml_backend_opencl_context {
367367 cl_program program_mul_mv_q4_0_f32_1d_8x_flat;
368368 cl_program program_mul_mv_q4_0_f32_1d_16x_flat;
369369 cl_program program_mul_mv_q6_K;
370+ cl_program program_mul_mv_q8_0_f32;
370371 cl_program program_mul_mv_mxfp4_f32;
371372 cl_program program_mul_mv_f16_f16;
372373 cl_program program_mul_mv_f16_f32_1row;
@@ -451,6 +452,7 @@ struct ggml_backend_opencl_context {
451452 cl_kernel kernel_convert_block_q4_0_noshuffle;
452453 cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
453454 cl_kernel kernel_mul_mv_q6_K_f32;
455+ cl_kernel kernel_mul_mv_q8_0_f32;
454456 cl_kernel kernel_mul_mv_mxfp4_f32;
455457 cl_kernel kernel_im2col_f32, kernel_im2col_f16;
456458 cl_kernel kernel_argsort_f32_i32;
@@ -986,6 +988,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
986988 GGML_LOG_CONT (" ." );
987989 }
988990
991+ // mul_mv_q8_0_f32
992+ {
993+ #ifdef GGML_OPENCL_EMBED_KERNELS
994+ const std::string kernel_src {
995+ #include " mul_mv_q8_0_f32.cl.h"
996+ };
997+ #else
998+ const std::string kernel_src = read_file (" mul_mv_q8_0_f32.cl" );
999+ #endif
1000+ backend_ctx->program_mul_mv_q8_0_f32 =
1001+ build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
1002+
1003+ CL_CHECK ((backend_ctx->kernel_mul_mv_q8_0_f32 = clCreateKernel (backend_ctx->program_mul_mv_q8_0_f32 , " kernel_mul_mv_q8_0_f32" , &err), err));
1004+ GGML_LOG_CONT (" ." );
1005+ }
1006+
9891007 // mul_mv_mxfp4_f32
9901008 {
9911009#ifdef GGML_OPENCL_EMBED_KERNELS
@@ -2722,6 +2740,8 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
27222740 } else if (op->src [0 ]->type == GGML_TYPE_Q4_0 || op->src [0 ]->type == GGML_TYPE_MXFP4 ||
27232741 op->src [0 ]->type == GGML_TYPE_Q6_K) {
27242742 return op->src [1 ]->type == GGML_TYPE_F32 && ggml_is_contiguous (op->src [0 ]) && ggml_is_contiguous (op->src [1 ]);
2743+ } else if (op->src [0 ]->type == GGML_TYPE_Q8_0) {
2744+ return op->src [1 ]->type == GGML_TYPE_F32;
27252745 }
27262746 return false ;
27272747 case GGML_OP_MUL_MAT_ID:
@@ -6714,7 +6734,45 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
67146734#endif // GGML_OPENCL_SOA_Q
67156735 break ;
67166736 case GGML_TYPE_Q4_1:
6717- case GGML_TYPE_Q8_0:
6737+ case GGML_TYPE_Q8_0: {
6738+ kernel = backend_ctx->kernel_mul_mv_q8_0_f32 ;
6739+
6740+ // nth0 - subgroup size
6741+ // nth1 - number of subgroups per workgroup
6742+ // ndst - number of output values per workgroup = output per subgroup * number of subgroups
6743+ if (backend_ctx->gpu_family == INTEL) {
6744+ nth0 = 16 ;
6745+ nth1 = 2 ;
6746+ ndst = nth1*4 ;
6747+ } else if (backend_ctx->gpu_family == ADRENO) {
6748+ nth0 = 64 ;
6749+ nth1 = 2 ;
6750+ ndst = nth1*4 ;
6751+ } else {
6752+ GGML_ASSERT (false && " TODO: Unknown GPU" );
6753+ }
6754+
6755+ CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &extra0->data_device ));
6756+ CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_ulong), &offset0));
6757+ CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_mem), &extra1->data_device ));
6758+ CL_CHECK (clSetKernelArg (kernel, 3 , sizeof (cl_ulong), &offset1));
6759+ CL_CHECK (clSetKernelArg (kernel, 4 , sizeof (cl_mem), &extrad->data_device ));
6760+ CL_CHECK (clSetKernelArg (kernel, 5 , sizeof (cl_ulong), &offsetd));
6761+ CL_CHECK (clSetKernelArg (kernel, 6 , sizeof (int ), &ne00));
6762+ CL_CHECK (clSetKernelArg (kernel, 7 , sizeof (int ), &ne01));
6763+ CL_CHECK (clSetKernelArg (kernel, 8 , sizeof (cl_ulong), &nb01));
6764+ CL_CHECK (clSetKernelArg (kernel, 9 , sizeof (cl_ulong), &nb02));
6765+ CL_CHECK (clSetKernelArg (kernel, 10 , sizeof (cl_ulong), &nb03));
6766+ CL_CHECK (clSetKernelArg (kernel, 11 , sizeof (int ), &ne12));
6767+ CL_CHECK (clSetKernelArg (kernel, 12 , sizeof (cl_ulong), &nb11));
6768+ CL_CHECK (clSetKernelArg (kernel, 13 , sizeof (cl_ulong), &nb12));
6769+ CL_CHECK (clSetKernelArg (kernel, 14 , sizeof (cl_ulong), &nb13));
6770+ CL_CHECK (clSetKernelArg (kernel, 15 , sizeof (int ), &ne0));
6771+ CL_CHECK (clSetKernelArg (kernel, 16 , sizeof (int ), &ne1));
6772+ CL_CHECK (clSetKernelArg (kernel, 17 , sizeof (int ), &r2));
6773+ CL_CHECK (clSetKernelArg (kernel, 18 , sizeof (int ), &r3));
6774+ break ;
6775+ }
67186776 case GGML_TYPE_Q2_K:
67196777 case GGML_TYPE_Q3_K:
67206778 case GGML_TYPE_Q4_K:
0 commit comments