@@ -367,6 +367,7 @@ struct ggml_backend_opencl_context {
367367 cl_program program_mul_mv_q4_0_f32_1d_8x_flat;
368368 cl_program program_mul_mv_q4_0_f32_1d_16x_flat;
369369 cl_program program_mul_mv_q6_K;
370+ cl_program program_mul_mv_q8_0_f32;
370371 cl_program program_mul_mv_mxfp4_f32;
371372 cl_program program_mul_mv_f16_f16;
372373 cl_program program_mul_mv_f16_f32_1row;
@@ -451,6 +452,7 @@ struct ggml_backend_opencl_context {
451452 cl_kernel kernel_convert_block_q4_0_noshuffle;
452453 cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
453454 cl_kernel kernel_mul_mv_q6_K_f32;
455+ cl_kernel kernel_mul_mv_q8_0_f32;
454456 cl_kernel kernel_mul_mv_mxfp4_f32;
455457 cl_kernel kernel_im2col_f32, kernel_im2col_f16;
456458 cl_kernel kernel_argsort_f32_i32;
@@ -986,6 +988,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
986988 GGML_LOG_CONT (" ." );
987989 }
988990
991+ // mul_mv_q8_0_f32
992+ {
993+ #ifdef GGML_OPENCL_EMBED_KERNELS
994+ const std::string kernel_src {
995+ #include " mul_mv_q8_0_f32.cl.h"
996+ };
997+ #else
998+ const std::string kernel_src = read_file (" mul_mv_q8_0_f32.cl" );
999+ #endif
1000+ backend_ctx->program_mul_mv_q8_0_f32 =
1001+ build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
1002+
1003+ CL_CHECK ((backend_ctx->kernel_mul_mv_q8_0_f32 = clCreateKernel (backend_ctx->program_mul_mv_q8_0_f32 , " kernel_mul_mv_q8_0_f32" , &err), err));
1004+ GGML_LOG_CONT (" ." );
1005+ }
1006+
9891007 // mul_mv_mxfp4_f32
9901008 {
9911009#ifdef GGML_OPENCL_EMBED_KERNELS
@@ -2724,6 +2742,8 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
27242742 } else if (op->src [0 ]->type == GGML_TYPE_Q4_0 || op->src [0 ]->type == GGML_TYPE_MXFP4 ||
27252743 op->src [0 ]->type == GGML_TYPE_Q6_K) {
27262744 return op->src [1 ]->type == GGML_TYPE_F32 && ggml_is_contiguous (op->src [0 ]) && ggml_is_contiguous (op->src [1 ]);
2745+ } else if (op->src [0 ]->type == GGML_TYPE_Q8_0) {
2746+ return op->src [1 ]->type == GGML_TYPE_F32;
27272747 }
27282748 return false ;
27292749 case GGML_OP_MUL_MAT_ID:
@@ -6717,7 +6737,45 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
67176737#endif // GGML_OPENCL_SOA_Q
67186738 break ;
67196739 case GGML_TYPE_Q4_1:
6720- case GGML_TYPE_Q8_0:
6740+ case GGML_TYPE_Q8_0: {
6741+ kernel = backend_ctx->kernel_mul_mv_q8_0_f32 ;
6742+
6743+ // nth0 - subgroup size
6744+ // nth1 - number of subgroups per workgroup
6745+ // ndst - number of output values per workgroup = output per subgroup * number of subgroups
6746+ if (backend_ctx->gpu_family == INTEL) {
6747+ nth0 = 16 ;
6748+ nth1 = 2 ;
6749+ ndst = nth1*4 ;
6750+ } else if (backend_ctx->gpu_family == ADRENO) {
6751+ nth0 = 64 ;
6752+ nth1 = 2 ;
6753+ ndst = nth1*4 ;
6754+ } else {
6755+ GGML_ASSERT (false && " TODO: Unknown GPU" );
6756+ }
6757+
6758+ CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &extra0->data_device ));
6759+ CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_ulong), &offset0));
6760+ CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_mem), &extra1->data_device ));
6761+ CL_CHECK (clSetKernelArg (kernel, 3 , sizeof (cl_ulong), &offset1));
6762+ CL_CHECK (clSetKernelArg (kernel, 4 , sizeof (cl_mem), &extrad->data_device ));
6763+ CL_CHECK (clSetKernelArg (kernel, 5 , sizeof (cl_ulong), &offsetd));
6764+ CL_CHECK (clSetKernelArg (kernel, 6 , sizeof (int ), &ne00));
6765+ CL_CHECK (clSetKernelArg (kernel, 7 , sizeof (int ), &ne01));
6766+ CL_CHECK (clSetKernelArg (kernel, 8 , sizeof (cl_ulong), &nb01));
6767+ CL_CHECK (clSetKernelArg (kernel, 9 , sizeof (cl_ulong), &nb02));
6768+ CL_CHECK (clSetKernelArg (kernel, 10 , sizeof (cl_ulong), &nb03));
6769+ CL_CHECK (clSetKernelArg (kernel, 11 , sizeof (int ), &ne12));
6770+ CL_CHECK (clSetKernelArg (kernel, 12 , sizeof (cl_ulong), &nb11));
6771+ CL_CHECK (clSetKernelArg (kernel, 13 , sizeof (cl_ulong), &nb12));
6772+ CL_CHECK (clSetKernelArg (kernel, 14 , sizeof (cl_ulong), &nb13));
6773+ CL_CHECK (clSetKernelArg (kernel, 15 , sizeof (int ), &ne0));
6774+ CL_CHECK (clSetKernelArg (kernel, 16 , sizeof (int ), &ne1));
6775+ CL_CHECK (clSetKernelArg (kernel, 17 , sizeof (int ), &r2));
6776+ CL_CHECK (clSetKernelArg (kernel, 18 , sizeof (int ), &r3));
6777+ break ;
6778+ }
67216779 case GGML_TYPE_Q2_K:
67226780 case GGML_TYPE_Q3_K:
67236781 case GGML_TYPE_Q4_K:
0 commit comments