@@ -367,6 +367,7 @@ struct ggml_backend_opencl_context {
367367 cl_program program_mul_mv_q4_0_f32_1d_8x_flat;
368368 cl_program program_mul_mv_q4_0_f32_1d_16x_flat;
369369 cl_program program_mul_mv_q6_K;
370+ cl_program program_mul_mv_q8_0_f32;
370371 cl_program program_mul_mv_mxfp4_f32;
371372 cl_program program_mul_mv_mxfp4_f32_flat;
372373 cl_program program_mul_mv_f16_f16;
@@ -455,6 +456,7 @@ struct ggml_backend_opencl_context {
455456 cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
456457 cl_kernel kernel_mul_mv_q6_K_f32;
457458 cl_kernel kernel_mul_mv_mxfp4_f32, kernel_mul_mv_mxfp4_f32_flat;
459+ cl_kernel kernel_mul_mv_q8_0_f32;
458460 cl_kernel kernel_im2col_f32, kernel_im2col_f16;
459461 cl_kernel kernel_argsort_f32_i32;
460462 cl_kernel kernel_sum_rows_f32;
@@ -992,6 +994,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
992994 GGML_LOG_CONT (" ." );
993995 }
994996
997+ // mul_mv_q8_0_f32
998+ {
999+ #ifdef GGML_OPENCL_EMBED_KERNELS
1000+ const std::string kernel_src {
1001+ #include " mul_mv_q8_0_f32.cl.h"
1002+ };
1003+ #else
1004+ const std::string kernel_src = read_file (" mul_mv_q8_0_f32.cl" );
1005+ #endif
1006+ backend_ctx->program_mul_mv_q8_0_f32 =
1007+ build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
1008+
1009+ CL_CHECK ((backend_ctx->kernel_mul_mv_q8_0_f32 = clCreateKernel (backend_ctx->program_mul_mv_q8_0_f32 , " kernel_mul_mv_q8_0_f32" , &err), err));
1010+ GGML_LOG_CONT (" ." );
1011+ }
1012+
9951013 // mul_mv_mxfp4_f32
9961014 {
9971015#ifdef GGML_OPENCL_EMBED_KERNELS
@@ -2807,6 +2825,8 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
28072825 } else if (op->src [0 ]->type == GGML_TYPE_Q4_0 || op->src [0 ]->type == GGML_TYPE_MXFP4 ||
28082826 op->src [0 ]->type == GGML_TYPE_Q6_K) {
28092827 return op->src [1 ]->type == GGML_TYPE_F32 && ggml_is_contiguous (op->src [0 ]) && ggml_is_contiguous (op->src [1 ]);
2828+ } else if (op->src [0 ]->type == GGML_TYPE_Q8_0) {
2829+ return op->src [1 ]->type == GGML_TYPE_F32;
28102830 }
28112831 return false ;
28122832 case GGML_OP_MUL_MAT_ID:
@@ -6937,7 +6957,45 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
69376957#endif // GGML_OPENCL_SOA_Q
69386958 break ;
69396959 case GGML_TYPE_Q4_1:
6940- case GGML_TYPE_Q8_0:
6960+ case GGML_TYPE_Q8_0: {
6961+ kernel = backend_ctx->kernel_mul_mv_q8_0_f32 ;
6962+
6963+ // nth0 - subgroup size
6964+ // nth1 - number of subgroups per workgroup
6965+ // ndst - number of output values per workgroup = output per subgroup * number of subgroups
6966+ if (backend_ctx->gpu_family == INTEL) {
6967+ nth0 = 16 ;
6968+ nth1 = 2 ;
6969+ ndst = nth1*4 ;
6970+ } else if (backend_ctx->gpu_family == ADRENO) {
6971+ nth0 = 64 ;
6972+ nth1 = 2 ;
6973+ ndst = nth1*4 ;
6974+ } else {
6975+ GGML_ASSERT (false && " TODO: Unknown GPU" );
6976+ }
6977+
6978+ CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &extra0->data_device ));
6979+ CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_ulong), &offset0));
6980+ CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_mem), &extra1->data_device ));
6981+ CL_CHECK (clSetKernelArg (kernel, 3 , sizeof (cl_ulong), &offset1));
6982+ CL_CHECK (clSetKernelArg (kernel, 4 , sizeof (cl_mem), &extrad->data_device ));
6983+ CL_CHECK (clSetKernelArg (kernel, 5 , sizeof (cl_ulong), &offsetd));
6984+ CL_CHECK (clSetKernelArg (kernel, 6 , sizeof (int ), &ne00));
6985+ CL_CHECK (clSetKernelArg (kernel, 7 , sizeof (int ), &ne01));
6986+ CL_CHECK (clSetKernelArg (kernel, 8 , sizeof (cl_ulong), &nb01));
6987+ CL_CHECK (clSetKernelArg (kernel, 9 , sizeof (cl_ulong), &nb02));
6988+ CL_CHECK (clSetKernelArg (kernel, 10 , sizeof (cl_ulong), &nb03));
6989+ CL_CHECK (clSetKernelArg (kernel, 11 , sizeof (int ), &ne12));
6990+ CL_CHECK (clSetKernelArg (kernel, 12 , sizeof (cl_ulong), &nb11));
6991+ CL_CHECK (clSetKernelArg (kernel, 13 , sizeof (cl_ulong), &nb12));
6992+ CL_CHECK (clSetKernelArg (kernel, 14 , sizeof (cl_ulong), &nb13));
6993+ CL_CHECK (clSetKernelArg (kernel, 15 , sizeof (int ), &ne0));
6994+ CL_CHECK (clSetKernelArg (kernel, 16 , sizeof (int ), &ne1));
6995+ CL_CHECK (clSetKernelArg (kernel, 17 , sizeof (int ), &r2));
6996+ CL_CHECK (clSetKernelArg (kernel, 18 , sizeof (int ), &r3));
6997+ break ;
6998+ }
69416999 case GGML_TYPE_Q2_K:
69427000 case GGML_TYPE_Q3_K:
69437001 case GGML_TYPE_Q4_K:
0 commit comments