@@ -367,6 +367,7 @@ struct ggml_backend_opencl_context {
367
367
cl_program program_mul_mv_q4_0_f32_1d_8x_flat;
368
368
cl_program program_mul_mv_q4_0_f32_1d_16x_flat;
369
369
cl_program program_mul_mv_q6_K;
370
+ cl_program program_mul_mv_q8_0_f32;
370
371
cl_program program_mul_mv_mxfp4_f32;
371
372
cl_program program_mul_mv_f16_f16;
372
373
cl_program program_mul_mv_f16_f32_1row;
@@ -451,6 +452,7 @@ struct ggml_backend_opencl_context {
451
452
cl_kernel kernel_convert_block_q4_0_noshuffle;
452
453
cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
453
454
cl_kernel kernel_mul_mv_q6_K_f32;
455
+ cl_kernel kernel_mul_mv_q8_0_f32;
454
456
cl_kernel kernel_mul_mv_mxfp4_f32;
455
457
cl_kernel kernel_im2col_f32, kernel_im2col_f16;
456
458
cl_kernel kernel_argsort_f32_i32;
@@ -986,6 +988,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
986
988
GGML_LOG_CONT (" ." );
987
989
}
988
990
991
+ // mul_mv_q8_0_f32
992
+ {
993
+ #ifdef GGML_OPENCL_EMBED_KERNELS
994
+ const std::string kernel_src {
995
+ #include " mul_mv_q8_0_f32.cl.h"
996
+ };
997
+ #else
998
+ const std::string kernel_src = read_file (" mul_mv_q8_0_f32.cl" );
999
+ #endif
1000
+ backend_ctx->program_mul_mv_q8_0_f32 =
1001
+ build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
1002
+
1003
+ CL_CHECK ((backend_ctx->kernel_mul_mv_q8_0_f32 = clCreateKernel (backend_ctx->program_mul_mv_q8_0_f32 , " kernel_mul_mv_q8_0_f32" , &err), err));
1004
+ GGML_LOG_CONT (" ." );
1005
+ }
1006
+
989
1007
// mul_mv_mxfp4_f32
990
1008
{
991
1009
#ifdef GGML_OPENCL_EMBED_KERNELS
@@ -2724,6 +2742,8 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
2724
2742
} else if (op->src [0 ]->type == GGML_TYPE_Q4_0 || op->src [0 ]->type == GGML_TYPE_MXFP4 ||
2725
2743
op->src [0 ]->type == GGML_TYPE_Q6_K) {
2726
2744
return op->src [1 ]->type == GGML_TYPE_F32 && ggml_is_contiguous (op->src [0 ]) && ggml_is_contiguous (op->src [1 ]);
2745
+ } else if (op->src [0 ]->type == GGML_TYPE_Q8_0) {
2746
+ return op->src [1 ]->type == GGML_TYPE_F32;
2727
2747
}
2728
2748
return false ;
2729
2749
case GGML_OP_MUL_MAT_ID:
@@ -6716,7 +6736,45 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
6716
6736
#endif // GGML_OPENCL_SOA_Q
6717
6737
break ;
6718
6738
case GGML_TYPE_Q4_1:
6719
- case GGML_TYPE_Q8_0:
6739
+ case GGML_TYPE_Q8_0: {
6740
+ kernel = backend_ctx->kernel_mul_mv_q8_0_f32 ;
6741
+
6742
+ // nth0 - subgroup size
6743
+ // nth1 - number of subgroups per workgroup
6744
+ // ndst - number of output values per workgroup = output per subgroup * number of subgroups
6745
+ if (backend_ctx->gpu_family == INTEL) {
6746
+ nth0 = 16 ;
6747
+ nth1 = 2 ;
6748
+ ndst = nth1*4 ;
6749
+ } else if (backend_ctx->gpu_family == ADRENO) {
6750
+ nth0 = 64 ;
6751
+ nth1 = 2 ;
6752
+ ndst = nth1*4 ;
6753
+ } else {
6754
+ GGML_ASSERT (false && " TODO: Unknown GPU" );
6755
+ }
6756
+
6757
+ CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &extra0->data_device ));
6758
+ CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_ulong), &offset0));
6759
+ CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_mem), &extra1->data_device ));
6760
+ CL_CHECK (clSetKernelArg (kernel, 3 , sizeof (cl_ulong), &offset1));
6761
+ CL_CHECK (clSetKernelArg (kernel, 4 , sizeof (cl_mem), &extrad->data_device ));
6762
+ CL_CHECK (clSetKernelArg (kernel, 5 , sizeof (cl_ulong), &offsetd));
6763
+ CL_CHECK (clSetKernelArg (kernel, 6 , sizeof (int ), &ne00));
6764
+ CL_CHECK (clSetKernelArg (kernel, 7 , sizeof (int ), &ne01));
6765
+ CL_CHECK (clSetKernelArg (kernel, 8 , sizeof (cl_ulong), &nb01));
6766
+ CL_CHECK (clSetKernelArg (kernel, 9 , sizeof (cl_ulong), &nb02));
6767
+ CL_CHECK (clSetKernelArg (kernel, 10 , sizeof (cl_ulong), &nb03));
6768
+ CL_CHECK (clSetKernelArg (kernel, 11 , sizeof (int ), &ne12));
6769
+ CL_CHECK (clSetKernelArg (kernel, 12 , sizeof (cl_ulong), &nb11));
6770
+ CL_CHECK (clSetKernelArg (kernel, 13 , sizeof (cl_ulong), &nb12));
6771
+ CL_CHECK (clSetKernelArg (kernel, 14 , sizeof (cl_ulong), &nb13));
6772
+ CL_CHECK (clSetKernelArg (kernel, 15 , sizeof (int ), &ne0));
6773
+ CL_CHECK (clSetKernelArg (kernel, 16 , sizeof (int ), &ne1));
6774
+ CL_CHECK (clSetKernelArg (kernel, 17 , sizeof (int ), &r2));
6775
+ CL_CHECK (clSetKernelArg (kernel, 18 , sizeof (int ), &r3));
6776
+ break ;
6777
+ }
6720
6778
case GGML_TYPE_Q2_K:
6721
6779
case GGML_TYPE_Q3_K:
6722
6780
case GGML_TYPE_Q4_K:
0 commit comments