@@ -367,6 +367,7 @@ struct ggml_backend_opencl_context {
367
367
cl_program program_mul_mv_q4_0_f32_1d_8x_flat;
368
368
cl_program program_mul_mv_q4_0_f32_1d_16x_flat;
369
369
cl_program program_mul_mv_q6_K;
370
+ cl_program program_mul_mv_q8_0_f32;
370
371
cl_program program_mul_mv_mxfp4_f32;
371
372
cl_program program_mul_mv_f16_f16;
372
373
cl_program program_mul_mv_f16_f32_1row;
@@ -451,6 +452,7 @@ struct ggml_backend_opencl_context {
451
452
cl_kernel kernel_convert_block_q4_0_noshuffle;
452
453
cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
453
454
cl_kernel kernel_mul_mv_q6_K_f32;
455
+ cl_kernel kernel_mul_mv_q8_0_f32;
454
456
cl_kernel kernel_mul_mv_mxfp4_f32;
455
457
cl_kernel kernel_im2col_f32, kernel_im2col_f16;
456
458
cl_kernel kernel_argsort_f32_i32;
@@ -986,6 +988,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
986
988
GGML_LOG_CONT (" ." );
987
989
}
988
990
991
+ // mul_mv_q8_0_f32
992
+ {
993
+ #ifdef GGML_OPENCL_EMBED_KERNELS
994
+ const std::string kernel_src {
995
+ #include " mul_mv_q8_0_f32.cl.h"
996
+ };
997
+ #else
998
+ const std::string kernel_src = read_file (" mul_mv_q8_0_f32.cl" );
999
+ #endif
1000
+ backend_ctx->program_mul_mv_q8_0_f32 =
1001
+ build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
1002
+
1003
+ CL_CHECK ((backend_ctx->kernel_mul_mv_q8_0_f32 = clCreateKernel (backend_ctx->program_mul_mv_q8_0_f32 , " kernel_mul_mv_q8_0_f32" , &err), err));
1004
+ GGML_LOG_CONT (" ." );
1005
+ }
1006
+
989
1007
// mul_mv_mxfp4_f32
990
1008
{
991
1009
#ifdef GGML_OPENCL_EMBED_KERNELS
@@ -2724,6 +2742,8 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
2724
2742
} else if (op->src [0 ]->type == GGML_TYPE_Q4_0 || op->src [0 ]->type == GGML_TYPE_MXFP4 ||
2725
2743
op->src [0 ]->type == GGML_TYPE_Q6_K) {
2726
2744
return op->src [1 ]->type == GGML_TYPE_F32 && ggml_is_contiguous (op->src [0 ]) && ggml_is_contiguous (op->src [1 ]);
2745
+ } else if (op->src [0 ]->type == GGML_TYPE_Q8_0) {
2746
+ return op->src [1 ]->type == GGML_TYPE_F32;
2727
2747
}
2728
2748
return false ;
2729
2749
case GGML_OP_MUL_MAT_ID:
@@ -6717,7 +6737,45 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
6717
6737
#endif // GGML_OPENCL_SOA_Q
6718
6738
break ;
6719
6739
case GGML_TYPE_Q4_1:
6720
- case GGML_TYPE_Q8_0:
6740
+ case GGML_TYPE_Q8_0: {
6741
+ kernel = backend_ctx->kernel_mul_mv_q8_0_f32 ;
6742
+
6743
+ // nth0 - subgroup size
6744
+ // nth1 - number of subgroups per workgroup
6745
+ // ndst - number of output values per workgroup = output per subgroup * number of subgroups
6746
+ if (backend_ctx->gpu_family == INTEL) {
6747
+ nth0 = 16 ;
6748
+ nth1 = 2 ;
6749
+ ndst = nth1*4 ;
6750
+ } else if (backend_ctx->gpu_family == ADRENO) {
6751
+ nth0 = 64 ;
6752
+ nth1 = 2 ;
6753
+ ndst = nth1*4 ;
6754
+ } else {
6755
+ GGML_ASSERT (false && " TODO: Unknown GPU" );
6756
+ }
6757
+
6758
+ CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &extra0->data_device ));
6759
+ CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_ulong), &offset0));
6760
+ CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_mem), &extra1->data_device ));
6761
+ CL_CHECK (clSetKernelArg (kernel, 3 , sizeof (cl_ulong), &offset1));
6762
+ CL_CHECK (clSetKernelArg (kernel, 4 , sizeof (cl_mem), &extrad->data_device ));
6763
+ CL_CHECK (clSetKernelArg (kernel, 5 , sizeof (cl_ulong), &offsetd));
6764
+ CL_CHECK (clSetKernelArg (kernel, 6 , sizeof (int ), &ne00));
6765
+ CL_CHECK (clSetKernelArg (kernel, 7 , sizeof (int ), &ne01));
6766
+ CL_CHECK (clSetKernelArg (kernel, 8 , sizeof (cl_ulong), &nb01));
6767
+ CL_CHECK (clSetKernelArg (kernel, 9 , sizeof (cl_ulong), &nb02));
6768
+ CL_CHECK (clSetKernelArg (kernel, 10 , sizeof (cl_ulong), &nb03));
6769
+ CL_CHECK (clSetKernelArg (kernel, 11 , sizeof (int ), &ne12));
6770
+ CL_CHECK (clSetKernelArg (kernel, 12 , sizeof (cl_ulong), &nb11));
6771
+ CL_CHECK (clSetKernelArg (kernel, 13 , sizeof (cl_ulong), &nb12));
6772
+ CL_CHECK (clSetKernelArg (kernel, 14 , sizeof (cl_ulong), &nb13));
6773
+ CL_CHECK (clSetKernelArg (kernel, 15 , sizeof (int ), &ne0));
6774
+ CL_CHECK (clSetKernelArg (kernel, 16 , sizeof (int ), &ne1));
6775
+ CL_CHECK (clSetKernelArg (kernel, 17 , sizeof (int ), &r2));
6776
+ CL_CHECK (clSetKernelArg (kernel, 18 , sizeof (int ), &r3));
6777
+ break ;
6778
+ }
6721
6779
case GGML_TYPE_Q2_K:
6722
6780
case GGML_TYPE_Q3_K:
6723
6781
case GGML_TYPE_Q4_K:
0 commit comments