@@ -402,7 +402,7 @@ struct ggml_backend_opencl_context {
402
402
cl_program program_conv_2d_f16_f32;
403
403
cl_program program_tsembd;
404
404
cl_program program_mul_mv_id_q4_0_f32_8x_flat;
405
- cl_program program_mul_mv_id_q8_0_f32;
405
+ cl_program program_mul_mv_id_q8_0_f32, program_mul_mv_id_q8_0_f32_flat ;
406
406
cl_program program_mul_mv_id_mxfp4_f32;
407
407
cl_program program_mul_mm_f32_f32_l4_lm;
408
408
cl_program program_mul_mm_f16_f32_l4_lm;
@@ -472,7 +472,7 @@ struct ggml_backend_opencl_context {
472
472
cl_kernel kernel_conv_2d_f16_f32;
473
473
cl_kernel kernel_timestep_embedding;
474
474
cl_kernel kernel_mul_mv_id_q4_0_f32_8x_flat;
475
- cl_kernel kernel_mul_mv_id_q8_0_f32;
475
+ cl_kernel kernel_mul_mv_id_q8_0_f32, kernel_mul_mv_id_q8_0_f32_flat ;
476
476
cl_kernel kernel_mul_mv_id_mxfp4_f32;
477
477
cl_kernel kernel_mul_mm_f32_f32_l4_lm;
478
478
cl_kernel kernel_mul_mm_f16_f32_l4_lm;
@@ -1766,6 +1766,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
1766
1766
GGML_LOG_CONT (" ." );
1767
1767
}
1768
1768
1769
+ // mul_mv_id_q8_0_f32_flat
1770
+ {
1771
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1772
+ const std::string kernel_src {
1773
+ #include " mul_mv_id_q8_0_f32_flat.cl.h"
1774
+ };
1775
+ #else
1776
+ const std::string kernel_src = read_file (" mul_mv_id_q8_0_f32_flat.cl" );
1777
+ #endif
1778
+ backend_ctx->program_mul_mv_id_q8_0_f32_flat =
1779
+ build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
1780
+
1781
+ CL_CHECK ((backend_ctx->kernel_mul_mv_id_q8_0_f32_flat = clCreateKernel (backend_ctx->program_mul_mv_id_q8_0_f32_flat , " kernel_mul_mv_id_q8_0_f32_flat" , &err), err));
1782
+ GGML_LOG_CONT (" ." );
1783
+ }
1784
+
1769
1785
// mul_mv_id_mxfp4_f32
1770
1786
{
1771
1787
#ifdef GGML_OPENCL_EMBED_KERNELS
@@ -7133,6 +7149,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
7133
7149
7134
7150
#ifdef GGML_OPENCL_SOA_Q
7135
7151
ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra ;
7152
+ ggml_tensor_extra_cl_q8_0 * extra0_q8_0 = (ggml_tensor_extra_cl_q8_0 *)src0->extra ;
7136
7153
#endif
7137
7154
7138
7155
const int ne00 = src0->ne [0 ];
@@ -7221,6 +7238,43 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
7221
7238
break ;
7222
7239
}
7223
7240
case GGML_TYPE_Q8_0: {
7241
+ #ifdef GGML_OPENCL_SOA_Q
7242
+ kernel = backend_ctx->kernel_mul_mv_id_q8_0_f32_flat ;
7243
+
7244
+ if (backend_ctx->gpu_family == INTEL) {
7245
+ sgs = 16 ;
7246
+ nsg = 2 ;
7247
+ ndst = 4 ;
7248
+ } else if (backend_ctx->gpu_family == ADRENO) {
7249
+ sgs = 64 ;
7250
+ nsg = 2 ;
7251
+ ndst = 4 ;
7252
+ } else {
7253
+ GGML_ASSERT (false && " TODO: Unknown GPU" );
7254
+ }
7255
+
7256
+ CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &extra0_q8_0->q ));
7257
+ CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_mem), &extra0_q8_0->d ));
7258
+ CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_mem), &extra1->data_device ));
7259
+ CL_CHECK (clSetKernelArg (kernel, 3 , sizeof (cl_ulong), &offset1));
7260
+ CL_CHECK (clSetKernelArg (kernel, 4 , sizeof (cl_mem), &extra2->data_device ));
7261
+ CL_CHECK (clSetKernelArg (kernel, 5 , sizeof (cl_ulong), &offset2));
7262
+ CL_CHECK (clSetKernelArg (kernel, 6 , sizeof (cl_mem), &extrad->data_device ));
7263
+ CL_CHECK (clSetKernelArg (kernel, 7 , sizeof (cl_ulong), &offsetd));
7264
+ CL_CHECK (clSetKernelArg (kernel, 8 , sizeof (int ), &ne00));
7265
+ CL_CHECK (clSetKernelArg (kernel, 9 , sizeof (int ), &ne01));
7266
+ CL_CHECK (clSetKernelArg (kernel, 10 , sizeof (cl_ulong), &nb01));
7267
+ CL_CHECK (clSetKernelArg (kernel, 11 , sizeof (cl_ulong), &nb02));
7268
+ CL_CHECK (clSetKernelArg (kernel, 12 , sizeof (int ), &ne11));
7269
+ CL_CHECK (clSetKernelArg (kernel, 13 , sizeof (int ), &ne12));
7270
+ CL_CHECK (clSetKernelArg (kernel, 14 , sizeof (cl_ulong), &nb11));
7271
+ CL_CHECK (clSetKernelArg (kernel, 15 , sizeof (cl_ulong), &nb12));
7272
+ CL_CHECK (clSetKernelArg (kernel, 16 , sizeof (int ), &ne20));
7273
+ CL_CHECK (clSetKernelArg (kernel, 17 , sizeof (int ), &ne21));
7274
+ CL_CHECK (clSetKernelArg (kernel, 18 , sizeof (cl_ulong), &nb21));
7275
+ CL_CHECK (clSetKernelArg (kernel, 19 , sizeof (int ), &ne0));
7276
+ CL_CHECK (clSetKernelArg (kernel, 20 , sizeof (int ), &ne1));
7277
+ #else
7224
7278
kernel = backend_ctx->kernel_mul_mv_id_q8_0_f32 ;
7225
7279
7226
7280
if (backend_ctx->gpu_family == INTEL) {
@@ -7230,7 +7284,6 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
7230
7284
} else if (backend_ctx->gpu_family == ADRENO) {
7231
7285
sgs = 64 ;
7232
7286
nsg = 2 ;
7233
- ndst = 8 ;
7234
7287
ndst = 4 ;
7235
7288
} else {
7236
7289
GGML_ASSERT (false && " TODO: Unknown GPU" );
@@ -7257,7 +7310,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
7257
7310
CL_CHECK (clSetKernelArg (kernel, 18 , sizeof (cl_ulong), &nb21));
7258
7311
CL_CHECK (clSetKernelArg (kernel, 19 , sizeof (int ), &ne0));
7259
7312
CL_CHECK (clSetKernelArg (kernel, 20 , sizeof (int ), &ne1));
7260
-
7313
+ # endif // GGML_OPENCL_SOA_Q
7261
7314
break ;
7262
7315
}
7263
7316
case GGML_TYPE_MXFP4: {
0 commit comments