@@ -403,7 +403,7 @@ struct ggml_backend_opencl_context {
403
403
cl_program program_conv_2d_f16_f32;
404
404
cl_program program_tsembd;
405
405
cl_program program_mul_mv_id_q4_0_f32_8x_flat;
406
- cl_program program_mul_mv_id_q8_0_f32;
406
+ cl_program program_mul_mv_id_q8_0_f32, program_mul_mv_id_q8_0_f32_flat ;
407
407
cl_program program_mul_mv_id_mxfp4_f32;
408
408
cl_program program_mul_mv_id_mxfp4_f32_flat;
409
409
cl_program program_mul_mm_f32_f32_l4_lm;
@@ -475,7 +475,7 @@ struct ggml_backend_opencl_context {
475
475
cl_kernel kernel_conv_2d_f16_f32;
476
476
cl_kernel kernel_timestep_embedding;
477
477
cl_kernel kernel_mul_mv_id_q4_0_f32_8x_flat;
478
- cl_kernel kernel_mul_mv_id_q8_0_f32;
478
+ cl_kernel kernel_mul_mv_id_q8_0_f32, kernel_mul_mv_id_q8_0_f32_flat ;
479
479
cl_kernel kernel_mul_mv_id_mxfp4_f32;
480
480
cl_kernel kernel_mul_mv_id_mxfp4_f32_flat;
481
481
cl_kernel kernel_mul_mm_f32_f32_l4_lm;
@@ -1788,6 +1788,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
1788
1788
GGML_LOG_CONT (" ." );
1789
1789
}
1790
1790
1791
+ // mul_mv_id_q8_0_f32_flat
1792
+ {
1793
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1794
+ const std::string kernel_src {
1795
+ #include " mul_mv_id_q8_0_f32_flat.cl.h"
1796
+ };
1797
+ #else
1798
+ const std::string kernel_src = read_file (" mul_mv_id_q8_0_f32_flat.cl" );
1799
+ #endif
1800
+ backend_ctx->program_mul_mv_id_q8_0_f32_flat =
1801
+ build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
1802
+
1803
+ CL_CHECK ((backend_ctx->kernel_mul_mv_id_q8_0_f32_flat = clCreateKernel (backend_ctx->program_mul_mv_id_q8_0_f32_flat , " kernel_mul_mv_id_q8_0_f32_flat" , &err), err));
1804
+ GGML_LOG_CONT (" ." );
1805
+ }
1806
+
1791
1807
// mul_mv_id_mxfp4_f32
1792
1808
{
1793
1809
#ifdef GGML_OPENCL_EMBED_KERNELS
@@ -7397,6 +7413,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
7397
7413
#ifdef GGML_OPENCL_SOA_Q
7398
7414
ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra ;
7399
7415
ggml_tensor_extra_cl_mxfp4 * extra0_mxfp4 = (ggml_tensor_extra_cl_mxfp4 *)src0->extra ;
7416
+ ggml_tensor_extra_cl_q8_0 * extra0_q8_0 = (ggml_tensor_extra_cl_q8_0 *)src0->extra ;
7400
7417
#endif
7401
7418
7402
7419
const int ne00 = src0->ne [0 ];
@@ -7485,6 +7502,43 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
7485
7502
break ;
7486
7503
}
7487
7504
case GGML_TYPE_Q8_0: {
7505
+ #ifdef GGML_OPENCL_SOA_Q
7506
+ kernel = backend_ctx->kernel_mul_mv_id_q8_0_f32_flat ;
7507
+
7508
+ if (backend_ctx->gpu_family == INTEL) {
7509
+ sgs = 16 ;
7510
+ nsg = 2 ;
7511
+ ndst = 4 ;
7512
+ } else if (backend_ctx->gpu_family == ADRENO) {
7513
+ sgs = 64 ;
7514
+ nsg = 2 ;
7515
+ ndst = 4 ;
7516
+ } else {
7517
+ GGML_ASSERT (false && " TODO: Unknown GPU" );
7518
+ }
7519
+
7520
+ CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &extra0_q8_0->q ));
7521
+ CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_mem), &extra0_q8_0->d ));
7522
+ CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_mem), &extra1->data_device ));
7523
+ CL_CHECK (clSetKernelArg (kernel, 3 , sizeof (cl_ulong), &offset1));
7524
+ CL_CHECK (clSetKernelArg (kernel, 4 , sizeof (cl_mem), &extra2->data_device ));
7525
+ CL_CHECK (clSetKernelArg (kernel, 5 , sizeof (cl_ulong), &offset2));
7526
+ CL_CHECK (clSetKernelArg (kernel, 6 , sizeof (cl_mem), &extrad->data_device ));
7527
+ CL_CHECK (clSetKernelArg (kernel, 7 , sizeof (cl_ulong), &offsetd));
7528
+ CL_CHECK (clSetKernelArg (kernel, 8 , sizeof (int ), &ne00));
7529
+ CL_CHECK (clSetKernelArg (kernel, 9 , sizeof (int ), &ne01));
7530
+ CL_CHECK (clSetKernelArg (kernel, 10 , sizeof (cl_ulong), &nb01));
7531
+ CL_CHECK (clSetKernelArg (kernel, 11 , sizeof (cl_ulong), &nb02));
7532
+ CL_CHECK (clSetKernelArg (kernel, 12 , sizeof (int ), &ne11));
7533
+ CL_CHECK (clSetKernelArg (kernel, 13 , sizeof (int ), &ne12));
7534
+ CL_CHECK (clSetKernelArg (kernel, 14 , sizeof (cl_ulong), &nb11));
7535
+ CL_CHECK (clSetKernelArg (kernel, 15 , sizeof (cl_ulong), &nb12));
7536
+ CL_CHECK (clSetKernelArg (kernel, 16 , sizeof (int ), &ne20));
7537
+ CL_CHECK (clSetKernelArg (kernel, 17 , sizeof (int ), &ne21));
7538
+ CL_CHECK (clSetKernelArg (kernel, 18 , sizeof (cl_ulong), &nb21));
7539
+ CL_CHECK (clSetKernelArg (kernel, 19 , sizeof (int ), &ne0));
7540
+ CL_CHECK (clSetKernelArg (kernel, 20 , sizeof (int ), &ne1));
7541
+ #else
7488
7542
kernel = backend_ctx->kernel_mul_mv_id_q8_0_f32 ;
7489
7543
7490
7544
if (backend_ctx->gpu_family == INTEL) {
@@ -7494,7 +7548,6 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
7494
7548
} else if (backend_ctx->gpu_family == ADRENO) {
7495
7549
sgs = 64 ;
7496
7550
nsg = 2 ;
7497
- ndst = 8 ;
7498
7551
ndst = 4 ;
7499
7552
} else {
7500
7553
GGML_ASSERT (false && " TODO: Unknown GPU" );
@@ -7521,7 +7574,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
7521
7574
CL_CHECK (clSetKernelArg (kernel, 18 , sizeof (cl_ulong), &nb21));
7522
7575
CL_CHECK (clSetKernelArg (kernel, 19 , sizeof (int ), &ne0));
7523
7576
CL_CHECK (clSetKernelArg (kernel, 20 , sizeof (int ), &ne1));
7524
-
7577
+ # endif // GGML_OPENCL_SOA_Q
7525
7578
break ;
7526
7579
}
7527
7580
case GGML_TYPE_MXFP4: {
0 commit comments