@@ -367,7 +367,7 @@ struct ggml_backend_opencl_context {
367367 cl_program program_mul_mv_q4_0_f32_1d_8x_flat;
368368 cl_program program_mul_mv_q4_0_f32_1d_16x_flat;
369369 cl_program program_mul_mv_q6_K;
370- cl_program program_mul_mv_q8_0_f32;
370+ cl_program program_mul_mv_q8_0_f32, program_mul_mv_q8_0_f32_flat ;
371371 cl_program program_mul_mv_mxfp4_f32;
372372 cl_program program_mul_mv_mxfp4_f32_flat;
373373 cl_program program_mul_mv_f16_f16;
@@ -452,12 +452,13 @@ struct ggml_backend_opencl_context {
452452 cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v;
453453 cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0;
454454 cl_kernel kernel_convert_block_mxfp4, kernel_restore_block_mxfp4;
455+ cl_kernel kernel_convert_block_q8_0, kernel_restore_block_q8_0;
455456 cl_kernel kernel_mul_mat_q4_0_f32_8x_flat;
456457 cl_kernel kernel_convert_block_q4_0_noshuffle;
457458 cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
458459 cl_kernel kernel_mul_mv_q6_K_f32;
459460 cl_kernel kernel_mul_mv_mxfp4_f32, kernel_mul_mv_mxfp4_f32_flat;
460- cl_kernel kernel_mul_mv_q8_0_f32;
461+ cl_kernel kernel_mul_mv_q8_0_f32, kernel_mul_mv_q8_0_f32_flat ;
461462 cl_kernel kernel_im2col_f32, kernel_im2col_f16;
462463 cl_kernel kernel_argsort_f32_i32;
463464 cl_kernel kernel_sum_rows_f32;
@@ -773,8 +774,10 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
773774 CL_CHECK ((backend_ctx->kernel_convert_block_q4_0_noshuffle = clCreateKernel (backend_ctx->program_cvt , " kernel_convert_block_q4_0_noshuffle" , &err), err));
774775 CL_CHECK ((backend_ctx->kernel_convert_block_q4_0 = clCreateKernel (backend_ctx->program_cvt , " kernel_convert_block_q4_0" , &err), err));
775776 CL_CHECK ((backend_ctx->kernel_restore_block_q4_0 = clCreateKernel (backend_ctx->program_cvt , " kernel_restore_block_q4_0" , &err), err));
776- CL_CHECK ((backend_ctx->kernel_convert_block_mxfp4 = clCreateKernel (backend_ctx->program_cvt , " kernel_convert_block_mxfp4" , &err), err));
777- CL_CHECK ((backend_ctx->kernel_restore_block_mxfp4 = clCreateKernel (backend_ctx->program_cvt , " kernel_restore_block_mxfp4" , &err), err));
777+ CL_CHECK ((backend_ctx->kernel_convert_block_mxfp4 = clCreateKernel (backend_ctx->program_cvt , " kernel_convert_block_mxfp4" , &err), err));
778+ CL_CHECK ((backend_ctx->kernel_restore_block_mxfp4 = clCreateKernel (backend_ctx->program_cvt , " kernel_restore_block_mxfp4" , &err), err));
779+ CL_CHECK ((backend_ctx->kernel_convert_block_q8_0 = clCreateKernel (backend_ctx->program_cvt , " kernel_convert_block_q8_0" , &err), err));
780+ CL_CHECK ((backend_ctx->kernel_restore_block_q8_0 = clCreateKernel (backend_ctx->program_cvt , " kernel_restore_block_q8_0" , &err), err));
778781 GGML_LOG_CONT (" ." );
779782 }
780783
@@ -1012,6 +1015,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
10121015 GGML_LOG_CONT (" ." );
10131016 }
10141017
1018+ // mul_mv_q8_0_f32_flat
1019+ {
1020+ #ifdef GGML_OPENCL_EMBED_KERNELS
1021+ const std::string kernel_src {
1022+ #include " mul_mv_q8_0_f32_flat.cl.h"
1023+ };
1024+ #else
1025+ const std::string kernel_src = read_file (" mul_mv_q8_0_f32_flat.cl" );
1026+ #endif
1027+ backend_ctx->program_mul_mv_q8_0_f32_flat =
1028+ build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
1029+
1030+ CL_CHECK ((backend_ctx->kernel_mul_mv_q8_0_f32_flat = clCreateKernel (backend_ctx->program_mul_mv_q8_0_f32_flat , " kernel_mul_mv_q8_0_f32_flat" , &err), err));
1031+ GGML_LOG_CONT (" ." );
1032+ }
1033+
10151034 // mul_mv_mxfp4_f32
10161035 {
10171036#ifdef GGML_OPENCL_EMBED_KERNELS
@@ -2499,17 +2518,50 @@ struct ggml_tensor_extra_cl_mxfp4 {
24992518 CL_CHECK (clReleaseMemObject (q_img));
25002519 q = nullptr ;
25012520 }
2502- // Currently, q_img and d_img are only initialized when SMALL_ALLOC is
2503- // enabled. They point to the images in ggml_backend_opencl_buffer_context.
2504- // So, there is no need to release them here.
2505- // TODO: initialize them for non SMALL_PATH path, or remove them.
2521+ // Currently, q_img and d_img are not used. They can be image1d_buffer_t
2522+ // that wraps around q and d to utilize image access path.
25062523 q_img = nullptr ;
25072524 e_img = nullptr ;
25082525 size_q = 0 ;
25092526 size_e = 0 ;
25102527 }
25112528};
25122529
2530+ struct ggml_tensor_extra_cl_q8_0 {
2531+ cl_mem q = nullptr ;
2532+ cl_mem q_img = nullptr ;
2533+
2534+ cl_mem d = nullptr ;
2535+ cl_mem d_img = nullptr ;
2536+
2537+ size_t size_q = 0 ;
2538+ size_t size_d = 0 ;
2539+
2540+ ~ggml_tensor_extra_cl_q8_0 () {
2541+ reset ();
2542+ }
2543+
2544+ void reset () {
2545+ // q and d are subbuffers into the bigger buffer allocated in ggml_backend_buffer.
2546+ // They must be properly released so that the original buffer can be
2547+ // properly released to avoid memory leak.
2548+ if (q != nullptr ) {
2549+ CL_CHECK (clReleaseMemObject (q));
2550+ q = nullptr ;
2551+ }
2552+ if (d != nullptr ) {
2553+ CL_CHECK (clReleaseMemObject (d));
2554+ d = nullptr ;
2555+ }
2556+ // Currently, q_img and d_img are not used. They can be image1d_buffer_t
2557+ // that wraps around q and d to utilize image access path.
2558+ q_img = nullptr ;
2559+ d_img = nullptr ;
2560+ size_q = 0 ;
2561+ size_d = 0 ;
2562+ }
2563+ };
2564+
25132565// ------------------------------------------------------------------------------
25142566// Backend API
25152567// ------------------------------------------------------------------------------
@@ -3022,6 +3074,12 @@ struct ggml_backend_opencl_buffer_context {
30223074 for (ggml_tensor_extra_cl_mxfp4 * e : temp_tensor_extras_mxfp4_in_use) {
30233075 delete e;
30243076 }
3077+ for (ggml_tensor_extra_cl_q8_0 * e : temp_tensor_extras_q8_0) {
3078+ delete e;
3079+ }
3080+ for (ggml_tensor_extra_cl_q8_0 * e : temp_tensor_extras_q8_0_in_use) {
3081+ delete e;
3082+ }
30253083 }
30263084
30273085 ggml_tensor_extra_cl * ggml_opencl_alloc_temp_tensor_extra () {
@@ -3069,6 +3127,21 @@ struct ggml_backend_opencl_buffer_context {
30693127 return extra;
30703128 }
30713129
3130+ ggml_tensor_extra_cl_q8_0 * ggml_opencl_alloc_temp_tensor_extra_q8_0 () {
3131+ ggml_tensor_extra_cl_q8_0 * extra;
3132+ if (temp_tensor_extras_q8_0.empty ()) {
3133+ extra = new ggml_tensor_extra_cl_q8_0 ();
3134+ } else {
3135+ extra = temp_tensor_extras_q8_0.back ();
3136+ temp_tensor_extras_q8_0.pop_back ();
3137+ }
3138+
3139+ temp_tensor_extras_q8_0_in_use.push_back (extra);
3140+
3141+ extra->reset ();
3142+ return extra;
3143+ }
3144+
30723145 void reset () {
30733146 for (ggml_tensor_extra_cl * e : temp_tensor_extras_in_use) {
30743147 temp_tensor_extras.push_back (e);
@@ -3084,6 +3157,11 @@ struct ggml_backend_opencl_buffer_context {
30843157 temp_tensor_extras_mxfp4.push_back (e);
30853158 }
30863159 temp_tensor_extras_mxfp4_in_use.clear ();
3160+
3161+ for (ggml_tensor_extra_cl_q8_0 * e : temp_tensor_extras_q8_0_in_use) {
3162+ temp_tensor_extras_q8_0.push_back (e);
3163+ }
3164+ temp_tensor_extras_q8_0_in_use.clear ();
30873165 }
30883166
30893167 // Pools for extras. Available extras are in `temp_tensor_extras`. Extras
@@ -3097,6 +3175,8 @@ struct ggml_backend_opencl_buffer_context {
30973175 std::vector<ggml_tensor_extra_cl_q4_0 *> temp_tensor_extras_q4_0_in_use;
30983176 std::vector<ggml_tensor_extra_cl_mxfp4 *> temp_tensor_extras_mxfp4;
30993177 std::vector<ggml_tensor_extra_cl_mxfp4 *> temp_tensor_extras_mxfp4_in_use;
3178+ std::vector<ggml_tensor_extra_cl_q8_0 *> temp_tensor_extras_q8_0;
3179+ std::vector<ggml_tensor_extra_cl_q8_0 *> temp_tensor_extras_q8_0_in_use;
31003180
31013181 // The buffer_context is initially created by ggml_backend_buft_alloc_buffer
31023182 // before any tensor is initialized (at the beginning of alloc_tensor_range).
@@ -3509,6 +3589,65 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
35093589
35103590 tensor->extra = extra;
35113591
3592+ return ;
3593+ }
3594+ if (tensor->type == GGML_TYPE_Q8_0) {
3595+ ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra ;
3596+ GGML_ASSERT (extra_orig && " Tesnors in OpenCL backend should have been allocated and initialized" );
3597+
3598+ // Allocate the new extra and create aliases from the original.
3599+ ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context ;
3600+ ggml_tensor_extra_cl_q8_0 * extra = ctx->ggml_opencl_alloc_temp_tensor_extra_q8_0 ();
3601+
3602+ size_t size_d = ggml_nelements (tensor)/ggml_blck_size (tensor->type )*sizeof (ggml_fp16_t );
3603+ size_t size_q = ggml_nelements (tensor)/ggml_blck_size (tensor->type )*(ggml_blck_size (tensor->type )*sizeof (char ));
3604+ GGML_ASSERT (size_d + size_q == ggml_nbytes (tensor) && " Incorrect tensor size" );
3605+
3606+ cl_int err;
3607+ cl_mem data_device = clCreateBuffer (context, CL_MEM_READ_WRITE,
3608+ ggml_nbytes (tensor), NULL , &err);
3609+ CL_CHECK (err);
3610+ CL_CHECK (clEnqueueWriteBuffer (
3611+ queue, data_device, CL_TRUE, 0 ,
3612+ ggml_nbytes (tensor), data, 0 , NULL , NULL ));
3613+
3614+ // The original tensor memory is divided into scales and quants, i.e.,
3615+ // we first store scales, then quants.
3616+ cl_buffer_region region;
3617+
3618+ // Create subbuffer for scales.
3619+ region.origin = align_to (extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment );
3620+ region.size = size_d;
3621+ extra->d = clCreateSubBuffer (
3622+ extra_orig->data_device , CL_MEM_READ_WRITE,
3623+ CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
3624+ CL_CHECK (err);
3625+ auto previous_origin = region.origin ;
3626+
3627+ // Create subbuffer for quants.
3628+ region.origin = align_to (previous_origin + size_d, backend_ctx->alignment );
3629+ region.size = size_q;
3630+ extra->q = clCreateSubBuffer (
3631+ extra_orig->data_device , CL_MEM_READ_WRITE,
3632+ CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
3633+ CL_CHECK (err);
3634+
3635+ cl_kernel kernel = backend_ctx->kernel_convert_block_q8_0 ;
3636+
3637+ CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &data_device));
3638+ CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_mem), &extra->q ));
3639+ CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_mem), &extra->d ));
3640+
3641+ size_t global_work_size[] = {(size_t )ggml_nelements (tensor)/ggml_blck_size (tensor->type ), 1 , 1 };
3642+ size_t local_work_size[] = {64 , 1 , 1 };
3643+
3644+ cl_event evt;
3645+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size, 0 , NULL , &evt));
3646+ CL_CHECK (clWaitForEvents (1 , &evt));
3647+ CL_CHECK (clReleaseMemObject (data_device));
3648+
3649+ tensor->extra = extra;
3650+
35123651 return ;
35133652 }
35143653#endif // GGML_OPENCL_SOA_Q
@@ -3582,6 +3721,32 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
35823721 size_t global_work_size[] = {(size_t )ggml_nelements (tensor)/ggml_blck_size (tensor->type ), 1 , 1 };
35833722 size_t local_work_size[] = {1 , 1 , 1 };
35843723
3724+ cl_event evt;
3725+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL ,
3726+ global_work_size, local_work_size, 0 , NULL , &evt));
3727+ CL_CHECK (clWaitForEvents (1 , &evt));
3728+ CL_CHECK (clEnqueueReadBuffer (
3729+ queue, data_device, CL_TRUE, offset,
3730+ size, data, 0 , NULL , NULL ));
3731+ CL_CHECK (clReleaseMemObject (data_device));
3732+ return ;
3733+ }
3734+ if (tensor->type == GGML_TYPE_Q8_0) {
3735+ ggml_tensor_extra_cl_q8_0 * extra = (ggml_tensor_extra_cl_q8_0 *)tensor->extra ;
3736+
3737+ cl_int err;
3738+ cl_mem data_device = clCreateBuffer (context, CL_MEM_READ_WRITE,
3739+ ggml_nbytes (tensor), NULL , &err);
3740+ CL_CHECK (err);
3741+
3742+ cl_kernel kernel = backend_ctx->kernel_restore_block_q8_0 ;
3743+ CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &extra->q ));
3744+ CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_mem), &extra->d ));
3745+ CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_mem), &data_device));
3746+
3747+ size_t global_work_size[] = {(size_t )ggml_nelements (tensor)/ggml_blck_size (tensor->type ), 1 , 1 };
3748+ size_t local_work_size[] = {1 , 1 , 1 };
3749+
35853750 cl_event evt;
35863751 CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL ,
35873752 global_work_size, local_work_size, 0 , NULL , &evt));
@@ -6307,6 +6472,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
63076472#ifdef GGML_OPENCL_SOA_Q
63086473 ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra ;
63096474 ggml_tensor_extra_cl_mxfp4 * extra0_mxfp4 = (ggml_tensor_extra_cl_mxfp4 *)src0->extra ;
6475+ ggml_tensor_extra_cl_q8_0 * extra0_q8_0 = (ggml_tensor_extra_cl_q8_0 *)src0->extra ;
63106476#endif
63116477
63126478 const int ne00 = src0 ? src0->ne [0 ] : 0 ;
@@ -6977,6 +7143,44 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
69777143 break ;
69787144 case GGML_TYPE_Q4_1:
69797145 case GGML_TYPE_Q8_0: {
7146+ #ifdef GGML_OPENCL_SOA_Q
7147+ kernel = backend_ctx->kernel_mul_mv_q8_0_f32_flat ;
7148+
7149+ // nth0 - subgroup size
7150+ // nth1 - number of subgroups per workgroup
7151+ // ndst - number of output values per workgroup = output per subgroup * number of subgroups
7152+ if (backend_ctx->gpu_family == INTEL) {
7153+ nth0 = 16 ;
7154+ nth1 = 2 ;
7155+ ndst = nth1*4 ;
7156+ } else if (backend_ctx->gpu_family == ADRENO) {
7157+ nth0 = 64 ;
7158+ nth1 = 2 ;
7159+ ndst = nth1*4 ;
7160+ } else {
7161+ GGML_ASSERT (false && " TODO: Unknown GPU" );
7162+ }
7163+
7164+ CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &extra0_q8_0->q ));
7165+ CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_mem), &extra0_q8_0->d ));
7166+ CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_mem), &extra1->data_device ));
7167+ CL_CHECK (clSetKernelArg (kernel, 3 , sizeof (cl_ulong), &offset1));
7168+ CL_CHECK (clSetKernelArg (kernel, 4 , sizeof (cl_mem), &extrad->data_device ));
7169+ CL_CHECK (clSetKernelArg (kernel, 5 , sizeof (cl_ulong), &offsetd));
7170+ CL_CHECK (clSetKernelArg (kernel, 6 , sizeof (int ), &ne00));
7171+ CL_CHECK (clSetKernelArg (kernel, 7 , sizeof (int ), &ne01));
7172+ CL_CHECK (clSetKernelArg (kernel, 8 , sizeof (cl_ulong), &nb01));
7173+ CL_CHECK (clSetKernelArg (kernel, 9 , sizeof (cl_ulong), &nb02));
7174+ CL_CHECK (clSetKernelArg (kernel, 10 , sizeof (cl_ulong), &nb03));
7175+ CL_CHECK (clSetKernelArg (kernel, 11 , sizeof (int ), &ne12));
7176+ CL_CHECK (clSetKernelArg (kernel, 12 , sizeof (cl_ulong), &nb11));
7177+ CL_CHECK (clSetKernelArg (kernel, 13 , sizeof (cl_ulong), &nb12));
7178+ CL_CHECK (clSetKernelArg (kernel, 14 , sizeof (cl_ulong), &nb13));
7179+ CL_CHECK (clSetKernelArg (kernel, 15 , sizeof (int ), &ne0));
7180+ CL_CHECK (clSetKernelArg (kernel, 16 , sizeof (int ), &ne1));
7181+ CL_CHECK (clSetKernelArg (kernel, 17 , sizeof (int ), &r2));
7182+ CL_CHECK (clSetKernelArg (kernel, 18 , sizeof (int ), &r3));
7183+ #else
69807184 kernel = backend_ctx->kernel_mul_mv_q8_0_f32 ;
69817185
69827186 // nth0 - subgroup size
@@ -7013,6 +7217,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
70137217 CL_CHECK (clSetKernelArg (kernel, 16 , sizeof (int ), &ne1));
70147218 CL_CHECK (clSetKernelArg (kernel, 17 , sizeof (int ), &r2));
70157219 CL_CHECK (clSetKernelArg (kernel, 18 , sizeof (int ), &r3));
7220+ #endif // GGML_OPENCL_SOA_Q
70167221 break ;
70177222 }
70187223 case GGML_TYPE_Q2_K:
0 commit comments