@@ -367,7 +367,7 @@ struct ggml_backend_opencl_context {
367367 cl_program program_mul_mv_q4_0_f32_1d_8x_flat;
368368 cl_program program_mul_mv_q4_0_f32_1d_16x_flat;
369369 cl_program program_mul_mv_q6_K;
370- cl_program program_mul_mv_q8_0_f32;
370+ cl_program program_mul_mv_q8_0_f32, program_mul_mv_q8_0_f32_flat ;
371371 cl_program program_mul_mv_mxfp4_f32;
372372 cl_program program_mul_mv_f16_f16;
373373 cl_program program_mul_mv_f16_f32_1row;
@@ -449,11 +449,12 @@ struct ggml_backend_opencl_context {
449449 cl_kernel kernel_mul_mat_f16_f32_tiled;
450450 cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v;
451451 cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0;
452+ cl_kernel kernel_convert_block_q8_0, kernel_restore_block_q8_0;
452453 cl_kernel kernel_mul_mat_q4_0_f32_8x_flat;
453454 cl_kernel kernel_convert_block_q4_0_noshuffle;
454455 cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
455456 cl_kernel kernel_mul_mv_q6_K_f32;
456- cl_kernel kernel_mul_mv_q8_0_f32;
457+ cl_kernel kernel_mul_mv_q8_0_f32, kernel_mul_mv_q8_0_f32_flat ;
457458 cl_kernel kernel_mul_mv_mxfp4_f32;
458459 cl_kernel kernel_im2col_f32, kernel_im2col_f16;
459460 cl_kernel kernel_argsort_f32_i32;
@@ -769,6 +770,8 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
769770 CL_CHECK ((backend_ctx->kernel_convert_block_q4_0_noshuffle = clCreateKernel (backend_ctx->program_cvt , " kernel_convert_block_q4_0_noshuffle" , &err), err));
770771 CL_CHECK ((backend_ctx->kernel_convert_block_q4_0 = clCreateKernel (backend_ctx->program_cvt , " kernel_convert_block_q4_0" , &err), err));
771772 CL_CHECK ((backend_ctx->kernel_restore_block_q4_0 = clCreateKernel (backend_ctx->program_cvt , " kernel_restore_block_q4_0" , &err), err));
773+ CL_CHECK ((backend_ctx->kernel_convert_block_q8_0 = clCreateKernel (backend_ctx->program_cvt , " kernel_convert_block_q8_0" , &err), err));
774+ CL_CHECK ((backend_ctx->kernel_restore_block_q8_0 = clCreateKernel (backend_ctx->program_cvt , " kernel_restore_block_q8_0" , &err), err));
772775 GGML_LOG_CONT (" ." );
773776 }
774777
@@ -1006,6 +1009,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
10061009 GGML_LOG_CONT (" ." );
10071010 }
10081011
1012+ // mul_mv_q8_0_f32_flat
1013+ {
1014+ #ifdef GGML_OPENCL_EMBED_KERNELS
1015+ const std::string kernel_src {
1016+ #include " mul_mv_q8_0_f32_flat.cl.h"
1017+ };
1018+ #else
1019+ const std::string kernel_src = read_file (" mul_mv_q8_0_f32_flat.cl" );
1020+ #endif
1021+ backend_ctx->program_mul_mv_q8_0_f32_flat =
1022+ build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
1023+
1024+ CL_CHECK ((backend_ctx->kernel_mul_mv_q8_0_f32_flat = clCreateKernel (backend_ctx->program_mul_mv_q8_0_f32_flat , " kernel_mul_mv_q8_0_f32_flat" , &err), err));
1025+ GGML_LOG_CONT (" ." );
1026+ }
1027+
10091028 // mul_mv_mxfp4_f32
10101029 {
10111030#ifdef GGML_OPENCL_EMBED_KERNELS
@@ -2427,6 +2446,41 @@ struct ggml_tensor_extra_cl_q4_0 {
24272446 }
24282447};
24292448
2449+ struct ggml_tensor_extra_cl_q8_0 {
2450+ cl_mem q = nullptr ;
2451+ cl_mem q_img = nullptr ;
2452+
2453+ cl_mem d = nullptr ;
2454+ cl_mem d_img = nullptr ;
2455+
2456+ size_t size_q = 0 ;
2457+ size_t size_d = 0 ;
2458+
2459+ ~ggml_tensor_extra_cl_q8_0 () {
2460+ reset ();
2461+ }
2462+
2463+ void reset () {
2464+ // q and d are subbuffers into the bigger buffer allocated in ggml_backend_buffer.
2465+ // They must be properly released so that the original buffer can be
2466+ // properly released to avoid memory leak.
2467+ if (q != nullptr ) {
2468+ CL_CHECK (clReleaseMemObject (q));
2469+ q = nullptr ;
2470+ }
2471+ if (d != nullptr ) {
2472+ CL_CHECK (clReleaseMemObject (d));
2473+ d = nullptr ;
2474+ }
2475+ // Currently, q_img and d_img are not used. They can be image1d_buffer_t
2476+ // that wraps around q and d to utilize image access path.
2477+ q_img = nullptr ;
2478+ d_img = nullptr ;
2479+ size_q = 0 ;
2480+ size_d = 0 ;
2481+ }
2482+ };
2483+
24302484// ------------------------------------------------------------------------------
24312485// Backend API
24322486// ------------------------------------------------------------------------------
@@ -2933,6 +2987,12 @@ struct ggml_backend_opencl_buffer_context {
29332987 for (ggml_tensor_extra_cl_q4_0 * e : temp_tensor_extras_q4_0_in_use) {
29342988 delete e;
29352989 }
2990+ for (ggml_tensor_extra_cl_q8_0 * e : temp_tensor_extras_q8_0) {
2991+ delete e;
2992+ }
2993+ for (ggml_tensor_extra_cl_q8_0 * e : temp_tensor_extras_q8_0_in_use) {
2994+ delete e;
2995+ }
29362996 }
29372997
29382998 ggml_tensor_extra_cl * ggml_opencl_alloc_temp_tensor_extra () {
@@ -2965,6 +3025,21 @@ struct ggml_backend_opencl_buffer_context {
29653025 return extra;
29663026 }
29673027
3028+ ggml_tensor_extra_cl_q8_0 * ggml_opencl_alloc_temp_tensor_extra_q8_0 () {
3029+ ggml_tensor_extra_cl_q8_0 * extra;
3030+ if (temp_tensor_extras_q8_0.empty ()) {
3031+ extra = new ggml_tensor_extra_cl_q8_0 ();
3032+ } else {
3033+ extra = temp_tensor_extras_q8_0.back ();
3034+ temp_tensor_extras_q4_0.pop_back ();
3035+ }
3036+
3037+ temp_tensor_extras_q8_0_in_use.push_back (extra);
3038+
3039+ extra->reset ();
3040+ return extra;
3041+ }
3042+
29683043 void reset () {
29693044 for (ggml_tensor_extra_cl * e : temp_tensor_extras_in_use) {
29703045 temp_tensor_extras.push_back (e);
@@ -2975,6 +3050,11 @@ struct ggml_backend_opencl_buffer_context {
29753050 temp_tensor_extras_q4_0.push_back (e);
29763051 }
29773052 temp_tensor_extras_q4_0_in_use.clear ();
3053+
3054+ for (ggml_tensor_extra_cl_q8_0 * e : temp_tensor_extras_q8_0_in_use) {
3055+ temp_tensor_extras_q8_0.push_back (e);
3056+ }
3057+ temp_tensor_extras_q8_0_in_use.clear ();
29783058 }
29793059
29803060 // Pools for extras. Available extras are in `temp_tensor_extras`. Extras
@@ -2986,6 +3066,8 @@ struct ggml_backend_opencl_buffer_context {
29863066 std::vector<ggml_tensor_extra_cl *> temp_tensor_extras_in_use;
29873067 std::vector<ggml_tensor_extra_cl_q4_0 *> temp_tensor_extras_q4_0;
29883068 std::vector<ggml_tensor_extra_cl_q4_0 *> temp_tensor_extras_q4_0_in_use;
3069+ std::vector<ggml_tensor_extra_cl_q8_0 *> temp_tensor_extras_q8_0;
3070+ std::vector<ggml_tensor_extra_cl_q8_0 *> temp_tensor_extras_q8_0_in_use;
29893071
29903072 // The buffer_context is initially created by ggml_backend_buft_alloc_buffer
29913073 // before any tensor is initialized (at the beginning of alloc_tensor_range).
@@ -3328,6 +3410,65 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
33283410 }
33293411 #endif // GGML_OPENCL_USE_ADRENO_KERNELS
33303412
3413+ return ;
3414+ }
3415+ if (tensor->type == GGML_TYPE_Q8_0) {
3416+ ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra ;
3417+ GGML_ASSERT (extra_orig && " Tesnors in OpenCL backend should have been allocated and initialized" );
3418+
3419+ // Allocate the new extra and create aliases from the original.
3420+ ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context ;
3421+ ggml_tensor_extra_cl_q8_0 * extra = ctx->ggml_opencl_alloc_temp_tensor_extra_q8_0 ();
3422+
3423+ size_t size_d = ggml_nelements (tensor)/ggml_blck_size (tensor->type )*sizeof (ggml_fp16_t );
3424+ size_t size_q = ggml_nelements (tensor)/ggml_blck_size (tensor->type )*(ggml_blck_size (tensor->type )*sizeof (char ));
3425+ GGML_ASSERT (size_d + size_q == ggml_nbytes (tensor) && " Incorrect tensor size" );
3426+
3427+ cl_int err;
3428+ cl_mem data_device = clCreateBuffer (context, CL_MEM_READ_WRITE,
3429+ ggml_nbytes (tensor), NULL , &err);
3430+ CL_CHECK (err);
3431+ CL_CHECK (clEnqueueWriteBuffer (
3432+ queue, data_device, CL_TRUE, 0 ,
3433+ ggml_nbytes (tensor), data, 0 , NULL , NULL ));
3434+
3435+ // The original tensor memory is divided into scales and quants, i.e.,
3436+ // we first store scales, then quants.
3437+ cl_buffer_region region;
3438+
3439+ // Create subbuffer for scales.
3440+ region.origin = align_to (extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment );
3441+ region.size = size_d;
3442+ extra->d = clCreateSubBuffer (
3443+ extra_orig->data_device , CL_MEM_READ_WRITE,
3444+ CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
3445+ CL_CHECK (err);
3446+ auto previous_origin = region.origin ;
3447+
3448+ // Create subbuffer for quants.
3449+ region.origin = align_to (previous_origin + size_d, backend_ctx->alignment );
3450+ region.size = size_q;
3451+ extra->q = clCreateSubBuffer (
3452+ extra_orig->data_device , CL_MEM_READ_WRITE,
3453+ CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
3454+ CL_CHECK (err);
3455+
3456+ cl_kernel kernel = backend_ctx->kernel_convert_block_q8_0 ;
3457+
3458+ CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &data_device));
3459+ CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_mem), &extra->q ));
3460+ CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_mem), &extra->d ));
3461+
3462+ size_t global_work_size[] = {(size_t )ggml_nelements (tensor)/ggml_blck_size (tensor->type ), 1 , 1 };
3463+ size_t local_work_size[] = {64 , 1 , 1 };
3464+
3465+ cl_event evt;
3466+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size, 0 , NULL , &evt));
3467+ CL_CHECK (clWaitForEvents (1 , &evt));
3468+ CL_CHECK (clReleaseMemObject (data_device));
3469+
3470+ tensor->extra = extra;
3471+
33313472 return ;
33323473 }
33333474#endif // GGML_OPENCL_SOA_Q
@@ -3376,6 +3517,32 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
33763517 size_t global_work_size[] = {(size_t )ggml_nelements (tensor)/ggml_blck_size (tensor->type ), 1 , 1 };
33773518 size_t local_work_size[] = {1 , 1 , 1 };
33783519
3520+ cl_event evt;
3521+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL ,
3522+ global_work_size, local_work_size, 0 , NULL , &evt));
3523+ CL_CHECK (clWaitForEvents (1 , &evt));
3524+ CL_CHECK (clEnqueueReadBuffer (
3525+ queue, data_device, CL_TRUE, offset,
3526+ size, data, 0 , NULL , NULL ));
3527+ CL_CHECK (clReleaseMemObject (data_device));
3528+ return ;
3529+ }
3530+ if (tensor->type == GGML_TYPE_Q8_0) {
3531+ ggml_tensor_extra_cl_q8_0 * extra = (ggml_tensor_extra_cl_q8_0 *)tensor->extra ;
3532+
3533+ cl_int err;
3534+ cl_mem data_device = clCreateBuffer (context, CL_MEM_READ_WRITE,
3535+ ggml_nbytes (tensor), NULL , &err);
3536+ CL_CHECK (err);
3537+
3538+ cl_kernel kernel = backend_ctx->kernel_restore_block_q8_0 ;
3539+ CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &extra->q ));
3540+ CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_mem), &extra->d ));
3541+ CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_mem), &data_device));
3542+
3543+ size_t global_work_size[] = {(size_t )ggml_nelements (tensor)/ggml_blck_size (tensor->type ), 1 , 1 };
3544+ size_t local_work_size[] = {1 , 1 , 1 };
3545+
33793546 cl_event evt;
33803547 CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL ,
33813548 global_work_size, local_work_size, 0 , NULL , &evt));
@@ -6087,6 +6254,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
60876254
60886255#ifdef GGML_OPENCL_SOA_Q
60896256 ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra ;
6257+ ggml_tensor_extra_cl_q8_0 * extra0_q8_0 = (ggml_tensor_extra_cl_q8_0 *)src0->extra ;
60906258#endif
60916259
60926260 const int ne00 = src0 ? src0->ne [0 ] : 0 ;
@@ -6757,6 +6925,44 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
67576925 break ;
67586926 case GGML_TYPE_Q4_1:
67596927 case GGML_TYPE_Q8_0: {
6928+ #ifdef GGML_OPENCL_SOA_Q
6929+ kernel = backend_ctx->kernel_mul_mv_q8_0_f32_flat ;
6930+
6931+ // nth0 - subgroup size
6932+ // nth1 - number of subgroups per workgroup
6933+ // ndst - number of output values per workgroup = output per subgroup * number of subgroups
6934+ if (backend_ctx->gpu_family == INTEL) {
6935+ nth0 = 16 ;
6936+ nth1 = 2 ;
6937+ ndst = nth1*4 ;
6938+ } else if (backend_ctx->gpu_family == ADRENO) {
6939+ nth0 = 64 ;
6940+ nth1 = 2 ;
6941+ ndst = nth1*4 ;
6942+ } else {
6943+ GGML_ASSERT (false && " TODO: Unknown GPU" );
6944+ }
6945+
6946+ CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &extra0_q8_0->q ));
6947+ CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_mem), &extra0_q8_0->d ));
6948+ CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_mem), &extra1->data_device ));
6949+ CL_CHECK (clSetKernelArg (kernel, 3 , sizeof (cl_ulong), &offset1));
6950+ CL_CHECK (clSetKernelArg (kernel, 4 , sizeof (cl_mem), &extrad->data_device ));
6951+ CL_CHECK (clSetKernelArg (kernel, 5 , sizeof (cl_ulong), &offsetd));
6952+ CL_CHECK (clSetKernelArg (kernel, 6 , sizeof (int ), &ne00));
6953+ CL_CHECK (clSetKernelArg (kernel, 7 , sizeof (int ), &ne01));
6954+ CL_CHECK (clSetKernelArg (kernel, 8 , sizeof (cl_ulong), &nb01));
6955+ CL_CHECK (clSetKernelArg (kernel, 9 , sizeof (cl_ulong), &nb02));
6956+ CL_CHECK (clSetKernelArg (kernel, 10 , sizeof (cl_ulong), &nb03));
6957+ CL_CHECK (clSetKernelArg (kernel, 11 , sizeof (int ), &ne12));
6958+ CL_CHECK (clSetKernelArg (kernel, 12 , sizeof (cl_ulong), &nb11));
6959+ CL_CHECK (clSetKernelArg (kernel, 13 , sizeof (cl_ulong), &nb12));
6960+ CL_CHECK (clSetKernelArg (kernel, 14 , sizeof (cl_ulong), &nb13));
6961+ CL_CHECK (clSetKernelArg (kernel, 15 , sizeof (int ), &ne0));
6962+ CL_CHECK (clSetKernelArg (kernel, 16 , sizeof (int ), &ne1));
6963+ CL_CHECK (clSetKernelArg (kernel, 17 , sizeof (int ), &r2));
6964+ CL_CHECK (clSetKernelArg (kernel, 18 , sizeof (int ), &r3));
6965+ #else
67606966 kernel = backend_ctx->kernel_mul_mv_q8_0_f32 ;
67616967
67626968 // nth0 - subgroup size
@@ -6793,6 +6999,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
67936999 CL_CHECK (clSetKernelArg (kernel, 16 , sizeof (int ), &ne1));
67947000 CL_CHECK (clSetKernelArg (kernel, 17 , sizeof (int ), &r2));
67957001 CL_CHECK (clSetKernelArg (kernel, 18 , sizeof (int ), &r3));
7002+ #endif // GGML_OPENCL_SOA_Q
67967003 break ;
67977004 }
67987005 case GGML_TYPE_Q2_K:
0 commit comments