@@ -367,7 +367,7 @@ struct ggml_backend_opencl_context {
367
367
cl_program program_mul_mv_q4_0_f32_1d_8x_flat;
368
368
cl_program program_mul_mv_q4_0_f32_1d_16x_flat;
369
369
cl_program program_mul_mv_q6_K;
370
- cl_program program_mul_mv_q8_0_f32;
370
+ cl_program program_mul_mv_q8_0_f32, program_mul_mv_q8_0_f32_flat ;
371
371
cl_program program_mul_mv_mxfp4_f32;
372
372
cl_program program_mul_mv_f16_f16;
373
373
cl_program program_mul_mv_f16_f32_1row;
@@ -449,11 +449,12 @@ struct ggml_backend_opencl_context {
449
449
cl_kernel kernel_mul_mat_f16_f32_tiled;
450
450
cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v;
451
451
cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0;
452
+ cl_kernel kernel_convert_block_q8_0, kernel_restore_block_q8_0;
452
453
cl_kernel kernel_mul_mat_q4_0_f32_8x_flat;
453
454
cl_kernel kernel_convert_block_q4_0_noshuffle;
454
455
cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
455
456
cl_kernel kernel_mul_mv_q6_K_f32;
456
- cl_kernel kernel_mul_mv_q8_0_f32;
457
+ cl_kernel kernel_mul_mv_q8_0_f32, kernel_mul_mv_q8_0_f32_flat ;
457
458
cl_kernel kernel_mul_mv_mxfp4_f32;
458
459
cl_kernel kernel_im2col_f32, kernel_im2col_f16;
459
460
cl_kernel kernel_argsort_f32_i32;
@@ -769,6 +770,8 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
769
770
CL_CHECK ((backend_ctx->kernel_convert_block_q4_0_noshuffle = clCreateKernel (backend_ctx->program_cvt , " kernel_convert_block_q4_0_noshuffle" , &err), err));
770
771
CL_CHECK ((backend_ctx->kernel_convert_block_q4_0 = clCreateKernel (backend_ctx->program_cvt , " kernel_convert_block_q4_0" , &err), err));
771
772
CL_CHECK ((backend_ctx->kernel_restore_block_q4_0 = clCreateKernel (backend_ctx->program_cvt , " kernel_restore_block_q4_0" , &err), err));
773
+ CL_CHECK ((backend_ctx->kernel_convert_block_q8_0 = clCreateKernel (backend_ctx->program_cvt , " kernel_convert_block_q8_0" , &err), err));
774
+ CL_CHECK ((backend_ctx->kernel_restore_block_q8_0 = clCreateKernel (backend_ctx->program_cvt , " kernel_restore_block_q8_0" , &err), err));
772
775
GGML_LOG_CONT (" ." );
773
776
}
774
777
@@ -1006,6 +1009,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
1006
1009
GGML_LOG_CONT (" ." );
1007
1010
}
1008
1011
1012
+ // mul_mv_q8_0_f32_flat
1013
+ {
1014
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1015
+ const std::string kernel_src {
1016
+ #include " mul_mv_q8_0_f32_flat.cl.h"
1017
+ };
1018
+ #else
1019
+ const std::string kernel_src = read_file (" mul_mv_q8_0_f32_flat.cl" );
1020
+ #endif
1021
+ backend_ctx->program_mul_mv_q8_0_f32_flat =
1022
+ build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
1023
+
1024
+ CL_CHECK ((backend_ctx->kernel_mul_mv_q8_0_f32_flat = clCreateKernel (backend_ctx->program_mul_mv_q8_0_f32_flat , " kernel_mul_mv_q8_0_f32_flat" , &err), err));
1025
+ GGML_LOG_CONT (" ." );
1026
+ }
1027
+
1009
1028
// mul_mv_mxfp4_f32
1010
1029
{
1011
1030
#ifdef GGML_OPENCL_EMBED_KERNELS
@@ -2427,6 +2446,41 @@ struct ggml_tensor_extra_cl_q4_0 {
2427
2446
}
2428
2447
};
2429
2448
2449
+ struct ggml_tensor_extra_cl_q8_0 {
2450
+ cl_mem q = nullptr ;
2451
+ cl_mem q_img = nullptr ;
2452
+
2453
+ cl_mem d = nullptr ;
2454
+ cl_mem d_img = nullptr ;
2455
+
2456
+ size_t size_q = 0 ;
2457
+ size_t size_d = 0 ;
2458
+
2459
+ ~ggml_tensor_extra_cl_q8_0 () {
2460
+ reset ();
2461
+ }
2462
+
2463
+ void reset () {
2464
+ // q and d are subbuffers into the bigger buffer allocated in ggml_backend_buffer.
2465
+ // They must be properly released so that the original buffer can be
2466
+ // properly released to avoid memory leak.
2467
+ if (q != nullptr ) {
2468
+ CL_CHECK (clReleaseMemObject (q));
2469
+ q = nullptr ;
2470
+ }
2471
+ if (d != nullptr ) {
2472
+ CL_CHECK (clReleaseMemObject (d));
2473
+ d = nullptr ;
2474
+ }
2475
+ // Currently, q_img and d_img are not used. They can be image1d_buffer_t
2476
+ // that wraps around q and d to utilize image access path.
2477
+ q_img = nullptr ;
2478
+ d_img = nullptr ;
2479
+ size_q = 0 ;
2480
+ size_d = 0 ;
2481
+ }
2482
+ };
2483
+
2430
2484
// ------------------------------------------------------------------------------
2431
2485
// Backend API
2432
2486
// ------------------------------------------------------------------------------
@@ -2932,6 +2986,12 @@ struct ggml_backend_opencl_buffer_context {
2932
2986
for (ggml_tensor_extra_cl_q4_0 * e : temp_tensor_extras_q4_0_in_use) {
2933
2987
delete e;
2934
2988
}
2989
+ for (ggml_tensor_extra_cl_q8_0 * e : temp_tensor_extras_q8_0) {
2990
+ delete e;
2991
+ }
2992
+ for (ggml_tensor_extra_cl_q8_0 * e : temp_tensor_extras_q8_0_in_use) {
2993
+ delete e;
2994
+ }
2935
2995
}
2936
2996
2937
2997
ggml_tensor_extra_cl * ggml_opencl_alloc_temp_tensor_extra () {
@@ -2964,6 +3024,21 @@ struct ggml_backend_opencl_buffer_context {
2964
3024
return extra;
2965
3025
}
2966
3026
3027
+ ggml_tensor_extra_cl_q8_0 * ggml_opencl_alloc_temp_tensor_extra_q8_0 () {
3028
+ ggml_tensor_extra_cl_q8_0 * extra;
3029
+ if (temp_tensor_extras_q8_0.empty ()) {
3030
+ extra = new ggml_tensor_extra_cl_q8_0 ();
3031
+ } else {
3032
+ extra = temp_tensor_extras_q8_0.back ();
3033
+ temp_tensor_extras_q4_0.pop_back ();
3034
+ }
3035
+
3036
+ temp_tensor_extras_q8_0_in_use.push_back (extra);
3037
+
3038
+ extra->reset ();
3039
+ return extra;
3040
+ }
3041
+
2967
3042
void reset () {
2968
3043
for (ggml_tensor_extra_cl * e : temp_tensor_extras_in_use) {
2969
3044
temp_tensor_extras.push_back (e);
@@ -2974,6 +3049,11 @@ struct ggml_backend_opencl_buffer_context {
2974
3049
temp_tensor_extras_q4_0.push_back (e);
2975
3050
}
2976
3051
temp_tensor_extras_q4_0_in_use.clear ();
3052
+
3053
+ for (ggml_tensor_extra_cl_q8_0 * e : temp_tensor_extras_q8_0_in_use) {
3054
+ temp_tensor_extras_q8_0.push_back (e);
3055
+ }
3056
+ temp_tensor_extras_q8_0_in_use.clear ();
2977
3057
}
2978
3058
2979
3059
// Pools for extras. Available extras are in `temp_tensor_extras`. Extras
@@ -2985,6 +3065,8 @@ struct ggml_backend_opencl_buffer_context {
2985
3065
std::vector<ggml_tensor_extra_cl *> temp_tensor_extras_in_use;
2986
3066
std::vector<ggml_tensor_extra_cl_q4_0 *> temp_tensor_extras_q4_0;
2987
3067
std::vector<ggml_tensor_extra_cl_q4_0 *> temp_tensor_extras_q4_0_in_use;
3068
+ std::vector<ggml_tensor_extra_cl_q8_0 *> temp_tensor_extras_q8_0;
3069
+ std::vector<ggml_tensor_extra_cl_q8_0 *> temp_tensor_extras_q8_0_in_use;
2988
3070
2989
3071
// The buffer_context is initially created by ggml_backend_buft_alloc_buffer
2990
3072
// before any tensor is initialized (at the beginning of alloc_tensor_range).
@@ -3327,6 +3409,65 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
3327
3409
}
3328
3410
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
3329
3411
3412
+ return ;
3413
+ }
3414
+ if (tensor->type == GGML_TYPE_Q8_0) {
3415
+ ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra ;
3416
+ GGML_ASSERT (extra_orig && " Tesnors in OpenCL backend should have been allocated and initialized" );
3417
+
3418
+ // Allocate the new extra and create aliases from the original.
3419
+ ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context ;
3420
+ ggml_tensor_extra_cl_q8_0 * extra = ctx->ggml_opencl_alloc_temp_tensor_extra_q8_0 ();
3421
+
3422
+ size_t size_d = ggml_nelements (tensor)/ggml_blck_size (tensor->type )*sizeof (ggml_fp16_t );
3423
+ size_t size_q = ggml_nelements (tensor)/ggml_blck_size (tensor->type )*(ggml_blck_size (tensor->type )*sizeof (char ));
3424
+ GGML_ASSERT (size_d + size_q == ggml_nbytes (tensor) && " Incorrect tensor size" );
3425
+
3426
+ cl_int err;
3427
+ cl_mem data_device = clCreateBuffer (context, CL_MEM_READ_WRITE,
3428
+ ggml_nbytes (tensor), NULL , &err);
3429
+ CL_CHECK (err);
3430
+ CL_CHECK (clEnqueueWriteBuffer (
3431
+ queue, data_device, CL_TRUE, 0 ,
3432
+ ggml_nbytes (tensor), data, 0 , NULL , NULL ));
3433
+
3434
+ // The original tensor memory is divided into scales and quants, i.e.,
3435
+ // we first store scales, then quants.
3436
+ cl_buffer_region region;
3437
+
3438
+ // Create subbuffer for scales.
3439
+ region.origin = align_to (extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment );
3440
+ region.size = size_d;
3441
+ extra->d = clCreateSubBuffer (
3442
+ extra_orig->data_device , CL_MEM_READ_WRITE,
3443
+ CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
3444
+ CL_CHECK (err);
3445
+ auto previous_origin = region.origin ;
3446
+
3447
+ // Create subbuffer for quants.
3448
+ region.origin = align_to (previous_origin + size_d, backend_ctx->alignment );
3449
+ region.size = size_q;
3450
+ extra->q = clCreateSubBuffer (
3451
+ extra_orig->data_device , CL_MEM_READ_WRITE,
3452
+ CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
3453
+ CL_CHECK (err);
3454
+
3455
+ cl_kernel kernel = backend_ctx->kernel_convert_block_q8_0 ;
3456
+
3457
+ CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &data_device));
3458
+ CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_mem), &extra->q ));
3459
+ CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_mem), &extra->d ));
3460
+
3461
+ size_t global_work_size[] = {(size_t )ggml_nelements (tensor)/ggml_blck_size (tensor->type ), 1 , 1 };
3462
+ size_t local_work_size[] = {64 , 1 , 1 };
3463
+
3464
+ cl_event evt;
3465
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size, 0 , NULL , &evt));
3466
+ CL_CHECK (clWaitForEvents (1 , &evt));
3467
+ CL_CHECK (clReleaseMemObject (data_device));
3468
+
3469
+ tensor->extra = extra;
3470
+
3330
3471
return ;
3331
3472
}
3332
3473
#endif // GGML_OPENCL_SOA_Q
@@ -3375,6 +3516,32 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
3375
3516
size_t global_work_size[] = {(size_t )ggml_nelements (tensor)/ggml_blck_size (tensor->type ), 1 , 1 };
3376
3517
size_t local_work_size[] = {1 , 1 , 1 };
3377
3518
3519
+ cl_event evt;
3520
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL ,
3521
+ global_work_size, local_work_size, 0 , NULL , &evt));
3522
+ CL_CHECK (clWaitForEvents (1 , &evt));
3523
+ CL_CHECK (clEnqueueReadBuffer (
3524
+ queue, data_device, CL_TRUE, offset,
3525
+ size, data, 0 , NULL , NULL ));
3526
+ CL_CHECK (clReleaseMemObject (data_device));
3527
+ return ;
3528
+ }
3529
+ if (tensor->type == GGML_TYPE_Q8_0) {
3530
+ ggml_tensor_extra_cl_q8_0 * extra = (ggml_tensor_extra_cl_q8_0 *)tensor->extra ;
3531
+
3532
+ cl_int err;
3533
+ cl_mem data_device = clCreateBuffer (context, CL_MEM_READ_WRITE,
3534
+ ggml_nbytes (tensor), NULL , &err);
3535
+ CL_CHECK (err);
3536
+
3537
+ cl_kernel kernel = backend_ctx->kernel_restore_block_q8_0 ;
3538
+ CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &extra->q ));
3539
+ CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_mem), &extra->d ));
3540
+ CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_mem), &data_device));
3541
+
3542
+ size_t global_work_size[] = {(size_t )ggml_nelements (tensor)/ggml_blck_size (tensor->type ), 1 , 1 };
3543
+ size_t local_work_size[] = {1 , 1 , 1 };
3544
+
3378
3545
cl_event evt;
3379
3546
CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL ,
3380
3547
global_work_size, local_work_size, 0 , NULL , &evt));
@@ -6086,6 +6253,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
6086
6253
6087
6254
#ifdef GGML_OPENCL_SOA_Q
6088
6255
ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra ;
6256
+ ggml_tensor_extra_cl_q8_0 * extra0_q8_0 = (ggml_tensor_extra_cl_q8_0 *)src0->extra ;
6089
6257
#endif
6090
6258
6091
6259
const int ne00 = src0 ? src0->ne [0 ] : 0 ;
@@ -6756,6 +6924,44 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
6756
6924
break ;
6757
6925
case GGML_TYPE_Q4_1:
6758
6926
case GGML_TYPE_Q8_0: {
6927
+ #ifdef GGML_OPENCL_SOA_Q
6928
+ kernel = backend_ctx->kernel_mul_mv_q8_0_f32_flat ;
6929
+
6930
+ // nth0 - subgroup size
6931
+ // nth1 - number of subgroups per workgroup
6932
+ // ndst - number of output values per workgroup = output per subgroup * number of subgroups
6933
+ if (backend_ctx->gpu_family == INTEL) {
6934
+ nth0 = 16 ;
6935
+ nth1 = 2 ;
6936
+ ndst = nth1*4 ;
6937
+ } else if (backend_ctx->gpu_family == ADRENO) {
6938
+ nth0 = 64 ;
6939
+ nth1 = 2 ;
6940
+ ndst = nth1*4 ;
6941
+ } else {
6942
+ GGML_ASSERT (false && " TODO: Unknown GPU" );
6943
+ }
6944
+
6945
+ CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &extra0_q8_0->q ));
6946
+ CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_mem), &extra0_q8_0->d ));
6947
+ CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_mem), &extra1->data_device ));
6948
+ CL_CHECK (clSetKernelArg (kernel, 3 , sizeof (cl_ulong), &offset1));
6949
+ CL_CHECK (clSetKernelArg (kernel, 4 , sizeof (cl_mem), &extrad->data_device ));
6950
+ CL_CHECK (clSetKernelArg (kernel, 5 , sizeof (cl_ulong), &offsetd));
6951
+ CL_CHECK (clSetKernelArg (kernel, 6 , sizeof (int ), &ne00));
6952
+ CL_CHECK (clSetKernelArg (kernel, 7 , sizeof (int ), &ne01));
6953
+ CL_CHECK (clSetKernelArg (kernel, 8 , sizeof (cl_ulong), &nb01));
6954
+ CL_CHECK (clSetKernelArg (kernel, 9 , sizeof (cl_ulong), &nb02));
6955
+ CL_CHECK (clSetKernelArg (kernel, 10 , sizeof (cl_ulong), &nb03));
6956
+ CL_CHECK (clSetKernelArg (kernel, 11 , sizeof (int ), &ne12));
6957
+ CL_CHECK (clSetKernelArg (kernel, 12 , sizeof (cl_ulong), &nb11));
6958
+ CL_CHECK (clSetKernelArg (kernel, 13 , sizeof (cl_ulong), &nb12));
6959
+ CL_CHECK (clSetKernelArg (kernel, 14 , sizeof (cl_ulong), &nb13));
6960
+ CL_CHECK (clSetKernelArg (kernel, 15 , sizeof (int ), &ne0));
6961
+ CL_CHECK (clSetKernelArg (kernel, 16 , sizeof (int ), &ne1));
6962
+ CL_CHECK (clSetKernelArg (kernel, 17 , sizeof (int ), &r2));
6963
+ CL_CHECK (clSetKernelArg (kernel, 18 , sizeof (int ), &r3));
6964
+ #else
6759
6965
kernel = backend_ctx->kernel_mul_mv_q8_0_f32 ;
6760
6966
6761
6967
// nth0 - subgroup size
@@ -6792,6 +6998,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
6792
6998
CL_CHECK (clSetKernelArg (kernel, 16 , sizeof (int ), &ne1));
6793
6999
CL_CHECK (clSetKernelArg (kernel, 17 , sizeof (int ), &r2));
6794
7000
CL_CHECK (clSetKernelArg (kernel, 18 , sizeof (int ), &r3));
7001
+ #endif // GGML_OPENCL_SOA_Q
6795
7002
break ;
6796
7003
}
6797
7004
case GGML_TYPE_Q2_K:
0 commit comments