@@ -367,7 +367,7 @@ struct ggml_backend_opencl_context {
367
367
cl_program program_mul_mv_q4_0_f32_1d_8x_flat;
368
368
cl_program program_mul_mv_q4_0_f32_1d_16x_flat;
369
369
cl_program program_mul_mv_q6_K;
370
- cl_program program_mul_mv_q8_0_f32;
370
+ cl_program program_mul_mv_q8_0_f32, program_mul_mv_q8_0_f32_flat ;
371
371
cl_program program_mul_mv_mxfp4_f32;
372
372
cl_program program_mul_mv_f16_f16;
373
373
cl_program program_mul_mv_f16_f32_1row;
@@ -449,11 +449,12 @@ struct ggml_backend_opencl_context {
449
449
cl_kernel kernel_mul_mat_f16_f32_tiled;
450
450
cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v;
451
451
cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0;
452
+ cl_kernel kernel_convert_block_q8_0, kernel_restore_block_q8_0;
452
453
cl_kernel kernel_mul_mat_q4_0_f32_8x_flat;
453
454
cl_kernel kernel_convert_block_q4_0_noshuffle;
454
455
cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
455
456
cl_kernel kernel_mul_mv_q6_K_f32;
456
- cl_kernel kernel_mul_mv_q8_0_f32;
457
+ cl_kernel kernel_mul_mv_q8_0_f32, kernel_mul_mv_q8_0_f32_flat ;
457
458
cl_kernel kernel_mul_mv_mxfp4_f32;
458
459
cl_kernel kernel_im2col_f32, kernel_im2col_f16;
459
460
cl_kernel kernel_argsort_f32_i32;
@@ -769,6 +770,8 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
769
770
CL_CHECK ((backend_ctx->kernel_convert_block_q4_0_noshuffle = clCreateKernel (backend_ctx->program_cvt , " kernel_convert_block_q4_0_noshuffle" , &err), err));
770
771
CL_CHECK ((backend_ctx->kernel_convert_block_q4_0 = clCreateKernel (backend_ctx->program_cvt , " kernel_convert_block_q4_0" , &err), err));
771
772
CL_CHECK ((backend_ctx->kernel_restore_block_q4_0 = clCreateKernel (backend_ctx->program_cvt , " kernel_restore_block_q4_0" , &err), err));
773
+ CL_CHECK ((backend_ctx->kernel_convert_block_q8_0 = clCreateKernel (backend_ctx->program_cvt , " kernel_convert_block_q8_0" , &err), err));
774
+ CL_CHECK ((backend_ctx->kernel_restore_block_q8_0 = clCreateKernel (backend_ctx->program_cvt , " kernel_restore_block_q8_0" , &err), err));
772
775
GGML_LOG_CONT (" ." );
773
776
}
774
777
@@ -1006,6 +1009,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
1006
1009
GGML_LOG_CONT (" ." );
1007
1010
}
1008
1011
1012
+ // mul_mv_q8_0_f32_flat
1013
+ {
1014
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1015
+ const std::string kernel_src {
1016
+ #include " mul_mv_q8_0_f32_flat.cl.h"
1017
+ };
1018
+ #else
1019
+ const std::string kernel_src = read_file (" mul_mv_q8_0_f32_flat.cl" );
1020
+ #endif
1021
+ backend_ctx->program_mul_mv_q8_0_f32_flat =
1022
+ build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
1023
+
1024
+ CL_CHECK ((backend_ctx->kernel_mul_mv_q8_0_f32_flat = clCreateKernel (backend_ctx->program_mul_mv_q8_0_f32_flat , " kernel_mul_mv_q8_0_f32_flat" , &err), err));
1025
+ GGML_LOG_CONT (" ." );
1026
+ }
1027
+
1009
1028
// mul_mv_mxfp4_f32
1010
1029
{
1011
1030
#ifdef GGML_OPENCL_EMBED_KERNELS
@@ -2427,6 +2446,41 @@ struct ggml_tensor_extra_cl_q4_0 {
2427
2446
}
2428
2447
};
2429
2448
2449
+ struct ggml_tensor_extra_cl_q8_0 {
2450
+ cl_mem q = nullptr ;
2451
+ cl_mem q_img = nullptr ;
2452
+
2453
+ cl_mem d = nullptr ;
2454
+ cl_mem d_img = nullptr ;
2455
+
2456
+ size_t size_q = 0 ;
2457
+ size_t size_d = 0 ;
2458
+
2459
+ ~ggml_tensor_extra_cl_q8_0 () {
2460
+ reset ();
2461
+ }
2462
+
2463
+ void reset () {
2464
+ // q and d are subbuffers into the bigger buffer allocated in ggml_backend_buffer.
2465
+ // They must be properly released so that the original buffer can be
2466
+ // properly released to avoid memory leak.
2467
+ if (q != nullptr ) {
2468
+ CL_CHECK (clReleaseMemObject (q));
2469
+ q = nullptr ;
2470
+ }
2471
+ if (d != nullptr ) {
2472
+ CL_CHECK (clReleaseMemObject (d));
2473
+ d = nullptr ;
2474
+ }
2475
+ // Currently, q_img and d_img are not used. They can be image1d_buffer_t
2476
+ // that wraps around q and d to utilize image access path.
2477
+ q_img = nullptr ;
2478
+ d_img = nullptr ;
2479
+ size_q = 0 ;
2480
+ size_d = 0 ;
2481
+ }
2482
+ };
2483
+
2430
2484
// ------------------------------------------------------------------------------
2431
2485
// Backend API
2432
2486
// ------------------------------------------------------------------------------
@@ -2930,6 +2984,12 @@ struct ggml_backend_opencl_buffer_context {
2930
2984
for (ggml_tensor_extra_cl_q4_0 * e : temp_tensor_extras_q4_0_in_use) {
2931
2985
delete e;
2932
2986
}
2987
+ for (ggml_tensor_extra_cl_q8_0 * e : temp_tensor_extras_q8_0) {
2988
+ delete e;
2989
+ }
2990
+ for (ggml_tensor_extra_cl_q8_0 * e : temp_tensor_extras_q8_0_in_use) {
2991
+ delete e;
2992
+ }
2933
2993
}
2934
2994
2935
2995
ggml_tensor_extra_cl * ggml_opencl_alloc_temp_tensor_extra () {
@@ -2962,6 +3022,21 @@ struct ggml_backend_opencl_buffer_context {
2962
3022
return extra;
2963
3023
}
2964
3024
3025
+ ggml_tensor_extra_cl_q8_0 * ggml_opencl_alloc_temp_tensor_extra_q8_0 () {
3026
+ ggml_tensor_extra_cl_q8_0 * extra;
3027
+ if (temp_tensor_extras_q8_0.empty ()) {
3028
+ extra = new ggml_tensor_extra_cl_q8_0 ();
3029
+ } else {
3030
+ extra = temp_tensor_extras_q8_0.back ();
3031
+ temp_tensor_extras_q4_0.pop_back ();
3032
+ }
3033
+
3034
+ temp_tensor_extras_q8_0_in_use.push_back (extra);
3035
+
3036
+ extra->reset ();
3037
+ return extra;
3038
+ }
3039
+
2965
3040
void reset () {
2966
3041
for (ggml_tensor_extra_cl * e : temp_tensor_extras_in_use) {
2967
3042
temp_tensor_extras.push_back (e);
@@ -2972,6 +3047,11 @@ struct ggml_backend_opencl_buffer_context {
2972
3047
temp_tensor_extras_q4_0.push_back (e);
2973
3048
}
2974
3049
temp_tensor_extras_q4_0_in_use.clear ();
3050
+
3051
+ for (ggml_tensor_extra_cl_q8_0 * e : temp_tensor_extras_q8_0_in_use) {
3052
+ temp_tensor_extras_q8_0.push_back (e);
3053
+ }
3054
+ temp_tensor_extras_q8_0_in_use.clear ();
2975
3055
}
2976
3056
2977
3057
// Pools for extras. Available extras are in `temp_tensor_extras`. Extras
@@ -2983,6 +3063,8 @@ struct ggml_backend_opencl_buffer_context {
2983
3063
std::vector<ggml_tensor_extra_cl *> temp_tensor_extras_in_use;
2984
3064
std::vector<ggml_tensor_extra_cl_q4_0 *> temp_tensor_extras_q4_0;
2985
3065
std::vector<ggml_tensor_extra_cl_q4_0 *> temp_tensor_extras_q4_0_in_use;
3066
+ std::vector<ggml_tensor_extra_cl_q8_0 *> temp_tensor_extras_q8_0;
3067
+ std::vector<ggml_tensor_extra_cl_q8_0 *> temp_tensor_extras_q8_0_in_use;
2986
3068
2987
3069
// The buffer_context is initially created by ggml_backend_buft_alloc_buffer
2988
3070
// before any tensor is initialized (at the beginning of alloc_tensor_range).
@@ -3325,6 +3407,65 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
3325
3407
}
3326
3408
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
3327
3409
3410
+ return ;
3411
+ }
3412
+ if (tensor->type == GGML_TYPE_Q8_0) {
3413
+ ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra ;
3414
+ GGML_ASSERT (extra_orig && " Tesnors in OpenCL backend should have been allocated and initialized" );
3415
+
3416
+ // Allocate the new extra and create aliases from the original.
3417
+ ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context ;
3418
+ ggml_tensor_extra_cl_q8_0 * extra = ctx->ggml_opencl_alloc_temp_tensor_extra_q8_0 ();
3419
+
3420
+ size_t size_d = ggml_nelements (tensor)/ggml_blck_size (tensor->type )*sizeof (ggml_fp16_t );
3421
+ size_t size_q = ggml_nelements (tensor)/ggml_blck_size (tensor->type )*(ggml_blck_size (tensor->type )*sizeof (char ));
3422
+ GGML_ASSERT (size_d + size_q == ggml_nbytes (tensor) && " Incorrect tensor size" );
3423
+
3424
+ cl_int err;
3425
+ cl_mem data_device = clCreateBuffer (context, CL_MEM_READ_WRITE,
3426
+ ggml_nbytes (tensor), NULL , &err);
3427
+ CL_CHECK (err);
3428
+ CL_CHECK (clEnqueueWriteBuffer (
3429
+ queue, data_device, CL_TRUE, 0 ,
3430
+ ggml_nbytes (tensor), data, 0 , NULL , NULL ));
3431
+
3432
+ // The original tensor memory is divided into scales and quants, i.e.,
3433
+ // we first store scales, then quants.
3434
+ cl_buffer_region region;
3435
+
3436
+ // Create subbuffer for scales.
3437
+ region.origin = align_to (extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment );
3438
+ region.size = size_d;
3439
+ extra->d = clCreateSubBuffer (
3440
+ extra_orig->data_device , CL_MEM_READ_WRITE,
3441
+ CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
3442
+ CL_CHECK (err);
3443
+ auto previous_origin = region.origin ;
3444
+
3445
+ // Create subbuffer for quants.
3446
+ region.origin = align_to (previous_origin + size_d, backend_ctx->alignment );
3447
+ region.size = size_q;
3448
+ extra->q = clCreateSubBuffer (
3449
+ extra_orig->data_device , CL_MEM_READ_WRITE,
3450
+ CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
3451
+ CL_CHECK (err);
3452
+
3453
+ cl_kernel kernel = backend_ctx->kernel_convert_block_q8_0 ;
3454
+
3455
+ CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &data_device));
3456
+ CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_mem), &extra->q ));
3457
+ CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_mem), &extra->d ));
3458
+
3459
+ size_t global_work_size[] = {(size_t )ggml_nelements (tensor)/ggml_blck_size (tensor->type ), 1 , 1 };
3460
+ size_t local_work_size[] = {64 , 1 , 1 };
3461
+
3462
+ cl_event evt;
3463
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size, 0 , NULL , &evt));
3464
+ CL_CHECK (clWaitForEvents (1 , &evt));
3465
+ CL_CHECK (clReleaseMemObject (data_device));
3466
+
3467
+ tensor->extra = extra;
3468
+
3328
3469
return ;
3329
3470
}
3330
3471
#endif // GGML_OPENCL_SOA_Q
@@ -3373,6 +3514,32 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
3373
3514
size_t global_work_size[] = {(size_t )ggml_nelements (tensor)/ggml_blck_size (tensor->type ), 1 , 1 };
3374
3515
size_t local_work_size[] = {1 , 1 , 1 };
3375
3516
3517
+ cl_event evt;
3518
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL ,
3519
+ global_work_size, local_work_size, 0 , NULL , &evt));
3520
+ CL_CHECK (clWaitForEvents (1 , &evt));
3521
+ CL_CHECK (clEnqueueReadBuffer (
3522
+ queue, data_device, CL_TRUE, offset,
3523
+ size, data, 0 , NULL , NULL ));
3524
+ CL_CHECK (clReleaseMemObject (data_device));
3525
+ return ;
3526
+ }
3527
+ if (tensor->type == GGML_TYPE_Q8_0) {
3528
+ ggml_tensor_extra_cl_q8_0 * extra = (ggml_tensor_extra_cl_q8_0 *)tensor->extra ;
3529
+
3530
+ cl_int err;
3531
+ cl_mem data_device = clCreateBuffer (context, CL_MEM_READ_WRITE,
3532
+ ggml_nbytes (tensor), NULL , &err);
3533
+ CL_CHECK (err);
3534
+
3535
+ cl_kernel kernel = backend_ctx->kernel_restore_block_q8_0 ;
3536
+ CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &extra->q ));
3537
+ CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_mem), &extra->d ));
3538
+ CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_mem), &data_device));
3539
+
3540
+ size_t global_work_size[] = {(size_t )ggml_nelements (tensor)/ggml_blck_size (tensor->type ), 1 , 1 };
3541
+ size_t local_work_size[] = {1 , 1 , 1 };
3542
+
3376
3543
cl_event evt;
3377
3544
CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL ,
3378
3545
global_work_size, local_work_size, 0 , NULL , &evt));
@@ -6084,6 +6251,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
6084
6251
6085
6252
#ifdef GGML_OPENCL_SOA_Q
6086
6253
ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra ;
6254
+ ggml_tensor_extra_cl_q8_0 * extra0_q8_0 = (ggml_tensor_extra_cl_q8_0 *)src0->extra ;
6087
6255
#endif
6088
6256
6089
6257
const int ne00 = src0 ? src0->ne [0 ] : 0 ;
@@ -6754,6 +6922,44 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
6754
6922
break ;
6755
6923
case GGML_TYPE_Q4_1:
6756
6924
case GGML_TYPE_Q8_0: {
6925
+ #ifdef GGML_OPENCL_SOA_Q
6926
+ kernel = backend_ctx->kernel_mul_mv_q8_0_f32_flat ;
6927
+
6928
+ // nth0 - subgroup size
6929
+ // nth1 - number of subgroups per workgroup
6930
+ // ndst - number of output values per workgroup = output per subgroup * number of subgroups
6931
+ if (backend_ctx->gpu_family == INTEL) {
6932
+ nth0 = 16 ;
6933
+ nth1 = 2 ;
6934
+ ndst = nth1*4 ;
6935
+ } else if (backend_ctx->gpu_family == ADRENO) {
6936
+ nth0 = 64 ;
6937
+ nth1 = 2 ;
6938
+ ndst = nth1*4 ;
6939
+ } else {
6940
+ GGML_ASSERT (false && " TODO: Unknown GPU" );
6941
+ }
6942
+
6943
+ CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &extra0_q8_0->q ));
6944
+ CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_mem), &extra0_q8_0->d ));
6945
+ CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_mem), &extra1->data_device ));
6946
+ CL_CHECK (clSetKernelArg (kernel, 3 , sizeof (cl_ulong), &offset1));
6947
+ CL_CHECK (clSetKernelArg (kernel, 4 , sizeof (cl_mem), &extrad->data_device ));
6948
+ CL_CHECK (clSetKernelArg (kernel, 5 , sizeof (cl_ulong), &offsetd));
6949
+ CL_CHECK (clSetKernelArg (kernel, 6 , sizeof (int ), &ne00));
6950
+ CL_CHECK (clSetKernelArg (kernel, 7 , sizeof (int ), &ne01));
6951
+ CL_CHECK (clSetKernelArg (kernel, 8 , sizeof (cl_ulong), &nb01));
6952
+ CL_CHECK (clSetKernelArg (kernel, 9 , sizeof (cl_ulong), &nb02));
6953
+ CL_CHECK (clSetKernelArg (kernel, 10 , sizeof (cl_ulong), &nb03));
6954
+ CL_CHECK (clSetKernelArg (kernel, 11 , sizeof (int ), &ne12));
6955
+ CL_CHECK (clSetKernelArg (kernel, 12 , sizeof (cl_ulong), &nb11));
6956
+ CL_CHECK (clSetKernelArg (kernel, 13 , sizeof (cl_ulong), &nb12));
6957
+ CL_CHECK (clSetKernelArg (kernel, 14 , sizeof (cl_ulong), &nb13));
6958
+ CL_CHECK (clSetKernelArg (kernel, 15 , sizeof (int ), &ne0));
6959
+ CL_CHECK (clSetKernelArg (kernel, 16 , sizeof (int ), &ne1));
6960
+ CL_CHECK (clSetKernelArg (kernel, 17 , sizeof (int ), &r2));
6961
+ CL_CHECK (clSetKernelArg (kernel, 18 , sizeof (int ), &r3));
6962
+ #else
6757
6963
kernel = backend_ctx->kernel_mul_mv_q8_0_f32 ;
6758
6964
6759
6965
// nth0 - subgroup size
@@ -6790,6 +6996,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
6790
6996
CL_CHECK (clSetKernelArg (kernel, 16 , sizeof (int ), &ne1));
6791
6997
CL_CHECK (clSetKernelArg (kernel, 17 , sizeof (int ), &r2));
6792
6998
CL_CHECK (clSetKernelArg (kernel, 18 , sizeof (int ), &r3));
6999
+ #endif // GGML_OPENCL_SOA_Q
6793
7000
break ;
6794
7001
}
6795
7002
case GGML_TYPE_Q2_K:
0 commit comments