@@ -367,7 +367,7 @@ struct ggml_backend_opencl_context {
367
367
cl_program program_mul_mv_q4_0_f32_1d_8x_flat;
368
368
cl_program program_mul_mv_q4_0_f32_1d_16x_flat;
369
369
cl_program program_mul_mv_q6_K;
370
- cl_program program_mul_mv_q8_0_f32;
370
+ cl_program program_mul_mv_q8_0_f32, program_mul_mv_q8_0_f32_flat ;
371
371
cl_program program_mul_mv_mxfp4_f32;
372
372
cl_program program_mul_mv_f16_f16;
373
373
cl_program program_mul_mv_f16_f32_1row;
@@ -449,11 +449,12 @@ struct ggml_backend_opencl_context {
449
449
cl_kernel kernel_mul_mat_f16_f32_tiled;
450
450
cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v;
451
451
cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0;
452
+ cl_kernel kernel_convert_block_q8_0, kernel_restore_block_q8_0;
452
453
cl_kernel kernel_mul_mat_q4_0_f32_8x_flat;
453
454
cl_kernel kernel_convert_block_q4_0_noshuffle;
454
455
cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
455
456
cl_kernel kernel_mul_mv_q6_K_f32;
456
- cl_kernel kernel_mul_mv_q8_0_f32;
457
+ cl_kernel kernel_mul_mv_q8_0_f32, kernel_mul_mv_q8_0_f32_flat ;
457
458
cl_kernel kernel_mul_mv_mxfp4_f32;
458
459
cl_kernel kernel_im2col_f32, kernel_im2col_f16;
459
460
cl_kernel kernel_argsort_f32_i32;
@@ -769,6 +770,8 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
769
770
CL_CHECK ((backend_ctx->kernel_convert_block_q4_0_noshuffle = clCreateKernel (backend_ctx->program_cvt , " kernel_convert_block_q4_0_noshuffle" , &err), err));
770
771
CL_CHECK ((backend_ctx->kernel_convert_block_q4_0 = clCreateKernel (backend_ctx->program_cvt , " kernel_convert_block_q4_0" , &err), err));
771
772
CL_CHECK ((backend_ctx->kernel_restore_block_q4_0 = clCreateKernel (backend_ctx->program_cvt , " kernel_restore_block_q4_0" , &err), err));
773
+ CL_CHECK ((backend_ctx->kernel_convert_block_q8_0 = clCreateKernel (backend_ctx->program_cvt , " kernel_convert_block_q8_0" , &err), err));
774
+ CL_CHECK ((backend_ctx->kernel_restore_block_q8_0 = clCreateKernel (backend_ctx->program_cvt , " kernel_restore_block_q8_0" , &err), err));
772
775
GGML_LOG_CONT (" ." );
773
776
}
774
777
@@ -1006,6 +1009,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
1006
1009
GGML_LOG_CONT (" ." );
1007
1010
}
1008
1011
1012
+ // mul_mv_q8_0_f32_flat
1013
+ {
1014
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1015
+ const std::string kernel_src {
1016
+ #include " mul_mv_q8_0_f32_flat.cl.h"
1017
+ };
1018
+ #else
1019
+ const std::string kernel_src = read_file (" mul_mv_q8_0_f32_flat.cl" );
1020
+ #endif
1021
+ backend_ctx->program_mul_mv_q8_0_f32_flat =
1022
+ build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
1023
+
1024
+ CL_CHECK ((backend_ctx->kernel_mul_mv_q8_0_f32_flat = clCreateKernel (backend_ctx->program_mul_mv_q8_0_f32_flat , " kernel_mul_mv_q8_0_f32_flat" , &err), err));
1025
+ GGML_LOG_CONT (" ." );
1026
+ }
1027
+
1009
1028
// mul_mv_mxfp4_f32
1010
1029
{
1011
1030
#ifdef GGML_OPENCL_EMBED_KERNELS
@@ -2427,6 +2446,41 @@ struct ggml_tensor_extra_cl_q4_0 {
2427
2446
}
2428
2447
};
2429
2448
2449
+ struct ggml_tensor_extra_cl_q8_0 {
2450
+ cl_mem q = nullptr ;
2451
+ cl_mem q_img = nullptr ;
2452
+
2453
+ cl_mem d = nullptr ;
2454
+ cl_mem d_img = nullptr ;
2455
+
2456
+ size_t size_q = 0 ;
2457
+ size_t size_d = 0 ;
2458
+
2459
+ ~ggml_tensor_extra_cl_q8_0 () {
2460
+ reset ();
2461
+ }
2462
+
2463
+ void reset () {
2464
+ // q and d are subbuffers into the bigger buffer allocated in ggml_backend_buffer.
2465
+ // They must be properly released so that the original buffer can be
2466
+ // properly released to avoid memory leak.
2467
+ if (q != nullptr ) {
2468
+ CL_CHECK (clReleaseMemObject (q));
2469
+ q = nullptr ;
2470
+ }
2471
+ if (d != nullptr ) {
2472
+ CL_CHECK (clReleaseMemObject (d));
2473
+ d = nullptr ;
2474
+ }
2475
+ // Currently, q_img and d_img are not used. They can be image1d_buffer_t
2476
+ // that wraps around q and d to utilize image access path.
2477
+ q_img = nullptr ;
2478
+ d_img = nullptr ;
2479
+ size_q = 0 ;
2480
+ size_d = 0 ;
2481
+ }
2482
+ };
2483
+
2430
2484
// ------------------------------------------------------------------------------
2431
2485
// Backend API
2432
2486
// ------------------------------------------------------------------------------
@@ -2933,6 +2987,12 @@ struct ggml_backend_opencl_buffer_context {
2933
2987
for (ggml_tensor_extra_cl_q4_0 * e : temp_tensor_extras_q4_0_in_use) {
2934
2988
delete e;
2935
2989
}
2990
+ for (ggml_tensor_extra_cl_q8_0 * e : temp_tensor_extras_q8_0) {
2991
+ delete e;
2992
+ }
2993
+ for (ggml_tensor_extra_cl_q8_0 * e : temp_tensor_extras_q8_0_in_use) {
2994
+ delete e;
2995
+ }
2936
2996
}
2937
2997
2938
2998
ggml_tensor_extra_cl * ggml_opencl_alloc_temp_tensor_extra () {
@@ -2965,6 +3025,21 @@ struct ggml_backend_opencl_buffer_context {
2965
3025
return extra;
2966
3026
}
2967
3027
3028
+ ggml_tensor_extra_cl_q8_0 * ggml_opencl_alloc_temp_tensor_extra_q8_0 () {
3029
+ ggml_tensor_extra_cl_q8_0 * extra;
3030
+ if (temp_tensor_extras_q8_0.empty ()) {
3031
+ extra = new ggml_tensor_extra_cl_q8_0 ();
3032
+ } else {
3033
+ extra = temp_tensor_extras_q8_0.back ();
3034
+ temp_tensor_extras_q4_0.pop_back ();
3035
+ }
3036
+
3037
+ temp_tensor_extras_q8_0_in_use.push_back (extra);
3038
+
3039
+ extra->reset ();
3040
+ return extra;
3041
+ }
3042
+
2968
3043
void reset () {
2969
3044
for (ggml_tensor_extra_cl * e : temp_tensor_extras_in_use) {
2970
3045
temp_tensor_extras.push_back (e);
@@ -2975,6 +3050,11 @@ struct ggml_backend_opencl_buffer_context {
2975
3050
temp_tensor_extras_q4_0.push_back (e);
2976
3051
}
2977
3052
temp_tensor_extras_q4_0_in_use.clear ();
3053
+
3054
+ for (ggml_tensor_extra_cl_q8_0 * e : temp_tensor_extras_q8_0_in_use) {
3055
+ temp_tensor_extras_q8_0.push_back (e);
3056
+ }
3057
+ temp_tensor_extras_q8_0_in_use.clear ();
2978
3058
}
2979
3059
2980
3060
// Pools for extras. Available extras are in `temp_tensor_extras`. Extras
@@ -2986,6 +3066,8 @@ struct ggml_backend_opencl_buffer_context {
2986
3066
std::vector<ggml_tensor_extra_cl *> temp_tensor_extras_in_use;
2987
3067
std::vector<ggml_tensor_extra_cl_q4_0 *> temp_tensor_extras_q4_0;
2988
3068
std::vector<ggml_tensor_extra_cl_q4_0 *> temp_tensor_extras_q4_0_in_use;
3069
+ std::vector<ggml_tensor_extra_cl_q8_0 *> temp_tensor_extras_q8_0;
3070
+ std::vector<ggml_tensor_extra_cl_q8_0 *> temp_tensor_extras_q8_0_in_use;
2989
3071
2990
3072
// The buffer_context is initially created by ggml_backend_buft_alloc_buffer
2991
3073
// before any tensor is initialized (at the beginning of alloc_tensor_range).
@@ -3328,6 +3410,65 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
3328
3410
}
3329
3411
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
3330
3412
3413
+ return ;
3414
+ }
3415
+ if (tensor->type == GGML_TYPE_Q8_0) {
3416
+ ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra ;
3417
+ GGML_ASSERT (extra_orig && " Tesnors in OpenCL backend should have been allocated and initialized" );
3418
+
3419
+ // Allocate the new extra and create aliases from the original.
3420
+ ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context ;
3421
+ ggml_tensor_extra_cl_q8_0 * extra = ctx->ggml_opencl_alloc_temp_tensor_extra_q8_0 ();
3422
+
3423
+ size_t size_d = ggml_nelements (tensor)/ggml_blck_size (tensor->type )*sizeof (ggml_fp16_t );
3424
+ size_t size_q = ggml_nelements (tensor)/ggml_blck_size (tensor->type )*(ggml_blck_size (tensor->type )*sizeof (char ));
3425
+ GGML_ASSERT (size_d + size_q == ggml_nbytes (tensor) && " Incorrect tensor size" );
3426
+
3427
+ cl_int err;
3428
+ cl_mem data_device = clCreateBuffer (context, CL_MEM_READ_WRITE,
3429
+ ggml_nbytes (tensor), NULL , &err);
3430
+ CL_CHECK (err);
3431
+ CL_CHECK (clEnqueueWriteBuffer (
3432
+ queue, data_device, CL_TRUE, 0 ,
3433
+ ggml_nbytes (tensor), data, 0 , NULL , NULL ));
3434
+
3435
+ // The original tensor memory is divided into scales and quants, i.e.,
3436
+ // we first store scales, then quants.
3437
+ cl_buffer_region region;
3438
+
3439
+ // Create subbuffer for scales.
3440
+ region.origin = align_to (extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment );
3441
+ region.size = size_d;
3442
+ extra->d = clCreateSubBuffer (
3443
+ extra_orig->data_device , CL_MEM_READ_WRITE,
3444
+ CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
3445
+ CL_CHECK (err);
3446
+ auto previous_origin = region.origin ;
3447
+
3448
+ // Create subbuffer for quants.
3449
+ region.origin = align_to (previous_origin + size_d, backend_ctx->alignment );
3450
+ region.size = size_q;
3451
+ extra->q = clCreateSubBuffer (
3452
+ extra_orig->data_device , CL_MEM_READ_WRITE,
3453
+ CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
3454
+ CL_CHECK (err);
3455
+
3456
+ cl_kernel kernel = backend_ctx->kernel_convert_block_q8_0 ;
3457
+
3458
+ CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &data_device));
3459
+ CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_mem), &extra->q ));
3460
+ CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_mem), &extra->d ));
3461
+
3462
+ size_t global_work_size[] = {(size_t )ggml_nelements (tensor)/ggml_blck_size (tensor->type ), 1 , 1 };
3463
+ size_t local_work_size[] = {64 , 1 , 1 };
3464
+
3465
+ cl_event evt;
3466
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size, 0 , NULL , &evt));
3467
+ CL_CHECK (clWaitForEvents (1 , &evt));
3468
+ CL_CHECK (clReleaseMemObject (data_device));
3469
+
3470
+ tensor->extra = extra;
3471
+
3331
3472
return ;
3332
3473
}
3333
3474
#endif // GGML_OPENCL_SOA_Q
@@ -3376,6 +3517,32 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
3376
3517
size_t global_work_size[] = {(size_t )ggml_nelements (tensor)/ggml_blck_size (tensor->type ), 1 , 1 };
3377
3518
size_t local_work_size[] = {1 , 1 , 1 };
3378
3519
3520
+ cl_event evt;
3521
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL ,
3522
+ global_work_size, local_work_size, 0 , NULL , &evt));
3523
+ CL_CHECK (clWaitForEvents (1 , &evt));
3524
+ CL_CHECK (clEnqueueReadBuffer (
3525
+ queue, data_device, CL_TRUE, offset,
3526
+ size, data, 0 , NULL , NULL ));
3527
+ CL_CHECK (clReleaseMemObject (data_device));
3528
+ return ;
3529
+ }
3530
+ if (tensor->type == GGML_TYPE_Q8_0) {
3531
+ ggml_tensor_extra_cl_q8_0 * extra = (ggml_tensor_extra_cl_q8_0 *)tensor->extra ;
3532
+
3533
+ cl_int err;
3534
+ cl_mem data_device = clCreateBuffer (context, CL_MEM_READ_WRITE,
3535
+ ggml_nbytes (tensor), NULL , &err);
3536
+ CL_CHECK (err);
3537
+
3538
+ cl_kernel kernel = backend_ctx->kernel_restore_block_q8_0 ;
3539
+ CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &extra->q ));
3540
+ CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_mem), &extra->d ));
3541
+ CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_mem), &data_device));
3542
+
3543
+ size_t global_work_size[] = {(size_t )ggml_nelements (tensor)/ggml_blck_size (tensor->type ), 1 , 1 };
3544
+ size_t local_work_size[] = {1 , 1 , 1 };
3545
+
3379
3546
cl_event evt;
3380
3547
CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL ,
3381
3548
global_work_size, local_work_size, 0 , NULL , &evt));
@@ -6087,6 +6254,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
6087
6254
6088
6255
#ifdef GGML_OPENCL_SOA_Q
6089
6256
ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra ;
6257
+ ggml_tensor_extra_cl_q8_0 * extra0_q8_0 = (ggml_tensor_extra_cl_q8_0 *)src0->extra ;
6090
6258
#endif
6091
6259
6092
6260
const int ne00 = src0 ? src0->ne [0 ] : 0 ;
@@ -6757,6 +6925,44 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
6757
6925
break ;
6758
6926
case GGML_TYPE_Q4_1:
6759
6927
case GGML_TYPE_Q8_0: {
6928
+ #ifdef GGML_OPENCL_SOA_Q
6929
+ kernel = backend_ctx->kernel_mul_mv_q8_0_f32_flat ;
6930
+
6931
+ // nth0 - subgroup size
6932
+ // nth1 - number of subgroups per workgroup
6933
+ // ndst - number of output values per workgroup = output per subgroup * number of subgroups
6934
+ if (backend_ctx->gpu_family == INTEL) {
6935
+ nth0 = 16 ;
6936
+ nth1 = 2 ;
6937
+ ndst = nth1*4 ;
6938
+ } else if (backend_ctx->gpu_family == ADRENO) {
6939
+ nth0 = 64 ;
6940
+ nth1 = 2 ;
6941
+ ndst = nth1*4 ;
6942
+ } else {
6943
+ GGML_ASSERT (false && " TODO: Unknown GPU" );
6944
+ }
6945
+
6946
+ CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &extra0_q8_0->q ));
6947
+ CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_mem), &extra0_q8_0->d ));
6948
+ CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_mem), &extra1->data_device ));
6949
+ CL_CHECK (clSetKernelArg (kernel, 3 , sizeof (cl_ulong), &offset1));
6950
+ CL_CHECK (clSetKernelArg (kernel, 4 , sizeof (cl_mem), &extrad->data_device ));
6951
+ CL_CHECK (clSetKernelArg (kernel, 5 , sizeof (cl_ulong), &offsetd));
6952
+ CL_CHECK (clSetKernelArg (kernel, 6 , sizeof (int ), &ne00));
6953
+ CL_CHECK (clSetKernelArg (kernel, 7 , sizeof (int ), &ne01));
6954
+ CL_CHECK (clSetKernelArg (kernel, 8 , sizeof (cl_ulong), &nb01));
6955
+ CL_CHECK (clSetKernelArg (kernel, 9 , sizeof (cl_ulong), &nb02));
6956
+ CL_CHECK (clSetKernelArg (kernel, 10 , sizeof (cl_ulong), &nb03));
6957
+ CL_CHECK (clSetKernelArg (kernel, 11 , sizeof (int ), &ne12));
6958
+ CL_CHECK (clSetKernelArg (kernel, 12 , sizeof (cl_ulong), &nb11));
6959
+ CL_CHECK (clSetKernelArg (kernel, 13 , sizeof (cl_ulong), &nb12));
6960
+ CL_CHECK (clSetKernelArg (kernel, 14 , sizeof (cl_ulong), &nb13));
6961
+ CL_CHECK (clSetKernelArg (kernel, 15 , sizeof (int ), &ne0));
6962
+ CL_CHECK (clSetKernelArg (kernel, 16 , sizeof (int ), &ne1));
6963
+ CL_CHECK (clSetKernelArg (kernel, 17 , sizeof (int ), &r2));
6964
+ CL_CHECK (clSetKernelArg (kernel, 18 , sizeof (int ), &r3));
6965
+ #else
6760
6966
kernel = backend_ctx->kernel_mul_mv_q8_0_f32 ;
6761
6967
6762
6968
// nth0 - subgroup size
@@ -6793,6 +6999,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
6793
6999
CL_CHECK (clSetKernelArg (kernel, 16 , sizeof (int ), &ne1));
6794
7000
CL_CHECK (clSetKernelArg (kernel, 17 , sizeof (int ), &r2));
6795
7001
CL_CHECK (clSetKernelArg (kernel, 18 , sizeof (int ), &r3));
7002
+ #endif // GGML_OPENCL_SOA_Q
6796
7003
break ;
6797
7004
}
6798
7005
case GGML_TYPE_Q2_K:
0 commit comments