@@ -367,7 +367,7 @@ struct ggml_backend_opencl_context {
367
367
cl_program program_mul_mv_q4_0_f32_1d_8x_flat;
368
368
cl_program program_mul_mv_q4_0_f32_1d_16x_flat;
369
369
cl_program program_mul_mv_q6_K;
370
- cl_program program_mul_mv_q8_0_f32;
370
+ cl_program program_mul_mv_q8_0_f32, program_mul_mv_q8_0_f32_flat ;
371
371
cl_program program_mul_mv_mxfp4_f32;
372
372
cl_program program_mul_mv_mxfp4_f32_flat;
373
373
cl_program program_mul_mv_f16_f16;
@@ -452,12 +452,13 @@ struct ggml_backend_opencl_context {
452
452
cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v;
453
453
cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0;
454
454
cl_kernel kernel_convert_block_mxfp4, kernel_restore_block_mxfp4;
455
+ cl_kernel kernel_convert_block_q8_0, kernel_restore_block_q8_0;
455
456
cl_kernel kernel_mul_mat_q4_0_f32_8x_flat;
456
457
cl_kernel kernel_convert_block_q4_0_noshuffle;
457
458
cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
458
459
cl_kernel kernel_mul_mv_q6_K_f32;
459
460
cl_kernel kernel_mul_mv_mxfp4_f32, kernel_mul_mv_mxfp4_f32_flat;
460
- cl_kernel kernel_mul_mv_q8_0_f32;
461
+ cl_kernel kernel_mul_mv_q8_0_f32, kernel_mul_mv_q8_0_f32_flat ;
461
462
cl_kernel kernel_im2col_f32, kernel_im2col_f16;
462
463
cl_kernel kernel_argsort_f32_i32;
463
464
cl_kernel kernel_sum_rows_f32;
@@ -773,8 +774,10 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
773
774
CL_CHECK ((backend_ctx->kernel_convert_block_q4_0_noshuffle = clCreateKernel (backend_ctx->program_cvt , " kernel_convert_block_q4_0_noshuffle" , &err), err));
774
775
CL_CHECK ((backend_ctx->kernel_convert_block_q4_0 = clCreateKernel (backend_ctx->program_cvt , " kernel_convert_block_q4_0" , &err), err));
775
776
CL_CHECK ((backend_ctx->kernel_restore_block_q4_0 = clCreateKernel (backend_ctx->program_cvt , " kernel_restore_block_q4_0" , &err), err));
776
- CL_CHECK ((backend_ctx->kernel_convert_block_mxfp4 = clCreateKernel (backend_ctx->program_cvt , " kernel_convert_block_mxfp4" , &err), err));
777
- CL_CHECK ((backend_ctx->kernel_restore_block_mxfp4 = clCreateKernel (backend_ctx->program_cvt , " kernel_restore_block_mxfp4" , &err), err));
777
+ CL_CHECK ((backend_ctx->kernel_convert_block_mxfp4 = clCreateKernel (backend_ctx->program_cvt , " kernel_convert_block_mxfp4" , &err), err));
778
+ CL_CHECK ((backend_ctx->kernel_restore_block_mxfp4 = clCreateKernel (backend_ctx->program_cvt , " kernel_restore_block_mxfp4" , &err), err));
779
+ CL_CHECK ((backend_ctx->kernel_convert_block_q8_0 = clCreateKernel (backend_ctx->program_cvt , " kernel_convert_block_q8_0" , &err), err));
780
+ CL_CHECK ((backend_ctx->kernel_restore_block_q8_0 = clCreateKernel (backend_ctx->program_cvt , " kernel_restore_block_q8_0" , &err), err));
778
781
GGML_LOG_CONT (" ." );
779
782
}
780
783
@@ -1012,6 +1015,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
1012
1015
GGML_LOG_CONT (" ." );
1013
1016
}
1014
1017
1018
+ // mul_mv_q8_0_f32_flat
1019
+ {
1020
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1021
+ const std::string kernel_src {
1022
+ #include " mul_mv_q8_0_f32_flat.cl.h"
1023
+ };
1024
+ #else
1025
+ const std::string kernel_src = read_file (" mul_mv_q8_0_f32_flat.cl" );
1026
+ #endif
1027
+ backend_ctx->program_mul_mv_q8_0_f32_flat =
1028
+ build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
1029
+
1030
+ CL_CHECK ((backend_ctx->kernel_mul_mv_q8_0_f32_flat = clCreateKernel (backend_ctx->program_mul_mv_q8_0_f32_flat , " kernel_mul_mv_q8_0_f32_flat" , &err), err));
1031
+ GGML_LOG_CONT (" ." );
1032
+ }
1033
+
1015
1034
// mul_mv_mxfp4_f32
1016
1035
{
1017
1036
#ifdef GGML_OPENCL_EMBED_KERNELS
@@ -2499,17 +2518,50 @@ struct ggml_tensor_extra_cl_mxfp4 {
2499
2518
CL_CHECK (clReleaseMemObject (q_img));
2500
2519
q = nullptr ;
2501
2520
}
2502
- // Currently, q_img and d_img are only initialized when SMALL_ALLOC is
2503
- // enabled. They point to the images in ggml_backend_opencl_buffer_context.
2504
- // So, there is no need to release them here.
2505
- // TODO: initialize them for non SMALL_PATH path, or remove them.
2521
+ // Currently, q_img and d_img are not used. They can be image1d_buffer_t
2522
+ // that wraps around q and d to utilize image access path.
2506
2523
q_img = nullptr ;
2507
2524
e_img = nullptr ;
2508
2525
size_q = 0 ;
2509
2526
size_e = 0 ;
2510
2527
}
2511
2528
};
2512
2529
2530
+ struct ggml_tensor_extra_cl_q8_0 {
2531
+ cl_mem q = nullptr ;
2532
+ cl_mem q_img = nullptr ;
2533
+
2534
+ cl_mem d = nullptr ;
2535
+ cl_mem d_img = nullptr ;
2536
+
2537
+ size_t size_q = 0 ;
2538
+ size_t size_d = 0 ;
2539
+
2540
+ ~ggml_tensor_extra_cl_q8_0 () {
2541
+ reset ();
2542
+ }
2543
+
2544
+ void reset () {
2545
+ // q and d are subbuffers into the bigger buffer allocated in ggml_backend_buffer.
2546
+ // They must be properly released so that the original buffer can be
2547
+ // properly released to avoid memory leak.
2548
+ if (q != nullptr ) {
2549
+ CL_CHECK (clReleaseMemObject (q));
2550
+ q = nullptr ;
2551
+ }
2552
+ if (d != nullptr ) {
2553
+ CL_CHECK (clReleaseMemObject (d));
2554
+ d = nullptr ;
2555
+ }
2556
+ // Currently, q_img and d_img are not used. They can be image1d_buffer_t
2557
+ // that wraps around q and d to utilize image access path.
2558
+ q_img = nullptr ;
2559
+ d_img = nullptr ;
2560
+ size_q = 0 ;
2561
+ size_d = 0 ;
2562
+ }
2563
+ };
2564
+
2513
2565
// ------------------------------------------------------------------------------
2514
2566
// Backend API
2515
2567
// ------------------------------------------------------------------------------
@@ -3022,6 +3074,12 @@ struct ggml_backend_opencl_buffer_context {
3022
3074
for (ggml_tensor_extra_cl_mxfp4 * e : temp_tensor_extras_mxfp4_in_use) {
3023
3075
delete e;
3024
3076
}
3077
+ for (ggml_tensor_extra_cl_q8_0 * e : temp_tensor_extras_q8_0) {
3078
+ delete e;
3079
+ }
3080
+ for (ggml_tensor_extra_cl_q8_0 * e : temp_tensor_extras_q8_0_in_use) {
3081
+ delete e;
3082
+ }
3025
3083
}
3026
3084
3027
3085
ggml_tensor_extra_cl * ggml_opencl_alloc_temp_tensor_extra () {
@@ -3069,6 +3127,21 @@ struct ggml_backend_opencl_buffer_context {
3069
3127
return extra;
3070
3128
}
3071
3129
3130
+ ggml_tensor_extra_cl_q8_0 * ggml_opencl_alloc_temp_tensor_extra_q8_0 () {
3131
+ ggml_tensor_extra_cl_q8_0 * extra;
3132
+ if (temp_tensor_extras_q8_0.empty ()) {
3133
+ extra = new ggml_tensor_extra_cl_q8_0 ();
3134
+ } else {
3135
+ extra = temp_tensor_extras_q8_0.back ();
3136
+ temp_tensor_extras_q8_0.pop_back ();
3137
+ }
3138
+
3139
+ temp_tensor_extras_q8_0_in_use.push_back (extra);
3140
+
3141
+ extra->reset ();
3142
+ return extra;
3143
+ }
3144
+
3072
3145
void reset () {
3073
3146
for (ggml_tensor_extra_cl * e : temp_tensor_extras_in_use) {
3074
3147
temp_tensor_extras.push_back (e);
@@ -3084,6 +3157,11 @@ struct ggml_backend_opencl_buffer_context {
3084
3157
temp_tensor_extras_mxfp4.push_back (e);
3085
3158
}
3086
3159
temp_tensor_extras_mxfp4_in_use.clear ();
3160
+
3161
+ for (ggml_tensor_extra_cl_q8_0 * e : temp_tensor_extras_q8_0_in_use) {
3162
+ temp_tensor_extras_q8_0.push_back (e);
3163
+ }
3164
+ temp_tensor_extras_q8_0_in_use.clear ();
3087
3165
}
3088
3166
3089
3167
// Pools for extras. Available extras are in `temp_tensor_extras`. Extras
@@ -3097,6 +3175,8 @@ struct ggml_backend_opencl_buffer_context {
3097
3175
std::vector<ggml_tensor_extra_cl_q4_0 *> temp_tensor_extras_q4_0_in_use;
3098
3176
std::vector<ggml_tensor_extra_cl_mxfp4 *> temp_tensor_extras_mxfp4;
3099
3177
std::vector<ggml_tensor_extra_cl_mxfp4 *> temp_tensor_extras_mxfp4_in_use;
3178
+ std::vector<ggml_tensor_extra_cl_q8_0 *> temp_tensor_extras_q8_0;
3179
+ std::vector<ggml_tensor_extra_cl_q8_0 *> temp_tensor_extras_q8_0_in_use;
3100
3180
3101
3181
// The buffer_context is initially created by ggml_backend_buft_alloc_buffer
3102
3182
// before any tensor is initialized (at the beginning of alloc_tensor_range).
@@ -3509,6 +3589,65 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
3509
3589
3510
3590
tensor->extra = extra;
3511
3591
3592
+ return ;
3593
+ }
3594
+ if (tensor->type == GGML_TYPE_Q8_0) {
3595
+ ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra ;
3596
+ GGML_ASSERT (extra_orig && " Tesnors in OpenCL backend should have been allocated and initialized" );
3597
+
3598
+ // Allocate the new extra and create aliases from the original.
3599
+ ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context ;
3600
+ ggml_tensor_extra_cl_q8_0 * extra = ctx->ggml_opencl_alloc_temp_tensor_extra_q8_0 ();
3601
+
3602
+ size_t size_d = ggml_nelements (tensor)/ggml_blck_size (tensor->type )*sizeof (ggml_fp16_t );
3603
+ size_t size_q = ggml_nelements (tensor)/ggml_blck_size (tensor->type )*(ggml_blck_size (tensor->type )*sizeof (char ));
3604
+ GGML_ASSERT (size_d + size_q == ggml_nbytes (tensor) && " Incorrect tensor size" );
3605
+
3606
+ cl_int err;
3607
+ cl_mem data_device = clCreateBuffer (context, CL_MEM_READ_WRITE,
3608
+ ggml_nbytes (tensor), NULL , &err);
3609
+ CL_CHECK (err);
3610
+ CL_CHECK (clEnqueueWriteBuffer (
3611
+ queue, data_device, CL_TRUE, 0 ,
3612
+ ggml_nbytes (tensor), data, 0 , NULL , NULL ));
3613
+
3614
+ // The original tensor memory is divided into scales and quants, i.e.,
3615
+ // we first store scales, then quants.
3616
+ cl_buffer_region region;
3617
+
3618
+ // Create subbuffer for scales.
3619
+ region.origin = align_to (extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment );
3620
+ region.size = size_d;
3621
+ extra->d = clCreateSubBuffer (
3622
+ extra_orig->data_device , CL_MEM_READ_WRITE,
3623
+ CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
3624
+ CL_CHECK (err);
3625
+ auto previous_origin = region.origin ;
3626
+
3627
+ // Create subbuffer for quants.
3628
+ region.origin = align_to (previous_origin + size_d, backend_ctx->alignment );
3629
+ region.size = size_q;
3630
+ extra->q = clCreateSubBuffer (
3631
+ extra_orig->data_device , CL_MEM_READ_WRITE,
3632
+ CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
3633
+ CL_CHECK (err);
3634
+
3635
+ cl_kernel kernel = backend_ctx->kernel_convert_block_q8_0 ;
3636
+
3637
+ CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &data_device));
3638
+ CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_mem), &extra->q ));
3639
+ CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_mem), &extra->d ));
3640
+
3641
+ size_t global_work_size[] = {(size_t )ggml_nelements (tensor)/ggml_blck_size (tensor->type ), 1 , 1 };
3642
+ size_t local_work_size[] = {64 , 1 , 1 };
3643
+
3644
+ cl_event evt;
3645
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size, 0 , NULL , &evt));
3646
+ CL_CHECK (clWaitForEvents (1 , &evt));
3647
+ CL_CHECK (clReleaseMemObject (data_device));
3648
+
3649
+ tensor->extra = extra;
3650
+
3512
3651
return ;
3513
3652
}
3514
3653
#endif // GGML_OPENCL_SOA_Q
@@ -3582,6 +3721,32 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
3582
3721
size_t global_work_size[] = {(size_t )ggml_nelements (tensor)/ggml_blck_size (tensor->type ), 1 , 1 };
3583
3722
size_t local_work_size[] = {1 , 1 , 1 };
3584
3723
3724
+ cl_event evt;
3725
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL ,
3726
+ global_work_size, local_work_size, 0 , NULL , &evt));
3727
+ CL_CHECK (clWaitForEvents (1 , &evt));
3728
+ CL_CHECK (clEnqueueReadBuffer (
3729
+ queue, data_device, CL_TRUE, offset,
3730
+ size, data, 0 , NULL , NULL ));
3731
+ CL_CHECK (clReleaseMemObject (data_device));
3732
+ return ;
3733
+ }
3734
+ if (tensor->type == GGML_TYPE_Q8_0) {
3735
+ ggml_tensor_extra_cl_q8_0 * extra = (ggml_tensor_extra_cl_q8_0 *)tensor->extra ;
3736
+
3737
+ cl_int err;
3738
+ cl_mem data_device = clCreateBuffer (context, CL_MEM_READ_WRITE,
3739
+ ggml_nbytes (tensor), NULL , &err);
3740
+ CL_CHECK (err);
3741
+
3742
+ cl_kernel kernel = backend_ctx->kernel_restore_block_q8_0 ;
3743
+ CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &extra->q ));
3744
+ CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_mem), &extra->d ));
3745
+ CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_mem), &data_device));
3746
+
3747
+ size_t global_work_size[] = {(size_t )ggml_nelements (tensor)/ggml_blck_size (tensor->type ), 1 , 1 };
3748
+ size_t local_work_size[] = {1 , 1 , 1 };
3749
+
3585
3750
cl_event evt;
3586
3751
CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL ,
3587
3752
global_work_size, local_work_size, 0 , NULL , &evt));
@@ -6307,6 +6472,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
6307
6472
#ifdef GGML_OPENCL_SOA_Q
6308
6473
ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra ;
6309
6474
ggml_tensor_extra_cl_mxfp4 * extra0_mxfp4 = (ggml_tensor_extra_cl_mxfp4 *)src0->extra ;
6475
+ ggml_tensor_extra_cl_q8_0 * extra0_q8_0 = (ggml_tensor_extra_cl_q8_0 *)src0->extra ;
6310
6476
#endif
6311
6477
6312
6478
const int ne00 = src0 ? src0->ne [0 ] : 0 ;
@@ -6977,6 +7143,44 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
6977
7143
break ;
6978
7144
case GGML_TYPE_Q4_1:
6979
7145
case GGML_TYPE_Q8_0: {
7146
+ #ifdef GGML_OPENCL_SOA_Q
7147
+ kernel = backend_ctx->kernel_mul_mv_q8_0_f32_flat ;
7148
+
7149
+ // nth0 - subgroup size
7150
+ // nth1 - number of subgroups per workgroup
7151
+ // ndst - number of output values per workgroup = output per subgroup * number of subgroups
7152
+ if (backend_ctx->gpu_family == INTEL) {
7153
+ nth0 = 16 ;
7154
+ nth1 = 2 ;
7155
+ ndst = nth1*4 ;
7156
+ } else if (backend_ctx->gpu_family == ADRENO) {
7157
+ nth0 = 64 ;
7158
+ nth1 = 2 ;
7159
+ ndst = nth1*4 ;
7160
+ } else {
7161
+ GGML_ASSERT (false && " TODO: Unknown GPU" );
7162
+ }
7163
+
7164
+ CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &extra0_q8_0->q ));
7165
+ CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_mem), &extra0_q8_0->d ));
7166
+ CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_mem), &extra1->data_device ));
7167
+ CL_CHECK (clSetKernelArg (kernel, 3 , sizeof (cl_ulong), &offset1));
7168
+ CL_CHECK (clSetKernelArg (kernel, 4 , sizeof (cl_mem), &extrad->data_device ));
7169
+ CL_CHECK (clSetKernelArg (kernel, 5 , sizeof (cl_ulong), &offsetd));
7170
+ CL_CHECK (clSetKernelArg (kernel, 6 , sizeof (int ), &ne00));
7171
+ CL_CHECK (clSetKernelArg (kernel, 7 , sizeof (int ), &ne01));
7172
+ CL_CHECK (clSetKernelArg (kernel, 8 , sizeof (cl_ulong), &nb01));
7173
+ CL_CHECK (clSetKernelArg (kernel, 9 , sizeof (cl_ulong), &nb02));
7174
+ CL_CHECK (clSetKernelArg (kernel, 10 , sizeof (cl_ulong), &nb03));
7175
+ CL_CHECK (clSetKernelArg (kernel, 11 , sizeof (int ), &ne12));
7176
+ CL_CHECK (clSetKernelArg (kernel, 12 , sizeof (cl_ulong), &nb11));
7177
+ CL_CHECK (clSetKernelArg (kernel, 13 , sizeof (cl_ulong), &nb12));
7178
+ CL_CHECK (clSetKernelArg (kernel, 14 , sizeof (cl_ulong), &nb13));
7179
+ CL_CHECK (clSetKernelArg (kernel, 15 , sizeof (int ), &ne0));
7180
+ CL_CHECK (clSetKernelArg (kernel, 16 , sizeof (int ), &ne1));
7181
+ CL_CHECK (clSetKernelArg (kernel, 17 , sizeof (int ), &r2));
7182
+ CL_CHECK (clSetKernelArg (kernel, 18 , sizeof (int ), &r3));
7183
+ #else
6980
7184
kernel = backend_ctx->kernel_mul_mv_q8_0_f32 ;
6981
7185
6982
7186
// nth0 - subgroup size
@@ -7013,6 +7217,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
7013
7217
CL_CHECK (clSetKernelArg (kernel, 16 , sizeof (int ), &ne1));
7014
7218
CL_CHECK (clSetKernelArg (kernel, 17 , sizeof (int ), &r2));
7015
7219
CL_CHECK (clSetKernelArg (kernel, 18 , sizeof (int ), &r3));
7220
+ #endif // GGML_OPENCL_SOA_Q
7016
7221
break ;
7017
7222
}
7018
7223
case GGML_TYPE_Q2_K:
0 commit comments