Skip to content

Commit 3e0ee39

Browse files
committed
opencl: add flattened q8_0 mv
1 parent 9cce98b commit 3e0ee39

File tree

4 files changed

+380
-10
lines changed

4 files changed

+380
-10
lines changed

ggml/src/ggml-opencl/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ set(GGML_OPENCL_KERNELS
8383
mul_mv_q4_0_f32_1d_16x_flat
8484
mul_mv_q6_k
8585
mul_mv_q8_0_f32
86+
mul_mv_q8_0_f32_flat
8687
mul_mv_mxfp4_f32
8788
mul_mv_mxfp4_f32_flat
8889
mul_mv_id_q4_0_f32_8x_flat

ggml/src/ggml-opencl/ggml-opencl.cpp

Lines changed: 213 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -367,7 +367,7 @@ struct ggml_backend_opencl_context {
367367
cl_program program_mul_mv_q4_0_f32_1d_8x_flat;
368368
cl_program program_mul_mv_q4_0_f32_1d_16x_flat;
369369
cl_program program_mul_mv_q6_K;
370-
cl_program program_mul_mv_q8_0_f32;
370+
cl_program program_mul_mv_q8_0_f32, program_mul_mv_q8_0_f32_flat;
371371
cl_program program_mul_mv_mxfp4_f32;
372372
cl_program program_mul_mv_mxfp4_f32_flat;
373373
cl_program program_mul_mv_f16_f16;
@@ -452,12 +452,13 @@ struct ggml_backend_opencl_context {
452452
cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v;
453453
cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0;
454454
cl_kernel kernel_convert_block_mxfp4, kernel_restore_block_mxfp4;
455+
cl_kernel kernel_convert_block_q8_0, kernel_restore_block_q8_0;
455456
cl_kernel kernel_mul_mat_q4_0_f32_8x_flat;
456457
cl_kernel kernel_convert_block_q4_0_noshuffle;
457458
cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
458459
cl_kernel kernel_mul_mv_q6_K_f32;
459460
cl_kernel kernel_mul_mv_mxfp4_f32, kernel_mul_mv_mxfp4_f32_flat;
460-
cl_kernel kernel_mul_mv_q8_0_f32;
461+
cl_kernel kernel_mul_mv_q8_0_f32, kernel_mul_mv_q8_0_f32_flat;
461462
cl_kernel kernel_im2col_f32, kernel_im2col_f16;
462463
cl_kernel kernel_argsort_f32_i32;
463464
cl_kernel kernel_sum_rows_f32;
@@ -773,8 +774,10 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
773774
CL_CHECK((backend_ctx->kernel_convert_block_q4_0_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_0_noshuffle", &err), err));
774775
CL_CHECK((backend_ctx->kernel_convert_block_q4_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_0", &err), err));
775776
CL_CHECK((backend_ctx->kernel_restore_block_q4_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_0", &err), err));
776-
CL_CHECK((backend_ctx->kernel_convert_block_mxfp4 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_mxfp4", &err), err));
777-
CL_CHECK((backend_ctx->kernel_restore_block_mxfp4 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_mxfp4", &err), err));
777+
CL_CHECK((backend_ctx->kernel_convert_block_mxfp4 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_mxfp4", &err), err));
778+
CL_CHECK((backend_ctx->kernel_restore_block_mxfp4 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_mxfp4", &err), err));
779+
CL_CHECK((backend_ctx->kernel_convert_block_q8_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q8_0", &err), err));
780+
CL_CHECK((backend_ctx->kernel_restore_block_q8_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q8_0", &err), err));
778781
GGML_LOG_CONT(".");
779782
}
780783

@@ -1012,6 +1015,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
10121015
GGML_LOG_CONT(".");
10131016
}
10141017

1018+
// mul_mv_q8_0_f32_flat
1019+
{
1020+
#ifdef GGML_OPENCL_EMBED_KERNELS
1021+
const std::string kernel_src {
1022+
#include "mul_mv_q8_0_f32_flat.cl.h"
1023+
};
1024+
#else
1025+
const std::string kernel_src = read_file("mul_mv_q8_0_f32_flat.cl");
1026+
#endif
1027+
backend_ctx->program_mul_mv_q8_0_f32_flat =
1028+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1029+
1030+
CL_CHECK((backend_ctx->kernel_mul_mv_q8_0_f32_flat = clCreateKernel(backend_ctx->program_mul_mv_q8_0_f32_flat, "kernel_mul_mv_q8_0_f32_flat", &err), err));
1031+
GGML_LOG_CONT(".");
1032+
}
1033+
10151034
// mul_mv_mxfp4_f32
10161035
{
10171036
#ifdef GGML_OPENCL_EMBED_KERNELS
@@ -2499,17 +2518,50 @@ struct ggml_tensor_extra_cl_mxfp4 {
24992518
CL_CHECK(clReleaseMemObject(q_img));
25002519
q = nullptr;
25012520
}
2502-
// Currently, q_img and d_img are only initialized when SMALL_ALLOC is
2503-
// enabled. They point to the images in ggml_backend_opencl_buffer_context.
2504-
// So, there is no need to release them here.
2505-
// TODO: initialize them for non SMALL_PATH path, or remove them.
2521+
// Currently, q_img and d_img are not used. They can be image1d_buffer_t
2522+
// that wraps around q and d to utilize image access path.
25062523
q_img = nullptr;
25072524
e_img = nullptr;
25082525
size_q = 0;
25092526
size_e = 0;
25102527
}
25112528
};
25122529

2530+
struct ggml_tensor_extra_cl_q8_0 {
2531+
cl_mem q = nullptr;
2532+
cl_mem q_img = nullptr;
2533+
2534+
cl_mem d = nullptr;
2535+
cl_mem d_img = nullptr;
2536+
2537+
size_t size_q = 0;
2538+
size_t size_d = 0;
2539+
2540+
~ggml_tensor_extra_cl_q8_0() {
2541+
reset();
2542+
}
2543+
2544+
void reset() {
2545+
// q and d are subbuffers into the bigger buffer allocated in ggml_backend_buffer.
2546+
// They must be properly released so that the original buffer can be
2547+
// properly released to avoid memory leak.
2548+
if (q != nullptr) {
2549+
CL_CHECK(clReleaseMemObject(q));
2550+
q = nullptr;
2551+
}
2552+
if (d != nullptr) {
2553+
CL_CHECK(clReleaseMemObject(d));
2554+
d = nullptr;
2555+
}
2556+
// Currently, q_img and d_img are not used. They can be image1d_buffer_t
2557+
// that wraps around q and d to utilize image access path.
2558+
q_img = nullptr;
2559+
d_img = nullptr;
2560+
size_q = 0;
2561+
size_d = 0;
2562+
}
2563+
};
2564+
25132565
//------------------------------------------------------------------------------
25142566
// Backend API
25152567
//------------------------------------------------------------------------------
@@ -3022,6 +3074,12 @@ struct ggml_backend_opencl_buffer_context {
30223074
for (ggml_tensor_extra_cl_mxfp4 * e : temp_tensor_extras_mxfp4_in_use) {
30233075
delete e;
30243076
}
3077+
for (ggml_tensor_extra_cl_q8_0 * e : temp_tensor_extras_q8_0) {
3078+
delete e;
3079+
}
3080+
for (ggml_tensor_extra_cl_q8_0 * e : temp_tensor_extras_q8_0_in_use) {
3081+
delete e;
3082+
}
30253083
}
30263084

30273085
ggml_tensor_extra_cl * ggml_opencl_alloc_temp_tensor_extra() {
@@ -3069,6 +3127,21 @@ struct ggml_backend_opencl_buffer_context {
30693127
return extra;
30703128
}
30713129

3130+
ggml_tensor_extra_cl_q8_0 * ggml_opencl_alloc_temp_tensor_extra_q8_0() {
3131+
ggml_tensor_extra_cl_q8_0 * extra;
3132+
if (temp_tensor_extras_q8_0.empty()) {
3133+
extra = new ggml_tensor_extra_cl_q8_0();
3134+
} else {
3135+
extra = temp_tensor_extras_q8_0.back();
3136+
temp_tensor_extras_q8_0.pop_back();
3137+
}
3138+
3139+
temp_tensor_extras_q8_0_in_use.push_back(extra);
3140+
3141+
extra->reset();
3142+
return extra;
3143+
}
3144+
30723145
void reset() {
30733146
for (ggml_tensor_extra_cl * e : temp_tensor_extras_in_use) {
30743147
temp_tensor_extras.push_back(e);
@@ -3084,6 +3157,11 @@ struct ggml_backend_opencl_buffer_context {
30843157
temp_tensor_extras_mxfp4.push_back(e);
30853158
}
30863159
temp_tensor_extras_mxfp4_in_use.clear();
3160+
3161+
for (ggml_tensor_extra_cl_q8_0 * e : temp_tensor_extras_q8_0_in_use) {
3162+
temp_tensor_extras_q8_0.push_back(e);
3163+
}
3164+
temp_tensor_extras_q8_0_in_use.clear();
30873165
}
30883166

30893167
// Pools for extras. Available extras are in `temp_tensor_extras`. Extras
@@ -3097,6 +3175,8 @@ struct ggml_backend_opencl_buffer_context {
30973175
std::vector<ggml_tensor_extra_cl_q4_0 *> temp_tensor_extras_q4_0_in_use;
30983176
std::vector<ggml_tensor_extra_cl_mxfp4 *> temp_tensor_extras_mxfp4;
30993177
std::vector<ggml_tensor_extra_cl_mxfp4 *> temp_tensor_extras_mxfp4_in_use;
3178+
std::vector<ggml_tensor_extra_cl_q8_0 *> temp_tensor_extras_q8_0;
3179+
std::vector<ggml_tensor_extra_cl_q8_0 *> temp_tensor_extras_q8_0_in_use;
31003180

31013181
// The buffer_context is initially created by ggml_backend_buft_alloc_buffer
31023182
// before any tensor is initialized (at the beginning of alloc_tensor_range).
@@ -3509,6 +3589,65 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
35093589

35103590
tensor->extra = extra;
35113591

3592+
return;
3593+
}
3594+
if (tensor->type == GGML_TYPE_Q8_0) {
3595+
ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra;
3596+
GGML_ASSERT(extra_orig && "Tesnors in OpenCL backend should have been allocated and initialized");
3597+
3598+
// Allocate the new extra and create aliases from the original.
3599+
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
3600+
ggml_tensor_extra_cl_q8_0 * extra = ctx->ggml_opencl_alloc_temp_tensor_extra_q8_0();
3601+
3602+
size_t size_d = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(ggml_fp16_t);
3603+
size_t size_q = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*(ggml_blck_size(tensor->type)*sizeof(char));
3604+
GGML_ASSERT(size_d + size_q == ggml_nbytes(tensor) && "Incorrect tensor size");
3605+
3606+
cl_int err;
3607+
cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
3608+
ggml_nbytes(tensor), NULL, &err);
3609+
CL_CHECK(err);
3610+
CL_CHECK(clEnqueueWriteBuffer(
3611+
queue, data_device, CL_TRUE, 0,
3612+
ggml_nbytes(tensor), data, 0, NULL, NULL));
3613+
3614+
// The original tensor memory is divided into scales and quants, i.e.,
3615+
// we first store scales, then quants.
3616+
cl_buffer_region region;
3617+
3618+
// Create subbuffer for scales.
3619+
region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment);
3620+
region.size = size_d;
3621+
extra->d = clCreateSubBuffer(
3622+
extra_orig->data_device, CL_MEM_READ_WRITE,
3623+
CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
3624+
CL_CHECK(err);
3625+
auto previous_origin = region.origin;
3626+
3627+
// Create subbuffer for quants.
3628+
region.origin = align_to(previous_origin + size_d, backend_ctx->alignment);
3629+
region.size = size_q;
3630+
extra->q = clCreateSubBuffer(
3631+
extra_orig->data_device, CL_MEM_READ_WRITE,
3632+
CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
3633+
CL_CHECK(err);
3634+
3635+
cl_kernel kernel = backend_ctx->kernel_convert_block_q8_0;
3636+
3637+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
3638+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q));
3639+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->d));
3640+
3641+
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
3642+
size_t local_work_size[] = {64, 1, 1};
3643+
3644+
cl_event evt;
3645+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3646+
CL_CHECK(clWaitForEvents(1, &evt));
3647+
CL_CHECK(clReleaseMemObject(data_device));
3648+
3649+
tensor->extra = extra;
3650+
35123651
return;
35133652
}
35143653
#endif // GGML_OPENCL_SOA_Q
@@ -3582,6 +3721,32 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
35823721
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
35833722
size_t local_work_size[] = {1, 1, 1};
35843723

3724+
cl_event evt;
3725+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
3726+
global_work_size, local_work_size, 0, NULL, &evt));
3727+
CL_CHECK(clWaitForEvents(1, &evt));
3728+
CL_CHECK(clEnqueueReadBuffer(
3729+
queue, data_device, CL_TRUE, offset,
3730+
size, data, 0, NULL, NULL));
3731+
CL_CHECK(clReleaseMemObject(data_device));
3732+
return;
3733+
}
3734+
if (tensor->type == GGML_TYPE_Q8_0) {
3735+
ggml_tensor_extra_cl_q8_0 * extra = (ggml_tensor_extra_cl_q8_0 *)tensor->extra;
3736+
3737+
cl_int err;
3738+
cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
3739+
ggml_nbytes(tensor), NULL, &err);
3740+
CL_CHECK(err);
3741+
3742+
cl_kernel kernel = backend_ctx->kernel_restore_block_q8_0;
3743+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
3744+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->d));
3745+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &data_device));
3746+
3747+
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
3748+
size_t local_work_size[] = {1, 1, 1};
3749+
35853750
cl_event evt;
35863751
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
35873752
global_work_size, local_work_size, 0, NULL, &evt));
@@ -6307,6 +6472,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
63076472
#ifdef GGML_OPENCL_SOA_Q
63086473
ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra;
63096474
ggml_tensor_extra_cl_mxfp4 * extra0_mxfp4 = (ggml_tensor_extra_cl_mxfp4 *)src0->extra;
6475+
ggml_tensor_extra_cl_q8_0 * extra0_q8_0 = (ggml_tensor_extra_cl_q8_0 *)src0->extra;
63106476
#endif
63116477

63126478
const int ne00 = src0 ? src0->ne[0] : 0;
@@ -6977,6 +7143,44 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
69777143
break;
69787144
case GGML_TYPE_Q4_1:
69797145
case GGML_TYPE_Q8_0: {
7146+
#ifdef GGML_OPENCL_SOA_Q
7147+
kernel = backend_ctx->kernel_mul_mv_q8_0_f32_flat;
7148+
7149+
// nth0 - subgroup size
7150+
// nth1 - number of subgroups per workgroup
7151+
// ndst - number of output values per workgroup = output per subgroup * number of subgroups
7152+
if (backend_ctx->gpu_family == INTEL) {
7153+
nth0 = 16;
7154+
nth1 = 2;
7155+
ndst = nth1*4;
7156+
} else if (backend_ctx->gpu_family == ADRENO) {
7157+
nth0 = 64;
7158+
nth1 = 2;
7159+
ndst = nth1*4;
7160+
} else {
7161+
GGML_ASSERT(false && "TODO: Unknown GPU");
7162+
}
7163+
7164+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q8_0->q));
7165+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q8_0->d));
7166+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
7167+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
7168+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
7169+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
7170+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
7171+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
7172+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01));
7173+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02));
7174+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03));
7175+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne12));
7176+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb11));
7177+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb12));
7178+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb13));
7179+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne0));
7180+
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne1));
7181+
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &r2));
7182+
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &r3));
7183+
#else
69807184
kernel = backend_ctx->kernel_mul_mv_q8_0_f32;
69817185

69827186
// nth0 - subgroup size
@@ -7013,6 +7217,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
70137217
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne1));
70147218
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &r2));
70157219
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &r3));
7220+
#endif // GGML_OPENCL_SOA_Q
70167221
break;
70177222
}
70187223
case GGML_TYPE_Q2_K:

ggml/src/ggml-opencl/kernels/cvt.cl

Lines changed: 40 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -117,9 +117,8 @@ kernel void kernel_convert_block_q4_0_noshuffle(
117117
}
118118
}
119119

120-
121120
//------------------------------------------------------------------------------
122-
// block_q4_0
121+
// block_mxfp4
123122
//------------------------------------------------------------------------------
124123
#define QK_MXFP4 32
125124
struct block_mxfp4 {
@@ -162,3 +161,42 @@ kernel void kernel_restore_block_mxfp4(
162161
b->qs[i] = q[i];
163162
}
164163
}
164+
165+
//------------------------------------------------------------------------------
166+
// block_q8_0
167+
//------------------------------------------------------------------------------
168+
typedef struct {
169+
half d; // delta
170+
char qs[QK8_0]; // quants
171+
} block_q8_0;
172+
173+
kernel void kernel_convert_block_q8_0(
174+
global block_q8_0 * src0,
175+
global uchar * dst_q,
176+
global half * dst_d
177+
) {
178+
global block_q8_0 * b = (global block_q8_0 *) src0 + get_global_id(0);
179+
global uchar * q = (global uchar *) dst_q + QK8_0*get_global_id(0);
180+
global half * d = (global half *) dst_d + get_global_id(0);
181+
182+
*d = b->d;
183+
184+
for (int i = 0; i < QK8_0; ++i) {
185+
q[i] = b->qs[i];
186+
}
187+
}
188+
189+
kernel void kernel_restore_block_q8_0(
190+
global uchar * src_q,
191+
global half * src_d,
192+
global block_q8_0 * dst
193+
) {
194+
global block_q8_0 * b = (global block_q8_0 *) dst + get_global_id(0);
195+
global uchar * q = (global uchar *) src_q + QK8_0*get_global_id(0);
196+
global half * d = (global half *) src_d + get_global_id(0);
197+
198+
b->d = *d;
199+
for (int i = 0; i < QK8_0; ++i) {
200+
b->qs[i] = q[i];
201+
}
202+
}

0 commit comments

Comments
 (0)