Skip to content

Commit 927e646

Browse files
committed
opencl: add flattened q8_0 mv
1 parent 5af525e commit 927e646

File tree

4 files changed

+375
-2
lines changed

4 files changed

+375
-2
lines changed

ggml/src/ggml-opencl/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ set(GGML_OPENCL_KERNELS
8383
mul_mv_q4_0_f32_1d_16x_flat
8484
mul_mv_q6_k
8585
mul_mv_q8_0_f32
86+
mul_mv_q8_0_f32_flat
8687
mul_mv_mxfp4_f32
8788
mul_mv_id_q4_0_f32_8x_flat
8889
mul_mv_id_q8_0_f32

ggml/src/ggml-opencl/ggml-opencl.cpp

Lines changed: 209 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -367,7 +367,7 @@ struct ggml_backend_opencl_context {
367367
cl_program program_mul_mv_q4_0_f32_1d_8x_flat;
368368
cl_program program_mul_mv_q4_0_f32_1d_16x_flat;
369369
cl_program program_mul_mv_q6_K;
370-
cl_program program_mul_mv_q8_0_f32;
370+
cl_program program_mul_mv_q8_0_f32, program_mul_mv_q8_0_f32_flat;
371371
cl_program program_mul_mv_mxfp4_f32;
372372
cl_program program_mul_mv_f16_f16;
373373
cl_program program_mul_mv_f16_f32_1row;
@@ -449,11 +449,12 @@ struct ggml_backend_opencl_context {
449449
cl_kernel kernel_mul_mat_f16_f32_tiled;
450450
cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v;
451451
cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0;
452+
cl_kernel kernel_convert_block_q8_0, kernel_restore_block_q8_0;
452453
cl_kernel kernel_mul_mat_q4_0_f32_8x_flat;
453454
cl_kernel kernel_convert_block_q4_0_noshuffle;
454455
cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
455456
cl_kernel kernel_mul_mv_q6_K_f32;
456-
cl_kernel kernel_mul_mv_q8_0_f32;
457+
cl_kernel kernel_mul_mv_q8_0_f32, kernel_mul_mv_q8_0_f32_flat;
457458
cl_kernel kernel_mul_mv_mxfp4_f32;
458459
cl_kernel kernel_im2col_f32, kernel_im2col_f16;
459460
cl_kernel kernel_argsort_f32_i32;
@@ -769,6 +770,8 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
769770
CL_CHECK((backend_ctx->kernel_convert_block_q4_0_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_0_noshuffle", &err), err));
770771
CL_CHECK((backend_ctx->kernel_convert_block_q4_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_0", &err), err));
771772
CL_CHECK((backend_ctx->kernel_restore_block_q4_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_0", &err), err));
773+
CL_CHECK((backend_ctx->kernel_convert_block_q8_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q8_0", &err), err));
774+
CL_CHECK((backend_ctx->kernel_restore_block_q8_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q8_0", &err), err));
772775
GGML_LOG_CONT(".");
773776
}
774777

@@ -1006,6 +1009,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
10061009
GGML_LOG_CONT(".");
10071010
}
10081011

1012+
// mul_mv_q8_0_f32_flat
1013+
{
1014+
#ifdef GGML_OPENCL_EMBED_KERNELS
1015+
const std::string kernel_src {
1016+
#include "mul_mv_q8_0_f32_flat.cl.h"
1017+
};
1018+
#else
1019+
const std::string kernel_src = read_file("mul_mv_q8_0_f32_flat.cl");
1020+
#endif
1021+
backend_ctx->program_mul_mv_q8_0_f32_flat =
1022+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1023+
1024+
CL_CHECK((backend_ctx->kernel_mul_mv_q8_0_f32_flat = clCreateKernel(backend_ctx->program_mul_mv_q8_0_f32_flat, "kernel_mul_mv_q8_0_f32_flat", &err), err));
1025+
GGML_LOG_CONT(".");
1026+
}
1027+
10091028
// mul_mv_mxfp4_f32
10101029
{
10111030
#ifdef GGML_OPENCL_EMBED_KERNELS
@@ -2427,6 +2446,41 @@ struct ggml_tensor_extra_cl_q4_0 {
24272446
}
24282447
};
24292448

2449+
struct ggml_tensor_extra_cl_q8_0 {
2450+
cl_mem q = nullptr;
2451+
cl_mem q_img = nullptr;
2452+
2453+
cl_mem d = nullptr;
2454+
cl_mem d_img = nullptr;
2455+
2456+
size_t size_q = 0;
2457+
size_t size_d = 0;
2458+
2459+
~ggml_tensor_extra_cl_q8_0() {
2460+
reset();
2461+
}
2462+
2463+
void reset() {
2464+
// q and d are subbuffers into the bigger buffer allocated in ggml_backend_buffer.
2465+
// They must be properly released so that the original buffer can be
2466+
// properly released to avoid memory leak.
2467+
if (q != nullptr) {
2468+
CL_CHECK(clReleaseMemObject(q));
2469+
q = nullptr;
2470+
}
2471+
if (d != nullptr) {
2472+
CL_CHECK(clReleaseMemObject(d));
2473+
d = nullptr;
2474+
}
2475+
// Currently, q_img and d_img are not used. They can be image1d_buffer_t
2476+
// that wraps around q and d to utilize image access path.
2477+
q_img = nullptr;
2478+
d_img = nullptr;
2479+
size_q = 0;
2480+
size_d = 0;
2481+
}
2482+
};
2483+
24302484
//------------------------------------------------------------------------------
24312485
// Backend API
24322486
//------------------------------------------------------------------------------
@@ -2932,6 +2986,12 @@ struct ggml_backend_opencl_buffer_context {
29322986
for (ggml_tensor_extra_cl_q4_0 * e : temp_tensor_extras_q4_0_in_use) {
29332987
delete e;
29342988
}
2989+
for (ggml_tensor_extra_cl_q8_0 * e : temp_tensor_extras_q8_0) {
2990+
delete e;
2991+
}
2992+
for (ggml_tensor_extra_cl_q8_0 * e : temp_tensor_extras_q8_0_in_use) {
2993+
delete e;
2994+
}
29352995
}
29362996

29372997
ggml_tensor_extra_cl * ggml_opencl_alloc_temp_tensor_extra() {
@@ -2964,6 +3024,21 @@ struct ggml_backend_opencl_buffer_context {
29643024
return extra;
29653025
}
29663026

3027+
ggml_tensor_extra_cl_q8_0 * ggml_opencl_alloc_temp_tensor_extra_q8_0() {
3028+
ggml_tensor_extra_cl_q8_0 * extra;
3029+
if (temp_tensor_extras_q8_0.empty()) {
3030+
extra = new ggml_tensor_extra_cl_q8_0();
3031+
} else {
3032+
extra = temp_tensor_extras_q8_0.back();
3033+
temp_tensor_extras_q4_0.pop_back();
3034+
}
3035+
3036+
temp_tensor_extras_q8_0_in_use.push_back(extra);
3037+
3038+
extra->reset();
3039+
return extra;
3040+
}
3041+
29673042
void reset() {
29683043
for (ggml_tensor_extra_cl * e : temp_tensor_extras_in_use) {
29693044
temp_tensor_extras.push_back(e);
@@ -2974,6 +3049,11 @@ struct ggml_backend_opencl_buffer_context {
29743049
temp_tensor_extras_q4_0.push_back(e);
29753050
}
29763051
temp_tensor_extras_q4_0_in_use.clear();
3052+
3053+
for (ggml_tensor_extra_cl_q8_0 * e : temp_tensor_extras_q8_0_in_use) {
3054+
temp_tensor_extras_q8_0.push_back(e);
3055+
}
3056+
temp_tensor_extras_q8_0_in_use.clear();
29773057
}
29783058

29793059
// Pools for extras. Available extras are in `temp_tensor_extras`. Extras
@@ -2985,6 +3065,8 @@ struct ggml_backend_opencl_buffer_context {
29853065
std::vector<ggml_tensor_extra_cl *> temp_tensor_extras_in_use;
29863066
std::vector<ggml_tensor_extra_cl_q4_0 *> temp_tensor_extras_q4_0;
29873067
std::vector<ggml_tensor_extra_cl_q4_0 *> temp_tensor_extras_q4_0_in_use;
3068+
std::vector<ggml_tensor_extra_cl_q8_0 *> temp_tensor_extras_q8_0;
3069+
std::vector<ggml_tensor_extra_cl_q8_0 *> temp_tensor_extras_q8_0_in_use;
29883070

29893071
// The buffer_context is initially created by ggml_backend_buft_alloc_buffer
29903072
// before any tensor is initialized (at the beginning of alloc_tensor_range).
@@ -3327,6 +3409,65 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
33273409
}
33283410
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
33293411

3412+
return;
3413+
}
3414+
if (tensor->type == GGML_TYPE_Q8_0) {
3415+
ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra;
3416+
GGML_ASSERT(extra_orig && "Tesnors in OpenCL backend should have been allocated and initialized");
3417+
3418+
// Allocate the new extra and create aliases from the original.
3419+
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
3420+
ggml_tensor_extra_cl_q8_0 * extra = ctx->ggml_opencl_alloc_temp_tensor_extra_q8_0();
3421+
3422+
size_t size_d = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(ggml_fp16_t);
3423+
size_t size_q = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*(ggml_blck_size(tensor->type)*sizeof(char));
3424+
GGML_ASSERT(size_d + size_q == ggml_nbytes(tensor) && "Incorrect tensor size");
3425+
3426+
cl_int err;
3427+
cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
3428+
ggml_nbytes(tensor), NULL, &err);
3429+
CL_CHECK(err);
3430+
CL_CHECK(clEnqueueWriteBuffer(
3431+
queue, data_device, CL_TRUE, 0,
3432+
ggml_nbytes(tensor), data, 0, NULL, NULL));
3433+
3434+
// The original tensor memory is divided into scales and quants, i.e.,
3435+
// we first store scales, then quants.
3436+
cl_buffer_region region;
3437+
3438+
// Create subbuffer for scales.
3439+
region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment);
3440+
region.size = size_d;
3441+
extra->d = clCreateSubBuffer(
3442+
extra_orig->data_device, CL_MEM_READ_WRITE,
3443+
CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
3444+
CL_CHECK(err);
3445+
auto previous_origin = region.origin;
3446+
3447+
// Create subbuffer for quants.
3448+
region.origin = align_to(previous_origin + size_d, backend_ctx->alignment);
3449+
region.size = size_q;
3450+
extra->q = clCreateSubBuffer(
3451+
extra_orig->data_device, CL_MEM_READ_WRITE,
3452+
CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
3453+
CL_CHECK(err);
3454+
3455+
cl_kernel kernel = backend_ctx->kernel_convert_block_q8_0;
3456+
3457+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
3458+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q));
3459+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->d));
3460+
3461+
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
3462+
size_t local_work_size[] = {64, 1, 1};
3463+
3464+
cl_event evt;
3465+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3466+
CL_CHECK(clWaitForEvents(1, &evt));
3467+
CL_CHECK(clReleaseMemObject(data_device));
3468+
3469+
tensor->extra = extra;
3470+
33303471
return;
33313472
}
33323473
#endif // GGML_OPENCL_SOA_Q
@@ -3375,6 +3516,32 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
33753516
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
33763517
size_t local_work_size[] = {1, 1, 1};
33773518

3519+
cl_event evt;
3520+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
3521+
global_work_size, local_work_size, 0, NULL, &evt));
3522+
CL_CHECK(clWaitForEvents(1, &evt));
3523+
CL_CHECK(clEnqueueReadBuffer(
3524+
queue, data_device, CL_TRUE, offset,
3525+
size, data, 0, NULL, NULL));
3526+
CL_CHECK(clReleaseMemObject(data_device));
3527+
return;
3528+
}
3529+
if (tensor->type == GGML_TYPE_Q8_0) {
3530+
ggml_tensor_extra_cl_q8_0 * extra = (ggml_tensor_extra_cl_q8_0 *)tensor->extra;
3531+
3532+
cl_int err;
3533+
cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
3534+
ggml_nbytes(tensor), NULL, &err);
3535+
CL_CHECK(err);
3536+
3537+
cl_kernel kernel = backend_ctx->kernel_restore_block_q8_0;
3538+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
3539+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->d));
3540+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &data_device));
3541+
3542+
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
3543+
size_t local_work_size[] = {1, 1, 1};
3544+
33783545
cl_event evt;
33793546
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
33803547
global_work_size, local_work_size, 0, NULL, &evt));
@@ -6086,6 +6253,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
60866253

60876254
#ifdef GGML_OPENCL_SOA_Q
60886255
ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra;
6256+
ggml_tensor_extra_cl_q8_0 * extra0_q8_0 = (ggml_tensor_extra_cl_q8_0 *)src0->extra;
60896257
#endif
60906258

60916259
const int ne00 = src0 ? src0->ne[0] : 0;
@@ -6756,6 +6924,44 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
67566924
break;
67576925
case GGML_TYPE_Q4_1:
67586926
case GGML_TYPE_Q8_0: {
6927+
#ifdef GGML_OPENCL_SOA_Q
6928+
kernel = backend_ctx->kernel_mul_mv_q8_0_f32_flat;
6929+
6930+
// nth0 - subgroup size
6931+
// nth1 - number of subgroups per workgroup
6932+
// ndst - number of output values per workgroup = output per subgroup * number of subgroups
6933+
if (backend_ctx->gpu_family == INTEL) {
6934+
nth0 = 16;
6935+
nth1 = 2;
6936+
ndst = nth1*4;
6937+
} else if (backend_ctx->gpu_family == ADRENO) {
6938+
nth0 = 64;
6939+
nth1 = 2;
6940+
ndst = nth1*4;
6941+
} else {
6942+
GGML_ASSERT(false && "TODO: Unknown GPU");
6943+
}
6944+
6945+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q8_0->q));
6946+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q8_0->d));
6947+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
6948+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
6949+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
6950+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
6951+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
6952+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
6953+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01));
6954+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02));
6955+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03));
6956+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne12));
6957+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb11));
6958+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb12));
6959+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb13));
6960+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne0));
6961+
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne1));
6962+
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &r2));
6963+
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &r3));
6964+
#else
67596965
kernel = backend_ctx->kernel_mul_mv_q8_0_f32;
67606966

67616967
// nth0 - subgroup size
@@ -6792,6 +6998,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
67926998
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne1));
67936999
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &r2));
67947000
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &r3));
7001+
#endif // GGML_OPENCL_SOA_Q
67957002
break;
67967003
}
67977004
case GGML_TYPE_Q2_K:

ggml/src/ggml-opencl/kernels/cvt.cl

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,3 +116,42 @@ kernel void kernel_convert_block_q4_0_noshuffle(
116116
#endif
117117
}
118118
}
119+
120+
//------------------------------------------------------------------------------
121+
// block_q8_0
122+
//------------------------------------------------------------------------------
123+
typedef struct {
124+
half d; // delta
125+
char qs[QK8_0]; // quants
126+
} block_q8_0;
127+
128+
kernel void kernel_convert_block_q8_0(
129+
global block_q8_0 * src0,
130+
global uchar * dst_q,
131+
global half * dst_d
132+
) {
133+
global block_q8_0 * b = (global block_q8_0 *) src0 + get_global_id(0);
134+
global uchar * q = (global uchar *) dst_q + QK8_0*get_global_id(0);
135+
global half * d = (global half *) dst_d + get_global_id(0);
136+
137+
*d = b->d;
138+
139+
for (int i = 0; i < QK8_0; ++i) {
140+
q[i] = b->qs[i];
141+
}
142+
}
143+
144+
kernel void kernel_restore_block_q8_0(
145+
global uchar * src_q,
146+
global half * src_d,
147+
global block_q8_0 * dst
148+
) {
149+
global block_q8_0 * b = (global block_q8_0 *) dst + get_global_id(0);
150+
global uchar * q = (global uchar *) src_q + QK8_0*get_global_id(0);
151+
global half * d = (global half *) src_d + get_global_id(0);
152+
153+
b->d = *d;
154+
for (int i = 0; i < QK8_0; ++i) {
155+
b->qs[i] = q[i];
156+
}
157+
}

0 commit comments

Comments
 (0)