Skip to content

Commit 124dd54

Browse files
committed
opencl: add flattened q8_0 mv
1 parent fa9dc15 commit 124dd54

File tree

4 files changed

+375
-2
lines changed

4 files changed

+375
-2
lines changed

ggml/src/ggml-opencl/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ set(GGML_OPENCL_KERNELS
8383
mul_mv_q4_0_f32_1d_16x_flat
8484
mul_mv_q6_k
8585
mul_mv_q8_0_f32
86+
mul_mv_q8_0_f32_flat
8687
mul_mv_mxfp4_f32
8788
mul_mv_id_q4_0_f32_8x_flat
8889
mul_mv_id_q8_0_f32

ggml/src/ggml-opencl/ggml-opencl.cpp

Lines changed: 209 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -367,7 +367,7 @@ struct ggml_backend_opencl_context {
367367
cl_program program_mul_mv_q4_0_f32_1d_8x_flat;
368368
cl_program program_mul_mv_q4_0_f32_1d_16x_flat;
369369
cl_program program_mul_mv_q6_K;
370-
cl_program program_mul_mv_q8_0_f32;
370+
cl_program program_mul_mv_q8_0_f32, program_mul_mv_q8_0_f32_flat;
371371
cl_program program_mul_mv_mxfp4_f32;
372372
cl_program program_mul_mv_f16_f16;
373373
cl_program program_mul_mv_f16_f32_1row;
@@ -449,11 +449,12 @@ struct ggml_backend_opencl_context {
449449
cl_kernel kernel_mul_mat_f16_f32_tiled;
450450
cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v;
451451
cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0;
452+
cl_kernel kernel_convert_block_q8_0, kernel_restore_block_q8_0;
452453
cl_kernel kernel_mul_mat_q4_0_f32_8x_flat;
453454
cl_kernel kernel_convert_block_q4_0_noshuffle;
454455
cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
455456
cl_kernel kernel_mul_mv_q6_K_f32;
456-
cl_kernel kernel_mul_mv_q8_0_f32;
457+
cl_kernel kernel_mul_mv_q8_0_f32, kernel_mul_mv_q8_0_f32_flat;
457458
cl_kernel kernel_mul_mv_mxfp4_f32;
458459
cl_kernel kernel_im2col_f32, kernel_im2col_f16;
459460
cl_kernel kernel_argsort_f32_i32;
@@ -769,6 +770,8 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
769770
CL_CHECK((backend_ctx->kernel_convert_block_q4_0_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_0_noshuffle", &err), err));
770771
CL_CHECK((backend_ctx->kernel_convert_block_q4_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_0", &err), err));
771772
CL_CHECK((backend_ctx->kernel_restore_block_q4_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_0", &err), err));
773+
CL_CHECK((backend_ctx->kernel_convert_block_q8_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q8_0", &err), err));
774+
CL_CHECK((backend_ctx->kernel_restore_block_q8_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q8_0", &err), err));
772775
GGML_LOG_CONT(".");
773776
}
774777

@@ -1006,6 +1009,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
10061009
GGML_LOG_CONT(".");
10071010
}
10081011

1012+
// mul_mv_q8_0_f32_flat
1013+
{
1014+
#ifdef GGML_OPENCL_EMBED_KERNELS
1015+
const std::string kernel_src {
1016+
#include "mul_mv_q8_0_f32_flat.cl.h"
1017+
};
1018+
#else
1019+
const std::string kernel_src = read_file("mul_mv_q8_0_f32_flat.cl");
1020+
#endif
1021+
backend_ctx->program_mul_mv_q8_0_f32_flat =
1022+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1023+
1024+
CL_CHECK((backend_ctx->kernel_mul_mv_q8_0_f32_flat = clCreateKernel(backend_ctx->program_mul_mv_q8_0_f32_flat, "kernel_mul_mv_q8_0_f32_flat", &err), err));
1025+
GGML_LOG_CONT(".");
1026+
}
1027+
10091028
// mul_mv_mxfp4_f32
10101029
{
10111030
#ifdef GGML_OPENCL_EMBED_KERNELS
@@ -2427,6 +2446,41 @@ struct ggml_tensor_extra_cl_q4_0 {
24272446
}
24282447
};
24292448

2449+
struct ggml_tensor_extra_cl_q8_0 {
2450+
cl_mem q = nullptr;
2451+
cl_mem q_img = nullptr;
2452+
2453+
cl_mem d = nullptr;
2454+
cl_mem d_img = nullptr;
2455+
2456+
size_t size_q = 0;
2457+
size_t size_d = 0;
2458+
2459+
~ggml_tensor_extra_cl_q8_0() {
2460+
reset();
2461+
}
2462+
2463+
void reset() {
2464+
// q and d are subbuffers into the bigger buffer allocated in ggml_backend_buffer.
2465+
// They must be properly released so that the original buffer can be
2466+
// properly released to avoid memory leak.
2467+
if (q != nullptr) {
2468+
CL_CHECK(clReleaseMemObject(q));
2469+
q = nullptr;
2470+
}
2471+
if (d != nullptr) {
2472+
CL_CHECK(clReleaseMemObject(d));
2473+
d = nullptr;
2474+
}
2475+
// Currently, q_img and d_img are not used. They can be image1d_buffer_t
2476+
// that wraps around q and d to utilize image access path.
2477+
q_img = nullptr;
2478+
d_img = nullptr;
2479+
size_q = 0;
2480+
size_d = 0;
2481+
}
2482+
};
2483+
24302484
//------------------------------------------------------------------------------
24312485
// Backend API
24322486
//------------------------------------------------------------------------------
@@ -2930,6 +2984,12 @@ struct ggml_backend_opencl_buffer_context {
29302984
for (ggml_tensor_extra_cl_q4_0 * e : temp_tensor_extras_q4_0_in_use) {
29312985
delete e;
29322986
}
2987+
for (ggml_tensor_extra_cl_q8_0 * e : temp_tensor_extras_q8_0) {
2988+
delete e;
2989+
}
2990+
for (ggml_tensor_extra_cl_q8_0 * e : temp_tensor_extras_q8_0_in_use) {
2991+
delete e;
2992+
}
29332993
}
29342994

29352995
ggml_tensor_extra_cl * ggml_opencl_alloc_temp_tensor_extra() {
@@ -2962,6 +3022,21 @@ struct ggml_backend_opencl_buffer_context {
29623022
return extra;
29633023
}
29643024

3025+
ggml_tensor_extra_cl_q8_0 * ggml_opencl_alloc_temp_tensor_extra_q8_0() {
3026+
ggml_tensor_extra_cl_q8_0 * extra;
3027+
if (temp_tensor_extras_q8_0.empty()) {
3028+
extra = new ggml_tensor_extra_cl_q8_0();
3029+
} else {
3030+
extra = temp_tensor_extras_q8_0.back();
3031+
temp_tensor_extras_q4_0.pop_back();
3032+
}
3033+
3034+
temp_tensor_extras_q8_0_in_use.push_back(extra);
3035+
3036+
extra->reset();
3037+
return extra;
3038+
}
3039+
29653040
void reset() {
29663041
for (ggml_tensor_extra_cl * e : temp_tensor_extras_in_use) {
29673042
temp_tensor_extras.push_back(e);
@@ -2972,6 +3047,11 @@ struct ggml_backend_opencl_buffer_context {
29723047
temp_tensor_extras_q4_0.push_back(e);
29733048
}
29743049
temp_tensor_extras_q4_0_in_use.clear();
3050+
3051+
for (ggml_tensor_extra_cl_q8_0 * e : temp_tensor_extras_q8_0_in_use) {
3052+
temp_tensor_extras_q8_0.push_back(e);
3053+
}
3054+
temp_tensor_extras_q8_0_in_use.clear();
29753055
}
29763056

29773057
// Pools for extras. Available extras are in `temp_tensor_extras`. Extras
@@ -2983,6 +3063,8 @@ struct ggml_backend_opencl_buffer_context {
29833063
std::vector<ggml_tensor_extra_cl *> temp_tensor_extras_in_use;
29843064
std::vector<ggml_tensor_extra_cl_q4_0 *> temp_tensor_extras_q4_0;
29853065
std::vector<ggml_tensor_extra_cl_q4_0 *> temp_tensor_extras_q4_0_in_use;
3066+
std::vector<ggml_tensor_extra_cl_q8_0 *> temp_tensor_extras_q8_0;
3067+
std::vector<ggml_tensor_extra_cl_q8_0 *> temp_tensor_extras_q8_0_in_use;
29863068

29873069
// The buffer_context is initially created by ggml_backend_buft_alloc_buffer
29883070
// before any tensor is initialized (at the beginning of alloc_tensor_range).
@@ -3325,6 +3407,65 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
33253407
}
33263408
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
33273409

3410+
return;
3411+
}
3412+
if (tensor->type == GGML_TYPE_Q8_0) {
3413+
ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra;
3414+
GGML_ASSERT(extra_orig && "Tesnors in OpenCL backend should have been allocated and initialized");
3415+
3416+
// Allocate the new extra and create aliases from the original.
3417+
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
3418+
ggml_tensor_extra_cl_q8_0 * extra = ctx->ggml_opencl_alloc_temp_tensor_extra_q8_0();
3419+
3420+
size_t size_d = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(ggml_fp16_t);
3421+
size_t size_q = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*(ggml_blck_size(tensor->type)*sizeof(char));
3422+
GGML_ASSERT(size_d + size_q == ggml_nbytes(tensor) && "Incorrect tensor size");
3423+
3424+
cl_int err;
3425+
cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
3426+
ggml_nbytes(tensor), NULL, &err);
3427+
CL_CHECK(err);
3428+
CL_CHECK(clEnqueueWriteBuffer(
3429+
queue, data_device, CL_TRUE, 0,
3430+
ggml_nbytes(tensor), data, 0, NULL, NULL));
3431+
3432+
// The original tensor memory is divided into scales and quants, i.e.,
3433+
// we first store scales, then quants.
3434+
cl_buffer_region region;
3435+
3436+
// Create subbuffer for scales.
3437+
region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment);
3438+
region.size = size_d;
3439+
extra->d = clCreateSubBuffer(
3440+
extra_orig->data_device, CL_MEM_READ_WRITE,
3441+
CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
3442+
CL_CHECK(err);
3443+
auto previous_origin = region.origin;
3444+
3445+
// Create subbuffer for quants.
3446+
region.origin = align_to(previous_origin + size_d, backend_ctx->alignment);
3447+
region.size = size_q;
3448+
extra->q = clCreateSubBuffer(
3449+
extra_orig->data_device, CL_MEM_READ_WRITE,
3450+
CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
3451+
CL_CHECK(err);
3452+
3453+
cl_kernel kernel = backend_ctx->kernel_convert_block_q8_0;
3454+
3455+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
3456+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q));
3457+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->d));
3458+
3459+
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
3460+
size_t local_work_size[] = {64, 1, 1};
3461+
3462+
cl_event evt;
3463+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3464+
CL_CHECK(clWaitForEvents(1, &evt));
3465+
CL_CHECK(clReleaseMemObject(data_device));
3466+
3467+
tensor->extra = extra;
3468+
33283469
return;
33293470
}
33303471
#endif // GGML_OPENCL_SOA_Q
@@ -3373,6 +3514,32 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
33733514
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
33743515
size_t local_work_size[] = {1, 1, 1};
33753516

3517+
cl_event evt;
3518+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
3519+
global_work_size, local_work_size, 0, NULL, &evt));
3520+
CL_CHECK(clWaitForEvents(1, &evt));
3521+
CL_CHECK(clEnqueueReadBuffer(
3522+
queue, data_device, CL_TRUE, offset,
3523+
size, data, 0, NULL, NULL));
3524+
CL_CHECK(clReleaseMemObject(data_device));
3525+
return;
3526+
}
3527+
if (tensor->type == GGML_TYPE_Q8_0) {
3528+
ggml_tensor_extra_cl_q8_0 * extra = (ggml_tensor_extra_cl_q8_0 *)tensor->extra;
3529+
3530+
cl_int err;
3531+
cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
3532+
ggml_nbytes(tensor), NULL, &err);
3533+
CL_CHECK(err);
3534+
3535+
cl_kernel kernel = backend_ctx->kernel_restore_block_q8_0;
3536+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
3537+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->d));
3538+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &data_device));
3539+
3540+
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
3541+
size_t local_work_size[] = {1, 1, 1};
3542+
33763543
cl_event evt;
33773544
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
33783545
global_work_size, local_work_size, 0, NULL, &evt));
@@ -6084,6 +6251,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
60846251

60856252
#ifdef GGML_OPENCL_SOA_Q
60866253
ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra;
6254+
ggml_tensor_extra_cl_q8_0 * extra0_q8_0 = (ggml_tensor_extra_cl_q8_0 *)src0->extra;
60876255
#endif
60886256

60896257
const int ne00 = src0 ? src0->ne[0] : 0;
@@ -6754,6 +6922,44 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
67546922
break;
67556923
case GGML_TYPE_Q4_1:
67566924
case GGML_TYPE_Q8_0: {
6925+
#ifdef GGML_OPENCL_SOA_Q
6926+
kernel = backend_ctx->kernel_mul_mv_q8_0_f32_flat;
6927+
6928+
// nth0 - subgroup size
6929+
// nth1 - number of subgroups per workgroup
6930+
// ndst - number of output values per workgroup = output per subgroup * number of subgroups
6931+
if (backend_ctx->gpu_family == INTEL) {
6932+
nth0 = 16;
6933+
nth1 = 2;
6934+
ndst = nth1*4;
6935+
} else if (backend_ctx->gpu_family == ADRENO) {
6936+
nth0 = 64;
6937+
nth1 = 2;
6938+
ndst = nth1*4;
6939+
} else {
6940+
GGML_ASSERT(false && "TODO: Unknown GPU");
6941+
}
6942+
6943+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q8_0->q));
6944+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q8_0->d));
6945+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
6946+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
6947+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
6948+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
6949+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
6950+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
6951+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01));
6952+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02));
6953+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03));
6954+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne12));
6955+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb11));
6956+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb12));
6957+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb13));
6958+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne0));
6959+
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne1));
6960+
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &r2));
6961+
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &r3));
6962+
#else
67576963
kernel = backend_ctx->kernel_mul_mv_q8_0_f32;
67586964

67596965
// nth0 - subgroup size
@@ -6790,6 +6996,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
67906996
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne1));
67916997
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &r2));
67926998
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &r3));
6999+
#endif // GGML_OPENCL_SOA_Q
67937000
break;
67947001
}
67957002
case GGML_TYPE_Q2_K:

ggml/src/ggml-opencl/kernels/cvt.cl

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,3 +116,42 @@ kernel void kernel_convert_block_q4_0_noshuffle(
116116
#endif
117117
}
118118
}
119+
120+
//------------------------------------------------------------------------------
121+
// block_q8_0
122+
//------------------------------------------------------------------------------
123+
typedef struct {
124+
half d; // delta
125+
char qs[QK8_0]; // quants
126+
} block_q8_0;
127+
128+
kernel void kernel_convert_block_q8_0(
129+
global block_q8_0 * src0,
130+
global uchar * dst_q,
131+
global half * dst_d
132+
) {
133+
global block_q8_0 * b = (global block_q8_0 *) src0 + get_global_id(0);
134+
global uchar * q = (global uchar *) dst_q + QK8_0*get_global_id(0);
135+
global half * d = (global half *) dst_d + get_global_id(0);
136+
137+
*d = b->d;
138+
139+
for (int i = 0; i < QK8_0; ++i) {
140+
q[i] = b->qs[i];
141+
}
142+
}
143+
144+
kernel void kernel_restore_block_q8_0(
145+
global uchar * src_q,
146+
global half * src_d,
147+
global block_q8_0 * dst
148+
) {
149+
global block_q8_0 * b = (global block_q8_0 *) dst + get_global_id(0);
150+
global uchar * q = (global uchar *) src_q + QK8_0*get_global_id(0);
151+
global half * d = (global half *) src_d + get_global_id(0);
152+
153+
b->d = *d;
154+
for (int i = 0; i < QK8_0; ++i) {
155+
b->qs[i] = q[i];
156+
}
157+
}

0 commit comments

Comments
 (0)