Skip to content

Commit 4db5641

Browse files
authored
opencl: add kernel to handle mat mul in attention to improve encoding speed (#17181)
* Add mul_mm_f16_f32_kq_kqv kernel * Add ggml_cl_mul_mat_kq_kqv_adreno func * fix whitespace * remove unused variable * remove redundant * refactor and clean up * remove trailing whitespace
1 parent 72bd732 commit 4db5641

File tree

3 files changed

+444
-0
lines changed

3 files changed

+444
-0
lines changed

ggml/src/ggml-opencl/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,7 @@ set(GGML_OPENCL_KERNELS
119119
pad
120120
repeat
121121
mul_mat_f16_f32
122+
mul_mm_f16_f32_kq_kqv
122123
conv2d
123124
conv2d_f16_f32
124125
flash_attn_f32_f16

ggml/src/ggml-opencl/ggml-opencl.cpp

Lines changed: 170 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -407,6 +407,8 @@ struct ggml_backend_opencl_context {
407407
cl_program program_mul_mv_f32_f32;
408408
cl_program program_mul;
409409
cl_program program_mul_mat_f16_f32_tiled;
410+
cl_program program_mul_mm_f16_f32_kqv;
411+
cl_program program_mul_mm_f16_f32_kq;
410412
cl_program program_div;
411413
cl_program program_sub;
412414
cl_program program_norm;
@@ -481,6 +483,8 @@ struct ggml_backend_opencl_context {
481483
cl_kernel kernel_mul_mat_f16_f32;
482484
cl_kernel kernel_mul_mat_f16_f32_l4;
483485
cl_kernel kernel_mul_mat_f16_f32_tiled;
486+
cl_kernel kernel_mul_mm_f16_f32_kqv;
487+
cl_kernel kernel_mul_mm_f16_f32_kq;
484488
cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v;
485489
cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0;
486490
cl_kernel kernel_convert_block_mxfp4, kernel_convert_block_mxfp4_trans, kernel_restore_block_mxfp4, kernel_restore_block_mxfp4_trans;
@@ -1235,6 +1239,25 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
12351239
GGML_LOG_CONT(".");
12361240
}
12371241

1242+
// mul_mm_f16_f32_kq_kqv
1243+
{
1244+
#ifdef GGML_OPENCL_EMBED_KERNELS
1245+
const std::string kernel_src {
1246+
#include "mul_mm_f16_f32_kq_kqv.cl.h"
1247+
};
1248+
#else
1249+
const std::string kernel_src = read_file("mul_mm_f16_f32_kq_kqv.cl");
1250+
#endif
1251+
backend_ctx->program_mul_mm_f16_f32_kqv =
1252+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts+" -DKQV ");
1253+
backend_ctx->program_mul_mm_f16_f32_kq =
1254+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1255+
1256+
CL_CHECK((backend_ctx->kernel_mul_mm_f16_f32_kqv = clCreateKernel(backend_ctx->program_mul_mm_f16_f32_kqv, "mul_mm_f16_f32_kqv", &err), err));
1257+
CL_CHECK((backend_ctx->kernel_mul_mm_f16_f32_kq = clCreateKernel(backend_ctx->program_mul_mm_f16_f32_kq, "mul_mm_f16_f32_kq", &err), err));
1258+
GGML_LOG_CONT(".");
1259+
}
1260+
12381261
// mul
12391262
{
12401263
#ifdef GGML_OPENCL_EMBED_KERNELS
@@ -6665,6 +6688,146 @@ static void ggml_cl_conv_2d(ggml_backend_t backend, const ggml_tensor * src0, co
66656688
backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size, local_work_size, dst);
66666689
}
66676690

6691+
static void ggml_cl_mul_mat_kq_kqv_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6692+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
6693+
6694+
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
6695+
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
6696+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
6697+
6698+
const int ne00 = src0->ne[0];
6699+
const int ne01 = src0->ne[1];
6700+
const int ne02 = src0->ne[2];
6701+
6702+
const cl_ulong nb01 = src0->nb[1];
6703+
const cl_ulong nb02 = src0->nb[2];
6704+
6705+
const int ne10 = src1->ne[0];
6706+
const int ne11 = src1->ne[1];
6707+
const int ne12 = src1->ne[2];
6708+
6709+
const cl_ulong nb10 = src1->nb[0];
6710+
6711+
const int ne0 = dst->ne[0];
6712+
const int ne1 = dst->ne[1];
6713+
6714+
GGML_ASSERT(ne00 == ne10);
6715+
6716+
cl_kernel kernel;
6717+
cl_context context = backend_ctx->context;
6718+
6719+
cl_int status;
6720+
cl_image_format img_fmt_1d;
6721+
cl_image_desc img_desc_1d;
6722+
cl_buffer_region region;
6723+
cl_mem A_image1d;
6724+
cl_mem A_sub_buffer;
6725+
cl_mem B_sub_buffer;
6726+
cl_mem D_image1d;
6727+
cl_mem D_sub_buffer;
6728+
6729+
int M = ne01;
6730+
int N = ne1;
6731+
int K = ne00;
6732+
6733+
if (nb01 > nb02) {
6734+
// KQ
6735+
kernel = backend_ctx->kernel_mul_mm_f16_f32_kq;
6736+
} else {
6737+
// KQV
6738+
kernel = backend_ctx->kernel_mul_mm_f16_f32_kqv;
6739+
}
6740+
// create sub-buffer for A
6741+
// <--------------------------------------------> //
6742+
extra0 = src0->view_src ? (ggml_tensor_extra_cl *)src0->view_src->extra : (ggml_tensor_extra_cl *)src0->extra;
6743+
6744+
region.origin = (extra0->offset);
6745+
if (nb01 > nb02) {
6746+
// KQ
6747+
region.size = nb01 * ne01;
6748+
} else {
6749+
// KQV
6750+
region.size = nb02 * ne02;
6751+
}
6752+
6753+
A_sub_buffer = clCreateSubBuffer((extra0->data_device), 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &status);
6754+
CL_CHECK(status);
6755+
6756+
// <--------------------------------------------> //
6757+
6758+
// create sub-buffer for B
6759+
// <--------------------------------------------> //
6760+
region.origin = (extra1->offset);
6761+
region.size = nb10 * ne10 * ne11 * ne12;
6762+
B_sub_buffer = clCreateSubBuffer((extra1->data_device), 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &status);
6763+
CL_CHECK(status);
6764+
// <--------------------------------------------> //
6765+
6766+
img_fmt_1d = {CL_RGBA, CL_FLOAT};
6767+
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
6768+
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
6769+
if (nb01 > nb02) {
6770+
img_desc_1d.image_width = (nb01 * ne01 / 4)/4;
6771+
}
6772+
else {
6773+
img_desc_1d.image_width = (nb02 * ne02 / 4)/4;
6774+
}
6775+
img_desc_1d.buffer = A_sub_buffer;
6776+
A_image1d = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
6777+
CL_CHECK(status);
6778+
6779+
// create sub-buffer for output C
6780+
// <--------------------------------------------> //
6781+
region.origin = (extrad->offset);
6782+
region.size = ne0 * ne1 * dst->ne[2] * dst->nb[0]; // size of C in bytes
6783+
D_sub_buffer = clCreateSubBuffer((extrad->data_device), 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &status);
6784+
CL_CHECK(status);
6785+
// <--------------------------------------------> //
6786+
6787+
// create image for C output
6788+
// <--------------------------------------------> //
6789+
img_fmt_1d = {CL_R, CL_FLOAT};
6790+
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
6791+
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
6792+
img_desc_1d.image_width = ne0 * ne1 * dst->ne[2] * dst->nb[0] / 4;
6793+
img_desc_1d.buffer = D_sub_buffer;
6794+
D_image1d = clCreateImage(context, CL_MEM_WRITE_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
6795+
CL_CHECK(status);
6796+
// <--------------------------------------------> //
6797+
6798+
int offset_src0 = 0;
6799+
int offset_src1 = 0;
6800+
6801+
// set kernel args
6802+
// <--------------------------------------------> //
6803+
cl_uint k_arg = 0;
6804+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &A_image1d));
6805+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &offset_src0));
6806+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &B_sub_buffer));
6807+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &offset_src1));
6808+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &D_image1d));
6809+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &extrad->offset));
6810+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &M));
6811+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &K));
6812+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &N));
6813+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne02));
6814+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne12));
6815+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &nb01));
6816+
6817+
size_t global_work_size[3] = {64, static_cast<size_t>(((M+63)/64)), static_cast<size_t>(((N+31)/32)*ne12)};
6818+
size_t local_work_size[3] = {64, 1, 2};
6819+
6820+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
6821+
6822+
// deallocate sub buffers and images
6823+
// <--------------------------------------------> //
6824+
CL_CHECK(clReleaseMemObject(A_image1d));
6825+
CL_CHECK(clReleaseMemObject(D_image1d));
6826+
CL_CHECK(clReleaseMemObject(A_sub_buffer));
6827+
CL_CHECK(clReleaseMemObject(B_sub_buffer));
6828+
CL_CHECK(clReleaseMemObject(D_sub_buffer));
6829+
}
6830+
66686831
static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
66696832
GGML_ASSERT(src0);
66706833
GGML_ASSERT(src0->extra);
@@ -6731,6 +6894,13 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
67316894
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
67326895
cl_context context = backend_ctx->context;
67336896

6897+
if(src0t == GGML_TYPE_F16 && src1t == GGML_TYPE_F32){
6898+
if (ne01 >= 64 && ne1 >= 32 && ne00 >= 16 && (ne12 % ne02) == 0){
6899+
ggml_cl_mul_mat_kq_kqv_adreno(backend, src0, src1, dst);
6900+
return;
6901+
}
6902+
}
6903+
67346904
if (ne01 && ne1 && use_adreno_kernels(backend_ctx, src0)) {
67356905

67366906
// init CL objects

0 commit comments

Comments
 (0)