Skip to content

Commit 24f32df

Browse files
committed
Add ggml_cl_mul_mat_kq_kqv_adreno func
1 parent 9e5c596 commit 24f32df

File tree

1 file changed

+160
-117
lines changed

1 file changed

+160
-117
lines changed

ggml/src/ggml-opencl/ggml-opencl.cpp

Lines changed: 160 additions & 117 deletions
Original file line numberDiff line numberDiff line change
@@ -6651,6 +6651,164 @@ static void ggml_cl_conv_2d(ggml_backend_t backend, const ggml_tensor * src0, co
66516651
backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size, local_work_size, dst);
66526652
}
66536653

6654+
static void ggml_cl_mul_mat_kq_kqv_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6655+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
6656+
6657+
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
6658+
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
6659+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
6660+
6661+
cl_ulong offset0 = extra0->offset + src0->view_offs;
6662+
cl_ulong offset1 = extra1->offset + src1->view_offs;
6663+
cl_ulong offsetd = extrad->offset + dst->view_offs;
6664+
6665+
const int ne00 = src0 ? src0->ne[0] : 0;
6666+
const int ne01 = src0 ? src0->ne[1] : 0;
6667+
const int ne02 = src0 ? src0->ne[2] : 0;
6668+
const int ne03 = src0 ? src0->ne[3] : 0;
6669+
6670+
const cl_ulong nb00 = src0 ? src0->nb[0] : 0;
6671+
const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
6672+
const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
6673+
const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
6674+
6675+
const int ne10 = src1 ? src1->ne[0] : 0;
6676+
const int ne11 = src1 ? src1->ne[1] : 0;
6677+
const int ne12 = src1 ? src1->ne[2] : 0;
6678+
const int ne13 = src1 ? src1->ne[3] : 0;
6679+
6680+
const cl_ulong nb10 = src1 ? src1->nb[0] : 0;
6681+
const cl_ulong nb11 = src1 ? src1->nb[1] : 0;
6682+
const cl_ulong nb12 = src1 ? src1->nb[2] : 0;
6683+
const cl_ulong nb13 = src1 ? src1->nb[3] : 0;
6684+
6685+
const int ne0 = dst ? dst->ne[0] : 0;
6686+
const int ne1 = dst ? dst->ne[1] : 0;
6687+
6688+
int r2 = ne12/ne02;
6689+
int r3 = ne13/ne03;
6690+
6691+
GGML_ASSERT(ne00 == ne10);
6692+
6693+
cl_kernel kernel;
6694+
cl_context context = backend_ctx->context;
6695+
6696+
cl_int status;
6697+
cl_image_format img_fmt_1d;
6698+
cl_image_desc img_desc_1d;
6699+
cl_buffer_region region;
6700+
cl_mem A_image1d;
6701+
cl_mem A_sub_buffer;
6702+
cl_mem B_sub_buffer;
6703+
cl_mem D_image1d;
6704+
cl_mem D_sub_buffer;
6705+
6706+
int M = ne01;
6707+
int N = ne1;
6708+
int K = ne00;
6709+
6710+
if (nb01 > nb02) {
6711+
// KQ
6712+
kernel = backend_ctx->kernel_mul_mm_f16_f32_kq;
6713+
} else {
6714+
// KQV
6715+
kernel = backend_ctx->kernel_mul_mm_f16_f32_kqv;
6716+
}
6717+
// create sub-buffer for A
6718+
// <--------------------------------------------> //
6719+
extra0 = src0->view_src ? (ggml_tensor_extra_cl *)src0->view_src->extra : (ggml_tensor_extra_cl *)src0->extra;
6720+
6721+
region.origin = (extra0->offset);
6722+
if (nb01 > nb02) {
6723+
// KQ
6724+
region.size = nb01 * ne01;
6725+
} else {
6726+
// KQV
6727+
region.size = nb02 * ne02;
6728+
}
6729+
6730+
A_sub_buffer = clCreateSubBuffer((extra0->data_device), 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &status);
6731+
CL_CHECK(status);
6732+
6733+
// <--------------------------------------------> //
6734+
6735+
// create sub-buffer for B
6736+
// <--------------------------------------------> //
6737+
region.origin = (extra1->offset);
6738+
region.size = nb10 * ne10 * ne11 * ne12;
6739+
B_sub_buffer = clCreateSubBuffer((extra1->data_device), 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &status);
6740+
CL_CHECK(status);
6741+
// <--------------------------------------------> //
6742+
6743+
img_fmt_1d = {CL_RGBA, CL_FLOAT};
6744+
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
6745+
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
6746+
if (nb01 > nb02) {
6747+
img_desc_1d.image_width = (nb01 * ne01 / 4)/4;
6748+
}
6749+
else {
6750+
img_desc_1d.image_width = (nb02 * ne02 / 4)/4;
6751+
}
6752+
img_desc_1d.buffer = A_sub_buffer;
6753+
A_image1d = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
6754+
CL_CHECK(status);
6755+
6756+
// create sub-buffer for output C
6757+
// <--------------------------------------------> //
6758+
region.origin = (extrad->offset);
6759+
region.size = ne0 * ne1 * dst->ne[2] * dst->nb[0]; // size of C in bytes
6760+
D_sub_buffer = clCreateSubBuffer((extrad->data_device), 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &status);
6761+
CL_CHECK(status);
6762+
// <--------------------------------------------> //
6763+
6764+
// create image for C output
6765+
// <--------------------------------------------> //
6766+
img_fmt_1d = {CL_R, CL_FLOAT};
6767+
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
6768+
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
6769+
img_desc_1d.image_width = ne0 * ne1 * dst->ne[2] * dst->nb[0] / 4;
6770+
img_desc_1d.buffer = D_sub_buffer;
6771+
D_image1d = clCreateImage(context, CL_MEM_WRITE_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
6772+
CL_CHECK(status);
6773+
// <--------------------------------------------> //
6774+
6775+
uint offset_src0 = 0;
6776+
uint offset_src1 = 0;
6777+
6778+
// set kernel args
6779+
// <--------------------------------------------> //
6780+
cl_uint k_arg = 0;
6781+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &A_image1d));
6782+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &offset_src0));
6783+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &B_sub_buffer));
6784+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &offset_src1));
6785+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &D_image1d));
6786+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &extrad->offset));
6787+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &M));
6788+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &K));
6789+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &N));
6790+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne02));
6791+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne12));
6792+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &nb01));
6793+
6794+
size_t global_work_size[3] = {64, static_cast<size_t>(((M+63)/64)), static_cast<size_t>(((N+31)/32)*ne12)};
6795+
size_t local_work_size[3] = {64, 1, 2};
6796+
6797+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
6798+
6799+
// deallocate sub buffers and images
6800+
// <--------------------------------------------> //
6801+
CL_CHECK(clReleaseMemObject(A_image1d));
6802+
CL_CHECK(clReleaseMemObject(D_image1d));
6803+
CL_CHECK(clReleaseMemObject(A_sub_buffer));
6804+
CL_CHECK(clReleaseMemObject(B_sub_buffer));
6805+
CL_CHECK(clReleaseMemObject(D_sub_buffer));
6806+
// <--------------------------------------------> //
6807+
6808+
return;
6809+
6810+
}
6811+
66546812
static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
66556813
GGML_ASSERT(src0);
66566814
GGML_ASSERT(src0->extra);
@@ -6717,125 +6875,10 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
67176875
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
67186876
cl_context context = backend_ctx->context;
67196877

6720-
cl_int status;
6721-
cl_image_format img_fmt_1d;
6722-
cl_image_desc img_desc_1d;
6723-
cl_buffer_region region;
6724-
cl_mem A_image1d;
6725-
cl_mem A_sub_buffer;
6726-
cl_mem B_sub_buffer;
6727-
cl_mem D_image1d;
6728-
cl_mem D_sub_buffer;
6729-
6730-
int M = ne01;
6731-
int N = ne1;
6732-
int K = ne00;
6733-
67346878
if(src0t == GGML_TYPE_F16 && src1t == GGML_TYPE_F32){
6735-
6736-
if (M >= 64 && N >= 32 && K >= 16 && (ne12 % ne02) == 0){
6737-
if (nb01 > nb02) {
6738-
// KQ
6739-
kernel = backend_ctx->kernel_mul_mm_f16_f32_kq;
6740-
} else {
6741-
// KQV
6742-
kernel = backend_ctx->kernel_mul_mm_f16_f32_kqv;
6743-
}
6744-
// create sub-buffer for A
6745-
// <--------------------------------------------> //
6746-
extra0 = src0->view_src ? (ggml_tensor_extra_cl *)src0->view_src->extra : (ggml_tensor_extra_cl *)src0->extra;
6747-
6748-
region.origin = (extra0->offset);
6749-
if (nb01 > nb02) {
6750-
// KQ
6751-
region.size = nb01 * ne01;
6752-
} else {
6753-
// KQV
6754-
region.size = nb02 * ne02;
6755-
}
6756-
6757-
A_sub_buffer = clCreateSubBuffer((extra0->data_device), 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &status);
6758-
CL_CHECK(status);
6759-
6760-
// <--------------------------------------------> //
6761-
6762-
// create sub-buffer for B
6763-
// <--------------------------------------------> //
6764-
region.origin = (extra1->offset);
6765-
region.size = nb10 * ne10 * ne11 * ne12;
6766-
B_sub_buffer = clCreateSubBuffer((extra1->data_device), 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &status);
6767-
CL_CHECK(status);
6768-
// <--------------------------------------------> //
6769-
6770-
img_fmt_1d = {CL_RGBA, CL_FLOAT};
6771-
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
6772-
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
6773-
if (nb01 > nb02) {
6774-
img_desc_1d.image_width = (nb01 * ne01 / 4)/4;
6775-
}
6776-
else {
6777-
img_desc_1d.image_width = (nb02 * ne02 / 4)/4;
6778-
}
6779-
img_desc_1d.buffer = A_sub_buffer;
6780-
A_image1d = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
6781-
CL_CHECK(status);
6782-
6783-
6784-
// create sub-buffer for output C
6785-
// <--------------------------------------------> //
6786-
region.origin = (extrad->offset);
6787-
region.size = ne0 * ne1 * dst->ne[2] * dst->nb[0]; // size of C in bytes
6788-
D_sub_buffer = clCreateSubBuffer((extrad->data_device), 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &status);
6789-
CL_CHECK(status);
6790-
// <--------------------------------------------> //
6791-
6792-
// create image for C output
6793-
// <--------------------------------------------> //
6794-
img_fmt_1d = {CL_R, CL_FLOAT};
6795-
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
6796-
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
6797-
img_desc_1d.image_width = ne0 * ne1 * dst->ne[2] * dst->nb[0] / 4;
6798-
img_desc_1d.buffer = D_sub_buffer;
6799-
D_image1d = clCreateImage(context, CL_MEM_WRITE_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
6800-
CL_CHECK(status);
6801-
// <--------------------------------------------> //
6802-
6803-
// offsets = 0 when using image
6804-
int offset0 = 0;
6805-
int offset1 = 0;
6806-
6807-
// set kernel args
6808-
// <--------------------------------------------> //
6809-
cl_uint k_arg = 0;
6810-
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &A_image1d));
6811-
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &offset0));
6812-
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &B_sub_buffer));
6813-
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &offset1));
6814-
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &D_image1d));
6815-
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &extrad->offset));
6816-
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &M));
6817-
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &K));
6818-
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &N));
6819-
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne02));
6820-
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne12));
6821-
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &nb01));
6822-
6823-
size_t global_work_size[3] = {64, static_cast<size_t>(((M+63)/64)), static_cast<size_t>(((N+31)/32)*ne12)};
6824-
size_t local_work_size[3] = {64, 1, 2};
6825-
6826-
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
6827-
6828-
// deallocate sub buffers and images
6829-
// <--------------------------------------------> //
6830-
CL_CHECK(clReleaseMemObject(A_image1d));
6831-
CL_CHECK(clReleaseMemObject(D_image1d));
6832-
CL_CHECK(clReleaseMemObject(A_sub_buffer));
6833-
CL_CHECK(clReleaseMemObject(B_sub_buffer));
6834-
CL_CHECK(clReleaseMemObject(D_sub_buffer));
6835-
// <--------------------------------------------> //
6836-
6879+
if (ne01 >= 64 && ne1 >= 32 && ne00 >= 16 && (ne12 % ne02) == 0){
6880+
ggml_cl_mul_mat_kq_kqv_adreno(backend, src0, src1, dst);
68376881
return;
6838-
68396882
}
68406883
}
68416884

0 commit comments

Comments
 (0)