@@ -6651,6 +6651,164 @@ static void ggml_cl_conv_2d(ggml_backend_t backend, const ggml_tensor * src0, co
66516651 backend_ctx->enqueue_ndrange_kernel (kernel, 2 , global_work_size, local_work_size, dst);
66526652}
66536653
6654+ static void ggml_cl_mul_mat_kq_kqv_adreno (ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6655+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context ;
6656+
6657+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra ;
6658+ ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra ;
6659+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra ;
6660+
6661+ cl_ulong offset0 = extra0->offset + src0->view_offs ;
6662+ cl_ulong offset1 = extra1->offset + src1->view_offs ;
6663+ cl_ulong offsetd = extrad->offset + dst->view_offs ;
6664+
6665+ const int ne00 = src0 ? src0->ne [0 ] : 0 ;
6666+ const int ne01 = src0 ? src0->ne [1 ] : 0 ;
6667+ const int ne02 = src0 ? src0->ne [2 ] : 0 ;
6668+ const int ne03 = src0 ? src0->ne [3 ] : 0 ;
6669+
6670+ const cl_ulong nb00 = src0 ? src0->nb [0 ] : 0 ;
6671+ const cl_ulong nb01 = src0 ? src0->nb [1 ] : 0 ;
6672+ const cl_ulong nb02 = src0 ? src0->nb [2 ] : 0 ;
6673+ const cl_ulong nb03 = src0 ? src0->nb [3 ] : 0 ;
6674+
6675+ const int ne10 = src1 ? src1->ne [0 ] : 0 ;
6676+ const int ne11 = src1 ? src1->ne [1 ] : 0 ;
6677+ const int ne12 = src1 ? src1->ne [2 ] : 0 ;
6678+ const int ne13 = src1 ? src1->ne [3 ] : 0 ;
6679+
6680+ const cl_ulong nb10 = src1 ? src1->nb [0 ] : 0 ;
6681+ const cl_ulong nb11 = src1 ? src1->nb [1 ] : 0 ;
6682+ const cl_ulong nb12 = src1 ? src1->nb [2 ] : 0 ;
6683+ const cl_ulong nb13 = src1 ? src1->nb [3 ] : 0 ;
6684+
6685+ const int ne0 = dst ? dst->ne [0 ] : 0 ;
6686+ const int ne1 = dst ? dst->ne [1 ] : 0 ;
6687+
6688+ int r2 = ne12/ne02;
6689+ int r3 = ne13/ne03;
6690+
6691+ GGML_ASSERT (ne00 == ne10);
6692+
6693+ cl_kernel kernel;
6694+ cl_context context = backend_ctx->context ;
6695+
6696+ cl_int status;
6697+ cl_image_format img_fmt_1d;
6698+ cl_image_desc img_desc_1d;
6699+ cl_buffer_region region;
6700+ cl_mem A_image1d;
6701+ cl_mem A_sub_buffer;
6702+ cl_mem B_sub_buffer;
6703+ cl_mem D_image1d;
6704+ cl_mem D_sub_buffer;
6705+
6706+ int M = ne01;
6707+ int N = ne1;
6708+ int K = ne00;
6709+
6710+ if (nb01 > nb02) {
6711+ // KQ
6712+ kernel = backend_ctx->kernel_mul_mm_f16_f32_kq ;
6713+ } else {
6714+ // KQV
6715+ kernel = backend_ctx->kernel_mul_mm_f16_f32_kqv ;
6716+ }
6717+ // create sub-buffer for A
6718+ // <--------------------------------------------> //
6719+ extra0 = src0->view_src ? (ggml_tensor_extra_cl *)src0->view_src ->extra : (ggml_tensor_extra_cl *)src0->extra ;
6720+
6721+ region.origin = (extra0->offset );
6722+ if (nb01 > nb02) {
6723+ // KQ
6724+ region.size = nb01 * ne01;
6725+ } else {
6726+ // KQV
6727+ region.size = nb02 * ne02;
6728+ }
6729+
6730+ A_sub_buffer = clCreateSubBuffer ((extra0->data_device ), 0 , CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
6731+ CL_CHECK (status);
6732+
6733+ // <--------------------------------------------> //
6734+
6735+ // create sub-buffer for B
6736+ // <--------------------------------------------> //
6737+ region.origin = (extra1->offset );
6738+ region.size = nb10 * ne10 * ne11 * ne12;
6739+ B_sub_buffer = clCreateSubBuffer ((extra1->data_device ), 0 , CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
6740+ CL_CHECK (status);
6741+ // <--------------------------------------------> //
6742+
6743+ img_fmt_1d = {CL_RGBA, CL_FLOAT};
6744+ memset (&img_desc_1d, 0 , sizeof (img_desc_1d));
6745+ img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
6746+ if (nb01 > nb02) {
6747+ img_desc_1d.image_width = (nb01 * ne01 / 4 )/4 ;
6748+ }
6749+ else {
6750+ img_desc_1d.image_width = (nb02 * ne02 / 4 )/4 ;
6751+ }
6752+ img_desc_1d.buffer = A_sub_buffer;
6753+ A_image1d = clCreateImage (context, CL_MEM_READ_ONLY, &img_fmt_1d, &img_desc_1d, NULL , &status);
6754+ CL_CHECK (status);
6755+
6756+ // create sub-buffer for output C
6757+ // <--------------------------------------------> //
6758+ region.origin = (extrad->offset );
6759+ region.size = ne0 * ne1 * dst->ne [2 ] * dst->nb [0 ]; // size of C in bytes
6760+ D_sub_buffer = clCreateSubBuffer ((extrad->data_device ), 0 , CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
6761+ CL_CHECK (status);
6762+ // <--------------------------------------------> //
6763+
6764+ // create image for C output
6765+ // <--------------------------------------------> //
6766+ img_fmt_1d = {CL_R, CL_FLOAT};
6767+ memset (&img_desc_1d, 0 , sizeof (img_desc_1d));
6768+ img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
6769+ img_desc_1d.image_width = ne0 * ne1 * dst->ne [2 ] * dst->nb [0 ] / 4 ;
6770+ img_desc_1d.buffer = D_sub_buffer;
6771+ D_image1d = clCreateImage (context, CL_MEM_WRITE_ONLY, &img_fmt_1d, &img_desc_1d, NULL , &status);
6772+ CL_CHECK (status);
6773+ // <--------------------------------------------> //
6774+
6775+ uint offset_src0 = 0 ;
6776+ uint offset_src1 = 0 ;
6777+
6778+ // set kernel args
6779+ // <--------------------------------------------> //
6780+ cl_uint k_arg = 0 ;
6781+ CL_CHECK (clSetKernelArg (kernel, k_arg++, sizeof (cl_mem), &A_image1d));
6782+ CL_CHECK (clSetKernelArg (kernel, k_arg++, sizeof (int ), &offset_src0));
6783+ CL_CHECK (clSetKernelArg (kernel, k_arg++, sizeof (cl_mem), &B_sub_buffer));
6784+ CL_CHECK (clSetKernelArg (kernel, k_arg++, sizeof (int ), &offset_src1));
6785+ CL_CHECK (clSetKernelArg (kernel, k_arg++, sizeof (cl_mem), &D_image1d));
6786+ CL_CHECK (clSetKernelArg (kernel, k_arg++, sizeof (int ), &extrad->offset ));
6787+ CL_CHECK (clSetKernelArg (kernel, k_arg++, sizeof (int ), &M));
6788+ CL_CHECK (clSetKernelArg (kernel, k_arg++, sizeof (int ), &K));
6789+ CL_CHECK (clSetKernelArg (kernel, k_arg++, sizeof (int ), &N));
6790+ CL_CHECK (clSetKernelArg (kernel, k_arg++, sizeof (int ), &ne02));
6791+ CL_CHECK (clSetKernelArg (kernel, k_arg++, sizeof (int ), &ne12));
6792+ CL_CHECK (clSetKernelArg (kernel, k_arg++, sizeof (int ), &nb01));
6793+
6794+ size_t global_work_size[3 ] = {64 , static_cast <size_t >(((M+63 )/64 )), static_cast <size_t >(((N+31 )/32 )*ne12)};
6795+ size_t local_work_size[3 ] = {64 , 1 , 2 };
6796+
6797+ backend_ctx->enqueue_ndrange_kernel (kernel, 3 , global_work_size, local_work_size, dst);
6798+
6799+ // deallocate sub buffers and images
6800+ // <--------------------------------------------> //
6801+ CL_CHECK (clReleaseMemObject (A_image1d));
6802+ CL_CHECK (clReleaseMemObject (D_image1d));
6803+ CL_CHECK (clReleaseMemObject (A_sub_buffer));
6804+ CL_CHECK (clReleaseMemObject (B_sub_buffer));
6805+ CL_CHECK (clReleaseMemObject (D_sub_buffer));
6806+ // <--------------------------------------------> //
6807+
6808+ return ;
6809+
6810+ }
6811+
66546812static void ggml_cl_mul_mat (ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
66556813 GGML_ASSERT (src0);
66566814 GGML_ASSERT (src0->extra );
@@ -6717,125 +6875,10 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
67176875#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
67186876 cl_context context = backend_ctx->context ;
67196877
6720- cl_int status;
6721- cl_image_format img_fmt_1d;
6722- cl_image_desc img_desc_1d;
6723- cl_buffer_region region;
6724- cl_mem A_image1d;
6725- cl_mem A_sub_buffer;
6726- cl_mem B_sub_buffer;
6727- cl_mem D_image1d;
6728- cl_mem D_sub_buffer;
6729-
6730- int M = ne01;
6731- int N = ne1;
6732- int K = ne00;
6733-
67346878 if (src0t == GGML_TYPE_F16 && src1t == GGML_TYPE_F32){
6735-
6736- if (M >= 64 && N >= 32 && K >= 16 && (ne12 % ne02) == 0 ){
6737- if (nb01 > nb02) {
6738- // KQ
6739- kernel = backend_ctx->kernel_mul_mm_f16_f32_kq ;
6740- } else {
6741- // KQV
6742- kernel = backend_ctx->kernel_mul_mm_f16_f32_kqv ;
6743- }
6744- // create sub-buffer for A
6745- // <--------------------------------------------> //
6746- extra0 = src0->view_src ? (ggml_tensor_extra_cl *)src0->view_src ->extra : (ggml_tensor_extra_cl *)src0->extra ;
6747-
6748- region.origin = (extra0->offset );
6749- if (nb01 > nb02) {
6750- // KQ
6751- region.size = nb01 * ne01;
6752- } else {
6753- // KQV
6754- region.size = nb02 * ne02;
6755- }
6756-
6757- A_sub_buffer = clCreateSubBuffer ((extra0->data_device ), 0 , CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
6758- CL_CHECK (status);
6759-
6760- // <--------------------------------------------> //
6761-
6762- // create sub-buffer for B
6763- // <--------------------------------------------> //
6764- region.origin = (extra1->offset );
6765- region.size = nb10 * ne10 * ne11 * ne12;
6766- B_sub_buffer = clCreateSubBuffer ((extra1->data_device ), 0 , CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
6767- CL_CHECK (status);
6768- // <--------------------------------------------> //
6769-
6770- img_fmt_1d = {CL_RGBA, CL_FLOAT};
6771- memset (&img_desc_1d, 0 , sizeof (img_desc_1d));
6772- img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
6773- if (nb01 > nb02) {
6774- img_desc_1d.image_width = (nb01 * ne01 / 4 )/4 ;
6775- }
6776- else {
6777- img_desc_1d.image_width = (nb02 * ne02 / 4 )/4 ;
6778- }
6779- img_desc_1d.buffer = A_sub_buffer;
6780- A_image1d = clCreateImage (context, CL_MEM_READ_ONLY, &img_fmt_1d, &img_desc_1d, NULL , &status);
6781- CL_CHECK (status);
6782-
6783-
6784- // create sub-buffer for output C
6785- // <--------------------------------------------> //
6786- region.origin = (extrad->offset );
6787- region.size = ne0 * ne1 * dst->ne [2 ] * dst->nb [0 ]; // size of C in bytes
6788- D_sub_buffer = clCreateSubBuffer ((extrad->data_device ), 0 , CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
6789- CL_CHECK (status);
6790- // <--------------------------------------------> //
6791-
6792- // create image for C output
6793- // <--------------------------------------------> //
6794- img_fmt_1d = {CL_R, CL_FLOAT};
6795- memset (&img_desc_1d, 0 , sizeof (img_desc_1d));
6796- img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
6797- img_desc_1d.image_width = ne0 * ne1 * dst->ne [2 ] * dst->nb [0 ] / 4 ;
6798- img_desc_1d.buffer = D_sub_buffer;
6799- D_image1d = clCreateImage (context, CL_MEM_WRITE_ONLY, &img_fmt_1d, &img_desc_1d, NULL , &status);
6800- CL_CHECK (status);
6801- // <--------------------------------------------> //
6802-
6803- // offsets = 0 when using image
6804- int offset0 = 0 ;
6805- int offset1 = 0 ;
6806-
6807- // set kernel args
6808- // <--------------------------------------------> //
6809- cl_uint k_arg = 0 ;
6810- CL_CHECK (clSetKernelArg (kernel, k_arg++, sizeof (cl_mem), &A_image1d));
6811- CL_CHECK (clSetKernelArg (kernel, k_arg++, sizeof (int ), &offset0));
6812- CL_CHECK (clSetKernelArg (kernel, k_arg++, sizeof (cl_mem), &B_sub_buffer));
6813- CL_CHECK (clSetKernelArg (kernel, k_arg++, sizeof (int ), &offset1));
6814- CL_CHECK (clSetKernelArg (kernel, k_arg++, sizeof (cl_mem), &D_image1d));
6815- CL_CHECK (clSetKernelArg (kernel, k_arg++, sizeof (int ), &extrad->offset ));
6816- CL_CHECK (clSetKernelArg (kernel, k_arg++, sizeof (int ), &M));
6817- CL_CHECK (clSetKernelArg (kernel, k_arg++, sizeof (int ), &K));
6818- CL_CHECK (clSetKernelArg (kernel, k_arg++, sizeof (int ), &N));
6819- CL_CHECK (clSetKernelArg (kernel, k_arg++, sizeof (int ), &ne02));
6820- CL_CHECK (clSetKernelArg (kernel, k_arg++, sizeof (int ), &ne12));
6821- CL_CHECK (clSetKernelArg (kernel, k_arg++, sizeof (int ), &nb01));
6822-
6823- size_t global_work_size[3 ] = {64 , static_cast <size_t >(((M+63 )/64 )), static_cast <size_t >(((N+31 )/32 )*ne12)};
6824- size_t local_work_size[3 ] = {64 , 1 , 2 };
6825-
6826- backend_ctx->enqueue_ndrange_kernel (kernel, 3 , global_work_size, local_work_size, dst);
6827-
6828- // deallocate sub buffers and images
6829- // <--------------------------------------------> //
6830- CL_CHECK (clReleaseMemObject (A_image1d));
6831- CL_CHECK (clReleaseMemObject (D_image1d));
6832- CL_CHECK (clReleaseMemObject (A_sub_buffer));
6833- CL_CHECK (clReleaseMemObject (B_sub_buffer));
6834- CL_CHECK (clReleaseMemObject (D_sub_buffer));
6835- // <--------------------------------------------> //
6836-
6879+ if (ne01 >= 64 && ne1 >= 32 && ne00 >= 16 && (ne12 % ne02) == 0 ){
6880+ ggml_cl_mul_mat_kq_kqv_adreno (backend, src0, src1, dst);
68376881 return ;
6838-
68396882 }
68406883 }
68416884
0 commit comments