@@ -407,6 +407,8 @@ struct ggml_backend_opencl_context {
407407 cl_program program_mul_mv_f32_f32;
408408 cl_program program_mul;
409409 cl_program program_mul_mat_f16_f32_tiled;
410+ cl_program program_mul_mm_f16_f32_kqv;
411+ cl_program program_mul_mm_f16_f32_kq;
410412 cl_program program_div;
411413 cl_program program_sub;
412414 cl_program program_norm;
@@ -481,6 +483,8 @@ struct ggml_backend_opencl_context {
481483 cl_kernel kernel_mul_mat_f16_f32;
482484 cl_kernel kernel_mul_mat_f16_f32_l4;
483485 cl_kernel kernel_mul_mat_f16_f32_tiled;
486+ cl_kernel kernel_mul_mm_f16_f32_kqv;
487+ cl_kernel kernel_mul_mm_f16_f32_kq;
484488 cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v;
485489 cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0;
486490 cl_kernel kernel_convert_block_mxfp4, kernel_convert_block_mxfp4_trans, kernel_restore_block_mxfp4, kernel_restore_block_mxfp4_trans;
@@ -1235,6 +1239,25 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
12351239 GGML_LOG_CONT (" ." );
12361240 }
12371241
1242+ // mul_mm_f16_f32_kq_kqv
1243+ {
1244+ #ifdef GGML_OPENCL_EMBED_KERNELS
1245+ const std::string kernel_src {
1246+ #include " mul_mm_f16_f32_kq_kqv.cl.h"
1247+ };
1248+ #else
1249+ const std::string kernel_src = read_file (" mul_mm_f16_f32_kq_kqv.cl" );
1250+ #endif
1251+ backend_ctx->program_mul_mm_f16_f32_kqv =
1252+ build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts+" -DKQV " );
1253+ backend_ctx->program_mul_mm_f16_f32_kq =
1254+ build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
1255+
1256+ CL_CHECK ((backend_ctx->kernel_mul_mm_f16_f32_kqv = clCreateKernel (backend_ctx->program_mul_mm_f16_f32_kqv , " mul_mm_f16_f32_kqv" , &err), err));
1257+ CL_CHECK ((backend_ctx->kernel_mul_mm_f16_f32_kq = clCreateKernel (backend_ctx->program_mul_mm_f16_f32_kq , " mul_mm_f16_f32_kq" , &err), err));
1258+ GGML_LOG_CONT (" ." );
1259+ }
1260+
12381261 // mul
12391262 {
12401263#ifdef GGML_OPENCL_EMBED_KERNELS
@@ -6665,6 +6688,146 @@ static void ggml_cl_conv_2d(ggml_backend_t backend, const ggml_tensor * src0, co
66656688 backend_ctx->enqueue_ndrange_kernel (kernel, 2 , global_work_size, local_work_size, dst);
66666689}
66676690
6691+ static void ggml_cl_mul_mat_kq_kqv_adreno (ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6692+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context ;
6693+
6694+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra ;
6695+ ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra ;
6696+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra ;
6697+
6698+ const int ne00 = src0->ne [0 ];
6699+ const int ne01 = src0->ne [1 ];
6700+ const int ne02 = src0->ne [2 ];
6701+
6702+ const cl_ulong nb01 = src0->nb [1 ];
6703+ const cl_ulong nb02 = src0->nb [2 ];
6704+
6705+ const int ne10 = src1->ne [0 ];
6706+ const int ne11 = src1->ne [1 ];
6707+ const int ne12 = src1->ne [2 ];
6708+
6709+ const cl_ulong nb10 = src1->nb [0 ];
6710+
6711+ const int ne0 = dst->ne [0 ];
6712+ const int ne1 = dst->ne [1 ];
6713+
6714+ GGML_ASSERT (ne00 == ne10);
6715+
6716+ cl_kernel kernel;
6717+ cl_context context = backend_ctx->context ;
6718+
6719+ cl_int status;
6720+ cl_image_format img_fmt_1d;
6721+ cl_image_desc img_desc_1d;
6722+ cl_buffer_region region;
6723+ cl_mem A_image1d;
6724+ cl_mem A_sub_buffer;
6725+ cl_mem B_sub_buffer;
6726+ cl_mem D_image1d;
6727+ cl_mem D_sub_buffer;
6728+
6729+ int M = ne01;
6730+ int N = ne1;
6731+ int K = ne00;
6732+
6733+ if (nb01 > nb02) {
6734+ // KQ
6735+ kernel = backend_ctx->kernel_mul_mm_f16_f32_kq ;
6736+ } else {
6737+ // KQV
6738+ kernel = backend_ctx->kernel_mul_mm_f16_f32_kqv ;
6739+ }
6740+ // create sub-buffer for A
6741+ // <--------------------------------------------> //
6742+ extra0 = src0->view_src ? (ggml_tensor_extra_cl *)src0->view_src ->extra : (ggml_tensor_extra_cl *)src0->extra ;
6743+
6744+ region.origin = (extra0->offset );
6745+ if (nb01 > nb02) {
6746+ // KQ
6747+ region.size = nb01 * ne01;
6748+ } else {
6749+ // KQV
6750+ region.size = nb02 * ne02;
6751+ }
6752+
6753+ A_sub_buffer = clCreateSubBuffer ((extra0->data_device ), 0 , CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
6754+ CL_CHECK (status);
6755+
6756+ // <--------------------------------------------> //
6757+
6758+ // create sub-buffer for B
6759+ // <--------------------------------------------> //
6760+ region.origin = (extra1->offset );
6761+ region.size = nb10 * ne10 * ne11 * ne12;
6762+ B_sub_buffer = clCreateSubBuffer ((extra1->data_device ), 0 , CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
6763+ CL_CHECK (status);
6764+ // <--------------------------------------------> //
6765+
6766+ img_fmt_1d = {CL_RGBA, CL_FLOAT};
6767+ memset (&img_desc_1d, 0 , sizeof (img_desc_1d));
6768+ img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
6769+ if (nb01 > nb02) {
6770+ img_desc_1d.image_width = (nb01 * ne01 / 4 )/4 ;
6771+ }
6772+ else {
6773+ img_desc_1d.image_width = (nb02 * ne02 / 4 )/4 ;
6774+ }
6775+ img_desc_1d.buffer = A_sub_buffer;
6776+ A_image1d = clCreateImage (context, CL_MEM_READ_ONLY, &img_fmt_1d, &img_desc_1d, NULL , &status);
6777+ CL_CHECK (status);
6778+
6779+ // create sub-buffer for output C
6780+ // <--------------------------------------------> //
6781+ region.origin = (extrad->offset );
6782+ region.size = ne0 * ne1 * dst->ne [2 ] * dst->nb [0 ]; // size of C in bytes
6783+ D_sub_buffer = clCreateSubBuffer ((extrad->data_device ), 0 , CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
6784+ CL_CHECK (status);
6785+ // <--------------------------------------------> //
6786+
6787+ // create image for C output
6788+ // <--------------------------------------------> //
6789+ img_fmt_1d = {CL_R, CL_FLOAT};
6790+ memset (&img_desc_1d, 0 , sizeof (img_desc_1d));
6791+ img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
6792+ img_desc_1d.image_width = ne0 * ne1 * dst->ne [2 ] * dst->nb [0 ] / 4 ;
6793+ img_desc_1d.buffer = D_sub_buffer;
6794+ D_image1d = clCreateImage (context, CL_MEM_WRITE_ONLY, &img_fmt_1d, &img_desc_1d, NULL , &status);
6795+ CL_CHECK (status);
6796+ // <--------------------------------------------> //
6797+
6798+ int offset_src0 = 0 ;
6799+ int offset_src1 = 0 ;
6800+
6801+ // set kernel args
6802+ // <--------------------------------------------> //
6803+ cl_uint k_arg = 0 ;
6804+ CL_CHECK (clSetKernelArg (kernel, k_arg++, sizeof (cl_mem), &A_image1d));
6805+ CL_CHECK (clSetKernelArg (kernel, k_arg++, sizeof (int ), &offset_src0));
6806+ CL_CHECK (clSetKernelArg (kernel, k_arg++, sizeof (cl_mem), &B_sub_buffer));
6807+ CL_CHECK (clSetKernelArg (kernel, k_arg++, sizeof (int ), &offset_src1));
6808+ CL_CHECK (clSetKernelArg (kernel, k_arg++, sizeof (cl_mem), &D_image1d));
6809+ CL_CHECK (clSetKernelArg (kernel, k_arg++, sizeof (int ), &extrad->offset ));
6810+ CL_CHECK (clSetKernelArg (kernel, k_arg++, sizeof (int ), &M));
6811+ CL_CHECK (clSetKernelArg (kernel, k_arg++, sizeof (int ), &K));
6812+ CL_CHECK (clSetKernelArg (kernel, k_arg++, sizeof (int ), &N));
6813+ CL_CHECK (clSetKernelArg (kernel, k_arg++, sizeof (int ), &ne02));
6814+ CL_CHECK (clSetKernelArg (kernel, k_arg++, sizeof (int ), &ne12));
6815+ CL_CHECK (clSetKernelArg (kernel, k_arg++, sizeof (int ), &nb01));
6816+
6817+ size_t global_work_size[3 ] = {64 , static_cast <size_t >(((M+63 )/64 )), static_cast <size_t >(((N+31 )/32 )*ne12)};
6818+ size_t local_work_size[3 ] = {64 , 1 , 2 };
6819+
6820+ backend_ctx->enqueue_ndrange_kernel (kernel, 3 , global_work_size, local_work_size, dst);
6821+
6822+ // deallocate sub buffers and images
6823+ // <--------------------------------------------> //
6824+ CL_CHECK (clReleaseMemObject (A_image1d));
6825+ CL_CHECK (clReleaseMemObject (D_image1d));
6826+ CL_CHECK (clReleaseMemObject (A_sub_buffer));
6827+ CL_CHECK (clReleaseMemObject (B_sub_buffer));
6828+ CL_CHECK (clReleaseMemObject (D_sub_buffer));
6829+ }
6830+
66686831static void ggml_cl_mul_mat (ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
66696832 GGML_ASSERT (src0);
66706833 GGML_ASSERT (src0->extra );
@@ -6731,6 +6894,13 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
67316894#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
67326895 cl_context context = backend_ctx->context ;
67336896
6897+ if (src0t == GGML_TYPE_F16 && src1t == GGML_TYPE_F32){
6898+ if (ne01 >= 64 && ne1 >= 32 && ne00 >= 16 && (ne12 % ne02) == 0 ){
6899+ ggml_cl_mul_mat_kq_kqv_adreno (backend, src0, src1, dst);
6900+ return ;
6901+ }
6902+ }
6903+
67346904 if (ne01 && ne1 && use_adreno_kernels (backend_ctx, src0)) {
67356905
67366906 // init CL objects
0 commit comments