@@ -368,6 +368,7 @@ struct ggml_backend_opencl_context {
368368 cl_program program_mul_mv_f16_f32;
369369 cl_program program_mul_mv_f32_f32;
370370 cl_program program_mul;
371+ cl_program program_mul_mat_f16_f32_tiled;
371372 cl_program program_div;
372373 cl_program program_sub;
373374 cl_program program_norm;
@@ -422,6 +423,7 @@ struct ggml_backend_opencl_context {
422423 cl_kernel kernel_mul_mat_f16_f32_1row;
423424 cl_kernel kernel_mul_mat_f16_f32;
424425 cl_kernel kernel_mul_mat_f16_f32_l4;
426+ cl_kernel kernel_mul_mat_f16_f32_tiled;
425427 cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v;
426428 cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0;
427429 cl_kernel kernel_mul_mat_q4_0_f32_8x_flat;
@@ -1015,6 +1017,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
10151017 GGML_LOG_CONT (" ." );
10161018 }
10171019
1020+ // mul_mat_f16_f32_tiled
1021+ {
1022+ #ifdef GGML_OPENCL_EMBED_KERNELS
1023+ const std::string kernel_src {
1024+ #include " mul_mat_f16_f32.cl.h"
1025+ };
1026+ #else
1027+ const std::string kernel_src = read_file (" mul_mat_f16_f32.cl" );
1028+ #endif
1029+ backend_ctx->program_mul_mat_f16_f32_tiled =
1030+ build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
1031+
1032+ CL_CHECK ((backend_ctx->kernel_mul_mat_f16_f32_tiled = clCreateKernel (backend_ctx->program_mul_mat_f16_f32_tiled , " mul_mat_f16_f32" , &err), err));
1033+ GGML_LOG_CONT (" ." );
1034+ }
1035+
10181036 // mul
10191037 {
10201038#ifdef GGML_OPENCL_EMBED_KERNELS
@@ -4927,6 +4945,58 @@ static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor
49274945 backend_ctx->enqueue_ndrange_kernel (kernel, 3 , global_work_size, NULL , dst);
49284946}
49294947
4948+ static void ggml_cl_mul_mat_f16_f32_tiled (ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
4949+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context ;
4950+
4951+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra ;
4952+ ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra ;
4953+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra ;
4954+
4955+ cl_ulong offset0 = extra0->offset + src0->view_offs ;
4956+ cl_ulong offset1 = extra1->offset + src1->view_offs ;
4957+ cl_ulong offsetd = extrad->offset + dst->view_offs ;
4958+
4959+ const int M = src0->ne [1 ];
4960+ const int N = src1->ne [1 ];
4961+ const int K = src0->ne [0 ];
4962+
4963+ cl_kernel kernel = backend_ctx->kernel_mul_mat_f16_f32_tiled ;
4964+
4965+ CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (int ), &M));
4966+ CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (int ), &N));
4967+ CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (int ), &K));
4968+ CL_CHECK (clSetKernelArg (kernel, 3 , sizeof (cl_mem), &extra0->data_device ));
4969+ CL_CHECK (clSetKernelArg (kernel, 4 , sizeof (cl_ulong), &offset0));
4970+ CL_CHECK (clSetKernelArg (kernel, 5 , sizeof (cl_mem), &extra1->data_device ));
4971+ CL_CHECK (clSetKernelArg (kernel, 6 , sizeof (cl_ulong), &offset1));
4972+ CL_CHECK (clSetKernelArg (kernel, 7 , sizeof (cl_mem), &extrad->data_device ));
4973+ CL_CHECK (clSetKernelArg (kernel, 8 , sizeof (cl_ulong), &offsetd));
4974+
4975+ // Tiling parameters. These need to be tuned for optimal performance.
4976+ // They must match the #defines in the kernel mul_mat_f16_f32.cl.
4977+ //
4978+ // OPWM / OPWN: Output tile size per Work-Group. A work-group computes a tile of size OPWM x OPWN.
4979+ // TPWM / TPWN: Threads per Work-group. This is the work-group size.
4980+ // OPTM / OPTN: Output elements per Thread. Each thread computes OPTM x OPTN elements.
4981+ //
4982+ // The following relationships must hold:
4983+ // OPWM = TPWM * OPTM
4984+ // OPWN = TPWN * OPTN
4985+ //
4986+ const int OPWM = 64 ;
4987+ const int OPWN = 64 ;
4988+ const int TPWM = 16 ;
4989+ const int TPWN = 8 ;
4990+
4991+ size_t local_work_size[2 ] = { TPWM, TPWN };
4992+ size_t global_work_size[2 ] = {
4993+ (size_t ) ((M + OPWM - 1 ) / OPWM) * TPWM,
4994+ (size_t ) ((N + OPWN - 1 ) / OPWN) * TPWN,
4995+ };
4996+
4997+ backend_ctx->enqueue_ndrange_kernel (kernel, 2 , global_work_size, local_work_size, dst);
4998+ }
4999+
49305000static void ggml_cl_mul_mat (ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
49315001 GGML_ASSERT (src0);
49325002 GGML_ASSERT (src0->extra );
@@ -4940,6 +5010,18 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
49405010
49415011 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context ;
49425012
5013+ if (src0t == GGML_TYPE_F16 && src1t == GGML_TYPE_F32 &&
5014+ src0->ne [1 ] > 32 && // M > 32
5015+ src1->ne [1 ] > 32 && // N > 32
5016+ src0->ne [0 ] > 32 && // K > 32
5017+ src0->ne [2 ] == 1 && src0->ne [3 ] == 1 &&
5018+ src1->ne [2 ] == 1 && src1->ne [3 ] == 1 &&
5019+ ggml_is_contiguous (src0) && ggml_is_contiguous (src1) &&
5020+ backend_ctx->kernel_mul_mat_f16_f32_tiled != NULL ) {
5021+ ggml_cl_mul_mat_f16_f32_tiled (backend, src0, src1, dst);
5022+ return ;
5023+ }
5024+
49435025 ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra ;
49445026 ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra ;
49455027 ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra ;
0 commit comments