@@ -331,6 +331,8 @@ struct ggml_backend_opencl_context {
331331
332332 cl_int alignment;
333333 size_t max_alloc_size;
334+ size_t max_image_width;
335+ size_t max_image_height;
334336 bool fp16_support;
335337 bool has_vector_subgroup_broadcast;
336338 ggml_cl_compiler_version adreno_cl_compiler_version;
@@ -369,6 +371,10 @@ struct ggml_backend_opencl_context {
369371 cl_program program_mul_mv_f32_f32;
370372 cl_program program_mul;
371373 cl_program program_mul_mat_f16_f32_tiled;
374+ cl_program program_mul_mat_f16_f32_image;
375+ cl_program program_pack_a_for_image;
376+ cl_program program_pack_b_for_image;
377+ cl_ulong global_mem_size;
372378 cl_program program_div;
373379 cl_program program_sub;
374380 cl_program program_norm;
@@ -424,6 +430,9 @@ struct ggml_backend_opencl_context {
424430 cl_kernel kernel_mul_mat_f16_f32;
425431 cl_kernel kernel_mul_mat_f16_f32_l4;
426432 cl_kernel kernel_mul_mat_f16_f32_tiled;
433+ cl_kernel kernel_mul_mat_f16_f32_image;
434+ cl_kernel kernel_pack_a_for_image;
435+ cl_kernel kernel_pack_b_for_image;
427436 cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v;
428437 cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0;
429438 cl_kernel kernel_mul_mat_q4_0_f32_8x_flat;
@@ -1033,6 +1042,54 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
10331042 GGML_LOG_CONT (" ." );
10341043 }
10351044
1045+ // mul_mat_f16_f32_image
1046+ {
1047+ #ifdef GGML_OPENCL_EMBED_KERNELS
1048+ const std::string kernel_src{
1049+ #include " mul_mat_f16_f32_image.cl.h"
1050+ };
1051+ #else
1052+ const std::string kernel_src = read_file (" mul_mat_f16_f32_image.cl" );
1053+ #endif
1054+ backend_ctx->program_mul_mat_f16_f32_image =
1055+ build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
1056+
1057+ CL_CHECK ((backend_ctx->kernel_mul_mat_f16_f32_image = clCreateKernel (backend_ctx->program_mul_mat_f16_f32_image , " mul_mat_f16_f32_image" , &err), err));
1058+ GGML_LOG_CONT (" ." );
1059+ }
1060+
1061+ // pack_a_for_image
1062+ {
1063+ #ifdef GGML_OPENCL_EMBED_KERNELS
1064+ const std::string kernel_src{
1065+ #include " pack_a_for_image.cl.h"
1066+ };
1067+ #else
1068+ const std::string kernel_src = read_file (" pack_a_for_image.cl" );
1069+ #endif
1070+ backend_ctx->program_pack_a_for_image =
1071+ build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
1072+
1073+ CL_CHECK ((backend_ctx->kernel_pack_a_for_image = clCreateKernel (backend_ctx->program_pack_a_for_image , " pack_a_for_image" , &err), err));
1074+ GGML_LOG_CONT (" ." );
1075+ }
1076+
1077+ // pack_b_for_image
1078+ {
1079+ #ifdef GGML_OPENCL_EMBED_KERNELS
1080+ const std::string kernel_src{
1081+ #include " pack_b_for_image.cl.h"
1082+ };
1083+ #else
1084+ const std::string kernel_src = read_file (" pack_b_for_image.cl" );
1085+ #endif
1086+ backend_ctx->program_pack_b_for_image =
1087+ build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
1088+
1089+ CL_CHECK ((backend_ctx->kernel_pack_b_for_image = clCreateKernel (backend_ctx->program_pack_b_for_image , " pack_b_for_image" , &err), err));
1090+ GGML_LOG_CONT (" ." );
1091+ }
1092+
10361093 // mul
10371094 {
10381095#ifdef GGML_OPENCL_EMBED_KERNELS
@@ -1987,6 +2044,11 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
19872044 clGetDeviceInfo (device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof (size_t ), &backend_ctx->max_alloc_size , NULL );
19882045 GGML_LOG_INFO (" ggml_opencl: max mem alloc size: %zu MB\n " , backend_ctx->max_alloc_size /1024 /1024 );
19892046
2047+ CL_CHECK (clGetDeviceInfo (device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof (cl_ulong), &backend_ctx->global_mem_size , NULL ));
2048+
2049+ CL_CHECK (clGetDeviceInfo (device, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof (size_t ), &backend_ctx->max_image_width , NULL ));
2050+ CL_CHECK (clGetDeviceInfo (device, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof (size_t ), &backend_ctx->max_image_height , NULL ));
2051+
19902052 // Check SVM.
19912053 cl_device_svm_capabilities svm_caps;
19922054 CL_CHECK (clGetDeviceInfo (device, CL_DEVICE_SVM_CAPABILITIES, sizeof (cl_device_svm_capabilities), &svm_caps, 0 ));
@@ -4997,6 +5059,93 @@ static void ggml_cl_mul_mat_f16_f32_tiled(ggml_backend_t backend, const ggml_ten
49975059 backend_ctx->enqueue_ndrange_kernel (kernel, 2 , global_work_size, local_work_size, dst);
49985060}
49995061
5062+ static void ggml_cl_mul_mat_f16_f32_image (ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
5063+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context ;
5064+ cl_context context = backend_ctx->context ;
5065+ cl_command_queue queue = backend_ctx->queue ;
5066+ cl_int err = 0 ;
5067+
5068+ const int M = src0->ne [1 ];
5069+ const int N = src1->ne [1 ];
5070+ const int K = src0->ne [0 ];
5071+ const int K_4 = (K + 3 ) / 4 ;
5072+ const int N_4 = (N + 3 ) / 4 ;
5073+
5074+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra ;
5075+ ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra ;
5076+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra ;
5077+
5078+ cl_ulong offset0 = extra0->offset + src0->view_offs ;
5079+ cl_ulong offset1 = extra1->offset + src1->view_offs ;
5080+ cl_ulong offsetd = extrad->offset + dst->view_offs ;
5081+
5082+ cl_mem a_image = NULL , b_image = NULL ;
5083+ cl_event pack_events[2 ];
5084+ cl_event matmul_event;
5085+
5086+ // Create image for A
5087+ cl_image_format format_A = {CL_RGBA, CL_HALF_FLOAT};
5088+ cl_image_desc desc_A = {};
5089+ desc_A.image_type = CL_MEM_OBJECT_IMAGE2D;
5090+ desc_A.image_width = K_4;
5091+ desc_A.image_height = M;
5092+ a_image = clCreateImage (context, CL_MEM_READ_WRITE, &format_A, &desc_A, NULL , &err);
5093+ CL_CHECK (err);
5094+
5095+ // Create image for B
5096+ cl_image_format format_B = {CL_RGBA, CL_HALF_FLOAT};
5097+ cl_image_desc desc_B = {};
5098+ desc_B.image_type = CL_MEM_OBJECT_IMAGE2D;
5099+ desc_B.image_width = N_4;
5100+ desc_B.image_height = K;
5101+ b_image = clCreateImage (context, CL_MEM_READ_WRITE, &format_B, &desc_B, NULL , &err);
5102+ CL_CHECK (err);
5103+
5104+ // Launch packing kernel for A
5105+ cl_kernel pack_a_kernel = backend_ctx->kernel_pack_a_for_image ;
5106+ CL_CHECK (clSetKernelArg (pack_a_kernel, 0 , sizeof (cl_mem), &extra0->data_device ));
5107+ CL_CHECK (clSetKernelArg (pack_a_kernel, 1 , sizeof (cl_ulong), &offset0));
5108+ CL_CHECK (clSetKernelArg (pack_a_kernel, 2 , sizeof (cl_mem), &a_image));
5109+ CL_CHECK (clSetKernelArg (pack_a_kernel, 3 , sizeof (int ), &M));
5110+ CL_CHECK (clSetKernelArg (pack_a_kernel, 4 , sizeof (int ), &K));
5111+ const size_t pack_a_gws[2 ] = { (size_t )K_4, (size_t )M };
5112+ CL_CHECK (clEnqueueNDRangeKernel (queue, pack_a_kernel, 2 , NULL , pack_a_gws, NULL , 0 , NULL , &pack_events[0 ]));
5113+
5114+ // Launch packing kernel for B
5115+ cl_kernel pack_b_kernel = backend_ctx->kernel_pack_b_for_image ;
5116+ CL_CHECK (clSetKernelArg (pack_b_kernel, 0 , sizeof (cl_mem), &extra1->data_device ));
5117+ CL_CHECK (clSetKernelArg (pack_b_kernel, 1 , sizeof (cl_ulong), &offset1));
5118+ CL_CHECK (clSetKernelArg (pack_b_kernel, 2 , sizeof (cl_mem), &b_image));
5119+ CL_CHECK (clSetKernelArg (pack_b_kernel, 3 , sizeof (int ), &K));
5120+ CL_CHECK (clSetKernelArg (pack_b_kernel, 4 , sizeof (int ), &N));
5121+ const size_t pack_b_gws[2 ] = { (size_t )N_4, (size_t )K };
5122+ CL_CHECK (clEnqueueNDRangeKernel (queue, pack_b_kernel, 2 , NULL , pack_b_gws, NULL , 0 , NULL , &pack_events[1 ]));
5123+
5124+ // Launch matmul kernel
5125+ cl_kernel matmul_kernel = backend_ctx->kernel_mul_mat_f16_f32_image ;
5126+ CL_CHECK (clSetKernelArg (matmul_kernel, 0 , sizeof (cl_mem), &a_image));
5127+ CL_CHECK (clSetKernelArg (matmul_kernel, 1 , sizeof (cl_mem), &b_image));
5128+ CL_CHECK (clSetKernelArg (matmul_kernel, 2 , sizeof (cl_mem), &extrad->data_device ));
5129+ CL_CHECK (clSetKernelArg (matmul_kernel, 3 , sizeof (cl_ulong), &offsetd));
5130+ CL_CHECK (clSetKernelArg (matmul_kernel, 4 , sizeof (int ), &M));
5131+ CL_CHECK (clSetKernelArg (matmul_kernel, 5 , sizeof (int ), &N));
5132+ CL_CHECK (clSetKernelArg (matmul_kernel, 6 , sizeof (int ), &K));
5133+
5134+ const int OPWM = 64 ;
5135+ const int OPWN = 64 ;
5136+ const size_t lws[2 ] = { 16 , 8 }; // WG_M, WG_N
5137+ const size_t gws[2 ] = { (size_t )ceil ((float )M / OPWM) * lws[0 ], (size_t )ceil ((float )N / OPWN) * lws[1 ] };
5138+ CL_CHECK (clEnqueueNDRangeKernel (queue, matmul_kernel, 2 , NULL , gws, lws, 2 , pack_events, &matmul_event));
5139+
5140+ // Wait for matmul to finish and release resources
5141+ CL_CHECK (clWaitForEvents (1 , &matmul_event));
5142+ CL_CHECK (clReleaseEvent (pack_events[0 ]));
5143+ CL_CHECK (clReleaseEvent (pack_events[1 ]));
5144+ CL_CHECK (clReleaseEvent (matmul_event));
5145+ CL_CHECK (clReleaseMemObject (a_image));
5146+ CL_CHECK (clReleaseMemObject (b_image));
5147+ }
5148+
50005149static void ggml_cl_mul_mat (ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
50015150 GGML_ASSERT (src0);
50025151 GGML_ASSERT (src0->extra );
@@ -5010,6 +5159,35 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
50105159
50115160 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context ;
50125161
5162+ if (src0t == GGML_TYPE_F16 && src1t == GGML_TYPE_F32 &&
5163+ backend_ctx->gpu_family == ADRENO && backend_ctx->kernel_mul_mat_f16_f32_image != NULL &&
5164+ ggml_is_contiguous (src0) && ggml_is_contiguous (src1) &&
5165+ src0->ne [2 ] == 1 && src0->ne [3 ] == 1 &&
5166+ src1->ne [2 ] == 1 && src1->ne [3 ] == 1 ) {
5167+
5168+ const int M = src0->ne [1 ];
5169+ const int N = src1->ne [1 ];
5170+ const int K = src0->ne [0 ];
5171+
5172+ // Performance thresholds: only use for reasonably large matrices
5173+ // where the GPU speedup can outweigh the CPU-side transpose/packing overhead.
5174+ if (M > 32 && N > 32 && K > 32 ) {
5175+ const size_t n_padded_4 = (size_t )((N + 3 ) / 4 );
5176+ const size_t temp_a_size = (size_t )M * K * sizeof (ggml_fp16_t );
5177+ const size_t temp_b_size = n_padded_4 * K * 4 * sizeof (ggml_fp16_t ); // RGBA
5178+ const size_t total_temp_image_size = temp_a_size + temp_b_size;
5179+
5180+ // Safety checks for memory and device limits
5181+ if ((size_t )K <= backend_ctx->max_image_width &&
5182+ (size_t )M <= backend_ctx->max_image_height &&
5183+ n_padded_4 <= backend_ctx->max_image_height &&
5184+ total_temp_image_size < (backend_ctx->global_mem_size / 4 )) { // Ensure temp images use < 25% of total VRAM
5185+ ggml_cl_mul_mat_f16_f32_image (backend, src0, src1, dst);
5186+ return ;
5187+ }
5188+ }
5189+ }
5190+
50135191 if (src0t == GGML_TYPE_F16 && src1t == GGML_TYPE_F32 &&
50145192 src0->ne [1 ] > 32 && // M > 32
50155193 src1->ne [1 ] > 32 && // N > 32
0 commit comments