@@ -224,12 +224,14 @@ struct ggml_backend_opencl_context {
224224 cl_program program;
225225 cl_program program_1;
226226 cl_program program_2;
227+ cl_program program_im2col;
227228
228229 cl_kernel kernel_add, kernel_add_row;
229230 cl_kernel kernel_mul, kernel_mul_row;
230231 cl_kernel kernel_scale;
231232 cl_kernel kernel_silu, kernel_silu_4;
232233 cl_kernel kernel_gelu, kernel_gelu_4;
234+ cl_kernel kernel_gelu_quick, kernel_gelu_quick_4;
233235 cl_kernel kernel_relu;
234236 cl_kernel kernel_clamp;
235237 cl_kernel kernel_norm;
@@ -239,6 +241,7 @@ struct ggml_backend_opencl_context {
239241 cl_kernel kernel_soft_max_f16, kernel_soft_max_4_f16;
240242 cl_kernel kernel_get_rows_f32, kernel_get_rows_f16, kernel_get_rows_q4_0;
241243 cl_kernel kernel_rope_norm_f32, kernel_rope_norm_f16, kernel_rope_neox_f32, kernel_rope_neox_f16;
244+ cl_kernel kernel_rope_multi_f32, kernel_rope_multi_f16, kernel_rope_vision_f32, kernel_rope_vision_f16;
242245 cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32;
243246 cl_kernel kernel_mul_mat_f32_f32;
244247 cl_kernel kernel_mul_mat_f16_f16;
@@ -252,6 +255,7 @@ struct ggml_backend_opencl_context {
252255 kernel_mul_mat_q4_0_f32_flat_img_v0;
253256 cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
254257 cl_kernel kernel_mul_mv_q6_K_f32;
258+ cl_kernel kernel_im2col_f32, kernel_im2col_f16;
255259
256260#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
257261 // Transpose kernels
@@ -708,6 +712,8 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
708712 CL_CHECK ((backend_ctx->kernel_silu_4 = clCreateKernel (backend_ctx->program , " kernel_silu_4" , &err), err));
709713 CL_CHECK ((backend_ctx->kernel_gelu = clCreateKernel (backend_ctx->program , " kernel_gelu" , &err), err));
710714 CL_CHECK ((backend_ctx->kernel_gelu_4 = clCreateKernel (backend_ctx->program , " kernel_gelu_4" , &err), err));
715+ CL_CHECK ((backend_ctx->kernel_gelu_quick = clCreateKernel (backend_ctx->program , " kernel_gelu_quick" , &err), err));
716+ CL_CHECK ((backend_ctx->kernel_gelu_quick_4 = clCreateKernel (backend_ctx->program , " kernel_gelu_quick_4" , &err), err));
711717 CL_CHECK ((backend_ctx->kernel_relu = clCreateKernel (backend_ctx->program , " kernel_relu" , &err), err));
712718 CL_CHECK ((backend_ctx->kernel_clamp = clCreateKernel (backend_ctx->program , " kernel_clamp" , &err), err));
713719 CL_CHECK ((backend_ctx->kernel_norm = clCreateKernel (backend_ctx->program , " kernel_norm" , &err), err));
@@ -722,6 +728,10 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
722728 CL_CHECK ((backend_ctx->kernel_rope_norm_f16 = clCreateKernel (backend_ctx->program , " kernel_rope_norm_f16" , &err), err));
723729 CL_CHECK ((backend_ctx->kernel_rope_neox_f32 = clCreateKernel (backend_ctx->program , " kernel_rope_neox_f32" , &err), err));
724730 CL_CHECK ((backend_ctx->kernel_rope_neox_f16 = clCreateKernel (backend_ctx->program , " kernel_rope_neox_f16" , &err), err));
731+ CL_CHECK ((backend_ctx->kernel_rope_multi_f32 = clCreateKernel (backend_ctx->program , " kernel_rope_multi_f32" , &err), err));
732+ CL_CHECK ((backend_ctx->kernel_rope_multi_f16 = clCreateKernel (backend_ctx->program , " kernel_rope_multi_f16" , &err), err));
733+ CL_CHECK ((backend_ctx->kernel_rope_vision_f32 = clCreateKernel (backend_ctx->program , " kernel_rope_vision_f32" , &err), err));
734+ CL_CHECK ((backend_ctx->kernel_rope_vision_f16 = clCreateKernel (backend_ctx->program , " kernel_rope_vision_f16" , &err), err));
725735 CL_CHECK ((backend_ctx->kernel_cpy_f16_f16 = clCreateKernel (backend_ctx->program , " kernel_cpy_f16_f16" , &err), err));
726736 CL_CHECK ((backend_ctx->kernel_cpy_f16_f32 = clCreateKernel (backend_ctx->program , " kernel_cpy_f16_f32" , &err), err));
727737 CL_CHECK ((backend_ctx->kernel_cpy_f32_f16 = clCreateKernel (backend_ctx->program , " kernel_cpy_f32_f16" , &err), err));
@@ -769,6 +779,19 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
769779
770780 CL_CHECK ((backend_ctx->kernel_convert_block_q4_0_noshuffle = clCreateKernel (backend_ctx->program_2 , " kernel_convert_block_q4_0_noshuffle" , &err), err));
771781
782+ // im2col kernels
783+ #ifdef GGML_OPENCL_EMBED_KERNELS
784+ const std::string kernel_src_im2col {
785+ #include " ggml-opencl_im2col.cl.h"
786+ };
787+ #else
788+ const std::string kernel_src_im2col = read_file (" ggml-opencl_im2col.cl" );
789+ #endif
790+ backend_ctx->program_im2col = build_program_from_source (context, device, kernel_src_im2col.c_str (), compile_opts);
791+
792+ CL_CHECK ((backend_ctx->kernel_im2col_f32 = clCreateKernel (backend_ctx->program_im2col , " kernel_im2col_f32" , &err), err));
793+ CL_CHECK ((backend_ctx->kernel_im2col_f16 = clCreateKernel (backend_ctx->program_im2col , " kernel_im2col_f16" , &err), err));
794+
772795 // Kernels for Adreno
773796#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
774797#ifdef GGML_OPENCL_EMBED_KERNELS
@@ -1187,6 +1210,7 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
11871210 case GGML_UNARY_OP_GELU:
11881211 case GGML_UNARY_OP_SILU:
11891212 case GGML_UNARY_OP_RELU:
1213+ case GGML_UNARY_OP_GELU_QUICK:
11901214 return ggml_is_contiguous (op->src [0 ]) && op->src [0 ]->type == GGML_TYPE_F32;
11911215 default :
11921216 return false ;
@@ -1216,14 +1240,26 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
12161240 return op->ne [3 ] == 1 ;
12171241 case GGML_OP_ROPE: {
12181242 const int mode = ((const int32_t *) op->op_params )[2 ];
1219- if (mode & GGML_ROPE_TYPE_MROPE) {
1243+ const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
1244+ const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
1245+ if (is_mrope && !is_vision) {
1246+ if (op->src [0 ]->type == GGML_TYPE_F32 ||
1247+ op->src [0 ]->type == GGML_TYPE_F16) {
1248+ return true ;
1249+ }
12201250 return false ;
12211251 }
1222- if (mode & GGML_ROPE_TYPE_VISION) {
1252+ if (is_vision) {
1253+ if (op->src [0 ]->type == GGML_TYPE_F32 ||
1254+ op->src [0 ]->type == GGML_TYPE_F16) {
1255+ return true ;
1256+ }
12231257 return false ;
12241258 }
12251259 return true ;
12261260 }
1261+ case GGML_OP_IM2COL:
1262+ return true ;
12271263 default :
12281264 return false ;
12291265 }
@@ -2582,6 +2618,53 @@ static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const
25822618#endif
25832619}
25842620
2621+ static void ggml_cl_gelu_quick (ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2622+ GGML_ASSERT (src0);
2623+ GGML_ASSERT (src0->extra );
2624+ GGML_ASSERT (dst);
2625+ GGML_ASSERT (dst->extra );
2626+
2627+ UNUSED (src1);
2628+
2629+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context ;
2630+ cl_command_queue queue = backend_ctx->queue ;
2631+
2632+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra ;
2633+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra ;
2634+
2635+ cl_ulong offset0 = extra0->offset + src0->view_offs ;
2636+ cl_ulong offsetd = extrad->offset + dst->view_offs ;
2637+
2638+ cl_kernel kernel;
2639+
2640+ int n = ggml_nelements (dst);
2641+
2642+ if (n % 4 == 0 ) {
2643+ kernel = backend_ctx->kernel_gelu_quick_4 ;
2644+ n /= 4 ;
2645+ } else {
2646+ kernel = backend_ctx->kernel_gelu_quick ;
2647+ }
2648+
2649+ CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &extra0->data_device ));
2650+ CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_ulong), &offset0));
2651+ CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_mem), &extrad->data_device ));
2652+ CL_CHECK (clSetKernelArg (kernel, 3 , sizeof (cl_ulong), &offsetd));
2653+
2654+ size_t global_work_size[] = {(size_t )n, 1 , 1 };
2655+ size_t local_work_size[] = {64 , 1 , 1 };
2656+
2657+ #ifdef GGML_OPENCL_PROFILING
2658+ cl_event evt;
2659+ clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size, 0 , NULL , &evt);
2660+
2661+ g_profiling_info.emplace_back ();
2662+ populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size, dst);
2663+ #else
2664+ clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size, 0 , NULL , NULL );
2665+ #endif
2666+ }
2667+
25852668static void ggml_cl_silu (ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
25862669 GGML_ASSERT (src0);
25872670 GGML_ASSERT (src0->extra );
@@ -3980,36 +4063,70 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
39804063 float attn_factor;
39814064 float beta_fast;
39824065 float beta_slow;
4066+ int32_t sections[4 ];
39834067
39844068 memcpy (&freq_base, (int32_t *) dst->op_params + 5 , sizeof (float ));
39854069 memcpy (&freq_scale, (int32_t *) dst->op_params + 6 , sizeof (float ));
39864070 memcpy (&ext_factor, (int32_t *) dst->op_params + 7 , sizeof (float ));
39874071 memcpy (&attn_factor, (int32_t *) dst->op_params + 8 , sizeof (float ));
39884072 memcpy (&beta_fast, (int32_t *) dst->op_params + 9 , sizeof (float ));
39894073 memcpy (&beta_slow, (int32_t *) dst->op_params + 10 , sizeof (float ));
4074+ memcpy (§ions, (int32_t *) dst->op_params + 11 , sizeof (int32_t )*4 );
39904075
39914076 const bool is_neox = mode & 2 ;
4077+ const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
4078+ const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
4079+
4080+ if (is_mrope) {
4081+ GGML_ASSERT (sections[0 ] > 0 || sections[1 ] > 0 || sections[2 ] > 0 );
4082+ }
4083+
4084+ if (is_vision) {
4085+ GGML_ASSERT (n_dims == ne00/2 );
4086+ }
39924087
39934088 cl_kernel kernel;
39944089
3995- if (! is_neox) {
4090+ if (is_neox) {
39964091 switch (src0->type ) {
39974092 case GGML_TYPE_F32:
3998- kernel = backend_ctx->kernel_rope_norm_f32 ;
4093+ kernel = backend_ctx->kernel_rope_neox_f32 ;
39994094 break ;
40004095 case GGML_TYPE_F16:
4001- kernel = backend_ctx->kernel_rope_norm_f16 ;
4096+ kernel = backend_ctx->kernel_rope_neox_f16 ;
4097+ break ;
4098+ default :
4099+ GGML_ASSERT (false );
4100+ };
4101+ } else if (is_mrope && !is_vision) {
4102+ switch (src0->type ) {
4103+ case GGML_TYPE_F32:
4104+ kernel = backend_ctx->kernel_rope_multi_f32 ;
4105+ break ;
4106+ case GGML_TYPE_F16:
4107+ kernel = backend_ctx->kernel_rope_multi_f16 ;
40024108 break ;
40034109 default :
40044110 GGML_ASSERT (false );
40054111 };
4112+ } else if (is_vision) {
4113+ switch (src0->type ) {
4114+ case GGML_TYPE_F32:
4115+ kernel = backend_ctx->kernel_rope_vision_f32 ;
4116+ break ;
4117+ case GGML_TYPE_F16:
4118+ kernel = backend_ctx->kernel_rope_vision_f16 ;
4119+ break ;
4120+ default :
4121+ GGML_ASSERT (false );
4122+ }
40064123 } else {
40074124 switch (src0->type ) {
40084125 case GGML_TYPE_F32:
4009- kernel = backend_ctx->kernel_rope_neox_f32 ;
4126+ kernel = backend_ctx->kernel_rope_norm_f32 ;
40104127 break ;
40114128 case GGML_TYPE_F16:
4012- kernel = backend_ctx->kernel_rope_neox_f16 ;
4129+ kernel = backend_ctx->kernel_rope_norm_f16 ;
40134130 break ;
40144131 default :
40154132 GGML_ASSERT (false );
@@ -4049,6 +4166,9 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
40494166 CL_CHECK (clSetKernelArg (kernel, 30 , sizeof (float ), &attn_factor));
40504167 CL_CHECK (clSetKernelArg (kernel, 31 , sizeof (float ), &beta_fast));
40514168 CL_CHECK (clSetKernelArg (kernel, 32 , sizeof (float ), &beta_slow));
4169+ if (is_mrope || is_vision) {
4170+ CL_CHECK (clSetKernelArg (kernel, 33 , sizeof (int32_t )*4 , §ions));
4171+ }
40524172
40534173 size_t global_work_size[] = {(size_t )ne01*nth, (size_t )ne02, (size_t )ne03};
40544174 size_t local_work_size[] = {(size_t )nth, 1 , 1 };
@@ -4064,6 +4184,98 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
40644184#endif
40654185}
40664186
4187+ static void ggml_cl_im2col (ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
4188+ GGML_ASSERT (src0);
4189+ GGML_ASSERT (src1);
4190+ GGML_ASSERT (src1->extra );
4191+ GGML_ASSERT (dst);
4192+ GGML_ASSERT (dst->extra );
4193+
4194+ // src0 - filter, src1 - input
4195+ GGML_ASSERT (src1->type == GGML_TYPE_F32);
4196+ GGML_ASSERT (dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
4197+
4198+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context ;
4199+ cl_command_queue queue = backend_ctx->queue ;
4200+
4201+ ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra ;
4202+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra ;
4203+
4204+ cl_ulong offset1 = extra1->offset + src1->view_offs ;
4205+ cl_ulong offsetd = extrad->offset + dst->view_offs ;
4206+
4207+ const int32_t s0 = ((const int32_t *)(dst->op_params ))[0 ];
4208+ const int32_t s1 = ((const int32_t *)(dst->op_params ))[1 ];
4209+ const int32_t p0 = ((const int32_t *)(dst->op_params ))[2 ];
4210+ const int32_t p1 = ((const int32_t *)(dst->op_params ))[3 ];
4211+ const int32_t d0 = ((const int32_t *)(dst->op_params ))[4 ];
4212+ const int32_t d1 = ((const int32_t *)(dst->op_params ))[5 ];
4213+
4214+ const bool is_2D = ((const int32_t *)(dst->op_params ))[6 ] == 1 ;
4215+
4216+ const cl_long IC = src1->ne [is_2D ? 2 : 1 ];
4217+ const cl_long IH = is_2D ? src1->ne [1 ] : 1 ;
4218+ const cl_long IW = src1->ne [0 ];
4219+
4220+ const cl_long KH = is_2D ? src0->ne [1 ] : 1 ;
4221+ const cl_long KW = src0->ne [0 ];
4222+
4223+ const cl_long OH = is_2D ? dst->ne [2 ] : 1 ;
4224+ const cl_long OW = dst->ne [1 ];
4225+
4226+ // nb is byte offset, src is type float32
4227+ const cl_ulong delta_offset = src1->nb [is_2D ? 2 : 1 ]/4 ;
4228+ const cl_long batch = src1->ne [is_2D ? 3 : 2 ];
4229+ const cl_ulong batch_offset = src1->nb [is_2D ? 3 : 2 ]/4 ;
4230+
4231+ const cl_long pelements = OW*KW*KH;
4232+ const cl_long CHW = IC*KH*KW;
4233+
4234+ cl_kernel kernel;
4235+
4236+ if (dst->type == GGML_TYPE_F16) {
4237+ kernel = backend_ctx->kernel_im2col_f16 ;
4238+ } else {
4239+ kernel = backend_ctx->kernel_im2col_f32 ;
4240+ }
4241+
4242+ CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &extra1->data_device ));
4243+ CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_ulong), &offset1));
4244+ CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_mem), &extrad->data_device ));
4245+ CL_CHECK (clSetKernelArg (kernel, 3 , sizeof (cl_ulong), &offsetd));
4246+ CL_CHECK (clSetKernelArg (kernel, 4 , sizeof (cl_ulong), &batch_offset));
4247+ CL_CHECK (clSetKernelArg (kernel, 5 , sizeof (cl_ulong), &delta_offset));
4248+ CL_CHECK (clSetKernelArg (kernel, 6 , sizeof (cl_long), &IW));
4249+ CL_CHECK (clSetKernelArg (kernel, 7 , sizeof (cl_long), &IH));
4250+ CL_CHECK (clSetKernelArg (kernel, 8 , sizeof (cl_long), &IC));
4251+ CL_CHECK (clSetKernelArg (kernel, 9 , sizeof (cl_long), &OW));
4252+ CL_CHECK (clSetKernelArg (kernel, 10 , sizeof (cl_long), &OH));
4253+ CL_CHECK (clSetKernelArg (kernel, 11 , sizeof (cl_long), &KW));
4254+ CL_CHECK (clSetKernelArg (kernel, 12 , sizeof (cl_long), &KH));
4255+ CL_CHECK (clSetKernelArg (kernel, 13 , sizeof (cl_long), &pelements));
4256+ CL_CHECK (clSetKernelArg (kernel, 14 , sizeof (cl_long), &CHW));
4257+ CL_CHECK (clSetKernelArg (kernel, 15 , sizeof (int ), &s0));
4258+ CL_CHECK (clSetKernelArg (kernel, 16 , sizeof (int ), &s1));
4259+ CL_CHECK (clSetKernelArg (kernel, 17 , sizeof (int ), &p0));
4260+ CL_CHECK (clSetKernelArg (kernel, 18 , sizeof (int ), &p1));
4261+ CL_CHECK (clSetKernelArg (kernel, 19 , sizeof (int ), &d0));
4262+ CL_CHECK (clSetKernelArg (kernel, 20 , sizeof (int ), &d1));
4263+
4264+ const int num_blocks = (pelements + 256 - 1 ) / 256 ;
4265+ size_t global_work_size[] = {(size_t )num_blocks*256 , (size_t )OH, (size_t )batch*IC};
4266+ size_t local_work_size[] = {256 , 1 , 1 };
4267+
4268+ #ifdef GGML_OPENCL_PROFILING
4269+ cl_event evt;
4270+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size, 0 , NULL , &evt));
4271+
4272+ g_profiling_info.emplace_back ();
4273+ populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size, dst);
4274+ #else
4275+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size, 0 , NULL , NULL ));
4276+ #endif
4277+ }
4278+
40674279// ------------------------------------------------------------------------------
40684280// Op offloading
40694281// ------------------------------------------------------------------------------
@@ -4122,6 +4334,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
41224334 }
41234335 func = ggml_cl_gelu;
41244336 break ;
4337+ case GGML_UNARY_OP_GELU_QUICK:
4338+ if (!any_on_device) {
4339+ return false ;
4340+ }
4341+ func = ggml_cl_gelu_quick;
4342+ break ;
41254343 case GGML_UNARY_OP_SILU:
41264344 if (!any_on_device) {
41274345 return false ;
@@ -4194,6 +4412,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
41944412 }
41954413 func = ggml_cl_rope;
41964414 break ;
4415+ case GGML_OP_IM2COL:
4416+ if (!any_on_device) {
4417+ return false ;
4418+ }
4419+ func = ggml_cl_im2col;
4420+ break ;
41974421 default :
41984422 return false ;
41994423 }
0 commit comments