@@ -313,6 +313,7 @@ struct ggml_backend_opencl_context {
313313
314314 cl_kernel kernel_add, kernel_add_row;
315315 cl_kernel kernel_mul, kernel_mul_row;
316+ cl_kernel kernel_div, kernel_div_row;
316317 cl_kernel kernel_scale;
317318 cl_kernel kernel_silu, kernel_silu_4;
318319 cl_kernel kernel_gelu, kernel_gelu_4;
@@ -1004,6 +1005,23 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
10041005 GGML_LOG_CONT (" ." );
10051006 }
10061007
1008+ // div
1009+ {
1010+ #ifdef GGML_OPENCL_EMBED_KERNELS
1011+ const std::string kernel_src {
1012+ #include " div.cl.h"
1013+ };
1014+ #else
1015+ const std::string kernel_src = read_file (" div.cl" );
1016+ #endif
1017+ backend_ctx->program_mul =
1018+ build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
1019+
1020+ CL_CHECK ((backend_ctx->kernel_div = clCreateKernel (backend_ctx->program_mul , " kernel_div" , &err), err));
1021+ CL_CHECK ((backend_ctx->kernel_div_row = clCreateKernel (backend_ctx->program_mul , " kernel_div_row" , &err), err));
1022+ GGML_LOG_CONT (" ." );
1023+ }
1024+
10071025 // Adreno kernels
10081026#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
10091027 // transpose
@@ -1874,6 +1892,7 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
18741892 case GGML_OP_ADD:
18751893 case GGML_OP_SCALE:
18761894 case GGML_OP_MUL:
1895+ case GGML_OP_DIV:
18771896 return op->src [0 ]->type == GGML_TYPE_F32;
18781897 case GGML_OP_UNARY:
18791898 switch (ggml_get_unary_op (op)) {
@@ -3258,6 +3277,131 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
32583277 }
32593278}
32603279
3280+ static void ggml_cl_div (ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3281+ GGML_ASSERT (src0);
3282+ GGML_ASSERT (src0->extra );
3283+ GGML_ASSERT (src1);
3284+ GGML_ASSERT (src1->extra );
3285+ GGML_ASSERT (dst);
3286+ GGML_ASSERT (dst->extra );
3287+
3288+ const int ne00 = src0->ne [0 ];
3289+ const int ne01 = src0->ne [1 ];
3290+ const int ne02 = src0->ne [2 ];
3291+ const int ne03 = src0->ne [3 ];
3292+
3293+ const cl_ulong nb00 = src0->nb [0 ];
3294+ const cl_ulong nb01 = src0->nb [1 ];
3295+ const cl_ulong nb02 = src0->nb [2 ];
3296+ const cl_ulong nb03 = src0->nb [3 ];
3297+
3298+ const int ne10 = src1->ne [0 ];
3299+ const int ne11 = src1->ne [1 ];
3300+ const int ne12 = src1->ne [2 ];
3301+ const int ne13 = src1->ne [3 ];
3302+
3303+ const cl_ulong nb10 = src1->nb [0 ];
3304+ const cl_ulong nb11 = src1->nb [1 ];
3305+ const cl_ulong nb12 = src1->nb [2 ];
3306+ const cl_ulong nb13 = src1->nb [3 ];
3307+
3308+ const int ne0 = dst->ne [0 ];
3309+
3310+ const cl_ulong nb0 = dst->nb [0 ];
3311+ const cl_ulong nb1 = dst->nb [1 ];
3312+ const cl_ulong nb2 = dst->nb [2 ];
3313+ const cl_ulong nb3 = dst->nb [3 ];
3314+
3315+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context ;
3316+ cl_command_queue queue = backend_ctx->queue ;
3317+
3318+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra ;
3319+ ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra ;
3320+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra ;
3321+
3322+ cl_ulong offset0 = extra0->offset + src0->view_offs ;
3323+ cl_ulong offset1 = extra1->offset + src1->view_offs ;
3324+ cl_ulong offsetd = extrad->offset + dst->view_offs ;
3325+
3326+ bool bcast_row = false ;
3327+ cl_kernel kernel;
3328+
3329+ if (ggml_nelements (src1) == ne10 && ggml_is_contiguous (src1) && ne00 % 4 == 0 && ne10 % 4 == 0 ) {
3330+ GGML_ASSERT (ggml_is_contiguous (src0));
3331+
3332+ // src1 is a row
3333+ GGML_ASSERT (ne11 == 1 );
3334+
3335+ bcast_row = true ;
3336+ int ne = ne00 / 4 ;
3337+ kernel = backend_ctx->kernel_div_row ;
3338+
3339+ CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &extra0->data_device ));
3340+ CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_ulong), &offset0));
3341+ CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_mem), &extra1->data_device ));
3342+ CL_CHECK (clSetKernelArg (kernel, 3 , sizeof (cl_ulong), &offset1));
3343+ CL_CHECK (clSetKernelArg (kernel, 4 , sizeof (cl_mem), &extrad->data_device ));
3344+ CL_CHECK (clSetKernelArg (kernel, 5 , sizeof (cl_ulong), &offsetd));
3345+ CL_CHECK (clSetKernelArg (kernel, 6 , sizeof (int ), &ne));
3346+ } else {
3347+ kernel = backend_ctx->kernel_div ;
3348+
3349+ CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &extra0->data_device ));
3350+ CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_ulong), &offset0));
3351+ CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_mem), &extra1->data_device ));
3352+ CL_CHECK (clSetKernelArg (kernel, 3 , sizeof (cl_ulong), &offset1));
3353+ CL_CHECK (clSetKernelArg (kernel, 4 , sizeof (cl_mem), &extrad->data_device ));
3354+ CL_CHECK (clSetKernelArg (kernel, 5 , sizeof (cl_ulong), &offsetd));
3355+ CL_CHECK (clSetKernelArg (kernel, 6 , sizeof (cl_ulong), &nb00));
3356+ CL_CHECK (clSetKernelArg (kernel, 7 , sizeof (cl_ulong), &nb01));
3357+ CL_CHECK (clSetKernelArg (kernel, 8 , sizeof (cl_ulong), &nb02));
3358+ CL_CHECK (clSetKernelArg (kernel, 9 , sizeof (cl_ulong), &nb03));
3359+ CL_CHECK (clSetKernelArg (kernel, 10 , sizeof (int ), &ne10));
3360+ CL_CHECK (clSetKernelArg (kernel, 11 , sizeof (int ), &ne11));
3361+ CL_CHECK (clSetKernelArg (kernel, 12 , sizeof (int ), &ne12));
3362+ CL_CHECK (clSetKernelArg (kernel, 13 , sizeof (int ), &ne13));
3363+ CL_CHECK (clSetKernelArg (kernel, 14 , sizeof (cl_ulong), &nb10));
3364+ CL_CHECK (clSetKernelArg (kernel, 15 , sizeof (cl_ulong), &nb11));
3365+ CL_CHECK (clSetKernelArg (kernel, 16 , sizeof (cl_ulong), &nb12));
3366+ CL_CHECK (clSetKernelArg (kernel, 17 , sizeof (cl_ulong), &nb13));
3367+ CL_CHECK (clSetKernelArg (kernel, 18 , sizeof (int ), &ne0));
3368+ CL_CHECK (clSetKernelArg (kernel, 19 , sizeof (cl_ulong), &nb0));
3369+ CL_CHECK (clSetKernelArg (kernel, 20 , sizeof (cl_ulong), &nb1));
3370+ CL_CHECK (clSetKernelArg (kernel, 21 , sizeof (cl_ulong), &nb2));
3371+ CL_CHECK (clSetKernelArg (kernel, 22 , sizeof (cl_ulong), &nb3));
3372+ }
3373+
3374+ if (bcast_row) {
3375+ int n = ggml_nelements (dst)/4 ;
3376+ size_t global_work_size[] = {(size_t )n, 1 , 1 };
3377+ size_t local_work_size[] = {64 , 1 , 1 };
3378+
3379+ #ifdef GGML_OPENCL_PROFILING
3380+ cl_event evt;
3381+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size, 0 , NULL , &evt));
3382+
3383+ g_profiling_info.emplace_back ();
3384+ populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size, dst);
3385+ #else
3386+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size, 0 , NULL , NULL ));
3387+ #endif
3388+ } else {
3389+ unsigned int nth = MIN (64 , ne0);
3390+ size_t global_work_size[] = {ne01*nth, (size_t )ne02, (size_t )ne03};
3391+ size_t local_work_size[] = {nth, 1 , 1 };
3392+
3393+ #ifdef GGML_OPENCL_PROFILING
3394+ cl_event evt;
3395+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size, 0 , NULL , &evt));
3396+
3397+ g_profiling_info.emplace_back ();
3398+ populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size, dst);
3399+ #else
3400+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size, 0 , NULL , NULL ));
3401+ #endif
3402+ }
3403+ }
3404+
32613405static void ggml_cl_gelu (ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
32623406 GGML_ASSERT (src0);
32633407 GGML_ASSERT (src0->extra );
@@ -5098,6 +5242,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
50985242 }
50995243 func = ggml_cl_mul;
51005244 break ;
5245+ case GGML_OP_DIV:
5246+ if (!any_on_device) {
5247+ return false ;
5248+ }
5249+ func = ggml_cl_div;
5250+ break ;
51015251 case GGML_OP_UNARY:
51025252 switch (ggml_get_unary_op (tensor)) {
51035253 case GGML_UNARY_OP_GELU:
0 commit comments