@@ -313,6 +313,7 @@ struct ggml_backend_opencl_context {
313
313
314
314
cl_kernel kernel_add, kernel_add_row;
315
315
cl_kernel kernel_mul, kernel_mul_row;
316
+ cl_kernel kernel_div, kernel_div_row;
316
317
cl_kernel kernel_scale;
317
318
cl_kernel kernel_silu, kernel_silu_4;
318
319
cl_kernel kernel_gelu, kernel_gelu_4;
@@ -1004,6 +1005,23 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
1004
1005
GGML_LOG_CONT (" ." );
1005
1006
}
1006
1007
1008
+ // div
1009
+ {
1010
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1011
+ const std::string kernel_src {
1012
+ #include " div.cl.h"
1013
+ };
1014
+ #else
1015
+ const std::string kernel_src = read_file (" div.cl" );
1016
+ #endif
1017
+ backend_ctx->program_mul =
1018
+ build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
1019
+
1020
+ CL_CHECK ((backend_ctx->kernel_div = clCreateKernel (backend_ctx->program_mul , " kernel_div" , &err), err));
1021
+ CL_CHECK ((backend_ctx->kernel_div_row = clCreateKernel (backend_ctx->program_mul , " kernel_div_row" , &err), err));
1022
+ GGML_LOG_CONT (" ." );
1023
+ }
1024
+
1007
1025
// Adreno kernels
1008
1026
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
1009
1027
// transpose
@@ -1874,6 +1892,7 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
1874
1892
case GGML_OP_ADD:
1875
1893
case GGML_OP_SCALE:
1876
1894
case GGML_OP_MUL:
1895
+ case GGML_OP_DIV:
1877
1896
return op->src [0 ]->type == GGML_TYPE_F32;
1878
1897
case GGML_OP_UNARY:
1879
1898
switch (ggml_get_unary_op (op)) {
@@ -3258,6 +3277,131 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
3258
3277
}
3259
3278
}
3260
3279
3280
+ static void ggml_cl_div (ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3281
+ GGML_ASSERT (src0);
3282
+ GGML_ASSERT (src0->extra );
3283
+ GGML_ASSERT (src1);
3284
+ GGML_ASSERT (src1->extra );
3285
+ GGML_ASSERT (dst);
3286
+ GGML_ASSERT (dst->extra );
3287
+
3288
+ const int ne00 = src0->ne [0 ];
3289
+ const int ne01 = src0->ne [1 ];
3290
+ const int ne02 = src0->ne [2 ];
3291
+ const int ne03 = src0->ne [3 ];
3292
+
3293
+ const cl_ulong nb00 = src0->nb [0 ];
3294
+ const cl_ulong nb01 = src0->nb [1 ];
3295
+ const cl_ulong nb02 = src0->nb [2 ];
3296
+ const cl_ulong nb03 = src0->nb [3 ];
3297
+
3298
+ const int ne10 = src1->ne [0 ];
3299
+ const int ne11 = src1->ne [1 ];
3300
+ const int ne12 = src1->ne [2 ];
3301
+ const int ne13 = src1->ne [3 ];
3302
+
3303
+ const cl_ulong nb10 = src1->nb [0 ];
3304
+ const cl_ulong nb11 = src1->nb [1 ];
3305
+ const cl_ulong nb12 = src1->nb [2 ];
3306
+ const cl_ulong nb13 = src1->nb [3 ];
3307
+
3308
+ const int ne0 = dst->ne [0 ];
3309
+
3310
+ const cl_ulong nb0 = dst->nb [0 ];
3311
+ const cl_ulong nb1 = dst->nb [1 ];
3312
+ const cl_ulong nb2 = dst->nb [2 ];
3313
+ const cl_ulong nb3 = dst->nb [3 ];
3314
+
3315
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context ;
3316
+ cl_command_queue queue = backend_ctx->queue ;
3317
+
3318
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra ;
3319
+ ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra ;
3320
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra ;
3321
+
3322
+ cl_ulong offset0 = extra0->offset + src0->view_offs ;
3323
+ cl_ulong offset1 = extra1->offset + src1->view_offs ;
3324
+ cl_ulong offsetd = extrad->offset + dst->view_offs ;
3325
+
3326
+ bool bcast_row = false ;
3327
+ cl_kernel kernel;
3328
+
3329
+ if (ggml_nelements (src1) == ne10 && ggml_is_contiguous (src1) && ne00 % 4 == 0 && ne10 % 4 == 0 ) {
3330
+ GGML_ASSERT (ggml_is_contiguous (src0));
3331
+
3332
+ // src1 is a row
3333
+ GGML_ASSERT (ne11 == 1 );
3334
+
3335
+ bcast_row = true ;
3336
+ int ne = ne00 / 4 ;
3337
+ kernel = backend_ctx->kernel_div_row ;
3338
+
3339
+ CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &extra0->data_device ));
3340
+ CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_ulong), &offset0));
3341
+ CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_mem), &extra1->data_device ));
3342
+ CL_CHECK (clSetKernelArg (kernel, 3 , sizeof (cl_ulong), &offset1));
3343
+ CL_CHECK (clSetKernelArg (kernel, 4 , sizeof (cl_mem), &extrad->data_device ));
3344
+ CL_CHECK (clSetKernelArg (kernel, 5 , sizeof (cl_ulong), &offsetd));
3345
+ CL_CHECK (clSetKernelArg (kernel, 6 , sizeof (int ), &ne));
3346
+ } else {
3347
+ kernel = backend_ctx->kernel_div ;
3348
+
3349
+ CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &extra0->data_device ));
3350
+ CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_ulong), &offset0));
3351
+ CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_mem), &extra1->data_device ));
3352
+ CL_CHECK (clSetKernelArg (kernel, 3 , sizeof (cl_ulong), &offset1));
3353
+ CL_CHECK (clSetKernelArg (kernel, 4 , sizeof (cl_mem), &extrad->data_device ));
3354
+ CL_CHECK (clSetKernelArg (kernel, 5 , sizeof (cl_ulong), &offsetd));
3355
+ CL_CHECK (clSetKernelArg (kernel, 6 , sizeof (cl_ulong), &nb00));
3356
+ CL_CHECK (clSetKernelArg (kernel, 7 , sizeof (cl_ulong), &nb01));
3357
+ CL_CHECK (clSetKernelArg (kernel, 8 , sizeof (cl_ulong), &nb02));
3358
+ CL_CHECK (clSetKernelArg (kernel, 9 , sizeof (cl_ulong), &nb03));
3359
+ CL_CHECK (clSetKernelArg (kernel, 10 , sizeof (int ), &ne10));
3360
+ CL_CHECK (clSetKernelArg (kernel, 11 , sizeof (int ), &ne11));
3361
+ CL_CHECK (clSetKernelArg (kernel, 12 , sizeof (int ), &ne12));
3362
+ CL_CHECK (clSetKernelArg (kernel, 13 , sizeof (int ), &ne13));
3363
+ CL_CHECK (clSetKernelArg (kernel, 14 , sizeof (cl_ulong), &nb10));
3364
+ CL_CHECK (clSetKernelArg (kernel, 15 , sizeof (cl_ulong), &nb11));
3365
+ CL_CHECK (clSetKernelArg (kernel, 16 , sizeof (cl_ulong), &nb12));
3366
+ CL_CHECK (clSetKernelArg (kernel, 17 , sizeof (cl_ulong), &nb13));
3367
+ CL_CHECK (clSetKernelArg (kernel, 18 , sizeof (int ), &ne0));
3368
+ CL_CHECK (clSetKernelArg (kernel, 19 , sizeof (cl_ulong), &nb0));
3369
+ CL_CHECK (clSetKernelArg (kernel, 20 , sizeof (cl_ulong), &nb1));
3370
+ CL_CHECK (clSetKernelArg (kernel, 21 , sizeof (cl_ulong), &nb2));
3371
+ CL_CHECK (clSetKernelArg (kernel, 22 , sizeof (cl_ulong), &nb3));
3372
+ }
3373
+
3374
+ if (bcast_row) {
3375
+ int n = ggml_nelements (dst)/4 ;
3376
+ size_t global_work_size[] = {(size_t )n, 1 , 1 };
3377
+ size_t local_work_size[] = {64 , 1 , 1 };
3378
+
3379
+ #ifdef GGML_OPENCL_PROFILING
3380
+ cl_event evt;
3381
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size, 0 , NULL , &evt));
3382
+
3383
+ g_profiling_info.emplace_back ();
3384
+ populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size, dst);
3385
+ #else
3386
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size, 0 , NULL , NULL ));
3387
+ #endif
3388
+ } else {
3389
+ unsigned int nth = MIN (64 , ne0);
3390
+ size_t global_work_size[] = {ne01*nth, (size_t )ne02, (size_t )ne03};
3391
+ size_t local_work_size[] = {nth, 1 , 1 };
3392
+
3393
+ #ifdef GGML_OPENCL_PROFILING
3394
+ cl_event evt;
3395
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size, 0 , NULL , &evt));
3396
+
3397
+ g_profiling_info.emplace_back ();
3398
+ populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size, dst);
3399
+ #else
3400
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size, 0 , NULL , NULL ));
3401
+ #endif
3402
+ }
3403
+ }
3404
+
3261
3405
static void ggml_cl_gelu (ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3262
3406
GGML_ASSERT (src0);
3263
3407
GGML_ASSERT (src0->extra );
@@ -5098,6 +5242,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
5098
5242
}
5099
5243
func = ggml_cl_mul;
5100
5244
break ;
5245
+ case GGML_OP_DIV:
5246
+ if (!any_on_device) {
5247
+ return false ;
5248
+ }
5249
+ func = ggml_cl_div;
5250
+ break ;
5101
5251
case GGML_OP_UNARY:
5102
5252
switch (ggml_get_unary_op (tensor)) {
5103
5253
case GGML_UNARY_OP_GELU:
0 commit comments