Skip to content

Commit c785441

Browse files
committed
opencl: add div
1 parent 16e2544 commit c785441

File tree

3 files changed

+223
-0
lines changed

3 files changed

+223
-0
lines changed

ggml/src/ggml-opencl/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ set(GGML_OPENCL_KERNELS
6060
cpy
6161
cvt
6262
diag_mask_inf
63+
div
6364
gelu
6465
gemv_noshuffle_general
6566
gemv_noshuffle

ggml/src/ggml-opencl/ggml-opencl.cpp

Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -313,6 +313,7 @@ struct ggml_backend_opencl_context {
313313

314314
cl_kernel kernel_add, kernel_add_row;
315315
cl_kernel kernel_mul, kernel_mul_row;
316+
cl_kernel kernel_div, kernel_div_row;
316317
cl_kernel kernel_scale;
317318
cl_kernel kernel_silu, kernel_silu_4;
318319
cl_kernel kernel_gelu, kernel_gelu_4;
@@ -1004,6 +1005,23 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
10041005
GGML_LOG_CONT(".");
10051006
}
10061007

1008+
// div
1009+
{
1010+
#ifdef GGML_OPENCL_EMBED_KERNELS
1011+
const std::string kernel_src {
1012+
#include "div.cl.h"
1013+
};
1014+
#else
1015+
const std::string kernel_src = read_file("div.cl");
1016+
#endif
1017+
backend_ctx->program_mul =
1018+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1019+
1020+
CL_CHECK((backend_ctx->kernel_div = clCreateKernel(backend_ctx->program_mul, "kernel_div", &err), err));
1021+
CL_CHECK((backend_ctx->kernel_div_row = clCreateKernel(backend_ctx->program_mul, "kernel_div_row", &err), err));
1022+
GGML_LOG_CONT(".");
1023+
}
1024+
10071025
// Adreno kernels
10081026
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
10091027
// transpose
@@ -1874,6 +1892,7 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
18741892
case GGML_OP_ADD:
18751893
case GGML_OP_SCALE:
18761894
case GGML_OP_MUL:
1895+
case GGML_OP_DIV:
18771896
return op->src[0]->type == GGML_TYPE_F32;
18781897
case GGML_OP_UNARY:
18791898
switch (ggml_get_unary_op(op)) {
@@ -3258,6 +3277,131 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
32583277
}
32593278
}
32603279

3280+
static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3281+
GGML_ASSERT(src0);
3282+
GGML_ASSERT(src0->extra);
3283+
GGML_ASSERT(src1);
3284+
GGML_ASSERT(src1->extra);
3285+
GGML_ASSERT(dst);
3286+
GGML_ASSERT(dst->extra);
3287+
3288+
const int ne00 = src0->ne[0];
3289+
const int ne01 = src0->ne[1];
3290+
const int ne02 = src0->ne[2];
3291+
const int ne03 = src0->ne[3];
3292+
3293+
const cl_ulong nb00 = src0->nb[0];
3294+
const cl_ulong nb01 = src0->nb[1];
3295+
const cl_ulong nb02 = src0->nb[2];
3296+
const cl_ulong nb03 = src0->nb[3];
3297+
3298+
const int ne10 = src1->ne[0];
3299+
const int ne11 = src1->ne[1];
3300+
const int ne12 = src1->ne[2];
3301+
const int ne13 = src1->ne[3];
3302+
3303+
const cl_ulong nb10 = src1->nb[0];
3304+
const cl_ulong nb11 = src1->nb[1];
3305+
const cl_ulong nb12 = src1->nb[2];
3306+
const cl_ulong nb13 = src1->nb[3];
3307+
3308+
const int ne0 = dst->ne[0];
3309+
3310+
const cl_ulong nb0 = dst->nb[0];
3311+
const cl_ulong nb1 = dst->nb[1];
3312+
const cl_ulong nb2 = dst->nb[2];
3313+
const cl_ulong nb3 = dst->nb[3];
3314+
3315+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3316+
cl_command_queue queue = backend_ctx->queue;
3317+
3318+
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3319+
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
3320+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
3321+
3322+
cl_ulong offset0 = extra0->offset + src0->view_offs;
3323+
cl_ulong offset1 = extra1->offset + src1->view_offs;
3324+
cl_ulong offsetd = extrad->offset + dst->view_offs;
3325+
3326+
bool bcast_row = false;
3327+
cl_kernel kernel;
3328+
3329+
if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
3330+
GGML_ASSERT(ggml_is_contiguous(src0));
3331+
3332+
// src1 is a row
3333+
GGML_ASSERT(ne11 == 1);
3334+
3335+
bcast_row = true;
3336+
int ne = ne00 / 4;
3337+
kernel = backend_ctx->kernel_div_row;
3338+
3339+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
3340+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
3341+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
3342+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
3343+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
3344+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
3345+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
3346+
} else {
3347+
kernel = backend_ctx->kernel_div;
3348+
3349+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
3350+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
3351+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
3352+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
3353+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
3354+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
3355+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb00));
3356+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
3357+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
3358+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03));
3359+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne10));
3360+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne11));
3361+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne12));
3362+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne13));
3363+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb10));
3364+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb11));
3365+
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb12));
3366+
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb13));
3367+
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne0));
3368+
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb0));
3369+
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb1));
3370+
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb2));
3371+
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb3));
3372+
}
3373+
3374+
if (bcast_row) {
3375+
int n = ggml_nelements(dst)/4;
3376+
size_t global_work_size[] = {(size_t)n, 1, 1};
3377+
size_t local_work_size[] = {64, 1, 1};
3378+
3379+
#ifdef GGML_OPENCL_PROFILING
3380+
cl_event evt;
3381+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3382+
3383+
g_profiling_info.emplace_back();
3384+
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3385+
#else
3386+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3387+
#endif
3388+
} else {
3389+
unsigned int nth = MIN(64, ne0);
3390+
size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
3391+
size_t local_work_size[] = {nth, 1, 1};
3392+
3393+
#ifdef GGML_OPENCL_PROFILING
3394+
cl_event evt;
3395+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3396+
3397+
g_profiling_info.emplace_back();
3398+
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3399+
#else
3400+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3401+
#endif
3402+
}
3403+
}
3404+
32613405
static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
32623406
GGML_ASSERT(src0);
32633407
GGML_ASSERT(src0->extra);
@@ -5098,6 +5242,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
50985242
}
50995243
func = ggml_cl_mul;
51005244
break;
5245+
case GGML_OP_DIV:
5246+
if (!any_on_device) {
5247+
return false;
5248+
}
5249+
func = ggml_cl_div;
5250+
break;
51015251
case GGML_OP_UNARY:
51025252
switch (ggml_get_unary_op(tensor)) {
51035253
case GGML_UNARY_OP_GELU:
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
2+
3+
//------------------------------------------------------------------------------
4+
// div
5+
//------------------------------------------------------------------------------
6+
kernel void kernel_div(
7+
global char * src0,
8+
ulong offset0,
9+
global char * src1,
10+
ulong offset1,
11+
global char * dst,
12+
ulong offsetd,
13+
ulong nb00,
14+
ulong nb01,
15+
ulong nb02,
16+
ulong nb03,
17+
int ne10,
18+
int ne11,
19+
int ne12,
20+
int ne13,
21+
ulong nb10,
22+
ulong nb11,
23+
ulong nb12,
24+
ulong nb13,
25+
int ne0,
26+
ulong nb0,
27+
ulong nb1,
28+
ulong nb2,
29+
ulong nb3
30+
) {
31+
src0 = src0 + offset0;
32+
src1 = src1 + offset1;
33+
dst = dst + offsetd;
34+
35+
int i03 = get_group_id(2);
36+
int i02 = get_group_id(1);
37+
int i01 = get_group_id(0);
38+
39+
int i13 = i03 % ne13;
40+
int i12 = i02 % ne12;
41+
int i11 = i01 % ne11;
42+
43+
global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
44+
global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
45+
global char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1;
46+
47+
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
48+
const int i10 = i0 % ne10;
49+
*((global float *)(dst_ptr + i0*nb0)) = *((global float *)(src0_ptr + i0*nb00)) / *((global float *)(src1_ptr + i10*nb10));
50+
}
51+
}
52+
53+
// assumption: src1 is a row
54+
// broadcast src1 into src0
55+
kernel void kernel_div_row(
56+
global float4 * src0,
57+
ulong offset0,
58+
global float4 * src1,
59+
ulong offset1,
60+
global float4 * dst,
61+
ulong offsetd,
62+
int ne
63+
) {
64+
src0 = (global float4*)((global char*)src0 + offset0);
65+
src1 = (global float4*)((global char*)src1 + offset1);
66+
dst = (global float4*)((global char*)dst + offsetd);
67+
68+
// This performs better than using %.
69+
uint gid = get_global_id(0);
70+
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
71+
dst[gid] = src0[gid] / src1[idx1];
72+
}

0 commit comments

Comments
 (0)