Skip to content

Commit b178fb9

Browse files
opencl: use ulong for offsets and strides in ADD kernel
1 parent f789386 commit b178fb9

File tree

2 files changed

+82
-82
lines changed

2 files changed

+82
-82
lines changed

ggml/src/ggml-opencl2/ggml-opencl2.cpp

Lines changed: 52 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -2031,30 +2031,30 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
20312031
const int ne02 = src0 ? src0->ne[2] : 0;
20322032
const int ne03 = src0 ? src0->ne[3] : 0;
20332033

2034-
const int nb00 = src0 ? src0->nb[0] : 0;
2035-
const int nb01 = src0 ? src0->nb[1] : 0;
2036-
const int nb02 = src0 ? src0->nb[2] : 0;
2037-
const int nb03 = src0 ? src0->nb[3] : 0;
2034+
const cl_ulong nb00 = src0 ? src0->nb[0] : 0;
2035+
const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
2036+
const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
2037+
const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
20382038

20392039
const int ne10 = src1 ? src1->ne[0] : 0;
20402040
const int ne11 = src1 ? src1->ne[1] : 0;
20412041
const int ne12 = src1 ? src1->ne[2] : 0;
20422042
const int ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13);
20432043

2044-
const int nb10 = src1 ? src1->nb[0] : 0;
2045-
const int nb11 = src1 ? src1->nb[1] : 0;
2046-
const int nb12 = src1 ? src1->nb[2] : 0;
2047-
const int nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13);
2044+
const cl_ulong nb10 = src1 ? src1->nb[0] : 0;
2045+
const cl_ulong nb11 = src1 ? src1->nb[1] : 0;
2046+
const cl_ulong nb12 = src1 ? src1->nb[2] : 0;
2047+
const cl_ulong nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13);
20482048

20492049
const int ne0 = dst ? dst->ne[0] : 0;
20502050
const int ne1 = dst ? dst->ne[1] : 0;
20512051
const int ne2 = dst ? dst->ne[2] : 0;
20522052
const int ne3 = dst ? dst->ne[3] : 0;
20532053

2054-
const int nb0 = dst ? dst->nb[0] : 0;
2055-
const int nb1 = dst ? dst->nb[1] : 0;
2056-
const int nb2 = dst ? dst->nb[2] : 0;
2057-
const int nb3 = dst ? dst->nb[3] : 0;
2054+
const cl_ulong nb0 = dst ? dst->nb[0] : 0;
2055+
const cl_ulong nb1 = dst ? dst->nb[1] : 0;
2056+
const cl_ulong nb2 = dst ? dst->nb[2] : 0;
2057+
const cl_ulong nb3 = dst ? dst->nb[3] : 0;
20582058

20592059
ggml_backend_opencl2_context *backend_ctx = (ggml_backend_opencl2_context *)backend->context;
20602060
cl_command_queue queue = backend_ctx->queue;
@@ -2063,9 +2063,9 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
20632063
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
20642064
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
20652065

2066-
int offset0 = extra0->offset + src0->view_offs;
2067-
int offset1 = extra1->offset + src1->view_offs;
2068-
int offsetd = extrad->offset + dst->view_offs;
2066+
cl_ulong offset0 = extra0->offset + src0->view_offs;
2067+
cl_ulong offset1 = extra1->offset + src1->view_offs;
2068+
cl_ulong offsetd = extrad->offset + dst->view_offs;
20692069

20702070
bool bcast_row = false;
20712071
int nb = ne00;
@@ -2081,46 +2081,46 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
20812081
nb = ne00 / 4;
20822082
kernel = backend_ctx->kernel_add_row;
20832083

2084-
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
2085-
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &offset0));
2086-
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
2087-
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &offset1));
2088-
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
2089-
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &offsetd));
2090-
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &nb));
2084+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
2085+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
2086+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
2087+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
2088+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
2089+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
2090+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &nb));
20912091
} else {
20922092
kernel = backend_ctx->kernel_add;
20932093

2094-
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
2095-
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &offset0));
2096-
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
2097-
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &offset1));
2098-
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
2099-
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &offsetd));
2100-
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
2101-
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
2102-
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
2103-
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne03));
2104-
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &nb00));
2105-
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &nb01));
2106-
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &nb02));
2107-
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &nb03));
2108-
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne10));
2109-
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne11));
2110-
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne12));
2111-
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne13));
2112-
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &nb10));
2113-
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &nb11));
2114-
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &nb12));
2115-
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int), &nb13));
2116-
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &ne0));
2117-
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &ne1));
2118-
CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &ne2));
2119-
CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int), &ne3));
2120-
CL_CHECK(clSetKernelArg(kernel, 26, sizeof(int), &nb0));
2121-
CL_CHECK(clSetKernelArg(kernel, 27, sizeof(int), &nb1));
2122-
CL_CHECK(clSetKernelArg(kernel, 28, sizeof(int), &nb2));
2123-
CL_CHECK(clSetKernelArg(kernel, 29, sizeof(int), &nb3));
2094+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
2095+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
2096+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
2097+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
2098+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
2099+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
2100+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
2101+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
2102+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
2103+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne03));
2104+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb00));
2105+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01));
2106+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
2107+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb03));
2108+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne10));
2109+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne11));
2110+
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne12));
2111+
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne13));
2112+
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb10));
2113+
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb11));
2114+
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb12));
2115+
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb13));
2116+
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &ne0));
2117+
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &ne1));
2118+
CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &ne2));
2119+
CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int), &ne3));
2120+
CL_CHECK(clSetKernelArg(kernel, 26, sizeof(cl_ulong), &nb0));
2121+
CL_CHECK(clSetKernelArg(kernel, 27, sizeof(cl_ulong), &nb1));
2122+
CL_CHECK(clSetKernelArg(kernel, 28, sizeof(cl_ulong), &nb2));
2123+
CL_CHECK(clSetKernelArg(kernel, 29, sizeof(cl_ulong), &nb3));
21242124
}
21252125

21262126
if (bcast_row) {

ggml/src/ggml-opencl2/kernels/ggml-opencl2.cl

Lines changed: 30 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -237,35 +237,35 @@ void dequantize_q4_0_f16(global struct block_q4_0 * xb, short il, half16 * reg)
237237
// cons: not very efficient
238238
kernel void kernel_add(
239239
global char * src0,
240-
int offset0,
240+
ulong offset0,
241241
global char * src1,
242-
int offset1,
242+
ulong offset1,
243243
global char * dst,
244-
int offsetd,
245-
int ne00,
246-
int ne01,
247-
int ne02,
248-
int ne03,
249-
int nb00,
250-
int nb01,
251-
int nb02,
252-
int nb03,
253-
int ne10,
254-
int ne11,
255-
int ne12,
256-
int ne13,
257-
int nb10,
258-
int nb11,
259-
int nb12,
260-
int nb13,
261-
int ne0,
262-
int ne1,
263-
int ne2,
264-
int ne3,
265-
int nb0,
266-
int nb1,
267-
int nb2,
268-
int nb3
244+
ulong offsetd,
245+
int ne00,
246+
int ne01,
247+
int ne02,
248+
int ne03,
249+
ulong nb00,
250+
ulong nb01,
251+
ulong nb02,
252+
ulong nb03,
253+
int ne10,
254+
int ne11,
255+
int ne12,
256+
int ne13,
257+
ulong nb10,
258+
ulong nb11,
259+
ulong nb12,
260+
ulong nb13,
261+
int ne0,
262+
int ne1,
263+
int ne2,
264+
int ne3,
265+
ulong nb0,
266+
ulong nb1,
267+
ulong nb2,
268+
ulong nb3
269269
) {
270270
src0 = src0 + offset0;
271271
src1 = src1 + offset1;
@@ -293,11 +293,11 @@ kernel void kernel_add(
293293
// broadcast src1 into src0
294294
kernel void kernel_add_row(
295295
global float4 * src0,
296-
int offset0,
296+
ulong offset0,
297297
global float4 * src1,
298-
int offset1,
298+
ulong offset1,
299299
global float4 * dst,
300-
int offsetd,
300+
ulong offsetd,
301301
int nb
302302
) {
303303
src0 = (global float4*)((global char*)src0 + offset0);

0 commit comments

Comments
 (0)