Skip to content

Commit 3e946aa

Browse files
committed
opencl: only copy to cont for f32 and f16 tensors
1 parent 5dc0b5f commit 3e946aa

File tree

1 file changed

+42
-19
lines changed

1 file changed

+42
-19
lines changed

ggml/src/ggml-opencl/ggml-opencl.cpp

Lines changed: 42 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -7024,25 +7024,6 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
70247024
if (src1t == GGML_TYPE_F32 &&
70257025
ne00 % 16 == 0 &&
70267026
ne11 > 1) {
7027-
cl_mem mem_src0 = extra0->data_device;
7028-
cl_mem mem_src1 = extra1->data_device;
7029-
7030-
if (!ggml_is_contiguous(src0)) {
7031-
backend_ctx->prealloc_src0.allocate(backend_ctx->context, ggml_nbytes(src0));
7032-
ggml_cl_copy_to_contiguous(backend, src0, backend_ctx->prealloc_src0.buffer,
7033-
nb00, nb01, nb02, nb03);
7034-
mem_src0 = backend_ctx->prealloc_src0.buffer;
7035-
offset0 = 0;
7036-
}
7037-
7038-
if (!ggml_is_contiguous(src1)) {
7039-
backend_ctx->prealloc_src1.allocate(backend_ctx->context, ggml_nbytes(src1));
7040-
ggml_cl_copy_to_contiguous(backend, src1, backend_ctx->prealloc_src1.buffer,
7041-
nb10, nb11, nb12, nb13);
7042-
mem_src1 = backend_ctx->prealloc_src1.buffer;
7043-
offset1 = 0;
7044-
}
7045-
70467027
switch(src0t) {
70477028
case GGML_TYPE_F32: {
70487029
kernel = backend_ctx->kernel_mul_mm_f32_f32_l4_lm;
@@ -7052,6 +7033,25 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
70527033
int batch_stride_b = ne10*ne11;
70537034
int batch_stride_d = ne0*ne1;
70547035

7036+
cl_mem mem_src0 = extra0->data_device;
7037+
cl_mem mem_src1 = extra1->data_device;
7038+
7039+
if (!ggml_is_contiguous(src0)) {
7040+
backend_ctx->prealloc_src0.allocate(backend_ctx->context, ggml_nbytes(src0));
7041+
ggml_cl_copy_to_contiguous(backend, src0, backend_ctx->prealloc_src0.buffer,
7042+
nb00, nb01, nb02, nb03);
7043+
mem_src0 = backend_ctx->prealloc_src0.buffer;
7044+
offset0 = 0;
7045+
}
7046+
7047+
if (!ggml_is_contiguous(src1)) {
7048+
backend_ctx->prealloc_src1.allocate(backend_ctx->context, ggml_nbytes(src1));
7049+
ggml_cl_copy_to_contiguous(backend, src1, backend_ctx->prealloc_src1.buffer,
7050+
nb10, nb11, nb12, nb13);
7051+
mem_src1 = backend_ctx->prealloc_src1.buffer;
7052+
offset1 = 0;
7053+
}
7054+
70557055
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &mem_src0));
70567056
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
70577057
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &mem_src1));
@@ -7087,6 +7087,25 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
70877087
int batch_stride_b = ne10*ne11;
70887088
int batch_stride_d = ne0*ne1;
70897089

7090+
cl_mem mem_src0 = extra0->data_device;
7091+
cl_mem mem_src1 = extra1->data_device;
7092+
7093+
if (!ggml_is_contiguous(src0)) {
7094+
backend_ctx->prealloc_src0.allocate(backend_ctx->context, ggml_nbytes(src0));
7095+
ggml_cl_copy_to_contiguous(backend, src0, backend_ctx->prealloc_src0.buffer,
7096+
nb00, nb01, nb02, nb03);
7097+
mem_src0 = backend_ctx->prealloc_src0.buffer;
7098+
offset0 = 0;
7099+
}
7100+
7101+
if (!ggml_is_contiguous(src1)) {
7102+
backend_ctx->prealloc_src1.allocate(backend_ctx->context, ggml_nbytes(src1));
7103+
ggml_cl_copy_to_contiguous(backend, src1, backend_ctx->prealloc_src1.buffer,
7104+
nb10, nb11, nb12, nb13);
7105+
mem_src1 = backend_ctx->prealloc_src1.buffer;
7106+
offset1 = 0;
7107+
}
7108+
70907109
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &mem_src0));
70917110
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
70927111
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &mem_src1));
@@ -7150,6 +7169,10 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
71507169
return;
71517170
}
71527171
case GGML_TYPE_Q8_0: {
7172+
if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1)) {
7173+
break;
7174+
}
7175+
71537176
kernel = backend_ctx->kernel_mul_mm_q8_0_f32_l4_lm;
71547177
nth0 = 128; // calculated as (BM*BN)/(TM*TN)
71557178

0 commit comments

Comments
 (0)