@@ -7024,25 +7024,6 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
7024
7024
if (src1t == GGML_TYPE_F32 &&
7025
7025
ne00 % 16 == 0 &&
7026
7026
ne11 > 1 ) {
7027
- cl_mem mem_src0 = extra0->data_device ;
7028
- cl_mem mem_src1 = extra1->data_device ;
7029
-
7030
- if (!ggml_is_contiguous (src0)) {
7031
- backend_ctx->prealloc_src0 .allocate (backend_ctx->context , ggml_nbytes (src0));
7032
- ggml_cl_copy_to_contiguous (backend, src0, backend_ctx->prealloc_src0 .buffer ,
7033
- nb00, nb01, nb02, nb03);
7034
- mem_src0 = backend_ctx->prealloc_src0 .buffer ;
7035
- offset0 = 0 ;
7036
- }
7037
-
7038
- if (!ggml_is_contiguous (src1)) {
7039
- backend_ctx->prealloc_src1 .allocate (backend_ctx->context , ggml_nbytes (src1));
7040
- ggml_cl_copy_to_contiguous (backend, src1, backend_ctx->prealloc_src1 .buffer ,
7041
- nb10, nb11, nb12, nb13);
7042
- mem_src1 = backend_ctx->prealloc_src1 .buffer ;
7043
- offset1 = 0 ;
7044
- }
7045
-
7046
7027
switch (src0t) {
7047
7028
case GGML_TYPE_F32: {
7048
7029
kernel = backend_ctx->kernel_mul_mm_f32_f32_l4_lm ;
@@ -7052,6 +7033,25 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
7052
7033
int batch_stride_b = ne10*ne11;
7053
7034
int batch_stride_d = ne0*ne1;
7054
7035
7036
+ cl_mem mem_src0 = extra0->data_device ;
7037
+ cl_mem mem_src1 = extra1->data_device ;
7038
+
7039
+ if (!ggml_is_contiguous (src0)) {
7040
+ backend_ctx->prealloc_src0 .allocate (backend_ctx->context , ggml_nbytes (src0));
7041
+ ggml_cl_copy_to_contiguous (backend, src0, backend_ctx->prealloc_src0 .buffer ,
7042
+ nb00, nb01, nb02, nb03);
7043
+ mem_src0 = backend_ctx->prealloc_src0 .buffer ;
7044
+ offset0 = 0 ;
7045
+ }
7046
+
7047
+ if (!ggml_is_contiguous (src1)) {
7048
+ backend_ctx->prealloc_src1 .allocate (backend_ctx->context , ggml_nbytes (src1));
7049
+ ggml_cl_copy_to_contiguous (backend, src1, backend_ctx->prealloc_src1 .buffer ,
7050
+ nb10, nb11, nb12, nb13);
7051
+ mem_src1 = backend_ctx->prealloc_src1 .buffer ;
7052
+ offset1 = 0 ;
7053
+ }
7054
+
7055
7055
CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &mem_src0));
7056
7056
CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_ulong), &offset0));
7057
7057
CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_mem), &mem_src1));
@@ -7087,6 +7087,25 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
7087
7087
int batch_stride_b = ne10*ne11;
7088
7088
int batch_stride_d = ne0*ne1;
7089
7089
7090
+ cl_mem mem_src0 = extra0->data_device ;
7091
+ cl_mem mem_src1 = extra1->data_device ;
7092
+
7093
+ if (!ggml_is_contiguous (src0)) {
7094
+ backend_ctx->prealloc_src0 .allocate (backend_ctx->context , ggml_nbytes (src0));
7095
+ ggml_cl_copy_to_contiguous (backend, src0, backend_ctx->prealloc_src0 .buffer ,
7096
+ nb00, nb01, nb02, nb03);
7097
+ mem_src0 = backend_ctx->prealloc_src0 .buffer ;
7098
+ offset0 = 0 ;
7099
+ }
7100
+
7101
+ if (!ggml_is_contiguous (src1)) {
7102
+ backend_ctx->prealloc_src1 .allocate (backend_ctx->context , ggml_nbytes (src1));
7103
+ ggml_cl_copy_to_contiguous (backend, src1, backend_ctx->prealloc_src1 .buffer ,
7104
+ nb10, nb11, nb12, nb13);
7105
+ mem_src1 = backend_ctx->prealloc_src1 .buffer ;
7106
+ offset1 = 0 ;
7107
+ }
7108
+
7090
7109
CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &mem_src0));
7091
7110
CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_ulong), &offset0));
7092
7111
CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_mem), &mem_src1));
@@ -7150,6 +7169,10 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
7150
7169
return ;
7151
7170
}
7152
7171
case GGML_TYPE_Q8_0: {
7172
+ if (!ggml_is_contiguous (src0) || !ggml_is_contiguous (src1)) {
7173
+ break ;
7174
+ }
7175
+
7153
7176
kernel = backend_ctx->kernel_mul_mm_q8_0_f32_l4_lm ;
7154
7177
nth0 = 128 ; // calculated as (BM*BN)/(TM*TN)
7155
7178
0 commit comments