@@ -581,6 +581,7 @@ struct ggml_backend_opencl_context {
581581 cl_kernel kernel_transpose_32;
582582 cl_kernel kernel_transpose_32_16;
583583 cl_kernel kernel_transpose_16;
584+ cl_kernel kernel_transpose_16_4x1;
584585
585586 cl_mem A_s_d_max; // max scale buffer size for transpose
586587 cl_mem A_q_d_max; // max weight buffer size for transpose
@@ -1664,6 +1665,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
16641665 CL_CHECK ((backend_ctx->kernel_transpose_32_16 = clCreateKernel (backend_ctx->program_transpose , " kernel_transpose_32_16" , &err), err));
16651666 CL_CHECK ((backend_ctx->kernel_transpose_32 = clCreateKernel (backend_ctx->program_transpose , " kernel_transpose_32" , &err), err));
16661667 CL_CHECK ((backend_ctx->kernel_transpose_16 = clCreateKernel (backend_ctx->program_transpose , " kernel_transpose_16" , &err), err));
1668+ CL_CHECK ((backend_ctx->kernel_transpose_16_4x1 = clCreateKernel (backend_ctx->program_transpose , " kernel_transpose_16_4x1" , &err), err));
16671669 GGML_LOG_CONT (" ." );
16681670 }
16691671
@@ -2981,7 +2983,10 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
29812983 // cl_mem qT_d = clCreateBuffer(context, CL_MEM_READ_WRITE, q_size_bytes, NULL, &err);
29822984 CL_CHECK (err);
29832985
2984- // size_t d_size_bytes = M * (K / 32) / 2 * sizeof(float);
2986+ bool K_tile_trans = true ;
2987+ if ((K / 32 ) % 4 != 0 ){
2988+ K_tile_trans =false ;
2989+ }
29852990 size_t d_size_bytes = M * (K / 32 ) * 2 ;
29862991 region.origin = 0 ;
29872992 region.size = d_size_bytes;
@@ -3022,10 +3027,15 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
30223027 qT_d_image1D = clCreateImage (context, 0 , &img_fmt_1d, &img_desc_1d, NULL , &err);
30233028 CL_CHECK (err);
30243029
3025- img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
30263030 memset (&img_desc_1d, 0 , sizeof (img_desc_1d));
3031+ if (K_tile_trans) {
3032+ img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
3033+ img_desc_1d.image_width = M * K / 32 / 4 ;
3034+ } else {
3035+ img_fmt_1d = { CL_R, CL_HALF_FLOAT };
3036+ img_desc_1d.image_width = M * K / 32 ;
3037+ }
30273038 img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
3028- img_desc_1d.image_width = M * K / 32 / 4 ;
30293039 img_desc_1d.buffer = extra->d ;
30303040 d_d_image1D = clCreateImage (context, 0 , &img_fmt_1d, &img_desc_1d, NULL , &err);
30313041 CL_CHECK (err);
@@ -3061,6 +3071,10 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
30613071 int width_s = K / 32 / 4 ;
30623072
30633073 kernel = backend_ctx->kernel_transpose_16 ;
3074+ if (!K_tile_trans) {
3075+ kernel = backend_ctx->kernel_transpose_16_4x1 ;
3076+ width_s = K / 32 ;
3077+ }
30643078 CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &d_d_image1D));
30653079 CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_mem), &dT_d_image1D));
30663080 CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (int ), &height_s));
0 commit comments