Skip to content

Commit 4084bd1

Browse files
committed
Q4_0 tranpose fix for Adreno
1 parent 3427959 commit 4084bd1

File tree

2 files changed

+37
-3
lines changed

2 files changed

+37
-3
lines changed

ggml/src/ggml-opencl/ggml-opencl.cpp

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -581,6 +581,7 @@ struct ggml_backend_opencl_context {
581581
cl_kernel kernel_transpose_32;
582582
cl_kernel kernel_transpose_32_16;
583583
cl_kernel kernel_transpose_16;
584+
cl_kernel kernel_transpose_16_4x1;
584585

585586
cl_mem A_s_d_max; // max scale buffer size for transpose
586587
cl_mem A_q_d_max; // max weight buffer size for transpose
@@ -1664,6 +1665,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
16641665
CL_CHECK((backend_ctx->kernel_transpose_32_16 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_32_16", &err), err));
16651666
CL_CHECK((backend_ctx->kernel_transpose_32 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_32", &err), err));
16661667
CL_CHECK((backend_ctx->kernel_transpose_16 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16", &err), err));
1668+
CL_CHECK((backend_ctx->kernel_transpose_16_4x1 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16_4x1", &err), err));
16671669
GGML_LOG_CONT(".");
16681670
}
16691671

@@ -2981,7 +2983,10 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
29812983
// cl_mem qT_d = clCreateBuffer(context, CL_MEM_READ_WRITE, q_size_bytes, NULL, &err);
29822984
CL_CHECK(err);
29832985

2984-
// size_t d_size_bytes = M * (K / 32) / 2 * sizeof(float);
2986+
bool K_tile_trans = true;
2987+
if ((K / 32) % 4 != 0){
2988+
K_tile_trans =false;
2989+
}
29852990
size_t d_size_bytes = M * (K / 32) * 2;
29862991
region.origin = 0;
29872992
region.size = d_size_bytes;
@@ -3022,10 +3027,15 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
30223027
qT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
30233028
CL_CHECK(err);
30243029

3025-
img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
30263030
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
3031+
if (K_tile_trans) {
3032+
img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
3033+
img_desc_1d.image_width = M * K / 32 / 4;
3034+
} else {
3035+
img_fmt_1d = { CL_R, CL_HALF_FLOAT };
3036+
img_desc_1d.image_width = M * K / 32;
3037+
}
30273038
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
3028-
img_desc_1d.image_width = M * K / 32 / 4;
30293039
img_desc_1d.buffer = extra->d;
30303040
d_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
30313041
CL_CHECK(err);
@@ -3061,6 +3071,10 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
30613071
int width_s = K / 32 / 4;
30623072

30633073
kernel = backend_ctx->kernel_transpose_16;
3074+
if (!K_tile_trans) {
3075+
kernel = backend_ctx->kernel_transpose_16_4x1;
3076+
width_s = K / 32;
3077+
}
30643078
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_d_image1D));
30653079
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &dT_d_image1D));
30663080
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_s));

ggml/src/ggml-opencl/kernels/transpose.cl

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,26 @@ kernel void kernel_transpose_16(
2424
write_imageh(output, (i_2+3)*rows+j, (half4)(temp0.s3, temp1.s3, temp2.s3, temp3.s3));
2525
}
2626

27+
// Padded kernel for irregular shape
28+
kernel void kernel_transpose_16_4x1(
29+
__read_only image1d_buffer_t input,
30+
__write_only image1d_buffer_t output,
31+
const uint rows,
32+
const uint cols
33+
) {
34+
35+
const int i = get_global_id(0);
36+
const int j = get_global_id(1);
37+
const int j_2 = j << 2;
38+
39+
half temp0 = read_imageh(input, (j_2 + 0) * cols + i).x;
40+
half temp1 = read_imageh(input, (j_2 + 1) * cols + i).x;
41+
half temp2 = read_imageh(input, (j_2 + 2) * cols + i).x;
42+
half temp3 = read_imageh(input, (j_2 + 3) * cols + i).x;
43+
44+
write_imageh(output, i * rows + j, (half4)(temp0, temp1, temp2, temp3));
45+
}
46+
2747
// 32-bit transpose, loading/storing a 4x4 tile of elements
2848
kernel void kernel_transpose_32(
2949
__read_only image1d_buffer_t input,

0 commit comments

Comments
 (0)