Skip to content

Commit 097f869

Browse files
quic-sszotlhez
authored andcommitted
opencl: fix transpose_16, dump_tensor, enforce subgroup size
1 parent b0a765c commit 097f869

File tree

4 files changed

+43
-34
lines changed

4 files changed

+43
-34
lines changed

ggml/src/ggml-opencl/ggml-opencl.cpp

Lines changed: 17 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1365,6 +1365,11 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
13651365
int M = tensor->ne[1]; // ne01
13661366
int K = tensor->ne[0]; // ne00
13671367

1368+
//For matrix-vector multiplication kernel, we assume K is a multiple of 32
1369+
GGML_ASSERT(K % 32 == 0);
1370+
//For transpose kernels, we assume K is a multiple of 4 (satisfied by prior assert), and M is a multiple of 4
1371+
GGML_ASSERT(M % 4 == 0);
1372+
13681373
// transpose is out of place, so we need to allocate transposed buffers
13691374
// <----------------------------------------------------------------------------------> //
13701375
// use sub_buffer of max buffer size instead
@@ -1405,36 +1410,36 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
14051410
cl_mem qT_d_image1D;
14061411
cl_mem dT_d_image1D;
14071412

1408-
cl_image_format img_fmt_1d = { CL_RGBA, CL_FLOAT };
1413+
cl_image_format img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
14091414
cl_image_desc img_desc_1d;
14101415

14111416
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
14121417
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
1413-
img_desc_1d.image_width = M * K / 8 / 4;
1418+
img_desc_1d.image_width = M * K / 4 / 4;
14141419
img_desc_1d.buffer = extra->q;
14151420
q_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
14161421
CL_CHECK(err);
14171422

1418-
img_fmt_1d = { CL_RGBA, CL_FLOAT };
1423+
img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
14191424
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
14201425
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
1421-
img_desc_1d.image_width = M * K / 8 / 4;
1426+
img_desc_1d.image_width = M * K / 4 / 4;
14221427
img_desc_1d.buffer = qT_d;
14231428
qT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
14241429
CL_CHECK(err);
14251430

1426-
img_fmt_1d = { CL_RGBA, CL_FLOAT };
1431+
img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
14271432
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
14281433
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
1429-
img_desc_1d.image_width = M * K / 32 / 4 / 2;
1434+
img_desc_1d.image_width = M * K / 32 / 4;
14301435
img_desc_1d.buffer = extra->d;
14311436
d_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
14321437
CL_CHECK(err);
14331438

1434-
img_fmt_1d = { CL_RGBA, CL_FLOAT };
1439+
img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
14351440
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
14361441
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
1437-
img_desc_1d.image_width = M * K / 32 / 4 / 2;
1442+
img_desc_1d.image_width = M * K / 32 / 4;
14381443
img_desc_1d.buffer = dT_d;
14391444
dT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
14401445
CL_CHECK(err);
@@ -1443,8 +1448,8 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
14431448
// set up and call the transpose kernels
14441449
// <----------------------------------------------------------------------------------> //
14451450
// weights
1446-
int height_q = M / 8;
1447-
int width_q = K / 8 / 4;
1451+
int height_q = M / 4;
1452+
int width_q = K / 4 / 4;
14481453
kernel = backend_ctx->kernel_transpose_16;
14491454

14501455
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q_d_image1D));
@@ -1458,8 +1463,8 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
14581463
CL_CHECK(clWaitForEvents(1, &evt));
14591464

14601465
// scales
1461-
int height_s = M / 8;
1462-
int width_s = K / 32 / 8;
1466+
int height_s = M / 4;
1467+
int width_s = K / 32 / 4;
14631468

14641469
kernel = backend_ctx->kernel_transpose_16;
14651470
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_d_image1D));
@@ -1853,7 +1858,6 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
18531858
void * buf_d;
18541859
#endif
18551860

1856-
#ifdef GGML_USE_OPENCL
18571861
// Make sure everything is done.
18581862
CL_CHECK(clFinish(queue));
18591863

@@ -1889,7 +1893,6 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
18891893
extra->offset, ggml_nbytes(tensor), buf, 0, NULL, NULL));
18901894
CL_CHECK(clFinish(queue));
18911895
#endif // GGML_OPENCL_SOA_Q
1892-
#endif // GGML_USE_OPENCL
18931896

18941897
// Open file and dump.
18951898
char fname[512];

ggml/src/ggml-opencl/kernels/ggml-opencl.cl

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1659,6 +1659,9 @@ kernel void kernel_mul_mat_f16_f16(
16591659
//------------------------------------------------------------------------------
16601660
// mul_mat_f16_f32_1row
16611661
//------------------------------------------------------------------------------
1662+
#ifdef ADRENO_GPU
1663+
REQD_SUBGROUP_SIZE_64
1664+
#endif
16621665
kernel void kernel_mul_mat_f16_f32_1row(
16631666
global char * src0,
16641667
ulong offset0,

ggml/src/ggml-opencl/kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,16 @@
77
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
88
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
99

10-
__attribute__((qcom_reqd_sub_group_size("full")))
10+
#ifdef cl_qcom_reqd_sub_group_size
11+
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
12+
#define ADRENO_GPU 1
13+
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
14+
#endif
15+
16+
#ifdef ADRENO_GPU
17+
REQD_SUBGROUP_SIZE_128
18+
#endif
19+
1120
kernel void kernel_mul_mat_Ab_Bi_8x4(
1221
global const ushort * src0_q, // quantized A
1322
global const half * src0_d, // A scales
Lines changed: 13 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1-
// 16-bit transpose, loading/storing an 8x8 tile of elements
1+
// 16-bit transpose, loading/storing a 4x4 tile of elements
2+
3+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
24

35
kernel void kernel_transpose_16(
46
__read_only image1d_buffer_t input,
@@ -9,24 +11,16 @@ kernel void kernel_transpose_16(
911

1012
const int i = get_global_id(0);
1113
const int j = get_global_id(1);
12-
const int i_3 = i<<3;
13-
const int j_3 = j<<3;
14+
const int i_2 = i<<2;
15+
const int j_2 = j<<2;
1416

15-
ushort8 temp0 = as_ushort8(read_imagef(input, (j_3+0)*cols+i));
16-
ushort8 temp1 = as_ushort8(read_imagef(input, (j_3+1)*cols+i));
17-
ushort8 temp2 = as_ushort8(read_imagef(input, (j_3+2)*cols+i));
18-
ushort8 temp3 = as_ushort8(read_imagef(input, (j_3+3)*cols+i));
19-
ushort8 temp4 = as_ushort8(read_imagef(input, (j_3+4)*cols+i));
20-
ushort8 temp5 = as_ushort8(read_imagef(input, (j_3+5)*cols+i));
21-
ushort8 temp6 = as_ushort8(read_imagef(input, (j_3+6)*cols+i));
22-
ushort8 temp7 = as_ushort8(read_imagef(input, (j_3+7)*cols+i));
17+
half4 temp0 = read_imageh(input, (j_2+0)*cols+i);
18+
half4 temp1 = read_imageh(input, (j_2+1)*cols+i);
19+
half4 temp2 = read_imageh(input, (j_2+2)*cols+i);
20+
half4 temp3 = read_imageh(input, (j_2+3)*cols+i);
2321

24-
write_imagef(output, (i_3+0)*rows+j, as_float4((ushort8)(temp0.s0, temp1.s0, temp2.s0, temp3.s0, temp4.s0, temp5.s0, temp6.s0, temp7.s0)));
25-
write_imagef(output, (i_3+1)*rows+j, as_float4((ushort8)(temp0.s1, temp1.s1, temp2.s1, temp3.s1, temp4.s1, temp5.s1, temp6.s1, temp7.s1)));
26-
write_imagef(output, (i_3+2)*rows+j, as_float4((ushort8)(temp0.s2, temp1.s2, temp2.s2, temp3.s2, temp4.s2, temp5.s2, temp6.s2, temp7.s2)));
27-
write_imagef(output, (i_3+3)*rows+j, as_float4((ushort8)(temp0.s3, temp1.s3, temp2.s3, temp3.s3, temp4.s3, temp5.s3, temp6.s3, temp7.s3)));
28-
write_imagef(output, (i_3+4)*rows+j, as_float4((ushort8)(temp0.s4, temp1.s4, temp2.s4, temp3.s4, temp4.s4, temp5.s4, temp6.s4, temp7.s4)));
29-
write_imagef(output, (i_3+5)*rows+j, as_float4((ushort8)(temp0.s5, temp1.s5, temp2.s5, temp3.s5, temp4.s5, temp5.s5, temp6.s5, temp7.s5)));
30-
write_imagef(output, (i_3+6)*rows+j, as_float4((ushort8)(temp0.s6, temp1.s6, temp2.s6, temp3.s6, temp4.s6, temp5.s6, temp6.s6, temp7.s6)));
31-
write_imagef(output, (i_3+7)*rows+j, as_float4((ushort8)(temp0.s7, temp1.s7, temp2.s7, temp3.s7, temp4.s7, temp5.s7, temp6.s7, temp7.s7)));
22+
write_imageh(output, (i_2+0)*rows+j, (half4)(temp0.s0, temp1.s0, temp2.s0, temp3.s0));
23+
write_imageh(output, (i_2+1)*rows+j, (half4)(temp0.s1, temp1.s1, temp2.s1, temp3.s1));
24+
write_imageh(output, (i_2+2)*rows+j, (half4)(temp0.s2, temp1.s2, temp2.s2, temp3.s2));
25+
write_imageh(output, (i_2+3)*rows+j, (half4)(temp0.s3, temp1.s3, temp2.s3, temp3.s3));
3226
}

0 commit comments

Comments
 (0)