Skip to content

Commit 16e2544

Browse files
committed
opencl: add argsort
1 parent 17fc817 commit 16e2544

File tree

3 files changed

+168
-0
lines changed

3 files changed

+168
-0
lines changed

ggml/src/ggml-opencl/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ endfunction()
5555

5656
set(GGML_OPENCL_KERNELS
5757
add
58+
argsort
5859
clamp
5960
cpy
6061
cvt

ggml/src/ggml-opencl/ggml-opencl.cpp

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,7 @@ struct ggml_backend_opencl_context {
309309
cl_program program_softmax_f16;
310310
cl_program program_softmax_4_f32;
311311
cl_program program_softmax_4_f16;
312+
cl_program program_argsort_f32_i32;
312313

313314
cl_kernel kernel_add, kernel_add_row;
314315
cl_kernel kernel_mul, kernel_mul_row;
@@ -339,6 +340,7 @@ struct ggml_backend_opencl_context {
339340
cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
340341
cl_kernel kernel_mul_mv_q6_K_f32;
341342
cl_kernel kernel_im2col_f32, kernel_im2col_f16;
343+
cl_kernel kernel_argsort_f32_i32;
342344

343345
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
344346
// Transpose kernels
@@ -986,6 +988,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
986988
GGML_LOG_CONT(".");
987989
}
988990

991+
// argsort
992+
{
993+
#ifdef GGML_OPENCL_EMBED_KERNELS
994+
const std::string kernel_src {
995+
#include "argsort.cl.h"
996+
};
997+
#else
998+
const std::string kernel_src = read_file("argsort.cl");
999+
#endif
1000+
backend_ctx->program_argsort_f32_i32 =
1001+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1002+
1003+
CL_CHECK((backend_ctx->kernel_argsort_f32_i32 = clCreateKernel(backend_ctx->program_argsort_f32_i32, "kernel_argsort_f32_i32", &err), err));
1004+
GGML_LOG_CONT(".");
1005+
}
1006+
9891007
// Adreno kernels
9901008
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
9911009
// transpose
@@ -1912,6 +1930,8 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
19121930
}
19131931
case GGML_OP_IM2COL:
19141932
return true;
1933+
case GGML_OP_ARGSORT:
1934+
return op->src[0]->type == GGML_TYPE_F32;
19151935
default:
19161936
return false;
19171937
}
@@ -4975,6 +4995,61 @@ static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, con
49754995
#endif
49764996
}
49774997

4998+
static void ggml_cl_argsort(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
4999+
GGML_ASSERT(src0);
5000+
GGML_ASSERT(src0->extra);
5001+
GGML_ASSERT(dst);
5002+
GGML_ASSERT(dst->extra);
5003+
GGML_UNUSED(src1);
5004+
5005+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
5006+
GGML_ASSERT( dst->type == GGML_TYPE_I32);
5007+
GGML_ASSERT(ggml_is_contiguous(src0));
5008+
5009+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
5010+
cl_command_queue queue = backend_ctx->queue;
5011+
5012+
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
5013+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
5014+
5015+
cl_ulong offset0 = extra0->offset + src0->view_offs;
5016+
cl_ulong offsetd = extrad->offset + dst->view_offs;
5017+
5018+
const int ne00 = src0->ne[0];
5019+
const int nrows = ggml_nrows(src0);
5020+
5021+
int ne00_padded = 1;
5022+
while (ne00_padded < ne00) {
5023+
ne00_padded *= 2;
5024+
}
5025+
5026+
int order = (enum ggml_sort_order) dst->op_params[0];
5027+
5028+
cl_kernel kernel = backend_ctx->kernel_argsort_f32_i32;
5029+
5030+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
5031+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
5032+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
5033+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
5034+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
5035+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne00_padded));
5036+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &order));
5037+
CL_CHECK(clSetKernelArg(kernel, 7, ne00_padded*sizeof(int), NULL));
5038+
5039+
size_t global_work_size[] = {(size_t)ne00_padded, (size_t)nrows, (size_t)1};
5040+
size_t local_work_size[] = {(size_t)ne00_padded, 1, 1};
5041+
5042+
#ifdef GGML_OPENCL_PROFILING
5043+
cl_event evt;
5044+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
5045+
5046+
g_profiling_info.emplace_back();
5047+
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
5048+
#else
5049+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
5050+
#endif
5051+
}
5052+
49785053
//------------------------------------------------------------------------------
49795054
// Op offloading
49805055
//------------------------------------------------------------------------------
@@ -5115,6 +5190,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
51155190
}
51165191
func = ggml_cl_im2col;
51175192
break;
5193+
case GGML_OP_ARGSORT:
5194+
if (!any_on_device) {
5195+
return false;
5196+
}
5197+
func = ggml_cl_argsort;
5198+
break;
51185199
default:
51195200
return false;
51205201
}
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
2+
3+
#ifdef cl_intel_subgroups
4+
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
5+
#else
6+
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
7+
#endif
8+
9+
#ifdef cl_intel_required_subgroup_size
10+
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
11+
#define INTEL_GPU 1
12+
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
13+
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
14+
#elif defined(cl_qcom_reqd_sub_group_size)
15+
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
16+
#define ADRENO_GPU 1
17+
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
18+
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
19+
#endif
20+
21+
#define SWAP(x, y, T) { T tmp = (x); (x) = (y); (y) = tmp; }
22+
23+
enum ggml_sort_order {
24+
GGML_SORT_ORDER_ASC,
25+
GGML_SORT_ORDER_DESC,
26+
};
27+
28+
kernel void kernel_argsort_f32_i32(
29+
global float * src0,
30+
ulong offset0,
31+
global int * dst,
32+
ulong offsetd,
33+
const int ne00,
34+
const int ne00_pad,
35+
const int order,
36+
local int * dst_row
37+
) {
38+
// bitonic sort
39+
int col = get_local_id(0);
40+
int row = get_group_id(1);
41+
42+
if (col >= ne00_pad) {
43+
return;
44+
}
45+
46+
src0 = (global char *)((global char *)src0 + offset0);
47+
dst = (global float *)((global char *)dst + offsetd);
48+
49+
global float * x_row = src0 + row * ne00;
50+
51+
// initialize indices
52+
dst_row[col] = col;
53+
54+
barrier(CLK_LOCAL_MEM_FENCE);
55+
56+
for (int k = 2; k <= ne00_pad; k *= 2) {
57+
for (int j = k / 2; j > 0; j /= 2) {
58+
int ixj = col ^ j;
59+
if (ixj > col) {
60+
if ((col & k) == 0) {
61+
if (dst_row[col] >= ne00 ||
62+
(dst_row[ixj] < ne00 && (order == GGML_SORT_ORDER_ASC ?
63+
x_row[dst_row[col]] > x_row[dst_row[ixj]] :
64+
x_row[dst_row[col]] < x_row[dst_row[ixj]]))
65+
) {
66+
SWAP(dst_row[col], dst_row[ixj], int);
67+
}
68+
} else {
69+
if (dst_row[ixj] >= ne00 ||
70+
(dst_row[col] < ne00 && (order == GGML_SORT_ORDER_ASC ?
71+
x_row[dst_row[col]] < x_row[dst_row[ixj]] :
72+
x_row[dst_row[col]] > x_row[dst_row[ixj]]))
73+
) {
74+
SWAP(dst_row[col], dst_row[ixj], int);
75+
}
76+
}
77+
}
78+
barrier(CLK_LOCAL_MEM_FENCE);
79+
}
80+
}
81+
82+
// copy the result to dst without the padding
83+
if (col < ne00) {
84+
dst[row * ne00 + col] = dst_row[col];
85+
}
86+
}

0 commit comments

Comments
 (0)