Skip to content

Commit 418606e

Browse files
committed
add mul_mat_f16_f32_image kernel
1 parent 576c82e commit 418606e

File tree

5 files changed

+299
-0
lines changed

5 files changed

+299
-0
lines changed

ggml/src/ggml-opencl/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,9 @@ set(GGML_OPENCL_KERNELS
105105
pad
106106
repeat
107107
mul_mat_f16_f32
108+
mul_mat_f16_f32_image
109+
pack_a_for_image
110+
pack_b_for_image
108111
)
109112

110113
foreach (K ${GGML_OPENCL_KERNELS})

ggml/src/ggml-opencl/ggml-opencl.cpp

Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -331,6 +331,8 @@ struct ggml_backend_opencl_context {
331331

332332
cl_int alignment;
333333
size_t max_alloc_size;
334+
size_t max_image_width;
335+
size_t max_image_height;
334336
bool fp16_support;
335337
bool has_vector_subgroup_broadcast;
336338
ggml_cl_compiler_version adreno_cl_compiler_version;
@@ -369,6 +371,10 @@ struct ggml_backend_opencl_context {
369371
cl_program program_mul_mv_f32_f32;
370372
cl_program program_mul;
371373
cl_program program_mul_mat_f16_f32_tiled;
374+
cl_program program_mul_mat_f16_f32_image;
375+
cl_program program_pack_a_for_image;
376+
cl_program program_pack_b_for_image;
377+
cl_ulong global_mem_size;
372378
cl_program program_div;
373379
cl_program program_sub;
374380
cl_program program_norm;
@@ -424,6 +430,9 @@ struct ggml_backend_opencl_context {
424430
cl_kernel kernel_mul_mat_f16_f32;
425431
cl_kernel kernel_mul_mat_f16_f32_l4;
426432
cl_kernel kernel_mul_mat_f16_f32_tiled;
433+
cl_kernel kernel_mul_mat_f16_f32_image;
434+
cl_kernel kernel_pack_a_for_image;
435+
cl_kernel kernel_pack_b_for_image;
427436
cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v;
428437
cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0;
429438
cl_kernel kernel_mul_mat_q4_0_f32_8x_flat;
@@ -1033,6 +1042,54 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
10331042
GGML_LOG_CONT(".");
10341043
}
10351044

1045+
// mul_mat_f16_f32_image
1046+
{
1047+
#ifdef GGML_OPENCL_EMBED_KERNELS
1048+
const std::string kernel_src{
1049+
#include "mul_mat_f16_f32_image.cl.h"
1050+
};
1051+
#else
1052+
const std::string kernel_src = read_file("mul_mat_f16_f32_image.cl");
1053+
#endif
1054+
backend_ctx->program_mul_mat_f16_f32_image =
1055+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1056+
1057+
CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_image = clCreateKernel(backend_ctx->program_mul_mat_f16_f32_image, "mul_mat_f16_f32_image", &err), err));
1058+
GGML_LOG_CONT(".");
1059+
}
1060+
1061+
// pack_a_for_image
1062+
{
1063+
#ifdef GGML_OPENCL_EMBED_KERNELS
1064+
const std::string kernel_src{
1065+
#include "pack_a_for_image.cl.h"
1066+
};
1067+
#else
1068+
const std::string kernel_src = read_file("pack_a_for_image.cl");
1069+
#endif
1070+
backend_ctx->program_pack_a_for_image =
1071+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1072+
1073+
CL_CHECK((backend_ctx->kernel_pack_a_for_image = clCreateKernel(backend_ctx->program_pack_a_for_image, "pack_a_for_image", &err), err));
1074+
GGML_LOG_CONT(".");
1075+
}
1076+
1077+
// pack_b_for_image
1078+
{
1079+
#ifdef GGML_OPENCL_EMBED_KERNELS
1080+
const std::string kernel_src{
1081+
#include "pack_b_for_image.cl.h"
1082+
};
1083+
#else
1084+
const std::string kernel_src = read_file("pack_b_for_image.cl");
1085+
#endif
1086+
backend_ctx->program_pack_b_for_image =
1087+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1088+
1089+
CL_CHECK((backend_ctx->kernel_pack_b_for_image = clCreateKernel(backend_ctx->program_pack_b_for_image, "pack_b_for_image", &err), err));
1090+
GGML_LOG_CONT(".");
1091+
}
1092+
10361093
// mul
10371094
{
10381095
#ifdef GGML_OPENCL_EMBED_KERNELS
@@ -1987,6 +2044,11 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
19872044
clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &backend_ctx->max_alloc_size, NULL);
19882045
GGML_LOG_INFO("ggml_opencl: max mem alloc size: %zu MB\n", backend_ctx->max_alloc_size/1024/1024);
19892046

2047+
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &backend_ctx->global_mem_size, NULL));
2048+
2049+
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(size_t), &backend_ctx->max_image_width, NULL));
2050+
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(size_t), &backend_ctx->max_image_height, NULL));
2051+
19902052
// Check SVM.
19912053
cl_device_svm_capabilities svm_caps;
19922054
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_SVM_CAPABILITIES, sizeof(cl_device_svm_capabilities), &svm_caps, 0));
@@ -4997,6 +5059,93 @@ static void ggml_cl_mul_mat_f16_f32_tiled(ggml_backend_t backend, const ggml_ten
49975059
backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size, local_work_size, dst);
49985060
}
49995061

5062+
static void ggml_cl_mul_mat_f16_f32_image(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
5063+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
5064+
cl_context context = backend_ctx->context;
5065+
cl_command_queue queue = backend_ctx->queue;
5066+
cl_int err = 0;
5067+
5068+
const int M = src0->ne[1];
5069+
const int N = src1->ne[1];
5070+
const int K = src0->ne[0];
5071+
const int K_4 = (K + 3) / 4;
5072+
const int N_4 = (N + 3) / 4;
5073+
5074+
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
5075+
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
5076+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
5077+
5078+
cl_ulong offset0 = extra0->offset + src0->view_offs;
5079+
cl_ulong offset1 = extra1->offset + src1->view_offs;
5080+
cl_ulong offsetd = extrad->offset + dst->view_offs;
5081+
5082+
cl_mem a_image = NULL, b_image = NULL;
5083+
cl_event pack_events[2];
5084+
cl_event matmul_event;
5085+
5086+
// Create image for A
5087+
cl_image_format format_A = {CL_RGBA, CL_HALF_FLOAT};
5088+
cl_image_desc desc_A = {};
5089+
desc_A.image_type = CL_MEM_OBJECT_IMAGE2D;
5090+
desc_A.image_width = K_4;
5091+
desc_A.image_height = M;
5092+
a_image = clCreateImage(context, CL_MEM_READ_WRITE, &format_A, &desc_A, NULL, &err);
5093+
CL_CHECK(err);
5094+
5095+
// Create image for B
5096+
cl_image_format format_B = {CL_RGBA, CL_HALF_FLOAT};
5097+
cl_image_desc desc_B = {};
5098+
desc_B.image_type = CL_MEM_OBJECT_IMAGE2D;
5099+
desc_B.image_width = N_4;
5100+
desc_B.image_height = K;
5101+
b_image = clCreateImage(context, CL_MEM_READ_WRITE, &format_B, &desc_B, NULL, &err);
5102+
CL_CHECK(err);
5103+
5104+
// Launch packing kernel for A
5105+
cl_kernel pack_a_kernel = backend_ctx->kernel_pack_a_for_image;
5106+
CL_CHECK(clSetKernelArg(pack_a_kernel, 0, sizeof(cl_mem), &extra0->data_device));
5107+
CL_CHECK(clSetKernelArg(pack_a_kernel, 1, sizeof(cl_ulong), &offset0));
5108+
CL_CHECK(clSetKernelArg(pack_a_kernel, 2, sizeof(cl_mem), &a_image));
5109+
CL_CHECK(clSetKernelArg(pack_a_kernel, 3, sizeof(int), &M));
5110+
CL_CHECK(clSetKernelArg(pack_a_kernel, 4, sizeof(int), &K));
5111+
const size_t pack_a_gws[2] = { (size_t)K_4, (size_t)M };
5112+
CL_CHECK(clEnqueueNDRangeKernel(queue, pack_a_kernel, 2, NULL, pack_a_gws, NULL, 0, NULL, &pack_events[0]));
5113+
5114+
// Launch packing kernel for B
5115+
cl_kernel pack_b_kernel = backend_ctx->kernel_pack_b_for_image;
5116+
CL_CHECK(clSetKernelArg(pack_b_kernel, 0, sizeof(cl_mem), &extra1->data_device));
5117+
CL_CHECK(clSetKernelArg(pack_b_kernel, 1, sizeof(cl_ulong), &offset1));
5118+
CL_CHECK(clSetKernelArg(pack_b_kernel, 2, sizeof(cl_mem), &b_image));
5119+
CL_CHECK(clSetKernelArg(pack_b_kernel, 3, sizeof(int), &K));
5120+
CL_CHECK(clSetKernelArg(pack_b_kernel, 4, sizeof(int), &N));
5121+
const size_t pack_b_gws[2] = { (size_t)N_4, (size_t)K };
5122+
CL_CHECK(clEnqueueNDRangeKernel(queue, pack_b_kernel, 2, NULL, pack_b_gws, NULL, 0, NULL, &pack_events[1]));
5123+
5124+
// Launch matmul kernel
5125+
cl_kernel matmul_kernel = backend_ctx->kernel_mul_mat_f16_f32_image;
5126+
CL_CHECK(clSetKernelArg(matmul_kernel, 0, sizeof(cl_mem), &a_image));
5127+
CL_CHECK(clSetKernelArg(matmul_kernel, 1, sizeof(cl_mem), &b_image));
5128+
CL_CHECK(clSetKernelArg(matmul_kernel, 2, sizeof(cl_mem), &extrad->data_device));
5129+
CL_CHECK(clSetKernelArg(matmul_kernel, 3, sizeof(cl_ulong), &offsetd));
5130+
CL_CHECK(clSetKernelArg(matmul_kernel, 4, sizeof(int), &M));
5131+
CL_CHECK(clSetKernelArg(matmul_kernel, 5, sizeof(int), &N));
5132+
CL_CHECK(clSetKernelArg(matmul_kernel, 6, sizeof(int), &K));
5133+
5134+
const int OPWM = 64;
5135+
const int OPWN = 64;
5136+
const size_t lws[2] = { 16, 8 }; // WG_M, WG_N
5137+
const size_t gws[2] = { (size_t)ceil((float)M / OPWM) * lws[0], (size_t)ceil((float)N / OPWN) * lws[1] };
5138+
CL_CHECK(clEnqueueNDRangeKernel(queue, matmul_kernel, 2, NULL, gws, lws, 2, pack_events, &matmul_event));
5139+
5140+
// Wait for matmul to finish and release resources
5141+
CL_CHECK(clWaitForEvents(1, &matmul_event));
5142+
CL_CHECK(clReleaseEvent(pack_events[0]));
5143+
CL_CHECK(clReleaseEvent(pack_events[1]));
5144+
CL_CHECK(clReleaseEvent(matmul_event));
5145+
CL_CHECK(clReleaseMemObject(a_image));
5146+
CL_CHECK(clReleaseMemObject(b_image));
5147+
}
5148+
50005149
static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
50015150
GGML_ASSERT(src0);
50025151
GGML_ASSERT(src0->extra);
@@ -5010,6 +5159,35 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
50105159

50115160
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
50125161

5162+
if (src0t == GGML_TYPE_F16 && src1t == GGML_TYPE_F32 &&
5163+
backend_ctx->gpu_family == ADRENO && backend_ctx->kernel_mul_mat_f16_f32_image != NULL &&
5164+
ggml_is_contiguous(src0) && ggml_is_contiguous(src1) &&
5165+
src0->ne[2] == 1 && src0->ne[3] == 1 &&
5166+
src1->ne[2] == 1 && src1->ne[3] == 1) {
5167+
5168+
const int M = src0->ne[1];
5169+
const int N = src1->ne[1];
5170+
const int K = src0->ne[0];
5171+
5172+
// Performance thresholds: only use for reasonably large matrices
5173+
// where the GPU speedup can outweigh the CPU-side transpose/packing overhead.
5174+
if (M > 32 && N > 32 && K > 32) {
5175+
const size_t n_padded_4 = (size_t)((N + 3) / 4);
5176+
const size_t temp_a_size = (size_t)M * K * sizeof(ggml_fp16_t);
5177+
const size_t temp_b_size = n_padded_4 * K * 4 * sizeof(ggml_fp16_t); // RGBA
5178+
const size_t total_temp_image_size = temp_a_size + temp_b_size;
5179+
5180+
// Safety checks for memory and device limits
5181+
if ((size_t)K <= backend_ctx->max_image_width &&
5182+
(size_t)M <= backend_ctx->max_image_height &&
5183+
n_padded_4 <= backend_ctx->max_image_height &&
5184+
total_temp_image_size < (backend_ctx->global_mem_size / 4)) { // Ensure temp images use < 25% of total VRAM
5185+
ggml_cl_mul_mat_f16_f32_image(backend, src0, src1, dst);
5186+
return;
5187+
}
5188+
}
5189+
}
5190+
50135191
if (src0t == GGML_TYPE_F16 && src1t == GGML_TYPE_F32 &&
50145192
src0->ne[1] > 32 && // M > 32
50155193
src1->ne[1] > 32 && // N > 32
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
2+
3+
__constant sampler_t SAMPLER = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
4+
5+
__kernel void mul_mat_f16_f32_image(
6+
__read_only image2d_t A_img,
7+
__read_only image2d_t B_img,
8+
__global float* C_buf,
9+
const ulong c_offset,
10+
const int M,
11+
const int N,
12+
const int K
13+
) {
14+
const int n_4_idx = get_global_id(0);
15+
const int m_idx = get_global_id(1);
16+
17+
const int n_base = n_4_idx << 2;
18+
19+
if (n_base >= N || m_idx >= M) {
20+
return;
21+
}
22+
23+
float4 c_vals = (float4)(0.0f);
24+
const int K_4 = (K + 3) / 4;
25+
26+
for (int k_4_idx = 0; k_4_idx < K_4; ++k_4_idx) {
27+
int k_base = k_4_idx << 2;
28+
29+
float4 a_vals = convert_float4(read_imageh(A_img, SAMPLER, (int2)(k_4_idx, m_idx)));
30+
31+
if (k_base < K) {
32+
float4 b0 = convert_float4(read_imageh(B_img, SAMPLER, (int2)(n_4_idx, k_base + 0)));
33+
c_vals = mad(a_vals.x, b0, c_vals);
34+
}
35+
if (k_base + 1 < K) {
36+
float4 b1 = convert_float4(read_imageh(B_img, SAMPLER, (int2)(n_4_idx, k_base + 1)));
37+
c_vals = mad(a_vals.y, b1, c_vals);
38+
}
39+
if (k_base + 2 < K) {
40+
float4 b2 = convert_float4(read_imageh(B_img, SAMPLER, (int2)(n_4_idx, k_base + 2)));
41+
c_vals = mad(a_vals.z, b2, c_vals);
42+
}
43+
if (k_base + 3 < K) {
44+
float4 b3 = convert_float4(read_imageh(B_img, SAMPLER, (int2)(n_4_idx, k_base + 3)));
45+
c_vals = mad(a_vals.w, b3, c_vals);
46+
}
47+
}
48+
49+
__global float* C = (__global float*)((__global char*)C_buf + c_offset);
50+
51+
if (n_base + 3 < N) {
52+
C[(n_base + 0) * M + m_idx] = c_vals.x;
53+
C[(n_base + 1) * M + m_idx] = c_vals.y;
54+
C[(n_base + 2) * M + m_idx] = c_vals.z;
55+
C[(n_base + 3) * M + m_idx] = c_vals.w;
56+
} else {
57+
if (n_base < N) C[n_base * M + m_idx] = c_vals.x;
58+
if (n_base + 1 < N) C[(n_base + 1) * M + m_idx] = c_vals.y;
59+
if (n_base + 2 < N) C[(n_base + 2) * M + m_idx] = c_vals.z;
60+
}
61+
}
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
2+
3+
__kernel void pack_a_for_image(
4+
__global const half* src_a,
5+
const ulong a_offset,
6+
__write_only image2d_t dest_img,
7+
const int M,
8+
const int K
9+
) {
10+
const int k_4_idx = get_global_id(0);
11+
const int m_idx = get_global_id(1);
12+
13+
const int k_base = k_4_idx << 2;
14+
15+
if (k_base >= K || m_idx >= M) {
16+
return;
17+
}
18+
19+
__global const half* a_ptr = (__global const half*)((__global const char*)src_a + a_offset);
20+
const int a_idx_base = m_idx * K + k_base;
21+
22+
half4 vals;
23+
vals.x = a_ptr[a_idx_base];
24+
vals.y = (k_base + 1 < K) ? a_ptr[a_idx_base + 1] : (half)0.0h;
25+
vals.z = (k_base + 2 < K) ? a_ptr[a_idx_base + 2] : (half)0.0h;
26+
vals.w = (k_base + 3 < K) ? a_ptr[a_idx_base + 3] : (half)0.0h;
27+
28+
write_imageh(dest_img, (int2)(k_4_idx, m_idx), vals);
29+
}
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
2+
3+
__kernel void pack_b_for_image(
4+
__global const float* src_b,
5+
const ulong b_offset,
6+
__write_only image2d_t dest_img,
7+
const int K,
8+
const int N
9+
) {
10+
const int n_4_idx = get_global_id(0);
11+
const int k_idx = get_global_id(1);
12+
13+
const int n_base = n_4_idx << 2;
14+
15+
if (n_base >= N || k_idx >= K) {
16+
return;
17+
}
18+
19+
__global const float* b_ptr = (__global const float*)((__global const char*)src_b + b_offset);
20+
21+
half4 vals;
22+
vals.x = convert_half(b_ptr[n_base * K + k_idx]);
23+
vals.y = (n_base + 1 < N) ? convert_half(b_ptr[(n_base + 1) * K + k_idx]) : (half)0.0h;
24+
vals.z = (n_base + 2 < N) ? convert_half(b_ptr[(n_base + 2) * K + k_idx]) : (half)0.0h;
25+
vals.w = (n_base + 3 < N) ? convert_half(b_ptr[(n_base + 3) * K + k_idx]) : (half)0.0h;
26+
27+
write_imageh(dest_img, (int2)(n_4_idx, k_idx), vals);
28+
}

0 commit comments

Comments
 (0)