Skip to content

Commit 34fb141

Browse files
lhezquic-sszotshawngu-quicquic-aanguswanghqc
committed
[cl][adreno] Add Adreno GPU support
Add new OpenCL backend to support Adreno GPUs --------- Co-authored-by: Skyler Szot <[email protected]> Co-authored-by: Shangqing Gu <[email protected]> Co-authored-by: Alexander Angus <[email protected]> Co-authored-by: Hongqiang Wang <[email protected]> Co-authored-by: Max Krasnyansky <[email protected]>
1 parent f112d19 commit 34fb141

21 files changed

+9331
-1
lines changed

ggml/CMakeLists.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,12 @@ set (GGML_SYCL_TARGET "INTEL" CACHE STRING
172172
set (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
173173
"ggml: sycl device architecture")
174174

175+
option(GGML_OPENCL "ggml: use OpenCL" OFF)
176+
option(GGML_OPENCL_SMALL_ALLOC "ggml: use small allocation for tensors" ON)
177+
option(GGML_OPENCL_PROFILING "ggml: use OpenCL profiling (increases overhead)" OFF)
178+
option(GGML_OPENCL_EMBED_KERNELS "ggml: embed kernels" ON)
179+
option(GGML_OPENCL_USE_ADRENO_KERNELS "ggml: use optimized kernels for Adreno" ON)
180+
175181
# extra artifacts
176182
option(GGML_BUILD_TESTS "ggml: build tests" ${GGML_STANDALONE})
177183
option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})

ggml/include/ggml-alloc.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_i
6969
// Utils
7070
// Create a buffer and allocate all the tensors in a ggml_context
7171
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
72+
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft_for_weights(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
7273
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);
7374

7475
#ifdef __cplusplus

ggml/include/ggml-opencl2.h

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
// SPDX-FileCopyrightText: Copyright (c) Qualcomm Innovation Center, Inc. All rights reserved
2+
// SPDX-License-Identifier: MIT
3+
4+
#ifndef GGML_OPENCL2_H
5+
#define GGML_OPENCL2_H
6+
7+
#include "ggml.h"
8+
#include "ggml-backend.h"
9+
10+
#ifdef __cplusplus
11+
extern "C" {
12+
#endif
13+
14+
#define CL_CHECK(err) \
15+
do { \
16+
cl_int err_ = (err); \
17+
if (err_ != CL_SUCCESS) { \
18+
fprintf(stderr, "ggml_opencl: %s error %d at %s:%d\n", \
19+
#err, err_, __FILE__, __LINE__); \
20+
GGML_ASSERT(0); \
21+
} \
22+
} while (0)
23+
24+
//
25+
// backend API
26+
//
27+
GGML_BACKEND_API ggml_backend_t ggml_backend_opencl2_init(void);
28+
GGML_BACKEND_API bool ggml_backend_is_opencl2(ggml_backend_t backend);
29+
30+
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_opencl2_buffer_type(void);
31+
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_opencl2_host_buffer_type(void);
32+
33+
GGML_BACKEND_API ggml_backend_t ggml_backend_reg_opencl2_init(const char * params, void * user_data);
34+
35+
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_opencl2_reg(void);
36+
37+
#ifdef __cplusplus
38+
}
39+
#endif
40+
41+
#endif // GGML_OPENCL2_H

ggml/src/CMakeLists.txt

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -305,6 +305,19 @@ else ()
305305
ggml_add_cpu_backend_variant_impl("")
306306
endif()
307307

308+
# TODO: This is intrusive. We intend to remove SMALL_ALLOC path once the we fully
309+
# migrate to the non SMALL_ALLOC path. Also need to converge on the backend name
310+
# so we don't need this name conversion.
311+
if (GGML_OPENCL)
312+
set(GGML_OPENCL2 ON)
313+
add_compile_definitions(GGML_USE_OPENCL)
314+
if (GGML_OPENCL_SMALL_ALLOC)
315+
add_compile_definitions(GGML_OPENCL_SMALL_ALLOC)
316+
endif ()
317+
else ()
318+
set(GGML_OPENCL2 OFF)
319+
endif ()
320+
308321
ggml_add_backend(BLAS)
309322
ggml_add_backend(CANN)
310323
ggml_add_backend(CUDA)
@@ -315,6 +328,8 @@ ggml_add_backend(MUSA)
315328
ggml_add_backend(RPC)
316329
ggml_add_backend(SYCL)
317330
ggml_add_backend(Vulkan)
331+
ggml_add_backend(OpenCL2)
332+
ggml_add_backend(MUSA)
318333

319334
foreach (target ggml-base ggml)
320335
target_include_directories(${target} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)

ggml/src/ggml-alloc.c

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1033,6 +1033,92 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
10331033
return buffer;
10341034
}
10351035

1036+
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft_for_weights(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
1037+
#ifndef GGML_OPENCL_SMALL_ALLOC
1038+
return ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
1039+
#else
1040+
// Small allocation allocates a separate buffer for each tensor. Instead of
1041+
// collecting multiple tensors to allocate a large buffer, each tensor is
1042+
// allocated a buffer immediately. This is only supposed to be used for
1043+
// weights tensors (note that weights can be f32).
1044+
GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
1045+
1046+
size_t alignment = ggml_backend_buft_get_alignment(buft);
1047+
1048+
ggml_backend_buffer_t * buffers = NULL;
1049+
size_t n_buffers = 0;
1050+
1051+
struct ggml_tensor * first_view = NULL;
1052+
struct ggml_tensor * first = ggml_get_first_tensor(ctx);
1053+
for (struct ggml_tensor * t = first; t != NULL; t = ggml_get_next_tensor(ctx, t)) {
1054+
size_t this_size = 0;
1055+
if (t->data == NULL && t->view_src == NULL) {
1056+
// Tensor size must be properly padded.
1057+
this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
1058+
}
1059+
1060+
// The allocation logic here has gone beyond intention in order to make
1061+
// `test-backend-ops` work. The very initial intention was to allocate
1062+
// memory for weights - each weight tensor gets its own buffer object.
1063+
// The original function should be used to allocate for intermediate tensors.
1064+
// There are usually no view tensors for weights; this is not true for
1065+
// intermediate tensors. However, in `test-backend-ops` there is no
1066+
// differetiation between weight tensors and intermediate tensors.
1067+
// This function is used for general allocation when small allocation is
1068+
// enabled in the test. This requires the function to also handle view
1069+
// tensors, which do no require actual allocation. In the original function,
1070+
// view tensors are allocated with other non-view tensors since view tensors
1071+
// sizes are 0.
1072+
// Here, we try to identify view tensors and allocate them with the next
1073+
// non-view tensor. View tensors cannot allocated (alone) but must be
1074+
// initialized (together with non-view tensors).
1075+
1076+
// This is a view tensor of its size if 0. Record its location if it is the
1077+
// first one after a non-view tensor. If the next tensor is still a view,
1078+
// simply go to the next. We want to allocate all consecutive view tensors
1079+
// together with the next non-view tensor.
1080+
if (this_size == 0 && first_view == NULL) {
1081+
first_view = t;
1082+
continue;
1083+
}
1084+
1085+
if (first_view) {
1086+
// This is a non-view tensor. If there are any view tensors before
1087+
// this non-view tensor, we want to allocate these view tensors and
1088+
// this non-view tensor together.
1089+
// The first tensor to allocate is the first view tensor.
1090+
first = first_view;
1091+
} else {
1092+
// Otherwise, allocate this non-view tensor immediately.
1093+
first = t;
1094+
}
1095+
1096+
if (!alloc_tensor_range(ctx, first, ggml_get_next_tensor(ctx, t), buft, this_size, &buffers, &n_buffers)) {
1097+
return NULL;
1098+
}
1099+
1100+
// Always reset first_view after a non-view tensor.
1101+
first_view = NULL;
1102+
}
1103+
1104+
if (n_buffers == 0) {
1105+
#ifndef NDEBUG
1106+
fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
1107+
#endif
1108+
return NULL;
1109+
}
1110+
1111+
ggml_backend_buffer_t buffer;
1112+
if (n_buffers == 1) {
1113+
buffer = buffers[0];
1114+
} else {
1115+
buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);
1116+
}
1117+
free(buffers);
1118+
return buffer;
1119+
#endif
1120+
}
1121+
10361122
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend) {
10371123
return ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_get_default_buffer_type(backend));
10381124
}

ggml/src/ggml-backend-reg.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,10 @@
4646
#include "ggml-vulkan.h"
4747
#endif
4848

49+
#ifdef GGML_USE_OPENCL2
50+
#include "ggml-opencl2.h"
51+
#endif
52+
4953
#ifdef GGML_USE_BLAS
5054
#include "ggml-blas.h"
5155
#endif
@@ -146,6 +150,9 @@ struct ggml_backend_registry {
146150
#ifdef GGML_USE_VULKAN
147151
register_backend(ggml_backend_vk_reg());
148152
#endif
153+
#ifdef GGML_USE_OPENCL2
154+
register_backend(ggml_backend_opencl2_reg());
155+
#endif
149156
#ifdef GGML_USE_CANN
150157
register_backend(ggml_backend_cann_reg());
151158
#endif
Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
2+
find_package(OpenCL)
3+
4+
if (OpenCL_FOUND)
5+
find_package(Python3 REQUIRED)
6+
7+
set(TARGET_NAME ggml-opencl2)
8+
9+
add_library(${TARGET_NAME}
10+
ggml-opencl2.cpp
11+
../../include/ggml-opencl2.h)
12+
target_link_libraries(${TARGET_NAME} PRIVATE ggml-base ${OpenCL_LIBRARIES})
13+
target_include_directories(${TARGET_NAME} PRIVATE . .. ${OpenCL_INCLUDE_DIRS})
14+
15+
# TODO - this is kind of strange. We have been calling this backend OpenCL2,
16+
# so everything (function names, folder name, etc) except macro switches
17+
# has been OpenCL2. Now, the backend frameworke enforces the use of the folder
18+
# name as the backend name and switch. So, GGML_USE_OPENCL2 is used in
19+
# ggml-backend-reg.cpp, but the rest still uses GGML_USE_OPENCL.
20+
add_compile_definitions(GGML_USE_OPENCL)
21+
22+
if (GGML_OPENCL_PROFILING)
23+
message(STATUS "OpenCL profiling enabled (increases CPU overhead)")
24+
add_compile_definitions(GGML_OPENCL_PROFILING)
25+
endif ()
26+
27+
add_compile_definitions(GGML_OPENCL_SOA_Q)
28+
29+
if (GGML_OPENCL_SMALL_ALLOC)
30+
message(STATUS "OpenCL will allocate a separate buffer for each tensor. "
31+
"The default behavior allocates a large buffer to hold multiple tensors.")
32+
add_compile_definitions(GGML_OPENCL_SMALL_ALLOC)
33+
endif ()
34+
35+
if (GGML_OPENCL_USE_ADRENO_KERNELS)
36+
message(STATUS "OpenCL will use matmul kernels optimized for Adreno")
37+
add_compile_definitions(GGML_OPENCL_USE_ADRENO_KERNELS)
38+
endif ()
39+
40+
if (GGML_OPENCL_EMBED_KERNELS)
41+
add_compile_definitions(GGML_OPENCL_EMBED_KERNELS)
42+
43+
set(OPENCL2_CL_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl2.cl.h")
44+
set(OPENCL2_MM_CL_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl2_mm.cl.h")
45+
set(OPENCL2_CVT_CL_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl2_cvt.cl.h")
46+
47+
set(OPENCL2_GEMV_NOSHUFFLE_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl2_gemv_noshuffle.cl.h")
48+
set(OPENCL2_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl2_gemv_noshuffle_general.cl.h")
49+
set(OPENCL2_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl2_mul_mat_Ab_Bi_8x4.cl.h")
50+
set(OPENCL2_TRANSPOSE_16_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl2_transpose_16.cl.h")
51+
set(OPENCL2_TRANSPOSE_32_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl2_transpose_32.cl.h")
52+
set(OPENCL2_TRANSPOSE_32_16_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl2_transpose_32_16.cl.h")
53+
54+
set(EMBED_KERNEL_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/kernels/embed_kernel.py")
55+
file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated")
56+
57+
include_directories("${CMAKE_BINARY_DIR}/autogenerated")
58+
59+
# Python must be accessible from command line
60+
add_custom_command(
61+
OUTPUT ${OPENCL2_CL_SOURCE_EMBED}
62+
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
63+
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl2.cl
64+
${OPENCL2_CL_SOURCE_EMBED}
65+
DEPENDS kernels/ggml-opencl2.cl ${EMBED_KERNEL_SCRIPT}
66+
COMMENT "Generate ggml-opencl2.cl.h"
67+
)
68+
69+
add_custom_command(
70+
OUTPUT ${OPENCL2_MM_CL_SOURCE_EMBED}
71+
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
72+
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl2_mm.cl
73+
${OPENCL2_MM_CL_SOURCE_EMBED}
74+
DEPENDS kernels/ggml-opencl2_mm.cl ${EMBED_KERNEL_SCRIPT}
75+
COMMENT "Generate ggml-opencl2_mm.cl.h"
76+
)
77+
78+
add_custom_command(
79+
OUTPUT ${OPENCL2_CVT_CL_SOURCE_EMBED}
80+
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
81+
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl2_cvt.cl
82+
${OPENCL2_CVT_CL_SOURCE_EMBED}
83+
DEPENDS kernels/ggml-opencl2_cvt.cl ${EMBED_KERNEL_SCRIPT}
84+
COMMENT "Generate ggml-opencl2_cvt.cl.h"
85+
)
86+
87+
add_custom_command(
88+
OUTPUT ${OPENCL2_GEMV_NOSHUFFLE_SOURCE_EMBED}
89+
COMMAND python ${EMBED_KERNEL_SCRIPT}
90+
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl2_gemv_noshuffle.cl
91+
${OPENCL2_GEMV_NOSHUFFLE_SOURCE_EMBED}
92+
DEPENDS kernels/ggml-opencl2_gemv_noshuffle.cl ${EMBED_KERNEL_SCRIPT}
93+
COMMENT "Generate ggml-opencl2_gemv_noshuffle.cl.h"
94+
)
95+
96+
add_custom_command(
97+
OUTPUT ${OPENCL2_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED}
98+
COMMAND python ${EMBED_KERNEL_SCRIPT}
99+
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl2_gemv_noshuffle_general.cl
100+
${OPENCL2_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED}
101+
DEPENDS kernels/ggml-opencl2_gemv_noshuffle_general.cl ${EMBED_KERNEL_SCRIPT}
102+
COMMENT "Generate ggml-opencl2_gemv_noshuffle_general.cl.h"
103+
)
104+
105+
add_custom_command(
106+
OUTPUT ${OPENCL2_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED}
107+
COMMAND python ${EMBED_KERNEL_SCRIPT}
108+
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl2_mul_mat_Ab_Bi_8x4.cl
109+
${OPENCL2_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED}
110+
DEPENDS kernels/ggml-opencl2_mul_mat_Ab_Bi_8x4.cl ${EMBED_KERNEL_SCRIPT}
111+
COMMENT "Generate ggml-opencl2_mul_mat_Ab_Bi_8x4.cl.cl.h"
112+
)
113+
114+
add_custom_command(
115+
OUTPUT ${OPENCL2_TRANSPOSE_16_SOURCE_EMBED}
116+
COMMAND python ${EMBED_KERNEL_SCRIPT}
117+
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl2_transpose_16.cl
118+
${OPENCL2_TRANSPOSE_16_SOURCE_EMBED}
119+
DEPENDS kernels/ggml-opencl2_transpose_16.cl ${EMBED_KERNEL_SCRIPT}
120+
COMMENT "Generate ggml-opencl2_transpose_16.cl.h"
121+
)
122+
123+
add_custom_command(
124+
OUTPUT ${OPENCL2_TRANSPOSE_32_SOURCE_EMBED}
125+
COMMAND python ${EMBED_KERNEL_SCRIPT}
126+
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl2_transpose_32.cl
127+
${OPENCL2_TRANSPOSE_32_SOURCE_EMBED}
128+
DEPENDS kernels/ggml-opencl2_transpose_32.cl ${EMBED_KERNEL_SCRIPT}
129+
COMMENT "Generate ggml-opencl2_transpose_32.cl.h"
130+
)
131+
132+
add_custom_command(
133+
OUTPUT ${OPENCL2_TRANSPOSE_32_16_SOURCE_EMBED}
134+
COMMAND python ${EMBED_KERNEL_SCRIPT}
135+
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl2_transpose_32_16.cl
136+
${OPENCL2_TRANSPOSE_32_16_SOURCE_EMBED}
137+
DEPENDS kernels/ggml-opencl2_transpose_32_16.cl ${EMBED_KERNEL_SCRIPT}
138+
COMMENT "Generate ggml-opencl2_transpose_32_16.cl.h"
139+
)
140+
141+
target_sources(${TARGET_NAME} PRIVATE
142+
${OPENCL2_CL_SOURCE_EMBED}
143+
${OPENCL2_MM_CL_SOURCE_EMBED}
144+
${OPENCL2_CVT_CL_SOURCE_EMBED}
145+
${OPENCL2_GEMV_NOSHUFFLE_SOURCE_EMBED}
146+
${OPENCL2_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED}
147+
${OPENCL2_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED}
148+
${OPENCL2_TRANSPOSE_16_SOURCE_EMBED}
149+
${OPENCL2_TRANSPOSE_32_SOURCE_EMBED}
150+
${OPENCL2_TRANSPOSE_32_16_SOURCE_EMBED})
151+
else ()
152+
# copy ggml-opencl.cl to bin directory
153+
configure_file(kernels/ggml-opencl2.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl2.cl COPYONLY)
154+
configure_file(kernels/ggml-opencl2_mm.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl2_mm.cl COPYONLY)
155+
configure_file(kernels/ggml-opencl2_cvt.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl2_cvt.cl COPYONLY)
156+
endif ()
157+
else ()
158+
message(WARNING "OpenCL not found")
159+
endif ()

0 commit comments

Comments
 (0)