Skip to content

Commit cc8c5d4

Browse files
author
zhouwg
committed
ggml-dsp: split ggml-dsp.c into multiple files and cleanup
1 parent 5a17dc7 commit cc8c5d4

File tree

12 files changed

+602
-540
lines changed

12 files changed

+602
-540
lines changed

ggml/src/ggml-hexagon/CMakeLists.txt

Lines changed: 7 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -49,17 +49,10 @@ endif()
4949
set(OPT_FLAG " ")
5050
if (${HTP_ARCH_VERSION} STREQUAL "v75" OR ${HTP_ARCH_VERSION} STREQUAL "v79")
5151
#works fine on Snapdragon 8Gen3&8Elite with 1.5x - 3x performance gains with the default ggml backend
52-
set(OPT_FLAG " -O3 -march=armv8.7-a -mcpu=cortex-x1 -mtune=cortex-x1 -flto -D_GNU_SOURCE -fvectorize -fno-finite-math-only -ffp-model=fast ")
52+
set(OPT_FLAG " -O3 -march=armv8.7-a -mcpu=cortex-x1 -mtune=cortex-x1 -flto -D_GNU_SOURCE -fvectorize -ffp-model=fast -fno-finite-math-only")
5353
endif()
5454
message("OPT_FLAG:${OPT_FLAG}")
5555

56-
#cross compiling for hexagon kernels on cDSP side
57-
set(HEXAGON_CC "${HEXAGON_SDK_PATH}/tools/HEXAGON_Tools/8.8.06/Tools/bin/hexagon-clang")
58-
set(HEXAGON_CXX "${HEXAGON_SDK_PATH}/tools/HEXAGON_Tools/8.8.06/Tools/bin/hexagon-clang")
59-
set(HEXAGON_TARGET libggmlop_skel${HTP_ARCH_VERSION}.so)
60-
set(HEXAGON_KERNELS_PATH "${CMAKE_CURRENT_LIST_DIR}/kernels")
61-
set(HEXAGON_COMPUTE "compute${HTP_ARCH_VERSION}")
62-
6356
if(CMAKE_SYSTEM_NAME STREQUAL "Android")
6457
find_library(LOG_LIB log)
6558

@@ -91,13 +84,12 @@ else()
9184
message(FATAL_ERROR "ggml-hexagon now only available on Android and Windows(Windows on ARM)")
9285
endif()
9386

94-
9587
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DGGML_USE_HEXAGON ${DEBUG_FLAG} ${OPT_FLAG}")
9688
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGGML_USE_HEXAGON ${DEBUG_FLAG} ${OPT_FLAG}")
9789
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -DGGML_USE_HEXAGON ${DEBUG_FLAG} ${OPT_FLAG}")
9890
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DGGML_USE_HEXAGON ${DEBUG_FLAG} ${OPT_FLAG}")
9991

100-
file(GLOB HEXAGON_SOURCES "${CMAKE_CURRENT_LIST_DIR}/*.cpp" "${CMAKE_CURRENT_LIST_DIR}/kernels/ggmlop_ap_skel.c")
92+
file(GLOB HEXAGON_SOURCES "${CMAKE_CURRENT_LIST_DIR}/*.cpp" "${CMAKE_CURRENT_LIST_DIR}/stub.c")
10193
ggml_add_backend_library(ggml-hexagon ${HEXAGON_SOURCES})
10294

10395
target_include_directories(ggml-hexagon PRIVATE ${QNN_SDK_PATH}/include/QNN ${HEXAGON_SDK_PATH} ${CMAKE_CURRENT_LIST_DIR})
@@ -106,18 +98,18 @@ target_link_libraries(ggml-hexagon PRIVATE ${QNN_LINK_LIBRARIES})
10698
string(REGEX REPLACE "/$" "" QNN_DEFAULT_LIB_SEARCH_PATH "${QNN_DEFAULT_LIB_SEARCH_PATH}")
10799
target_compile_definitions(ggml-hexagon PRIVATE QNN_DEFAULT_LIB_SEARCH_PATH="${QNN_DEFAULT_LIB_SEARCH_PATH}/")
108100

101+
#cross compiling hexagon kernels which running on cDSP side
109102
function(ggml_hexagon_build_kernel KNAME)
110103
message(STATUS "ggml_hexagon: build hexagon-kernel ${KNAME}")
111104

112105
add_custom_command(
113106
TARGET ${PROJECT_NAME}
114107
POST_BUILD
115108
COMMAND echo "current working path:`pwd`\n"
116-
COMMAND ${HEXAGON_CC} -o ${HEXAGON_KERNELS_PATH}/ggml-dsp.o -c ${HEXAGON_KERNELS_PATH}/ggml-dsp.c -m${HTP_ARCH_VERSION} -c -Ofast -Wall -Wstrict-prototypes -fno-zero-initialized-in-bss -fdata-sections -fpic ${DEBUG_FLAG} -D__V_DYNAMIC__ -mhvx -mhvx-length=128B -fno-finite-math-only -I${HEXAGON_SDK_PATH}/incs -I${HEXAGON_SDK_PATH}/libs/qprintf/inc -I${HEXAGON_SDK_PATH}/incs/stddef -I${HEXAGON_SDK_PATH}/ipc/fastrpc/incs -I${HEXAGON_SDK_PATH}/ipc/fastrpc/rpcmem/inc -I${HEXAGON_SDK_PATH}/utils/examples -I${HEXAGON_SDK_PATH}/ipc/fastrpc/rtld/ship/inc -I${HEXAGON_SDK_PATH}/libs/atomic/inc -I${HEXAGON_SDK_PATH}/utils/sim_utils/inc -I${HEXAGON_SDK_PATH}/rtos/qurt/${HEXAGON_COMPUTE}/include/posix -I${HEXAGON_SDK_PATH}/rtos/qurt/${HEXAGON_COMPUTE}/include/qurt/
117-
COMMAND ${HEXAGON_CC} -o ${HEXAGON_KERNELS_PATH}/ggmlop_cdsp_skel.o -c ${HEXAGON_KERNELS_PATH}/ggmlop_cdsp_skel.c -m${HTP_ARCH_VERSION} -c -Ofast -Wall -Wstrict-prototypes -fno-zero-initialized-in-bss -fdata-sections -fpic -D__V_DYNAMIC__ -mhvx -mhvx-length=128B -fno-finite-math-only -I${HEXAGON_SDK_PATH}/incs -I${HEXAGON_SDK_PATH}/libs/qprintf/inc -I${HEXAGON_SDK_PATH}/incs/stddef -I${HEXAGON_SDK_PATH}/ipc/fastrpc/incs -I${HEXAGON_SDK_PATH}/ipc/fastrpc/rpcmem/inc -I${HEXAGON_SDK_PATH}/utils/examples -I${HEXAGON_SDK_PATH}/ipc/fastrpc/rtld/ship/inc -I${HEXAGON_SDK_PATH}/libs/atomic/inc -I${HEXAGON_SDK_PATH}/utils/sim_utils/inc
118-
COMMAND ${HEXAGON_CC} -m${HTP_ARCH_VERSION} -Wl,--defsym=ISDB_TRUSTED_FLAG=2 -Wl,--defsym=ISDB_SECURE_FLAG=2 -Wl,--no-threads -fpic -shared -Wl,-Bsymbolic -Wl,--wrap=malloc -Wl,--wrap=calloc -Wl,--wrap=free -Wl,--wrap=realloc -Wl,--wrap=memalign -lc -Wl,-soname=${HEXAGON_TARGET} -o ../../../bin/${HEXAGON_TARGET} -Wl,--start-group ${HEXAGON_KERNELS_PATH}/ggmlop_cdsp_skel.o ${HEXAGON_KERNELS_PATH}/ggml-dsp.o -Wl,--end-group
119-
COMMAND ls -l ../../../bin/${HEXAGON_TARGET}
120-
COMMAND /bin/cp -fv ../../../bin/${HEXAGON_TARGET} ../../../bin/libggmlop_skel.so
109+
COMMAND echo "${CMAKE_CURRENT_LIST_DIR}/kernels"
110+
COMMAND make -C ${CMAKE_CURRENT_LIST_DIR}/kernels/ clean
111+
COMMAND make -C ${CMAKE_CURRENT_LIST_DIR}/kernels/ HEXAGON_SDK_PATH=${HEXAGON_SDK_PATH} HTP_ARCH_VERSION=${HTP_ARCH_VERSION} DEBUG_FLAG=${DEBUG_FLAG}
112+
COMMAND ls -l ../../../bin/libggmlop_skel.so
121113
COMMENT "build hexagon-kernel"
122114
)
123115
endfunction()

ggml/src/ggml-hexagon/ggml-hexagon.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@
121121
#include "ggml-impl.h"
122122
#include "ggml-backend-impl.h"
123123

124-
#include "kernels/ggmlop_ap_skel.h"
124+
#include "kernels/skel.h"
125125

126126
// =================================================================================================
127127
// section-1: forward/prototype declaration, global vars, macros, data structures
@@ -383,8 +383,8 @@ static struct hexagon_appcfg_t g_hexagon_appcfg = {
383383
#elif defined(_WIN32)
384384
.qnn_runtimelib_path = "C:\\",
385385
#endif
386-
.ggml_hexagon_version = {"1.04"},
387-
.ggml_dsp_version = {"0.61"},
386+
.ggml_hexagon_version = {"1.05"},
387+
.ggml_dsp_version = {"0.62"},
388388
};
389389

390390
//file:///opt/qcom/aistack/qairt/2.31.0.250130/docs/QNN/general/overview.html#tbl-supported-snapdragon-devices
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
#following vars already defined in CMakeLists.txt
2+
#HTP_ARCH_VERSION=v79
3+
#DEBUG_FLAG=-DNDEBUG -Wall
4+
#HEXAGON_SDK_PATH=/opt/qcom/Hexagon_SDK/6.2.0.1
5+
6+
HEXAGON_COMPUTE=compute${HTP_ARCH_VERSION}
7+
HEXAGON_CC=${HEXAGON_SDK_PATH}/tools/HEXAGON_Tools/8.8.06/Tools/bin/hexagon-clang
8+
HEXAGON_CXX=${HEXAGON_SDK_PATH}/tools/HEXAGON_Tools/8.8.06/Tools/bin/hexagon-clang
9+
10+
TARGET=libggmlop_skel.so
11+
12+
$(info HEXAGON_SDK_PATH:${HEXAGON_SDK_PATH})
13+
$(info HTP_ARCH_VERSION:${HTP_ARCH_VERSION})
14+
$(info DEBUG_FLAG:${DEBUG_FLAG})
15+
$(info HEXAGON_COMPUTE:${HEXAGON_COMPUTE})
16+
17+
INCS=-I${HEXAGON_SDK_PATH}/incs -I${HEXAGON_SDK_PATH}/libs/qprintf/inc -I${HEXAGON_SDK_PATH}/incs/stddef -I${HEXAGON_SDK_PATH}/ipc/fastrpc/incs -I${HEXAGON_SDK_PATH}/ipc/fastrpc/rpcmem/inc -I${HEXAGON_SDK_PATH}/ipc/fastrpc/rtld/ship/inc -I${HEXAGON_SDK_PATH}/libs/atomic/inc -I${HEXAGON_SDK_PATH}/utils/sim_utils/inc -I${HEXAGON_SDK_PATH}/utils/sim_utils/inc -I${HEXAGON_SDK_PATH}/rtos/qurt/${HEXAGON_COMPUTE}/include/posix -I${HEXAGON_SDK_PATH}/rtos/qurt/${HEXAGON_COMPUTE}/include/qurt/
18+
19+
CFLAGS=-m${HTP_ARCH_VERSION} -c -Ofast -Wall -Wstrict-prototypes -fno-zero-initialized-in-bss -fdata-sections -fpic -D__V_DYNAMIC__ -mhvx -mhvx-length=128B ${INCS} -fno-finite-math-only
20+
21+
LDFLAGS=-m${HTP_ARCH_VERSION} -Wl,--defsym=ISDB_TRUSTED_FLAG=2 -Wl,--defsym=ISDB_SECURE_FLAG=2 -Wl,--no-threads -fpic -shared -Wl,-Bsymbolic -Wl,--wrap=malloc -Wl,--wrap=calloc -Wl,--wrap=free -Wl,--wrap=realloc -Wl,--wrap=memalign -lc -Wl,-soname=${TARGET}
22+
23+
SRCS = $(wildcard *.c)
24+
OBJS = $(patsubst %.c, %.o, $(SRCS))
25+
26+
ALL:$(OBJS)
27+
${HEXAGON_CC} ${LDFLAGS} -o ${TARGET} -Wl,--start-group ${OBJS} -Wl,--end-group
28+
@ls -l ${TARGET}
29+
/bin/cp -fv ${TARGET} ../../../../out/android/bin/
30+
/bin/cp -fv ${TARGET} ../../../../out/android/bin/libggmlop_skel${HTP_ARCH_VERSION}.so
31+
/bin/rm -f *.so
32+
33+
%.o:%.c
34+
@echo "${HEXAGON_CC} ${CFLAGS} ${DEBUG_FLAG} -D__FILENAME__=\"$<\" -o $@ -c $<"
35+
${HEXAGON_CC} ${CFLAGS} -D__FILENAME__=\"$<\" -o $@ -c $<
36+
@echo "\n"
37+
38+
clean:
39+
rm -f *.o
Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
#include "ggml-dsp.h"
2+
3+
inline static void ggmlhexagon_dsp_add_f32 (const int n, float * z, const float * x, const float * y) {
4+
HVX_Vector * va;
5+
HVX_Vector * vb;
6+
HVX_Vector * vc;
7+
HVX_Vector qf32;
8+
const int FLOATS_PER_VECTOR = 128 / sizeof(float);
9+
const int block = n / FLOATS_PER_VECTOR;
10+
const int left = n % FLOATS_PER_VECTOR;
11+
const int blocks = block * FLOATS_PER_VECTOR;
12+
13+
if (0 == block) {
14+
for (size_t i = 0; i < n; ++i)
15+
z[i] = x[i] + y[i];
16+
17+
return;
18+
}
19+
20+
if ((((uintptr_t)z | (uintptr_t)x | (uintptr_t)y) % ALIGN_128_BYTE) != 0) {
21+
GGMLHEXAGON_LOG_DEBUG("memaddress mismatch alignment 128 bytes z:%p x:%p y:%p", z, x, y);
22+
for (size_t i = 0; i < n; ++i)
23+
z[i] = x[i] + y[i];
24+
25+
return;
26+
}
27+
28+
va = (HVX_Vector *)x;
29+
vb = (HVX_Vector *)y;
30+
vc = (HVX_Vector *)z;
31+
for (size_t i = 0; i < block; ++i) {
32+
qf32 = Q6_Vqf32_vadd_VsfVsf(*va++, *vb++);
33+
*vc = Q6_Vsf_equals_Vqf32(qf32);
34+
vc++;
35+
}
36+
37+
if (left > 0) {
38+
for (size_t i = 0; i < left; ++i)
39+
z[i + blocks] = x[i + blocks] + y[i + blocks];
40+
}
41+
}
42+
43+
static void ggml_compute_forward_add_f32(
44+
const struct ggml_tensor * src0,
45+
const struct ggml_tensor * src1,
46+
struct ggml_tensor * dst) {
47+
GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
48+
uint64_t start_time = ggml_time_us();
49+
50+
memcpy(dst->ne, src1->ne, 16);
51+
memcpy(dst->nb, src1->nb, 16);
52+
ggmlhexagon_dump_tensor(src0, 1);
53+
ggmlhexagon_dump_tensor(src1, 1);
54+
ggmlhexagon_dump_tensor(dst, 1);
55+
56+
GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
57+
58+
const int ith = 0;
59+
const int nth = 1;
60+
61+
const int nr = ggml_nrows(src0);
62+
GGML_TENSOR_BINARY_OP_LOCALS
63+
64+
GGML_ASSERT( nb0 == sizeof(float));
65+
GGML_ASSERT(nb00 == sizeof(float));
66+
67+
const int dr = (nr + nth - 1)/nth;
68+
const int ir0 = dr*ith;
69+
const int ir1 = MIN(ir0 + dr, nr);
70+
if (nb10 == sizeof(float)) {
71+
for (int ir = ir0; ir < ir1; ++ir) {
72+
// src1 is broadcastable across src0 and dst in i1, i2, i3
73+
const int32_t i03 = ir/(ne02*ne01);
74+
const int32_t i02 = (ir - i03*ne02*ne01)/ne01;
75+
const int32_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
76+
77+
const int32_t i13 = i03 % ne13;
78+
const int32_t i12 = i02 % ne12;
79+
const int32_t i11 = i01 % ne11;
80+
const int32_t nr0 = ne00 / ne10;
81+
82+
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
83+
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
84+
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
85+
for (int32_t r = 0; r < nr0; ++r) {
86+
ggmlhexagon_dsp_add_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
87+
}
88+
}
89+
} else {
90+
// src1 is not contiguous
91+
for (int ir = ir0; ir < ir1; ++ir) {
92+
// src1 is broadcastable across src0 and dst in i1, i2, i3
93+
const int32_t i03 = ir/(ne02*ne01);
94+
const int32_t i02 = (ir - i03*ne02*ne01)/ne01;
95+
const int32_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
96+
97+
const int32_t i13 = i03 % ne13;
98+
const int32_t i12 = i02 % ne12;
99+
const int32_t i11 = i01 % ne11;
100+
101+
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
102+
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
103+
104+
for (int32_t i0 = 0; i0 < ne0; ++i0) {
105+
const int32_t i10 = i0 % ne10;
106+
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
107+
108+
dst_ptr[i0] = src0_ptr[i0] + *src1_ptr;
109+
}
110+
}
111+
}
112+
113+
uint64_t end_time = ggml_time_us();
114+
uint64_t duration = (end_time - start_time);
115+
GGMLHEXAGON_LOG_DEBUG("duration %llu us", duration);
116+
#if !GGMLHEXAGON_DEBUG
117+
UNUSED(duration);
118+
#endif
119+
120+
GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
121+
}
122+
123+
//FIXME: why failed with test-backend-ops when disable ion rpc mempool
124+
int ggmlop_dsp_add(remote_handle64 h, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst)
125+
{
126+
GGMLHEXAGON_LOG_DEBUG("enter %s\n", __func__);
127+
switch (src0->type) {
128+
case GGML_TYPE_F32:
129+
{
130+
if (src1->type == GGML_TYPE_F32) {
131+
ggml_compute_forward_add_f32(src0, src1, dst);
132+
} else {
133+
GGML_ABORT("fatal error");
134+
}
135+
break;
136+
}
137+
default:
138+
{
139+
GGML_ABORT("fatal error");
140+
}
141+
}
142+
GGMLHEXAGON_LOG_DEBUG("leave %s\n", __func__);
143+
return 0;
144+
}

0 commit comments

Comments
 (0)