Skip to content

Commit cb304a6

Browse files
author
zhouwg
committed
ggml-hexagon: enable multi-threading feature on cDSP side
1 parent 59454c3 commit cb304a6

File tree

6 files changed

+59
-17
lines changed

6 files changed

+59
-17
lines changed

ggml/src/ggml-hexagon/ggml-hexagon.cpp

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -325,6 +325,7 @@ struct hexagon_appcfg_t {
325325
int enable_all_q_mulmat; // enable/disable offload all quantized type mulmat to cDSP
326326
int profiler_duration; // threshold of duration in profiler, per seconds
327327
int profiler_counts; // threshold of counts in profiler
328+
int thread_counts; // thread_counts on cDSP side
328329
const char * cfgfilename;
329330
const char * runtime_libpath;
330331
char ggml_hexagon_version[GGMLHEXAGON_TMPBUF_LEN];
@@ -348,6 +349,7 @@ static struct hexagon_appcfg_t g_hexagon_appcfg = {
348349
.enable_all_q_mulmat = 0,
349350
.profiler_duration = 5,
350351
.profiler_counts = 100,
352+
.thread_counts = 4,
351353
.cfgfilename = "ggml-hexagon.cfg",
352354
#if defined(__ANDROID__)
353355
//Android command line program
@@ -357,8 +359,8 @@ static struct hexagon_appcfg_t g_hexagon_appcfg = {
357359
#elif defined(_WIN32)
358360
.qnn_runtimelib_path = "C:\\",
359361
#endif
360-
.ggml_hexagon_version = {"1.80"},
361-
.ggml_dsp_version = {"0.60"},
362+
.ggml_hexagon_version = {"1.81"},
363+
.ggml_dsp_version = {"0.61"},
362364
};
363365

364366
//file:///opt/qcom/aistack/qairt/2.31.0.250130/docs/QNN/general/overview.html#tbl-supported-snapdragon-devices
@@ -886,10 +888,19 @@ class hexagon_profiler {
886888
//FIXME:hardcode filename of profiler data
887889
std::string filename = std::string(g_hexagon_appcfg.runtime_libpath) + "/";
888890
if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
889-
if (0 == g_hexagon_appcfg.enable_rpc_ion_mempool) {
890-
filename = filename + "hexagon_perf_cdsp.dat";
891+
if (g_hexagon_appcfg.thread_counts > 1) {
892+
//multi-threading feature enabled on cDSP side
893+
if (0 == g_hexagon_appcfg.enable_rpc_ion_mempool) {
894+
filename = filename + "hexagon_perf_cdsp_mt.dat";
895+
} else {
896+
filename = filename + "hexagon_perf_cdsp_ion_mt.dat";
897+
}
891898
} else {
892-
filename = filename + "hexagon_perf_cdsp_ion.dat";
899+
if (0 == g_hexagon_appcfg.enable_rpc_ion_mempool) {
900+
filename = filename + "hexagon_perf_cdsp.dat";
901+
} else {
902+
filename = filename + "hexagon_perf_cdsp_ion.dat";
903+
}
893904
}
894905
} else {
895906
filename = filename + "hexagon_perf_qnn.dat";
@@ -1782,6 +1793,7 @@ static void ggmlhexagon_load_cfg() {
17821793

17831794
hexagoncfg_instance.get_intvalue("cdsp", "enable_rpc_ion_mempool", g_hexagon_appcfg.enable_rpc_ion_mempool, 0);
17841795
hexagoncfg_instance.get_intvalue("cdsp", "enable_all_q_mulmat", g_hexagon_appcfg.enable_all_q_mulmat, 0);
1796+
hexagoncfg_instance.get_intvalue("cdsp", "thread_counts", g_hexagon_appcfg.thread_counts, 4);
17851797

17861798
GGMLHEXAGON_LOG_INFO("internal ggml_hexagon_version=%s", g_hexagon_appcfg.ggml_hexagon_version);
17871799
GGMLHEXAGON_LOG_INFO("internal ggml_dsp_version=%s", g_hexagon_appcfg.ggml_dsp_version);
@@ -5315,7 +5327,8 @@ static int ggmlhexagon_init_dsp(ggml_backend_hexagon_context * ctx) {
53155327
//FIXME: only support offload fp32 GGML_OP_MUL_MAT to cDSP
53165328
GGMLHEXAGON_LOG_INFO("only support offload fp32 GGML_OP_ADD and fp32 GGML_OP_MUL_MAT to cDSP currently");
53175329
ggmlhexagon_probe_dspinfo(ctx);
5318-
ggmlop_dsp_setclocks(ctx->ggmlop_handle, HAP_DCVS_VCORNER_TURBO_PLUS, 40, 1);
5330+
//FIXME: re-use this function to pass thread_counts info to code on cDSP side before fully understand qidl mechanism
5331+
ggmlop_dsp_setclocks(ctx->ggmlop_handle, HAP_DCVS_VCORNER_TURBO_PLUS, 40, 1, g_hexagon_appcfg.thread_counts);
53195332
ggmlhexagon_set_rpc_latency(ctx->ggmlop_handle, RPC_POLL_QOS, 100);
53205333
int result = ggmlhexagon_init_rpcmempool(ctx);
53215334
if (0 != result) {

ggml/src/ggml-hexagon/kernels/ggml-dsp.c

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -876,6 +876,7 @@ static int64_t ggml_time_us(void) {
876876
// =================================================================================================
877877
// section-4: ggml-hexagon kernel helper function
878878
// =================================================================================================
879+
static int32 g_thread_counts = 1;
879880
int ggmlop_dsp_open(const char*uri, remote_handle64* handle) {
880881
void *tptr = NULL;
881882
FARF(HIGH, "uri %s", uri);
@@ -897,13 +898,15 @@ int ggmlop_dsp_close(remote_handle64 handle) {
897898
return 0;
898899
}
899900

900-
AEEResult ggmlop_dsp_setclocks(remote_handle64 handle, int32 power_level, int32 latency, int32 dcvs_enabled) {
901+
AEEResult ggmlop_dsp_setclocks(remote_handle64 handle, int32 power_level, int32 latency, int32 dcvs_enabled, int32 thread_counts) {
901902
GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
902903
HAP_power_request_t request;
903904
memset(&request, 0, sizeof(HAP_power_request_t));
904905
request.type = HAP_power_set_apptype;
905906
request.apptype = HAP_POWER_COMPUTE_CLIENT_CLASS;
906907

908+
g_thread_counts = thread_counts;
909+
907910
void * ggmop_ctx = (void*)(handle);
908911
int retval = HAP_power_set(ggmop_ctx, &request);
909912
if (retval) {
@@ -1192,7 +1195,7 @@ static void ggml_compute_forward_mul_mat_one_chunk(
11921195
}
11931196

11941197
//FIXME: only support fp32 mulmat on cDSP
1195-
int ggmlop_dsp_mulmat(remote_handle64 h, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1198+
static int ggmlop_dsp_mulmat_singlethread(remote_handle64 h, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
11961199
GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
11971200
ggmlhexagon_dump_tensor(src0, 0);
11981201
ggmlhexagon_dump_tensor(src1, 0);
@@ -1353,6 +1356,21 @@ int ggmlop_dsp_mulmat(remote_handle64 h, const ggml_tensor * src0, const ggml_te
13531356
return 0;
13541357
}
13551358

1359+
int ggmlop_dsp_mulmat_multithread(remote_handle64 h, const struct dsptensor * src0, const struct dsptensor * src1, dsptensor * dst) {
1360+
GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
1361+
GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
1362+
return 0;
1363+
}
1364+
1365+
int ggmlop_dsp_mulmat(remote_handle64 h, const struct dsptensor * src0, const struct dsptensor * src1, dsptensor * dst) {
1366+
if (g_thread_counts > 1) {
1367+
return ggmlop_dsp_mulmat_multithread(h, src0, src1, dst);
1368+
} else {
1369+
return ggmlop_dsp_mulmat_singlethread(h, src0, src1, dst);
1370+
}
1371+
return 0;
1372+
}
1373+
13561374
int ggmlop_dsp_softmax(remote_handle64 h, const dsptensor * src0, const dsptensor * src1, dsptensor * dst) {
13571375

13581376
GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );

ggml/src/ggml-hexagon/kernels/ggmlop_ap_skel.c

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -296,24 +296,25 @@ __QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_dsp_open)(const char* uri, remote_hand
296296
__QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_dsp_close)(remote_handle64 h) __QAIC_STUB_ATTRIBUTE {
297297
return __QAIC_REMOTE(remote_handle64_close)(h);
298298
}
299-
static __inline int _stub_method(remote_handle64 _handle, uint32_t _mid, uint32_t _in0[1], uint32_t _in1[1], uint32_t _in2[1]) {
299+
static __inline int _stub_method(remote_handle64 _handle, uint32_t _mid, uint32_t _in0[1], uint32_t _in1[1], uint32_t _in2[1], uint32_t _in3[1]) {
300300
remote_arg _pra[1] = {0};
301-
uint32_t _primIn[3]= {0};
301+
uint32_t _primIn[4]= {0};
302302
int _nErr = 0;
303303
_pra[0].buf.pv = (void*)_primIn;
304304
_pra[0].buf.nLen = sizeof(_primIn);
305305
_COPY(_primIn, 0, _in0, 0, 4);
306306
_COPY(_primIn, 4, _in1, 0, 4);
307307
_COPY(_primIn, 8, _in2, 0, 4);
308+
_COPY(_primIn, 12,_in3, 0, 4);
308309
_TRY_FARF(_nErr, __QAIC_REMOTE(remote_handle64_invoke)(_handle, REMOTE_SCALARS_MAKEX(0, _mid, 1, 0, 0, 0), _pra));
309310
_CATCH_FARF(_nErr) {
310311
_QAIC_FARF(RUNTIME_ERROR, "ERROR 0x%x: handle=0x%"PRIx64", scalar=0x%x, method ID=%d: %s failed\n", _nErr , _handle, REMOTE_SCALARS_MAKEX(0, _mid, 1, 0, 0, 0), _mid, __func__);
311312
}
312313
return _nErr;
313314
}
314-
__QAIC_STUB_EXPORT AEEResult __QAIC_STUB(ggmlop_dsp_setclocks)(remote_handle64 _handle, int32 power_level, int32 latency, int32 dcvs_enable) __QAIC_STUB_ATTRIBUTE {
315+
__QAIC_STUB_EXPORT AEEResult __QAIC_STUB(ggmlop_dsp_setclocks)(remote_handle64 _handle, int32 power_level, int32 latency, int32 dcvs_enable, int32 threads) __QAIC_STUB_ATTRIBUTE {
315316
uint32_t _mid = 2;
316-
return _stub_method(_handle, _mid, (uint32_t*)&power_level, (uint32_t*)&latency, (uint32_t*)&dcvs_enable);
317+
return _stub_method(_handle, _mid, (uint32_t*)&power_level, (uint32_t*)&latency, (uint32_t*)&dcvs_enable, (uint32_t*)&threads);
317318
}
318319
static __inline int _stub_unpack(_ATTRIBUTE_UNUSED remote_arg* _praROutPost, _ATTRIBUTE_UNUSED remote_arg* _ppraROutPost[1], _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _rout0[1], _ATTRIBUTE_UNUSED uint32_t _rout1[4], _ATTRIBUTE_UNUSED uint32_t _rout2[4], _ATTRIBUTE_UNUSED uint32_t _rout3[1], _ATTRIBUTE_UNUSED uint32_t _rout4[16], _ATTRIBUTE_UNUSED uint32_t _rout5[1], _ATTRIBUTE_UNUSED char* _rout6[1], _ATTRIBUTE_UNUSED uint32_t _rout6Len[1]) {
319320
int _nErr = 0;

ggml/src/ggml-hexagon/kernels/ggmlop_ap_skel.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -272,7 +272,7 @@ __QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_open)(const char* uri, remote_
272272
* @retval, 0 on success, should always succeed
273273
*/
274274
__QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_close)(remote_handle64 h) __QAIC_HEADER_ATTRIBUTE;
275-
__QAIC_HEADER_EXPORT AEEResult __QAIC_HEADER(ggmlop_dsp_setclocks)(remote_handle64 _h, int32 power_level, int32 latency, int32 dcvs_enable) __QAIC_HEADER_ATTRIBUTE;
275+
__QAIC_HEADER_EXPORT AEEResult __QAIC_HEADER(ggmlop_dsp_setclocks)(remote_handle64 _h, int32 power_level, int32 latency, int32 dcvs_enable, int32 threads) __QAIC_HEADER_ATTRIBUTE;
276276
__QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_add)(remote_handle64 _h, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_HEADER_ATTRIBUTE;
277277
__QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_mulmat)(remote_handle64 _h, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_HEADER_ATTRIBUTE;
278278
__QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_softmax)(remote_handle64 _h, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_HEADER_ATTRIBUTE;

ggml/src/ggml-hexagon/kernels/ggmlop_cdsp_skel.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -397,11 +397,12 @@ static __inline int _skel_method(int (*_pfn)(remote_handle64, const dsptensor*,
397397
_allocator_deinit(_al);
398398
return _nErr;
399399
}
400-
static __inline int _skel_method_1(int (*_pfn)(remote_handle64, int32, int32, int32), remote_handle64 _h, uint32_t _sc, remote_arg* _pra) {
400+
static __inline int _skel_method_1(int (*_pfn)(remote_handle64, int32, int32, int32, int32), remote_handle64 _h, uint32_t _sc, remote_arg* _pra) {
401401
remote_arg* _praEnd = 0;
402402
uint32_t _in0[1] = {0};
403403
uint32_t _in1[1] = {0};
404404
uint32_t _in2[1] = {0};
405+
uint32_t _in3[1] = {0};
405406
uint32_t* _primIn= 0;
406407
int _nErr = 0;
407408
_praEnd = ((_pra + REMOTE_SCALARS_INBUFS(_sc)) + REMOTE_SCALARS_OUTBUFS(_sc) + REMOTE_SCALARS_INHANDLES(_sc) + REMOTE_SCALARS_OUTHANDLES(_sc));
@@ -415,7 +416,8 @@ static __inline int _skel_method_1(int (*_pfn)(remote_handle64, int32, int32, in
415416
_COPY(_in0, 0, _primIn, 0, 4);
416417
_COPY(_in1, 0, _primIn, 4, 4);
417418
_COPY(_in2, 0, _primIn, 8, 4);
418-
_TRY(_nErr, _pfn(_h, (int32)*_in0, (int32)*_in1, (int32)*_in2));
419+
_COPY(_in3, 0, _primIn, 12, 4);
420+
_TRY(_nErr, _pfn(_h, (int32)*_in0, (int32)*_in1, (int32)*_in2, (int32)*_in3));
419421
_QAIC_CATCH(_nErr) {}
420422
return _nErr;
421423
}

scripts/ggml-hexagon.cfg

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,9 @@
2323
#
2424
[general]
2525
#version of ggml-hexagon.cpp on ARM-AP side
26-
version = "1.80"
26+
version = "1.81"
2727
#version of ggml-dsp.c on cDSP side
28-
ggmldsp_version = "0.60"
28+
ggmldsp_version = "0.61"
2929

3030
#0: HEXAGON_BACKEND_QNNCPU
3131
#1: HEXAGON_BACKEND_QNNGPU
@@ -100,3 +100,11 @@ enable_rpc_ion_mempool = 0
100100
enable_all_q_mulmat = 0
101101
#attention:
102102
#ensure enable_q_mulmat = 1 when set enable_all_q_mulmat = 1
103+
104+
#enable/disable multi-threading on cDSP side
105+
# 0 disable multi-threading on cDSP side
106+
# 1 disable multi-threading on cDSP side
107+
# 2 setting thread_counts to 2 on cDSP side
108+
# 3 setting thread_counts to 3 on cDSP side
109+
# 4 setting thread_counts to 4 on cDSP side
110+
thread_counts = 1

0 commit comments

Comments
 (0)