2323 * this is a complicated skeleton, can expand other ggml ops accordingly
2424 *
2525 * currently provide following ggml op' implementation through Hexagon DSP:
26- * - GGML_OP_ADD:
26+ * - GGML_OP_ADD & GGML_OP_MUL_MAT :
2727 * this is a skeleton, can expand other ggml ops accordingly
2828 *
2929 * Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -343,6 +343,7 @@ struct ggml_backend_qnn_context {
343343 size_t rpc_mempool_len;
344344 void * rpc_mempool;
345345 remote_handle64 ggmlop_handle;
346+ int domain_id;
346347};
347348
348349struct qnn_op_caps {
@@ -363,6 +364,8 @@ struct qnn_parameter {
363364 int enable_dlbc;
364365 int inference_approach; // 0: QNN_GENERAL 1: DIRECT_USE_CDSP 2: QNN_SINGELGRAPH
365366 int qnn_backend; // 0: QNN-CPU backend 1: QNN-GPU backend 2: QNN-NPU backend
367+ int enable_mulmat_cdsp; // enable/disable offload mulmat to cDSP
368+ int enable_q_mulmat; // enable/disable offload fp32 & all quantized type mulmat to cDSP
366369 const char * qnn_cfgfilename;
367370 const char * qnn_runtimelib_path;
368371};
@@ -381,6 +384,8 @@ static struct qnn_parameter g_qnn_params = {
381384 .enable_dlbc = 1 ,
382385 .inference_approach = 0 ,
383386 .qnn_backend = 2 , // default is QNN-NPU backend
387+ .enable_mulmat_cdsp = 0 ,
388+ .enable_q_mulmat = 0 ,
384389 .qnn_cfgfilename = " ggml-qnn.cfg" ,
385390#if defined(__ANDROID__)
386391// Android command line program
@@ -1451,7 +1456,7 @@ static int ggmlhexagon_get_dsp_support(int * domain) {
14511456 return hexagon_error;
14521457}
14531458
1454- static int ggmlhexagon_get_vtcm_info (int domain, uint32_t * capability , uint32_t attr ) {
1459+ static int ggmlhexagon_get_vtcm_info (int domain, uint32_t attr , uint32_t * capability ) {
14551460 int hexagon_error = AEE_SUCCESS;
14561461 *capability = 0 ;
14571462
@@ -1633,7 +1638,7 @@ static bool ggmlhexagon_is_status_notification_supported(int domain) {
16331638 return false ;
16341639}
16351640
1636- static int ggmlhexagon_get_hmx_support_info (int domain, uint32_t * capability , uint32_t attr ) {
1641+ static int ggmlhexagon_get_hmx_support_info (int domain, uint32_t attr , uint32_t * capability ) {
16371642 int hexagon_error = AEE_SUCCESS;
16381643 *capability = 0 ;
16391644
@@ -1679,7 +1684,7 @@ static int ggmlhexagon_get_hmx_support_info(int domain, uint32_t * capability, u
16791684 return hexagon_error;
16801685}
16811686
1682- static int ggmlhexagon_get_hex_arch_ver (int domain, uint32_t * capability) {
1687+ static int ggmlhexagon_get_hvx_arch_ver (int domain, uint32_t * capability) {
16831688 int hexagon_error = AEE_SUCCESS;
16841689 *capability = 0 ;
16851690 if (remote_handle_control) {
@@ -1696,7 +1701,7 @@ static int ggmlhexagon_get_hex_arch_ver(int domain, uint32_t * capability) {
16961701 hexagon_error = AEE_SUCCESS;
16971702 goto bail;
16981703 } else if (hexagon_error == AEE_SUCCESS) {
1699- *capability = dsp_capability_arch_ver.capability ;
1704+ *capability = dsp_capability_arch_ver.capability & 0xFF ;
17001705 } else {
17011706 GGMLQNN_LOG_DEBUG (" get_hex_arch_ver failed with error 0x%x" , hexagon_error);
17021707 goto bail;
@@ -1710,7 +1715,7 @@ static int ggmlhexagon_get_hex_arch_ver(int domain, uint32_t * capability) {
17101715 return hexagon_error;
17111716}
17121717
1713- static int ggmlhexagon_get_hvx_support_info (int domain, uint32_t * capability , uint32_t attr )
1718+ static int ggmlhexagon_get_hvx_support_info (int domain, uint32_t attr , uint32_t * capability )
17141719{
17151720 int hexagon_error = AEE_SUCCESS;
17161721 *capability = 0 ;
@@ -1834,6 +1839,58 @@ static AEEResult ggmlhexagon_set_clocks(remote_handle64 handle, int32 power_leve
18341839 return AEE_SUCCESS;
18351840}
18361841
1842+ static void ggmlhexagon_probe_dspinfo (ggml_backend_qnn_context * ctx, size_t * rpcmem_capacity) {
1843+ size_t candidate_size = 0 ;
1844+ uint8_t * rpc_buffer = nullptr ;
1845+ const int SIZE_IN_MB = (1 << 20 );
1846+ size_t probe_slots[] = {1024 , 1536 , 2048 - 48 , 2048 };
1847+ size_t probe_counts = sizeof (probe_slots) / sizeof (size_t );
1848+ for (size_t idx = 0 ; idx < probe_counts; idx++) {
1849+ rpc_buffer = static_cast <uint8_t *>(rpcmem_alloc (RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, (probe_slots[idx] * SIZE_IN_MB)));
1850+ if (nullptr == rpc_buffer) {
1851+ GGMLQNN_LOG_DEBUG (" alloc rpcmem %d (MB) failure, %s\n " , probe_slots[idx], strerror (errno));
1852+ break ;
1853+ } else {
1854+ candidate_size = probe_slots[idx];
1855+ rpcmem_free (rpc_buffer);
1856+ rpc_buffer = nullptr ;
1857+ }
1858+ }
1859+
1860+ *rpcmem_capacity = candidate_size;
1861+ GGMLQNN_LOG_INFO (" capacity of rpc ion memory %d MB\n " , *rpcmem_capacity);
1862+
1863+ uint32_t dsp_version = 0 ;
1864+ ggmlhexagon_get_hvx_arch_ver (ctx->domain_id , &dsp_version);
1865+
1866+ if (dsp_version == 0x68 || dsp_version == 0x69 || dsp_version == 0x73 || dsp_version == 0x75 || dsp_version == 0x79 ) {
1867+ GGMLQNN_LOG_DEBUG (" dsp arch version 0x%x" , dsp_version);
1868+ } else {
1869+ GGMLQNN_LOG_WARN (" error: dsp arch version 0x%x is not supported" , dsp_version);
1870+ }
1871+
1872+ uint32_t vtcm_count = 0 ;
1873+ uint32_t vtcm_page = 0 ;
1874+ ggmlhexagon_get_vtcm_info (ctx->domain_id , VTCM_COUNT, &vtcm_count);
1875+ ggmlhexagon_get_vtcm_info (ctx->domain_id , VTCM_PAGE, &vtcm_page);
1876+ GGMLQNN_LOG_DEBUG (" vtcm_count %d" , vtcm_count);
1877+ GGMLQNN_LOG_DEBUG (" vtcm_page %d" , vtcm_page);
1878+
1879+ uint32_t hmx_depth = 0 ;
1880+ uint32_t hmx_spatial = 0 ;
1881+ ggmlhexagon_get_hmx_support_info (ctx->domain_id , HMX_SUPPORT_DEPTH, &hmx_depth);
1882+ ggmlhexagon_get_hmx_support_info (ctx->domain_id , HMX_SUPPORT_SPATIAL, &hmx_spatial);
1883+ GGMLQNN_LOG_DEBUG (" hmx_depth %d" , hmx_depth);
1884+ GGMLQNN_LOG_DEBUG (" hmx_spatial %d" , hmx_spatial);
1885+
1886+ uint32_t hvx_support_128b = 0 ;
1887+ ggmlhexagon_get_hvx_support_info (ctx->domain_id , HVX_SUPPORT_128B, &hvx_support_128b);
1888+ GGMLQNN_LOG_DEBUG (" hvx_support_128b %d" , hvx_support_128b);
1889+
1890+ GGMLQNN_LOG_DEBUG (" unsigned pd supported %d" , ggmlhexagon_get_unsignedpd_support ());
1891+ GGMLQNN_LOG_DEBUG (" async fastrpc supported %d" , ggmlhexagon_is_async_fastrpc_supported (ctx->domain_id ));
1892+ }
1893+
18371894static int ggmlhexagon_init_dsp (ggml_backend_qnn_context * ctx) {
18381895 int hexagon_error = AEE_SUCCESS;
18391896
@@ -1931,6 +1988,7 @@ static int ggmlhexagon_init_dsp(ggml_backend_qnn_context * ctx) {
19311988 }
19321989 }
19331990
1991+ ctx->domain_id = domain_id;
19341992 GGMLQNN_LOG_INFO (" using Hexagon domain %d(%s)" , domain_id, ggmlhexagon_get_dsp_name (domain_id));
19351993 GGMLQNN_LOG_INFO (" unsignedpd_enabled %d" , is_unsignedpd_enabled);
19361994 if (is_unsignedpd_enabled) {
@@ -1966,7 +2024,9 @@ static int ggmlhexagon_init_dsp(ggml_backend_qnn_context * ctx) {
19662024 hexagon_error = ggmlop_dsp_open (ggmlop_domain_uri, &ctx->ggmlop_handle );
19672025 if (AEE_SUCCESS == hexagon_error) {
19682026 GGMLQNN_LOG_INFO (" succeed to open domain %d(%s)" , domain_id, ggmlhexagon_get_dsp_name (domain_id));
1969- GGMLQNN_LOG_INFO (" only support GGML_OP_ADD on cDSP currently\n " );
2027+ GGMLQNN_LOG_INFO (" only support GGML_OP_ADD and GGML_OP_MUL_MAT on cDSP currently\n " );
2028+ size_t rpcmem_size = 0 ;
2029+ ggmlhexagon_probe_dspinfo (ctx, &rpcmem_size);
19702030 ggmlhexagon_set_clocks (ctx->ggmlop_handle , HAP_DCVS_V2_DUTY_CYCLE_MODE, 40 , 1 );
19712031 ggmlhexagon_set_rpc_latency (domain_id, RPC_POLL_QOS, 1000 );
19722032 } else {
@@ -1983,9 +2043,10 @@ static int ggmlhexagon_init_dsp(ggml_backend_qnn_context * ctx) {
19832043
19842044 if (ctx->rpc_mempool ) {
19852045 rpcmem_free (ctx->rpc_mempool );
1986- ctx->rpc_mempool = nullptr ;
1987- ctx->rpc_mempool_len = 0 ;
1988- ctx->ggmlop_handle = -1 ;
2046+ ctx->rpc_mempool = nullptr ;
2047+ ctx->rpc_mempool_len = 0 ;
2048+ ctx->ggmlop_handle = -1 ;
2049+ ctx->domain_id = -1 ;
19892050 }
19902051
19912052 return -1 ;
@@ -2005,8 +2066,9 @@ static void ggmlhexagon_close_cdsp(ggml_backend_qnn_context * ctx) {
20052066
20062067 if (ctx->rpc_mempool ) {
20072068 rpcmem_free (ctx->rpc_mempool );
2008- ctx->rpc_mempool = nullptr ;
2009- ctx->rpc_mempool_len = 0 ;
2069+ ctx->rpc_mempool = nullptr ;
2070+ ctx->rpc_mempool_len = 0 ;
2071+ ctx->domain_id = -1 ;
20102072 }
20112073 GGMLQNN_LOG_DEBUG (" leave %s" , __func__);
20122074}
@@ -2019,39 +2081,28 @@ static void ggmlhexagon_compute(ggml_backend_qnn_context * ctx, struct ggml_tens
20192081
20202082 int hexagon_error = AEE_SUCCESS;
20212083 ggmlhexagon_op_func_t op_func = nullptr ;
2022- void * wdata = nullptr ;
2023-
20242084 ggml_tensor * src0 = op->src [0 ];
2025- // src1 might-be nullptr for some ggml op
20262085 ggml_tensor * src1 = op->src [1 ];
20272086 ggml_tensor * dst = op;
2028- ggml_type src0_type = src0->type ;
20292087
20302088 switch (op->op ) {
20312089 case GGML_OP_ADD:
20322090 op_func = ggmlop_dsp_add;
20332091 break ;
20342092 case GGML_OP_MUL_MAT: {
2035- wdata = ggmlqnn_type_trait (ctx, op);
20362093 op_func = ggmlop_dsp_mulmat;
20372094 break ;
20382095 }
20392096 default :
20402097 return ;
20412098 }
20422099
2043- if ((GGML_OP_MUL_MAT == op->op ) && (src0_type != GGML_TYPE_F32)) {
2044- dsptensor_0.data = wdata;
2045- dsptensor_0.data_len = ctx->desired_size ;
2046- } else {
2047- dsptensor_0.data = src0->data ;
2048- dsptensor_0.data_len = ggml_nbytes (src0);
2049- }
2100+ dsptensor_0.data = src0->data ;
2101+ dsptensor_0.data_len = ggml_nbytes (src0);
20502102
2051- dsptensor_1.data = src1->data ;
2052- dsptensor_2.data = dst->data ;
2103+ dsptensor_1.data = src1->data ;
2104+ dsptensor_2.data = dst->data ;
20532105
2054- // make compiler happy
20552106 dsptensor_0.ne [0 ] = src0->ne [0 ];
20562107 dsptensor_0.ne [1 ] = src0->ne [1 ];
20572108 dsptensor_0.ne [2 ] = src0->ne [2 ];
@@ -2086,10 +2137,6 @@ static void ggmlhexagon_compute(ggml_backend_qnn_context * ctx, struct ggml_tens
20862137 dsptensor_1.data_len = ggml_nbytes (src1);
20872138 dsptensor_2.data_len = ggml_nbytes (dst);
20882139
2089- if ((GGML_OP_MUL_MAT == op->op ) && (src0_type != GGML_TYPE_F32)) {
2090- dsptensor_0.data_len = ctx->desired_size ;
2091- }
2092-
20932140 dsptensor_0.type = src0->type ;
20942141 dsptensor_1.type = src1->type ;
20952142 dsptensor_2.type = dst->type ;
@@ -4179,10 +4226,12 @@ static void ggmlqnn_load_cfg() {
41794226 qnncfg_instance.get_intvalue (" general" , " dump_op_info" , g_qnn_params.dump_op_info , 0 );
41804227 qnncfg_instance.get_intvalue (" general" , " inference_approach" , g_qnn_params.inference_approach , 0 );
41814228 qnncfg_instance.get_intvalue (" general" , " qnn_backend" , g_qnn_params.qnn_backend , 2 );
4182- qnncfg_instance.get_intvalue (" npu" , " hvx_threads" , g_qnn_params.hvx_threads , 4 );
4183- qnncfg_instance.get_intvalue (" npu" , " vtcm_size_in_mb" , g_qnn_params.vtcm_size_in_mb , 8 );
4184- qnncfg_instance.get_intvalue (" npu" , " enable_dlbc" , g_qnn_params.enable_dlbc , 0 );
4185- qnncfg_instance.get_stringvalue (" npu" , " precision_mode" , precision_mode, " fp32" );
4229+ qnncfg_instance.get_intvalue (" qnn" , " hvx_threads" , g_qnn_params.hvx_threads , 4 );
4230+ qnncfg_instance.get_intvalue (" qnn" , " vtcm_size_in_mb" , g_qnn_params.vtcm_size_in_mb , 8 );
4231+ qnncfg_instance.get_intvalue (" qnn" , " enable_dlbc" , g_qnn_params.enable_dlbc , 0 );
4232+ qnncfg_instance.get_stringvalue (" qnn" , " precision_mode" , precision_mode, " fp32" );
4233+ qnncfg_instance.get_intvalue (" cdsp" , " enable_mulmat_cdsp" , g_qnn_params.enable_mulmat_cdsp , 0 );
4234+ qnncfg_instance.get_intvalue (" cdsp" , " enable_q_mulmat" , g_qnn_params.enable_q_mulmat , 0 );
41864235 GGMLQNN_LOG_INFO (" print_qnn_internal_log=%d" , g_qnn_params.print_qnn_internal_log );
41874236 GGMLQNN_LOG_INFO (" inference_approach=%d(%s)" , g_qnn_params.inference_approach ,
41884237 ggmlqnn_get_inference_approach_name (g_qnn_params.inference_approach ));
@@ -4226,39 +4275,48 @@ static bool ggmlhexagon_can_handle_op(const ggml_backend_qnn_context * ctx, cons
42264275 const int64_t ne00 = op_tensor->src [0 ]->ne [0 ];
42274276 uint32_t src0_rank = 0 ;
42284277 uint32_t src1_rank = 0 ;
4278+ bool support = false ;
4279+
42294280 if (nullptr != src0) {
42304281 src0_rank = ggml_n_dims (src0);
42314282 }
42324283 if (nullptr != src1) {
42334284 src1_rank = ggml_n_dims (src1);
42344285 }
42354286
4236- // TODO: only support offload GGML_OP_ADD and GGML_OP_MUL_MAT to cDSP directly
4237- bool support = ((op_tensor->op == GGML_OP_ADD) || (op_tensor->op == GGML_OP_MUL_MAT));
4287+ if (g_qnn_params.enable_mulmat_cdsp )
4288+ support = ((op_tensor->op == GGML_OP_ADD) || (op_tensor->op == GGML_OP_MUL_MAT));
4289+ else
4290+ support = (op_tensor->op == GGML_OP_ADD);
42384291 if (!support)
42394292 return false ;
42404293
4294+ ggmlqnn_dump_op_info (op_tensor);
42414295 switch (op_tensor->op ) {
42424296 case GGML_OP_ADD:
42434297 {
42444298 if (!ggml_are_same_shape (src0, src1)) {
42454299 return false ;
42464300 }
4301+
42474302 return (src0->type == GGML_TYPE_F32) && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32);
42484303 }
4249-
42504304 case GGML_OP_MUL_MAT:
42514305 {
42524306 ggmlqnn_dump_op_info (op_tensor);
42534307
4254- if (src1_rank != 2 )
4308+ // TODO:3d&4d matrix mulmat on cDSP
4309+ if (src0_rank != 2 )
42554310 return false ;
42564311
4257- return (src0->type == GGML_TYPE_F32) && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32);
4258-
4312+ if (g_qnn_params.enable_q_mulmat )
4313+ return (src0->type == GGML_TYPE_F32 || ggml_is_quantized (src0->type ))
4314+ && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32);
4315+ else
4316+ return (src0->type == GGML_TYPE_F32) && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32);
42594317 }
42604318 default :
4261- return false ;
4319+ return ggmlqnn_same_types (ctx, op_tensor) ;
42624320 }
42634321}
42644322
@@ -4597,8 +4655,6 @@ static ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer(
45974655 if (nullptr == ctx->buffer ) {
45984656 GGMLQNN_LOG_WARN (" %s: failed to allocate %d MiB\n " , __func__, size / (1 << 20 ));
45994657 return nullptr ;
4600- } else {
4601- GGMLQNN_LOG_DEBUG (" %s: allocate %d MiB\n " , __func__, size_aligned / (1 << 20 ));
46024658 }
46034659
46044660 return ggml_backend_buffer_init (buft, ggml_backend_qnn_buffer_interface, ctx, size);
@@ -4729,10 +4785,16 @@ static void ggml_backend_qnn_device_get_memory(ggml_backend_dev_t dev, size_t *
47294785 *total = ggmlqnn_get_system_total_memory_in_bytes ();
47304786 *free = ggmlqnn_get_system_free_memory_in_bytes ();
47314787 } else if (QNN_BACKEND_NPU == ctx->device ) {
4732- size_t rpc_ion_memsize = ctx->instance ->get_rpcmem_capacity ();
4733- size_t rpc_ion_usage = ctx->instance ->get_rpcmem_usage ();
4734- GGMLQNN_LOG_DEBUG (" rpc memsize %d" , rpc_ion_memsize);
4735- GGMLQNN_LOG_DEBUG (" rpc usage %d" , rpc_ion_usage);
4788+ size_t rpc_ion_memsize = 0 ;
4789+ size_t rpc_ion_usage = 0 ;
4790+ if (DIRECT_USE_CDSP != g_qnn_params.inference_approach ) {
4791+ rpc_ion_memsize = ctx->instance ->get_rpcmem_capacity ();
4792+ rpc_ion_usage = ctx->instance ->get_rpcmem_usage ();
4793+ } else {
4794+ ggmlhexagon_probe_dspinfo (ctx, &rpc_ion_memsize);
4795+ }
4796+ GGMLQNN_LOG_DEBUG (" rpc memsize %d M" , rpc_ion_memsize);
4797+ GGMLQNN_LOG_DEBUG (" rpc usage %d M" , rpc_ion_usage);
47364798 *total = rpc_ion_memsize * (1 << 20 );
47374799 *free = (rpc_ion_memsize - rpc_ion_usage) * (1 << 20 );
47384800 }
@@ -5078,9 +5140,12 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) {
50785140 return g_qnn_mgr[device].backend ;
50795141 }
50805142
5081- qnn_instance * instance = ggmlqnn_init_qnn_instance (device, qnn_lib_path);
5082- if (nullptr == instance)
5083- return nullptr ;
5143+ // don't initialize QNN when inference approach is offload ggml op to Hexagon cDSP directly
5144+ if (DIRECT_USE_CDSP != g_qnn_params.inference_approach ) {
5145+ qnn_instance * instance = ggmlqnn_init_qnn_instance (device, qnn_lib_path);
5146+ if (nullptr == instance)
5147+ return nullptr ;
5148+ }
50845149
50855150 ggml_backend_qnn_interface.graph_compute = ggmlqnn_backend_graph_compute_general;
50865151
0 commit comments