Skip to content

Commit 97f7874

Browse files
author
zhouwg
committed
ggml-qnn: make host code(ggml-qnn.cpp) more clear and more stable
1 parent 4d7c345 commit 97f7874

File tree

2 files changed

+126
-53
lines changed

2 files changed

+126
-53
lines changed

ggml/src/ggml-qnn/ggml-qnn.cpp

Lines changed: 116 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
* this is a complicated skeleton, can expand other ggml ops accordingly
2424
*
2525
* currently provide following ggml op' implementation through Hexagon DSP:
26-
* - GGML_OP_ADD:
26+
* - GGML_OP_ADD & GGML_OP_MUL_MAT:
2727
* this is a skeleton, can expand other ggml ops accordingly
2828
*
2929
* Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -343,6 +343,7 @@ struct ggml_backend_qnn_context {
343343
size_t rpc_mempool_len;
344344
void * rpc_mempool;
345345
remote_handle64 ggmlop_handle;
346+
int domain_id;
346347
};
347348

348349
struct qnn_op_caps {
@@ -363,6 +364,8 @@ struct qnn_parameter {
363364
int enable_dlbc;
364365
int inference_approach; // 0: QNN_GENERAL 1: DIRECT_USE_CDSP 2: QNN_SINGELGRAPH
365366
int qnn_backend; // 0: QNN-CPU backend 1: QNN-GPU backend 2: QNN-NPU backend
367+
int enable_mulmat_cdsp; // enable/disable offload mulmat to cDSP
368+
int enable_q_mulmat; // enable/disable offload fp32 & all quantized type mulmat to cDSP
366369
const char * qnn_cfgfilename;
367370
const char * qnn_runtimelib_path;
368371
};
@@ -381,6 +384,8 @@ static struct qnn_parameter g_qnn_params = {
381384
.enable_dlbc = 1,
382385
.inference_approach = 0,
383386
.qnn_backend = 2, //default is QNN-NPU backend
387+
.enable_mulmat_cdsp = 0,
388+
.enable_q_mulmat = 0,
384389
.qnn_cfgfilename = "ggml-qnn.cfg",
385390
#if defined(__ANDROID__)
386391
//Android command line program
@@ -1451,7 +1456,7 @@ static int ggmlhexagon_get_dsp_support(int * domain) {
14511456
return hexagon_error;
14521457
}
14531458

1454-
static int ggmlhexagon_get_vtcm_info(int domain, uint32_t * capability, uint32_t attr) {
1459+
static int ggmlhexagon_get_vtcm_info(int domain, uint32_t attr, uint32_t * capability) {
14551460
int hexagon_error = AEE_SUCCESS;
14561461
*capability = 0;
14571462

@@ -1633,7 +1638,7 @@ static bool ggmlhexagon_is_status_notification_supported(int domain) {
16331638
return false;
16341639
}
16351640

1636-
static int ggmlhexagon_get_hmx_support_info(int domain, uint32_t * capability, uint32_t attr) {
1641+
static int ggmlhexagon_get_hmx_support_info(int domain, uint32_t attr, uint32_t * capability) {
16371642
int hexagon_error = AEE_SUCCESS;
16381643
*capability = 0;
16391644

@@ -1679,7 +1684,7 @@ static int ggmlhexagon_get_hmx_support_info(int domain, uint32_t * capability, u
16791684
return hexagon_error;
16801685
}
16811686

1682-
static int ggmlhexagon_get_hex_arch_ver(int domain, uint32_t * capability) {
1687+
static int ggmlhexagon_get_hvx_arch_ver(int domain, uint32_t * capability) {
16831688
int hexagon_error = AEE_SUCCESS;
16841689
*capability = 0;
16851690
if(remote_handle_control) {
@@ -1696,7 +1701,7 @@ static int ggmlhexagon_get_hex_arch_ver(int domain, uint32_t * capability) {
16961701
hexagon_error = AEE_SUCCESS;
16971702
goto bail;
16981703
} else if (hexagon_error == AEE_SUCCESS) {
1699-
*capability = dsp_capability_arch_ver.capability;
1704+
*capability = dsp_capability_arch_ver.capability & 0xFF;
17001705
} else {
17011706
GGMLQNN_LOG_DEBUG("get_hex_arch_ver failed with error 0x%x", hexagon_error);
17021707
goto bail;
@@ -1710,7 +1715,7 @@ static int ggmlhexagon_get_hex_arch_ver(int domain, uint32_t * capability) {
17101715
return hexagon_error;
17111716
}
17121717

1713-
static int ggmlhexagon_get_hvx_support_info(int domain, uint32_t * capability, uint32_t attr)
1718+
static int ggmlhexagon_get_hvx_support_info(int domain, uint32_t attr, uint32_t * capability)
17141719
{
17151720
int hexagon_error = AEE_SUCCESS;
17161721
*capability = 0;
@@ -1834,6 +1839,58 @@ static AEEResult ggmlhexagon_set_clocks(remote_handle64 handle, int32 power_leve
18341839
return AEE_SUCCESS;
18351840
}
18361841

1842+
static void ggmlhexagon_probe_dspinfo(ggml_backend_qnn_context * ctx, size_t * rpcmem_capacity) {
1843+
size_t candidate_size = 0;
1844+
uint8_t * rpc_buffer = nullptr;
1845+
const int SIZE_IN_MB = (1 << 20);
1846+
size_t probe_slots[] = {1024, 1536, 2048 - 48, 2048};
1847+
size_t probe_counts = sizeof(probe_slots) / sizeof(size_t);
1848+
for (size_t idx = 0; idx < probe_counts; idx++) {
1849+
rpc_buffer = static_cast<uint8_t *>(rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, (probe_slots[idx] * SIZE_IN_MB)));
1850+
if (nullptr == rpc_buffer) {
1851+
GGMLQNN_LOG_DEBUG("alloc rpcmem %d (MB) failure, %s\n", probe_slots[idx], strerror(errno));
1852+
break;
1853+
} else {
1854+
candidate_size = probe_slots[idx];
1855+
rpcmem_free(rpc_buffer);
1856+
rpc_buffer = nullptr;
1857+
}
1858+
}
1859+
1860+
*rpcmem_capacity = candidate_size;
1861+
GGMLQNN_LOG_INFO("capacity of rpc ion memory %d MB\n", *rpcmem_capacity);
1862+
1863+
uint32_t dsp_version = 0;
1864+
ggmlhexagon_get_hvx_arch_ver(ctx->domain_id, &dsp_version);
1865+
1866+
if (dsp_version == 0x68 || dsp_version == 0x69 || dsp_version == 0x73 || dsp_version == 0x75 || dsp_version == 0x79) {
1867+
GGMLQNN_LOG_DEBUG("dsp arch version 0x%x", dsp_version);
1868+
} else {
1869+
GGMLQNN_LOG_WARN("error: dsp arch version 0x%x is not supported", dsp_version);
1870+
}
1871+
1872+
uint32_t vtcm_count = 0;
1873+
uint32_t vtcm_page = 0;
1874+
ggmlhexagon_get_vtcm_info(ctx->domain_id, VTCM_COUNT, &vtcm_count);
1875+
ggmlhexagon_get_vtcm_info(ctx->domain_id, VTCM_PAGE, &vtcm_page);
1876+
GGMLQNN_LOG_DEBUG("vtcm_count %d", vtcm_count);
1877+
GGMLQNN_LOG_DEBUG("vtcm_page %d", vtcm_page);
1878+
1879+
uint32_t hmx_depth = 0;
1880+
uint32_t hmx_spatial = 0;
1881+
ggmlhexagon_get_hmx_support_info(ctx->domain_id, HMX_SUPPORT_DEPTH, &hmx_depth);
1882+
ggmlhexagon_get_hmx_support_info(ctx->domain_id, HMX_SUPPORT_SPATIAL, &hmx_spatial);
1883+
GGMLQNN_LOG_DEBUG("hmx_depth %d", hmx_depth);
1884+
GGMLQNN_LOG_DEBUG("hmx_spatial %d", hmx_spatial);
1885+
1886+
uint32_t hvx_support_128b = 0;
1887+
ggmlhexagon_get_hvx_support_info(ctx->domain_id, HVX_SUPPORT_128B, &hvx_support_128b);
1888+
GGMLQNN_LOG_DEBUG("hvx_support_128b %d", hvx_support_128b);
1889+
1890+
GGMLQNN_LOG_DEBUG("unsigned pd supported %d", ggmlhexagon_get_unsignedpd_support());
1891+
GGMLQNN_LOG_DEBUG("async fastrpc supported %d", ggmlhexagon_is_async_fastrpc_supported(ctx->domain_id));
1892+
}
1893+
18371894
static int ggmlhexagon_init_dsp(ggml_backend_qnn_context * ctx) {
18381895
int hexagon_error = AEE_SUCCESS;
18391896

@@ -1931,6 +1988,7 @@ static int ggmlhexagon_init_dsp(ggml_backend_qnn_context * ctx) {
19311988
}
19321989
}
19331990

1991+
ctx->domain_id = domain_id;
19341992
GGMLQNN_LOG_INFO("using Hexagon domain %d(%s)", domain_id, ggmlhexagon_get_dsp_name(domain_id));
19351993
GGMLQNN_LOG_INFO("unsignedpd_enabled %d", is_unsignedpd_enabled);
19361994
if (is_unsignedpd_enabled) {
@@ -1966,7 +2024,9 @@ static int ggmlhexagon_init_dsp(ggml_backend_qnn_context * ctx) {
19662024
hexagon_error = ggmlop_dsp_open(ggmlop_domain_uri, &ctx->ggmlop_handle);
19672025
if (AEE_SUCCESS == hexagon_error) {
19682026
GGMLQNN_LOG_INFO("succeed to open domain %d(%s)", domain_id, ggmlhexagon_get_dsp_name(domain_id));
1969-
GGMLQNN_LOG_INFO("only support GGML_OP_ADD on cDSP currently\n");
2027+
GGMLQNN_LOG_INFO("only support GGML_OP_ADD and GGML_OP_MUL_MAT on cDSP currently\n");
2028+
size_t rpcmem_size = 0;
2029+
ggmlhexagon_probe_dspinfo(ctx, &rpcmem_size);
19702030
ggmlhexagon_set_clocks(ctx->ggmlop_handle, HAP_DCVS_V2_DUTY_CYCLE_MODE, 40, 1);
19712031
ggmlhexagon_set_rpc_latency(domain_id, RPC_POLL_QOS, 1000);
19722032
} else {
@@ -1983,9 +2043,10 @@ static int ggmlhexagon_init_dsp(ggml_backend_qnn_context * ctx) {
19832043

19842044
if (ctx->rpc_mempool) {
19852045
rpcmem_free(ctx->rpc_mempool);
1986-
ctx->rpc_mempool = nullptr;
1987-
ctx->rpc_mempool_len = 0;
1988-
ctx->ggmlop_handle = -1;
2046+
ctx->rpc_mempool = nullptr;
2047+
ctx->rpc_mempool_len = 0;
2048+
ctx->ggmlop_handle = -1;
2049+
ctx->domain_id = -1;
19892050
}
19902051

19912052
return -1;
@@ -2005,8 +2066,9 @@ static void ggmlhexagon_close_cdsp(ggml_backend_qnn_context * ctx) {
20052066

20062067
if (ctx->rpc_mempool) {
20072068
rpcmem_free(ctx->rpc_mempool);
2008-
ctx->rpc_mempool = nullptr;
2009-
ctx->rpc_mempool_len = 0;
2069+
ctx->rpc_mempool = nullptr;
2070+
ctx->rpc_mempool_len = 0;
2071+
ctx->domain_id = -1;
20102072
}
20112073
GGMLQNN_LOG_DEBUG("leave %s", __func__);
20122074
}
@@ -2019,39 +2081,28 @@ static void ggmlhexagon_compute(ggml_backend_qnn_context * ctx, struct ggml_tens
20192081

20202082
int hexagon_error = AEE_SUCCESS;
20212083
ggmlhexagon_op_func_t op_func = nullptr;
2022-
void * wdata = nullptr;
2023-
20242084
ggml_tensor * src0 = op->src[0];
2025-
//src1 might-be nullptr for some ggml op
20262085
ggml_tensor * src1 = op->src[1];
20272086
ggml_tensor * dst = op;
2028-
ggml_type src0_type = src0->type;
20292087

20302088
switch (op->op) {
20312089
case GGML_OP_ADD:
20322090
op_func = ggmlop_dsp_add;
20332091
break;
20342092
case GGML_OP_MUL_MAT: {
2035-
wdata = ggmlqnn_type_trait(ctx, op);
20362093
op_func = ggmlop_dsp_mulmat;
20372094
break;
20382095
}
20392096
default:
20402097
return;
20412098
}
20422099

2043-
if ((GGML_OP_MUL_MAT == op->op) && (src0_type != GGML_TYPE_F32)) {
2044-
dsptensor_0.data = wdata;
2045-
dsptensor_0.data_len = ctx->desired_size;
2046-
} else {
2047-
dsptensor_0.data = src0->data;
2048-
dsptensor_0.data_len = ggml_nbytes(src0);
2049-
}
2100+
dsptensor_0.data = src0->data;
2101+
dsptensor_0.data_len = ggml_nbytes(src0);
20502102

2051-
dsptensor_1.data = src1->data;
2052-
dsptensor_2.data = dst->data;
2103+
dsptensor_1.data = src1->data;
2104+
dsptensor_2.data = dst->data;
20532105

2054-
//make compiler happy
20552106
dsptensor_0.ne[0] = src0->ne[0];
20562107
dsptensor_0.ne[1] = src0->ne[1];
20572108
dsptensor_0.ne[2] = src0->ne[2];
@@ -2086,10 +2137,6 @@ static void ggmlhexagon_compute(ggml_backend_qnn_context * ctx, struct ggml_tens
20862137
dsptensor_1.data_len = ggml_nbytes(src1);
20872138
dsptensor_2.data_len = ggml_nbytes(dst);
20882139

2089-
if ((GGML_OP_MUL_MAT == op->op) && (src0_type != GGML_TYPE_F32)) {
2090-
dsptensor_0.data_len = ctx->desired_size;
2091-
}
2092-
20932140
dsptensor_0.type = src0->type;
20942141
dsptensor_1.type = src1->type;
20952142
dsptensor_2.type = dst->type;
@@ -4179,10 +4226,12 @@ static void ggmlqnn_load_cfg() {
41794226
qnncfg_instance.get_intvalue("general", "dump_op_info", g_qnn_params.dump_op_info, 0);
41804227
qnncfg_instance.get_intvalue("general", "inference_approach", g_qnn_params.inference_approach, 0);
41814228
qnncfg_instance.get_intvalue("general", "qnn_backend", g_qnn_params.qnn_backend, 2);
4182-
qnncfg_instance.get_intvalue("npu", "hvx_threads", g_qnn_params.hvx_threads, 4);
4183-
qnncfg_instance.get_intvalue("npu", "vtcm_size_in_mb", g_qnn_params.vtcm_size_in_mb, 8);
4184-
qnncfg_instance.get_intvalue("npu", "enable_dlbc", g_qnn_params.enable_dlbc, 0);
4185-
qnncfg_instance.get_stringvalue("npu", "precision_mode", precision_mode, "fp32");
4229+
qnncfg_instance.get_intvalue("qnn", "hvx_threads", g_qnn_params.hvx_threads, 4);
4230+
qnncfg_instance.get_intvalue("qnn", "vtcm_size_in_mb", g_qnn_params.vtcm_size_in_mb, 8);
4231+
qnncfg_instance.get_intvalue("qnn", "enable_dlbc", g_qnn_params.enable_dlbc, 0);
4232+
qnncfg_instance.get_stringvalue("qnn", "precision_mode", precision_mode, "fp32");
4233+
qnncfg_instance.get_intvalue("cdsp", "enable_mulmat_cdsp", g_qnn_params.enable_mulmat_cdsp, 0);
4234+
qnncfg_instance.get_intvalue("cdsp", "enable_q_mulmat", g_qnn_params.enable_q_mulmat, 0);
41864235
GGMLQNN_LOG_INFO("print_qnn_internal_log=%d", g_qnn_params.print_qnn_internal_log);
41874236
GGMLQNN_LOG_INFO("inference_approach=%d(%s)", g_qnn_params.inference_approach,
41884237
ggmlqnn_get_inference_approach_name(g_qnn_params.inference_approach));
@@ -4226,39 +4275,48 @@ static bool ggmlhexagon_can_handle_op(const ggml_backend_qnn_context * ctx, cons
42264275
const int64_t ne00 = op_tensor->src[0]->ne[0];
42274276
uint32_t src0_rank = 0;
42284277
uint32_t src1_rank = 0;
4278+
bool support = false;
4279+
42294280
if (nullptr != src0) {
42304281
src0_rank = ggml_n_dims(src0);
42314282
}
42324283
if (nullptr != src1) {
42334284
src1_rank = ggml_n_dims(src1);
42344285
}
42354286

4236-
//TODO: only support offload GGML_OP_ADD and GGML_OP_MUL_MAT to cDSP directly
4237-
bool support = ((op_tensor->op == GGML_OP_ADD) || (op_tensor->op == GGML_OP_MUL_MAT));
4287+
if (g_qnn_params.enable_mulmat_cdsp)
4288+
support = ((op_tensor->op == GGML_OP_ADD) || (op_tensor->op == GGML_OP_MUL_MAT));
4289+
else
4290+
support = (op_tensor->op == GGML_OP_ADD);
42384291
if (!support)
42394292
return false;
42404293

4294+
ggmlqnn_dump_op_info(op_tensor);
42414295
switch (op_tensor->op) {
42424296
case GGML_OP_ADD:
42434297
{
42444298
if (!ggml_are_same_shape(src0, src1)) {
42454299
return false;
42464300
}
4301+
42474302
return (src0->type == GGML_TYPE_F32) && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32);
42484303
}
4249-
42504304
case GGML_OP_MUL_MAT:
42514305
{
42524306
ggmlqnn_dump_op_info(op_tensor);
42534307

4254-
if (src1_rank != 2)
4308+
//TODO:3d&4d matrix mulmat on cDSP
4309+
if (src0_rank != 2)
42554310
return false;
42564311

4257-
return (src0->type == GGML_TYPE_F32) && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32);
4258-
4312+
if (g_qnn_params.enable_q_mulmat)
4313+
return (src0->type == GGML_TYPE_F32 || ggml_is_quantized(src0->type))
4314+
&& (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32);
4315+
else
4316+
return (src0->type == GGML_TYPE_F32) && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32);
42594317
}
42604318
default:
4261-
return false;
4319+
return ggmlqnn_same_types(ctx, op_tensor);
42624320
}
42634321
}
42644322

@@ -4597,8 +4655,6 @@ static ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer(
45974655
if (nullptr == ctx->buffer) {
45984656
GGMLQNN_LOG_WARN("%s: failed to allocate %d MiB\n", __func__, size / (1 << 20));
45994657
return nullptr;
4600-
} else {
4601-
GGMLQNN_LOG_DEBUG("%s: allocate %d MiB\n", __func__, size_aligned / (1 << 20));
46024658
}
46034659

46044660
return ggml_backend_buffer_init(buft, ggml_backend_qnn_buffer_interface, ctx, size);
@@ -4729,10 +4785,16 @@ static void ggml_backend_qnn_device_get_memory(ggml_backend_dev_t dev, size_t *
47294785
*total = ggmlqnn_get_system_total_memory_in_bytes();
47304786
*free = ggmlqnn_get_system_free_memory_in_bytes();
47314787
} else if (QNN_BACKEND_NPU == ctx->device) {
4732-
size_t rpc_ion_memsize = ctx->instance->get_rpcmem_capacity();
4733-
size_t rpc_ion_usage = ctx->instance->get_rpcmem_usage();
4734-
GGMLQNN_LOG_DEBUG("rpc memsize %d", rpc_ion_memsize);
4735-
GGMLQNN_LOG_DEBUG("rpc usage %d", rpc_ion_usage);
4788+
size_t rpc_ion_memsize = 0;
4789+
size_t rpc_ion_usage = 0;
4790+
if (DIRECT_USE_CDSP != g_qnn_params.inference_approach) {
4791+
rpc_ion_memsize = ctx->instance->get_rpcmem_capacity();
4792+
rpc_ion_usage = ctx->instance->get_rpcmem_usage();
4793+
} else {
4794+
ggmlhexagon_probe_dspinfo(ctx, &rpc_ion_memsize);
4795+
}
4796+
GGMLQNN_LOG_DEBUG("rpc memsize %d M", rpc_ion_memsize);
4797+
GGMLQNN_LOG_DEBUG("rpc usage %d M", rpc_ion_usage);
47364798
*total = rpc_ion_memsize * (1 << 20);
47374799
*free = (rpc_ion_memsize - rpc_ion_usage) * (1 << 20);
47384800
}
@@ -5078,9 +5140,12 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) {
50785140
return g_qnn_mgr[device].backend;
50795141
}
50805142

5081-
qnn_instance * instance = ggmlqnn_init_qnn_instance(device, qnn_lib_path);
5082-
if (nullptr == instance)
5083-
return nullptr;
5143+
//don't initialize QNN when inference approach is offload ggml op to Hexagon cDSP directly
5144+
if (DIRECT_USE_CDSP != g_qnn_params.inference_approach) {
5145+
qnn_instance * instance = ggmlqnn_init_qnn_instance(device, qnn_lib_path);
5146+
if (nullptr == instance)
5147+
return nullptr;
5148+
}
50845149

50855150
ggml_backend_qnn_interface.graph_compute = ggmlqnn_backend_graph_compute_general;
50865151

scripts/ggml-qnn.cfg

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
[general]
22
#0: QNN-CPU backend
33
#1: QNN-GPU backend
4-
#2: QNN-NPU backend
4+
#2: QNN-NPU backend / Hexagon cDSP
55
#3: default ggml backend
66
qnn_backend = 2
77

@@ -22,8 +22,16 @@ dump_op_info = 0
2222
# 2: special approach through QNN: mapping entire ggml cgraph to QNN graph
2323
inference_approach = 1
2424

25-
[npu]
25+
#inference approach through QNN
26+
[qnn]
2627
hvx_threads = 4
2728
vtcm_size_in_mb = 8
2829
enable_dlbc = 1
2930
precision_mode = "fp16"
31+
32+
#inference approach through cDSP
33+
[cdsp]
34+
#enable/disable offload mulmat to cDSP
35+
enable_mulmat_cdsp = 0
36+
#enable/disable offload fp32 & all quantized type mulmat to cDSP
37+
enable_q_mulmat = 0

0 commit comments

Comments
 (0)