Skip to content

Commit 2c22cb1

Browse files
author
zhouwg
committed
ggml-qnn: enable RPC feature for QNN-NPU backend
1 parent 816ebb9 commit 2c22cb1

File tree

1 file changed

+163
-20
lines changed

1 file changed

+163
-20
lines changed

ggml/src/ggml-qnn/ggml-qnn.cpp

Lines changed: 163 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1885,7 +1885,7 @@ class qnn_instance {
18851885
return 0;
18861886
}
18871887

1888-
std::string &get_qnn_graph_name() { return _graph_name; }
1888+
std::string & get_qnn_graph_name() { return _graph_name; }
18891889

18901890
bool is_rpcmem_initialized() {
18911891
return _rpcmem_initialized;
@@ -1906,8 +1906,10 @@ class qnn_instance {
19061906
void unregister_rpcmem(Qnn_MemHandle_t mem_handle);
19071907

19081908
void * alloc_rpcmem(size_t bytes, size_t alignment);
1909+
void * get_rpcmem_from_memhandle(Qnn_MemHandle_t mem_handle);
19091910

19101911
void free_rpcmem(void * buf);
1912+
void free_rpcmem();
19111913

19121914
bool is_rpcmem_allocated(void * buf);
19131915

@@ -1975,15 +1977,16 @@ class qnn_instance {
19751977
QNN_INTERFACE_VER_TYPE _qnn_raw_interface;
19761978
QNN_SYSTEM_INTERFACE_VER_TYPE _qnn_raw_system_interface;
19771979

1978-
std::unordered_set<Qnn_MemHandle_t> _qnn_mem_set;
1980+
std::unordered_map<void *, Qnn_MemHandle_t> _qnn_mem_set;
19791981
std::unordered_map<void *, Qnn_MemHandle_t> _qnn_rpc_buffer_to_handles;
19801982

1983+
19811984
static std::mutex _init_mutex;
19821985
static std::unordered_map<BackendIdType, void *> _loaded_lib_handle;
19831986
static std::unordered_map<std::string, BackendIdType> _lib_path_to_backend_id;
19841987
static std::unordered_map<BackendIdType, const QnnInterface_t *> _loaded_backend;
19851988

1986-
void *_rpc_lib_handle = nullptr;
1989+
void * _rpc_lib_handle = nullptr;
19871990
std::atomic_bool _rpcmem_initialized{false};
19881991
pfn_rpc_mem_alloc _pfn_rpc_mem_alloc;
19891992
pfn_rpc_mem_free _pfn_rpc_mem_free;
@@ -2032,11 +2035,30 @@ void qnn_instance::free_rpcmem(void * buf) {
20322035
} else if (0 == _rpcmem_store_map.count(buf)) {
20332036
GGMLQNN_LOG_WARN("no allocated tensor\n");
20342037
} else {
2038+
GGMLQNN_LOG_DEBUG("free rpc mem %p", _rpcmem_store_map[buf]);
20352039
_pfn_rpc_mem_free(_rpcmem_store_map[buf]);
20362040
_rpcmem_store_map.erase(buf);
20372041
}
20382042
}
20392043

2044+
void qnn_instance::free_rpcmem() {
2045+
Qnn_ErrorHandle_t error = QNN_SUCCESS;
2046+
2047+
if (_rpcmem_store_map.empty()) {
2048+
GGMLQNN_LOG_WARN("no rpcmem allocated\n");
2049+
return;
2050+
}
2051+
2052+
for (std::unordered_map<void *, void *>::iterator it = _rpcmem_store_map.begin();
2053+
it != _qnn_mem_set.end();
2054+
it++) {
2055+
void * rpcbuffer = it->second;
2056+
GGMLQNN_LOG_DEBUG("free rpc buffer %p", rpcbuffer);
2057+
_pfn_rpc_mem_free(rpcbuffer);
2058+
}
2059+
_rpcmem_store_map.clear();
2060+
}
2061+
20402062
int32_t qnn_instance::rpcmem_to_fd(void * buf) {
20412063
int32_t mem_fd = -1;
20422064
if (!is_rpcmem_initialized()) {
@@ -2059,10 +2081,6 @@ int qnn_instance::register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor) {
20592081
return 2;
20602082
}
20612083

2062-
if (is_rpcmem_allocated(p_data)) {
2063-
GGMLQNN_LOG_WARN("rpc memory already allocated\n");
2064-
//return 3;
2065-
}
20662084
if (is_rpcmem_registered((QNN_VER_PTR(*p_tensor)->memHandle))) {
20672085
GGMLQNN_LOG_WARN("tensor %s has been registered shared memory\n", (QNN_VER_PTR(*p_tensor)->name));
20682086
return 4;
@@ -2094,7 +2112,7 @@ int qnn_instance::register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor) {
20942112
GGMLQNN_LOG_INFO("tensor %s successfully register shared memory\n", (QNN_VER_PTR(*p_tensor)->name));
20952113
}
20962114
QNN_VER_PTR(*p_tensor)->memHandle = handle;
2097-
_qnn_mem_set.insert(handle);
2115+
_qnn_mem_set.insert((std::pair<void*, Qnn_MemHandle_t>(p_data, handle)));
20982116

20992117
return 0;
21002118
}
@@ -2136,17 +2154,36 @@ Qnn_MemHandle_t qnn_instance::register_rpcmem(void * p_data, const uint32_t ran
21362154
return handle;
21372155
}
21382156

2157+
void * qnn_instance::get_rpcmem_from_memhandle(Qnn_MemHandle_t mem_handle) {
2158+
for (std::unordered_map<void *, Qnn_MemHandle_t>::iterator it = _qnn_mem_set.begin();
2159+
it != _qnn_mem_set.end();
2160+
it++) {
2161+
Qnn_MemHandle_t mem_handle = it->second;
2162+
if (it->second == mem_handle) {
2163+
return it->first;
2164+
}
2165+
}
2166+
GGMLQNN_LOG_WARN("can't find rpcmem from qnn mem handle %p", mem_handle);
2167+
return nullptr;
2168+
}
2169+
21392170
void qnn_instance::unregister_rpcmem() {
21402171
Qnn_ErrorHandle_t error = QNN_SUCCESS;
21412172

21422173
if (_qnn_mem_set.empty()) {
21432174
GGMLQNN_LOG_WARN("no rpcmem registered\n");
21442175
}
21452176

2146-
for (auto &mem_handle : _qnn_mem_set) {
2177+
for (std::unordered_map<void *, Qnn_MemHandle_t>::iterator it = _qnn_mem_set.begin();
2178+
it != _qnn_mem_set.end();
2179+
it++) {
2180+
Qnn_MemHandle_t mem_handle = it->second;
21472181
error = _qnn_interface.qnn_mem_de_register(&mem_handle, 1);
21482182
if (error != QNN_SUCCESS) {
2149-
GGMLQNN_LOG_WARN("failed to unregister shared memory, error %d\n", QNN_GET_ERROR_CODE(error));
2183+
GGMLQNN_LOG_WARN("failed to unregister shared memory, error %d\n",
2184+
QNN_GET_ERROR_CODE(error));
2185+
} else {
2186+
GGMLQNN_LOG_DEBUG("unregister shared memory ok");
21502187
}
21512188
}
21522189
_qnn_mem_set.clear();
@@ -2158,14 +2195,14 @@ void qnn_instance::unregister_rpcmem(Qnn_MemHandle_t mem_handle) {
21582195
GGMLQNN_LOG_WARN("failed to unregister shared memory, error %d", QNN_GET_ERROR_CODE(error));
21592196
}
21602197

2161-
auto it = std::find_if(_qnn_rpc_buffer_to_handles.begin(), _qnn_rpc_buffer_to_handles.end(),
2198+
auto it = std::find_if(_qnn_mem_set.begin(), _qnn_mem_set.end(),
21622199
[mem_handle](const auto &kv) { return kv.second == mem_handle; });
2163-
if (it == _qnn_rpc_buffer_to_handles.end()) {
2200+
if (it == _qnn_mem_set.end()) {
21642201
GGMLQNN_LOG_WARN("failed to find shared memory handler: %p", mem_handle);
21652202
return;
21662203
}
21672204

2168-
_qnn_rpc_buffer_to_handles.erase(it);
2205+
_qnn_mem_set.erase(it);
21692206
}
21702207

21712208
bool qnn_instance::is_rpcmem_allocated(void * buf) {
@@ -2562,7 +2599,7 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
25622599
temp_context_config.empty() ? nullptr : temp_context_config.data(),
25632600
&_qnn_context_handle);
25642601
if (nullptr == _qnn_context_handle) {
2565-
GGMLQNN_LOG_WARN("why failed to initialize qnn context\n");
2602+
GGMLQNN_LOG_WARN("why failed to initialize qnn context, error:%s\n", strerror(errno));
25662603
return 8;
25672604
} else {
25682605
GGMLQNN_LOG_DEBUG("initialize qnn context successfully\n");
@@ -2636,9 +2673,13 @@ int qnn_instance::qnn_finalize() {
26362673
int ret_status = 0;
26372674
Qnn_ErrorHandle_t error = QNN_SUCCESS;
26382675

2676+
GGMLQNN_LOG_DEBUG("enter %s\n", __func__);
26392677
//FIXME:should be removed in the future
26402678
reset_idx();
26412679

2680+
free_rpcmem();
2681+
unregister_rpcmem();
2682+
26422683
if (nullptr != _pfn_rpc_mem_deinit)
26432684
_pfn_rpc_mem_deinit();
26442685

@@ -2700,6 +2741,7 @@ int qnn_instance::qnn_finalize() {
27002741
unload_backend();
27012742

27022743
unload_system();
2744+
GGMLQNN_LOG_DEBUG("leave %s\n", __func__);
27032745

27042746
return ret_status;
27052747
}
@@ -2954,6 +2996,10 @@ static void ggml_qnn_add(ggml_backend_t backend, ggml_tensor * op) {
29542996
const ggml_tensor * src1 = op->src[1];
29552997
ggml_tensor * dst = op;
29562998

2999+
uint8_t * qnn_rpcbuffer_0 = nullptr;
3000+
uint8_t * qnn_rpcbuffer_1 = nullptr;
3001+
uint8_t * qnn_rpcbuffer_2 = nullptr;
3002+
29573003
GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst);
29583004

29593005
instance = ctx->instance;
@@ -3067,6 +3113,19 @@ static void ggml_qnn_add(ggml_backend_t backend, ggml_tensor * op) {
30673113
GGMLQNN_LOG_INFO("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error);
30683114
return;
30693115
}
3116+
3117+
if (ctx->device == QNN_BACKEND_NPU) {
3118+
QNN_VER_PTR(*tensor_0)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
3119+
QNN_VER_PTR(*tensor_0)->clientBuf = {.data=nullptr, .dataSize=0};
3120+
3121+
QNN_VER_PTR(*tensor_1)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
3122+
QNN_VER_PTR(*tensor_1)->clientBuf = {.data=nullptr, .dataSize=0};
3123+
3124+
QNN_VER_PTR(*tensor_2)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
3125+
QNN_VER_PTR(*tensor_2)->clientBuf = {.data=nullptr, .dataSize=0};
3126+
}
3127+
3128+
30703129
error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0);
30713130
if (QNN_SUCCESS != error) {
30723131
GGMLQNN_LOG_INFO("error = %d\n", error);
@@ -3080,9 +3139,43 @@ static void ggml_qnn_add(ggml_backend_t backend, ggml_tensor * op) {
30803139
GGMLQNN_LOG_INFO("error = %d\n", error);
30813140
}
30823141

3083-
QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
3084-
QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)};
3085-
QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)};
3142+
if (ctx->device != QNN_BACKEND_NPU) {
3143+
QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
3144+
QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)};
3145+
QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)};
3146+
} else {
3147+
qnn_rpcbuffer_0 = static_cast<uint8_t *>(instance->alloc_rpcmem(
3148+
ggml_nbytes(src0), 4));
3149+
if (nullptr == qnn_rpcbuffer_0) {
3150+
GGMLQNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno));
3151+
goto failure;
3152+
} else {
3153+
GGMLQNN_LOG_DEBUG("alloc rpcmem %p successfully\n", qnn_rpcbuffer_0);
3154+
}
3155+
instance->register_rpcmem(qnn_rpcbuffer_0, tensor_0);
3156+
memcpy(qnn_rpcbuffer_0, src0->data, ggml_nbytes(src0));
3157+
3158+
qnn_rpcbuffer_1 = static_cast<uint8_t *>(instance->alloc_rpcmem(
3159+
ggml_nbytes(src1), 4));
3160+
if (nullptr == qnn_rpcbuffer_1) {
3161+
GGMLQNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno));
3162+
goto failure;
3163+
} else {
3164+
GGMLQNN_LOG_DEBUG("alloc rpcmem %p successfully\n", qnn_rpcbuffer_1);
3165+
}
3166+
instance->register_rpcmem(qnn_rpcbuffer_1, tensor_1);
3167+
memcpy(qnn_rpcbuffer_1, src1->data, ggml_nbytes(src1));
3168+
3169+
qnn_rpcbuffer_2 = static_cast<uint8_t *>(instance->alloc_rpcmem(
3170+
ggml_nbytes(dst), 4));
3171+
if (nullptr == qnn_rpcbuffer_2) {
3172+
GGMLQNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno));
3173+
goto failure;
3174+
} else {
3175+
GGMLQNN_LOG_DEBUG("alloc rpcmem %p successfully\n", qnn_rpcbuffer_2);
3176+
}
3177+
instance->register_rpcmem(qnn_rpcbuffer_2, tensor_2);
3178+
}
30863179

30873180
Qnn_Tensor_t tensor_inputs[] = {
30883181
*tensor_0,
@@ -3119,6 +3212,15 @@ static void ggml_qnn_add(ggml_backend_t backend, ggml_tensor * op) {
31193212
if (QNN_SUCCESS != error) {
31203213
GGMLQNN_LOG_INFO("error = %d\n", error);
31213214
}
3215+
3216+
if (ctx->device == QNN_BACKEND_NPU) {
3217+
uint8_t * qnn_rpcbuffer = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(
3218+
QNN_VER_PTR(*tensor_2)->memHandle));
3219+
GGMLQNN_LOG_INFO("qnn_rpcbuffer = %p\n", qnn_rpcbuffer);
3220+
if (nullptr != qnn_rpcbuffer)
3221+
memcpy(dst->data, qnn_rpcbuffer, ggml_nbytes(dst));
3222+
}
3223+
31223224
auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2);
31233225
instance->_qnn_graph_map[map_entry] = graph_item;
31243226
} else {
@@ -3138,9 +3240,21 @@ static void ggml_qnn_add(ggml_backend_t backend, ggml_tensor * op) {
31383240
QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst);
31393241
QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type;
31403242

3141-
QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
3142-
QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)};
3143-
QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)};
3243+
if (ctx->device != QNN_BACKEND_NPU) {
3244+
QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
3245+
QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)};
3246+
QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)};
3247+
} else {
3248+
uint8_t * qnn_buffer_0 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(
3249+
QNN_VER_PTR(*tensor_0)->memHandle));
3250+
if (nullptr != qnn_buffer_0)
3251+
memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0));
3252+
3253+
uint8_t * qnn_buffer_1 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(
3254+
QNN_VER_PTR(*tensor_1)->memHandle));
3255+
if (nullptr != qnn_buffer_1)
3256+
memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1));
3257+
}
31443258

31453259
Qnn_Tensor_t tensor_inputs[] = {
31463260
*tensor_0,
@@ -3156,6 +3270,35 @@ static void ggml_qnn_add(ggml_backend_t backend, ggml_tensor * op) {
31563270
if (QNN_SUCCESS != error) {
31573271
GGMLQNN_LOG_INFO("error = %d\n", error);
31583272
}
3273+
3274+
if (ctx->device == QNN_BACKEND_NPU) {
3275+
uint8_t * qnn_buffer_2 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(
3276+
QNN_VER_PTR(*tensor_2)->memHandle));
3277+
if (nullptr != qnn_buffer_2)
3278+
memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst));
3279+
}
3280+
}
3281+
3282+
failure:
3283+
if (QNN_SUCCESS != error) {
3284+
GGMLQNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0));
3285+
GGMLQNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1));
3286+
GGMLQNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2));
3287+
GGMLQNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64
3288+
" x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
3289+
src0->name, src0->type, ggml_type_name(src0->type),
3290+
src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0],
3291+
src0->nb[1], src0->nb[2]);
3292+
GGMLQNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64
3293+
" x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
3294+
src1->name, src1->type, ggml_type_name(src1->type),
3295+
src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0],
3296+
src1->nb[1], src1->nb[2]);
3297+
GGMLQNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64
3298+
" x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
3299+
dst->name, dst->type, ggml_type_name(dst->type),
3300+
dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0],
3301+
dst->nb[1], dst->nb[2]);
31593302
}
31603303

31613304
//avoid memory leak in func free_qnn_tensor

0 commit comments

Comments
 (0)