|
14 | 14 | * section-6 QNN helper function |
15 | 15 | * section-7 ggml-qnn backend helper function / class |
16 | 16 | * section-8 implementation of ggml-qnn backend according to ggml's backend subsystem |
17 | | - * section-9 implementation of offload ggml op to QNN backend |
18 | | - * section-10 illustrate why the second approach is actual an fake at the moment |
| 17 | + * section-9 implementation of general approach or the first tech approach |
| 18 | + * section-10 implementation of the second tech approach:mapping the entire ggml cgraph to a single QNN graph |
19 | 19 | * |
20 | 20 | * currently provide following ggml op' QNN backend implementation: |
21 | | - * - GGML_OP_ADD: this is a simple skeleton, can expand other ggml ops according to expertise |
22 | | - * - GGML_OP_MUL: this is a simple skeleton, can expand other ggml ops according to expertise |
23 | | - * - GGML_OP_MUL_MAT:this is a complicated skeleton, can expand other complex ggml ops accordingly |
| 21 | + * - GGML_OP_ADD/GGML_OP_SUB/GGML_OP_MUL/GGML_OP_DIV: |
| 22 | + * this is a simple skeleton, can expand other ggml ops according to expertise |
| 23 | + * - GGML_OP_LOG/GGML_OP_SQRT: |
| 24 | + * this is a simple skeleton, can expand other ggml ops according to expertise |
| 25 | + * - GGML_OP_MUL_MAT: |
| 26 | + * this is a complicated skeleton, can expand other complex ggml ops accordingly |
24 | 27 | * |
25 | 28 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
26 | 29 | * of this software and associated documentation files (the "Software"), to |
|
80 | 83 | #include <unordered_set> |
81 | 84 | #include <utility> |
82 | 85 | #include <future> |
83 | | -#include <chrono> |
84 | 86 | #if (defined __ANDROID__) || (defined ANDROID) |
85 | 87 | #include "android/log.h" |
86 | 88 | #endif |
@@ -186,7 +188,6 @@ static void ggml_qnn_diag_mask(ggml_backend_qnn_context * ctx, ggml_tensor * dst |
186 | 188 |
|
187 | 189 | #define GGMLQNN_MEM_ADD(alignment) (sizeof (size_t) + alignment) |
188 | 190 | #define GGMLQNN_MEM_MASK(alignment) ((uintptr_t)alignment - 1) |
189 | | -#define TENSOR_DUMP(tensor) ggmlqnn_tensor_dump(tensor, #tensor) |
190 | 191 | #define GQCGT ggmlqnn_create_general_tensor |
191 | 192 | #define QNN_VER_PTR(x) (&((x).v1)) |
192 | 193 | #define RPCMEM_DEFAULT_FLAGS 1 |
@@ -260,7 +261,7 @@ using qnn_ptensors_t = std::vector< Qnn_Tensor_t *>; |
260 | 261 | using qnn_singlenode_res_t = std::tuple<Qnn_GraphHandle_t, qnn_ptensors_t>; |
261 | 262 |
|
262 | 263 | //QNN resource management for the second technical approach(mapping the entire cgraph to a single QNN graph) |
263 | | -using qnn_tensors_t = std::vector< Qnn_Tensor_t * >; |
| 264 | +using qnn_tensors_t = std::vector< Qnn_Tensor_t >; |
264 | 265 | using qnn_cgraph_node_t = std::tuple<std::string, Qnn_GraphHandle_t>; |
265 | 266 | using qnn_cgraph_nodes_t = std::vector<qnn_cgraph_node_t>; |
266 | 267 | using qnn_multinode_res_t = std::tuple<Qnn_GraphHandle_t, qnn_cgraph_nodes_t, qnn_ptensors_t, qnn_tensors_t, qnn_tensors_t>; |
@@ -325,11 +326,6 @@ struct ggml_backend_qnn_context { |
325 | 326 | size_t work_size; |
326 | 327 | size_t desired_size; |
327 | 328 | int n_threads; |
328 | | - |
329 | | -#if 1//ndef NDEBUG |
330 | | - std::atomic_uint32_t supported_op_count = 0; |
331 | | - std::atomic_uint32_t unsupported_op_count = 0; |
332 | | -#endif |
333 | 329 | }; |
334 | 330 |
|
335 | 331 | struct qnn_op_caps { |
@@ -370,8 +366,6 @@ static struct qnn_parameter g_qnn_params = { |
370 | 366 | #if defined(__ANDROID__) |
371 | 367 | //Android command line program |
372 | 368 | .qnn_runtimelib_path = "/data/local/tmp/", |
373 | | -//Android KanTV standard APP |
374 | | -// .qnn_runtimelib_path = "/data/data/com.cdeos.kantv/qnnlib/", |
375 | 369 | #elif defined(__linux__) |
376 | 370 | .qnn_runtimelib_path = "/tmp/", |
377 | 371 | #elif defined(_WIN32) |
@@ -1066,46 +1060,6 @@ static void ggmlqnn_load_cfg() { |
1066 | 1060 | } |
1067 | 1061 | } |
1068 | 1062 |
|
1069 | | -static void ggmlqnn_tensor_dump_elements(const ggml_tensor * tensor) { |
1070 | | - float value = 0; |
1071 | | - std::ostringstream tmposs; |
1072 | | - if (tensor->type == GGML_TYPE_F32) { |
1073 | | - for (int h = 0; h < tensor->ne[3]; h++) { |
1074 | | - for (int i = 0; i < tensor->ne[2]; i++) { |
1075 | | - for (int j = 0; j < tensor->ne[1]; j++) { |
1076 | | - for (int k = 0; k < tensor->ne[0]; k++) { |
1077 | | - value = ((float *) tensor->data)[h * tensor->ne[2] + i * tensor->ne[1] + |
1078 | | - j * tensor->ne[0] + k]; |
1079 | | - tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value |
1080 | | - << " "; |
1081 | | - } |
1082 | | - if (strlen(tmposs.str().c_str()) <= (4096 - 96)) { |
1083 | | - GGMLQNN_LOG_DEBUG("%s\n", tmposs.str().c_str()); |
1084 | | - } |
1085 | | - tmposs.clear(); |
1086 | | - tmposs.str(""); |
1087 | | - } |
1088 | | - } |
1089 | | - } |
1090 | | - } |
1091 | | - |
1092 | | - GGMLQNN_LOG_DEBUG("\n"); |
1093 | | -} |
1094 | | - |
1095 | | - |
1096 | | -static void ggmlqnn_tensor_dump(const ggml_tensor * tensor, const char * name) { |
1097 | | - GGMLQNN_LOG_DEBUG("dump ggml tensor %s(%s)\n", name, tensor->name); |
1098 | | - GGMLQNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64", nb = (%5zi, %5zi, %5zi, %5zi)\n", |
1099 | | - name, |
1100 | | - tensor->type, ggml_type_name(tensor->type), |
1101 | | - tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], |
1102 | | - tensor->nb[0], tensor->nb[1], tensor->nb[2], tensor->nb[2]); |
1103 | | - ggmlqnn_tensor_dump_elements(tensor); |
1104 | | - |
1105 | | - GGMLQNN_LOG_DEBUG("\n"); |
1106 | | -} |
1107 | | - |
1108 | | - |
1109 | 1063 | // ================================================================================================= |
1110 | 1064 | // section-6: QNN helper function |
1111 | 1065 | // ================================================================================================= |
@@ -2698,7 +2652,7 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { |
2698 | 2652 | std::filesystem::path full_path(std::string(g_qnn_params.qnn_runtimelib_path) + "libcdsprpc.so"); |
2699 | 2653 | full_path /= std::filesystem::path("libcdsprpc.so").filename(); |
2700 | 2654 | _rpc_lib_handle = dlopen(full_path.string().c_str(), RTLD_NOW | RTLD_LOCAL); |
2701 | | - if (!_rpc_lib_handle) { |
| 2655 | + if (nullptr == _rpc_lib_handle) { |
2702 | 2656 | GGMLQNN_LOG_WARN("failed to load %s\n", full_path.c_str()); |
2703 | 2657 | _rpc_lib_handle = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL); |
2704 | 2658 | } |
@@ -5083,9 +5037,56 @@ void ggml_qnn_rope(ggml_backend_qnn_context * ctx, ggml_tensor * dst) { |
5083 | 5037 | // ================================================================================================= |
5084 | 5038 | // details: https://github.com/ggml-org/llama.cpp/pull/12326#issuecomment-2712838649 |
5085 | 5039 | // ref: https://github.com/kantv-ai/kantv/blob/kantv-poc-with-qnn/core/ggml/jni/Inception_v3.cpp#L20634 |
5086 | | -static enum ggml_status |
5087 | | -ggml_backend_qnn_graph_compute_special(ggml_backend_t backend, struct ggml_cgraph *cgraph) { |
5088 | | - enum ggml_status ggml_result = GGML_STATUS_SUCCESS; |
| 5040 | +static enum ggml_status ggml_backend_qnn_graph_compute_special(ggml_backend_t backend, struct ggml_cgraph * cgraph) { |
| 5041 | + enum ggml_status ggml_result = GGML_STATUS_SUCCESS; |
| 5042 | + Qnn_ErrorHandle_t qnn_error = QNN_SUCCESS; |
| 5043 | + qnn_perf op_perf = qnn_perf("ggml_backend_qnn_graph_compute_special"); |
| 5044 | + qnn_instance * instance = nullptr; |
| 5045 | + Qnn_GraphHandle_t graph_handle = nullptr; |
| 5046 | + ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; |
| 5047 | + instance = ctx->instance; |
| 5048 | + QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; |
| 5049 | + op_perf.start(); |
| 5050 | + |
| 5051 | + //now we got the entire ggml cgraph or a ggml cgraph which contains multiple nodes |
| 5052 | + GGMLQNN_LOG_DEBUG("qnn device %d(%s)", ctx->device, ggml_backend_qnn_get_devname(ctx->device)); |
| 5053 | + GGMLQNN_LOG_DEBUG("cgraph->n_nodes %d", cgraph->n_nodes); |
| 5054 | + int num_nodes = std::min(5, cgraph->n_nodes); |
| 5055 | + //for (int i = 0; i < cgraph->n_nodes; i++) { |
| 5056 | + for (int i = 0; i < num_nodes; i++) { |
| 5057 | + ggml_tensor * node = cgraph->nodes[i]; |
| 5058 | + GGMLQNN_LOG_DEBUG("%s: op %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); |
| 5059 | + } |
| 5060 | + |
| 5061 | + //now we'll offload the ggml cgraph to a single QNN graph |
| 5062 | + std::string graph_name; |
| 5063 | + ggmlqnn_get_graphkey_from_cgraph(cgraph, graph_name); |
| 5064 | + if (graph_name == "") |
| 5065 | + return GGML_STATUS_SUCCESS; |
| 5066 | + if (ctx->qnn_multinode_graph_map.find(graph_name) != ctx->qnn_multinode_graph_map.end()) { |
| 5067 | + GGMLQNN_LOG_DEBUG("graph name %s already create", graph_name.c_str()); |
| 5068 | + //retrieve computational resource from cached QNN graph |
| 5069 | + qnn_multinode_res_t &graph_res = ctx->qnn_multinode_graph_map[graph_name]; |
| 5070 | + graph_handle = std::get<0>(graph_res); |
| 5071 | + } else { |
| 5072 | + //create QNN graph |
| 5073 | + GGMLQNN_LOG_INFO("graph name %s", graph_name.c_str()); |
| 5074 | + qnn_error = instance->init_qnn_graph(graph_name, static_cast<QNNBackend>(ctx->device), 8, 4); |
| 5075 | + if (QNN_SUCCESS != qnn_error) { |
| 5076 | + GGMLQNN_LOG_WARN("can't create qnn graph handle with graph name %s, error = %d(%s)\n", graph_name.c_str(), qnn_error, |
| 5077 | + ggmlqnn_get_qnnerror_string(qnn_error)); |
| 5078 | + return ggml_result; |
| 5079 | + } |
| 5080 | + graph_handle = instance->get_qnn_graph_handle(); |
| 5081 | + //TBD: compose a single opcfg QNN graph |
| 5082 | + |
| 5083 | + //finalize QNN graph |
| 5084 | + CHECK_QNN_API(qnn_error, qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr)); |
| 5085 | + |
| 5086 | + //TBD: cache QNN graph |
| 5087 | + } |
| 5088 | + //TBD: exec QNN graph |
| 5089 | + GGMLQNN_LOG_DEBUG("the second inference approach \"mapping cgraph to QNN graph\" will be seen in the future"); |
5089 | 5090 |
|
5090 | 5091 | return ggml_result; |
5091 | 5092 | } |
0 commit comments