7575#include < unordered_set>
7676#include < utility>
7777#include < stdatomic.h>
78+ #include < future>
7879#if (defined __ANDROID__) || (defined ANDROID)
7980#include " android/log.h"
8081#endif
@@ -815,6 +816,11 @@ struct ggml_backend_qnn_context {
815816 QNN_INTERFACE_VER_TYPE raw_interface;
816817 QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface;
817818 struct qcom_socinfo socinfo;
819+
820+ std::unique_ptr<char []> work_data;
821+ std::vector<std::future<void >> tasks;
822+ size_t work_size = 0 ;
823+ int n_threads = GGML_DEFAULT_N_THREADS;
818824} ;
819825
820826// the following helper funcs are used to ensure every QNN tensor name is unique
@@ -2780,7 +2786,7 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor) {
27802786 const uint32_t src1_rank = ggml_get_tensor_rank (src1);
27812787
27822788 if (tensor->op == GGML_OP_ADD) {
2783- // dump_tensors_info (tensor);
2789+ // dump_op_info (tensor);
27842790 if (!ggml_are_same_shape (src0, src1)) {
27852791 return false ;
27862792 }
@@ -2791,6 +2797,7 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor) {
27912797 }
27922798
27932799 if (tensor->op == GGML_OP_MUL_MAT) {
2800+ dump_op_info (tensor);
27942801 if (src0_rank != src1_rank) // make QNN SDK happy
27952802 return false ;
27962803 if (src0_rank < 2 ) // QNN's limitation, make QNN SDK happy
@@ -2800,17 +2807,18 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor) {
28002807 if ((src1->ne [2 ] != src0->ne [2 ]) || (src1->ne [3 ] != src0->ne [3 ])) // make QNN SDK happy
28012808 return false ;
28022809
2803- // TODO: support more data type in func ggml_qnn_mul_mat(...)
2804- // src0: q4_0, q6_k, ...
2805- // src1: f32
2806- // dst : f32
2807- return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)
2808- && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16)
2809- && (src0->type == src1->type ) && (src0->type == tensor->type );
2810+ if (2 != src0_rank) { // TODO: quantize src0 for 3D & 4D matrix
2811+ return (src0->type == GGML_TYPE_F32)
2812+ && (src1->type == GGML_TYPE_F32)
2813+ && (tensor->type == GGML_TYPE_F32);
2814+ } else {
2815+ return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q6_K)
2816+ && (src1->type == GGML_TYPE_F32) && (tensor->type == GGML_TYPE_F32);
2817+ }
28102818 }
28112819
28122820 if (tensor->op == GGML_OP_MUL) {
2813- // dump_tensors_info (tensor);
2821+ // dump_op_info (tensor);
28142822 if ((src0_rank != 2 ) || (src1_rank != 2 )) // TODO: 3D and 4D matrix
28152823 return false ;
28162824 return (src0->type == GGML_TYPE_F32)
@@ -2870,7 +2878,9 @@ static void ggml_qnn_general_node(ggml_backend_t backend, ggml_tensor * op) {
28702878 p_tensor1 = ggml_qnn_create_compute_tensor (src1);
28712879 p_tensor2 = ggml_qnn_create_compute_tensor (dst);
28722880 }
2881+ #if GGMLQNN_PRINT_OP_ADD_LOG
28732882 print_tensors_info (__func__, ctx, src0, src1, dst);
2883+ #endif
28742884
28752885 // ensure QNN tensor has correct tensor type
28762886 QNN_VER_PTR (*p_tensor0)->type = QNN_TENSOR_TYPE_APP_WRITE;
@@ -2966,7 +2976,6 @@ static void ggml_qnn_general_node(ggml_backend_t backend, ggml_tensor * op) {
29662976
29672977 auto graph_item = std::make_tuple (graph_handle, ggml_op_add_tensors);
29682978 instance->_qnn_graph_map [graph_name] = graph_item;
2969-
29702979 } else {
29712980 Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32;
29722981 Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32;
@@ -3039,22 +3048,31 @@ static void ggml_qnn_general_node(ggml_backend_t backend, ggml_tensor * op) {
30393048 QNN_VER_PTR (*p_tensor0)->dimensions = tensor_0_dimensions;
30403049 QNN_VER_PTR (*p_tensor1)->dimensions = tensor_1_dimensions;
30413050 QNN_VER_PTR (*p_tensor2)->dimensions = tensor_2_dimensions;
3051+
3052+ #if GGMLQNN_PRINT_OP_ADD_LOG
30423053 op_perf.info ();
3054+ #endif
30433055}
30443056
30453057/*
3046- * the logic of ggml_qnn_mul_mat is similar to ggml_qnn_general_node but much more complicated
3047- * than ggml_qnn_general_node.
3048- * matrix transpose and type trait are required for offload mulmat to QNN backend,
3049- * so it's a standalone function. accordingly, this is another typical skeleton for offload other
3050- * ggml ops to QNN backend
3058+ * @brief performs matrix multiplication with FP32 & quantized weights and floating-point inputs
3059+ * using the QNN backend. this function performs matrix multiplication of the input tensor
3060+ * `src1` and the weight tensor `src0`, handling transposing, and quantization as needed,
3061+ * and stores the result in the destination tensor `dst`.
30513062 *
3052- * MUL_MAT take most of the compute time (about 95%).so to speed up llama inference, should focus on MUL_MAT.
3063+ * @param backend the context which got through (ggml_backend_qnn_context *)backend->context for the
3064+ * QNN backend operations.
3065+ * @param op the destination tensor where the result of the matrix multiplication will be stored.
30533066 *
3054- * have three kinds of MUL_MAT to compute:
3055- * mul_mat_f32: both src0 and src1 are F32, this will be naturally handled in QNN backend
3056- * mul_mat_f16_f32: src0 is F16 and src1 is F32, f16 in src0 -> f32 in src0', then src0' * src1
3057- * mul_mat_q_f32: src0 is quantized (Q4_0, Q4_1, ...) and src1 is F32, src0 -> f32 in src0', then src0' * src1
3067+ * @note the logic of ggml_qnn_mul_mat is similar to ggml_qnn_general_node but much more complicated
3068+ * than ggml_qnn_general_node. so it's a standalone function. accordingly, this is another
3069+ * typical skeleton for offload other ggml ops to QNN backend. MUL_MAT take most of the compute
3070+ * time (about 95%).so to speed up llama inference, should focus on this func. there are three kinds
3071+ * of MUL_MAT to compute:
3072+ * mul_mat_f32: both src0 and src1 are F32, this will be naturally handled in QNN backend
3073+ * mul_mat_f16_f32: src0 is F16 and src1 is F32, f16 in src0 -> f32 in src0', then src0' * src1
3074+ * mul_mat_q_f32: src0 is quantized (Q4_0, Q4_1, Q6_K...)
3075+ * and src1 is F32, src0 -> f32 in src0', then src0' * src1
30583076*/
30593077static void ggml_qnn_mul_mat (ggml_backend_t backend, ggml_tensor * op) {
30603078 Qnn_ErrorHandle_t error = QNN_SUCCESS;
@@ -3077,10 +3095,72 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
30773095 QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface ;
30783096 op_perf.start ();
30793097
3080- uint32_t src0_rank = ggml_get_tensor_rank (src0);
3081- uint32_t src1_rank = ggml_get_tensor_rank (src1);
3098+ const enum ggml_type type = src0->type ;
3099+ const uint32_t src0_rank = ggml_get_tensor_rank (src0);
3100+ const uint32_t src1_rank = ggml_get_tensor_rank (src1);
3101+
3102+ GGML_TENSOR_BINARY_OP_LOCALS
3103+ GGML_ASSERT (ne0 == ne01);
3104+ GGML_ASSERT (ne1 == ne11);
3105+ GGML_ASSERT (ne2 == ne12);
3106+ GGML_ASSERT (ne3 == ne13);
3107+ GGML_ASSERT (nb00 == ggml_type_size (type));
3108+ GGML_ASSERT (nb10 == ggml_type_size (src1->type ));
3109+
30823110 GGML_ASSERT (src0_rank == src1_rank);
3083- GGML_ASSERT (src0_rank >= 2 ); // QNN SDK's limitation
3111+ GGML_ASSERT (src0_rank >= 2 ); // QNN SDK's limitation, make QNN SDK happy
3112+
3113+ // broadcast factors
3114+ const int64_t r2 = ne12 / ne02;
3115+ const int64_t r3 = ne13 / ne03;
3116+ const int64_t ne_plane = ne01 * ne00;
3117+ const size_t desired_size = ((GGML_TYPE_F32 == type) ? 0 : ne03 * ne02 * ne_plane * sizeof (float ));
3118+ if (ctx->work_size < desired_size) {
3119+ ctx->work_data .reset (new char [desired_size]);
3120+ ctx->work_size = desired_size;
3121+ }
3122+ void * wdata = ctx->work_data .get ();
3123+ // convert src0 to float
3124+ if (type != GGML_TYPE_F32) {
3125+ const auto * type_traits = ggml_get_type_traits (type);
3126+ ggml_to_float_t const to_float = type_traits->to_float ;
3127+
3128+ for (int64_t i03 = 0 ; i03 < ne03; i03++) {
3129+ for (int64_t i02 = 0 ; i02 < ne02; i02++) {
3130+ const void * x = (char *)src0->data + i02 * nb02 + i03 * nb03;
3131+ float * const wplane = (float *)wdata + i02 * ne_plane + i03 * ne02 * ne_plane;
3132+
3133+ const int min_cols_per_thread = 4096 ;
3134+ const int min_rows_per_thread = std::max ((int )(min_cols_per_thread / ne00), 1 );
3135+ const int n_threads = std::max (std::min (ctx->n_threads , (int )(ne01 / min_rows_per_thread)), 1 );
3136+ for (int i = 1 ; i < n_threads; i++) {
3137+ const int64_t start = i * ne01 / n_threads;
3138+ const int64_t end = (i + 1 ) * ne01 / n_threads;
3139+ if (start < end) {
3140+ ctx->tasks .push_back (std::async (std::launch::async, [=]() {
3141+ for (int64_t i01 = start; i01 < end; i01++) {
3142+ to_float ((const char *)x + i01 * nb01, wplane + i01 * ne00, ne00);
3143+ }
3144+ }));
3145+ }
3146+ }
3147+ {
3148+ // reuse the current thread for the first task
3149+ const int64_t start = 0 ;
3150+ const int64_t end = ne01 / n_threads;
3151+ for (int64_t i01 = start; i01 < end; i01++) {
3152+ to_float ((const char *)x + i01 * nb01, wplane + i01 * ne00, ne00);
3153+ }
3154+ }
3155+ }
3156+ }
3157+
3158+ // wait for all tasks to finish
3159+ for (auto & task : ctx->tasks ) {
3160+ task.get ();
3161+ }
3162+ ctx->tasks .clear ();
3163+ }
30843164
30853165 std::string graph_name;
30863166 get_graph_key_from_op (op, graph_name);
@@ -3133,9 +3213,10 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
31333213
31343214 2. QNN's MatMul can only support input tensors with rank >= 2
31353215
3136- there is gap between ggml mulmat and QNN mulmat,we need to perform a transpose operation when offloading mulmat to QNN backend.
3216+ in the all, there is gap between ggml mulmat and QNN mulmat,we need to perform a transpose
3217+ operation when offloading mulmat to QNN backend. this concise implementation will handle
3218+ transpose in func ggml_qnn_create_general_tensor()
31373219 */
3138-
31393220 // step-1: create qnn graph
31403221 error = qnn_raw_interface.graphCreate (instance->get_qnn_context_handle (),
31413222 graph_name.c_str (), nullptr , &graph_handle);
@@ -3158,8 +3239,11 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
31583239 CHECK_QNN_API (error, qnn_raw_interface.tensorCreateGraphTensor (graph_handle, p_tensor0));
31593240 CHECK_QNN_API (error, qnn_raw_interface.tensorCreateGraphTensor (graph_handle, p_tensor1));
31603241 CHECK_QNN_API (error, qnn_raw_interface.tensorCreateGraphTensor (graph_handle, p_tensor2));
3161-
3162- QNN_VER_PTR (*p_tensor0)->clientBuf = {src0->data , ggml_get_tensor_data_size (src0)};
3242+ if (type != GGML_TYPE_F32) {
3243+ QNN_VER_PTR (*p_tensor0)->clientBuf = {wdata, static_cast <uint32_t >(desired_size)};
3244+ } else {
3245+ QNN_VER_PTR (*p_tensor0)->clientBuf = {src0->data , ggml_get_tensor_data_size (src0)};
3246+ }
31633247 QNN_VER_PTR (*p_tensor1)->clientBuf = {src1->data , ggml_get_tensor_data_size (src1)};
31643248 QNN_VER_PTR (*p_tensor2)->clientBuf = {dst->data , ggml_get_tensor_data_size (dst)};
31653249
@@ -3170,14 +3254,14 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
31703254 // step-5: compose qnn graph: add mat_mul node
31713255 Qnn_Param_t out_0_params[] = {
31723256 {QNN_PARAMTYPE_SCALAR,
3173- QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN1,
3174- .scalarParam = {QNN_DATATYPE_BOOL_8, .bool8Value = 1 }
3257+ QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN1,
3258+ .scalarParam = {QNN_DATATYPE_BOOL_8, .bool8Value = 1 }
31753259 }
31763260 };
31773261
31783262 Qnn_Tensor_t out_0_inputs[] = {*p_tensor0, *p_tensor1};
31793263 Qnn_Tensor_t out_0_outputs[] = {*p_tensor2_transpose};
3180- #if 0
3264+ #if 0 //leave here for easily understand code, can be removed in the future
31813265 Qnn_OpConfig_t out_0 = {
31823266 QNN_OPCONFIG_VERSION_1, .v1 =
31833267 {"ggmlqnn_mulmat_opconfig", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL,
@@ -3202,7 +3286,7 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
32023286 };
32033287 Qnn_Tensor_t out_trans1_0_inputs[] = {*p_tensor2_transpose};
32043288 Qnn_Tensor_t out_trans1_0_outputs[] = {*p_tensor2};
3205- #if 0
3289+ #if 0 //leave here for easily understand code, can be removed in the future
32063290 Qnn_OpConfig_t out_trans1_0 = {
32073291 QNN_OPCONFIG_VERSION_1,
32083292 .v1 = {"ggmlqnn_mulmat_transpose_opconfig",
@@ -3216,7 +3300,7 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
32163300 };
32173301#else
32183302 Qnn_OpConfig_t out_trans1_0 = create_op_config (" ggmlqnn_mulmat_transpose_opconfig" , QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_TRANSPOSE,
3219- out_trans1_0_params, 1 , out_trans1_0_inputs, 1 , out_trans1_0_outputs, 1 );
3303+ out_trans1_0_params, 1 , out_trans1_0_inputs, 1 , out_trans1_0_outputs, 1 );
32203304#endif
32213305 CHECK_QNN_API (error, qnn_raw_interface.graphAddNode (graph_handle,out_trans1_0));
32223306
@@ -3225,9 +3309,9 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
32253309 Qnn_Tensor_t input_tensors_0[] = {*p_tensor0, *p_tensor1};
32263310 Qnn_Tensor_t output_tensors_0[] = {*p_tensor2};
32273311 CHECK_QNN_API (error, qnn_raw_interface.graphExecute (graph_handle,
3228- input_tensors_0, 2 ,
3229- output_tensors_0, 1 ,
3230- nullptr , nullptr ));
3312+ input_tensors_0, 2 ,
3313+ output_tensors_0, 1 ,
3314+ nullptr , nullptr ));
32313315
32323316 qnn_tensors_t ggml_op_mulmat_tensors;
32333317 ggml_op_mulmat_tensors.reserve (5 );
@@ -3239,7 +3323,11 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
32393323 auto graph_item = std::make_tuple (graph_handle, ggml_op_mulmat_tensors);
32403324 instance->_qnn_graph_map [graph_name] = graph_item;
32413325 } else {
3242- QNN_VER_PTR (*p_tensor0)->clientBuf = {src0->data , ggml_get_tensor_data_size (src0)};
3326+ if (type != GGML_TYPE_F32) {
3327+ QNN_VER_PTR (*p_tensor0)->clientBuf = {wdata, static_cast <uint32_t >(desired_size)};
3328+ } else {
3329+ QNN_VER_PTR (*p_tensor0)->clientBuf = {src0->data , ggml_get_tensor_data_size (src0)};
3330+ }
32433331 QNN_VER_PTR (*p_tensor1)->clientBuf = {src1->data , ggml_get_tensor_data_size (src1)};
32443332 QNN_VER_PTR (*p_tensor2)->clientBuf = {dst->data , ggml_get_tensor_data_size (dst)};
32453333
@@ -3250,13 +3338,13 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
32503338 Qnn_Tensor_t tensor_outputs[] = {
32513339 *p_tensor2
32523340 };
3253- // this is the second technical approach of "how to utilize the Hexagon NPU maximally" through
3254- // QNN SDK, details could be found at
3255- // https://github.com/kantv-ai /llama.cpp/wiki/mapping-ggml-compute-graph-to-QNN-compute-graph
3341+ // this is the second technical approach or another pipeline of "how to utilize the Hexagon
3342+ // NPU maximally" through QNN SDK, details could be found at
3343+ // https://github.com/ggml-org /llama.cpp/pull/12049#issuecomment-2678308360
32563344 CHECK_QNN_API (error, qnn_raw_interface.graphExecute (graph_handle,
3257- tensor_inputs, 2 ,
3258- tensor_outputs, 1 ,
3259- nullptr , nullptr ));
3345+ tensor_inputs, 2 ,
3346+ tensor_outputs, 1 ,
3347+ nullptr , nullptr ));
32603348 }
32613349
32623350 // restore the original dimensions of qnn tensors to avoid memory leak in func free_qnn_tensor
0 commit comments