@@ -200,71 +200,25 @@ void ggml_qnn_general_node(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
200200
201201 auto graph_item = std::make_tuple (graph_handle, ggml_op_add_tensors);
202202 instance->_qnn_graph_map [graph_name] = graph_item;
203- } else {
204- Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32;
205- Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32;
206- Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32;
207-
208- src0_qnn_type = ggmlqnn_datatype_from_ggml_datatype (src0->type );
209- src1_qnn_type = ggmlqnn_datatype_from_ggml_datatype (src1->type );
210- dst_qnn_type = ggmlqnn_datatype_from_ggml_datatype (dst->type );
211-
212- uint32_t dimensions_input_0[] = {(uint32_t ) src0->ne [0 ], (uint32_t ) src0->ne [1 ],
213- (uint32_t ) src0->ne [2 ], (uint32_t ) src0->ne [3 ]};
214- uint32_t dimensions_input_1[] = {(uint32_t ) src1->ne [0 ], (uint32_t ) src1->ne [1 ],
215- (uint32_t ) src1->ne [2 ], (uint32_t ) src1->ne [3 ]};
216- uint32_t dimensions_output[] = {(uint32_t ) dst->ne [0 ], (uint32_t ) dst->ne [1 ],
217- (uint32_t ) dst->ne [2 ], (uint32_t ) dst->ne [3 ]};
218-
219- QNN_VER_PTR (*p_tensor0)->dimensions = dimensions_input_0;
220- QNN_VER_PTR (*p_tensor0)->rank = ggml_n_dims (src0);
221- QNN_VER_PTR (*p_tensor0)->dataType = src0_qnn_type;
222-
223- QNN_VER_PTR (*p_tensor1)->dimensions = dimensions_input_1;
224- QNN_VER_PTR (*p_tensor1)->rank = ggml_n_dims (src1);
225- QNN_VER_PTR (*p_tensor1)->dataType = src1_qnn_type;
226-
227- QNN_VER_PTR (*p_tensor2)->dimensions = dimensions_output;
228- QNN_VER_PTR (*p_tensor2)->rank = ggml_n_dims (dst);
229- QNN_VER_PTR (*p_tensor2)->dataType = dst_qnn_type;
230-
231- if (enable_npu_rpc) {
232- // TODO: NPU RPC feature will failed with test-backend-ops
233- uint8_t * qnn_buffer_0 = static_cast <uint8_t *>(instance->get_rpcmem_from_memhandle (QNN_VER_PTR (*p_tensor0)->memHandle ));
234- GGMLQNN_LOG_INFO (" qnn_rpcbuffer_0 = %p\n " , qnn_buffer_0);
235- if (nullptr != qnn_buffer_0) {
236- memcpy (qnn_buffer_0, src0->data , ggml_nbytes (src0));
237- }
238-
239- uint8_t * qnn_buffer_1 = static_cast <uint8_t *>(instance->get_rpcmem_from_memhandle (QNN_VER_PTR (*p_tensor1)->memHandle ));
240- GGMLQNN_LOG_INFO (" qnn_rpcbuffer_1 = %p\n " , qnn_buffer_1);
241- if (nullptr != qnn_buffer_1) {
242- memcpy (qnn_buffer_1, src1->data , ggml_nbytes (src1));
243- }
244- } else {
245- QNN_VER_PTR (*p_tensor0)->clientBuf = {src0->data , ggmlqnn_get_tensor_data_size (src0)};
246- QNN_VER_PTR (*p_tensor1)->clientBuf = {src1->data , ggmlqnn_get_tensor_data_size (src1)};
247- QNN_VER_PTR (*p_tensor2)->clientBuf = {dst->data , ggmlqnn_get_tensor_data_size (dst)};
248- }
249-
250- Qnn_Tensor_t tensor_inputs[] = {
251- *p_tensor0,
252- *p_tensor1
253- };
254- Qnn_Tensor_t tensor_outputs[] = {
255- *p_tensor2
256- };
257- CHECK_QNN_API (error, qnn_raw_interface.graphExecute (graph_handle,
258- tensor_inputs, 2 ,
259- tensor_outputs, 1 ,
260- nullptr , nullptr ));
203+ }
261204
262- if (enable_npu_rpc) {
263- // TODO:NPU RPC feature will failed with test-backend-ops
264- uint8_t * qnn_buffer_2 = static_cast <uint8_t *>(instance->get_rpcmem_from_memhandle (QNN_VER_PTR (*p_tensor2)->memHandle ));
265- if (nullptr != qnn_buffer_2) {
266- memcpy (dst->data , qnn_buffer_2, ggml_nbytes (dst));
267- }
205+ Qnn_Tensor_t tensor_inputs[] = {
206+ *p_tensor0,
207+ *p_tensor1
208+ };
209+ Qnn_Tensor_t tensor_outputs[] = {
210+ *p_tensor2
211+ };
212+ CHECK_QNN_API (error, qnn_raw_interface.graphExecute (graph_handle,
213+ tensor_inputs, 2 ,
214+ tensor_outputs, 1 ,
215+ nullptr , nullptr ));
216+
217+ if (enable_npu_rpc) {
218+ // TODO:NPU RPC feature will failed with test-backend-ops
219+ uint8_t * qnn_buffer_2 = static_cast <uint8_t *>(instance->get_rpcmem_from_memhandle (QNN_VER_PTR (*p_tensor2)->memHandle ));
220+ if (nullptr != qnn_buffer_2) {
221+ memcpy (dst->data , qnn_buffer_2, ggml_nbytes (dst));
268222 }
269223 }
270224
@@ -461,12 +415,14 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
461415 CHECK_QNN_API (error, qnn_raw_interface.graphExecute (graph_handle, input_tensors, 2 ,
462416 output_tensors, 1 , NULL , NULL ));
463417
418+ #if 0
464419 // Log dst for debugging
465420 float *dst_data = (float *)dst->data;
466421 GGMLQNN_LOG_DEBUG("dst shape: [%d, %d, %d, %d]\n", dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]);
467422 for (int i = 0; i < dst->ne[0] * dst->ne[1] * dst->ne[2] * dst->ne[3]; i++) {
468423 GGMLQNN_LOG_DEBUG("dst[%d] = %f\n", i, dst_data[i]);
469424 }
425+ #endif
470426
471427 op_perf.info ();
472428}
@@ -665,14 +621,8 @@ void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
665621#endif
666622 CHECK_QNN_API (error, qnn_raw_interface.graphAddNode (graph_handle,out_trans1_0));
667623
668- // step-6: finalize qnn graph and execute qnn graph
624+ // step-6: finalize qnn graph
669625 CHECK_QNN_API (error, qnn_raw_interface.graphFinalize (graph_handle, nullptr , nullptr ));
670- Qnn_Tensor_t input_tensors_0[] = {*p_tensor0, *p_tensor1};
671- Qnn_Tensor_t output_tensors_0[] = {*p_tensor2};
672- CHECK_QNN_API (error, qnn_raw_interface.graphExecute (graph_handle,
673- input_tensors_0, 2 ,
674- output_tensors_0, 1 ,
675- nullptr , nullptr ));
676626
677627 qnn_tensors_t ggml_op_mulmat_tensors;
678628 ggml_op_mulmat_tensors.reserve (5 );
@@ -683,30 +633,30 @@ void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
683633 ggml_op_mulmat_tensors.push_back (p_tensor2_transpose);
684634 auto graph_item = std::make_tuple (graph_handle, ggml_op_mulmat_tensors);
685635 instance->_qnn_graph_map [graph_name] = graph_item;
686- } else {
687- if (src0_type != GGML_TYPE_F32) {
688- QNN_VER_PTR (*p_tensor0)->clientBuf = {wdata, static_cast <uint32_t >(desired_size)};
689- } else {
690- QNN_VER_PTR (*p_tensor0)->clientBuf = {src0->data , ggmlqnn_get_tensor_data_size (src0)};
691- }
692- QNN_VER_PTR (*p_tensor1)->clientBuf = {src1->data , ggmlqnn_get_tensor_data_size (src1)};
693- QNN_VER_PTR (*p_tensor2)->clientBuf = {dst->data , ggmlqnn_get_tensor_data_size (dst)};
636+ }
694637
695- Qnn_Tensor_t tensor_inputs[] = {
696- *p_tensor0,
697- *p_tensor1
698- };
699- Qnn_Tensor_t tensor_outputs[] = {
700- *p_tensor2
701- };
702- // this is the second technical approach or another pipeline of "how to utilize the Hexagon
703- // NPU maximally" through QNN SDK, details could be found at
704- // https://github.com/ggml-org/llama.cpp/pull/12049#issuecomment-2678308360
705- CHECK_QNN_API (error, qnn_raw_interface.graphExecute (graph_handle,
706- tensor_inputs, 2 ,
707- tensor_outputs, 1 ,
708- nullptr , nullptr ));
638+ if (src0_type != GGML_TYPE_F32) {
639+ QNN_VER_PTR (*p_tensor0)->clientBuf = {wdata, static_cast <uint32_t >(desired_size)};
640+ } else {
641+ QNN_VER_PTR (*p_tensor0)->clientBuf = {src0->data , ggmlqnn_get_tensor_data_size (src0)};
709642 }
643+ QNN_VER_PTR (*p_tensor1)->clientBuf = {src1->data , ggmlqnn_get_tensor_data_size (src1)};
644+ QNN_VER_PTR (*p_tensor2)->clientBuf = {dst->data , ggmlqnn_get_tensor_data_size (dst)};
645+
646+ Qnn_Tensor_t tensor_inputs[] = {
647+ *p_tensor0,
648+ *p_tensor1
649+ };
650+ Qnn_Tensor_t tensor_outputs[] = {
651+ *p_tensor2
652+ };
653+ // this is the second technical approach or another pipeline of "how to utilize the Hexagon
654+ // NPU maximally" through QNN SDK, details could be found at
655+ // https://github.com/ggml-org/llama.cpp/pull/12049#issuecomment-2678308360
656+ CHECK_QNN_API (error, qnn_raw_interface.graphExecute (graph_handle,
657+ tensor_inputs, 2 ,
658+ tensor_outputs, 1 ,
659+ nullptr , nullptr ));
710660
711661 // restore the original dimensions of qnn tensors to avoid memory leak in func free_qnn_tensor
712662 QNN_VER_PTR (*p_tensor0)->dimensions = tensor_0_dimensions;
0 commit comments