Skip to content

Commit 7b2d3bc

Browse files
author
zhouwg
committed
ggml-qnn: refine ggml_qnn_mul_mat and ggml_qnn_general_node according to Grok 3's style
1 parent 4cae702 commit 7b2d3bc

File tree

3 files changed

+69
-120
lines changed

3 files changed

+69
-120
lines changed

ggml/src/ggml-qnn/ggml-qnn-impl.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ void ggmlqnn_log_internal(ggml_log_level level, const char * file, const char
9999
#else
100100
#define GGMLQNN_DEBUG 1 // for troubleshooting QNN backend
101101
#define ENABLE_QNNBACKEND_PERF 0 // enable/disable op's perf info
102-
#define GGMLQNN_PRINT_QNN_INTERNAL_LOG 1 // enable/disable QNN's internal log
102+
#define GGMLQNN_PRINT_QNN_INTERNAL_LOG 0 // enable/disable QNN's internal log
103103
#define GGMLQNN_PRINT_OP_ADD_LOG 0 // GGML_OP_ADD already verified with QNN-CPU / QNN-GPU / QNN-NPU
104104
#define GGMLQNN_PRINT_OP_MUL_MAT_LOG 1
105105
#endif

ggml/src/ggml-qnn/ggml-qnn-ops.cpp

Lines changed: 43 additions & 93 deletions
Original file line numberDiff line numberDiff line change
@@ -200,71 +200,25 @@ void ggml_qnn_general_node(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
200200

201201
auto graph_item = std::make_tuple(graph_handle, ggml_op_add_tensors);
202202
instance->_qnn_graph_map[graph_name] = graph_item;
203-
} else {
204-
Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32;
205-
Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32;
206-
Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32;
207-
208-
src0_qnn_type = ggmlqnn_datatype_from_ggml_datatype(src0->type);
209-
src1_qnn_type = ggmlqnn_datatype_from_ggml_datatype(src1->type);
210-
dst_qnn_type = ggmlqnn_datatype_from_ggml_datatype(dst->type);
211-
212-
uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
213-
(uint32_t) src0->ne[2], (uint32_t) src0->ne[3]};
214-
uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
215-
(uint32_t) src1->ne[2], (uint32_t) src1->ne[3]};
216-
uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
217-
(uint32_t) dst->ne[2], (uint32_t) dst->ne[3]};
218-
219-
QNN_VER_PTR(*p_tensor0)->dimensions = dimensions_input_0;
220-
QNN_VER_PTR(*p_tensor0)->rank = ggml_n_dims(src0);
221-
QNN_VER_PTR(*p_tensor0)->dataType = src0_qnn_type;
222-
223-
QNN_VER_PTR(*p_tensor1)->dimensions = dimensions_input_1;
224-
QNN_VER_PTR(*p_tensor1)->rank = ggml_n_dims(src1);
225-
QNN_VER_PTR(*p_tensor1)->dataType = src1_qnn_type;
226-
227-
QNN_VER_PTR(*p_tensor2)->dimensions = dimensions_output;
228-
QNN_VER_PTR(*p_tensor2)->rank = ggml_n_dims(dst);
229-
QNN_VER_PTR(*p_tensor2)->dataType = dst_qnn_type;
230-
231-
if (enable_npu_rpc) {
232-
//TODO: NPU RPC feature will failed with test-backend-ops
233-
uint8_t * qnn_buffer_0 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor0)->memHandle));
234-
GGMLQNN_LOG_INFO("qnn_rpcbuffer_0 = %p\n", qnn_buffer_0);
235-
if (nullptr != qnn_buffer_0) {
236-
memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0));
237-
}
238-
239-
uint8_t * qnn_buffer_1 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor1)->memHandle));
240-
GGMLQNN_LOG_INFO("qnn_rpcbuffer_1 = %p\n", qnn_buffer_1);
241-
if (nullptr != qnn_buffer_1) {
242-
memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1));
243-
}
244-
} else {
245-
QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggmlqnn_get_tensor_data_size(src0)};
246-
QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggmlqnn_get_tensor_data_size(src1)};
247-
QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggmlqnn_get_tensor_data_size(dst)};
248-
}
249-
250-
Qnn_Tensor_t tensor_inputs[] = {
251-
*p_tensor0,
252-
*p_tensor1
253-
};
254-
Qnn_Tensor_t tensor_outputs[] = {
255-
*p_tensor2
256-
};
257-
CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
258-
tensor_inputs, 2,
259-
tensor_outputs, 1,
260-
nullptr, nullptr));
203+
}
261204

262-
if (enable_npu_rpc) {
263-
//TODO:NPU RPC feature will failed with test-backend-ops
264-
uint8_t * qnn_buffer_2 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor2)->memHandle));
265-
if (nullptr != qnn_buffer_2) {
266-
memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst));
267-
}
205+
Qnn_Tensor_t tensor_inputs[] = {
206+
*p_tensor0,
207+
*p_tensor1
208+
};
209+
Qnn_Tensor_t tensor_outputs[] = {
210+
*p_tensor2
211+
};
212+
CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
213+
tensor_inputs, 2,
214+
tensor_outputs, 1,
215+
nullptr, nullptr));
216+
217+
if (enable_npu_rpc) {
218+
//TODO:NPU RPC feature will failed with test-backend-ops
219+
uint8_t * qnn_buffer_2 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor2)->memHandle));
220+
if (nullptr != qnn_buffer_2) {
221+
memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst));
268222
}
269223
}
270224

@@ -461,12 +415,14 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
461415
CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle, input_tensors, 2,
462416
output_tensors, 1, NULL, NULL));
463417

418+
#if 0
464419
// Log dst for debugging
465420
float *dst_data = (float *)dst->data;
466421
GGMLQNN_LOG_DEBUG("dst shape: [%d, %d, %d, %d]\n", dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]);
467422
for (int i = 0; i < dst->ne[0] * dst->ne[1] * dst->ne[2] * dst->ne[3]; i++) {
468423
GGMLQNN_LOG_DEBUG("dst[%d] = %f\n", i, dst_data[i]);
469424
}
425+
#endif
470426

471427
op_perf.info();
472428
}
@@ -665,14 +621,8 @@ void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
665621
#endif
666622
CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle,out_trans1_0));
667623

668-
//step-6: finalize qnn graph and execute qnn graph
624+
//step-6: finalize qnn graph
669625
CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr));
670-
Qnn_Tensor_t input_tensors_0[] = {*p_tensor0, *p_tensor1};
671-
Qnn_Tensor_t output_tensors_0[] = {*p_tensor2};
672-
CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
673-
input_tensors_0, 2,
674-
output_tensors_0, 1,
675-
nullptr, nullptr));
676626

677627
qnn_tensors_t ggml_op_mulmat_tensors;
678628
ggml_op_mulmat_tensors.reserve(5);
@@ -683,30 +633,30 @@ void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
683633
ggml_op_mulmat_tensors.push_back(p_tensor2_transpose);
684634
auto graph_item = std::make_tuple(graph_handle, ggml_op_mulmat_tensors);
685635
instance->_qnn_graph_map[graph_name] = graph_item;
686-
} else {
687-
if (src0_type != GGML_TYPE_F32) {
688-
QNN_VER_PTR(*p_tensor0)->clientBuf = {wdata, static_cast<uint32_t>(desired_size)};
689-
} else {
690-
QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggmlqnn_get_tensor_data_size(src0)};
691-
}
692-
QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggmlqnn_get_tensor_data_size(src1)};
693-
QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggmlqnn_get_tensor_data_size(dst)};
636+
}
694637

695-
Qnn_Tensor_t tensor_inputs[] = {
696-
*p_tensor0,
697-
*p_tensor1
698-
};
699-
Qnn_Tensor_t tensor_outputs[] = {
700-
*p_tensor2
701-
};
702-
// this is the second technical approach or another pipeline of "how to utilize the Hexagon
703-
// NPU maximally" through QNN SDK, details could be found at
704-
// https://github.com/ggml-org/llama.cpp/pull/12049#issuecomment-2678308360
705-
CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
706-
tensor_inputs, 2,
707-
tensor_outputs, 1,
708-
nullptr, nullptr));
638+
if (src0_type != GGML_TYPE_F32) {
639+
QNN_VER_PTR(*p_tensor0)->clientBuf = {wdata, static_cast<uint32_t>(desired_size)};
640+
} else {
641+
QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggmlqnn_get_tensor_data_size(src0)};
709642
}
643+
QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggmlqnn_get_tensor_data_size(src1)};
644+
QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggmlqnn_get_tensor_data_size(dst)};
645+
646+
Qnn_Tensor_t tensor_inputs[] = {
647+
*p_tensor0,
648+
*p_tensor1
649+
};
650+
Qnn_Tensor_t tensor_outputs[] = {
651+
*p_tensor2
652+
};
653+
// this is the second technical approach or another pipeline of "how to utilize the Hexagon
654+
// NPU maximally" through QNN SDK, details could be found at
655+
// https://github.com/ggml-org/llama.cpp/pull/12049#issuecomment-2678308360
656+
CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
657+
tensor_inputs, 2,
658+
tensor_outputs, 1,
659+
nullptr, nullptr));
710660

711661
// restore the original dimensions of qnn tensors to avoid memory leak in func free_qnn_tensor
712662
QNN_VER_PTR(*p_tensor0)->dimensions = tensor_0_dimensions;

tests/ggml-qnn-ut.cpp

Lines changed: 25 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -332,37 +332,36 @@ int main(int argc, char * argv[]) {
332332
std::vector<ggml_backend_ptr> backends;
333333
std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;
334334
printf("Testing %zu devices\n\n", ggml_backend_dev_count());
335-
//for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
336-
for (size_t i = 0; i < 2; i++) {
337-
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
335+
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
336+
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
338337

339-
printf("Backend %zu/%zu: %s\n", i + 1, ggml_backend_dev_count(),
340-
ggml_backend_dev_name(dev));
338+
printf("Backend %zu/%zu: %s\n", i + 1, ggml_backend_dev_count(),
339+
ggml_backend_dev_name(dev));
341340

342-
if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
343-
printf(" Skipping CPU backend\n");
344-
continue;
345-
}
341+
if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
342+
printf(" Skipping CPU backend\n");
343+
continue;
344+
}
346345

347-
backend = ggml_backend_dev_init(dev, reinterpret_cast<const char *>(i));
348-
GGML_ASSERT(backend != NULL);
349-
if (backend != nullptr) {
350-
printf("%s: initialize %s backend\n", __func__, ggml_backend_dev_name(dev));
351-
}
352-
backends.emplace_back(backend);
346+
backend = ggml_backend_dev_init(dev, reinterpret_cast<const char *>(i));
347+
GGML_ASSERT(backend != NULL);
348+
if (backend != nullptr) {
349+
printf("%s: initialize %s backend\n", __func__, ggml_backend_dev_name(dev));
350+
}
351+
backends.emplace_back(backend);
353352

354-
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
355-
auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(
356-
reg, "ggml_backend_set_n_threads");
357-
if (ggml_backend_set_n_threads_fn) {
358-
ggml_backend_set_n_threads_fn(backend, std::thread::hardware_concurrency());
359-
}
353+
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
354+
auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(
355+
reg, "ggml_backend_set_n_threads");
356+
if (ggml_backend_set_n_threads_fn) {
357+
ggml_backend_set_n_threads_fn(backend, std::thread::hardware_concurrency());
358+
}
360359

361-
printf(" Device description: %s\n", ggml_backend_dev_description(dev));
362-
size_t free, total;
363-
ggml_backend_dev_memory(dev, &free, &total);
364-
printf(" Device memory: %zu MB (%zu MB free)\n", total / 1024 / 1024, free / 1024 / 1024);
365-
printf("\n");
360+
printf(" Device description: %s\n", ggml_backend_dev_description(dev));
361+
size_t free, total;
362+
ggml_backend_dev_memory(dev, &free, &total);
363+
printf(" Device memory: %zu MB (%zu MB free)\n", total / 1024 / 1024, free / 1024 / 1024);
364+
printf("\n");
366365
}
367366

368367
ggml_backend_t backend_cpu = nullptr;

0 commit comments

Comments
 (0)