Skip to content

Commit c2b6fec

Browse files
authored
feat: perf opt part2 (#39)
* add qurt_thread * add thread pool * add thread_pool obj at device ctx * wip * small refactoring to fit the thread pool structure * set start/end threads for add * init thread pool * fix thread creation * split complete and pending signals * opt mulmat * wip * 2 threads * back to 4 threads * use barrier * remove some unnecessary package * add multi thread support for mul mat * wip * use qurt_barrier_t instead of qurt_signal_t * wip * wip * add log * split qnn cmake config * create function to calculate the start and end func * wip * fix comment * fix comment * fix comment * wip * fix typo
1 parent a0e54cf commit c2b6fec

File tree

17 files changed

+400
-1138
lines changed

17 files changed

+400
-1138
lines changed

ggml/src/ggml-qnn/CMakeLists.txt

Lines changed: 11 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,9 @@ option(GGML_QNN_ENABLE_HEXAGON_BACKEND "ggml-qnn: Enable Hexagon custom package"
55

66
if(CMAKE_SYSTEM_NAME STREQUAL "Android")
77
find_library(LOG_LIB log)
8-
set(QNN_LINK_LIBRARIES ${LOG_LIB})
9-
set(QNN_DEFAULT_LIB_SEARCH_PATH "/data/local/tmp/" CACHE STRING "customized library search path for QNN backend")
10-
add_compile_options(-g -O0)
8+
set(COMMON_LINK_LIBRARIES ${LOG_LIB})
119
elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows" OR CMAKE_SYSTEM_NAME STREQUAL "Linux")
12-
set(QNN_DEFAULT_LIB_SEARCH_PATH "" CACHE STRING "customized library search path for QNN backend")
10+
message("Building for Linux or Windows")
1311
else()
1412
message(FATAL_ERROR "QNN now only available on Android, Windows and Linux")
1513
endif()
@@ -29,33 +27,15 @@ message("CMAKE_CXX_FLAGS_DEBUG: ${CMAKE_CXX_FLAGS_DEBUG}")
2927
message("CMAKE_CXX_FLAGS_RELEASE: ${CMAKE_CXX_FLAGS_RELEASE}")
3028
message("QNN_SDK_PATH: ${GGML_QNN_SDK_PATH}")
3129

32-
file(GLOB QNN_SOURCES "${CMAKE_CURRENT_LIST_DIR}/qnn/*.cpp")
33-
file(GLOB COMMON_SOURCES "${CMAKE_CURRENT_LIST_DIR}/*.cpp")
34-
ggml_add_backend_library(ggml-qnn
35-
${QNN_SOURCES}
36-
${COMMON_SOURCES}
37-
)
30+
message("GGML_QNN: ${GGML_QNN}")
31+
message("GGML_QNN_ENABLE_PERFORMANCE_TRACKING: ${GGML_QNN_ENABLE_PERFORMANCE_TRACKING}")
32+
message("GGML_QNN_ENABLE_HEXAGON_BACKEND: ${GGML_QNN_ENABLE_HEXAGON_BACKEND}")
33+
message("GGML_HEXAGON_NPU_ONLY: ${GGML_HEXAGON_NPU_ONLY}")
3834

39-
target_include_directories(ggml-qnn PRIVATE
40-
${GGML_QNN_SDK_PATH}/include/QNN
41-
${CMAKE_CURRENT_LIST_DIR}/qnn
42-
${CMAKE_CURRENT_LIST_DIR}
35+
ggml_add_backend_library(ggml-qnn
36+
../../include/ggml-qnn.h
4337
)
44-
target_link_libraries(ggml-qnn PRIVATE ${QNN_LINK_LIBRARIES})
45-
46-
if(NOT "${QNN_DEFAULT_LIB_SEARCH_PATH}" STREQUAL "")
47-
string(REGEX REPLACE "/$" "" QNN_DEFAULT_LIB_SEARCH_PATH "${QNN_DEFAULT_LIB_SEARCH_PATH}")
48-
endif()
49-
50-
message("GGML_QNN_DEFAULT_LIB_SEARCH_PATH: ${QNN_DEFAULT_LIB_SEARCH_PATH}")
51-
target_compile_definitions(ggml-qnn PRIVATE GGML_QNN_DEFAULT_LIB_SEARCH_PATH="${QNN_DEFAULT_LIB_SEARCH_PATH}")
52-
53-
if(GGML_QNN_ENABLE_CPU_BACKEND)
54-
message("GGML_QNN_ENABLE_CPU_BACKEND is enabled")
55-
target_compile_definitions(ggml-qnn PRIVATE GGML_QNN_ENABLE_CPU_BACKEND)
56-
else()
57-
message("GGML_QNN_ENABLE_CPU_BACKEND is disabled")
58-
endif()
38+
target_link_libraries(ggml-qnn PRIVATE ${COMMON_LINK_LIBRARIES})
5939

6040
if(GGML_QNN_ENABLE_PERFORMANCE_TRACKING)
6141
message("GGML_QNN_ENABLE_PERFORMANCE_TRACKING is enabled")
@@ -72,6 +52,8 @@ if(GGML_HEXAGON_NPU_ONLY)
7252
set(GGML_QNN_ENABLE_HEXAGON_BACKEND ON)
7353
else()
7454
message("GGML_HEXAGON_NPU_ONLY is disabled")
55+
add_subdirectory(qnn)
56+
target_link_libraries(ggml-qnn PRIVATE qnn-backend)
7557
endif()
7658

7759
if(GGML_QNN_ENABLE_HEXAGON_BACKEND)

ggml/src/ggml-qnn/npu/device/device.cpp

Lines changed: 39 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,22 +3,38 @@
33
#include <HAP_compute_res.h>
44
#include <hexagon_types.h>
55

6+
#include <memory>
67
#include <new>
78

89
#include "graph.hpp"
910
#include "hexagon_npu.h"
1011
#include "op_impl.hpp"
1112
#include "remote.h"
1213
#include "tensor.hpp"
14+
#include "thread_pool.hpp"
1315
#include "util.hpp"
1416

15-
#define NPU_UNUSED(x) (void) (x)
16-
1717
namespace {
1818

1919
struct npu_device_context {
20-
int unused = 0;
21-
// TODO: should we add tensor context here?
20+
std::unique_ptr<hexagon::default_thread_pool> thread_pool;
21+
22+
bool init_thread_pool() {
23+
if (thread_pool) {
24+
DEVICE_LOG_DEBUG("Thread pool already initialized");
25+
return true;
26+
}
27+
28+
auto pool = std::make_unique<hexagon::default_thread_pool>();
29+
if (!pool) {
30+
DEVICE_LOG_ERROR("Failed to create thread pool");
31+
return false;
32+
}
33+
34+
thread_pool = std::move(pool);
35+
DEVICE_LOG_DEBUG("Thread pool initialized");
36+
return true;
37+
}
2238
};
2339

2440
inline hexagon::tensor * tensor_from_handle(npu_device_graph_handle_t h) {
@@ -37,6 +53,10 @@ inline npu_device_tensor_handle_t graph_to_handle(hexagon::graph * graph) {
3753
return reinterpret_cast<npu_device_tensor_handle_t>(graph);
3854
}
3955

56+
inline npu_device_context * device_context_from_handle(remote_handle64 h) {
57+
return reinterpret_cast<npu_device_context *>(h);
58+
}
59+
4060
} // namespace
4161

4262
int npu_device_open(const char * uri, remote_handle64 * h) {
@@ -47,12 +67,18 @@ int npu_device_open(const char * uri, remote_handle64 * h) {
4767
return AEE_ENOMEMORY;
4868
}
4969

70+
if (!context->init_thread_pool()) {
71+
DEVICE_LOG_ERROR("Failed to initialize thread pool");
72+
delete context;
73+
return AEE_EFAILED;
74+
}
75+
5076
*h = reinterpret_cast<remote_handle64>(context);
5177
return AEE_SUCCESS;
5278
}
5379

5480
int npu_device_close(remote_handle64 h) {
55-
auto * context = reinterpret_cast<npu_device_context *>(h);
81+
auto * context = device_context_from_handle(h);
5682
if (!context) {
5783
DEVICE_LOG_ERROR("Invalid npu_device_context handle");
5884
return AEE_EINVHANDLE;
@@ -149,13 +175,19 @@ AEEResult npu_device_graph_set_tensor(remote_handle64 _h, npu_device_graph_handl
149175
}
150176

151177
AEEResult npu_device_graph_compute(remote_handle64 _h, npu_device_graph_handle_t graph_handle) {
152-
NPU_UNUSED(_h);
178+
auto dev_ctx = device_context_from_handle(_h);
179+
if (!dev_ctx) {
180+
DEVICE_LOG_DEBUG("Invalid npu_device_context handle");
181+
return AEE_EINVHANDLE;
182+
}
183+
153184
auto * graph = graph_from_handle(graph_handle);
154185
if (!graph) {
186+
DEVICE_LOG_ERROR("Invalid graph handle");
155187
return AEE_EINVHANDLE;
156188
}
157189

158-
if (!graph->compute()) {
190+
if (!graph->compute(dev_ctx->thread_pool.get())) {
159191
return AEE_EFAILED;
160192
}
161193

ggml/src/ggml-qnn/npu/device/graph.cpp

Lines changed: 28 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -8,24 +8,23 @@
88

99
namespace hexagon {
1010

11+
graph::graph() noexcept {
12+
DEVICE_LOG_DEBUG("graph(%p) created\n", (void *) this);
13+
}
14+
1115
graph::~graph() noexcept {
12-
if (_tensors) {
13-
delete[] _tensors;
14-
}
16+
_tensors.reset();
17+
DEVICE_LOG_DEBUG("graph(%p) destroyed\n", (void *) this);
1518
}
1619

1720
void graph::set_tensor(const npu_device_tensor_handle_t * tensors, int tensor_count) {
18-
if (_tensor_count > 0) {
19-
delete[] _tensors;
20-
}
21-
2221
if (tensor_count <= 0) {
23-
_tensors = nullptr;
22+
_tensors.reset();
2423
_tensor_count = 0;
2524
return;
2625
}
2726

28-
_tensors = new (std::nothrow) tensor *[tensor_count];
27+
_tensors = std::make_unique<tensor *[]>(size_t(tensor_count));
2928
for (int i = 0; i < tensor_count; ++i) {
3029
auto * tensor_obj = reinterpret_cast<tensor *>(tensors[i]);
3130
_tensors[i] = tensor_obj;
@@ -37,31 +36,43 @@ void graph::set_tensor(const npu_device_tensor_handle_t * tensors, int tensor_co
3736
DEVICE_LOG_DEBUG("graph(%p) tensor count: %zu\n", (void *) this, _tensor_count);
3837
}
3938

40-
bool graph::compute() {
39+
bool graph::compute(default_thread_pool * thread_pool) {
4140
if (!_tensors || !_tensor_count) {
4241
DEVICE_LOG_DEBUG("graph(%p) no tensors to compute\n", (void *) this);
4342
return true; // return success if no tensors to compute
4443
}
4544

4645
DEVICE_LOG_DEBUG("graph(%p) compute\n", (void *) this);
46+
thread_pool->sync_execute(reinterpret_cast<default_thread_pool::task_type>(&graph::thread_pool_task), this);
47+
48+
for (size_t i = 0; i < _tensor_count; ++i) {
49+
auto * dst = _tensors[i];
50+
dst->flush(); // TODO: optimize this
51+
}
52+
53+
return true;
54+
}
55+
56+
void graph::thread_pool_task(default_thread_pool * pool, size_t thread_idx, size_t thread_count, graph * graph) {
57+
NPU_UNUSED(pool);
58+
graph->compute_impl(thread_idx, thread_count);
59+
}
60+
61+
void graph::compute_impl(size_t thread_idx, size_t thread_count) {
4762
for (size_t i = 0; i < _tensor_count; ++i) {
4863
auto * dst = _tensors[i];
4964
auto op = dst->get_op();
5065
auto * func = get_compute_func(op);
5166
if (!func) {
5267
DEVICE_LOG_ERROR("graph(%p) tensor[%zu] op %d not supported\n", (void *) this, i, op);
53-
return false;
68+
return;
5469
}
5570

56-
if (!func(dst)) {
71+
if (!func(dst, thread_idx, thread_count)) {
5772
DEVICE_LOG_ERROR("graph(%p) tensor[%zu] op %d compute failed\n", (void *) this, i, op);
58-
return false;
73+
return;
5974
}
60-
61-
dst->flush(); // TODO: optimize this
6275
}
63-
64-
return true;
6576
}
6677

6778
} // namespace hexagon
Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,32 @@
11
#pragma once
22

3+
#include <memory>
4+
35
#include "hexagon_npu.h"
46
#include "tensor.hpp"
7+
#include "thread_pool.hpp"
58

69
namespace hexagon {
710

811
class graph {
912
public:
1013
// TODO: add execute direction here
11-
explicit graph() noexcept {}
14+
explicit graph() noexcept;
1215

1316
~graph() noexcept;
1417

1518
void set_tensor(const npu_device_tensor_handle_t * tensors, int tensor_count);
1619

17-
bool compute();
20+
bool compute(default_thread_pool * thread_pool);
1821

1922
private:
20-
tensor ** _tensors = nullptr;
21-
size_t _tensor_count = 0;
23+
static void thread_pool_task(default_thread_pool * pool, size_t thread_idx, size_t thread_count, graph * graph);
24+
void compute_impl(size_t thread_idx, size_t thread_count);
25+
26+
std::unique_ptr<tensor *[]> _tensors;
27+
size_t _tensor_count = 0;
2228

23-
graph(const graph &) = delete;
24-
void operator=(const graph &) = delete;
25-
graph(graph &&) = delete;
26-
void operator=(graph &&) = delete;
29+
DISABLE_COPY_AND_MOVE(graph);
2730
};
2831

2932
} // namespace hexagon

ggml/src/ggml-qnn/npu/device/op_impl.cpp

Lines changed: 20 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -76,11 +76,12 @@ inline HVX_Vector vmul_f32_f32(HVX_Vector a, HVX_Vector b) {
7676
}
7777

7878
template <typename _TySrc, typename _TyDst, void (*_RowFunc)(const _TySrc *, const _TySrc *, size_t, _TyDst *)>
79-
bool element_wise_op(hexagon::tensor * out) {
79+
bool element_wise_op(hexagon::tensor * out, size_t tidx, size_t tcnt) {
8080
if (!out) {
8181
return false;
8282
}
8383

84+
static_assert(DEVICE_TENSOR_MAX_DIMS == 4, "element_wise_op requires max dims 4");
8485
auto * src0 = out->get_src(0);
8586
auto * src1 = out->get_src(1);
8687
if (!src0 || !src1) {
@@ -93,28 +94,24 @@ bool element_wise_op(hexagon::tensor * out) {
9394
return false;
9495
}
9596

96-
static_assert(DEVICE_TENSOR_MAX_DIMS == 4, "element_wise_op requires max dims 4");
97-
98-
const auto * src0_ptr = reinterpret_cast<const uint8_t *>(src0->get_data());
99-
const auto * src1_ptr = reinterpret_cast<const uint8_t *>(src1->get_data());
100-
auto * dst_ptr = reinterpret_cast<uint8_t *>(out->get_data());
101-
for (int64_t i3 = 0; i3 < out->get_ne(3); i3++) {
102-
const auto * src0_cube = src0_ptr + i3 * src0->get_nb(3);
103-
const auto * src1_cube = src1_ptr + (i3 % src1->get_ne(3)) * src1->get_nb(3);
104-
auto * dst_cube = dst_ptr + i3 * out->get_nb(3);
105-
for (int64_t i2 = 0; i2 < out->get_ne(2); i2++) {
106-
const auto * src0_plane = src0_cube + i2 * src0->get_nb(2);
107-
const auto * src1_plane = src1_cube + (i2 % src1->get_ne(2)) * src1->get_nb(2);
108-
auto * dst_plane = dst_cube + i2 * out->get_nb(2);
109-
for (int64_t i1 = 0; i1 < out->get_ne(1); i1++) {
110-
// TODO: prefetch row?
111-
auto * src0_row = src0_plane + i1 * src0->get_nb(1);
112-
auto * src1_row = src1_plane + (i1 % src1->get_ne(1)) * src1->get_nb(1);
113-
auto * dst_row = reinterpret_cast<float *>(dst_plane + i1 * out->get_nb(1));
114-
_RowFunc(reinterpret_cast<const _TySrc *>(src0_row), reinterpret_cast<const _TySrc *>(src1_row),
115-
static_cast<size_t>(out->get_ne(0)), reinterpret_cast<_TyDst *>(dst_row));
116-
}
117-
}
97+
const auto * src0_ptr = reinterpret_cast<const uint8_t *>(src0->get_data());
98+
const auto * src1_ptr = reinterpret_cast<const uint8_t *>(src1->get_data());
99+
auto * dst_ptr = reinterpret_cast<uint8_t *>(out->get_data());
100+
auto total_rows = out->get_ne(3) * out->get_ne(2) * out->get_ne(1);
101+
const auto rows_per_box = out->get_ne(2) * out->get_ne(1);
102+
const auto start_end = hexagon::get_thread_work_slice(total_rows, tidx, tcnt);
103+
for (int64_t ir = start_end.first; ir < start_end.second; ++ir) {
104+
const auto i03 = ir / rows_per_box;
105+
const auto i02 = ir / out->get_ne(1) - i03 * out->get_ne(2);
106+
const auto i01 = ir % out->get_ne(1);
107+
const auto i13 = i03 % src1->get_ne(3);
108+
const auto i12 = i02 % src1->get_ne(2);
109+
const auto i11 = i01 % src1->get_ne(1);
110+
auto * src0_row = src0_ptr + i03 * src0->get_nb(3) + i02 * src0->get_nb(2) + i01 * src0->get_nb(1);
111+
auto * src1_row = src1_ptr + i13 * src1->get_nb(3) + i12 * src1->get_nb(2) + i11 * src1->get_nb(1);
112+
auto * dst_row = dst_ptr + i03 * out->get_nb(3) + i02 * out->get_nb(2) + i01 * out->get_nb(1);
113+
_RowFunc(reinterpret_cast<const _TySrc *>(src0_row), reinterpret_cast<const _TySrc *>(src1_row),
114+
static_cast<size_t>(out->get_ne(0)), reinterpret_cast<_TyDst *>(dst_row));
118115
}
119116

120117
return true;

ggml/src/ggml-qnn/npu/device/op_impl.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
namespace hexagon {
77

8-
typedef bool (*compute_func_type)(tensor * dst);
8+
typedef bool (*compute_func_type)(tensor * dst, size_t tidx, size_t tcnt);
99
typedef bool (*op_is_supported_func_type)(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1,
1010
const npu_device_tensor_spec & dst, npu_device_tensor_op op);
1111

0 commit comments

Comments
 (0)