Skip to content

Commit c23ab46

Browse files
authored
feat: perf opt part4 (#43)
* wip * refactor: rewrite dequantize_row_q4_0 by intrinsic * log for debug * fix q4 intrinsic * small opt * wip * wip * add vtcm_quota_size * add perf log for hexagon-npu backend * wip * add log * sync after a specfic op * increase worker thread priority * fix unbalanced thread slice * small slict to fit in vtcm cache * limit the supported row element size * opt 4_0 dequant * fix q4 dequant * add power_utils * add rms_norm * wip * enable rms_norm f32 * fix rms_norm with param * fix compiling flags * use float * fix small row size * vectorized rms norm * wip * read 2 vectors * rename * add perf log on update * set empty tensors handle also * merge some rpc functions * opt param update * wip * print more log * add struct for update param config * add npu_device_graph_set_tensor_with_param * merge tensor and params update * wip * wip * make as template to reuse * vectorize dequantize_row_q8_0 * opt * avoid using union to store q data * wip * wip * wip
1 parent 2306f82 commit c23ab46

32 files changed

+1020
-403
lines changed

ggml/src/ggml-qnn/npu/CMakeLists.txt

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,11 @@ else()
231231

232232
build_idl(idl/hexagon_npu.idl hexagon_npu_skel_OBJS)
233233

234+
add_subdirectory(${HEXAGON_SDK_ROOT}/libs/qprintf qprintf_dir)
235+
target_include_directories(hexagon_npu_skel_OBJS PUBLIC
236+
${HEXAGON_SDK_ROOT}/libs/qprintf/inc/
237+
)
238+
234239
# disable warnings for the skel
235240
set_source_files_properties(
236241
${skel_srcs}
@@ -239,12 +244,12 @@ else()
239244
)
240245

241246
add_library(hexagon_npu_skel SHARED $<TARGET_OBJECTS:hexagon_npu_skel_OBJS>)
242-
243247
target_link_libraries(hexagon_npu_skel
244248
${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc++abi.a
245249
${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc++.a
246250
)
247251
set_target_properties(hexagon_npu_skel PROPERTIES OUTPUT_NAME "hexagon_npu_skel_${HEXAGON_ARCH}")
252+
target_link_libraries(hexagon_npu_skel qprintf_static)
248253

249254
copy_binaries(hexagon_npu_skel)
250255
endif()

ggml/src/ggml-qnn/npu/device/device.cpp

Lines changed: 35 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@ int npu_device_open(const char * uri, remote_handle64 * h) {
106106
}
107107

108108
*h = reinterpret_cast<remote_handle64>(context);
109+
DEVICE_LOG_INFO("NPU device context created: %p", (void *) *h);
109110
return AEE_SUCCESS;
110111
}
111112

@@ -117,6 +118,7 @@ int npu_device_close(remote_handle64 h) {
117118
}
118119

119120
delete context;
121+
DEVICE_LOG_INFO("NPU device context destroyed: %p", (void *) h);
120122
return AEE_SUCCESS;
121123
}
122124

@@ -130,6 +132,12 @@ AEEResult npu_device_device_support_op(remote_handle64 _h, const npu_device_tens
130132
const npu_device_tensor_spec * src1, const npu_device_tensor_spec * dst,
131133
npu_device_tensor_op op, boolean * is_supported) {
132134
NPU_UNUSED(_h);
135+
136+
if (!src0 || !src1 || !dst || !is_supported) {
137+
DEVICE_LOG_ERROR("npu_device_device_support_op: Invalid arguments");
138+
return AEE_EINVARGS;
139+
}
140+
133141
*is_supported = hexagon::support_op(*src0, *src1, *dst, op);
134142
return AEE_SUCCESS;
135143
}
@@ -147,28 +155,15 @@ AEEResult npu_device_tensor_init(remote_handle64 _h, const npu_device_tensor_con
147155
return AEE_SUCCESS;
148156
}
149157

150-
AEEResult npu_device_tensor_set_src(remote_handle64 _h, npu_device_tensor_handle_t tensor_handle, uint64_t index,
151-
npu_device_tensor_handle_t src) {
158+
AEEResult npu_device_tensor_update_params(remote_handle64 _h, npu_device_tensor_handle_t tensor_handle,
159+
const npu_device_tensor_update_config * config) {
152160
NPU_UNUSED(_h);
153161
auto * tensor = tensor_from_handle(tensor_handle);
154-
if (!tensor) {
155-
return AEE_EINVHANDLE;
156-
}
157-
158-
auto * src_tensor = tensor_from_handle(src);
159-
tensor->set_src(index, src_tensor);
160-
return AEE_SUCCESS;
161-
}
162-
163-
AEEResult npu_device_tensor_set_op(remote_handle64 _h, npu_device_tensor_handle_t tensor_handle,
164-
npu_device_tensor_op op) {
165-
NPU_UNUSED(_h);
166-
auto * tensor = tensor_from_handle(tensor_handle);
167-
if (!tensor) {
162+
if (!tensor || !config) {
168163
return AEE_EINVHANDLE;
169164
}
170165

171-
tensor->set_op(op);
166+
tensor->update_config(*config);
172167
return AEE_SUCCESS;
173168
}
174169

@@ -206,6 +201,29 @@ AEEResult npu_device_graph_set_tensor(remote_handle64 _h, npu_device_graph_handl
206201
return AEE_SUCCESS;
207202
}
208203

204+
AEEResult npu_device_graph_set_tensor_with_param(remote_handle64 _h, npu_device_graph_handle_t graph_handle,
205+
const npu_device_tensor_handle_t * tensor_handles,
206+
int tensor_handlesLen,
207+
const npu_device_tensor_update_config * tensor_params,
208+
int tensor_paramsLen) {
209+
NPU_UNUSED(_h);
210+
auto * graph = graph_from_handle(graph_handle);
211+
if (!graph || !tensor_handles || tensor_handlesLen <= 0 || !tensor_params ||
212+
tensor_handlesLen != tensor_paramsLen) {
213+
return AEE_EINVHANDLE;
214+
}
215+
216+
graph->set_tensor(tensor_handles, tensor_handlesLen);
217+
for (int i = 0; i < tensor_handlesLen; ++i) {
218+
auto * tensor = tensor_from_handle(tensor_handles[i]);
219+
if (tensor) {
220+
tensor->update_config(tensor_params[i]);
221+
}
222+
}
223+
224+
return AEE_SUCCESS;
225+
}
226+
209227
AEEResult npu_device_graph_compute(remote_handle64 _h, npu_device_graph_handle_t graph_handle) {
210228
auto dev_ctx = device_context_from_handle(_h);
211229
if (!dev_ctx) {

ggml/src/ggml-qnn/npu/device/graph.cpp

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@
1010
namespace hexagon {
1111

1212
graph::graph() noexcept {
13-
DEVICE_LOG_DEBUG("graph(%p) created\n", (void *) this);
13+
_vtcm_quota_size = hexagon::vtcm_mem::get_avail_block_size(); // TODO: move to device init?
14+
DEVICE_LOG_DEBUG("graph(%p) created: vtcm quota size: %zu\n", (void *) this, _vtcm_quota_size);
1415
}
1516

1617
graph::~graph() noexcept {
@@ -45,6 +46,8 @@ bool graph::compute(default_thread_pool * thread_pool, const float * f16_to_f32_
4546
}
4647

4748
DEVICE_LOG_DEBUG("graph(%p) compute\n", (void *) this);
49+
50+
DEVICE_SCOPED_PERFORMANCE_TRACKER("[%p]compute", (void *) this);
4851
_f16_to_f32_table = f16_to_f32_table;
4952
if (thread_pool) {
5053
thread_pool->sync_execute(reinterpret_cast<default_thread_pool::task_type>(&graph::thread_pool_task), this);
@@ -61,6 +64,8 @@ void graph::thread_pool_task(default_thread_pool * pool, size_t thread_idx, size
6164
}
6265

6366
void graph::compute_impl(default_thread_pool * pool, size_t thread_idx, size_t thread_count) {
67+
hexagon::compute_params params = { thread_idx, thread_count, _vtcm_quota_size / thread_count, _f16_to_f32_table };
68+
6469
for (size_t i = 0; i < _tensor_count; ++i) {
6570
auto * dst = _tensors[i];
6671
auto op = dst->get_op();
@@ -69,14 +74,14 @@ void graph::compute_impl(default_thread_pool * pool, size_t thread_idx, size_t t
6974
DEVICE_LOG_ERROR("graph(%p) tensor[%zu] op %d not supported\n", (void *) this, i, op);
7075
return;
7176
}
72-
73-
hexagon::compute_params params = { thread_idx, thread_count, _f16_to_f32_table };
7477
if (!func(dst, &params)) {
7578
DEVICE_LOG_ERROR("graph(%p) tensor[%zu] op %d compute failed\n", (void *) this, i, op);
7679
}
7780

78-
// TODO: figure out which ops need to sync
79-
if (pool) {
81+
DEVICE_SCOPED_PERFORMANCE_TRACKER("[%p]sync_thread, tidx: %zu", (void *) this, thread_idx);
82+
83+
const bool should_sync = requires_thread_barrier(op);
84+
if (pool && should_sync && i < _tensor_count - 1) {
8085
pool->sync_thread();
8186
}
8287
dst->invalidate();

ggml/src/ggml-qnn/npu/device/graph.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ class graph {
2525

2626
std::unique_ptr<tensor *[]> _tensors;
2727
size_t _tensor_count = 0;
28+
size_t _vtcm_quota_size = 0;
2829
const float * _f16_to_f32_table = nullptr;
2930

3031
DISABLE_COPY_AND_MOVE(graph);

0 commit comments

Comments
 (0)