ggml-org
diff --git a/‎ggml/src/ggml-qnn/CMakeLists.txt‎
Lines changed: 11 additions & 29 deletions b/‎ggml/src/ggml-qnn/CMakeLists.txt‎
Lines changed: 11 additions & 29 deletions
diff --git a/‎ggml/src/ggml-qnn/npu/device/device.cpp‎
Lines changed: 39 additions & 7 deletions b/‎ggml/src/ggml-qnn/npu/device/device.cpp‎
Lines changed: 39 additions & 7 deletions
diff --git a/‎ggml/src/ggml-qnn/npu/device/graph.cpp‎
Lines changed: 28 additions & 17 deletions b/‎ggml/src/ggml-qnn/npu/device/graph.cpp‎
Lines changed: 28 additions & 17 deletions
diff --git a/‎ggml/src/ggml-qnn/npu/device/graph.hpp‎
Lines changed: 11 additions & 8 deletions b/‎ggml/src/ggml-qnn/npu/device/graph.hpp‎
Lines changed: 11 additions & 8 deletions
diff --git a/‎ggml/src/ggml-qnn/npu/device/op_impl.cpp‎
Lines changed: 20 additions & 23 deletions b/‎ggml/src/ggml-qnn/npu/device/op_impl.cpp‎
Lines changed: 20 additions & 23 deletions
diff --git a/‎ggml/src/ggml-qnn/npu/device/op_impl.hpp‎
Lines changed: 1 addition & 1 deletion b/‎ggml/src/ggml-qnn/npu/device/op_impl.hpp‎
Lines changed: 1 addition & 1 deletion
@@ -5,11 +5,9 @@ option(GGML_QNN_ENABLE_HEXAGON_BACKEND "ggml-qnn: Enable Hexagon custom package"
 
 if(CMAKE_SYSTEM_NAME STREQUAL "Android")
     find_library(LOG_LIB log)
-    set(QNN_LINK_LIBRARIES ${LOG_LIB})
-    set(QNN_DEFAULT_LIB_SEARCH_PATH "/data/local/tmp/" CACHE STRING "customized library search path for QNN backend")
-    add_compile_options(-g -O0)
+    set(COMMON_LINK_LIBRARIES ${LOG_LIB})
 elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows" OR CMAKE_SYSTEM_NAME STREQUAL "Linux")
-    set(QNN_DEFAULT_LIB_SEARCH_PATH "" CACHE STRING "customized library search path for QNN backend")
+    message("Building for Linux or Windows")
 else()
     message(FATAL_ERROR "QNN now only available on Android, Windows and Linux")
 endif()
@@ -29,33 +27,15 @@ message("CMAKE_CXX_FLAGS_DEBUG: ${CMAKE_CXX_FLAGS_DEBUG}")
 message("CMAKE_CXX_FLAGS_RELEASE: ${CMAKE_CXX_FLAGS_RELEASE}")
 message("QNN_SDK_PATH: ${GGML_QNN_SDK_PATH}")
 
-file(GLOB QNN_SOURCES "${CMAKE_CURRENT_LIST_DIR}/qnn/*.cpp")
-file(GLOB COMMON_SOURCES "${CMAKE_CURRENT_LIST_DIR}/*.cpp")
-ggml_add_backend_library(ggml-qnn
-    ${QNN_SOURCES}
-    ${COMMON_SOURCES}
-)
+message("GGML_QNN: ${GGML_QNN}")
+message("GGML_QNN_ENABLE_PERFORMANCE_TRACKING: ${GGML_QNN_ENABLE_PERFORMANCE_TRACKING}")
+message("GGML_QNN_ENABLE_HEXAGON_BACKEND: ${GGML_QNN_ENABLE_HEXAGON_BACKEND}")
+message("GGML_HEXAGON_NPU_ONLY: ${GGML_HEXAGON_NPU_ONLY}")
 
-target_include_directories(ggml-qnn PRIVATE
-    ${GGML_QNN_SDK_PATH}/include/QNN
-    ${CMAKE_CURRENT_LIST_DIR}/qnn
-    ${CMAKE_CURRENT_LIST_DIR}
+ggml_add_backend_library(ggml-qnn
+    ../../include/ggml-qnn.h
 )
-target_link_libraries(ggml-qnn PRIVATE ${QNN_LINK_LIBRARIES})
-
-if(NOT "${QNN_DEFAULT_LIB_SEARCH_PATH}" STREQUAL "")
-    string(REGEX REPLACE "/$" "" QNN_DEFAULT_LIB_SEARCH_PATH "${QNN_DEFAULT_LIB_SEARCH_PATH}")
-endif()
-
-message("GGML_QNN_DEFAULT_LIB_SEARCH_PATH: ${QNN_DEFAULT_LIB_SEARCH_PATH}")
-target_compile_definitions(ggml-qnn PRIVATE GGML_QNN_DEFAULT_LIB_SEARCH_PATH="${QNN_DEFAULT_LIB_SEARCH_PATH}")
-
-if(GGML_QNN_ENABLE_CPU_BACKEND)
-    message("GGML_QNN_ENABLE_CPU_BACKEND is enabled")
-    target_compile_definitions(ggml-qnn PRIVATE GGML_QNN_ENABLE_CPU_BACKEND)
-else()
-    message("GGML_QNN_ENABLE_CPU_BACKEND is disabled")
-endif()
+target_link_libraries(ggml-qnn PRIVATE ${COMMON_LINK_LIBRARIES})
 
 if(GGML_QNN_ENABLE_PERFORMANCE_TRACKING)
     message("GGML_QNN_ENABLE_PERFORMANCE_TRACKING is enabled")
@@ -72,6 +52,8 @@ if(GGML_HEXAGON_NPU_ONLY)
     set(GGML_QNN_ENABLE_HEXAGON_BACKEND ON)
 else()
     message("GGML_HEXAGON_NPU_ONLY is disabled")
+    add_subdirectory(qnn)
+    target_link_libraries(ggml-qnn PRIVATE qnn-backend)
 endif()
 
 if(GGML_QNN_ENABLE_HEXAGON_BACKEND)
 
@@ -3,22 +3,38 @@
 #include <HAP_compute_res.h>
 #include <hexagon_types.h>
 
+#include <memory>
 #include <new>
 
 #include "graph.hpp"
 #include "hexagon_npu.h"
 #include "op_impl.hpp"
 #include "remote.h"
 #include "tensor.hpp"
+#include "thread_pool.hpp"
 #include "util.hpp"
 
-#define NPU_UNUSED(x) (void) (x)
-
 namespace {
 
 struct npu_device_context {
-    int unused = 0;
-    // TODO: should we add tensor context here?
+    std::unique_ptr<hexagon::default_thread_pool> thread_pool;
+
+    bool init_thread_pool() {
+        if (thread_pool) {
+            DEVICE_LOG_DEBUG("Thread pool already initialized");
+            return true;
+        }
+
+        auto pool = std::make_unique<hexagon::default_thread_pool>();
+        if (!pool) {
+            DEVICE_LOG_ERROR("Failed to create thread pool");
+            return false;
+        }
+
+        thread_pool = std::move(pool);
+        DEVICE_LOG_DEBUG("Thread pool initialized");
+        return true;
+    }
 };
 
 inline hexagon::tensor * tensor_from_handle(npu_device_graph_handle_t h) {
@@ -37,6 +53,10 @@ inline npu_device_tensor_handle_t graph_to_handle(hexagon::graph * graph) {
     return reinterpret_cast<npu_device_tensor_handle_t>(graph);
 }
 
+inline npu_device_context * device_context_from_handle(remote_handle64 h) {
+    return reinterpret_cast<npu_device_context *>(h);
+}
+
 }  // namespace
 
 int npu_device_open(const char * uri, remote_handle64 * h) {
@@ -47,12 +67,18 @@ int npu_device_open(const char * uri, remote_handle64 * h) {
         return AEE_ENOMEMORY;
     }
 
+    if (!context->init_thread_pool()) {
+        DEVICE_LOG_ERROR("Failed to initialize thread pool");
+        delete context;
+        return AEE_EFAILED;
+    }
+
     *h = reinterpret_cast<remote_handle64>(context);
     return AEE_SUCCESS;
 }
 
 int npu_device_close(remote_handle64 h) {
-    auto * context = reinterpret_cast<npu_device_context *>(h);
+    auto * context = device_context_from_handle(h);
     if (!context) {
         DEVICE_LOG_ERROR("Invalid npu_device_context handle");
         return AEE_EINVHANDLE;
@@ -149,13 +175,19 @@ AEEResult npu_device_graph_set_tensor(remote_handle64 _h, npu_device_graph_handl
 }
 
 AEEResult npu_device_graph_compute(remote_handle64 _h, npu_device_graph_handle_t graph_handle) {
-    NPU_UNUSED(_h);
+    auto dev_ctx = device_context_from_handle(_h);
+    if (!dev_ctx) {
+        DEVICE_LOG_DEBUG("Invalid npu_device_context handle");
+        return AEE_EINVHANDLE;
+    }
+
     auto * graph = graph_from_handle(graph_handle);
     if (!graph) {
+        DEVICE_LOG_ERROR("Invalid graph handle");
         return AEE_EINVHANDLE;
     }
 
-    if (!graph->compute()) {
+    if (!graph->compute(dev_ctx->thread_pool.get())) {
         return AEE_EFAILED;
     }
 
 
@@ -8,24 +8,23 @@
 
 namespace hexagon {
 
+graph::graph() noexcept {
+    DEVICE_LOG_DEBUG("graph(%p) created\n", (void *) this);
+}
+
 graph::~graph() noexcept {
-    if (_tensors) {
-        delete[] _tensors;
-    }
+    _tensors.reset();
+    DEVICE_LOG_DEBUG("graph(%p) destroyed\n", (void *) this);
 }
 
 void graph::set_tensor(const npu_device_tensor_handle_t * tensors, int tensor_count) {
-    if (_tensor_count > 0) {
-        delete[] _tensors;
-    }
-
     if (tensor_count <= 0) {
-        _tensors      = nullptr;
+        _tensors.reset();
         _tensor_count = 0;
         return;
     }
 
-    _tensors = new (std::nothrow) tensor *[tensor_count];
+    _tensors = std::make_unique<tensor *[]>(size_t(tensor_count));
     for (int i = 0; i < tensor_count; ++i) {
         auto * tensor_obj = reinterpret_cast<tensor *>(tensors[i]);
         _tensors[i]       = tensor_obj;
@@ -37,31 +36,43 @@ void graph::set_tensor(const npu_device_tensor_handle_t * tensors, int tensor_co
     DEVICE_LOG_DEBUG("graph(%p) tensor count: %zu\n", (void *) this, _tensor_count);
 }
 
-bool graph::compute() {
+bool graph::compute(default_thread_pool * thread_pool) {
     if (!_tensors || !_tensor_count) {
         DEVICE_LOG_DEBUG("graph(%p) no tensors to compute\n", (void *) this);
         return true;  // return success if no tensors to compute
     }
 
     DEVICE_LOG_DEBUG("graph(%p) compute\n", (void *) this);
+    thread_pool->sync_execute(reinterpret_cast<default_thread_pool::task_type>(&graph::thread_pool_task), this);
+
+    for (size_t i = 0; i < _tensor_count; ++i) {
+        auto * dst = _tensors[i];
+        dst->flush();  // TODO: optimize this
+    }
+
+    return true;
+}
+
+void graph::thread_pool_task(default_thread_pool * pool, size_t thread_idx, size_t thread_count, graph * graph) {
+    NPU_UNUSED(pool);
+    graph->compute_impl(thread_idx, thread_count);
+}
+
+void graph::compute_impl(size_t thread_idx, size_t thread_count) {
     for (size_t i = 0; i < _tensor_count; ++i) {
         auto * dst  = _tensors[i];
         auto   op   = dst->get_op();
         auto * func = get_compute_func(op);
         if (!func) {
             DEVICE_LOG_ERROR("graph(%p) tensor[%zu] op %d not supported\n", (void *) this, i, op);
-            return false;
+            return;
         }
 
-        if (!func(dst)) {
+        if (!func(dst, thread_idx, thread_count)) {
             DEVICE_LOG_ERROR("graph(%p) tensor[%zu] op %d compute failed\n", (void *) this, i, op);
-            return false;
+            return;
         }
-
-        dst->flush();  // TODO: optimize this
     }
-
-    return true;
 }
 
 }  // namespace hexagon
@@ -1,29 +1,32 @@
 #pragma once
 
+#include <memory>
+
 #include "hexagon_npu.h"
 #include "tensor.hpp"
+#include "thread_pool.hpp"
 
 namespace hexagon {
 
 class graph {
   public:
     // TODO: add execute direction here
-    explicit graph() noexcept {}
+    explicit graph() noexcept;
 
     ~graph() noexcept;
 
     void set_tensor(const npu_device_tensor_handle_t * tensors, int tensor_count);
 
-    bool compute();
+    bool compute(default_thread_pool * thread_pool);
 
   private:
-    tensor ** _tensors      = nullptr;
-    size_t    _tensor_count = 0;
+    static void thread_pool_task(default_thread_pool * pool, size_t thread_idx, size_t thread_count, graph * graph);
+    void        compute_impl(size_t thread_idx, size_t thread_count);
+
+    std::unique_ptr<tensor *[]> _tensors;
+    size_t                      _tensor_count = 0;
 
-    graph(const graph &)          = delete;
-    void operator=(const graph &) = delete;
-    graph(graph &&)               = delete;
-    void operator=(graph &&)      = delete;
+    DISABLE_COPY_AND_MOVE(graph);
 };
 
 }  // namespace hexagon
@@ -76,11 +76,12 @@ inline HVX_Vector vmul_f32_f32(HVX_Vector a, HVX_Vector b) {
 }
 
 template <typename _TySrc, typename _TyDst, void (*_RowFunc)(const _TySrc *, const _TySrc *, size_t, _TyDst *)>
-bool element_wise_op(hexagon::tensor * out) {
+bool element_wise_op(hexagon::tensor * out, size_t tidx, size_t tcnt) {
     if (!out) {
         return false;
     }
 
+    static_assert(DEVICE_TENSOR_MAX_DIMS == 4, "element_wise_op requires max dims 4");
     auto * src0 = out->get_src(0);
     auto * src1 = out->get_src(1);
     if (!src0 || !src1) {
@@ -93,28 +94,24 @@ bool element_wise_op(hexagon::tensor * out) {
         return false;
     }
 
-    static_assert(DEVICE_TENSOR_MAX_DIMS == 4, "element_wise_op requires max dims 4");
-
-    const auto * src0_ptr = reinterpret_cast<const uint8_t *>(src0->get_data());
-    const auto * src1_ptr = reinterpret_cast<const uint8_t *>(src1->get_data());
-    auto *       dst_ptr  = reinterpret_cast<uint8_t *>(out->get_data());
-    for (int64_t i3 = 0; i3 < out->get_ne(3); i3++) {
-        const auto * src0_cube = src0_ptr + i3 * src0->get_nb(3);
-        const auto * src1_cube = src1_ptr + (i3 % src1->get_ne(3)) * src1->get_nb(3);
-        auto *       dst_cube  = dst_ptr + i3 * out->get_nb(3);
-        for (int64_t i2 = 0; i2 < out->get_ne(2); i2++) {
-            const auto * src0_plane = src0_cube + i2 * src0->get_nb(2);
-            const auto * src1_plane = src1_cube + (i2 % src1->get_ne(2)) * src1->get_nb(2);
-            auto *       dst_plane  = dst_cube + i2 * out->get_nb(2);
-            for (int64_t i1 = 0; i1 < out->get_ne(1); i1++) {
-                // TODO: prefetch row?
-                auto * src0_row = src0_plane + i1 * src0->get_nb(1);
-                auto * src1_row = src1_plane + (i1 % src1->get_ne(1)) * src1->get_nb(1);
-                auto * dst_row  = reinterpret_cast<float *>(dst_plane + i1 * out->get_nb(1));
-                _RowFunc(reinterpret_cast<const _TySrc *>(src0_row), reinterpret_cast<const _TySrc *>(src1_row),
-                         static_cast<size_t>(out->get_ne(0)), reinterpret_cast<_TyDst *>(dst_row));
-            }
-        }
+    const auto * src0_ptr     = reinterpret_cast<const uint8_t *>(src0->get_data());
+    const auto * src1_ptr     = reinterpret_cast<const uint8_t *>(src1->get_data());
+    auto *       dst_ptr      = reinterpret_cast<uint8_t *>(out->get_data());
+    auto         total_rows   = out->get_ne(3) * out->get_ne(2) * out->get_ne(1);
+    const auto   rows_per_box = out->get_ne(2) * out->get_ne(1);
+    const auto   start_end    = hexagon::get_thread_work_slice(total_rows, tidx, tcnt);
+    for (int64_t ir = start_end.first; ir < start_end.second; ++ir) {
+        const auto i03      = ir / rows_per_box;
+        const auto i02      = ir / out->get_ne(1) - i03 * out->get_ne(2);
+        const auto i01      = ir % out->get_ne(1);
+        const auto i13      = i03 % src1->get_ne(3);
+        const auto i12      = i02 % src1->get_ne(2);
+        const auto i11      = i01 % src1->get_ne(1);
+        auto *     src0_row = src0_ptr + i03 * src0->get_nb(3) + i02 * src0->get_nb(2) + i01 * src0->get_nb(1);
+        auto *     src1_row = src1_ptr + i13 * src1->get_nb(3) + i12 * src1->get_nb(2) + i11 * src1->get_nb(1);
+        auto *     dst_row  = dst_ptr + i03 * out->get_nb(3) + i02 * out->get_nb(2) + i01 * out->get_nb(1);
+        _RowFunc(reinterpret_cast<const _TySrc *>(src0_row), reinterpret_cast<const _TySrc *>(src1_row),
+                 static_cast<size_t>(out->get_ne(0)), reinterpret_cast<_TyDst *>(dst_row));
     }
 
     return true;
 
@@ -5,7 +5,7 @@
 
 namespace hexagon {
 
-typedef bool (*compute_func_type)(tensor * dst);
+typedef bool (*compute_func_type)(tensor * dst, size_t tidx, size_t tcnt);
 typedef bool (*op_is_supported_func_type)(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1,
                                           const npu_device_tensor_spec & dst, npu_device_tensor_op op);