ggml-org
diff --git a/‎ggml/src/ggml-qnn/npu/device/device.cpp‎
Lines changed: 27 additions & 17 deletions b/‎ggml/src/ggml-qnn/npu/device/device.cpp‎
Lines changed: 27 additions & 17 deletions
diff --git a/‎ggml/src/ggml-qnn/npu/device/graph.cpp‎
Lines changed: 16 additions & 8 deletions b/‎ggml/src/ggml-qnn/npu/device/graph.cpp‎
Lines changed: 16 additions & 8 deletions
diff --git a/‎ggml/src/ggml-qnn/npu/device/graph.hpp‎
Lines changed: 5 additions & 4 deletions b/‎ggml/src/ggml-qnn/npu/device/graph.hpp‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp‎
Lines changed: 49 additions & 23 deletions b/‎ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp‎
Lines changed: 49 additions & 23 deletions
diff --git a/‎ggml/src/ggml-qnn/npu/device/op_flash_attn.hpp‎
Lines changed: 4 additions & 2 deletions b/‎ggml/src/ggml-qnn/npu/device/op_flash_attn.hpp‎
Lines changed: 4 additions & 2 deletions
@@ -1,10 +1,4 @@
 
-#include <AEEStdErr.h>
-#include <HAP_compute_res.h>
-#include <hexagon_types.h>
-
-#include <memory>
-
 #include "graph.hpp"
 #include "hexagon_npu.h"
 #include "op_impl.hpp"
@@ -14,6 +8,12 @@
 #include "type_traits.hpp"
 #include "util.hpp"
 
+#include <AEEStdErr.h>
+#include <HAP_compute_res.h>
+#include <hexagon_types.h>
+
+#include <memory>
+
 namespace {
 
 struct npu_device_context {
@@ -130,28 +130,34 @@ AEEResult npu_device_device_get_alignment(remote_handle64 _h, uint32_t * alignme
     return AEE_SUCCESS;
 }
 
-AEEResult npu_device_device_support_op(remote_handle64 _h, npu_device_tensor_op op, const npu_device_tensor_spec * dst,
-                                       const npu_device_tensor_spec * srcs, int srcsLen, boolean * is_supported) {
+AEEResult npu_device_device_support_op(remote_handle64                   _h,
+                                       const npu_device_tensor_op_spec * op_spec,
+                                       const npu_device_tensor_spec *    dst,
+                                       const npu_device_tensor_spec *    srcs,
+                                       int                               srcsLen,
+                                       boolean *                         is_supported) {
     NPU_UNUSED(_h);
 
     if (!srcs || srcsLen <= 0 || !dst || !is_supported) {
         DEVICE_LOG_ERROR("npu_device_device_support_op: Invalid arguments");
         return AEE_EINVARGS;
     }
 
-    *is_supported = hexagon::support_op(op, dst, srcs, srcsLen);
+    *is_supported = hexagon::support_op(op_spec, dst, srcs, srcsLen);
     return AEE_SUCCESS;
 }
 
-AEEResult npu_device_tensor_init(remote_handle64 _h, const npu_device_tensor_config * info,
-                                 npu_device_tensor_handle_t * tensor_handle) {
+AEEResult npu_device_tensor_init(remote_handle64                  _h,
+                                 const npu_device_tensor_config * info,
+                                 npu_device_tensor_handle_t *     tensor_handle) {
     NPU_UNUSED(_h);
     auto * tensor  = new hexagon::tensor(*info);
     *tensor_handle = tensor_to_handle(tensor);
     return AEE_SUCCESS;
 }
 
-AEEResult npu_device_tensor_update_params(remote_handle64 _h, npu_device_tensor_handle_t tensor_handle,
+AEEResult npu_device_tensor_update_params(remote_handle64                         _h,
+                                          npu_device_tensor_handle_t              tensor_handle,
                                           const npu_device_tensor_update_config * config) {
     NPU_UNUSED(_h);
     auto * tensor = tensor_from_handle(tensor_handle);
@@ -174,8 +180,9 @@ AEEResult npu_device_tensor_free(remote_handle64 _h, npu_device_tensor_handle_t
     return AEE_SUCCESS;
 }
 
-AEEResult npu_device_tensors_free(remote_handle64 _h, const npu_device_tensor_handle_t * tensor_handles,
-                                  int tensor_handlesLen) {
+AEEResult npu_device_tensors_free(remote_handle64                    _h,
+                                  const npu_device_tensor_handle_t * tensor_handles,
+                                  int                                tensor_handlesLen) {
     NPU_UNUSED(_h);
     if (!tensor_handles || tensor_handlesLen < 0) {
         DEVICE_LOG_ERROR("npu_device_tensors_free: Invalid arguments");
@@ -201,8 +208,10 @@ AEEResult npu_device_graph_init(remote_handle64 _h, npu_device_graph_handle_t *
     return AEE_SUCCESS;
 }
 
-AEEResult npu_device_graph_set_tensor(remote_handle64 _h, npu_device_graph_handle_t graph_handle,
-                                      const npu_device_tensor_handle_t * tensor_handles, int tensor_handlesLen) {
+AEEResult npu_device_graph_set_tensor(remote_handle64                    _h,
+                                      npu_device_graph_handle_t          graph_handle,
+                                      const npu_device_tensor_handle_t * tensor_handles,
+                                      int                                tensor_handlesLen) {
     NPU_UNUSED(_h);
     auto * graph = graph_from_handle(graph_handle);
     if (!graph || !tensor_handles || tensor_handlesLen <= 0) {
@@ -213,7 +222,8 @@ AEEResult npu_device_graph_set_tensor(remote_handle64 _h, npu_device_graph_handl
     return AEE_SUCCESS;
 }
 
-AEEResult npu_device_graph_set_tensor_with_param(remote_handle64 _h, npu_device_graph_handle_t graph_handle,
+AEEResult npu_device_graph_set_tensor_with_param(remote_handle64                         _h,
+                                                 npu_device_graph_handle_t               graph_handle,
                                                  const npu_device_tensor_handle_t *      tensor_handles,
                                                  int                                     tensor_handlesLen,
                                                  const npu_device_tensor_update_config * tensor_params,
 
@@ -1,12 +1,12 @@
 
 #include "graph.hpp"
 
-#include <new>
-
 #include "op_impl.hpp"
 #include "util.hpp"
 #include "vtcm_mem.hpp"
 
+#include <new>
+
 namespace hexagon {
 
 graph::graph() noexcept {
@@ -30,8 +30,12 @@ void graph::set_tensor(const npu_device_tensor_handle_t * tensors, int tensor_co
     for (int i = 0; i < tensor_count; ++i) {
         auto * tensor_obj = reinterpret_cast<tensor *>(tensors[i]);
         _tensors[i]       = tensor_obj;
-        DEVICE_LOG_DEBUG("graph(%p) set_tensor[%d]: %p(%p,%p), op: %s\n", (void *) this, i, (void *) tensor_obj,
-                         (void *) tensor_obj->get_src(0), (void *) tensor_obj->get_src(1),
+        DEVICE_LOG_DEBUG("graph(%p) set_tensor[%d]: %p(%p,%p), op: %s\n",
+                         (void *) this,
+                         i,
+                         (void *) tensor_obj,
+                         (void *) tensor_obj->get_src(0),
+                         (void *) tensor_obj->get_src(1),
                          op_get_name(tensor_obj->get_op()));
     }
 
@@ -64,8 +68,9 @@ bool graph::compute(default_thread_pool * thread_pool, const float * f16_to_f32_
     return true;
 }
 
-void graph::thread_pool_task(default_thread_pool * pool, default_thread_pool::thread_params * thread_params,
-                             void * graph) {
+void graph::thread_pool_task(default_thread_pool *                pool,
+                             default_thread_pool::thread_params * thread_params,
+                             void *                               graph) {
     reinterpret_cast<hexagon::graph *>(graph)->compute_impl(pool, thread_params);
 }
 
@@ -86,8 +91,11 @@ void graph::compute_impl(default_thread_pool * pool, default_thread_pool::thread
 
         const bool should_sync = requires_thread_barrier(op);
         if (pool && should_sync && i < _tensor_count - 1) {
-            DEVICE_SCOPED_PERFORMANCE_TRACKER("[%p]sync_thread, tidx: %zu, tensor[%zu/%zu]", (void *) this,
-                                              params.get_thread_index(), i, _tensor_count);
+            DEVICE_SCOPED_PERFORMANCE_TRACKER("[%p]sync_thread, tidx: %zu, tensor[%zu/%zu]",
+                                              (void *) this,
+                                              params.get_thread_index(),
+                                              i,
+                                              _tensor_count);
             pool->sync_thread();
         }
     }
 
@@ -1,11 +1,11 @@
 #pragma once
 
-#include <memory>
-
 #include "hexagon_npu.h"
 #include "tensor.hpp"
 #include "thread_pool.hpp"
 
+#include <memory>
+
 namespace hexagon {
 
 class graph {
@@ -20,8 +20,9 @@ class graph {
     bool compute(default_thread_pool * thread_pool, const float * f16_to_f32_table);
 
   private:
-    static void thread_pool_task(default_thread_pool * pool, default_thread_pool::thread_params * thread_params,
-                                 void * graph);
+    static void thread_pool_task(default_thread_pool *                pool,
+                                 default_thread_pool::thread_params * thread_params,
+                                 void *                               graph);
     void        compute_impl(default_thread_pool * pool, default_thread_pool::thread_params * thread_params);
 
     std::unique_ptr<tensor *[]> _tensors;
 
@@ -14,15 +14,20 @@ inline float f16_to_f32(const npu_device_fp16_t src) {
 
 // From: ggml/src/ggml-cpu/ops.cpp
 template <bool _IsKvF16>
-void flash_attn_impl(hexagon::tensor * out, const hexagon::tensor * q, const hexagon::tensor * k,
-                     const hexagon::tensor * v, const hexagon::tensor * mask, hexagon::compute_params * params) {
+void flash_attn_impl(hexagon::tensor *         out,
+                     const hexagon::tensor *   q,
+                     const hexagon::tensor *   k,
+                     const hexagon::tensor *   v,
+                     const hexagon::tensor *   mask,
+                     hexagon::compute_params * params) {
     static_assert(3 <= hexagon::kMaxParamsCount, "flash_attn op params count exceeds max params count");
 
     constexpr const npu_device_tensor_data_type kKvDataType = _IsKvF16 ? NPU_DATA_TYPE_F16 : NPU_DATA_TYPE_F32;
 
     if (k->get_type() != kKvDataType || v->get_type() != k->get_type()) {
         DEVICE_LOG_ERROR("flash_attn_impl: k and v must have same type, got k: %s, v: %s\n",
-                         hexagon::get_type_name(k->get_type()), hexagon::get_type_name(v->get_type()));
+                         hexagon::get_type_name(k->get_type()),
+                         hexagon::get_type_name(v->get_type()));
         return;
     }
 
@@ -80,7 +85,8 @@ void flash_attn_impl(hexagon::tensor * out, const hexagon::tensor * q, const hex
     const auto     out_rows_per_batch = out->get_ne(2) * out->get_ne(1);
     uint8_t *      dst_ptr            = out->get_write_buffer();
     if (!dst_ptr) {
-        DEVICE_LOG_ERROR("flash_attn_impl: dst_ptr is not writable, tensor: %p, type: %s\n", (void *) out,
+        DEVICE_LOG_ERROR("flash_attn_impl: dst_ptr is not writable, tensor: %p, type: %s\n",
+                         (void *) out,
                          hexagon::get_type_name(out->get_type()));
         return;
     }
@@ -118,7 +124,8 @@ void flash_attn_impl(hexagon::tensor * out, const hexagon::tensor * q, const hex
 
         const npu_device_fp16_t * mp =
             mask_ptr ? reinterpret_cast<const npu_device_fp16_t *>(mask_ptr + iq1 * mask->get_nb(1) +
-                                                                   (iq3 % mask->get_ne(2)) * mask->get_nb(2)) :
+                                                                   (iq2 % mask->get_ne(2)) * mask->get_nb(2) +
+                                                                   (iq3 % mask->get_ne(3)) * mask->get_nb(3)) :
                        nullptr;
 
         // k indices
@@ -251,8 +258,8 @@ bool flash_attn_f32(tensor * out, compute_params * params) {
     const auto * v    = out->get_src(2);
     const auto * mask = out->get_src(3);
     if (!q || !k || !v || !mask) {
-        DEVICE_LOG_DEBUG("invalid src tensors: q: %p, k: %p, v: %p, mask: %p\n", (void *) q, (void *) k, (void *) v,
-                         (void *) mask);
+        DEVICE_LOG_DEBUG(
+            "invalid src tensors: q: %p, k: %p, v: %p, mask: %p\n", (void *) q, (void *) k, (void *) v, (void *) mask);
         return false;
     }
 
@@ -264,8 +271,11 @@ bool flash_attn_f32(tensor * out, compute_params * params) {
     return true;
 }
 
-bool is_flash_attn_supported(npu_device_tensor_op op, const npu_device_tensor_spec * dst,
-                             const npu_device_tensor_spec * srcs, size_t src_len) {
+bool is_flash_attn_supported(const npu_device_tensor_op_spec * op_spec,
+                             const npu_device_tensor_spec *    dst,
+                             const npu_device_tensor_spec *    srcs,
+                             size_t                            src_len) {
+    const auto op = op_spec->op;
     if (op != NPU_OP_FLASH_ATTN) {
         DEVICE_LOG_DEBUG("op is not NPU_OP_FLASH_ATTN: %d\n", op);
         return false;
@@ -295,7 +305,9 @@ bool is_flash_attn_supported(npu_device_tensor_op op, const npu_device_tensor_sp
 
     const auto * v = &srcs[2];
     if (v->type != k->type) {  // TODO: support more v types
-        DEVICE_LOG_DEBUG("[%s]v type is not the same as k: %s vs %s\n", op_get_name(op), get_type_name(v->type),
+        DEVICE_LOG_DEBUG("[%s]v type is not the same as k: %s vs %s\n",
+                         op_get_name(op),
+                         get_type_name(v->type),
                          get_type_name(k->type));
         return false;
     }
@@ -310,28 +322,42 @@ bool is_flash_attn_supported(npu_device_tensor_op op, const npu_device_tensor_sp
         DEVICE_LOG_DEBUG(
             "[%s]dst shape does not match q and v: dst ne: %ld, %ld, %ld, %ld, q ne: %ld, %ld, %ld, %ld, "
             "v ne: %ld, %ld, %ld, %ld\n",
-            op_get_name(op), dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], q->ne[0], q->ne[1], q->ne[2], q->ne[3],
-            v->ne[0], v->ne[1], v->ne[2], v->ne[3]);
+            op_get_name(op),
+            dst->ne[0],
+            dst->ne[1],
+            dst->ne[2],
+            dst->ne[3],
+            q->ne[0],
+            q->ne[1],
+            q->ne[2],
+            q->ne[3],
+            v->ne[0],
+            v->ne[1],
+            v->ne[2],
+            v->ne[3]);
         return false;
     }
 
     if (is_transposed_or_permuted(dst->nb)) {
-        DEVICE_LOG_DEBUG("[%s]dst cannot be transposed or permuted, nb: %zu, %zu, %zu, %zu\n", op_get_name(op),
-                         dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3]);
+        DEVICE_LOG_DEBUG("[%s]dst cannot be transposed or permuted, nb: %zu, %zu, %zu, %zu\n",
+                         op_get_name(op),
+                         dst->nb[0],
+                         dst->nb[1],
+                         dst->nb[2],
+                         dst->nb[3]);
         return false;
     }
 
     if (q->ne[0] != k->ne[0]) {
         DEVICE_LOG_DEBUG("[%s]q and k shapes do not match: q ne: %ld, %ld, %ld, %ld, k ne: %ld, %ld, %ld, %ld\n",
-                         op_get_name(op), q->ne[0], q->ne[1], q->ne[2], q->ne[3], k->ne[0], k->ne[1], k->ne[2],
-                         k->ne[3]);
-        return false;
-    }
-
-    if (q->ne[2] != k->ne[2] || q->ne[3] != k->ne[3] || q->ne[3] != 1) {
-        // TODO: add broadcast support
-        DEVICE_LOG_DEBUG("[%s]q and k shapes do not match: q ne: %ld, %ld, %ld, %ld, k ne: %ld, %ld, %ld, %ld\n",
-                         op_get_name(op), q->ne[0], q->ne[1], q->ne[2], q->ne[3], k->ne[0], k->ne[1], k->ne[2],
+                         op_get_name(op),
+                         q->ne[0],
+                         q->ne[1],
+                         q->ne[2],
+                         q->ne[3],
+                         k->ne[0],
+                         k->ne[1],
+                         k->ne[2],
                          k->ne[3]);
         return false;
     }
 
@@ -5,7 +5,9 @@
 namespace hexagon {
 
 bool flash_attn_f32(tensor * out, compute_params * params);
-bool is_flash_attn_supported(npu_device_tensor_op op, const npu_device_tensor_spec * dst,
-                             const npu_device_tensor_spec * srcs, size_t src_len);
+bool is_flash_attn_supported(const npu_device_tensor_op_spec * op_spec,
+                             const npu_device_tensor_spec *    dst,
+                             const npu_device_tensor_spec *    srcs,
+                             size_t                            src_len);
 
 }  // namespace hexagon