diff --git a/.gitignore b/.gitignore
index d571806dbfd95..83da95a2c0953 100644
--- a/.gitignore
+++ b/.gitignore
@@ -145,4 +145,13 @@ poetry.toml
 # Local scripts
 /run-vim.sh
 /run-chat.sh
+
+HEXAGON_Tools/
+prebuilts/QNN_SDK/qairt/2.35.0.250530/
+prebuilts/QNN_SDK/qairt/2.36.0.250627/
+prebuilts/QNN_SDK/v2.35.0.250530.zip
+prebuilts/QNN_SDK/v2.36.0.250627.zip
+prebuilts/Hexagon_SDK/minimal-hexagon-sdk-6.2.0.1.xz
+prebuilts/OpenCL_SDK/
+prebuilts/Vulkan_SDK/
 HEXAGON_Tools/
diff --git a/ggml/include/ggml-hexagon.h b/ggml/include/ggml-hexagon.h
index 0d41a955f6715..fe9d4d8e588ba 100644
--- a/ggml/include/ggml-hexagon.h
+++ b/ggml/include/ggml-hexagon.h
@@ -21,15 +21,30 @@ enum HEXAGONBackend {
     HEXAGON_BACKEND_GGML    = 4, //"fake" HEXAGON backend for compare performance between HEXAGON backend and ggml backend
 };
 
-GGML_BACKEND_API ggml_backend_t ggml_backend_hexagon_init(size_t dev_num, const char * qnn_lib_path);
+//0: general approach through QNN:offload ggmlop to QNN(QNNCPU, QNNGPU, QNNNPU）
+//1: special approach through QNN-SINGLEGRAPH:mapping entire ggml cgraph to a single QNN graph
+//2: general approach through Hexagon cDSP:offload ggmlop to Hexagon cDSP directly
+enum hwaccel_approach_type {
+     HWACCEL_QNN            = 0,
+     HWACCEL_QNN_SINGLEGRAPH= 1,
+     HWACCEL_CDSP           = 2,
+};
+
+GGML_BACKEND_API ggml_backend_t     ggml_backend_hexagon_init(size_t dev_num, const char * qnn_lib_path);
 
-GGML_BACKEND_API bool           ggml_backend_is_hexagon(ggml_backend_t backend);
+GGML_BACKEND_API bool               ggml_backend_is_hexagon(ggml_backend_t backend);
 
-GGML_BACKEND_API int            ggml_backend_hexagon_get_device_count(void);
+GGML_BACKEND_API int                ggml_backend_hexagon_get_device_count(void);
 
 GGML_BACKEND_API ggml_backend_reg_t ggml_backend_hexagon_reg(void);
 
-const char * ggml_backend_hexagon_get_devname(size_t dev_num);
+GGML_BACKEND_API const char *       ggml_backend_hexagon_get_devname(size_t dev_num);
+
+GGML_BACKEND_API void               ggml_backend_hexagon_set_cfg(int new_hexagon_backend, int new_hwaccel_approach);
+
+GGML_BACKEND_API int                ggml_backend_hexagon_get_mulmat_algotype(void);
+
+GGML_BACKEND_API void               ggml_backend_hexagon_set_mulmat_algotype(int new_mulmat_algotype);
 
 #ifdef __cplusplus
 }
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index ffd83931add5c..c25485536b5a5 100644
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -196,7 +196,7 @@ struct ggml_backend_registry {
 #ifdef GGML_USE_HEXAGON
         register_backend(ggml_backend_hexagon_reg());
 #endif
-      
+
 #ifdef GGML_USE_CPU
         register_backend(ggml_backend_cpu_reg());
 #endif
diff --git a/ggml/src/ggml-hexagon/CMakeLists.txt b/ggml/src/ggml-hexagon/CMakeLists.txt
index 0dcd7e5e2a168..3515106cc23da 100644
--- a/ggml/src/ggml-hexagon/CMakeLists.txt
+++ b/ggml/src/ggml-hexagon/CMakeLists.txt
@@ -39,13 +39,19 @@ endif()
 
 #check whether user's specified htp arch is valid
 set(CHECK_HTP_ARCH "WRONG")
-foreach (feat v68 v69 v73 v75 v79)
+#ref: https://github.com/quic/ai-hub-apps/tree/main/tutorials/llm_on_genie
+#foreach (feat v68 v69 v73 v75 v79)
+#foreach (feat v73 v75 v79)
+#for simplify workflow, only support v75 and v79, or only support 8Gen3 and 8Elite
+foreach (feat v75 v79)
     if (${feat} STREQUAL ${HTP_ARCH_VERSION})
         set(CHECK_HTP_ARCH "GOOD")
     endif()
 endforeach()
 if (${CHECK_HTP_ARCH} STREQUAL "WRONG")
-    message(FATAL_ERROR "ggml-hexagon backend only support htp arch v68,v69,v73,v75,v79")
+    #message(FATAL_ERROR "ggml-hexagon backend only support htp arch v68,v69,v73,v75,v79")
+    #for simplify workflow, only support v75 and v79, or only support 8Gen3 and 8Elite
+    message(FATAL_ERROR "ggml-hexagon backend only support htp arch v75,v79")
 endif()
 
 #check optimization flags
@@ -92,10 +98,10 @@ else()
     message(FATAL_ERROR "ggml-hexagon now only available on Android and Windows(Windows on ARM)")
 endif()
 
-set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS} ${DEBUG_FLAG} ${OPT_FLAG}")
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${DEBUG_FLAG} ${OPT_FLAG}")
-set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} ${DEBUG_FLAG} ${OPT_FLAG}")
-set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} ${DEBUG_FLAG} ${OPT_FLAG}")
+set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   -DGGML_USE_HEXAGON ${DEBUG_FLAG} ${OPT_FLAG}")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGGML_USE_HEXAGON ${DEBUG_FLAG} ${OPT_FLAG}")
+set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -DGGML_USE_HEXAGON ${DEBUG_FLAG} ${OPT_FLAG}")
+set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DGGML_USE_HEXAGON ${DEBUG_FLAG} ${OPT_FLAG}")
 
 file(GLOB HEXAGON_SOURCES "${CMAKE_CURRENT_LIST_DIR}/*.cpp" "${CMAKE_CURRENT_LIST_DIR}/kernels/stub.c")
 ggml_add_backend_library(ggml-hexagon ${HEXAGON_SOURCES})
@@ -118,7 +124,7 @@ function(ggml_hexagon_build_kernel KNAME)
         COMMAND make -C ${CMAKE_CURRENT_LIST_DIR}/kernels/ clean
         COMMAND make -C ${CMAKE_CURRENT_LIST_DIR}/kernels/ HEXAGON_SDK_PATH=${HEXAGON_SDK_PATH} HTP_ARCH_VERSION=${HTP_ARCH_VERSION} DEBUG_FLAG=${DEBUG_FLAG}
         COMMAND echo "current working path:`pwd`\n"
-        COMMAND ls -l  ../../../bin/libggmlop-skel.so
+        COMMAND ls -l  ../../../bin/libggmldsp-skel.so
         COMMENT "build hexagon-kernel"
     )
 endfunction()
diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 251dcc586c51e..74f3a2461a10b 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -1,4 +1,5 @@
 /*
+ * Copyright (c) zhouwg(https://github.com/zhouwg)
  * Copyright (c) 2024-2025 The ggml authors
  *
  * Qualcomm QNN SDK and reference tech guides could be found at:
@@ -59,6 +60,7 @@
 #include <unordered_set>
 #include <utility>
 #include <future>
+#include <algorithm>
 
 #if defined(__ANDROID__) || defined(__linux__)
 #include <unistd.h>
@@ -154,10 +156,9 @@ struct ggml_backend_hexagon_context;
 
 #if !defined (_WINDOWS)
 #pragma weak remote_system_request
+#pragma weak remote_session_control
 #endif
 
-#define MAX_DOMAIN_NAMELEN 12
-
 #define CHECK_QNN_API(error, result)                                            \
     do {                                                                        \
         error = (result);                                                       \
@@ -179,6 +180,9 @@ struct ggml_backend_hexagon_context;
         }                                                                       \
     } while (0)                                                                 \
 
+#ifndef ggmlop_URI
+#define ggmlop_URI "file:///libggmldsp-skel.so?ggmldsp_skel_handle_invoke&_modver=1.0&_idlver=0.0.1"
+#endif
 // =================================================================================================
 //  section-1: data type, data structure, global vars
 // =================================================================================================
@@ -187,6 +191,9 @@ using pfn_rpc_mem_deinit                        = void (*)(void);
 using pfn_rpc_mem_alloc                         = void *(*)(int, uint32_t, int);
 using pfn_rpc_mem_free                          = void (*)(void *);
 using pfn_rpc_mem_to_fd                         = int (*)(void *);
+using pfn_qnnsaver_initialize                   = decltype(QnnSaver_initialize);
+using pfn_qnninterface_getproviders             = decltype(QnnInterface_getProviders);
+using pfn_qnnsysteminterface_getproviders       = decltype(QnnSystemInterface_getProviders);
 using pfn_rpc_remote_handle_control             = int (*)(uint32_t, void*, uint32_t);
 using pfn_rpc_remote_register_buf               = int (*)(void*, int, int);
 using pfn_rpc_remote_session_control            = int (*)(uint32_t, void *, uint32_t);
@@ -195,9 +202,6 @@ using pfn_rpc_remote_handle64_close             = int (*)(remote_handle64);
 using pfn_rpc_remote_handle64_invoke            = int (*)(remote_handle64, uint32_t, remote_arg *);
 using pfn_rpc_remote_handle64_control           = int (*)(remote_handle64, uint32_t, void*, uint32_t);
 
-using _pfn_QnnSaver_initialize                  = decltype(QnnSaver_initialize);
-using _pfn_QnnInterface_getProviders            = decltype(QnnInterface_getProviders);
-using _pfn_QnnSystemInterface_getProviders      = decltype(QnnSystemInterface_getProviders);
 
 //QNN resource management for the general approach through QNN
 using qnn_tensors_t                             = std::vector< Qnn_Tensor_t >;
@@ -219,15 +223,6 @@ enum qnn_profile_level {
     PROFILE_DETAIL  = 2,
 };
 
-//0: general approach through QNN:offload ggmlop to QNN
-//1: special approach through QNN-SINGLEGRAPH:mapping entire ggml cgraph to a single QNN graph
-//2: general approach through Hexagon cDSP:offload ggmlop to Hexagon cDSP directly
-enum hwaccel_approach_type {
-    HWACCEL_QNN                     = 0,
-    HWACCEL_QNN_SINGLEGRAPH         = 1,
-    HWACCEL_CDSP                    = 2,
-};
-
 enum hexagon_dsp_type {
     HEXAGON_ADSP    = 0,
     HEXAGON_MDSP    = 1,
@@ -253,7 +248,7 @@ enum qcom_chipset_soc_model {
     SM8475 = 42,  // v69, SD 8+ Gen 1
     SM8550 = 43,  // v73, SD 8 Gen 2
     SM8650 = 57,  // v75, SD 8 Gen 3
-    SM8750 = 69,  // v79, SD 8 Elite(aka 8 Gen 4)
+    SM8750 = 69,  // v79, SD 8 Elite
 #if !defined(__ANDROID__) && !defined(__linux__)
     SC7280X     = 44,
     SC8280X     = 37,
@@ -355,6 +350,7 @@ struct hexagon_appcfg_t {
     int profiler_duration;      // threshold of duration in profiler, per seconds
     int profiler_counts;        // threshold of counts in profiler
     int thread_counts;          // thread_counts on cDSP side
+    int mulmat_algotype;        // algorithm type of mulmat on cDSP side
     const char * cfgfilename;
     const char * runtime_libpath;
     char ggml_hexagon_version[GGMLHEXAGON_TMPBUF_LEN];
@@ -377,12 +373,23 @@ static struct hexagon_appcfg_t g_hexagon_appcfg = {
         .hexagon_backend        = HEXAGON_BACKEND_CDSP,
         .enable_rpc_ion_mempool = 0,
         .enable_all_q_mulmat    = 0,
-        .profiler_duration      = 5,
+        .profiler_duration      = 5,    //seconds
         .profiler_counts        = 100,
         .thread_counts          = 4,
+        .mulmat_algotype        = 0,
         .cfgfilename            = "ggml-hexagon.cfg",
+#if defined(__ANDROID__)
+    #if defined(STANDARD_ANDROID_APP)
+        .runtime_libpath        = "/data/data/com.kantvai.kantvplayer/",
+    #else
         .runtime_libpath        = "/data/data/com.layla/files/app-data/qnn-inference/",
-        .ggml_hexagon_version   = {"1.08"},
+    #endif
+#elif defined(__linux__)
+        .qnn_runtimelib_path    = "/tmp/",
+#elif defined(_WIN32)
+        .qnn_runtimelib_path    = "C:\\",
+#endif
+        .ggml_hexagon_version   = {"1.13"},
         .ggml_dsp_version       = {"0.63"},
 };
 
@@ -435,7 +442,7 @@ static struct qcom_socinfo g_qnn_soc_info_table[] = {
                 .soc_model         = SM8750,
                 .htp_arch          = V79,
                 .vtcm_size_in_mb   = 8,
-                .soc_desc          = "Qualcomm SnapDragon 8 Elite(aka 8 Gen 4)"},
+                .soc_desc          = "Qualcomm SnapDragon 8 Elite"},
 
 #if !defined(__ANDROID__) && !defined(__linux__)
         /* Qualcomm SnapDragon 7c Gen 2 */
@@ -627,6 +634,7 @@ static constexpr const qnn_op_caps ggmlqnn_k_op_caps[] = {
         {false, GGML_OP_TRANSPOSE, 0, nullptr},
         {false, GGML_OP_GET_ROWS, 0, nullptr},
         {false, GGML_OP_GET_ROWS_BACK, 0, nullptr},
+        {false, GGML_OP_SET_ROWS, 0, nullptr},
         {false, GGML_OP_DIAG, 0, nullptr},
         {false, GGML_OP_DIAG_MASK_INF, 0, nullptr},
         {false, GGML_OP_DIAG_MASK_ZERO, 0, nullptr},
@@ -638,6 +646,7 @@ static constexpr const qnn_op_caps ggmlqnn_k_op_caps[] = {
         {false, GGML_OP_CONV_TRANSPOSE_1D, 0, nullptr},
         {false, GGML_OP_IM2COL, 0, nullptr},
         {false, GGML_OP_IM2COL_BACK, 0, nullptr},
+        {false, GGML_OP_CONV_2D, 0, nullptr},
         {false, GGML_OP_CONV_2D_DW, 0, nullptr},
         {false, GGML_OP_CONV_TRANSPOSE_2D, 0, nullptr},
         {false, GGML_OP_POOL_1D, 0, nullptr},
@@ -646,6 +655,7 @@ static constexpr const qnn_op_caps ggmlqnn_k_op_caps[] = {
         {false, GGML_OP_UPSCALE, 0, nullptr},
         {false, GGML_OP_PAD, 0, nullptr},
         {false, GGML_OP_PAD_REFLECT_1D, 0, nullptr},
+        {false, GGML_OP_ROLL, 0, nullptr},
         {false, GGML_OP_ARANGE, 0, nullptr},
         {false, GGML_OP_TIMESTEP_EMBEDDING, 0, nullptr},
         {false, GGML_OP_ARGSORT, 0, nullptr},
@@ -669,28 +679,14 @@ static constexpr const qnn_op_caps ggmlqnn_k_op_caps[] = {
         {false, GGML_OP_CROSS_ENTROPY_LOSS, 0, nullptr},
         {false, GGML_OP_CROSS_ENTROPY_LOSS_BACK, 0, nullptr},
         {false, GGML_OP_OPT_STEP_ADAMW, 0, nullptr},
-        {false, static_cast<ggml_op>(GGML_UNARY_OP_ABS), 0, nullptr},
-        {false, static_cast<ggml_op>(GGML_UNARY_OP_SGN), 0, nullptr},
-        {false, static_cast<ggml_op>(GGML_UNARY_OP_NEG), 0, nullptr},
-        {false, static_cast<ggml_op>(GGML_UNARY_OP_STEP), 0, nullptr},
-        {false, static_cast<ggml_op>(GGML_UNARY_OP_TANH), 0, nullptr},
-        {false, static_cast<ggml_op>(GGML_UNARY_OP_ELU), 0, nullptr},
-        {false, static_cast<ggml_op>(GGML_UNARY_OP_RELU), 0, nullptr},
-        {false, static_cast<ggml_op>(GGML_UNARY_OP_SIGMOID), 0, nullptr},
-        {false, static_cast<ggml_op>(GGML_UNARY_OP_GELU), 0, nullptr},
-        {false, static_cast<ggml_op>(GGML_UNARY_OP_GELU_ERF), 0, nullptr},
-        {false, static_cast<ggml_op>(GGML_UNARY_OP_GELU_QUICK), 0, nullptr},
-        {false, static_cast<ggml_op>(GGML_UNARY_OP_SILU), 0, nullptr},
-        {false, static_cast<ggml_op>(GGML_UNARY_OP_HARDSWISH), 0, nullptr},
-        {false, static_cast<ggml_op>(GGML_UNARY_OP_HARDSIGMOID), 0, nullptr},
-        {false, static_cast<ggml_op>(GGML_UNARY_OP_EXP), 0, nullptr}
+        {false, GGML_OP_GLU, 0, nullptr},
 };
 
 static_assert(ggmlqnn_k_op_caps[GGML_OP_NONE].supported,    "GGML_OP_NONE is not true");
 static_assert(ggmlqnn_k_op_caps[GGML_OP_ADD].supported,     "GGML_OP_ADD is not true");
 static_assert(ggmlqnn_k_op_caps[GGML_OP_MUL].supported,     "GGML_OP_MUL is not true");
 static_assert(ggmlqnn_k_op_caps[GGML_OP_MUL_MAT].supported, "GGML_OP_MUL_MAT is not true");
-static_assert(std::size(ggmlqnn_k_op_caps) == (static_cast<size_t>(GGML_OP_COUNT) + static_cast<size_t>(GGML_UNARY_OP_COUNT)),
+static_assert(std::size(ggmlqnn_k_op_caps) == (static_cast<size_t>(GGML_OP_COUNT)),
               "pls check ggmlqnn_k_op_caps and ensure is corresponding to latest ggml.h");
 
 //supported ggml op by HWACCEL_CDSP
@@ -700,12 +696,12 @@ static constexpr const hexagon_op_caps ggmlhexagon_k_op_caps[] = {
         {true,  GGML_OP_ADD, 2, "ggmlop_dsp_add", ggmlop_dsp_add},
         {false, GGML_OP_ADD1, 0, nullptr, nullptr},
         {false, GGML_OP_ACC, 0, nullptr, nullptr},
-        {false,  GGML_OP_SUB, 2, nullptr, nullptr},
-        {false,  GGML_OP_MUL, 2, nullptr, nullptr},
-        {false,  GGML_OP_DIV, 2, nullptr, nullptr},
+        {false, GGML_OP_SUB, 2, nullptr, nullptr},
+        {false, GGML_OP_MUL, 2, nullptr, nullptr},
+        {false, GGML_OP_DIV, 2, nullptr, nullptr},
         {false, GGML_OP_SQR, 0, nullptr, nullptr},
-        {false,  GGML_OP_SQRT, 0, nullptr, nullptr},
-        {false,  GGML_OP_LOG, 0, nullptr, nullptr},
+        {false, GGML_OP_SQRT, 0, nullptr, nullptr},
+        {false, GGML_OP_LOG, 0, nullptr, nullptr},
         {false, GGML_OP_SIN, 0, nullptr, nullptr},
         {false, GGML_OP_COS, 0, nullptr, nullptr},
         {false, GGML_OP_SUM, 0, nullptr, nullptr},
@@ -718,7 +714,7 @@ static constexpr const hexagon_op_caps ggmlhexagon_k_op_caps[] = {
         {false, GGML_OP_CONCAT, 0, nullptr, nullptr},
         {false, GGML_OP_SILU_BACK, 0, nullptr, nullptr},
         {false, GGML_OP_NORM, 0, nullptr, nullptr},
-        {true, GGML_OP_RMS_NORM, 1, "ggmlop_dsp_rmsnorm", ggmlop_dsp_rmsnorm},
+        {true,  GGML_OP_RMS_NORM, 1, "ggmlop_dsp_rmsnorm", ggmlop_dsp_rmsnorm},
         {false, GGML_OP_RMS_NORM_BACK, 0, nullptr, nullptr},
         {false, GGML_OP_GROUP_NORM, 0, nullptr, nullptr},
         {false, GGML_OP_L2_NORM, 0, nullptr, nullptr},
@@ -735,10 +731,11 @@ static constexpr const hexagon_op_caps ggmlhexagon_k_op_caps[] = {
         {false, GGML_OP_TRANSPOSE, 0, nullptr, nullptr},
         {false, GGML_OP_GET_ROWS, 0, nullptr, nullptr},
         {false, GGML_OP_GET_ROWS_BACK, 0, nullptr, nullptr},
+        {false, GGML_OP_SET_ROWS, 0, nullptr, nullptr},
         {false, GGML_OP_DIAG, 0, nullptr, nullptr},
         {false, GGML_OP_DIAG_MASK_INF, 0, nullptr, nullptr},
         {false, GGML_OP_DIAG_MASK_ZERO, 0, nullptr, nullptr},
-        {true, GGML_OP_SOFT_MAX, 1, "ggmlop_dsp_softmax", ggmlop_dsp_softmax},
+        {true,  GGML_OP_SOFT_MAX, 1, "ggmlop_dsp_softmax", ggmlop_dsp_softmax},
         {false, GGML_OP_SOFT_MAX_BACK, 0, nullptr, nullptr},
         {false, GGML_OP_ROPE, 0, nullptr, nullptr},
         {false, GGML_OP_ROPE_BACK, 0, nullptr, nullptr},
@@ -746,14 +743,16 @@ static constexpr const hexagon_op_caps ggmlhexagon_k_op_caps[] = {
         {false, GGML_OP_CONV_TRANSPOSE_1D, 0, nullptr, nullptr},
         {false, GGML_OP_IM2COL, 0, nullptr, nullptr},
         {false, GGML_OP_IM2COL_BACK, 0, nullptr, nullptr},
+        {false, GGML_OP_CONV_2D, 0, nullptr, nullptr},
         {false, GGML_OP_CONV_2D_DW, 0, nullptr, nullptr},
         {false, GGML_OP_CONV_TRANSPOSE_2D, 0, nullptr, nullptr},
         {false, GGML_OP_POOL_1D, 0, nullptr, nullptr},
-        {true, GGML_OP_POOL_2D, 1, "ggmlop_dsp_pool2d", ggmlop_dsp_pool2d},
+        {true,  GGML_OP_POOL_2D, 1, "ggmlop_dsp_pool2d", ggmlop_dsp_pool2d},
         {false, GGML_OP_POOL_2D_BACK, 0, nullptr, nullptr},
         {false, GGML_OP_UPSCALE, 0, nullptr, nullptr},
         {false, GGML_OP_PAD, 0, nullptr, nullptr},
         {false, GGML_OP_PAD_REFLECT_1D, 0, nullptr, nullptr},
+        {false, GGML_OP_ROLL, 0, nullptr, nullptr},
         {false, GGML_OP_ARANGE, 0, nullptr, nullptr},
         {false, GGML_OP_TIMESTEP_EMBEDDING, 0, nullptr, nullptr},
         {false, GGML_OP_ARGSORT, 0, nullptr, nullptr},
@@ -777,28 +776,14 @@ static constexpr const hexagon_op_caps ggmlhexagon_k_op_caps[] = {
         {false, GGML_OP_CROSS_ENTROPY_LOSS, 0, nullptr, nullptr},
         {false, GGML_OP_CROSS_ENTROPY_LOSS_BACK, 0, nullptr, nullptr},
         {false, GGML_OP_OPT_STEP_ADAMW, 0, nullptr, nullptr},
-        {false, static_cast<ggml_op>(GGML_UNARY_OP_ABS), 0, nullptr, nullptr},
-        {false, static_cast<ggml_op>(GGML_UNARY_OP_SGN), 0, nullptr, nullptr},
-        {false, static_cast<ggml_op>(GGML_UNARY_OP_NEG), 0, nullptr, nullptr},
-        {false, static_cast<ggml_op>(GGML_UNARY_OP_STEP), 0, nullptr, nullptr},
-        {false, static_cast<ggml_op>(GGML_UNARY_OP_TANH), 0, nullptr, nullptr},
-        {false, static_cast<ggml_op>(GGML_UNARY_OP_ELU), 0, nullptr, nullptr},
-        {false, static_cast<ggml_op>(GGML_UNARY_OP_RELU), 0, nullptr, nullptr},
-        {false, static_cast<ggml_op>(GGML_UNARY_OP_SIGMOID), 0, nullptr, nullptr},
-        {false, static_cast<ggml_op>(GGML_UNARY_OP_GELU), 0, nullptr, nullptr},
-        {false, static_cast<ggml_op>(GGML_UNARY_OP_GELU_ERF), 0, nullptr, nullptr},
-        {false, static_cast<ggml_op>(GGML_UNARY_OP_GELU_QUICK), 0, nullptr, nullptr},
-        {false, static_cast<ggml_op>(GGML_UNARY_OP_SILU), 0, nullptr, nullptr},
-        {false, static_cast<ggml_op>(GGML_UNARY_OP_HARDSWISH), 0, nullptr, nullptr},
-        {false, static_cast<ggml_op>(GGML_UNARY_OP_HARDSIGMOID), 0, nullptr, nullptr},
-        {false, static_cast<ggml_op>(GGML_UNARY_OP_EXP), 0, nullptr, nullptr}
+        {false, GGML_OP_GLU, 0, nullptr, nullptr},
 };
 
 static_assert(ggmlhexagon_k_op_caps[GGML_OP_NONE].supported,     "GGML_OP_NONE is not true");
 static_assert(ggmlhexagon_k_op_caps[GGML_OP_ADD].supported,      "GGML_OP_ADD is not true");
 static_assert(ggmlhexagon_k_op_caps[GGML_OP_MUL_MAT].supported,  "GGML_OP_MUL_MAT is not true");
 static_assert(ggmlhexagon_k_op_caps[GGML_OP_SOFT_MAX].supported, "GGML_OP_SOFT_MAX is not true");
-static_assert(std::size(ggmlhexagon_k_op_caps) == (static_cast<size_t>(GGML_OP_COUNT) + static_cast<size_t>(GGML_UNARY_OP_COUNT)),
+static_assert(std::size(ggmlhexagon_k_op_caps) == (static_cast<size_t>(GGML_OP_COUNT)),
               "pls check ggmlhexagon_k_op_caps and ensure is corresponding to latest ggml.h");
 
 static int32_t g_qnntensor_idx = 0; //ensure every QNN tensor name is unique
@@ -887,6 +872,41 @@ static void ggmlhexagon_log_internal(ggml_log_level level, const char * file, co
     }
 }
 
+static void ggmlhexagon_get_processname(char * p_name) {
+    if (nullptr == p_name)
+        return;
+
+    char tmpbuf[GGMLHEXAGON_TMPBUF_LEN];
+    memset(tmpbuf, 0, GGMLHEXAGON_TMPBUF_LEN);
+#if defined(__ANDROID__) || defined(__linux__)
+    int result = readlink("/proc/self/exe", tmpbuf, GGMLHEXAGON_TMPBUF_LEN - 1);
+    if (result < 0) {
+        GGMLHEXAGON_LOG_WARN("failed to get process name, reason:%s", strerror(errno));
+        return;
+    }
+    GGMLHEXAGON_LOG_DEBUG("process name %s", tmpbuf);
+    const char * realname = strrchr(tmpbuf, '/') + 1;
+    GGMLHEXAGON_LOG_DEBUG("process name %s", realname);
+    snprintf(p_name, GGMLHEXAGON_TMPBUF_LEN, "%s", realname);
+#endif
+}
+
+static bool ggmlhexagon_is_llamabench_running() {
+    char processname[GGMLHEXAGON_TMPBUF_LEN];
+    memset(processname, 0, GGMLHEXAGON_TMPBUF_LEN);
+
+    ggmlhexagon_get_processname(processname);
+    if (0 != processname[0] && 0 != processname[1] && 0 != processname[10]) {
+        if (0 == memcmp(processname, "llama-bench", strlen("llama-bench"))) {
+            return true;
+        }
+        if (0 == memcmp(processname, "test-thread-safety", strlen("test-thread-safety"))) {
+            return true;
+        }
+    }
+    return false;
+}
+
 static void ggmlhexagon_print_tensors_info(const char * func_name, const ggml_backend_hexagon_context * ctx,
                 const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * dst) {
     //skip sanity check of params because of performance concern
@@ -1124,7 +1144,7 @@ class hexagon_profiler {
             );
         }
 
-        //print/compare NPU's I/O performance between 8Gen3 and 8Elite(aka 8Gen4) , removed in the future
+        //print/compare NPU's I/O performance between 8Gen3 and 8Elite , removed in the future
         char bps_string[GGMLHEXAGON_TMPBUF_LEN];
         memset(bps_string, 0, GGMLHEXAGON_TMPBUF_LEN);
         profiler_get_bpsstring(_total_inputsize + _total_outputsize, elapse_time, bps_string);
@@ -1292,6 +1312,7 @@ class hexagon_perf {
         _begin_time = ggml_time_us();
     }
 
+    //use explicit function calls rather than scoped feature
     void info() {
         if (0 == g_hexagon_appcfg.enable_perf) {
             return;
@@ -1304,8 +1325,13 @@ class hexagon_perf {
         // it's not mandatory
         // had to expose two public function in hexagon_profiler class
         if (g_hexagon_profiler.profiler_get_frame_index() <= g_hexagon_profiler.profiler_get_threshold_count()) {
+            const char * devname = ggml_backend_hexagon_get_devname(g_hexagon_appcfg.hexagon_backend);
+            //the logic here is make sense because already checked in ggml_backend_hexagon_device_init_backend
+            if (g_hexagon_appcfg.hexagon_backend != HEXAGON_BACKEND_GGML) {
+                devname += 16;
+            }
             GGMLHEXAGON_LOG_VERBOSE("inference duration of %s through %s: %lld microseconds",
-                                    _perf_name.c_str(), ggmlhexagon_get_hwaccel_approach_name(g_hexagon_appcfg.hwaccel_approach), _duration);
+                                    _perf_name.c_str(), devname, _duration);
         }
 
         //update profiler data
@@ -1328,7 +1354,7 @@ class hexagon_perf {
     int   _output_size  = 0;
 };
 
-//a simple class to load configurations from ggml-hexagon.cfg
+//a simple class to load/set running configurations in ggml-hexagon.cfg
 class hexagon_appcfg {
 public:
     hexagon_appcfg() {}
@@ -1394,6 +1420,103 @@ class hexagon_appcfg {
         value = atol(_hexagon_appcfg[section][key].c_str());
     }
 
+    bool modify_hexagon_config(std::string & cfg_filename, int new_hexagon_backend, int new_hwaccel_approach, int new_mulmat_algotype) {
+        std::ifstream inputfile(cfg_filename);
+        if (!inputfile.is_open()) {
+            GGMLHEXAGON_LOG_WARN("can't open file %s", cfg_filename.c_str());
+            return false;
+        }
+
+        std::string filedata = "";
+
+        std::string line;
+        std::string backupline;
+        bool is_rewrite = false;
+        bool is_founded = false;
+        bool is_key = true;
+        std::string key;
+        std::string value;
+        std::string newvalue;
+        while (std::getline(inputfile, line)) {
+            is_founded = false;
+            backupline = line;
+            trim(line);
+            if (0 == line.rfind("#", 0)) {
+                filedata += backupline;
+                filedata += "\n";
+                continue;
+            }
+
+            newvalue = "";
+            if (line.rfind("hexagon_backend", 0) != std::string::npos) {
+                if (new_hexagon_backend >= 0) {
+                    is_founded = true;
+                    is_rewrite = true;
+                    newvalue = std::to_string(new_hexagon_backend);
+                }
+            }
+
+            if (line.rfind("hwaccel_approach", 0) != std::string::npos) {
+                //compatiable with previous logic
+                if (new_hwaccel_approach >= 0) {
+                    is_founded = true;
+                    is_rewrite = true;
+                    newvalue = std::to_string(new_hwaccel_approach);
+                }
+            }
+
+            if (line.rfind("mulmat_algotype", 0) != std::string::npos) {
+                //compatiable with previous logic
+                if (new_mulmat_algotype >= 0) {
+                    is_founded = true;
+                    is_rewrite = true;
+                    newvalue = std::to_string(new_mulmat_algotype);
+                }
+            }
+
+
+            if (is_founded) {
+                is_key = true;
+                key = "";
+                value = "";
+
+                for (size_t i = 0; i < line.size(); ++i) {
+                    if (line[i] == '=') {
+                        is_key = false;
+                        continue;
+                    }
+                    if (is_key) {
+                        key += line[i];
+                    } else {
+                        value += line[i];
+                    }
+                }
+                trim(key);
+                trim(value);
+                GGMLHEXAGON_LOG_VERBOSE("key %s value %s\n", key.c_str(), value.c_str());
+                GGMLHEXAGON_LOG_VERBOSE("key %s new value %s\n", key.c_str(), newvalue.c_str());
+                backupline = key + " = " + newvalue;
+            }
+            filedata += backupline;
+            filedata += "\n";
+        }
+        inputfile.close();
+
+        if (is_rewrite) {
+            std::ofstream outputfile;
+            outputfile.open(cfg_filename);
+            outputfile.flush();
+            outputfile << filedata;
+            outputfile.close();
+        }
+        return true;
+    }
+
+    //compatiable with previous codes
+    bool modify_hexagon_config(std::string & cfg_filename, int new_hexagon_backend, int new_hwaccel_approach) {
+        return modify_hexagon_config(cfg_filename, new_hexagon_backend, new_hwaccel_approach, -1);
+    }
+
 private:
     void ltrim(std::string & str) {
         if (str.empty()) return;
@@ -1734,10 +1857,6 @@ static void ggmlhexagon_append_tensor_dimensions(const ggml_tensor * tensor, std
 }
 
 static size_t ggmlhexagon_get_op_index(const ggml_tensor * tensor) {
-    if (tensor->op == GGML_OP_UNARY) {
-        return static_cast<size_t>(GGML_OP_COUNT) + static_cast<size_t>(ggml_get_unary_op(tensor));
-    }
-
     return tensor->op;
 }
 
@@ -1879,13 +1998,12 @@ static void ggmlhexagon_load_cfg() {
     ggmlhexagon_get_timestring(time_string);
     GGMLHEXAGON_LOG_DEBUG("program running start time:%s", time_string);
     std::string cfg_filename = std::string(g_hexagon_appcfg.runtime_libpath) + std::string(g_hexagon_appcfg.cfgfilename);
-    GGMLHEXAGON_LOG_INFO("load hexagon appcfg from %s", cfg_filename.c_str());
     hexagon_appcfg hexagoncfg_instance;
     hexagoncfg_instance.load(cfg_filename);
     hexagoncfg_instance.dump([](const std::string & section, const std::string & key, const std::string value) {
         std::ostringstream  tmposs;
         tmposs << "section[" << std::setw(10) << std::left << section << "],[" << std::setw(25) << std::left << key << "] = [" << value << "]";
-        GGMLHEXAGON_LOG_INFO("%s", tmposs.str().c_str());
+        GGMLHEXAGON_LOG_VERBOSE("%s", tmposs.str().c_str());
     });
     std::string precision_mode;
     std::string version; //version of ggml-hexagon.cpp
@@ -1912,19 +2030,22 @@ static void ggmlhexagon_load_cfg() {
     hexagoncfg_instance.get_intvalue("cdsp", "enable_rpc_ion_mempool", g_hexagon_appcfg.enable_rpc_ion_mempool, 0);
     hexagoncfg_instance.get_intvalue("cdsp", "enable_all_q_mulmat", g_hexagon_appcfg.enable_all_q_mulmat, 0);
     hexagoncfg_instance.get_intvalue("cdsp", "thread_counts", g_hexagon_appcfg.thread_counts, 4);
+    hexagoncfg_instance.get_intvalue("cdsp", "mulmat_algotype", g_hexagon_appcfg.mulmat_algotype, 0);
 
-    GGMLHEXAGON_LOG_INFO("internal ggml_hexagon_version=%s", g_hexagon_appcfg.ggml_hexagon_version);
-    GGMLHEXAGON_LOG_INFO("internal ggml_dsp_version=%s", g_hexagon_appcfg.ggml_dsp_version);
-    GGMLHEXAGON_LOG_INFO("external ggml_hexagon_version=%s", version.c_str());
-    GGMLHEXAGON_LOG_INFO("external ggml_dsp_version=%s", ggmldsp_version.c_str());
     memcpy(g_hexagon_appcfg.ggml_dsp_version, ggmldsp_version.c_str(), strlen(ggmldsp_version.c_str()));
-    GGMLHEXAGON_LOG_INFO("hwaccel_approach=%d(%s)", g_hexagon_appcfg.hwaccel_approach,
+
+    GGMLHEXAGON_LOG_VERBOSE("load hexagon appcfg from %s", cfg_filename.c_str());
+    GGMLHEXAGON_LOG_VERBOSE("internal ggml_hexagon_version=%s", g_hexagon_appcfg.ggml_hexagon_version);
+    GGMLHEXAGON_LOG_VERBOSE("internal ggml_dsp_version=%s", g_hexagon_appcfg.ggml_dsp_version);
+    GGMLHEXAGON_LOG_VERBOSE("external ggml_hexagon_version=%s", version.c_str());
+    GGMLHEXAGON_LOG_VERBOSE("external ggml_dsp_version=%s", ggmldsp_version.c_str());
+    GGMLHEXAGON_LOG_VERBOSE("hwaccel_approach=%d(%s)", g_hexagon_appcfg.hwaccel_approach,
                          ggmlhexagon_get_hwaccel_approach_name(g_hexagon_appcfg.hwaccel_approach));
-    GGMLHEXAGON_LOG_INFO("hexagon_backend=%d(%s)", g_hexagon_appcfg.hexagon_backend,
+    GGMLHEXAGON_LOG_VERBOSE("hexagon_backend=%d(%s)", g_hexagon_appcfg.hexagon_backend,
                          ggml_backend_hexagon_get_devname(g_hexagon_appcfg.hexagon_backend));
-    GGMLHEXAGON_LOG_INFO("runtime libpath=%s", g_hexagon_appcfg.runtime_libpath);
-    GGMLHEXAGON_LOG_INFO("enable_perf=%d", g_hexagon_appcfg.enable_perf);
-    GGMLHEXAGON_LOG_INFO("enable_profiler=%d", g_hexagon_appcfg.enable_profiler);
+    GGMLHEXAGON_LOG_VERBOSE("runtime libpath=%s", g_hexagon_appcfg.runtime_libpath);
+    GGMLHEXAGON_LOG_VERBOSE("enable_perf=%d", g_hexagon_appcfg.enable_perf);
+    GGMLHEXAGON_LOG_VERBOSE("enable_profiler=%d", g_hexagon_appcfg.enable_profiler);
 
     if (precision_mode.find("fp16") != std::string::npos) {
         g_hexagon_appcfg.precision_mode = 1;
@@ -1942,6 +2063,58 @@ static void ggmlhexagon_load_cfg() {
     initialized = true;
 }
 
+void ggml_backend_hexagon_set_cfg(int new_hexagon_backend, int new_hwaccel_approach) {
+    if (new_hexagon_backend < 0 || new_hexagon_backend > HEXAGON_BACKEND_GGML) {
+        GGMLHEXAGON_LOG_WARN("invalid new_hexagon_backend");
+        return;
+    }
+    if (new_hwaccel_approach < 0 || new_hwaccel_approach > HWACCEL_CDSP) {
+        GGMLHEXAGON_LOG_WARN("invalid new_hwaccel_approach");
+        return;
+    }
+    std::string cfg_filename = std::string(g_hexagon_appcfg.runtime_libpath) + std::string(g_hexagon_appcfg.cfgfilename);
+    GGMLHEXAGON_LOG_VERBOSE("load hexagon appcfg from %s", cfg_filename.c_str());
+    hexagon_appcfg hexagoncfg_instance;
+    GGMLHEXAGON_LOG_VERBOSE("set_hexagon_cfg with new_hexagon_backend %d, new_hwaccel_approach %d", new_hexagon_backend, new_hwaccel_approach);
+    hexagoncfg_instance.modify_hexagon_config(cfg_filename, new_hexagon_backend, new_hwaccel_approach);
+    hexagoncfg_instance.load(cfg_filename);
+    hexagoncfg_instance.dump([](const std::string & section, const std::string & key, const std::string value) {
+        std::ostringstream  tmposs;
+        tmposs << "section[" << std::setw(10) << std::left << section << "],[" << std::setw(25) << std::left << key << "] = [" << value << "]";
+        GGMLHEXAGON_LOG_VERBOSE("%s", tmposs.str().c_str());
+    });
+}
+
+int ggml_backend_hexagon_get_mulmat_algotype() {
+    std::string cfg_filename = std::string(g_hexagon_appcfg.runtime_libpath) + std::string(g_hexagon_appcfg.cfgfilename);
+    hexagon_appcfg hexagoncfg_instance;
+    hexagoncfg_instance.load(cfg_filename);
+    hexagoncfg_instance.get_intvalue("cdsp", "mulmat_algotype", g_hexagon_appcfg.mulmat_algotype, 0);
+    return g_hexagon_appcfg.mulmat_algotype;
+}
+
+/**
+ * troubleshooting peformance of mulmat on cDSP during development stage
+ */
+void ggml_backend_hexagon_set_mulmat_algotype(int new_mulmat_algotype) {
+    //the logic here is different with logic in the ggml_backend_hexagon_set_cfg(int new_hexagon_backend, int new_hwaccel_approach)
+    if (new_mulmat_algotype < 0) {
+        GGMLHEXAGON_LOG_WARN("invalid new_mulmat_algotype");
+        return;
+    }
+    std::string cfg_filename = std::string(g_hexagon_appcfg.runtime_libpath) + std::string(g_hexagon_appcfg.cfgfilename);
+    GGMLHEXAGON_LOG_VERBOSE("load hexagon appcfg from %s", cfg_filename.c_str());
+    hexagon_appcfg hexagoncfg_instance;
+    GGMLHEXAGON_LOG_VERBOSE("set_hexagon_cfg with new_mulmat_algotype %d", new_mulmat_algotype);
+    hexagoncfg_instance.modify_hexagon_config(cfg_filename, -1, -1, new_mulmat_algotype);
+    hexagoncfg_instance.load(cfg_filename);
+    hexagoncfg_instance.dump([](const std::string & section, const std::string & key, const std::string value) {
+        std::ostringstream  tmposs;
+        tmposs << "section[" << std::setw(10) << std::left << section << "],[" << std::setw(25) << std::left << key << "] = [" << value << "]";
+        GGMLHEXAGON_LOG_VERBOSE("%s", tmposs.str().c_str());
+    });
+}
+
 static bool ggmlhexagon_check_valid_appcfg() {
     bool is_valid_appcfg = true;
 
@@ -1949,38 +2122,38 @@ static bool ggmlhexagon_check_valid_appcfg() {
                           ggmlhexagon_get_hwaccel_approach_name(g_hexagon_appcfg.hwaccel_approach));
     GGMLHEXAGON_LOG_DEBUG("user's specified hexagon_backend=%d", g_hexagon_appcfg.hexagon_backend);
     if (g_hexagon_appcfg.hexagon_backend >= GGML_HEXAGON_MAX_DEVICES) {
-        GGMLHEXAGON_LOG_INFO("using default ggml backend");
+        GGMLHEXAGON_LOG_VERBOSE("using default ggml backend");
         is_valid_appcfg = false;
     }
 
     if (HWACCEL_QNN_SINGLEGRAPH == g_hexagon_appcfg.hwaccel_approach) {
-        GGMLHEXAGON_LOG_INFO("HWACCEL_QNN_SINGLEGRAPH not supported");
+        GGMLHEXAGON_LOG_VERBOSE("HWACCEL_QNN_SINGLEGRAPH not supported");
         is_valid_appcfg = false;
     }
 
     if (HWACCEL_QNN == g_hexagon_appcfg.hwaccel_approach) {
         if (HEXAGON_BACKEND_CDSP == g_hexagon_appcfg.hexagon_backend) {
-            GGMLHEXAGON_LOG_INFO("hexagon_backend HEXAGON_BACKEND_CDSP must match with hwaccel_approach HWACCEL_CDSP");
+            GGMLHEXAGON_LOG_VERBOSE("hexagon_backend HEXAGON_BACKEND_CDSP must match with hwaccel_approach HWACCEL_CDSP");
             is_valid_appcfg = false;
         }
     }
 
     if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
         if ((HEXAGON_BACKEND_CDSP != g_hexagon_appcfg.hexagon_backend) && (HEXAGON_BACKEND_GGML != g_hexagon_appcfg.hexagon_backend)) {
-            GGMLHEXAGON_LOG_INFO("hwaccel_approach HWACCEL_CDSP must match with hexagon_backend HEXAGON_BACKEND_CDSP");
+            GGMLHEXAGON_LOG_VERBOSE("hwaccel_approach HWACCEL_CDSP must match with hexagon_backend HEXAGON_BACKEND_CDSP");
             is_valid_appcfg = false;
         }
 
         if (1 == g_hexagon_appcfg.enable_all_q_mulmat) {
             if (0 == g_hexagon_appcfg.enable_q_mulmat) {
-                GGMLHEXAGON_LOG_INFO("ensure set enable_q_mulmat to 1 firstly when set enable_all_q_mulmat to 1");
-                is_valid_appcfg = false;
+                GGMLHEXAGON_LOG_DEBUG("ensure set enable_q_mulmat to 1 firstly when set enable_all_q_mulmat to 1 if you are not currently comparing the performance of GGML_OP_ADD between QNNCPU, QNNGPU, QNNNPU, cDSP, ggml");
+                //is_valid_appcfg = false;
             }
         }
     }
 
     if (!is_valid_appcfg) {
-        GGMLHEXAGON_LOG_INFO("it seems there is wrong configuration in ggml-hexagon.cfg, will using the default ggml backend accordingly");
+        GGMLHEXAGON_LOG_VERBOSE("it seems there is non-default configuration in ggml-hexagon.cfg, will using the default ggml backend accordingly");
     }
     return is_valid_appcfg;
 }
@@ -1990,6 +2163,11 @@ static void ggmlhexagon_print_running_timestamp(ggml_backend_hexagon_context * c
     char timestamp[GGMLHEXAGON_TMPBUF_LEN];
     memset(timestamp, 0, GGMLHEXAGON_TMPBUF_LEN);
 
+    if (ggmlhexagon_is_llamabench_running()) {
+        //make llama-bench happy
+        return;
+    }
+
     GGMLHEXAGON_LOG_INFO("ggml_hexagon_version:             %s", g_hexagon_appcfg.ggml_hexagon_version);
     GGMLHEXAGON_LOG_INFO("ggml_dsp_version:                 %s", g_hexagon_appcfg.ggml_dsp_version);
     GGMLHEXAGON_LOG_INFO("hwaccel approach:                 %d(%s)", g_hexagon_appcfg.hwaccel_approach,
@@ -2001,10 +2179,11 @@ static void ggmlhexagon_print_running_timestamp(ggml_backend_hexagon_context * c
     if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
         GGMLHEXAGON_LOG_INFO("offload quantize GGML_OP_MUL_MAT: %s", g_hexagon_appcfg.enable_q_mulmat ? "YES" : "NO");
         GGMLHEXAGON_LOG_INFO("using rpc ion memory pool:        %s", g_hexagon_appcfg.enable_rpc_ion_mempool ? "YES" : "NO");
-        GGMLHEXAGON_LOG_INFO("thread_counts with HWACCEL_CDSP: %d", g_hexagon_appcfg.thread_counts);
+        GGMLHEXAGON_LOG_INFO("thread_counts with HWACCEL_CDSP:  %d", g_hexagon_appcfg.thread_counts);
+        GGMLHEXAGON_LOG_INFO("mulmat algo type on cDSP:         %d", g_hexagon_appcfg.mulmat_algotype);
         ggmlhexagon_probe_dspinfo(ctx);
     } else {
-        GGMLHEXAGON_LOG_INFO("thread_counts with HWACCEL_QNN: %d", g_hexagon_appcfg.hvx_threads);
+        GGMLHEXAGON_LOG_INFO("thread_counts with HWACCEL_QNN:   %d", g_hexagon_appcfg.hvx_threads);
         GGMLHEXAGON_LOG_INFO("offload quantize GGML_OP_MUL_MAT: %s", g_hexagon_appcfg.enable_q_mulmat ? "YES" : "NO");
     }
     GGMLHEXAGON_LOG_INFO("running timestamp:%s", timestamp);
@@ -2756,6 +2935,14 @@ class qnn_instance {
     std::unordered_map<void *, Qnn_MemHandle_t> _qnn_rpc_buffer_to_handles;
 
     std::atomic_bool _rpcmem_initialized{false};
+
+    // this is moved to static declarations in this file
+    // pfn_rpc_mem_alloc _pfn_rpc_mem_alloc;
+    // pfn_rpc_mem_free _pfn_rpc_mem_free;
+    // pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd;
+    // pfn_rpc_mem_init  _pfn_rpc_mem_init;
+    // pfn_rpc_mem_deinit _pfn_rpc_mem_deinit;
+
     std::unordered_map<void *, void *> _rpcmem_store_map;
     std::unordered_map<void *, size_t> _rpcmem_usage_map;
     size_t                             _rpcmem_usage    = 0;   // mempool usage in bytes
@@ -2763,6 +2950,10 @@ class qnn_instance {
 
     std::string _graph_name;
     HEXAGONBackend _device_id;
+
+    // this is moved to static declarations in this file
+    //void * _rpc_lib_handle      = nullptr;
+
     bool       _enable_qnn_rpc  = false; //TODO:unknown issue with QNN RPC feature
 
     qnn_instance(const qnn_instance &) = delete;
@@ -3002,7 +3193,7 @@ int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t *
         return 1;
     }
 
-    auto get_providers = ggmlqnn_load_qnn_functionpointers<_pfn_QnnInterface_getProviders *>(
+    auto get_providers = ggmlqnn_load_qnn_functionpointers<pfn_qnninterface_getproviders *>(
                                lib_handle,
                                "QnnInterface_getProviders");
     if (nullptr == get_providers) {
@@ -3042,7 +3233,7 @@ int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t *
         GGMLHEXAGON_LOG_WARN("unable to find a valid qnn interface\n");
         return 6;
     } else {
-        GGMLHEXAGON_LOG_INFO("find a valid qnn interface\n");
+        GGMLHEXAGON_LOG_VERBOSE("find a valid qnn interface\n");
     }
     set_qnn_raw_interface(qnn_interface);
 
@@ -3052,7 +3243,7 @@ int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t *
     _backend_id         = backend_id;
 
     auto saver_initialize =
-            ggmlqnn_load_qnn_functionpointers<_pfn_QnnSaver_initialize *>(_loaded_lib_handle, "QnnSaver_initialize");
+            ggmlqnn_load_qnn_functionpointers<pfn_qnnsaver_initialize *>(_loaded_lib_handle, "QnnSaver_initialize");
     if (nullptr != saver_initialize) {
         error = saver_initialize(saver_config);
         if (error != QNN_SUCCESS) {
@@ -3103,7 +3294,7 @@ int qnn_instance::load_system() {
         }
     }
 
-    auto * get_providers = reinterpret_cast<_pfn_QnnSystemInterface_getProviders *>(dlsym(
+    auto * get_providers = reinterpret_cast<pfn_qnnsysteminterface_getproviders *>(dlsym(
             _system_lib_handle, "QnnSystemInterface_getProviders"));
     if (nullptr == get_providers) {
         GGMLHEXAGON_LOG_WARN("can not load QNN symbol QnnSystemInterface_getProviders: %s\n", dlerror());
@@ -3144,7 +3335,7 @@ int qnn_instance::load_system() {
         GGMLHEXAGON_LOG_WARN("unable to find a valid qnn system interface\n");
         return 6;
     } else {
-        GGMLHEXAGON_LOG_INFO("find a valid qnn system interface\n");
+        GGMLHEXAGON_LOG_VERBOSE("find a valid qnn system interface\n");
     }
     set_qnn_raw_system_interface(qnn_system_interface);
 
@@ -3154,7 +3345,7 @@ int qnn_instance::load_system() {
     if (nullptr == _qnn_system_handle) {
         GGMLHEXAGON_LOG_WARN("can not create QNN system contenxt\n");
     } else {
-        GGMLHEXAGON_LOG_INFO("initialize qnn system successfully\n");
+        GGMLHEXAGON_LOG_VERBOSE("initialize qnn system successfully\n");
     }
 
     return 0;
@@ -3306,16 +3497,16 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
         qcom_socinfo soc_info = {};
         qnnstatus = _qnn_raw_interface.deviceGetPlatformInfo(nullptr, &p_info);
         if (QNN_SUCCESS == qnnstatus) {
-            GGMLHEXAGON_LOG_INFO("device counts %d\n", p_info->v1.numHwDevices);
+            GGMLHEXAGON_LOG_VERBOSE("device counts %d\n", p_info->v1.numHwDevices);
             QnnDevice_HardwareDeviceInfo_t *         infos    = p_info->v1.hwDevices;
             QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = {};
             for (uint32_t i = 0; i < p_info->v1.numHwDevices; i++) {
-                GGMLHEXAGON_LOG_INFO("deviceID:%d, deviceType:%d, numCores %d\n", (int) infos[i].v1.deviceId,
+                GGMLHEXAGON_LOG_VERBOSE("deviceID:%d, deviceType:%d, numCores %d\n", (int) infos[i].v1.deviceId,
                              (int) infos[i].v1.deviceType, (int) infos[i].v1.numCores);
                 QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension;
                 chipinfo                                = devinfo->onChipDevice;
                 size_t htp_arch                         = (size_t) chipinfo.arch;
-                GGMLHEXAGON_LOG_INFO("htp_type:%d(%s)\n", devinfo->devType,
+                GGMLHEXAGON_LOG_VERBOSE("htp_type:%d(%s)\n", devinfo->devType,
                              (devinfo->devType == QNN_HTP_DEVICE_TYPE_ON_CHIP) ? "ON_CHIP" : "");
                 soc_info = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize, {} };
             }
@@ -3349,7 +3540,7 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
     if (QNN_SUCCESS != qnnstatus && QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnnstatus) {
         GGMLHEXAGON_LOG_WARN("failed to create QNN device\n");
     } else {
-        GGMLHEXAGON_LOG_INFO("create device successfully\n");
+        GGMLHEXAGON_LOG_VERBOSE("create device successfully\n");
     }
 
     if (PROFILE_OFF != _profile_level) {
@@ -3432,9 +3623,9 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
         htp_set_memory_grow_size();
 
         if (enable_qnn_rpc()) {
-            GGMLHEXAGON_LOG_INFO("NPU RPC feature enabled with QNN-NPU backend");
+            GGMLHEXAGON_LOG_VERBOSE("NPU RPC feature enabled with QNN-NPU backend");
         } else {
-            GGMLHEXAGON_LOG_INFO("NPU RPC feature disabled with QNN-NPU backend");
+            GGMLHEXAGON_LOG_VERBOSE("NPU RPC feature disabled with QNN-NPU backend");
         }
     }
 
@@ -3449,7 +3640,7 @@ int qnn_instance::qnn_finalize() {
     int ret_status = 0;
     Qnn_ErrorHandle_t error = QNN_SUCCESS;
 
-    GGMLHEXAGON_LOG_INFO("enter %s\n", __func__);
+    GGMLHEXAGON_LOG_VERBOSE("enter %s\n", __func__);
     ggmlqnn_reset_idx();
 
     free_rpcmem();
@@ -3516,7 +3707,7 @@ int qnn_instance::qnn_finalize() {
     unload_backend();
     unload_system();
 
-    GGMLHEXAGON_LOG_INFO("leave %s\n", __func__);
+    GGMLHEXAGON_LOG_VERBOSE("leave %s\n", __func__);
     return ret_status;
 }
 
@@ -3690,7 +3881,7 @@ void qnn_instance::htp_probe_rpc_meminfo() {
 
     free_rpcmem();
     _rpcmem_usage = 0;
-    GGMLHEXAGON_LOG_INFO("capacity of rpc ion memory %d MiB\n", _rpcmem_capacity / SIZE_IN_MB);
+    GGMLHEXAGON_LOG_VERBOSE("capacity of rpc ion memory %d MiB\n", _rpcmem_capacity / SIZE_IN_MB);
 }
 
 void qnn_instance::htp_print_info() {
@@ -3735,10 +3926,10 @@ void qnn_instance::print_backend_info() {
             status = "No";
         }
 
-        GGMLHEXAGON_LOG_INFO("%s: %s", name, status);
+        GGMLHEXAGON_LOG_VERBOSE("%s: %s", name, status);
     };
 
-    GGMLHEXAGON_LOG_INFO("QNN backend properties:");
+    GGMLHEXAGON_LOG_VERBOSE("QNN backend properties:");
     print_property("Create context from binary list", QNN_PROPERTY_CONTEXT_SUPPORT_CREATE_FROM_BINARY_LIST_ASYNC);
     print_property("Dynamic batch", QNN_PROPERTY_GRAPH_SUPPORT_BATCH_MULTIPLE);
     print_property("Early termination", QNN_PROPERTY_GRAPH_SUPPORT_EARLY_TERMINATION);
@@ -3767,7 +3958,7 @@ void qnn_instance::htp_set_memory_grow_size(size_t size) {
     if (QNN_SUCCESS != result) {
         GGMLHEXAGON_LOG_WARN("failed to set HTP memory config");
     } else {
-        GGMLHEXAGON_LOG_INFO("succeed to set HTP memory config");
+        GGMLHEXAGON_LOG_VERBOSE("succeed to set HTP memory config");
     }
 }
 
@@ -3854,7 +4045,7 @@ void qnn_instance::htp_enter_performance_mode() {
     if (ret != QNN_SUCCESS) {
         GGMLHEXAGON_LOG_WARN("failed to set HTP power config");
     } else {
-        GGMLHEXAGON_LOG_INFO("succeed to set HTP power config");
+        GGMLHEXAGON_LOG_VERBOSE("succeed to set HTP power config");
     }
 }
 
@@ -4066,7 +4257,7 @@ static void ggmlqnn_compute_elementwise(ggml_backend_hexagon_context * ctx, ggml
         }
     } else {
         GGML_ASSERT(instance->get_device_id() == ctx->device);
-        GGMLHEXAGON_LOG_INFO("graph name %s", graph_name.c_str());
+        GGMLHEXAGON_LOG_VERBOSE("graph name %s", graph_name.c_str());
         //create QNN graph
         error = instance->init_qnn_graph(graph_name, static_cast<HEXAGONBackend>(ctx->device),
                                          g_hexagon_appcfg.vtcm_size_in_mb,
@@ -4391,7 +4582,7 @@ static void ggmlqnn_compute_mul_mat_4d(ggml_backend_hexagon_context * ctx, ggml_
 
              in the all, there is gap between ggml mulmat and QNN mulmat,we need to perform a transpose
              operation when offloading mulmat to QNN backend. this implementation will handle transpose
-             in func ggmlqnn_compute_create_general_tensor()
+             in func ggmlqnn_create_general_tensor()
 
  * @param ctx     the context of backend
  * @param op      the destination tensor where the result of the matrix multiplication will be stored.
@@ -4459,7 +4650,7 @@ static void ggmlqnn_compute_mul_mat(ggml_backend_hexagon_context * ctx, ggml_ten
         p_tensor2_transpose = tensors[4];
     } else {
         //create QNN graph
-        GGMLHEXAGON_LOG_INFO("graph name %s", graph_name.c_str());
+        GGMLHEXAGON_LOG_VERBOSE("graph name %s", graph_name.c_str());
         error = instance->init_qnn_graph(graph_name, static_cast<HEXAGONBackend>(ctx->device),
                                          g_hexagon_appcfg.vtcm_size_in_mb,
                                          g_hexagon_appcfg.hvx_threads);
@@ -4766,7 +4957,20 @@ static int ggmlhexagon_get_domains_info(const char * domain_type, int * num_doma
     int hexagon_err = AEE_SUCCESS;
     int ss_info     = 0;
     void * buffer   = nullptr;
-    ss_info = strcmp(domain_type, "NSP")? HPASS: NSP;
+#if 0
+typedef enum {
+    /** Flag to be used to query list of all available domains */
+    ALL_DOMAINS,
+    NSP,
+    LPASS,
+    SDSP,
+    MODEM,
+    HPASS,
+} fastrpc_domain_type;
+#endif
+    //ss_info = strcmp(domain_type, "NSP") ? HPASS: NSP;
+    //forward compatible with new SDK
+    ss_info = (0 == memcmp(domain_type, "NSP", 3)) ? 1 : 5;
     system_req_payload req;
     memset(&req, 0, sizeof(system_req_payload));
     req.id = FASTRPC_GET_DOMAINS;
@@ -4993,7 +5197,11 @@ static void ggmlhexagon_set_rpc_latency(remote_handle64 handle, int qos, int lat
             GGMLHEXAGON_LOG_WARN("failed with error 0x%x", hexagon_error);
             goto bail;
         } else {
-            GGMLHEXAGON_LOG_INFO("set rpc qos %d, latency %d\n", qos, latency);
+            if (ggmlhexagon_is_llamabench_running()) {
+                GGMLHEXAGON_LOG_VERBOSE("set rpc qos %d, latency %d\n", qos, latency);
+            } else {
+                GGMLHEXAGON_LOG_INFO("set rpc qos %d, latency %d\n", qos, latency);
+            }
         }
     } else {
         hexagon_error = AEE_EUNSUPPORTEDAPI;
@@ -5004,6 +5212,41 @@ static void ggmlhexagon_set_rpc_latency(remote_handle64 handle, int qos, int lat
     return;
 }
 
+/**
+ * set FastRPC thread priority (default unchanged at 192)
+ * priority values range from 1 to 255, with smaller values representing higher priorities
+ * Unprivileged clients: 64 through 254 (cDSP only)
+ * Privileged clients:   1  through 254
+ *
+ * ref:file:///opt/qcom/Hexagon_SDK/6.2.0.1/docs/software/system_integration.html#priority-levels
+ */
+static int ggmlhexagon_set_priority(int domain, int priority) {
+    int err = 0;
+
+    if (priority < 1) {
+        priority = 1;
+    }
+    if (priority > 255) {
+        priority = 255;
+    }
+
+    if (remote_session_control) {
+        struct remote_rpc_thread_params data;
+        data.domain     = domain;
+        data.prio       = priority;
+        data.stack_size = -1;
+        err = remote_session_control(FASTRPC_THREAD_PARAMS, (void *)&data, sizeof(data));
+        if (err != AEE_SUCCESS) {
+            GGMLHEXAGON_LOG_WARN("remote_session_control failed with 0x%x when setting thread priority\n", err);
+        } else {
+            GGMLHEXAGON_LOG_VERBOSE("thread priority set to %d\n", priority);
+        }
+    } else {
+        GGMLHEXAGON_LOG_WARN("cannot set thread priority\n");
+    }
+    return err;
+}
+
 static bool ggmlhexagon_is_status_notification_supported(int domain) {
     int hexagon_error = AEE_SUCCESS;
 
@@ -5219,7 +5462,11 @@ static int ggmlhexagon_init_rpcmempool(ggml_backend_hexagon_context * ctx) {
     ctx->rpc_mempool_capacity = candidate_size * SIZE_IN_MB;
     GGMLHEXAGON_LOG_DEBUG("rpc memory capacity %ld(%d MiB) for device %d",
                           ctx->rpc_mempool_capacity, ctx->rpc_mempool_capacity / SIZE_IN_MB, ctx->device);
-    GGMLHEXAGON_LOG_INFO("capacity of rpc memory %d MiB", ctx->rpc_mempool_capacity / SIZE_IN_MB);
+    if (ggmlhexagon_is_llamabench_running()) {
+        GGMLHEXAGON_LOG_VERBOSE("capacity of rpc memory %d MiB", ctx->rpc_mempool_capacity / SIZE_IN_MB);
+    } else {
+        GGMLHEXAGON_LOG_INFO("capacity of rpc memory %d MiB", ctx->rpc_mempool_capacity / SIZE_IN_MB);
+    }
 
     if ((g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) {
         GGML_ASSERT(ctx->rpc_mempool_capacity > (8 * SIZE_IN_MB));
@@ -5265,14 +5512,22 @@ static void ggmlhexagon_probe_dspinfo(ggml_backend_hexagon_context * ctx) {
     ggmlhexagon_get_hvx_arch_ver(ctx->domain_id, &dsp_version);
 
     if (dsp_version == 0x68 || dsp_version == 0x69 || dsp_version == 0x73 || dsp_version == 0x75 || dsp_version == 0x79) {
-        GGMLHEXAGON_LOG_INFO("dsp arch version 0x%x", dsp_version);
+        if (ggmlhexagon_is_llamabench_running()) {
+            GGMLHEXAGON_LOG_VERBOSE("dsp arch version 0x%x", dsp_version);
+        } else {
+            GGMLHEXAGON_LOG_INFO("dsp arch version 0x%x", dsp_version);
+        }
         //0x68 -> 68, 0x69 -> 69, 0x73 -> 73, 0x75 -> 75, 0x79 -> 79
         size_t htp_arch = ggmlhexagon_htparch_hex_to_decimal(dsp_version);
         GGMLHEXAGON_LOG_DEBUG("dsp arch version %d", htp_arch);
         struct qcom_socinfo * socinfo = ggmlhexagon_get_socinfo_from_socmodel(htp_arch);
         if (nullptr != socinfo) {
             //got fully description of SoC when hwaccel approach is HWACCEL_CDSP
-            GGMLHEXAGON_LOG_INFO("device info: %s, %s", socinfo->soc_desc, ggmlhexagon_get_htparch_desc(htp_arch));
+            if (ggmlhexagon_is_llamabench_running()) {
+                GGMLHEXAGON_LOG_VERBOSE("device info: %s, %s", socinfo->soc_desc, ggmlhexagon_get_htparch_desc(htp_arch));
+            } else {
+                GGMLHEXAGON_LOG_INFO("device info: %s, %s", socinfo->soc_desc, ggmlhexagon_get_htparch_desc(htp_arch));
+            }
         }
     } else {
         GGMLHEXAGON_LOG_WARN("error: dsp arch version 0x%x is not supported", dsp_version);
@@ -5282,27 +5537,42 @@ static void ggmlhexagon_probe_dspinfo(ggml_backend_hexagon_context * ctx) {
     uint32_t vtcm_page  = 0;
     ggmlhexagon_get_vtcm_info(ctx->domain_id, VTCM_COUNT, &vtcm_count);
     ggmlhexagon_get_vtcm_info(ctx->domain_id, VTCM_PAGE, &vtcm_page);
-    GGMLHEXAGON_LOG_INFO("vtcm_count %d", vtcm_count);
-    GGMLHEXAGON_LOG_INFO("vtcm_page %d", vtcm_page);
 
     uint32_t hmx_depth = 0;
     uint32_t hmx_spatial = 0;
     ggmlhexagon_get_hmx_support_info(ctx->domain_id, HMX_SUPPORT_DEPTH, &hmx_depth);
     ggmlhexagon_get_hmx_support_info(ctx->domain_id, HMX_SUPPORT_SPATIAL, &hmx_spatial);
-    GGMLHEXAGON_LOG_INFO("hmx_depth %d", hmx_depth);
-    GGMLHEXAGON_LOG_INFO("hmx_spatial %d", hmx_spatial);
 
     uint32_t hvx_support_128b = 0;
     ggmlhexagon_get_hvx_support_info(ctx->domain_id, HVX_SUPPORT_128B, &hvx_support_128b);
-    GGMLHEXAGON_LOG_INFO("hvx_support_128b %d", hvx_support_128b);
 
-    GGMLHEXAGON_LOG_INFO("unsigned pd supported %d", ggmlhexagon_get_unsignedpd_support());
-    GGMLHEXAGON_LOG_INFO("async fastrpc supported %d", ggmlhexagon_is_async_fastrpc_supported(ctx->domain_id));
+    if (ggmlhexagon_is_llamabench_running()) {
+        //make llama-bench happy
+        GGMLHEXAGON_LOG_VERBOSE("vtcm_count %d", vtcm_count);
+        GGMLHEXAGON_LOG_VERBOSE("vtcm_page %d", vtcm_page);
+        GGMLHEXAGON_LOG_VERBOSE("hmx_depth %d", hmx_depth);
+        GGMLHEXAGON_LOG_VERBOSE("hmx_spatial %d", hmx_spatial);
+        GGMLHEXAGON_LOG_VERBOSE("hvx_support_128b %d", hvx_support_128b);
+        GGMLHEXAGON_LOG_VERBOSE("unsigned pd supported %d", ggmlhexagon_get_unsignedpd_support());
+        GGMLHEXAGON_LOG_VERBOSE("async fastrpc supported %d", ggmlhexagon_is_async_fastrpc_supported(ctx->domain_id));
+    } else {
+        GGMLHEXAGON_LOG_INFO("vtcm_count %d", vtcm_count);
+        GGMLHEXAGON_LOG_INFO("vtcm_page %d", vtcm_page);
+        GGMLHEXAGON_LOG_INFO("hmx_depth %d", hmx_depth);
+        GGMLHEXAGON_LOG_INFO("hmx_spatial %d", hmx_spatial);
+        GGMLHEXAGON_LOG_INFO("hvx_support_128b %d", hvx_support_128b);
+        GGMLHEXAGON_LOG_INFO("unsigned pd supported %d", ggmlhexagon_get_unsignedpd_support());
+        GGMLHEXAGON_LOG_INFO("async fastrpc supported %d", ggmlhexagon_is_async_fastrpc_supported(ctx->domain_id));
+    }
 }
 
 static void ggmlhexagon_deinit_cdsp(ggml_backend_hexagon_context * ctx) {
     int hexagon_error  = AEE_SUCCESS;
-    GGMLHEXAGON_LOG_INFO("enter %s", __func__);
+    if (ggmlhexagon_is_llamabench_running()) {
+        GGMLHEXAGON_LOG_VERBOSE("enter %s", __func__);
+    } else {
+        GGMLHEXAGON_LOG_INFO("enter %s", __func__);
+    }
     if (0 != ctx->ggmlop_handle) {
         hexagon_error = ggmlop_dsp_close(ctx->ggmlop_handle);
         if (AEE_SUCCESS != hexagon_error) {
@@ -5314,13 +5584,18 @@ static void ggmlhexagon_deinit_cdsp(ggml_backend_hexagon_context * ctx) {
     ggmlhexagon_deinit_rpcmempool(ctx);
 
     ctx->domain_id             = -1;
-    GGMLHEXAGON_LOG_INFO("leave %s", __func__);
+    if (ggmlhexagon_is_llamabench_running()) {
+        GGMLHEXAGON_LOG_VERBOSE("leave %s", __func__);
+    } else {
+        GGMLHEXAGON_LOG_INFO("leave %s", __func__);
+    }
 }
 
 static int ggmlhexagon_init_dsp(ggml_backend_hexagon_context * ctx) {
     static std::mutex mutex;
     std::lock_guard<std::mutex> lock(mutex);
 
+    // load dynamic functions from Qualcomm's rpcmem library (we moved it to the init dsp function (this one))
 #if defined(__ANDROID__) || defined(__linux__)
     std::filesystem::path full_path(std::string(g_hexagon_appcfg.runtime_libpath) + "libcdsprpc.so");
     //full_path /= std::filesystem::path("libcdsprpc.so").filename();
@@ -5455,8 +5730,13 @@ static int ggmlhexagon_init_dsp(ggml_backend_hexagon_context * ctx) {
     }
 
     ctx->domain_id = domain_id;
-    GGMLHEXAGON_LOG_INFO("using Hexagon domain %d(%s)", domain_id, ggmlhexagon_get_dsp_name(domain_id));
-    GGMLHEXAGON_LOG_INFO("unsignedpd_enabled %d", is_unsignedpd_enabled);
+    if (ggmlhexagon_is_llamabench_running()) {
+        GGMLHEXAGON_LOG_VERBOSE("using Hexagon domain %d(%s)", domain_id, ggmlhexagon_get_dsp_name(domain_id));
+        GGMLHEXAGON_LOG_VERBOSE("unsignedpd_enabled %d", is_unsignedpd_enabled);
+    } else {
+        GGMLHEXAGON_LOG_INFO("using Hexagon domain %d(%s)", domain_id, ggmlhexagon_get_dsp_name(domain_id));
+        GGMLHEXAGON_LOG_INFO("unsignedpd_enabled %d", is_unsignedpd_enabled);
+    }
     if (is_unsignedpd_enabled) {
         if (remote_session_control) {
             struct remote_rpc_control_unsigned_module data;
@@ -5465,7 +5745,7 @@ static int ggmlhexagon_init_dsp(ggml_backend_hexagon_context * ctx) {
             hexagon_error = remote_session_control(DSPRPC_CONTROL_UNSIGNED_MODULE, (void *)&data, sizeof(data));
             GGMLHEXAGON_LOG_DEBUG("remote_session_control returned %d for configuring unsigned PD success", hexagon_error);
             if (AEE_SUCCESS != hexagon_error) {
-                GGMLHEXAGON_LOG_DEBUG("error 0x%x: remote_session_control failed", hexagon_error);
+                GGMLHEXAGON_LOG_WARN("error 0x%x: remote_session_control failed", hexagon_error);
             }
         } else {
             GGMLHEXAGON_LOG_DEBUG("unsigned PD not supported on this device");
@@ -5482,6 +5762,13 @@ static int ggmlhexagon_init_dsp(ggml_backend_hexagon_context * ctx) {
         GGMLHEXAGON_LOG_WARN("error 0x%x: failed to compute on domain %d", hexagon_error, domain_id);
         goto bail;
     }
+    ggmlhexagon_set_priority(domain_id, 160);
+
+    ggmlop_domain_uri_len   = strlen(ggmlop_URI) + MAX_DOMAIN_NAMELEN;
+    ggmlop_domain_uri       = (char *)malloc(ggmlop_domain_uri_len);
+    if (NULL == ggmlop_domain_uri) {
+        goto bail;
+    }
 
     // we copy the appropaite ggmlop-skel into our runtime libpath
     {
@@ -5491,8 +5778,8 @@ static int ggmlhexagon_init_dsp(ggml_backend_hexagon_context * ctx) {
         if (dsp_version == 0x68 || dsp_version == 0x69 || dsp_version == 0x73 ||
             dsp_version == 0x75 || dsp_version == 0x79) {
 
-            // delete the file $(g_hexagon_appcfg.runtime_libpath)/libggmlop-skel.so if it exists
-            std::string filepath = std::string(g_hexagon_appcfg.runtime_libpath) + "/libggmlop-skel.so";
+            // delete the file $(g_hexagon_appcfg.runtime_libpath)/libggmldsp-skel.so if it exists
+            std::string filepath = std::string(g_hexagon_appcfg.runtime_libpath) + "/libggmldsp-skel.so";
             if (std::filesystem::exists(filepath)) {
                 std::filesystem::remove(filepath);
             }
@@ -5500,13 +5787,13 @@ static int ggmlhexagon_init_dsp(ggml_backend_hexagon_context * ctx) {
             // detect the htp arch number
             size_t htp_arch = ggmlhexagon_htparch_hex_to_decimal(dsp_version);
 
-            // find the file $(g_hexagon_appcfg.runtime_libpath)/libggmlop-skelV$(htp_arch).so if it exists
-            // copy and rename it to libggmlop-skel.so in the same folder
+            // find the file $(g_hexagon_appcfg.runtime_libpath)/libggmldsp-skelV$(htp_arch).so if it exists
+            // copy and rename it to libggmldsp-skel.so in the same folder
 
             // Construct file paths
-            std::string source_filename = std::string("libggmlop-skelV") + std::to_string(htp_arch) + ".so";
-            std::string source_path = std::string(g_hexagon_appcfg.runtime_libpath) + "/" + source_filename;
-            std::string dest_path = std::string(g_hexagon_appcfg.runtime_libpath) + "/libggmlop-skel.so";
+            std::string source_filename = std::string("libggmldsp-skelV") + std::to_string(htp_arch) + ".so";
+            std::string source_path = std::string(g_hexagon_appcfg.runtime_libpath) + source_filename;
+            std::string dest_path = std::string(g_hexagon_appcfg.runtime_libpath) + "libggmldsp-skel.so";
 
             // Check if source file exists
             if (std::filesystem::exists(source_path)) {
@@ -5538,12 +5825,18 @@ static int ggmlhexagon_init_dsp(ggml_backend_hexagon_context * ctx) {
     GGMLHEXAGON_LOG_DEBUG("ggmlop domain uri:%s", ggmlop_domain_uri);
     hexagon_error = ggmlop_dsp_open(ggmlop_domain_uri, &ctx->ggmlop_handle);
     if (AEE_SUCCESS == hexagon_error) {
-        GGMLHEXAGON_LOG_INFO("succeed to open domain %d(%s)", domain_id, ggmlhexagon_get_dsp_name(domain_id));
-        //FIXME: only support offload fp32 GGML_OP_MUL_MAT to cDSP
-        GGMLHEXAGON_LOG_INFO("only support offload fp32 GGML_OP_ADD and fp32 GGML_OP_MUL_MAT to cDSP currently");
+        if (ggmlhexagon_is_llamabench_running()) {
+            GGMLHEXAGON_LOG_VERBOSE("succeed to open domain %d(%s)", domain_id, ggmlhexagon_get_dsp_name(domain_id));
+            GGMLHEXAGON_LOG_VERBOSE("only support offload GGML_OP_ADD and GGML_OP_MUL_MAT to cDSP currently");
+        } else {
+            GGMLHEXAGON_LOG_INFO("succeed to open domain %d(%s)", domain_id, ggmlhexagon_get_dsp_name(domain_id));
+            GGMLHEXAGON_LOG_INFO("only support offload GGML_OP_ADD and GGML_OP_MUL_MAT to cDSP currently");
+        }
         ggmlhexagon_probe_dspinfo(ctx);
         //FIXME: re-use this function to pass thread_counts info to code on cDSP side before fully understand qidl mechanism
-        ggmlop_dsp_setclocks(ctx->ggmlop_handle, HAP_DCVS_VCORNER_TURBO_PLUS, 40, 1, g_hexagon_appcfg.thread_counts);
+        //ggmlop_dsp_setclocks(ctx->ggmlop_handle, HAP_DCVS_VCORNER_TURBO_PLUS, 40, 1, g_hexagon_appcfg.thread_counts);
+        //backward compatible with previous codes on cDSP side
+        ggmlop_dsp_setclocks(ctx->ggmlop_handle, HAP_DCVS_VCORNER_TURBO_PLUS, 40, g_hexagon_appcfg.mulmat_algotype, g_hexagon_appcfg.thread_counts);
         ggmlhexagon_set_rpc_latency(ctx->ggmlop_handle, RPC_POLL_QOS, 100);
         int result = ggmlhexagon_init_rpcmempool(ctx);
         if (0 != result) {
@@ -5559,6 +5852,10 @@ static int ggmlhexagon_init_dsp(ggml_backend_hexagon_context * ctx) {
     //make sure test-backend-ops get the correct backend name when hwaccel approach is 2(HWACCEL_CDSP)
     memcpy(g_hexagon_mgr[ctx->device].name, "Hexagon-cDSP", strlen("Hexagon-cDSP"));
 
+    if (NULL != ggmlop_domain_uri) {
+        free(ggmlop_domain_uri);
+        ggmlop_domain_uri = NULL;
+    }
     return 0;
 
 bail:
@@ -5705,8 +6002,6 @@ static bool ggmlhexagon_can_handle_op_through_cdsp(ggml_backend_dev_t dev, const
         case GGML_OP_MUL_MAT:
         {
             ggmlhexagon_dump_op_info(op_tensor);
-            //FIXME:keep same filter logic with QNN solution to compare NPU performance between cDSP approach
-            //      and QNN-NPU approach, remove these filters in the future
             if (src0_rank != src1_rank)
                 return false;
             if (src0_rank != 2)
@@ -5714,7 +6009,7 @@ static bool ggmlhexagon_can_handle_op_through_cdsp(ggml_backend_dev_t dev, const
 
             if (1 == g_hexagon_appcfg.enable_q_mulmat) {
                 if (1 == g_hexagon_appcfg.enable_all_q_mulmat) {
-                    return (src0->type == GGML_TYPE_F32 || ggml_is_quantized(src0->type)) && (src1->type == GGML_TYPE_F32);
+                    return (src0->type == GGML_TYPE_F32  || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && (src1->type == GGML_TYPE_F32);
                 }
 
                 return (src0->type == GGML_TYPE_F32
@@ -5799,22 +6094,25 @@ static bool ggmlhexagon_can_handle_op_through_qnn(ggml_backend_dev_t dev, const
 
             if (src0_rank != 2) {
                 // FIXME: there are some limitations for mulmat in QNN SDK: rank >= 2.
-                //        keep same filter logic with QNN solution to compare NPU performance between
-                //        cDSP approach and QNN-NPU approach, remove these filters in the future
                 return false;
             }
 
             if (ctx->device == HEXAGON_BACKEND_QNNNPU) {
-                if (1 == g_hexagon_appcfg.enable_q_mulmat)
+                if (1 == g_hexagon_appcfg.enable_q_mulmat) {
                     return (src0->type == GGML_TYPE_F32
                         || src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q8_0
                         || src0->type == GGML_TYPE_Q6_K || src0->type == GGML_TYPE_Q8_K
                         ) && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32);
-                else
+                } else {
                     return (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && op_tensor->type == GGML_TYPE_F32);
+                }
             } else {
-                return (src0->type == GGML_TYPE_F32 || ggml_is_quantized(src0->type))
+                if (1 == g_hexagon_appcfg.enable_q_mulmat) {
+                    return (src0->type == GGML_TYPE_F32 || ggml_is_quantized(src0->type))
                         && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32);
+                } else {
+                    return (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && op_tensor->type == GGML_TYPE_F32);
+                }
             }
         }
         case GGML_OP_LOG:
@@ -6076,10 +6374,13 @@ static ggml_backend_buffer_t ggml_backend_hexagon_buffer_type_alloc_buffer(
     if ((HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) {
         GGMLHEXAGON_LOG_DEBUG("device %d(%s)", ctx->device, ggml_backend_hexagon_get_devname(ctx->device));
         GGML_ASSERT(nullptr != ctx->rpc_mempool);
-        GGMLHEXAGON_LOG_DEBUG("size %ld(%d MiB), rpc_mempool_usage %ld(%d MiB), rpc_mempool_len %ld(%d MiB)",
+        GGMLHEXAGON_LOG_VERBOSE("size %ld(%d MiB), rpc_mempool_usage %ld(%d MiB), rpc_mempool_len %ld(%d MiB)",
                               size, size / SIZE_IN_MB, ctx->rpc_mempool_usage, ctx->rpc_mempool_usage / SIZE_IN_MB,
                               ctx->rpc_mempool_len, ctx->rpc_mempool_len / SIZE_IN_MB);
-        GGML_ASSERT(size + ctx->rpc_mempool_usage <= ctx->rpc_mempool_len);
+        if (size + ctx->rpc_mempool_usage >= ctx->rpc_mempool_len) {
+            GGMLHEXAGON_LOG_WARN("device memory allocation of size %ld failed", size);
+            return nullptr;
+        }
         buffer_ctx->buffer = (static_cast<char*>(ctx->rpc_mempool)) + ctx->rpc_mempool_usage;
         GGMLHEXAGON_LOG_DEBUG("buffer_ctx->buffer %p", buffer_ctx->buffer);
         GGML_ASSERT(nullptr != buffer_ctx->buffer);
@@ -6303,7 +6604,7 @@ static void ggml_backend_hexagon_device_get_props(ggml_backend_dev_t dev,
 static ggml_backend_t ggml_backend_hexagon_device_init_backend(ggml_backend_dev_t dev, const char * params) {
     GGML_UNUSED(dev);
     GGMLHEXAGON_LOG_DEBUG("enter %s\n", __func__);
-    size_t dev_index = 0;
+    int dev_index = 0;
 
     //case-1: test-backend-ops or other similar scenario: calling ggml_backend_dev_init(dev, reinterpret_cast<const char *>(i)) directly in user's code
     ggmlhexagon_load_cfg();
@@ -6319,11 +6620,19 @@ static ggml_backend_t ggml_backend_hexagon_device_init_backend(ggml_backend_dev_
             return nullptr;
         }
     } else {
-        GGMLHEXAGON_LOG_INFO("program specified param is not nullptr");
+        GGMLHEXAGON_LOG_VERBOSE("program specified param is not nullptr");
         //user's program calling ggml_backend_hexagon_device_init_backend directly
         dev_index = (int)(intptr_t)params;
+        if (dev_index < 0) {
+            GGMLHEXAGON_LOG_VERBOSE("it shouldn't happend\n");
+            //test-thread-safety might-be running at the moment or an invalid value passed from user's program
+            dev_index = HEXAGON_BACKEND_QNNCPU; //0
+        }
+        if (dev_index > GGML_HEXAGON_MAX_DEVICES) {
+            dev_index = HEXAGON_BACKEND_GGML;   //4
+        }
         g_hexagon_appcfg.hexagon_backend = dev_index;
-        GGMLHEXAGON_LOG_INFO("program specified dev_index %d\n", dev_index);
+        GGMLHEXAGON_LOG_VERBOSE("program specified dev_index %d\n", dev_index);
     }
     GGMLHEXAGON_LOG_DEBUG("hexagon_backend=%d", dev_index);
     ggml_backend_t hexagon_backend = ggml_backend_hexagon_init(dev_index, g_hexagon_appcfg.runtime_libpath);
@@ -6720,7 +7029,7 @@ const char * ggml_backend_hexagon_get_devname(size_t dev_num) {
 
 static qnn_instance * ggmlqnn_init_qnn_instance(size_t device, const char * qnn_lib_path) {
     int result = 0;
-    GGMLHEXAGON_LOG_INFO("device=%d, hwaccel approach=%d(%s)", device, g_hexagon_appcfg.hwaccel_approach,
+    GGMLHEXAGON_LOG_VERBOSE("device=%d, hwaccel approach=%d(%s)", device, g_hexagon_appcfg.hwaccel_approach,
                      ggmlhexagon_get_hwaccel_approach_name(g_hexagon_appcfg.hwaccel_approach));
 
     qnn_instance * instance = nullptr;
@@ -6740,7 +7049,7 @@ static qnn_instance * ggmlqnn_init_qnn_instance(size_t device, const char * qnn_
     }
 
     std::string device_name = ggml_backend_hexagon_get_devname(device);
-    GGMLHEXAGON_LOG_INFO("qnn device name %s", device_name.c_str());
+    GGMLHEXAGON_LOG_VERBOSE("qnn device name %s", device_name.c_str());
     g_hexagon_mgr[device].instance = instance;
     g_hexagon_mgr[device].raw_interface = instance->get_qnn_raw_interface();
     g_hexagon_mgr[device].raw_system_interface = instance->get_qnn_raw_system_interface();
@@ -6777,9 +7086,6 @@ ggml_backend_t ggml_backend_hexagon_init(size_t device, const char * runtime_lib
         ggmlhexagon_set_runtime_path(device, runtime_libpath);
     }
 
-    // the condition above never be true because our hardcoded runtime_libpath is always the same as the config, so we manually set the library paths here
-    ggmlhexagon_set_runtime_path(g_hexagon_appcfg.hexagon_backend, g_hexagon_appcfg.runtime_libpath);
-
     if (nullptr != g_hexagon_mgr[device].backend) {
         GGMLHEXAGON_LOG_DEBUG("backend %d(%s) already loaded", device,
                          ggml_backend_hexagon_get_devname(device));
@@ -6811,7 +7117,7 @@ ggml_backend_t ggml_backend_hexagon_init(size_t device, const char * runtime_lib
         }
     } else {
         //get fully description of SoC when hwaccel approach is HWACCEL_QNN and backend is HEXAGON_BACKEND_QNNNPU
-        GGMLHEXAGON_LOG_INFO("device name %s", ggml_backend_hexagon_device_get_description(hexagon_backend->device));
+        GGMLHEXAGON_LOG_VERBOSE("device name %s", ggml_backend_hexagon_device_get_description(hexagon_backend->device));
     }
     GGMLHEXAGON_LOG_DEBUG("leave %s", __func__);
 
diff --git a/ggml/src/ggml-hexagon/kernels/Makefile b/ggml/src/ggml-hexagon/kernels/Makefile
index 0e6b3fa2e4df6..b3e7f038cb866 100755
--- a/ggml/src/ggml-hexagon/kernels/Makefile
+++ b/ggml/src/ggml-hexagon/kernels/Makefile
@@ -7,7 +7,7 @@ HEXAGON_COMPUTE=compute${HTP_ARCH_VERSION}
 HEXAGON_CC=${HEXAGON_SDK_PATH}/tools/HEXAGON_Tools/8.8.06/Tools/bin/hexagon-clang
 HEXAGON_CXX=${HEXAGON_SDK_PATH}/tools/HEXAGON_Tools/8.8.06/Tools/bin/hexagon-clang
 
-TARGET=libggmlop-skel.so
+TARGET=libggmldsp-skel.so
 
 $(info HEXAGON_SDK_PATH:${HEXAGON_SDK_PATH})
 $(info HTP_ARCH_VERSION:${HTP_ARCH_VERSION})
@@ -23,12 +23,14 @@ LDFLAGS=-m${HTP_ARCH_VERSION} -Wl,--defsym=ISDB_TRUSTED_FLAG=2 -Wl,--defsym=ISDB
 #SRCS = $(wildcard *.c)
 SRCS = ggml-dsp.c skel.c entry.c add.c  mulmat.c
 OBJS = $(patsubst %.c, %.o, $(SRCS))
+OBJS += dot.o
+OBJS += worker_pool.o
 
 ALL:$(OBJS)
 		${HEXAGON_CC} ${LDFLAGS} -o ${TARGET} -Wl,--start-group ${OBJS} -Wl,--end-group
 		@ls -l ${TARGET}
-		/bin/cp -fv ${TARGET} ../../../../out/android/bin/
-		/bin/cp -fv ${TARGET} ../../../../out/android/bin/libggmlop-skel${HTP_ARCH_VERSION}.so
+		/bin/cp -fv ${TARGET} ../../../../out/ggmlhexagon-android/bin/
+		/bin/cp -fv ${TARGET} ../../../../out/ggmlhexagon-android/bin/libggmldsp-skel${HTP_ARCH_VERSION}.so
 		/bin/rm -f *.so
 
 %.o:%.c
@@ -36,5 +38,16 @@ ALL:$(OBJS)
 		${HEXAGON_CC} ${CFLAGS} ${DEBUG_FLAG} -D__FILENAME__=\"$<\" -o $@ -c $<
 		@echo "\n"
 
+%.o:%.S
+		@echo "${HEXAGON_CC} ${CFLAGS} ${DEBUG_FLAG} -D__FILENAME__=\"$<\" -o $@ -c $<"
+		${HEXAGON_CC} ${CFLAGS} ${DEBUG_FLAG} -D__FILENAME__=\"$<\" -o $@ -c $<
+		@echo "\n"
+
+%.o:%.cpp
+		@echo "${HEXAGON_CC} ${CFLAGS} ${DEBUG_FLAG} -D__FILENAME__=\"$<\" -o $@ -c $<"
+		${HEXAGON_CC} ${CFLAGS} ${DEBUG_FLAG} -D__FILENAME__=\"$<\" -o $@ -c $<
+		@echo "\n"
+
 clean:
 	rm -f *.o
+	/bin/rm -f *.so
diff --git a/ggml/src/ggml-hexagon/kernels/dot.S b/ggml/src/ggml-hexagon/kernels/dot.S
new file mode 100755
index 0000000000000..2031a6001519b
--- /dev/null
+++ b/ggml/src/ggml-hexagon/kernels/dot.S
@@ -0,0 +1,136 @@
+/**=============================================================================
+@file
+    qhblas_f_vector_dot_af.S
+
+@brief
+    Calculates dot product of two input float vectors.
+
+    Function prototype
+        
+        int32_t qhblas_f_vector_dot_af(float_a8_t *input_1, float_a8_t *input_2, float *output, uint32_t size);
+
+    Reference C code
+        
+        int32_t qhblas_f_vector_dot_af(float_a8_t *input_1, float_a8_t *input_2, float *output, uint32_t size)
+        {
+            if ((input_1 == NULL) || (input_2 == NULL) || (output == NULL) || (size == 0))
+            {
+                return -1;
+            }
+
+            float dot = 0;
+            for (uint32_t i = 0; i < size; ++i)
+            {
+                dot += input_1[i] * input_2[i];
+            }
+
+            *output = dot;
+            return 0;
+        }
+
+Copyright (c) 2019 Qualcomm Technologies Incorporated.
+All Rights Reserved. Qualcomm Proprietary and Confidential.
+=============================================================================**/
+
+/*============================================================================*/
+
+    .p2align 2
+    .p2align 4,,15
+    .global qhblas_f_vector_dot_af
+    .type qhblas_f_vector_dot_af, @function
+
+/*============================================================================*/
+
+#define DC_PREFETCH_AHEAD    64                                    // number of bytes for DCFETCH
+#define L2_PREFETCH_AHEAD    256                                   // number of bytes for L2FETCH
+#define L2FETCH_CONFIG       0x0100FF00+(L2_PREFETCH_AHEAD/256)    // [stride = 256 : width = 255 : height = bytes/256]
+#define L2_PREFETCH_ELEMS    L2_PREFETCH_AHEAD/8                   // number of elements to prefetch with L2FETCH
+
+/*============================================================================*/
+
+qhblas_f_vector_dot_af:
+{
+    p0 = !cmp.eq(r0,#0)                           // input_1 != NULL
+    p0 = !cmp.eq(r1,#0)                           // input_2 != NULL
+    p0 = !cmp.eq(r2,#0)                           // output != NULL
+    p0 = cmp.gtu(r3,#0)                           // size > 0
+    if (!p0.new) jump:nt .L_ret
+}
+{
+    r10 = #0
+    r3 = lsr(r3,#1)                               // size / 2
+    p1 = tstbit(r3,#0)                            // check for odd size
+    if(cmp.eq(r3.new,#0)) jump:nt .L_do_one
+}
+{
+    r7:6 = #0
+    r9:8 = #0
+    r5 = add(r3,#7)                               // (size / 2) + 7
+    p2 = cmp.gtu(r3,#L2_PREFETCH_ELEMS)           // check whether we can do l2fetch
+}
+{
+    r5 = lsr(r5,#3)                               // ceil(size / 2)
+    r14 = mux(p2,r3,#0)                           // set l2fetch counter
+}
+{
+    r13:12 = combine(##L2FETCH_CONFIG,#8)         // set l2fetch config and max number of iterations for .L_loop_do_two
+    loop1(.L_prefetch_loop_do_two,r5)
+}
+    .falign
+.L_prefetch_loop_do_two:
+{
+    dcfetch(r0+#DC_PREFETCH_AHEAD)                // prefetch ahead for input_1
+    r5 = min(r12,r3)                              // min(8, size / 2)
+}
+{
+    dcfetch(r1+#DC_PREFETCH_AHEAD)                // prefetch ahead for input_2
+    loop0(.L_loop_do_two,r5)
+    p2 = cmp.eq(r3,r14)                           // check whether to do l2fetch
+    if (!p2.new) jump:t .L_loop_do_two
+}
+{
+    r5 = add(r3,#-L2_PREFETCH_ELEMS)              // number of elements left to prefetch ahead
+    r15 = add(r0,#L2_PREFETCH_AHEAD)              // input_1 addr for l2fetch
+}
+{
+    p2 = cmp.gtu(r5,#L2_PREFETCH_ELEMS)           // check whether we can continue to do l2fetch
+    r15 = add(r1,#L2_PREFETCH_AHEAD)              // input_2 addr for l2fetch
+    l2fetch(r15,r13)
+}
+{
+    if (p2) r14 = add(r14,#-L2_PREFETCH_ELEMS)    // adjust l2fetch counter
+    if (!p2) r14 = #0                             // there are no more bytes left to prefetch ahead
+    l2fetch(r15,r13)
+}
+    .falign
+.L_loop_do_two:
+{
+    r7:6 = memd(r0++#8)
+    r9:8 = memd(r1++#8)
+    r10 += sfmpy(r7,r9)
+}
+{
+    r10 += sfmpy(r6,r8)
+    r3 = add(r3,#-1)                              // adjust (size / 2)
+}:endloop0:endloop1
+{
+    r10 += sfmpy(r7,r9)
+    if (!p1) jump:nt .L_ret
+}
+    .falign
+.L_do_one:
+{
+    r4 = memw(r0)
+    r5 = memw(r1)
+}
+{
+    r10 += sfmpy(r4,r5)
+}
+    .falign
+.L_ret:
+{
+    if (p0) memw(r2) = r10
+    r0 = mux(p0,#0,#-1)
+    jumpr r31
+}
+    .size   qhblas_f_vector_dot_af, .-qhblas_f_vector_dot_af
diff --git a/ggml/src/ggml-hexagon/kernels/entry.c b/ggml/src/ggml-hexagon/kernels/entry.c
index ea38beea673c0..8af93ea1d3082 100644
--- a/ggml/src/ggml-hexagon/kernels/entry.c
+++ b/ggml/src/ggml-hexagon/kernels/entry.c
@@ -34,7 +34,7 @@ int ggmlop_dsp_close(remote_handle64 handle) {
     return 0;
 }
 
-AEEResult ggmlop_dsp_setclocks(remote_handle64 handle, int32 power_level, int32 latency, int32 dcvs_enabled, int32 thread_counts) {
+AEEResult ggmlop_dsp_setclocks(remote_handle64 handle, int32 power_level, int32 latency, int32 mulmat_algo, int32 thread_counts) {
     GGMLHEXAGON_LOG_DEBUG("enter %s", __func__);
     HAP_power_request_t request;
     memset(&request, 0, sizeof(HAP_power_request_t));
@@ -60,7 +60,7 @@ AEEResult ggmlop_dsp_setclocks(remote_handle64 handle, int32 power_level, int32
     request.type = HAP_power_set_DCVS_v2;
     request.dcvs_v2.dcvs_enable = TRUE;
     request.dcvs_v2.dcvs_params.target_corner = (HAP_dcvs_voltage_corner_t)power_level;
-    if (dcvs_enabled) {
+    if (mulmat_algo) {
         request.dcvs_v2.dcvs_params.min_corner = HAP_DCVS_VCORNER_DISABLE;
         request.dcvs_v2.dcvs_params.max_corner = HAP_DCVS_VCORNER_DISABLE;
     } else {
diff --git a/ggml/src/ggml-hexagon/kernels/mulmat.c b/ggml/src/ggml-hexagon/kernels/mulmat.c
index f7494c8eaacf4..f34b6f8b09b4e 100644
--- a/ggml/src/ggml-hexagon/kernels/mulmat.c
+++ b/ggml/src/ggml-hexagon/kernels/mulmat.c
@@ -145,7 +145,6 @@ static void ggml_compute_forward_mul_mat_one_chunk(const ggml_tensor *src0, cons
     }
 }
 
-//TODO: only support fp32 mulmat on cDSP
 static int ggmlop_dsp_mulmat_singlethread(remote_handle64 h, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
     ggmlhexagon_dump_tensor(src0, 0);
@@ -274,7 +273,6 @@ static int ggmlop_dsp_mulmat_singlethread(remote_handle64 h, const ggml_tensor *
     return 0;
 }
 
-//TODO:multithreading mulmat
 static int ggmlop_dsp_mulmat_multithread(remote_handle64 h, const struct dsptensor * src0, const struct dsptensor * src1, dsptensor * dst) {
     GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
     GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
diff --git a/ggml/src/ggml-hexagon/kernels/skel.c b/ggml/src/ggml-hexagon/kernels/skel.c
index 26da58273f013..b216d66a654ab 100644
--- a/ggml/src/ggml-hexagon/kernels/skel.c
+++ b/ggml/src/ggml-hexagon/kernels/skel.c
@@ -289,8 +289,8 @@ extern int adsp_mmap_fd_getinfo(int, uint32_t *);
 #ifdef __cplusplus
 extern "C" {
 #endif
-_ATTRIBUTE_VISIBILITY uint32_t ggmlop_skel_handle_invoke_qaic_version = 10048;
-_ATTRIBUTE_VISIBILITY char ggmlop_skel_handle_invoke_uri[77+1]="file:///libggmlop-skel.so?ggmlop_skel_handle_invoke&_modver=1.0&_idlver=0.0.1";
+_ATTRIBUTE_VISIBILITY uint32_t ggmldsp_skel_handle_invoke_qaic_version = 10048;
+_ATTRIBUTE_VISIBILITY char ggmldsp_skel_handle_invoke_uri[79+1]="file:///libggmldsp-skel.so?ggmldsp_skel_handle_invoke&_modver=1.0&_idlver=0.0.1";
 static __inline int _skel_pack(_ATTRIBUTE_UNUSED remote_arg* _praROutPost, _ATTRIBUTE_UNUSED remote_arg* _ppraROutPost[1], _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _rout0[1], _ATTRIBUTE_UNUSED uint32_t _rout1[4], _ATTRIBUTE_UNUSED uint32_t _rout2[4], _ATTRIBUTE_UNUSED uint32_t _rout3[1], _ATTRIBUTE_UNUSED uint32_t _rout4[16], _ATTRIBUTE_UNUSED uint32_t _rout5[1], _ATTRIBUTE_UNUSED char* _rout6[1], _ATTRIBUTE_UNUSED uint32_t _rout6Len[1]) {
    int _nErr = 0;
    remote_arg* _praROutPostStart = _praROutPost;
@@ -598,7 +598,7 @@ static __inline int _skel_method_3(int (*_pfn)(const char*, remote_handle64*), u
    _QAIC_CATCH(_nErr) {}
    return _nErr;
 }
-__QAIC_SKEL_EXPORT int __QAIC_SKEL(ggmlop_skel_handle_invoke)(remote_handle64 _h, uint32_t _sc, remote_arg* _pra) __QAIC_SKEL_ATTRIBUTE {
+__QAIC_SKEL_EXPORT int __QAIC_SKEL(ggmldsp_skel_handle_invoke)(remote_handle64 _h, uint32_t _sc, remote_arg* _pra) __QAIC_SKEL_ATTRIBUTE {
    switch(REMOTE_SCALARS_METHOD(_sc)){
       case 0:
       return _skel_method_3(__QAIC_IMPL(ggmlop_dsp_open), _sc, _pra);
diff --git a/ggml/src/ggml-hexagon/kernels/skel.h b/ggml/src/ggml-hexagon/kernels/skel.h
index 194c71e6ecb2a..4850265ee504f 100644
--- a/ggml/src/ggml-hexagon/kernels/skel.h
+++ b/ggml/src/ggml-hexagon/kernels/skel.h
@@ -272,15 +272,12 @@ __QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_open)(const char* uri, remote_
     * @retval, 0 on success, should always succeed
     */
 __QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_close)(remote_handle64 h) __QAIC_HEADER_ATTRIBUTE;
-__QAIC_HEADER_EXPORT AEEResult __QAIC_HEADER(ggmlop_dsp_setclocks)(remote_handle64 _h, int32 power_level, int32 latency, int32 dcvs_enable, int32 threads) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT AEEResult __QAIC_HEADER(ggmlop_dsp_setclocks)(remote_handle64 _h, int32 power_level, int32 latency, int32 mulmat_algotype, int32 thread_counts) __QAIC_HEADER_ATTRIBUTE;
 __QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_add)(remote_handle64 _h, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_HEADER_ATTRIBUTE;
 __QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_mulmat)(remote_handle64 _h, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_HEADER_ATTRIBUTE;
 __QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_softmax)(remote_handle64 _h, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_HEADER_ATTRIBUTE;
 __QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_rmsnorm)(remote_handle64 _h, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_HEADER_ATTRIBUTE;
 __QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_pool2d)(remote_handle64 _h, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_HEADER_ATTRIBUTE;
-#ifndef ggmlop_URI
-#define ggmlop_URI "file:///libggmlop-skel.so?ggmlop_skel_handle_invoke&_modver=1.0&_idlver=0.0.1"
-#endif /*ggmlop_URI*/
 #ifdef __cplusplus
 }
 #endif
diff --git a/ggml/src/ggml-hexagon/kernels/stub.c b/ggml/src/ggml-hexagon/kernels/stub.c
index 6074d243610df..7936c43cd6d77 100644
--- a/ggml/src/ggml-hexagon/kernels/stub.c
+++ b/ggml/src/ggml-hexagon/kernels/stub.c
@@ -312,9 +312,9 @@ static __inline int _stub_method(remote_handle64 _handle, uint32_t _mid, uint32_
    }
    return _nErr;
 }
-__QAIC_STUB_EXPORT AEEResult __QAIC_STUB(ggmlop_dsp_setclocks)(remote_handle64 _handle, int32 power_level, int32 latency, int32 dcvs_enable, int32 threads) __QAIC_STUB_ATTRIBUTE {
+__QAIC_STUB_EXPORT AEEResult __QAIC_STUB(ggmlop_dsp_setclocks)(remote_handle64 _handle, int32 power_level, int32 latency, int32 mulmat_algotype, int32 threads) __QAIC_STUB_ATTRIBUTE {
    uint32_t _mid = 2;
-   return _stub_method(_handle, _mid, (uint32_t*)&power_level, (uint32_t*)&latency, (uint32_t*)&dcvs_enable, (uint32_t*)&threads);
+   return _stub_method(_handle, _mid, (uint32_t*)&power_level, (uint32_t*)&latency, (uint32_t*)&mulmat_algotype, (uint32_t*)&threads);
 }
 static __inline int _stub_unpack(_ATTRIBUTE_UNUSED remote_arg* _praROutPost, _ATTRIBUTE_UNUSED remote_arg* _ppraROutPost[1], _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _rout0[1], _ATTRIBUTE_UNUSED uint32_t _rout1[4], _ATTRIBUTE_UNUSED uint32_t _rout2[4], _ATTRIBUTE_UNUSED uint32_t _rout3[1], _ATTRIBUTE_UNUSED uint32_t _rout4[16], _ATTRIBUTE_UNUSED uint32_t _rout5[1], _ATTRIBUTE_UNUSED char* _rout6[1], _ATTRIBUTE_UNUSED uint32_t _rout6Len[1]) {
    int _nErr = 0;
diff --git a/ggml/src/ggml-hexagon/kernels/worker_pool.cpp b/ggml/src/ggml-hexagon/kernels/worker_pool.cpp
new file mode 100755
index 0000000000000..8186edcf18a95
--- /dev/null
+++ b/ggml/src/ggml-hexagon/kernels/worker_pool.cpp
@@ -0,0 +1,475 @@
+/**=============================================================================
+
+@file
+   worker_pool.cpp
+
+@brief
+   Utility providing a multi-priority thread worker pool for
+   multi-threaded computer vision (or other compute) applications.
+
+Copyright (c) 2019-2020 Qualcomm Technologies Incorporated.
+All Rights Reserved. Qualcomm Proprietary and Confidential.
+
+Export of this technology or software is regulated by the U.S.
+Government. Diversion contrary to U.S. law prohibited.
+
+All ideas, data and information contained in or disclosed by
+this document are confidential and proprietary information of
+Qualcomm Technologies Incorporated and all rights therein are expressly reserved.
+By accepting this material the recipient agrees that this material
+and the information contained therein are held in confidence and in
+trust and will not be used, copied, reproduced in whole or in part,
+nor its contents revealed in any manner to others without the express
+written permission of Qualcomm Technologies Incorporated.
+
+=============================================================================**/
+
+/*===========================================================================
+    INCLUDE FILE
+===========================================================================*/
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include "worker_pool.h"
+
+#ifndef _DEBUG
+#define _DEBUG
+#endif
+#include "HAP_farf.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+#include "qurt.h"
+#include "hexagon_protos.h"
+
+void worker_pool_constructor(void) __attribute__((constructor));
+void worker_pool_destructor(void) __attribute__((destructor));
+
+#ifdef __cplusplus
+}
+#endif
+
+/*===========================================================================
+    DEFINE
+===========================================================================*/
+#define WORKER_THREAD_STACK_SZ  2 *16384
+#define WORKER_KILL_SIGNAL      31                      // signal to kill the worker threads
+#define NUM_JOB_SLOTS           (MAX_NUM_WORKERS + 1) // max queued jobs, slightly more than number of workers.
+#define LOWEST_USABLE_QURT_PRIO 254
+
+/*===========================================================================
+    TYPEDEF
+===========================================================================*/
+// internal structure kept in thread-local storage per instance of worker pool
+typedef struct
+{
+    qurt_anysignal_t     empty_jobs;                // available job nodes
+    qurt_anysignal_t     queued_jobs;               // jobs that are waiting for a worker
+    qurt_mutex_t         empty_jobs_mutex;          // mutex for multiple threads trying to send a job
+    qurt_mutex_t         queued_jobs_mutex;         // mutex for multiple threads trying to acquire a job
+    unsigned int         job_queue_mask;            // mask for job queue nodes
+    unsigned int         num_workers;               // number of workers in this pool
+    worker_pool_job_t    job[NUM_JOB_SLOTS];        // list of job descriptors
+    qurt_thread_t        thread[MAX_NUM_WORKERS];   // thread ID's of the workers
+    void *               stack[MAX_NUM_WORKERS];    // thread stack pointers
+} worker_pool_t;
+
+// internal structure containing OS primitives to sync caller with all its spawned jobs.
+typedef union
+{
+    worker_synctoken_t raw;
+    struct
+    {
+        unsigned int atomic_countdown;
+        unsigned int reserved;                      // reserved to align next element to 8 bytes
+        qurt_sem_t   sem;
+    } sync;
+} internal_synctoken_t;
+
+/*===========================================================================
+    GLOBAL VARIABLES (per PD)
+===========================================================================*/
+// initialized in constructor
+unsigned int num_workers = 1;
+unsigned int num_hvx128_contexts = 0;
+
+/*===========================================================================
+    STATIC VARIABLES
+===========================================================================*/
+
+static worker_pool_context_t static_context = NULL;
+
+/*===========================================================================
+    LOCAL FUNCTION
+===========================================================================*/
+// the main workloop for each of the worker threads.
+static void worker_pool_main(void* context)
+{
+    // local pointer to owning pool's context
+    worker_pool_t *me = (worker_pool_t *) context;
+
+    // some local vars to reduce dereferencing inside loop
+    qurt_anysignal_t *signal = &me->queued_jobs;
+    unsigned int mask = me->job_queue_mask;
+    qurt_mutex_t *mutex = &me->queued_jobs_mutex;
+
+    while(1)
+    {
+        qurt_mutex_lock(mutex);                     // mutex only allows 1 thread to wait on signal at a time. QuRT restriction.
+        (void) qurt_anysignal_wait(signal, mask);    // wait for a job
+        unsigned int sig_rx = Q6_R_ct0_R(mask & qurt_anysignal_get(signal)); // count trailing 0's to choose flagged job
+        if (sig_rx < NUM_JOB_SLOTS)        // if real job
+        {
+            worker_pool_job_t job = me->job[sig_rx];    // local copy of job descriptor
+            (void) qurt_anysignal_clear(signal, (1 << sig_rx));        // clear the queued job signal
+            (void) qurt_anysignal_set(&me->empty_jobs, (1 << sig_rx)); // send node back to empty list
+            qurt_mutex_unlock(mutex);                // unlock the mutex
+            job.fptr(job.dptr);                        // issue the callback
+        }
+        else if (WORKER_KILL_SIGNAL == sig_rx)
+        {
+            // don't clear the kill signal, leave it for all the workers to see, and exit
+            qurt_mutex_unlock(mutex);
+            qurt_thread_exit(0);
+        }
+        else{
+            FARF(HIGH,"Worker pool received invalid job %d", sig_rx );
+            qurt_mutex_unlock(mutex);
+        }
+        // else ignore
+    }
+}
+
+void worker_pool_constructor()
+{
+    FARF(HIGH, "In worker_pool constructor");
+    qurt_sysenv_max_hthreads_t num_threads;
+    if (QURT_EOK != qurt_sysenv_get_max_hw_threads(&num_threads))
+    {
+        num_workers = MAX_NUM_WORKERS; // Couldn't get number of threads from QuRT, default to 4.
+        FARF(HIGH, "Failed to get number of threads. Defaulting to %u", num_workers);
+    }
+    else
+    {
+        num_workers = num_threads.max_hthreads;
+    }
+
+    /* Verify that number of hw threads isn't greater than max supported number of hw threads.
+       Max threads is used as a constant value for array size. */
+    if (num_workers > MAX_NUM_WORKERS)
+    {
+        num_workers = MAX_NUM_WORKERS;
+        FARF(HIGH, "Limiting number of threads to maximum supported value %u", num_workers);
+    }
+
+    num_hvx128_contexts = (qurt_hvx_get_units() >> 8) & 0xFF;
+
+    /* initialize static worker_pool for clients who pass NULL as context.*/
+    if (worker_pool_init(&static_context) != AEE_SUCCESS)
+    {
+        FARF(ERROR, "Could not initialize default worker pool");
+    }
+}
+
+AEEResult worker_pool_init_with_stack_size(worker_pool_context_t *context, int stack_size)
+{
+    int nErr = 0;
+
+    if(stack_size <= 0)
+    {
+        FARF(ERROR, "Stack size can not be negative");
+        return AEE_EBADPARM;
+    }
+
+    if (NULL == context)
+    {
+        FARF(ERROR, "NULL context passed to worker_pool_init().");
+        return AEE_EBADPARM;
+    }
+
+    // Allocations
+    int size = (stack_size * num_workers) + (sizeof(worker_pool_t));
+    unsigned char *mem_blob = (unsigned char*)malloc(size);
+    if (!mem_blob)
+    {
+        FARF(ERROR,"Could not allocate memory for worker pool!!");
+        return AEE_ENOMEMORY;
+    }
+
+    worker_pool_t *me = (worker_pool_t *)(mem_blob + stack_size * num_workers);
+
+    // name for the first worker, useful in debugging threads
+    char name[19];
+    snprintf(name, 12, "0x%8x:", (int)me);
+    strcat(name, "worker0");
+    me->num_workers = num_workers;
+    // initializations
+    for (unsigned int i = 0; i < me->num_workers; i++)
+    {
+        me->stack[i] = NULL;
+        me->thread[i] = 0;
+    }
+
+    // initialize job queue
+    qurt_anysignal_init(&(me->queued_jobs));
+    qurt_anysignal_init(&(me->empty_jobs));
+    qurt_mutex_init(&(me->empty_jobs_mutex));
+    qurt_mutex_init(&(me->queued_jobs_mutex));
+    me->job_queue_mask = (1 << NUM_JOB_SLOTS) - 1;  // set a bit for each job node, number of job nodes = num_workers + 1
+    (void) qurt_anysignal_set(&(me->empty_jobs), me->job_queue_mask); // fill the empty pool.
+    me->job_queue_mask |= (1 << WORKER_KILL_SIGNAL);  // add the kill signal to the mask.
+
+    // launch the workers
+    qurt_thread_attr_t attr;
+    qurt_thread_attr_init (&attr);
+
+    for (unsigned int i = 0; i < me->num_workers; i++)
+    {
+        // set up stack
+        me->stack[i] = mem_blob;
+        mem_blob += stack_size;
+        qurt_thread_attr_set_stack_addr(&attr, me->stack[i]);
+        qurt_thread_attr_set_stack_size(&attr, stack_size);
+
+        // set up name
+        qurt_thread_attr_set_name(&attr, name);
+        name[17] = (name[17] + 1);
+        // name threads context:worker0, context:worker1, .. (recycle at 9, but num threads should be less than that anyway)
+        if (name[17] > '9') name[17] = '0';
+        // set up priority - by default, match the creating thread's prio
+        int prio = qurt_thread_get_priority(qurt_thread_get_id());
+
+        // If loading thread has priority less than 64, load static worker pool with 64 priority.
+        if(context == &static_context && prio < 64) prio = 64;
+
+        if (prio < 1) prio = 1;
+        if (prio > LOWEST_USABLE_QURT_PRIO) prio = LOWEST_USABLE_QURT_PRIO;
+
+        qurt_thread_attr_set_priority(&attr, prio);
+
+        // launch
+        nErr = qurt_thread_create(&(me->thread[i]), &attr, worker_pool_main, (void *)me);
+        if (nErr)
+        {
+            FARF(ERROR, "Could not launch worker threads!");
+            worker_pool_deinit((worker_pool_context_t*)&me);
+            return AEE_EQURTTHREADCREATE;
+        }
+    }
+    *context = (worker_pool_context_t*)me;
+    return AEE_SUCCESS;
+}
+
+AEEResult worker_pool_init(worker_pool_context_t *context)
+{
+    return worker_pool_init_with_stack_size(context, WORKER_THREAD_STACK_SZ);
+}
+
+
+// clean up worker pool
+void worker_pool_deinit(worker_pool_context_t *context)
+{
+    worker_pool_t *me = (worker_pool_t*)*context;
+
+    // if no worker pool exists, return error.
+    if (NULL == me)
+    {
+        return;
+    }
+
+    // de-initializations
+    (void) qurt_anysignal_set(&(me->empty_jobs), (1 << WORKER_KILL_SIGNAL));  // notify to stop new jobs.
+    (void) qurt_anysignal_set(&(me->queued_jobs), (1 << WORKER_KILL_SIGNAL)); // kill worker pool.
+    for (unsigned int i = 0; i < me->num_workers; i++)                                          // wait for workers to die
+    {
+        if (me->thread[i])
+        {
+            int status;
+            (void) qurt_thread_join(me->thread[i], &status);
+        }
+    }
+
+    // release resources
+    qurt_mutex_destroy(&(me->empty_jobs_mutex));
+    qurt_mutex_destroy(&(me->queued_jobs_mutex));
+    qurt_anysignal_destroy(&(me->queued_jobs));
+    qurt_anysignal_destroy(&(me->empty_jobs));
+    // free allocated memory (were allocated as a single buffer starting at stack[0])
+    if (me->stack[0]) free (me->stack[0]);
+    // Assign NULL to freed context so that further refence to it fails.
+    *context = NULL;
+}
+
+// submit a job to the pool.
+AEEResult worker_pool_submit(worker_pool_context_t context, worker_pool_job_t job)
+{
+    worker_pool_t *me = (worker_pool_t*)context;
+
+    // if NULL is passed as worker_pool_context, try to use default static worker_pool
+    if (NULL == me)
+    {
+        if (static_context == NULL)
+        {
+            FARF(HIGH, "No default static worker pool found");
+            return AEE_ERESOURCENOTFOUND;
+        }
+        FARF(MEDIUM, "Using default static worker pool");
+        me = (worker_pool_t*)static_context;
+    }
+
+    // if a worker thread tries to submit a job, call it in-context to avoid recursion deadlock.
+    unsigned int i;
+    qurt_thread_t id = qurt_thread_get_id();
+    for (i = 0; i < me->num_workers; i++)
+    {
+        if (id == me->thread[i])
+        {
+            job.fptr(job.dptr);                     // issue the callback in caller's context
+            return AEE_SUCCESS;
+        }
+    }
+
+    // local vars to reduce dereferencing
+    qurt_mutex_t *mutex = &me->empty_jobs_mutex;
+    qurt_anysignal_t *signal =  &me->empty_jobs;
+    unsigned int mask = me->job_queue_mask;
+
+    qurt_mutex_lock(mutex);                             // lock empty queue
+    (void) qurt_anysignal_wait(signal, mask);            // wait for an empty job node
+    unsigned int bitfield = qurt_anysignal_get(signal);
+
+    // check if pool is being killed and return early
+    if (bitfield & (1 << WORKER_KILL_SIGNAL))
+    {
+        qurt_mutex_unlock(mutex);
+        return AEE_ENOMORE;
+    }
+
+    // send the job to the queue.
+    unsigned int sig_rx = Q6_R_ct0_R(mask & bitfield); // count trailing 0's to find first avail node
+    me->job[sig_rx] = job;            // copy job descriptor
+    (void) qurt_anysignal_clear(signal, (1 << sig_rx));        // clear the empty job node flag
+    (void) qurt_anysignal_set(&me->queued_jobs, (1 << sig_rx)); // notify of pending job
+    qurt_mutex_unlock(mutex);                // unlock the mutex
+
+    return 0;
+}
+
+void worker_pool_destructor()
+{
+    FARF(HIGH, "In worker_pool destructor");
+
+    worker_pool_deinit(&static_context);
+}
+
+/*===========================================================================
+    GLOBAL FUNCTION
+===========================================================================*/
+// initialize a synctoken - caller will wait on the synctoken and each job will release it.
+// caller wakes when all jobs have released.
+void worker_pool_synctoken_init(worker_synctoken_t *token, unsigned int njobs)
+{
+    // cast input to usable struct
+    internal_synctoken_t *internal_token = (internal_synctoken_t *) token;
+
+    // initialize atomic counter and semaphore
+    internal_token->sync.atomic_countdown = njobs;
+    qurt_sem_init_val(&internal_token->sync.sem, 0);
+}
+
+// worker job responsible for calling this function to count down completed jobs.
+void worker_pool_synctoken_jobdone(worker_synctoken_t *token)
+{
+    // cast input to usable struct
+    internal_synctoken_t *internal_token = (internal_synctoken_t *) token;
+
+    // count down atomically, and raise semaphore if last job.
+    if (0 == worker_pool_atomic_dec_return(&internal_token->sync.atomic_countdown))
+    {
+        (void) qurt_sem_up(&internal_token->sync.sem);
+    }
+}
+
+// job submitter waits on this function for all jobs to complete.
+void worker_pool_synctoken_wait(worker_synctoken_t *token)
+{
+    // cast input to usable struct
+    internal_synctoken_t *internal_token = (internal_synctoken_t *) token;
+
+    // Wait for all jobs to finish and raise the semaphore
+    (void) qurt_sem_down(&internal_token->sync.sem);
+
+    // clean up the semaphore
+    (void) qurt_sem_destroy(&internal_token->sync.sem);
+}
+
+AEEResult worker_pool_set_thread_priority(worker_pool_context_t context, unsigned int prio)
+{
+    worker_pool_t *me = (worker_pool_t*)context;
+
+    // if no worker pool exists, return error.
+    if (NULL == me)
+    {
+        return AEE_ENOMORE;
+    }
+
+    int result = AEE_SUCCESS;
+    if (prio < 1) prio = 1;
+    if (prio > LOWEST_USABLE_QURT_PRIO) prio = LOWEST_USABLE_QURT_PRIO;
+    for (unsigned int i = 0; i < me->num_workers; i++)
+    {
+        int res = qurt_thread_set_priority(me->thread[i], (unsigned short)prio);
+        if (0 != res)
+        {
+            result = AEE_EBADPARM;
+            FARF(ERROR, "QURT failed to set priority of thread %d, ERROR = %d", me->thread[i], res);
+        }
+    }
+    return result;
+}
+
+AEEResult worker_pool_retrieve_threadID(worker_pool_context_t context, unsigned int* threadIDs) {
+
+    worker_pool_t *me = (worker_pool_t*)context;
+    if(me == NULL)
+    {
+        FARF(ERROR, "Context NULL in RetrieveThreadID");
+        return AEE_EBADPARM;;
+    }
+
+    for(int i=0; i<me->num_workers; i++)
+    {
+        threadIDs[i]= me->thread[i];
+        FARF(MEDIUM, "Inside RetrieveThreadID threadIDs[%d] is %d",i,threadIDs[i]);
+    }
+    return AEE_SUCCESS;
+}
+
+
+AEEResult worker_pool_get_thread_priority(worker_pool_context_t context, unsigned int *prio)
+{
+    worker_pool_t *me = (worker_pool_t*)context;
+
+    // if NULL is passed as context, share static_context's priority.
+    if (NULL == me)
+    {
+        if (static_context == NULL)
+            return AEE_ENOMORE;
+        FARF(HIGH, "Using default static worker pool");
+        me = (worker_pool_t*)static_context;
+    }
+
+    int priority = qurt_thread_get_priority(me->thread[0]);
+    if (priority > 0)
+    {
+        *prio = priority;
+        return 0;
+    }
+    else
+    {
+        *prio = 0;
+        return AEE_EBADSTATE;
+    }
+}
diff --git a/ggml/src/ggml-hexagon/kernels/worker_pool.h b/ggml/src/ggml-hexagon/kernels/worker_pool.h
new file mode 100755
index 0000000000000..701cbf6215f43
--- /dev/null
+++ b/ggml/src/ggml-hexagon/kernels/worker_pool.h
@@ -0,0 +1,329 @@
+#ifndef WORKER_H
+#define WORKER_H
+
+/**=============================================================================
+
+@file
+   worker_pool.h
+
+@brief
+   Utility providing a thread worker pool for multi-threaded computer vision
+   (or other compute) applications.
+
+Copyright (c) 2019-2020 Qualcomm Technologies Incorporated.
+All Rights Reserved. Qualcomm Proprietary and Confidential.
+
+Export of this technology or software is regulated by the U.S.
+Government. Diversion contrary to U.S. law prohibited.
+
+All ideas, data and information contained in or disclosed by
+this document are confidential and proprietary information of
+Qualcomm Technologies Incorporated and all rights therein are expressly reserved.
+By accepting this material the recipient agrees that this material
+and the information contained therein are held in confidence and in
+trust and will not be used, copied, reproduced in whole or in part,
+nor its contents revealed in any manner to others without the express
+written permission of Qualcomm Technologies Incorporated.
+
+=============================================================================**/
+//==============================================================================
+// Defines
+//==============================================================================
+/// MACRO enables function to be visible in shared-library case.
+#define WORKERPOOL_API __attribute__ ((visibility ("default")))
+
+//==============================================================================
+// Include Files
+//==============================================================================
+
+#include <AEEStdDef.h>
+#include <AEEStdErr.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*===========================================================================
+    TYPEDEF
+===========================================================================*/
+/// signature of callbacks to be invoked by worker threads
+typedef void ( *worker_callback_t )( void* );
+
+/// Typedef of worker_pool context
+typedef void* worker_pool_context_t;
+
+/// descriptor for requested callback
+typedef struct
+{
+    /// function pointer
+    worker_callback_t fptr;
+    /// data pointer
+    void* dptr;
+} worker_pool_job_t;
+
+/// opaque client view of synchronization token for job submitter and workers. Internals hidden in implementation.
+typedef struct
+{
+    /// opaque array to store synchronization token for job
+    unsigned int dummy[8];   // large enough to hold a counter and a semaphore
+} worker_synctoken_t __attribute__((aligned(8)));
+
+/*===========================================================================
+    CONSTANTS
+===========================================================================*/
+/// Maximum supported number of worker threads.
+
+#define MAX_NUM_WORKERS   8
+/// Number of workers
+WORKERPOOL_API extern unsigned int num_workers;
+/// Maximum number of hvx 128 bytes units available
+WORKERPOOL_API extern unsigned int num_hvx128_contexts;
+
+//==============================================================================
+// Declarations
+//==============================================================================
+
+//---------------------------------------------------------------------------
+/// @brief
+///   Initialize a worker pool. Should be called by each control thread that
+///   requires its own worker pool.
+///
+///
+/// @param *context
+///   pointer to worker_pool_context_t variable.
+///
+/// @return
+///   0 - success.
+///   any other value - failure.
+//---------------------------------------------------------------------------
+WORKERPOOL_API AEEResult
+worker_pool_init(worker_pool_context_t *context);
+
+//---------------------------------------------------------------------------
+/// @brief
+///   Initialize a worker pool with custom stack size of worker threads.
+//    Should be called by each control thread that requires its own worker pool.
+///
+///
+/// @param *context
+///   pointer to worker_pool_context_t variable.
+/// @param *stack_size
+///   stack size of each worker thread.
+///
+/// @return
+///   0 - success.
+///   any other value - failure.
+//---------------------------------------------------------------------------
+WORKERPOOL_API AEEResult
+worker_pool_init_with_stack_size(worker_pool_context_t *context, int stack_size);
+
+//---------------------------------------------------------------------------
+/// @brief
+///   Kill worker threads and release worker pool resources. Must be called
+///   when pool owner no longer requires the pool.
+///
+///
+/// @param *context
+///   worker_pool_context_t.
+///
+//---------------------------------------------------------------------------
+WORKERPOOL_API void
+worker_pool_deinit(worker_pool_context_t *context);
+
+//---------------------------------------------------------------------------
+/// @brief
+///   Function to determine if there is an established worker pool available to
+///   the calling thread. This is an optional call - if no pool is available
+///   but attempted to be used, everything works seamlessly, in the client's
+///   context (instead of worker context).
+///
+///
+/// @param context
+///   worker_pool_context_t.
+///
+/// @return
+///   0 - no worker pool available.
+///   any other value - worker pool available.
+//---------------------------------------------------------------------------
+WORKERPOOL_API AEEResult
+worker_pool_available(worker_pool_context_t context);
+
+//---------------------------------------------------------------------------
+/// @brief
+///   Submit a job to the worker pool.
+///
+///
+/// @param context
+///   worker pool context where job is to be submitted.
+///
+/// @param job
+///   callback function pointer and data.
+///
+/// @return
+///   0 - success.
+///   any other value - failure.
+//---------------------------------------------------------------------------
+WORKERPOOL_API AEEResult
+worker_pool_submit(worker_pool_context_t context, worker_pool_job_t job);
+
+//---------------------------------------------------------------------------
+/// @brief
+///   Initialize a synchronization token for job submitter and workers to use.
+///   Each worker callback must be given access to the token to release it, and
+///   job submitter will wait for all jobs to release the token. Internals are
+///   hidden from client.
+///
+///
+/// @param token
+///   pointer to the synctoken structure.
+///
+/// @param njobs
+///   number of jobs that will be releasing the token
+//---------------------------------------------------------------------------
+WORKERPOOL_API void
+worker_pool_synctoken_init(worker_synctoken_t *token, unsigned int njobs);
+
+//---------------------------------------------------------------------------
+/// @brief
+///   Needs to be called by the worker in the callback before exiting. The
+///   token must be available to the callback via the data pointer given
+///   to the callback during job submission.
+///
+///
+/// @param token
+///   pointer to the synctoken structure held by the job submitter
+//---------------------------------------------------------------------------
+WORKERPOOL_API void
+worker_pool_synctoken_jobdone(worker_synctoken_t *token);
+
+//---------------------------------------------------------------------------
+/// @brief
+///   Job submitter calls this function after submitting all jobs to await
+///   their completion.
+///
+///
+/// @param token
+///   pointer to the synctoken structure
+//---------------------------------------------------------------------------
+WORKERPOOL_API void
+worker_pool_synctoken_wait(worker_synctoken_t *token);
+
+//---------------------------------------------------------------------------
+/// @brief
+///   Set the thread priority of the worker threads. Specified priority will
+///   be applied to all threads in the default worker pool. The threads
+///   that service boosted and background job requests will also be adjusted to be relative
+///   to the new default thread priority.
+///
+///
+/// @param context
+///   worker pool context whose workers' priorities are to be changed.
+///
+/// @param prio
+///   desired priority. 1 is the highest priority allowed. 255 is the lowest priority allowed.
+///
+/// @return
+///   0 - success.
+///   any other value - failure.
+//---------------------------------------------------------------------------
+WORKERPOOL_API AEEResult
+worker_pool_set_thread_priority(worker_pool_context_t context, unsigned int prio);
+
+//---------------------------------------------------------------------------
+/// @brief
+///   Query the thread priority of the default worker threads. This will return
+///   the current priority for one of the workers, which are all created
+///   with the same priority. If a user callback has changed one or more worker threads independently,
+///   there is no guarantee on which worker's priority is returned by this function.
+///
+///
+/// @param context
+///   worker pool context whose workers' priorities are asked.
+///
+/// @param prio
+///   desired priority. 1 is the highest priority allowed. 255 is the lowest priority allowed.
+///
+/// @return
+///   0 - success.
+///   any other value - failure.
+//---------------------------------------------------------------------------
+WORKERPOOL_API AEEResult
+worker_pool_get_thread_priority(worker_pool_context_t context, unsigned int *prio);
+
+//---------------------------------------------------------------------------
+/// @brief
+///   Utility inline to atomically increment a variable. Useful in
+///   synchronizing jobs among worker threads, in cases where all
+///   job-related info can be determined by the job number.
+///
+///
+/// @param target
+///   pointer to the variable being incremented
+///
+/// @return
+///   the value after incrementing
+//---------------------------------------------------------------------------
+static inline unsigned int
+worker_pool_atomic_inc_return(unsigned int *target)
+{
+    unsigned int result;
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = add(%0, #1)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target)
+        : "p0");
+    return result;
+}
+
+//---------------------------------------------------------------------------
+/// @brief
+///   Utility inline to atomically decrement a variable.
+///
+///
+/// @param target
+///   pointer to the variable being incremented
+///
+/// @return
+///   the value after decrementing
+//---------------------------------------------------------------------------
+static inline unsigned int
+worker_pool_atomic_dec_return(unsigned int *target)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = add(%0, #-1)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target)
+        : "p0");
+    return result;
+}
+
+//---------------------------------------------------------------------------
+/// @brief
+///   Quries and retruns the threads IDs of all the active threads in the worker pool.
+///
+///
+/// @param context
+///   worker pool context whose workers' IDs are asked.
+///
+/// @param threadIDs
+///   pointer to the array created by the user where thread IDs will be written to.
+///
+/// @return
+///   0 - success.
+///   0E - Invalid parameter
+//---------------------------------------------------------------------------
+WORKERPOOL_API AEEResult
+worker_pool_retrieve_threadID(worker_pool_context_t context, unsigned int* threadIDs);
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // #ifndef WORKER_H
diff --git a/ggml/src/ggml-kompute/kompute b/ggml/src/ggml-kompute/kompute
new file mode 160000
index 0000000000000..4565194ed7c32
--- /dev/null
+++ b/ggml/src/ggml-kompute/kompute
@@ -0,0 +1 @@
+Subproject commit 4565194ed7c32d1d2efa32ceab4d3c6cae006306
diff --git a/models/t5-very-small-random-F32.gguf b/models/t5-very-small-random-F32.gguf
new file mode 100644
index 0000000000000..fd386d88562d2
Binary files /dev/null and b/models/t5-very-small-random-F32.gguf differ
diff --git a/prebuilts/Hexagon_SDK/.lock b/prebuilts/Hexagon_SDK/.lock
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/CPU/QnnCpuCommon.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/CPU/QnnCpuCommon.h
new file mode 100755
index 0000000000000..fdbfc1136d556
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/CPU/QnnCpuCommon.h
@@ -0,0 +1,50 @@
+//=============================================================================
+//
+//  Copyright (c) 2020-2023 Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//=============================================================================
+
+/** @file
+ *  @brief QNN CPU Common components
+ *
+ *         This file defines versioning and other identification details
+ *         and supplements QnnCommon.h for CPU backend
+ */
+
+#ifndef QNN_CPU_COMMON_H
+#define QNN_CPU_COMMON_H
+
+#include "QnnCommon.h"
+
+/// CPU Backend identifier
+#define QNN_BACKEND_ID_CPU 3
+
+/// CPU interface provider
+#define QNN_CPU_INTERFACE_PROVIDER_NAME "CPU_QTI_AISW"
+
+// CPU API Version values
+#define QNN_CPU_API_VERSION_MAJOR 1
+#define QNN_CPU_API_VERSION_MINOR 1
+#define QNN_CPU_API_VERSION_PATCH 0
+
+// clang-format off
+/// Macro to set Qnn_ApiVersion_t for CPU backend
+#define QNN_CPU_API_VERSION_INIT                                 \
+  {                                                              \
+    {                                                            \
+      QNN_API_VERSION_MAJOR,     /*coreApiVersion.major*/        \
+      QNN_API_VERSION_MINOR,     /*coreApiVersion.major*/        \
+      QNN_API_VERSION_PATCH      /*coreApiVersion.major*/        \
+    },                                                           \
+    {                                                            \
+      QNN_CPU_API_VERSION_MAJOR, /*backendApiVersion.major*/     \
+      QNN_CPU_API_VERSION_MINOR, /*backendApiVersion.minor*/     \
+      QNN_CPU_API_VERSION_PATCH  /*backendApiVersion.patch*/     \
+    }                                                            \
+  }
+
+// clang-format on
+
+#endif  // QNN_CPU_COMMON_H
\ No newline at end of file
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/CPU/QnnCpuGraph.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/CPU/QnnCpuGraph.h
new file mode 100755
index 0000000000000..750cfd0b501f1
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/CPU/QnnCpuGraph.h
@@ -0,0 +1,117 @@
+//=============================================================================
+//
+//  Copyright (c) 2022 Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//=============================================================================
+
+/** @file
+ *  @brief QNN CPU component Graph API.
+ *
+ *         The interfaces in this file work with the top level QNN
+ *         API and supplements QnnGraph.h for CPU backend
+ */
+
+#ifndef QNN_CPU_GRAPH_H
+#define QNN_CPU_GRAPH_H
+
+#include "QnnGraph.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+/**
+ * @brief This enum provides different CPU graph configuration
+ *         options associated with QnnGraph
+ */
+typedef enum {
+  QNN_CPU_GRAPH_CONFIG_OPTION_OP_DEBUG_CALLBACK = 1,
+  QNN_CPU_GRAPH_CONFIG_OPTION_UNDEFINED         = 0x7fffffff
+} QnnCpuGraph_ConfigOption_t;
+
+/* @brief CallBack function pointer to be filled by user.
+ *        This callback will be called after each op execution.
+ *        Only outputTensor id and data buffer is valid, consumable.
+ *        Memory is owned by BE which is valid throughout the callback.
+ *        Client should not update any parameter and argument of opConfig.
+ *        NULL tensor/buffer indicate invalid data buffer.
+ */
+typedef Qnn_ErrorHandle_t (*QnnCpuGraph_OpDebugCallback_t)(Qnn_OpConfig_t* opConfig,
+                                                           void* callBackParam);
+
+/* @brief Structure to be filled by user.
+ *        This structure will have callback function and callback reference data.
+ *        Memory is owned by BE which is valid throughout the callback.
+ *        Client should not update any parameter and argument of opConfig.
+ *        NULL callback function indicate no debug option.
+ */
+typedef struct {
+  void* callBackParam;
+  QnnCpuGraph_OpDebugCallback_t cpuGraphOpDebugCallback;
+} QnnCpuGraph_OpDebug_t;
+
+// clang-format off
+/// QnnCpuGraph_OpDebug_t initializer macro
+#define QNN_CPU_GRAPH_OP_DEBUG_INIT       \
+  {                                       \
+    NULL,    /*callBackParam*/            \
+    NULL     /*cpuGraphOpDebugCallback*/  \
+  }
+// clang-format on
+
+//=============================================================================
+// Public Functions
+//=============================================================================
+
+//------------------------------------------------------------------------------
+//   Implementation Definition
+//------------------------------------------------------------------------------
+
+/**
+ * @brief        Structure describing the set of configurations supported by graph.
+ *               Objects of this type are to be referenced through QnnGraph_CustomConfig_t.
+ *
+ *               The struct has two fields - option and a union of corresponding config values
+ *               Based on the option corresponding item in the union can be used to specify
+ *               config.
+ *               Below is the map between QnnCpuGraph_ConfigOption_t and config value
+ *
+ *               \verbatim embed:rst:leading-asterisk
+ *               +----+------------------------------------------+------------------------------------+
+ *               | #  | Config Option                            | Configuration Struct/value      |
+ *               +====+==========================================+====================================+
+ *               | 1  | QNN_CPU_GRAPH_CONFIG_DEBUG_CALLBACK      | QnnCpuGraph_OpDebug_t           |
+ *               +----+------------------------------------------+------------------------------------+
+ *               \endverbatim
+ */
+typedef struct {
+  QnnCpuGraph_ConfigOption_t option;
+  union UNNAMED {
+    QnnCpuGraph_OpDebug_t cpuGraphOpDebug;
+  };
+} QnnCpuGraph_CustomConfig_t;
+
+/// QnnCpuGraph_CustomConfig_t initializer macro
+#define QNN_CPU_GRAPH_CUSTOM_CONFIG_INIT                      \
+  {                                                           \
+    QNN_CPU_GRAPH_CONFIG_OPTION_UNKNOWN, /*option*/           \
+    {                                                         \
+      QNN_CPU_GRAPH_OP_DEBUG_INIT /*cpuGraphOpDebugCallback*/ \
+    }                                                         \
+  }
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/CPU/QnnCpuOpPackage.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/CPU/QnnCpuOpPackage.h
new file mode 100755
index 0000000000000..97bdab8dfd3f9
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/CPU/QnnCpuOpPackage.h
@@ -0,0 +1,224 @@
+//==============================================================================
+//
+//  Copyright (c) 2020-2023 Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/** @file
+ *  @brief CPU Operation Package component API
+ *
+ *         Provides interface to interact with OpPackage libraries registered
+ *         with the CPU backend.
+ */
+
+#ifndef QNN_CPU_OP_PACKAGE_H
+#define QNN_CPU_OP_PACKAGE_H
+
+#include "CPU/QnnCpuCommon.h"
+#include "QnnGraph.h"
+#include "QnnOpPackage.h"
+#include "QnnTypes.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define QNN_CPUOPPACKAGE_TENSOR_DATA_FORMAT_FLAT_BUFFER 0
+
+/**
+ * @brief A value representing a tensor data format.
+ */
+typedef uint32_t QnnCpuOpPackage_TensorDataFormat_t;
+
+/**
+ * @brief A value representing a profile data in ms.
+ */
+typedef double QnnCpuOpPackage_ProfileData_t;
+
+/**
+ * @brief An enum to specify a param type.
+ */
+typedef enum {
+  QNN_CPU_PARAMTYPE_SCALAR = 0,
+  QNN_CPU_PARAMTYPE_TENSOR = 1,
+  QNN_CPU_PARAMTYPE_STRING = 2,
+  // Unused, present to ensure 32 bits.
+  QNN_CPU_PARAMTYPE_UNDEFINED = 0xFFFFFFFF
+} QnnCpuOpPackage_ParamType_t;
+
+/**
+ * @brief An enum to specify tensor data type.
+ */
+typedef enum {
+  QNN_CPU_DATATYPE_BOOL_8   = 0x0508,
+  QNN_CPU_DATATYPE_INT_8    = 0x0008,
+  QNN_CPU_DATATYPE_INT_32   = 0x0032,
+  QNN_CPU_DATATYPE_UINT_8   = 0x0108,
+  QNN_CPU_DATATYPE_UINT_32  = 0x0132,
+  QNN_CPU_DATATYPE_FLOAT_32 = 0x0232,
+  // Unused, present to ensure 32 bits.
+  QNN_CPU_DATATYPE_UNDEFINED = 0x7FFFFFFF
+} QnnCpuOpPackage_DataType_t;
+
+/**
+ * @brief An enum to specify logging level.
+ */
+typedef enum {
+  QNN_CPU_MSG_ERROR = 1,
+  QNN_CPU_MSG_DEBUG = 2,
+  QNN_CPU_MSG_LOW   = 3,
+  QNN_CPU_MSG_MED   = 4,
+  QNN_CPU_MSG_HIGH  = 5,
+  // Unused, present to ensure 32 bits
+  QNN_CPU_MSG_UNDEFINED = 0x7FFFFFFF
+} QnnCpuOpPackage_MsgType_t;
+
+/**
+ * @brief An enum to specify the profiling type.
+ */
+typedef enum {
+  QNN_CPU_PROFILE_BASIC    = 1,
+  QNN_CPU_PROFILE_DETAILED = 2,
+  // Unused, present to ensure 32 bits
+  QNN_CPU_PROFILE_UNDEFINED = 0x7FFFFFFF
+} QnnCpuOpPackage_ProfileType_t;
+
+/**
+ * @brief A struct which defines the Global infrastructure.
+ */
+typedef struct _QnnOpPackage_GlobalInfrastructure_t {
+  // Message
+  void (*reportMessage)(QnnCpuOpPackage_MsgType_t msgType, const char* msg, ...);
+
+  // Profile
+  void (*profile)(QnnCpuOpPackage_ProfileType_t profileType,
+                  QnnCpuOpPackage_ProfileData_t timeInMsec);
+} QnnCpuOpPackage_GlobalInfra_t;
+
+// clang-format off
+/// QnnCpuOpPackage_GlobalInfra_t initializer macro
+#define QNN_CPU_OP_PACKAGE_GLOBAL_INFRA_INIT \
+  {                                          \
+    NULL,    /*reportMessage*/               \
+    NULL     /*profile*/                     \
+  }
+// clang-format on
+
+typedef Qnn_ErrorHandle_t (*QnnCpuOpPackage_OpImplFn_t)(void* opPkgNodeData);
+
+/**
+ * @brief A struct which defines the OpImpl definition.
+ */
+typedef struct _QnnOpPackage_OpImpl_t {
+  QnnCpuOpPackage_OpImplFn_t opImplFn;
+  void* userData;
+} QnnCpuOpPackage_OpImpl_t;
+
+// clang-format off
+/// QnnCpuOpPackage_OpImpl_t initializer macro
+#define QNN_CPU_OP_PACKAGE_OPIMPL_INIT \
+  {                                    \
+    NULL,    /*kernelFn*/              \
+    NULL     /*userData*/              \
+  }
+// clang-format on
+
+/**
+ * @brief A struct which describes the properties of a tensor.
+ *
+ */
+typedef struct {
+  QnnCpuOpPackage_TensorDataFormat_t dataFormat;
+  QnnCpuOpPackage_DataType_t dataType;
+  uint32_t rank;
+  uint32_t* maxDimensions;
+  uint32_t* currentDimensions;
+  void* data;
+  Qnn_QuantizeParams_t quantizeParams;
+} QnnCpuOpPackage_Tensor_t;
+
+// clang-format off
+/// QnnCpuOpPackage_Tensor_t initializer macro
+#define QNN_CPU_OP_PACKAGE_TENSOR_INIT                        \
+  {                                                           \
+    QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, /*dataFormat*/        \
+    QNN_CPU_DATATYPE_UNDEFINED,         /*dataType*/          \
+    0,                                  /*rank*/              \
+    NULL,                               /*maxDimensions*/     \
+    NULL,                               /*currentDimensions*/ \
+    NULL,                               /*data*/              \
+    QNN_QUANTIZE_PARAMS_INIT            /*quantizeParams*/    \
+  }
+// clang-format on
+
+/**
+ * @brief A struct which describes the parameters of a node.
+ *
+ */
+typedef struct {
+  QnnCpuOpPackage_ParamType_t type;
+  const char* name;
+  union {
+    double scalarParam;
+    const char* string;
+    QnnCpuOpPackage_Tensor_t* tensorParam;
+  };
+} QnnCpuOpPackage_Param_t;
+
+// clang-format off
+/// QnnCpuOpPackage_Param_t initializer macro
+#define QNN_CPU_OP_PACKAGE_PARAM_INIT     \
+  {                                       \
+    QNN_CPU_PARAMTYPE_UNDEFINED, /*type*/ \
+    NULL,                        /*name*/ \
+    {                                     \
+      0 /*scalarParam*/                   \
+    }                                     \
+  }
+// clang-format on
+
+/**
+ * @brief A struct which describes the node.
+ *
+ */
+typedef struct _QnnOpPackage_Node_t {
+  const char* name;
+  const char* packageName;
+  const char* typeName;
+  uint32_t numOfParams;
+  QnnCpuOpPackage_Param_t** params;
+  uint32_t numOfInputs;
+  QnnCpuOpPackage_Tensor_t** inputs;
+  uint32_t numOfOutputs;
+  QnnCpuOpPackage_Tensor_t** outputs;
+} QnnCpuOpPackage_Node_t;
+
+// clang-format off
+/// QnnCpuOpPackage_Node_t initializer macro
+#define QNN_CPU_OP_PACKAGE_NODE_INIT \
+  {                                  \
+    NULL,     /*name*/               \
+    NULL,     /*packageName*/        \
+    NULL,     /*typeName*/           \
+    0,        /*numOfParams*/        \
+    NULL,     /*params*/             \
+    0,        /*numOfInputs*/        \
+    NULL,     /*inputs*/             \
+    0,        /*numOfOutputs*/       \
+    NULL      /*outputs*/            \
+  }
+// clang-format on
+
+/**
+ * @brief Graph infrastructure.
+ *
+ */
+typedef _QnnOpPackage_GraphInfrastructure_t QnnCpuOpPackage_GraphInfrastructure_t;
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // QNN_CPU_OP_PACKAGE_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspBackend.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspBackend.h
new file mode 100755
index 0000000000000..e2b6c69dffbdf
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspBackend.h
@@ -0,0 +1,108 @@
+//=============================================================================
+//
+//  Copyright (c) 2022-2023 Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//=============================================================================
+
+/** @file
+ *  @brief QNN DSP component Backend API.
+ *
+ *         The interfaces in this file work with the top level QNN
+ *         API and supplements QnnBackend.h for DSP backend
+ */
+
+#ifndef QNN_DSP_BACKEND_H
+#define QNN_DSP_BACKEND_H
+
+#include "QnnBackend.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+//=============================================================================
+// Public Functions
+//=============================================================================
+
+//------------------------------------------------------------------------------
+//   Implementation Definition
+//------------------------------------------------------------------------------
+
+// clang-format off
+
+/* @brief Enum describing the set of custom configs supported by DSP backend.
+*/
+typedef enum {
+  ///  The accelerator will always attempt to fold relu activation
+  ///  into the immediate preceding convolution operation. This optimization
+  ///  is correct when quantization ranges for convolution are equal or
+  ///  subset of the Relu operation. For graphs, where this cannot be
+  ///  guaranteed, the client should set this option to true
+  QNN_DSP_BACKEND_CONFIG_OPTION_FOLD_RELU_ACTIVATION_INTO_CONV_OFF = 0,
+  ///  The accelerator will always attempt to all Convolution
+  ///  operation using HMX instructions. Convolution that have
+  ///  short depth and/or weights that are not symmetric could
+  ///  exhibit inaccurate results. In such cases, clients must
+  ///  set this option to true to guarantee correctness of the operation
+  QNN_DSP_BACKEND_CONFIG_OPTION_SHORT_DEPTH_CONV_ON_HMX_OFF = 1,
+  ///  Every APP side user process that uses a DSP via FastRPC
+  ///  has a corresponding dynamic user process domain on the DSP side.
+  ///  QNN by default opens RPC session as unsigned PD,
+  ///  in case this option is set to true,
+  ///  RPC session will be opened as signed PD (requires signed .so).
+  QNN_DSP_BACKEND_CONFIG_OPTION_USE_SIGNED_PROCESS_DOMAIN = 2,
+  /// set QnnDspBackend_DspArch_t for offline prepare mode
+  QNN_DSP_BACKEND_CONFIG_OPTION_ARCH = 3,
+  /// UNKNOWN enum option that must not be used
+  QNN_DSP_BACKEND_CONFIG_OPTION_UNKNOWN = 0x7fffffff
+} QnnDspBackend_ConfigOption_t;
+
+typedef enum {
+  QNN_DSP_BACKEND_DSP_ARCH_NONE = 0,
+  QNN_DSP_BACKEND_DSP_ARCH_V65 = 65,
+  QNN_DSP_BACKEND_DSP_ARCH_V66 = 66,
+  QNN_DSP_BACKEND_DSP_ARCH_V68 = 68,
+  QNN_DSP_BACKEND_DSP_ARCH_V69 = 69,
+  QNN_DSP_BACKEND_DSP_ARCH_V73 = 73,
+  QNN_DSP_BACKEND_DSP_ARCH_UNKNOWN = 0x7fffffff
+} QnnDspBackend_DspArch_t;
+
+/**
+ * @brief Structure describing the set of configurations supported by the backend.
+ *        Objects of this type are to be referenced through QnnBackend_CustomConfig_t.
+ */
+typedef struct QnnDspBackend_CustomConfig {
+  QnnDspBackend_ConfigOption_t option;
+  union UNNAMED {
+    bool foldReluActivationIntoConvOff;
+    bool shortDepthConvOnHmxOff;
+    bool useSignedProcessDomain;
+    QnnDspBackend_DspArch_t arch;
+  };
+} QnnDspBackend_CustomConfig_t ;
+
+/// QnnDspBackend_CustomConfig_t initializer macro
+#define QNN_DSP_BACKEND_CUSTOM_CONFIG_INIT \
+  {                                                   \
+    QNN_DSP_BACKEND_CONFIG_OPTION_UNKNOWN, /*option*/ \
+    {                                                 \
+      false /*foldReluActivationIntoConvOff*/         \
+    }                                                 \
+  }
+
+// clang-format on
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspCommon.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspCommon.h
new file mode 100755
index 0000000000000..8b5ad49d04d6e
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspCommon.h
@@ -0,0 +1,61 @@
+//=============================================================================
+//
+//  Copyright (c) 2022-2023 Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//=============================================================================
+
+/** @file
+ *  @brief QNN DSP Common components
+ *
+ *         This file defines versioning and other identification details
+ *         and supplements QnnCommon.h for DSP backend
+ */
+
+#ifndef QNN_DSP_COMMON_H
+#define QNN_DSP_COMMON_H
+
+#include "QnnCommon.h"
+
+/// DSP Backend identifier
+#define QNN_BACKEND_ID_DSP 5
+
+/// DSP interface provider
+#define QNN_DSP_INTERFACE_PROVIDER_NAME "DSP_QTI_AISW"
+
+// DSP API Version values
+#define QNN_DSP_API_VERSION_MAJOR 5
+#define QNN_DSP_API_VERSION_MINOR 0
+#define QNN_DSP_API_VERSION_PATCH 1
+
+// clang-format off
+
+/// Macro to set Qnn_ApiVersion_t for DSP backend
+#define QNN_DSP_API_VERSION_INIT                                 \
+  {                                                              \
+    {                                                            \
+      QNN_API_VERSION_MAJOR,     /*coreApiVersion.major*/        \
+      QNN_API_VERSION_MINOR,     /*coreApiVersion.major*/        \
+      QNN_API_VERSION_PATCH      /*coreApiVersion.major*/        \
+    },                                                           \
+    {                                                            \
+      QNN_DSP_API_VERSION_MAJOR, /*backendApiVersion.major*/     \
+      QNN_DSP_API_VERSION_MINOR, /*backendApiVersion.minor*/     \
+      QNN_DSP_API_VERSION_PATCH  /*backendApiVersion.patch*/     \
+    }                                                            \
+  }
+
+// clang-format on
+
+// DSP Binary Version values
+#define QNN_DSP_BINARY_VERSION_MAJOR 1
+#define QNN_DSP_BINARY_VERSION_MINOR 0
+#define QNN_DSP_BINARY_VERSION_PATCH 0
+
+// DSP Context blob Version values
+#define QNN_DSP_CONTEXT_BLOB_VERSION_MAJOR 1
+#define QNN_DSP_CONTEXT_BLOB_VERSION_MINOR 0
+#define QNN_DSP_CONTEXT_BLOB_VERSION_PATCH 0
+
+#endif  // QNN_DSP_COMMON_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspDevice.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspDevice.h
new file mode 100755
index 0000000000000..eecf62f5cbc02
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspDevice.h
@@ -0,0 +1,46 @@
+//=============================================================================
+//
+//  Copyright (c) 2022-2023 Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//=============================================================================
+
+/** @file
+ *  @brief QNN DSP component Device API.
+ *
+ *         The interfaces in this file work with the top level QNN
+ *         API and supplements QnnDevice.h for DSP backend
+ */
+#ifndef QNN_DSP_DEVICE_H
+#define QNN_DSP_DEVICE_H
+
+#include "QnnDevice.h"
+#include "QnnDspPerfInfrastructure.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _QnnDevice_Infrastructure_t {
+  QnnDspPerfInfrastructure_CreatePowerConfigIdFn_t createPowerConfigId;
+  QnnDspPerfInfrastructure_DestroyPowerConfigIdFn_t destroyPowerConfigId;
+  QnnDspPerfInfrastructure_SetPowerConfigFn_t setPowerConfig;
+  QnnDspPerfInfrastructure_SetMemoryConfigFn_t setMemoryConfig;
+  QnnDspPerfInfrastructure_SetThreadConfigFn_t setThreadConfig;
+} QnnDspDevice_Infrastructure_t;
+
+#define QNN_DSP_DEVICE_INFRASTRUCTURE_INIT \
+  {                                        \
+    NULL,     /*createPowerConfigId*/      \
+        NULL, /*destroyPowerConfigId*/     \
+        NULL, /*setPowerConfig*/           \
+        NULL, /*setMemoryConfig*/          \
+        NULL  /*setThreadConfig*/          \
+  }
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
\ No newline at end of file
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspGraph.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspGraph.h
new file mode 100755
index 0000000000000..dd1c5220c8721
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspGraph.h
@@ -0,0 +1,171 @@
+//=============================================================================
+//
+//  Copyright (c) 2022-2023 Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//=============================================================================
+
+/**
+ *  @file
+ *  @brief QNN DSP component Graph API.
+ *
+ *         The interfaces in this file work with the top level QNN
+ *         API and supplements QnnGraph.h for DSP backend
+ */
+
+#ifndef QNN_DSP_GRAPH_H
+#define QNN_DSP_GRAPH_H
+
+#include "QnnGraph.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+/**
+ * @brief This enum provides different DSP graph optimization
+ *        options that can be used to finalize the graph
+ *        for optimum performance.
+ */
+typedef enum {
+  QNN_DSP_GRAPH_OPTIMIZATION_TYPE_SCHEDULE_THRESHOLD         = 1,
+  QNN_DSP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_RETRIES           = 2,
+  QNN_DSP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG = 3,
+  QNN_DSP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC                = 4,
+  QNN_DSP_GRAPH_OPTIMIZATION_TYPE_UNKNOWN                    = 0x7fffffff
+} QnnDspGraph_OptimizationType_t;
+
+// clang-format off
+
+/**
+ * @brief Struct describing the set of optimization types
+ *        and the values associated with each optimization type.
+ *
+ *        Below is the Map between QnnDspGraph_OptimizationType_t and allowable values:
+ *
+ *        \verbatim embed:rst:leading-asterisk
+ *        +----+------------------------------------------------------------+-----------------------------------------------------------+
+ *        | #  | OptimizationType option                                    | Allowable values                                          |
+ *        +====+============================================================+===========================================================+
+ *        | 1  | QNN_DSP_GRAPH_OPTIMIZATION_TYPE_SCHEDULE_THRESHOLD         | Reserved                                                  |
+ *        +----+------------------------------------------------------------+-----------------------------------------------------------+
+ *        | 2  | QNN_DSP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_RETRIES           | Reserved                                                  |
+ *        +----+------------------------------------------------------------+-----------------------------------------------------------+
+ *        | 3  | QNN_DSP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG | Defines the optimization strategy used by the HTP backend |
+ *        |    |                                                            |                                                           |
+ *        |    |                                                            |   1 = Faster preparation time, less optimal graph         |
+ *        |    |                                                            |                                                           |
+ *        |    |                                                            |   2 = More optimal graph but may take longer to prepare   |
+ *        +----+------------------------------------------------------------+-----------------------------------------------------------+
+ *        | 4  | QNN_DSP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC                | Reserved                                                  |
+ *        +----+------------------------------------------------------------+-----------------------------------------------------------+
+ *        \endverbatim
+ */
+typedef struct {
+  QnnDspGraph_OptimizationType_t type;
+  float floatValue;
+} QnnDspGraph_OptimizationOption_t;
+
+/// QnnDspGraph_OptimizationOption_t initializer macro
+#define QNN_DSP_GRAPH_OPTIMIZATION_OPTION_INIT              \
+  {                                                         \
+    QNN_DSP_GRAPH_OPTIMIZATION_TYPE_UNKNOWN, /*type*/       \
+    0.0f                                     /*floatValue*/ \
+  }
+// clang-format on
+
+/**
+ * @brief This enum provides different DSP graph configuration
+ *        options associated with QnnGraph
+ */
+typedef enum {
+  QNN_DSP_GRAPH_CONFIG_OPTION_OPTIMIZATION = 1,
+  QNN_DSP_GRAPH_CONFIG_OPTION_ENCODING     = 2,
+  QNN_DSP_GRAPH_CONFIG_OPTION_PRIORITY     = 3,
+  QNN_DSP_GRAPH_CONFIG_OPTION_PRECISION    = 4,
+  QNN_DSP_GRAPH_CONFIG_OPTION_UNKNOWN      = 0x7fffffff
+} QnnDspGraph_ConfigOption_t;
+
+typedef enum {
+  QNN_DSP_GRAPH_ENCODING_DYNAMIC = 1,
+  /** @deprecated
+   */
+  QNN_DSP_GRAPH_ENCOING_DYNAMIC = QNN_DSP_GRAPH_ENCODING_DYNAMIC,
+  QNN_DSP_GRAPH_ENCODING_STATIC = 2,
+  /** @deprecated
+   */
+  QNN_DSP_GRAPH_ENCOING_STATIC   = QNN_DSP_GRAPH_ENCODING_STATIC,
+  QNN_DSP_GRAPH_ENCODING_UNKNOWN = 0x7fffffff,
+  /** @deprecated
+   */
+  QNN_DSP_GRAPH_ENCOING_UNKNOW = QNN_DSP_GRAPH_ENCODING_UNKNOWN
+} QnnDspGraph_Encoding_t;
+
+//=============================================================================
+// Public Functions
+//=============================================================================
+
+//------------------------------------------------------------------------------
+//   Implementation Definition
+//------------------------------------------------------------------------------
+
+// clang-format off
+
+/**
+ * @brief        Structure describing the set of configurations supported by graph.
+ *               Objects of this type are to be referenced through QnnGraph_CustomConfig_t.
+ *
+ *               The struct has two fields - option and a union of corresponding config values
+ *               Based on the option corresponding item in the union can be used to specify
+ *               config.
+ *
+ *               Below is the Map between QnnDspGraph_ConfigOption_t and config value
+ *
+ *               \verbatim embed:rst:leading-asterisk
+ *               +----+------------------------------------------+------------------------------------+
+ *               | #  | Config Option                            | Configuration Struct/value         |
+ *               +====+==========================================+====================================+
+ *               | 1  | QNN_DSP_GRAPH_CONFIG_OPTION_OPTIMIZATION | QnnDspGraph_OptimizationOption_t   |
+ *               +----+------------------------------------------+------------------------------------+
+ *               | 2  | QNN_DSP_GRAPH_CONFIG_OPTION_ENCODING     | QnnDspGraph_Encoding_t             |
+ *               +----+------------------------------------------+------------------------------------+
+ *               | 3  | QNN_DSP_GRAPH_CONFIG_OPTION_PRECISION    | Qnn_Precision_t                    |
+ *               +----+------------------------------------------+------------------------------------+
+ *               | 4  | QNN_DSP_GRAPH_CONFIG_OPTION_PRIORITY     | Qnn_Priority_t                     |
+ *               +----+------------------------------------------+------------------------------------+
+ *               \endverbatim
+ */
+typedef struct {
+  QnnDspGraph_ConfigOption_t option;
+  union {
+    QnnDspGraph_OptimizationOption_t optimizationOption;
+    QnnDspGraph_Encoding_t encoding;
+    Qnn_Priority_t priority;
+    Qnn_Precision_t precision;
+  };
+} QnnDspGraph_CustomConfig_t;
+
+// clang-format on
+/// QnnDspGraph_CustomConfig_t initializer macro
+#define QNN_DSP_GRAPH_CUSTOM_CONFIG_INIT                            \
+  {                                                                 \
+    QNN_DSP_GRAPH_CONFIG_OPTION_UNKNOWN, /*option*/                 \
+    {                                                               \
+      QNN_DSP_GRAPH_OPTIMIZATION_OPTION_INIT /*optimizationOption*/ \
+    }                                                               \
+  }
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspOpPackage.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspOpPackage.h
new file mode 100755
index 0000000000000..c8760ecb6b798
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspOpPackage.h
@@ -0,0 +1,42 @@
+//==============================================================================
+//
+//  Copyright (c) 2022-2023 Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef QNN_DSP_OP_PACKAGE_HPP
+#define QNN_DSP_OP_PACKAGE_HPP
+
+#include "QnnOpPackage.h"
+#include "QnnTypes.h"
+#include "Udo/UdoImplDsp.h"
+
+/**
+ * @brief A struct which defines the Global infrastructure.
+ */
+typedef struct _QnnOpPackage_GlobalInfrastructure_t {
+  /// include the UdoMalloc, UdoFree and so on
+  Udo_DspGlobalInfrastructure_t* dspGlobalInfra;
+} QnnDspOpPackage_GlobalInfrastructure_t;
+
+/**
+ * @brief A struct which defines the operation info.
+ */
+typedef struct _QnnOpPackage_OperationInfo_t {
+  char* opType;
+  uint32_t numOfStaticParams;
+  uint32_t numOfInputs;
+  uint32_t numOfOutputs;
+
+  Udo_CreateOpFactoryFunction_t createOpFactory;
+  Udo_CreateOperationFunction_t createOperation;
+  Udo_ExecuteOpFunction_t executeOp;
+  Udo_ReleaseOpFunction_t releaseOp;
+  Udo_ReleaseOpFactoryFunction_t releaseOpFactory;
+  Udo_ValidateOperationFunction_t validateOp;
+  Udo_QueryOperationFunction_t queryOp;
+} QnnDspOpPackage_OperationInfo_t;
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspPerfInfrastructure.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspPerfInfrastructure.h
new file mode 100755
index 0000000000000..c9b1aa3020b9e
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspPerfInfrastructure.h
@@ -0,0 +1,448 @@
+//==============================================================================
+//
+// Copyright (c) 2022-2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/** @file
+ *  @brief QNN DSP component Performance Infrastructure API
+ *
+ *         Provides interface to the client to control performance and system
+ *         settings of the QNN DSP Accelerator
+ */
+
+#ifndef QNN_DSP_PERF_INFRASTRUCTURE_H
+#define QNN_DSP_PERF_INFRASTRUCTURE_H
+
+#include "QnnCommon.h"
+#include "QnnTypes.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// max rpc polling time allowed - 9999 us
+#define QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIG_MAX_RPC_POLLING_TIME 9999
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+/**
+ * @brief QNN DSP PerfInfrastructure API result / error codes.
+ *
+ */
+typedef enum {
+  QNN_DSP_PERF_INFRASTRUCTURE_MIN_ERROR = QNN_MIN_ERROR_PERF_INFRASTRUCTURE,
+  ////////////////////////////////////////////////////////////////////////
+
+  QNN_DSP_PERF_INFRASTRUCTURE_NO_ERROR                 = QNN_SUCCESS,
+  QNN_DSP_PERF_INFRASTRUCTURE_ERROR_INVALID_HANDLE_PTR = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 0,
+  QNN_DSP_PERF_INFRASTRUCTURE_ERROR_INVALID_INPUT      = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 1,
+  QNN_DSP_PERF_INFRASTRUCTURE_ERROR_UNSUPPORTED_CONFIG = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 2,
+  QNN_DSP_PERF_INFRASTRUCTURE_ERROR_TRANSPORT          = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 3,
+  QNN_DSP_PERF_INFRASTRUCTURE_ERROR_UNSUPPORTED        = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 4,
+  QNN_DSP_PERF_INFRASTRUCTURE_ERROR_FAILED             = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 5,
+
+  ////////////////////////////////////////////////////////////////////////
+  QNN_DSP_PERF_INFRASTRUCTURE_MAX_ERROR = QNN_MAX_ERROR_PERF_INFRASTRUCTURE,
+  /// UNDEFINED value that must not be used by client
+  QNN_DSP_PERF_INFRASTRUCTURE_ERROR_UNDEFINED = 0x7fffffff
+} QnnDspPerfInfrastructure_Error_t;
+
+/**
+ * @brief Used to allow client start (non-zero value) or stop participating
+ * (zero value) in DCVS
+ *
+ */
+typedef uint32_t QnnDspPerfInfrastructure_DcvsEnable_t;
+
+/**
+ * @brief Allows client to set up the sleep latency in microseconds
+ *
+ */
+typedef uint32_t QnnDspPerfInfrastructure_SleepLatency_t;
+
+/**
+ * @brief Allows client to disable sleep or low power modes.
+ * Pass a non-zero value to disable sleep in DSP
+ *
+ */
+typedef uint32_t QnnDspPerfInfrastructure_SleepDisable_t;
+
+/**
+ * @brief sets the minimum size by which user heap should grow
+ * when heap is exhausted. This API is expected to be
+ * called only once per backend and has a process wide impact
+ *
+ * Grow size provided in bytes and defaults to 16MB
+ */
+typedef uint32_t QnnDspPerfInfrastructure_MemGrowSize_t;
+
+/**
+ * @brief sets the vtcm size to use for graphs that
+ * are prepared offline. This API should be set up
+ * before users can finalize a graph offline. It allows
+ * the QNN DSP backend to configure the serialized
+ * context for the available vtcm on target
+ *
+ * VTCM size provided in MB and does not have a default
+ */
+typedef uint32_t QnnDspPerfInfrastructure_VtcmSize_t;
+
+/**
+ * @brief sets the number of HVX threads for QNN DSP
+ */
+typedef uint32_t QnnDspPerfInfrastructure_HvxThreadNumber_t;
+
+/**
+ * @brief These are the different voltage corners that can
+ * be requested by the client to influence the voting scheme
+ * for DCVS
+ *
+ */
+typedef enum {
+  /// Maps to HAP_DCVS_VCORNER_DISABLE.
+  /// Disable setting up voltage corner
+  DCVS_VOLTAGE_CORNER_DISABLE = 0x10,
+  /// Maps to HAP_DCVS_VCORNER_SVS2.
+  /// Set voltage corner to minimum value supported on platform
+  DCVS_VOLTAGE_VCORNER_MIN_VOLTAGE_CORNER = 0x20,
+  /// Maps to HAP_DCVS_VCORNER_SVS2.
+  /// Set voltage corner to SVS2 value for the platform
+  DCVS_VOLTAGE_VCORNER_SVS2 = 0x30,
+  /// Maps to HAP_DCVS_VCORNER_SVS.
+  /// Set voltage corner to SVS value for the platform
+  DCVS_VOLTAGE_VCORNER_SVS = 0x40,
+  /// Maps to HAP_DCVS_VCORNER_SVS_PLUS.
+  /// Set voltage corner to SVS_PLUS value for the platform
+  DCVS_VOLTAGE_VCORNER_SVS_PLUS = 0x50,
+  /// Maps to HAP_DCVS_VCORNER_NOM.
+  /// Set voltage corner to NOMINAL value for the platform
+  DCVS_VOLTAGE_VCORNER_NOM = 0x60,
+  /// Maps to HAP_DCVS_VCORNER_NOM_PLUS.
+  /// Set voltage corner to NOMINAL_PLUS value for the platform
+  DCVS_VOLTAGE_VCORNER_NOM_PLUS = 0x70,
+  /// Maps to HAP_DCVS_VCORNER_TURBO.
+  /// Set voltage corner to TURBO value for the platform
+  DCVS_VOLTAGE_VCORNER_TURBO = 0x80,
+  /// Maps to HAP_DCVS_VCORNER_TURBO_PLUS.
+  /// Set voltage corner to TURBO_PLUS value for the platform
+  DCVS_VOLTAGE_VCORNER_TURBO_PLUS = 0x90,
+  /// Maps to HAP_DCVS_VCORNER_MAX.
+  /// Set voltage corner to maximum value supported on the platform
+  DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER = 0xA0,
+  /// UNKNOWN value that must not be used by client
+  DCVS_VOLTAGE_VCORNER_UNKNOWN = 0x7fffffff
+} QnnDspPerfInfrastructure_VoltageCorner_t;
+
+/**
+ * @brief This enum defines all the possible power mode
+ *        that a client can set to influence DCVS mode
+ */
+typedef enum {
+  /// Maps to HAP_DCVS_V2_ADJUST_UP_DOWN.
+  /// Allows for DCVS to adjust up and down
+  QNN_DSP_PERF_INFRASTRUCTURE_POWERMODE_ADJUST_UP_DOWN = 0x1,
+  /// Maps to HAP_DCVS_V2_ADJUST_ONLY_UP.
+  /// Allows for DCVS to adjust up only
+  QNN_DSP_PERF_INFRASTRUCTURE_POWERMODE_ADJUST_ONLY_UP = 0x2,
+  /// Maps to HAP_DCVS_V2_POWER_SAVER_MODE.
+  /// Higher thresholds for power efficiency
+  QNN_DSP_PERF_INFRASTRUCTURE_POWERMODE_POWER_SAVER_MODE = 0x4,
+  /// Maps to HAP_DCVS_V2_POWER_SAVER_AGGRESSIVE_MODE.
+  /// Higher thresholds for power efficiency with faster ramp down
+  QNN_DSP_PERF_INFRASTRUCTURE_POWERMODE_POWER_SAVER_AGGRESSIVE_MODE = 0x8,
+  /// Maps to HAP_DCVS_V2_PERFORMANCE_MODE.
+  /// Lower thresholds for maximum performance
+  QNN_DSP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE = 0x10,
+  /// Maps to HAP_DCVS_V2_DUTY_CYCLE_MODE.
+  /// The below value applies only for HVX clients:
+  ///  - For streaming class clients:
+  ///   - detects periodicity based on HVX usage
+  ///   - lowers clocks in the no HVX activity region of each period.
+  ///  - For compute class clients:
+  ///   - Lowers clocks on no HVX activity detects and brings clocks up on detecting HVX activity
+  ///   again.
+  ///   - Latency involved in bringing up the clock will be at max 1 to 2 ms.
+  QNN_DSP_PERF_INFRASTRUCTURE_POWERMODE_DUTY_CYCLE_MODE = 0x20,
+  /// UNKNOWN value that must not be used by client
+  QNN_DSP_PERF_INFRASTRUCTURE_POWERMODE_UNKNOWN = 0x7fffffff
+} QnnDspPerfInfrastructure_PowerMode_t;
+
+/**
+ * @brief This enum defines all the possible performance
+ *        options in Dsp Performance Infrastructure that
+ *        relate to setting up of power levels
+ */
+typedef enum {
+  /// config enum implies the usage of dcvsEnableConfig struct. For dcvs v2, if not provided, will
+  /// set to false
+  QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_ENABLE = 1,
+  /// config enum implies the usage of sleepLatencyConfig struct
+  QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_SLEEP_LATENCY = 2,
+  /// config enum implies the usage of sleepDisableConfig struct
+  QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_SLEEP_DISABLE = 3,
+  /// config enum implies the usage of dcvsPowerModeConfig struct. If not provided, power save mode
+  /// will be used
+  QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_POWER_MODE = 4,
+  /// config enum implies the usage of dcvsVoltageCornerConfig struct
+  QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_VOLTAGE_CORNER = 5,
+  /// config enum implies the usage of busVoltageCornerConfig struct
+  QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_BUS_VOLTAGE_CORNER = 6,
+  /// config enum implies the usage of coreVoltageCornerConfig struct
+  QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_CORE_VOLTAGE_CORNER = 7,
+  /// config enum implies the usage of rpcControlLatencyConfig struct
+  QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_CONTROL_LATENCY = 9,
+  /// config enum implies the usage of rpcPollingTimeConfig struct
+  /// this config is only supported on V69 and later
+  /// if enabled, this config is applied to entire process
+  /// max allowed is QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIG_MAX_RPC_POLLING_TIME us
+  QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME = 10,
+  /// config HMX timeout interval in us. The HMX is turned off after the set interval
+  /// time if no interaction with it after an inference is finished.
+  QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_HMX_TIMEOUT_INTERVAL_US = 11,
+  /// UNKNOWN config option which must not be used
+  QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_UNKNOWN = 0x7fffffff
+} QnnDspPerfInfrastructure_PowerConfigOption_t;
+
+/**
+ * @brief Allows client to set up the RPC control latency in microseconds
+ *
+ */
+typedef uint32_t QnnDspPerfInfrastructure_RpcControlLatency_t;
+
+/**
+ * @brief Allows client to set up the RPC polling time in microseconds
+ */
+typedef uint32_t QnnDspPerfInfrastructure_RpcPollingTime_t;
+
+/**
+ * @brief Allows client to set up the HMX timeout interval in microseconds
+ */
+typedef uint32_t QnnDspPerfInfrastructure_HmxTimeoutIntervalUs_t;
+
+/**
+ * @brief This struct provides performance infrastructure configuration
+ *         associated with setting up of power levels
+ */
+typedef struct {
+  QnnDspPerfInfrastructure_PowerConfigOption_t config;
+  union {
+    QnnDspPerfInfrastructure_DcvsEnable_t dcvsEnableConfig;
+    QnnDspPerfInfrastructure_SleepLatency_t sleepLatencyConfig;
+    QnnDspPerfInfrastructure_SleepDisable_t sleepDisableConfig;
+    QnnDspPerfInfrastructure_PowerMode_t dcvsPowerModeConfig;
+    QnnDspPerfInfrastructure_VoltageCorner_t dcvsVoltageCornerMinConfig;
+    QnnDspPerfInfrastructure_VoltageCorner_t dcvsVoltageCornerTargetConfig;
+    QnnDspPerfInfrastructure_VoltageCorner_t dcvsVoltageCornerMaxConfig;
+    QnnDspPerfInfrastructure_VoltageCorner_t busVoltageCornerMinConfig;
+    QnnDspPerfInfrastructure_VoltageCorner_t busVoltageCornerTargetConfig;
+    QnnDspPerfInfrastructure_VoltageCorner_t busVoltageCornerMaxConfig;
+    QnnDspPerfInfrastructure_VoltageCorner_t coreVoltageCornerMinConfig;
+    QnnDspPerfInfrastructure_VoltageCorner_t coreVoltageCornerTargetConfig;
+    QnnDspPerfInfrastructure_VoltageCorner_t coreVoltageCornerMaxConfig;
+    QnnDspPerfInfrastructure_RpcControlLatency_t rpcControlLatencyConfig;
+    QnnDspPerfInfrastructure_RpcPollingTime_t rpcPollingTimeConfig;
+    QnnDspPerfInfrastructure_HmxTimeoutIntervalUs_t hmxTimeoutIntervalUsConfig;
+  };
+} QnnDspPerfInfrastructure_PowerConfig_t;
+
+/// QnnDspPerfInfrastructure_PowerConfig_t initializer macro
+#define QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIG_INIT                  \
+  {                                                                    \
+    QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_UNKNOWN, /*config*/ \
+    {                                                                  \
+      0 /*dcvsEnableConfig*/                                           \
+    }                                                                  \
+  }
+
+/**
+ * @brief This enum defines all the possible performance
+ *        options in Dsp Performance Infrastructure that
+ *        relate to system memory settings
+ */
+typedef enum {
+  /// sets memory grow size
+  QNN_DSP_PERF_INFRASTRUCTURE_MEMORY_CONFIGOPTION_GROW_SIZE = 1,
+  /// set the size of VTCM configuration (in MB) to use
+  /// This setting is applicable only for off target usage.
+  /// For on-target usage, refer QNN_DSP_PERF_INFRASTRUCTURE_MEMORY_CONFIGOPTION_VTCM_USAGE_FACTOR
+  QNN_DSP_PERF_INFRASTRUCTURE_MEMORY_CONFIGOPTION_VTCM_SIZE = 2,
+  /// set the vtcm usage factor on-target
+  QNN_DSP_PERF_INFRASTRUCTURE_MEMORY_CONFIGOPTION_VTCM_USAGE_FACTOR = 3,
+  /// UNKNOWN config option that must not be used
+  QNN_DSP_PERF_INFRASTRUCTURE_MEMORY_CONFIGOPTION_UNKNOWN = 0x7fffffff
+} QnnDspPerfInfrastructure_MemoryConfigOption_t;
+
+/**
+ * @brief This enum defines all the possible performance
+ *        options in Dsp Performance Infrastructure that
+ *        relate to thread settings
+ */
+typedef enum {
+  /// sets number of HVX threads
+  QNN_DSP_PERF_INFRASTRUCTURE_THREAD_CONFIGOPTION_NUMBER_OF_HVX_THREADS = 1,
+  /// UNKNOWN config option that must not be used
+  QNN_DSP_PERF_INFRASTRUCTURE_THREAD_CONFIGOPTION_UNKNOWN = 0x7fffffff
+} QnnDspPerfInfrastructure_ThreadConfigOption_t;
+
+/**
+ * @brief This enum defines all the possible vtcm
+ *        usage configuration. These settings apply only
+ *        for on-target libraries
+ *
+ */
+typedef enum {
+  /// use all the vtcm available on target
+  QNN_DSP_PERF_INFRASTRUCTURE_VTCM_USE_FULL = 1,
+  /// use bare minimal vtcm available on target. This is
+  /// not supported in the current release.
+  QNN_DSP_PERF_INFRASTRUCTURE_VTCM_USE_MIN     = 2,
+  QNN_DSP_PERF_INFRASTRUCTURE_VTCM_USE_UNKNOWN = 0x7fffffff
+} QnnDspPerfInfrastructure_VtcmUsageFactor_t;
+
+/**
+ * @brief Provides performance infrastructure configuration
+ *        options that are memory specific
+ */
+typedef struct {
+  QnnDspPerfInfrastructure_MemoryConfigOption_t config;
+  union {
+    QnnDspPerfInfrastructure_MemGrowSize_t memGrowSizeConfig;
+    QnnDspPerfInfrastructure_VtcmSize_t vtcmSizeInMB;
+    QnnDspPerfInfrastructure_VtcmUsageFactor_t vtcmUsageConfig;
+  };
+} QnnDspPerfInfrastructure_MemoryConfig_t;
+
+/// QnnDspPerfInfrastructure_MemoryConfig_t initializer macro
+#define QNN_DSP_PERF_INFRASTRUCTURE_MEMORY_CONFIG_INIT                  \
+  {                                                                     \
+    QNN_DSP_PERF_INFRASTRUCTURE_MEMORY_CONFIGOPTION_UNKNOWN, /*config*/ \
+    {                                                                   \
+      0 /*memGrowSizeConfig*/                                           \
+    }                                                                   \
+  }
+
+/**
+ * @brief Provides performance infrastructure configuration
+ *        options that are thread specific
+ */
+typedef struct {
+  QnnDspPerfInfrastructure_ThreadConfigOption_t config;
+  union {
+    QnnDspPerfInfrastructure_HvxThreadNumber_t numHvxThreads;
+  };
+} QnnDspPerfInfrastructure_ThreadConfig_t;
+
+/// QnnDspPerfInfrastructure_ThreadConfig_t initializer macro
+#define QNN_DSP_PERF_INFRASTRUCTURE_THREAD_CONFIG_INIT                  \
+  {                                                                     \
+    QNN_DSP_PERF_INFRASTRUCTURE_THREAD_CONFIGOPTION_UNKNOWN, /*config*/ \
+    {                                                                   \
+      0 /*numHvxThreads*/                                               \
+    }                                                                   \
+  }
+
+//=============================================================================
+// API Methods
+//=============================================================================
+
+/**
+ * @brief This API allows client to create power configuration id that
+ *        has to be used to set different performance modes.
+ *        Power configuration id has to be destroyed by client when not needed.
+ *
+ * @param[out] powerConfigId Pointer to power configuration id to be created.
+ *
+ *
+ * @return Error code
+ *         \n QNN_SUCCESS: No error encountered
+ *         \n QNN_DSP_PERF_INFRASTRUCTURE_ERROR_INVALID_INPUT if power configuration
+ *            id is NULL
+ */
+typedef Qnn_ErrorHandle_t (*QnnDspPerfInfrastructure_CreatePowerConfigIdFn_t)(
+    uint32_t* powerConfigId);
+
+/**
+ * @brief This API allows client to destroy power configuration id.
+ *
+ * @param[in] powerConfigId A power configuration id to be destroyed.
+ *
+ *
+ * @return Error code
+ *         \n QNN_SUCCESS: No error encountered
+ *         \n QNN_DSP_PERF_INFRASTRUCTURE_ERROR_INVALID_INPUT if power configuration
+ *            id does not exist
+ */
+typedef Qnn_ErrorHandle_t (*QnnDspPerfInfrastructure_DestroyPowerConfigIdFn_t)(
+    uint32_t powerConfigId);
+
+/**
+ * @brief This API allows client to set up system power configuration that
+ *        will enable different performance modes. This API uses
+ *        HAP_power_dcvs_v3_payload struct to config HAP power parameters.
+ *        Detailed HAP power parameters description please refer to Hexagon
+ *        SDK HAP_power_dcvs_v3_payload documentation.
+ *
+ * @param[in] powerConfigId A power client id to associate calls to system
+ *            power settings. A value of 0 implies NULL power client id
+ *            and can override every other setting the user process. To
+ *            enable power settings for multiple clients in the same
+ *            process, use a non-zero power client id.
+ *
+ *
+ * @param[in] config Pointer to a NULL terminated array
+ *            of config option for performance configuration.
+ *            NULL is allowed and indicates no config options are provided.
+ *
+ * @return Error code
+ *         \n QNN_SUCCESS: No error encountered
+ *         \n QNN_DSP_PERF_INFRASTRUCTURE_ERROR_INVALID_INPUT if power configuration
+ *            does not exist
+ */
+typedef Qnn_ErrorHandle_t (*QnnDspPerfInfrastructure_SetPowerConfigFn_t)(
+    uint32_t powerConfigId, const QnnDspPerfInfrastructure_PowerConfig_t** config);
+
+/**
+ * @brief This API allows clients to set up configuration associated with
+ *        system memory
+ *
+ * @param[in] config Pointer to a NULL terminated array
+ *            of config option for system memory configuration.
+ *            NULL is allowed and indicates no config options are provided.
+ *
+ * @return Error code
+ *         \n QNN_SUCCESS: No error encountered
+ */
+typedef Qnn_ErrorHandle_t (*QnnDspPerfInfrastructure_SetMemoryConfigFn_t)(
+    const QnnDspPerfInfrastructure_MemoryConfig_t** config);
+
+/**
+ * @brief This API allows clients to set up configuration for threads
+ *
+ * @param[in] config Pointer to a NULL terminated array
+ *            of config option for thread configuration.
+ *            NULL is allowed and indicates no config options are provided.
+ *
+ * @note This function should be called after QnnBackend_initialize and
+ *       before Context and Graph calls
+ *
+ * @return Error code
+ *         \n QNN_SUCCESS: No error encountered
+ *         \n QNN_DSP_PERF_INFRASTRUCTURE_ERROR_UNSUPPORTED_CONFIG if invalid
+ *            config or value passed
+ *         \n QNN_DSP_PERF_INFRASTRUCTURE_ERROR_INVALID_INPUT if config is NULL
+ *         \n QNN_DSP_PERF_INFRASTRUCTURE_ERROR_TRANSPORT if unable to set the
+ *            settings in DSP
+ */
+typedef Qnn_ErrorHandle_t (*QnnDspPerfInfrastructure_SetThreadConfigFn_t)(
+    const QnnDspPerfInfrastructure_ThreadConfig_t** config);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // QNN_DSP_PERF_INFRASTRUCTURE_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspProfile.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspProfile.h
new file mode 100755
index 0000000000000..04c1897aa7e18
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspProfile.h
@@ -0,0 +1,244 @@
+//==============================================================================
+//
+//  Copyright (c) 2022-2023 Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/**
+ *  @file
+ *  @brief QNN DSP Profile component API.
+ *
+ *          Requires DSP backend to be initialized.
+ *          Should be used with the QnnProfile API but has DSP backend
+ *          specific definition for different QnnProfile data structures
+ *
+ */
+
+#ifndef QNN_DSP_PROFILE_H
+#define QNN_DSP_PROFILE_H
+
+#include "QnnProfile.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the remote procedure call on the ARM processor
+ *        when client invokes QnnContext_createFromBinary. The value
+ *        returned is time in microseconds.
+ *
+ * @note context load binary host rpc time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_DSP_PROFILE_EVENTTYPE_CONTEXT_LOAD_BIN_HOST_RPC_TIME_MICROSEC 1002
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the remote procedure call on the DSP processor
+ *        when client invokes QnnContext_createFromBinary. The value
+ *        returned is time in microseconds.
+ *
+ * @note context load binary dsp rpc time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_DSP_PROFILE_EVENTTYPE_CONTEXT_LOAD_BIN_DSP_RPC_TIME_MICROSEC 1003
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the time taken to create the context on the
+ *        accelerator when client invokes QnnContext_createFromBinary.
+ *        The value returned is time in microseconds.
+ *
+ * @note context load binary accelerator time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_DSP_PROFILE_EVENTTYPE_CONTEXT_LOAD_BIN_ACCEL_TIME_MICROSEC 1004
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the remote procedure call on the ARM processor
+ *        when client invokes QnnGraph_finalize.
+ *        The value returned is time in microseconds.
+ *
+ * @note graph finalize host rpc time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_HOST_RPC_TIME_MICROSEC 2001
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the remote procedure call on the DSP processor
+ *        when client invokes QnnGraph_finalize.
+ *        The value returned is time in microseconds.
+ *
+ * @note graph finalize dsp rpc time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_DSP_RPC_TIME_MICROSEC 2002
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to finalize the graph on the accelerator
+ *        when client invokes QnnGraph_finalize.
+ *        The value returned is time in microseconds.
+ *
+ * @note graph finalize accelerator time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_ACCEL_TIME_MICROSEC 2003
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the remote procedure call on the ARM processor
+ *        when client invokes QnnGraph_execute or QnnGraph_executeAsync.
+ *        The value returned is time in microseconds.
+ *
+ * @note graph execute host rpc time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_HOST_RPC_TIME_MICROSEC 3001
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the remote procedure call on the DSP processor
+ *        when client invokes QnnGraph_execute or QnnGraph_executeAsync.
+ *        The value returned is time in microseconds.
+ *
+ * @note graph execute dsp rpc time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_DSP_RPC_TIME_MICROSEC 3002
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to execute the graph on the accelerator
+ *        when client invokes QnnGraph_execute or QnnGraph_executeAsync.
+ *        The value returned is number of processor cycles taken.
+ *
+ * @note graph execute accelerator time maybe available only on
+ *       QNN_PROFILE_LEVEL_DETAILED levels
+ *
+ * @note When QNN_PROFILE_LEVEL_DETAILED is used, this event can have
+ *       multiple sub-events of type QNN_PROFILE_EVENTTYPE_NODE.
+ *       There will be a sub-event for each node that was added to the graph
+ */
+#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_ACCEL_TIME_CYCLE 3003
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to execute the graph on the accelerator
+ *        when client invokes QnnGraph_execute or QnnGraph_executeAsync.
+ *        The value returned is time taken in microseconds
+ *
+ * @note graph execute accelerator time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ *
+ * @note When QNN_PROFILE_LEVEL_DETAILED is used, this event can have
+ *       multiple sub-events of type QNN_PROFILE_EVENTTYPE_NODE / QNN_PROFILE_EVENTUNIT_MICROSEC
+ *       There will be a sub-event for each node that was added to the graph
+ */
+#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_ACCEL_TIME_MICROSEC 3004
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to time taken for miscellaneous work i.e. time
+ *        that cannot be attributed to a node but are still needed to
+ *        execute the graph on the accelerator. This occurs when client invokes
+ *        QnnGraph_execute or QnnGraph_executeAsync.
+ *        The value returned is time taken in microseconds
+ *
+ * @note graph execute misc accelerator time is available only on
+ *       QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_MISC_ACCEL_TIME_MICROSEC 3005
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to time taken for a graph yield instance to
+ *        release all its resources to the other graph.
+ *        The value returned is time taken in microseconds.
+ */
+#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_YIELD_INSTANCE_RELEASE_TIME 3006
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to time a graph spends waiting for a higher
+ *        priority graph to finish execution.
+ *        The value returned is time taken in microseconds
+ */
+#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_YIELD_INSTANCE_WAIT_TIME 3007
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to time a graph spends re-acquiring resources
+ *        and restoring vtcm.
+ *        The value returned is time taken in microseconds
+ */
+#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_YIELD_INSTANCE_RESTORE_TIME 3008
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the number of times that a yield occured
+ *        during execution
+ */
+#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_YIELD_COUNT 3009
+
+/**
+ * @brief QnnProfile_EventType_t definition for time a graph waits to get
+ *        VTCM. This should be constant UNLESS we need another graph to yield.
+ *        The value returned is time taken in microseconds.
+ */
+#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_VTCM_ACQUIRE_TIME 3010
+
+/**
+ * @brief QnnProfile_EventType_t definition for time a graph waits to get
+ *        HMX + HVX, and turn them all on.
+ *        The value returned is time taken in microseconds.
+ */
+#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_RESOURCE_POWER_UP_TIME 3011
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the remote procedure call on the ARM processor
+ *        when client invokes QnnContext_free which in consequence deinit graph.
+ *        The value returned is time in microseconds.
+ *
+ * @note graph deinit host rpc time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_DEINIT_HOST_RPC_TIME_MICROSEC 4001
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the remote procedure call on the DSP processor
+ *        when client invokes QnnContext_free which in consequence deinit graph.
+ *        The value returned is time in microseconds.
+ *
+ * @note graph deinit dsp rpc time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_DEINIT_DSP_RPC_TIME_MICROSEC 4002
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the time taken to deinit graph on the
+ *        accelerator when client invokes QnnContext_free which in consequence
+ *        deinit graph. The value returned is time in microseconds.
+ *
+ * @note graph deinit accelerator time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_DEINIT_ACCEL_TIME_MICROSEC 4003
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // QNN_DSP_PROFILE_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspProperty.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspProperty.h
new file mode 100755
index 0000000000000..39669338e35f8
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspProperty.h
@@ -0,0 +1,30 @@
+//==============================================================================
+//
+//  Copyright (c) 2022-2023 Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef QNN_DSP_PROPERTY_H
+#define QNN_DSP_PROPERTY_H
+
+#include "QnnProperty.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+/**
+ * @brief Property key for determining whether a backend supports unsigned pd.
+ */
+#define QNN_PROPERTY_CUSTOM_DSP_UNSIGNED_PD_SUPPORT QNN_PROPERTY_GROUP_CUSTOM + 1
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // QNN_DSP_PROPERTY_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoBase.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoBase.h
new file mode 100755
index 0000000000000..942e5997ab5ff
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoBase.h
@@ -0,0 +1,509 @@
+//==============================================================================
+//
+// Copyright (c) 2019-2021 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef SNPE_UDO_BASE_H
+#define SNPE_UDO_BASE_H
+
+#include <stdint.h>
+
+// Provide values to use for API version.
+#define API_VERSION_MAJOR 1
+#define API_VERSION_MINOR 6
+#define API_VERSION_TEENY 0
+
+/** @addtogroup c_plus_plus_apis C++
+@{ */
+
+// Defines a bitmask of enum values.
+typedef uint32_t SnpeUdo_Bitmask_t;
+typedef SnpeUdo_Bitmask_t Udo_Bitmask_t;
+
+// A string of characters, rather than an array of bytes.
+// Assumed to be UTF-8.
+typedef char* SnpeUdo_String_t;
+typedef SnpeUdo_String_t Udo_String_t;
+
+// The maximum allowable length of a SnpeUdo_String_t in bytes,
+// including null terminator. SNPE will truncate strings longer
+// than this.
+#define SNPE_UDO_MAX_STRING_SIZE 1024
+
+/**
+  * An enum which holds the various error types.
+  * The error types are divided to classes :
+  * 0 - 99    : generic errors
+  * 100 - 200 : errors related to configuration
+  *
+  */
+typedef enum
+{
+   /// No Error
+   SNPE_UDO_NO_ERROR                    = 0,          UDO_NO_ERROR                    = 0,
+   /// Unsupported value for core type
+   SNPE_UDO_WRONG_CORE                  = 1,          UDO_WRONG_CORE                  = 1,
+   /// Invalid attribute/argument passed into UDO API
+   SNPE_UDO_INVALID_ARGUMENT            = 2,          UDO_INVALID_ARGUMENT            = 2,
+   /// Unsupported feature error
+   SNPE_UDO_UNSUPPORTED_FEATURE         = 3,          UDO_UNSUPPORTED_FEATURE         = 3,
+   /// Error relating to memory allocation
+   SNPE_UDO_MEM_ALLOC_ERROR             = 4,          UDO_MEM_ALLOC_ERROR             = 4,
+   /* Configuration Specific errors */
+   /// No op with given attributes available in library
+   SNPE_UDO_WRONG_OPERATION             = 100,        UDO_WRONG_OPERATION             = 100,
+   /// Unsupported value for core type in UDO configuration
+   SNPE_UDO_WRONG_CORE_TYPE             = 101,        UDO_WRONG_CORE_TYPE             = 101,
+   /// Wrong number of params in UDO definition
+   SNPE_UDO_WRONG_NUM_OF_PARAMS         = 102,        UDO_WRONG_NUM_OF_PARAMS         = 102,
+   /// Wrong number of dimensions for tensor(s) in UDO definition
+   SNPE_UDO_WRONG_NUM_OF_DIMENSIONS     = 103,        UDO_WRONG_NUM_OF_DIMENSIONS     = 103,
+   /// Wrong number of input tensors in UDO definition
+   SNPE_UDO_WRONG_NUM_OF_INPUTS         = 104,        UDO_WRONG_NUM_OF_INPUTS         = 104,
+   /// Wrong number of output tensors in UDO definition
+   SNPE_UDO_WRONG_NUM_OF_OUTPUTS        = 105,        UDO_WRONG_NUM_OF_OUTPUTS        = 105,
+   SNPE_UDO_PROGRAM_CACHE_NOT_FOUND     = 106,        UDO_PROGRAM_CACHE_NOT_FOUND     = 106,
+   SNPE_UDO_UNKNOWN_ERROR               = 0xFFFFFFFF, UDO_UNKNOWN_ERROR               = 0xFFFFFFFF
+} SnpeUdo_ErrorType_t;
+
+typedef SnpeUdo_ErrorType_t Udo_ErrorType_t;
+
+/**
+  * An enum which holds the various data types.
+  * Designed to be used as single values or combined into a bitfield parameter
+  * (0x1, 0x2, 0x4, etc)
+  * \n FIXED_XX types are targeted for data in tensors.
+  * \n UINT / INT types are targeted for scalar params
+  */
+typedef enum
+{
+   /// data type: 16-bit floating point
+   SNPE_UDO_DATATYPE_FLOAT_16       = 0x01,        UDO_DATATYPE_FLOAT_16       = 0x01,
+   /// data type: 32-bit floating point
+   SNPE_UDO_DATATYPE_FLOAT_32       = 0x02,        UDO_DATATYPE_FLOAT_32       = 0x02,
+   /// data type: 4-bit fixed point
+   SNPE_UDO_DATATYPE_FIXED_4        = 0x04,        UDO_DATATYPE_FIXED_4        = 0x04,
+   /// data type: 8-bit fixed point
+   SNPE_UDO_DATATYPE_FIXED_8        = 0x08,        UDO_DATATYPE_FIXED_8        = 0x08,
+   /// data type: 16-bit fixed point
+   SNPE_UDO_DATATYPE_FIXED_16       = 0x10,        UDO_DATATYPE_FIXED_16       = 0x10,
+   /// data type: 32-bit fixed point
+   SNPE_UDO_DATATYPE_FIXED_32       = 0x20,        UDO_DATATYPE_FIXED_32       = 0x20,
+   /// data type: 8-bit unsigned integer
+   SNPE_UDO_DATATYPE_UINT_8         = 0x100,       UDO_DATATYPE_UINT_8         = 0x100,
+   /// data type: 16-bit unsigned integer
+   SNPE_UDO_DATATYPE_UINT_16        = 0x200,       UDO_DATATYPE_UINT_16        = 0x200,
+   /// data type: 32-bit unsigned integer
+   SNPE_UDO_DATATYPE_UINT_32        = 0x400,       UDO_DATATYPE_UINT_32        = 0x400,
+   /// data type: 8-bit signed integer
+   SNPE_UDO_DATATYPE_INT_8          = 0x1000,      UDO_DATATYPE_INT_8          = 0x1000,
+   /// data type: 16-bit signed integer
+   SNPE_UDO_DATATYPE_INT_16         = 0x2000,      UDO_DATATYPE_INT_16         = 0x2000,
+   /// data type: 32-bit signed integer
+   SNPE_UDO_DATATYPE_INT_32         = 0x4000,      UDO_DATATYPE_INT_32         = 0x4000,
+   SNPE_UDO_DATATYPE_LAST           = 0xFFFFFFFF,  UDO_DATATYPE_LAST           = 0xFFFFFFFF
+} SnpeUdo_DataType_t;
+
+typedef SnpeUdo_DataType_t Udo_DataType_t;
+
+/**
+  * An enum which holds the various layouts.
+  * Designed to be used as single values or combined into a bitfield parameter
+  * (0x1, 0x2, 0x4, etc)
+  */
+typedef enum
+{
+   /// data layout (4D): NHWC (batch-height-width-channel)
+   SNPE_UDO_LAYOUT_NHWC             = 0x01,        UDO_LAYOUT_NHWC             = 0x01,
+   /// data layout (4D): NCHW (batch-channel-height-width)
+   SNPE_UDO_LAYOUT_NCHW             = 0x02,        UDO_LAYOUT_NCHW             = 0x02,
+   /// data layout (5D): NDHWC (batch-dimension-height-width-channel)
+   SNPE_UDO_LAYOUT_NDHWC            = 0x04,        UDO_LAYOUT_NDHWC            = 0x04,
+   SNPE_UDO_LAYOUT_GPU_OPTIMAL1     = 0x08,        UDO_LAYOUT_GPU_OPTIMAL1     = 0x08,
+   SNPE_UDO_LAYOUT_GPU_OPTIMAL2     = 0x10,        UDO_LAYOUT_GPU_OPTIMAL2     = 0x10,
+   SNPE_UDO_LAYOUT_DSP_OPTIMAL1     = 0x11,        UDO_LAYOUT_DSP_OPTIMAL1     = 0x11,
+   SNPE_UDO_LAYOUT_DSP_OPTIMAL2     = 0x12,        UDO_LAYOUT_DSP_OPTIMAL2     = 0x12,
+   // Indicates no data will be allocated for this tensor.
+   // Used to specify optional inputs/outputs positionally.
+   SNPE_UDO_LAYOUT_NULL             = 0x13,        UDO_LAYOUT_NULL             = 0x13,
+   SNPE_UDO_LAYOUT_LAST             = 0xFFFFFFFF,  UDO_LAYOUT_LAST             = 0xFFFFFFFF
+} SnpeUdo_TensorLayout_t;
+
+typedef SnpeUdo_TensorLayout_t Udo_TensorLayout_t;
+
+/**
+  * An enum which holds the UDO library Core type .
+  * Designed to be used as single values or combined into a bitfield parameter
+  * (0x1, 0x2, 0x4, etc)
+  */
+typedef enum
+{
+   /// Library target IP Core is undefined
+   SNPE_UDO_CORETYPE_UNDEFINED   = 0x00,          UDO_CORETYPE_UNDEFINED   = 0x00,
+   /// Library target IP Core is CPU
+   SNPE_UDO_CORETYPE_CPU         = 0x01,          UDO_CORETYPE_CPU         = 0x01,
+   /// Library target IP Core is GPU
+   SNPE_UDO_CORETYPE_GPU         = 0x02,          UDO_CORETYPE_GPU         = 0x02,
+   /// Library target IP Core is DSP
+   SNPE_UDO_CORETYPE_DSP         = 0x04,          UDO_CORETYPE_DSP         = 0x04,
+   SNPE_UDO_CORETYPE_LAST        = 0xFFFFFFFF,    UDO_CORETYPE_LAST        = 0xFFFFFFFF
+} SnpeUdo_CoreType_t;
+
+typedef SnpeUdo_CoreType_t Udo_CoreType_t;
+
+/**
+  * An enum to specify the parameter type : Scalar or Tensor
+  */
+typedef enum
+{
+   /// UDO static param type: scalar
+   SNPE_UDO_PARAMTYPE_SCALAR = 0x00,         UDO_PARAMTYPE_SCALAR = 0x00,
+   /// UDO static param type: string
+   SNPE_UDO_PARAMTYPE_STRING = 0x01,         UDO_PARAMTYPE_STRING = 0x01,
+   /// UDO static param type: tensor
+   SNPE_UDO_PARAMTYPE_TENSOR = 0x02,         UDO_PARAMTYPE_TENSOR = 0x02,
+   SNPE_UDO_PARAMTYPE_LAST   = 0xFFFFFFFF,   UDO_PARAMTYPE_LAST   = 0xFFFFFFFF
+} SnpeUdo_ParamType_t;
+
+typedef SnpeUdo_ParamType_t Udo_ParamType_t;
+
+/**
+  * An enum to specify quantization type
+  */
+typedef enum
+{
+   /// Tensor Quantization type: NONE. Signifies unquantized tensor data
+   SNPE_UDO_QUANTIZATION_NONE   = 0x00,         UDO_QUANTIZATION_NONE   = 0x00,
+   /// Tensor Quantization type: Tensorflow-style
+   SNPE_UDO_QUANTIZATION_TF     = 0x01,         UDO_QUANTIZATION_TF     = 0x01,
+   SNPE_UDO_QUANTIZATION_QMN    = 0x02,         UDO_QUANTIZATION_QMN    = 0x02,
+   SNPE_UDO_QUANTIZATION_LAST   = 0xFFFFFFFF,   UDO_QUANTIZATION_LAST   = 0xFFFFFFFF
+} SnpeUdo_QuantizationType_t;
+
+typedef SnpeUdo_QuantizationType_t Udo_QuantizationType_t;
+
+/**
+ * @brief A struct which is used to provide a version number using 3 values : major, minor, teeny
+ *
+ */
+typedef struct
+{
+   /// version field: major - for backward-incompatible changes
+   uint32_t major;
+   /// version field: minor - for backward-compatible feature updates
+   uint32_t minor;
+   /// version field: teeny - for minor bug-fixes and clean-up
+   uint32_t teeny;
+} SnpeUdo_Version_t;
+
+typedef SnpeUdo_Version_t Udo_Version_t;
+
+/**
+ * @brief A struct returned from version query, contains the Library version and API version
+ *
+ */
+typedef struct
+{
+   /// Version of UDO library. Controlled by users
+   SnpeUdo_Version_t libVersion;
+   /// Version of SNPE UDO API used in compiling library. Determined by SNPE
+   SnpeUdo_Version_t apiVersion;
+} SnpeUdo_LibVersion_t;
+
+/**
+ * @brief A struct returned from version query, contains the package version
+ *
+ */
+typedef struct
+{
+   /// Version of UDO API used in package.
+   Udo_Version_t apiVersion;
+} Udo_PkgVersion_t;
+
+/**
+ * @brief A union to hold the value of a generic type. Allows defining a parameter struct
+ * in a generic way, with a "value" location that holds the data regardless of the type.
+ *
+ */
+typedef union
+{
+   /// value type: float
+   float    floatValue;
+   /// value type: unsigned 32-bit integer
+   uint32_t uint32Value;
+   /// value type: signed 32-bit integer
+   int32_t  int32Value;
+   /// value type: unsigned 16-bit integer
+   uint16_t uint16Value;
+   /// value type: signed 16-bit integer
+   int16_t  int16Value;
+   /// value type: unsigned 8-bit integer
+   uint8_t  uint8Value;
+   /// value type: signed 8-bit integer
+   int8_t   int8Value;
+} SnpeUdo_Value_t;
+
+typedef SnpeUdo_Value_t Udo_Value_t;
+
+/**
+ * @brief A struct which defines a scalar parameter : name, data type, and union of values
+ *
+ */
+typedef struct
+{
+   /// The parameter data type : float, int, etc.
+   SnpeUdo_DataType_t  dataType;
+   /// a union of specified type which holds the data
+   SnpeUdo_Value_t dataValue;
+} SnpeUdo_ScalarParam_t;
+
+typedef SnpeUdo_ScalarParam_t Udo_ScalarParam_t;
+
+/**
+ * @brief A struct which defines the quantization parameters in case of Tensorflow style quantization
+ *
+ */
+typedef struct
+{
+   /// minimum value of the quantization range of data
+   float minValue;
+   /// maximum value of the quantization range of data
+   float maxValue;
+} SnpeUdo_TFQuantize_t;
+
+typedef SnpeUdo_TFQuantize_t Udo_TFQuantize_t;
+
+/**
+ * @brief A struct which defines the quantization type, and union of supported quantization structs
+ *
+ */
+typedef struct
+{
+   /// quantization type (only TF-style currently supported)
+   SnpeUdo_QuantizationType_t quantizeType;
+   union
+   {
+     /// TF-style min-max quantization ranges
+     SnpeUdo_TFQuantize_t TFParams;
+   };
+} SnpeUdo_QuantizeParams_t;
+
+typedef SnpeUdo_QuantizeParams_t Udo_QuantizeParams_t;
+
+/**
+ * @brief A struct which defines the datatype associated with a specified core-type
+ * This should be used to denote the datatypes for a single tensor info, depending
+ * on the intended execution core.
+ *
+ */
+typedef struct
+{
+    /// The IP Core
+    SnpeUdo_CoreType_t     coreType;
+    /// The associated datatype for this coreType
+    SnpeUdo_DataType_t       dataType;
+} SnpeUdo_PerCoreDatatype_t;
+
+typedef SnpeUdo_PerCoreDatatype_t Udo_PerCoreDatatype_t;
+
+/**
+ * @brief A struct which defines a tensor parameter : name, data type, layout, quantization, more.
+ *        Also holds a pointer to the tensor data.
+ *
+ */
+typedef struct
+{
+   /// The maximum allowable dimensions of the tensor. The memory held in
+   /// _tensorData_ is guaranteed to be large enough for this.
+   uint32_t*                maxDimensions;
+   /// The current dimensions of the tensor. An operation may modify the current
+   /// dimensions of its output, to indicate cases where the output has been
+   /// "resized".
+   /// Note that for static parameters, the current and max dimensions must
+   /// match.
+   uint32_t*                currDimensions;
+   /// Quantization params applicable to the tensor. Currently only supports
+   /// Tensorflow quantization style.
+   SnpeUdo_QuantizeParams_t quantizeParams;
+   /// Number of dimensions to the tensor: 3D, 4D, etc.
+   uint32_t                 tensorRank;
+   /// The parameter data type: float, int, etc.
+   SnpeUdo_DataType_t       dataType;
+   /// The tensor layout type: NCHW, NHWC, etc.
+   SnpeUdo_TensorLayout_t   layout;
+   /// Opaque pointer to tensor data. User may be required to re-interpret the pointer
+   /// based on core-specific definitions.
+   void*                    tensorData;
+} SnpeUdo_TensorParam_t;
+
+typedef SnpeUdo_TensorParam_t Udo_TensorParam_t;
+
+/**
+ * @brief struct which defines a UDO parameter - a union of scalar, tensor and string parameters
+ *
+ */
+typedef struct
+{
+   /// Type is scalar or tensor
+  SnpeUdo_ParamType_t paramType;
+  /// The param name, for example : "offset", "activation_type"
+  SnpeUdo_String_t    paramName;
+  union
+  {
+    /// scalar param value
+    SnpeUdo_ScalarParam_t scalarParam;
+    /// tensor param value
+    SnpeUdo_TensorParam_t tensorParam;
+    /// string param value
+    SnpeUdo_String_t      stringParam;
+  };
+} SnpeUdo_Param_t;
+
+typedef SnpeUdo_Param_t Udo_Param_t;
+
+/**
+ * @brief A struct which defines Operation information which is specific for IP core (CPU, GPU, DSP ...)
+ *
+ */
+typedef struct
+{
+   /// The IP Core
+   SnpeUdo_CoreType_t     udoCoreType;
+   /// Bitmask, defines supported internal calculation types (like FLOAT_32, etc)
+   /// Based on SnpeUdo_DataType
+   SnpeUdo_Bitmask_t      operationCalculationTypes;
+} SnpeUdo_OpCoreInfo_t;
+
+typedef SnpeUdo_OpCoreInfo_t Udo_OpCoreInfo_t;
+
+/**
+ * @brief A struct which defines the common and core-specific Operation information
+ *
+ */
+typedef struct
+{
+   /// Operation type
+   SnpeUdo_String_t      operationType;
+   /// A bitmask describing which IP Cores (CPU, GPU, DSP ...) support this operation
+   /// Translated based on SnpeUdo_CoreType
+   SnpeUdo_Bitmask_t     supportedByCores;
+   /// Number of static parameters defined by the op
+   uint32_t              numOfStaticParams;
+   /// Array of static parameters. Can be scalar or tensor params
+   SnpeUdo_Param_t*      staticParams;
+   /// Number of input tensors this op receives
+   uint32_t              numOfInputs;
+   /// Array of input tensor names to this operation
+   SnpeUdo_String_t*     inputNames;
+   /// Number of output tensors this op receives
+   uint32_t              numOfOutputs;
+   /// Array of output tensor names to this operation
+   SnpeUdo_String_t*     outputNames;
+   /// Number of cores that the op can execute on
+   uint32_t              numOfCoreInfo;
+   /// Array of per-core information entries
+   SnpeUdo_OpCoreInfo_t* opPerCoreInfo;
+} SnpeUdo_OperationInfo_t;
+
+typedef SnpeUdo_OperationInfo_t Udo_OperationInfo_t;
+
+/**
+ * @brief A struct which provides the implementation library info : type, name
+ *
+ */
+typedef struct
+{
+   /// Defines the IP Core that this implementation library is targeting
+   SnpeUdo_CoreType_t     udoCoreType;
+   /// library name. will be looked at in the standard library path
+   SnpeUdo_String_t       libraryName;
+} SnpeUdo_LibraryInfo_t;
+
+typedef SnpeUdo_LibraryInfo_t Udo_LibraryInfo_t;
+
+/**
+ * @brief A struct returned by the registration library and contains information on the UDO package :
+ * name, operations, libraries, etc.
+ *
+ */
+typedef struct
+{
+   /// A string containing the package name
+   SnpeUdo_String_t         packageName;
+   /// A bitmask describing supported IP cores (CPU, GPU, DSP ...)
+   /// Translated based on SnpeUdo_CoreType
+   SnpeUdo_Bitmask_t        supportedCoreTypes;
+   /// The number of implementation libraries in the package
+   uint32_t                numOfImplementationLib;
+   /// Array of implementation libraries names/types
+   SnpeUdo_LibraryInfo_t*   implementationLib;
+   /// A string containing all operation types separated by space
+   SnpeUdo_String_t         operationsString;
+   /// Number of supported operations
+   uint32_t                numOfOperations;
+   /// Array of Operation info structs. Each entry describes one
+   /// Operation (name, params, inputs, outputs)
+   SnpeUdo_OperationInfo_t* operationsInfo;
+} SnpeUdo_RegInfo_t;
+
+typedef SnpeUdo_RegInfo_t Udo_RegInfo_t;
+
+/**
+* @brief A struct returned by the implementation library and contains information on the
+* specific library: name, IP Core, operations, etc.
+*
+*/
+typedef struct
+{
+   /// Defines the IP Core that this implementation library is targeting
+   SnpeUdo_CoreType_t     udoCoreType;
+   /// A string containing the package name
+   SnpeUdo_String_t       packageName;
+   /// A string containing all operation types separated by space
+   SnpeUdo_String_t       operationsString;
+   /// Number of supported operations
+   uint32_t              numOfOperations;
+} SnpeUdo_ImpInfo_t;
+
+typedef SnpeUdo_ImpInfo_t Udo_ImpInfo_t;
+
+/**
+ * @brief This struct defines an operation. It is used for validation
+ * or creation of an operation.
+ * In case of using it for creation, the static params which are tensors
+ * contain pointers to the real data (weights, for example), and input/output
+ * tensors also include pointers to the buffers used.
+ */
+typedef struct
+{
+   /// The IP Core that the operation is defined for - CPU, GPU, DSP...
+   SnpeUdo_CoreType_t      udoCoreType;
+   /// Operation type
+   SnpeUdo_String_t        operationType;
+   /// The number of static parameters provided in the staticParams array.
+   /// this number has to match the number provided by the UDO Registration library information
+   uint32_t               numOfStaticParams;
+   /// Array of static parameters
+   SnpeUdo_Param_t*        staticParams;
+   /// The number of input parameters provided in inputs array.
+   /// this number has to match the number provided by the UDO Registration library information
+   uint32_t               numOfInputs;
+   /// Array of input tensors, providing layout, data type, sizes, etc
+   /// When used to create an operation, also contains the initial location of the data
+   SnpeUdo_TensorParam_t*  inputs;
+   /// The number of output parameters provided in inputs array.
+   /// this number has to match the number provided by the UDO Registration library information
+   uint32_t               numOfOutputs;
+   /// Array of output tensors, providing layout, data type, sizes, etc
+   /// When used to create an operation, also contains the initial location of the data
+   SnpeUdo_TensorParam_t*  outputs;
+} SnpeUdo_OpDefinition_t;
+
+typedef SnpeUdo_OpDefinition_t Udo_OpDefinition_t;
+
+/** @} */ /* end_addtogroup c_plus_plus_apis C++ */
+
+#endif //SNPE_UDO_BASE_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoFlatten.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoFlatten.h
new file mode 100755
index 0000000000000..84a8fe310908e
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoFlatten.h
@@ -0,0 +1,78 @@
+//==============================================================================
+//
+// Copyright (c) 2019 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#include "DSP/Udo/UdoBase.h"
+
+#define HVX_ALIGNMENT 128
+#define DSP_STRUCT_ALIGNMENT 8
+#define DSP_ALIGN(X, ALIGNMENT) (((X) + ALIGNMENT - 1) & (~((ALIGNMENT)-1)))
+
+typedef struct dspStaticParamsMeta {
+        uint32_t size;
+        uint32_t numParams;       
+} dspStaticParamsMeta_t;
+
+typedef struct tensorParamInfo {
+        SnpeUdo_TensorLayout_t layout;
+        SnpeUdo_QuantizeParams_t quantizeInfo;
+        SnpeUdo_DataType_t dataType;
+        uint32_t paddingFor8byteAlignment;
+} tensorParamInfo_t;
+
+typedef struct udoString {
+        uint32_t sizeStruct;  // aligned
+        uint32_t lengthString;  // does not include null character
+        // followed by a string
+} udoString_t;  // allocate mem for string for 8 byte alignment
+
+typedef struct dims {
+        uint32_t size;
+        uint32_t rank;
+        uint32_t ds;  // rank # of max dimensions followed by rank # of current dimensions for tensors
+} dims_t;
+
+typedef struct tensorData {
+	uint32_t structSize;
+        uint32_t dataSize;
+        // followed by actual tensor data
+} tensorData_t;
+
+typedef struct dspStaticParamDescriptor {
+        uint32_t size;   // including size of descriptor (including dims + data for tensors) (or including string for strings)
+        SnpeUdo_ParamType_t paramType;
+        union {   // not used for string data
+                SnpeUdo_ScalarParam_t scalarInfo;
+                tensorParamInfo_t tensorInfo;
+        };
+        udoString_t name;
+        // followed by char*
+        // in case of tensor, followed by dim_stride and tensor_data
+        // in case of string, followed by udo_string and char*
+} dspStaticParamDescriptor_t;
+
+typedef struct paramSizes {
+       uint32_t descriptorSize;
+       uint32_t nameStructSize;
+       uint32_t dimsSize;
+       uint32_t dataStructSize;
+       uint32_t dataSize;
+       uint32_t stringDataStructSize;
+} paramSizes_t;
+
+typedef struct dspStaticParams {
+        dspStaticParamsMeta_t meta;
+        dspStaticParamDescriptor_t paramDesc;
+} dspStaticParams_t;
+
+
+int 
+SnpeUdo_flattenStaticParams (SnpeUdo_Param_t** paramList, uint32_t numParams, uint32_t* flattenedSize, void** flattened);
+
+void 
+SnpeUdo_freeFlattenedStaticParams (void** flattened);
+
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoImpl.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoImpl.h
new file mode 100755
index 0000000000000..bcc767a3c4a0f
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoImpl.h
@@ -0,0 +1,343 @@
+//==============================================================================
+//
+// Copyright (c) 2019-2021 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef SNPE_UDO_IMPL_H
+#define SNPE_UDO_IMPL_H
+
+#include <stdbool.h>
+
+#include "DSP/Udo/UdoShared.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/** @addtogroup c_plus_plus_apis C++
+@{ */
+
+typedef struct _SnpeUdo_OpFactory_t* SnpeUdo_OpFactory_t;
+typedef struct _SnpeUdo_Operation_t* SnpeUdo_Operation_t;
+
+typedef SnpeUdo_OpFactory_t Udo_OpFactory_t;
+typedef SnpeUdo_Operation_t Udo_Operation_t;
+
+/**
+ * @brief Initialize the shared library's data structures. Calling any other
+ *        library function before this one will result in error.
+ *
+ * @param[in] globalInfrastructure Global core-specific infrastructure to be
+ *            used by operations created in this library. The definition and
+ *            semantics of this object will be defined in the corresponding
+ *            implementation header for the core type.
+ * @return Error code
+ */
+SnpeUdo_ErrorType_t
+SnpeUdo_initImplLibrary(void* globalInfrastructure);
+
+typedef SnpeUdo_ErrorType_t
+(*SnpeUdo_InitImplLibraryFunction_t)(void*);
+
+/**
+ * @brief A function to query the API version of the UDO implementation library.
+ *        The function populates a SnpeUdo_LibVersion_t struct, which contains a SnpeUdo_Version_t
+ *        struct for API version and library version.
+ *
+ * @param[in, out] version A pointer to struct which contains major, minor, teeny information for
+ *                 library and api versions.
+ *
+ * @return Error code
+ */
+SnpeUdo_ErrorType_t
+SnpeUdo_getImplVersion(SnpeUdo_LibVersion_t** version);
+
+typedef SnpeUdo_ErrorType_t
+(*SnpeUdo_getImplVersion_t)(SnpeUdo_LibVersion_t** version);
+
+/**
+ * @brief Release the shared library's data structures, and invalidate any
+ *        handles returned by the library. The behavior of any outstanding
+ *        asynchronous calls made to this library when this function is called
+ *        are undefined. All library functions (except SnpeUdo_initImplLibrary) will
+ *        return an error after this function has been successfully called.
+ *
+ *        It should be possible to call SnpeUdo_initImplLibrary after calling this
+ *        function, and re-initialize the library.
+ *
+ * @return Error code
+ */
+SnpeUdo_ErrorType_t
+SnpeUdo_terminateImplLibrary(void);
+
+typedef SnpeUdo_ErrorType_t
+(*SnpeUdo_TerminateImplLibraryFunction_t)(void);
+
+
+/**
+ * @brief A function to query info on the UDO implementation library.
+ *        The function populates a structure which contains information about
+ *        operations that are part of this library
+ *
+ * @param[in, out] implementationInfo A pointer to struct which contains information
+ *                 on the operations
+ *
+ * @return error code
+ *
+ */
+SnpeUdo_ErrorType_t
+SnpeUdo_getImpInfo(SnpeUdo_ImpInfo_t** implementationInfo);
+
+typedef SnpeUdo_ErrorType_t
+(*SnpeUdo_GetImpInfoFunction_t)(SnpeUdo_ImpInfo_t** implementationInfo);
+
+typedef SnpeUdo_GetImpInfoFunction_t Udo_GetImpInfoFunction_t;
+
+/**
+ * @brief A function to create an operation factory.
+ *        The function receives the operation type, and an array of static parameters,
+ *        and returns operation factory handler
+ *
+ * @param[in] udoCoreType The Core type to create the operation on. An error will
+ *            be returned if this does not match the core type of the library.
+ *
+ * @param[in] perFactoryInfrastructure CreateOpFactory infrastructure appropriate to this
+ *            core type. The definition and semantics of this object will be defined
+ *            in the corresponding implementation header for the core type.
+ *
+ * @param[in] operationType A string containing Operation type. for example "MY_CONV"
+ *
+ * @param[in] numOfStaticParams The number of static parameters.
+ *
+ * @param[in] staticParams Array of static parameters
+ *
+ * @param[in,out] opFactory Handler to Operation Factory, to be used when creating operations
+ *
+ * @return Error Code
+ */
+SnpeUdo_ErrorType_t
+SnpeUdo_createOpFactory(SnpeUdo_CoreType_t    udoCoreType,
+                        void*                perFactoryInfrastructure,
+                        SnpeUdo_String_t      operationType,
+                        uint32_t             numOfStaticParams,
+                        SnpeUdo_Param_t*      staticParams,
+                        SnpeUdo_OpFactory_t*  opFactory);
+
+typedef SnpeUdo_ErrorType_t
+(*SnpeUdo_CreateOpFactoryFunction_t)(SnpeUdo_CoreType_t,
+                                     void*,
+                                     SnpeUdo_String_t,
+                                     uint32_t,
+                                     SnpeUdo_Param_t*,
+                                     SnpeUdo_OpFactory_t*);
+
+typedef SnpeUdo_CreateOpFactoryFunction_t Udo_CreateOpFactoryFunction_t;
+
+/**
+ * @brief A function to release the resources allocated for an operation factory
+ *        created by this library.
+ *
+ * @param[in] opFactory The operation factory to release. Upon success this handle will be invalidated.
+ *
+ * @return Error Code
+ */
+SnpeUdo_ErrorType_t
+SnpeUdo_releaseOpFactory(SnpeUdo_OpFactory_t opFactory);
+
+typedef SnpeUdo_ErrorType_t
+(*SnpeUdo_ReleaseOpFactoryFunction_t)(SnpeUdo_OpFactory_t);
+
+typedef SnpeUdo_ReleaseOpFactoryFunction_t Udo_ReleaseOpFactoryFunction_t;
+
+/**
+ * @brief A function to create an operation from the factory.
+ *        The function receives array of inputs and array of outputs, and creates an operation
+ *        instance, returning the operation instance handler.
+ *
+ * @param[in] opFactory OpFactory instance containing the parameters for this operation.
+ *
+ * @param[in] perOpInfrastructure Per-Op infrastructure for this operation. The definition
+ *            and semantics of this object will be defined in the implementation header
+ *            appropriate to this core type.
+ *
+ * @param[in] numOfInputs The number of input tensors this operation will receive.
+ *
+ * @param[in] inputs Array of input tensors, providing both the sizes and initial
+ *            location of the data.
+ *
+ * @param[in] numOfOutputs Number of output tensors this operation will produce.
+ *
+ * @param[in] outputs Array of output tensors, providing both the sizes and
+ *            initial location of the data.
+ *
+ * @param[in,out] operation Handle for newly created operation instance.
+ *
+ * @return Error Code
+ */
+SnpeUdo_ErrorType_t
+SnpeUdo_createOperation(SnpeUdo_OpFactory_t    opFactory,
+                        void*                 perOpInfrastructure,
+                        uint32_t              numOfInputs,
+                        SnpeUdo_TensorParam_t* inputs,
+                        uint32_t              numOfOutputs,
+                        SnpeUdo_TensorParam_t* outputs,
+                        SnpeUdo_Operation_t*   operation);
+
+typedef SnpeUdo_ErrorType_t
+(*SnpeUdo_CreateOperationFunction_t)(SnpeUdo_OpFactory_t,
+                                     void*,
+                                     uint32_t,
+                                     SnpeUdo_TensorParam_t*,
+                                     uint32_t,
+                                     SnpeUdo_TensorParam_t*,
+                                     SnpeUdo_Operation_t*);
+
+typedef SnpeUdo_CreateOperationFunction_t Udo_CreateOperationFunction_t;
+
+/**
+ * @brief A pointer to notification function.
+ *
+ *        The notification function supports the non-blocking (e.g. asynchronous) execution use-case.
+ *        In case an "executeUdoOp" function is called with "blocking" set to zero, and a
+ *        notify function, this function will be called by the implementation library at the
+ *        end of execution. The implementation library will pass the notify function the ID
+ *        that was provided to it when "executeUdoOp" was called.
+ *
+ * @param[in] ID 32-bit value, that was provided to executeUdoOp by the calling entity.
+ *            Can be used to track the notifications, in case of multiple execute calls issued.
+ *
+ * @return Error code
+ *
+ */
+typedef SnpeUdo_ErrorType_t
+(*SnpeUdo_ExternalNotify_t)(const uint32_t ID);
+
+typedef SnpeUdo_ExternalNotify_t Udo_ExternalNotify_t;
+
+/**
+ * @brief Operation execution function.
+ *
+ *        Calling this function will run the operation on set of inputs, generating a set of outputs.
+ *        The call can be blocking (synchronous) or non-blocking (asynchronous). To support the
+ *        non-blocking mode, the calling entity can pass an ID and a notification function.
+ *        At the end of the execution this notification function would be called, passing it the ID.
+ *        <b> NOTE: Asynchronous execution mode not supported in this release. </b>
+ *
+ * @param[in] operation handle to the operation on which execute is invoked
+ * @param[in] blocking flag to indicate execution mode.
+ *            If set, execution is blocking,
+ *            e.g SnpeUdo_executeOp call does not return until execution is done.
+ *            If not set, SnpeUdo_executeOp returns immediately, and the
+ *            library will call the notification function (if set) when execution is done.
+ *
+ * @param[in] ID 32-bit number that can be used by the calling entity to track execution
+ *            in case of non-blocking execution.
+ *            For example, it can be a sequence number, increased by one on each call.
+ *
+ * @param[in] notifyFunc Pointer to notification function. if the pointer is set, and execution is
+ *            non-blocking, the library will call this function at end of execution,
+ *            passing the number provided as ID
+ *
+ * @return Error code
+ *
+ */
+SnpeUdo_ErrorType_t
+SnpeUdo_executeOp(SnpeUdo_Operation_t operation,
+                  bool         blocking,
+                  const uint32_t ID,
+                  SnpeUdo_ExternalNotify_t notifyFunc);
+
+typedef SnpeUdo_ErrorType_t
+(*SnpeUdo_ExecuteOpFunction_t)(SnpeUdo_Operation_t,
+                               bool,
+                               const uint32_t,
+                               SnpeUdo_ExternalNotify_t);
+
+typedef SnpeUdo_ExecuteOpFunction_t Udo_ExecuteOpFunction_t;
+
+/**
+ * @brief A function to setting the inputs & outputs. part of SnpeUdo_Operation struct,
+ *        returned from creation of a new operation instance.
+ *        <b> Not supported in this release. </b>
+ *
+ *        This function allows the calling entity to change some of the inputs and outputs
+ *        between calls to execute.
+ *        Note that the change is limited to changing the <b> pointer </b> to the tensor data only.
+ *        Any other change may be rejected by the implementation library, causing
+ *        immediate invalidation of the operation instance
+ *
+ * @param[in] operation Operation on which IO tensors are set
+ *
+ * @param[in] inputs array of tensor parameters. The calling entity may provide a subset of the
+ *            operation inputs, providing only those that it wants to change.
+ *
+ * @param[in] outputs array of tensor parameters. The calling entity may provide a subset of the
+ *            operation outputs, providing only those that it wants to change.
+ *
+ * @return Error code
+ *
+ */
+SnpeUdo_ErrorType_t
+SnpeUdo_setOpIO(SnpeUdo_Operation_t operation,
+                SnpeUdo_TensorParam_t* inputs,
+                SnpeUdo_TensorParam_t* outputs);
+
+typedef SnpeUdo_ErrorType_t
+(*SnpeUdo_SetOpIOFunction_t)(SnpeUdo_Operation_t,
+                             SnpeUdo_TensorParam_t*,
+                             SnpeUdo_TensorParam_t*);
+
+typedef SnpeUdo_SetOpIOFunction_t Udo_SetOpIOFunction_t;
+
+/**
+ * @brief A function to return execution times.
+ *
+ *        This function can be called to query the operation execution times on the IP core
+ *        on which the operation is run. The time is provided in micro-seconds
+ *
+ * @param[in] operation Handle to operation whose execution time is being profiled
+ *
+ * @param[in,out] executionTime pointer to a uint32 value.This function writes the operation
+ *                execution time in usec into this value.
+ *
+ * @return Error code
+ *
+ */
+SnpeUdo_ErrorType_t
+SnpeUdo_profileOp(SnpeUdo_Operation_t operation, uint32_t *executionTime);
+
+typedef SnpeUdo_ErrorType_t
+(*SnpeUdo_ProfileOpFunction_t)(SnpeUdo_Operation_t, uint32_t*);
+
+typedef SnpeUdo_ProfileOpFunction_t Udo_ProfileOpFunction_t;
+
+/**
+ * @brief A function to release the operation instance
+ *        \n When it is called, the implementation library needs to release all resources
+ *        allocated for this operation instance.
+ *        \n Note that all function pointers which are part of SnpeUdo_Operation become
+ *        <b> invalid </b> once releaseUdoOp call returns.
+ *
+ * @param[in] operation Handle to operation to be released
+ * @return Error code
+ *
+ */
+SnpeUdo_ErrorType_t
+SnpeUdo_releaseOp(SnpeUdo_Operation_t operation);
+
+typedef SnpeUdo_ErrorType_t
+(*SnpeUdo_ReleaseOpFunction_t)(SnpeUdo_Operation_t);
+
+typedef SnpeUdo_ReleaseOpFunction_t Udo_ReleaseOpFunction_t;
+
+/** @} */ /* end_addtogroup c_plus_plus_apis C++ */
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif //SNPE_UDO_IMPL_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoImplDsp.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoImplDsp.h
new file mode 100755
index 0000000000000..522c6050a402d
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoImplDsp.h
@@ -0,0 +1,199 @@
+//==============================================================================
+//
+// Copyright (c) 2019-2021 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+// Header to be used by a DSP Hexnn UDO Implementation library
+
+#ifndef SNPE_UDO_IMPL_DSP_H
+#define SNPE_UDO_IMPL_DSP_H
+#include <stdio.h>
+#include "DSP/Udo/UdoImpl.h"
+
+/** @addtogroup c_plus_plus_apis C++
+@{ */
+
+/**
+ * @brief A function to validate that a set of params is supported by an operation
+ *        This function is HexNN specific, use case is when registration library is not in use.
+ *        Optional function.
+ *
+ * @param[in] operationType Operation type
+ * @param[in] numOfStaticParams Number of static params defined by the op
+ * @param[in] staticParams Array of static params to the op
+ * @return Error code, indicating if the operation can be created on this set of configuration or not.
+ *
+ */
+
+SnpeUdo_ErrorType_t
+SnpeUdo_validateOperation (SnpeUdo_String_t operationType,
+                           uint32_t numOfStaticParams,
+                           const SnpeUdo_Param_t* staticParams);
+
+typedef SnpeUdo_ErrorType_t (*SnpeUdo_ValidateOperationFunction_t) (SnpeUdo_String_t,
+                                                                    uint32_t,
+                                                                    const SnpeUdo_Param_t*);
+
+typedef SnpeUdo_ValidateOperationFunction_t Udo_ValidateOperationFunction_t;
+
+// enum used for indicating input/outout tensor data layouts on DSP, plain vs d32
+typedef enum {
+        SNPE_UDO_DSP_TENSOR_LAYOUT_PLAIN = 0x00,        UDO_DSP_TENSOR_LAYOUT_PLAIN = 0x00,
+        SNPE_UDO_DSP_TENSOR_LAYOUT_D32   = 0x01,        UDO_DSP_TENSOR_LAYOUT_D32   = 0x01
+} SnpeUdo_HexNNTensorLayout_t;
+
+typedef SnpeUdo_HexNNTensorLayout_t Udo_HexNNTensorLayout_t;
+
+/**
+ * @brief A function to query numbers of inputs and outputs,
+ *        quantization type of each input and each output as arrays,
+ *        and data layout (plain vs d32) of each input and each output as arrays
+ *        of an operation.
+ * inputsQuantTypes and inputsLayouts should point to arrays of size numOfInputs
+ * outputsQuantTypes and outputsLayouts should point to arrays of size numOfOutputs
+ *
+ * Note: inputsLayouts and inputsLayouts can point to NULL, in this case, it is
+ * assumed all inputs and/or outputs have plain data layouts, i.e. no D32
+ *
+ * @param[in] operationType Operation type
+ * @param[in] numOfStaticParams Number of static params defined by the op
+ * @param[in] staticParams Array of static params to the op
+ * @param[in,out] numOfInputs Number of input tensors to the op
+ * @param[in,out] inputsQuantTypes Array of Quantization info for each input tensor
+ * @param[in,out] inputsLayouts Array of layout type for each input tensor
+ * @param[in,out] numOfOutputs Number of output tensors to the op
+ * @param[in,out] outputsQuantTypes Array of Quantization info for each output tensor
+ * @param[in,out] outputsLayouts Array of layout type for each output tensor
+ * @return error code, indicating status of query
+ */
+
+SnpeUdo_ErrorType_t
+SnpeUdo_queryOperation (SnpeUdo_String_t operationType,
+                        uint32_t numOfStaticParams,
+                        const SnpeUdo_Param_t* staticParams,
+                        uint32_t* numOfInputs,
+                        SnpeUdo_QuantizationType_t** inputsQuantTypes,
+                        SnpeUdo_HexNNTensorLayout_t** inputsLayouts,
+                        uint32_t* numOfOutputs,
+                        SnpeUdo_QuantizationType_t** outputsQuantTypes,
+                        SnpeUdo_HexNNTensorLayout_t** outputsLayouts);
+
+typedef SnpeUdo_ErrorType_t (*SnpeUdo_QueryOperationFunction_t) (SnpeUdo_String_t,
+                                                                 uint32_t,
+                                                                 const SnpeUdo_Param_t*,
+                                                                 uint32_t*,
+                                                                 SnpeUdo_QuantizationType_t**,
+                                                                 SnpeUdo_HexNNTensorLayout_t**,
+                                                                 uint32_t*,
+                                                                 SnpeUdo_QuantizationType_t**,
+                                                                 SnpeUdo_HexNNTensorLayout_t**);
+
+typedef SnpeUdo_QueryOperationFunction_t Udo_QueryOperationFunction_t;
+
+// Global infrastructure functions supported by Hexagon-NN v2
+typedef void (*workerThread_t) (void* perOpInfrastructure, void* userData);
+typedef int (*udoSetOutputTensorSize_t) (void* perOpInfrastructure, uint32_t outIdx, uint32_t size);
+typedef int (*udoGetInputD32Paddings_t) (void* perOpInfrastructure, uint32_t inIdx,
+                                         uint32_t* heightPadBefore, uint32_t* heightPadAfter,
+                                         uint32_t* widthPadBefore, uint32_t* widthPadAfter,
+                                         uint32_t* depthPadBefore, uint32_t* depthPadAfter);
+typedef int (*udoSetOutputD32ShapeSizePaddings_t) (void* perOpInfrastructure, uint32_t outIdx,
+                                                   uint32_t batch,
+                                                   uint32_t height, uint32_t heightPadBefore, uint32_t heightPadAfter,
+                                                   uint32_t width, uint32_t widthPadBefore, uint32_t widthPadAfter,
+                                                   uint32_t depth, uint32_t depthPadBefore, uint32_t depthPadAfter,
+                                                   SnpeUdo_DataType_t dataType);
+typedef void* (*udoMemalign_t) (size_t n, size_t size);
+typedef void* (*udoMalloc_t) (size_t size);
+typedef void* (*udoCalloc_t) (size_t n, size_t size);
+typedef void (*udoFree_t) (void* ptr);
+typedef uint32_t (*udoGetVtcmSize_t) (void* perOpInfrastructure);
+typedef void* (*udoGetVtcmPtr_t) (void* perOpInfrastructure);
+typedef uint32_t (*udoVtcmIsReal_t) (void* perOpInfrastructure);
+typedef void (*udoRunWorkerThreads_t) (void* perOpInfrastructure, uint32_t nThreads, workerThread_t w, void* userData);
+
+typedef struct hexNNv2GlobalInfra {
+    udoSetOutputTensorSize_t udoSetOutputTensorSize;
+    udoGetInputD32Paddings_t udoGetInputD32Paddings;
+    udoSetOutputD32ShapeSizePaddings_t udoSetOutputD32ShapeSizePaddings;
+    udoMemalign_t udoMemalign;
+    udoMalloc_t udoMalloc;
+    udoCalloc_t udoCalloc;
+    udoFree_t udoFree;
+    udoGetVtcmSize_t udoGetVtcmSize;
+    udoGetVtcmPtr_t udoGetVtcmPtr;
+    udoVtcmIsReal_t udoVtcmIsReal;
+    udoRunWorkerThreads_t udoRunWorkerThreads;
+} SnpeUdo_HexNNv2GlobalInfra_t;
+
+typedef SnpeUdo_HexNNv2GlobalInfra_t Udo_HexNNv2GlobalInfra_t;
+
+// hexnn types
+typedef enum hexnnInfraType {
+   UDO_INFRA_HEXNN_V2,
+   UDO_INFRA_HEXNN_V3   // reserved, do not use
+} SnpeUdo_HexNNInfraType_t;
+
+typedef SnpeUdo_HexNNInfraType_t Udo_HexNNInfraType_t;
+
+typedef struct {
+    Udo_CreateOpFactoryFunction_t create_op_factory;
+    Udo_CreateOperationFunction_t create_operation;
+    Udo_ExecuteOpFunction_t execute_op;
+    Udo_ReleaseOpFunction_t release_op;
+    Udo_ReleaseOpFactoryFunction_t release_op_factory;
+    Udo_ValidateOperationFunction_t validate_op;
+    Udo_QueryOperationFunction_t query_op;
+} udo_func_package_t;
+
+/**
+ * @brief Infrastructures needed by a developer of DSP Hexnn UDO Implementation library.
+ *
+ * The framework/runtime which loads the Hexnn UDO implementation library provides
+ * this infrastructure to the loaded library by calling "SnpeUdo_initImplLibrary"
+ * function, and passing it (cast to void*). The Hexnn UDO library is expected
+ * to cast it back to this structure.
+ *
+ */
+typedef struct dspGlobalInfrastructure {
+    SnpeUdo_Version_t   dspInfraVersion;     // api version
+    SnpeUdo_HexNNInfraType_t infraType;
+    SnpeUdo_HexNNv2GlobalInfra_t hexNNv2Infra;
+} SnpeUdo_DspGlobalInfrastructure_t;
+
+typedef SnpeUdo_DspGlobalInfrastructure_t Udo_DspGlobalInfrastructure_t;
+
+/**
+ * hexnn v2 per op factory infrastructure
+ *
+ * The framework/runtime passes per op factory infrastructure as a void pointer
+ * to HexNN UDO implementation library by calling function "SnpeUdo_createOpFactory".
+ * UDO implementation library is expected to cast it back to this following struct.
+ *
+ */
+typedef struct hexnnv2OpFactoryInfra {
+   unsigned long graphId;
+} SnpeUdo_HexNNv2OpFactoryInfra_t;
+
+typedef SnpeUdo_HexNNv2OpFactoryInfra_t Udo_HexNNv2OpFactoryInfra_t;
+
+/**
+ * hexnn v2 per operation infrastructure
+ *
+ * The framework/runtime passes per operation infrastructure as a void pointer
+ * to HexNN UDO implementation library by calling function "SnpeUdo_createOperation".
+ * UDO implementation library is expected to cast it to the following type and save it.
+ *
+ * This is needed to be passed back into some functions from global infrastructure.
+ *
+ */
+typedef void* SnpeUdo_HexNNv2OpInfra_t;
+
+typedef SnpeUdo_HexNNv2OpInfra_t Udo_HexNNv2OpInfra_t;
+
+/** @} */ /* end_addtogroup c_plus_plus_apis C++ */
+
+#endif // SNPE_UDO_IMPL_DSP_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoShared.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoShared.h
new file mode 100755
index 0000000000000..8c17c1d5b35f1
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoShared.h
@@ -0,0 +1,48 @@
+//==============================================================================
+//
+// Copyright (c) 2019-2021 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef SNPE_UDO_SHARED_H
+#define SNPE_UDO_SHARED_H
+
+#include "DSP/Udo/UdoBase.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/** @addtogroup c_plus_plus_apis C++
+@{ */
+
+/**
+ * @brief A function to return the various versions as they relate to the UDO
+ *        The function returns a struct containing the the following:
+ *        libVersion: the version of the implementation library compiled for the UDO. Set by user
+ *        apiVersion: the version of the UDO API used in compiling the implementation library.
+ *        Set by SNPE
+ *
+ * @param[in, out] version A pointer to Version struct of type SnpeUdo_LibVersion_t
+ *
+ * @return Error code
+ *
+ */
+SnpeUdo_ErrorType_t
+SnpeUdo_getVersion (SnpeUdo_LibVersion_t** version);
+
+typedef SnpeUdo_ErrorType_t
+(*SnpeUdo_GetVersionFunction_t) (SnpeUdo_LibVersion_t** version);
+
+typedef SnpeUdo_GetVersionFunction_t Udo_GetVersionFunction_t;
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+/** @} */ /* end_addtogroup c_plus_plus_apis C++ */
+
+#endif // SNPE_UDO_SHARED_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuBackend.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuBackend.h
new file mode 100755
index 0000000000000..d7050c875f6db
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuBackend.h
@@ -0,0 +1,71 @@
+//==============================================================================
+//
+// Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/**
+ *  @file
+ *  @brief  A header which defines the QNN GPU specialization of the QnnBackend.h interface.
+ */
+
+#ifndef QNN_GPU_BACKEND_H
+#define QNN_GPU_BACKEND_H
+
+#ifdef __cplusplus
+#include <cstdint>
+#else
+#include <stdint.h>
+#endif
+
+#include "QnnBackend.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+* @brief This enum defines QNN GPU custom Backend config options.
+*/
+typedef enum {
+  /// If non-zero, tuning mode will be enabled
+  QNN_GPU_BACKEND_CONFIG_OPTION_ENABLE_TUNING_MODE = 0,
+  /// The Performance cache directory. Must be non-null
+  QNN_GPU_BACKEND_CONFIG_OPTION_PERFORMANCE_CACHE_DIR = 1,
+  /// If non-zero, the performance cache will be ignored when initializing
+  QNN_GPU_BACKEND_CONFIG_OPTION_INVALIDATE_PERFORMANCE_CACHE = 2,
+  /// Unused, present to ensure 32 bits.
+  QNN_GPU_BACKEND_CONFIG_OPTION_UNDEFINED = 0x7FFFFFFF,
+} QnnGpuBackend_ConfigOption_t;
+
+/**
+ * @brief A struct which defines the QNN GPU Backend custom configuration options.
+ *        Objects of this type are to be referenced through QnnBackend_CustomConfig_t.
+ */
+typedef struct {
+   QnnGpuBackend_ConfigOption_t option;
+   union UNNAMED {
+      uint8_t enableTuningMode;
+      const char* performanceCacheDir;
+      uint8_t invalidatePerformanceCache;
+   };
+} QnnGpuBackend_CustomConfig_t;
+
+// clang-format off
+/// QnnGpuBackend_CustomConfig_t initializer macro
+#define QNN_GPU_BACKEND_CUSTOM_CONFIG_INIT                        \
+  {                                                               \
+    QNN_GPU_BACKEND_CONFIG_OPTION_UNDEFINED, /*option*/           \
+    {                                                             \
+    false                            /*enableTuningMode*/         \
+    }                                                             \
+  }
+// clang-format on
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuCommon.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuCommon.h
new file mode 100755
index 0000000000000..8fd9c18afb46b
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuCommon.h
@@ -0,0 +1,49 @@
+//==============================================================================
+//
+// Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/**
+ *  @file
+ *  @brief  A header which defines common QNN GPU macros.
+ */
+
+#ifndef QNN_GPU_COMMON_H
+#define QNN_GPU_COMMON_H
+
+#include "QnnCommon.h"
+
+/// GPU Backend identifier
+#define QNN_BACKEND_ID_GPU 4
+
+/// GPU interface provider
+#define QNN_GPU_INTERFACE_PROVIDER_NAME "GPU_QTI_AISW"
+
+// GPU API Version values
+#define QNN_GPU_API_VERSION_MAJOR 3
+#define QNN_GPU_API_VERSION_MINOR 7
+#define QNN_GPU_API_VERSION_PATCH 0
+
+// clang-format off
+
+/// Macro to set Qnn_ApiVersion_t for GPU backend
+#define QNN_GPU_API_VERSION_INIT                                 \
+  {                                                              \
+    {                                                            \
+      QNN_API_VERSION_MAJOR,     /*coreApiVersion.major*/        \
+      QNN_API_VERSION_MINOR,     /*coreApiVersion.major*/        \
+      QNN_API_VERSION_PATCH      /*coreApiVersion.major*/        \
+    },                                                           \
+    {                                                            \
+      QNN_GPU_API_VERSION_MAJOR, /*backendApiVersion.major*/     \
+      QNN_GPU_API_VERSION_MINOR, /*backendApiVersion.minor*/     \
+      QNN_GPU_API_VERSION_PATCH  /*backendApiVersion.patch*/     \
+    }                                                            \
+  }
+
+// clang-format on
+
+#endif  // QNN_GPU_COMMON_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuContext.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuContext.h
new file mode 100755
index 0000000000000..42599e4280971
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuContext.h
@@ -0,0 +1,78 @@
+//==============================================================================
+//
+// Copyright (c) 2021-2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/**
+ *  @file
+ *  @brief  A header which defines the QNN GPU specialization of the QnnContext.h interface.
+ */
+
+#ifndef QNN_GPU_CONTEXT_H
+#define QNN_GPU_CONTEXT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @brief This enum defines QNN GPU custom context config options.
+ */
+typedef enum {
+  /// Sets performance hint options via QnnGpuContext_PerfHint_t
+  QNN_GPU_CONTEXT_CONFIG_OPTION_PERF_HINT = 0,
+  /// If non-zero, OpenGL buffers will be used
+  QNN_GPU_CONTEXT_CONFIG_OPTION_USE_GL_BUFFERS = 1,
+  /// The kernel disk cache directory. Must be non-null
+  QNN_GPU_CONTEXT_CONFIG_OPTION_KERNEL_REPO_DIR = 2,
+  /// If non-zero, the kernel disk cache will be ignored when initializing
+  QNN_GPU_CONTEXT_CONFIG_OPTION_INVALIDATE_KERNEL_REPO = 3,
+  /// Unused, present to ensure 32 bits.
+  QNN_GPU_CONTEXT_CONFIG_OPTION_UNDEFINED = 0x7FFFFFFF
+} QnnGpuContext_ConfigOption_t;
+
+/**
+ * @brief An enum which defines the different GPU performance hint options.
+ */
+typedef enum {
+  /// Sets the GPU performance hint to high performance, this is the default
+  QNN_GPU_CONTEXT_PERF_HINT_HIGH = 0,
+  /// Sets the GPU performance hint to normal performance
+  QNN_GPU_CONTEXT_PERF_HINT_NORMAL = 1,
+  /// Sets the GPU performance hint to low performance
+  QNN_GPU_CONTEXT_PERF_HINT_LOW = 2
+} QnnGpuContext_PerfHint_t;
+
+/**
+ * @brief A struct which defines the QNN GPU context custom configuration options.
+ *        Objects of this type are to be referenced through QnnContext_CustomConfig_t.
+ */
+typedef struct {
+  QnnGpuContext_ConfigOption_t option;
+  union UNNAMED {
+    QnnGpuContext_PerfHint_t perfHint;
+    uint8_t useGLBuffers;
+    const char* kernelRepoDir;
+    uint8_t invalidateKernelRepo;
+  };
+} QnnGpuContext_CustomConfig_t;
+
+// clang-format off
+/// QnnGpuContext_CustomConfig_t initializer macro
+#define QNN_GPU_CONTEXT_CUSTOM_CONFIG_INIT                        \
+  {                                                               \
+    QNN_GPU_CONTEXT_CONFIG_OPTION_UNDEFINED, /*option*/           \
+    {                                                             \
+    QNN_GPU_CONTEXT_PERF_HINT_HIGH           /*perfHint*/         \
+    }                                                             \
+  }
+// clang-format on
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuGraph.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuGraph.h
new file mode 100755
index 0000000000000..e0652d44883ef
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuGraph.h
@@ -0,0 +1,72 @@
+//==============================================================================
+//
+// Copyright (c) 2020-2021 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/**
+ *  @file
+ *  @brief  A header which defines the QNN GPU specialization of the QnnGraph.h interface.
+ */
+
+#ifndef QNN_GPU_GRAPH_H
+#define QNN_GPU_GRAPH_H
+
+#ifdef __cplusplus
+#include <cstdint>
+#else
+#include <stdint.h>
+#endif
+
+#include "QnnGraph.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @brief An enum which defines the different tensor optimization options. A
+ *        tensor may be optimized to the specified QnnGpu_Precision_t when it
+ *        is a graph tensor that is not a graph input or a graph output and
+ *        does not connect two operations from different op packages.
+ */
+typedef enum {
+  /// Sets the precision mode to floating point 32-bit (FP32)
+  QNN_GPU_PRECISION_FP32 = 0,
+  /// Sets the precision mode to floating point 16-bit (FP16)
+  QNN_GPU_PRECISION_FP16 = 1,
+  /// Sets the precision mode to FP16 for storage and FP32 for calculations
+  QNN_GPU_PRECISION_HYBRID = 2,
+  /// Uses the tensor data type provided by the user (default)
+  QNN_GPU_PRECISION_USER_PROVIDED = 3,
+} QnnGpu_Precision_t;
+
+/**
+ * @brief A struct which defines the QNN GPU graph custom configuration options.
+ *        Objects of this type are to be referenced through QnnGraph_CustomConfig_t.
+ */
+typedef struct {
+  QnnGpu_Precision_t precision;
+  uint8_t disableMemoryOptimizations;
+  uint8_t disableNodeOptimizations;
+  uint8_t disableQueueRecording;
+} QnnGpuGraph_CustomConfig_t;
+
+// clang-format off
+/// QnnGpuGraph_CustomConfig_t initializer macro
+#define QNN_GPU_GRAPH_CUSTOM_CONFIG_INIT                              \
+  {                                                                   \
+    QNN_GPU_PRECISION_USER_PROVIDED,   /*precision*/                  \
+    0u,                                /*disableMemoryOptimizations*/ \
+    0u,                                /*disableNodeOptimizations*/   \
+    0u                                 /*disableQueueRecording*/      \
+  }
+// clang-format on
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuMem.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuMem.h
new file mode 100755
index 0000000000000..1c6cd5c3e032a
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuMem.h
@@ -0,0 +1,52 @@
+//==============================================================================
+//
+// Copyright (c) 2024 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/**
+ *  @file
+ *  @brief  A header which defines the QNN GPU specialization of the QnnMem.h interface.
+ */
+
+#ifndef QNN_GPU_MEM_H
+#define QNN_GPU_MEM_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef void* QnnGpuMem_Buffer_t;
+
+/**
+ * @brief This enum defines QNN GPU memory type
+ */
+typedef enum { QNN_GPU_MEM_OPENCL = 0, QNN_GPU_MEM_UNDEFINED = 0x7FFFFFF } QnnGpu_MemType_t;
+
+/**
+ * @brief A struct which defines the QNN GPU memory preallocated by the client.
+ *        Objects of this type are to be referenced through Qnn_MemInfoCustom_t.
+ */
+typedef struct {
+  QnnGpu_MemType_t memType;
+  union {
+    QnnGpuMem_Buffer_t buffer;
+  };
+} QnnGpu_MemInfoCustom_t;
+
+// clang-format off
+/// QnnGpu_MemInfoCustom_t initializer macro
+#define QNN_GPU_MEMINFO_CUSTOM_INIT                               \
+  {                                                               \
+    QNN_GPU_MEM_UNDEFINED, /*memType*/                            \
+    NULL /* buffer*/                                              \
+  }
+// clang-format on
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuOpPackage.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuOpPackage.h
new file mode 100755
index 0000000000000..5413f50ba2267
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuOpPackage.h
@@ -0,0 +1,682 @@
+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/**
+ *  @file
+ *  @brief  A header which defines the QNN GPU specialization of the QnnOpPackage.h interface.
+ */
+
+#ifndef QNN_GPU_OP_PACKAGE_H
+#define QNN_GPU_OP_PACKAGE_H
+
+#ifdef __cplusplus
+#include <cstdint>
+#else
+#include <stdint.h>
+#endif
+
+#include "GPU/QnnGpuCommon.h"
+#include "GPU/QnnGpuGraph.h"
+#include "QnnOpPackage.h"
+#include "QnnTypes.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// QnnOpPackage_GlobalInfrastructure_t specialization.
+//=============================================================================
+
+/**
+ * @brief A struct which is used to communicate device constant properties
+ */
+typedef struct {
+  /// GPU device version string
+  char deviceVersion[128];
+  /// GPU driver interface version {major, minor}
+  uint32_t interfaceVersion[2];
+  /// GPU Adreno(TM) tier string
+  char tierName[8];
+  /// GPU driver version {product, major, minor, patch}
+  uint32_t compilerVersion[4];
+  /// GPU device max work group size
+  size_t maxWorkGroupSize;
+  /// GPU device image 2D max width
+  size_t image2dMaxWidth;
+  /// GPU device image 2D max height
+  size_t image2dMaxHeight;
+  /// GPU device max memory allocation size
+  size_t maxBufferAllocSize;
+  /// GPU device addr alignment in bits
+  uint32_t baseAddrAlignment;
+  /// GPU device image 2D Array max width
+  size_t image2dArrayMaxWidth;
+  /// GPU device image 2D Array max height
+  size_t image2dArrayMaxHeight;
+  /// GPU device image 2D Array max depth
+  size_t image2dArrayMaxDepth;
+} QnnGpu_DeviceProperties_t;
+
+/**
+ * @brief A QNN GPU struct specializing QnnOpPackage_GlobalInfrastructure_t
+ */
+typedef struct _QnnOpPackage_GlobalInfrastructure_t {
+  /// GPU backend version (as returned by QnnBackend_getApiVersion())
+  const Qnn_ApiVersion_t* sdkApiVersion;
+  /// GPU device properties
+  const QnnGpu_DeviceProperties_t* deviceProperties;
+  /// Null terminated path to the OpenCL driver used by the backend
+  const char* driverPath;
+} QnnGpuOpPackage_GlobalInfrastructure_t;
+
+//=============================================================================
+// QnnOpPackage_PackageInfo_t specialization.
+//=============================================================================
+
+/**
+ * @brief A struct having op package specific information
+ */
+typedef struct _QnnOpPackage_PackageInfo_t {
+  /// Null terminated hash key string of all kernel sources
+  const char* kernelRepoHash;
+} QnnGpuOpPackage_PackageInfo_t;
+
+//=============================================================================
+// QnnOpPackage_Optimization_t specialization.
+//=============================================================================
+
+/**
+ * @brief An enum to specify the QNN GPU optimization type
+ *
+ */
+typedef enum {
+  /// Undefined option only used for QNN_GPU_OP_PACKAGE_OPTIMIZATION_INIT
+  QNN_GPU_OPTIMIZATION_TYPE_UNDEFINED = 0,
+  /// Super node optimization
+  QNN_GPU_OPTIMIZATION_TYPE_SUPER_NODE = 2,
+} QnnGpuOpPackage_OptimizationType_t;
+
+/**
+ * @brief A struct representing a super node connection constraint.
+ */
+typedef struct {
+  /// Producer node corresponding to QnnGpuOpPackage_SuperNodeOptimization_t::operations
+  uint32_t producer;
+  /// Output tensor index corresponding to the producer node
+  uint32_t producerOutputIndex;
+  /// Consumer node corresponding to QnnGpuOpPackage_SuperNodeOptimization_t::operations
+  uint32_t consumer;
+  /// Output tensor index corresponding to the consumer node
+  uint32_t consumerInputIndex;
+} QnnGpuOpPackage_SuperNodeConnectionConstraint_t;
+
+/**
+ * @brief An enum to specify the source of a tensor in an op def for a tensor constraint.
+ *
+ */
+typedef enum {
+  /// Tensor is an op def output
+  QNN_GPU_OPTIMIZATION_SUPER_NODE_TENSOR_SOURCE_OUTPUT = 1,
+  QNN_GPU_OPTIMIZATION_SUPER_NODE_TENSOR_SOURCE_INPUT  = 2,
+} QnnGpuOpPackage_TensorConstraintSource_t;
+
+/**
+ * @brief An enum to specify the tensor constraint type.
+ *
+ */
+typedef enum {
+  /// Add a Qnn_DataType_t to the whitelist of allowable types.
+  /// If no data type constraint is present for a tensor, all data types are allowed.
+  QNN_GPU_OPTIMIZATION_SUPER_NODE_TENSOR_CONSTRAINT_DATA_TYPE = 1,
+  /// Tensor must match it's rank
+  QNN_GPU_OPTIMIZATION_SUPER_NODE_TENSOR_CONSTRAINT_RANK = 2,
+  /// Tensor must match one of it's dimensions
+  QNN_GPU_OPTIMIZATION_SUPER_NODE_TENSOR_CONSTRAINT_DIMENSION = 3,
+  /// Add a Qnn_TensorType_t to the whitelist of allowable tensor types.
+  /// If no tensor type constraint is present for a tensor, all types are allowed.
+  QNN_GPU_OPTIMIZATION_SUPER_NODE_TENSOR_CONSTRAINT_TENSOR_TYPE = 4,
+} QnnGpuOpPackage_TensorConstraintType_t;
+
+/**
+ * @brief A struct representing a tensor constraint.
+ */
+typedef struct {
+  /// Operation corresponding to QnnGpuOpPackage_SuperNodeOptimization_t::operations
+  uint32_t operationIndex;
+  /// Source of the tensor in the Qnn_OpConfig_t
+  QnnGpuOpPackage_TensorConstraintSource_t source;
+  union {
+    /// Tensor index in the Qnn_OpConfig_t, used only for inputs and outputs
+    uint32_t index;
+    /// Tensor parameter name in the Qnn_OpConfig_t, used only for parameters
+    const char* name;
+  };
+  /// Type of tensor constraint
+  QnnGpuOpPackage_TensorConstraintType_t type;
+  union {
+    /// Tensor data type for Qnn_DataType_t constraints
+    Qnn_DataType_t dataType;
+    /// Tensor type for Qnn_TensorType_t constraints
+    Qnn_TensorType_t tensorType;
+    /// Tensor rank for rank constraints
+    uint32_t rank;
+    struct {
+      /// Tensor dimension index for dimension constraints
+      uint32_t index;
+      /// Tensor dimension size for dimension constraints
+      uint32_t size;
+    } dimension;
+  };
+} QnnGpuOpPackage_TensorConstraint_t;
+
+typedef struct {
+  /// Null-terminated array of comma separated lists of operations used for matching super node ops.
+  /// An asterisk (*) may be used to represent any operation type.
+  const char** operations;
+  /// Null-terminated array of pointers to super node connection constraints
+  QnnGpuOpPackage_SuperNodeConnectionConstraint_t** connectionConstraints;
+  /// Null-terminated array of pointers to super node tensor constraints
+  QnnGpuOpPackage_TensorConstraint_t** tensorConstraints;
+} QnnGpuOpPackage_SuperNodeOptimization_t;
+
+// clang-format off
+/// QnnGpuOpPackage_SuperNodeOptimization_t initializer macro
+#define QNN_GPU_OP_PACKAGE_SUPER_NODE_OPTIMIZATION_INIT \
+  {                                                     \
+    NULL, /*operations*/                                \
+    NULL, /*connectionConstraints*/                     \
+    NULL, /*tensorConstraints*/                         \
+  }
+// clang-format on
+
+/**
+ * @brief A struct representing a QNN GPU optimization.
+ */
+typedef struct _QnnOpPackage_Optimization_t {
+  /// Type of optimization
+  QnnGpuOpPackage_OptimizationType_t type;
+  /// Op package assigned name of the optimization
+  const char* name;
+  union {
+    /// Super node optimization, used when type is QNN_GPU_OPTIMIZATION_TYPE_SUPER_NODE
+    const QnnGpuOpPackage_SuperNodeOptimization_t* superNode;
+  };
+} QnnGpuOpPackage_Optimization_t;
+
+/// QnnGpuOpPackage_Optimization_t initializer macro
+#define QNN_GPU_OP_PACKAGE_OPTIMIZATION_INIT            \
+  {                                                     \
+    QNN_GPU_OPTIMIZATION_TYPE_UNDEFINED, NULL, { NULL } \
+  }
+
+//=============================================================================
+// QnnOpPackage_GraphInfrastructure_t specialization.
+//=============================================================================
+
+/**
+ * @brief A QNN GPU struct specializing QnnOpPackage_GraphInfrastructure_t
+ */
+typedef struct _QnnOpPackage_GraphInfrastructure_t {
+  /// GPU precision mode, user-supplied hint used for optimal kernel selection
+  QnnGpu_Precision_t precisionMode;
+} QnnGpuOpPackage_GraphInfrastructure_t;
+
+//=============================================================================
+// QNN GPU Memory Object
+//=============================================================================
+
+/**
+ * @brief An enum to specify the QNN GPU memory object type
+ *
+ */
+typedef enum {
+  /// Host memory, only used for Qnn_Param_t tensors
+  QNN_GPU_MEM_OBJ_TYPE_HOST = 0,
+  /// GPU driver buffer memory object
+  QNN_GPU_MEM_OBJ_TYPE_BUFFER = 1,
+  /// GPU driver image 2D memory object
+  QNN_GPU_MEM_OBJ_TYPE_IMAGE2D = 2,
+  /// GPU driver image 2D array memory object
+  QNN_GPU_MEM_OBJ_TYPE_IMAGE2D_ARRAY = 3,
+  /// Aggregation of GPU driver image 2D memory objects
+  QNN_GPU_MEM_OBJ_TYPE_AGGREGATED_IMAGE2D = 4,
+  /// Aggregation of GPU driver image 2D array memory objects
+  QNN_GPU_MEM_OBJ_TYPE_AGGREGATED_IMAGE2D_ARRAY = 5,
+  /// Memory type is unclaimed and can be specified by the op package via the \n
+  /// QnnGpu_OutputClaim_t struct
+  QNN_GPU_MEM_OBJ_TYPE_UNCLAIMED = 6,
+} QnnGpu_MemoryObjectType_t;
+
+/**
+ * @brief An enum to specify the QNN GPU memory layout
+ *
+ */
+typedef enum {
+  /// HWC layout
+  QNN_GPU_MEM_LAYOUT_HWC = 0,
+  /// HCW layout
+  QNN_GPU_MEM_LAYOUT_HCW = 1,
+  /// CHW layout
+  QNN_GPU_MEM_LAYOUT_CHW = 2,
+  /// Undefined
+  QNN_GPU_MEM_LAYOUT_UNDEFINED = 0x7FFFFFFF,
+} QnnGpu_MemoryLayout_t;
+
+/**
+ * @brief A struct to specify blockSize for weight Tensor and tensorId for weight Param tensor
+ */
+typedef struct {
+  // Block Quantization, block Sizes
+  uint32_t* bqBlockSize;
+  /// Tensor Id for Quantization encodings
+  uint32_t bqEncodingTensorId;
+} QnnGpu_BlockEncodingInfo_t;
+
+// clang-format off
+/// QnnGpu_MemoryObject_t initializer macro
+#define QNN_GPU_BLOCK_ENCODING_INFO_INIT                   \
+  {                                                   \
+    NULL,                      /*bqBlockSize*/    \
+    0u                         /*bqEncodingTensorId*/      \
+  }
+// clang-format on
+
+/**
+ * @brief A QNN GPU struct specifying a memory object
+ *        This struct is used with the following kernel argument types:
+ *          - QNN_GPU_KERNEL_ARG_TYPE_OP_INPUT_READ
+ *          - QNN_GPU_KERNEL_ARG_TYPE_OP_INPUT_READWRITE
+ *          - QNN_GPU_KERNEL_ARG_TYPE_OP_OUTPUT_WRITE
+ *          - QNN_GPU_KERNEL_ARG_TYPE_INTERNAL_READ
+ *          - QNN_GPU_KERNEL_ARG_TYPE_INTERNAL_READWRITE
+ *          - QNN_GPU_KERNEL_ARG_TYPE_INTERNAL_WRITE
+ */
+typedef struct {
+  /// Type of memory object
+  QnnGpu_MemoryObjectType_t type;
+  /// Data type of the memory object
+  Qnn_DataType_t dataType;
+  /// Memory object dimensions                                                                 \n
+  ///   Size is numDimensions. Uses the following type dependent format:                       \n
+  ///   QNN_GPU_MEM_OBJ_TYPE_BUFFER                   -> {numElements}                         \n
+  ///   QNN_GPU_MEM_OBJ_TYPE_IMAGE2D                  -> {height,width}                        \n
+  ///   QNN_GPU_MEM_OBJ_TYPE_IMAGE2D_ARRAY            -> {height,width,array_size}             \n
+  ///   QNN_GPU_MEM_OBJ_TYPE_AGGREGATED_IMAGE2D       -> {num_batches,height,width}            \n
+  ///   QNN_GPU_MEM_OBJ_TYPE_AGGREGATED_IMAGE2D_ARRAY -> {num_batches,height,width,array_size}
+  uint32_t* dimensions;
+  /// Memory object offsets                                         \n
+  ///   Size is numDimensions.                                      \n
+  ///   Indicates where the data store starts in the memory object. \n
+  uint32_t* offsets;
+  /// Number of dimensions in memory object                           \n
+  ///   Size is numDimensions. Has the following type dependent size: \n
+  ///   QNN_GPU_MEM_OBJ_TYPE_BUFFER                   -> 1            \n
+  ///   QNN_GPU_MEM_OBJ_TYPE_IMAGE2D                  -> 2            \n
+  ///   QNN_GPU_MEM_OBJ_TYPE_IMAGE2D_ARRAY            -> 3            \n
+  ///   QNN_GPU_MEM_OBJ_TYPE_AGGREGATED_IMAGE2D       -> 3            \n
+  ///   QNN_GPU_MEM_OBJ_TYPE_AGGREGATED_IMAGE2D_ARRAY -> 4
+  uint32_t numDimensions;
+  /// Memory object layout                           \n
+  /// Op package specific layout identifier          \n
+  /// Default is QNN_GPU_MEM_LAYOUT_UNDEFINED if not already specified by a prior operation
+  QnnGpu_MemoryLayout_t layout;
+  /// Block Quantization Tensor Information
+  QnnGpu_BlockEncodingInfo_t blockEncodingInfo;
+} QnnGpu_MemoryObject_t;
+
+// clang-format off
+/// QnnGpu_MemoryObject_t initializer macro
+#define QNN_GPU_MEMORY_OBJECT_INIT                    \
+  {                                                   \
+    QNN_GPU_MEM_OBJ_TYPE_UNCLAIMED, /*type*/          \
+    QNN_DATATYPE_UNDEFINED,         /*dataType*/      \
+    NULL,                           /*dimensions*/    \
+    NULL,                           /*offsets*/       \
+    0u,                             /*numDimensions*/ \
+    QNN_GPU_MEM_LAYOUT_UNDEFINED,   /*layout*/        \
+    QNN_GPU_BLOCK_ENCODING_INFO_INIT  /*blockEncodingInfo*/    \
+  }
+// clang-format on
+
+//=============================================================================
+// QnnOpPackage_Node_t specialization.
+//=============================================================================
+
+/**
+ * @brief A QNN GPU struct specifying a storage tensor
+ */
+typedef struct {
+  /// Tensor ID
+  uint32_t id;
+  /// Tensor's associated memory object
+  const QnnGpu_MemoryObject_t* memoryObject;
+} QnnGpu_TensorStorageType_t;
+
+// clang-format off
+/// QnnGpu_TensorStorageType_t initializer macro
+#define QNN_GPU_TENSOR_STORAGE_TYPE_INIT \
+  {                                      \
+    0u,   /*id*/                         \
+    NULL  /*memoryObject*/               \
+  }
+// clang-format on
+
+/**
+ * @brief A QNN GPU struct specializing QnnOpPackage_Node_t
+ */
+typedef struct _QnnOpPackage_Node_t {
+  /// Optimization index, see QnnOpPackage_Info_t, ignore when only one op config provided
+  uint32_t optimization;
+  /// Null-terminated array of operation config pointers
+  /// Only one pointer provided when no optimizations performed
+  const Qnn_OpConfig_t** configs;
+  /// Null-terminated array of tensor storage type pointers called out in the config
+  const QnnGpu_TensorStorageType_t** storageTypes;
+  /// Kernel variant index, if set then used by OpPackage to determine kernel selection
+  int32_t kernelVariant;
+} QnnGpuOpPackage_Node_t;
+
+//=============================================================================
+// QnnOpPackage_OpImpl_t specialization.
+//=============================================================================
+
+/**
+ * @brief A QNN GPU struct specifying an output tensor claim. Using the principle
+ *        of least work, operations must output a memory object type that is most
+ *        convenient for itself. Only QNN_TENSOR_TYPE_NATIVE tensor types may
+ *        be claimed.
+ */
+typedef struct {
+  /// Index into the Qnn_OpConfig_t provided in QnnGpuOpPackage_Node_t
+  uint32_t opConfigIndex;
+  /// Index into the operation outputs to identify the tensor
+  uint32_t outputIndex;
+  /// Specification of the claimed memory object
+  const QnnGpu_MemoryObject_t* memoryObject;
+} QnnGpu_OutputClaim_t;
+
+// clang-format off
+/// QnnGpu_OutputClaim_t initializer macro
+#define QNN_GPU_OUTPUT_CLAIM_INIT \
+  {                               \
+    0u,      /*opConfigIndex*/    \
+    0u,      /*outputIndex*/      \
+    NULL     /*memoryObject*/     \
+  }
+// clang-format on
+
+/**
+ * @brief An enum to specify the kernel argument type.
+ *
+ */
+typedef enum {
+  /// Operation input tensor used as kernel input
+  QNN_GPU_KERNEL_ARG_TYPE_OP_INPUT_READ = 0,
+  /// Operation input tensor used as kernel output
+  QNN_GPU_KERNEL_ARG_TYPE_OP_INPUT_READWRITE = 1,
+  /// Operation output tensor used as kernel output
+  QNN_GPU_KERNEL_ARG_TYPE_OP_OUTPUT_WRITE = 2,
+  /// Operation internal tensor used as kernel input
+  QNN_GPU_KERNEL_ARG_TYPE_INTERNAL_READ = 3,
+  /// Operation internal tensor used as kernel input/output
+  QNN_GPU_KERNEL_ARG_TYPE_INTERNAL_READWRITE = 4,
+  /// Operation internal tensor used as kernel output
+  QNN_GPU_KERNEL_ARG_TYPE_INTERNAL_WRITE = 5,
+  /// Plain old data kernel argument
+  QNN_GPU_KERNEL_ARG_TYPE_DATA = 6,
+  /// Local memory kernel argument
+  QNN_GPU_KERNEL_ARG_TYPE_LOCAL = 7,
+  /// Null pointer kernel argument
+  QNN_GPU_KERNEL_ARG_TYPE_NULL_PTR = 8,
+  /// Operation tensor parameter used as kernel input
+  QNN_GPU_KERNEL_ARG_TYPE_OP_TENSOR_PARAM = 9,
+} QnnGpu_KernelArgType_t;
+
+/**
+ * @brief A QNN GPU struct specifying a kernel argument corresponding to a tensor.
+ *        This struct is used with the following kernel argument types:
+ *          - QNN_GPU_KERNEL_ARG_TYPE_OP_INPUT_READ
+ *          - QNN_GPU_KERNEL_ARG_TYPE_OP_INPUT_READWRITE
+ *          - QNN_GPU_KERNEL_ARG_TYPE_OP_OUTPUT_WRITE
+ *          - QNN_GPU_KERNEL_ARG_TYPE_INTERNAL_READ
+ *          - QNN_GPU_KERNEL_ARG_TYPE_INTERNAL_READWRITE
+ *          - QNN_GPU_KERNEL_ARG_TYPE_INTERNAL_WRITE
+ */
+typedef struct {
+  /// Index into the Qnn_OpConfig_t provided in QnnGpuOpPackage_Node_t, ignored for INTERNAL types
+  uint32_t opConfigIndex;
+  /// Index into the operation input ot output list or the internal tensor list
+  uint32_t tensorIndex;
+  /// Batch element index for aggregated tensor types
+  uint32_t element;
+} QnnGpu_TensorKernelArg_t;
+
+// clang-format off
+/// QnnGpu_TensorKernelArg_t initializer macro
+#define QNN_GPU_TENSOR_KERNEL_ARG_INIT \
+  {                                    \
+    0u,   /*opConfigIndex*/            \
+    0u,   /*tensorIndex*/              \
+    0u    /*element*/                  \
+  }
+// clang-format on
+
+/**
+ * @brief An enum to specify the kernel data argument type.
+ *
+ */
+typedef enum {
+  QNN_GPU_KERNEL_ARG_CL_TYPE_CHAR   = 0,
+  QNN_GPU_KERNEL_ARG_CL_TYPE_UCHAR  = 1,
+  QNN_GPU_KERNEL_ARG_CL_TYPE_SHORT  = 2,
+  QNN_GPU_KERNEL_ARG_CL_TYPE_USHORT = 3,
+  QNN_GPU_KERNEL_ARG_CL_TYPE_INT    = 4,
+  QNN_GPU_KERNEL_ARG_CL_TYPE_UINT   = 5,
+  QNN_GPU_KERNEL_ARG_CL_TYPE_LONG   = 6,
+  QNN_GPU_KERNEL_ARG_CL_TYPE_ULONG  = 7,
+  QNN_GPU_KERNEL_ARG_CL_TYPE_FLOAT  = 8,
+  QNN_GPU_KERNEL_ARG_CL_TYPE_DOUBLE = 9,
+} QnnGpu_DataKernelArgType_t;
+
+/**
+ * @brief A QNN GPU struct specifying a kernel argument corresponding to a plain old data.
+ *        This struct is used only with the QNN_GPU_KERNEL_ARG_TYPE_DATA arg type.
+ */
+typedef struct {
+  /// Data type of the data
+  QnnGpu_DataKernelArgType_t type;
+  union {
+    /// Used with QNN_GPU_KERNEL_ARG_CL_TYPE_CHAR
+    int8_t qnnChar;
+    /// Used with QNN_GPU_KERNEL_ARG_CL_TYPE_UCHAR
+    uint8_t qnnUChar;
+    /// Used with QNN_GPU_KERNEL_ARG_CL_TYPE_SHORT
+    int16_t qnnShort;
+    /// Used with QNN_GPU_KERNEL_ARG_CL_TYPE_USHORT
+    uint16_t qnnUShort;
+    /// Used with QNN_GPU_KERNEL_ARG_CL_TYPE_INT
+    int32_t qnnInt;
+    /// Used with QNN_GPU_KERNEL_ARG_CL_TYPE_UINT
+    uint32_t qnnUInt;
+    /// Used with QNN_GPU_KERNEL_ARG_CL_TYPE_LONG
+    int64_t qnnLong;
+    /// Used with QNN_GPU_KERNEL_ARG_CL_TYPE_ULONG
+    uint64_t qnnULong;
+    /// Used with QNN_GPU_KERNEL_ARG_CL_TYPE_FLOAT
+    float qnnFloat;
+    /// Used with QNN_GPU_KERNEL_ARG_CL_TYPE_DOUBLE
+    double qnnDouble;
+  };
+} QnnGpu_DataKernelArg_t;
+
+/// QnnGpu_DataKernelArg_t initializer macro
+#define QNN_GPU_DATA_KERNEL_ARG_INIT          \
+  {                                           \
+    QNN_GPU_KERNEL_ARG_CL_TYPE_CHAR, /*type*/ \
+    {                                         \
+      0 /*qnnChar*/                           \
+    }                                         \
+  }
+
+/**
+ * @brief A QNN GPU struct specifying a kernel argument corresponding to a local memory type.
+ *        This struct is used only with the QNN_GPU_KERNEL_ARG_TYPE_LOCAL arg type.
+ */
+typedef struct {
+  /// Size of the memory requested in bytes
+  uint32_t size;
+} QnnGpu_LocalKernelArg_t;
+
+/// QnnGpu_LocalKernelArg_t initializer macro
+#define QNN_GPU_LOCAL_KERNEL_ARG_INIT \
+  { 0u /*size*/ }
+
+/**
+ * @brief A QNN GPU struct specifying a kernel argument.
+ *        Note that the QNN_GPU_KERNEL_ARG_TYPE_NULL_PTR type does not have an entry in
+ *        the union.
+ */
+typedef struct {
+  /// Type of kernel argument
+  QnnGpu_KernelArgType_t type;
+  union {
+    /// Tensor type argument
+    QnnGpu_TensorKernelArg_t tensor;
+    /// Plain old data argument
+    QnnGpu_DataKernelArg_t data;
+    /// Local memory argument
+    QnnGpu_LocalKernelArg_t local;
+  };
+} QnnGpu_KernelArg_t;
+
+/// QnnGpu_KernelArg_t initializer macro
+#define QNN_GPU_KERNEL_ARG_INIT                 \
+  {                                             \
+    QNN_GPU_KERNEL_ARG_TYPE_NULL_PTR, /*type*/  \
+    {                                           \
+      QNN_GPU_TENSOR_KERNEL_ARG_INIT /*tensor*/ \
+    }                                           \
+  }
+
+/**
+ * @brief An enum to specify the kernel source type.
+ *
+ */
+typedef enum {
+  QNN_GPU_KERNEL_SOURCE_TYPE_TEXT   = 0,
+  QNN_GPU_KERNEL_SOURCE_TYPE_BINARY = 1,
+} QnnGpu_KernelSourceType_t;
+
+/**
+ * @brief This enum defines QNN GPU kernel tuning options.
+ */
+typedef enum {
+  /// local work size tuning
+  QNN_GPU_KERNEL_TUNING_LOCAL_WORK_SIZE = 0,
+  QNN_GPU_KERNEL_TUNING_UNDEFINED       = 0x7FFFFFFF
+} QnnGpu_KernelTuningOption_t;
+
+/**
+ * @brief This struct provides local-work-size tuning configuration.
+ */
+typedef struct {
+  uint32_t minValue[3];
+  uint32_t maxValue[3];
+  uint32_t stepSize[3];
+} QnnGpu_KernelLocalWorkSizeTuning_t;
+
+/**
+ * @brief This struct provides QNN GPU kernel tuning configuration.
+ */
+typedef struct {
+  QnnGpu_KernelTuningOption_t option;
+  union UNNAMED {
+    QnnGpu_KernelLocalWorkSizeTuning_t lws;
+  };
+} QnnGpu_KernelTuningConfig_t;
+
+/**
+ * @brief A QNN GPU struct specifying a kernel.
+ */
+typedef struct {
+  /// Kernel source code or binary
+  const void* kernelSource;
+  /// Length of kernel source/binary in bytes
+  size_t sourceLength;
+  /// Type of kernel source
+  QnnGpu_KernelSourceType_t sourceType;
+  /// Null terminated build options string used for kernel compilation
+  const char* buildOptions;
+  /// Rank of the globalWorkSizes
+  size_t globalWorkDim;
+  /// Global work sizes used by enqueuing the kernel
+  size_t globalWorkSizes[3];
+  /// Rank of the localWorkSizes
+  size_t localWorkDim;
+  /// Local work sizes used by enqueuing the kernel
+  size_t localWorkSizes[3];
+  /// Null-terminated array of kernel arguments in the order they appear in the kernel function
+  QnnGpu_KernelArg_t** args;
+  /// Null terminated name of the kernel
+  const char* name;
+  /// If non-zero, kernel will be enqueued during execute even if it is static
+  uint32_t isDynamic;
+  /// Null-terminated array to provide kernel tuning configurations.
+  QnnGpu_KernelTuningConfig_t** tuningConfigs;
+  /// Reserved field, must be null
+  void* reserved;
+} QnnGpu_Kernel_t;
+
+// clang-format off
+/// QnnGpu_Kernel_t initializer macro
+#define QNN_GPU_KERNEL_INIT                              \
+  {                                                      \
+    NULL,                            /*kernelSource*/    \
+    0u,                              /*sourceLength*/    \
+    QNN_GPU_KERNEL_SOURCE_TYPE_TEXT, /*sourceType*/      \
+    NULL,                            /*buildOptions*/    \
+    0u,                              /*globalWorkDim*/   \
+    {0u},                            /*globalWorkSizes*/ \
+    0u,                              /*localWorkDim*/    \
+    {0u},                            /*localWorkSizes*/  \
+    NULL,                            /*args*/            \
+    NULL,                            /*name*/            \
+    0u,                              /*isDynamic*/       \
+    NULL,                            /*tuningConfigs*/   \
+    NULL                             /*reserved*/        \
+  }
+// clang-format on
+
+/**
+ * @brief A QNN GPU struct specifying an operation.
+ */
+typedef struct _QnnOpPackage_OpImpl_t {
+  /// Null-terminated array of output claims
+  QnnGpu_OutputClaim_t** outputClaims;
+  /// Null-terminated array of tensor requests
+  QnnGpu_MemoryObject_t** memoryObjects;
+  /// Null-terminated array of kernels
+  QnnGpu_Kernel_t** kernels;
+} QnnGpu_Operation_t;
+
+// clang-format off
+/// QnnGpu_Operation_t initializer macro
+#define QNN_GPU_OPERATION_INIT     \
+  {                                \
+    NULL,     /*outputClaims*/     \
+    NULL,     /*memoryObjects*/    \
+    NULL,     /*kernels*/          \
+  }
+// clang-format on
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GenAiTransformer/QnnGenAiTransformerCommon.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GenAiTransformer/QnnGenAiTransformerCommon.h
new file mode 100755
index 0000000000000..3adb43819b8b3
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GenAiTransformer/QnnGenAiTransformerCommon.h
@@ -0,0 +1,50 @@
+//=============================================================================
+//
+//  Copyright (c) 2024 Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//=============================================================================
+
+/** @file
+ *  @brief QNN GenAiTransformer Common components
+ *
+ *         This file defines versioning and other identification details
+ *         and supplements QnnCommon.h for GenAiTransformer backend
+ */
+
+#ifndef QNN_GENAI_TRANSFORMER_COMMON_H
+#define QNN_GENAI_TRANSFORMER_COMMON_H
+
+#include "QnnCommon.h"
+
+/// GenAiTransformer Backend identifier
+#define QNN_BACKEND_ID_GENAI_TRANSFORMER 14
+
+/// GenAiTransformer interface provider
+#define QNN_GENAI_TRANSFORMER_INTERFACE_PROVIDER_NAME "GENAI_TRANSFORMER_QTI_AISW"
+
+// GenAiTransformer API Version values
+#define QNN_GENAI_TRANSFORMER_API_VERSION_MAJOR 1
+#define QNN_GENAI_TRANSFORMER_API_VERSION_MINOR 0
+#define QNN_GENAI_TRANSFORMER_API_VERSION_PATCH 0
+
+// clang-format off
+/// Macro to set Qnn_ApiVersion_t for GENAI_TRANSFORMER backend
+#define QNN_GENAI_TRANSFORMER_API_VERSION_INIT                                 \
+  {                                                              \
+    {                                                            \
+      QNN_API_VERSION_MAJOR,     /*coreApiVersion.major*/        \
+      QNN_API_VERSION_MINOR,     /*coreApiVersion.major*/        \
+      QNN_API_VERSION_PATCH      /*coreApiVersion.major*/        \
+    },                                                           \
+    {                                                            \
+      QNN_GENAI_TRANSFORMER_API_VERSION_MAJOR, /*backendApiVersion.major*/     \
+      QNN_GENAI_TRANSFORMER_API_VERSION_MINOR, /*backendApiVersion.minor*/     \
+      QNN_GENAI_TRANSFORMER_API_VERSION_PATCH  /*backendApiVersion.patch*/     \
+    }                                                            \
+  }
+
+// clang-format on
+
+#endif  // QNN_GENAI_TRANSFORMER_COMMON_H
\ No newline at end of file
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaBackend.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaBackend.h
new file mode 100755
index 0000000000000..e756b8042ec09
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaBackend.h
@@ -0,0 +1,76 @@
+//=============================================================================
+//
+//  Copyright (c) 2022 Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//=============================================================================
+
+/** @file
+ *  @brief QNN HTA component Backend API.
+ *
+ *         The interfaces in this file work with the top level QNN
+ *         API and supplements QnnBackend.h for HTA backend
+ */
+
+#ifndef QNN_HTA_BACKEND_H
+#define QNN_HTA_BACKEND_H
+
+#include "QnnBackend.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+/* @brief Enum describing the set of features supported by HTA backend.
+          This is used as a bitmask, so assign unique bits to each entries.
+*/
+typedef enum {
+  ///  The accelerator will always attempt to fold relu activation
+  ///  into the immediate preceding convolution operation. This optimization
+  ///  is correct when quantization ranges for convolution are equal or
+  ///  subset of the Relu operation. For graphs, where this cannot be
+  ///  guranteed, the client should set this flag
+  QNN_HTA_FOLD_RELU_ACTIVATION_INTO_CONV_OFF = 1 << 0,
+  /// UNKNOWN enum event that must not be used
+  QNN_HTA_BACKEND_FEATURES_UNKNOWN = 0x7fffffff
+} QnnHtaBackend_Features_t;
+
+//=============================================================================
+// Public Functions
+//=============================================================================
+
+//------------------------------------------------------------------------------
+//   Implementation Definition
+//------------------------------------------------------------------------------
+
+// clang-format off
+
+/**
+ * @brief Structure describing the set of configurations supported by the backend.
+ *        Objects of this type are to be referenced through QnnBackend_CustomConfig_t.
+ */
+typedef struct {
+    /// field to save the features that are passed
+    /// via QnnHtaBackend_Features_t
+    uint32_t bitmaskFeatures;
+} QnnHtaBackend_CustomConfig_t ;
+
+/// QnnHtaBackend_CustomConfig_t initializer macro
+#define QNN_HTA_BACKEND_CUSTOM_CONFIG_INIT \
+  { 0 /*bitmaskFeatures*/ }
+
+// clang-format on
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaCommon.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaCommon.h
new file mode 100755
index 0000000000000..1eb8e1f0a99a4
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaCommon.h
@@ -0,0 +1,62 @@
+//=============================================================================
+//
+//  Copyright (c) 2022 Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//=============================================================================
+
+/** @file
+ *  @brief QNN HTA Common components
+ *
+ *         This file defines versioning and other identification details
+ *         and supplements QnnCommon.h for HTA backend
+ */
+
+#ifndef QNN_HTA_COMMON_H
+#define QNN_HTA_COMMON_H
+
+#include "QnnCommon.h"
+
+/// HTA Backend identifier
+#define QNN_BACKEND_ID_HTA 7
+
+/// HTA interface provider
+#define QNN_HTA_INTERFACE_PROVIDER_NAME "HTA_QTI_AISW"
+
+// HTA API Version values
+
+#define QNN_HTA_API_VERSION_MAJOR 2
+#define QNN_HTA_API_VERSION_MINOR 0
+#define QNN_HTA_API_VERSION_PATCH 0
+
+// clang-format off
+
+/// Macro to set Qnn_ApiVersion_t for HTA backend
+#define QNN_HTA_API_VERSION_INIT                                 \
+  {                                                              \
+    {                                                            \
+      QNN_API_VERSION_MAJOR,     /*coreApiVersion.major*/        \
+      QNN_API_VERSION_MINOR,     /*coreApiVersion.major*/        \
+      QNN_API_VERSION_PATCH      /*coreApiVersion.major*/        \
+    },                                                           \
+    {                                                            \
+      QNN_HTA_API_VERSION_MAJOR, /*backendApiVersion.major*/     \
+      QNN_HTA_API_VERSION_MINOR, /*backendApiVersion.minor*/     \
+      QNN_HTA_API_VERSION_PATCH  /*backendApiVersion.patch*/     \
+    }                                                            \
+  }
+
+// clang-format on
+
+// HTA Binary Version values
+#define QNN_HTA_BINARY_VERSION_MAJOR 2
+#define QNN_HTA_BINARY_VERSION_MINOR 0
+#define QNN_HTA_BINARY_VERSION_PATCH 0
+
+// HTA Context blob Version values
+#define QNN_HTA_CONTEXT_BLOB_VERSION_MAJOR 1
+#define QNN_HTA_CONTEXT_BLOB_VERSION_MINOR 1
+#define QNN_HTA_CONTEXT_BLOB_VERSION_PATCH 0
+
+#endif  // QNN_HTA_COMMON_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaDevice.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaDevice.h
new file mode 100755
index 0000000000000..d31f5232e21f3
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaDevice.h
@@ -0,0 +1,41 @@
+//=============================================================================
+//
+//  Copyright (c) 2022 Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//=============================================================================
+
+/** @file
+ *  @brief QNN HTA component Device API.
+ *
+ *         The interfaces in this file work with the top level QNN
+ *         API and supplements QnnDevice.h for HTA backend
+ */
+#ifndef QNN_HTA_DEVICE_H
+#define QNN_HTA_DEVICE_H
+
+#include "QnnDevice.h"
+#include "QnnHtaPerfInfrastructure.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _QnnDevice_Infrastructure_t {
+  QnnHtaPerfInfrastructure_SetPowerConfigFn_t setPowerConfig;
+} QnnHtaDevice_Infrastructure_t;
+
+// clang-format off
+/// QnnHtaDevice_Infrastructure_t initializer macro
+#define QNN_HTA_DEVICE_INFRASTRUCTURE_INIT \
+  {                                        \
+    NULL,     /*setPowerConfig*/           \
+  }
+// clang-format on
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
\ No newline at end of file
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaGraph.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaGraph.h
new file mode 100755
index 0000000000000..0abbb9bc5114d
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaGraph.h
@@ -0,0 +1,123 @@
+//=============================================================================
+//
+//  Copyright (c) 2022 Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//=============================================================================
+
+/** @file
+ *  @brief QNN HTA component Graph API.
+ *
+ *         The interfaces in this file work with the top level QNN
+ *         API and supplements QnnGraph.h for HTA backend
+ */
+
+#ifndef QNN_HTA_GRAPH_H
+#define QNN_HTA_GRAPH_H
+
+#include "QnnGraph.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+/**
+ * @brief This enum provides different HTA graph optimization
+ *         options that can be used to finalize the graph
+ *         for optimum performance
+ */
+typedef enum QnnHtaGraph_OptimizationType {
+  QNN_HTA_GRAPH_OPTIMIZATION_TYPE_SCHEDULE_THRESHOLD = 1,
+  QNN_HTA_GRAPH_OPTIMIZATION_TYPE_FINALIZE_RETRIES   = 2,
+  QNN_HTA_GRAPH_OPTIMIZATION_TYPE_UNKNOWN            = 0x7fffffff
+} QnnHtaGraph_OptimizationType_t;
+
+/* @brief Struct describing the set of optimization type
+ *        and the value associated with the optimization
+ */
+typedef struct QnnHtaGraph_OptimizationOption {
+  QnnHtaGraph_OptimizationType_t type;
+  float floatValue;
+} QnnHtaGraph_OptimizationOption_t;
+
+// clang-format off
+/// QnnHtaGraph_OptimizationOption_t initializer macro
+#define QNN_HTA_GRAPH_OPTIMIZATION_OPTION_INIT              \
+  {                                                         \
+    QNN_HTA_GRAPH_OPTIMIZATION_TYPE_UNKNOWN, /*type*/       \
+    0.0f                                     /*floatValue*/ \
+  }
+// clang-format on
+
+/**
+ * @brief This enum provides different HTA graph configuration
+ *         options associated with QnnGraph
+ */
+typedef enum QnnHtaGraph_ConfigOption {
+  QNN_HTA_GRAPH_CONFIG_OPTION_OPTIMIZATION = 1,
+  QNN_HTA_GRAPH_CONFIG_OPTION_PRIORITY     = 2,
+  QNN_HTA_GRAPH_CONFIG_OPTION_UNKNOWN      = 0x7fffffff
+} QnnHtaGraph_ConfigOption_t;
+
+//=============================================================================
+// Public Functions
+//=============================================================================
+
+//------------------------------------------------------------------------------
+//   Implementation Definition
+//------------------------------------------------------------------------------
+
+// clang-format off
+
+/**
+ * @brief        Structure describing the set of configurations supported by graph.
+ *               Objects of this type are to be referenced through QnnGraph_CustomConfig_t.
+ *
+ *               The struct has two fields - option and a union of corresponding config values
+ *               Based on the option corresponding item in the union can be used to specify
+ *               config
+ *               Below is the Map between QnnHtaGraph_ConfigOption_t and config value
+ *
+ *               \verbatim embed:rst:leading-asterisk
+ *               +----+------------------------------------------+------------------------------------+
+ *               | #  | Config Option                            | Configuration Struct/value         |
+ *               +====+==========================================+====================================+
+ *               | 1  | QNN_HTA_GRAPH_CONFIG_OPTION_OPTIMIZATION | QnnHtaGraph_OptimizationOption_t   |
+ *               +----+------------------------------------------+------------------------------------+
+ *               | 2  | QNN_HTA_GRAPH_CONFIG_OPTION_PRIORITY     | Qnn_Priority_t                     |
+ *               +----+------------------------------------------+------------------------------------+
+ *               \endverbatim
+ */
+typedef struct {
+  QnnHtaGraph_ConfigOption_t option;
+  union {
+    QnnHtaGraph_OptimizationOption_t optimizationOption;
+    Qnn_Priority_t priority;
+  };
+} QnnHtaGraph_CustomConfig_t ;
+
+
+/// QnnHtaGraph_CustomConfig_t initalizer macro
+#define QNN_HTA_GRAPH_CUSTOM_CONFIG_INIT                            \
+  {                                                                 \
+    QNN_HTA_GRAPH_CONFIG_OPTION_UNKNOWN, /*option*/                 \
+    {                                                               \
+      QNN_HTA_GRAPH_OPTIMIZATION_OPTION_INIT /*optimizationOption*/ \
+    }                                                               \
+  }
+
+// clang-format on
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaPerfInfrastructure.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaPerfInfrastructure.h
new file mode 100755
index 0000000000000..4f6e0c22c274b
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaPerfInfrastructure.h
@@ -0,0 +1,134 @@
+//==============================================================================
+//
+// Copyright (c) 2022 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/** @file
+ *  @brief QNN HTA component Performance Infrastructure API
+ *
+ *         Provides interface to the client to control performance and system
+ *         settings of the QNN HTA Accelerator
+ */
+
+#ifndef QNN_HTA_PERF_INFRASTRUCTURE_H
+#define QNN_HTA_PERF_INFRASTRUCTURE_H
+
+#include "QnnCommon.h"
+#include "QnnTypes.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+/**
+ * @brief QNN HTA PerfInfrastructure API result / error codes.
+ *
+ */
+typedef enum {
+  QNN_HTA_PERF_INFRASTRUCTURE_MIN_ERROR = QNN_MIN_ERROR_PERF_INFRASTRUCTURE,
+  ////////////////////////////////////////////////////////////////////////
+
+  QNN_HTA_PERF_INFRASTRUCTURE_NO_ERROR                 = QNN_SUCCESS,
+  QNN_HTA_PERF_INFRASTRUCTURE_ERROR_INVALID_HANDLE_PTR = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 0,
+  QNN_HTA_PERF_INFRASTRUCTURE_ERROR_INVALID_INPUT      = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 1,
+  QNN_HTA_PERF_INFRASTRUCTURE_ERROR_UNSUPPORTED_CONFIG = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 2,
+  QNN_HTA_PERF_INFRASTRUCTURE_ERROR_TRANSPORT          = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 3,
+
+  ////////////////////////////////////////////////////////////////////////
+  QNN_HTA_PERF_INFRASTRUCTURE_MAX_ERROR = QNN_MAX_ERROR_PERF_INFRASTRUCTURE
+} QnnHtaPerfInfrastructure_Error_t;
+
+/**
+ * @brief This enum defines all the possible performance
+ *        options in Hta Performance Infrastructure that
+ *        relate to setting up of power levels
+ */
+typedef enum {
+  /// config enum implies the usage of powerModeConfig struct. If not provided
+  /// will be used as type identificator
+  QNN_HTA_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_POWER_MODE = 1,
+  /// UNKNOWN config option which must not be used
+  QNN_HTA_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_UNKNOWN = 0x7fffffff
+} QnnHtaPerfInfrastructure_PowerConfigOption_t;
+
+/**
+ * @brief This enum defines all the possible power mode
+ *        that a client can set
+ */
+typedef enum {
+  /// default mode
+  QNN_HTA_PERF_INFRASTRUCTURE_POWERMODE_DEFAULT = 0,
+  /// low power saver mode
+  QNN_HTA_PERF_INFRASTRUCTURE_POWERMODE_LOW_POWER_SAVER = 1,
+  /// power saver mode
+  QNN_HTA_PERF_INFRASTRUCTURE_POWERMODE_POWER_SAVER = 2,
+  /// high power saver mode
+  QNN_HTA_PERF_INFRASTRUCTURE_POWERMODE_HIGH_POWER_SAVER = 3,
+  /// balanced mode
+  QNN_HTA_PERF_INFRASTRUCTURE_POWERMODE_BALANCED = 4,
+  /// high performance mode
+  QNN_HTA_PERF_INFRASTRUCTURE_POWERMODE_HIGH_PERFORMANCE = 5,
+  /// burst mode
+  QNN_HTA_PERF_INFRASTRUCTURE_POWERMODE_BURST = 6,
+  /// UNKNOWN value that must not be used by client
+  QNN_HTA_PERF_INFRASTRUCTURE_POWERMODE_UNKNOWN = 0x7fffffff
+} QnnHtaPerfInfrastructure_PowerMode_t;
+
+/**
+ * @brief This struct provides performance infrastructure configuration
+ *         associated with setting up of power levels
+ */
+typedef struct {
+  QnnHtaPerfInfrastructure_PowerConfigOption_t config;
+  // Organize as union for future expand flexibility defined by PowerConfigOption_t
+  union {
+    QnnHtaPerfInfrastructure_PowerMode_t powerModeConfig;
+  };
+} QnnHtaPerfInfrastructure_PowerConfig_t;
+
+/// QnnHtaPerfInfrastructure_PowerConfig_t initializer macro
+#define QNN_HTA_PERF_INFRASTRUCTURE_POWER_CONFIG_INIT                   \
+  {                                                                     \
+    QNN_HTA_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_UNKNOWN, /*config*/  \
+    {                                                                   \
+      QNN_HTA_PERF_INFRASTRUCTURE_POWERMODE_UNKNOWN /*powerModeConfig*/ \
+    }                                                                   \
+  }
+
+//=============================================================================
+// API Methods
+//=============================================================================
+
+/**
+ * @brief This API allows client to set up system power configuration that
+ *        will enable different performance modes.
+ *
+ * @param[in] clientId A power client id to associate calls to system
+ *            power settings. A value of 0 implies NULL power client id
+ *            and can override every other setting the user process. To
+ *            enable power settings for multiple clients in the same
+ *            process, use a non-zero power client id.
+ *
+ *
+ * @param[in] config Pointer to a NULL terminated array
+ *            of config option for performance configuration.
+ *            NULL is allowed and indicates no config options are provided.
+ *
+ * @return Error code
+ *         \n QNN_SUCCESS: No error encountered
+ */
+typedef Qnn_ErrorHandle_t (*QnnHtaPerfInfrastructure_SetPowerConfigFn_t)(
+    uint32_t clientId, const QnnHtaPerfInfrastructure_PowerConfig_t** config);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // QNN_HTA_PERF_INFRASTRUCTURE_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaProfile.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaProfile.h
new file mode 100755
index 0000000000000..f069dbbedf6b7
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaProfile.h
@@ -0,0 +1,199 @@
+//==============================================================================
+//
+//  Copyright (c) 2022 Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/**
+ *  @file
+ *  @brief QNN HTA Profile component API.
+ *
+ *          Requires HTA backend to be initialized.
+ *          Should be used with the QnnProfile API but has HTA backend
+ *          specific definition for different QnnProfile data structures
+ *
+ */
+
+#ifndef QNN_HTA_PROFILE_H
+#define QNN_HTA_PROFILE_H
+
+#include "QnnProfile.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the remote procedure call on the ARM processor
+ *        when client invokes QnnContext_createFromBinary. The value
+ *        returned is time in microseconds.
+ *
+ * @note context load binary host time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTA_PROFILE_EVENTTYPE_CONTEXT_LOAD_BIN_HOST_TIME_MICROSEC 1002
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the remote procedure call on the HTA processor
+ *        when client invokes QnnContext_createFromBinary. The value
+ *        returned is time in microseconds.
+ *
+ * @note context load binary HTA time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTA_PROFILE_EVENTTYPE_CONTEXT_LOAD_BIN_HTA_TIME_MICROSEC 1003
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the time taken to create the context on the
+ *        accelerator when client invokes QnnContext_createFromBinary.
+ *        The value returned is time in microseconds.
+ *
+ * @note context load binary accelerator time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTA_PROFILE_EVENTTYPE_CONTEXT_LOAD_BIN_ACCEL_TIME_MICROSEC 1004
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the remote procedure call on the ARM processor
+ *        when client invokes QnnGraph_finalize.
+ *        The value returned is time in microseconds.
+ *
+ * @note graph finalize host time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTA_PROFILE_EVENTTYPE_GRAPH_FINALIZE_HOST_TIME_MICROSEC 2001
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the remote procedure call on the HTA processor
+ *        when client invokes QnnGraph_finalize.
+ *        The value returned is time in microseconds.
+ *
+ * @note graph finalize HTA time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTA_PROFILE_EVENTTYPE_GRAPH_FINALIZE_HTA_TIME_MICROSEC 2002
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to finalize the graph on the accelerator
+ *        when client invokes QnnGraph_finalize.
+ *        The value returned is time in microseconds.
+ *
+ * @note graph finalize accelerator time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTA_PROFILE_EVENTTYPE_GRAPH_FINALIZE_ACCEL_TIME_MICROSEC 2003
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the remote procedure call on the ARM processor
+ *        when client invokes QnnGraph_execute or QnnGraph_executeAsync.
+ *        The value returned is time in microseconds.
+ *
+ * @note graph execute host time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTA_PROFILE_EVENTTYPE_GRAPH_EXECUTE_HOST_TIME_MICROSEC 3001
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the remote procedure call on the HTA processor
+ *        when client invokes QnnGraph_execute or QnnGraph_executeAsync.
+ *        The value returned is time in microseconds.
+ *
+ * @note graph execute HTA time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTA_PROFILE_EVENTTYPE_GRAPH_EXECUTE_HTA_TIME_MICROSEC 3002
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to execute the graph on the accelerator
+ *        when client invokes QnnGraph_execute or QnnGraph_executeAsync.
+ *        The value returned is number of processor cycles taken.
+ *
+ * @note graph execute accelerator time maybe available only on
+ *       QNN_PROFILE_LEVEL_DETAILED levels
+ *
+ * @note When QNN_PROFILE_LEVEL_DETAILED is used, this event can have
+ *       multiple sub-events of type QNN_PROFILE_EVENTTYPE_NODE.
+ *       There will be a sub-event for each node that was added to the graph
+ */
+#define QNN_HTA_PROFILE_EVENTTYPE_GRAPH_EXECUTE_ACCEL_TIME_CYCLE 3003
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to execute the graph on the accelerator
+ *        when client invokes QnnGraph_execute or QnnGraph_executeAsync.
+ *        The value returned is time taken in microseconds
+ *
+ * @note graph execute accelerator time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ *
+ * @note When QNN_PROFILE_LEVEL_DETAILED is used, this event can have
+ *       multiple sub-events of type QNN_PROFILE_EVENTTYPE_NODE / QNN_PROFILE_EVENTUNIT_MICROSEC.
+ *       There will be a sub-event for each node that was added to the graph
+ */
+#define QNN_HTA_PROFILE_EVENTTYPE_GRAPH_EXECUTE_ACCEL_TIME_MICROSEC 3004
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to time taken for miscellaneous work i.e. time
+ *        that cannot be attributed to a node but are still needed to
+ *        execute the graph on the accelerator. This occurs when client invokes
+ *        QnnGraph_execute or QnnGraph_executeAsync.
+ *        The value returned is time taken in microseconds
+ *
+ * @note graph execute misc accelerator time is available only on
+ *       QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTA_PROFILE_EVENTTYPE_GRAPH_EXECUTE_MISC_ACCEL_TIME_MICROSEC 3005
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the remote procedure call on the ARM processor
+ *        when client invokes QnnContext_free which in consequence deinit graph.
+ *        The value returned is time in microseconds.
+ *
+ * @note graph deinit host time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTA_PROFILE_EVENTTYPE_GRAPH_DEINIT_HOST_TIME_MICROSEC 4001
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the remote procedure call on the HTA processor
+ *        when client invokes QnnContext_free which in consequence deinit graph.
+ *        The value returned is time in microseconds.
+ *
+ * @note graph deinit HTA time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTA_PROFILE_EVENTTYPE_GRAPH_DEINIT_HTA_TIME_MICROSEC 4002
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the time taken to deinit graph on the
+ *        accelerator when client invokes QnnContext_free which in consequence
+ *        deinit graph. The value returned is time in microseconds.
+ *
+ * @note graph deinit accelerator time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTA_PROFILE_EVENTTYPE_GRAPH_DEINIT_ACCEL_TIME_MICROSEC 4003
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // QNN_HTA_PROFILE_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpCommon.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpCommon.h
new file mode 100755
index 0000000000000..8b1d458a04b8e
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpCommon.h
@@ -0,0 +1,98 @@
+//=============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//=============================================================================
+
+/** @file
+ *  @brief QNN HTP Common components
+ *
+ *         This file defines versioning and other identification details
+ *         and supplements QnnCommon.h for HTP backend
+ */
+
+#ifndef QNN_HTP_COMMON_H
+#define QNN_HTP_COMMON_H
+
+#include "QnnCommon.h"
+
+/// HTP Backend identifier
+#define QNN_BACKEND_ID_HTP 6
+
+/// HTP interface provider
+#define QNN_HTP_INTERFACE_PROVIDER_NAME "HTP_QTI_AISW"
+
+// HTP API Version values
+#define QNN_HTP_API_VERSION_MAJOR 5
+#define QNN_HTP_API_VERSION_MINOR 34
+#define QNN_HTP_API_VERSION_PATCH 0
+
+// clang-format off
+
+/// Macro to set Qnn_ApiVersion_t for HTP backend
+#define QNN_HTP_API_VERSION_INIT                                 \
+  {                                                              \
+    {                                                            \
+        QNN_API_VERSION_MAJOR,        /*coreApiVersion.major*/   \
+        QNN_API_VERSION_MINOR,        /*coreApiVersion.major*/   \
+        QNN_API_VERSION_PATCH         /*coreApiVersion.major*/   \
+    },                                                           \
+    {                                                            \
+      QNN_HTP_API_VERSION_MAJOR,     /*backendApiVersion.major*/ \
+      QNN_HTP_API_VERSION_MINOR,     /*backendApiVersion.minor*/ \
+      QNN_HTP_API_VERSION_PATCH      /*backendApiVersion.patch*/ \
+    }                                                            \
+  }
+
+// clang-format on
+
+// DSP Context blob Version values
+#define QNN_HTP_CONTEXT_BLOB_VERSION_MAJOR 3
+#define QNN_HTP_CONTEXT_BLOB_VERSION_MINOR 2
+#define QNN_HTP_CONTEXT_BLOB_VERSION_PATCH 3
+
+/* ==== CDSP Security Library Versioning ==== */
+/* ==== This information is only intended for OEMs ==== */
+
+/* Security versioning for DSP libraries is supported V73 onwards */
+#define QNN_HTP_NATIVE_LIB_SECURITY_VERSIONING_MIN_ARCH 73
+
+/* Here we will define CDSP library versions for different targets
+ * Version is increased whenever there is a security fix from CDSP
+ * The versioning will start from 1.0.0 for each new target
+ * */
+
+/* V73 Security Issues:
+ * List of security issues fixed for V73 and the fixed version
+ * */
+#define QNN_HTP_V73_NATIVE_LIB_SECURITY_VERSION_MAJOR 1
+#define QNN_HTP_V73_NATIVE_LIB_SECURITY_VERSION_MINOR 0
+#define QNN_HTP_V73_NATIVE_LIB_SECURITY_VERSION_PATCH 0
+
+/* V75 Security Issues:
+ * List of security issues fixed for V75 and the fixed version
+ * */
+// HTP Native library version values for V75
+#define QNN_HTP_V75_NATIVE_LIB_SECURITY_VERSION_MAJOR 1
+#define QNN_HTP_V75_NATIVE_LIB_SECURITY_VERSION_MINOR 0
+#define QNN_HTP_V75_NATIVE_LIB_SECURITY_VERSION_PATCH 0
+
+/* V79 Security Issues:
+ * List of security issues fixed for V79 and the fixed version
+ * */
+// HTP Native library version values for V79
+#define QNN_HTP_V79_NATIVE_LIB_SECURITY_VERSION_MAJOR 1
+#define QNN_HTP_V79_NATIVE_LIB_SECURITY_VERSION_MINOR 0
+#define QNN_HTP_V79_NATIVE_LIB_SECURITY_VERSION_PATCH 0
+
+/* V81 Security Issues:
+ * List of security issues fixed for V81 and the fixed version
+ * */
+// HTP Native library version values for V81
+#define QNN_HTP_V81_NATIVE_LIB_SECURITY_VERSION_MAJOR 1
+#define QNN_HTP_V81_NATIVE_LIB_SECURITY_VERSION_MINOR 0
+#define QNN_HTP_V81_NATIVE_LIB_SECURITY_VERSION_PATCH 0
+
+#endif  // QNN_HTP_COMMON_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpContext.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpContext.h
new file mode 100755
index 0000000000000..8266817e2dc41
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpContext.h
@@ -0,0 +1,164 @@
+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All rights reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.s
+//
+//==============================================================================
+
+/**
+ *  @file
+ *  @brief QNN HTP component Context API.
+ *
+ *         The interfaces in this file work with the top level QNN
+ *         API and supplements QnnContext.h for HTP backend
+ */
+
+#ifndef QNN_HTP_CONTEXT_H
+#define QNN_HTP_CONTEXT_H
+
+#include "QnnContext.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+/**
+ * @brief This enum provides different HTP context configuration
+ *        options associated with QnnContext
+ */
+typedef enum {
+  QNN_HTP_CONTEXT_CONFIG_OPTION_WEIGHT_SHARING_ENABLED            = 1,
+  QNN_HTP_CONTEXT_CONFIG_OPTION_REGISTER_MULTI_CONTEXTS           = 2,
+  QNN_HTP_CONTEXT_CONFIG_OPTION_FILE_READ_MEMORY_BUDGET           = 3,
+  QNN_HTP_CONTEXT_CONFIG_OPTION_DSP_MEMORY_PROFILING_ENABLED      = 4,
+  QNN_HTP_CONTEXT_CONFIG_OPTION_SHARE_RESOURCES                   = 5,
+  QNN_HTP_CONTEXT_CONFIG_OPTION_IO_MEM_ESTIMATION                 = 6,
+  QNN_HTP_CONTEXT_CONFIG_OPTION_PREPARE_ONLY                      = 7,
+  QNN_HTP_CONTEXT_CONFIG_OPTION_INIT_ACCELERATION                 = 8,
+  QNN_HTP_CONTEXT_CONFIG_OPTION_SKIP_VALIDATION_ON_BINARY_SECTION = 9,
+  QNN_HTP_CONTEXT_CONFIG_OPTION_UNKNOWN                           = 0x7fffffff
+} QnnHtpContext_ConfigOption_t;
+
+typedef struct {
+  // Handle referring to the first context associated to a group. When a new
+  // group is to be registered, the following value must be 0.
+  Qnn_ContextHandle_t firstGroupHandle;
+  // Max spill-fill buffer to be allocated for the group of context in bytes.
+  // The value that is passed during the registration of the first context to
+  // a group is taken. Subsequent configuration of this value is disregarded.
+  uint64_t maxSpillFillBuffer;
+} QnnHtpContext_GroupRegistration_t;
+
+//=============================================================================
+// Public Functions
+//=============================================================================
+
+//------------------------------------------------------------------------------
+//   Implementation Definition
+//------------------------------------------------------------------------------
+
+// clang-format off
+
+/**
+ * @brief        Structure describing the set of configurations supported by context.
+ *               Objects of this type are to be referenced through QnnContext_CustomConfig_t.
+ *
+ *               The struct has two fields - option and a union of config values
+ *               Based on the option corresponding item in the union can be used to specify
+ *               config.
+ *
+ *               Below is the Map between QnnHtpContext_CustomConfig_t and config value
+ *
+ *               \verbatim embed:rst:leading-asterisk
+ *               +----+---------------------------------------------------------------------+---------------------------------------+
+ *               | #  | Config Option                                                       | Configuration Struct/value            |
+ *               +====+=====================================================================+=======================================+
+ *               | 1  | QNN_HTP_CONTEXT_CONFIG_OPTION_WEIGHT_SHARING_ENABLED                | bool                                  |
+ *               +====+=====================================================================+=======================================+
+ *               | 2  | QNN_HTP_CONTEXT_CONFIG_OPTION_REGISTER_MULTI_CONTEXTS               | QnnHtpContext_GroupRegistration_t     |
+ *               +====+=====================================================================+=======================================+
+ *               | 3  | QNN_HTP_CONTEXT_CONFIG_OPTION_FILE_READ_MEMORY_BUDGET               | uint64_t                              |
+ *               +====+=====================================================================+=======================================+
+ *               | 4  | QNN_HTP_CONTEXT_CONFIG_OPTION_DSP_MEMORY_PROFILING_ENABLED          | bool                                  |
+ *               +====+=====================================================================+=======================================+
+ *               | 5  | QNN_HTP_CONTEXT_CONFIG_OPTION_SHARE_RESOURCES                       | bool                                  |
+ *               +----+---------------------------------------------------------------------+---------------------------------------+
+ *               | 6  | QNN_HTP_CONTEXT_CONFIG_OPTION_IO_MEM_ESTIMATION                     | bool                                  |
+ *               +----+---------------------------------------------------------------------+---------------------------------------+
+ *               | 7  | QNN_HTP_CONTEXT_CONFIG_OPTION_PREPARE_ONLY                          | bool                                  |
+ *               +----+---------------------------------------------------------------------+---------------------------------------+
+ *               | 8  | QNN_HTP_CONTEXT_CONFIG_OPTION_INIT_ACCELERATION                     | bool                                  |
+ *               +----+---------------------------------------------------------------------+---------------------------------------+
+ *               | 9  | QNN_HTP_CONTEXT_CONFIG_OPTION_SKIP_VALIDATION_ON_BINARY_SECTION     | bool                                  |
+ *               +----+---------------------------------------------------------------------+---------------------------------------+
+ *               \endverbatim
+ */
+typedef struct QnnHtpContext_CustomConfig {
+  QnnHtpContext_ConfigOption_t option;
+  union UNNAMED {
+    // This field sets the weight sharing which is by default false
+    bool weightSharingEnabled;
+    QnnHtpContext_GroupRegistration_t groupRegistration;
+    // - Init time may be impacted depending the value set below
+    // - Value should be grather than 0 and less than or equal to the file size
+    //    - If set to 0, the feature is not utilized
+    //    - If set to greater than file size, min(fileSize, fileReadMemoryBudgetInMb) is used
+    // - As an example, if value 2 is passed, it would translate to (2 * 1024 * 1024) bytes
+    uint64_t fileReadMemoryBudgetInMb;
+    bool dspMemoryProfilingEnabled;
+    // This field enables resource sharing across different contexts, enhancing RAM and virtual
+    // address(VA) space utialization. When this flag is activated, graphs are expected to execute
+    // sequentially. Note that this configuration option is only supported when using the
+    // QnnContext_createFromBinaryListAsync API.
+    bool shareResources;
+    // This field enables I/O memory estimation during QnnContext_createFromBinary API when multiple
+    // PDs are available. When enabled, it estimates the total size of the I/O tensors required by
+    // the context to ensure sufficient space on the PD before deserialization. This feature helps
+    // with memory registration failures in large models.
+    // Note that enabling this feature increases peak RAM usage during context initialization phase
+    // in QnnContext_createFromBinary, but sustained RAM remains unaffected.
+    bool ioMemEstimation;
+    // This field enables model preparation without mapping its content on the DSP side. It is
+    // useful when a model needs to be prepared on the device but executed through a serialized
+    // binary method. This prevents extra mapping onto the DSP VA space. Set this flag only when
+    // creating the context.
+    bool isPrepareOnly;
+    // This field enables initialization acceleration, which is disabled by default.
+    // If set to true, the DSP will utilize all hardware threads to accelerate deserialization.
+    // It is not recommended to execute graphs simultaneously, as this will significantly degrade
+    // performance.
+    // Note that this feature may not be effective for small graphs with a few number of ops.
+    bool initAcceleration;
+    // This field enables crc32 check skip in Lora super adapter apply, which is disabled by default.
+    // If set to true, crc32 check for non-base adapter in super adapter apply use case will be
+    // skipped to improve time cost.
+    // Note that base adapter in super adaper never do crc32 check, therefore, their apply time cost
+    // won't improve by turning this config option on.
+    bool skipValidationOnBinarySection;
+  };
+} QnnHtpContext_CustomConfig_t;
+
+/// QnnHtpContext_CustomConfig_t initializer macro
+#define QNN_HTP_CONTEXT_CUSTOM_CONFIG_INIT            \
+  {                                                   \
+    QNN_HTP_CONTEXT_CONFIG_OPTION_UNKNOWN, /*option*/ \
+    {                                                 \
+      false                          /*weightsharing*/\
+    }                                                 \
+  }
+
+// clang-format on
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpDevice.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpDevice.h
new file mode 100755
index 0000000000000..e70c23577264b
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpDevice.h
@@ -0,0 +1,178 @@
+//=============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//=============================================================================
+
+/** @file
+ *  @brief QNN HTP Device components
+ *
+ *  This file defines structures and supplements QnnDevice.h for QNN HTP device
+ */
+
+#pragma once
+
+#include "QnnCommon.h"
+#include "QnnDevice.h"
+#include "QnnHtpPerfInfrastructure.h"
+#include "QnnTypes.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * This is used to represent the HTP hardware architecture
+ * Since QnnDevice only supports V68 or newer, using legacy ARCH will result in error
+ */
+typedef enum {
+  QNN_HTP_DEVICE_ARCH_NONE    = 0,
+  QNN_HTP_DEVICE_ARCH_V68     = 68,
+  QNN_HTP_DEVICE_ARCH_V69     = 69,
+  QNN_HTP_DEVICE_ARCH_V73     = 73,
+  QNN_HTP_DEVICE_ARCH_V75     = 75,
+  QNN_HTP_DEVICE_ARCH_V79     = 79,
+  QNN_HTP_DEVICE_ARCH_V81     = 81,
+  QNN_HTP_DEVICE_ARCH_UNKNOWN = 0x7fffffff
+} QnnHtpDevice_Arch_t;
+
+/**
+ * data struture to configure a device to set the minimum HTP Arch
+ * the driver will use ops that compatible to this HTP Arch
+ */
+typedef struct {
+  uint32_t deviceId;
+  QnnHtpDevice_Arch_t arch;
+} QnnHtpDevice_Minimum_Arch_t;
+
+/**
+ * data struture to configure a device to running in Signed/unsigned Domain.
+ */
+typedef struct {
+  uint32_t deviceId;
+  bool useSignedProcessDomain;
+} QnnHtpDevice_UseSignedProcessDomain_t;
+
+typedef void* QnnHtpDevice_UseCustomSetting_t;
+
+/**
+ * enum to list what custom configure is available.
+ */
+typedef enum {
+  QNN_HTP_DEVICE_CONFIG_OPTION_SOC      = 0,
+  QNN_HTP_DEVICE_CONFIG_OPTION_ARCH     = 1,
+  QNN_HTP_DEVICE_CONFIG_OPTION_SIGNEDPD = 2,
+  QNN_HTP_DEVICE_CONFIG_OPTION_CUSTOM   = 3,
+  QNN_HTP_DEVICE_CONFIG_OPTION_RESERVED = 0x7fff0000,
+  QNN_HTP_DEVICE_CONFIG_OPTION_UNKNOWN  = 0x7fffffff
+} QnnHtpDevice_ConfigOption_t;
+
+/**
+ * Data structure for custom configure.
+ */
+typedef struct {
+  QnnHtpDevice_ConfigOption_t option;
+  union UNNAMED {
+    // This field set the SoC Model
+    uint32_t socModel;
+    // This field update the minimum HTP arch
+    QnnHtpDevice_Minimum_Arch_t arch;
+    // This structure is used for enable/disable Signed/unsigned PD
+    QnnHtpDevice_UseSignedProcessDomain_t useSignedProcessDomain;
+    // This structure is used for enable Custom setting
+    QnnHtpDevice_UseCustomSetting_t useCustomSetting;
+    // Reserved for internal purposes
+    void* reserved;
+  };
+} QnnHtpDevice_CustomConfig_t;
+
+// For deviceType in QnnDevice_HardwareDeviceInfoV1_t
+typedef enum {
+  QNN_HTP_DEVICE_TYPE_ON_CHIP = 0,  // HTP cores are inside SoC
+  QNN_HTP_DEVICE_TYPE_UNKNOWN = 0x7fffffff
+} QnnHtpDevice_DeviceType_t;
+
+/**
+ * @brief QNN HTP Device core type
+ * This enumeration provides information about the core type inside the SOC.
+ *
+ * For online operation, the caller should retrieve this information from
+ * `QnnDevice_getPlatformInfo`. For offline operation, the caller needs to create a
+ * `QnnDevice_CoreInfo_t` with the correct core type, and then use it to create the
+ * `QnnDevice_PlatformInfo_t`.
+ */
+typedef enum {
+  QNN_HTP_CORE_TYPE_NSP   = 0,
+  QNN_HTP_CORE_TYPE_HPASS = 1,
+
+  // supported coreType are < QNN_CORE_TYPE_MAX
+  QNN_HTP_CORE_TYPE_MAX,
+  QNN_HTP_CORE_TYPE_UNKNOWN = 0x7fffffff
+} QnnHtpDevice_CoreType_t;
+
+/**
+ * This structure provides info about the NSP device inside SoC
+ * For online operation, caller should get these info from QnnDevice_getPlatformInfo
+ * For offline operation, caller need to create this structure and filling the correct information
+ * for QnnDevice_create
+ */
+typedef struct {
+  size_t vtcmSize;           // The VTCM for this device in Mega Byte
+                             // user could not request VTCM size exceed this value
+  uint32_t socModel;         // An enum value defined in Qnn Header that represent SoC model
+  bool signedPdSupport;      // This field is true if the device supports Signed PD
+  bool dlbcSupport;          // This field is true if the device supports DLBC
+  QnnHtpDevice_Arch_t arch;  // This field shows the Architecture of this device
+} QnnHtpDevice_OnChipDeviceInfoExtension_t;
+
+/**
+ * This structure is being used in QnnDevice_HardwareDeviceInfoV1_t
+ * QnnDevice_getPlatformInfo use this structure to list the supported device features/info
+ */
+typedef struct _QnnDevice_DeviceInfoExtension_t {
+  QnnHtpDevice_DeviceType_t devType;
+  union UNNAMED {
+    QnnHtpDevice_OnChipDeviceInfoExtension_t onChipDevice;
+  };
+} QnnHtpDevice_DeviceInfoExtension_t;
+
+/**
+ * @brief QNN HTP Device PerfInfrastructure specialization structure.
+ *        Objects of this type are to be referenced through QnnDevice_getInfrastructure.
+ *
+ *        Contains function pointers for each interface method for
+ *        Htp PerfInfrastructure.
+ */
+typedef struct {
+  QnnHtpPerfInfrastructure_CreatePowerConfigIdFn_t createPowerConfigId;
+  QnnHtpPerfInfrastructure_DestroyPowerConfigIdFn_t destroyPowerConfigId;
+  QnnHtpPerfInfrastructure_SetPowerConfigFn_t setPowerConfig;
+  QnnHtpPerfInfrastructure_SetMemoryConfigFn_t setMemoryConfig;
+} QnnHtpDevice_PerfInfrastructure_t;
+
+/// QnnHtpDevice_PerfInfrastructure_t initializer macro
+#define QNN_HTP_DEVICE_PERF_INFRASTRUCTURE_INIT \
+  {                                             \
+    NULL,     /*createPowerConfigId*/           \
+        NULL, /*destroyPowerConfigId*/          \
+        NULL, /*setPowerConfig*/                \
+        NULL  /*setMemoryConfig*/               \
+  }
+
+typedef enum {
+  QNN_HTP_DEVICE_INFRASTRUCTURE_TYPE_PERF    = 0,
+  QNN_HTP_DEVICE_INFRASTRUCTURE_TYPE_UNKNOWN = 0x7fffffff
+} QnnHtpDevice_InfrastructureType_t;
+
+typedef struct _QnnDevice_Infrastructure_t {
+  QnnHtpDevice_InfrastructureType_t infraType;
+  union UNNAMED {
+    QnnHtpDevice_PerfInfrastructure_t perfInfra;
+  };
+} QnnHtpDevice_Infrastructure_t;
+
+// clang-format on
+#ifdef __cplusplus
+}  // extern "C"
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpGraph.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpGraph.h
new file mode 100755
index 0000000000000..f7e49e9fb8bc3
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpGraph.h
@@ -0,0 +1,299 @@
+//=============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//=============================================================================
+
+/**
+ *  @file
+ *  @brief QNN HTP component Graph API.
+ *
+ *         The interfaces in this file work with the top level QNN
+ *         API and supplements QnnGraph.h for HTP backend
+ */
+
+#ifndef QNN_HTP_GRAPH_H
+#define QNN_HTP_GRAPH_H
+
+#include "QnnGraph.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+/**
+ * @brief QnnHtpGraph config value macro. Represents to use the maximum
+ *        available number of the resource.
+ *
+ *        Currently only applicable for QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE.
+ */
+#define QNN_HTP_GRAPH_CONFIG_OPTION_MAX 0
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+/**
+ * @brief This enum provides different HTP graph optimization
+ *        options that can be used to finalize the graph
+ *        for optimum performance.
+ */
+typedef enum {
+  QNN_HTP_GRAPH_OPTIMIZATION_TYPE_SCHEDULE_THRESHOLD                = 1,
+  QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_RETRIES                  = 2,
+  QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG        = 3,
+  QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC                       = 4,
+  QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC_WEIGHTS               = 5,
+  QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_SPARSE_WEIGHTS_COMPRESSION = 6,
+  QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_SLC_ALLOCATOR              = 7,
+  QNN_HTP_GRAPH_OPTIMIZATION_TYPE_UNKNOWN                           = 0x7fffffff
+} QnnHtpGraph_OptimizationType_t;
+
+// clang-format off
+
+/**
+ * @brief Struct describing the set of optimization types
+ *        and the values associated with each optimization type.
+ *
+ *        Below is the Map between QnnHtpGraph_OptimizationType_t and allowable values:
+ *
+ *        \verbatim embed:rst:leading-asterisk
+ *        +----+--------------------------------------------------------------------+---------------------------------------------------------------------+
+ *        | #  | OptimizationType option                                            | Allowable values                                                    |
+ *        +====+====================================================================+=====================================================================+
+ *        | 1  | QNN_HTP_GRAPH_OPTIMIZATION_TYPE_SCHEDULE_THRESHOLD                 | Reserved                                                            |
+ *        +----+--------------------------------------------------------------------+---------------------------------------------------------------------+
+ *        | 2  | QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_RETRIES                   | Reserved                                                            |
+ *        +----+--------------------------------------------------------------------+---------------------------------------------------------------------+
+ *        | 3  | QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG         | Defines the optimization strategy used by the HTP backend           |
+ *        +----+--------------------------------------------------------------------+---------------------------------------------------------------------+
+ *        | 4  | QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC                        | Reserved                                                            |
+ *        +----+--------------------------------------------------------------------+---------------------------------------------------------------------+
+ *        | 5  | QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC_WEIGHTS                | Enables DLBC weights compression                                    |
+ *        +----+--------------------------------------------------------------------+---------------------------------------------------------------------+
+ *        | 6  | QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_SPARSE_WEIGHTS_COMPRESSION  | Enables Weight Sparsity Compression                                 |
+ *        +----+--------------------------------------------------------------------+---------------------------------------------------------------------+
+ *        | 7  | QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_SLC_ALLOCATOR               | Enables System Level Cache Allocator usage                          |
+ *        +----+--------------------------------------------------------------------+---------------------------------------------------------------------+
+ *        \endverbatim
+ */
+typedef struct {
+  QnnHtpGraph_OptimizationType_t type;
+  float floatValue;
+} QnnHtpGraph_OptimizationOption_t;
+
+/**
+ * @brief This struct encapsulates all the VTCM configurations for parallel graph execution.
+ *
+ * @code
+ *    |<--           (1) 8MB Total Hardware VTCM           -->|
+ *           |<--            (2) 7MB Addressable           -->|
+ *    +------+------+------+------+------+------+------+------+
+ *    |  CV  |      |      |      |      |      |      |      |
+ *    +------+------+------+------+------+------+------+------+
+ *           |<-- (4) Graph A  -->|<--     (4) Graph B     -->|
+ *
+ *         A |> 0 MB      (3) Graph Offset
+ *         B |-------------------> 3 MB
+ * @endcode
+ */
+typedef struct {
+    /// (4) above, the amount of VTCM used by a graph
+    uint32_t sizeInBytes;
+    /// (3) above, where in the addressable region to start VTCM.
+    ///     Note: (3) + (4) <= (2)
+    uint32_t offsetInBytes;
+    /// (2) Addressable portion of VTCM.
+    /// Set to less than hardware size so Graph(s) can coexist with other VTCM clients.
+    uint32_t sizeTotalInBytes;
+
+    // For ABI compatibility in the future.
+    // Set to 0 for now.
+    uint32_t reserved[3];
+} QnnHtpGraph_VtcmConfig_t;
+
+/**
+ * @brief This enum defines whether graph concurrency (i.e. multiple graphs running concurrently)
+ *        is possible, and how to behave when circumstances for concurrency aren't possible.
+ */
+typedef enum {
+  /// This graph will not be able to run concurrently with other graphs.
+  QNN_HTP_GRAPH_CONCURRENCY_OPTION_NONE                       = 0,
+  QNN_HTP_GRAPH_CONCURRENCY_OPTION_DEFAULT                    = QNN_HTP_GRAPH_CONCURRENCY_OPTION_NONE,
+  /// Graph will try to run concurrently, sharing all resources on the DSP (VTCM, HMX, HVX, etc).
+  QNN_HTP_GRAPH_CONCURRENCY_OPTION_ALL_SHARED                 = 1,
+  // Unused, present to ensure 32 bits.
+  QNN_HTP_GRAPH_CONCURRENCY_OPTION_UNKNOWN                    = 0x7fffffff
+} QnnHtpGraph_ConcurrencyOption_t;
+
+/**
+ * @brief This struct encapsulates all the configurations for parallel graph execution.
+ */
+typedef struct {
+  QnnHtpGraph_ConcurrencyOption_t concurrency;
+  QnnHtpGraph_VtcmConfig_t vtcmConfig;
+
+  // For ABI compatibility in the future.
+  // Set to 0 for now.
+  uint32_t reserved[4];
+} QnnHtpGraph_ParallelGraphExecutionConfig_t;
+/// The settings in this struct is only applicable
+///  for DSP architectures >= V81.
+/// Use on other SOCs will return an error.
+///
+/// Values will be defaulted to their SOC's TURBO frequency
+///  (SOC as identified by Qnn_DeviceHandle_t).
+///
+/// On automotive SDKs HMX OP Bounding will be enabled by default.
+///
+/// On non-automotive SDKs using this setting will enable
+///  HMX OP Bounding. It is off by default.
+typedef struct QnnHtp_HmxBoundingInfo {
+  /// Target HMX freq in Hz.
+  /// Can be derived from sysMonApp (HexagonSDK) or QProfiler.
+  float targetHmxFreqHz;
+  /// Target DSP Core freq in Hz.
+  /// Can be derived from sysMonApp (HexagonSDK) or QProfiler.
+  float targetDspCoreFreq;
+} QnnHtp_HmxBoundingInfo_t;
+
+/// QnnHtpGraph_OptimizationOption_t initializer macro
+#define QNN_HTP_GRAPH_OPTIMIZATION_OPTION_INIT              \
+  {                                                         \
+    QNN_HTP_GRAPH_OPTIMIZATION_TYPE_UNKNOWN, /*type*/       \
+    0.0f                                     /*floatValue*/ \
+  }
+// clang-format on
+
+/**
+ * @brief This enum provides different HTP graph configuration
+ *        options associated with QnnGraph
+ */
+typedef enum {
+  QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION    = 1,
+  QNN_HTP_GRAPH_CONFIG_OPTION_PRECISION       = 2,
+  QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE_IN_MB = 3,
+  QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE       = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE_IN_MB,
+  QNN_HTP_GRAPH_CONFIG_OPTION_FOLD_RELU_ACTIVATION_INTO_CONV_OFF = 4,
+  QNN_HTP_GRAPH_CONFIG_OPTION_SHORT_DEPTH_CONV_ON_HMX_OFF        = 5,
+  QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS                    = 6,
+  QNN_HTP_GRAPH_CONFIG_OPTION_FINALIZE_CONFIG                    = 7,
+  QNN_HTP_GRAPH_CONFIG_OPTION_NUM_CORES                          = 8,
+  QNN_HTP_GRAPH_CONFIG_OPTION_PARALLEL_GRAPH_EXECUTION_CONFIG    = 9,
+  QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE_IN_BYTES                 = 10,
+  QNN_HTP_GRAPH_CONFIG_OPTION_HMX_BOUNDING                       = 11,
+  QNN_HTP_GRAPH_CONFIG_OPTION_WEIGHTS_PACKING                    = 12,
+  QNN_HTP_GRAPH_CONFIG_OPTION_ASSUME_SAME_QUANT                  = 13,
+  QNN_HTP_GRAPH_CONFIG_OPTION_RESERVED                           = 0x7fff0000,
+  QNN_HTP_GRAPH_CONFIG_OPTION_UNKNOWN                            = 0x7fffffff
+} QnnHtpGraph_ConfigOption_t;
+
+//=============================================================================
+// Public Functions
+//=============================================================================
+
+//------------------------------------------------------------------------------
+//   Implementation Definition
+//------------------------------------------------------------------------------
+
+/**
+ * @brief A struct for different config parameters in a key value format.
+ */
+typedef struct {
+  const char* key;
+  Qnn_Scalar_t value;
+} QnnHtpGraph_FinalizeConfig_t;
+
+/**
+ * @brief        Structure describing the set of configurations supported by graph.
+ *               Objects of this type are to be referenced through QnnGraph_CustomConfig_t.
+ *
+ *               The struct has two fields - option and a union of corresponding config values
+ *               Based on the option corresponding item in the union can be used to specify
+ *               config.
+ *
+ *               Below is the Map between QnnHtpGraph_ConfigOption_t and config value
+ *
+ *               \verbatim embed:rst:leading-asterisk
+ *               +----+-------------------------------------------------------------------------------------+------------------------------------------------+
+ *               | #  | Config Option | Configuration Struct/value                     |
+ *               +====+=====================================================================================+================================================+
+ *               | 1  | QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION | QnnHtpGraph_OptimizationOption_t
+ * |
+ *               +----+-------------------------------------------------------------------------------------+------------------------------------------------+
+ *               | 2  | QNN_HTP_GRAPH_CONFIG_OPTION_PRECISION | Qnn_Precision_t |
+ *               +----+-------------------------------------------------------------------------------------+------------------------------------------------+
+ *               | 3  |
+ * QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE_IN_MB/QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE   | uint32_t |
+ *               +----+-------------------------------------------------------------------------------------+------------------------------------------------+
+ *               | 4  | QNN_HTP_GRAPH_CONFIG_OPTION_FOLD_RELU_ACTIVATION_INTO_CONV_OFF | bool |
+ *               +----+-------------------------------------------------------------------------------------+------------------------------------------------+
+ *               | 5  | QNN_HTP_GRAPH_CONFIG_OPTION_SHORT_DEPTH_CONV_ON_HMX_OFF | bool |
+ *               +----+-------------------------------------------------------------------------------------+------------------------------------------------+
+ *               | 6  | QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS | uint32_t |
+ *               +----+-------------------------------------------------------------------------------------+------------------------------------------------+
+ *               | 7  | QNN_HTP_GRAPH_CONFIG_OPTION_FINALIZE_CONFIG | QnnHtpGraph_FinalizeConfig_t |
+ *               +----+-------------------------------------------------------------------------------------+------------------------------------------------+
+ *               | 8  | QNN_HTP_GRAPH_CONFIG_OPTION_NUM_CORES | uint32_t |
+ *               +----+-------------------------------------------------------------------------------------+------------------------------------------------+
+ *               |  9 | QNN_HTP_GRAPH_CONFIG_OPTION_PARALLEL_GRAPH_EXECUTION_CONFIG |
+ * QnnHtpGraph_ParallelGraphExecutionConfig_t     |
+ *               +----+-------------------------------------------------------------------------------------+------------------------------------------------+
+ *               | 10 | QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE_IN_BYTES | uint32_t |
+ *               +----+-------------------------------------------------------------------------------------+------------------------------------------------+
+ *               | 11 | QNN_HTP_GRAPH_CONFIG_OPTION_HMX_BOUNDING | uint32_t |
+ *               +----+-------------------------------------------------------------------------------------+------------------------------------------------+
+ *               | 12 | QNN_HTP_GRAPH_CONFIG_OPTION_WEIGHTS_PACKING | bool |
+ *               +----+-------------------------------------------------------------------------------------+------------------------------------------------+
+ *               | 13 | QNN_HTP_GRAPH_CONFIG_OPTION_ASSUME_SAME_QUANT | bool |
+ *               +----+-------------------------------------------------------------------------------------+------------------------------------------------+
+ *               +-------------------------+----------------------------------------------------------------+------------------------------------------------+
+ *               | 0x7fff0000 - 0x7ffffffe | QNN_HTP_GRAPH_CONFIG_OPTION_RESERVED | These are
+ * reserved for internal purposes       |
+ *               +-------------------------+----------------------------------------------------------------+------------------------------------------------+
+ *               \endverbatim
+ *
+ *               NOTE: Option #6 (i.e. QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS), can only be
+ *               set prior to the first execution of the graph. Proceeding executions will not use
+ *               the updated value if user does change it after the first execution.
+ */
+typedef struct {
+  QnnHtpGraph_ConfigOption_t option;
+  union {
+    QnnHtpGraph_OptimizationOption_t optimizationOption;
+    Qnn_Precision_t precision;
+    uint32_t vtcmSizeInMB;
+    bool foldReluActivationIntoConvOff;
+    bool shortDepthConvOnHmxOff;
+    uint64_t numHvxThreads;
+    void* reserved;
+    QnnHtpGraph_FinalizeConfig_t finalizeConfig;
+    uint32_t numCores;
+    QnnHtpGraph_ParallelGraphExecutionConfig_t parallelGraphExecutionConfig;
+    uint32_t vtcmSizeInBytes;
+    QnnHtp_HmxBoundingInfo_t hmxBoundingInfo;
+    bool weightsPacking;
+    bool assumeSameQuant;
+  };
+} QnnHtpGraph_CustomConfig_t;
+
+// clang-format on
+/// QnnHtpGraph_CustomConfig_t initializer macro
+#define QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT                            \
+  {                                                                 \
+    QNN_HTP_GRAPH_CONFIG_OPTION_UNKNOWN, /*option*/                 \
+    {                                                               \
+      QNN_HTP_GRAPH_OPTIMIZATION_OPTION_INIT /*optimizationOption*/ \
+    }                                                               \
+  }
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpMem.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpMem.h
new file mode 100755
index 0000000000000..adc9ef2c52504
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpMem.h
@@ -0,0 +1,85 @@
+//==============================================================================
+//
+//  Copyright (c) 2022-2023 Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef QNN_HTP_MEMORY_INFRASTRUCTURE_2_H
+#define QNN_HTP_MEMORY_INFRASTRUCTURE_2_H
+
+#include "QnnCommon.h"
+
+/**
+ *  @file
+ *  @brief QNN HTP Memory Infrastructure component API.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// VTCM
+//=============================================================================
+
+// clang-format off
+
+/**
+ * @brief Raw memory address that exists ONLY on the QURT
+ * side.
+ */
+typedef uint32_t QnnHtpMem_QurtAddress_t;
+
+/**
+ * @brief Configuration for custom shared buffer memory type
+ * This shared buffer is a contiguous chunk of memory identified
+ * by a single file descriptor which will be used by multiple tensors
+ * based on the offset provided
+ * Each QnnMem_register call with different offset will return a
+ * unique memory handle
+ */
+typedef struct {
+  // File descriptor for memory, must be set to QNN_MEM_INVALID_FD if not applicable
+  int32_t fd;
+  // Offset to be used in contiguous shared buffer
+  uint64_t offset;
+} QnnHtpMem_SharedBufferConfig_t;
+
+// clang-format off
+
+/**
+ * @brief QNN Memory Type
+ */
+typedef enum {
+  QNN_HTP_MEM_QURT = 0,
+  QNN_HTP_MEM_SHARED_BUFFER = 1,
+  QNN_HTP_MEM_UNDEFINED = 0x7FFFFFFF
+} QnnHtpMem_Type_t;
+
+// clang-format off
+
+/**
+ * @brief descriptor used for the QNN API
+ */
+typedef struct {
+  // Memory type identified by QnnHtpMem_Type_t
+  QnnHtpMem_Type_t type;
+  // Total size of the buffer
+  // For memory type QURT, it would be size of a tensor
+  // For memory type SHARED BUFFER, it would be the total size of the buffer
+  uint64_t size;
+
+  union {
+    QnnHtpMem_QurtAddress_t qurtAddress;
+    QnnHtpMem_SharedBufferConfig_t sharedBufferConfig;
+  };
+} QnnMemHtp_Descriptor_t;
+
+// clang-format on
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpPerfInfrastructure.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpPerfInfrastructure.h
new file mode 100755
index 0000000000000..f92317ac94bf2
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpPerfInfrastructure.h
@@ -0,0 +1,511 @@
+//==============================================================================
+//
+// Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/** @file
+ *  @brief QNN HTP component Performance Infrastructure API
+ *
+ *         Provides interface to the client to control performance and system
+ *         settings of the QNN HTP Accelerator
+ */
+
+#ifndef QNN_HTP_PERF_INFRASTRUCTURE_H
+#define QNN_HTP_PERF_INFRASTRUCTURE_H
+
+#include "QnnCommon.h"
+#include "QnnTypes.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// max rpc polling time allowed - 9999 us
+#define QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIG_MAX_RPC_POLLING_TIME 9999
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+/**
+ * @brief QNN HTP PerfInfrastructure API result / error codes.
+ *
+ */
+typedef enum {
+  QNN_HTP_PERF_INFRASTRUCTURE_MIN_ERROR = QNN_MIN_ERROR_PERF_INFRASTRUCTURE,
+  ////////////////////////////////////////////////////////////////////////
+
+  QNN_HTP_PERF_INFRASTRUCTURE_NO_ERROR                 = QNN_SUCCESS,
+  QNN_HTP_PERF_INFRASTRUCTURE_ERROR_INVALID_HANDLE_PTR = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 0,
+  QNN_HTP_PERF_INFRASTRUCTURE_ERROR_INVALID_INPUT      = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 1,
+  QNN_HTP_PERF_INFRASTRUCTURE_ERROR_UNSUPPORTED_CONFIG = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 2,
+  QNN_HTP_PERF_INFRASTRUCTURE_ERROR_TRANSPORT          = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 3,
+  QNN_HTP_PERF_INFRASTRUCTURE_ERROR_UNSUPPORTED        = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 4,
+  QNN_HTP_PERF_INFRASTRUCTURE_ERROR_MEM_ALLOC          = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 5,
+  QNN_HTP_PERF_INFRASTRUCTURE_ERROR_FAILED             = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 6,
+
+  ////////////////////////////////////////////////////////////////////////
+  QNN_HTP_PERF_INFRASTRUCTURE_MAX_ERROR = QNN_MAX_ERROR_PERF_INFRASTRUCTURE,
+  /// UNDEFINED value that must not be used by client
+  QNN_HTP_PERF_INFRASTRUCTURE_ERROR_UNDEFINED = 0x7fffffff
+} QnnHtpPerfInfrastructure_Error_t;
+
+/**
+ * @brief Allows client to consider (non-zero value) DCVS enable/disable
+ * and option parameters, otherwise (zero value)
+ *
+ */
+typedef uint32_t QnnHtpPerfInfrastructure_SetDcvsEnable_t;
+
+/**
+ * @brief Allows client to start (non-zero value) or stop (zero value)
+ * participating in DCVS
+ *
+ */
+typedef uint32_t QnnHtpPerfInfrastructure_DcvsEnable_t;
+
+/**
+ * @brief Allows client to consider (non-zero value) latency parameter,
+ * otherwise (zero value)
+ *
+ */
+typedef uint32_t QnnHtpPerfInfrastructure_SetSleepLatency_t;
+
+/**
+ * @brief Allows client to set up the sleep latency in microseconds
+ *
+ */
+typedef uint32_t QnnHtpPerfInfrastructure_SleepLatency_t;
+
+/**
+ * @brief Allows client to consider (non-zero value) sleep disable
+ * parameter, otherwise (zero value)
+ *
+ */
+typedef uint32_t QnnHtpPerfInfrastructure_SetSleepDisable_t;
+
+/**
+ * @brief Allows client to disable sleep or low power modes.
+ * Pass a non-zero value to disable sleep in HTP
+ *
+ */
+typedef uint32_t QnnHtpPerfInfrastructure_SleepDisable_t;
+
+/**
+ * @brief Allows client to consider (non-zero value) bus clock
+ * params, otherwise (zero value)
+ *
+ */
+typedef uint32_t QnnHtpPerfInfrastructure_SetBusParams_t;
+
+/**
+ * @brief Allows client consider (non-zero value) core clock
+ * params, otherwise (zero value)
+ *
+ */
+typedef uint32_t QnnHtpPerfInfrastructure_SetCoreParams_t;
+
+/**
+ * @brief Allows client to set up the RPC control latency in microseconds
+ *
+ */
+typedef uint32_t QnnHtpPerfInfrastructure_RpcControlLatency_t;
+
+/**
+ * @brief Allows client to set up the RPC polling time in microseconds
+ */
+typedef uint32_t QnnHtpPerfInfrastructure_RpcPollingTime_t;
+
+/**
+ * @brief Allows client to set up the adaptive polling time in microseconds
+ */
+typedef uint32_t QnnHtpPerfInfrastructure_AdaptivePollingTime_t;
+
+/**
+ * @brief Allows client to set up the HMX timeout interval in microseconds
+ */
+typedef uint32_t QnnHtpPerfInfrastructure_HmxTimeoutIntervalUs_t;
+
+/**
+ * @brief sets the minimum size by which user heap should grow
+ * when heap is exhausted. This API is expected to be
+ * called only once per backend and has a process wide impact
+ *
+ * Grow size provided in bytes and defaults to 16MB
+ */
+typedef uint32_t QnnHtpPerfInfrastructure_MemGrowSize_t;
+
+/**
+ * @brief Allows client to set default values for HMX frequency.
+ * If enabled 1 HMX vote will scale with DCVS Corner if 0 HMX vote
+ * needs to be specified manually.
+ *
+ */
+typedef uint32_t QnnHtpPerfInfrastructure_HmxDefault_Vote_t;
+
+/**
+ *  @brief Perf modes to specify clock frequency level within
+ *  target voltage corner currently applies only for HMX config.
+ */
+typedef enum {
+  // To select max frequency at target voltage corner.
+  QNN_HTP_PERF_INFRASTRUCTURE_CLK_PERF_HIGH = 0,
+  // To select min frequency at target voltage corner.
+  QNN_HTP_PERF_INFRASTRUCTURE_CLK_PERF_LOW,
+  /// UNKNOWN value that must not be used by client
+  QNN_HTP_PERF_INFRASTRUCTURE_CLK_PERF_UNKNOWN = 0x7fffffff
+} QnnHtpPerfInfrastructure_ClkPerfMode_t;
+
+/**
+ * @brief These are the different voltage corners that can
+ * be requested by the client to influence the voting scheme
+ * for DCVS
+ *
+ */
+typedef enum {
+  /// Maps to HAP_DCVS_VCORNER_DISABLE.
+  /// Disable setting up voltage corner
+  DCVS_VOLTAGE_CORNER_DISABLE = 0x10,
+  /// Maps to HAP_DCVS_VCORNER_SVS2.
+  /// Set voltage corner to minimum value supported on platform
+  DCVS_VOLTAGE_VCORNER_MIN_VOLTAGE_CORNER = 0x20,
+  /// Maps to HAP_DCVS_VCORNER_SVS2.
+  /// Set voltage corner to SVS2 value for the platform
+  DCVS_VOLTAGE_VCORNER_SVS2 = 0x30,
+  /// Maps to HAP_DCVS_VCORNER_SVS.
+  /// Set voltage corner to SVS value for the platform
+  DCVS_VOLTAGE_VCORNER_SVS = 0x40,
+  /// Maps to HAP_DCVS_VCORNER_SVS_PLUS.
+  /// Set voltage corner to SVS_PLUS value for the platform
+  DCVS_VOLTAGE_VCORNER_SVS_PLUS = 0x50,
+  /// Maps to HAP_DCVS_VCORNER_NOM.
+  /// Set voltage corner to NOMINAL value for the platform
+  DCVS_VOLTAGE_VCORNER_NOM = 0x60,
+  /// Maps to HAP_DCVS_VCORNER_NOM_PLUS.
+  /// Set voltage corner to NOMINAL_PLUS value for the platform
+  DCVS_VOLTAGE_VCORNER_NOM_PLUS = 0x70,
+  /// Maps to HAP_DCVS_VCORNER_TURBO.
+  /// Set voltage corner to TURBO value for the platform
+  DCVS_VOLTAGE_VCORNER_TURBO = 0x80,
+  /// Maps to HAP_DCVS_VCORNER_TURBO_PLUS.
+  /// Set voltage corner to TURBO_PLUS value for the platform
+  DCVS_VOLTAGE_VCORNER_TURBO_PLUS = 0x90,
+  /// Maps to HAP_DCVS_VCORNER_TURBO_L2.
+  /// Set voltage corner to TURBO_L2 value for the platform
+  DCVS_VOLTAGE_VCORNER_TURBO_L2 = 0x92,
+  /// Maps to HAP_DCVS_VCORNER_TURBO_L3.
+  /// Set voltage corner to TURBO_L3 value for the platform
+  DCVS_VOLTAGE_VCORNER_TURBO_L3 = 0x93,
+  /// Maps to HAP_DCVS_VCORNER_MAX.
+  /// Set voltage corner to maximum value supported on the platform
+  DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER = 0xA0,
+  /// UNKNOWN value that must not be used by client
+  DCVS_VOLTAGE_VCORNER_UNKNOWN = 0x7fffffff
+} QnnHtpPerfInfrastructure_VoltageCorner_t;
+
+/**
+ * @brief These are the expanded voltage corners that can
+ * be requested by the client to influence the voting scheme
+ * for DCVS
+ *
+ */
+typedef enum {
+  /// Maps to HAP_DCVS_EXP_VCORNER_DISABLE.
+  /// Disable setting up voltage corner
+  DCVS_EXP_VCORNER_DISABLE = 0,
+  /// Maps to HAP_DCVS_EXP_VCORNER_MIN.
+  /// Set voltage corner to minimum value supported on platform
+  DCVS_EXP_VCORNER_MIN = 0x100,
+  /// Maps to HAP_DCVS_EXP_VCORNER_LOW_SVS_D2.
+  /// Set voltage corner to LOWSVS_D2 value for the platform
+  DCVS_EXP_VCORNER_LOW_SVS_D2 = 0x134,
+  /// Maps to HAP_DCVS_EXP_VCORNER_LOW_SVS_D1.
+  /// Set voltage corner to LOWSVS_D1 value for the platform
+  DCVS_EXP_VCORNER_LOW_SVS_D1 = 0x138,
+  /// Maps to HAP_DCVS_EXP_VCORNER_LOW_SVS.
+  /// Set voltage corner to LOWSVS value for the platform
+  DCVS_EXP_VCORNER_LOW_SVS = 0x140,
+  /// Maps to HAP_DCVS_EXP_VCORNER_SVS.
+  /// Set voltage corner to SVS value for the platform
+  DCVS_EXP_VCORNER_SVS = 0x180,
+  /// Maps to HAP_DCVS_EXP_VCORNER_SVS_L1.
+  /// Set voltage corner to SVS_L1 value for the platform
+  DCVS_EXP_VCORNER_SVS_L1 = 0x1C0,
+  /// Maps to HAP_DCVS_EXP_VCORNER_NOM.
+  /// Set voltage corner to NOM value for the platform
+  DCVS_EXP_VCORNER_NOM = 0x200,
+  /// Maps to HAP_DCVS_EXP_VCORNER_NOM_L1.
+  /// Set voltage corner to NOM_L1 value for the platform
+  DCVS_EXP_VCORNER_NOM_L1 = 0x240,
+  /// Maps to HAP_DCVS_EXP_VCORNER_TUR.
+  /// Set voltage corner to TURBO value for the platform
+  DCVS_EXP_VCORNER_TUR = 0x280,
+  /// Maps to HAP_DCVS_EXP_VCORNER_TUR_L1.
+  /// Set voltage corner to TURBO_L1 value for the platform
+  DCVS_EXP_VCORNER_TUR_L1 = 0x2A0,
+  /// Maps to HAP_DCVS_EXP_VCORNER_TUR_L2.
+  /// Set voltage corner to TURBO_L2 value for the platform
+  DCVS_EXP_VCORNER_TUR_L2 = 0x2B0,
+  /// Maps to HAP_DCVS_EXP_VCORNER_TUR_L3.
+  /// Set voltage corner to TURBO_L3 value for the platform
+  DCVS_EXP_VCORNER_TUR_L3 = 0x2C0,
+  /// Maps to HAP_DCVS_EXP_VCORNER_MAX.
+  /// Selects the maximum voltage corner defined for the chipset
+  DCVS_EXP_VCORNER_MAX = 0xFFFF,
+  /// UNKNOWN value that must not be used by client
+  DCVS_EXP_VCORNER_UNKNOWN = 0x7fffffff
+} QnnHtpPerfInfrastructure_ExpVoltageCorner_t;
+
+/**
+ * @brief This enum defines all the possible power mode
+ *        that a client can set to influence DCVS mode
+ */
+typedef enum {
+  /// Maps to HAP_DCVS_V2_ADJUST_UP_DOWN.
+  /// Allows for DCVS to adjust up and down
+  QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_ADJUST_UP_DOWN = 0x1,
+  /// Maps to HAP_DCVS_V2_ADJUST_ONLY_UP.
+  /// Allows for DCVS to adjust up only
+  QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_ADJUST_ONLY_UP = 0x2,
+  /// Maps to HAP_DCVS_V2_POWER_SAVER_MODE.
+  /// Higher thresholds for power efficiency
+  QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_POWER_SAVER_MODE = 0x4,
+  /// Maps to HAP_DCVS_V2_POWER_SAVER_AGGRESSIVE_MODE.
+  /// Higher thresholds for power efficiency with faster ramp down
+  QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_POWER_SAVER_AGGRESSIVE_MODE = 0x8,
+  /// Maps to HAP_DCVS_V2_PERFORMANCE_MODE.
+  /// Lower thresholds for maximum performance
+  QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE = 0x10,
+  /// Maps to HAP_DCVS_V2_DUTY_CYCLE_MODE.
+  /// The below value applies only for HVX clients:
+  ///  - For streaming class clients:
+  ///   - detects periodicity based on HVX usage
+  ///   - lowers clocks in the no HVX activity region of each period.
+  ///  - For compute class clients:
+  ///   - Lowers clocks on no HVX activity detects and brings clocks up on detecting HVX activity
+  ///   again.
+  ///   - Latency involved in bringing up the clock will be at max 1 to 2 ms.
+  QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_DUTY_CYCLE_MODE = 0x20,
+  /// UNKNOWN value that must not be used by client
+  QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_UNKNOWN = 0x7fffffff
+} QnnHtpPerfInfrastructure_PowerMode_t;
+
+/**
+ * @brief This struct provides performance infrastructure configuration
+ *        associated with setting up of DcvsV3 which allows to select
+ *        bus and core operating corners separately
+ */
+typedef struct {
+  uint32_t contextId;
+  QnnHtpPerfInfrastructure_SetDcvsEnable_t setDcvsEnable;
+  QnnHtpPerfInfrastructure_DcvsEnable_t dcvsEnable;
+  QnnHtpPerfInfrastructure_PowerMode_t powerMode;
+  QnnHtpPerfInfrastructure_SetSleepLatency_t setSleepLatency;
+  QnnHtpPerfInfrastructure_SleepLatency_t sleepLatency;
+  QnnHtpPerfInfrastructure_SetSleepDisable_t setSleepDisable;
+  QnnHtpPerfInfrastructure_SleepDisable_t sleepDisable;
+  QnnHtpPerfInfrastructure_SetBusParams_t setBusParams;
+  QnnHtpPerfInfrastructure_VoltageCorner_t busVoltageCornerMin;
+  QnnHtpPerfInfrastructure_VoltageCorner_t busVoltageCornerTarget;
+  QnnHtpPerfInfrastructure_VoltageCorner_t busVoltageCornerMax;
+  QnnHtpPerfInfrastructure_SetCoreParams_t setCoreParams;
+  QnnHtpPerfInfrastructure_VoltageCorner_t coreVoltageCornerMin;
+  QnnHtpPerfInfrastructure_VoltageCorner_t coreVoltageCornerTarget;
+  QnnHtpPerfInfrastructure_VoltageCorner_t coreVoltageCornerMax;
+} QnnHtpPerfInfrastructure_DcvsV3_t;
+
+/**
+ * @brief This struct provides performance infrastructure configuration
+ *        associated with setting up of hmxv2 which allows to select
+ *        hmx corner separately. If hmxPickDefault is 1 all voltage corner
+ *        params will be ignored. Ensure to use same contextID as used for
+ *        DCVS vote.
+ */
+typedef struct {
+  QnnHtpPerfInfrastructure_HmxDefault_Vote_t hmxPickDefault;
+  QnnHtpPerfInfrastructure_ExpVoltageCorner_t hmxVoltageCornerMin;
+  QnnHtpPerfInfrastructure_ExpVoltageCorner_t hmxVoltageCornerTarget;
+  QnnHtpPerfInfrastructure_ExpVoltageCorner_t hmxVoltageCornerMax;
+  QnnHtpPerfInfrastructure_ClkPerfMode_t hmxPerfMode;
+} QnnHtpPerfInfrastructure_HmxV2_t;
+
+/**
+ * @brief This enum defines all the possible performance
+ *        options in Htp Performance Infrastructure that
+ *        relate to setting up of power levels
+ */
+typedef enum {
+  /// config enum implies the usage of Dcvs v3
+  QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3 = 1,
+  /// config enum implies the usage of rpcControlLatencyConfig struct
+  QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_CONTROL_LATENCY = 2,
+  /// config enum implies the usage of rpcPollingTimeConfig struct
+  /// this config is only supported on V69 and later
+  /// if enabled, this config is applied to entire process
+  /// max allowed is QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIG_MAX_RPC_POLLING_TIME us
+  QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME = 3,
+  /// config HMX timeout interval in us. The HMX is turned off after the set interval
+  /// time if no interaction with it after an inference is finished.
+  QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_HMX_TIMEOUT_INTERVAL_US = 4,
+  /// config HMX V2 voting parameters only on supported chips
+  QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_HMX_V2 = 5,
+  /// config enum implies the usage of adaptivePollingTime struct
+  /// this config can only be enabled in the RPC polling mode
+  /// if enabled, this config is applied to the entire process
+  QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_ADAPTIVE_POLLING_TIME = 6,
+  /// UNKNOWN config option which must not be used
+  QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_UNKNOWN = 0x7fffffff
+} QnnHtpPerfInfrastructure_PowerConfigOption_t;
+
+/**
+ * @brief This struct provides performance infrastructure configuration
+ *         associated with setting up of power levels
+ */
+typedef struct {
+  QnnHtpPerfInfrastructure_PowerConfigOption_t option;
+  union UNNAMED {
+    QnnHtpPerfInfrastructure_DcvsV3_t dcvsV3Config;
+    QnnHtpPerfInfrastructure_RpcControlLatency_t rpcControlLatencyConfig;
+    QnnHtpPerfInfrastructure_RpcPollingTime_t rpcPollingTimeConfig;
+    QnnHtpPerfInfrastructure_HmxTimeoutIntervalUs_t hmxTimeoutIntervalUsConfig;
+    QnnHtpPerfInfrastructure_HmxV2_t hmxV2Config;
+    QnnHtpPerfInfrastructure_AdaptivePollingTime_t adaptivePollingTimeConfig;
+  };
+} QnnHtpPerfInfrastructure_PowerConfig_t;
+
+/// QnnHtpPerfInfrastructure_PowerConfig_t initializer macro
+#define QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIG_INIT                  \
+  {                                                                    \
+    QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_UNKNOWN, /*config*/ \
+    {                                                                  \
+      0 /*dcvsV3Config*/                                               \
+    }                                                                  \
+  }
+
+/**
+ * @brief This enum defines all the possible performance
+ *        options in Htp Performance Infrastructure that
+ *        relate to system memory settings
+ */
+typedef enum {
+  /// sets memory grow size
+  QNN_HTP_PERF_INFRASTRUCTURE_MEMORY_CONFIGOPTION_GROW_SIZE = 1,
+  /// UNKNOWN config option that must not be used
+  QNN_HTP_PERF_INFRASTRUCTURE_MEMORY_CONFIGOPTION_UNKNOWN = 0x7fffffff
+} QnnHtpPerfInfrastructure_MemoryConfigOption_t;
+
+/**
+ * @brief Provides performance infrastructure configuration
+ *        options that are memory specific
+ */
+typedef struct {
+  QnnHtpPerfInfrastructure_MemoryConfigOption_t option;
+  union UNNAMED {
+    QnnHtpPerfInfrastructure_MemGrowSize_t memGrowSizeConfig;
+  };
+} QnnHtpPerfInfrastructure_MemoryConfig_t;
+
+/// QnnHtpPerfInfrastructure_MemoryConfig_t initializer macro
+#define QNN_HTP_PERF_INFRASTRUCTURE_MEMORY_CONFIG_INIT                  \
+  {                                                                     \
+    QNN_HTP_PERF_INFRASTRUCTURE_MEMORY_CONFIGOPTION_UNKNOWN, /*config*/ \
+    {                                                                   \
+      0 /*memGrowSizeConfig*/                                           \
+    }                                                                   \
+  }
+
+//=============================================================================
+// API Methods
+//=============================================================================
+
+/**
+ * @brief This API allows client to create power configuration id that
+ *        has to be used to set different performance modes.
+ *        Power configuration id has to be destroyed by client when not needed.
+ *
+ * @param[in] deviceId Hardware Device on which this config id needs to be created.
+ *
+ * @param[in] coreId Core/NSP on which this config id needs to be created.
+ *
+ * @param[out] powerConfigId Pointer to power configuration id to be created.
+ *
+ * @return Error code
+ *         \n QNN_SUCCESS: No error encountered
+ *         \n QNN_HTP_PERF_INFRASTRUCTURE_ERROR_INVALID_INPUT if deviceId/coreId
+ *            or power configuration id is NULL
+ */
+typedef Qnn_ErrorHandle_t (*QnnHtpPerfInfrastructure_CreatePowerConfigIdFn_t)(
+    uint32_t deviceId, uint32_t coreId, uint32_t* powerConfigId);
+
+/**
+ * @brief This API allows client to destroy power configuration id.
+ *
+ * @param[in] powerConfigId A power configuration id to be destroyed.
+ *
+ * @return Error code
+ *         \n QNN_SUCCESS: No error encountered
+ *         \n QNN_HTP_PERF_INFRASTRUCTURE_ERROR_INVALID_INPUT if power configuration
+ *            id does not exist
+ *         \n QNN_COMMON_ERROR_SYSTEM_COMMUNICATION: SSR occurence (successful recovery)
+ *         \n QNN_COMMON_ERROR_SYSTEM_COMMUNICATION_FATAL: SSR occurence (unsuccessful recovery)
+ */
+typedef Qnn_ErrorHandle_t (*QnnHtpPerfInfrastructure_DestroyPowerConfigIdFn_t)(
+    uint32_t powerConfigId);
+
+/**
+ * @brief This API allows client to set up system power configuration that
+ *        will enable different performance modes. This API uses
+ *        HAP_power_dcvs_v3_payload struct to config HAP power parameters.
+ *        Detailed HAP power parameters description please refer to Hexagon
+ *        SDK HAP_power_dcvs_v3_payload documentation.
+ *
+ * @param[in] powerConfigId A power client id to associate calls to system
+ *            power settings. A value of 0 implies NULL power client id
+ *            and can override every other setting the user process. To
+ *            enable power settings for multiple clients in the same
+ *            process, use a non-zero power client id.
+ *
+ * @param[in] config Pointer to a NULL terminated array
+ *            of config option for performance configuration.
+ *            NULL is allowed and indicates no config options are provided.
+ *
+ * @return Error code
+ *         \n QNN_SUCCESS: No error encountered
+ *         \n QNN_HTP_PERF_INFRASTRUCTURE_ERROR_INVALID_INPUT if power configuration
+ *            does not exist
+ *         \n QNN_COMMON_ERROR_SYSTEM_COMMUNICATION: SSR occurence (successful recovery)
+ *         \n QNN_COMMON_ERROR_SYSTEM_COMMUNICATION_FATAL: SSR occurence (unsuccessful recovery)
+ */
+typedef Qnn_ErrorHandle_t (*QnnHtpPerfInfrastructure_SetPowerConfigFn_t)(
+    uint32_t powerConfigId, const QnnHtpPerfInfrastructure_PowerConfig_t** config);
+
+/**
+ * @brief This API allows clients to set up configuration associated with
+ *        system memory on a specific device
+ *
+ * @param[in] deviceId Hardware Device on which this config needs to be applied.
+ *
+ * @param[in] coreId Core/NSP on which this config needs to be applied.
+ *
+ * @param[in] config Pointer to a NULL terminated array
+ *            of config option for system memory configuration.
+ *            NULL is allowed and indicates no config options are provided.
+ *
+ * @return Error code
+ *         \n QNN_SUCCESS: No error encountered
+ *         \n QNN_HTP_PERF_INFRASTRUCTURE_ERROR_INVALID_INPUT if deviceId/coreId
+ *            or memory configuration does not exist
+ *         \n QNN_COMMON_ERROR_SYSTEM_COMMUNICATION: SSR occurence (successful recovery)
+ *         \n QNN_COMMON_ERROR_SYSTEM_COMMUNICATION_FATAL: SSR occurence (unsuccessful recovery)
+ */
+typedef Qnn_ErrorHandle_t (*QnnHtpPerfInfrastructure_SetMemoryConfigFn_t)(
+    uint32_t deviceId, uint32_t coreId, const QnnHtpPerfInfrastructure_MemoryConfig_t** config);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // QNN_HTP_PERF_INFRASTRUCTURE_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpProfile.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpProfile.h
new file mode 100755
index 0000000000000..92381d17b0440
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpProfile.h
@@ -0,0 +1,567 @@
+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/**
+ *  @file
+ *  @brief QNN HTP Profile component API.
+ *
+ *          Requires HTP backend to be initialized.
+ *          Should be used with the QnnProfile API but has HTP backend
+ *          specific definition for different QnnProfile data structures
+ *
+ */
+
+#ifndef QNN_HTP_PROFILE_H
+#define QNN_HTP_PROFILE_H
+
+#include "QnnProfile.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the remote procedure call on the ARM processor
+ *        when client invokes QnnContext_createFromBinary. The value
+ *        returned is time in microseconds.
+ *
+ * @note context load binary host rpc time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_CONTEXT_LOAD_BIN_HOST_RPC_TIME_MICROSEC 1002
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the remote procedure call on the HTP processor
+ *        when client invokes QnnContext_createFromBinary. The value
+ *        returned is time in microseconds.
+ *
+ * @note context load binary htp rpc time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_CONTEXT_LOAD_BIN_HTP_RPC_TIME_MICROSEC 1003
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the time taken to create the context on the
+ *        accelerator when client invokes QnnContext_createFromBinary.
+ *        The value returned is time in microseconds.
+ *
+ * @note context load binary accelerator time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_CONTEXT_LOAD_BIN_ACCEL_TIME_MICROSEC 1004
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the remote procedure call on the ARM processor
+ *        when client invokes QnnGraph_finalize.
+ *        The value returned is time in microseconds.
+ *
+ * @note graph finalize host rpc time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_HOST_RPC_TIME_MICROSEC 2001
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the remote procedure call on the HTP processor
+ *        when client invokes QnnGraph_finalize.
+ *        The value returned is time in microseconds.
+ *
+ * @note graph finalize htp rpc time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_HTP_RPC_TIME_MICROSEC 2002
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to finalize the graph on the accelerator
+ *        when client invokes QnnGraph_finalize.
+ *        The value returned is time in microseconds.
+ *
+ * @note graph finalize accelerator time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_ACCEL_TIME_MICROSEC 2003
+
+/* Graph Performance Estimate Support
+ *
+ **/
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to Performance Estimates for the graph
+ *        when client invokes QnnGraph_finalize.
+ *        This is just a dummy event which will print only the heading
+ *        with no value  or unit.
+ * @note HTP Performance Estimates maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE 2004
+
+/**
+ * @brief QnnProfile_EventType_t definition to get perf mode at which
+ *        the perf estimates are collected during QnnGraph_finalize.
+ *        The value returned is the perf mode in string with no unit.
+ *
+ * @note Perf mode maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE_MODE 2005
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to simulated execution cycles during
+ *        QnnGraph_finalize.
+ *        The value returned is number of cycles.
+ *
+ * @note Simulated execution cycles maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE_SIM_EXEC_CYCLES 2006
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to a lower estimate of simulated execution
+ *        cycles during QnnGraph_finalize.
+ *        The value returned is number of cycles.
+ *
+ * @note Simulated execution cycles lower estimate maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE_SIM_EXEC_LOWER_CYCLES 2007
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to a upper estimate of simulated execution
+ *        cycles during QnnGraph_finalize.
+ *        The value returned is number of cycles.
+ *
+ * @note Simulated execution cycles upper estimate maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE_SIM_EXEC_UPPER_CYCLES 2008
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to DDR information for each HTP during
+ *        QnnGraph_finalize.
+ *        This is just a dummy event which will print only the heading
+ *        with no value  or unit.
+ *
+ * @note DDR Information for each HTP maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE_BANDWIDTH_STATS 2009
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the HTP ID on chip during QnnGraph_finalize.
+ *        The value returned is the HTP ID with no unit.
+ *
+ * @note HTP ID's maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE_BANDWIDTH_STATS_HTP_ID 2010
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the Graph defined inputs or the total reads
+ *        (in bytes) from DDR for graph input related tensors (weights,
+ *        bias, activations) which do not have predecessors.
+ *        The value returned is the num of blocks in bytes.
+ *
+ * @note Graph defined inputs for each HTP maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE_INPUT_FILL 2011
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the total reads (in bytes) from DDR for
+ *        compiler generated fill operators which have predecessors and
+ *        successors and originate on the same HTP.
+ *        The value returned is the num of blocks in bytes.
+ *
+ * @note Intermediate Fill Information for each HTP maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE_INTERMEDIATE_FILL 2012
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the total writes (in bytes) from DDR for
+ *        compiler generated fill operators which have predecessors and
+ *        successors and originate on the same HTP.
+ *        The value returned is the num of blocks in bytes.
+ *
+ * @note Intermediate Spill Information for each HTP maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE_INTERMEDIATE_SPILL 2013
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the total reads (in bytes) from DDR for
+ *        fills which were generated by a different HTP core and do not
+ *        have a predecessor, but have a successor.
+ *        The value returned is the num of blocks in bytes.
+ *
+ * @note Inter HTP Fill Information for each HTP maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE_INTER_HTP_FILL 2014
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the total writes (in bytes) from DDR for
+ *        fills which were generated by a different HTP core and do not
+ *        have a successor, but have a predecessor.
+ *        The value returned is the num of blocks in bytes.
+ *
+ * @note Inter HTP Spill Information for each HTP maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE_INTER_HTP_SPILL 2015
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the total writes (in bytes) to DDR for
+ *        graph output related tensors which do not have successors.
+ *        The value returned is the num of blocks in bytes.
+ *
+ * @note Graph output related tensors for each HTP maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE_OUTPUT_SPILL 2016
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the total number of missing ops which do
+ *        not have any cost associated with them while getting the graph
+ *        performance estimates.
+ *        The value returned is the num of missing ops with no unit.
+ *
+ * @note Number of missing cost ops maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE_MISSING_COST_OPS 2017
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the op ids of the missing ops which do
+ *        not have any cost associated with them while getting the graph
+ *        performance estimates.
+ *        The value returned is the opname along with the op id (decimal
+ *        format) of the ops which does not have any costs associated
+ *        with them.
+ *
+ * @note Opname and Op ids of missing cost ops are available only with
+ *       QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE_MISSING_COST_OPID 2018
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the remote procedure call on the ARM processor
+ *        when client invokes QnnGraph_execute or QnnGraph_executeAsync.
+ *        The value returned is time in microseconds.
+ *
+ * @note graph execute host rpc time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_HOST_RPC_TIME_MICROSEC 3001
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the remote procedure call on the HTP processor
+ *        when client invokes QnnGraph_execute or QnnGraph_executeAsync.
+ *        The value returned is time in microseconds.
+ *
+ * @note graph execute htp rpc time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_HTP_RPC_TIME_MICROSEC 3002
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to execute the graph on the accelerator
+ *        when client invokes QnnGraph_execute or QnnGraph_executeAsync.
+ *        The value returned is number of processor cycles taken.
+ *
+ * @note graph execute accelerator time maybe available only on
+ *       QNN_PROFILE_LEVEL_DETAILED levels
+ *
+ * @note When QNN_PROFILE_LEVEL_DETAILED is used, this event can have
+ *       multiple sub-events of type QNN_PROFILE_EVENTTYPE_NODE.
+ *       There will be a sub-event for each node that was added to the graph
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_ACCEL_TIME_CYCLE 3003
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to execute the graph on the accelerator
+ *        when client invokes QnnGraph_execute or QnnGraph_executeAsync.
+ *        The value indicates execute including wait/resource acquisition
+ *        time on the accelerator, if applicable in multi-threaded scenarios.
+ *        The value returned is time taken in microseconds
+ *
+ * @note graph execute accelerator time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ *
+ * @note When QNN_PROFILE_LEVEL_DETAILED is used, this event can have
+ *       multiple sub-events of type QNN_PROFILE_EVENTTYPE_NODE / QNN_PROFILE_EVENTUNIT_MICROSEC
+ *       There will be a sub-event for each node that was added to the graph
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_ACCEL_TIME_MICROSEC 3004
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to time taken for miscellaneous work i.e. time
+ *        that cannot be attributed to a node but are still needed to
+ *        execute the graph on the accelerator. This occurs when client invokes
+ *        QnnGraph_execute or QnnGraph_executeAsync.
+ *        The value returned is time taken in microseconds
+ *
+ * @note graph execute misc accelerator time is available only on
+ *       QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_MISC_ACCEL_TIME_MICROSEC 3005
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to time taken for a graph yield instance to
+ *        release all its resources to the other graph.
+ *        The value returned is time taken in microseconds.
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_YIELD_INSTANCE_RELEASE_TIME 3006
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to time a graph spends waiting for a higher
+ *        priority graph to finish execution.
+ *        The value returned is time taken in microseconds
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_YIELD_INSTANCE_WAIT_TIME 3007
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to time a graph spends re-acquiring resources
+ *        and restoring vtcm.
+ *        The value returned is time taken in microseconds
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_YIELD_INSTANCE_RESTORE_TIME 3008
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the number of times that a yield occured
+ *        during execution
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_YIELD_COUNT 3009
+
+/**
+ * @brief QnnProfile_EventType_t definition for time a graph waits to get
+ *        VTCM. This should be constant UNLESS we need another graph to yield.
+ *        The value returned is time taken in microseconds.
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_VTCM_ACQUIRE_TIME 3010
+
+/**
+ * @brief QnnProfile_EventType_t definition for time a graph waits to get
+ *        HMX + HVX, and turn them all on.
+ *        The value returned is time taken in microseconds.
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_RESOURCE_POWER_UP_TIME 3011
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to execute the graph on the accelerator
+ *        when client invokes QnnGraph_execute or QnnGraph_executeAsync.
+ *        The value indicates execute excluding wait/resource acquisition
+ *        time on the accelerator, if applicable in multi-threaded scenarios.
+ *        The value returned is time taken in microseconds
+ *
+ * @note graph execute accelerator time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ *
+ * @note When QNN_PROFILE_LEVEL_DETAILED is used, this event can have
+ *       multiple sub-events of type QNN_PROFILE_EVENTTYPE_NODE / QNN_PROFILE_EVENTUNIT_MICROSEC
+ *       There will be a sub-event for each node that was added to the graph
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_ACCEL_EXCL_WAIT_TIME_MICROSEC 3012
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the remote procedure call on the ARM processor
+ *        when client invokes QnnContext_free which in consequence deinit graph.
+ *        The value returned is time in microseconds.
+ *
+ * @note graph deinit host rpc time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_DEINIT_HOST_RPC_TIME_MICROSEC 4001
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the remote procedure call on the HTP processor
+ *        when client invokes QnnContext_free which in consequence deinit graph.
+ *        The value returned is time in microseconds.
+ *
+ * @note graph deinit htp rpc time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_DEINIT_HTP_RPC_TIME_MICROSEC 4002
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the time taken to deinit graph on the
+ *        accelerator when client invokes QnnContext_free which in consequence
+ *        deinit graph. The value returned is time in microseconds.
+ *
+ * @note graph deinit accelerator time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_DEINIT_ACCEL_TIME_MICROSEC 4003
+
+/**
+ * @brief QnnProfile_EventType_t definition to get data related to execution of
+ *        an operation. This value represents the amount of time an op spends
+ *        waiting for execution on the main thread since the last op on the main
+ *        thread due to scheduling and can be interpreted appropriately in
+ *        conjunction with the unit.
+ *
+ * @note node wait information is available on QNN_HTP_PROFILE_LEVEL_LINTING level
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_NODE_WAIT 5001
+
+/**
+ * @brief QnnProfile_EventType_t definition to get data related to execution of
+ *        an operation. This value represents the amount of time at least one
+ *        background op is running during the execution of an op on the main thread
+ *        and can be interpreted appropriately in conjunction with the unit.
+ *
+ * @note node overlap information is available on QNN_HTP_PROFILE_LEVEL_LINTING level
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_NODE_OVERLAP 5002
+
+/**
+ * @brief QnnProfile_EventType_t definition to get data related to execution of
+ *        an operation. This value represents the amount of time at least one
+ *        background op that is not being waited upon to finish is running during
+ *        the wait period of an op on the main thread and can be interpreted
+ *        appropriately in conjunction with the unit.
+ *
+ * @note node wait overlap information is available on QNN_HTP_PROFILE_LEVEL_LINTING
+ *       level
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_NODE_WAIT_OVERLAP 5003
+
+/**
+ * @brief QnnProfile_EventType_t definition to get data related to execution of
+ *        an operation. This value represents a bitmask denoting the resources
+ *        an op uses.
+ *
+ * @note node specific information is available on QNN_HTP_PROFILE_LEVEL_LINTING level
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_NODE_RESOURCEMASK 5004
+
+/**
+ * @brief QnnProfile_EventType_t definition to get data related to execution of
+ *        an operation. This value represents the ID of an op running in parallel to
+ *        an op running on the main thread or on HMX.
+ *
+ * @note node specific information is available on QNN_HTP_PROFILE_LEVEL_LINTING level
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_NODE_CRITICAL_BG_OP_ID 5005
+
+/**
+ * @brief QnnProfile_EventType_t definition to get data related to execution of
+ *        an operation. This value represents the ID of an op running on threads other
+ *        than the main or the HMX thread when the main and the HMX threads are not
+ *        executing any op.
+ *
+ * @note node specific information is available on QNN_HTP_PROFILE_LEVEL_LINTING level
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_NODE_WAIT_BG_OP_ID 5006
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to execute the graph's critical path on the accelerator
+ *        when client invokes QnnGraph_execute or QnnGraph_executeAsync.
+ *        The value returned is number of processor cycles taken.
+ *
+ * @note graph execute accelerator time maybe available only on
+ *       QNN_HTP_PROFILE_LEVEL_LINTING levels
+ *
+ * @note When QNN_HTP_PROFILE_LEVEL_LINTING is used, this event can have
+ *       multiple sub-events of type QNN_PROFILE_EVENTTYPE_NODE.
+ *       There will be a sub-event for each node that was added to the graph
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_CRITICAL_ACCEL_TIME_CYCLE 6001
+
+/**
+ * @brief Linting QnnProfile_Level_t definition that allows collecting in-depth
+ *        performance metrics for each op in the graph including main thread
+ *        execution time and time spent on parallel background ops.
+ */
+#define QNN_HTP_PROFILE_LEVEL_LINTING 7001
+
+/**
+ * @brief QnnProfile_EventType_t definition to get number of HVX threads
+ *        configured by a graph. Different graphs can have a different
+ *        value.
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_NUMBER_OF_HVX_THREADS 8001
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to applying binary section for updatable tensors
+ *        when client invokes QnnContext_ApplyBinarySection.
+ *        It refers to the total time the entire API takes.
+ *        The value returned is time taken in microseconds.
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_APPLY_BINARY_SECTION_QNN 9001
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to applying binary section for updatable tensors
+ *        when client invokes QnnContext_ApplyBinarySection.
+ *        It refers to the time of callTransport.
+ *        The value returned is time taken in microseconds.
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_APPLY_BINARY_SECTION_RPC 9002
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to applying binary section for updatable tensors
+ *        when client invokes QnnContext_ApplyBinarySection.
+ *        It refers to the remote procedure call on the HTP processor.
+ *        The value returned is time taken in microseconds.
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_APPLY_BINARY_SECTION_QNN_ACC 9003
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to applying binary section for updatable tensors
+ *        when client invokes QnnContext_ApplyBinarySection.
+ *        It refers to the Hexnn call
+ *        The value returned is time taken in microseconds.
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_APPLY_BINARY_SECTION_ACC 9004
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // QNN_HTP_PROFILE_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpProperty.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpProperty.h
new file mode 100755
index 0000000000000..51440061dc611
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpProperty.h
@@ -0,0 +1,30 @@
+//==============================================================================
+//
+//  Copyright (c) 2022 Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef QNN_HTP_PROPERTY_H
+#define QNN_HTP_PROPERTY_H
+
+#include "QnnProperty.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+/**
+ * @brief Property key for determining whether a backend supports unsigned pd.
+ */
+#define QNN_PROPERTY_CUSTOM_HTP_UNSIGNED_PD_SUPPORT QNN_PROPERTY_GROUP_CUSTOM + 1
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // QNN_HTP_PROPERTY_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpSystemContext.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpSystemContext.h
new file mode 100755
index 0000000000000..dcfedcb3f6450
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpSystemContext.h
@@ -0,0 +1,119 @@
+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All rights reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/**
+ *  @file
+ *  @brief QNN HTP component System Context API.
+ *
+ *         The interfaces in this file work with the top level QNN
+ *         API and supplements QnnSystemContext.h for HTP backend
+ */
+
+#ifndef QNN_HTP_SYSTEM_CONTEXT_H
+#define QNN_HTP_SYSTEM_CONTEXT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+typedef enum {
+  // Following version with hwInfoBlobVersion as:
+  //   - Major 0, Minor: 0, Patch: 1
+  QNN_SYSTEM_CONTEXT_HTP_HW_INFO_BLOB_VERSION_V1 = 0x01,
+  // Unused, present to ensure 32 bits.
+  QNN_SYSTEM_CONTEXT_HTP_HW_INFO_BLOB_UNDEFINED = 0x7FFFFFFF
+} QnnHtpSystemContext_HwInfoBlobVersion_t;
+
+// This struct is gets populated within a binary blob as part of hwInfoBlob in
+// QnnSystemContext_BinaryInfoV#_t struct in QnnSystemContext.h
+typedef struct QnnHtpSystemContext_HwBlobInfoV1 {
+  // This value represents the index of the list of graphs registered
+  // to this context as specified in QnnSystemContext_GraphInfo_t*
+  uint32_t graphListIndex;
+  // Stores the spill-fill buffer size used by each of the graphs
+  uint64_t spillFillBufferSize;
+} QnnHtpSystemContext_HwBlobInfoV1_t;
+
+typedef struct {
+  QnnHtpSystemContext_HwInfoBlobVersion_t version;
+  union UNNAMED {
+    QnnHtpSystemContext_HwBlobInfoV1_t contextBinaryHwInfoBlobV1_t;
+  };
+} QnnHtpSystemContext_HwBlobInfo_t;
+
+typedef enum {
+  // Following version with GraphInfoBlobVersion as:
+  //   - Major 0, Minor: 0, Patch: 1
+  QNN_SYSTEM_CONTEXT_HTP_GRAPH_INFO_BLOB_VERSION_V1 = 0x01,
+  // Unused, present to ensure 32 bits.
+  QNN_SYSTEM_CONTEXT_HTP_GRAPH_INFO_BLOB_UNDEFINED = 0x7FFFFFFF
+} QnnHtpSystemContext_GraphInfoBlobVersion_t;
+
+// This struct is gets populated within a binary blob as part of GraphInfoBlob in
+// QnnSystemContext_BinaryInfoV#_t struct in QnnSystemContext.h
+typedef struct {
+  // Stores the spill-fill buffer size used by each of the graphs
+  uint64_t spillFillBufferSize;
+  // HTP vtcm size (MB)
+  uint32_t vtcmSize;
+  // Optimization level
+  uint32_t optimizationLevel;
+  // Htp Dlbc
+  uint8_t htpDlbc;
+  // Number of HVX Threads to reserve;
+  uint64_t numHvxThreads;
+} QnnHtpSystemContext_GraphBlobInfoV1_t;
+
+typedef struct {
+  QnnHtpSystemContext_GraphInfoBlobVersion_t version;
+  union UNNAMED {
+    QnnHtpSystemContext_GraphBlobInfoV1_t contextBinaryGraphBlobInfoV1;
+  };
+} QnnHtpSystemContext_GraphBlobInfo_t;
+
+typedef enum {
+  // Following version with ContextInfoBlobVersion as:
+  //   - Major 0, Minor: 0, Patch: 1
+  QNN_SYSTEM_CONTEXT_HTP_CONTEXT_INFO_BLOB_VERSION_V1 = 0x01,
+  // Unused, present to ensure 32 bits.
+  QNN_SYSTEM_CONTEXT_HTP_CONTEXT_INFO_BLOB_UNDEFINED = 0x7FFFFFFF
+} QnnHtpSystemContext_ContextInfoBlobVersion_t;
+
+typedef struct{
+    /// An integer representation of SocUtility::DspArch
+    uint32_t dspArch;
+} QnnHtpSystemContext_ContextBlobInfoV1_t;
+
+typedef struct {
+  QnnHtpSystemContext_ContextInfoBlobVersion_t version;
+  union UNNAMED {
+    QnnHtpSystemContext_ContextBlobInfoV1_t contextBinaryContextBlobInfoV1;
+  };
+} QnnHtpSystemContext_ContextBlobInfo_t;
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+//=============================================================================
+// Public Functions
+//=============================================================================
+
+//=============================================================================
+// Implementation Definition
+//=============================================================================
+
+// clang-format on
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
\ No newline at end of file
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/afuncs.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/afuncs.h
new file mode 100755
index 0000000000000..28b5685f29750
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/afuncs.h
@@ -0,0 +1,338 @@
+//==============================================================================
+//
+// Copyright (c) 2018, 2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef AFUNCS_H
+#define AFUNCS_H 1
+
+#include <algorithm>
+#include <cmath>
+#include "dtype.h"
+#ifndef __hexagon__
+#include <cstring> // for memcpy etc
+#endif
+// #include "asm_define.h"
+#include "builtin_intrinsics.h"
+#include "macros_attribute.h"
+
+struct tile_data {
+    uint8_t **addr;
+    uint32_t offset_t_col;
+    uint32_t offset_t_row;
+    uint32_t width;
+    uint32_t height;
+    uint32_t depth;
+};
+
+// Define order: .addr, .offset_t_col, .offset_t_row, .width, .height, .depth
+#define TILEDATA(adrtab, next_tab_col, next_tab_row, h, w, d)                                                          \
+    {                                                                                                                  \
+        (uint8_t **)(adrtab), static_cast<uint32_t>(next_tab_col), static_cast<uint32_t>(next_tab_row),                \
+                static_cast<uint32_t>(w), static_cast<uint32_t>(h), static_cast<uint32_t>(d)                           \
+    }
+
+/*=======================================*/
+/* Auxiliary functions                   */
+/*=======================================*/
+#if defined(__hexagon__)
+inline int32_t max_i32(int32_t a, int32_t b)
+{
+    return Q6_R_max_RR(a, b);
+}
+inline int32_t min_i32(int32_t a, int32_t b)
+{
+    return Q6_R_min_RR(a, b);
+}
+inline uint32_t max_u32(uint32_t a, uint32_t b)
+{
+    return Q6_R_maxu_RR(a, b);
+}
+inline uint32_t min_u32(uint32_t a, uint32_t b)
+{
+    return Q6_R_minu_RR(a, b);
+}
+#else
+inline int32_t max_i32(int32_t a, int32_t b)
+{
+    return (a < b) ? b : a;
+}
+inline int32_t min_i32(int32_t a, int32_t b)
+{
+    return (a < b) ? a : b;
+}
+inline uint32_t max_u32(uint32_t a, uint32_t b)
+{
+    return (a < b) ? b : a;
+}
+inline uint32_t min_u32(uint32_t a, uint32_t b)
+{
+    return (a < b) ? a : b;
+}
+#endif
+
+[[maybe_unused]] inline ALWAYSINLINE int64_t roundf_i64(float val)
+{
+    // add 0.5 (with same sign as val) and then conversion to int truncates toward 0.
+    // values exactly halfway will round away from 0 (like roundf).
+
+    return (int64_t)(val + copysignf(0.5f, val));
+}
+
+[[maybe_unused]] inline ALWAYSINLINE NN_INT32_T roundf_i32(float val)
+{
+    // add 0.5 (with same sign as val) and then conversion to int truncates toward 0.
+    // values exactly halfway will round away from 0 (like roundf).
+
+    return (int)(val + copysignf(0.5f, val));
+}
+// same thing for rounding to unsigned range; -ve inputs will give 0.
+//
+[[maybe_unused]] inline ALWAYSINLINE uint32_t roundf_u32(float val)
+{
+    // add 0.5f and then convert to uint (trunc towards 0; -ve values are clipped to 0).
+#ifdef __hexagon__
+    // use intrinsic since conv of -ve float to unsigned is 'undefined behaviour' in C.
+    return Q6_R_convert_sf2uw_R_chop(val + 0.5f);
+#else
+    return (val < 0.5f) ? 0 : (uint32_t)(val + 0.5f);
+#endif
+}
+
+[[maybe_unused]] inline ALWAYSINLINE NN_INT32_T roundd_i32(double val)
+{
+    // add 0.5 (with same sign as val) and then conversion to int truncates toward 0.
+    // values exactly halfway will round away from 0 (like round).
+
+    return (int)(val + copysign(0.5, val));
+}
+
+[[maybe_unused]] inline ALWAYSINLINE NN_INT32_T saturate_u8(NN_INT32_T val)
+{
+#ifdef __hexagon__
+    return Q6_R_satub_R(val);
+#else
+    return (val < 0) ? 0 : ((val > 255) ? 255 : val);
+#endif
+}
+
+[[maybe_unused]] inline ALWAYSINLINE NN_INT32_T saturate_u16(NN_INT32_T val)
+{
+#ifdef __hexagon__
+    return Q6_R_satuh_R(val);
+#else
+    return (val < 0) ? 0 : ((val > 65535) ? 65535 : val);
+#endif
+}
+
+[[maybe_unused]] static inline ALWAYSINLINE NN_INT32_T saturate_i16(NN_INT32_T val)
+{
+#ifdef __hexagon__
+    return Q6_R_sath_R(val);
+#else
+    return (val < -32768) ? -32768 : ((val > 32767) ? 32767 : val);
+#endif
+}
+
+/**
+ * @brief low-cost frexpf (but only the exponent result);
+ * Generates only a few instructions on hexagon.
+ *
+ * Input must not be inf,nan, zero, or denormal.
+ *
+ * returns:
+ *        -1 if abs(x) is in range 0.25 ... 0.249999
+ *         0 if abs(x) is in range 0.5 ... 0.99999
+ *         1 if abs(x) is in range 1.0 .. 1.9999
+ *  etc
+ *
+ *  If the value -126 is returned, x is a zero or denormal;
+ *  129 is returned for inf or NaN. for other cases the value is the same
+ *  as what frexpf  (in math.h) generates for the exponent.
+ */
+[[maybe_unused]] inline ALWAYSINLINE constexpr int flt_getexp(float x)
+{
+    union {
+        float f;
+        uint32_t u32;
+    } const uu = {x};
+    return ((uu.u32 >> 23u) & 0xFFu) - 126;
+}
+/**
+ * @brief low-cost frexpf (but only the 'fraction' result);
+ * Generates only a few instructions on hexagon.
+ *
+ * Input must not be inf,nan, zero, or denormal.
+ *
+ * returns a value in the range [0.5, 1.0)  (or in (-1.0,-0.5] when x < 0)
+ * such that x = flt_getmant(x) * powf2(2.0, flt_getexp(x))
+ *
+ */
+[[maybe_unused]] inline ALWAYSINLINE constexpr float flt_getmant(float x)
+{
+    union {
+        float f;
+        uint32_t u32;
+    } uu = {x};
+    uu.u32 = (uu.u32 & 0x807fffffu) | (uint32_t(126) << 23u); // force exponent = 126
+    return uu.f;
+}
+
+/**
+ * @brief returns the mantissa of x, as a 24-bit number
+ * in the range 0x800000 .. 0xFFFFFF
+ *
+ * Input must not be inf,nan, zero, or denormal.
+ *
+ * Sign is discarded. same as powf(2,24) * flt_getmant(fabsf(x)).
+ */
+[[maybe_unused]] inline ALWAYSINLINE constexpr int32_t flt_getfrac(float x)
+{
+    union {
+        float f;
+        uint32_t u32;
+    } const uu = {x};
+    int32_t const m = (uu.u32 & 0x007fffffu) | (uint32_t(1) << 23u);
+    return m;
+}
+
+//
+// This 'normalizes' a float to 0.5 .. 0.9999  (sign is retained)
+// Same result as the return value from frexpf, without using a function call
+// Results are not valid if x is 0, denormal, or inf/nan
+//
+[[maybe_unused]] inline ALWAYSINLINE float flt_getfrac_norm(float x)
+{
+    union {
+        float f;
+        uint32_t u32;
+    } uu = {x};
+    uu.u32 = (uu.u32 & 0x807fffffu) | (uint32_t(126) << 23u); // force exponent = 126
+    return uu.f;
+}
+/**
+ * @brief low-cost 2.0*n for integer n.
+ * Same as powf(2.0f, iexpo) without a function call;
+ *
+ * Constraint: iexpo must be in range -126..127
+ */
+[[maybe_unused]] inline ALWAYSINLINE constexpr float flt_power2(uint32_t const iexpo)
+{
+    uint32_t const a = (iexpo + 127) & 0xFFu;
+    union {
+        uint32_t u32;
+        float f;
+    } const uu = {a << 23u};
+    return uu.f;
+}
+/**
+ * @brief low-cost ldexpf
+ * Same as ldexpf(val, iexpo) without a function call;
+ *
+ * Constraint: iexpo must be in range -126..127
+ */
+[[maybe_unused]] inline ALWAYSINLINE constexpr float flt_ldexp(float val, int iexpo)
+{
+    return val * flt_power2(iexpo);
+}
+/**
+ * @brief low-cost 2.0*n for integer n.
+ * Same as pow(2.0d, iexpo) without a function call;
+ *
+ * Constraint: iexpo must be in range -1022..1023
+ */
+[[maybe_unused]] inline ALWAYSINLINE constexpr double double_power2(uint32_t const iexpo)
+{
+    uint64_t const a = (iexpo + 1023) & 0x7FFu;
+    union {
+        uint64_t u64;
+        double d;
+    } const uu = {a << 52u};
+    return uu.d;
+}
+/**
+ * @brief low-cost ldexpf
+ * Same as ldexp(val, iexpo) without a function call;
+ *
+ * Constraint: iexpo must be in range -1022..1023
+ */
+[[maybe_unused]] inline ALWAYSINLINE constexpr double double_ldexp(double val, int iexpo)
+{
+    return val * double_power2(iexpo);
+}
+
+/**
+ * @brief returns the exponent and mantissa of x, as a n-bit number
+ *
+ * Constraint: iexpo must be in range -126..127
+ * Input must not be negative, inf,nan, zero, or denormal.
+ */
+template <uint32_t MBITS> inline constexpr std::pair<int32_t, uint32_t> get_scalefactor(float x)
+{
+    union {
+        float f;
+        uint32_t u32;
+    } const uu = {x};
+
+    uint32_t inval = uu.u32;
+    uint32_t const mask = hnnx::safe_lshift(1, MBITS) - 1;
+    inval = hnnx::safe_rshift(inval + hnnx::safe_lshift(1, (24 - MBITS - 1)),
+                              (24 - MBITS)); // possibly overflows into exponent, but that's OK.
+    uint32_t const m = ((inval & mask) | hnnx::safe_lshift(1u, (MBITS - 1)));
+    int32_t const e = int32_t(hnnx::safe_rshift(inval, (MBITS - 1)) & 0xFFu) - 126;
+    return {e, m};
+}
+
+/**
+ * @brief returns the parameters for scaling.
+ * bit 31-24: left shift amount
+ * bit 23-16: right shift amout
+ * bit 15- 0: scale factor
+ *
+ * Input must not be inf,nan, zero, negative or denormal.
+ *
+ */
+[[maybe_unused]] inline ALWAYSINLINE constexpr uint32_t get_scaling_params(float x, int max_sl, int max_sr)
+{
+    auto [e, m] = get_scalefactor<15>(x);
+    // Set a sl or sr amount to perform a multiply of 2^exponent by mantissa.
+    int sl = (e > 0) ? e : 0;
+    int sr = (e > 0) ? 0 : -e;
+    // The max_sl allows the addition of extra left shifts when working with small numbers having negative exponents.
+    // For every extra left shift, there is an offsetting right shift added so that the net right shift amount
+    // required from the exponent stays the same. The max_sr parameter provides a ceiling to the required offsetting
+    // right shifts, preventing the total right shift requirement from being large enough to erase data through shifting.
+    if (sl == 0 && sr > 0) {
+        sl = min_i32(max_sl, max_i32(max_sr - sr, 0));
+        sr = sr + sl;
+    }
+    return ((uint32_t(sl) & 0x0FFu) << 24u) | ((uint32_t(sr) & 0x0FFu) << 16u) | uint32_t(m);
+}
+
+/**
+ * @brief given a scale in float and a recip shift amount
+ *  return a quantized scale multiplier and change recip shamt inplace
+ *
+ */
+inline uint32_t get_quantized_multipiler(const float scale_f, int &recip_shamt)
+{
+    recip_shamt = (scale_f <= 1.0f) ? 0 : flt_getexp(scale_f);
+    uint32_t scale = static_cast<uint32_t>(roundf(flt_ldexp(scale_f, (31 - recip_shamt))));
+    scale = (scale < 0x7fffffffu) ? scale : 0x7FFFFFFFu;
+    return scale;
+}
+
+/**
+ * @brief given a scale in float and a recip shift amount
+ *  return a quantized scale multiplier and change recip shamt inplace
+ *
+ */
+//Now with corrected spelling
+inline uint32_t get_quantized_multiplier(const float scale_f, int &recip_shamt)
+{
+    return get_quantized_multipiler(scale_f, recip_shamt);
+}
+#endif /*AFUNCS_H*/
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/allocator.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/allocator.h
new file mode 100755
index 0000000000000..844bcf4c7ec50
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/allocator.h
@@ -0,0 +1,236 @@
+//==============================================================================
+//
+// Copyright (c) 2020 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef ALLOCATOR_H
+#define ALLOCATOR_H 1
+
+#include <cstddef>
+#include <algorithm>
+#include <memory>
+#include "dtype_enum.h"
+#include "weak_linkage.h"
+#include "macros_attribute.h"
+#include "forward_classes.h"
+#include "hexagon_nn_types.h"
+
+enum class MemoryClass {
+    Plain,
+    TCM,
+    UnCached, // for spill/fill DDR
+    XXX_LAST_MEMORY_TYPE,
+    Default = Plain
+};
+
+PUSH_VISIBILITY(default)
+
+extern bool TrackedAllocError;
+
+class Graph;
+class HexagonNNEnv;
+namespace fa {
+struct PoolDesc;
+struct BigBuff;
+struct RuntimeAllocator;
+} // namespace fa
+namespace hnnx {
+
+class Serializer;
+class Deserializer;
+
+// some options flags (powers of 2) for calls to Tensor::allocate
+enum AllocOptions {
+    uncached_int8 = 0x1, // override MemoryClass to UnCached.
+    uncached_int16 = 0x2,
+    uncached_fp16 = 0x4
+};
+
+/*
+ * Maybe FIXME: It seems like FancyAllocator has just about all the same interfaces as Allocator,
+ * is all this pimpl stuff needed, or could we just inherit Allocator and have a unique_ptr<Allocator>
+ * in our graph?
+ */
+
+class Allocator {
+  public:
+    // MIN_ALIGN, MAX_ALIGN:
+    //  - both must be powers of 2
+    //  -  8 <= MIN_ALIGN <= MAX_ALIGN
+    // All allocations will be aligned to at least MIN_ALIGN, both start and end of each region.
+    // This includes sub-allocations in memory pools.
+    // Alignment requests > MAX_ALIGN may be treated as MAX_ALIGN if allocated in DDR.
+    //
+    static constexpr unsigned MIN_ALIGN = 256;
+    static constexpr unsigned MAX_ALIGN = 256;
+
+    // The alignment used by TCM allocation; >= MIN_ALIGN
+    static constexpr unsigned TCM_ALLOC_ALIGN = 2048;
+
+    static void *vacant() { return (void *)2; } // special value for 'vacant' slot.
+    enum Mode { AllocVirtual, AllocPhysical, AllocTemp, AllocTempEnd, AllocComplete, LastMode = AllocComplete };
+
+    // AllocTemp/AllocTempEnd are used in Virtual mode, to set a 'Temp Physical' mode
+    // where allocation is done to physical memory, but into memory blocks which
+    // are discarded when we return via AllocTempEnd (So, AllocTempEnd is not possible as an actual
+    // current mode).
+    // This is intended to support nesting (multiple levels of AllocTemp; each
+    // AllocTempEnd discards all allocs since the matching AllocTemp; but
+    // currently nesting is not supported, so AllocTemp must be followed by AllocTempEnd,
+    // which actually takes you back to AllocVirtual
+    // AllocComplete allows no further allocations. A deserialized allocator
+    // is in this state.
+
+    API_EXPORT Allocator(Mode mode_in, Graph &graph_in) : graph(graph_in), mode(mode_in){};
+    API_EXPORT virtual ~Allocator() = 0;
+
+    Graph &graph;
+
+    // Either allocates enough, or dips into a buffer (and changes the buffer pointer and size parameter accordingly).
+    // al is an alignment parameter; it must be a power of 2 or the code below won't work.
+    API_EXPORT void *tracked_aligned_alloc(size_t al, size_t bytes, fa::BigBuff *const bb = nullptr);
+    API_EXPORT void tracked_free(void *aligned_ptr) noexcept;
+
+    API_EXPORT virtual void allocate_n(void **arrp, size_t n, size_t block_size, size_t alignment, MemoryClass memclass,
+                                       unsigned options, DType dtype);
+
+    // options for allocate_persistent_blocks.
+    // if 'allnew' is *not* present, it is assumed that all of the pointers
+    //   are either null, or point to existing persistent blocks. The 'null' ones
+    //   are replaced with new allocations, and the ref counts are increased in both cases.
+    // with 'allnew': pointers are assumed to contain garbage. Equivalent to zeroing the
+    //   pointer table first.
+    //
+    // zoneB: with this, ref counts are update in 'B' zone instead of A
+    //
+    // incref: ovverides 'allnew'; all of the existing pointers are required to be valid persistent
+    //    blocks; the ref counts are increased by 1
+    // decref: overrides 'incref and allnew'; all of the pointers are required to be valid persistent
+    //    blocks; the ref counts are reduced by 1. If total refs are zero, block is freed.
+    //    the pointer table is not updated.
+    //
+    // infinite: newly alloc'd blocks get refcount set to a huge number, instead of 1.
+    // Currently this is used when deserializing, since we can't free things immediately when in Crate.
+    //
+    enum persistent_options {
+        allnew = 1u, // assume existing pointers are garbage, allocate them all.
+        zoneB = 2u, // reference count in zone B instead of A.
+        incref = 4u, // enforce that all existing are persistnent; incref them.
+        decref = 8u,
+        infinite = 16u, // refcounts on new blocks, set to a huge # instead of 1.
+    };
+
+    // allocate n 'persistent' blocks of the given size/alignment, and update the table.
+    API_EXPORT virtual void allocate_persistent_blocks(void **table, size_t nblocks, size_t block_size,
+                                                       size_t alignment, unsigned options);
+
+    API_EXPORT inline void *allocate(const void *oldval, size_t block_size, size_t alignment, MemoryClass memclass,
+                                     unsigned options, DType dtype)
+    {
+        PUSH_WARNING()
+        DISABLE_WARNING("-Wcast-qual", MSVC_NO_EQUIV)
+        void *tmp = const_cast<void *>(oldval);
+        POP_WARNING()
+        allocate_n(&tmp, 1, block_size, alignment, memclass, options, dtype);
+        return tmp;
+    }
+
+    API_EXPORT Mode get_mode() const { return mode; }
+    API_EXPORT virtual void set_mode(Mode new_mode);
+
+    API_EXPORT virtual void set_tcm_pool(void *base, size_t size);
+
+    API_EXPORT virtual void set_largest_memory_alloc_size(size_t size);
+
+    /*
+	 * Serialize all the internal data for the allocator.
+	 * Memory regions / pools, etc.
+	 */
+    API_EXPORT virtual void serialize(Serializer &) const;
+    /*
+     * Deserialize the allocator, restore internal data from buffer.
+     */
+    API_EXPORT virtual void deserialize(HexagonNNEnv &env, Deserializer &dctx,
+                                        hexagon_nn_wide_address_const_t params_weights = 0U,
+                                        const size_t params_weights_length = 0,
+                                        hexagon_nn_wide_iovec_t const &weights = NULL_IOVEC);
+
+    API_EXPORT virtual int find_replaceable_mempool(unsigned const replaceable_pool_seq,
+                                                    fa::PoolDesc &found_pool) const;
+
+    // LCOV_EXCL_START [SAFTYSWCCB-1542]
+    API_EXPORT static inline constexpr size_t fixup_alignment(size_t align)
+    {
+        static_assert(MIN_ALIGN >= 8 && (MIN_ALIGN & (MIN_ALIGN - 1)) == 0, "bad MIN_ALIGN");
+        static_assert(MAX_ALIGN >= MIN_ALIGN && (MAX_ALIGN & (MAX_ALIGN - 1)) == 0, "bad MAX_ALIGN");
+        if (MIN_ALIGN < MAX_ALIGN) {
+            return std::max<size_t>(MIN_ALIGN, std::min<size_t>(MAX_ALIGN, align));
+        } else {
+            return MIN_ALIGN;
+        }
+    }
+    // LCOV_EXCL_STOP
+
+    API_EXPORT static inline constexpr size_t round_up_align(size_t n, size_t align)
+    {
+        return (n + (align - 1)) & ~(align - 1);
+    }
+    template <typename T> API_EXPORT static inline T *round_up_align(T *p, size_t align)
+    {
+        return (T *)round_up_align((size_t)p, align);
+    }
+
+  protected:
+    Mode mode = AllocVirtual;
+};
+
+//
+// this is s 'shim' class to help in making dummy allocators. It defines overrides
+// for all of the pure-virtual methods, so you don't need to
+//
+class FakeAllocator : public Allocator {
+  public:
+    API_EXPORT FakeAllocator(Allocator::Mode mode_in, Graph &graph_in) : Allocator(mode_in, graph_in){};
+    API_EXPORT virtual ~FakeAllocator();
+};
+
+// this is an accessor which is used by the Dma 'Fill' operation
+// to get a source pointer for reading const, based on (pool_id, offset).
+// It also holds the base pointer for ddr spill area.
+// Maybe other things could be added later.
+
+class MemPoolRunTimeAccessor {
+    hexagon_nn_wide_address_t spill_area;
+    fa::PoolDesc const *pool_table; // pool_table[0] is for poolid=1
+    unsigned max_pool_id;
+
+  public:
+    API_EXPORT MemPoolRunTimeAccessor(hexagon_nn_wide_address_const_t spill_area_in, fa::PoolDesc const *const pt,
+                                      unsigned const pt_size)
+        : spill_area(spill_area_in), pool_table(pt), max_pool_id(pt_size)
+    {
+    }
+    API_EXPORT MemPoolRunTimeAccessor() : spill_area(0), pool_table(nullptr), max_pool_id(0) {}
+    API_EXPORT MemPoolRunTimeAccessor(MemPoolRunTimeAccessor const &) = default;
+    API_EXPORT MemPoolRunTimeAccessor &operator=(MemPoolRunTimeAccessor const &) = default;
+
+    // pool ids are >= 1, <= num_pools
+    API_EXPORT constexpr unsigned num_pools() const { return max_pool_id; } //LCOV_EXCL_LINE [SAFTYSWCCB-1542]
+    // map pool_id to base address of the data, for persistent pool; also get 'is_weights' flag.
+    // implementation in runtime_alloc.h
+    std::pair<hexagon_nn_wide_address_t, bool> get_persistent_pool_base_iswts(unsigned pool_id) const;
+    API_EXPORT hexagon_nn_wide_address_t get_spill_area() const { return spill_area; }
+
+    // used to construct the ConstExtentDescriptor during prep
+    // implementation in fa_alloc.h
+    API_EXPORT fa::PoolDesc const *get_descriptor(unsigned pool_id) const;
+};
+
+} // namespace hnnx
+
+POP_VISIBILITY()
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/bake_defs.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/bake_defs.h
new file mode 100755
index 0000000000000..11d01bcb31b95
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/bake_defs.h
@@ -0,0 +1,244 @@
+//==============================================================================
+//
+// Copyright (c) Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef BAKE_DEFS
+#define BAKE_DEFS 1
+#include <cstdint>
+#include <algorithm>
+#include <utility>
+#include <tuple>
+
+#include "executable.h"
+
+// Contains defs for host-side and target side, so try not
+// to add too many 'host only' things.
+
+#ifdef __hexagon__
+#define HNNX_ARCH_CAN_RUN_BAKED 1
+#endif
+
+namespace hnnx {
+
+namespace bake {
+
+using tgt_ptr_word = unsigned;
+using tgt_sizet_word = unsigned;
+static constexpr unsigned tgt_ptr_bytes = sizeof(tgt_ptr_word);
+static constexpr unsigned tgt_sizet_bytes = sizeof(tgt_sizet_word);
+static constexpr bool op_has_graphp = false;
+static constexpr unsigned tensor_uptr_ptrs = 2;
+static constexpr unsigned max_opaquet_align = 1024; // must be power of 2
+
+// This should be OK as a first approx: includes hexagon and x86-32
+static constexpr bool host_can_run_baked = sizeof(void *) == tgt_ptr_bytes;
+
+inline unsigned constexpr round_up(unsigned x, unsigned m)
+{
+    return ((x + (m - 1)) / m) * m;
+}
+
+// functions to calculate size, align of various things. They
+// are included in target build so we can static_assert that sizes are what we think they are.
+// (all must be constexpr).
+
+// {size, alignment} of typical_op
+inline constexpr std::pair<unsigned, unsigned> typical_op_tgt_size_align(unsigned n_in, unsigned n_out)
+{
+    // 1 pointer per input, plus tensor_uptr_ptrs per output; but if n_in = n_out == 0, it's 1 pointer.
+    // (for a 'fill' byte).
+    unsigned num_io_ptrs = n_in + n_out * tensor_uptr_ptrs;
+    if (num_io_ptrs == 0) num_io_ptrs = 1; // n_in = n_out = 0 case
+    return {tgt_ptr_bytes * ((op_has_graphp ? 2 : 1) // vptr, and maybe Graph *
+                             + num_io_ptrs), // inputs and outputs
+            tgt_ptr_bytes}; // align
+}
+
+// 'tensor_op_tgt_size_align is used for crate accounting of ShapeWrapperOp, ConstWrapperOp, DummyOp<N>
+// In a proper 'baked graph' we don't need to insert these, just the tensors...
+
+inline constexpr std::pair<unsigned, unsigned> tensor_op_tgt_size_align(unsigned n_out)
+{
+    // happens to be the same as TypicalOp with no inputs...
+    return typical_op_tgt_size_align(0, n_out);
+}
+
+// {size, alignment, extra} of typical_op_with_compiler
+//    extra_len is the len of the extra data
+//    extra_align is its alignment.
+// The 3rd return value is the offset of the 'extra' within the image.
+//
+inline constexpr std::tuple<unsigned, unsigned, unsigned>
+typical_op_extra_tgt_size_align(unsigned n_in, unsigned n_out, unsigned extra_len, unsigned extra_align)
+{
+    std::pair<unsigned, unsigned> base_size = typical_op_tgt_size_align(n_in, n_out);
+    unsigned extra_offs = base_size.first;
+    if (extra_len > 0) {
+        extra_align = std::max(extra_align, base_size.second);
+        extra_len = round_up(extra_len, extra_align);
+        extra_offs = round_up(extra_offs, extra_align);
+        base_size.first = extra_offs + extra_len;
+        base_size.second = extra_align;
+    }
+    return {base_size.first, base_size.second, extra_offs};
+}
+
+// {size, alignment} of variadic op (without the in, out array contents)!
+constexpr std::pair<unsigned, unsigned> variadic_op_tgt_size_align(unsigned n_in, unsigned n_out)
+{
+    const unsigned cratevec_words = 2;
+    return {tgt_ptr_bytes * (1 // vptr
+                             + (op_has_graphp ? 1 : 0) // Graph *
+                             + 2 * cratevec_words), // two cratevecs
+            tgt_ptr_bytes}; // align
+}
+// {size, alignment} of simple_op_wrapper (without the in, out array contents)!
+constexpr std::pair<unsigned, unsigned> simplewrap_op_tgt_size_align(unsigned n_in, unsigned n_out)
+{
+    // this is just one more pointer than a variadic op...
+    const auto var_result = variadic_op_tgt_size_align(n_in, n_out);
+    return {var_result.first + tgt_ptr_bytes, var_result.second};
+}
+
+// {size, alignment} of a ChunkPreloadOp
+constexpr std::pair<unsigned, unsigned> chunk_preload_op_tgt_size_align()
+{
+    return {tgt_ptr_bytes * (1 // vptr
+                             + (op_has_graphp ? 1 : 0) // Graph *
+                             + 2), // ptr, len;
+            tgt_ptr_bytes}; // align
+}
+
+//
+// {size_align} of Shape<RANK> object
+//
+constexpr std::pair<unsigned, unsigned> shape_tgt_size_align(unsigned rank)
+{
+    // tgt_sizet_bytes * (1 + 1 + 2 * rank) =
+    //      vtable ptr
+    //      shapeflag flags + padding[]
+    //      std::array<size_t, Rank> dims
+    //      std::array<size_t, Rank> max_dims
+    //  + rank = std::array<uint8_t, Rank> pad
+    return {round_up(tgt_sizet_bytes * (1 + 1 + 1 + 2 * rank) + rank, tgt_sizet_bytes), tgt_sizet_bytes};
+}
+
+//
+// {size_align} of DynamicShape<RANK> object
+//
+constexpr std::pair<unsigned, unsigned> dynamic_shape_tgt_size_align(const unsigned rank)
+{
+    // std::array<size_t, Rank> dims == tgt_sizet_bytes * rank
+    // (shapeflag flags + padding[]) + vtable ptr + dynamic_state =  (3 * tgt_sizet_bytes)
+    return {round_up(tgt_sizet_bytes * rank + (4 * tgt_sizet_bytes), tgt_sizet_bytes), tgt_sizet_bytes};
+}
+
+//
+// {size_align} of interface object (may or may not be quantized)
+//
+constexpr std::pair<unsigned, unsigned> interface_tgt_size_align(bool is_quantized)
+{
+    return {tgt_sizet_bytes + (is_quantized ? round_up(3 * 4, tgt_sizet_bytes) : 0), tgt_sizet_bytes};
+}
+
+// {size_align} of Tensors, of three different forms:
+//
+// 'general' tensor
+//
+constexpr std::pair<unsigned, unsigned> tensor_general_tgt_size_align()
+{
+    return {tgt_sizet_bytes * 4 + 2 * tgt_ptr_bytes, tgt_sizet_bytes};
+}
+
+// 'shape' tensor, of given rank.
+//
+constexpr std::pair<unsigned, unsigned> tensor_shape_tgt_size_align(unsigned rank)
+{
+    return {tgt_sizet_bytes * ((rank == 0 ? 1 : rank) + 1), tgt_sizet_bytes};
+}
+
+// 'scalar' tensor, need to know if the interface is 'quantized' or not
+// Note, this assumes all value are <= size_t bytes.
+//
+constexpr std::pair<unsigned, unsigned> tensor_scalar_tgt_size_align(bool is_quantized)
+{
+    const unsigned ifc_size = interface_tgt_size_align(is_quantized).first;
+    return {tgt_sizet_bytes * 2 + ifc_size, tgt_sizet_bytes};
+}
+// sizeof OpExtraInfo on target: {long long, 2 * unsigned, char *, 4 * padbyte}
+constexpr std::pair<unsigned, unsigned> OpExtraInfo_size_align = {24, 8};
+
+// The size of a SliceDispatchOp for the given number of slices.
+// Currently it's always the same regardless of 'nslices'; We may introduce 'right-sized'
+// value, in which case 'exact=true' will get the 'real' size; but exact = false will always
+// give the full size.
+constexpr std::pair<unsigned, unsigned> slice_dispatch_op_size_align(unsigned const nslices, bool const exact = false)
+{
+    return {tgt_sizet_bytes * ((op_has_graphp ? 5 : 4) + 3 * Executable::MAX_OP_SLICES), tgt_sizet_bytes};
+}
+
+// The size of a Predicated Op
+constexpr std::pair<unsigned, unsigned> pred_op_size_align()
+{
+    return {tgt_sizet_bytes * ((op_has_graphp ? 5 : 4) + 3), tgt_sizet_bytes};
+}
+
+// this is used in e.g.
+// if constexpr(host_can_run_baked) static_assert(size_align_matches<TypicalOp>(N_IN, N_OUT));
+
+template <typename T, typename SZAL> constexpr bool size_align_matches(SZAL sz)
+{
+    return sizeof(T) == std::get<0>(sz) && alignof(T) == std::get<1>(sz);
+}
+
+// This is a utility to check that a type T has a given size and aligment, using static_assert;
+// Just need to include a call to 'do-nothing' bake::check_size_align<T>::template check<SIZE,ALIGN>();
+// The static assert is *disabled* unless compiling on hexagon (or compatible host).
+//
+// It's more complex than it needs to be, since it's designed to make sure the type and
+// numbers wind up in the error message, e.g. you could end up with
+//   error: static_assert failed due to requirement 'claimed(40) == actual(48)' "size not as claimed"
+//        static_assert(claimed(CLAIMED_SIZE) == actual(ACTUAL_SIZE), "size not as claimed");
+// ... note: in instantiation of function template specialization 'check_szal<MyType>::check_size_align<..., ...>'
+//
+template <typename T> struct check_size_align {
+    static constexpr int claimed(int K) { return K; }
+    static constexpr int actual(int K) { return K; }
+    template <int CLAIMED_SIZE, int ACTUAL_SIZE = sizeof(T)> static constexpr bool check_size()
+    {
+        static_assert(claimed(CLAIMED_SIZE) == actual(ACTUAL_SIZE), "size not as claimed");
+        return CLAIMED_SIZE == ACTUAL_SIZE;
+    }
+    template <int CLAIMED_ALIGN, int ACTUAL_ALIGN = alignof(T)> static constexpr bool check_align()
+    {
+        static_assert(claimed(CLAIMED_ALIGN) == actual(ACTUAL_ALIGN), "align not as claimed");
+        return CLAIMED_ALIGN == ACTUAL_ALIGN;
+    }
+
+    template <int CLAIMED_SIZE, int CLAIMED_ALIGN> static constexpr bool check()
+    {
+        bool result = true;
+        if constexpr (host_can_run_baked) {
+            result = check_size<CLAIMED_SIZE>() && check_align<CLAIMED_ALIGN>();
+        }
+        return result;
+    }
+};
+
+} // namespace bake
+
+//
+// op_opaque_tgt_info<OpaqueT> must be specialized for each OpaqueT used in TypicalOpWithCompiler
+//
+template <typename OpaqueT> struct op_opaque_tgt_info {
+    // static constexpr unsigned length = ..; // length of the struct on target CPU
+    // static constexpr unsigned alignment = ... // aligbment on target CPU
+};
+
+} // namespace hnnx
+
+#endif // BAKE_DEFS
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/builtin_intrinsics.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/builtin_intrinsics.h
new file mode 100755
index 0000000000000..3496b792f25aa
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/builtin_intrinsics.h
@@ -0,0 +1,247 @@
+//==============================================================================
+//
+// Copyright (c) 2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+// Compiler builtin intrinsic functions should be specified in this file
+
+#ifndef BUILTIN_INTRINSICS_H_
+#define BUILTIN_INTRINSICS_H_
+
+#include <cassert>
+#include <climits>
+#include <cstdint>
+#include <type_traits>
+
+// Branch prediction
+#if defined(__clang__)
+
+#define HEX_LIKELY(x)   __builtin_expect(!!(x), 1)
+#define HEX_UNLIKELY(x) __builtin_expect(!!(x), 0)
+
+#define HEX_ASSUME      __builtin_assume
+#define HEX_UNREACHABLE __builtin_unreachable
+
+#elif defined(_MSC_VER)
+
+#define HEX_LIKELY(x)   (x)
+#define HEX_UNLIKELY(x) (x)
+
+#define HEX_ASSUME        __assume
+#define HEX_UNREACHABLE() __assume(0)
+
+#elif defined(__GNUC__)
+//No equivalent __builtin_assume in GNUC. Hence leaving empty.
+#define HEX_ASSUME(cond)
+
+#define HEX_LIKELY(x)   __builtin_expect(!!(x), 1)
+#define HEX_UNLIKELY(x) __builtin_expect(!!(x), 0)
+#define HEX_UNREACHABLE __builtin_unreachable
+
+#endif // defined(__clang__)
+
+// Overflow detection
+#if defined(__clang__) || defined(__GNUC__)
+
+#define HEX_ADD_OVERFLOW __builtin_add_overflow
+#define HEX_MUL_OVERFLOW __builtin_mul_overflow
+
+#elif defined(_MSC_VER)
+
+#include <limits>
+
+template <typename _T> static inline bool HEX_ADD_OVERFLOW(_T a, _T b, _T *out)
+{
+    *out = a + b;
+    return ((b > 0) && (a > std::numeric_limits<_T>::max() - b)) ||
+           ((b < 0) && (a < std::numeric_limits<_T>::min() - b));
+}
+
+template <typename _T> static inline bool HEX_MUL_OVERFLOW(_T a, _T b, _T *out)
+{
+    *out = a * b;
+    return ((b > 0) && (a > std::numeric_limits<_T>::max() / b || a < std::numeric_limits<_T>::min() / b)) ||
+           ((b < 0) && (a > std::numeric_limits<_T>::min() / b || a < std::numeric_limits<_T>::max() / b));
+}
+
+#endif // __clang__
+
+// Count bits
+
+#include <bitset>
+
+template <typename _T> static inline int HEX_COUNT_ONE_BIT(_T x)
+{
+    return std::bitset<sizeof(_T) * 8>(x).count();
+}
+
+#define HEX_COUNT_ONE_BIT_ULL HEX_COUNT_ONE_BIT
+#define HEX_COUNT_ONE_BIT_UL  HEX_COUNT_ONE_BIT
+
+#if defined(__clang__) || defined(__GNUC__)
+
+#define HEX_COUNT_LEADING_ZERO     __builtin_clz
+#define HEX_COUNT_LEADING_ZERO_UL  __builtin_clzl
+#define HEX_COUNT_LEADING_ZERO_ULL __builtin_clzll
+
+#define HEX_COUNT_TRAILING_ZERO     __builtin_ctz
+#define HEX_COUNT_TRAILING_ZERO_UL  __builtin_ctzl
+#define HEX_COUNT_TRAILING_ZERO_ULL __builtin_ctzll
+
+#elif defined(_MSC_VER)
+
+#include <intrin.h>
+
+// Returns the number of leading 0-bits in x, starting at the most significant
+// bit position. If x is 0, the result is undefined.
+static inline int HEX_COUNT_LEADING_ZERO_ULL(unsigned long long x)
+{
+    unsigned long where;
+    if (_BitScanReverse64(&where, x)) return static_cast<int>(63 - where);
+    return 64; // Undefined behavior
+}
+
+static inline int HEX_COUNT_LEADING_ZERO(unsigned int x)
+{
+    unsigned long where;
+    if (_BitScanReverse(&where, x)) return static_cast<int>(31 - where);
+    return 32; // Undefined Behavior.
+}
+
+static inline int HEX_COUNT_LEADING_ZERO_UL(unsigned long x)
+{
+    return sizeof(x) == 8 ? HEX_COUNT_LEADING_ZERO_ULL(x) : HEX_COUNT_LEADING_ZERO(static_cast<unsigned int>(x));
+}
+
+// Returns the number of trailing 0-bits in x, starting at the least significant
+// bit position. If x is 0, the result is undefined.
+static inline int HEX_COUNT_TRAILING_ZERO_ULL(unsigned long long x)
+{
+    unsigned long where;
+    if (_BitScanForward64(&where, x)) return static_cast<int>(where);
+    return 64; // Undefined Behavior.
+}
+
+static inline int HEX_COUNT_TRAILING_ZERO(unsigned int x)
+{
+    unsigned long where;
+    if (_BitScanForward(&where, x)) return static_cast<int>(where);
+    return 32; // Undefined Behavior.
+}
+
+static inline int HEX_COUNT_TRAILING_ZERO_UL(unsigned long x)
+{
+    return sizeof(x) == 8 ? HEX_COUNT_TRAILING_ZERO_ULL(x) : HEX_COUNT_TRAILING_ZERO(static_cast<unsigned int>(x));
+}
+
+#endif // defined(__clang__)
+
+// Atomic operation
+
+#if defined(__clang__) || defined(__GNUC__)
+
+#define HEX_ATOMIC_FETCH_AND_ADD __sync_fetch_and_add
+
+#define HEX_ATOMIC_FETCH_AND_AND __sync_fetch_and_and
+#define HEX_ATOMIC_FETCH_AND_OR  __sync_fetch_and_or
+
+#define HEX_ATOMIC_VAL_COMPARE_AND_SWAP  __sync_val_compare_and_swap
+#define HEX_ATOMIC_BOOL_COMPARE_AND_SWAP __sync_bool_compare_and_swap
+
+#elif defined(_MSC_VER)
+
+#include <intrin.h>
+
+#define HEX_ATOMIC_FETCH_AND_ADD(_p, _v)                                                                               \
+    (sizeof *(_p) == sizeof(__int64) ? _InterlockedExchangeAdd64((__int64 *)(_p), (__int64)(_v))                       \
+                                     : _InterlockedExchangeAdd((long *)(_p), (long)(_v)))
+
+template <typename _T> static inline _T HEX_ATOMIC_FETCH_AND_AND(_T volatile *_p, _T _v)
+{
+    _InterlockedAnd((long *)_p, (long)_v);
+    return static_cast<_T>(*_p);
+}
+
+template <typename _T> static inline _T HEX_ATOMIC_FETCH_AND_OR(_T volatile *_p, _T _v)
+{
+    _InterlockedOr((long *)_p, (long)_v);
+    return static_cast<_T>(*_p);
+}
+
+#define HEX_ATOMIC_VAL_COMPARE_AND_SWAP(_p, _old, _new)                                                                \
+    (sizeof *(_p) == sizeof(__int64)                                                                                   \
+             ? _InterlockedCompareExchange64((__int64 *)(_p), (__int64)(_new), (__int64)(_old))                        \
+             : _InterlockedCompareExchange((long *)(_p), (long)(_new), (long)(_old)))
+
+#define HEX_ATOMIC_BOOL_COMPARE_AND_SWAP(_p, _old, _new) (HEX_ATOMIC_VAL_COMPARE_AND_SWAP(_p, _old, _new) == (_old))
+
+#endif // defined(__clang__)
+
+namespace hnnx {
+
+/**
+ * @brief promote_shift_operand reflects the integral promotions for small integer types.
+ * safe_lshift/safe_rshift must be aware of these promotions, since the C++ standard only
+ * defines the behavior for shift operations where the RHS is between 0 and
+ * 1 less than the bit-width of the *promoted* type of the LHS.
+ */
+template <typename T> struct promote_shift_operand {
+    typedef T type;
+};
+
+template <> struct promote_shift_operand<char> {
+    using type = int;
+};
+template <> struct promote_shift_operand<signed char> {
+    using type = int;
+};
+template <> struct promote_shift_operand<unsigned char> {
+    using type = int;
+};
+template <> struct promote_shift_operand<short> {
+    using type = int;
+};
+template <> struct promote_shift_operand<unsigned short> {
+    using type = int;
+};
+
+template <typename T> using promote_shift_operand_t = typename promote_shift_operand<T>::type;
+
+// The following portable template functions are replacements for the
+// built-in shift operations, << and >>, that provide the following guarantees:
+//
+// 1. Both the left and right operands of the shift will be treated as unsigned.
+//    This, by construction, prevents any undefined or implementation-defined
+//    behavior that may arise when shifting negative-valued expressions.
+// 2. The right operand will be bit-masked in a way that guarantees
+//    that its value is in the range [0, bitwidth(promoted_left_operand) - 1]
+
+template <typename T> constexpr unsigned get_safe_shift_mask()
+{
+    return unsigned(CHAR_BIT * sizeof(promote_shift_operand_t<std::remove_cv_t<std::remove_reference_t<T>>>) - 1);
+}
+
+template <typename T, typename S, unsigned mask = get_safe_shift_mask<T>()>
+constexpr auto safe_lshift(T const value, S const shift_amount)
+{
+    static_assert(std::is_integral<T>::value && std::is_integral<S>::value,
+                  "safe_lshift only makes sense for integral parameters");
+    assert((static_cast<unsigned>(shift_amount) & ~mask) == 0 && "shift_amount is out of range");
+    return value << shift_amount;
+}
+
+template <typename T, typename S, unsigned mask = get_safe_shift_mask<T>()>
+constexpr auto safe_rshift(T const value, S const shift_amount)
+{
+    static_assert(std::is_integral<T>::value && std::is_integral<S>::value,
+                  "safe_rshift only makes sense for integral parameters");
+    assert((static_cast<unsigned>(shift_amount) & ~mask) == 0 && "shift_amount is out of range");
+    return value >> shift_amount;
+}
+
+} // namespace hnnx
+
+#endif /* BUILTIN_INTRINSICS_H_ */
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/c_tricks.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/c_tricks.h
new file mode 100755
index 0000000000000..0531625039312
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/c_tricks.h
@@ -0,0 +1,21 @@
+//==============================================================================
+//
+// Copyright (c) 2020 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef C_TRICKS_H
+#define C_TRICKS_H 1
+
+#define CTRICKS_PASTER2(A, B) A##B
+#define CTRICKS_PASTER(A, B)  CTRICKS_PASTER2(A, B)
+
+#define STRINGIFY(x) #x
+#define TOSTRING(x)  STRINGIFY(x)
+
+#define PROBABLY(x)  __builtin_expect(!(!(x)), 1)
+#define YEAHRIGHT(x) __builtin_expect(!(!(x)), 1)
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/cc_pp.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/cc_pp.h
new file mode 100755
index 0000000000000..c4363d8cb3e6f
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/cc_pp.h
@@ -0,0 +1,26 @@
+//==============================================================================
+//
+// Copyright (c) 2020 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef CC_PP_H
+#define CC_PP_H 1
+
+/*
+ * C++ Preprocessor Definitions
+ */
+
+#ifdef __cplusplus
+#define EXTERN_C_BEGIN extern "C" {
+#define EXTERN_C_END                                                                                                   \
+    }                                                                                                                  \
+    ;
+#else
+#define EXTERN_C_BEGIN /* NOTHING */
+#define EXTERN_C_END   /* NOTHING */
+#endif
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/check_hvx.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/check_hvx.h
new file mode 100755
index 0000000000000..bd12354b0a314
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/check_hvx.h
@@ -0,0 +1,35 @@
+//==============================================================================
+//
+// Copyright (c) 2022-2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#include "cc_pp.h"
+#include "macros_attribute.h"
+#include "weak_linkage.h"
+
+#ifndef CHECK_HVX_H
+#define CHECK_HVX_H 1
+
+//
+// This makes sure that we have an HVX context (or not).  Does nothing on H2 or
+// QuRT, but on x86, makes use of a TLS variable to do the check.
+//
+
+#ifdef __hexagon__
+
+static inline void check_hvx() {}
+static inline void check_not_hvx() {}
+
+#else
+
+PUSH_VISIBILITY(default)
+API_EXPORT void check_hvx();
+API_EXPORT void check_not_hvx();
+POP_VISIBILITY()
+
+#endif
+
+#endif // CHECK_HVX_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/const_extent_descriptor.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/const_extent_descriptor.h
new file mode 100755
index 0000000000000..a7f50569eb471
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/const_extent_descriptor.h
@@ -0,0 +1,207 @@
+//==============================================================================
+//
+// Copyright (c) 2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef CONST_EXTENT_DESCRIPTOR_H
+#define CONST_EXTENT_DESCRIPTOR_H 1
+
+#include <cstdio>
+#include <vector>
+#include <cassert>
+#include <string>
+#include "forward_classes.h"
+#include "serialize_defs.h"
+#include "pickle_header_tags.h"
+#include "const_extent_shared.h"
+
+namespace hnnx {
+
+// This class is used, on both encoder and decoder, to contain a 'const extent descriptor' in its raw form, (just an array of uint32)
+// and provide higher-level access to the contents.
+
+class ConstExtentDesc {
+  protected:
+    using table_t = std::vector<uint32_t>;
+    // The 'table' may or may not contain the 'padding' section at the end; this is not accessed,
+    // and the serialize method will always generate the required padding.
+    table_t table;
+    // some values broken out from the header...
+    unsigned extab_n = 0, extab_idx = 0; // number of extents, and word index where they start
+    unsigned mptab_n = 0, mptab_idx = 0; // number of memory pools, and word index where they start.
+    unsigned desc_len = 0; // length of the entire descriptor in bytes (0 if invalid descriptor)
+
+    bool scan_table(); // sanity check, and unpacks the above; returns true if OK.
+
+  public:
+    static uint8_t constexpr EXTENT_FLAGS_BITFIELD_LSB = 8;
+    static uint8_t constexpr EXTENT_FLAGS_BITFIELD_WIDTH = 8;
+
+    ///
+    /// @brief Values for 8b flags in extent record
+    ///
+    static uint8_t constexpr EXTENT_FLAG_RESERVED_0 = (1 << 0);
+    static uint8_t constexpr EXTENT_FLAG_RESERVED_1 = (1 << 1);
+    static uint8_t constexpr EXTENT_FLAG_RESERVED_2 = (1 << 2);
+    static uint8_t constexpr EXTENT_FLAG_RESERVED_3 = (1 << 3);
+    static uint8_t constexpr EXTENT_FLAG_IS_FAR_HINT = (1 << 4); ///< Contents maybe far
+    static uint8_t constexpr EXTENT_FLAG_RESERVED_5 = (1 << 5);
+    static uint8_t constexpr EXTENT_FLAG_RESERVED_6 = (1 << 6);
+    static uint8_t constexpr EXTENT_FLAG_RESERVED_7 = (1 << 7);
+
+    // Return from 'extent_info'.
+    struct extab_entry {
+        uint32_t extent_flags;
+        uint32_t align; // a power of 2, >= 64
+        uint64_t offset; // offset, in bytes, from the start of the descriptor, to where the data is.
+        uint64_t length; // length of the data in bytes.
+    };
+    // Return from 'mempool_info'.
+    // Note: if 'adjust_offset' is true, the 'offset' field from the containing extent will be added to offset,
+    // so that the offset is from the start of the descriptor, instead of the start of the containing extent.
+    struct mempool_entry {
+        uint32_t mempool_id; // a mempool id >=2 indicating a const mempool
+        uint32_t extent_id; // an extent_id, >=1
+        uint64_t offset; // offset in bytes of the data from the start of the extent (see note above)
+        uint64_t length; // length in bytes of the data
+    };
+    // optional name of the const_extent this descriptor corresponds to. Used for matching in weight_sharing.
+    std::string name = std::string{};
+
+    ConstExtentDesc() {}
+    ConstExtentDesc(table_t &&table_in);
+    void serialize(Serializer &) const;
+    inline bool load_table(table_t &&table_in)
+    {
+        table = std::move(table_in);
+        return scan_table();
+    }
+
+    constexpr bool is_valid() const { return desc_len != 0; }
+
+    constexpr unsigned descriptor_length() const { return desc_len; }
+
+    constexpr unsigned num_extents() const { return extab_n; }
+    constexpr unsigned num_mempools() const { return mptab_n; }
+
+    // unpack a row of the extent table
+    // NOTE: extent_id is 1-based, must be 1 .. num_extents()
+    extab_entry extent_info(unsigned extent_id) const;
+
+    // unpack a row of the mempool table.
+    // note: idx is not a mempool idx, it is a 1-based row in range 1...num_mempools();
+    // if adjust_offset, the offset of the containing extent is added to the offset
+    // of the mempool in the returned value.
+    mempool_entry mempool_info(unsigned idx, bool adjust_offset = false) const;
+
+    // The ordering of the data and the descriptors is such that:
+    //
+    // (1)  extent_info(1).offset >= descriptor_length()
+    //      mempool_info(1,true).offset >= descriptor_length()
+    // (2) for i >=2,
+    //      extent_info(i).offset >= extent_info(i+1).offset + extent_info(i+1).length
+    //      mempool_info(i,true).offset >= mempool_info(1-1,true).offset + mempool_info(1-1).length
+    //
+
+#if !defined(PREPARE_DISABLED)
+    ///
+    /// @brief Memory pool record iterator
+    /// @details Use to iterator over records in memory pool table in constant
+    /// extent descriptor
+    ///
+    class mempool_iterator {
+      public:
+        using iterator_category = std::input_iterator_tag;
+        using value_type = ConstExtentDesc::mempool_entry;
+        using difference_type = std::ptrdiff_t;
+        using pointer = value_type *;
+        using reference = value_type &;
+
+        ///
+        /// @brief Constructor
+        /// @param [in] cedesc A valid constant extent descriptor instance
+        /// @param [in] index Record index (zero-based!)
+        ///
+        explicit mempool_iterator(ConstExtentDesc const &cedesc, uint32_t index) : _cedesc(cedesc), _index(index) {}
+
+        ///
+        /// @brief Increment record
+        /// @return Iterator
+        ///
+        mempool_iterator &operator++()
+        {
+            // Increment IFF valid constant extent descriptor and mempool record
+            // index within range
+            _index += (_cedesc.is_valid() && (_index < _cedesc.mptab_n)) ? 1 : 0;
+            return *this;
+        }
+
+        ///
+        /// @brief Equality operator
+        /// @return true if iterators are equal
+        ///
+        bool operator==(mempool_iterator const &other) const { return _index == other._index; }
+
+        ///
+        /// @brief Inequality operator
+        /// @return true if iterators are not equal
+        ///
+        bool operator!=(mempool_iterator const &other) const { return !(*this == other); }
+
+        ///
+        /// @brief Dereference iterator
+        ///
+        reference operator*();
+
+      private:
+        ///
+        /// @brief Reference to a constant extent descriptor instance
+        /// @details It contains the blob representing constant extent segment
+        ///
+        ConstExtentDesc const &_cedesc;
+
+        ///
+        /// @brief Current index
+        ///
+        uint32_t _index;
+
+        ///
+        /// @brief Mempool record entry
+        /// @details It is assigned when on iterator dereference
+        ///
+        value_type _entry;
+    };
+
+    ///
+    /// @brief Return mempool iterator initialized to the first record
+    /// @return Mempool iterator
+    ///
+    mempool_iterator begin() { return mempool_iterator(*this, 0); }
+
+    ///
+    /// @brief Return mempool iterator beyond the last record
+    /// @warning Intended to be used as a sentinel
+    /// @return Mempool iterator
+    ///
+    mempool_iterator end() { return mempool_iterator(*this, mptab_n); }
+#endif
+};
+#ifndef PREPARE_DISABLED
+// Called at the end of serializing a graph, if 'const extent' mode is enabled.
+// See comment in const_extent_descriptor.cc for full details.
+// LCOV_EXCL_START [SAFTYSWCCB-1542]
+size_t write_aligned_const_info(Graph const &gr, Serializer &sctx, unsigned buried_aux_n_words = 0);
+#else
+inline constexpr size_t write_aligned_const_info(Graph const &gr, Serializer const &sctx, unsigned = 0)
+{
+    return 0;
+}
+// LCOV_EXCL_STOP
+#endif
+
+} // namespace hnnx
+
+#endif // CONST_EXTENT_DESCRIPTOR_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/const_extent_shared.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/const_extent_shared.h
new file mode 100755
index 0000000000000..39c95e26ed561
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/const_extent_shared.h
@@ -0,0 +1,81 @@
+//==============================================================================
+//
+// Copyright (c) 2024 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef CONST_EXTENT_SHARED_H_
+#define CONST_EXTENT_SHARED_H_
+
+namespace hnnx {
+// definitions pertaining to the 'const extent descriptor'.
+
+constexpr unsigned CONST_EXTENT_DESC_MAGIC = 0x71c43c9b;
+// if a const extent descriptor has a 'cbname' in it, the last 32-bit slot
+// is this value. The 0x3e, 0x00 is the ">\0" at the end of the cbname
+constexpr unsigned CONST_EXTENT_CBNAME_TAG = 0xebbe003e;
+
+// This must be a power of 2, and >= 64.
+// This is effectively a 'quiet' minimum on options.serialize_const_alignment, which sets
+// the actual alignment.
+// It is not necessary for the decoder to know what value of alignment was used in the encoder.
+constexpr unsigned CONST_EXTENT_MIN_ALIGN = 256;
+//
+// this is a (non-quiet) maximum on options.serialize_const_alignment
+constexpr unsigned CONST_EXTENT_MAX_ALIGN = 1024 * 1024;
+
+///
+/// @brief Size of const extent descriptor header
+///
+constexpr unsigned CONST_EXTENT_HEADER_SIZE_WORDS = 4u;
+constexpr unsigned CONST_EXTENT_HEADER_SIZE_BYTES = CONST_EXTENT_HEADER_SIZE_WORDS * 4u;
+
+///
+/// @brief Size of an extent record
+/// @details Const extent descriptor contains a table of such records
+///
+constexpr unsigned CONST_EXTENT_RECORD_SIZE_WORDS = 4u;
+constexpr unsigned CONST_EXTENT_RECORD_SIZE_BYTES = CONST_EXTENT_RECORD_SIZE_WORDS * 4u;
+
+///
+/// @brief Offset of extent record table relative to const extent descriptor
+/// @details Both byte and words offsets are listed
+///
+constexpr unsigned CONST_EXTENT_RECORD_TAB_OFFSET_WORDS = 4u;
+constexpr unsigned CONST_EXTENT_RECORD_TAB_OFFSET_BYTES = CONST_EXTENT_RECORD_TAB_OFFSET_WORDS * 4u;
+
+///
+/// @brief Size of mempool record in a const extent descriptor
+/// @details Both byte and word sizes are provided
+///
+constexpr unsigned CONST_EXTENT_MEMPOOL_RECORD_SIZE_WORDS = 4u;
+constexpr unsigned CONST_EXTENT_MEMPOOL_RECORD_SIZE_BYTES = CONST_EXTENT_MEMPOOL_RECORD_SIZE_WORDS * 4u;
+
+// This function is used by deserializer to help it extract the extent-desc table (as a vector<uint32_t>) from some
+// arbitrary point down the pickle. Parameter is a pointer to the first 4 words; the return value is
+//  0 if the first two words do not look like CEDesc header;
+//  n otherwise (where 'n' is the number of 32-bit words to extract).
+//
+inline unsigned const_extent_hdr_check(uint32_t const *const hdrp)
+{
+    if (hdrp[0] != CONST_EXTENT_DESC_MAGIC) return 0;
+    const unsigned word0 = hdrp[1];
+    const unsigned hdr_len16 = word0 >> 24u; // units of 16 bytes
+    const unsigned desc_len64 = word0 & 0xFFFFFFu; // units of 64 bytes
+    const unsigned n_extent = hdrp[2] & 0xFFFFFFu;
+    const unsigned n_mempool = hdrp[3] & 0xFFFFFFu;
+    // no. of words actually needed
+    const unsigned desc_words = 4 * (hdr_len16 + n_extent + n_mempool);
+
+    // note, n_extent == n_mempool == 0 is allowed.
+    if (hdr_len16 == 0 || desc_len64 == 0 || n_extent > n_mempool || desc_words > desc_len64 * 16) {
+        return -1;
+    }
+    return desc_words;
+}
+
+} // namespace hnnx
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/constraints.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/constraints.h
new file mode 100755
index 0000000000000..b30f7b8f5c871
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/constraints.h
@@ -0,0 +1,121 @@
+//==============================================================================
+//
+// Copyright (c) 2020 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef CONSTRAINTS_H
+#define CONSTRAINTS_H
+
+#include "interface_defs.h"
+#include "op_def.h"
+
+#include <cstddef>
+#include <cstdint>
+
+namespace constraint_lib {
+
+/** \defgroup OptConstraint Constraint Expressions for Optimization Rules
+ * \ingroup OptimizationFuncs
+ *
+ * @{
+ */
+//! Find the chunksize of a given tensor type in a given dimension (a constant).
+/// For instance, LAYOUT_CHUNKSIZE(QUint8CroutonTensor,3) gives size_t(32)
+///
+#define LAYOUT_CHUNKSIZE(TYPENAME, IDX) (TYPENAME::layout.ChunkSizes[(IDX)])
+
+// some convenience wrappers...
+
+//! IS_FLOAT16("operand") -> bool   (true if operand has Float16 output)
+#define IS_FLOAT16(X) EQ(DTYPE_OF(X), DType::Float16)
+
+//! IS_FLOAT32("operand") -> bool   (true if operand has float output)
+#define IS_FLOAT32(X) EQ(DTYPE_OF(X), DType::Float32)
+
+//! IS_FLOAT("operand") -> bool   (alias of IS_FLOAT32)
+#define IS_FLOAT(X) IS_FLOAT32(X)
+
+//! IS_QUINT8("operand") -> bool   (true if operand has 'QUInt8' output)
+#define IS_QUINT8(X) EQ(DTYPE_OF(X), DType::QUInt8)
+
+//! IS_QINT8("operand") -> bool (true if operand has 'QInt8' output)
+#define IS_QINT8(X) EQ(DTYPE_OF(X), DType::QInt8)
+
+//! IS_QINT16("operand") -> bool   (true if operand has 'QInt16' output)
+#define IS_QINT16(X) EQ(DTYPE_OF(X), DType::QInt16)
+
+//! IS_QUINT16("operand") -> bool   (true if operand has 'QUInt16' output)
+#define IS_QUINT16(X) EQ(DTYPE_OF(X), DType::QUInt16)
+
+//! IS_QINT32("operand") -> bool   (true if operand has 'QInt32' output)
+#define IS_QINT32(X) EQ(DTYPE_OF(X), DType::QInt32)
+//! IS_INT32("operand") -> bool   (true if operand has 'Int32' output)
+#define IS_INT32(X) EQ(DTYPE_OF(X), DType::Int32)
+
+//! IS_INT64("operand") -> bool   (true if operand has 'Int64' output)
+#define IS_INT64(X) EQ(DTYPE_OF(X), DType::Int64)
+
+//! IS_QUANT_TYPE("operand") -> bool (true if operand has 'Quantized' output)
+#define IS_QUANT_TYPE(X) OR(IS_QUINT8(X), IS_QINT8(X), IS_QINT16(X), IS_QUINT16(X), IS_QINT32(X))
+//! IS_QUANT_SIGNED("operand") -> bool (true if operand has 'Signed Quantized' output)
+#define IS_QUANT_SIGNED(X) OR(IS_QINT32(X), IS_QINT16(X), IS_QINT8(X))
+//! IS_SIGNED_SYMM("operand") -> bool (true if operand has 'Signed Quantized' output with offset == 0)
+#define IS_SIGNED_SYMM(X) AND(IS_QUANT_SIGNED(X), EQ(ZERO_OFFSET_OF(X), 0))
+
+// The problem with IS_SIGNED_SYMM is that it tends to get used as
+//  AND( IS_QINT8(X), IS_SIGNED_SYMM(X))
+// which expands to X.dtype==qint8 && ( (X.dtype ==qint32 || X.dtype == .. ) && X.zero_offs == 0)
+// So, use IS_QINT8_SYMM(X) etc instead.
+
+//! IS_QINT8_SYMM("operand") -> bool (true if operand has QINT8 output with offset == 0)
+#define IS_QINT8_SYMM(X) AND(IS_QINT8(X), EQ(ZERO_OFFSET_OF(X), 0))
+//! IS_QINT16_SYMM("operand") -> bool (true if operand has QINT16 output with offset == 0)
+#define IS_QINT16_SYMM(X) AND(IS_QINT16(X), EQ(ZERO_OFFSET_OF(X), 0))
+//! IS_QINT32_SYMM("operand") -> bool (true if operand has QINT32 output with offset == 0)
+#define IS_QINT32_SYMM(X) AND(IS_QINT32(X), EQ(ZERO_OFFSET_OF(X), 0))
+
+//! IS_FULLY_CONNECT_WEIGHT("operand") -> bool (true if operand is QUInt8 or (QInt8 and symmetrically quantized))
+#define IS_FULLY_CONNECT_WEIGHT(X) OR(IS_QUINT8(X), IS_QINT8_SYMM(X))
+
+//! IS_FLOAT16_BOTH("operand", "operand") -> bool (true if both operands are FP16 type)
+#define IS_FLOAT16_BOTH(X, Y) AND(IS_FLOAT16(X), IS_FLOAT16(Y))
+//! IS_FLOAT16_ALL("operand", ...) -> bool (true if all operands are FP16 type)
+#define IS_FLOAT16_ALL(...) IS_DTYPE_ALL(DType::Float16, __VA_ARGS__)
+//! IS_FLOAT32_ALL("operand", ...) -> bool (true if all operands are FP32 type)
+#define IS_FLOAT32_ALL(...) IS_DTYPE_ALL(DType::Float32, __VA_ARGS__)
+
+//! DIM_CHANNEL("operand") -> unsigned (extract depth dimension, #4)
+#define DIM_CHANNEL(X) DIM_OF(X, 4)
+//! DIM_DEPTH("operand") -> unsigned (extract depth dimension, #3)
+#define DIM_DEPTH(X) DIM_OF(X, 3)
+//! DIM_WIDTH("operand") -> unsigned (extract width dimension, #2)
+#define DIM_WIDTH(X) DIM_OF(X, 2)
+//! DIM_HEIGHT("operand") -> unsigned (extract height dimension, #1)
+#define DIM_HEIGHT(X) DIM_OF(X, 1)
+//! DIM_BATCHES("operand") -> unsigned (extract batches dimension, #0)
+#define DIM_BATCHES(X) DIM_OF(X, 0)
+
+//! DIM_NFILTS("operand") -> unsigned (extract 'output depth' dimension from filter weights, #3)
+#define DIM_NFILTS(X) DIM_OF(X, 3)
+//! DIM_FILTDEPTH("operand") -> unsigned (extract 'input depth' dimension from filter weights, #2)
+#define DIM_FILTDEPTH(X) DIM_OF(X, 2)
+//! DIM_FILTWIDTH("operand") -> unsigned (extract 'filter width' dimension from filter weights, #1)
+#define DIM_FILTWIDTH(X) DIM_OF(X, 1)
+//! DIM_FILTHEIGHT("operand") -> unsigned (extract 'filter height' dimension from filter weights, #0)
+#define DIM_FILTHEIGHT(X) DIM_OF(X, 0)
+
+#define MAX_SPARSE_ELEMENTS(X) DIM_OF(X, (MAX_DIMENSIONS - 1))
+
+//! IS_EMPTY_DIM("operand", dim) -> bool (true if size of dim is 0)
+#define IS_EMPTY_DIM(X, DIM) EQ(DIM_OF(X, DIM), 0)
+
+//! IS_EMPTY("operand") -> bool (true if size of all dims is 0)
+#define IS_EMPTY(X) AND(IS_EMPTY_DIM(X, 0), IS_EMPTY_DIM(X, 1), IS_EMPTY_DIM(X, 2), IS_EMPTY_DIM(X, 3))
+
+} // namespace constraint_lib
+/** @} */
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/conversions.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/conversions.h
new file mode 100755
index 0000000000000..4cb348c637953
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/conversions.h
@@ -0,0 +1,609 @@
+//==============================================================================
+//
+// Copyright (c) 2018 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef CONVERSIONS_H
+#define CONVERSIONS_H
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <cmath>
+#include <limits>
+
+#include "builtin_intrinsics.h"
+
+#ifdef __hexagon__
+#include "hexagon_protos.h"
+#endif
+
+#include "float16.h"
+
+#if defined(__clang__)
+#define ATTR_NO_SANITIZE(CATEGORY) __attribute__((no_sanitize(CATEGORY)))
+#else
+#define ATTR_NO_SANITIZE(CATEGORY) /*empty */
+#endif
+
+namespace hnnx {
+
+namespace scast {
+
+// for a given floating type F, and a integer type TI,
+//  intrange_within_float<F,TI>::max()
+// generates the largest value representable in type F which will fit into TI without overflow.
+// in many cases this is F(std::numeric_limits<TI>::max()),
+// but there are exceptions when the mantissa of F is narrower than TI; in those cases we
+// want the representable value which is smaller than the integer's max value, not the nearest:
+//     F        TI
+//   Float16  int16   32752.0                (0x7ff0)
+//   Float15  uint16  65504.0                (0xffe0)
+//   float    int32   2147483520.0           (0x7fffff80)
+//   float    uint32  4294967040.0           (0xFFFFFF00)
+//   float    int64   9.223371487e18         (0x7fff_ff80_0000_0000)
+//   float    uint64  1.844674297e+19        (0xFFFF_FF00__0000_0000)
+//   double   int64   9223372036854774784.0  (0x7FFF_FFFF_FFFF_FC00)
+//   double   uint64  18446744073709549568.0 (0xFFFF_FFFF_FFFF_F800)
+//
+// All of the 'min' limits are zero or powers of 2, so those can be converted
+// directly from std::numeric_limits<TI>::min()
+//
+//
+template <typename F, typename TI> struct intrange_within_float {
+};
+
+// LCOV_EXCL_START [SAFTYSWCCB-1736] constexprs resolved during compile time
+template <typename TI> struct intrange_within_float<Float16, TI> {
+    static_assert(std::numeric_limits<TI>::is_integer);
+    static inline constexpr Float16 max()
+    {
+        if constexpr (sizeof(TI) < 2) {
+            return Float16(std::numeric_limits<TI>::max());
+        } else if constexpr (sizeof(TI) == 2) {
+            return std::numeric_limits<TI>::is_signed ? Float16(32752.0f) : Float16(65504.0f);
+        } else {
+            return std::numeric_limits<TI>::is_signed ? Float16(-65504.0f) : Float16(65504.0f);
+        }
+    }
+    // 'min' value of integer range is always exactly representable
+    static inline constexpr Float16 min() { return Float16(std::numeric_limits<TI>::min()); }
+};
+
+template <typename TI> struct intrange_within_float<float, TI> {
+    static_assert(std::numeric_limits<TI>::is_integer);
+    static inline constexpr float max()
+    {
+        if constexpr (sizeof(TI) < 4) {
+            return float(std::numeric_limits<TI>::max());
+        } else if constexpr (sizeof(TI) == 4) {
+            return std::numeric_limits<TI>::is_signed ? 2147483520.0f : 4294967040.0f;
+        } else {
+            static_assert(sizeof(TI) == 8);
+            return std::numeric_limits<TI>::is_signed ? 9.223371487e18f : 1.844674297e+19f;
+        }
+    }
+    // 'min' value of integer range is always exactly representable
+    static inline constexpr float min() { return float(std::numeric_limits<TI>::min()); }
+};
+
+template <typename TI> struct intrange_within_float<double, TI> {
+    static_assert(std::numeric_limits<TI>::is_integer);
+    static inline constexpr double max()
+    {
+        if constexpr (sizeof(TI) < 8) {
+            return double(std::numeric_limits<TI>::max());
+        } else {
+            static_assert(sizeof(TI) == 8);
+            return std::numeric_limits<TI>::is_signed ? 9223372036854774784.0 : 18446744073709549568.0;
+        }
+    }
+    // 'min' value of integer range is always exactly representable
+    static inline constexpr float min() { return double(std::numeric_limits<TI>::min()); }
+};
+// LCOV_EXCL_STOP
+
+template <typename TOUT, typename TIN> struct satcast_helper {
+    static_assert(std::numeric_limits<TOUT>::is_specialized && std::numeric_limits<TIN>::is_specialized);
+    static inline TOUT constexpr op(TIN val)
+    {
+        if constexpr (!std::numeric_limits<TOUT>::is_integer) { // convert to a float
+            return TOUT(val);
+        } else {
+            constexpr bool OUTS = std::numeric_limits<TOUT>::is_signed;
+            if constexpr (std::numeric_limits<TIN>::is_integer) {
+                // integer to integer.
+                // widening? or same width, same signedness?
+                constexpr bool INS = std::numeric_limits<TIN>::is_signed;
+                if (sizeof(TOUT) > sizeof(TIN) || (sizeof(TOUT) == sizeof(TIN) && OUTS == INS)) {
+                    // if the output is unsigned and the input < 0, return 0
+                    // otherwise it's a normal cast.
+                    return (!OUTS && INS && val < 0) ? TOUT(0) : TOUT(val);
+                } else if (sizeof(TOUT) == sizeof(TIN)) {
+                    if (!OUTS) { // same size, different signs
+                        return (val < 0) ? (TOUT)0 : (TOUT)val; // signed->unsigned
+                    } else {
+                        constexpr TIN lim = std::numeric_limits<TOUT>::max();
+                        return (val > lim) ? (TOUT)lim : (TOUT)val;
+                    }
+                } else {
+                    // narrowing conversion
+                    if (!OUTS) {
+                        constexpr TIN m = std::numeric_limits<TOUT>::max();
+                        return (val < 0) ? TOUT(0) : (val > m) ? TOUT(m) : TOUT(val);
+                    } else {
+                        constexpr TIN mn = INS ? std::numeric_limits<TOUT>::min() : 0;
+                        constexpr TIN mx = std::numeric_limits<TOUT>::max();
+                        return (val < mn) ? TOUT(mn) : (val > mx) ? TOUT(mx) : TOUT(val);
+                    }
+                }
+            } else { // float to integer
+                if constexpr (sizeof(TOUT) <= sizeof(int32_t)) {
+                    if constexpr (OUTS) {
+                        constexpr TIN loval = intrange_within_float<TIN, int32_t>::min();
+                        constexpr TIN hival = intrange_within_float<TIN, int32_t>::max();
+                        int32_t const tmp = (int32_t)std::max(loval, std::min(hival, val));
+                        return satcast_helper<TOUT, int32_t>::op(tmp);
+                    } else {
+                        constexpr TIN loval = 0.0;
+                        constexpr TIN hival = intrange_within_float<TIN, uint32_t>::max();
+                        uint32_t const tmp = (uint32_t)std::max(loval, std::min(hival, val));
+                        return satcast_helper<TOUT, uint32_t>::op(tmp);
+                    }
+                } else { // 64-bit output assumed
+                    constexpr TIN loval = intrange_within_float<TIN, TOUT>::min();
+                    constexpr TIN hival = intrange_within_float<TIN, TOUT>::max();
+                    return (TOUT)std::max(loval, std::min(hival, val));
+                }
+            }
+        }
+    }
+};
+// specialize for conversion to same
+template <typename TT> struct satcast_helper<TT, TT> {
+    static_assert(std::numeric_limits<TT>::is_specialized);
+    static inline TT constexpr op(TT val) { return val; }
+};
+
+#ifdef __hexagon__
+
+// saturate to types <= int.
+template <typename T> struct q6_sat_int {
+};
+template <> struct q6_sat_int<int8_t> {
+    static inline int op(int x) { return Q6_R_satb_R(x); }
+};
+template <> struct q6_sat_int<uint8_t> {
+    static inline int op(int x) { return Q6_R_satub_R(x); }
+};
+template <> struct q6_sat_int<int16_t> {
+    static inline int op(int x) { return Q6_R_sath_R(x); }
+};
+template <> struct q6_sat_int<uint16_t> {
+    static inline int op(int x) { return Q6_R_satuh_R(x); }
+};
+
+// TODO: these should be done again for 'long' if long is also 32 bits.
+#if 0 // NOTE: we can't really do this unless intrinsics are constexpr
+template <> struct satcast_helper<uint8_t, int> {
+    static inline uint8_t /*constexpr*/ op(int val)
+    {
+        return Q6_R_satub_R(val);
+    }
+};
+template <> struct satcast_helper<int8_t, int> {
+    static inline int8_t /*constexpr*/ op(int val) { return Q6_R_satb_R(val); }
+};
+template <> struct satcast_helper<uint16_t, int> {
+    static inline uint16_t /*constexpr*/ op(int val)
+    {
+        return Q6_R_satuh_R(val);
+    }
+};
+template <> struct satcast_helper<int16_t, int> {
+    static inline int16_t /*constexpr*/ op(int val) { return Q6_R_sath_R(val); }
+};
+#endif
+
+#endif
+} // end namespace scast
+
+} // namespace hnnx
+
+/**
+ * @brief saturate_cast<TOUT,TIN>( TIN val ) will work on any two numeric types;
+ * if the input is outside the numeric range of the output type, it
+ * will be range-limited.
+ *
+ * it works as follows:
+ *   * if TOUT is a floating type, the operation is the same as the C++ cast.
+ *   * if TOUT is integer and TIN is float, the input is first converted
+ *    to one of int32,uint32, int64, uint64 ensuring that out-of-range values
+ *    are clipped; and then converted to the output type as below (if it is smaller
+ *    than 32 bits) (The 2-step conversion is intended to work well when things
+ *    are specialized to support native hexagon ops).
+ *  * Otherwise they are both integers.
+ *    - If the output width is larger than the input (or if they are the same size
+ *      and of the same signedness):
+ *        * if the output is unsigned, and the input is < 0, the result is zero
+ *        * otherwise the result is the same as a C++ cast (all values representable)
+ *    - Otherwise, it is a saturating cast; values are limited to the range of TOUT.
+ */
+template <typename TOUT, typename TIN> inline constexpr TOUT saturate_cast(TIN val)
+{
+    return hnnx::scast::satcast_helper<TOUT, TIN>::op(val);
+}
+
+/**
+ * @brief T saturate_round<T>( float val )
+ * round val to nearest int, and saturate to range of T.
+ *
+ * T must be an integer type, at most 32 bits.
+ */
+// For general C platform, we need to clip the range before converting to int;
+// for hexagon the conversions saturate.
+//
+#ifndef __hexagon__
+template <typename TOUT> inline TOUT saturate_round(float val)
+{
+    static_assert(sizeof(TOUT) <= 8 && std::numeric_limits<TOUT>::is_integer);
+    return saturate_cast<TOUT>(std::nearbyintf(val));
+}
+
+#else
+template <typename TOUT> inline TOUT saturate_round(float val)
+{
+    static_assert(sizeof(TOUT) <= 8 && std::numeric_limits<TOUT>::is_integer);
+    if constexpr ((sizeof(TOUT) == 8) && !std::numeric_limits<TOUT>::is_signed) {
+        // convert to unsigned u64, rounding, saturating
+        return Q6_P_convert_sf2ud_R(val);
+    } else if constexpr ((sizeof(TOUT) == 8) && std::numeric_limits<TOUT>::is_signed) {
+        // convert to int64, rounding
+        return Q6_P_convert_sf2d_R(val);
+    } else if constexpr ((sizeof(TOUT) == 4) && !std::numeric_limits<TOUT>::is_signed) {
+        // convert to unsigned u32, rounding, saturating
+        return Q6_R_convert_sf2uw_R(val);
+    } else {
+        // convert to int32,rounding;
+        int const r = Q6_R_convert_sf2w_R(val);
+        if constexpr (sizeof(TOUT) < 4) return static_cast<TOUT>(hnnx::scast::q6_sat_int<TOUT>::op(r));
+        return static_cast<TOUT>(r); // LCOV_EXCL_LINE [SAFTYSWCCB-1736]
+    }
+}
+#endif
+
+namespace hnnx {
+
+/**
+ * @brief 'proper' compare of any two integer types
+ *  proper_gt( a, b) => a > b;
+ *    E.g. if a is unsigned and b is signed, the operation checks to see if b is < 0;
+ *    if so, the result is true; otherwise an unsigned compare is done: a > (unsigned)b
+ *
+ */
+namespace prpercmp {
+
+/**
+ * @brief if both A and B are either *int*, or smaller than int,
+ *   then promote them both to int and compare them.
+ *
+ * otherwise, if TA is wider than TB, (or the same, with TA unsigned):
+ *    promote b to TA, and then compare them.
+ *    Exception, if TA is unsigned and TB is signed and b < 0; then a<b always.
+ * otherwise, TB is wider than TA (or the same with TA signed):
+ *   promote a to TB, and then compare them.
+ *   Exception, if TB is unsigned and TA is signed and a < 0.
+ *
+ */
+
+template <typename TA, typename TB> struct proper_cmp_helper {
+    static_assert(std::numeric_limits<TA>::is_integer && std::numeric_limits<TB>::is_integer);
+    static const bool ASIGNED = std::numeric_limits<TA>::is_signed;
+    static const bool BSIGNED = std::numeric_limits<TB>::is_signed;
+
+    // compare by promoting both to int, when...
+    static const bool CMP_AS_INT = (sizeof(TA) < sizeof(int) || (sizeof(TA) == sizeof(int) && ASIGNED)) &&
+                                   (sizeof(TB) < sizeof(int) || (sizeof(TB) == sizeof(int) && BSIGNED));
+    // otherwise, compare by promoting B to A when ...
+    static const bool B_TO_A = sizeof(TA) > sizeof(TB) || (sizeof(TA) == sizeof(TB) && !ASIGNED);
+    // otherwise, compare by promoting A to B
+
+    static inline bool constexpr eq(TA a, TB b)
+    {
+        if (CMP_AS_INT) {
+            return (int)a == (int)b;
+        } else if (B_TO_A) {
+            if (!ASIGNED && BSIGNED && b < 0) return false;
+            return a == (TA)b;
+        } else {
+            if (!BSIGNED && ASIGNED && a < 0) return false;
+            return (TB)a == b;
+        }
+    }
+    static inline bool constexpr lt(TA a, TB b)
+    {
+        if (CMP_AS_INT) {
+            return (int)a < (int)b;
+        } else if (B_TO_A) {
+            if (!ASIGNED && BSIGNED && b < 0) return false; // a < b  always false if  b<0
+            return a < (TA)b;
+        } else {
+            if (!BSIGNED && ASIGNED && a < 0) return true; // a < b  always true if  a<0
+            return (TB)a < b;
+        }
+    }
+};
+/**
+ * @brief specialize for comparison to same type
+ */
+template <typename T> struct proper_cmp_helper<T, T> {
+    static_assert(std::numeric_limits<T>::is_integer);
+    static inline bool constexpr eq(T a, T b) { return a == b; }
+    static inline bool constexpr lt(T a, T b) { return a < b; }
+};
+
+} // end namespace prpercmp
+
+} // namespace hnnx
+
+/**
+ * @brief 'proper' compare of any two integer types, respecting signedness and actual numeric value.
+ *  proper_eq(a,b) => a == b;
+ *
+ * E.g. if a is signed and <0, and b is unsigned, result will always be false.
+ *
+ */
+
+template <typename TA, typename TB> inline bool constexpr proper_eq(TA a, TB b)
+{
+    return hnnx::prpercmp::proper_cmp_helper<TA, TB>::eq(a, b);
+}
+/**
+ * @brief 'proper' compare of any two integer types, respecting signedness and actual numeric value
+ *  proper_ne(a,b) => !proper_eq(a,b);
+ */
+template <typename TA, typename TB> inline bool constexpr proper_ne(TA a, TB b)
+{
+    return !hnnx::prpercmp::proper_cmp_helper<TA, TB>::eq(a, b);
+}
+/**
+ * @brief 'proper' compare of any two integer types, respecting signedness and actual numeric value
+ *  proper_lt(a,b) => a<b;
+ */
+template <typename TA, typename TB> inline bool constexpr proper_lt(TA a, TB b)
+{
+    return hnnx::prpercmp::proper_cmp_helper<TA, TB>::lt(a, b);
+}
+/**
+ * @brief 'proper' compare of any two integer types, respecting signedness and actual numeric value
+ *  proper_ge(a,b) => a>=b;
+ */
+template <typename TA, typename TB> inline bool constexpr proper_ge(TA a, TB b)
+{
+    return !hnnx::prpercmp::proper_cmp_helper<TA, TB>::lt(a, b);
+}
+/**
+ * @brief 'proper' compare of any two integer types, respecting signedness and actual numeric value
+ *  proper_gt(a,b) => a>b;
+ */
+template <typename TA, typename TB> inline bool constexpr proper_gt(TA a, TB b)
+{
+    return hnnx::prpercmp::proper_cmp_helper<TB, TA>::lt(b, a);
+}
+/**
+ * @brief 'proper' compare of any two integer types, respecting signedness and actual numeric value
+ *  proper_le(a,b) => a<=b;
+ */
+template <typename TA, typename TB> inline bool constexpr proper_le(TA a, TB b)
+{
+    return !hnnx::prpercmp::proper_cmp_helper<TB, TA>::lt(b, a);
+}
+/**
+ * @brief x >= lo && x < limit, using proper compares
+ */
+template <typename TA, typename TB, typename TC> inline bool constexpr proper_inrange(TA x, TB lo, TC limit)
+{
+    return proper_ge<TA, TB>(x, lo) && proper_lt<TA, TC>(x, limit);
+}
+
+/**
+ * @brief x >= lo && x <= hi, using proper compares
+ */
+template <typename TA, typename TB, typename TC> inline bool constexpr proper_inrange_closed(TA x, TB lo, TC hi)
+{
+    return proper_ge<TA, TB>(x, lo) && proper_le<TA, TC>(x, hi);
+}
+
+/**
+ * @brief find the 'width' of an unsigned value (# of bits needed to contain it)
+ * this is floor( log2(x))+1
+ *  (and 0 when x = 0)
+ *
+ */
+inline int constexpr binary_bitwidth(unsigned x)
+{
+    return (x == 0) ? 0 : (sizeof(unsigned) * 8 - HEX_COUNT_LEADING_ZERO(x));
+}
+/**
+ * @brief find the 'width' of an unsigned long value (# of bits needed to contain it)
+ * this is floor( log2(x))+1
+ *  (and 0 when x = 0)
+ *
+ */
+inline int constexpr binary_bitwidth(unsigned long x)
+{
+    return (x == 0) ? 0 : (sizeof(unsigned long) * 8 - HEX_COUNT_LEADING_ZERO_UL(x));
+}
+/**
+ * @brief find the 'width' of an unsigned long long value (# of bits needed to contain it)
+ * this is floor( log2(x))+1
+ *  (and 0 when x = 0)
+ *
+ */
+inline int constexpr binary_bitwidth(unsigned long long x)
+{
+    return (x == 0) ? 0 : (sizeof(unsigned long long) * 8 - HEX_COUNT_LEADING_ZERO_ULL(x));
+}
+/**
+ * @brief saturating u32+u32 add
+ */
+inline uint32_t /*constexpr*/ addu32_sat(uint32_t a, uint32_t b)
+{
+    uint64_t const prod = (uint64_t)a + b;
+    return saturate_cast<uint32_t>(prod);
+}
+
+/**
+ * @brief saturating i32+i32 add
+ */
+inline int32_t /*constexpr*/ addi32_sat(int32_t a, int32_t b)
+{
+#ifdef __hexagon__
+    return Q6_R_add_RR_sat(a, b);
+#else
+    int64_t prod = (int64_t)a + b;
+    return saturate_cast<int32_t>(prod);
+#endif
+}
+
+/**
+ * @brief saturating u32xu32 multiply
+ */
+inline uint32_t constexpr mulu32_sat(uint32_t a, uint32_t b)
+{
+    uint64_t const prod = (uint64_t)a * b;
+    return saturate_cast<uint32_t>(prod);
+}
+
+/**
+ * @brief saturating i32xi32 multiply
+ */
+inline int32_t constexpr muli32_sat(int32_t a, int32_t b)
+{
+    int64_t const prod = (int64_t)a * b;
+    return saturate_cast<int32_t>(prod);
+}
+
+/**
+ * @brief saturating u64xu64 multiply
+ */
+inline uint64_t /*constexpr*/ mulu64_sat(uint64_t a, uint64_t b)
+{
+    uint64_t prod = 0;
+    if (HEX_MUL_OVERFLOW(a, b, &prod)) {
+        prod = std::numeric_limits<uint64_t>::max();
+    }
+    return prod;
+}
+
+/**
+ * @brief saturating i64xi64 multiply
+ */
+inline int64_t /*constexpr*/ muli64_sat(int64_t a, int64_t b)
+{
+    int64_t prod = 0;
+    if (HEX_MUL_OVERFLOW(a, b, &prod)) {
+        prod = (int64_t(uint64_t(a) ^ uint64_t(b)) >= 0) ? std::numeric_limits<int64_t>::max()
+                                                         : std::numeric_limits<int64_t>::min();
+    }
+    return prod;
+}
+/**
+ * @brief add unsigned+unsigned->unsigned, escaping 'unsigned overflow' checks
+ */
+ATTR_NO_SANITIZE("unsigned-integer-overflow")
+inline unsigned constexpr addu32_modular(unsigned a, unsigned b)
+{
+    return a + b;
+}
+/**
+ * @brief subtract unsigned-unsigned->unsigned, escaping 'unsigned overflow' checks
+ * For '-unsigned_var', use subu32_modular(0,unsigned_var)
+ */
+ATTR_NO_SANITIZE("unsigned-integer-overflow")
+inline unsigned constexpr subu32_modular(unsigned a, unsigned b)
+{
+    return a - b;
+}
+/**
+ * @brief multiply unsigned*unsigned->unsigned, escaping 'unsigned overflow' checks
+ */
+ATTR_NO_SANITIZE("unsigned-integer-overflow")
+inline unsigned constexpr mulu32_modular(unsigned a, unsigned b)
+{
+    return a * b;
+}
+/**
+ * @brief mul-add u32*u32+u32->u32, escaping 'unsigned overflow' checks
+ */
+ATTR_NO_SANITIZE("unsigned-integer-overflow")
+inline unsigned constexpr muladdu32_modular(unsigned a, unsigned b, unsigned c)
+{
+    return a * b + c;
+}
+
+/**
+ * @brief add u64+u64->u64, escaping 'unsigned overflow' checks
+ */
+ATTR_NO_SANITIZE("unsigned-integer-overflow")
+inline uint64_t constexpr addu64_modular(uint64_t a, uint64_t b)
+{
+    return a + b;
+}
+
+/**
+ * @brief subtract u64-u64->u64, escaping 'unsigned overflow' checks
+ */
+ATTR_NO_SANITIZE("unsigned-integer-overflow")
+inline uint64_t constexpr subu64_modular(uint64_t a, uint64_t b)
+{
+    return a - b;
+}
+/**
+ * @brief mul u64*u64->u64, escaping 'unsigned overflow' checks
+ */
+ATTR_NO_SANITIZE("unsigned-integer-overflow")
+inline uint64_t constexpr mulu64_modular(uint64_t a, uint64_t b)
+{
+    return a * b;
+}
+
+/**
+ * @brief 'image' conversion from TIN to TOUT (which must be the same size)
+ * e.g. image_convert<unsigned,float>( 1.25f) -> 0x3fa00000
+ */
+
+template <typename TOUT, typename TIN> inline constexpr TOUT image_convert(TIN x)
+{
+    static_assert(sizeof(TOUT) == sizeof(TIN));
+    static_assert(std::is_trivially_copyable_v<TOUT>);
+    static_assert(std::is_trivially_copyable_v<TIN>);
+    static_assert(std::is_trivially_constructible_v<TOUT>);
+    TOUT out;
+    std::memcpy(&out, &x, sizeof(TOUT));
+    return out;
+}
+
+// round up A to a multiple of B.
+// b is expected to be > 0 even if signed.
+
+template <typename TD> inline constexpr size_t round_up(size_t a, TD b)
+{
+    static_assert(std::is_integral_v<TD>, "round_up can only apply to integer types");
+    // for b being  a power of 2, this should compile as (a+(b-1)) &~(b-1)
+    return b * ((a + (b - 1)) / b);
+}
+// for int, b is expected to be > 0;
+// this will work for negative a, e.g. round_up(-53,10) -> -50
+template <typename TD> inline constexpr size_t round_up(int a, TD b)
+{
+    static_assert(std::is_integral_v<TD>, "round_up can only apply to integer types");
+    int const bi = b;
+    int const tmp = a + ((a > 0) ? (bi - 1) : 0);
+    return bi * (tmp / bi);
+}
+
+#endif /*CONVERSIONS_H*/
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/cost.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/cost.h
new file mode 100755
index 0000000000000..8f0b21ccb86e5
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/cost.h
@@ -0,0 +1,38 @@
+//==============================================================================
+//
+// Copyright (c) 2020 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef COST_H
+#define COST_H 1
+
+// NOTE: WHATCOST may be something like SNAIL/128
+#define COST_OF(FUNC, WHATCOST)     COST_OF_OP(typename DerivedType<(FUNC)>::type, WHATCOST)
+#define COST_OF_F(FUNC, WHATCOSTFN) COST_OF_OP_F(typename DerivedType<(FUNC)>::type, WHATCOSTFN)
+
+#ifdef PREPARE_DISABLED
+#define COST_OF_OP(OP, WHATCOST)
+#define COST_OF_OP_F(OP, WHATCOSTFN)
+#else
+#define COST_OF_OP(OP, WHATCOST)                                                                                       \
+    template <> [[maybe_unused]] constexpr hnnx::cost_function_t hnnx::get_costf<OP>()                                 \
+    {                                                                                                                  \
+        return hnnx::cost_function_t(float(StandardCosts::WHATCOST));                                                  \
+    }
+
+#define COST_OF_OP_F(OP, WHATCOSTFN)                                                                                   \
+    template <>                                                                                                        \
+    float hnnx::cost_function_t::cfunc<OP>(hnnx::cost_function_t const &, const Graph &graph_in, const Op *op)         \
+    {                                                                                                                  \
+        return WHATCOSTFN(graph_in, op);                                                                               \
+    }                                                                                                                  \
+    template <> [[maybe_unused]] constexpr hnnx::cost_function_t hnnx::get_costf<OP>()                                 \
+    {                                                                                                                  \
+        return hnnx::cost_function_t(hnnx::cost_function_t::cfunc<OP>, 1.0);                                           \
+    }
+#endif
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/cost_funcs.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/cost_funcs.h
new file mode 100755
index 0000000000000..286945b9b34b8
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/cost_funcs.h
@@ -0,0 +1,56 @@
+//=============================================================================
+//
+//  Copyright (c) 2020 Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//============================================================================
+
+#ifndef COST_FUNCS_H
+#define COST_FUNCS_H
+#include <string_view>
+#include <utility>
+#include "weak_linkage.h"
+#include "macros_attribute.h"
+PUSH_VISIBILITY(default)
+
+class Graph;
+class Op;
+
+namespace hnnx {
+
+class API_EXPORT cost_function_t {
+    using inner_func_t = float (*)(cost_function_t const &, const Graph &, Op const *);
+    inner_func_t funcp;
+    float val;
+
+  public:
+    cost_function_t(cost_function_t const &) = default;
+    cost_function_t &operator=(cost_function_t const &) = default;
+    constexpr explicit cost_function_t(float val_in) : funcp(simple_cost_function), val(val_in) {}
+    constexpr cost_function_t(inner_func_t f, float val_in) : funcp(f), val(val_in) {}
+    constexpr cost_function_t() noexcept : funcp(simple_cost_function), val(0.0f) {}
+
+    inline float operator()(const Graph &graph_in, Op const *op) const { return (*funcp)(*this, graph_in, op); }
+    static float simple_cost_function(cost_function_t const &self, const Graph &, Op const *)
+    {
+        return self.val;
+    } // just returns val;
+
+    float get_val() const { return val; }
+
+    // unreliable compare for two cost func: returns  -1,0,1 if this cost
+    // is <,=,> than rhs cost, with the second result being true; or <0,false>
+    // if it can't tell.
+    std::pair<int, bool> compare(cost_function_t const &rhs) const;
+
+    template <class T> static float cfunc(cost_function_t const &, const Graph &, Op const *);
+};
+
+API_EXPORT cost_function_t cost_func_from_str(std::string_view);
+
+} // namespace hnnx
+
+POP_VISIBILITY()
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/crate.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/crate.h
new file mode 100755
index 0000000000000..494f51e40fa0f
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/crate.h
@@ -0,0 +1,471 @@
+//==============================================================================
+//
+// Copyright (c) Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/*
+ * crate.h
+ *
+ *  Created on: Aug 1, 2019
+ *      Author: smithg
+ */
+
+#ifndef CRATE_H_
+#define CRATE_H_
+#include <cstddef>
+#include <cstdint>
+#include <utility>
+#include <list>
+#include <memory>
+#include <vector>
+#include <cstring>
+#include <stdexcept>
+
+#include "is_detected.h"
+#include "forward_classes.h"
+#include "macros_attribute.h"
+#include "weak_linkage.h"
+#include "size_align_code.h"
+
+PUSH_VISIBILITY(default)
+
+class Graph;
+class Tensor;
+
+/// @brief A 'Crate' allows construction of some number of different data types,
+/// contiguously packed into a few large memory blocks.
+///
+/// Example:
+///
+///     Crate crt;
+///     Thing  tp* = crt.emplace<Thing>( ... ctor parms for Thing ... )
+///     AnotherThing tp2* = crt.emplace<AnotherThing>( ... ctor parms for AnotherThing ... )
+///
+/// When the crate is destroyed, all of the contained objects are destroyed in the reverse
+/// order. You cannot 'remove' a single entry using
+///
+///  crt.erase has been deprecated
+///
+/// However, this is likely not going to free any memory; it will just call the dtor of the
+/// object (and make sure it doesn't get called later, when the Crate is cleared or destroyed).
+///
+/// You can also emplace variable-sized arrays of trivially-destructable objects.
+///
+/// alloc_array does not initialize:
+///
+///     float * farr = crt.alloc_array<float>(n);
+///
+/// alloc_array_zero does zero-initializing:
+///
+///     int * farr = crt.alloc_array_zero<int>(n);
+///
+/// If an allocation needs space larger than CHUNKBYTES, it will get its own chunk.
+///
+// Each record containing an object has a non-null 'dtor' field; if the object is trivially destructible,
+// this will be  (dtor_funcp)1, and the object is not on the linked-list.
+//
+// note:
+//  A constructor may emplace additional records in the crate recursively. Likewise,
+//  it's OK if the dtors call erase() on other objects. If this happens during a 'clear',
+//  the erase calls are ignored since the other objects are going to get dtor'd anyhow (if they have not
+//  been already).
+// Important: if object A's constructor places B into the crate, then B will very likely get destroyed
+//  first when the crate is cleared. Thus, A's destructor can't look at B (it can erase B, which is ignored
+//  as described above).
+
+//
+// new 'raw' mode:
+//  - when the crate is in 'raw' mode, no destructors are registered. inserting an object
+//    increases 'alloc_count' in the chunk header, but does not increment 'nrec', nor any
+//    does it increase Crate::m_records.
+//  - raw mode is entered by enable_raw_mode(size_needed):
+//      which does this in addition to enabling raw mode:
+//         - if there is no current chunk, or if the current chunk doesn't have room for 'size_needed' bytes,
+//           a new chunk is added which does.
+//         - enable_raw_mode(size_needed) returns a chunk handle.
+//
+// Internally, raw_mode causes add_record_slot() to do the same thing, but it only moves alloc_count, it does
+// not assign a slot index, and 'idx' is -1 in the returned struct.
+// All callers of add_record_slot() *must* check for raw mode (can be done by checking idx < 0), and then avoid
+// adding an dtor or doing '++m_records'.
+//
+// it's also possible to call .enable_raw_mode(), disable_raw_mode()
+// but .enable_raw_mode() does nothing if there isn't at least one chunk allocated.
+//
+
+namespace hnnx {
+
+//
+// This is used to statically determine whether a type T has a clear(Graph&)
+// method.  This is used as an additional destructor which takes a Graph
+// reference.
+//
+
+template <typename T> using clear_t = decltype(std::declval<T &>().clear(std::declval<Graph &>()));
+
+template <typename T> constexpr bool has_clear = is_detected_v<clear_t, T>;
+
+class Deserz;
+class DCrate;
+
+class Crate {
+    API_EXPORT static constexpr size_t CHUNKBYTES = (1 << 16);
+    static_assert(CHUNKBYTES % 8 == 0 && CHUNKBYTES >= 128);
+    typedef void (*dtor_funcp)(Graph *graph_in, void *);
+    API_EXPORT static dtor_funcp DTOR_TRIVIAL() { return (dtor_funcp)1; }
+    API_EXPORT static dtor_funcp DTOR_IN_PROCESS() { return (dtor_funcp)2; }
+
+    //! A record in the index of a chunk
+    struct index_rec {
+        unsigned loc; ///< offset in bytes to the object
+        dtor_funcp
+                dtor; ///< pointer to dtor function (null if empty record; (DTOR_TRIVIAL if the object is trivial dtor)
+    };
+    //! A chunk record in the crate.
+    ///
+    /// Each chunk is created as an array of uint64_t, via make_unique<uint64_t[]>
+    /// The memory in a chunk has a chunkhdr, which is followed by:
+    ///
+    ///    [Objects][Objects][Objects]--> free space   <--[Index records]
+    ///
+    /// 'alloc_count' is the next offset available to be allocated.
+    /// index records are entered in reverse order from the end. So, the last nrec*sizeof(index_rec)
+    /// bytes of the area, are the index.
+    ///
+    typedef std::unique_ptr<uint64_t[]> uptr_chunk_t;
+    struct chunkhdr;
+    API_EXPORT static chunkhdr *hdr_of(uptr_chunk_t &p) { return reinterpret_cast<chunkhdr *>(p.get()); }
+    API_EXPORT static chunkhdr const *hdr_of(uptr_chunk_t const &p)
+    {
+        return reinterpret_cast<chunkhdr const *>(p.get());
+    }
+    /// The chunkhdr is the first portion of the chunk, and is immediately followed
+    /// by data_len bytes, which is a multiple of 8.
+    struct API_EXPORT alignas(8) chunkhdr {
+        unsigned data_len; ///< length of the data area following header, bytes (>=CHUNKBYTES).
+        unsigned nrec; ///< records in use (including deleted ones)
+        unsigned alloc_count; ///< offset of first byte in 'free space'
+        // init to a given length (header not included)
+        void init(unsigned length)
+        {
+            data_len = length;
+            nrec = 0;
+            alloc_count = 0;
+        }
+        // reset (preserve data_len)
+        void init()
+        {
+            nrec = 0;
+            alloc_count = 0;
+        }
+        // pointer to 'offs ' within data area
+        inline uint8_t *get_ptr(unsigned offs) { return (uint8_t *)(this + 1) + offs; }
+        // pointer to end of  the allocation
+        inline uint8_t *get_end_ptr() { return (uint8_t *)(this + 1) + data_len; }
+        // amount of space remaining
+        inline size_t space_avail() const { return data_len - alloc_count - nrec * sizeof(index_rec); }
+        // get pointer to an index record.
+        // record 0 is the last (oldest) one.
+        index_rec *index_p(int idx) { return (index_rec *)get_end_ptr() - (idx + 1); }
+        static uptr_chunk_t allocate(unsigned len);
+    };
+    std::vector<uptr_chunk_t> m_chunks; /// < chunks with data
+    std::vector<uptr_chunk_t> m_free; /// < chunks without
+    typedef std::vector<uptr_chunk_t>::iterator chunk_iter;
+
+    bool m_rawmode = false;
+    bool m_clearing = false; ///< set while clearing.
+    size_t m_allrecords = 0; ///< includes removed and 'padding' records
+    size_t m_records = 0; ///< only actual, non-erased records.
+
+    //! Returned from add_record_slot (which is used to create a new record)
+    struct recposn {
+        chunkhdr *chunkp; ///< the chunk in which it was found
+        void *objp; ///< pointer to the object
+        int idx; ///< index within the chunk (= -1 if insert was done in raw mode)
+    };
+    API_EXPORT recposn add_record_slot(size_t bytes, size_t align);
+    API_EXPORT void recover_ctor_throw(recposn const &) noexcept;
+    API_EXPORT void install_dtor(recposn const &, dtor_funcp dtor_func);
+    API_EXPORT void move_to_free(chunk_iter chunk_to_free);
+
+  public:
+    class ChunkHandle {
+        friend class Crate;
+        chunkhdr *chunkp;
+
+      protected:
+        ChunkHandle(chunkhdr *cp) : chunkp(cp){};
+
+      public:
+        ChunkHandle() : chunkp(nullptr) {} // null handle may only be assigned-to
+        ChunkHandle(ChunkHandle const &) = default;
+        ChunkHandle &operator=(ChunkHandle const &) = default;
+        friend inline bool operator==(ChunkHandle const &a, ChunkHandle const &b) { return a.chunkp == b.chunkp; }
+        std::pair<void *, size_t> get_memory_extent() const
+        {
+            size_t const len = chunkp->get_ptr(chunkp->alloc_count) - (uint8_t *)chunkp;
+            return {chunkp, len};
+        }
+    };
+
+    API_EXPORT Crate(); ///< Construct a new Crate
+    Crate(Crate const &) = delete;
+    Crate &operator=(Crate const &) = delete;
+
+    // get the preload handle for the first chunk
+    ChunkHandle first_chunk_handle() const
+    {
+        return ChunkHandle(m_chunks.empty() ? nullptr : hdr_of(const_cast<Crate &>(*this).m_chunks.front()));
+    }
+    // get the preload handle for the most recent chunk
+    ChunkHandle last_chunk_handle() const
+    {
+        return ChunkHandle(m_chunks.empty() ? nullptr : hdr_of(const_cast<Crate &>(*this).m_chunks.back()));
+    }
+    // 'raw mode'
+    ChunkHandle enable_raw_mode(unsigned bytes_needed);
+    API_EXPORT void enable_raw_mode();
+    void disable_raw_mode() { m_rawmode = false; }
+    bool raw_mode() const { return m_rawmode; }
+
+    // Note that the destructor doesn't do anything.  You have to call clear() manually.
+    API_EXPORT ~Crate();
+    //! The number of objects in the crate.
+    size_t size() const { return m_records; }
+    //! The number of chunks in use
+    size_t chunk_count() const { return m_chunks.size(); }
+    //! The amount of space left in the current chunk, approximately.
+    /// DO NOT CALL unless chunk_count() > 0
+    size_t current_chunk_space_remain() const { return hdr_of(this->m_chunks.back())->space_avail(); }
+    //! Delete all objects. Does not necessarily free all storage to the
+    /// system; but all retained storage is availabe for re-use in the crate.
+    /// Note that this is no longer called by the destructor- it must be called explicitly.
+    API_EXPORT void clear(Graph *graph_in);
+    // Special entry for deserialzing in segments.
+    // If it is possible to allocate, in current raw-mode chunk, everything from offset 'start'
+    // up to but not including 'limit', this is done, and the base address of that region is returned.
+    // otherwise does nothing and returns null.
+    API_EXPORT void *allocate_bulk(size_t start, size_t limit);
+
+    //! Construct an object of type T into the crate, using the
+    /// parameters of any constructor of T. It is acceptable for the
+    /// constructor to call the emplace method to add other objects to
+    /// the crate.
+    template <typename T, typename... Args> API_HIDDEN T *emplace(Args &&...args)
+    {
+        recposn const pos = add_record_slot(sizeof(T), alignof(T));
+        // construct the object
+        if constexpr (std::is_nothrow_constructible<T, Args...>::value) {
+            new (pos.objp) T(std::forward<Args>(args)...);
+        } else {
+            try {
+                new (pos.objp) T(std::forward<Args>(args)...);
+            } catch (const std::exception &e) {
+                recover_ctor_throw(pos);
+                throw;
+            }
+        }
+        if (pos.idx >= 0) {
+            // register destructor
+            if constexpr (!std::is_trivially_destructible<T>::value) {
+                // Obtain a callable '~T()' function.
+                // this typically generates a jump, or a small inline; lambda can
+                // be cast to a function pointer since it has no state.
+                auto dtor_func = [](Graph *graph_in, void *obj) {
+                    if constexpr (has_clear<T>) {
+                        static_cast<T *>(obj)->clear(graph_in);
+                    }
+                    static_cast<T *>(obj)->~T();
+                };
+                install_dtor(pos, (dtor_funcp)dtor_func);
+            } else {
+                ++m_records; // note, install_dtor does this too.
+            }
+        }
+        return static_cast<T *>(pos.objp);
+    }
+
+    using deserialize_op_func = void *(*)(void *, Deserz &);
+    using deserialize_dtor_func = void (*)(Graph *, void *);
+
+    // Alternate interface to cut down on template instantations:
+    // init_func is used to initialize the memory, and dtor_func
+    // is is used to register the destructor.  It's up to the user
+    // to provide the correct size and alignment.
+
+    API_EXPORT void *emplace_explicit(Deserz &dctx, deserialize_op_func init_func, deserialize_dtor_func dtor_func,
+                                      size_align_code_t size_al);
+
+    //! Allocate 'n' of type T in the crate.
+    /// Will initially be garbage; T must be trivially destructable (unless waived)
+    template <typename T, bool DTOR_OK = false> T *alloc_array(size_t n)
+    {
+        static_assert(DTOR_OK || std::is_trivially_destructible<T>::value);
+        if (n == 0) return nullptr;
+        recposn const pos = add_record_slot(sizeof(T) * n, alignof(T));
+        if (pos.idx >= 0) m_records++;
+        return static_cast<T *>(pos.objp);
+    }
+    //! Allocate 'n' of type T in the crate.
+    /// Will be zero-filled; T must be trivially destructable.
+    template <typename T> T *alloc_array_zero(size_t n)
+    {
+        T *const res = alloc_array<T>(n);
+        if (n != 0) ::memset(res, 0, sizeof(T) * n);
+        return res;
+    }
+    //! Allocate 'n' of type T in the crate.
+    /// Will be "value constructed"; in case of things like int and pointer,
+    /// this means they will be zeroed.
+    ///
+    /// T must be trivially destructable.
+    template <typename T> T *alloc_array_value(size_t n)
+    {
+        T *res = alloc_array<T>(n);
+        if (n != 0) std::uninitialized_value_construct_n(res, n);
+        return res;
+    }
+};
+
+/*
+ * EJP: This seems silly, but I don't know how to get visibility into Graph into a templated Tensor because of include hell.
+ */
+
+API_EXPORT Crate *graph_crate(Graph &graph_in);
+
+//
+// replacement for vector, for use in ops;
+
+//
+// limited options for constructor:
+//   (1) copy, or move, from vector<T> - need Graph *;
+//   (2) create with a given size, null-initialized; - need Graph *;
+//   (3) create empty, and then fill in later
+//       using init( Graph* , std::vector const &)
+//       or init( Graph* , std::vector &&)
+//       or init( Graph *, size )
+//       or init( Graph *, T const *ptr, size );
+//       or init_move( Graph *, T *ptr, size );
+
+// With option 3, it assumed that the 'init' is done during the constructor of
+// a host object - this is needed during deserialize, for instance.
+// the 'len' is 32 bits so this type occupies 2 pointers, vs. 3 for std::vector.
+//
+// If 'T' has a destructor, the cratevec's destructor will invoke that on
+// each element of the vector, in reverse order.
+// when the 'move-from' mechanisms to init from 'std::vector && are used,
+// the supplied vector will not be cleared; but its elements will all be
+// 'moved-from'.
+
+template <typename T> class cratevec {
+    T *m_ptr;
+    unsigned m_len;
+    using vec_t = std::vector<T>;
+    static constexpr bool need_dtor = !std::is_trivially_destructible<T>::value;
+
+  public:
+    using iterator = T *;
+    using const_iterator = T const *;
+    using value_type = T;
+    using size_type = size_t;
+    using difference_type = ptrdiff_t;
+    using reference = T &;
+    using const_reference = T const &;
+
+    cratevec() : m_ptr(nullptr), m_len(0) {}
+    cratevec(Graph *g, vec_t const &v) : cratevec()
+    {
+        if (!v.empty()) init(g, v.data(), v.size());
+    }
+    cratevec(Graph *g, vec_t &&v) : cratevec()
+    {
+        if (!v.empty()) init_move(g, v.data(), v.size());
+    }
+    cratevec(Graph *g, size_t n) : cratevec() { init(g, n); }
+    cratevec(cratevec const &) = delete;
+    cratevec(cratevec &&) = delete;
+    ~cratevec()
+    {
+        if constexpr (need_dtor) {
+            if (m_len > 0) {
+                T *const ptr0 = m_ptr;
+                T *ptr = ptr0 + m_len;
+                do {
+                    ptr--;
+                    ptr->~T();
+                } while (ptr > ptr0);
+            }
+        }
+    }
+
+    cratevec &operator=(cratevec const &) = delete;
+    cratevec &operator=(cratevec &&) = delete;
+
+    void init(Graph *g, T const *data, size_t n)
+    {
+        assert(m_len == 0);
+        if (n) {
+            m_ptr = graph_crate(*g)->alloc_array<T, true>(n);
+            std::uninitialized_copy_n(data, n, m_ptr);
+            m_len = n;
+        }
+    }
+    void init_move(Graph *g, T *data, size_t n)
+    {
+        assert(m_len == 0);
+        if (n) {
+            m_ptr = graph_crate(*g)->alloc_array<T, true>(n);
+            std::uninitialized_move_n(data, n, m_ptr);
+            m_len = n;
+        }
+    }
+    // these methods get used during deserialize, so allow it to pass crate in directly.
+    void init(hnnx::Crate *const crate_p, size_t const n)
+    {
+        assert(m_len == 0);
+        if (n) {
+            m_ptr = crate_p->alloc_array<T, true>(n);
+            std::uninitialized_value_construct_n(m_ptr, n);
+            m_len = n;
+        }
+    }
+    // The DCrate version is defined in dcrate_inlines.h
+    void init(hnnx::DCrate *crate_p, size_t n);
+
+    void init(Graph *const g, size_t const n) { init(graph_crate(*g), n); }
+    void init(Graph *const g, vec_t const &v) { init(g, v.data(), v.size()); }
+    void init(Graph *const g, vec_t &&v) { init_move(g, v.data(), v.size()); }
+
+    iterator begin() noexcept { return m_ptr; }
+    iterator end() noexcept { return m_ptr + m_len; }
+    const_iterator begin() const noexcept { return m_ptr; }
+    const_iterator end() const noexcept { return m_ptr + m_len; }
+    const_iterator cbegin() const noexcept { return m_ptr; }
+    const_iterator cend() const noexcept { return m_ptr + m_len; }
+    size_type size() const noexcept { return m_len; }
+    T *data() noexcept { return m_ptr; }
+    T const *data() const noexcept { return m_ptr; }
+    bool empty() const noexcept { return m_len == 0; }
+    reference operator[](size_type idx) { return m_ptr[idx]; }
+    const_reference operator[](size_type idx) const { return m_ptr[idx]; }
+    reference at(size_type idx)
+    {
+        if (idx >= m_len) throw std::range_error("cratevec");
+        return m_ptr[idx];
+    }
+    const_reference at(size_type idx) const { return const_cast<cratevec &>(*this).at(idx); }
+    reference front() { return m_ptr[0]; }
+    const_reference front() const { return m_ptr[0]; }
+    reference back() { return m_ptr[m_len - 1]; }
+    const_reference back() const { return m_ptr[m_len - 1]; }
+};
+
+} // namespace hnnx
+
+POP_VISIBILITY()
+
+#endif /* CRATE_H_ */
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/dcrate_inlines.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/dcrate_inlines.h
new file mode 100755
index 0000000000000..a48e7bc909904
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/dcrate_inlines.h
@@ -0,0 +1,101 @@
+//==============================================================================
+//
+// Copyright (c) Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef DCRATE_INLINES_H
+#define DCRATE_INLINES_H 1
+
+#include <cstddef>
+#include <cstdint>
+#include <cassert>
+
+#include "macros_attribute.h"
+#include "deser_concurrent.h"
+#include "crate.h"
+
+namespace hnnx {
+
+// alloc 'amount' bytes with given alignment.
+inline void *DCrate::do_alloc(const size_t align, const size_t amount)
+{
+    size_t basep = size_t(nextp);
+    if (align > 4) {
+        basep = (basep + (align - 1)) & ~(align - 1);
+    }
+    size_t const next_base = basep + amount;
+    if (next_base > (size_t)limitp) return nullptr;
+    nextp = (void *)next_base; // update 'nextp' ...
+    return (void *)basep;
+}
+
+template <typename T, bool DTOR_OK /*=false*/> inline T *DCrate::alloc_array(const size_t n)
+{
+    if (nextp != nullptr) {
+        void *const allocp = do_alloc(alignof(T), sizeof(T) * n);
+        if (allocp) return (T *)allocp;
+    }
+    return cratep->alloc_array<T, DTOR_OK>(n);
+}
+
+template <typename T, typename... Args> inline T *DCrate::emplace(Args &&...args)
+{
+    if (nextp != nullptr) {
+        void *const allocp = do_alloc(alignof(T), sizeof(T));
+        if (allocp) {
+            new (allocp) T(std::forward<Args>(args)...);
+            return (T *)allocp;
+        }
+    }
+    return cratep->emplace<T>(std::forward<Args>(args)...);
+}
+
+template <>
+inline void *DCrate::emplace_explicit(Deserz &dctx, deserialize_op_func const init_func,
+                                      deserialize_dtor_func const dtor_func, size_align_code_t const size_al)
+{
+    if (nextp != nullptr) {
+        void *const allocp = do_alloc(size_al.align(), size_al.size());
+        if (allocp) {
+            init_func(allocp, dctx);
+            return allocp;
+        }
+    }
+    return cratep->emplace_explicit(dctx, init_func, dtor_func, size_al);
+}
+
+// this will be used in place of 'emplace' when the constructor parms
+// are just 'Deserz &'
+template <typename T> inline T *DCrate::emplace0(Deserz &dctx)
+{
+    deserialize_op_func const ctor = [](void *const ptr, Deserz &dctx) -> void * {
+        new (ptr) T(dctx);
+        return ptr;
+    };
+    if (nextp != nullptr) {
+        void *const allocp = do_alloc(alignof(T), sizeof(T));
+        if (allocp) {
+            (ctor)(allocp, dctx);
+            return (T *)allocp;
+        }
+    }
+    return (T *)cratep->emplace_explicit(dctx, ctor, nullptr, size_align_code_t::for_type<T>());
+}
+// init method of cratevec<T> using 'Dcrate' is declared here to avoid header inclusion madness.
+//
+template <typename T> inline void hnnx::cratevec<T>::init(hnnx::DCrate *crate_p, size_t n)
+{
+    assert(m_len == 0);
+    if (n) {
+        m_ptr = crate_p->alloc_array<T, true>(n);
+        std::uninitialized_value_construct_n(m_ptr, n);
+        m_len = n;
+    }
+}
+
+} // namespace hnnx
+
+#endif // DCRATE_INLINES_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/deser_concurrent.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/deser_concurrent.h
new file mode 100755
index 0000000000000..16db21a082cf1
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/deser_concurrent.h
@@ -0,0 +1,288 @@
+//==============================================================================
+//
+// Copyright (c) Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef DESER_CONCURRENT_H
+#define DESER_CONCURRENT_H 1
+
+#include <cstddef>
+#include <cstdint>
+#include <cassert>
+#include <cstring>
+#include <memory>
+#include <vector>
+#include <tuple>
+
+#include "deser_concurrent_defs.h"
+
+// this is intended to be included only in "deserialize.h"
+
+struct PreloadInfo;
+
+namespace hnnx {
+struct runlist_seg_descriptor;
+class Crate;
+class Deserz;
+class fixup_supplemental_recs;
+class InitTimeSchedule;
+
+// describes a 'span' of the deserialized data
+struct deser_segment_span {
+    void *base;
+    void *limit;
+};
+
+// This describes a partially-decoded segment; includes fixups.
+// This should stay small so we can place it inside Deserz, and std::move it
+// out (to keep the fixup list) when done with the segment.
+struct runlist_fixup_state {
+    unsigned segno = 0;
+    size_t *crate_begin = nullptr; // where the data starts in the crate
+    runlist_seg_descriptor *seg_desc = nullptr; // Corresponding 'runlist_seg_descriptor' for reference.
+    // The next three are copied from the runlist_auxdata_seg_desc
+    uint32_t base_tensor_index = 0; // first tensor index defined this segment
+    uint32_t base_blocktable_index = 0; // first blocktable index defined in this segment
+    uint32_t base_sharedobj_index = 0; // first 'shared_object' index defined in this segment
+    // fixup data
+    size_t *fixup_list_head = nullptr; // head of the 'fixup list', or null if none.
+    fixup_supplemental_recs *fixup_supplemental; // supplemental fixup list
+
+    runlist_fixup_state() = default;
+    ~runlist_fixup_state() = default;
+    runlist_fixup_state(runlist_fixup_state const &) = default;
+    // *Some* implementations of c++lib require this to have operator= (non-move)
+    // in order for std::vector containing it to be constructed via resize.
+    runlist_fixup_state &operator=(runlist_fixup_state const &) = default;
+    // the move-ctor and move-assign must leave the source with no fixup list,
+    // and segno = 0.
+    runlist_fixup_state(runlist_fixup_state &&from) { do_move_from(std::move(from)); }
+    runlist_fixup_state &operator=(runlist_fixup_state &&from)
+    {
+        do_move_from(std::move(from));
+        return *this;
+    }
+
+  private:
+    // this is used in move-constructor and move-assign; it will always leave 'from'
+    // with certain 'null' values to trap cases where we're using the wrong instance.
+    void do_move_from(runlist_fixup_state &&from)
+    {
+        segno = from.segno;
+        crate_begin = from.crate_begin;
+        seg_desc = from.seg_desc;
+        base_tensor_index = from.base_tensor_index;
+        base_blocktable_index = from.base_blocktable_index;
+        base_sharedobj_index = from.base_sharedobj_index;
+        fixup_list_head = from.fixup_list_head;
+        fixup_supplemental = from.fixup_supplemental;
+        from.segno = 0;
+        from.seg_desc = nullptr;
+        from.fixup_list_head = nullptr;
+    }
+};
+//
+// This contains 'supplemental' fixup records for a segment;  there is one instance in each runlist_seg_descriptor,
+// and a pointer to in the runlist_fixup_state. When the 'runlist_fixup_state' is moved in or out of the Deserz,
+// the pointer to this remains.
+// To avoid the overhead of vec_push_back, this // has a static array into which values are recorded;
+// when this is full (or near full), all the records within are appended to the vector in a single operation.
+// At the end of the operation, any remaining records are appended to the vector, but only if the vector
+// is not empty (we can read the records out of the fixed array, if they all fit).
+//
+// The append() is not safe unless 'ensure_room_for' is checked first; you can e.g. do ensure_room_for(3)
+// ahead of doing up to 3 append
+// It is best to use a constant as parameter to ensure_room_for, i.e. ahead of code which may append
+// *up to* 4 values, use ensure_room_for(4); this simplifies the inline expansion of 'ensure_room_for',
+// and makes very little difference to performance compared to using the exact value.
+//
+class fixup_supplemental_recs {
+    static constexpr unsigned ARR_SIZE = 64;
+    unsigned num_in_arr = 0;
+    uint32_t fixed_arr[ARR_SIZE];
+    std::vector<uint32_t> var_arr;
+    unsigned n_vec = 0; // = var_arr.size()
+
+  public:
+    void clear();
+    unsigned constexpr size() const { return num_in_arr + n_vec; }
+    void reserve(unsigned const n) { var_arr.reserve(n); }
+    inline void ensure_room_for(unsigned const n)
+    {
+        assert(n <= ARR_SIZE);
+        if (num_in_arr > ARR_SIZE - n) flush_to_vec();
+    }
+    // append allowed only when preceded by 'ensure_room_for'
+    inline void append(uint32_t const val)
+    {
+        assert(num_in_arr < ARR_SIZE);
+        fixed_arr[num_in_arr++] = val;
+    }
+    // use instead of 'ensure_room_for(1); push_back(n)'
+    inline void push_back(uint32_t const val)
+    {
+        if (num_in_arr > ARR_SIZE - 1) flush_to_vec();
+        fixed_arr[num_in_arr++] = val;
+    }
+    // After all push_back() done, do a 'finish'
+    // and then get_limits() can be used to traverse the data.
+    void finish(); // flushes, but only if the vec is not empty.
+    std::pair<uint32_t const *, uint32_t const *> get_limits() const;
+
+  protected:
+    void flush_to_vec();
+};
+
+// An array of these (size N+1) is used to hold the
+// information used in deserializing each each segment.
+// The [N+1] is partially used; some operations may use
+// e.g. arr[i+1].auxinfo.some_field to find out where something
+// ends for the current segment, using the start of the next segment;
+// so N-1 entry needs a next.
+
+struct runlist_seg_descriptor {
+    runlist_auxdata_seg_desc auxinfo; // the data from the 'aux_data' record for this segment
+    runlist_fixup_state segfixup; // the deserialization state (moved in and out of Deserz as needed)
+    fixup_supplemental_recs fixup_supp; // fixup supplemental recs.
+    deser_segment_span span_to_deser = {};
+    // These are used to configure the last preload in each segment, which preloads a region
+    // which is either partially, or entirely, in the next segment. So, the first two entries
+    // below are actually set at the end of deserialization of the previous segment; the end_preload
+    // is set by the current segment deserialize.
+    // The information stored in [N] is for configuring
+    // the last preload in the last segment, with end_preload set to 'end of crate'; in this case
+    // start_preload could be <= the end of the crate, and then we don't configure it.
+    // likewise the information in [0] is only 'end_preload', which can be used to configure
+    // 'Graph::m_initial_preload' (it should go from start-of-crate to seg[0].end_preload).
+    // In some cases (hopefully, only in testing) we may have segments with no preloads in them,
+    // in which case null pointers will appear in some of these; the ChunkPreload ops need to
+    // configured by getting info from adjacent segments.
+    PreloadInfo *prev_seg_final_preload{}; // points to the prev segments' final PreloadInfo
+    char *start_preload{}; // the preload start address for prev seg's final preload
+    char *end_preload{}; // end address  for prev seg's final preload
+};
+
+// One instance of this is in Deserializer, called segments.
+// It is created 'empty', and populated when we encounter the valid
+// Aux Data record.
+//
+class DeserSegDescs {
+    unsigned n_segs = 0;
+    // points to an array of n_seg + 1, if n_segs > 0
+    std::unique_ptr<runlist_seg_descriptor[]> seg_arr;
+
+  public:
+    DeserSegDescs() = default;
+    ~DeserSegDescs() = default;
+    DeserSegDescs(DeserSegDescs const &) = delete;
+    DeserSegDescs(DeserSegDescs &&) = default;
+    DeserSegDescs &operator=(DeserSegDescs const &) = delete;
+    DeserSegDescs &operator=(DeserSegDescs &&) = default;
+
+    // these two are used to create the array
+    void set_size(unsigned const n); // used to create sized, empty array
+    runlist_seg_descriptor *data() { return seg_arr.get(); }
+
+    constexpr unsigned num_segs() const { return n_segs; }
+    constexpr bool is_active() const { return n_segs != 0; }
+    // note: 'i' may be 0 .. num_segs(); only can use when 'is_active'.
+    runlist_seg_descriptor &operator[](unsigned const i) { return seg_arr[i]; }
+    runlist_seg_descriptor const &operator[](unsigned const i) const { return seg_arr[i]; }
+
+    // We can add other data in here, to manage the concurrent deserialization.
+    unsigned n_threads = 0; // set when allocating the 'Deserz' array
+    std::vector<Deserz> deserz_arr; // sized as 'n_threads'.
+
+    // start-of-crate, rounded to a multiple of 32; Calculated before any multi-thread
+    // operations. Use to configure Graph::m_initial_preload.
+    void *crate_preload_start_boundary;
+    // end-of-crate, rounded up to multiple of 32. Calculated before any multi-thread
+    // operations. No 'ChunkPreloadOp' will exceed this.
+    void *crate_preload_final_boundary;
+
+    InitTimeSchedule *initSchedule;
+};
+
+// A 'DCrate' is a proxy object stored within Deserz.
+// It has some of the same methods as Crate; but if nextp is not null,
+// it will allocated into the space at 'nextp', limited by 'limitp'
+// Otherwise it will use the Crate.
+// Most methods are defined as inlines in dcrate_inlines,h
+//
+class DCrate {
+    // these are either both null, or both non-null and 4-aligned.
+    void *nextp = nullptr;
+    void *limitp = nullptr;
+    Crate *cratep = nullptr;
+
+  public:
+    DCrate() {}
+    ~DCrate() {}
+    DCrate(DCrate const &) = default;
+    DCrate(DCrate &&) = default;
+    DCrate &operator=(DCrate const &) = default;
+    DCrate &operator=(DCrate &&) = default;
+    explicit DCrate(Crate &c) : cratep(&c) {}
+    void set_crate(Crate &c) { cratep = &c; }
+    Crate *crate() { return cratep; }
+    bool is_active() const { return nextp != nullptr; }
+
+    constexpr size_t bytes_remaining() const { return (char *)limitp - (char *)nextp; }
+    char *next_loc() { return (char *)nextp; }
+    std::pair<char *, char *> range_remain() { return {(char *)nextp, (char *)limitp}; }
+
+    void set_memory_range(void *base, unsigned len)
+    {
+        nextp = base;
+        limitp = (void *)((char *)base + len);
+    }
+    void remove_memory_range()
+    {
+        nextp = nullptr;
+        limitp = nullptr;
+    }
+
+    // Methods of Crate we want to support (See crate.h for more more detail).
+    // Note that the constructors invoked in 'emplace' and 'emplace_explicit'
+    // can and will recursively call 'emplace' to construct their sub-objects.
+    template <typename T, typename... Args> T *emplace(Args &&...args);
+    // variant of 'emplace' which can use the 'emplace_explicit' call to avoid
+    // instantiating the constructor twice
+    template <typename T> T *emplace0(Deserz &dctx);
+    // (this is defined with 'template' args, only so it can be declared here without
+    // forward refs. All are pass-by-value. Only one specialization will be defined).
+    template <typename FI, typename FD, typename SA> void *emplace_explicit(Deserz &dctx, FI, FD, SA);
+    // array allocation, used to make all arrays in crate during deserialize.
+    template <typename T, bool DTOR_OK = false> T *alloc_array(size_t n);
+
+  private:
+    // reserve the specified data in the range, and return pointer to start; or
+    // return null if not possible.
+    void *do_alloc(size_t align, size_t amount);
+};
+
+// defines the encoding in the upper 3 bits of the last word of a 'multi-word' supplemental record
+// all must be 4..7, since a 0 in the msb indicates a 'short' record.
+
+constexpr unsigned SUPPFIXUP_CAT_tensor = 4;
+constexpr unsigned SUPPFIXUP_CAT_sharedobj = 5;
+constexpr unsigned SUPPFIXUP_CAT_blocktable = 6; // with indices packed in one word
+constexpr unsigned SUPPFIXUP_CAT_blocktable_full = 7; // .. in two words
+constexpr unsigned SUPPFIXUP_CAT_SHIFT = 29u;
+
+bool fixup_encode_for_blocktable(runlist_fixup_state &seginfo, uint32_t idx, uint32_t table_offs, void **ptrloc);
+
+// high-level operations in the 'deserialize by segments' code.
+
+GraphStatus do_multiseg_deser(Deserializer &dctx, size_t ref_deser_pos);
+GraphStatus segmentjob_deserialize_ops(Deserializer &dctx, unsigned segno, unsigned threadno);
+GraphStatus segmentjob_process_fixups(Deserializer &dctx, unsigned segno, unsigned threadno);
+GraphStatus segmentjob_compile_ops(Deserializer &dctx, unsigned segno, unsigned threadno);
+void resolve_chunk_preload_after_multiseg_deser(Deserializer &dctx);
+
+} // namespace hnnx
+
+#endif // DESER_CONCURRENT_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/deser_concurrent_defs.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/deser_concurrent_defs.h
new file mode 100755
index 0000000000000..3d72ed7d2de71
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/deser_concurrent_defs.h
@@ -0,0 +1,97 @@
+//==============================================================================
+//
+// Copyright (c) Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef DESER_CONCURRENT_DEFS_H
+#define DESER_CONCURRENT_DEFS_H 1
+
+#include <cstddef>
+#include <cstdint>
+
+namespace hnnx {
+
+// NOTE: this file contains defs for concurrent deserialize which are needed on both decode and prepare
+// side; mostly just the format of the Aux Data records.
+// Defs needed only on decode side are in 'deser_concurrent.h', which #includes this file.
+
+constexpr unsigned DesConcur_MIN_SEGMENTS = 8; // can't have less than this number.
+
+// This is the number of runlist slots in the runlist_auxdata_seg_desc format.
+// It must be >= the actual number. This number is coded into the start of the AuxData
+// payload. If the number gets bigger, the reader of the aux-data
+// record will need to be able to cope with the older, smaller value.
+
+constexpr unsigned DesConcur_MAX_RUNLISTS = 4;
+
+// The 'Aux Data' record describing the runlist partition has a payload formed of
+// a runlist_auxdata_header, followed immediately by N+1 of runlist_auxdata_seg_desc.
+// The number N is in the header; there may be additional words after, which can be
+// ignored
+//
+// Aux Data header record.
+// The 'record_version' is reserved to flag changes in the format, so that
+//   if it changes, new skel can understand old records.
+//    Currently, It has this format; most changes will expand one of the fields
+//    so following this may be adequate to capture version changes; if it is not,
+//   add flags in the upper bits.
+//   bits 31 ..13 : reserved, 0
+//   bit  12: set of crate sizes are calculated based on 'dynamic tensor' sizes
+//   bits 11..8 length of the header in uint32's
+//   bits 7..3 length of 'segment' record, in uint32's
+//   bits 2..0 .. value of DesConcur_MAX_RUNLISTS
+//
+struct runlist_auxdata_header {
+    unsigned record_version; // see above
+    unsigned numsegs : 16; // number of segments; >= 8, likely <= 64 but who knows
+    unsigned hdrflags : 16; // reserved for flags
+    unsigned runlist_offset; // see below
+};
+
+// 'runlist_offset' is the offset, in u32's units, from the 'num_in_tensors' word
+// to the 'n_ops_total' word. This is needed by 'weight share' processing in order to
+// adjust the deser_offset values to accommodate changes in the encoding length of pointers.
+
+// The N segments are described by an array of N+1 of runlist_auxdata_seg_desc;
+// segment i is defined by arr[i] (start) and arr[i+1] (end).
+// An exception is 'crate_seg_len'- this may be less than arr[i+1].crate_offset - arr[i].crate_offset
+//  due to padding.
+// In the final record arr[N]:
+//    - crate_seg_len is not used (0)
+//    - The *_list_posn records are the total length of the runlists
+//    - the four 'base_*_index' values are all 1 greater than any index used in the graph
+//
+struct runlist_auxdata_seg_desc {
+    uint32_t deser_offset; // where the input (pickle) data begins - reference point is the start of 'Runlist' as
+    //                     // defined in docs/pickle_format.md, i.e. the location of 'n_ops_total' word
+    uint32_t crate_offset; // offset in crate
+    uint32_t crate_seg_len; // crate length needed (not used in final entry)
+    uint32_t runlist_posn[DesConcur_MAX_RUNLISTS]; // where the segment starts in Op* runlist
+    uint32_t execlist_posn[DesConcur_MAX_RUNLISTS]; // where the segment starts in 'execlist'
+    uint32_t base_opseq_index; // first 'op_sequence_marker' index used in the segment.
+    uint32_t base_tensor_index; // first tensor index defined this segment
+    uint32_t base_blocktable_index; // first blocktable index defined in this segment
+    uint32_t base_sharedobj_index; // first 'shared_object' index defined in this segment
+};
+
+// Bit in the header version indicating crate sizes allow for 'dynamic shapes'.
+// NOTE: if that gets backed out later, leave this here but remove it from DesConcur_AUXDATA_REC_VERSION
+//
+constexpr unsigned DesConcur_AUXDATA_REC_VERSION_DYNSHAPE_SIZES = 4096;
+
+constexpr unsigned DesConcur_AUXDATA_REC_VERSION = // composed of:
+        ((sizeof(runlist_auxdata_header) / sizeof(uint32_t)) * 256 // header size
+         + (sizeof(runlist_auxdata_seg_desc) / sizeof(uint32_t)) * 8 // seg desc len
+         + DesConcur_MAX_RUNLISTS) |
+        DesConcur_AUXDATA_REC_VERSION_DYNSHAPE_SIZES;
+
+// values to be used to 'grow' old crate estimate to compensate for 'dyn shape' mismatch
+constexpr unsigned DesConcur_CrateGrowPerTensor = 2; // number of words per 'tensor'
+constexpr unsigned DesConcur_CrateGrowPerShared = 2; // number of words per 'shared object'
+
+} // namespace hnnx
+
+#endif // DESER_CONCURRENT_DEFS_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/deserialize_tensors.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/deserialize_tensors.h
new file mode 100755
index 0000000000000..43f14039fd1ad
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/deserialize_tensors.h
@@ -0,0 +1,68 @@
+//==============================================================================
+//
+// Copyright (c) 2021-2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef DESERIALIZE_TENSORS_H
+#define DESERIALIZE_TENSORS_H 1
+
+#include <cstdio>
+#include <cassert>
+#include <cstring>
+#include <memory>
+#include <vector>
+#include <tuple>
+#include "limits.h"
+#include "log.h"
+
+#include "forward_classes.h"
+#include "serdes_tensors.h"
+
+namespace hnnx {
+
+// see comment in serdes_tensors.h for overview of how this works.
+
+class Deserializer;
+
+class DeserTensorConn : public SerTensorConnDefs {
+    typedef unsigned tensor_idx;
+    typedef Tensor const *ptr_type;
+
+    // this collects all of the tensor_def we have seen. index is seq_index-1.
+    std::vector<ptr_type> defined_tensors;
+
+  public:
+    DeserTensorConn() {}
+    // process a tensor definition
+    void tensor_def(Deserz &, ptr_type);
+    // process n tensor refs.
+    void tensor_refs(Deserz &, ptr_type *ptrs, unsigned num);
+    // process a tensor ref
+    void tensor_ref(Deserz &dctx, ptr_type &ptr) { tensor_refs(dctx, &ptr, 1); }
+
+    // TODO: remove these two, we don't use them, and should not.
+    // read an identity (for use in subsequent need_fixup)
+    tensor_idx read_identity(Deserz &);
+    // apply the identity to 'fix' a tensor pointer (usually now, sometimes later
+    void need_fixup(tensor_idx ident, ptr_type *dst);
+
+    // 'reserve' the defined tensors to avoid allocation overhead...
+    inline void reserve_tensors(const size_t n) { defined_tensors.reserve(n); }
+    // resize the 'defined tensors' table to its full capacity (specified).
+    // Used only in multi-thread deserialize, prior to deserializing the runlist.
+    inline void resize_tensordef_table(const size_t n) { defined_tensors.resize(n); }
+
+    // this is for use by 'reference fixup' code, in concurrent deserialize.
+    std::vector<ptr_type> const &get_defined_tensors() const { return defined_tensors; }
+
+  protected:
+    tensor_idx read_identity_inline(Deserz &);
+    void apply_fixup_inline(tensor_idx idx, ptr_type *dst);
+};
+
+} // namespace hnnx
+
+#endif // DESERIALIZE_TENSORS_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/deserializer.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/deserializer.h
new file mode 100755
index 0000000000000..7312ae8bdd948
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/deserializer.h
@@ -0,0 +1,761 @@
+//==============================================================================
+//
+// Copyright (c) Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef DESERIALIZER_H
+#define DESERIALIZER_H 1
+
+#include <cstdio>
+#include <cassert>
+#include <cstring>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <type_traits>
+#include <array>
+#include <vector>
+#include <map>
+#include <typeinfo>
+#include <typeindex>
+#include <string_view>
+#include "limits.h"
+#include "dtype.h"
+#include "log.h"
+#include "allocator.h"
+#include "op_extra_info.h"
+
+#include "serialize_defs.h"
+#include "forward_classes.h"
+#include "deserialize_tensors.h"
+#include "macros_attribute.h"
+#include "const_extent_descriptor.h"
+#include "weak_linkage.h"
+#include "size_align_code.h"
+#include "deser_concurrent.h"
+#include "hexagon_nn_types.h"
+
+namespace hnnx {
+class DMA_Manager;
+class Crate;
+/**
+ * @brief \ref Serializer and \ref Deserializer modules that provides
+ * a mechanism to flatten (serialize) and reconstruct (deserialize)
+ * primitive and user-defined data types. The initial objective
+ * was to create an in-memory representation of the optimized
+ * \ref Graph on x86 which can then be reconstructed and executed on
+ * a qdsp target, essentially, a means to Graph caching.
+ *
+ */
+using tensor_deserializer_fn = uptr_Tensor (*)(Deserz &);
+
+using deserialize_op_func = void *(*)(void *, Deserz &); // Allocation function
+using deserialize_dtor_func = void (*)(Graph *, void *); // Deallocation function
+class SimpleOpBase;
+using deserialize_make_unique = std::unique_ptr<SimpleOpBase> (*)();
+
+struct op_deserializer_fn {
+    op_deserializer_fn(deserialize_op_func init_func_in, const size_align_code_t sizeal_in)
+        : init_func(init_func_in), size_align_code(sizeal_in)
+    {
+    }
+    op_deserializer_fn(deserialize_op_func init_func_in, deserialize_dtor_func dtor_func_in,
+                       const size_align_code_t sizeal_in)
+        : dtor_func(dtor_func_in), init_func(init_func_in), size_align_code(sizeal_in){};
+    op_deserializer_fn(const op_deserializer_fn &) = default;
+    op_deserializer_fn(op_deserializer_fn &&) = default;
+    op_deserializer_fn &operator=(const op_deserializer_fn &) = delete;
+    deserialize_dtor_func dtor_func = nullptr;
+    deserialize_op_func init_func = nullptr;
+    const size_align_code_t size_align_code{};
+    inline constexpr size_t get_size() const { return size_align_code.size(); }
+    inline constexpr size_t get_align() const { return size_align_code.align(); }
+};
+
+// here's a quick and dirty way to make these maps go faster: compare string_view starting with len;
+// and if the len is the same, then compare the middle character, and if that's the same,
+// use memcmp. This avoids getting slowed down by a lot of long common prefixes in the type names.
+// and we don't care about the weird ordering it generates.
+//
+struct trick_stringview_lt {
+    bool operator()(std::string_view const &a, std::string_view const &b) const
+    {
+        unsigned const na = a.size();
+        unsigned const nb = b.size();
+        if (na != nb) return na < nb;
+        char const *const pa = a.data();
+        char const *const pb = b.data();
+        if (pa == pb || na == 0) return false; // pa==pb is a  common case.
+        unsigned const char_a = pa[na >> 1];
+        unsigned const char_b = pb[na >> 1];
+        if (char_a != char_b) return char_a < char_b;
+        return ::memcmp(pa, pb, na) < 0;
+    }
+};
+
+using op_deserializer_map_t = std::map<std::string_view, std::pair<op_deserializer_fn, bool>, trick_stringview_lt>;
+using tensor_deserializer_map_t = std::map<std::string_view, tensor_deserializer_fn, trick_stringview_lt>;
+using cexdesc_deserializer_map = std::map<std::string, ConstExtentDesc>;
+
+using const_extent_t = std::pair<hexagon_nn_wide_address_t, size_t>;
+using weight_buf_deserializer_map = std::map<std::string, const_extent_t>;
+
+/**
+ * @brief Deserializer class to reverse the serialization
+ * process and reconstruct the data for specific types
+ *
+ */
+class Deserz : public DeSerError {
+    friend class Deserializer; // weirdly, sometimes a derived class needs to be a friend.
+    friend class DeserTensorConn;
+
+  protected:
+    Deserz(Deserializer *full_deser, char const *p, size_t n, Graph *g = nullptr);
+
+  public:
+    // I want to make this protected, but can't.
+    // Even code which has access to a protected copy_ctor
+    // of foo can't invoke .resize(n, foo_inst) on a std::vector<foo>. This
+    // seems like a defect in C++. Applies to various 'emplace' methods too;
+    // the 'emplace' can only ever use public ctors.
+    Deserz(Deserz const &) = default;
+
+  public:
+    virtual ~Deserz(); // please keep this as first virtual method declared.
+
+    // These three ONLY TO BE USED when setting up a Deserz to start processing a segment.
+    void setup_source_span(deser_segment_span const &);
+    void setup_dcrate_out(void *base, size_t len);
+    void setup_next_tensor_index(unsigned const idx) { next_tensordef_index = idx; }
+
+    typedef uint32_t object_identity_type;
+
+    // Note, various accessor methods are defined as inlines below 'class Deserializer'.
+    // true if this Deserz is really an instance of Deserializer.
+    constexpr bool is_base_deser() const;
+
+    using op_deserialize_fn_list_t = std::vector<op_deserializer_map_t::const_iterator>;
+    using tensor_deserialize_fn_list_t = std::vector<tensor_deserializer_fn>;
+
+    op_deserialize_fn_list_t &get_op_deserialize_fn_list();
+    tensor_deserialize_fn_list_t &get_tensor_deserialize_fn_list();
+    std::vector<void *const *> &get_blocktable_link_table();
+    // when deserializing an op:
+    //  - call deserialize_tensor_ref (or _refs) on all the input tensor pointers
+    //  - pass all output tensor addresses to deserialize_tensor_def
+    //  Sequence must match serialization; note that the deserialize-ctor of Tensor
+    //  calls deserialize_tensor_def on itself; so there is no need to call it elsewhere,
+    //   except for specialized types which are constructed otherwise during depickle (e.g.,
+    //   types embedded in the Op).
+    //
+    // Some ops have multiple copies of some input tensor pointers; for these, it's possible
+    // serialize just one reference, and the deserialize it using
+    //     auto id = deserialize_object_identity()		// <- corresponds to serialize_tensor_ref
+    //     need_tensor_fixup( id, &first_tensor_pointer);
+    //      (other deserialize activity can happen here)
+    //     need_tensor_fixup( id, &second_tensor_pointer);
+
+    void deserialize_tensor_def(Tensor const *tensor_ptr);
+    void deserialize_tensor_ref(Tensor const *&where);
+    void deserialize_tensor_refs(Tensor const **ptrs, unsigned n);
+    template <typename T> void deserialize_tensor_ref(T const *&where);
+    template <typename T> void deserialize_tensor_refs(T const **ptrs, unsigned n);
+    object_identity_type deserialize_object_identity();
+    void need_tensor_fixup(object_identity_type oid, Tensor const **where);
+
+    Graph &graph() const { return *graph_ptr; }
+    Crate *crate() { return d_crate.crate(); }
+    DCrate *dcrate() { return &d_crate; }
+    DeserSegDescs const &get_segments() const; // gets ref to associated 'segments' object
+    op_deserializer_map_t const &get_op_deser_map() const { return *op_deserializer_map; }
+
+    bool is_aligned_const_format() const;
+    bool has_pending_tensor_updates();
+
+    bool is_shared_dynamic_tensor_shape_format() const;
+
+    fa::RuntimeAllocator *allocator;
+    DCrate d_crate; // contains a crate pointer
+
+  protected:
+    // hoist pointers to these maps into Deserializer to avoid static lock overhead
+    op_deserializer_map_t const *op_deserializer_map;
+    tensor_deserializer_map_t const *tensor_deserializer_map;
+    Graph *graph_ptr{};
+    Deserializer *full_deser;
+
+    char const *bufstart; // start of current buffer
+    char const *bufend; // first byte we can't read
+    char const *bufp; // next to read
+    char const *buf_limit; // <= bufend; where 'fill_buffer' needs to be called.
+    size_t bytes_filled; // bytes previously filled
+
+    uint32_t op_flags;
+    OpExtraInfo op_extra_info;
+
+    unsigned next_tensordef_index = 1; // belongs to 'tensorconn' but needs to be in Deserz.
+    // 'format version'. Currently only ones used are 0 = classic, 1 = July/2023
+    // Only access through methods like .classic_format();
+    // This is changed to non-zero value based on seeing certain Aux Data records
+    // (which must appear before the allocator).
+    int format_version = 0;
+
+    // this is used in multi-thread decoding. It is important that
+    // it remains null-constructed if the object is really a base of Deserializer;
+    // it is only used in 'segment' Deserz instances.
+    runlist_fixup_state seg_fixup_state{};
+
+    /**
+	 * @brief throws an error since deserializer detected
+	 * deserialization on insufficient bytes i.e. an underflow
+	 *
+	 */
+    API_EXPORT virtual char const *fill_buffer(); // called for underflow on short operation
+
+    /**
+	 * @brief Deserialize data of specified length and write into
+	 * buffer provided by caller
+	 *
+	 * @param[out] p buffer to write to
+	 * @param[in] len length of the \ref bufp to read from
+	 * @param[in] align if true, skip input bytes to a boundary of 4
+	 */
+    API_EXPORT virtual void deserialize_fread(void *p, size_t len, bool align);
+
+    /**
+	 * @brief Get current position of buffer from which next data will be read
+	 *
+	 * @return size_t offset from buffer start
+	 */
+    size_t buffer_offset() const { return bufp - bufstart; }
+    /**
+	 * @brief Available buffer size remaining for deserialization
+	 *
+	 * @return size_t remaining bytes size
+	 */
+    size_t buffer_remain() const { return bufend - bufp; }
+
+    /**
+	 * @brief deserialize buffer for type T
+	 *
+	 * @retval T returs the deserialized value of type T
+	 *
+	 * Note: This is the templated API called by deserialize_T() functions
+	 *
+	 * Note: Cannot be used for more than 4 bytes, there is a specialized version to read u64.
+	 */
+    template <typename T> T simple_deserialize()
+    {
+        static_assert(sizeof(T) <= 4, "can only read sizeof(T) <= 4");
+        constexpr size_t W = 4;
+        char const *curr_p = bufp;
+        if (curr_p >= buf_limit) {
+            curr_p = fill_buffer();
+        }
+        T const val = *(T const *)(curr_p);
+        bufp = curr_p + W;
+        return val;
+    }
+    // see comment above deserialize_shared_obj.
+    API_EXPORT std::pair<void const *, void const **> deserialize_shared_obj_func(void const **ptrloc);
+    API_EXPORT uint64_t deser_u64_slowpath();
+    void initial_l2fetch(); // called only from ctor
+
+  public:
+    inline constexpr bool classic_format() const { return format_version == 0; }
+    /**
+	 * @brief deserialize data of type which calls simple_deserialize
+	 *
+	 * @param val data to deserialize
+	 *
+	 * Note: the below are the only types supported for deserialize_type<T>
+	 */
+    API_EXPORT uint64_t deserialize_uint64(); // inline later
+    inline float deserialize_float() { return simple_deserialize<float>(); }
+    inline uint32_t deserialize_uint32() { return simple_deserialize<uint32_t>(); }
+    inline NN_INT32_T deserialize_int32() { return simple_deserialize<NN_INT32_T>(); }
+    inline int16_t deserialize_int16() { return simple_deserialize<int16_t>(); }
+    inline uint16_t deserialize_uint16() { return simple_deserialize<uint16_t>(); }
+    inline int8_t deserialize_int8() { return simple_deserialize<int8_t>(); }
+    inline uint8_t deserialize_uint8() { return simple_deserialize<uint8_t>(); }
+
+    inline uint64_t deserialize_namesig() { return deserialize_uint64(); }
+
+    // note, this is defined as an inline in deserializer.cc and not available elsewhere
+    tensor_deserializer_fn deserialize_tensor_identification(unsigned tensor_class_index);
+
+    // deserialize string
+    // **NOTE** will throe runtime error if called in a Deserz which is not really a Deserialize.
+    API_EXPORT std::string_view deserialize_str();
+
+    uint32_t get_op_flags() const { return op_flags; };
+    void clear_op_flags() { op_flags = 0; };
+    void set_op_flags(uint32_t f) { op_flags = f; };
+
+    const OpExtraInfo &get_op_extra_info() const { return op_extra_info; };
+    void clear_extra_info() { op_extra_info.clear(); };
+    void set_op_extra_info(OpExtraInfo in_op_extra_info) { op_extra_info = in_op_extra_info; };
+
+    /**
+	 * @brief deserialize buffer for specified size
+	 *
+	 * @param[in] alloc_size number of bytes to read from \ref bufp
+	 * @param[out] ptr destination buffer for the read bytes
+	 * @return size_t number of bytes actually read
+	 */
+    API_EXPORT size_t deserialize_buf(size_t alloc_size, void *ptr);
+    /**
+	 * @brief similar to deserialize_buf but first deserialize a
+	 * uint32_t size of bytes that should match the alloc_size
+	 *
+	 * @param[in] alloc_size number of bytes to read from \ref bufp
+	 * @param[out] ptr destination buffer for the read bytes
+	 * @return size_t number of bytes actually read
+	 */
+    API_EXPORT size_t deserialize_buf_withlen(size_t alloc_size, void *ptr);
+    // deserialize a pointer as 64 bits
+    inline void *deserialize_ptr() { return (void *)size_t(deserialize_uint64()); }
+
+    template <typename T> T deserialize_type();
+
+    template <typename RetT, size_t N, typename SerialT> std::array<RetT, N> deserialize_array();
+
+    /**
+	 * @brief convernience wrappers for deserialize fuctions that
+	 * take in different number of arguments of uint32_t type
+	 *
+	 * @return std::tuple<uint32_t,uint32_t> (first, second) uint32_t data deserialized
+	 */
+    // convenience wrappers (to reduce inlined code size w/o much loss of speed)
+    API_EXPORT std::tuple<uint32_t, uint32_t> deserialize_uint32_x2();
+    API_EXPORT std::tuple<uint32_t, uint32_t, uint32_t> deserialize_uint32_x3();
+    API_EXPORT std::tuple<uint32_t, uint32_t, uint32_t, uint32_t> deserialize_uint32_x4();
+
+    API_EXPORT void deserialize_uint32_arr(uint32_t *p, size_t N);
+
+    // to reduce code size in the templates, we can deserialize arrays of
+    // N uint32 to sizet
+    API_EXPORT void deserialize_uint32_arr_sizet(size_t *p, size_t N);
+
+    /**
+	 * @brief deserialize array containing uint32_t type date
+	 *
+	 * @tparam N size of the array
+	 * @return std::array<size_t,N> array containing the deserialized values
+	 */
+    template <size_t N> std::array<size_t, N> deserialize_uint32_array_sizet()
+    {
+        std::array<size_t, N> res;
+        deserialize_uint32_arr_sizet(&res[0], N);
+        return res;
+    }
+
+    //
+    // This is used for shared objects like Shape and Interface.
+    // it deserializes the index, and decides if it's the first instance.
+    //  - must always pass the address which needs to point to it; though it
+    //    will be not be set by this function.
+    //  - if retval.second is null, then the object was previously deserialized,
+    //    and return.first is the pointer to it.
+    //  - otherwise, caller must deserialize the instance, and store the pointer
+    //    at *retval.second. retval.first will be null in this case.
+    // In scenarios where delayed resolution is used, the return may be {token,null}
+    // where 'token' is actually  delayed resolution token.
+    //
+    template <typename T>
+    std::pair<T const *, T const **> // see above
+    deserialize_shared_obj(T const **const loc)
+    {
+        auto const res = deserialize_shared_obj_func((void const **)loc);
+        return {(T const *)res.first, (T const **)res.second};
+    }
+
+    // Increment tue current read position of internal buffer without reading anything
+    void deserialize_skip_words(size_t nwords);
+
+    // Apply the 'pointer fixups' contained within seg_info. This can
+    // be called with 'this' being any Deserz or Deserializer associated
+    // with the operation (it is only used to access tables in Deserializer).
+    // This can only be done on a given segment when all previous have
+    // been deserialized; so if we have one Deserz per thread, we need
+    // to 'move' the seg_info object out of it after completing the segment,
+    // and use this later to do the fixups.
+    // Returns true if ok, false if failed.
+    // Will leave the fixup list empty on success.
+    bool apply_segment_fixups(runlist_fixup_state &seg_info) const;
+
+    // Methods to move 'seg_fixup_state' object in or out.
+    void install_seg_fixup_state(runlist_fixup_state &&src) { seg_fixup_state = std::move(src); }
+    runlist_fixup_state extract_seg_fixup_state() { return std::move(seg_fixup_state); }
+    void extract_seg_fixup_state_to(runlist_fixup_state &dest) { dest = std::move(seg_fixup_state); }
+
+    // and a read_only accessor
+    runlist_fixup_state const &fixup_state() const { return seg_fixup_state; }
+
+    // for Tensor::deserialize_blocktable
+    inline bool fixup_encode_for_blocktable(uint32_t const idx, uint32_t const table_offs, void **const ptrloc)
+    {
+        return hnnx::fixup_encode_for_blocktable(seg_fixup_state, idx, table_offs, ptrloc);
+    }
+};
+
+/////////////////
+
+class Deserializer : public Deserz {
+    friend class Deserz;
+
+  public:
+    /**
+	 * @brief Construct a new Deserializer object
+	 *
+	 * @param[in] p buffer that needs to be deserialized
+	 * @param[in] n length of the buffer
+	 * @param[in] g pointer Graph object to deserialize (usually null, since object
+	 *              is being passed to the Graph::Graph ctor to deserialize; that ctor
+	 *              must immediately call dctx.set_graph(*this) )
+	 */
+    API_EXPORT Deserializer(char const *p, size_t n, Graph *g = nullptr);
+    API_EXPORT virtual ~Deserializer(); // please keep this as first virtual method declared.
+
+    void set_graph(Graph &g);
+
+    inline void deserialize_tensor_def(Tensor const *tensor_ptr) { tensorconn.tensor_def(*this, tensor_ptr); }
+    inline void deserialize_tensor_ref(Tensor const *&where) { tensorconn.tensor_ref(*this, where); }
+    inline void deserialize_tensor_refs(Tensor const **ptrs, unsigned n) { tensorconn.tensor_refs(*this, ptrs, n); }
+    inline void deserialize_pred_conditions(std::vector<uint32_t> &pred_cond_list)
+    {
+        // get the number of items in the vector
+        uint32_t num_of_objects = deserialize_uint32();
+        assert(num_of_objects <= UINT32_MAX);
+        if (num_of_objects > 0) {
+            pred_cond_list.resize(num_of_objects);
+
+            // TODO: remove this once we know how to update it at runtime
+            // Currently setting it to true
+            pred_cond_list.at(0) = 1;
+        }
+    }
+    template <typename T> inline void deserialize_tensor_ref(T const *&where)
+    {
+        static_assert(std::is_base_of<Tensor, T>::value);
+        tensorconn.tensor_ref(*this, *(Tensor const **)&where);
+    }
+    template <typename T> void deserialize_tensor_refs(T const **ptrs, unsigned n)
+    {
+        static_assert(std::is_base_of<Tensor, T>::value);
+        tensorconn.tensor_refs(*this, (Tensor const **)ptrs, n);
+    }
+    inline object_identity_type deserialize_object_identity() { return tensorconn.read_identity(*this); }
+
+    inline void need_tensor_fixup(object_identity_type oid, Tensor const **where) { tensorconn.need_fixup(oid, where); }
+    inline void resolve_fixups()
+    {
+        [[maybe_unused]] const object_identity_type newval = tensorconn.read_identity(*this);
+        assert(newval == 0);
+    }
+
+    constexpr bool is_aligned_const_format() const { return aligned_const_format_flag; }
+    void set_aligned_const_format(const bool v = true) { aligned_const_format_flag = v; }
+
+    constexpr bool is_shared_dynamic_tensor_shape_format() const { return shared_dynamic_tensor_shape; }
+    void set_shared_dynamic_tensor_shape_format(const bool v = true) { shared_dynamic_tensor_shape = v; }
+
+    PUSH_WARNING()
+    DISABLE_WARNING("-Wcast-qual", MSVC_NO_EQUIV)
+    // valid when the entire pickle, in const_extent format, is loaded as a single, persistent dma buffer
+    inline unsigned char *get_weight_pointer() { return ((unsigned char *)bufstart) + (4 * pickle_len_words); };
+    POP_WARNING()
+    inline size_t get_weight_size() { return (bufend - bufstart) - (4 * pickle_len_words); };
+
+    inline op_deserialize_fn_list_t &get_op_deserialize_fn_list() { return op_deserialize_fn_list; }
+    inline tensor_deserialize_fn_list_t &get_tensor_deserialize_fn_list() { return tensor_deserialize_fn_list; }
+
+    // Next 4 methods are used to support 'deserialize_by_segments'.
+    // 'get_forward_span' returns a 'deser_segment_span' (pair of pointers) for a region of deserialized data
+    // from 'ref + start' up to 'ref + end', where start and end (0 <= start < end) are byte offsets
+    // relative to some position 'ref' in the deserialized data, and 'ref' is the value which bytes_consumed()
+    // returned at that reference point. All should be multiples of 4.
+    deser_segment_span get_forward_span(size_t ref, size_t start, size_t end);
+    // used to get a reference point for bytes_consumed
+    size_t bytes_consumed() const { return bufp - bufstart; }
+    // used to skip past the last 'get_forward_span' we did
+    void skip_to_after_span(deser_segment_span const &);
+    // resize tables: tensor, shared_obj, linktable, according to info in final_segdesc
+    void resize_object_tables(runlist_auxdata_seg_desc const &final_desc);
+
+    uint32_t crate_size_according_to_segments() const;
+
+  protected:
+    std::vector<void const *> objindex; // index of pointers to shape, etc.
+    // the state of the 'tensor connectivity' deserialize engine.
+    DeserTensorConn tensorconn;
+    bool aligned_const_format_flag = false;
+    bool shared_dynamic_tensor_shape = false;
+
+    // this is used in 'deserialize_str', so it ideally should be in Deserz; but
+    // it's pretty large; so, put it here and forbid calling deserialize_str
+    // on a Derserz which not really a Deserialize. We only use it to decode
+    // 'classic' pickles, so this is ok.
+    char name_buf[4096]; // used for string view
+
+    // do the reference fixups on a segment. Return true if OK.
+    // See Deserz::apply_segment_fixups for public API.
+    static bool do_segment_fixups(runlist_fixup_state &seginfo, Deserz const &dctx0);
+
+  public:
+    inline constexpr bool classic_format() const { return format_version == 0; }
+    inline void set_format_2307() { format_version = 1; }
+
+    // This is called when a 'class index' Aux Data is encountered.
+    // It must deserialize exactly the indicated number of payload words.
+    // is_tensor = false for "Co" (op class index), and true for "Ct" (tensor class index)
+    API_EXPORT void auxdata_class_index(unsigned payload_words, bool is_tensor);
+    //
+    // called when an 'Nt' Aux data is encountered, which provides some array sizes for the
+    // deserialization.
+    // It must deserialize exactly the indicated number of payload words.
+    API_EXPORT void auxdata_temparr_sizes(unsigned payload_words);
+    // Called when a 'AuxTag_deserializeSegments' is encountered. If it likes
+    // the record, it will set up the 'segments' object.
+    API_EXPORT void auxdata_deserialize_segments(unsigned payload_words);
+
+    // called when a 'KS' Aux data is encountered, which provides a const_extent_descriptor
+    // It must deserialize exactly the indicated number of payload words.
+    API_EXPORT int auxdata_read_const_extent_descriptor(const unsigned payload_words);
+    // helper for above. payload_words is the length WITH PADDING
+    API_EXPORT int extract_const_extent_name(const unsigned payload_words, std::string &retVal);
+
+    // Extract a std::vector<uint32_t> containing the 'const extent descriptor table,
+    // from a given offset (in units of 32-bit words) relative to the start of the pickle.
+    // or separate pointer (if separate buffer for the weights was passed in).
+    // This does not affect the current position.
+    // If there is a problem, it returns an empty vector; caller *must* check and report.
+    // This uses hnnx::const_extent_hdr_check to understand how much it should read,
+    // and to do basic check.
+    API_EXPORT std::vector<uint32_t> extract_const_extent_table(size_t posn_in_words);
+    std::vector<uint32_t> extract_const_extent_table(hexagon_nn_wide_address_const_t weight_data,
+                                                     const size_t weight_size);
+    // given a destination char pointer, prefilled with \null, fills it in with the name of the const_extent
+    // caller must provide destination of sufficient length
+    std::string name_from_weight_data(hexagon_nn_wide_address_const_t weight_data, const uint32_t weight_length);
+    // helper func for above. return -1 if name not present.
+    std::string get_name(hexagon_nn_wide_address_const_t weight_data, const uint32_t weight_length);
+    // give a vector of weight_data buffers, stores them all in the appropriate map
+    void store_named_weight_bufs(const hexagon_nn_wide_address_const_t *const buffers, const uint64_t *const lengths,
+                                 const unsigned num_buffers);
+    //
+    // copy 'len' bytes of data at offset offs_bytes in the pickle into location dstp.
+    // returns true if it's possible. You can maybe pass a DMA_Manager to have it queued...
+    // offs_bytes defined as uint64_t to support possible 'far' data on hexagon.
+    API_EXPORT bool extract_const_extent_data(uint64_t offs_bytes, size_t len, void *dstp, DMA_Manager *dma = nullptr);
+    // same, using an external const_extent
+    bool extract_const_extent_data(uint64_t offs_bytes, size_t len, void *dstp,
+                                   hexagon_nn_wide_address_const_t weight_data, const size_t weight_length);
+
+    // This extracts the 'objindex', when it is needed e.g. to 'patch' interfaces.
+    // Must be done only after deserializing, and can only be done once.
+    std::vector<void const *> extract_objindex() { return std::move(objindex); }
+
+    DeserSegDescs segments; // array of runlist_seg_descriptor, empty if not doing multiseg.
+
+    // this is used to pass the offset of the const-extent-descriptor (recorded as pickle_len)
+    // to the alloc->deserialize.
+    size_t pickle_len_words;
+
+    // OPTIONAL maps from weight buffer names to the descriptors and the buffers, respectively
+    cexdesc_deserializer_map named_cexdescs;
+    weight_buf_deserializer_map named_weight_bufs;
+
+    void *uncached_ptr;
+    uint32_t uncached_len;
+
+    std::vector<op_deserializer_map_t::const_iterator> op_deserialize_fn_list;
+    std::vector<tensor_deserializer_fn> tensor_deserialize_fn_list;
+
+    // used to 'link' shared blocktables during deser.
+    std::vector<void *const *> blocktable_link_table;
+};
+
+/////////////////
+
+// true if this Deserz is really an instance of Deserializer.
+inline constexpr bool Deserz::is_base_deser() const
+{
+    return static_cast<Deserz const *>(full_deser) == this;
+}
+
+inline bool Deserz::is_aligned_const_format() const
+{
+    return full_deser->aligned_const_format_flag;
+}
+inline bool Deserz::is_shared_dynamic_tensor_shape_format() const
+{
+    return full_deser->shared_dynamic_tensor_shape;
+}
+inline Deserz::op_deserialize_fn_list_t &Deserz::get_op_deserialize_fn_list()
+{
+    return full_deser->op_deserialize_fn_list;
+}
+inline Deserz::tensor_deserialize_fn_list_t &Deserz::get_tensor_deserialize_fn_list()
+{
+    return full_deser->tensor_deserialize_fn_list;
+}
+inline std::vector<void *const *> &Deserz::get_blocktable_link_table()
+{
+    return full_deser->blocktable_link_table;
+}
+// For these in Deserz, we must call the corresponding methods on the
+// tensorconn in 'full_deser', but must pass 'this' as first parameter.
+inline void Deserz::deserialize_tensor_def(Tensor const *const tensor_ptr)
+{
+    full_deser->tensorconn.tensor_def(*this, tensor_ptr);
+}
+inline void Deserz::deserialize_tensor_ref(Tensor const *&where)
+{
+    full_deser->tensorconn.tensor_ref(*this, where);
+}
+inline void Deserz::deserialize_tensor_refs(Tensor const **const ptrs, const unsigned n)
+{
+    full_deser->tensorconn.tensor_refs(*this, ptrs, n);
+}
+inline DeserSegDescs const &Deserz::get_segments() const
+{
+    return full_deser->segments;
+}
+
+// unaligned read of 64-bits (two 32-bit aligned reads)
+template <> inline uint64_t Deserz::simple_deserialize<uint64_t>()
+{
+    char const *const curr_p = bufp;
+    if (curr_p + 8u > buf_limit) {
+        return deser_u64_slowpath();
+    }
+    uint32_t const *const p = (uint32_t const *)(curr_p);
+    bufp = curr_p + 8u;
+    return p[0] + ((uint64_t)p[1] << 32);
+}
+inline uint64_t Deserz::deserialize_uint64()
+{
+    return simple_deserialize<uint64_t>();
+}
+
+template <> inline uint64_t Deserz::deserialize_type<uint64_t>()
+{
+    return deserialize_uint64();
+}
+template <> inline float Deserz::deserialize_type<float>()
+{
+    return deserialize_float();
+}
+// sometimes uint32_t is unsigned long, sometimes it's unsigned
+// sometimes unsigned long is uint64. Hopefully this should cover it all.
+#if ULONG_MAX == UINT_MAX
+template <> inline unsigned long Deserz::deserialize_type<unsigned long>()
+{
+    return deserialize_uint32();
+}
+template <> inline long Deserz::deserialize_type<long>()
+{
+    return deserialize_int32();
+}
+#endif
+template <> inline unsigned Deserz::deserialize_type<unsigned>()
+{
+    return deserialize_uint32();
+}
+template <> inline int Deserz::deserialize_type<int>()
+{
+    return deserialize_int32();
+}
+template <> inline int16_t Deserz::deserialize_type<int16_t>()
+{
+    return deserialize_int16();
+}
+template <> inline uint16_t Deserz::deserialize_type<uint16_t>()
+{
+    return deserialize_uint16();
+}
+template <> inline int8_t Deserz::deserialize_type<int8_t>()
+{
+    return deserialize_int8();
+}
+template <> inline uint8_t Deserz::deserialize_type<uint8_t>()
+{
+    return deserialize_uint8();
+}
+
+// assert( dctx.deserialize_uint32() == SOME_CONST );
+// is not safe, since if you turn off asserts, it will no longer read the 4 bytes. This is to allow that to work
+#define DESERIALIZE_ASSERT_UINT32(DCTX, VAL)                                                                           \
+    do {                                                                                                               \
+        uint32_t const tmp [[gnu::unused]] = (DCTX).deserialize_uint32();                                              \
+        assert(tmp == (VAL));                                                                                          \
+    } while (0)
+
+#include "weak_linkage.h"
+PUSH_VISIBILITY(default)
+
+/**
+ * @brief register the deserialization function for each \ref Op
+ * TypicalOp and VariadicOp derived classes are instantiated via
+ * template and hence the need to create a map of deserialize functions
+ * for each Op when they are generated at library initialization
+ *
+ * @param[in] tinf Op type_info that is used to key the map
+ * @param[in] fn Deserialize function
+ */
+API_EXPORT void deserialize_op_register(std::type_info const *tinf, const std::string_view type_tag,
+                                        const op_deserializer_fn &fn, bool is_external = false);
+/**
+ * @brief register the deserialization function for each \ref Tensor
+ * Since \ref Tensor derived classes are instantiated via templates, there
+ * is a need to create a map of deserialize function for each Tensor at runtime
+ *
+ * @param[in] type_tag Tensor type tag that is used to key the map
+ * @param[in] fn Deserialize function
+ */
+API_FUNC_EXPORT void deserialize_tensor_register(std::type_info const &tinf, const char *type_tag,
+                                                 tensor_deserializer_fn fn);
+
+POP_VISIBILITY()
+
+// this is fully defined in serialize_register.h
+template <typename T> struct deserialize_tensor_using_constructor;
+
+// this is fully defined in serialize_register.h
+template <typename T> struct alloc_func_for_op;
+template <typename T> struct dealloc_func_for_op;
+
+//////////////////////
+// Forward decls of things defined in template_help.h
+//
+// contains_type< tuple<a,b,c>, x >::value: true if x is in a,b,c ...
+// no 'remove ref' etc is done.
+template <typename TUPLET, typename T> struct contains_type;
+template <typename TUPLET, typename T> struct not_contains_type;
+template <template <typename> typename Pred, typename...> struct TupFilter;
+
+PUSH_VISIBILITY(default)
+
+// 'slow path' for deserialize_op_idx, used when the value is not aready in the table.
+API_EXPORT uint32_t deserialize_op_idx_slow(Deserz &dctx, uint32_t op_idx);
+
+/**
+ * @brief deserialize a \ref Tensor. The implementation makes use of the map
+ * created during \ref deserialize_tensor_register to construct the Tensor.
+ *
+ * @param[in] producer \ref Op that will produce this tensor
+ * @param[in] dctx \ref Deserializer context that has the buffer to read from
+ * @param[in] graph_in \ref Graph context where this Tensor lives.
+ * @return uptr_Tensor unique_ptr of \ref Tensor type
+ */
+API_EXPORT uptr_Tensor deserialize_tensor(Deserz &dctx);
+
+POP_VISIBILITY()
+
+} // namespace hnnx
+
+#endif // DESERIALIZER_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/dtype.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/dtype.h
new file mode 100755
index 0000000000000..4e93116f9b467
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/dtype.h
@@ -0,0 +1,180 @@
+//==============================================================================
+//
+// Copyright (c) 2020 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef DTYPE_H
+#define DTYPE_H 1
+
+#include <cstdint>
+#include <type_traits>
+#include "dtype_enum.h"
+#include "float16.h"
+#include "macros_attribute.h"
+#include "weak_linkage.h"
+
+template <DType DT> struct dtype_traits {
+};
+
+template <> struct dtype_traits<DType::QUInt8> {
+    typedef uint8_t element_type;
+    typedef uint8_t storage_type;
+    static const int element_size = sizeof(element_type);
+    static const bool is_quant = true;
+    static const bool is_float = false;
+    static const storage_type minus_inf_code = 0;
+};
+
+template <> struct dtype_traits<DType::QUInt16> {
+    typedef uint16_t element_type;
+    typedef uint16_t storage_type;
+    static const int element_size = sizeof(element_type);
+    static const bool is_quant = true;
+    static const bool is_float = false;
+    static const storage_type minus_inf_code = 0;
+};
+
+template <> struct dtype_traits<DType::QInt16> {
+    typedef int16_t element_type;
+    typedef uint16_t storage_type;
+    static const int element_size = sizeof(element_type);
+    static const bool is_quant = true;
+    static const bool is_float = false;
+    static const storage_type minus_inf_code = 0x8000;
+};
+template <> struct dtype_traits<DType::Float16> {
+    typedef Float16 element_type;
+    typedef uint16_t storage_type;
+    static const int element_size = sizeof(element_type);
+    static const bool is_quant = false;
+    static const bool is_float = true;
+    // -inf pattern (but, if hvx flt16 are used, maybe
+    // it should be 0xFFFF?
+    static const storage_type minus_inf_code = 0xFC00;
+};
+template <> struct dtype_traits<DType::Float32> {
+    typedef float element_type;
+    typedef NN_UINT32_T storage_type;
+    static const int element_size = sizeof(element_type);
+    static const bool is_quant = false;
+    static const bool is_float = true;
+    // -inf pattern (but, if hvx flt16 are used, maybe
+    // it should be 0xFFFFFFFF?
+    static const storage_type minus_inf_code = 0xFF800000;
+};
+template <> struct dtype_traits<DType::Int32> {
+    typedef NN_INT32_T element_type;
+    typedef NN_UINT32_T storage_type;
+    static const int element_size = sizeof(element_type);
+    static const bool is_quant = false;
+    static const bool is_float = false;
+    static const storage_type minus_inf_code = 1u << 31;
+};
+template <> struct dtype_traits<DType::QInt32> {
+    typedef NN_INT32_T element_type;
+    typedef NN_UINT32_T storage_type;
+    static const int element_size = sizeof(element_type);
+    static const bool is_quant = true;
+    static const bool is_float = false;
+    static const storage_type minus_inf_code = 1u << 31;
+};
+template <> struct dtype_traits<DType::QInt8> {
+    typedef int8_t element_type;
+    typedef uint8_t storage_type;
+    static const int element_size = sizeof(element_type);
+    static const bool is_quant = true;
+    static const bool is_float = false;
+    static const storage_type minus_inf_code = 128;
+};
+template <> struct dtype_traits<DType::Int64> {
+    typedef NN_INT64_T element_type;
+    typedef NN_UINT64_T storage_type;
+    static const int element_size = sizeof(element_type);
+    static const bool is_quant = false;
+    static const bool is_float = false;
+    static const storage_type minus_inf_code = 1llu << 63;
+};
+
+// 'runtime' attributes
+// E.g. Dtype_info(d).elbytes gives the element size.
+struct dtype_info {
+    unsigned elbytes : 8;
+    DType dtype : 8;
+    unsigned is_quant : 1;
+    unsigned is_float : 1;
+    unsigned is_signed : 1;
+};
+
+PUSH_VISIBILITY(default)
+API_EXPORT dtype_info DType_info(DType d); // in graph.cc
+POP_VISIBILITY()
+
+namespace hnnx {
+namespace dtype_private {
+template <DType DT> dtype_info constexpr inline dtype_info_for()
+{
+    typedef dtype_traits<DT> traits;
+    return dtype_info{
+            sizeof(typename traits::element_type), //elbytes
+            DT, // dtype
+            traits::is_quant, //is_quant
+            traits::is_float, //is_float
+            (std::is_signed<typename traits::element_type>::value ? 1 : 0) //is_signed
+    };
+}
+template <> dtype_info constexpr inline dtype_info_for<DType::UNKNOWN>()
+{
+    return dtype_info{
+            0, //elbytes
+            DType::UNKNOWN, // dtype
+            0, //is_quant
+            0, //is_float
+            0 //is_signed
+    };
+}
+// this is intended to be only referenced once (inside DType_info, in graph.cc)
+// and is placed here for easy maintenance
+
+inline constexpr dtype_info DType_info_inline(DType d)
+{
+    switch (d) {
+    case DType::QUInt8:
+        return dtype_info_for<DType::QUInt8>();
+    case DType::QUInt16:
+        return dtype_info_for<DType::QUInt16>();
+    case DType::QInt16:
+        return dtype_info_for<DType::QInt16>();
+    case DType::Float16:
+        return dtype_info_for<DType::Float16>();
+    case DType::Float32:
+        return dtype_info_for<DType::Float32>();
+    case DType::Int32:
+        return dtype_info_for<DType::Int32>();
+    case DType::QInt32:
+        return dtype_info_for<DType::QInt32>();
+    case DType::QInt8:
+        return dtype_info_for<DType::QInt8>();
+    case DType::Int64:
+        return dtype_info_for<DType::Int64>();
+    default:
+        return dtype_info_for<DType::UNKNOWN>();
+    }
+}
+} //namespace dtype_private
+
+template <DType DT> // maps DT -> constexpr dtype_info
+constexpr dtype_info dtype_info_v = dtype_private::dtype_info_for<DT>();
+
+} // namespace hnnx
+
+// LCOV_EXCL_START [SAFTYSWCCB-1736] constexprs resolved during compile time
+template <typename TINTERFACE> constexpr DType dtype_of_type()
+{
+    return DType::UNKNOWN;
+}
+// LCOV_EXCL_STOP
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/dtype_enum.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/dtype_enum.h
new file mode 100755
index 0000000000000..9a600a6d0d61d
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/dtype_enum.h
@@ -0,0 +1,97 @@
+//==============================================================================
+//
+// Copyright (c) 2020, 2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef DTYPE_ENUM_H
+#define DTYPE_ENUM_H 1
+
+#include <stdint.h>
+#include "weak_linkage.h"
+
+PUSH_VISIBILITY(default)
+
+typedef int NN_INT32_T;
+
+typedef unsigned int NN_UINT32_T;
+
+typedef int64_t NN_INT64_T;
+
+typedef uint64_t NN_UINT64_T;
+
+#ifdef __cplusplus
+enum class DType : uint32_t {
+#else
+enum DType {
+#endif
+    UNKNOWN = 0,
+    QUInt8 = 1,
+    QUInt16 = 2,
+    QInt16 = 3,
+    Float32 = 4,
+    Int32 = 5,
+    QInt32 = 6,
+    QInt8 = 7,
+    Float16 = 8,
+    Int64 = 9,
+    ZZ_LAST_DTYPE,
+    None = 254, //  for output of OpDef representing null output. Not for use by external API.
+    Multi = 255 //  for output of OpDef representing multiple outputs. Not for use by external API.
+};
+
+#define DTYPE_NAMETABLE_INIT                                                                                           \
+    {                                                                                                                  \
+        "UNKNOWN", "QUInt8", "QUInt16", "QInt16", "Float32", "Int32", "QInt32", "QInt8", "Float16", "Int64"            \
+    }
+
+#ifdef __cplusplus
+namespace hnnx {
+extern "C" {
+#endif
+API_FUNC_EXPORT char const *DType_name(enum DType);
+#ifdef __cplusplus
+} // extern C
+// this is intended to be only referenced once (inside DType_name, in graph.cc)
+// and is placed here for easy maintenance
+// LCOV_EXCL_START [SAFTYSWCCB-1542]
+inline char const *DType_name_inline(DType d)
+{
+    switch (d) {
+    default:
+    case DType::ZZ_LAST_DTYPE:
+    case DType::None:
+        return "Bad_DType";
+    case DType::UNKNOWN:
+        return "UNKNOWN";
+    case DType::QUInt8:
+        return "QUInt8";
+    case DType::QUInt16:
+        return "QUInt16";
+    case DType::QInt16:
+        return "QInt16";
+    case DType::Float16:
+        return "Float16";
+    case DType::Float32:
+        return "Float32";
+    case DType::Int32:
+        return "Int32";
+    case DType::Int64:
+        return "Int64";
+    case DType::QInt32:
+        return "QInt32";
+    case DType::QInt8:
+        return "QInt8";
+    case DType::Multi:
+        return "Multi";
+    }
+}
+// LCOV_EXCL_STOP
+} // namespace hnnx
+#endif
+
+POP_VISIBILITY()
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/dynamic_tensors.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/dynamic_tensors.h
new file mode 100755
index 0000000000000..c3884ee42431a
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/dynamic_tensors.h
@@ -0,0 +1,55 @@
+//==============================================================================
+//
+// Copyright (c) 2024 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef DYNAMIC_TENSOR_H
+#define DYNAMIC_TENSOR_H
+
+#ifdef __cplusplus
+struct DynamicStatus {
+#endif // __cplusplus
+    enum DynamicTensorErrorCode {
+        ValidData = 0,
+        SemiValidData = 1,
+        InvalidData = 2,
+        InPlace = 3,
+        Fallback = 4,
+        NonInplace = 4, // alias to fallback
+        InvalidConfig = 5
+    };
+#ifdef __cplusplus
+    static bool skip_execute(const DynamicStatus ec)
+    {
+        bool retVal;
+        switch (DynamicTensorErrorCode(ec)) {
+        case ValidData:
+        case SemiValidData:
+        case Fallback:
+            retVal = false;
+            break;
+        default:
+            retVal = true;
+            break;
+        }
+        return retVal;
+    }
+    DynamicStatus(const DynamicTensorErrorCode ec) : error_code(ec) {}
+    explicit DynamicStatus(const int ec) : error_code(static_cast<DynamicTensorErrorCode>(ec)) {}
+    bool operator==(const DynamicTensorErrorCode ec) const { return error_code == ec; }
+    bool operator!=(const DynamicTensorErrorCode ec) const { return error_code != ec; }
+    int to_int() const { return static_cast<int>(error_code); }
+    explicit operator DynamicTensorErrorCode() const { return error_code; }
+    explicit operator bool() const { return !skip_execute(error_code); }
+    static bool failed_execute(const DynamicStatus ec) { return DynamicTensorErrorCode(ec) == InvalidConfig; }
+
+  private:
+    DynamicTensorErrorCode error_code;
+};
+
+#endif // __cplusplus
+
+#endif // DYNAMIC_TENSOR_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/executable.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/executable.h
new file mode 100755
index 0000000000000..923cba972ff8e
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/executable.h
@@ -0,0 +1,129 @@
+//==============================================================================
+//
+// Copyright (c) 2018-2024 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+#ifndef EXECUTABLE_H
+#define EXECUTABLE_H 1
+
+#include "graph_status.h"
+#include <tuple>
+#include <cstdlib>
+#include <stdint.h>
+
+class Graph;
+
+#include "weak_linkage.h"
+#include "macros_attribute.h"
+PUSH_VISIBILITY(default)
+
+namespace hnnx {
+
+// Passed to any Op function which specfies 'op_slice_spec' as a value parameter.
+// This is called by its typedef 'hnnx::op_slice_spec' everywhere; it has a short name hnnx::OsS since it's
+// mangled into all the Op execute method names.
+// The parameter must be the last one (unless there is a Graph const &, in which case it's before that).
+struct OsS {
+  protected:
+    // on hexagon:
+    //   must be possible to pass this in a 32 bit register; and 'default constructor' must
+    //   be equivalent to a 32-bit value of '1'.
+    unsigned m_nslices : 16;
+    unsigned m_slice_idx : 16;
+
+  public:
+    OsS(OsS const &) = default;
+    OsS &operator=(OsS const &) = default;
+    constexpr OsS() : m_nslices(1), m_slice_idx(0) {}
+    constexpr OsS(unsigned const n, unsigned const i) : m_nslices(n), m_slice_idx(i) {}
+
+    constexpr unsigned num_slices() const { return m_nslices; }
+    constexpr unsigned slice_idx() const { return m_slice_idx; }
+
+    // If you want to pass an op_slice_spec into an asm routine as 32-bits, use this;
+    // it provides an integer with 'num_slices' in lower 16 bits and 'slice_idx' in upper 16,
+    // and will do so even if we change the format of op_slice_spec (e.g. to add some extra bits)
+    // so you won't need to change your asm.
+    unsigned as_uint32() const
+    {
+        union {
+            OsS ss;
+            unsigned as_u;
+        } uu = {*this};
+        return uu.as_u;
+    }
+
+    void from_uint32(unsigned x)
+    {
+        union {
+            unsigned u;
+            OsS ss;
+        } uu = {x};
+        *this = uu.ss;
+    }
+};
+using op_slice_spec = OsS;
+
+#define EXECUTE_METHOD_PARMS Graph *gr [[maybe_unused]], hnnx::op_slice_spec
+
+typedef volatile uint32_t *counter_t;
+typedef volatile uint32_t *counter_nc_t;
+
+/*
+	 * We want to have an abstraction for things that can execute()
+	 * so that we can treat them more abstractly than all the things in Op
+	 *
+	 * This is that interface.
+	 *
+	 * Note that an important optimization is to be able to obtain the address of the execute call.
+	 *
+	 * So....
+	 * THE "execute()" VIRTUAL FUNCTION MUST BE THE 0th THING IN THE VTABLE
+	 * THE "execute()" VIRTUAL FUNCTION MUST NOT CHANGE SIGNATURES
+	 *
+	 * This allows us to look into the structures to find out more concrete addresses.
+	 */
+// Note: do not change the class design in any way that requires up/down pointer casts
+// (between Excecutable its subclasses) to change the pointer value.
+class API_EXPORT Executable {
+  public:
+    static constexpr unsigned MAX_OP_SLICES = 4;
+
+    using FuncType = GraphStatus (*)(const void *, EXECUTE_METHOD_PARMS);
+    using ItemType = std::pair<FuncType, const void *>;
+    struct alignas(16) ExecType { // alignment keeps it all in same cache line on hexagon.
+        FuncType funcp;
+        const void *datap;
+        counter_t gate_cp;
+        counter_t done_cp;
+        ExecType(FuncType const f, const void *const d, counter_t const gc, counter_t const dc)
+            : funcp(f), datap(d), gate_cp(gc), done_cp(dc)
+        {
+        }
+        ExecType &operator=(ExecType const &rhs) = default;
+        ExecType() : funcp{}, datap{}, gate_cp{}, done_cp{} {}
+    };
+    virtual GraphStatus execute(EXECUTE_METHOD_PARMS) const noexcept = 0; // Needs to be at vtable offset zero!!!
+    virtual ItemType compile(Graph &graph_in) const; // Turn this Executable into a function pointer and data pointer.
+    virtual ~Executable() = default;
+    static const size_t *vtable(Executable const *); // helper function: get vtable
+    static size_t execute_address(Executable const *); // helper function: get address of execute() function
+
+    static GraphStatus no_op_function(const void *, EXECUTE_METHOD_PARMS); // just returns Success.
+    static ItemType null_item() { return {no_op_function, nullptr}; }
+};
+
+// to execute an Executable::ItemType...
+
+inline GraphStatus execute_item(Graph *graph_in, Executable::ExecType const &itemt)
+{
+    return (*itemt.funcp)(itemt.datap, graph_in, op_slice_spec{});
+}
+
+}; // namespace hnnx
+
+POP_VISIBILITY()
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/flags.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/flags.h
new file mode 100755
index 0000000000000..c22f44296d88b
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/flags.h
@@ -0,0 +1,177 @@
+//==============================================================================
+//
+// Copyright (c) Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef HEXNN_FLAGS_H
+#define HEXNN_FLAGS_H 1
+
+#include "builtin_intrinsics.h"
+#include <cstddef>
+
+/*
+ * Every flag needs a constant for what it refers to
+ */
+typedef unsigned long Flags_word;
+enum class Flags : unsigned {
+    IS_CONST = 0, // output doesn't change
+    INHIBIT_CONST_PROP, // do not const-propagate this Op
+    RESOURCE_HVX, // op needs an HVX thread (converted to spawn/validate)
+    RESOURCE_HMX, // uses HMX
+    RESOURCE_HLX, // uses HLX
+    IS_DMA, // op issues dma, does not wait (converted to dma_start/dma_sync)
+    FOR_HVX, // op is a spawn or validate for HVX (must combine with MOVE_*)
+    FOR_HMX, // op is a spawn or validate for HMX (must combine with MOVE_*)
+    FOR_HLX, // op is a spawn or validate for HLX (must combine with MOVE_*)
+    FOR_DMA, // op is a dma_start or dma_sync (must combine with MOVE_*)
+    MOVE_EARLY, // scheduler should move early (must combine with FOR_*)
+    MOVE_LATE, // scheduler should move late (must combine with FOR_*)
+    NULL_EXEC, // exec function does nothing
+    IS_SPILL, // Op is a 'Spill', added during schedule/alloc
+    IS_FILL, // Op is a 'Fill', added during schedule/alloc
+    INPLACE_NOP, // A NULL_EXEC which is just a copy (see below).
+    IS_COPY, // An op which is a copy (see below)
+    IS_SYNC, // Op is a 'SyncOp'
+    IS_SEND, // Op is a multi-core multicast send.
+    IS_RECV, // Op is a multi-core receive.
+    IS_PADZAP, // Op is a crouton padzap (same tensor type and shape in and out)
+    IS_PRELOAD, // Op is a chunk preload op
+    CAN_BE_SRC_DESTRUCTIVE, // Op will work correctly if input[0] and output[0] are at the same places in TCM
+    IS_WEIGHT_FOR_BIT_REARRANGE, // Indicates Weight data will be used for bit rearrangement
+    XXX_LAST_FLAG
+};
+// NOTE: if you add new background op types (FOR_HVX, RESOURCE_HVX, etc) make sure to update
+// Flags_RESOURCE_BACKGROUND, Flags_FOR_BACKGROUND in include/priv/flags_prepare.h
+
+// INPLACE_NOP is a null_exec op that has 1 input and 1 output, with identical DType and quantization,
+// shape, and tensor type; which could just be bypassed. Examples are the 'Padzap' with no work to do,
+// and all ForceFormat_flat where the input is already flat (implemented by format_no_translate_flat).
+
+// IS_COPY is a op that has 1 input and 1 output, with the same shape and tensor layout type; the memory
+// class and dtype can be different (dtype must be the same #bytes); but the operation must be fulfilled
+// by raw copy of the input block(s) to the output block(s). Mainly this is intended to mark const->TCM
+// operations that can be replaced by 'const-fill'.
+
+static_assert(static_cast<int>(Flags::XXX_LAST_FLAG) <= 64, "Too many flags");
+
+/**
+ * @brief Now we want to add flags to Ops (and maybe other things)
+ * The default for all flags is 0.
+ * Ideally, every op have one poitner/reference/function pointer/something per class (not per obj)
+ * that would get the right flags.
+ *
+ * We could get that with a virtual function (entry in the vtable)
+ * ... the default could be inherited
+ * ... But how do we choose whether or not to override that function?
+ * ... And how do we override the function conditionally?
+ * We could get a static constexpr variable...
+ * ... But how would we get at the static constexpr value from a pointer to base?
+ *
+ * We have get_flag_word(), virtual method of Op, which returns a Flags_word value.
+ * And get_flag(Flag f), non-virtual; calls get_flag_word() and then tests the specified bit.
+ * You can also call get_flag_word() once and test multiple bits using calls to test_flag_for().
+ */
+
+/*
+ * We might be able to use bitset here, but bitset has limited constexpr for some reason...
+ */
+
+namespace hnnx {
+
+template <Flags... idxs>
+constexpr Flags_word flagval_generate = ((Flags_word(1) << static_cast<unsigned>(idxs)) | ... | 0);
+
+constexpr int FLAG_FOLDING_LIMIT = 20;
+
+template <typename T, int S = 1> inline constexpr Flags_word flags_for()
+{
+    return 0;
+}
+
+inline constexpr bool test_flag_for(Flags_word w, Flags which)
+{
+    return (safe_rshift(w, static_cast<unsigned>(which)) & 1u) != 0;
+}
+
+// LCOV_EXCL_START [SAFTYSWCCB-1542]
+inline constexpr bool test_flag_and(Flags_word w, Flags which_a, Flags which_b)
+{
+    if ((safe_rshift(w, static_cast<unsigned>(which_a)) & 1u) == 0) return false;
+    return (safe_rshift(w, static_cast<unsigned>(which_b)) & 1u) != 0;
+}
+//LCOV_EXCL_STOP
+
+// LCOV_EXCL_START [SAFTYSWCCB-1736] constexprs resolved during compile time
+// used in pub/impl/op_register.h for internal ops with constexpr lva
+template <typename T, int S> class FlagCounter {
+  public:
+    constexpr static unsigned increment() { return 0U; }
+
+    constexpr static unsigned get() { return increment() + FlagCounter<T, S - 1>::get(); }
+};
+
+template <typename T> class FlagCounter<T, -1> {
+  public:
+    constexpr static unsigned get() { return 0U; }
+};
+//LCOV_EXCL_STOP
+
+#define DOCS_UNSET ""
+
+// LCOV_EXCL_START [SAFTYSWCCB-1542]
+template <typename T> [[maybe_unused]] static constexpr const char *docs_for()
+{
+    return DOCS_UNSET;
+}
+//LCOV_EXCL_STOP
+
+} // namespace hnnx
+
+/* Counts up from 0 each time a flag is added for this Type, specializing flags_for
+ * with the value from the counter. To access the flag for the op, it's expected to be at
+ * flags_for<Op>. The S value is any monotonically increasing value.
+ */
+#define FLAGS_FOR_IMPL(T, S, ...)                                                                                                                                                                                                   \
+    using hnnx::FlagCounter;                                                                                                                                                                                                        \
+    /* LCOV_EXCL_START [SAFTYSWCCB-1736] constexprs resolved during compile time */                                                                                                                                                 \
+    template <> constexpr unsigned FlagCounter<T, S>::increment() { return 1U; }                                                                                                                                                    \
+    /* LCOV_EXCL_STOP */                                                                                                                                                                                                            \
+    template <> constexpr Flags_word hnnx::flags_for<T, FlagCounter<T, S>::get()>()                                                                                                                                                 \
+    {                                                                                                                                                                                                                               \
+        constexpr Flags_word flags = hnnx::flagval_generate<__VA_ARGS__>;                                                                                                                                                           \
+        static_assert(                                                                                                                                                                                                              \
+                FlagCounter<T, S>::get() <= hnnx::FLAG_FOLDING_LIMIT,                                                                                                                                                               \
+                "Flag folding limit exceeded, this means you tried to register too many flags to the same type.");                                                                                                                  \
+        static_assert(                                                                                                                                                                                                              \
+                FlagCounter<T, S>::get() == 1 || flags == flags_for<T, FlagCounter<T, S>::get() - 1>(),                                                                                                                             \
+                "Flags mismatch, this happens when different flags have been registered for the same type. Due to TCM folding, this can also happen when registering different flags for the TCM and non-TCM op implementations."); \
+        return flags;                                                                                                                                                                                                               \
+    }
+
+#define FLAGS_FOR(F, ...)                   FLAGS_FOR_IMPL(F, __COUNTER__, __VA_ARGS__)
+#define FLAGS_FOR_DT_NO_TCM_FOLDING(F, ...) FLAGS_FOR_IMPL(DerivedType<F>::type, __COUNTER__, __VA_ARGS__)
+
+#if defined(PREPARE_DISABLED) && !defined(TCM_FOLDING_DISABLED)
+// See register-op-tcm-folding.md
+#define MOD_DER_TYPE(F, LINE) fold::ModifiedDerivedType<F, LINE>::Modified
+#define FLAGS_FOR_DT_IMPL(F, UNIQUE_VAL, ...)                                                                          \
+    MDT(F, UNIQUE_VAL)                                                                                                 \
+    FLAGS_FOR_IMPL(MOD_DER_TYPE(F, UNIQUE_VAL), __COUNTER__, __VA_ARGS__)
+#define FLAGS_FOR_DT(F, ...) FLAGS_FOR_DT_IMPL(F, __COUNTER__, __VA_ARGS__)
+#else
+#define FLAGS_FOR_DT(F, ...) FLAGS_FOR_IMPL(DerivedType<F>::type, __COUNTER__, __VA_ARGS__)
+#endif
+
+#define DOCS_FOR_DT(F, DOCSTRING) DOCS_FOR(DerivedType<F>::type, DOCSTRING)
+
+#ifndef PREPARE_DISABLED
+#define DOCS_FOR(F, DOCSTRING)                                                                                         \
+    template <> constexpr const char *hnnx::docs_for<F>() { return DOCSTRING; }
+#else
+#define DOCS_FOR(F, DOCSTRING)
+#endif
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/float16.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/float16.h
new file mode 100755
index 0000000000000..dc50b93a4dd99
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/float16.h
@@ -0,0 +1,474 @@
+//==============================================================================
+//
+// Copyright (c) 2020, 2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef FLOAT16_H
+#define FLOAT16_H
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cmath>
+#include <limits>
+
+#include "builtin_intrinsics.h"
+
+#include "weak_linkage.h"
+#include "macros_attribute.h"
+
+PUSH_VISIBILITY(default)
+
+struct API_EXPORT Float16 {
+    constexpr Float16() : d(0) {}
+    constexpr Float16(float f);
+    constexpr Float16(const Float16 &f) : d(f.d) {}
+    constexpr Float16 &operator=(Float16 f);
+
+    constexpr bool is_zero() const;
+    constexpr bool is_neg() const;
+    constexpr bool is_inf() const;
+    constexpr bool is_nan() const;
+    constexpr bool is_subnorm() const;
+    constexpr bool is_norm() const;
+    constexpr bool is_finite() const;
+
+    constexpr int16_t exp() const;
+    constexpr int16_t frac() const;
+    constexpr uint16_t raw() const { return d; }
+
+    static constexpr int exp_max() { return 15; }
+    static constexpr int exp_min() { return -14; }
+    static constexpr int16_t bias() { return 15; }
+
+    static constexpr Float16 zero(bool neg = false);
+    static constexpr Float16 qnan();
+    static constexpr Float16 snan();
+    static constexpr Float16 inf(bool neg = false);
+
+    static constexpr Float16 from_raw(uint16_t v);
+
+    constexpr operator float() const;
+    // same as ->float, but treats max. exp as a normal number
+    // instead of inf/nan
+    float to_float_alt() const;
+    // same as from-float, but allows +/- 131008 range, using
+    // exp=31 as normal.
+    static Float16 from_float_alt(float v);
+
+  private:
+    explicit constexpr Float16(int sign, int exp, int frac);
+
+    constexpr uint16_t sign_bit() const;
+    constexpr uint16_t exp_bits() const;
+    constexpr uint16_t frac_bits() const;
+
+    static constexpr uint16_t make_exp_bits(uint16_t e);
+    static constexpr uint16_t make_sign_bit(uint16_t s);
+    static constexpr uint16_t make_frac_bits(uint16_t f);
+
+    static constexpr uint16_t make_zero(bool neg);
+    static constexpr uint16_t make_nan(bool quiet);
+    static constexpr uint16_t make_inf(bool neg);
+    static constexpr uint16_t make(int sign, int exp, int frac);
+
+    static constexpr uint32_t round(uint32_t v, unsigned s);
+
+    std::pair<int32_t, int32_t> force_norm() const;
+
+    union {
+        uint16_t d;
+        struct {
+            uint16_t mantissa : 10;
+            uint16_t exponent : 5;
+            uint16_t sign : 1;
+        };
+    };
+
+    friend API_FUNC_EXPORT Float16 operator-(Float16 a);
+    friend API_FUNC_EXPORT Float16 operator+(Float16 a, Float16 b);
+    friend API_FUNC_EXPORT Float16 operator-(Float16 a, Float16 b);
+    friend API_FUNC_EXPORT Float16 operator*(Float16 a, Float16 b);
+};
+
+POP_VISIBILITY()
+
+inline constexpr Float16::Float16(float f) : d(0)
+{
+    union U {
+        constexpr U(float f) : f(f) {}
+        float f;
+        uint32_t w;
+    } const u(f);
+
+    bool const neg = u.w & (uint32_t(1u) << 31u);
+    int const exp_extract = (u.w >> 23u) & 0xFFu;
+    uint32_t const frac_bits = u.w & 0x7FFFFFu;
+
+    if (exp_extract == 0xFF) {
+        if (frac_bits == 0)
+            d = make_inf(neg);
+        else
+            d = make_nan(frac_bits & 0x400000u);
+        return;
+    }
+
+    if (exp_extract == 0) {
+        // It could be a subnormal number, but all single-precision subnormals
+        // become 0 in half-precision.
+        d = make_zero(neg);
+        return;
+    }
+
+    int const exp = exp_extract - 127;
+    int const frac = round(frac_bits | (uint32_t(1) << 23u), 23 - 10);
+    d = make(neg, exp, frac);
+}
+
+inline constexpr Float16 &Float16::operator=(const Float16 f)
+{
+    d = f.d;
+    return *this;
+}
+
+inline constexpr bool Float16::is_zero() const
+{
+    return (exp_bits() | frac_bits()) == 0x0000;
+}
+
+inline constexpr bool Float16::is_neg() const
+{
+    return sign_bit();
+}
+
+inline constexpr bool Float16::is_inf() const
+{
+    return exp_bits() == make_exp_bits(0x001F) && frac_bits() == 0x0000;
+}
+
+inline constexpr bool Float16::is_nan() const
+{
+    return exp_bits() == make_exp_bits(0x001F) && frac_bits() != 0x0000;
+}
+
+inline constexpr bool Float16::is_subnorm() const
+{
+    return exp_bits() == make_exp_bits(0x0000) && frac_bits() != 0x0000;
+}
+
+inline constexpr bool Float16::is_norm() const
+{
+    if (is_zero()) return true;
+    return exp_bits() > make_exp_bits(0x0000) && exp_bits() < make_exp_bits(0x001F);
+}
+
+inline constexpr bool Float16::is_finite() const
+{
+    return is_norm() || is_subnorm();
+}
+
+inline constexpr int16_t Float16::exp() const
+{
+    assert(is_finite());
+    int16_t const e = static_cast<int16_t>(exp_bits() >> 10u);
+    return e != 0 ? e - bias() : e - bias() + 1;
+}
+
+inline constexpr int16_t Float16::frac() const
+{
+    assert(is_finite());
+    uint16_t f = frac_bits();
+    if (is_norm()) f |= uint32_t(1) << 10u;
+    return static_cast<int16_t>(f);
+}
+
+inline constexpr Float16 Float16::zero(bool neg)
+{
+    return Float16::from_raw(make_zero(neg));
+}
+
+inline constexpr Float16 Float16::qnan()
+{
+    return Float16::from_raw(make_nan(true));
+}
+
+inline constexpr Float16 Float16::snan()
+{
+    return Float16::from_raw(make_nan(false));
+}
+
+inline constexpr Float16 Float16::inf(bool neg)
+{
+    return Float16::from_raw(make_inf(neg));
+}
+
+inline constexpr Float16 Float16::from_raw(uint16_t v)
+{
+    Float16 f;
+    f.d = v;
+    return f;
+}
+
+inline constexpr Float16::operator float() const
+{
+    uint32_t const sign = is_neg();
+
+    // Reproduce the right type of inf/nan.
+    if (exp_bits() == make_exp_bits(0x001F)) {
+        union {
+            uint32_t w;
+            float f;
+        } u{};
+        u.w = 0;
+        u.w |= sign << 31u;
+        u.w |= uint32_t(0xFF) << 23u;
+        // Copy over the msb of the fractional part.
+        uint16_t const frac = frac_bits();
+        uint32_t frac_msb = frac & (uint32_t(1u) << 9u);
+        frac_msb <<= 12u; // RHS = 21 - 9
+        u.w |= frac_msb;
+        // Make sure the frac part doesn't become 0 for signaling NaNs.
+        if ((frac & (frac_msb - 1)) != 0) u.w |= 1u;
+        return u.f;
+    }
+
+    auto [e, f] = force_norm();
+    if (f == 0) return sign != 0 ? -0.0f : 0.0f;
+
+    float const v = ldexpf(f, e - 10);
+    return sign ? -v : v;
+}
+
+inline constexpr Float16::Float16(int sign, int exp, int frac) : d(make(sign, exp, frac)) {}
+
+inline constexpr uint16_t Float16::sign_bit() const
+{
+    return d & 0x8000u;
+}
+
+inline constexpr uint16_t Float16::exp_bits() const
+{
+    return d & 0x7C00u;
+}
+
+inline constexpr uint16_t Float16::frac_bits() const
+{
+    return d & 0x03FFu;
+}
+
+inline constexpr uint16_t Float16::make_sign_bit(uint16_t s)
+{
+    return static_cast<uint16_t>(!!s) << 15u;
+}
+
+inline constexpr uint16_t Float16::make_exp_bits(uint16_t e)
+{
+    return (e & 0x001Fu) << 10u;
+}
+
+inline constexpr uint16_t Float16::make_frac_bits(uint16_t f)
+{
+    return f & 0x03FFu;
+}
+
+inline constexpr uint16_t Float16::make_zero(bool neg)
+{
+    return make_sign_bit(neg) | make_exp_bits(0) | make_frac_bits(0);
+}
+
+inline constexpr uint16_t Float16::make_nan(bool quiet)
+{
+    uint16_t const f = quiet ? 0x0200 : 0x0100;
+    return make_sign_bit(0) | make_exp_bits(0x001F) | make_frac_bits(f);
+}
+
+inline constexpr uint16_t Float16::make_inf(bool neg)
+{
+    return make_sign_bit(neg) | make_exp_bits(0x001F) | make_frac_bits(0x0000);
+}
+
+inline constexpr uint16_t Float16::make(int sign, int exp, int frac)
+{
+    // Treat frac as a fixed-point value with 10 fraction bits.
+    if (frac == 0) {
+        // Signed zero.
+        return make_zero(sign);
+    }
+    assert(frac > 0);
+    unsigned const clz = HEX_COUNT_LEADING_ZERO(frac);
+    // For a finite, normalized non-zero number, clz should be 16+(16-11) = 21.
+    int exp_inc = 21 - clz;
+    if (exp + exp_inc > exp_max()) {
+        // Number has a magnitude that is too large.
+        return make_inf(sign);
+    }
+    if (exp + exp_inc < exp_min()) {
+        // This number can become subnormal or zero.
+        // safe_rshift will hit an assert if the shift is out of range
+        // If we had an out of range shift, then we should just clip it to the range
+        // Which should cause the frac to become 0 in either case
+        int mask = static_cast<int>(hnnx::get_safe_shift_mask<int>());
+        int shift_amount = exp_min() - exp - exp_inc;
+        shift_amount = (shift_amount > mask) ? mask : shift_amount;
+        frac = hnnx::safe_rshift(static_cast<unsigned>(frac), shift_amount);
+        return make_sign_bit(static_cast<uint16_t>(sign)) | make_exp_bits(0) |
+               make_frac_bits(static_cast<uint16_t>(frac));
+    }
+
+    if (exp_inc < 0) {
+        frac = hnnx::safe_lshift(static_cast<unsigned>(frac), -exp_inc);
+    } else if (exp_inc > 0) {
+        frac = round(static_cast<uint32_t>(frac), exp_inc);
+        // Rounding can change the most significant bit, so check it again.
+        unsigned const clzr = HEX_COUNT_LEADING_ZERO(frac);
+        assert(clzr == 20 || clzr == 21);
+        if (clzr < 21) {
+            frac = hnnx::safe_rshift(frac, (21 - clzr));
+            exp_inc += (21 - clzr);
+            // And the exponent check one more time...
+            if (exp + exp_inc > exp_max()) return make_inf(sign);
+        }
+    }
+    exp += exp_inc;
+    exp += bias();
+    return make_sign_bit(static_cast<uint16_t>(sign)) | make_exp_bits(static_cast<uint16_t>(exp)) |
+           make_frac_bits(static_cast<uint16_t>(frac));
+}
+
+inline constexpr uint32_t Float16::round(uint32_t v, unsigned s)
+{
+    if (s == 0) return v;
+    unsigned const out_msb = hnnx::safe_lshift(1u, (s - 1));
+    if ((v & out_msb) == 0) {
+        // Round down.
+        return hnnx::safe_rshift(v, s);
+    }
+    if ((v & (out_msb - 1)) == 0) {
+        // It's a tie, round to even.
+        v = hnnx::safe_rshift(v, s);
+        return v & 1u ? v + 1 : v;
+    }
+    // Round up.
+    return hnnx::safe_rshift(v, s) + 1;
+}
+
+inline std::pair<int32_t, int32_t> Float16::force_norm() const
+{
+    if (is_zero()) return std::make_pair(0, 0);
+    uint32_t f = frac_bits();
+    int32_t e = static_cast<int32_t>(exp_bits() >> 10u);
+    if (e == 0) {
+        // Subnormal number.
+        assert(f != 0);
+        unsigned const clz = HEX_COUNT_LEADING_ZERO(f) - 16; // Pretend we have 16 bits.
+        // Shift f left so that the first bit 1 is at position 10 from lsb
+        // (assuming that lsb is at 0).
+        e = -14 - (clz - 5);
+        f = hnnx::safe_lshift(f, clz - 5);
+    } else {
+        e -= bias();
+        f |= uint32_t(1) << 10u;
+    }
+    return std::make_pair(e, f);
+}
+
+constexpr Float16 operator"" _f16(long double v)
+{
+    return Float16(static_cast<float>(v));
+}
+
+PUSH_VISIBILITY(default)
+
+template <> class API_EXPORT std::numeric_limits<Float16> {
+  public:
+    static constexpr bool is_specialized = true;
+    static constexpr bool is_signed = true;
+    static constexpr bool is_integer = false;
+    static constexpr bool is_exact = false;
+    static constexpr bool has_infinity = true;
+    static constexpr bool has_quiet_NaN = true;
+    static constexpr bool has_signaling_NaN = true;
+    static constexpr auto has_denorm = std::denorm_present;
+    static constexpr bool has_denorm_loss = false; // libc++
+    static constexpr auto round_style = std::round_to_nearest;
+    static constexpr bool is_iec559 = true;
+    static constexpr bool is_bounded = true;
+    static constexpr bool is_modulo = false;
+    static constexpr int digits = 11;
+    static constexpr int digits10 = 3; // floor((digits-1) * log10(2))
+    static constexpr int max_digits10 = 5; // ceil(digits * log10(2) + 1)
+    static constexpr int radix = 2;
+    static constexpr int min_exponent = -13;
+    static constexpr int min_exponent10 = -4; // min normal =~ 0.000061035
+    static constexpr int max_exponent = 15;
+    static constexpr int max_exponent10 = 5; // largest finite val = 65504
+    static constexpr bool traps = false;
+    static constexpr bool tinyness_before = false; // libc++
+
+    static constexpr Float16 min() noexcept; // returns min positive normal
+    static constexpr Float16 lowest() noexcept; // returns true min
+    static constexpr Float16 max() noexcept; // max positive
+    static constexpr Float16 epsilon() noexcept; // step at 1.0
+    static constexpr Float16 round_error() noexcept; // 0.5
+    static constexpr Float16 infinity() noexcept;
+    static constexpr Float16 quiet_NaN() noexcept;
+    static constexpr Float16 signaling_NaN() noexcept;
+    static constexpr Float16 denorm_min() noexcept; // min positive denorm
+};
+
+POP_VISIBILITY()
+
+constexpr Float16 std::numeric_limits<Float16>::min() noexcept
+{
+    // 2^-14 * (1 + 0/1024)     ; 0 00001 0000000000
+    return Float16::from_raw(0x0400);
+}
+
+constexpr Float16 std::numeric_limits<Float16>::lowest() noexcept
+{
+    // -2^15 * (1 + 1023/1024)  ; 1 11110 1111111111
+    return Float16::from_raw(0xfbff); // -65504
+}
+
+constexpr Float16 std::numeric_limits<Float16>::max() noexcept
+{
+    // 2^15 * (1 + 1023/1024)   ; 0 11110 1111111111
+    return Float16::from_raw(0x7bff); // 65504
+}
+
+constexpr Float16 std::numeric_limits<Float16>::epsilon() noexcept
+{
+    // 2^-10 * (1 + 0/1024)     ; 0 00101 0000000000
+    return Float16::from_raw(0x1400); // next_after_1.0 - 1.0
+}
+
+constexpr Float16 std::numeric_limits<Float16>::round_error() noexcept
+{
+    // 2^-1 * (1 + 0/1024)      ; 0 01110 0000000000
+    return Float16::from_raw(0x3800); // 0.5
+}
+
+constexpr Float16 std::numeric_limits<Float16>::infinity() noexcept
+{
+    return Float16::inf(false);
+}
+
+constexpr Float16 std::numeric_limits<Float16>::quiet_NaN() noexcept
+{
+    return Float16::qnan();
+}
+
+constexpr Float16 std::numeric_limits<Float16>::signaling_NaN() noexcept
+{
+    return Float16::snan();
+}
+
+constexpr Float16 std::numeric_limits<Float16>::denorm_min() noexcept
+{
+    return Float16::from_raw(0x0001);
+}
+
+#endif // FLOAT16_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/forward_classes.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/forward_classes.h
new file mode 100755
index 0000000000000..22cd8b60361a2
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/forward_classes.h
@@ -0,0 +1,75 @@
+//==============================================================================
+//
+// Copyright (c) 2020, 2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef HEXNN_FORWARD_CLASSES_H
+#define HEXNN_FORWARD_CLASSES_H 1
+
+#include <memory>
+
+#include "weak_linkage.h"
+PUSH_VISIBILITY(default)
+
+class Graph;
+class Op;
+class OpDef;
+class Tensor;
+class Interface;
+template <unsigned TRank> class TensorShape;
+template <typename T> class PlainInterface;
+template <typename T> class ScaleOffsetInterface;
+
+namespace hnnx {
+
+class Serializer;
+class Deserializer;
+struct ShapeFlags;
+
+// this is a deleter for class T, for use in uniqe_ptr, by default it has the same
+// effect as default_delete, but it can be created
+// with a parameter 'true' that will cause it to do nothing instead of normal deletion.
+template <typename T> class DeleterWithDisable {
+    bool skip_delete;
+
+  public:
+    API_FUNC_EXPORT DeleterWithDisable() : skip_delete(false) {}
+    API_FUNC_EXPORT explicit DeleterWithDisable(bool skip) : skip_delete(skip) {}
+    API_FUNC_EXPORT DeleterWithDisable(DeleterWithDisable const &) = default;
+    API_FUNC_EXPORT DeleterWithDisable &operator=(DeleterWithDisable const &) = default;
+    // this conversion allows us to convert a unique_ptr<T> to unique_ptr<T,DeleterWithDisable<T> >
+    API_FUNC_EXPORT DeleterWithDisable(std::default_delete<T> const &) : skip_delete(false) {}
+    API_FUNC_EXPORT void operator()(T const *p) const;
+    API_FUNC_EXPORT inline bool delete_disabled() const { return skip_delete; }
+};
+template <typename T> API_FUNC_EXPORT void DeleterWithDisable<T>::operator()(T const *p) const
+{
+    if (!skip_delete) delete p;
+}
+
+extern template class DeleterWithDisable<Op>;
+extern template class DeleterWithDisable<Tensor>;
+
+typedef DeleterWithDisable<Op> Op_Deleter;
+typedef DeleterWithDisable<Tensor> Tensor_Deleter;
+typedef std::unique_ptr<Op, Op_Deleter> uptr_Op;
+typedef std::unique_ptr<Tensor, Tensor_Deleter> uptr_Tensor;
+
+// this can be applied to a uptr_Op or uptr_Tensor;
+// it will return true if the skip flag is set (i.e the object
+// is in a crate).
+//
+template <typename TA, typename TB>
+API_FUNC_EXPORT inline bool is_in_crate(std::unique_ptr<TA, DeleterWithDisable<TB>> &tp)
+{
+    return tp.get() != nullptr && tp.get_deleter().delete_disabled();
+}
+
+} // namespace hnnx
+
+POP_VISIBILITY()
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/graph_status.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/graph_status.h
new file mode 100755
index 0000000000000..e9e970cf44439
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/graph_status.h
@@ -0,0 +1,81 @@
+//==============================================================================
+//
+// Copyright (c) 2020, 2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef GRAPH_STATUS_H
+#define GRAPH_STATUS_H
+
+#ifdef __cplusplus
+struct GraphStatus {
+#endif // __cplusplus
+    enum GraphErrorCode {
+        Success = 0,
+        ErrorPickleSkipped = 1,
+        ErrorDimensions = 2,
+        ErrorPrecision = 3,
+        ErrorNAN = 4,
+        ErrorNoTCM = 5,
+        ErrorNoSpace = 6,
+        ErrorUnsupported = 7,
+        ErrorSequence = 8, // e.g. adding a node after prepare
+        ErrorBadID = 9, // source ref was 0 or not defined in graph; node ID was 0 or duplicate.
+        ErrorBadInput = 10,
+        ErrorInvalidTCM = 11,
+        ErrorFatalSchdule = 12,
+        ErrorFatalTCMRequest = 13,
+        ErrorFatalAllocate = 14,
+        ErrorFatalCheck = 15, // preprocess in prepare, e.g. clear the opid_alias_map, check connectivity, order_nodes
+        ErrorBadOpName = 16,
+        ErrorFatalOptimize = 17,
+        ErrorFatalCSE = 18, // steps that combined with CSE e.g. dead_code_removal_and_cse, const_prop_and_cse()
+        ErrorFatalInsert = 19, // when inject DMA spill/fill to fix any oversubscription of TCM
+        ErrorFatalReschedule = 20,
+        ErrorEmptyList = 21,
+        ErrorFatalExecute = 22,
+        ErrorFatalExecuteLastRun = 23,
+        ErrorTCMAcquire = 24, // we can recover from TCM acquire failures (when tcm was locked by a different client)
+        ErrorHMXAcquire = 25,
+        ErrorHMXPower = 26,
+        ErrorBadPMU = 27,
+        ErrorThreadCounts = 28,
+        ErrorClobberedPMU = 29, // Something clobbered our expected PMU event.
+        ErrorWeightsCompressedNoAperture = 30, // Weights are DLBC compressed, but failed to acquire aperture for it
+        ErrorRank = 31,
+        ErrorHMXRelease = 32,
+        ErrorTCMRelease = 33,
+        ErrorWeightsCompressedBadFormat = 34, // Weights are DLBC compressed, but compression format is not supported
+
+        ErrorFatalMcMetaData = 93,
+        ErrorFatalApiRecVersion = 94,
+        ErrorFatalDeserialize = 95,
+        ErrorFatalBlobVersion = 96,
+        ErrorFatalBlobVtcmSize = 97,
+        ErrorFatalUnusableGraph = 98,
+        ErrorFatalException = 99,
+        NotApplicable = 100, // used for internal signaling, should not be returned from API
+        Yielding = 101,
+        AbortSuccess = 102,
+        ErrorBadDynamicOp = 103,
+        ErrorFatal = -1,
+    };
+#ifdef __cplusplus
+    GraphStatus(const GraphStatus &) = default;
+    GraphStatus &operator=(const GraphStatus &) = default;
+    GraphStatus(GraphErrorCode ec) : error_code(ec) {}
+    explicit GraphStatus(int ec) : error_code(static_cast<GraphErrorCode>(ec)) {}
+    int to_int() const { return static_cast<int>(error_code); }
+    operator bool() const { return error_code != Success; }
+
+    bool operator==(GraphErrorCode ec) const { return error_code == ec; }
+    bool operator!=(GraphErrorCode ec) const { return error_code != ec; }
+
+  private:
+    GraphErrorCode error_code;
+};
+#endif // __cplusplus
+
+#endif // GRAPH_STATUS
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/hexagon_nn_types.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/hexagon_nn_types.h
new file mode 100755
index 0000000000000..d7252eeb9e9d3
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/hexagon_nn_types.h
@@ -0,0 +1,92 @@
+#pragma once
+//==============================================================================
+// @brief Collection of types used by various external/API headers
+//
+// Copyright (c) 2024 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+#include <stdint.h>
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// We need this so, as on Windows, long is just 32-bits.  This way, Long is consistently 64-bits on
+// 64-bit architectures (x86, aarch64 on Linux, Android, Windows, QNX, etc.).
+typedef ptrdiff_t Long;
+
+///
+/// @brief Max number of PMU events HexNN can sample
+///
+#define HEXAGON_NN_MAX_PMU_EVENTS 8
+
+///
+/// @brief Type for 32b (virtual) address
+///
+typedef uint32_t hexagon_nn_address_t;
+
+///
+/// @brief Type for 64b (virtual) address
+///
+typedef uint64_t hexagon_nn_wide_address_t;
+
+///
+/// @brief A visual marker for an address whose contents (the thing this points
+/// to) are immutable
+/// @details For example a pointer to a shared weights table. The table has a
+/// list of near/far pointers whose contents (weights) are considered immutable
+///
+typedef uint64_t hexagon_nn_wide_address_const_t;
+
+///
+/// @brief Type for iovec with 32b pointer/address and size
+///
+typedef struct {
+    hexagon_nn_address_t val;
+    uint32_t len;
+} hexagon_nn_iovec_t;
+
+///
+/// @brief Type for iovec with 64b pointer/address and size
+///
+typedef struct {
+    hexagon_nn_wide_address_t val;
+    uint64_t len;
+} hexagon_nn_wide_iovec_t;
+
+///
+/// @brief Used to specify thread types when calling hexagon_nn_set_thread_count
+/// and hexagon_nn_get_thread_count.
+///
+enum hexagon_nn_thread_type_t {
+    // Use these enums to specify the type of thread for hexagon_nn_set_thread_count.
+    VecThread = 0,
+    MtxThread = 1,
+    EltThread = 2,
+    // Use this for `count` to specify that the maximum available number of threads should be used.
+    MaxOsThreads = 1001,
+};
+
+enum MemContentType {
+    Standard = 0,
+    Weight = 1,
+    WeightDLBC = 2,
+    WeightReplaceable = 3,
+    ExtendedRO, ///< Content mapped to far memory with read-only permissions
+    ExtendedRW ///< Content mapped to far memory with read-write permissions
+};
+
+///
+/// @brief A NULL wide IO vector
+///
+/// @details Equivalent to nullptr for a pointer instance. Can be used as
+/// default value for arguments
+///
+static hexagon_nn_wide_iovec_t const NULL_IOVEC = {0ull, 0ull};
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/hvx_mathops.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/hvx_mathops.h
new file mode 100755
index 0000000000000..0a2f1c5f3147a
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/hvx_mathops.h
@@ -0,0 +1,83 @@
+//==============================================================================
+//
+// Copyright (c) 2021 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef HVX_MATHOPS_H_
+#define HVX_MATHOPS_H_ 1
+
+#include "intrinsics.h"
+
+namespace hnnx {
+//
+// Conversion of qf to int16, with a specific number of fractional bits,
+// and rounding/saturation. The number of fractional bits is set in range -2 .. 9
+// by a template parameter.
+//  E.g. a value of 51.0 will convert to 51 when FBITS=0, to 408 when FBITS=3,
+// and to 13 when FBITS=-2 (rounded from 12.75).
+//
+// This has been sweep-tested over all possible inputs.
+// This respects the 'hvx extended' hf, where exponent=31 is a normal range.
+// (but, you only see that extra range when FBITS =-2; otherwise those values
+// are saturated).
+// Any input values which are exactly halfway between integers are rounded
+// away from 0; others are rounded to nearest.
+//
+//
+// This should really work for larger FBITS, but for 10 or more, internal
+// rounding errors show up in the output somehow. Instead of scaling the input
+// directly according to FBITS, I also tried reducing the exponent of the added value,
+// i.e. use Q6_V_vsplat_R( 0x48400000 - (FBITS<<23) )
+// In principle this allows FBITS up to 28 or so, but results are not as expected, and
+// that approach only works for FBITS <= 8.
+//
+
+template <int FBITS, bool RND> inline HVX_Vector s16_from_hf_core(HVX_Vector vin)
+{
+    // convert to qf32, multiplying by 1.0 in the process.
+    HVX_Vector result = {0};
+    HVX_VectorPair v32 = Q6_Wqf32_vmpy_VhfVhf(vin, Q6_Vh_vsplat_R(0x3C00 + FBITS * 0x400));
+    // 'in-range' values are +/32752.
+    // add 192K to it, convert to sf
+    HVX_Vector v192K = Q6_V_vsplat_R(0x48400000);
+    HVX_Vector vsf_0 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_V_lo_W(v32), v192K));
+    HVX_Vector vsf_1 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_V_hi_W(v32), v192K));
+    // for in-range cases, result is {163858... 229360} so the exponent is always 144.
+    // if we extract bits 21..0 as a signed quantity, and round 6 bits off, that will be the answer.
+    // Start by <<10 to get the final 'sign' bit in bit 15...
+    vsf_0 = Q6_Vw_vasl_VwR(vsf_0, 10);
+    vsf_1 = Q6_Vw_vasl_VwR(vsf_1, 10);
+    // now round down to 16
+    if constexpr (RND) {
+        result = Q6_Vh_vround_VwVw_sat(vsf_1, vsf_0);
+    } else {
+        result = Q6_Vh_vsat_VwVw(vsf_1, vsf_0);
+    }
+    // but we need to also take care of out-of-range inputs; any with original exponent exceeding
+    // 29-FBITS. This is only possible when FBITS is -1 or more.
+    if (FBITS > -2) {
+        HVX_Vector tmp = Q6_Vh_vadd_VhVh(vin, vin); // shift out sign bit
+        HVX_Vector thrsh = Q6_Vh_vsplat_R((30 - FBITS) * 0x800); // must be <this
+        HVX_VectorPred n_overflow = Q6_Q_vcmp_gt_VuhVuh(thrsh, tmp);
+        HVX_Vector saturated = Q6_Vh_vlut4_VuhPh(vin, 0x800080007fff7fffULL);
+        result = Q6_V_vmux_QVV(n_overflow, result, saturated);
+    }
+    return result;
+}
+
+template <int FBITS> inline HVX_Vector s16_from_hf_rnd_sat(HVX_Vector vin)
+{
+    return s16_from_hf_core<FBITS, 1>(vin);
+}
+
+template <int FBITS> inline HVX_Vector s16_from_hf_sat(HVX_Vector vin)
+{
+    return s16_from_hf_core<FBITS, 0>(vin);
+}
+
+} //namespace hnnx
+
+#endif /* HVX_MATHOPS_H_ */
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/interface_defs.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/interface_defs.h
new file mode 100755
index 0000000000000..60db53fc840a5
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/interface_defs.h
@@ -0,0 +1,59 @@
+//==============================================================================
+//
+// Copyright (c) 2020 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef INTERFACE_DEFS_H
+#define INTERFACE_DEFS_H 1
+
+#include "dtype_enum.h"
+
+#include <cstddef>
+#include <cstdint>
+
+typedef unsigned long long OpId;
+typedef unsigned OpId_32;
+typedef unsigned size_t_32;
+#define MAX_DIMENSIONS 8
+
+// must be the same layout as struct input
+struct InputDef {
+    uint32_t input_id;
+    uint32_t output_idx;
+};
+
+struct OutputDef {
+    NN_UINT32_T rank;
+    DType dtype;
+    size_t max_sizes[MAX_DIMENSIONS];
+    NN_INT32_T zero_offset;
+    float stepsize;
+};
+
+struct InputDef_CanFormat {
+    OpId_32 input_id;
+    size_t_32 output_idx;
+};
+
+struct OutputDef_CanFormat {
+    NN_UINT32_T rank;
+    size_t_32 max_sizes[MAX_DIMENSIONS];
+    DType dtype;
+    NN_INT32_T zero_offset;
+    float stepsize;
+};
+struct Const_prefix_CanFormat {
+    size_t_32 reclen;
+    NN_UINT32_T rectype;
+    OpId_32 node_id;
+    size_t_32 rank;
+    DType dtype;
+    NN_INT32_T zero_offset;
+    float stepsize;
+    size_t_32 datalen;
+};
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/intrinsics.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/intrinsics.h
new file mode 100755
index 0000000000000..f35d3d2d31a11
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/intrinsics.h
@@ -0,0 +1,804 @@
+//==============================================================================
+//
+// Copyright (c) 2020-2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef INTRINSICS_H
+#define INTRINSICS_H 1
+
+#if !defined(__hexagon__) && !defined(_WIN32)
+#include <sched.h>
+#endif
+
+#include "log.h"
+
+#ifdef __hexagon__
+#include "hexagon_types.h"
+#endif
+#include "hexagon_protos.h"
+
+#include "afuncs.h"
+
+#include "check_hvx.h"
+
+#ifdef _WIN32
+#include <thread>
+#endif
+
+///////// inline implementation of trivial HVX intrinsics, reduces code size significantly.
+#if !defined(__hexagon__) && defined(Q6_V_lo_W)
+namespace hnnx {
+inline HVX_Vector q6op_V_lo_W(HVX_VectorPair const &w)
+{
+    return w.v[0];
+}
+inline HVX_Vector q6op_V_hi_W(HVX_VectorPair const &w)
+{
+    return w.v[1];
+}
+inline HVX_VectorPair q6op_W_vcombine_VV(HVX_Vector const &v1, HVX_Vector const &v0)
+{
+    HVX_VectorPair result;
+    result.v[0] = v0;
+    result.v[1] = v1;
+    return result;
+}
+#undef Q6_V_lo_W
+#define Q6_V_lo_W(W) hnnx::q6op_V_lo_W(W)
+#undef Q6_V_hi_W
+#define Q6_V_hi_W(W) hnnx::q6op_V_hi_W(W)
+#undef Q6_W_vcombine_VV
+#define Q6_W_vcombine_VV(V1, V0) hnnx::q6op_W_vcombine_VV(V1, V0)
+} // namespace hnnx
+
+//
+// workaround for defect in libnative Q6_Vw_vadd_VwVwQ_carry, Q6_Vw_vsub_VwVwQ_carry [QTOOL-108346]
+//
+namespace hnnx {
+template <bool SUBTRACT>
+HVX_Vector do_Vw_vaddORsub_VwVwQ_carry(HVX_Vector const &vu, HVX_Vector const &vv, HVX_VectorPred *qcarry)
+{
+    using u64 = unsigned long long;
+    HVX_Vector result;
+    HVX_VectorPred carry_out; // use local for this, in case qcarry aliases one of the inputs.
+    for (unsigned i = 0; i < 32; i++) {
+        unsigned v_val = vv.uw[i];
+        if (SUBTRACT) v_val = ~v_val;
+        u64 sum = u64(vu.uw[i]) + u64(v_val) + (qcarry->uw[i] & 1u);
+        result.uw[i] = unsigned(sum);
+        carry_out.uw[i] = (sum > 0xFFFFFFFFu) ? 0x01010101 : 0;
+    }
+    *qcarry = carry_out;
+    return result;
+}
+#undef Q6_Vw_vadd_VwVwQ_carry
+#define Q6_Vw_vadd_VwVwQ_carry(VA, VB, QP) hnnx::do_Vw_vaddORsub_VwVwQ_carry<false>(VA, VB, QP)
+#undef Q6_Vw_vsub_VwVwQ_carry
+#define Q6_Vw_vsub_VwVwQ_carry(VA, VB, QP) hnnx::do_Vw_vaddORsub_VwVwQ_carry<true>(VA, VB, QP)
+} // namespace hnnx
+// end [QTOOL-108346]
+
+#endif
+////////////////////////////////////////
+
+#include "hvx_mathops.h"
+#include "macros_attribute.h"
+
+typedef struct {
+    HVX_Vector val[2];
+} HVX_Vector_x2;
+
+typedef struct {
+    HVX_Vector val[3];
+} HVX_Vector_x3;
+typedef struct {
+    HVX_Vector val[4];
+} HVX_Vector_x4;
+
+typedef struct {
+    HVX_VectorPair val[2];
+} HVX_VectorPair_x2;
+
+typedef struct {
+    HVX_VectorPair val[3];
+} HVX_VectorPair_x3;
+
+typedef struct {
+    HVX_VectorPair val[4];
+} HVX_VectorPair_x4;
+
+// Splat 32b float to vector
+inline ALWAYSINLINE HVX_Vector q6op_V_vsplat_float32(const float val)
+{
+    union {
+        float as_f32;
+        int32_t as_i32;
+    } bitCast;
+    bitCast.as_f32 = val;
+    return Q6_V_vsplat_R(bitCast.as_i32);
+}
+
+// for splat 16b IEEE float to vector
+union bitcast_fp16 {
+    Float16 as_fp16;
+    int16_t as_i16;
+};
+
+inline ALWAYSINLINE HVX_Vector q6op_V_vsplat_float16(const Float16 val)
+{
+    const bitcast_fp16 bitcast{val};
+    return Q6_Vh_vsplat_R(bitcast.as_i16);
+}
+
+// 32x32 fractional multiply - expands to two ops
+//  equiv to :
+//    p  = (a*b + (1<<30)) >> 31     [with rounding]
+//    p  = a*b >> 31     			[without rounding]
+// The 'sat' only takes effect when both inputs
+// are -0x80000000 and causes the result to saturate to 0x7fffffff
+
+inline HVX_Vector q6op_Vw_vmpy_VwVw_s1_rnd_sat(HVX_Vector vu, HVX_Vector vv)
+{
+    return Q6_Vw_vmpyoacc_VwVwVh_s1_rnd_sat_shift(Q6_Vw_vmpye_VwVuh(vu, vv), vu, vv);
+}
+
+inline HVX_Vector q6op_Vw_vmpy_VwVw_s1_sat(HVX_Vector vu, HVX_Vector vv)
+{
+    return Q6_Vw_vmpyoacc_VwVwVh_s1_sat_shift(Q6_Vw_vmpye_VwVuh(vu, vv), vu, vv);
+}
+
+#ifdef __hexagon__
+// HEXAGON
+
+//Unaligned vector load
+
+inline HVX_Vector q6op_V_vldu_A(void const *addr)
+{
+#pragma pack(push, 1)
+    struct varr {
+        HVX_Vector v;
+    } const *pp;
+#pragma pack(pop)
+    pp = (struct varr const *)addr;
+    return pp->v;
+}
+
+// unaligned vector store.
+
+inline void q6op_vstu_AV(void *addr, HVX_Vector v)
+{
+#pragma pack(push, 1)
+    struct varr {
+        HVX_Vector v;
+    } * pp;
+#pragma pack(pop)
+    pp = (struct varr *)addr;
+    pp->v = v;
+}
+
+// conditional unaligned vector store.
+
+inline void q6op_vstu_QAV(HVX_VectorPred Qmask, void *addr, HVX_Vector v)
+{
+    unsigned int const bL = (uintptr_t)addr;
+    HVX_Vector *addr_v = (HVX_Vector *)addr;
+    HVX_Vector mask = Q6_V_vand_QR(Qmask, -1);
+    HVX_Vector vzero = Q6_V_vzero();
+    HVX_Vector vx = Q6_V_vlalign_VVR(v, v, bL);
+    HVX_Vector maskL = Q6_V_vlalign_VVR(mask, vzero, bL);
+    HVX_Vector maskH = Q6_V_vlalign_VVR(vzero, mask, bL);
+    HVX_Vector QL = Q6_Q_vcmp_gt_VubVub(maskL, vzero);
+    HVX_Vector QH = Q6_Q_vcmp_gt_VubVub(maskH, vzero);
+    Q6_vmem_QRIV(QL, &addr_v[0], vx);
+    if ((bL & 127) != 0) {
+        Q6_vmem_QRIV(QH, &addr_v[1], vx);
+    }
+}
+
+// Unaligned unaligned load/store:
+// vmemu( void *) can be assigned to, or read from,
+// and unaligned load/store will be used.
+// vmem( void const *) can be read from.
+//
+#pragma pack(push, 1)
+struct unaligned_vector_wrapper {
+    HVX_Vector v;
+    inline operator HVX_Vector() const { return v; };
+    inline HVX_Vector operator=(HVX_Vector val)
+    {
+        v = val;
+        return val;
+    }
+    inline HVX_Vector operator=(unaligned_vector_wrapper const &rhs)
+    {
+        if (this != &rhs) {
+            v = rhs.v;
+        }
+        return *this;
+    }
+}; // <- so the struct is not considered aligned
+#pragma pack(pop)
+inline HVX_Vector vmemu(void const *addr)
+{
+    return ((unaligned_vector_wrapper const *)addr)->v;
+}
+inline unaligned_vector_wrapper &vmemu(void *addr)
+{
+    return *(unaligned_vector_wrapper *)addr;
+}
+
+// this stores the first n bytes from vector vin to address 'addr'.
+// n must be in range 1..128, addr may have any alignment; does one or
+// two masked stores
+
+inline void q6op_vstu_variable_ARV(void *addr, int n, HVX_Vector vin)
+{
+    vin = Q6_V_vlalign_VVR(vin, vin, (size_t)addr); //rotate as needed.
+    unsigned const left_off = (size_t)addr & 127;
+    unsigned const right_off = left_off + n;
+    HVX_VectorPred qL_not = Q6_Q_vsetq_R((size_t)addr);
+    HVX_VectorPred qR = Q6_Q_vsetq2_R(right_off);
+    if (right_off > 128) {
+        Q6_vmaskedstorentq_QAV(qR, (HVX_Vector *)addr + 1, vin);
+        qR = Q6_Q_vcmp_eq_VbVb(vin, vin); // all 1's
+    }
+    qL_not = Q6_Q_or_QQn(qL_not, qR);
+    Q6_vmaskedstorentnq_QAV(qL_not, (HVX_Vector *)addr, vin);
+}
+// store 'n' bytes (1..128) from a vector to unaligned location 'ptr'.
+// The bytes are extracted from value, starting at position 'pos0' (and wrapping around, if pos0+n > 128).
+// Only the 7 lsbs of pos0 are used.
+inline void q6op_vstu_variable_ARVR(void *addr, int n, HVX_Vector vin, int pos0)
+{
+    vin = Q6_V_vlalign_VVR(vin, vin, (size_t)addr - pos0); //rotate as needed.
+    unsigned const left_off = (size_t)addr & 127;
+    unsigned const right_off = left_off + n;
+    HVX_VectorPred qL_not = Q6_Q_vsetq_R((size_t)addr);
+    HVX_VectorPred qR = Q6_Q_vsetq2_R(right_off);
+    if (right_off > 128) {
+        Q6_vmaskedstorentq_QAV(qR, (HVX_Vector *)addr + 1, vin);
+        qR = Q6_Q_vcmp_eq_VbVb(vin, vin); // all 1's
+    }
+    qL_not = Q6_Q_or_QQn(qL_not, qR);
+    Q6_vmaskedstorentnq_QAV(qL_not, (HVX_Vector *)addr, vin);
+}
+
+#if 0
+// store 'w' bytes (1..128) from a vector to unaligned location 'ptr'.
+// The bytes are extracted from value, starting at position 'pos0' (and wrapping around, if pos0+w > 128).
+// Only the 7 lsbs of pos0 are used.
+// This is an alternate implementation, seenms to be about the same cost, but maybe in some loops it will
+// be better depending on what else is in the loop. Probably not useful where 'w' is not a loop invariant.
+inline void q6op_vstu_variable_ARVR_alt(void *ptr, unsigned w, HVX_Vector value,
+                                        unsigned pos0 = 0)
+{
+    // make a mask with 1's in the first 'w' slots
+    HVX_Vector msk0 = Q6_V_vand_QR(Q6_Q_vsetq2_R(w), 1);
+    unsigned uptr = (size_t)ptr;
+    unsigned offs = uptr & 127;
+    // rotate data up according to 'offs' (and pos0)
+    value = Q6_V_vlalign_VVR(value, value, uptr - pos0);
+    // shift the mask up according to 'offs'
+    HVX_Vector mlo = Q6_V_vlalign_VVR(msk0, Q6_V_vzero(), uptr);
+    // and get upper part
+    HVX_Vector mhi = Q6_V_vlalign_VVR(Q6_V_vzero(), msk0, uptr);
+    Q6_vmaskedstorentq_QAV(Q6_Q_vcmp_gt_VubVub(mlo, Q6_V_vzero()), (char *)ptr,
+                           value);
+    if (offs + w > 128) {
+        Q6_vmaskedstorentq_QAV(Q6_Q_vcmp_gt_VubVub(mhi, Q6_V_vzero()),
+                               (char *)ptr + 128, value);
+    }
+}
+#endif
+
+#define PGSIZE (1024 * 1024)
+
+inline void dcfetch(void const *addr)
+{
+    PUSH_WARNING()
+    DISABLE_WARNING("-Wcast-qual", MSVC_NO_EQUIV)
+    //    asm volatile(" dcfetch(%0) " : : "r"(addr));
+    Q6_dcfetch_A(const_cast<void *>(addr));
+    POP_WARNING()
+}
+
+inline void ALWAYSINLINE l2pref(const void *p, uint32_t height, uint32_t width, uint32_t stride)
+{
+    uint64_t const control = Q6_P_combine_RR(stride, Q6_R_combine_RlRl(width, height));
+    asm volatile(" l2fetch(%0,%1) " : : "r"(p), "r"(control));
+}
+
+inline void ALWAYSINLINE pause_just_enough()
+{
+#if (__HEXAGON_ARCH__ >= 73)
+    //    asm volatile("pause(#1023)");
+    asm volatile("pause(#255)");
+// LCOV_EXCL_START [SAFTYSWCCB-1735]
+#elif (__HEXAGON_ARCH__ >= 69)
+    asm volatile("pause(#128)");
+#else
+    int tmp = 0;
+    asm volatile("%0 = add(pc,#8); jumpr %0;" : : "r"(tmp));
+    asm volatile("%0 = add(pc,#8); jumpr %0;" : : "r"(tmp));
+    asm volatile("%0 = add(pc,#8); jumpr %0;" : : "r"(tmp));
+    asm volatile("%0 = add(pc,#8); jumpr %0;" : : "r"(tmp));
+// LCOV_EXCL_STOP
+#endif
+}
+
+#else
+
+// PORTABLE
+#include <cstring>
+
+inline HVX_Vector &vmemu(void *addr)
+{
+    return *(HVX_Vector *)addr;
+}
+
+inline HVX_Vector vmemu(void const *v)
+{
+    return *(HVX_Vector const *)(v);
+}
+inline HVX_Vector q6op_V_vldu_A(void const *addr)
+{
+    return *(HVX_Vector const *)addr;
+}
+inline void q6op_vstu_AV(void *addr, HVX_Vector v)
+{
+    *(HVX_Vector *)addr = v;
+}
+
+inline void q6op_vstu_variable_ARV(void *addr, int n, HVX_Vector vin)
+{
+    check_hvx();
+
+    typedef union {
+        HVX_Vector v;
+        uint8_t u8[128];
+    } vec;
+
+    vec v;
+    v.v = vin;
+    std::memcpy((uint8_t *)addr, v.u8, n);
+}
+inline void q6op_vstu_variable_ARVR(void *addr, int n, HVX_Vector vin, int pos0)
+{
+    q6op_vstu_variable_ARV(addr, n, Q6_V_vror_VR(vin, pos0));
+}
+
+inline void dcfetch(void const volatile *addr) {}
+inline void l2pref(const void *p, uint32_t height, uint32_t width, uint32_t stride) {}
+
+inline void pause_just_enough()
+{
+#ifndef _WIN32
+    sched_yield();
+#else
+    std::this_thread::yield();
+#endif
+}
+
+#endif
+
+inline void dcfetch_block(const void *addr, int size)
+{
+    auto address = static_cast<const uint8_t *>(addr);
+
+    for (int i = 0; i < size; i += 64) {
+        dcfetch(address);
+        address += 64;
+    }
+}
+
+// unaligned load the lo part of HVX_VECTOR into pDst
+inline void vmemu_lo(HVX_VectorPair &output, uint8_t *pDst)
+{
+    HVX_Vector output_lo = Q6_V_lo_W(output);
+    q6op_vstu_AV(pDst, output_lo);
+}
+
+// unaligned load the hi part of HVX_VECTOR into pDst
+inline void vmemu_hi(HVX_VectorPair &output, uint8_t *pDst)
+{
+    HVX_Vector output_hi = Q6_V_hi_W(output);
+    q6op_vstu_AV(pDst, output_hi);
+}
+
+// This func conditional unaligned stores the first nwrite byte of vreg into addr.
+inline void q6op_vmemu_partial(uint8_t *addr, HVX_Vector vreg, int nwrite)
+{
+    HVX_VectorPred cond = Q6_Q_vsetq2_R(std::min(128, nwrite));
+    q6op_vstu_AV(addr, Q6_V_vmux_QVV(cond, vreg, vmemu(addr)));
+}
+
+// this is called with a dest pointer, two vectors, and 'bytes' in range 1..256.
+// The first 'bytes' bytes from the vectors (v0 followed by v1) will be stored
+// at the address, using  unaligned and masked stores as needed. If bytes <=0,
+// nothing is stored; if bytes > 256, the effect is the same as bytes == 256 (all stored).
+void hvx_store_vec_x2_unaligned(void *addr, HVX_Vector v0, HVX_Vector v1, int bytes) noexcept;
+
+inline void hvx_store_vec_x2_unaligned_inline(void *addr, HVX_Vector v0, HVX_Vector v1, int bytes) noexcept
+{
+    check_hvx();
+
+    static constexpr unsigned int vector_size = 128;
+    HVX_Vector *outp = (HVX_Vector *)addr;
+    if (bytes >= vector_size) {
+        q6op_vstu_AV(outp, v0);
+        outp++;
+        bytes -= vector_size;
+        v0 = v1;
+    }
+    if (bytes >= vector_size) {
+        q6op_vstu_AV(outp, v0);
+    } else if (bytes >= 1) {
+        q6op_vstu_variable_ARV(outp, bytes, v0);
+    }
+}
+
+// this is called with a dest pointer, four vectors, and 'bytes' in range 1..512.
+// The first 'bytes' bytes from the vectors (v0...v3) will be stored
+// at the address, using  unaligned and masked stores as needed. If bytes <=0,
+// nothing is stored; if bytes > 512, the effect is the same as bytes == 512 (all stored).
+void hvx_store_vec_x4_unaligned(void *addr, HVX_Vector v0, HVX_Vector v1, HVX_Vector v2, HVX_Vector v3,
+                                int bytes) noexcept;
+
+inline void hvx_store_vec_x4_unaligned_inline(void *addr, HVX_Vector v0, HVX_Vector v1, HVX_Vector v2, HVX_Vector v3,
+                                              int bytes) noexcept
+{
+    check_hvx();
+
+    static constexpr unsigned int vector_size = 128;
+    HVX_Vector *outp = (HVX_Vector *)addr;
+    if (bytes >= vector_size) {
+        q6op_vstu_AV(outp, v0);
+        outp++;
+        bytes -= vector_size;
+        v0 = v1;
+    }
+    if (bytes >= vector_size) {
+        q6op_vstu_AV(outp, v0);
+        outp++;
+        bytes -= vector_size;
+        v0 = v2;
+    }
+    if (bytes >= vector_size) {
+        q6op_vstu_AV(outp, v0);
+        outp++;
+        bytes -= vector_size;
+        v0 = v3;
+    }
+    if (bytes >= vector_size) {
+        q6op_vstu_AV(outp, v0);
+    } else if (bytes >= 1) {
+        q6op_vstu_variable_ARV(outp, bytes, v0);
+    }
+}
+
+inline HVX_VectorPair addv_u64(HVX_VectorPair acc, HVX_Vector newdata)
+{
+    const HVX_Vector v_one = Q6_V_vsplat_R(1);
+    HVX_Vector acc_lo = Q6_V_lo_W(acc);
+    HVX_Vector acc_hi = Q6_V_hi_W(acc);
+    // works for unsigned newdata since if acc_lo is >= 2^15 and we add
+    // newdata (as unsigned), then we have either
+    // 1) newdata < 2^15, in which case acc_lo will get 'less negative'
+    //    thus decreasing the magnitude of the negative acc_lo (which makes it bigger as
+    //    an unsigned int)
+    // 2) newdata > 2^15, then both acc_lo and newdata are negative so it just adds the magnitude
+    //   (as in -a + -b = -(a+b))
+    HVX_Vector new_lo = Q6_Vw_vadd_VwVw(acc_lo, newdata);
+    HVX_VectorPred ovf = Q6_Q_vcmp_gt_VuwVuw(newdata, new_lo);
+    acc_hi = Q6_Vw_condacc_QVwVw(ovf, acc_hi, v_one);
+    return Q6_W_vcombine_VV(acc_hi, new_lo);
+}
+
+inline HVX_VectorPair addw_u64(HVX_VectorPair acc, HVX_VectorPair addend)
+{
+    const HVX_Vector v_one = Q6_V_vsplat_R(1);
+    HVX_Vector acc_lo = Q6_V_lo_W(acc);
+    HVX_Vector acc_hi = Q6_V_hi_W(acc);
+    HVX_Vector addend_hi = Q6_V_hi_W(addend);
+    HVX_Vector addend_lo = Q6_V_lo_W(addend);
+    HVX_Vector new_hi = Q6_Vw_vadd_VwVw(addend_hi, acc_hi);
+    HVX_Vector new_lo = Q6_Vw_vadd_VwVw(acc_lo, addend_lo);
+    HVX_VectorPred ovf = Q6_Q_vcmp_gt_VuwVuw(addend_lo, new_lo);
+    new_hi = Q6_Vw_condacc_QVwVw(ovf, new_hi, v_one);
+    return Q6_W_vcombine_VV(new_hi, new_lo);
+}
+
+// Utilities to convert from uint64 to qf32/sf
+// Moved here so that they can be reused
+//convert long long int into qfloat for sum and sum(squared)
+inline HVX_Vector uint64_to_qfloat(HVX_Vector ll_hi, HVX_Vector ll_lo)
+{
+    HVX_Vector vzero = Q6_V_vzero();
+    HVX_VectorPred q0;
+    HVX_Vector v32 = Q6_V_vsplat_R(32);
+    HVX_Vector qmask = Q6_V_vsplat_R(0xffffff00);
+    HVX_Vector qexpmin = Q6_V_vsplat_R(0x0000009e); //^-9
+    HVX_Vector qf32_out, hi, lo, exp0, mant0, exp;
+    q0 = Q6_Q_vcmp_eq_VwVw(ll_hi, vzero); //if(!hi)
+    hi = Q6_V_vmux_QVV(q0, ll_lo, ll_hi); //
+    lo = Q6_V_vand_QnV(q0, ll_lo); //xxxx | xxxx or xxxx | 0000
+    exp0 = Q6_Vuw_vcl0_Vuw(hi); //get size of value 32 or 64bit
+    mant0 = Q6_Vw_vasl_VwVw(hi, exp0); //shift hi by size
+    exp = Q6_Vw_vsub_VwVw(v32, exp0); //compute missing bit using oppisite shift on lo
+    lo = Q6_Vw_vlsr_VwVw(lo, exp);
+    mant0 = Q6_Vw_vadd_VwVw(mant0, lo); //combine lo and hi
+    exp = Q6_V_vand_QnV(q0, v32); //adjust exp by 32 if 32 or 64bit ll
+    exp0 = Q6_Vw_vsub_VwVw(exp0, exp); //convert to qfloat exponent
+    mant0 = Q6_Vuw_vlsr_VuwR(mant0, 1); //make mant issa signed format
+    mant0 = Q6_V_vand_VV(mant0, qmask);
+    exp0 = Q6_Vw_vsub_VwVw(qexpmin, exp0); //merge mant and exponent
+    qf32_out = Q6_V_vor_VV(mant0, exp0); //qfloat
+    return (qf32_out);
+}
+
+inline HVX_Vector uint64_to_qfloat(HVX_VectorPair bigval)
+{
+    return uint64_to_qfloat(Q6_V_hi_W(bigval), Q6_V_lo_W(bigval));
+}
+
+//This function returns in IEEE FP32 format
+inline HVX_Vector uint64_to_float(HVX_Vector ll_hi, HVX_Vector ll_lo)
+{
+    HVX_Vector vzero = Q6_V_vzero();
+    HVX_Vector v32 = Q6_V_vsplat_R(32);
+
+    HVX_Vector exponent = Q6_V_vsplat_R(32);
+    HVX_VectorPred q0 = Q6_Q_vcmp_eq_VwVw(ll_hi, vzero);
+    exponent = Q6_V_vmux_QVV(q0, vzero, exponent);
+    HVX_Vector hi = Q6_V_vmux_QVV(q0, ll_lo, ll_hi);
+    HVX_Vector lo = Q6_V_vand_QnV(q0, ll_lo);
+
+    HVX_Vector msb_position = Q6_Vw_vsub_VwVw(v32, Q6_Vuw_vcl0_Vuw(hi));
+    HVX_Vector shft_amt = Q6_Vw_vsub_VwVw(Q6_V_vsplat_R(33), msb_position);
+
+    // compute the mantissa (fractional part) of the floating-point representation
+    HVX_Vector mantissa = Q6_Vw_vasl_VwVw(hi, shft_amt);
+    HVX_Vector temp1 = Q6_Vw_vsub_VwVw(v32, shft_amt);
+    temp1 = Q6_Vw_vlsr_VwVw(lo, temp1); //low_bits are shifted right by (32 - shift_amount)
+    mantissa = Q6_V_vor_VV(mantissa, temp1); //combines shifted values to create the mantissa
+    mantissa = Q6_Vuw_vlsr_VuwR(Q6_Vw_vadd_VwVw(mantissa, Q6_V_vsplat_R(0x00000100)),
+                                9); //equivalent to: ((mantissa >> 8) + 1) >> 1;
+
+    // compute the exponent
+    exponent = Q6_Vw_vadd_VwVw(msb_position, exponent);
+    exponent = Q6_Vw_vadd_VwVw(exponent, Q6_V_vsplat_R(126)); //apply exponent bias
+    exponent =
+            Q6_Vw_vasl_VwVw(exponent, Q6_V_vsplat_R(23)); //align the exponent part of the floating-point representation
+
+    HVX_Vector result = Q6_Vw_vadd_VwVw(exponent, mantissa);
+    // handling float conversion for 0 manually
+    const HVX_Vector maskZero = Q6_V_vand_QV(Q6_Q_vcmp_eq_VwVw(ll_lo, vzero), Q6_Q_vcmp_eq_VwVw(ll_hi, vzero));
+    result = Q6_V_vmux_QVV(maskZero, vzero, result);
+
+    return result;
+}
+
+inline HVX_Vector uint64_to_float(HVX_VectorPair bigval)
+{
+#if HEX_ARCH >= 73
+    return uint64_to_float(Q6_V_hi_W(bigval), Q6_V_lo_W(bigval));
+#else
+    // LCOV_EXCL_START [SAFTYSWCCB-1735]
+    return Q6_Vsf_equals_Vqf32(uint64_to_qfloat(Q6_V_hi_W(bigval), Q6_V_lo_W(bigval)));
+    // LCOV_EXCL_STOP
+#endif
+}
+
+inline HVX_Vector int32_to_qfloat(HVX_Vector const in)
+{
+    HVX_Vector const vzero = Q6_V_vzero();
+    HVX_VectorPred is_zero = Q6_Q_vcmp_eq_VwVw(in, vzero);
+    HVX_Vector lshift = Q6_Vw_vnormamt_Vw(in);
+    HVX_Vector normalized = Q6_Vw_vasl_VwVw(in, lshift);
+    HVX_Vector vexp = Q6_Vw_vsub_VwVw(Q6_V_vsplat_R(0x7f + 30), lshift);
+    HVX_Vector mant = Q6_V_vand_VV(Q6_V_vsplat_R(0xFFFFFF00), normalized);
+    HVX_Vector ret = Q6_V_vand_QnV(is_zero, Q6_Vw_vadd_VwVw(mant, vexp));
+    return ret;
+}
+
+inline HVX_Vector int32_to_float(HVX_Vector const in)
+{
+    return Q6_Vsf_equals_Vqf32(int32_to_qfloat(in));
+}
+
+// Hexagon toolchain 19.0 adds support for this function as a macro. The inline function
+// declaration is not needed in that case.
+#if !defined(Q6_Vqf32_equals_Vsf)
+// Convert IEEE 754 float to Qualcomm 32-bit float (qf32)
+[[maybe_unused]] static inline ALWAYSINLINE HVX_Vector Q6_Vqf32_equals_Vsf(HVX_Vector vin)
+{
+    return Q6_Vqf32_vadd_VsfVsf(vin, Q6_V_vzero());
+}
+#endif
+
+[[maybe_unused]] static inline ALWAYSINLINE HVX_Vector Q6_Vqf32_from_int(HVX_Vector vin)
+{
+    HVX_Vector const_126 = Q6_V_vsplat_R(0x0000007e);
+    HVX_Vector const31 = Q6_V_vsplat_R(31);
+    HVX_Vector mant = vin;
+    HVX_Vector exp = Q6_Vw_vnormamt_Vw(mant);
+    mant = Q6_Vw_vasl_VwVw(mant, exp);
+    exp = Q6_Vw_vsub_VwVw(const31, exp);
+    exp = Q6_Vw_vadd_VwVw(exp, const_126);
+    return Q6_V_vor_VV(mant, exp);
+}
+
+//Convert INT32 to return IEEE FP32
+[[maybe_unused]] static inline ALWAYSINLINE HVX_Vector int32_to_fp32(HVX_Vector vin)
+{
+    HVX_Vector v32 = Q6_V_vsplat_R(32);
+    const HVX_Vector Zerofp32 = Q6_V_vsplat_R(0x00000000); // 0.0 in IEEE FP32
+    const HVX_Vector maskSign = Q6_V_vsplat_R(0x80000000);
+    const HVX_Vector vinSgn = Q6_V_vand_VV(vin, maskSign);
+    vin = Q6_Vuw_vabsdiff_VwVw(vin, Q6_V_vzero());
+
+    HVX_Vector msb_position = Q6_Vw_vsub_VwVw(v32, Q6_Vuw_vcl0_Vuw(vin));
+    HVX_Vector shft_amt = Q6_Vw_vsub_VwVw(Q6_V_vsplat_R(33), msb_position);
+    HVX_Vector mantissa = Q6_Vw_vasl_VwVw(vin, shft_amt);
+    mantissa = Q6_Vuw_vlsr_VuwR(Q6_Vw_vadd_VwVw(mantissa, Q6_V_vsplat_R(0x00000100)), 9);
+
+    HVX_Vector exponent = Q6_Vw_vadd_VwVw(msb_position, Q6_V_vsplat_R(126));
+    exponent =
+            Q6_Vw_vasl_VwVw(exponent, Q6_V_vsplat_R(23)); //align the exponent part of the floating-point representation
+
+    HVX_Vector result = Q6_V_vor_VV(vinSgn, Q6_Vw_vadd_VwVw(exponent, mantissa));
+    // handling float conversion for 0 manually
+    const HVX_VectorPred maskZero = Q6_Q_vcmp_eq_VwVw(vin, Zerofp32);
+    result = Q6_V_vmux_QVV(maskZero, Zerofp32, result);
+
+    return result;
+}
+
+template <bool RND> static inline HVX_Vector convert_sf_to_s32_core(HVX_Vector vals)
+{
+    if constexpr (RND) {
+        HVX_Vector const sign = Q6_V_vand_VV(vals, Q6_V_vsplat_R(0x80000000));
+        HVX_Vector vqfadd = Q6_Vqf32_vadd_VsfVsf(vals, Q6_V_vor_VV(sign, q6op_V_vsplat_float32(0.5f)));
+        vals = Q6_Vsf_equals_Vqf32(vqfadd);
+    }
+#if HEX_ARCH >= 73
+    // Can use the fancy new intrinsic for this!
+    return Q6_Vw_equals_Vsf(vals);
+#else
+    // LCOV_EXCL_START [SAFTYSWCCB-1735]
+    const HVX_Vector const_zero = Q6_V_vzero();
+    const HVX_Vector const_7fffff = Q6_V_vsplat_R(0x7fffff);
+    const HVX_Vector const_800000 = Q6_V_vsplat_R(0x800000);
+    const HVX_Vector const_00ff = Q6_V_vsplat_R(0x00ff);
+    const HVX_Vector const_150 = Q6_V_vsplat_R(127 + 23);
+    const HVX_Vector const_n32 = Q6_V_vsplat_R(-32);
+    const HVX_Vector const_7 = Q6_V_vsplat_R(7);
+    const HVX_Vector const_7fffffff = Q6_V_vsplat_R(0x7fffffff);
+
+    HVX_VectorPred p_neg, p_overflow;
+    HVX_Vector mant, exp, shift;
+
+    /* Check for negative values */
+    p_neg = Q6_Q_vcmp_gt_VwVw(const_zero, vals);
+    /* Extract exponent and mantissa, add back hidden 1 */
+    exp = Q6_Vuw_vlsr_VuwR(vals, 23);
+    exp = Q6_V_vand_VV(exp, const_00ff);
+    mant = Q6_V_vand_VV(vals, const_7fffff);
+    mant = Q6_V_vor_VV(mant, const_800000);
+
+    /* shift and round to get integer bits */
+    shift = Q6_Vw_vmax_VwVw(Q6_Vw_vsub_VwVw(exp, const_150), const_n32);
+    mant = Q6_Vw_vasl_VwVw(mant, shift);
+
+    p_overflow = Q6_Q_vcmp_gt_VhVh(shift, const_7);
+    mant = Q6_V_vmux_QVV(p_overflow, const_7fffffff, mant);
+
+    /* Turn negative values into two's complement negative values */
+    HVX_Vector v_neg = Q6_V_vand_QR(p_neg, -1); // 0xFFFFFFFF in -ve lanes, 0 in >=0
+    mant = Q6_V_vxor_VV(Q6_Vw_vadd_VwVw(mant, v_neg), v_neg);
+    return mant;
+    // LCOV_EXCL_STOP
+#endif // HEX_ARCH >= 73
+}
+
+// Convert float to int32, round to nearest, with 0.5 rounded away from 0
+// ie np.copysign(0.5,f).astype('int32')
+static inline HVX_Vector convert_sf_to_s32_rnd(HVX_Vector vals)
+{
+    return convert_sf_to_s32_core<true>(vals);
+}
+
+// Convert float32 to int32, round toward 0
+// ie f.astype('int32')
+static inline HVX_Vector convert_sf_to_s32(HVX_Vector vals)
+{
+    return convert_sf_to_s32_core<false>(vals);
+}
+
+template <bool RND> static inline HVX_Vector convert_hf_to_s16_core(HVX_Vector vals)
+{
+#if HEX_ARCH >= 73
+    if constexpr (RND) {
+        return hnnx::s16_from_hf_rnd_sat<0>(vals);
+    } else {
+        return Q6_Vh_equals_Vhf(vals);
+    }
+#else
+    // LCOV_EXCL_START [SAFTYSWCCB-1735]
+    if constexpr (RND) {
+        return hnnx::s16_from_hf_rnd_sat<0>(vals);
+    } else {
+        return hnnx::s16_from_hf_sat<0>(vals);
+    }
+    // LCOV_EXCL_STOP
+#endif // HEX_ARCH >= 73
+}
+
+static inline HVX_Vector convert_hf_to_s16(HVX_Vector vals)
+{
+    return convert_hf_to_s16_core<0>(vals);
+}
+
+static inline HVX_Vector convert_hf_to_s16_rnd(HVX_Vector vals)
+{
+    return convert_hf_to_s16_core<1>(vals);
+}
+
+// Some existing graphs seem to be sensitive to modifications of int32_to_float
+// So this is added in order to work around that
+static inline HVX_Vector convert_s32_to_sf(const HVX_Vector vals)
+{
+#if HEX_ARCH >= 73
+    return Q6_Vsf_equals_Vw(vals);
+#else
+    // LCOV_EXCL_START [SAFTYSWCCB-1735]
+    return int32_to_float(vals);
+    // LCOV_EXCL_STOP
+#endif
+}
+
+#if defined(__hexagon__)
+#define SCATTER_TYPE(_a) (intptr_t) _a
+inline ALWAYSINLINE void scatter_release_and_stall(const void *p) // must point to TCM
+{
+    asm volatile("vmem(%0+#0):scatter_release" : : "r"(p));
+    // issue load to same address; will stall until vscatters complete
+    *(HVX_Vector const volatile *)p;
+}
+#else
+#define SCATTER_TYPE(_a) (HVX_Vector *)_a
+[[maybe_unused]] inline ALWAYSINLINE void scatter_release_and_stall(const void *p)
+{
+    check_hvx();
+    return; // empty function def on non-hexagon targets
+}
+#endif
+
+/*=======================================*/
+/* Helper Function in assembly        */
+/*=======================================*/
+extern "C" {
+void vmemcpy_h(void *dst, const void *src, size_t len);
+
+void vmemset_h(void *dst, int value, size_t len);
+
+}; // extern "C"
+
+#ifndef __hexagon__
+// map to std. library for x86
+inline void vmemcpy_h(void *dst, const void *src, size_t len)
+{
+    check_hvx();
+    memcpy(dst, src, len);
+}
+inline void vmemset_h(void *dst, int val, size_t len)
+{
+    check_hvx();
+    memset(dst, val, len);
+}
+#endif //__hexagon__
+
+#endif // INTRINSICS_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/is_detected.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/is_detected.h
new file mode 100755
index 0000000000000..a01c6241077b2
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/is_detected.h
@@ -0,0 +1,42 @@
+//==============================================================================
+//
+// Copyright (c) 2020-2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+//
+// This is a simple example implementation of is_detected which is implemented
+// in std::experimental, but isn't supported by MSVC.
+//
+
+#ifndef IS_DETECTED_H
+#define IS_DETECTED_H 1
+
+namespace detail {
+
+struct nonesuch {
+    ~nonesuch() = delete;
+    nonesuch(nonesuch const &) = delete;
+    void operator=(nonesuch const &) = delete;
+};
+
+template <class Default, class AlwaysVoid, template <class...> class Op, class... Args> struct detector {
+    using value_t = std::false_type;
+    using type = Default;
+};
+
+template <class Default, template <class...> class Op, class... Args>
+struct detector<Default, std::void_t<Op<Args...>>, Op, Args...> {
+    using value_t = std::true_type;
+    using type = Op<Args...>;
+};
+
+} // namespace detail
+
+template <template <class...> class Op, class... Args>
+using is_detected = typename detail::detector<detail::nonesuch, void, Op, Args...>::value_t;
+
+template <template <class...> class Op, class... Args> constexpr bool is_detected_v = is_detected<Op, Args...>::value;
+
+#endif // IS_DETECTED_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/log.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/log.h
new file mode 100755
index 0000000000000..6fa6f853d83a1
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/log.h
@@ -0,0 +1,299 @@
+//==============================================================================
+//
+// Copyright (c) 2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef LOG_H
+#define LOG_H 1
+
+#include "weak_linkage.h"
+#include "macros_attribute.h"
+#include <cstdarg>
+#include <string>
+#include <chrono>
+
+#if !defined(__PRETTY_FUNCTION__) && !defined(__GNUC__)
+#define __FUNC_INFO__ __FUNCSIG__
+#else
+#define __FUNC_INFO__ __PRETTY_FUNCTION__
+#endif
+
+// GCC and Clang define a preprocessor macro which is just the basename of the current file.
+#if defined(__FILE_NAME__)
+#define FILE_BASENAME __FILE_NAME__
+#else
+
+// MSVC doesn't have this nice feature, so we have to do it manually.  Note that the entire path
+// still ends up in the .rodata section, unfortunately.
+
+// Constexpr that will strip the path off of the file for logging purposes
+constexpr char const *stripFilePath(const char *path)
+{
+    const char *file = path;
+    while (*path) {
+        if (*path++ == '/') {
+            file = path;
+        }
+    }
+    return file;
+}
+
+#define FILE_BASENAME stripFilePath(__FILE__)
+
+#endif // defined(__FILE_NAME__)
+
+#define STRINGIZE_DETAIL(X) #X
+#define STRINGIZE(X)        STRINGIZE_DETAIL(X)
+
+#include "graph_status.h"
+#include "cc_pp.h"
+
+#ifdef __cplusplus
+#include <cstdio>
+#else
+#include <stdio.h>
+#endif
+
+// If log level or the dynamic logging flag are defined but don't have a value,
+// then consider them to be undefined.
+#if ~(~NN_LOG_MAXLVL + 0) == 0 && ~(~NN_LOG_MAXLVL + 1) == 1
+#undef NN_LOG_MAXLVL
+#endif
+
+#if ~(~NN_LOG_DYNLVL + 0) == 0 && ~(~NN_LOG_DYNLVL + 1) == 1
+#undef NN_LOG_DYNLVL
+#endif
+
+/*
+ * We have migrated using C++ features like iostream to printf strings.
+ * Why?
+ * * C++ iostream makes it more difficult to use mixed decimal/hex
+ * * C++ iostream isn't easily compatible with on-target logging facilities
+ * * C++ iostream is bad for code size, printf is much better
+ */
+
+//Log levels macro
+#define NN_LOG_ERRORLVL         0 //Error log level is 0
+#define NN_LOG_WARNLVL          1 //Warning log level is 1
+#define NN_LOG_STATLVL          2 //Stats log level is 2
+#define NN_LOG_INFOLVL          3 //Info log level is 3
+#define NN_LOG_VERBOSELVL       4 //Verbose log level is from 4-10
+#define NN_LOG_STATLVL_INTERNAL 8
+#define NN_LOG_INFOLVL_INTERNAL 9
+#define NN_LOG_DEBUGLVL         11 //Debug log level is > 10
+
+typedef void (*DspLogCallbackFunc)(int level, const char *fmt, va_list args);
+
+// Dynamically set the logging priority level.
+PUSH_VISIBILITY(default)
+EXTERN_C_BEGIN
+extern "C" {
+
+API_FUNC_EXPORT void SetLogPriorityLevel(int level);
+API_FUNC_EXPORT int GetLogPriorityLevel();
+API_FUNC_EXPORT void SetLogCallbackFunc(DspLogCallbackFunc fn);
+API_FUNC_EXPORT DspLogCallbackFunc GetLogCallbackFunc();
+
+// This prevents preemption if we're using the TID preemption mechanism.
+// Enable format checking when we're ready to fix all of the broken formats!
+//[[gnu::format(printf, 1, 2)]]
+API_FUNC_EXPORT void nn_log_printf(const char *fmt, ...);
+}
+EXTERN_C_END
+POP_VISIBILITY()
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// special log message for x86 that will log regardless logging level
+void qnndsp_x86_log(const char *fmt, ...);
+
+#ifdef __cplusplus
+}
+#endif
+
+/////////////////////////ENABLE_QNN_LOG
+#ifdef ENABLE_QNNDSP_LOG
+
+PUSH_VISIBILITY(default)
+#include "weak_linkage.h"
+
+API_FUNC_EXPORT API_C_FUNC void API_FUNC_NAME(SetLogCallback)(DspLogCallbackFunc cbFn, int logPriority);
+
+extern "C" {
+API_FUNC_EXPORT void qnndsp_log(int prio, const char *FMT, ...);
+
+API_FUNC_EXPORT void hv3_load_log_functions(decltype(SetLogCallback) **SetLogCallback_f);
+}
+POP_VISIBILITY()
+
+#define qnndsp_base_log(prio, cformat, ...) (void)(qnndsp_log(prio, cformat, ##__VA_ARGS__))
+
+#define _rawlog_(cformat, ...) (qnndsp_base_log(NN_LOG_ERRORLVL, cformat, __VA_ARGS__), GraphStatus::ErrorFatal)
+#define _okaylog_(cformat, ...)                                                                                        \
+    (qnndsp_base_log(NN_LOG_ERRORLVL, "%s:" STRINGIZE(__LINE__) ":" cformat "\n", FILE_BASENAME, __VA_ARGS__),         \
+     GraphStatus::ErrorFatal)
+#define _errlog_(cformat, ...)                                                                                         \
+    (qnndsp_base_log(NN_LOG_ERRORLVL, "%s:" STRINGIZE(__LINE__) ":ERROR:" cformat "\n", FILE_BASENAME, __VA_ARGS__),   \
+     GraphStatus::ErrorFatal)
+#define _warnlog_(cformat, ...) qnndsp_base_log(NN_LOG_WARNLVL, "WARNING: " cformat "\n", __VA_ARGS__)
+#define _statlog_(statname, statvalue, dummy)                                                                          \
+    qnndsp_base_log(NN_LOG_STATLVL, "STAT: %s=%lld\n", statname, (long long)statvalue)
+#define _i_statlog_(statname, statvalue, dummy)                                                                        \
+    qnndsp_base_log(NN_LOG_STATLVL_INTERNAL, "STAT: %s=%lld\n", statname, (long long)statvalue)
+#define _statslog_(statname, statvalue, dummy) qnndsp_base_log(NN_LOG_STATLVL, "STAT: %s=%s\n", statname, statvalue)
+#define _i_statslog_(statname, statvalue, dummy)                                                                       \
+    qnndsp_base_log(NN_LOG_STATLVL_INTERNAL, "STAT: %s=%s\n", statname, statvalue)
+#define _infolog_(cformat, ...)       qnndsp_base_log(NN_LOG_INFOLVL, cformat "\n", __VA_ARGS__)
+#define _i_infolog_(cformat, ...)     qnndsp_base_log(NN_LOG_INFOLVL_INTERNAL, cformat "\n", __VA_ARGS__)
+#define _debuglog_(cformat, ...)      qnndsp_base_log(NN_LOG_DEBUGLVL, cformat "\n", __VA_ARGS__)
+#define _verboselog_(cformat, ...)    qnndsp_base_log(NN_LOG_VERBOSELVL, cformat "\n", __VA_ARGS__)
+#define logmsgraw(prio, cformat, ...) qnndsp_base_log(prio, cformat, ##__VA_ARGS__)
+#define _logmsg_(prio, cformat, ...)                                                                                   \
+    (void)(qnndsp_base_log(prio, "%s:" STRINGIZE(__LINE__) ":" cformat "\n", FILE_BASENAME, __VA_ARGS__))
+#define _logmsgl_(prio, cformat, ...) (void)(qnndsp_base_log(prio, cformat, __VA_ARGS__))
+
+#else // ENABLE_QNNDSP_LOG
+
+// Standalone HexNN default log
+
+#if defined(NN_LOG_DYNLVL) && (NN_LOG_DYNLVL > 0)
+
+// Dynamic logging level test function.
+static inline bool log_condition(const int prio)
+{
+    return (prio <= GetLogPriorityLevel());
+};
+
+#elif defined(NN_LOG_MAXLVL)
+
+// Logging level is fixed at compile time.
+constexpr static bool log_condition(const int prio)
+{
+    return ((prio <= NN_LOG_MAXLVL) ? true : false);
+};
+
+#else
+
+// Logging is completely disabled.
+constexpr static bool log_condition(const int prio)
+{
+    return false;
+};
+
+#endif
+
+// These are conditional, where the condition is set via compile flags.  Note that these are
+// template functions so that we can exclude them from coverage using lcov commands.
+
+template <typename... Types> inline void logmsgraw(const int prio, char const *fmt, Types... args)
+{
+    // LCOV_EXCL_START [SAFTYSWCCB-996]
+    if (log_condition(prio)) {
+        nn_log_printf(fmt, args...);
+    }
+    // LCOV_EXCL_STOP
+}
+
+// These macros are what are used in actual code, so that the line and filename macros will expand
+// properly to show where the macro is invoked.
+
+#define _rawlog_(FMT, ...)  (nn_log_printf((FMT), __VA_ARGS__))
+#define _okaylog_(FMT, ...) (nn_log_printf(FILE_BASENAME ":" STRINGIZE(__LINE__) ":" FMT "\n", __VA_ARGS__))
+#define _errlog_(FMT, ...)                                                                                             \
+    (nn_log_printf(FILE_BASENAME ":" STRINGIZE(__LINE__) ":ERROR:" FMT "\n", __VA_ARGS__), GraphStatus::ErrorFatal)
+#define _logmsg_(PRIO, FMT, ...) logmsgraw(PRIO, FILE_BASENAME ":" STRINGIZE(__LINE__) ": " FMT "\n", __VA_ARGS__)
+#define _warnlog_(FMT, ...)                                                                                            \
+    logmsgraw(NN_LOG_WARNLVL, FILE_BASENAME ":" STRINGIZE(__LINE__) ":WARNING: " FMT "\n", __VA_ARGS__)
+#define _statlog_(statname, statvalue, dummy)                                                                          \
+    logmsgraw(NN_LOG_STATLVL, FILE_BASENAME ":" STRINGIZE(__LINE__) ":STAT: %s=%lld\n", statname, (long long)statvalue)
+#define _i_statlog_(statname, statvalue, dummy)                                                                        \
+    logmsgraw(NN_LOG_STATLVL_INTERNAL, FILE_BASENAME ":" STRINGIZE(__LINE__) ":STAT: %s=%lld\n", statname,             \
+              (long long)statvalue)
+#define _statslog_(statname, statvalue, dummy)                                                                         \
+    logmsgraw(NN_LOG_STATLVL, FILE_BASENAME ":" STRINGIZE(__LINE__) ":STAT: %s=%s\n", statname, statvalue)
+#define _i_statslog_(statname, statvalue, dummy)                                                                       \
+    logmsgraw(NN_LOG_STATLVL_INTERNAL, FILE_BASENAME ":" STRINGIZE(__LINE__) ":STAT: %s=%s\n", statname, (statvalue))
+#define _infolog_(FMT, ...) logmsgraw(NN_LOG_INFOLVL, FILE_BASENAME ":" STRINGIZE(__LINE__) ":" FMT "\n", __VA_ARGS__)
+#define _i_infolog_(FMT, ...)                                                                                          \
+    logmsgraw(NN_LOG_INFOLVL_INTERNAL, FILE_BASENAME ":" STRINGIZE(__LINE__) ":" FMT "\n", __VA_ARGS__)
+#define _debuglog_(FMT, ...) logmsgraw(NN_LOG_DEBUGLVL, FILE_BASENAME ":" STRINGIZE(__LINE__) ":" FMT "\n", __VA_ARGS__)
+#define _verboselog_(FMT, ...)                                                                                         \
+    logmsgraw(NN_LOG_VERBOSELVL, FILE_BASENAME ":" STRINGIZE(__LINE__) ":" FMT "\n", __VA_ARGS__)
+
+#endif // ENABLE_QNNDSP_LOG
+
+template <class T> constexpr const char *format_type_check = "";
+
+// This compile-time expression ensures that we always apply the -Wformat type-safety check.
+#define FORMAT_TYPE_CHECK(...) (format_type_check<decltype(printf(__VA_ARGS__))>)
+
+#define rawlog(...)       _rawlog_(__VA_ARGS__, FORMAT_TYPE_CHECK(__VA_ARGS__))
+#define okaylog(...)      _okaylog_(__VA_ARGS__, FORMAT_TYPE_CHECK(__VA_ARGS__))
+#define errlog(...)       _errlog_(__VA_ARGS__, FORMAT_TYPE_CHECK(__VA_ARGS__))
+#define logmsg(PRIO, ...) _logmsg_(PRIO, __VA_ARGS__, FORMAT_TYPE_CHECK(__VA_ARGS__))
+#define warnlog(...)      _warnlog_(__VA_ARGS__, FORMAT_TYPE_CHECK(__VA_ARGS__))
+#define statlog(...)      _statlog_(__VA_ARGS__, FORMAT_TYPE_CHECK(__VA_ARGS__))
+#define i_statlog(...)    _i_statlog_(__VA_ARGS__, FORMAT_TYPE_CHECK(__VA_ARGS__))
+#define statslog(...)     _statslog_(__VA_ARGS__, FORMAT_TYPE_CHECK(__VA_ARGS__))
+#define i_statslog(...)   _i_statslog_(__VA_ARGS__, FORMAT_TYPE_CHECK(__VA_ARGS__))
+#define infolog(...)      _infolog_(__VA_ARGS__, FORMAT_TYPE_CHECK(__VA_ARGS__))
+#define i_infolog(...)    _i_infolog_(__VA_ARGS__, FORMAT_TYPE_CHECK(__VA_ARGS__))
+#define _debuglog(...)    _debuglog_(__VA_ARGS__, FORMAT_TYPE_CHECK(__VA_ARGS__))
+#define verboselog(...)   _verboselog_(__VA_ARGS__, FORMAT_TYPE_CHECK(__VA_ARGS__))
+
+// Extra hook for debuglog.  This allows files to redefine it in order to add extra compile-time
+// hooks for removing it.
+#define debuglog(...) _debuglog(__VA_ARGS__)
+
+#ifdef NN_LOG_MAXLVL
+#define LOG_STAT()    ((NN_LOG_MAXLVL) >= NN_LOG_STATLVL)
+#define LOG_INFO()    ((NN_LOG_MAXLVL) >= NN_LOG_INFOLVL)
+#define LOG_DEBUG()   ((NN_LOG_MAXLVL) >= NN_LOG_DEBUGLVL)
+#define LOG_VERBOSE() ((NN_LOG_MAXLVL) >= NN_LOG_VERBOSELVL)
+#else
+#define LOG_STAT()    (1)
+#define LOG_INFO()    (1)
+#define LOG_DEBUG()   (1)
+#define LOG_VERBOSE() (1)
+#endif //#ifdef NN_LOG_MAXLVL
+
+class ExternalProgressLogger {
+
+  public:
+    static void start(const char *stage_name);
+
+    static void update_progress(unsigned int numerator, unsigned int denominator);
+
+    static void end(const char *stage_name, const char *duration);
+};
+
+class ExternalTimePoint {
+    using TimePoint = std::chrono::high_resolution_clock::time_point;
+    const std::string stage_name;
+    const TimePoint start_time;
+    unsigned int numerator = 1;
+    unsigned int denominator = 1;
+    bool done = false;
+
+  public:
+    explicit ExternalTimePoint(const std::string &&stage_name);
+
+    void update_progress(unsigned int new_numerator, unsigned int new_denominator);
+
+    void close();
+
+    // Custom destructor
+    ExternalTimePoint() = delete;
+    ExternalTimePoint(const ExternalTimePoint &) = delete;
+    ExternalTimePoint &operator=(ExternalTimePoint &t) = delete;
+    ExternalTimePoint(ExternalTimePoint &&) = delete;
+    ExternalTimePoint &operator=(ExternalTimePoint &&t) = delete;
+    ~ExternalTimePoint() { close(); } // LCOV_EXCL_LINE [SAFTYSWCCB-1542]
+};
+
+#endif //#ifndef LOG_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/macros_attribute.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/macros_attribute.h
new file mode 100755
index 0000000000000..ebdfa37152cdd
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/macros_attribute.h
@@ -0,0 +1,86 @@
+//==============================================================================
+//
+// Copyright (c) 2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef MACROS_MSVC_CPP17_H
+#define MACROS_MSVC_CPP17_H
+
+// Macros to compatible with MSVC and clang
+#if defined(_MSC_VER)
+
+// Macros to tells the compiler to never inline a particular member function
+#define NOINLINE __declspec(noinline)
+// Macros to force a function to be inlined, meaning that the function call is replaced with the function body at compile time
+#define ALWAYSINLINE __forceinline
+
+#ifndef API_EXPORT
+//Macros to export symbol from a library
+#define API_EXPORT __declspec(dllexport)
+#endif
+#ifndef API_HIDDEN
+//Macros to hidden symbol from a library
+#define API_HIDDEN
+
+//Macros to tell the compiler that the function returns an object that is not aliased, that is, referenced by any other pointers.
+#define RESTRICT_VAR __restrict
+
+#endif
+#else
+
+#define NOINLINE     __attribute__((noinline))
+#define ALWAYSINLINE __attribute((always_inline))
+
+#ifndef API_EXPORT
+/**
+ * @brief The definition of API_EXPORT is commented out for
+ * now, because it was causing problems with static analysis.
+ * As an alternative, use the PUSH_VISIBILITY(default)/POP_VISIBILITY()
+ * macros to ensure that symbols are exported on Linux.
+*/
+#define API_EXPORT /* [[gnu::visibility("default")]] */
+#endif
+#ifndef API_HIDDEN
+#define API_HIDDEN [[gnu::visibility("hidden")]]
+#endif
+
+#define RESTRICT_VAR __restrict__
+
+#endif // _MSC_VER
+
+/**
+ * @brief The following macros: [PUSH|POP|ENABLE|DISABLE]_WARNING,
+ * allow for in-code enabing and disabling of compiler warnings in
+ * a portable fashion.
+*/
+#define DO_PRAGMA(x) _Pragma(#x)
+
+#define MSVC_NO_EQUIV
+#define GNU_NO_EQUIV ""
+
+#if defined(__clang__)
+#define PUSH_WARNING()             DO_PRAGMA(clang diagnostic push)
+#define POP_WARNING()              DO_PRAGMA(clang diagnostic pop)
+#define ENABLE_WARNING(gnu, msvc)  DO_PRAGMA(clang diagnostic warning gnu)
+#define DISABLE_WARNING(gnu, msvc) DO_PRAGMA(clang diagnostic ignored gnu)
+#elif defined(__GNUG__)
+#define PUSH_WARNING()             DO_PRAGMA(GCC diagnostic push)
+#define POP_WARNING()              DO_PRAGMA(GCC diagnostic pop)
+#define ENABLE_WARNING(gnu, msvc)  DO_PRAGMA(GCC diagnostic warning gnu)
+#define DISABLE_WARNING(gnu, msvc) DO_PRAGMA(GCC diagnostic ignored gnu)
+#elif defined(_MSC_VER)
+#define PUSH_WARNING()             DO_PRAGMA(warning(push))
+#define POP_WARNING()              DO_PRAGMA(warning(pop))
+#define ENABLE_WARNING(gnu, msvc)  DO_PRAGMA(warning(default : msvc))
+#define DISABLE_WARNING(gnu, msvc) DO_PRAGMA(warning(disable : msvc))
+#else
+#define PUSH_WARNING()
+#define POP_WARNING()
+#define ENABLE_WARNING(gnu, msvc)
+#define DISABLE_WARNING(gnu, msvc)
+#endif
+
+#endif //MACROS_MSVC_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/match_op.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/match_op.h
new file mode 100755
index 0000000000000..9e95a060ac09f
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/match_op.h
@@ -0,0 +1,211 @@
+//==============================================================================
+//
+// Copyright (c) 2018-2024 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef MATCH_OP_H_
+#define MATCH_OP_H_ 1
+
+#include <vector>
+#include <stdexcept>
+#include "op_package_name.h"
+#include "opname_tag.h"
+#include "macros_attribute.h"
+#include "weak_linkage.h"
+
+#ifndef PREPARE_DISABLED
+//
+//
+// Classes to make a 'MatchOp'
+//
+// Executing the match rule, in the namespace of MatchBuilder, causes it to
+// return a MatchAstNode; that is analyzed to generate an instance of a subclass of MatchOpBase.
+//
+// MatchAstNode
+//    - node of an Abstract Syntax tree which is built of the rule.
+// MatchAstSubNode
+//    - represents a single input to an Op in an MatchAstNode. May or may not contain a pointer
+//      to a nested MatchAstNode.
+// MatchBuilder
+//   - just provides a namespace, in whicb 'executing' the text
+//     of the rule causes it to return a MatchAstNode
+//
+
+// Limit of match params. This is the sum of
+//   - all Op or OpVarIn the pattern, including the root; whether or not LET attached
+//   - number of distinct operand names which occur in the pattern. Names which appear
+//    in LET are not counted, even if they also appear outside of LET.
+//
+// This limit only affects the array sizes used in the Match object (only one instantiated,
+// during optimization phase) so we can be generous
+
+PUSH_VISIBILITY(default)
+
+namespace hnnx {
+
+static constexpr int MATCH_MAX_PATTERN = 80;
+
+//
+class MatchOpBase;
+using MatchOp_uptr = std::unique_ptr<MatchOpBase>;
+class MatchAstNode;
+using MatchAst_uptr = std::unique_ptr<MatchAstNode>;
+
+/// @brief MatchAstNode represents an input to an Op in a Match Pattern.
+///
+/// Only used as element of m_subnodes array in MatchAstNode.
+
+class MatchAstSubnode {
+    friend MatchAstNode;
+
+  protected:
+    // since most of the descendants are operand_tag_t, we keep those right in the
+    // subnode list. If m_sub is empty, then m_optag should be non-empty, and vice versa
+    MatchAst_uptr m_sub; ///<  points to a MatchAstNode if the input is a contained Op; empty otherwise
+    operand_tag_t m_optag; ///<   m_optag contains the operand tag if the input is not a contained op ; "" otherwise
+  public:
+    MatchAstSubnode(MatchAst_uptr &&ptr) : m_sub(std::move(ptr)) {}
+    MatchAstSubnode(operand_tag_parm_t otag) : m_optag(otag) {}
+    MatchAstSubnode(char const *otag) : m_optag(otag) {}
+    bool is_optag() const { return m_sub.get() == nullptr; }
+    MatchAstNode const *get_subnode_p() const { return m_sub.get(); }
+    MatchAstNode *get_subnode_p() { return m_sub.get(); }
+    operand_tag_t get_optag() const { return m_optag; }
+};
+
+/// @brief MatchAstNode represents an Op in a match rule
+///
+/// A tree of these is built by executing the match rule in a MatchBuilder context, and
+/// exists only long enough to be analyzed so that a MatchOp for the rule can be built.
+///
+
+class MatchAstNode {
+    friend MatchOpBase;
+
+  public:
+    enum node_variant { is_Op, is_OpVarIn };
+    virtual ~MatchAstNode() {}
+    operand_tag_t m_optag; ///< tag assigned via LET, or empty if none ("*" at root)
+    /// true if the node or subnode has any ref to optag (including its own m_optag)
+    API_EXPORT bool contains_optag(operand_tag_parm_t optag) const;
+    MatchAstNode &operator=(MatchAstNode const &) = delete;
+    MatchAstNode(MatchAstNode const &) = delete;
+
+  protected:
+    opname_tag_t m_opname; ///< From first param of Op
+    node_variant m_opvariant; ///< is Op or OpVarIn?
+    std::vector<MatchAstSubnode> m_subnodes; ///< One for each input in the pattern
+
+    /// Count the ops, and put their LET names in the map.
+    /// Returns >1 if ok, -1 if error
+    API_EXPORT int enumerate_ops(std::map<operand_tag_t, int> &op_tag_to_idx_map, int opcount);
+
+    // if WITH_OPT_DEBUG is not defined, this returns an empty pointer.
+    API_EXPORT std::unique_ptr<char[]> make_debug_desc(std::map<operand_tag_t, int> const &opertag_to_idx_map) const;
+
+  public:
+    API_EXPORT MatchAstNode(char const *, char const *, node_variant, int n_subnodes, MatchAstSubnode *subnodes);
+};
+
+} // namespace hnnx
+
+//
+// A rule definition looks like this, after pre-processing:
+//
+//template<>
+//MatchOp_uptr MatchBuilder::matcher<SomeUniqueClass>( )
+// {
+//	return build_matcher( Op("Foo1", Op("Abc","X"),"Y", "Z"));
+// }
+
+class MatchBuilder {
+
+  protected:
+    /// \ingroup OptMatch
+    /// @brief define an Op to match: Op("opname", ...inputs... )
+    /// The inputs can be operand tags (strings), or nested Op (or OpVarIn)
+    ///
+
+    template <typename... S> using are_strings = std::conjunction<std::is_same<const char *, S>...>;
+
+    template <typename... S> using have_matchast_uptr = std::disjunction<std::is_same<hnnx::MatchAst_uptr, S>...>;
+
+    // just convert each 'input' to a MatchAstSubnode, and build a MatchAstNode from that.
+    template <typename... Ts>
+    API_EXPORT static typename std::enable_if<have_matchast_uptr<Ts...>::value, hnnx::MatchAst_uptr>::type
+    Op(char const *opname, Ts &&...ts)
+    {
+        std::array<hnnx::MatchAstSubnode, sizeof...(Ts)> subnodes = {std::move(std::forward<Ts>(ts))...};
+        return std::make_unique<hnnx::MatchAstNode>(opname, pkg_flag.c_str(), hnnx::MatchAstNode::is_Op, sizeof...(Ts),
+                                                    subnodes.data());
+    }
+
+    template <typename... Ts>
+    API_EXPORT static typename std::enable_if<are_strings<Ts...>::value, hnnx::MatchAst_uptr>::type
+    Op(char const *opname, Ts... ts)
+    {
+        std::array<hnnx::MatchAstSubnode, sizeof...(Ts)> subnodes = {(ts)...};
+        return std::make_unique<hnnx::MatchAstNode>(opname, pkg_flag.c_str(), hnnx::MatchAstNode::is_Op, sizeof...(Ts),
+                                                    subnodes.data());
+    }
+
+    template <typename... Ts>
+    API_EXPORT static typename std::enable_if<are_strings<Ts...>::value, hnnx::MatchAst_uptr>::type
+    Op(char const *opname, hnnx::MatchAst_uptr &&t1, Ts... ts)
+    {
+        hnnx::MatchAstSubnode subnodes[sizeof...(Ts) + 1] = {std::move(t1), (ts)...};
+        return std::make_unique<hnnx::MatchAstNode>(opname, pkg_flag.c_str(), hnnx::MatchAstNode::is_Op,
+                                                    sizeof...(Ts) + 1, subnodes);
+    }
+
+    /// \ingroup OptMatch
+    /// @brief define an Op to match, with at least the specified number of inputs: OpVarIn("opname", ..inputs..)
+    /// This will match the same as Op, but will also accept additional (unspecified) inputs.
+    /// The number of inputs may be zero: OpVarIn("opname") matches any op "opname".
+    ///
+    template <typename... Ts>
+    API_EXPORT static typename std::enable_if<have_matchast_uptr<Ts...>::value, hnnx::MatchAst_uptr>::type
+    OpVarIn(char const *opname, Ts &&...ts)
+    {
+        std::array<hnnx::MatchAstSubnode, sizeof...(Ts)> subnodes = {std::move(std::forward<Ts>(ts))...};
+        return std::make_unique<hnnx::MatchAstNode>(opname, pkg_flag.c_str(), hnnx::MatchAstNode::is_OpVarIn,
+                                                    sizeof...(Ts), subnodes.data());
+    }
+
+    template <typename... Ts>
+    API_EXPORT static typename std::enable_if<are_strings<Ts...>::value, hnnx::MatchAst_uptr>::type
+    OpVarIn(char const *opname, Ts... ts)
+    {
+        std::array<hnnx::MatchAstSubnode, sizeof...(Ts)> subnodes = {(ts)...};
+        return std::make_unique<hnnx::MatchAstNode>(opname, pkg_flag.c_str(), hnnx::MatchAstNode::is_OpVarIn,
+                                                    sizeof...(Ts), subnodes.data());
+    }
+    /// \ingroup OptMatch
+    /// @brief give an Op in a pattern a tag: LET("tag", Op("opname", ...inputs... ))
+    /// Second parameter must be Op or OpVarIn, and must not contain the same tag.
+    /// The same tag can be used elsewhere in the pattern, but only in one LET.
+    ///
+    API_EXPORT static hnnx::MatchAst_uptr LET(hnnx::operand_tag_parm_t optag, hnnx::MatchAst_uptr &&subnode)
+    {
+        implement_LET(optag, subnode);
+        return std::move(subnode);
+    }
+    /// @brief internal implementation for LET.
+    /// just adds 'optag' to the node, and complains if it already has a tag.
+    /// Maybe it also checks that the name doesn't appear inside.
+    //
+    API_EXPORT static void implement_LET(hnnx::operand_tag_parm_t optag, hnnx::MatchAst_uptr &subnode);
+
+  public:
+    template <typename T> API_EXPORT static hnnx::MatchAst_uptr matcher();
+    API_EXPORT static hnnx::MatchOp_uptr build_matcher(hnnx::MatchAst_uptr &matchast);
+    API_EXPORT_IMPORT static std::string pkg_flag;
+};
+
+POP_VISIBILITY()
+
+#endif //* !PREPARE_DISABLED */
+#endif /* MATCH_OP_H_ */
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/memory_layout.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/memory_layout.h
new file mode 100755
index 0000000000000..bd0dc73b2c856
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/memory_layout.h
@@ -0,0 +1,284 @@
+//==============================================================================
+//
+// Copyright (c) 2020 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef HEXNN_MEMORY_LAYOUT_H
+#define HEXNN_MEMORY_LAYOUT_H 1
+
+#include <array>
+#include <cstddef>
+#include <tuple>
+#include <utility>
+
+/*
+ * Rewrite memory layout
+ *
+ * Use more recursion for less complexity at each level
+ *
+ * Separate Offset and Index for use by non-contiguous tensor representations
+ */
+
+namespace hnnx {
+
+// is_power_of_two: check for some number of zeros, followed by 1, followed by some number of zeros.
+// FIXME: maybe should use bitset?
+static inline constexpr bool is_power_of_two(unsigned long in)
+{
+    return (in > 0) && ((in & (in - 1)) == 0);
+}
+
+/*
+ * Making a constexpr std::array is kind of tough if a lot of the std::array
+ * member functions are not constexpr, which is true if you have pre-c++17 header
+ * files...
+ */
+// LCOV_EXCL_START [SAFTYSWCCB-1736] constexprs resolved during compile time
+template <typename T, size_t Rank, size_t... I>
+static inline constexpr std::array<T, Rank> make_stdarray_helper(const T val, std::index_sequence<I...>)
+{
+    std::array<T, Rank> out = {((void)I, val)...};
+    return out;
+}
+
+template <typename T, size_t Rank> static inline constexpr std::array<T, Rank> make_stdarray(const T val)
+{
+    return make_stdarray_helper<T, Rank>(val, std::make_index_sequence<Rank>{});
+}
+// LCOV_EXCL_STOP
+
+} // namespace hnnx
+
+/*
+ * We use std::get in a lot of places below because operator[] is not constexpr
+ * if you have pre-C++17 system header files
+ */
+
+/*
+ * The base template... do not use
+ */
+template <size_t... Stuff> struct ChunkedMemoryLayout {
+    //static_assert(false,"Oops: matched generic base. Please use specialized templates.");
+};
+
+/*
+ * The smallest Chunk is just 1 element
+ */
+template <size_t RankVal> struct ChunkedMemoryLayout<RankVal> {
+    static constexpr size_t Rank = RankVal;
+    static constexpr std::array<size_t, Rank> ChunkSizes = hnnx::make_stdarray<size_t, Rank>(1);
+    static constexpr size_t chunk_total = 1;
+    static constexpr bool is_valid_chunk = true;
+    static inline constexpr size_t chunk_offset(const std::array<size_t, Rank> &padded_coords,
+                                                const std::array<size_t, Rank> &dims_total)
+    {
+        return 0;
+    }
+    static inline constexpr size_t linear_offset(const std::array<size_t, Rank> &padded_coords,
+                                                 const std::array<size_t, Rank> &dims_total)
+    {
+        return 0;
+    }
+    static inline constexpr size_t chunk_index(const std::array<size_t, Rank> &padded_coords,
+                                               const std::array<size_t, Rank> &dims_total, size_t offset = 0)
+    {
+        return offset;
+    }
+    static inline constexpr std::array<size_t, Rank> pad_dims(const std::array<size_t, Rank> dims_in)
+    {
+        return dims_in;
+    }
+};
+
+/*
+ * This should boil down to nothing... no non-constexpr storage, no non-constexpr functions.
+ */
+template <size_t RankVal, size_t Dim, size_t ChunkSize, size_t... Rest>
+struct ChunkedMemoryLayout<RankVal, Dim, ChunkSize, Rest...> {
+    using Smaller = ChunkedMemoryLayout<RankVal, Rest...>;
+    static constexpr size_t Rank = RankVal;
+    static_assert(Dim < RankVal);
+    //static_assert(ChunkSize > 0);
+    static_assert((ChunkSize == 0) || hnnx::is_power_of_two(ChunkSize));
+    static_assert((ChunkSize == 0) || Smaller::is_valid_chunk);
+    static constexpr bool is_valid_chunk = ((ChunkSize > 0) && (Smaller::is_valid_chunk));
+    static constexpr std::array<size_t, Rank> embiggen_chunksize(const std::array<size_t, Rank> smaller_chunksize)
+    {
+        std::array<size_t, Rank> out = smaller_chunksize;
+        if (ChunkSize) std::get<Dim>(out) *= ChunkSize;
+        return out;
+    }
+    static constexpr std::array<size_t, Rank> ChunkSizes = embiggen_chunksize(Smaller::ChunkSizes);
+    static constexpr size_t chunk_total = ChunkSize ? Smaller::chunk_total * ChunkSize : Smaller::chunk_total;
+    /* Where in the chunk is this element? */
+    /*
+	 *  FIXME sooner than later: recommendation to return std::pair or similar of chunk_index and chunk_offset
+	 * Can keep compatibility easily enough with a single wrapper.
+	 */
+    static inline constexpr size_t chunk_offset(const std::array<size_t, Rank> &padded_coords,
+                                                const std::array<size_t, Rank> &dims_total)
+    {
+        if constexpr (ChunkSize > 0) {
+            const size_t smaller_offset = Smaller::chunk_offset(padded_coords, dims_total);
+            const size_t dim_coord = padded_coords[Dim];
+            const size_t smaller_idx = dim_coord / std::get<Dim>(Smaller::ChunkSizes);
+            const size_t thischunk_smaller_idx = smaller_idx % ChunkSize;
+            const size_t smaller_chunk_total = Smaller::chunk_total;
+            return thischunk_smaller_idx * smaller_chunk_total + smaller_offset;
+        } else {
+            size_t const chunk_off = Smaller::chunk_offset(padded_coords, dims_total);
+            return chunk_off;
+        }
+    }
+    /* FIXME later: we're going to assume last to first dimension ordering */
+    static inline constexpr size_t chunk_index(const std::array<size_t, Rank> &padded_coords,
+                                               const std::array<size_t, Rank> &dims_total, size_t offset = 0)
+    {
+        if constexpr (is_valid_chunk) {
+            return offset;
+        } else {
+            offset *= std::get<Dim>(dims_total) / std::get<Dim>(ChunkSizes);
+            offset += std::get<Dim>(padded_coords) / std::get<Dim>(ChunkSizes);
+            size_t const chunk_idx = Smaller::chunk_index(padded_coords, dims_total, offset);
+            return chunk_idx;
+        }
+    }
+    static inline constexpr size_t linear_offset(const std::array<size_t, Rank> &padded_coords,
+                                                 const std::array<size_t, Rank> &dims_total)
+    {
+        const size_t offset = chunk_offset(padded_coords, dims_total);
+        const size_t index = chunk_index(padded_coords, dims_total);
+        return index * chunk_total + offset;
+    }
+    static inline std::array<size_t, Rank> pad(const std::array<size_t, Rank> dims_in)
+    {
+        std::array<size_t, Rank> newdims;
+        for (int i = 0; i < Rank; i++) {
+            auto dim_chunk_size = ChunkSizes[i];
+            newdims[i] = ((dims_in[i] + (dim_chunk_size - 1)) & (~(dim_chunk_size - 1)));
+        }
+        return newdims;
+    }
+    static inline size_t num_blocks(const std::array<size_t, Rank> max_dims)
+    {
+        size_t blocks = 1;
+        for (int i = 0; i < Rank; i++) {
+            auto dim_chunk_size = ChunkSizes[i];
+            blocks *= max_dims[i] / dim_chunk_size;
+        }
+        return blocks;
+    }
+#if 0
+    static inline constexpr size_t
+    chunk_index(const std::array<size_t, Rank> padded_coords,
+                const std::array<size_t, Rank> dims_total)
+    {
+        size_t offset = 0;
+        for (int i = 0; i < Rank; i++) {
+            offset *= dims_total[i] / ChunkSizes[i];
+            offset += padded_coords[i] / ChunkSizes[i];
+        }
+        return offset;
+    }
+#endif
+};
+
+// Simplified case,
+// E.g. FlatMemoryLayout<4>
+//  equiv to ChunkedMemoryLayout<4, 0,0, 0,1, 0,2, 0,3>
+
+template <size_t RankVal> struct FlatMemoryLayout {
+    static constexpr size_t Rank = RankVal;
+    static constexpr std::array<size_t, Rank> ChunkSizes = hnnx::make_stdarray<size_t, Rank>(1);
+    static constexpr size_t chunk_total = 1;
+    static inline constexpr size_t chunk_offset(const std::array<size_t, Rank> &padded_coords,
+                                                const std::array<size_t, Rank> &dims_total)
+    {
+        return 0;
+    }
+    static inline constexpr size_t chunk_index(const std::array<size_t, Rank> &padded_coords,
+                                               const std::array<size_t, Rank> &dims_total)
+    {
+        size_t offset = padded_coords[0];
+        for (int i = 1; i < Rank; i++) {
+            offset = offset * dims_total[i] + padded_coords[i];
+        }
+        return offset;
+    }
+    static inline constexpr size_t linear_offset(const std::array<size_t, Rank> &padded_coords,
+                                                 const std::array<size_t, Rank> &dims_total)
+    {
+        return chunk_index(padded_coords, dims_total);
+    }
+    static inline constexpr std::array<size_t, Rank> pad(const std::array<size_t, Rank> dims_in) { return dims_in; }
+
+    static inline constexpr size_t num_blocks(const std::array<size_t, Rank> max_dims)
+    {
+        size_t blocks = max_dims[0];
+        for (int i = 1; i < Rank; i++) {
+            blocks *= max_dims[i];
+        }
+        return blocks;
+    }
+};
+class R4FlatMemoryLayout : public FlatMemoryLayout<4> {
+}; //NHWC
+class R5FlatMemoryLayout : public FlatMemoryLayout<5> {
+}; //NHWDC
+class R6FlatMemoryLayout : public FlatMemoryLayout<6> {
+};
+
+class R4NCHWMemoryLayout : public ChunkedMemoryLayout<4, 0, 0, 3, 0, 2, 0, 1, 0> {
+}; // NCHW
+class R4Depth32MemoryLayout : public ChunkedMemoryLayout<4, 0, 0, 1, 0, 3, 0, 2, 0, 2, 4, 3, 32> {
+};
+
+// Croutons for HMX, YYYXXXDDDDD chunks
+class R4CroutonLayout : public ChunkedMemoryLayout<4, 0, 0, 1, 0, 2, 0, 3, 0, 1, 8, 2, 8, 3, 32> {
+};
+// Croutons for HMX, YXXXXXDDDDD chunks (wide aspect ratio)
+class R4WideCroutonLayout : public ChunkedMemoryLayout<4, 0, 0, 1, 0, 2, 0, 3, 0, 1, 2, 2, 32, 3, 32> {
+};
+
+// Croutons for HMX, YYYXDDDDDXX chunks
+class R4Crouton4x1Layout : public ChunkedMemoryLayout<4, 0, 0, 1, 0, 2, 0, 3, 0, 1, 8, 2, 2, 3, 32, 2, 4> {
+};
+
+// Croutons for HMX, YYXXDDDDDYX chunks
+class R4Crouton2x2Layout : public ChunkedMemoryLayout<4, 0, 0, 1, 0, 2, 0, 3, 0, 1, 4, 2, 4, 3, 32, 1, 2, 2, 2> {
+};
+
+// Croutons for HMX, YYXXDDDDDYX chunks (wide aspect ratio)
+class R4WideCrouton2x2Layout : public ChunkedMemoryLayout<4, 0, 0, 1, 0, 2, 0, 3, 0, 2, 16, 3, 32, 1, 2, 2, 2> {
+};
+
+// Croutons2 for HMX, 8x4x32 chunks where the data is 16b
+class R4Crouton2Layout : public ChunkedMemoryLayout<4, 0, 0, 1, 0, 2, 0, 3, 0, 1, 8, 2, 2, 3, 32, 2, 2> {
+};
+
+class R4Crouton4Layout : public ChunkedMemoryLayout<4, 0, 0, 1, 0, 2, 0, 3, 0, 1, 8, 2, 2, 3, 32> {
+};
+
+class R4WideCrouton4Layout : public ChunkedMemoryLayout<4, 0, 0, 1, 0, 2, 0, 3, 0, 1, 2, 2, 8, 3, 32> {
+};
+
+class R4Weights8x4Layout : public ChunkedMemoryLayout<4, 0, 0, 1, 0, 3, 0, 2, 0, 0, 8, 1, 4, 2, 16, 3, 32, 2, 2> {
+};
+
+class R5CroutonLayout : public ChunkedMemoryLayout<5, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 2, 8, 3, 8, 4, 32> {
+};
+
+//typedef FlatMemoryLayout<4> R4FlatMemoryLayout; // NHWC
+//typedef ChunkedMemoryLayout<4, 0,0, 3,0, 2,0, 1,0> R4NCHWMemoryLayout; // NCHW
+//typedef ChunkedMemoryLayout<4, 0,0, 1,0, 3,0, 2,0, 2,4, 3,32> R4Depth32MemoryLayout;
+//typedef ChunkedMemoryLayout<4, 0,0, 1,0, 2,0, 3,0, 1,8, 2,8, 3,32> R4CroutonLayout;		// Croutons for HMX, 8x8x32 chunks
+
+//typedef ChunkedMemoryLayout<3, 2,0, 1,0, 0,0> R3FlatMemoryLayout; // HWC
+//typedef ChunkedMemoryLayout<2, 1,0, 0,0> RowMajorMatrixLayout; // 2D
+//typedef ChunkedMemoryLayout<2, 0,0, 1,0> ColMajorMatrixLayout; // 2D
+//typedef ChunkedMemoryLayout<1, 1,0> VectorLayout; // 1D
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/minihash.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/minihash.h
new file mode 100755
index 0000000000000..24c05e6465f8c
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/minihash.h
@@ -0,0 +1,971 @@
+//==============================================================================
+//
+// Copyright (c) 2020 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef MINIHASH_H_
+#define MINIHASH_H_
+
+#include <cassert>
+#include <vector>
+#include <utility>
+#include <cstdint>
+#include <cstring>
+#include <type_traits>
+#include <stdexcept>
+#include "conversions.h"
+#include "type_help.h"
+#include "builtin_intrinsics.h"
+
+#define XASSERT assert
+
+namespace hnnx {
+
+namespace minObj {
+
+inline int ceiling_log2(size_t value)
+{
+    if (value < 2) {
+        return 0;
+    }
+    if constexpr (sizeof(size_t) <= sizeof(unsigned long)) {
+        int const clz = HEX_COUNT_LEADING_ZERO_UL((unsigned long)value - 1);
+        return 8 * sizeof(unsigned long) - clz;
+    } else {
+        // LCOV_EXCL_START [SAFTYSWCCB-1736] device with 8 byte size_t
+        int const clz = HEX_COUNT_LEADING_ZERO_ULL((unsigned long long)value - 1);
+        return 8 * sizeof(unsigned long long) - clz;
+    }
+    // LCOV_EXCL_STOP
+}
+
+// hash T to uint32_t
+//
+template <typename T> struct findhash {
+    // uint32_t operator() ( T ) const{...}
+};
+
+// define for 'unsigned'
+// this is intended to be reasonably quick on hexagon
+template <> struct findhash<unsigned> {
+    uint32_t operator()(unsigned n) const
+    {
+        uint64_t const bigprod = uint64_t(n) * 0x740F1DE9;
+        return (uint32_t)bigprod ^ (uint32_t)(bigprod >> 32);
+    }
+};
+// define for 'unsigned long long' assuming it's 64 bits
+template <> struct findhash<unsigned long long> {
+    static_assert(sizeof(unsigned long long) == 8, "assumed true...?");
+    uint32_t operator()(unsigned long long n) const
+    {
+        unsigned const upperhash = mulu32_modular(0x192E2101u, (unsigned)(n >> 32));
+        return findhash<unsigned>()((unsigned)n ^ upperhash);
+    }
+};
+// define for 'unsigned long', which could be either 32 or 64
+template <>
+struct findhash<unsigned long>
+    : public findhash<
+              std::conditional<(sizeof(unsigned long) > sizeof(unsigned)), unsigned long long, unsigned>::type> {
+};
+// this is useful for defining findhash<X> on other types in other headers,
+// in terms of std::hash<X>,  without needing to include this header first.
+inline uint32_t findhash_sizet(size_t val)
+{
+    return findhash<size_t>()(val);
+}
+//
+// define for T*
+//
+template <typename T> struct findhash<T *> {
+    inline uint32_t operator()(T *ptr) const { return findhash<size_t>()((size_t)ptr); }
+};
+
+template <> struct findhash<int> : public findhash<unsigned> {
+};
+template <> struct findhash<long> : public findhash<unsigned long> {
+};
+template <> struct findhash<long long> : public findhash<unsigned long long> {
+};
+
+// hashmap_traits<typename Key,bool ERASE_OK>:
+//    bool valid:                     is this key OK
+//  (only needed if !ERASE_OK):
+//    static Key generate_null();		// make a 'null' entry
+//    static bool is_null(Key);		// test if nul
+template <typename Key, bool ERASE_OK> struct hashmap_traits { // defaults for ERASE_OK=true
+    static constexpr bool valid = std::is_trivial<Key>::value;
+};
+
+template <typename Key> struct hashmap_traits<Key, false> { // defaults for ERASE_OK=false
+    static constexpr bool valid =
+            std::is_trivial<Key>::value && (std::is_integral<Key>::value || std::is_pointer<Key>::value);
+    static inline Key generate_null() { return Key(0); }
+    static inline bool is_null(Key k) { return k == 0; }
+};
+
+// fake instance of integer type IT
+template <typename T> struct stuck_at_0 {
+    stuck_at_0() {}
+    stuck_at_0(T) {}
+    void operator=(T) {}
+    operator T() const { return 0; }
+};
+
+////////////////////////////////////////////////////////////////////
+//
+// minObj::hashmap<Key,T,ERASE_OK [,HASH]>
+//
+// typedefs:
+//    minihash_noerase<Key,T>
+//    minihash<Key,T>
+//
+//
+// This implements some std::map<Key,T> functionality but
+// storing the data in a contiguous array;  there are some constraints:
+//
+// - The key must be a trivial type, for which findhash()(Key) is workable,
+//   and == is defined; for ERASE_OK = false, it must be an integer type of 32 or 64
+//   bits, and the default hash is defined for those.
+//   Pointer type is also acceptable.
+//
+//  - T() must be move-constructable (and ideally should be a simple type).
+//   If it is expensive to move, maybe this type is not a good choice (occasional
+//   'rehash' moves everything in the table).
+//
+// - if ERASE_OK is false, then
+//     (a) key=0 is not allowed (it is used to mark reserved records)
+//     (b) there is no way to delete entries (except clear()).
+//
+//  - iteration is possible; the iterators address a tuple<const Key,T>
+//
+//  - iteration is only possible in forward direction (since the order is indeterminate,
+//    this is not really as issue)
+//    *But* adding a key may invalidate all except the end() iterator.
+//    shrink() may also do this.
+//
+//  - insertions are usually fast, but may sometimes take a while (if the hash table is enlarged).
+//    The policy is that if it is more than half full, its size is increased by 4.
+//
+//
+//  - (for ERASE_OK): deletions do not result in resizing, and do not invalidate iterators;
+//    but there is a 'shrink()' method which may shrink the table (and invalidate iterators)
+//    if there are enough dead nodes in it.
+//
+// The constructor can supply the initial size, which is rounded up to a power of 2
+// (and at least 8); the default is to make an empty table which becomes n=64 on first insert.
+//
+//  methods supported:
+//          hash.size()      ->  size_t
+//          hash.empty()     ->  bool
+//          hash.count(k)    ->  size_t
+//          hash.contains(k) ->  bool
+//          hash.clear()
+//          hash[k]          -> T &
+//          hash.at(k)       -> T &, T const &
+//          hash.find(k)     -> iterator or const iter
+//          hash.emplace( k, ... parms for T ctor )
+//          hash.try_emplace( k, ... parms for T ctor )
+//                (emplace and try_emplace are actually the same..)
+//          hash.begin(), hash.end(), hash.cbegin(), hash.cend()
+//          hash.swap( otherhash )
+//        Special methods:
+//          hash.shrink()->bool   (this does nothing when ERASE_OK is false).
+//              shrink() will rehash if a significant number of deleted entries
+//              exist; it returns true if it changed anything,
+//          hash.count_deleted()->size_t
+//              Returns the number of deleted entries. Always 0 when ERASE_OK = false.
+//
+//   Only if ERASE_OK:
+//          hash.erase(k)				- erase any key 'k'   (returns 0 or 1)
+//          hash.erase( iterator )		- erase iterator, which must be valid.
+//                     returns 'next' iterator
+//       Erase will not cause a 'rehash', so it invalidates any iterator to the
+//       erased element, but no others. Erasing the last entry (so size()=0) may
+//       cause all deleted entries to be cleared out.
+//
+// The hash table size HN is always a power of 2.
+// for a given key, we find a uint32 hash h, and then the path through the buckets is
+// defined by :
+//   - start at bucket h % NH;
+//   - increment by d mod NH, where d=2*(h>>16)+1.
+//  Since d is odd, it is always relatively prime to HN, so the path will cover all
+//   buckets.
+//  Note,
+//   if HN > 64K, a few 'h' bits are used for both the start position and d;
+//   if HN > 128K, the 'd' values are limited to 1,3 .. 128K-1 so that space
+//   is not fully covered. But, if the hash is uniform over the 32 bits these
+//   should not be problematic for reasonable-size hashes.
+//
+//
+template <typename Key> struct simple_key {
+    Key first; // the key value
+    using traits = hashmap_traits<Key, false>;
+    simple_key() : first(traits::generate_null()) {}
+    inline void clear_key() { first = traits::generate_null(); }
+    inline bool is_inuse() const { return !traits::is_null(first); }
+    inline bool is_deleted() const { return false; }
+    inline bool is_never_used() const { return traits::is_null(first); }
+};
+// this is used as the raw_entry type when ERASE_OK=false;
+// it should have the same size and layout as tuple<Key,T>
+template <typename Key, typename T> struct simple_raw_entry : public simple_key<Key> {
+    char value_slot alignas(T)[sizeof(T)];
+    static inline bool is_null(Key k) { return simple_key<Key>::traits::is_null(k); }
+
+    simple_raw_entry() : simple_key<Key>() {}
+    // this is only used when destroying a whole table, so we don't
+    // need to clear the key
+    inline void destroy()
+    {
+        if (this->is_inuse()) reinterpret_cast<T *>(value_slot)->~T();
+    }
+    inline void clear_entry()
+    {
+        if (this->is_inuse()) {
+            reinterpret_cast<T *>(value_slot)->~T();
+            this->clear_key();
+        }
+    }
+    inline T &value_field() { return *reinterpret_cast<T *>(value_slot); }
+    inline T const &value_field() const { return *reinterpret_cast<T const *>(value_slot); }
+    // move_from is only used when rehashing; the 'from' record is
+    // known to be non-empty. We move-construct the value, and
+    // delete the old one.
+    inline void move_from(Key k, simple_raw_entry &from)
+    {
+        XASSERT(!is_null(k) && !this->is_inuse());
+        this->first = k;
+        new (value_slot) T(std::move(from.value_field()));
+        reinterpret_cast<T *>(from.value_slot)->~T();
+    }
+    inline void copy_from(simple_raw_entry const &other)
+    {
+        this->first = other.first;
+        if (other.first != 0) new (value_slot) T(other.value_field());
+    }
+    template <class... Args> inline void emplace_new(Key k, Args &&...args)
+    {
+        XASSERT(!is_null(k) && !this->is_inuse());
+        this->first = k;
+        new (value_slot) T(std::forward<Args>(args)...);
+    }
+};
+// simple_raw_entry<Key,no_value>
+// is used for set-of-Key.
+struct no_value {
+};
+
+template <typename Key> struct simple_raw_entry<Key, no_value> : public simple_key<Key> {
+    static inline bool is_null(Key k) { return simple_key<Key>::traits::is_null(k); }
+    simple_raw_entry() : simple_key<Key>() {}
+    inline void destroy() {}
+    inline void clear_entry() { this->clear_key(); }
+    inline void move_from(Key k, simple_raw_entry &from)
+    {
+        XASSERT(!is_null(k) && !this->is_inuse());
+        this->first = k;
+    }
+    inline void copy_from(simple_raw_entry const &other) { this->first = other.first; }
+    inline void emplace_new(Key k, no_value const & /*unused*/)
+    {
+        XASSERT(!is_null(k) && !this->is_inuse());
+        this->first = k;
+    }
+};
+
+// erasable_raw_entry; for deletable maps
+// erasable_raw_entry<Key,no_value> is used
+// for deletable sets (the 'empty' value_slot
+// may use up a byte, but 'state' already does so it's
+// unlikely to increase the size).
+//
+template <typename Key, typename T> struct erasable_raw_entry {
+    Key first; // the key value
+    char value_slot alignas(T)[sizeof(T)];
+    signed char state;
+    // state = 0: never used; 1 = in use; -1 = deleted.
+    //
+    inline erasable_raw_entry() : state(0) {}
+
+    inline bool is_inuse() const { return state == 1; }
+    inline bool is_deleted() const { return state < 0; }
+    inline bool is_never_used() const { return state == 0; }
+    // this is only used when destroying a whole table, so we don't
+    // need to clear the state
+    inline void destroy()
+    {
+        if (state == 1) reinterpret_cast<T *>(value_slot)->~T();
+    }
+    // this is used in 'clear'; all state to 0.
+    inline void clear_entry()
+    {
+        if (state == 1) {
+            reinterpret_cast<T *>(value_slot)->~T();
+        }
+        state = 0;
+    }
+    // this is used in 'erase'.
+    inline void erase_entry()
+    {
+        XASSERT(state == 1);
+        if (state == 1) {
+            reinterpret_cast<T *>(value_slot)->~T();
+        }
+        state = -1;
+    }
+    inline T &value_field() { return *reinterpret_cast<T *>(value_slot); }
+    inline T const &value_field() const { return *reinterpret_cast<T const *>(value_slot); }
+    inline void move_from(Key k, erasable_raw_entry &from)
+    {
+        XASSERT(state <= 0);
+        first = k;
+        new (value_slot) T(std::move(from.value_field()));
+        reinterpret_cast<T *>(from.value_slot)->~T();
+        state = 1;
+    }
+    inline void copy_from(erasable_raw_entry const &other)
+    {
+        state = other.state;
+        if (other.state == 1) {
+            first = other.first;
+            new (value_slot) T(other.value_field());
+        }
+    }
+
+    template <class... Args> inline void emplace_new(Key k, Args &&...args)
+    {
+        XASSERT(state <= 0);
+        first = k;
+        new (value_slot) T(std::forward<Args>(args)...);
+        state = 1;
+    }
+};
+
+template <typename Key, typename T, bool ERASE_OK, typename HSH = findhash<Key>> class hashmap {
+    using traits = hashmap_traits<Key, false>;
+    static_assert(traits::valid, "Bad key type for hashmap");
+
+    typedef typename std::conditional<ERASE_OK, erasable_raw_entry<Key, T>, simple_raw_entry<Key, T>>::type raw_entry_t;
+
+    static constexpr bool T_needs_dtor = !std::is_trivially_destructible<T>::value;
+
+    // data members
+    size_t m_hashN; // size of the table (a power of 2)
+    unsigned int m_log2N; // log2(hashN)
+    size_t m_entries; // number of used entries, plus deleted entries.
+    // number of deleted entries: 'stuck_at_0' when !ERASE_OK.
+    typename std::conditional<ERASE_OK, size_t, stuck_at_0<size_t>>::type m_deleted; // deleted entries (if ERASE_OK)
+
+    std::vector<raw_entry_t> m_table;
+
+    typedef typename std::vector<raw_entry_t>::iterator table_iter_type;
+    typedef typename std::vector<raw_entry_t>::const_iterator table_kiter_type;
+
+    static constexpr bool is_set = std::is_same<T, no_value>::value;
+    template <typename, typename, bool, typename> friend class hashmap;
+    template <typename, bool, typename> friend class hashset;
+
+  public:
+    typedef Key key_type;
+    typedef T mapped_type;
+    typedef typename std::conditional<is_set, Key, std::pair<const Key, T>>::type value_type;
+    typedef size_t size_type;
+
+    static_assert(ERASE_OK || sizeof(raw_entry_t) == sizeof(value_type), "failed to make compatible layout");
+
+  private:
+    // iterator will be defined as a pointer to 'this'
+    // and an index into the table, with 'end()' being an index
+    // of -1; so we move backwards.
+    class hmap_kiterator {
+        friend class hashmap;
+        template <typename, typename, bool, typename> friend class hashmap;
+        //template <typename K2>
+        //friend class hashmap<K2,T,ERASE_OK,HSH>;
+      protected:
+        hashmap *m_object;
+        int m_posn;
+        hmap_kiterator(hashmap const *obj, int posn) : m_object(const_cast<hashmap *>(obj)), m_posn(posn) {}
+        hmap_kiterator(hashmap const *obj, table_iter_type posn)
+            : m_object(const_cast<hashmap *>(obj)), m_posn(posn - obj->m_table.begin())
+        {
+        }
+
+      public:
+        hmap_kiterator() : m_object(nullptr), m_posn(-1) {}
+        hmap_kiterator(hmap_kiterator const &) = default;
+        hmap_kiterator &operator=(hmap_kiterator const &) = default;
+
+        value_type const &operator*() const { return *reinterpret_cast<value_type *>(&m_object->m_table[m_posn]); }
+        value_type const *operator->() const { return reinterpret_cast<value_type *>(&m_object->m_table[m_posn]); }
+        hmap_kiterator &operator++()
+        { // pre-inc
+            m_posn = m_object->find_next_for_iter(m_posn);
+            return *this;
+        }
+
+        hmap_kiterator operator++(int)
+        { // post-inc
+            hmap_kiterator prev{*this};
+            m_posn = m_object->find_next_for_iter(m_posn);
+            return prev;
+        }
+        bool operator==(hmap_kiterator const &other) const { return m_posn == other.m_posn; }
+        bool operator!=(hmap_kiterator const &other) const { return m_posn != other.m_posn; }
+    };
+    class hmap_iterator : public hmap_kiterator {
+        friend class hashmap;
+        //template <typename K2>
+        //friend class hashmap<K2,T,ERASE_OK,HSH>;
+      protected:
+        hmap_iterator(hashmap *obj, int posn) : hmap_kiterator(obj, posn) {}
+        hmap_iterator(hashmap *obj, table_iter_type posn) : hmap_kiterator(obj, posn) {}
+        explicit hmap_iterator(hmap_kiterator const &k) : hmap_kiterator(k) {}
+        // coerce from an iterator of another type. This is safe if it is
+        // coerced back to the proper type before use, and if the value_type are layout
+        // compatible with the same T (and K of the same size).
+        template <typename otherhash> hmap_iterator coerce_from(typename otherhash::hmap_kiterator const &other)
+        {
+            return hmap_iterator((hashmap *)other.m_object, other.m_posn);
+        }
+
+      public:
+        using value_type = hashmap::value_type;
+        using difference_type = std::ptrdiff_t;
+        using iterator_category = std::forward_iterator_tag;
+        using pointer = value_type *;
+        using reference = value_type &;
+
+        hmap_iterator() : hmap_kiterator() {}
+        hmap_iterator(hmap_iterator const &) = default;
+        hmap_iterator &operator=(hmap_iterator const &) = default;
+
+        value_type &operator*() const { return const_cast<value_type &>(this->hmap_kiterator::operator*()); }
+        value_type *operator->() const { return const_cast<value_type *>(this->hmap_kiterator::operator->()); }
+        hmap_iterator &operator++()
+        { // pre-inc
+            return static_cast<hmap_iterator &>(this->hmap_kiterator::operator++());
+        }
+        hmap_iterator operator++(int)
+        { // post-inc
+            return hmap_iterator(this->hmap_kiterator::operator++(0));
+        }
+        // == and != inherit
+    };
+
+  protected:
+    template <typename otherhash> static hmap_iterator coerce_iter(typename otherhash::hmap_kiterator const &other)
+    {
+        return hmap_iterator((hashmap *)other.m_object, other.m_posn);
+    }
+
+  public:
+    typedef hmap_iterator iterator;
+    typedef hmap_kiterator const_iterator;
+
+    explicit hashmap(size_t n_entries) : m_hashN(0), m_log2N(-1), m_entries(0), m_deleted(0)
+    {
+        make_new_table(ceiling_log2(n_entries | 7));
+    }
+    hashmap() : m_hashN(0), m_log2N(-1), m_entries(0), m_deleted(0) {}
+
+    inline size_t size() const { return m_entries - m_deleted; }
+    inline bool empty() const { return m_entries == m_deleted; }
+    inline size_t count_deleted() const { return m_deleted; }
+
+    ~hashmap()
+    {
+        if constexpr (T_needs_dtor) clear();
+    }
+
+    bool shrink()
+    {
+        if constexpr (ERASE_OK) {
+            if (m_deleted * 2 > m_entries) {
+                rehash_table(false);
+                return true;
+            }
+        }
+        return false; // only does anything when ERASE_OK
+    }
+
+    hashmap(hashmap &&other) noexcept : hashmap() { this->swap(other); }
+    // copy-ctor: create empty, then use copy_from
+    hashmap(hashmap const &other) : hashmap() { copy_from(other); }
+    // assignment: clear, then use copy_from.
+    hashmap &operator=(hashmap const &other)
+    {
+        clear();
+        copy_from(other);
+        return *this;
+    }
+    // for this we can clear 'this' and then swap everything.
+    hashmap &operator=(hashmap &&other) noexcept
+    {
+        clear();
+        this->swap(other);
+        return *this;
+    }
+
+    void swap(hashmap &other) noexcept
+    {
+        std::swap(m_hashN, other.m_hashN);
+        std::swap(m_log2N, other.m_log2N);
+        std::swap(m_entries, other.m_entries);
+        if constexpr (ERASE_OK) std::swap(m_deleted, other.m_deleted);
+        std::swap(m_table, other.m_table);
+    }
+
+    iterator begin() { return hmap_iterator{this, find_next_for_iter(m_hashN)}; }
+    const_iterator cbegin() const { return hmap_kiterator{const_cast<hashmap *>(this), find_next_for_iter(m_hashN)}; }
+    const_iterator begin() const { return cbegin(); }
+
+    iterator end() { return hmap_iterator{this, -1}; }
+    const_iterator cend() const { return hmap_kiterator{const_cast<hashmap *>(this), -1}; }
+    const_iterator end() const { return cend(); }
+
+    void clear()
+    {
+        if (m_entries == 0) return;
+        raw_entry_t *const p = m_table.data();
+        if constexpr (T_needs_dtor || sizeof(raw_entry_t) > 2 * sizeof(Key)) {
+            for (size_t i = 0, n = m_hashN; i < n; i++) {
+                p[i].clear_entry();
+            }
+        } else { // nuke the entire site from orbit
+            memset(p, 0, m_hashN * sizeof(raw_entry_t));
+        }
+        m_entries = 0;
+        m_deleted = 0;
+    }
+
+    size_t count(Key const &k) const
+    {
+        table_kiter_type const iter = find_key(k);
+        return iter != m_table.end() ? 1 : 0;
+    }
+    bool contains(Key const &k) const
+    {
+        table_kiter_type const iter = find_key(k);
+        return iter != m_table.end();
+    }
+    iterator find(Key const &k)
+    {
+        table_kiter_type const iter = find_key(k);
+        return hmap_iterator{this, (iter == m_table.end()) ? -1 : int(iter - m_table.begin())};
+    }
+    const_iterator find(Key const &k) const
+    {
+        table_kiter_type const iter = find_key(k);
+        return hmap_kiterator{this, (iter == m_table.end()) ? -1 : int(iter - m_table.begin())};
+    }
+
+    // this may invalidate iterators by enlarging the table,
+    // if the key is not already there; but the one returned is always valid.
+    template <class... Args> std::pair<iterator, bool> emplace(Key const &k, Args &&...args)
+    {
+        table_iter_type const titer = find_key_for_ins(k);
+        std::pair<iterator, bool> result{iterator{this, titer}, false};
+
+        if (!titer->is_inuse()) {
+            [[maybe_unused]] bool const was_deleted = titer->is_deleted();
+            result.second = true;
+            titer->emplace_new(k, std::forward<Args>(args)...);
+            if constexpr (ERASE_OK) {
+                if (was_deleted)
+                    m_deleted--;
+                else
+                    m_entries++;
+            } else {
+                m_entries++;
+            }
+        }
+        return result;
+    }
+    template <class... Args> std::pair<iterator, bool> try_emplace(Key const &k, Args &&...args)
+    {
+        return emplace(k, std::forward<Args>(args)...);
+    }
+
+    T &at(Key const &k)
+    {
+        table_kiter_type const titer = find_key(k);
+        if (titer == m_table.end()) throw std::out_of_range("minimap::at");
+        return const_cast<T &>(titer->value_field());
+    }
+    T const &at(Key const &k) const { return const_cast<hashmap &>(*this).at(k); }
+
+    T &operator[](Key const &k)
+    {
+        static_assert(std::is_constructible<T>::value, "map[] requires a null-constructible mapped_type");
+        table_iter_type const titer = find_key_for_ins(k);
+        if (!titer->is_inuse()) {
+            bool const was_deleted = titer->is_deleted();
+            titer->emplace_new(k);
+            if constexpr (ERASE_OK) {
+                if (was_deleted)
+                    m_deleted--;
+                else
+                    m_entries++;
+            } else {
+                m_entries++;
+            }
+        }
+        return titer->value_field();
+    }
+
+    size_t erase(Key const &k)
+    {
+        static_assert(ERASE_OK, "erase not supported in this type");
+        if constexpr (ERASE_OK) {
+            table_kiter_type iter = find_key(k);
+            if (iter != m_table.end()) { // found
+                erase_inner(iter);
+                return 1;
+            }
+        }
+        return 0;
+    }
+    iterator erase(iterator const &posn)
+    {
+        static_assert(ERASE_OK, "erase not supported in this type");
+        if constexpr (ERASE_OK) {
+            int const p = posn.m_posn;
+            table_kiter_type const entp = m_table.cbegin() + p;
+            assert(posn.m_object == this && (size_t)p < m_hashN && entp->is_inuse());
+            erase_inner(entp);
+            return hmap_iterator{this, find_next_for_iter(m_hashN)};
+        } else {
+            return end();
+        }
+    }
+
+  protected:
+    void erase_inner(table_kiter_type iter)
+    {
+        if constexpr (ERASE_OK) {
+            const_cast<raw_entry_t &>(*iter).erase_entry();
+            m_deleted++;
+            if (m_deleted == m_entries) {
+                clear_all_deleted();
+            }
+        }
+    }
+    // m_table assumed to be empty; resize it for 2^log2 entries
+    void make_new_table(unsigned int log2n)
+    {
+        size_t const hn = size_t(1) << log2n;
+        m_table.resize(hn);
+        m_hashN = hn;
+        m_log2N = log2n;
+    }
+
+    // locate a key; returns an iterator to where it is, or to where it would go,
+    // if inserted. when 'INSERT' is true, never returns m_table.end().
+    // when INSERT is false, it will not change anything and will return m_table_end()
+    // if if doesn't find a proper entry.
+    // This must not be called on empty table!
+
+    template <bool INSERT = true> table_iter_type lookup_key_template(Key const &k)
+    {
+        size_t const mask = m_hashN - 1;
+        uint32_t const hsh = HSH()(k);
+        size_t probe_at = hsh & mask;
+        size_t const delta = (hsh >> 15) | 1; // must be odd.
+        //printf("       ...probing %llu in %zu at %zu, span=%zu", (unsigned long long)k, mask+1,probe_at,delta&mask);
+        table_iter_type const t0 = m_table.begin();
+        size_t remain = mask;
+        if constexpr (!ERASE_OK) {
+            assert(!raw_entry_t::is_null(k)); // this is not allowed
+            if (raw_entry_t::is_null(k)) return t0;
+            while (1) {
+                table_iter_type titer = t0 + probe_at;
+                if (!titer->is_inuse() || titer->first == k) {
+                    //printf("  ..found in %zu probes\n", mask+1-remain);
+                    if (!INSERT && !titer->is_inuse()) return m_table.end();
+                    return titer;
+                }
+                probe_at = (probe_at + delta) & mask;
+                if (--remain == 0) throw std::runtime_error("hash lookup failed");
+            }
+        } else {
+            // stop when we find a 'never used' slot,or a matching entry.
+            // But, if we are inserting, and we saw any deleted slots, return
+            // the iterator of the first deleted slot we saw, so it can be
+            // reused for the new key.
+            table_iter_type titer = t0 + probe_at;
+            table_iter_type titer_end = m_table.end();
+            table_iter_type titerx = titer_end;
+            while (1) {
+                if (titer->is_never_used()) {
+                    // end of chain. Return a previously seen deleted slot
+                    // if there is one.
+                    if (INSERT)
+                        return (titerx == titer_end) ? titer : titerx;
+                    else
+                        return titer_end;
+                }
+                if (titer->is_inuse()) {
+                    if (titer->first == k) {
+                        //printf("  ..found in %zu probes\n", mask+1-remain);
+                        return titer;
+                    }
+                } else { // is a deleted slot; if INSERT we want to keep that.
+                    if (INSERT)
+                        if (titerx == titer_end) titerx = titer;
+                }
+                probe_at = (probe_at + delta) & mask;
+                if (--remain == 0) throw std::runtime_error("hash lookup failed");
+                titer = t0 + probe_at;
+            }
+        }
+    }
+
+    // find_key_for_ins is a wrapper on lookup_key_template<true>;
+    // It will return either iterator to existing key (and not rehash)
+    // or an iterator to a not-in-use slot (and may have done a rehash).
+    //
+    // (1) find a spot to insert (or existing key)
+    // (2) if it's a new insert that will increase the m_entries if used,
+    //     and the table is too full, rehash and do it again.
+    // (also, if initially empty, go directly to step 2).
+    //
+    table_iter_type find_key_for_ins(Key const &k)
+    {
+        size_t thr = m_entries * 2;
+        while (1) { // at most 2 iterations.
+            if (m_hashN > 0) {
+                table_iter_type pos = lookup_key_template<true>(k);
+                if (m_hashN >= thr || !pos->is_never_used()) {
+                    return pos;
+                }
+            }
+            thr = 0;
+            rehash_table(true);
+        }
+    }
+
+    // find_key is lookup_key_template with no insert;
+    // always returns m_table.end() if key not found.
+    table_kiter_type find_key(Key const &k) const
+    {
+        if (m_hashN == 0) return m_table.end(); // no table!
+        return const_cast<hashmap *>(this)->lookup_key_template<false>(k);
+    }
+
+    // called when the last entry is erased; we can clear all the state
+    // to 'never used'.
+    //
+    void clear_all_deleted()
+    {
+        if constexpr (ERASE_OK) {
+            size_t mdel = m_deleted;
+            XASSERT(mdel >= 1 && mdel == m_entries);
+            raw_entry_t *const p = m_table.data();
+            for (size_t i = 0, n = m_hashN; i < n; i++) {
+                int const t = p[i].state;
+                if (t != 0) {
+                    XASSERT(t < 0);
+                    p[i].state = 0;
+                    if (--mdel == 0) break; // got them all
+                }
+            }
+            m_entries = 0;
+            m_deleted = 0;
+        }
+    }
+
+    // change the table size
+    // and refill from the old data (if any)
+    // Used in 'find_key_for_ins' and 'shrink'.
+    // 'growing' = true when called on insert, false
+    // on shrink.
+    void rehash_table(bool growing)
+    {
+        // how big to make it?
+        unsigned const sz = size(); // number of non-deleted entries
+        if (!ERASE_OK) growing = true; // don't need 'shrink' code path
+        int const new_log2 = growing ? (ceiling_log2(sz | 15) + 2) : (ceiling_log2(sz + 3 + (sz >> 1)) + 1);
+        std::vector<raw_entry_t> old_table;
+        old_table.swap(m_table);
+        make_new_table(new_log2);
+        size_t new_count = 0;
+        size_t const old_size = old_table.size();
+        raw_entry_t *const old_table_p = old_table.data();
+        for (size_t i = 0; i < old_size; i++) {
+            raw_entry_t &old_entry = old_table_p[i];
+            if (old_entry.is_inuse()) {
+                new_count++;
+                Key k = old_entry.first;
+                table_iter_type const new_loc = lookup_key_template<true>(k);
+                new_loc->move_from(k, old_entry);
+            }
+        }
+        assert(new_count == m_entries - m_deleted);
+        if (ERASE_OK) {
+            m_entries = new_count;
+            m_deleted = 0;
+        }
+    }
+    // find the largest index 'i' of an in-use entry such
+    // that i < posn; or return -1 if none.
+    // ('pure' is intended to prevent an inlined erase(iter)
+    // from calling or expanding this in cases where you don't use the
+    // returned iterator; but I suspect that since the compiler can
+    // see the code, it will just develop its own opinion).
+    [[gnu::pure]] int find_next_for_iter(int posn) const
+    {
+        raw_entry_t const *const table_p = m_table.data();
+        assert(posn <= int(m_hashN));
+        while (--posn >= 0) {
+            if (table_p[posn].is_inuse()) return posn;
+        }
+        return -1;
+    }
+
+    // copy from another hashmap.
+    // this is used in copy-ctor;
+    // and in operator = after 'clear'.
+    // We assume the current map is empty (no deleted
+    // items) but the table size could be anything.
+    void copy_from(hashmap const &other)
+    {
+        size_t const n_other = other.m_hashN;
+        if (n_other == 0) return;
+        // adopt the other table size
+        if (n_other != m_hashN) {
+            make_new_table(other.m_log2N);
+        }
+        // if std::is_trivial<T>::value, maybe just memcpy?
+        //
+        raw_entry_t const *const other_p = other.m_table.data();
+        raw_entry_t *const this_p = m_table.data();
+        assert(m_table.size() == n_other);
+        for (size_t i = 0; i < n_other; i++) {
+            this_p[i].copy_from(other_p[i]);
+        }
+        m_entries = other.m_entries;
+        m_deleted = other.m_deleted;
+    }
+
+}; // class hashmap
+
+// A hashset<K> is made by implementing hashmap<K,no_value>
+// and rewriting the interface a little with a private subclass.
+// Since we only need sets of ints (32,64,unsigned,signed)
+// and pointers, we map Key using value_proxy
+// so that all hashset<K> are based on either hashmap<unsigned,no_value>
+// or hashmap<unsigned long long,no_value>, and thus share most of
+// the generated code across many types.
+
+template <typename Key, bool ERASE_OK, typename HSH = findhash<typename value_proxy<Key, true>::type>>
+class hashset : private hashmap<typename value_proxy<Key, true>::type, no_value, ERASE_OK, HSH> {
+    using key_set_t = typename value_proxy<Key, true>::type;
+    using hashimpl = hashmap<key_set_t, no_value, ERASE_OK, HSH>;
+    using hashk = hashmap<Key, no_value, ERASE_OK, HSH>;
+    using impl_iterator = typename hashimpl::iterator;
+    using impl_kiterator = typename hashimpl::const_iterator;
+
+  public:
+    typedef Key key_type;
+    typedef Key value_type;
+    typedef size_t size_type;
+    typedef typename hashk::iterator iterator;
+    typedef typename hashk::const_iterator const_iterator;
+
+  protected:
+    static inline iterator iter_up(impl_iterator const &it) { return hashk::template coerce_iter<hashimpl>(it); }
+    static inline const_iterator iter_up(impl_kiterator const &it) { return hashk::template coerce_iter<hashimpl>(it); }
+    static inline impl_iterator iter_down(iterator const &it) { return hashimpl::template coerce_iter<hashk>(it); }
+    // image conversion of key will allow us to implement e.g.
+    // string_key-> size_t mapping, if string_key is a class
+    // containing just a pointer.
+    static inline key_set_t key_down(Key const &k)
+    {
+        static_assert(sizeof(k) == sizeof(key_set_t));
+        union {
+            Key k1;
+            key_set_t k2;
+        } const uu = {k};
+        return uu.k2;
+    }
+
+  public:
+    hashset() : hashimpl(){};
+    explicit hashset(size_t n_entries) : hashimpl(n_entries) {}
+
+    bool shrink() { return hashimpl::shrink(); }
+
+    hashset(hashset &&other) noexcept : hashimpl(std::move(other)) {}
+    hashset(hashset const &other) : hashimpl(other) {}
+    hashset &operator=(hashset const &other)
+    {
+        hashimpl::operator=(static_cast<hashimpl const &>(other));
+        return *this;
+    }
+    hashset &operator=(hashset &&other) noexcept
+    {
+        hashimpl::operator=(std::move(static_cast<hashimpl &>(other)));
+        return *this;
+    }
+
+    void swap(hashset &other) noexcept { hashimpl::swap(other); }
+
+    inline size_t size() const noexcept { return hashimpl::size(); }
+    inline bool empty() const noexcept { return hashimpl::empty(); }
+    inline size_t count_deleted() const noexcept { return hashimpl::count_deleted(); }
+
+    iterator begin() { return iter_up(hashimpl::begin()); }
+    const_iterator cbegin() const { return iter_up(hashimpl::cbegin()); }
+    const_iterator begin() const { return iter_up(hashimpl::cbegin()); }
+
+    iterator end() { return iter_up(hashimpl::end()); }
+    const_iterator cend() const { return iter_up(hashimpl::cend()); }
+    const_iterator end() const { return iter_up(hashimpl::cend()); }
+
+    void clear() { hashimpl::clear(); }
+
+    size_t count(Key const &k) const { return hashimpl::count(key_down(k)); }
+    bool contains(Key const &k) const { return hashimpl::contains(key_down(k)); }
+    iterator find(Key const &k) { return iter_up(hashimpl::find(key_down(k))); }
+    const_iterator find(Key const &k) const { return iter_up(hashimpl::find(key_down(k))); }
+
+    // insert, emplace, try_emplace are all the same if k is trivial type.
+    std::pair<iterator, bool> emplace(Key const &k) { return insert(k); }
+    std::pair<iterator, bool> try_emplace(Key const &k) { return insert(k); }
+    std::pair<iterator, bool> insert(Key const &k)
+    {
+        auto res = hashimpl::try_emplace(key_down(k), no_value{});
+        return {iter_up(res.first), res.second};
+    }
+    size_t erase(Key const &k) { return hashimpl::erase(key_down(k)); }
+    iterator erase(iterator const &posn) { return iter_up(hashimpl::erase(iter_down(posn))); }
+};
+
+template <typename Key, typename T, bool ERASE_OK, typename HSH = findhash<Key>>
+void swap(hashmap<Key, T, ERASE_OK, HSH> &a, hashmap<Key, T, ERASE_OK, HSH> &b)
+{
+    a.swap(b);
+}
+template <typename Key, bool ERASE_OK, typename HSH = findhash<Key>>
+void swap(hashset<Key, ERASE_OK, HSH> &a, hashset<Key, ERASE_OK, HSH> &b)
+{
+    a.swap(b);
+}
+
+} // namespace minObj
+
+template <typename K, typename T> using minihash_noerase = minObj::hashmap<K, T, false>;
+template <typename K, typename T> using minihash = minObj::hashmap<K, T, true>;
+
+template <typename K> using miniset_noerase = minObj::hashset<K, false>;
+template <typename K> using miniset = minObj::hashset<K, true>;
+
+} // namespace hnnx
+
+#endif /* MINIHASH_H_ */
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/oexpr.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/oexpr.h
new file mode 100755
index 0000000000000..bd8c2a635da1b
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/oexpr.h
@@ -0,0 +1,920 @@
+//==============================================================================
+// Copyright (c) 2020-2024 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef OEXPR_H_
+#define OEXPR_H_
+
+#include <algorithm>
+#include <functional>
+#include <utility>
+#include <numeric>
+#include "dtype_enum.h"
+#include "macros_attribute.h"
+#include "opname_tag.h"
+#include "weak_linkage.h"
+
+#ifndef PREPARE_DISABLED
+// This file is expected to be #included at top of optimize.h,
+// with oexp_post.h included farther down.
+// #include "optimize.h"
+
+// expression mechanism for constraints etc.
+// This supports deferred execution, so if used in replacement
+// rules under autosplit, you can use SPLIT_START etc in
+// expressions (e.g. ADD(SPLIT_START("I"), 2))
+// ... but only in contexts where a late-evaluation result is OK
+// (which includes inputs to gen_Shape, and it should include the first
+//  parameter of SELECT( cond, .. ops.. , ..ops ... )
+//
+//
+// All of the ADD() MUL() GE() etc
+// return objects which are specializations of expr,
+// and which contain the entire expression; these
+// are generally built at compile time.
+//
+// These objects all have a method .eval(ctx&) which
+// evaluates the expression. All expr specializations
+//  have a type 'otype' which defines what type eval()
+//  returns.
+// The mechanism supports short-circuit evaluation of
+// && ||, and SELECT, since nothing is actually done when the
+//  original expression is evaluated, only when the eval() is called.
+//
+//
+// The inputs to ADD() MUL() etc can be:
+//
+//   - specializations of expr containing smaller expressions.
+//   - scalar values (int,float etc)which will be coerced
+//     to a expr<Variant::value,T> that simply contains the valu
+//.
+//   - std::function<T(ctx&)> objects, for supported T types;
+//     these will be packed into expr<Variant::function,T> objects,
+//     which are not called until the .eval() is done (@@@ this is being removed)
+//
+//   Things like RANK_OF("PARM") are implemented by constructing
+//   an expr variant containing the "PARM" string; the eval() method
+//   will get the rank and return it. Thus, these are not called
+//   if they are skipped by AND, OR or SELECT conditions.
+//
+//  SPLIT_START etc, construct an expr variant containing the context
+//  name.
+//
+// For any expr<> object, we could construct it, bind it to a lambda which calls
+// its eval() method, and then that lambda can be converted to a std::function.
+//
+// We can't fully support C++ casts though.
+// What if someone writes  float( DIM_OF("x",2) ) ??
+// We can't write an 'operator float' which doesn't return a float.
+// Will need to do it with unary 'cast' ops.
+//
+// When this is used for constraints, it will be something like this:
+//  bool constraint_function() {
+//          auto result = AND( ..  the whole expression ... )
+//          return result.eval(*this)
+//   }
+// In this case there are no std::function objects; the compiler will
+// expand the whole expression and then evaluate its 'eval' method; and
+// so it will be as if the expressions were directly pasted (and, we
+// get short-cut eval of && ||, SELECT).
+//
+// Actually, I don't think we need to support std::function as input,
+// since we are replacing all of that.
+// But we do need to be able to make a std::function from an expression.
+//
+// This can be done using wrap_as_function, e.g. for an 'int' return type
+//
+//    std::function< int(ECtx&)> as_func = wrap_as_function<int>(  anything );
+//
+//  where the parameter is an expr object, or anything which will convert to one.
+// If it has a different return type, a conversion will be inserted in the wrapper.
+//
+//
+namespace constraint_lib {
+class Constraint;
+}
+namespace hnnx {
+template <typename T> class optim_configvar;
+}
+
+PUSH_VISIBILITY(default)
+
+API_EXPORT hnnx::Crate *get_lambda_crate();
+template <typename R> class OptFunction;
+
+template <typename R, typename... Args> class OptFunction<R(Args...)> {
+  public:
+    using thisType = OptFunction<R(Args...)>;
+    using OptFunctionTType = R (*)(void *, Args...);
+    using OptFunctionType = R (*)(Args...);
+
+    template <typename L> API_EXPORT static R LambdaWrapper(void *t, Args... args)
+    {
+        L *const obj = (L *)t;
+        return obj->operator()(args...);
+    }
+    API_EXPORT static R FunctionWrapper(void *t, Args... args)
+    {
+        OptFunctionType const obj = (OptFunctionType)t;
+        return obj(args...);
+    }
+    template <typename L>
+    API_EXPORT static typename std::enable_if<!std::is_lvalue_reference_v<L>, thisType>::type create(L &&lambda)
+    {
+        L *const l = get_lambda_crate()->emplace<L>(std::forward<L>(lambda));
+        return thisType(LambdaWrapper<L>, l);
+    }
+
+    OptFunctionTType mFunc;
+    void *mObj;
+    API_EXPORT OptFunction() : mFunc(nullptr), mObj(nullptr){};
+    API_EXPORT OptFunction(OptFunctionTType f, void *o) : mFunc(f), mObj(o){};
+
+    API_EXPORT R operator()(Args... args) const { return mFunc(mObj, args...); }
+    API_EXPORT operator bool() const { return (mFunc != nullptr); }
+};
+
+POP_VISIBILITY()
+
+namespace oExp {
+
+using std::forward;
+using std::tuple;
+
+enum class Variant : int {
+    // Core functionality...
+    // template params of expr:
+    value, // <value,T> : a value of type T
+    unop, // <unop,tuple<A,UFUNC>>  : UFUNC(A)
+    binop, // <binop, tuple<A,B,BFUNC>  : BFUNC(A,B)
+    binop1, // <binop, tuple<A,B,BFUNC>  : BFUNC(A,B) (where B is a scalar type, not an oExp)
+    lg_or, // <lg_or, tuple<A,B,...> :  A || B || ...
+    lg_and, // <lg_and, tuple<A,B,...> :  A && B && ...
+    lg_xor, // <lg_xor, tuple<A,B,...> :  A ^ B ^ ...
+    select, // <select, tuple<S,A,B>:   S?A:B
+    message, // <message, <mode,CONDITION>>   'mode' is an empty class
+    message_value, // <message, <mode,VALUE>>   'mode' is an empty class
+
+    // extensions which allow access to OpDef etc
+
+    property, // <property, tuple<OPEXP, prop_extractor_class>>  - extracts a property or dim of OpDef
+    opcompare, // <opcompare,tuple<OPEXPA,OPEXPB,op_compare_class>> - compares two ops in some way.
+    slicedim, // <slicedim,int>		- contains a slice name and a ptr-to-member var.
+    getconst, // <getconst, tuple<getconst_class,A>> - contains operand name and index expression.
+    external, // <external, tuple<FUNC, tuple<extra..>>>		- calls external-constraint
+    config, // <config,   T>  - reads a config var from optim_config_values.
+    producer_for,
+    eq_opstr, //  -- compares the opstr on a node to a constant
+
+    // Internal Exprs used for for D2RTR
+    d2rtr_constraint,
+};
+// in this namespace, ECtx is the type whose reference gets passed
+// to all the eval methods and std::function objects.
+typedef constraint_lib::Constraint ECtx;
+// sFunction<T>:  std::function returning T (using ECtx as a parameter)
+//
+template <typename T> using sFunction = OptFunction<T(ECtx &)>;
+
+template <Variant V, typename ARG> class expr {
+};
+
+////////////////////////////
+// map a 'scalar' value to an expr.
+////////////////////////////
+template <typename T> class expr<Variant::value, T> {
+    const T m_val;
+
+  public:
+    typedef T otype;
+    constexpr expr(T v) : m_val(v) {}
+    constexpr otype eval(ECtx &) const { return m_val; }
+    constexpr T getval() const { return m_val; }
+};
+
+//---------------------------------
+// this is an (uncallable) template function which returns
+// the same type as the eval() method of an expr; for use in decltype
+//  i.e. decltype(fake_eval(x)) will give you the return type of x.eval(ECtx&)
+//
+template <Variant V, typename T> inline auto fake_eval(expr<V, T> const &a)
+{
+    extern ECtx This_should_not_be_referenced;
+    return a.eval(This_should_not_be_referenced);
+}
+inline ALWAYSINLINE ECtx &fake_ectx()
+{
+    extern ECtx This_should_not_be_referenced;
+    return This_should_not_be_referenced;
+}
+
+//---------------------------------
+
+// wrap_param() is used to convert template parameters of things like ADD to an expr.
+// - any expr is unchanged
+// - scalar types are converted to the applicale 'value' (including coercion
+// from e.g. double to float)
+
+// an adapter which converts a scalar to an expr.
+template <typename T> struct wrapper_helper {
+};
+// direct conversions
+template <> struct wrapper_helper<bool> {
+    static constexpr auto wrap(bool x) { return expr<Variant::value, bool>(x); }
+};
+template <> struct wrapper_helper<int> {
+    static constexpr auto wrap(int x) { return expr<Variant::value, int>(x); }
+};
+template <> struct wrapper_helper<float> {
+    static constexpr auto wrap(float x) { return expr<Variant::value, float>(x); }
+};
+template <> struct wrapper_helper<DType> {
+    static constexpr auto wrap(DType x) { return expr<Variant::value, DType>(x); }
+};
+
+// double->float
+template <> struct wrapper_helper<double> {
+    static constexpr auto wrap(double x) { return expr<Variant::value, float>(float(x)); }
+};
+
+// unsigned, unsigned long, unsigned long long all-> size_t.
+// presumably one of them is identical to size_t on any given platform.
+template <> struct wrapper_helper<unsigned> {
+    static constexpr auto wrap(unsigned x) { return expr<Variant::value, size_t>(size_t(x)); }
+};
+template <> struct wrapper_helper<unsigned long> {
+    static constexpr auto wrap(unsigned long x) { return expr<Variant::value, size_t>(size_t(x)); }
+};
+template <> struct wrapper_helper<unsigned long long> {
+    static constexpr auto wrap(unsigned long long x) { return expr<Variant::value, size_t>(size_t(x)); }
+};
+
+// a variant is unchanged when wrapped.
+template <Variant V, typename T> struct wrapper_helper<expr<V, T>> {
+    static constexpr auto wrap(expr<V, T> const &x) { return x; }
+};
+
+template <typename T> inline constexpr auto wrap_param(T &&p)
+{
+    return wrapper_helper<std::remove_const_t<std::remove_reference_t<T>>>::wrap(std::forward<T>(p));
+}
+
+// We can also try to wrap something as a specific scalar type (e.g. to bool for SELECT).
+// here, T2 is always a scalar type
+// (currently, only works for bool; maybe it will be useful coerce consts to a common type
+// before ADD, e.g.).
+//
+
+template <typename T2, typename T> inline constexpr auto wrap_param_to(T &&p)
+{
+    typedef std::remove_reference_t<T> Tparam;
+    if constexpr (std::is_same_v<bool, T2> && std::is_arithmetic_v<Tparam>) {
+        // convert e.g. int to bool directly
+        return wrapper_helper<bool>::wrap(p != 0);
+    } else {
+        return wrapper_helper<Tparam>::wrap(std::forward<T>(p));
+    }
+}
+
+/** \defgroup OptConstraint Constraint Expressions for Optimization Rules
+ * \ingroup OptimizationFuncs
+ *
+ * These are the operations available for writing Constraint expressions.
+ * These may also be used in Replacement rules, e.g. to compute gen_Shape object dimensions.
+ *
+ * The following conversion operations may also be used in constraint expressions:
+ *
+ *     INT(expr)  UINT(expr) FLOAT(expr)  DTYPE(expr)
+ *
+ * Avoid using C-style casts, e.g. (int)expr; for conversion to bool use NE(exp,0)
+ *
+ * @{
+ * @}
+ *
+ */
+
+///////////////////////////////////////////////////////////////////////////////////////
+//
+// a unary operator
+//
+template <typename A, typename OPER> class expr<Variant::unop, tuple<A, OPER>> {
+    const A m_a;
+    const OPER m_op;
+
+  public:
+    using otype = decltype(m_op(fake_eval(m_a)));
+    constexpr expr(A a, OPER op) : m_a(a), m_op(op) {}
+    constexpr otype eval(ECtx &e) const { return m_op(m_a.eval(e)); }
+};
+template <typename A, typename OPER> inline constexpr auto make_unop(A a, OPER op)
+{
+    return expr<Variant::unop, tuple<A, OPER>>(a, op);
+}
+// specialize for case where input is value (constant folding)
+template <typename TA, typename OPER> inline constexpr auto make_unop(expr<Variant::value, TA> a, OPER op)
+{
+    return wrap_param(op(a.getval()));
+}
+
+// a binary operator
+//
+template <typename A, typename B, typename OPER> class expr<Variant::binop, tuple<A, B, OPER>> {
+    const A m_a;
+    const B m_b;
+    const OPER m_op;
+
+  public:
+    using otype = decltype(m_op(fake_eval(m_a), fake_eval(m_b)));
+    constexpr expr(A a, B b, OPER op) : m_a(a), m_b(b), m_op(op) {}
+    constexpr otype eval(ECtx &e) const { return m_op(m_a.eval(e), m_b.eval(e)); }
+};
+template <typename A, typename B, typename OPER> inline constexpr auto make_binop(A a, B b, OPER op)
+{
+    return expr<Variant::binop, tuple<A, B, OPER>>(a, b, op);
+}
+// specialize for the case where both inputs to binop are value (constant folding)
+template <typename TA, typename TB, typename OPER>
+inline constexpr auto make_binop(expr<Variant::value, TA> a, expr<Variant::value, TB> b, OPER op)
+{
+    auto constexpr res = op(a.getval(), b.getval());
+    return wrap_param(res);
+}
+
+// a binary operator where the RHS is a known value (these
+// are very common, so hopefully this reduces complexity of the generated code)
+//
+template <typename A, typename TB, typename OPER> class expr<Variant::binop1, tuple<A, TB, OPER>> {
+    const A m_a; // an oExp
+    const TB m_b; // a scalar
+    const OPER m_op;
+
+  public:
+    using otype = decltype(m_op(fake_eval(m_a), m_b));
+    constexpr expr(A a, TB b, OPER op) : m_a(a), m_b(b), m_op(op) {}
+    constexpr otype eval(ECtx &e) const { return m_op(m_a.eval(e), m_b); }
+};
+// specialize make_binop for the case where the second input is 'value'.
+template <typename A, typename TB, typename OPER>
+inline constexpr auto make_binop(A a, expr<Variant::value, TB> b, OPER op)
+{
+    using TBX = std::common_type_t<typename A::otype, TB>;
+    return expr<Variant::binop1, tuple<A, TBX, OPER>>(a, TBX(b.getval()), op);
+}
+
+// 'opers' is a tuple of N various expr objects which
+// are the inputs to AND,OR or XOR;
+// 'IDX' is in range 0..N-1
+// find the logical operation applied over inputs I..N-1,
+// expands recursively until IDX=N-1
+//
+template <Variant VAR, size_t IDX, typename OPERANDS>
+inline bool ALWAYSINLINE reduce_logop(ECtx &e, OPERANDS const &operands);
+
+// || operator (delayed eval of subexpressions)
+//
+template <typename OPERANDS> // operands is tuple<expr<> ...>
+class expr<Variant::lg_or, OPERANDS> {
+    const OPERANDS m_operands;
+
+  public:
+    using otype = bool;
+    inline constexpr expr(OPERANDS &&op) : m_operands(std::move(op)) {}
+    inline constexpr expr(OPERANDS const &op) : m_operands(op) {}
+    constexpr otype eval(ECtx &e) const { return reduce_logop<Variant::lg_or, 0, OPERANDS>(e, m_operands); }
+};
+template <typename OPERANDS> // operands is tuple<expr<> ...>
+class expr<Variant::lg_and, OPERANDS> {
+    const OPERANDS m_operands;
+
+  public:
+    using otype = bool;
+    inline constexpr expr(OPERANDS &&op) : m_operands(std::move(op)) {}
+    inline constexpr expr(OPERANDS const &op) : m_operands(op) {}
+    constexpr otype eval(ECtx &e) const { return reduce_logop<Variant::lg_and, 0, OPERANDS>(e, m_operands); }
+};
+template <typename OPERANDS> // operands is tuple<expr<> ...>
+class expr<Variant::lg_xor, OPERANDS> {
+    const OPERANDS m_operands;
+
+  public:
+    using otype = bool;
+    inline constexpr expr(OPERANDS &&op) : m_operands(std::move(op)) {}
+    inline constexpr expr(OPERANDS const &op) : m_operands(op) {}
+    constexpr otype eval(ECtx &e) const { return reduce_logop<Variant::lg_xor, 0, OPERANDS>(e, m_operands); }
+};
+template <Variant WHICH, typename OPERANDS> inline constexpr auto make_logop(OPERANDS &&opers)
+{
+    return expr<WHICH, std::remove_reference_t<OPERANDS>>(std::forward<OPERANDS>(opers));
+}
+// select
+template <typename SEL, typename A, typename B> class expr<Variant::select, tuple<SEL, A, B>> {
+    const SEL m_sel;
+    const A m_a;
+    const B m_b;
+
+  public:
+    using otype = decltype(bool() ? fake_eval(m_a) : fake_eval(m_b));
+    constexpr expr(SEL s, A a, B b) : m_sel(s), m_a(a), m_b(b) {}
+    constexpr otype eval(ECtx &e) const { return m_sel.eval(e) ? m_a.eval(e) : m_b.eval(e); }
+};
+template <typename SEL, typename A, typename B> inline constexpr auto make_select(SEL s, A a, B b)
+{
+    return expr<Variant::select, tuple<SEL, A, B>>(s, a, b);
+}
+
+// CLST must be a template class name with <type> missing; so
+// we can use std::plus etc.
+#define OEXP_ARITH(FNAME, CLST)                                                                                        \
+    template <typename A, typename B> inline constexpr auto FNAME(A &&pa, B &&pb)                                      \
+    {                                                                                                                  \
+        if constexpr (std::is_arithmetic_v<std::remove_reference_t<A>> &&                                              \
+                      std::is_arithmetic_v<std::remove_reference_t<B>>) {                                              \
+            /* 'constant folding' ... */                                                                               \
+            using common_t = std::common_type_t<std::remove_reference_t<A>, std::remove_reference_t<B>>;               \
+            return CLST<common_t>()(pa, pb);                                                                           \
+        } else {                                                                                                       \
+            auto wa = wrap_param(std::forward<A>(pa));                                                                 \
+            auto wb = wrap_param(std::forward<B>(pb));                                                                 \
+            using atype = decltype(fake_eval(wa));                                                                     \
+            using btype = decltype(fake_eval(wb));                                                                     \
+            using common_t = std::common_type_t<atype, btype>;                                                         \
+            return make_binop(wa, wb, CLST<common_t>());                                                               \
+        }                                                                                                              \
+    }
+
+/// \addtogroup OptConstraint
+
+// Formal definition of CEILDIV(a, b)
+template <typename T> struct ceil_div {
+    static_assert(std::is_integral_v<T>, "CEILDIV can only apply to integer types");
+    inline T operator()(T a, T b) const
+    {
+        T res = (a + b - 1) / b;
+        return res;
+    }
+};
+
+/// @{
+
+//! ADD(a,b):  A+B
+OEXP_ARITH(ADD, std::plus)
+
+//! SUB(a,b):  A-B
+OEXP_ARITH(SUB, std::minus)
+
+//! MUL(a,b):  A*B
+OEXP_ARITH(MUL, std::multiplies)
+
+//! DIV(a,b):  A/B
+OEXP_ARITH(DIV, std::divides)
+
+//!REM(a,b): A%B
+/// for signed types REM(a,b) is either 0 or has the same sign as a.
+OEXP_ARITH(REM, std::modulus)
+
+//! CEILDIV(a,b) round the result of division toward positive infinity
+OEXP_ARITH(CEILDIV, ceil_div)
+
+/// @}
+
+template <typename T> struct true_modulus {
+    static_assert(std::is_integral_v<T>, "MOD can only apply to integer types");
+    inline T operator()(T a, T b) const
+    {
+        T res = a % b;
+        // if res is non-zero and has opposite sign to b, add b to it.
+        if ((b < 0) ? (res > 0) : (res < 0)) res += b;
+        return res;
+    }
+};
+
+// Formal definition of ROUNDUP(a,b):
+// T can be int or unsigned ('size_t')
+// For b == 0, round to next larger power of two
+// UNDEFINED for b < 0 ( you will get 0, and maybe a runtime warning, some day?)
+// roundup(a,1) = a
+// for b>=2: result is a rounded up (towards +inf) to a multiple of b.
+//  So, roundup(15,10) = 20, roundup(-15,10) = -10
+// When b is a power of 2, this is consistent with (a+(b-1))&~(b-1) for both
+// signed and unsigned cases.
+//
+template <typename T> struct func_roundup {
+    static_assert(std::is_integral_v<T>, "ROUNDUP can only apply to integer types");
+    inline T roundup_pow2(T a) const
+    {
+        if (a <= 1) return a;
+        a -= 1;
+        a |= a >> 1;
+        a |= a >> 2;
+        a |= a >> 4;
+        a |= a >> 8;
+        a |= a >> 16;
+        a += 1;
+        return a;
+    }
+    inline T operator()(T a, T b) const
+    {
+        if (b < 0) return 0;
+        if (b == 0) return roundup_pow2(a);
+        if (b == 1) return a;
+        if ((b & (b - 1)) == 0) return (a + (b - 1)) & (~(b - 1));
+        // avoid overflow (except in cases where the rounded-up value overflows).
+        T rem = a % b;
+        if (rem == 0) return a;
+        // rem is 1..b-1 for  a > 0
+        // and -(b-1) ... -1 for a < 0
+        T anew = a - rem; // rounded towards 0;
+        if (a >= 0) anew += b;
+        return anew;
+    }
+};
+
+/// \addtogroup OptConstraint
+/// @{
+
+//! MOD(a,b) is either 0 or has the same sign as b.
+OEXP_ARITH(MOD, true_modulus)
+
+//! ROUNDUP(a,b): 'a' rounded up to a multiple of b.
+// - undefined if b <=0  (result will be 0, and you may get a runtime warning or something)
+// - for signed int: ROUNDUP(15,10)-> 20 but ROUNDUP(-15,10) -> -10.
+//
+OEXP_ARITH(ROUNDUP, func_roundup)
+
+/// @}
+
+template <typename T> struct min_func {
+    inline T operator()(T a, T b) const { return std::min(a, b); }
+};
+template <typename T> struct max_func {
+    inline T operator()(T a, T b) const { return std::max(a, b); }
+};
+
+/// \addtogroup OptConstraint
+/// @{
+
+//! MIN(a,b) minimum.
+OEXP_ARITH(MIN, min_func)
+//! MIN(a,b) maximum.
+OEXP_ARITH(MAX, max_func)
+
+/// @}
+
+template <typename T> struct lcm_func {
+    inline T operator()(T a, T b) const { return std::lcm(a, b); }
+};
+
+/// \addtogroup OptConstraint
+/// @{
+
+//! LCM(a,b): lcm(a,b)
+// return least common multiple
+OEXP_ARITH(LCM, lcm_func)
+
+/// @}
+
+/// \addtogroup OptConstraint
+/// @{
+// If this is needed, we should also add BIT_AND BIT_XOR BIT_COMPL
+//! BIT_OR(a,b):  A|B
+OEXP_ARITH(BIT_OR, std::bit_or)
+//! BIT_ANDR(a,b):  A&B
+OEXP_ARITH(BIT_AND, std::bit_and)
+/// @}
+
+#define OEXP_COMPARE(FNAME, CLST)                                                                                      \
+    template <typename A, typename B> inline constexpr auto FNAME(A &&pa, B &&pb)                                      \
+    {                                                                                                                  \
+        auto wa = wrap_param(std::forward<A>(pa));                                                                     \
+        auto wb = wrap_param(std::forward<B>(pb));                                                                     \
+        using atype = decltype(fake_eval(wa));                                                                         \
+        using btype = decltype(fake_eval(wb));                                                                         \
+        using common_t = std::common_type_t<atype, btype>;                                                             \
+        return oExp::make_binop(wa, wb, CLST<common_t>());                                                             \
+    }
+
+/// \addtogroup OptConstraint
+/// @{
+
+//! EQ(a,b)  - compare equal
+OEXP_COMPARE(EQ, std::equal_to);
+//! NE(a,b)  - compare not-equal
+OEXP_COMPARE(NE, std::not_equal_to);
+//! GT(a,b)  - compare greater-than
+OEXP_COMPARE(GT, std::greater);
+//! GE(a,b)  - compare greater-than-or-equal
+OEXP_COMPARE(GE, std::greater_equal);
+
+//! LT(a,b)  - compare less-than
+template <typename A, typename B> inline constexpr auto LT(A &&a, B &&b)
+{
+    return GT(std::forward<B>(b), std::forward<A>(a));
+}
+//! LE(a,b)  - compare less-than-or-equal
+template <typename A, typename B> inline constexpr auto LE(A &&a, B &&b)
+{
+    return GE(std::forward<B>(b), std::forward<A>(a));
+}
+
+/// @}
+
+#define OEXP_UNARYMATH(FNAME, CLSNAME)                                                                                 \
+    template <typename A> inline constexpr auto FNAME(A &&pa)                                                          \
+    {                                                                                                                  \
+        auto wa = wrap_param(std::forward<A>(pa));                                                                     \
+        using atype = decltype(fake_eval(wa));                                                                         \
+        return make_unop(wa, CLSNAME<atype>());                                                                        \
+    }
+
+#define OEXP_PREDICATE(FNAME, CODE)                                                                                    \
+    template <typename A> inline constexpr auto FNAME(A &&pa)                                                          \
+    {                                                                                                                  \
+        auto wa = wrap_param(std::forward<A>(pa));                                                                     \
+        using atype = decltype(fake_eval(wa));                                                                         \
+        return make_unop(wa, [](atype a) -> bool { return (CODE); });                                                  \
+    }
+
+template <typename T> inline bool is_pow2_func(T a)
+{
+    static_assert(std::is_integral_v<T>, "IS_POW2 can only apply to integer types");
+    return (a & (a - 1)) == 0;
+}
+template <typename T> struct abs_func {
+    inline T operator()(T a) const { return std::abs(a); }
+};
+
+/// \addtogroup OptConstraint
+/// @{
+
+//! NEG(x) - negation
+OEXP_UNARYMATH(NEG, std::negate)
+
+//! NOT(x) - boolean inversion
+OEXP_PREDICATE(NOT, !a)
+
+//! IS_POW2(x) - is x a power of 2 (assuming x >= 1)
+OEXP_PREDICATE(IS_POW2, is_pow2_func(a))
+
+//! ABS(x) - abs value
+OEXP_UNARYMATH(ABS, abs_func)
+
+/// @}
+
+// casts
+template <typename TTO, typename TFROM> struct cast_oper {
+    inline constexpr TTO operator()(TFROM x) const { return TTO(x); }
+};
+
+template <typename TTO, typename SOMEEXP> inline constexpr auto make_cast_oper(SOMEEXP &&exp)
+{
+    typedef std::remove_reference_t<SOMEEXP> EXPT;
+    typedef typename EXPT::otype from_type;
+    typedef cast_oper<TTO, from_type> OPER;
+    return expr<Variant::unop, tuple<EXPT, OPER>>(std::move(exp), OPER());
+}
+
+/// \addtogroup OptConstraint
+/// @{
+
+//! INT(x) - convert to int
+template <typename TA> static inline auto constexpr INT(TA &&a)
+{
+    auto wa = wrap_param(std::forward<TA>(a));
+    if constexpr (std::is_same_v<typename decltype(wa)::otype, int>) {
+        return wa; // already is
+    } else {
+        return make_cast_oper<int>(std::move(wa));
+    }
+}
+// specialize for when applied to 'value'
+template <typename TI> static inline auto constexpr INT(expr<Variant::value, TI> const &val)
+{
+    return expr<Variant::value, int>(int(val.getval()));
+}
+
+//! UINT(x) - convert to unsigned (i.e. size_t)
+template <typename TA> static inline auto constexpr UINT(TA &&a)
+{
+    auto wa = wrap_param(std::forward<TA>(a));
+    if constexpr (std::is_same_v<typename decltype(wa)::otype, size_t>) {
+        return wa; // already is
+    } else {
+        return make_cast_oper<size_t>(std::move(wa));
+    }
+}
+template <typename TI> static inline auto constexpr UINT(expr<Variant::value, TI> const &val)
+{
+    return expr<Variant::value, size_t>(size_t(val.getval()));
+}
+
+//! DTYPE(x) - convert to DType
+template <typename TA> static inline auto constexpr DTYPE(TA &&a)
+{
+    auto wa = wrap_param(std::forward<TA>(a));
+    if constexpr (std::is_same_v<typename decltype(wa)::otype, DType>) {
+        return wa; // already is
+    } else {
+        return make_cast_oper<DType>(std::move(wa));
+    }
+}
+template <typename TI> static inline auto constexpr DTYPE(expr<Variant::value, TI> const &val)
+{
+    return expr<Variant::value, DType>(DType(val.getval()));
+}
+
+//! FLOAT(x) - convert to float
+template <typename TA> static inline auto constexpr FLOAT(TA &&a)
+{
+    auto wa = wrap_param(std::forward<TA>(a));
+    if constexpr (std::is_same_v<typename decltype(wa)::otype, float>) {
+        return wa; // already is
+    } else {
+        return make_cast_oper<float>(std::move(wa));
+    }
+}
+template <typename TI> static inline auto constexpr FLOAT(expr<Variant::value, TI> const &val)
+{
+    return expr<Variant::value, float>(float(val.getval()));
+}
+
+/// @}
+
+// do we need a BOOL cast?
+
+/// \addtogroup OptConstraint
+/// @{
+
+//! OR(a,b, ...) - logical OR; evaluation stops after first 'true' operand
+template <typename TA, typename TB, typename... Ts> inline constexpr auto OR(TA &&a, TB &&b, Ts &&...ts)
+{
+    auto parms = std::make_tuple(wrap_param_to<bool>(std::forward<TA>(a)), wrap_param_to<bool>(std::forward<TB>(b)),
+                                 wrap_param_to<bool>(std::forward<Ts>(ts))...);
+    return make_logop<Variant::lg_or>(parms);
+}
+
+//! AND(a,b, ...) - logical AND; evaluation stops after first 'false' operand
+template <typename TA, typename TB, typename... Ts> inline constexpr auto AND(TA &&a, TB &&b, Ts &&...ts)
+{
+    auto parms = std::make_tuple(wrap_param_to<bool>(std::forward<TA>(a)), wrap_param_to<bool>(std::forward<TB>(b)),
+                                 wrap_param_to<bool>(std::forward<Ts>(ts))...);
+    return make_logop<Variant::lg_and>(parms);
+}
+
+//! AND(a,b, ...) - logical AND; evaluation stops after first 'false' operand
+template <typename TA> inline constexpr auto AND(TA &&a)
+{
+    return AND(std::forward<TA>(a), true);
+}
+
+//! XOR(a,b, ...) - logical XOR
+template <typename TA, typename TB, typename... Ts> inline constexpr auto XOR(TA &&a, TB &&b, Ts &&...ts)
+{
+    auto parms = std::make_tuple(wrap_param_to<bool>(std::forward<TA>(a)), wrap_param_to<bool>(std::forward<TB>(b)),
+                                 wrap_param_to<bool>(std::forward<Ts>(ts))...);
+    return make_logop<Variant::lg_xor>(parms);
+}
+
+//! ADD(a,b,c...) - equivalent to ADD( ADD(a,b), c ...)
+template <typename TA, typename TB, typename... Ts> inline constexpr auto ADD(TA &&a, TB &&b, Ts &&...ts)
+{
+    return ADD(ADD(std::forward<TA>(a), std::forward<TB>(b)), std::forward<Ts>(ts)...);
+}
+
+//! MUL(a,b,c...) - equivalent to MUL( MUL(a,b), c ...)
+template <typename TA, typename TB, typename... Ts> inline constexpr auto MUL(TA &&a, TB &&b, Ts &&...ts)
+{
+    return MUL(MUL(std::forward<TA>(a), std::forward<TB>(b)), std::forward<Ts>(ts)...);
+}
+
+//! MIN(a,b,c...) - equivalent to MIN( MIN(a,b), c ...)
+template <typename TA, typename TB, typename... Ts> inline constexpr auto MIN(TA &&a, TB &&b, Ts &&...ts)
+{
+    return MIN(MIN(std::forward<TA>(a), std::forward<TB>(b)), std::forward<Ts>(ts)...);
+}
+//! MAX(a,b,c...) - equivalent to MAX( MAX(a,b), c ...)
+template <typename TA, typename TB, typename... Ts> inline constexpr auto MAX(TA &&a, TB &&b, Ts &&...ts)
+{
+    return MAX(MAX(std::forward<TA>(a), std::forward<TB>(b)), std::forward<Ts>(ts)...);
+}
+
+#if 0 // this is in oexpr_post.h now, since it needs to handle opexpr too
+//  ! SELECT(cond, A,B) - cond?A:B
+template <typename SEL, typename A, typename B>
+inline constexpr auto SELECT(SEL &&s, A &&a, B &&b)
+{
+    auto ws = wrap_param_to<bool>(std::forward<SEL>(s));
+    auto wa = wrap_param(std::forward<A>(a));
+    auto wb = wrap_param(std::forward<B>(b));
+    return make_select(ws, wa, wb);
+}
+#endif
+
+template <typename SEL, typename A, typename B> constexpr auto SELECT(SEL &&s, A &&a, B &&b);
+
+/// @}
+
+// make an sFunction<T> which contains and returns a specific value.
+// These are self-contained std::function objects;
+// Hopefully can reduce the number of distinct std::function objects by using
+// this in all such cases...
+template <typename SCALART> sFunction<SCALART> make_literal_sfunction(SCALART val)
+{
+    return sFunction<SCALART>::create([val](ECtx &) -> SCALART { return val; });
+}
+
+// make_literal_sfunction<bool> is a special case: we make a function
+// object bound to one of sfunc_bool_false() or sfunc_bool_true()
+// This makes check_sfunction_bool() possible.
+//
+
+inline constexpr bool sfunc_bool_false(ECtx &)
+{
+    return false;
+}
+inline constexpr bool sfunc_bool_true(ECtx &)
+{
+    return true;
+}
+
+template <> inline sFunction<bool> make_literal_sfunction<bool>(bool val)
+{
+    return sFunction<bool>(sFunction<bool>::FunctionWrapper, (void *)(val ? sfunc_bool_true : sfunc_bool_false));
+}
+// given an sFunction<bool>, return
+//  0 if it will always return false,
+//  1 if it will always return true,
+// -1 if we can't tell.
+int check_sfunction_bool(sFunction<bool> const &);
+
+//// Convert to std::function
+// make any expr (or anything that converts to expr ) into a std::function
+// of the *specified* type, which should be one of the basic types bool,float,int,size_t, DType.
+//
+// Shortcuts:
+// if the input is a function, it's returned;
+// if the input is a scalar type, we use make_literal_sfunction.
+//
+template <typename T, typename EX> sFunction<T> wrap_as_function(EX &&a)
+{
+    // don't change it if it's already a function
+    typedef std::remove_reference_t<EX> EXT;
+    if constexpr (std::is_same_v<EXT, sFunction<T>>) {
+        return std::forward<EX>(a);
+    } else if constexpr (std::is_same_v<std::remove_const_t<EXT>, T> || std::is_arithmetic_v<EXT>) {
+        return make_literal_sfunction<T>(a);
+    } else if constexpr (std::is_same_v<EXT, expr<Variant::value, T>>) {
+        // is an expr<value,T>
+        return make_literal_sfunction<T>(a.getval());
+    } else {
+        auto ftmp = wrap_param_to<T>(std::forward<EX>(a));
+        return sFunction<T>::create([ftmp](ECtx &e) -> T { return ftmp.eval(e); });
+    }
+}
+
+// if wrap_as_function is given a expr which is 'value', it can just
+// snarf the value out of it and call make_literal_sfunction.
+// This requires that OTHERT can quietly convert to T.
+// (note, this doesn't work, the template above seems to be always used, so a
+// case was added there for T and expr<value,T>
+/*
+template <typename T, typename OTHERT>
+inline sFunction<T> wrap_as_function( expr<Variant::value,OTHERT> const & a ){
+	return make_literal_sfunction<T>( a.getval());
+}
+*/
+
+#undef OEXP_ARITH
+#undef OEXP_COMPARE
+#undef OEXP_UNARYMATH
+#undef OEXP_PREDICATE
+
+//this class allows encapsulation of things which evaluate to an Op
+// without the need to construct any Op; e.g,
+//  INPUT_OF("operandname",int-expr)
+//
+//  .. so that e.g. you can write
+//   DTYPE_OF( INPUT_OF( "operand", ITER_VAR("I")))
+// .. in a replacement rule, and can also use INPUT_OF in a constraint.
+//
+//
+
+enum class OpVnt : int {
+    // template params of opexpr:
+    parm, // <parm,void> : an operand_tag_t is within.
+    input_of, // <input_of,tuple<OPA,EXPRB>>  : INPUT_OF(A,B)
+    output_of, // <output_of,tuple<OPA,EXPRB>>  : OUTPUT_OF(A,B)
+    select, // <select,tuple<COND,OPA,OPB>	 : SELECT(A,B)
+};
+template <OpVnt V, typename ARG> class opexpr {
+};
+
+} // namespace oExp
+
+// The definitions in oExp namespace which depend on types
+// defined in optimize.h are in the header 'oexpr_post.h'
+
+#endif /* !PREPARE_DISABLED */
+#endif /* OEXPR_H_ */
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/oexpr_post.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/oexpr_post.h
new file mode 100755
index 0000000000000..41b000715572f
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/oexpr_post.h
@@ -0,0 +1,1100 @@
+//==============================================================================
+//
+// Copyright (c) 2020,2022-2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef OEXPR_POST_H_
+#define OEXPR_POST_H_
+
+#ifndef OEXPR_H_
+#error "?? must include oexpr.h first"
+#endif
+
+#ifdef WITH_OPT_DEBUG
+#include <sstream>
+#endif
+#include "build_options_pub.h"
+
+#include "weak_linkage.h"
+#include "macros_attribute.h"
+#ifndef PREPARE_DISABLED
+PUSH_VISIBILITY(default)
+
+namespace oExp {
+
+template <Variant VAR, size_t IDX, typename OPERANDS>
+inline bool ALWAYSINLINE reduce_logop(ECtx &e, OPERANDS const &operands)
+{
+    static constexpr size_t N = std::tuple_size_v<OPERANDS>;
+#ifdef WITH_OPT_DEBUG
+    using TraceState = constraint_lib::Constraint::TraceState;
+    TraceState save_state;
+    if (IDX == 0 and e.curr_rule_info and e.curr_rule_info->is(hnnx::OptimFlags::trace_rule)) {
+        save_state = e.trace;
+        e.trace.depth += 1;
+        e.trace.op = (VAR == Variant::lg_and ? "and" : VAR == Variant::lg_or ? "or" : "xor");
+    }
+    e.trace.clause = IDX;
+    auto result = [&save_state, &e](bool result) {
+        if (e.curr_rule_info and e.curr_rule_info->is(hnnx::OptimFlags::trace_rule))
+            e.trace_vector.push_back(TraceState{e.trace.depth, IDX, e.trace.op, result});
+        if (IDX == 0) e.trace = save_state;
+        return result;
+    };
+#else
+    auto result = [](bool r) { return r; };
+#endif
+    // find the first one.
+    bool result_i = std::get<IDX>(operands).eval(e);
+    if constexpr (IDX + 1 >= N) { // it's the last one .. we're done
+        return result(result_i);
+    } else if constexpr (VAR == Variant::lg_xor) {
+        // must xor with the rest
+        return result_i ^ reduce_logop<VAR, IDX + 1, OPERANDS>(e, operands);
+    } else {
+        if constexpr (VAR == Variant::lg_and) {
+            if (!result_i) return result(false); // short-cut AND if false
+        } else if constexpr (VAR == Variant::lg_or) {
+            if (result_i) return result(true); // short-cut OR if true
+        } else {
+            static_assert(false && IDX, "bad variant!?");
+        }
+        // evaluate the rest.
+        bool next = reduce_logop<VAR, IDX + 1, OPERANDS>(e, operands);
+#ifdef WITH_OPT_DEBUG
+        if (IDX == 0) e.trace = save_state;
+#endif
+        return next;
+    }
+}
+
+// opdef_accessor is a friend class of Constraint;
+// it forms a bridge to the operand map and op_def map; all
+// oExpr use this to map operand tags to OpRef, OpDef and OutputDef
+class opdef_accessor {
+  public:
+    API_EXPORT static OpRef lookup_operand(ECtx &e, hnnx::operand_tag_parm_t);
+    API_EXPORT static OpDef const &get_opdef(ECtx &e, hnnx::operand_tag_parm_t optag);
+    API_EXPORT static OpDef const &get_opdef(ECtx &e, OpRef);
+
+    // this is defined below in same file - wrapper to select a ger_opdef by type.
+    template <typename OEXPR_TYPE> static inline OpDef const &get_opdef_oexpr(ECtx &e, OEXPR_TYPE const &oexp);
+
+    API_EXPORT static OutputDef const &get_outputdef(ECtx &e, hnnx::operand_tag_parm_t optag);
+    API_EXPORT static OutputDef const &get_outputdef(ECtx &e, OpRef);
+    API_EXPORT static Split_Context const &lookup_split(ECtx &e, hnnx::split_context_tag_t const &tag);
+
+    API_EXPORT static OpRef get_input_of(ECtx &e, OpDef const *, int idx);
+    API_EXPORT static OpRef get_output_of(ECtx &e, OpRef, int idx);
+    template <typename T> API_EXPORT static T get_option(ECtx &e, hnnx::opname_tag_parm_t name);
+    static void show_debug_message(ECtx &e, char const *why, char const *str)
+    {
+        if constexpr (build_options_pub::DefOptLog)
+            const_cast<constraint_lib::Constraint &>(e).show_debug_message(why, str);
+    }
+};
+
+//
+// this can be used to extract the V and T from an opexpr<V,T>
+// opexpr< opexpr<V,T>>::variant -> V
+// opexpr< opexpr<V,T>>::type -> T
+
+template <typename DUMMY> struct opexpr_types {
+};
+template <OpVnt V, typename T> struct opexpr_types<opexpr<V, T>> {
+    static constexpr OpVnt variant = V;
+    using parmtype = T;
+};
+
+/////////////////////////////////////////
+// an opexpr which just contains a value.
+/////////////////////////////////////////
+template <> class opexpr<OpVnt::parm, void> : public opdef_accessor {
+    const hnnx::operand_tag_t m_optag;
+
+  public:
+    opexpr(hnnx::operand_tag_parm_t optag) : m_optag(optag) {}
+    OpRef eval(ECtx &e) const { return lookup_operand(e, m_optag); }
+    hnnx::operand_tag_parm_t get_optag() const { return m_optag; }
+};
+
+template <typename OEXPR_TYPE> inline OpDef const &opdef_accessor::get_opdef_oexpr(ECtx &e, OEXPR_TYPE const &oexp)
+{
+    if constexpr (opexpr_types<OEXPR_TYPE>::variant == OpVnt::parm) {
+        return get_opdef(e, oexp.get_optag());
+    } else {
+        return get_opdef(e, oexp.eval(e));
+    }
+}
+
+// 'wrap_opexpr( something )'
+// - turns string constant or operand_tag_t into opexpr<OpVnt::parm,void>
+// - opexpr input is returned unchanged
+// - others are invalid.
+//
+template <typename T> struct opwrapper_helper {
+    static_assert(false && sizeof(T), "wrap_opexpr instantiated on unsupported type");
+};
+
+// wrapping a char const *
+template <> struct opwrapper_helper<char const *> {
+    static auto wrap(char const *p) { return opexpr<OpVnt::parm, void>(p); }
+};
+template <int N> struct opwrapper_helper<char const[N]> {
+    static auto wrap(char const *p) { return opexpr<OpVnt::parm, void>(p); }
+};
+
+// wrapping an operand tag
+template <> struct opwrapper_helper<hnnx::operand_tag_t> {
+    static auto wrap(hnnx::operand_tag_parm_t p) { return opexpr<OpVnt::parm, void>(p); }
+};
+// wrapping an opexpr
+template <OpVnt V, typename T> struct opwrapper_helper<opexpr<V, T>> {
+    static constexpr auto wrap(opexpr<V, T> const &p) { return p; }
+};
+
+template <typename T> inline constexpr auto wrap_opexpr(T &&p)
+{
+    return opwrapper_helper<std::remove_reference_t<T>>::wrap(std::forward<T>(p));
+}
+
+/////////////////////////////////////////
+// opexpr for INPUT_OF
+/////////////////////////////////////////
+
+template <typename OPA, typename EXPRB> class opexpr<OpVnt::input_of, std::tuple<OPA, EXPRB>> : public opdef_accessor {
+    const OPA m_op;
+    const EXPRB m_idx;
+
+  public:
+    constexpr opexpr(OPA const &a, EXPRB const &b) : m_op(a), m_idx(b) {}
+    OpRef eval(ECtx &e) const
+    {
+        OpDef const &opd = get_opdef_oexpr<OPA>(e, m_op);
+        int idx = m_idx.eval(e);
+        return get_input_of(e, &opd, idx);
+    }
+};
+/// \ingroup ingroupOptConstraint
+/// @brief INPUT_OF("operand", index) - select the specified input of an op.
+///
+/// index must be in range 0 ... n-1, where n is the number of input the Op actually has.
+///
+///
+
+template <typename TOP, typename TEXPR> auto INPUT_OF(TOP &&opa, TEXPR &&exprb)
+{
+    auto wa = wrap_opexpr(std::forward<TOP>(opa));
+    auto wb = wrap_param_to<int>(std::forward<TEXPR>(exprb));
+    return opexpr<OpVnt::input_of, std::tuple<decltype(wa), decltype(wb)>>(wa, wb);
+}
+/////////////////////////////////////////
+// opexpr for OUTPUT_OF
+/////////////////////////////////////////
+
+template <typename OPA, typename EXPRB> class opexpr<OpVnt::output_of, std::tuple<OPA, EXPRB>> : public opdef_accessor {
+    const OPA m_op;
+    const EXPRB m_idx;
+
+  public:
+    constexpr opexpr(OPA const &a, EXPRB const &b) : m_op(a), m_idx(b) {}
+    OpRef eval(ECtx &e) const
+    {
+        OpRef opr = m_op.eval(e);
+        int idx = m_idx.eval(e);
+        return get_output_of(e, opr, idx);
+    }
+};
+
+/// \ingroup ingroupOptConstraint
+/// @brief OUTPUT_OF("operand", index) - select the specified output of a multi-output Op.
+///
+/// The "operand" must refer to either (a) a multi-output Op, with at least index+1 outputs;
+/// or (b) one of the $Out nodes of such an Op. The result is the $Out node for the output selected
+/// by 'index' (if index=0, "operand" can refer to an single-output Op, in which case you just get
+/// that Op).
+///
+
+template <typename TOP, typename TEXPR> auto OUTPUT_OF(TOP &&opa, TEXPR &&exprb)
+{
+    auto wa = wrap_opexpr(std::forward<TOP>(opa));
+    auto wb = wrap_param_to<int>(std::forward<TEXPR>(exprb));
+    return opexpr<OpVnt::output_of, std::tuple<decltype(wa), decltype(wb)>>(wa, wb);
+}
+
+////////////////////////// implement Config //////////////////////////////
+
+template <typename T> class expr<Variant::config, T> : public opdef_accessor {
+    const hnnx::operand_tag_t m_option_name;
+
+  public:
+    typedef T otype;
+    expr(char const *optname) : m_option_name(optname) {}
+    otype eval(ECtx &ectx) const { return opdef_accessor::get_option<T>(ectx, m_option_name); }
+};
+// specialize for bool: read as int, convert to bool.
+template <> class expr<Variant::config, bool> : public opdef_accessor {
+    const hnnx::operand_tag_t m_option_name;
+
+  public:
+    typedef bool otype;
+    expr(char const *optname) : m_option_name(optname) {}
+    otype eval(ECtx &ectx) const { return opdef_accessor::get_option<int>(ectx, m_option_name) != 0; }
+};
+
+////////////////////////////// properties of OpDef //////////////////////
+
+// property extractors, implement RANK_OF, STEPSIZE_OF, DIM_OF etc.
+//
+// All of these have two 'eval' methods:
+//    eval( ECtx &, operand_tag_parm_t parmtag)-> OpRef
+//    eval( ECtx &, Opref oref )-> OpRef
+//
+// The first one is used when the extractor is applied directly to a named
+// parameter as in DTYPE_OF("X"); the second is used in the general case
+// e.g DTYPE_OF(INPUT_OF("X", SUB(INPUTS_OF("X"),1)))
+//
+// The reason for this: mapping from parmtag to OpDef & is considerably faster
+// than the two-step mapping parmtag -> OpRef -> OpDef &
+// This is due to a caching mechanism in the Match object. We don't want
+// to defeat this speedup for the more common case.
+// Note that the methods get_outputdef etc of opdef_accessor are overloaded for
+// both cases, so the two eval methods tend to look the same.
+//
+//
+
+struct property_extract_rank : private opdef_accessor {
+    typedef size_t otype;
+    inline otype eval(ECtx &e, hnnx::operand_tag_parm_t ptag) const { return get_outputdef(e, ptag).rank; }
+    inline otype eval(ECtx &e, OpRef oref) const { return get_outputdef(e, oref).rank; }
+};
+struct property_extract_dtype : private opdef_accessor {
+    typedef DType otype;
+    inline otype eval(ECtx &e, hnnx::operand_tag_parm_t ptag) const { return get_outputdef(e, ptag).dtype; }
+    inline otype eval(ECtx &e, OpRef oref) const { return get_outputdef(e, oref).dtype; }
+};
+struct property_extract_zero_offset : private opdef_accessor {
+    typedef int otype;
+    inline otype eval(ECtx &e, hnnx::operand_tag_parm_t ptag) const
+    {
+        auto const &od = get_outputdef(e, ptag);
+        return (od.dtype == DType::Multi) ? 0 : od.zero_offset;
+    }
+    inline otype eval(ECtx &e, OpRef oref) const
+    {
+        auto const &od = get_outputdef(e, oref);
+        return (od.dtype == DType::Multi) ? 0 : od.zero_offset;
+    }
+};
+struct property_extract_n_outputs : private opdef_accessor {
+    typedef int otype;
+    API_EXPORT otype eval(ECtx &e, hnnx::operand_tag_parm_t ptag) const; // in oexpr.cc
+    API_EXPORT otype eval(ECtx &e, OpRef oref) const; // in oexpr.cc
+  private:
+    API_EXPORT static otype eval_common(ECtx &e, OpDef const *od); // in oexpr.cc
+};
+
+struct property_extract_elementsize : private opdef_accessor {
+    typedef size_t otype;
+    API_EXPORT otype eval(ECtx &e, hnnx::operand_tag_parm_t optag) const; // in oexpr.cc
+    API_EXPORT otype eval(ECtx &e, OpRef oref) const; // in oexpr.cc
+};
+
+struct property_extract_stepsize : private opdef_accessor {
+    typedef float otype;
+    inline otype eval(ECtx &e, hnnx::operand_tag_parm_t ptag) const { return get_outputdef(e, ptag).stepsize; }
+    inline otype eval(ECtx &e, OpRef oref) const { return get_outputdef(e, oref).stepsize; }
+};
+
+struct property_extract_n_inputs : private opdef_accessor {
+    typedef size_t otype;
+    inline otype eval(ECtx &e, hnnx::operand_tag_parm_t ptag) const { return get_opdef(e, ptag).n_inputs(); }
+    inline otype eval(ECtx &e, OpRef oref) const { return get_opdef(e, oref).n_inputs(); }
+};
+
+// can only be used embedded in dim_extractor
+struct property_extract_dim : protected opdef_accessor {
+    typedef size_t otype;
+    inline otype eval(ECtx &e, hnnx::operand_tag_parm_t ptag, int i) const
+    {
+        OutputDef const &odef = get_outputdef(e, ptag);
+        if ((unsigned)i > (unsigned)odef.rank) return 0;
+        return odef.max_sizes[i];
+    }
+    inline otype eval(ECtx &e, OpRef oref, int i) const
+    {
+        OutputDef const &odef = get_outputdef(e, oref);
+        if ((unsigned)i > (unsigned)odef.rank) return 0;
+        return odef.max_sizes[i];
+    }
+};
+template <typename ITYPE> struct dim_extractor : public property_extract_dim {
+    ITYPE m_iexpr;
+    inline size_t eval(ECtx &e, hnnx::operand_tag_parm_t ptag) const
+    {
+        int i = m_iexpr.eval(e);
+        return property_extract_dim::eval(e, ptag, i);
+    }
+    inline size_t eval(ECtx &e, OpRef oref) const
+    {
+        int i = m_iexpr.eval(e);
+        return property_extract_dim::eval(e, oref, i);
+    }
+    constexpr dim_extractor(ITYPE const &iexpr) : m_iexpr(wrap_param_to<int>(iexpr)) {}
+};
+template <> struct dim_extractor<int> : public property_extract_dim {
+    int m_iexpr;
+    inline size_t eval(ECtx &e, hnnx::operand_tag_parm_t ptag) const
+    {
+        int const i = m_iexpr;
+        return property_extract_dim::eval(e, ptag, i);
+    }
+    inline size_t eval(ECtx &e, OpRef oref) const
+    {
+        int const i = m_iexpr;
+        return property_extract_dim::eval(e, oref, i);
+    }
+    constexpr dim_extractor(int i) : m_iexpr(i) {}
+};
+
+struct property_data_size : private opdef_accessor {
+    typedef int64_t otype;
+    API_EXPORT int eval(ECtx &e, const OpDef &op) const;
+    API_EXPORT int eval(ECtx &e, hnnx::operand_tag_parm_t ptag) const;
+    API_EXPORT int eval(ECtx &e, OpRef oref) const;
+};
+
+template <typename OPEX, typename PROPEXT> class expr<Variant::property, std::tuple<OPEX, PROPEXT>> {
+    const OPEX m_opexpr;
+    PROPEXT m_propex;
+
+  public:
+    typedef typename PROPEXT::otype otype;
+    constexpr expr(OPEX const &opexp) : m_opexpr(opexp) {}
+    // this is for 'dim', to apply the index parameter
+    template <typename T>
+    constexpr expr(OPEX const &opexp, T &&xparm) : m_opexpr(opexp), m_propex(std::forward<T>(xparm))
+    {
+    }
+
+    otype eval(ECtx &e) const
+    {
+        if constexpr (opexpr_types<OPEX>::variant == OpVnt::parm) {
+            // use the operand tag directly.
+            return m_propex.eval(e, m_opexpr.get_optag());
+        } else {
+            // get an OpRef by evaluating the m_opexpr
+            return m_propex.eval(e, m_opexpr.eval(e));
+        }
+    }
+};
+
+/// \addtogroup OptConstraint
+/// @{
+
+//! RANK_OF("operand") - extract rank of output
+template <typename TOP> inline constexpr auto RANK_OF(TOP &&op)
+{
+    auto op_wrapped = wrap_opexpr(std::forward<TOP>(op));
+    return expr<Variant::property, std::tuple<decltype(op_wrapped), property_extract_rank>>(op_wrapped);
+}
+
+//! DTYPE_OF("operand") - extract dtype of output
+template <typename TOP> inline constexpr auto DTYPE_OF(TOP &&op)
+{
+    auto op_wrapped = wrap_opexpr(std::forward<TOP>(op));
+    return expr<Variant::property, std::tuple<decltype(op_wrapped), property_extract_dtype>>(op_wrapped);
+}
+
+//! ELEMENTSIZE_OF("operand") - extract dtype of output, return element size.
+template <typename TOP> inline constexpr auto ELEMENTSIZE_OF(TOP &&op)
+{
+    auto op_wrapped = wrap_opexpr(std::forward<TOP>(op));
+    return expr<Variant::property, std::tuple<decltype(op_wrapped), property_extract_elementsize>>(op_wrapped);
+}
+
+//! ZERO_OFFSET_OF("operand") - extract zero_offset of output quantization
+template <typename TOP> inline constexpr auto ZERO_OFFSET_OF(TOP &&op)
+{
+    auto op_wrapped = wrap_opexpr(std::forward<TOP>(op));
+    return expr<Variant::property, std::tuple<decltype(op_wrapped), property_extract_zero_offset>>(op_wrapped);
+}
+
+//! STEPSIZE_OF("operand") - extract step size of output quantization
+template <typename TOP> inline constexpr auto STEPSIZE_OF(TOP &&op)
+{
+    auto op_wrapped = wrap_opexpr(std::forward<TOP>(op));
+    return expr<Variant::property, std::tuple<decltype(op_wrapped), property_extract_stepsize>>(op_wrapped);
+}
+
+//! INPUTS_OF("operand") - get the number of inputs of an operand
+template <typename TOP> inline constexpr auto INPUTS_OF(TOP &&op)
+{
+    auto op_wrapped = wrap_opexpr(std::forward<TOP>(op));
+    return expr<Variant::property, std::tuple<decltype(op_wrapped), property_extract_n_inputs>>(op_wrapped);
+}
+//! OUTPUTS_OF("operand")- get the number of outputs of an operand
+/// This can be applied to any OpDef, including $Out; for $Out or Multi Opdef
+/// it will return the full # of outputs.
+template <typename TOP> inline constexpr auto OUTPUTS_OF(TOP &&op)
+{
+    auto op_wrapped = wrap_opexpr(std::forward<TOP>(op));
+    return expr<Variant::property, std::tuple<decltype(op_wrapped), property_extract_n_outputs>>(op_wrapped);
+}
+//! DIM_OF("operand", idx) - get the size of an output in dimension 'idx'
+
+template <typename TOP, typename T> inline constexpr auto DIM_OF(TOP &&op, T &&index)
+{
+    auto op_wrapped = wrap_opexpr(std::forward<TOP>(op));
+    return expr<Variant::property, std::tuple<decltype(op_wrapped), dim_extractor<std::remove_reference_t<T>>>>(
+            op_wrapped, std::forward<T>(index));
+}
+
+/// }@
+
+template <typename TOP> inline constexpr auto DATA_SIZE(TOP &&op)
+{
+    auto op_wrapped = wrap_opexpr(std::forward<TOP>(op));
+    return expr<Variant::property, std::tuple<decltype(op_wrapped), property_data_size>>(op_wrapped);
+}
+
+struct op_compare_same_op {
+    typedef bool otype;
+    otype eval(ECtx &e, OpDef const *a, OpDef const *b) const { return a == b; }
+};
+
+struct op_compare_same_encoding {
+    typedef bool otype;
+    API_EXPORT otype eval(ECtx &e, OpDef const *a, OpDef const *b) const; // in oexpr.cc
+};
+
+struct op_compare_same_shape {
+    typedef bool otype;
+    API_EXPORT otype eval(ECtx &e, OpDef const *a, OpDef const *b) const; // in oexpr.cc
+};
+
+//
+// compare two ops in some way.
+//
+template <typename OPEXA, typename OPEXB, typename OPCMP>
+class expr<Variant::opcompare, std::tuple<OPEXA, OPEXB, OPCMP>> : public property_extract_dim {
+    const OPEXA m_opexpra;
+    const OPEXB m_opexprb;
+    OPCMP m_opcmp;
+
+  public:
+    typedef typename OPCMP::otype otype; // probably bool
+    expr(OPEXA const &opexpa, OPEXB const &opexpb) : m_opexpra(opexpa), m_opexprb(opexpb) {}
+
+    otype eval(ECtx &e) const
+    {
+        OpDef const &opa = get_opdef_oexpr<OPEXA>(e, m_opexpra);
+        OpDef const &opb = get_opdef_oexpr<OPEXB>(e, m_opexprb);
+        return m_opcmp.eval(e, &opa, &opb);
+    }
+};
+
+//! SAME_OP("operanda", "operandb") - same opid.
+//
+template <typename TOPA, typename TOPB> inline auto SAME_OP(TOPA &&opa, TOPB &&opb)
+{
+    auto opa_wrapped = wrap_opexpr(std::forward<TOPA>(opa));
+    auto opb_wrapped = wrap_opexpr(std::forward<TOPB>(opb));
+
+    return expr<Variant::opcompare, std::tuple<decltype(opa_wrapped), decltype(opb_wrapped), op_compare_same_op>>(
+            opa_wrapped, opb_wrapped);
+}
+
+//! SAME_ENCODING("operanda", "operandb") - same dtype, and same quant. when applicable.
+//
+template <typename TOPA, typename TOPB> inline auto SAME_ENCODING(TOPA &&opa, TOPB &&opb)
+{
+    auto opa_wrapped = wrap_opexpr(std::forward<TOPA>(opa));
+    auto opb_wrapped = wrap_opexpr(std::forward<TOPB>(opb));
+
+    return expr<Variant::opcompare, std::tuple<decltype(opa_wrapped), decltype(opb_wrapped), op_compare_same_encoding>>(
+            opa_wrapped, opb_wrapped);
+}
+
+//! SAME_SHAPE("operanda", "operandb") - same shape.
+//
+template <typename TOPA, typename TOPB> inline auto SAME_SHAPE(TOPA &&opa, TOPB &&opb)
+{
+    auto opa_wrapped = wrap_opexpr(std::forward<TOPA>(opa));
+    auto opb_wrapped = wrap_opexpr(std::forward<TOPB>(opb));
+
+    return expr<Variant::opcompare, std::tuple<decltype(opa_wrapped), decltype(opb_wrapped), op_compare_same_shape>>(
+            opa_wrapped, opb_wrapped);
+}
+
+///////////////////////////// slice dimensions ////////////////////
+// The 'expr' contains a tag name, and a pointer-to-member-data
+//
+
+template <typename T> // always int...
+struct expr<Variant::slicedim, T> : private opdef_accessor {
+    hnnx::split_context_tag_t m_tag;
+    typedef T otype;
+    T const Split_Context::*m_which;
+    inline T eval(ECtx &e) const { return lookup_split(e, m_tag).*m_which; }
+    expr(hnnx::split_context_tag_t const &tag, int const Split_Context::*which) : m_tag(tag), m_which(which) {}
+};
+
+/// \addtogroup OptConstraint
+/// @{
+
+//! SPLIT_START("splitname") - get current split start
+inline auto SPLIT_START(hnnx::split_context_tag_t spl)
+{
+    return expr<Variant::slicedim, int>(spl, &Split_Context::start);
+}
+
+//! ITER_VAR("splitname") - get current split start value
+/// synonym for SPLIT_START (for use in OP_ITER)
+inline auto ITER_VAR(hnnx::split_context_tag_t spl)
+{
+    return expr<Variant::slicedim, int>(spl, &Split_Context::start);
+}
+
+//! SPLIT_SIZE("splitname") - get current split size
+inline auto SPLIT_SIZE(hnnx::split_context_tag_t spl)
+{
+    return expr<Variant::slicedim, int>(spl, &Split_Context::size);
+}
+
+//! SPLIT_DIM("splitname") - get current split dimension
+inline auto SPLIT_DIM(hnnx::split_context_tag_t spl)
+{
+    return expr<Variant::slicedim, int>(spl, &Split_Context::dim);
+}
+
+/// }@
+
+PUSH_VISIBILITY(default)
+
+// function classes for Variant::getconst
+//
+struct getconst_int {
+    typedef NN_INT32_T otype;
+    static NN_INT32_T eval(ECtx &e, OpDef const &opdef, int index)
+    {
+        return hnnx::getconst_int_impl(e.graph(), opdef, index).first;
+    }
+};
+struct getconst_int2 {
+    typedef NN_INT32_T otype;
+    static NN_INT32_T eval(ECtx &e, OpDef const &opdef, int index, int index2)
+    {
+        return hnnx::getconst_int_impl(e.graph(), opdef, index, index2).first;
+    }
+};
+
+struct getconst_int_valid {
+    typedef bool otype;
+    static bool eval(ECtx &e, OpDef const &opdef, int index)
+    {
+        return hnnx::getconst_int_impl(e.graph(), opdef, index).second;
+    }
+};
+struct getconst_int2_valid {
+    typedef bool otype;
+    static bool eval(ECtx &e, OpDef const &opdef, int index, int index2)
+    {
+        return hnnx::getconst_int_impl(e.graph(), opdef, index, index2).second;
+    }
+};
+
+struct getconst_float {
+    typedef float otype;
+    static float eval(ECtx &e, OpDef const &opdef, int index)
+    {
+        return hnnx::getconst_float_impl(e.graph(), opdef, index).first;
+    }
+};
+struct getconst_float_valid {
+    typedef bool otype;
+    static bool eval(ECtx &e, OpDef const &opdef, int index)
+    {
+        return hnnx::getconst_float_impl(e.graph(), opdef, index).second;
+    }
+};
+
+POP_VISIBILITY()
+//
+// for CONSTVAL_INT etc
+//
+
+template <typename GETCONST, typename OPEXPR, typename IEXPR>
+class expr<Variant::getconst, tuple<GETCONST, OPEXPR, IEXPR>> : private opdef_accessor {
+    OPEXPR m_opexpr;
+    IEXPR m_iexpr;
+
+  public:
+    typedef typename GETCONST::otype otype;
+    inline otype eval(ECtx &e) const
+    {
+        OpDef const &opd = get_opdef_oexpr<OPEXPR>(e, m_opexpr);
+        int i = m_iexpr.eval(e);
+        return GETCONST::eval(e, opd, i);
+    }
+    constexpr expr(OPEXPR const &oexp, IEXPR const &iexpr) : m_opexpr(oexp), m_iexpr(iexpr) {}
+};
+
+template <typename GETCONST, typename OPEXPR, typename IEXPR, typename IEXPR2>
+class expr<Variant::getconst, tuple<GETCONST, OPEXPR, IEXPR, IEXPR2>> : private opdef_accessor {
+    OPEXPR m_opexpr;
+    IEXPR m_iexpr;
+    IEXPR2 m_iexpr2;
+
+  public:
+    typedef typename GETCONST::otype otype;
+    inline otype eval(ECtx &e) const
+    {
+        OpDef const &opd = get_opdef_oexpr<OPEXPR>(e, m_opexpr);
+        int i = m_iexpr.eval(e);
+        int i2 = m_iexpr2.eval(e);
+        return GETCONST::eval(e, opd, i, i2);
+    }
+    expr(OPEXPR const &oexp, IEXPR const &iexpr, IEXPR2 const &iexpr2)
+        : m_opexpr(oexp), m_iexpr(iexpr), m_iexpr2(iexpr2)
+    {
+    }
+};
+
+template <typename GETCONST, typename OPEXPR, typename IEXPR>
+inline constexpr auto make_getconst_expr(OPEXPR &&opexp, IEXPR &&iexpr)
+{
+    return expr<Variant::getconst, tuple<GETCONST, std::remove_reference_t<OPEXPR>, std::remove_reference_t<IEXPR>>>(
+            std::forward<OPEXPR>(opexp), std::forward<IEXPR>(iexpr));
+}
+
+template <typename GETCONST, typename OPEXPR, typename IEXPR, typename IEXPR2>
+inline constexpr auto make_getconst_expr(OPEXPR &&opexp, IEXPR &&iexpr, IEXPR2 &&iexpr2)
+{
+    return expr<Variant::getconst, tuple<GETCONST, std::remove_reference_t<OPEXPR>, std::remove_reference_t<IEXPR>,
+                                         std::remove_reference_t<IEXPR2>>>(
+            std::forward<OPEXPR>(opexp), std::forward<IEXPR>(iexpr), std::forward<IEXPR2>(iexpr2));
+}
+
+/// \addtogroup OptConstraint
+/// @{
+
+//! CONSTVAL_INT("operand",idx) - extract int value from const at given index
+template <typename OPEXPR, typename IEXPR> constexpr auto CONSTVAL_INT(OPEXPR &&opexp, IEXPR &&idx)
+{
+    auto op_wrapped = wrap_opexpr(std::forward<OPEXPR>(opexp));
+    auto wi = wrap_param(std::forward<IEXPR>(idx));
+    return make_getconst_expr<getconst_int>(std::move(op_wrapped), std::move(wi));
+}
+
+//! CONSTVAL_INT("operand",idx, idx2) - extract int value from const at given index
+template <typename OPEXPR, typename IEXPR, typename IEXPR2>
+constexpr auto CONSTVAL_INT(OPEXPR &&opexp, IEXPR &&idx, IEXPR2 &&idx2)
+{
+    auto op_wrapped = wrap_opexpr(std::forward<OPEXPR>(opexp));
+    auto wi = wrap_param(std::forward<IEXPR>(idx));
+    auto wi2 = wrap_param(std::forward<IEXPR2>(idx2));
+    return make_getconst_expr<getconst_int2>(std::move(op_wrapped), std::move(wi), std::move(wi2));
+}
+
+//! CONSTVAL_INT_VALID("operand",idx) - determine if CONSTVAL_INT("operand",idx) is valid
+template <typename OPEXPR, typename IEXPR> constexpr auto CONSTVAL_INT_VALID(OPEXPR &&opexp, IEXPR &&idx)
+{
+    auto op_wrapped = wrap_opexpr(std::forward<OPEXPR>(opexp));
+    auto wi = wrap_param(std::forward<IEXPR>(idx));
+    return make_getconst_expr<getconst_int_valid>(std::move(op_wrapped), std::move(wi));
+}
+
+//! CONSTVAL_INT_VALID("operand",idx,idx2) - determine if CONSTVAL_INT("operand",idx,idx2) is valid
+template <typename OPEXPR, typename IEXPR, typename IEXPR2>
+constexpr auto CONSTVAL_INT_VALID(OPEXPR &&opexp, IEXPR &&idx, IEXPR2 &&idx2)
+{
+    auto op_wrapped = wrap_opexpr(std::forward<OPEXPR>(opexp));
+    auto wi = wrap_param(std::forward<IEXPR>(idx));
+    auto wi2 = wrap_param(std::forward<IEXPR2>(idx2));
+    return make_getconst_expr<getconst_int2_valid>(std::move(op_wrapped), std::move(wi), std::move(wi2));
+}
+
+//! CONSTVAL_FLOAT("operand",idx) - extract float value from const at given index
+template <typename OPEXPR, typename IEXPR> constexpr auto CONSTVAL_FLOAT(OPEXPR &&opexp, IEXPR &&idx)
+{
+    auto op_wrapped = wrap_opexpr(std::forward<OPEXPR>(opexp));
+    auto wi = wrap_param(std::forward<IEXPR>(idx));
+    return make_getconst_expr<getconst_float>(std::move(op_wrapped), std::move(wi));
+}
+
+//! CONSTVAL_FLOAT_VALID("operand",idx) - determine if CONSTVAL_FLOAT("operand",idx) is valid
+template <typename OPEXPR, typename IEXPR> constexpr auto CONSTVAL_FLOAT_VALID(OPEXPR &&opexp, IEXPR &&idx)
+{
+    auto op_wrapped = wrap_opexpr(std::forward<OPEXPR>(opexp));
+    auto wi = wrap_param(std::forward<IEXPR>(idx));
+    return make_getconst_expr<getconst_float_valid>(std::move(op_wrapped), std::move(wi));
+}
+
+// SELECT( bool, op,op )
+
+template <typename CONDEXP, typename OPA, typename OPB> class opexpr<OpVnt::select, tuple<CONDEXP, OPA, OPB>> {
+    CONDEXP m_cond;
+    OPA m_opa;
+    OPB m_opb;
+
+  public:
+    inline OpRef eval(ECtx &e) const
+    {
+        bool sel = m_cond.eval(e);
+        if (sel)
+            return m_opa.eval(e);
+        else
+            return m_opb.eval(e);
+    }
+    constexpr opexpr(CONDEXP const &sel, OPA const &opa, OPB const &opb) : m_cond(sel), m_opa(opa), m_opb(opb) {}
+};
+template <typename SEL, typename A, typename B> inline constexpr auto make_opselect(SEL s, A a, B b)
+{
+    return opexpr<OpVnt::select, tuple<SEL, A, B>>(s, a, b);
+}
+
+//! SELECT(cond, A,B) - cond?A:B
+template <typename SEL, typename A, typename B> inline constexpr auto SELECT(SEL &&s, A &&a, B &&b)
+{
+    // Op select, or numeric select?
+    // note, the 'Op' variant will not be used in a Replacement rule, since
+    // it's intercepted (in Replacement::SELECT) by a ReplFunc implementation.
+    if constexpr (Replacement::is_Op_type<A>() || Replacement::is_Op_type<B>()) {
+        static_assert(Replacement::is_Op_type<A>() && Replacement::is_Op_type<B>(), "bad SELECT parameters");
+        auto ws = wrap_param_to<bool>(std::forward<SEL>(s));
+        auto wa = wrap_opexpr(std::forward<A>(a));
+        auto wb = wrap_opexpr(std::forward<B>(b));
+        return make_opselect(ws, wa, wb);
+    } else {
+        auto ws = wrap_param_to<bool>(std::forward<SEL>(s));
+        auto wa = wrap_param(std::forward<A>(a));
+        auto wb = wrap_param(std::forward<B>(b));
+        return make_select(ws, wa, wb);
+    }
+}
+
+//! OPTION_INT("option_name") - get the option value as int
+inline expr<Variant::config, int> OPTION_INT(char const *optname)
+{
+    return expr<Variant::config, int>(optname);
+}
+//! OPTION_UINT("option_name") - get the option value as size_t
+inline expr<Variant::config, size_t> OPTION_UINT(char const *optname)
+{
+    return expr<Variant::config, size_t>(optname);
+}
+
+//! OPTION_FLOAT("option_name") - get the option value as float
+inline expr<Variant::config, float> OPTION_FLOAT(char const *optname)
+{
+    return expr<Variant::config, float>(optname);
+}
+//! OPTION_BOOL("option_name") - get the option value as bool
+inline expr<Variant::config, bool> OPTION_BOOL(char const *optname)
+{
+    return expr<Variant::config, bool>(optname);
+}
+
+/// }@
+
+////////////////////// MESSAGE ///////////////////////////////////////
+//  MESSAGE(".. message ..")
+//   - evaluates as true; records message and triggers debug info.
+//
+//   Messages only happen when WITH_OPT_DEBUG is defined; when it is not, we have
+//   MESSAGE("...") -> true
+//   MESSAGE_IF(cond, "...") -> cond
+//   MESSAGE_IFNOT(cond, "...") -> cond
+//   MESSAGE_VALUE(valud, "...") -> value
+//
+#ifdef WITH_OPT_DEBUG
+//
+// Variant::message is a MESSAGE, MESSAGE_IF, or MESSAGE_IFNOT expression.
+// When MESSAGE, CONDEXPR is 'char'.
+//
+struct msg_message {
+};
+struct msg_message_if {
+};
+struct msg_message_ifnot {
+};
+struct msg_message_value {
+};
+// MESSAGE(value, "...") -> value
+
+template <typename MODE, typename CONDEXPR>
+struct expr<Variant::message, tuple<MODE, CONDEXPR>> : private opdef_accessor {
+    CONDEXPR m_condition;
+    std::string m_message;
+
+    typedef bool otype;
+    expr(char const *m, CONDEXPR &&cond) : m_condition(std::move(cond)), m_message(m) {}
+    expr(char const *m, CONDEXPR const &cond) : m_condition(cond), m_message(m) {}
+    inline bool eval(ECtx &e) const
+    {
+        if constexpr (std::is_same_v<MODE, msg_message>) {
+            show_debug_message(e, "MESSAGE", m_message.c_str());
+            return true;
+        } else {
+            bool result = m_condition.eval(e);
+            if constexpr (std::is_same_v<MODE, msg_message_if>) {
+                if (result) show_debug_message(e, "MESSAGE_IF", m_message.c_str());
+            } else {
+                if (!result) show_debug_message(e, "MESSAGE_IFNOT", m_message.c_str());
+            }
+            return result;
+        }
+    }
+};
+template <typename MODE, typename CONDEXPR>
+struct expr<Variant::message_value, tuple<MODE, CONDEXPR>> : private opdef_accessor {
+    CONDEXPR m_condition;
+    std::string m_message;
+
+    typedef bool otype;
+    expr(char const *m, CONDEXPR &&cond) : m_condition(std::move(cond)), m_message(m) {}
+    expr(char const *m, CONDEXPR const &cond) : m_condition(cond), m_message(m) {}
+    inline int eval(ECtx &e) const
+    {
+        int result = m_condition.eval(e);
+        std::stringstream ss;
+        ss << m_message << " = " << result;
+        show_debug_message(e, "MESSAGE", ss.str().c_str());
+        return result;
+    }
+};
+template <typename MODE, typename CND> inline auto make_message_expr(char const *str, CND &&cond)
+{
+    return expr<Variant::message, tuple<MODE, std::remove_reference_t<CND>>>(str, std::forward<CND>(cond));
+}
+#endif
+
+#ifndef WITH_OPT_DEBUG
+// dummy functions (which are constexpr); just generate the bool
+inline constexpr auto MESSAGE(char const *str)
+{
+    return expr<Variant::value, bool>(true);
+}
+template <typename COND> inline constexpr auto MESSAGE_IF(COND a, char const *str)
+{
+    return wrap_param_to<bool>(a);
+}
+template <typename COND> inline constexpr auto MESSAGE_IFNOT(COND a, char const *str)
+{
+    return wrap_param_to<bool>(a);
+}
+template <typename VALUE> inline auto MESSAGE_VALUE(VALUE v, char const *str)
+{
+    return wrap_param_to<int>(v);
+}
+#else
+// actual functions for WITH_OPT_DEBUG - cannot be constexpr
+inline auto MESSAGE(char const *str)
+{
+    return expr<Variant::message, tuple<msg_message, char>>(str, '.');
+}
+template <typename VALUE> inline auto MESSAGE_VALUE(VALUE v, char const *str)
+{
+    auto wv = wrap_param_to<int>(v);
+    return expr<Variant::message_value, tuple<msg_message_value, std::remove_reference_t<VALUE>>>(str,
+                                                                                                  forward<VALUE>(wv));
+}
+template <typename COND> inline auto MESSAGE_IF(COND a, char const *str)
+{
+    auto wa = wrap_param_to<bool>(a);
+    return make_message_expr<msg_message_if>(str, wa);
+}
+template <typename COND> inline auto MESSAGE_IFNOT(COND a, char const *str)
+{
+    auto wa = wrap_param_to<bool>(a);
+    return make_message_expr<msg_message_ifnot>(str, wa);
+}
+
+#endif // WITH_OPT_DEBUG
+
+////////////////////// external constraint ///////////////////////////
+
+// This supports external constraint functions of the form
+//   T function ( Constraint & , OpRef, ... any scalar ...)
+// T *must* be one of the supported basic scalar types (and is usually bool...)
+//
+
+template <typename FUNC, typename ARGPACK, size_t... I>
+inline auto apply_cst_function(ECtx &e, FUNC f, OpRef tgt, ARGPACK args, std::index_sequence<I...>)
+{
+    // allow the function to see a non-const ref
+    auto &cstobj = const_cast<std::remove_const_t<ECtx> &>(e);
+    return (*f)(cstobj, tgt, std::get<I>(args).eval(e)...);
+}
+
+//  The expression type is
+//    expr<Variant::external, tuple< FUNC, tuple<...> >>
+// .. where ... are the types of 0 or or more scalar parameters,
+//   each of which is actually an expr object.
+//
+template <typename FUNC, typename INPARMS>
+struct expr<Variant::external, tuple<FUNC, INPARMS>> : private opdef_accessor {
+    static constexpr size_t NPARMS = std::tuple_size_v<INPARMS>;
+    FUNC m_function; // pointer to function
+    hnnx::operand_tag_t m_optag; // one of the function params
+    INPARMS m_other_operands; // other params (0 or more; all as expr<> objects)
+
+    expr(FUNC f, hnnx::operand_tag_parm_t optag, INPARMS &&etc)
+        : m_function(f), m_optag(optag), m_other_operands(std::move(etc))
+    {
+    }
+
+    using otype = decltype(apply_cst_function<FUNC, INPARMS>(fake_ectx(), m_function, OpRef(), m_other_operands,
+                                                             std::make_index_sequence<NPARMS>()));
+    // TODO: we should ensure here that 'otype' is one of the allowed types (and maybe apply_cse_function
+    // should do minor coercions, e.g. from short to int).
+
+    inline otype eval(ECtx &e) const
+    {
+        OpRef target = lookup_operand(e, m_optag);
+        return apply_cst_function<FUNC, INPARMS>(e, m_function, target, m_other_operands,
+                                                 std::make_index_sequence<NPARMS>());
+    }
+};
+
+// this EXTERNAL_CONSTRAINT requires that the second parameter is a literal operand name;
+// I don't think that's a problem.
+template <typename FUNC, typename... Args>
+auto EXTERNAL_CONSTRAINT(FUNC f, hnnx::operand_tag_parm_t optag, Args &&...args)
+{
+    auto parmpack = std::make_tuple(wrap_param(std::forward<Args>(args))...);
+    return expr<Variant::external, tuple<FUNC, decltype(parmpack)>>(f, optag, std::move(parmpack));
+}
+
+template <typename OPEX> struct expr<Variant::producer_for, OPEX> : private opdef_accessor {
+    std::string m_consumer_opname;
+    const OPEX m_prod_opexpr;
+    inline bool eval(ECtx &e) const
+    {
+        std::string prefix_consumer_opname;
+        const char *const opname = hnnx::get_opname_with_pkg_prefix(prefix_consumer_opname, m_consumer_opname.c_str());
+        OpDef const &prod_opdef = get_opdef_oexpr<OPEX>(e, m_prod_opexpr);
+        return hnnx::producer_for_impl(prod_opdef, opname);
+    }
+    expr(OPEX const &prod_opexpr, char const *consumer_opname)
+        : m_consumer_opname(consumer_opname), m_prod_opexpr(prod_opexpr)
+    {
+    }
+};
+
+//! PRODUCER_FOR("operand", "opname") - check if "operand" has consumer with name "opname"
+template <typename OPEX> auto PRODUCER_FOR(OPEX &&producer, char const *consumer_opname)
+{
+    auto producer_wrapped = wrap_opexpr(std::forward<OPEX>(producer));
+    return expr<Variant::producer_for, decltype(producer_wrapped)>(producer_wrapped, consumer_opname);
+}
+
+template <typename OPEX> struct expr<Variant::eq_opstr, OPEX> : private opdef_accessor {
+    std::string m_opname;
+    const OPEX m_opexpr;
+    inline bool eval(ECtx &e) const
+    {
+        // TODO -- can this be moved to the constructor?
+        std::string prefix_opname;
+        const char *opname = hnnx::get_opname_with_pkg_prefix(prefix_opname, m_opname.c_str());
+        OpDef const &opdef = get_opdef_oexpr<OPEX>(e, m_opexpr);
+        return opdef.opstr == opname;
+    }
+    expr(OPEX const &op_opexpr, char const *opname) : m_opname(opname), m_opexpr(op_opexpr) {}
+};
+
+//! IS_OP("op", "opname") - check if "op" does not opstr equal to  "opname"
+template <typename OPEX> auto IS_OP(OPEX &&op, char const *opname)
+{
+    auto op_wrapped = wrap_opexpr(std::forward<OPEX>(op));
+    return expr<Variant::eq_opstr, decltype(op_wrapped)>(op_wrapped, opname);
+}
+
+//! IS_DTYPE_ALL(Dtype, "operand", ...) -> bool (true if all operands are Dtype)
+template <typename Tdtype, typename Ta, typename... Ts>
+inline constexpr auto IS_DTYPE_ALL(Tdtype &&dtype, Ta &&a, Ts &&...ts)
+{
+    return AND(EQ(DTYPE_OF(std::forward<Ta>(a)), std::forward<Tdtype>(dtype)),
+               IS_DTYPE_ALL(std::forward<Tdtype>(dtype), std::forward<Ts>(ts))...);
+}
+} // namespace oExp
+
+// create namespaces visible in constraints, and in replacements.
+// 'constraint' namespace can't see SPLIT_START etc.
+
+namespace oExp_for_cst {
+
+#ifndef PREPARE_DISABLED
+using oExp::ADD, oExp::SUB, oExp::MUL, oExp::DIV, oExp::NEG;
+using oExp::AND, oExp::OR, oExp::XOR, oExp::NOT;
+using oExp::DATA_SIZE;
+using oExp::DIM_OF, oExp::INPUTS_OF, oExp::OUTPUTS_OF, oExp::ELEMENTSIZE_OF;
+using oExp::EQ, oExp::NE, oExp::LT, oExp::GT, oExp::LE, oExp::GE;
+using oExp::EXTERNAL_CONSTRAINT;
+using oExp::INPUT_OF, oExp::OUTPUT_OF;
+using oExp::IS_POW2;
+using oExp::MESSAGE, oExp::MESSAGE_IF, oExp::MESSAGE_IFNOT, oExp::MESSAGE_VALUE;
+using oExp::MIN, oExp::MAX, oExp::ABS;
+using oExp::RANK_OF, oExp::ZERO_OFFSET_OF, oExp::STEPSIZE_OF, oExp::DTYPE_OF;
+using oExp::REM, oExp::MOD;
+using oExp::ROUNDUP;
+using oExp::SAME_OP, oExp::SAME_ENCODING, oExp::SAME_SHAPE;
+using oExp::SELECT;
+
+using oExp::INT, oExp::UINT, oExp::FLOAT, oExp::DTYPE; // 'cast' operators
+
+using oExp::CONSTVAL_FLOAT, oExp::CONSTVAL_FLOAT_VALID;
+using oExp::CONSTVAL_INT, oExp::CONSTVAL_INT_VALID;
+
+using oExp::IS_DTYPE_ALL;
+using oExp::IS_OP;
+using oExp::OPTION_INT, oExp::OPTION_UINT, oExp::OPTION_FLOAT, oExp::OPTION_BOOL;
+using oExp::PRODUCER_FOR;
+#endif
+
+/// \ingroupOptConstraint
+/// @brief OK can be used when no constraint is needed
+static constexpr bool OK = true;
+
+/// \ingroupOptConstraint
+/// @brief  INF: use for inf in constraints and replacement rules.
+static constexpr float INF = std::numeric_limits<float>::infinity();
+/// \ingroupOptConstraint
+/// @brief  NEG_INF: use for -inf in constraints and replacement rules.
+static constexpr float NEG_INF = -std::numeric_limits<float>::infinity();
+
+/// \ingroupOptConstraint
+/// @brief  INF: use for inf in constraints and replacement rules.
+static constexpr float INF_INT = std::numeric_limits<int32_t>::infinity();
+/// \ingroupOptConstraint
+/// @brief  NEG_INF: use for -inf in constraints and replacement rules.
+static constexpr float NEG_INF_INT = -std::numeric_limits<int32_t>::infinity();
+} // namespace oExp_for_cst
+
+namespace oExp_for_repl {
+#ifndef PREPARE_DISABLED
+using namespace oExp_for_cst;
+using oExp::ITER_VAR;
+using oExp::SPLIT_START, oExp::SPLIT_SIZE, oExp::SPLIT_DIM;
+#endif
+
+// ITER_INPUT_OF( op, spl ) -> INPUT_OF( op, ITER_VAR(spl))
+
+/// \ingroup OptReplacement
+/// @brief ITER_INPUT_OF( <some_op>, "split") - extract the input from <someop> selected by ITER_VAR(split)
+///
+template <typename OPER> inline auto ITER_INPUT_OF(OPER &&oper, hnnx::split_context_tag_t whatsplit)
+{
+    return oExp::INPUT_OF(std::forward<OPER>(oper), oExp::ITER_VAR(whatsplit));
+}
+
+} // namespace oExp_for_repl
+
+POP_VISIBILITY()
+
+#endif /* !PREPARE_DISABLED */
+#endif /* OEXPR_POST_H_ */
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op.h
new file mode 100755
index 0000000000000..136270768095e
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op.h
@@ -0,0 +1,480 @@
+//==============================================================================
+//
+// Copyright (c) 2018-2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef OP_H
+#define OP_H
+
+#include <typeinfo>
+#include "flags.h"
+#include "graph_status.h"
+#include "op_def.h"
+#include "executable.h"
+#include "cost_funcs.h"
+#include "unique_types.h"
+#include "serialize_defs.h"
+#include "serialize_oplist.h"
+#include <set>
+#include <vector>
+#include "weak_linkage.h"
+#include "macros_attribute.h"
+
+class Graph;
+class Tensor;
+namespace hnnx {
+class OpIoPtrs;
+class SimpleOpBase;
+class CostBasedFeatureDesc;
+struct OpExtraInfo;
+} // namespace hnnx
+/*
+ * What are the fundamentals of an op?
+ *
+ * It has an ID to be able to refer to it easily
+ * It has zero or more inputs.  Inputs refer to an output of another op.
+ * It has zero or more outputs.  Output definitions determine the max size of an op.
+ * It can execute.  When an op executes, it uses the inputs to produce the outputs.
+ *
+ * There are also, probably some less important aspects to ops:
+ * * Constructor / Destructor
+ * * In Hexagon NN V2, we have a hook during graph preparation.  This isn't
+ *   always necessary, and maybe we should strive to make it unnecessary?
+ * * We sometimes use flags to indicate something about an op
+ *
+ * There will be some other aspects to ops eventually, dealing with when
+ * to "wake up" and what other ops to notify when finished.  But for now
+ * we can just run one op at a time.
+ */
+
+#include "weak_linkage.h"
+PUSH_VISIBILITY(default)
+
+// Flags used to describe the class of checkpoints we have.
+enum ChkptStoreType {
+    ChkptNormal = 0, // N, M
+    ChkptNone = 1, // (-1 or 0) and (-1 or 1).
+    ChkptNoGate = 2, // (-1 or 0), N
+    ChkptNoDone = 3, // N, (-1 or 1)
+    ChkptFlagShift = 2,
+    ChkptOpFlagMask = 0x3,
+    ChkptFlagMask = ((1 << ChkptFlagShift) - 1),
+};
+
+/*
+ * FIXME: instead of deserialize function, we should have a constructor with arguments (const char **bufp, size_t *)
+ */
+
+/**
+ * @class Op
+ *
+ * @brief Basic, minimal Op
+ * Ops inherit from this class
+ *
+ * This is starting out minimal, we will extend this in the future
+ *
+ * Maybe ID should be here, maybe not.
+ */
+
+class Op : public hnnx::Executable {
+    friend void hnnx::op_serialize_common(hnnx::Serializer &, Op const *, std::type_info const *);
+    //! Interface to the external world is 32 bits for an Op ID (0 and above 0xF000_0000 are reserved for internal use).
+    //! However, as we break ops down we want to have some semblance of the original op IDs while still maintaining unique IDs.
+    //! So we make internal OpIDs 64 bits.
+    //! Half of them can be the external OpID, and we can use a counter or something to uniquify in the other bits.
+    //! We can accumulate performance information and such to still represent OpIDs on the interface.
+  public:
+    Op(){};
+    API_EXPORT Op(Graph &graph_in, unsigned long long int my_id_in);
+    API_EXPORT explicit Op(hnnx::Deserz &);
+    Op(Op const &) = delete;
+    Op &operator=(Op const &) = delete;
+    // virtual destructor
+    virtual ~Op() = default;
+    // Use this if you need a destructor which has access to a Graph object.
+    API_EXPORT virtual void clear(Graph *graph_in) {}
+    API_EXPORT virtual GraphStatus prepare(hnnx::OpIoPtrs const &, bool tcm_available) = 0;
+    API_EXPORT virtual GraphStatus allocate(Graph &graph_in) = 0;
+    API_EXPORT OpId id(const Graph &graph_in) const noexcept;
+
+    API_EXPORT ChkptStoreType get_chkpt_store_type(const Graph &graph_in) const;
+    API_EXPORT OpStoreType get_op_store_type(const Graph &gr) const;
+    API_EXPORT static OpStoreType get_op_store_type(uint32_t flags)
+    {
+        return OpStoreType((flags >> ChkptFlagShift) & ChkptOpFlagMask);
+    };
+
+    API_EXPORT void set_chkpts(Graph &graph_in, const std::pair<int, int> chkpts);
+    API_EXPORT void set_chkpts(Graph &graph_in, int gate, int done)
+    {
+        set_chkpts(graph_in, std::make_pair(gate, done));
+    }
+
+    API_EXPORT const Tensor *get_input(size_t which) const { return get_input_output(which, true); }
+    API_EXPORT const Tensor *get_output(size_t which) const { return get_input_output(which, false); }
+
+    API_EXPORT virtual bool set_input(size_t which, const Tensor *tensor) { return false; }
+
+    API_EXPORT virtual bool is_valid() const noexcept = 0; // Is this op valid in this situation?
+    API_EXPORT void dependence_resolved() noexcept;
+    API_EXPORT bool
+    is_const() const noexcept; // Data for this op always available, execution and dependence tracking not needed.
+    API_EXPORT virtual std::pair<size_t, size_t> num_inputs_outputs() const = 0;
+    API_EXPORT inline size_t num_outputs() const { return num_inputs_outputs().second; }
+    API_EXPORT inline size_t num_inputs() const { return num_inputs_outputs().first; }
+    API_EXPORT const char *true_name() const;
+    API_EXPORT virtual Flags_word get_flag_word() const { return hnnx::flags_for<Op>(); }
+    virtual const char *get_docs() const { return hnnx::docs_for<Op>(); } //LCOV_EXCL_LINE [SAFTYSWCCB-1542]
+
+    /// @brief
+    ///     Gets the typeid mangled name of the kernel implementing this operator
+    API_EXPORT const char *true_func() const noexcept;
+
+    // get type, allowing for SimpleOpWrapper to get forwarded type.
+    API_EXPORT std::type_info const *get_type_extended() const;
+    API_EXPORT bool get_flag(Flags flag) const { return hnnx::test_flag_for(get_flag_word(), flag); }
+    //LCOV_EXCL_START [SAFTYSWCCB-1542]
+    API_EXPORT bool get_flag_and(Flags flag0, Flags flag1) const
+    {
+        return hnnx::test_flag_and(get_flag_word(), flag0, flag1);
+    }
+    //LCOV_EXCL_STOP
+    API_EXPORT inline hnnx::blockid_set_t input_blocks(int mc_sel = -1) const
+    {
+        return input_output_blocks(true, mc_sel);
+    }
+    API_EXPORT inline hnnx::blockid_set_t input_blocks(MemoryClass mc) const
+    {
+        return input_output_blocks(true, int(mc));
+    }
+    API_EXPORT inline hnnx::blockid_set_t output_blocks(int mc_sel = -1) const
+    {
+        return input_output_blocks(false, mc_sel);
+    }
+    API_EXPORT inline hnnx::blockid_set_t output_blocks(MemoryClass mc) const
+    {
+        return input_output_blocks(false, int(mc));
+    }
+
+    API_EXPORT virtual void enumerate_blocks(hnnx::MemBlockEnumerator &en, bool is_input) const {}
+    API_EXPORT inline void enumerate_input_blocks(hnnx::MemBlockEnumerator &en) const { enumerate_blocks(en, true); }
+    API_EXPORT inline void enumerate_output_blocks(hnnx::MemBlockEnumerator &en) const { enumerate_blocks(en, false); }
+
+    // The 'ef' parameter to these functions is a callable (function, lambda, std::function...)
+    // compatible with MemBlockEnumerator::supply_blocks_func
+    template <typename ENFUNC> API_EXPORT inline void enumerate_blocks_withfunc(ENFUNC &&ef, bool is_input) const
+    {
+        hnnx::MemBlockEnumWrapper<std::remove_reference_t<ENFUNC>> enumer(std::forward<ENFUNC>(ef));
+        this->enumerate_blocks(enumer, is_input);
+    }
+    template <typename ENFUNC> API_EXPORT inline void enumerate_input_blocks_withfunc(ENFUNC &&ef) const
+    {
+        enumerate_blocks_withfunc(std::forward<ENFUNC>(ef), true);
+    }
+    template <typename ENFUNC> API_EXPORT inline void enumerate_output_blocks_withfunc(ENFUNC &&ef) const
+    {
+        enumerate_blocks_withfunc(std::forward<ENFUNC>(ef), false);
+    }
+
+    API_EXPORT virtual void serialize(hnnx::SerOpsInterface &) const = 0;
+    using tensor_deserializer_register_func = int (*)();
+
+    // there are fewer combinations of true_output_tuple_type than there are
+    // TypicalOpIO, so it's better to return a function here than to make one.
+    //
+    static constexpr tensor_deserializer_register_func get_tensor_deserializer_register_func()
+    {
+        return hnnx::deserialize_tensor_tuple<std::tuple<>, false>::f_ptr();
+    }
+    API_EXPORT float cost(const Graph &) const;
+
+    // 'clone_mode' for Op::clone
+    enum op_clonemode {
+        opclone_auto, // opclone_dup if op has NULL_EXEC, otherwise opclone_realloc
+        opclone_realloc, // when duplicating the output tensors, zero all block ids and reallocate
+        opclone_dup // duplicate output with same block ids; and suppress ctor hooks.
+    };
+    //
+    //
+    // Clone an Op.
+    // This makes an op with the same input tensors as the current Op, and the specified
+    // OpId. The new op has new output tensors which are 'duplicate_clone' of the output
+    // tensors of the existing Op.
+    //
+    // Caveats:
+    //  - ALWAYS CHECK FOR NULL RETURN VALUE. There is no errlog if the clone fails, just a null return.
+    //  - Not all Op can be cloned in this way; it applies only to Ops which can be created from OpDef.generate().
+    //    So, no things like SpillOp or ValidateOp.
+    //  - The Op's 'constructor hooks' are only called if 'opclone_realloc' mode is specified (or selected via opclone_auto)
+    //  - 'prepare' is called with tcm_available = true; it is assumed that if new Op needs that, the original
+    //    op needed it too.
+    //  - You can pass an alternate Op type ('clone X but as Y'... ); use extreme caution, will only work if the
+    //    number and types of input and output tensors are supported by Y.
+    //
+    API_EXPORT hnnx::uptr_Op clone(Graph &graph_in, OpId, op_clonemode opclonemode = opclone_auto,
+                                   std::type_info const *as_type = nullptr) const;
+
+    // these are not virtual, but are thin wrappers of swap_output so they act as if virtual.
+    /// @brief remove output tensor from an op
+    /// returns empty pointer on failure. Always fails on Op types which don't overload swap_output.
+    API_EXPORT hnnx::uptr_Tensor steal_output(size_t which);
+    /// @brief attach an output tensors to an Op.
+    /// succeeds (and returns true) if val is not empty, 'which' is in range and the Op doesn't already have that
+    //  output set; otherwise it returns false and val is unchanged.
+    //  Always fails on Op types which don't overload swap_output.
+    API_EXPORT bool install_output(size_t which, hnnx::uptr_Tensor &&val);
+
+  protected:
+    API_EXPORT virtual Tensor const *get_input_output(size_t which, bool is_input) const = 0;
+    // swap_output underpins steal_output and install_output:
+    // it should:
+    //    return false, if these operations are not supported, or if the index is too large;
+    //    otherwise:
+    //       - if the incoming val is empty, treat it as 'steal_input'; if ok, swap and return true;
+    //         perform any other side-effects which may be needed.
+    //       - otherwise it's a 'set_output'. return false if the output is already set; otherwise
+    //         swap and return true (and perform any side-effects).
+    //
+    API_EXPORT virtual bool swap_output(size_t which, hnnx::uptr_Tensor &val);
+
+    //
+    // These are used in enumerate_blocks implementations
+    API_EXPORT void enumerate_op_input_blocks(hnnx::MemBlockEnumerator &en, Tensor const *const *inputs_p,
+                                              unsigned n) const;
+    API_EXPORT void enumerate_op_output_blocks(hnnx::MemBlockEnumerator &en, hnnx::uptr_Tensor const *outputs_p,
+                                               unsigned n) const;
+    template <typename VIN, typename VOUT>
+    [[gnu::always_inline]] inline void enumerate_op_blocks(hnnx::MemBlockEnumerator &en, VIN const &vinputs,
+                                                           VOUT const &voutputs, bool is_input) const
+    {
+        if (is_input) {
+            enumerate_op_input_blocks(en, vinputs.data(), vinputs.size());
+        } else {
+            enumerate_op_output_blocks(en, voutputs.data(), voutputs.size());
+        }
+    }
+
+    // legacy interface, implemented via enumerate_blocks
+    API_EXPORT hnnx::blockid_set_t input_output_blocks(bool is_input, int mc_sel) const;
+
+    // subclasses can forward enumerate_blocks to this method to reduce copy-pasta -
+    // it just traverses the inputs (or outputs) using the virtual API and calls
+    // enum_memory_blocks on all the tensors it discovers.
+    API_EXPORT void enumerate_blocks_generic(hnnx::MemBlockEnumerator &en, bool is_input) const;
+
+    // subclasses can forward 'allocate' to this method to reduce copy-pasta.
+    // it just calls allocate on all of the outputs it discovers using the
+    // virtual function API.
+    // If allocator is null, it uses the alloc in the graph.
+    API_EXPORT GraphStatus allocate_generic(hnnx::Allocator *alloc = nullptr);
+
+    API_EXPORT void serialize_internal(hnnx::Serializer &sctx, ChkptStoreType st) const;
+    API_EXPORT uint32_t get_serialize_flags(const Graph &, ChkptStoreType st) const;
+};
+
+/**
+ * @brief All Op source files must invoke this macro at the top of the file,
+ * before any COST_OF/REGISTER_OP/DEF_OPT calls.
+ *
+ */
+#define BEGIN_OP_DEFINITION(NAME) INITIALIZE_TABLES()
+
+/**
+ * @brief All Op source files must invoke this macro at the bottom of the
+ * file, after all COST_OF/REGISTER_OP/DEF_OPT calls.
+ *
+ */
+#define END_OP_DEFINITION(NAME) FINALIZE_TABLES(NAME)
+
+/**
+ * @brief Op Cost return types
+ * As of now we support 3 types of cost for Ops
+ */
+
+struct StandardCosts {
+    static constexpr float GLACIAL = 0x1.0p48; // 2**48 cycles
+    static constexpr float SNAIL = 0x1.0p32; // 2**32 cycles
+    static constexpr float FAST = 0x1.0p8; // 256 cycles
+    static constexpr float FREE = 0x1.0p-64;
+    static constexpr float DISABLE = 0x1.0p50; // 2**50 cycles, worse than GLACIAL, don't select this.
+};
+
+/*
+ * EJP: FIXME: Cost here is a simple fixed cost.
+ * Having simple costs available and a slow fixed cost available by default is great.
+ *
+ * But to accurately reflect cost, we need be able to inspect the details of the op definition.
+ * For example, a const of a convolution will depend on the types and shapes of
+ * weights and activations.
+ *
+ */
+
+namespace hnnx {
+
+/**
+ * Return the cost_function_t object for the Op.
+ * The Ops need to specialize this class
+ * if its cost differs from the default one.
+ */
+
+template <typename ConcreteOp> constexpr hnnx::cost_function_t get_costf()
+{
+    return hnnx::cost_function_t(StandardCosts::GLACIAL);
+}
+
+/*
+ * For concrete version of an op, see typical_op.h
+ */
+
+/*
+ * Not the typical Const op, but a wrapper around a Tensor that someone has formed...
+ */
+
+class ConstWrapperOp : public Op {
+    uptr_Tensor owned_tensor;
+
+  public:
+    API_EXPORT ConstWrapperOp(Graph &graph_in, OpId my_id_in, const OpDef *op_def_in);
+    API_EXPORT ConstWrapperOp(Graph &graph_in, OpId my_id_in, uptr_Tensor owned_tensor_in);
+    API_EXPORT explicit ConstWrapperOp(hnnx::Deserz &dctx);
+    // make a persistent Flat tensor with the given type, shape, data,
+    // and wrap it in a ConstWrapperOp. May not support all DTtype, but definitely
+    // Float32 and Int32, and QUint8. See implementaton in op.cc
+    API_EXPORT ConstWrapperOp(Graph &graph_in, OpId my_id_in, const OutputDef &def, void const *data_in);
+
+    API_EXPORT void clear(Graph *graph_in) override;
+    API_EXPORT virtual GraphStatus execute(EXECUTE_METHOD_PARMS) const noexcept override
+    {
+        return GraphStatus::Success;
+    }
+    API_EXPORT virtual hnnx::Executable::ItemType compile(Graph &graph_in) const noexcept override
+    {
+        return hnnx::Executable::null_item();
+    }
+    //LCOV_EXCL_START [SAFTYSWCCB-1542]
+    API_EXPORT virtual GraphStatus prepare(hnnx::OpIoPtrs const &, bool tcm_available) override
+    {
+        return GraphStatus::Success;
+    }
+    API_EXPORT virtual GraphStatus allocate(Graph &graph_in) override { return GraphStatus::Success; }
+    //LCOV_EXCL_STOP
+    API_EXPORT virtual std::pair<size_t, size_t> num_inputs_outputs() const override { return {0, 1}; }
+    API_EXPORT virtual bool is_valid() const noexcept override { return true; }
+
+    API_EXPORT const Tensor *tensor_p() const { return owned_tensor.get(); }
+    API_EXPORT virtual void serialize(hnnx::SerOpsInterface &sctx) const override;
+
+  protected:
+    API_EXPORT virtual const Tensor *get_input_output(size_t which, bool is_input) const override
+    {
+        return is_input ? nullptr : tensor_p();
+    }
+};
+
+class ShapeWrapperOp : public Op {
+    uptr_Tensor shape; // must actually be a TensorShape
+  public:
+    API_EXPORT ShapeWrapperOp(Graph &graph_in, OpId my_id_in, const OpDef *op_def_in);
+    API_EXPORT ShapeWrapperOp(Graph &graph_in, OpId my_id_in, uptr_Tensor owned_tensor_in);
+    API_EXPORT explicit ShapeWrapperOp(hnnx::Deserz &);
+    API_EXPORT virtual GraphStatus execute(EXECUTE_METHOD_PARMS) const noexcept override
+    {
+        return GraphStatus::Success;
+    }
+    API_EXPORT virtual hnnx::Executable::ItemType compile(Graph &graph_in) const noexcept override
+    {
+        return hnnx::Executable::null_item();
+    }
+    //LCOV_EXCL_START [SAFTYSWCCB-1542]
+    API_EXPORT virtual GraphStatus prepare(hnnx::OpIoPtrs const &, bool tcm_available) override
+    {
+        return GraphStatus::Success;
+    }
+    API_EXPORT virtual GraphStatus allocate(Graph &graph_in) override { return GraphStatus::Success; }
+    //LCOV_EXCL_STOP
+    API_EXPORT virtual std::pair<size_t, size_t> num_inputs_outputs() const override { return {0, 1}; }
+    API_EXPORT virtual bool is_valid() const noexcept override { return true; }
+
+    API_EXPORT virtual void serialize(hnnx::SerOpsInterface &sctx) const override;
+
+  protected:
+    API_EXPORT virtual const Tensor *get_input_output(size_t which, bool is_input) const override
+    {
+        return is_input ? nullptr : shape.get();
+    }
+};
+
+// MetaOpBase is a shim which provides empty defs for all of the =0 virtual methods,
+// so that internal Ops (e.g. PreloadOp) can be based on this and not need to define any they don't need
+//
+class MetaOpBase : public Op {
+  public:
+    MetaOpBase(){};
+    MetaOpBase(Graph &graph_in, unsigned long long int my_id_in) : Op(graph_in, my_id_in) {}
+    explicit MetaOpBase(hnnx::Deserz &dctx) : Op(dctx) {}
+
+    API_EXPORT virtual GraphStatus prepare(hnnx::OpIoPtrs const &,
+                                           bool tcm_available) override; //{ return GraphStatus::Success;}
+    API_EXPORT virtual GraphStatus allocate(Graph &graph_in) override; // { return GraphStatus::Success;}
+
+    API_EXPORT virtual bool is_valid() const noexcept override; // {return false;}
+    API_EXPORT virtual std::pair<size_t, size_t> num_inputs_outputs() const override; //{ return {0,0};}
+    API_EXPORT virtual Tensor const *get_input_output(size_t which,
+                                                      bool is_input) const override; // {return nullptr;}
+    API_EXPORT virtual void serialize(hnnx::SerOpsInterface &) const override; // {}
+    API_EXPORT virtual uptr_Op clone_meta(Graph &graph_in, OpId new_opid) const; // {return uptr_Op(nullptr);}
+};
+
+// SpecialPrepOpBase is a shim (based on MetaOpBase) which provides some new virtual methods
+// that are queried during GraphDeps stage of preparation.
+// This is intended for things like SuperTileOp which want to add these discovery methods.
+//
+// LCOV_EXCL_START [SAFTYSWCCB-1542]
+
+class SpecialPrepOpBase : public MetaOpBase {
+  public:
+    SpecialPrepOpBase(){};
+    SpecialPrepOpBase(Graph &graph_in, unsigned long long int my_id_in) : MetaOpBase(graph_in, my_id_in) {}
+    explicit SpecialPrepOpBase(hnnx::Deserz &dctx) : MetaOpBase(dctx) {}
+
+    // new virtual methods to populate the OpDesc for the op:
+    // These return 'true' if the result was changed, and 'false' if unchanged; the caller can set
+    // the variable to reasonable default before calling, and then ignore the result.
+    API_EXPORT virtual bool get_opdef_name(OpId opid, opname_tag_t &result) const; // {return false} in op.cc
+    API_EXPORT virtual bool get_splithist(OpId opid, splithist_t &result) const { return false; }
+    API_EXPORT virtual bool get_is_volatile(OpId opid, bool &result) const { return false; }
+    API_EXPORT virtual bool get_cost(const Graph &, OpId opid, float &result) const { return false; }
+    API_EXPORT virtual bool get_flags_word(OpId opid, Flags_word &result) const { return false; }
+
+    // make a CostBasedFeatureDesc. If 'false' is returned, it should be obtained 'in the usual manner'.
+    API_EXPORT virtual bool get_costbased_feature(OpId opid, CostBasedFeatureDesc &result) const { return false; }
+};
+// LCOV_EXCL_STOP
+
+// this is a base class for adding hooks on construction of Ops.
+// May not have data members or dtor - so it's just a vtable pointer, and is constexpr constructable
+// All methods must be const, and return GraphStatus; the 'default' methods do nothing and return GraphNotApplicable.
+// So, we can allow two or more hooks to be attached to an Op; when calling a method,
+// we will call it on the first one, and if it returns NotApplicable, we will try the next
+// one, etc (so they are 'layered', in effect).
+//
+class OpHookBase {
+  public:
+    API_EXPORT virtual GraphStatus pre_output_prep(OpIoPtrs const &, Op &) const;
+    API_EXPORT virtual GraphStatus pre_allocate(OpIoPtrs const &, Op &) const;
+};
+
+// if the indicated Op is a SpawnOp, get its inner op ptr, otherwise null.
+
+using SimpleOpFactory = std::unique_ptr<SimpleOpBase> (*)(size_t n_inputs_in, size_t n_outputs_in,
+                                                          Tensor const *const *inputs_in,
+                                                          OutputDef const *const *outputs_in, Graph &graph_in);
+
+} // namespace hnnx
+
+POP_VISIBILITY()
+
+#endif /*OP_H*/
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_def.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_def.h
new file mode 100755
index 0000000000000..2ea1893bf603a
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_def.h
@@ -0,0 +1,492 @@
+//==============================================================================
+//
+// Copyright (c) 2020,2022 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef OP_DEF_H
+#define OP_DEF_H 1
+
+#include <cassert>
+#include <functional>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "interface_defs.h"
+#include "splithist.h"
+#include "tensor.h"
+#include "opname_tag.h"
+#include "weak_linkage.h"
+#include "macros_attribute.h"
+
+// does an opname_tag_t start with the given string.
+// this assumes that converting opname_tag to string_view is cheaper than to std::string
+namespace hnnx {
+inline bool starts_with(opname_tag_t const &opstr, std::string const &pref)
+{
+    std::string_view const sv{opstr};
+    std::string_view sv2;
+    auto found = sv.find("::");
+    if (found != std::string_view::npos) {
+        sv2 = sv.substr(found + 2);
+    } else {
+        sv2 = sv;
+    }
+    int const n = pref.size();
+    return sv2.size() >= n && memcmp(sv2.data(), pref.c_str(), n) == 0;
+}
+inline bool starts_with(opname_tag_t const &opstr, char const *pref)
+{
+    std::string_view const sv{opstr};
+    std::string_view sv2;
+    auto found = sv.find("::");
+    if (found != std::string_view::npos) {
+        sv2 = sv.substr(found + 2);
+    } else {
+        sv2 = sv;
+    }
+    int const n = strlen(pref);
+    return sv2.size() >= n && memcmp(sv2.data(), pref, n) == 0;
+}
+
+class OpIoPtrs;
+} // namespace hnnx
+
+// is_deleted:
+//    OpDef has no downstream, and is slated for deletion
+// is_hidden:
+//    OpDef has been replaced or otherwise inactivated (always true if is_deleted).
+// is_const:
+//    OpDef has constant output
+// is_volatile:
+//    output is not invariant, even if inputs are
+//    (or if no inputs).
+//    Set for ops if no inputs, or if name starts with '*'.
+// is_retain:
+//    do not remove, even if it has no consumers.
+//    Currently set the same as volatile
+//
+PUSH_VISIBILITY(default)
+
+class OpDefFlags {
+    constexpr static uint32_t BIT_deleted = 1; // deleted, pending removal
+    constexpr static uint32_t BIT_hidden = 2; // replaced,hidden; not necessarily pending removal (>= deleted).
+    constexpr static uint32_t BIT_const = 4; // is an OpDef_ConstBase, or an op with const output.
+    constexpr static uint32_t BIT_volatile = 8; // is not invariant, even if all inputs are invariant.
+    constexpr static uint32_t BIT_retain = 16; // do not remove even if all outputs are not used.
+    constexpr static uint32_t BIT_dummy_out = 32; // is a $Out node
+    constexpr static uint32_t BIT_constbase = 64; // if and only if it's a OpDef_ConstBase. immutable.
+    constexpr static uint32_t BIT_in_constmap = 128; // set if it's in graph.const_map; <= constbase.
+    // Considered mutable (can change via const ref),
+    constexpr static uint32_t BIT_fake_unsigned = 256; // OpDef had dtype changed to unsigned during prepare
+    constexpr static uint32_t BIT_custom_op = 512; // OpDef is a custom op
+    // Rules for assigning flags at construction:
+    //  OpDef_ConstBase subclasses get 'const' and 'constbase'.
+    //  otherwise:
+    //    - if the opstr starts with "#', set 'const'
+    //    - otherwise if no inputs, has outputs
+    //          set 'volatile'
+    //    - otherwise if no outputs, or the the opstr starts with '*',
+    //          set 'volatile' and 'retain'
+    //  but for $Out nodes,we set retain | dummy_out.
+    //
+    // The following combinations never occur:
+    //   deleted=1, and hidden = 0.
+    //   constbase=1, and const=0
+    //   constbase=0, and in_constmap = 1.
+    //
+    //
+    uint16_t flags;
+
+  protected:
+    uint16_t opstr_hashval; //  hash of opstr.
+    API_EXPORT static inline int flag_init(hnnx::opname_tag_parm_t opstr, int n_in, int n_out)
+    {
+        std::string_view const sv{opstr};
+        char c0;
+        auto found = sv.find("::");
+        if (found != std::string_view::npos && !(sv.substr(found + 2).empty())) {
+            c0 = sv[found + 2];
+        } else if (found == std::string_view::npos && sv.size() > 0) {
+            c0 = sv[0];
+        } else {
+            return 0;
+        }
+        return (c0 == '#')                ? BIT_const
+               : (n_out == 0)             ? (BIT_retain | BIT_volatile)
+               : (n_in == 0 || c0 == '*') ? (BIT_volatile)
+               : (opstr == "$Out")        ? (BIT_retain | BIT_dummy_out)
+                                          : 0;
+    }
+    // ctor used by OpDef (non-const)
+    API_EXPORT OpDefFlags(hnnx::opname_tag_parm_t opstr, int n_in, int n_out)
+        : flags((uint16_t)flag_init(opstr, n_in, n_out)), opstr_hashval((uint16_t)hnnx::find_opname_hash(opstr))
+    {
+    }
+    // ctor used by OpDef_ConstBase
+    API_EXPORT OpDefFlags(hnnx::opname_tag_parm_t opstr,
+                          bool isconst) // is_const ignored; always true
+        : flags(BIT_const | BIT_constbase), opstr_hashval((uint16_t)hnnx::find_opname_hash(opstr))
+    {
+    }
+    template <unsigned F> bool set_flag_state(bool val)
+    {
+        unsigned const f = flags;
+        if (val)
+            flags = f | F;
+        else
+            flags = f & ~F;
+        return (f & F) != 0;
+    }
+
+  public:
+    API_EXPORT unsigned get_opstr_hash() const { return opstr_hashval; }
+    API_EXPORT bool is_const() const { return (flags & BIT_const) != 0; }
+    API_EXPORT bool is_retain() const { return (flags & BIT_retain) != 0; }
+    API_EXPORT bool is_volatile() const { return (flags & BIT_volatile) != 0; }
+    API_EXPORT bool is_deleted() const { return (flags & BIT_deleted) != 0; }
+    API_EXPORT bool is_hidden() const { return (flags & BIT_hidden) != 0; }
+    API_EXPORT bool is_dummy_out() const { return (flags & BIT_dummy_out) != 0; }
+    API_EXPORT bool is_constbase() const { return (flags & BIT_constbase) != 0; }
+    API_EXPORT bool is_in_constmap() const { return (flags & BIT_in_constmap) != 0; }
+    API_EXPORT bool is_fake_unsigned() const { return (flags & BIT_fake_unsigned) != 0; }
+    API_EXPORT bool is_custom_op() const { return (flags & BIT_custom_op) != 0; }
+    // this is all we should need.
+    API_EXPORT bool set_retain(bool val = true) { return set_flag_state<BIT_retain>(val); }
+    API_EXPORT void set_deleted() { flags |= (BIT_hidden | BIT_deleted); }
+    API_EXPORT void set_hidden() { flags |= BIT_hidden; }
+    // this is allowed via 'const' ref
+    API_EXPORT bool set_is_in_constmap(bool val = true) const
+    {
+        return const_cast<OpDefFlags &>(*this).set_flag_state<BIT_in_constmap>(val);
+    }
+    API_EXPORT bool set_fake_unsigned(bool val = true) { return set_flag_state<BIT_fake_unsigned>(val); }
+    API_EXPORT void set_custom_op() { flags |= BIT_custom_op; }
+    API_EXPORT void serialize(hnnx::Serializer &sctx) const;
+    API_EXPORT OpDefFlags(hnnx::Deserializer &dctx);
+
+  protected:
+    // derived ctors can do this.
+    API_EXPORT void set_const(bool val = true) { set_flag_state<BIT_const>(val); }
+};
+
+/**
+ * @class OpDef
+ *
+ * Not sure how much we want to templatize here...
+ *
+ * We want to split out the definition of ops from their execution behavior.
+ * Performance during definition / graph transformation is not as essential, but
+ * we need to avoid big-O problems and ease-of-use is important.
+ *
+ * For pattern matching we'd like to have some facilities to consider an op definition
+ * meet some things in typical usage
+ *
+ */
+
+/*
+ * An Op Reference refers to an OpID and Output Index in some Graph
+ * This should maybe be called "OutRef" or "TensorRef" or something?
+ *
+ * We'll continue to have dereference() return an OpDef for compatibilty
+ * But we often want the output definition, so add interfaces to get the
+ * pointed-to output as well as information about that output for convenience.
+ */
+
+class Op;
+class OpDef;
+class GraphPrepare;
+
+class RefersToGraph {
+  public:
+    std::reference_wrapper<GraphPrepare> m_graph;
+
+    RefersToGraph(GraphPrepare &g) : m_graph(g) {}
+    GraphPrepare &graph() const { return m_graph.get(); }
+};
+// this is used as the parameter to 'dereference' and 'output_def'
+// so they can take Graph &, or anything based on RefersToGraph &,
+// or a pointer to those.
+class AnyGraphContext {
+    GraphPrepare *m_graphp;
+
+  public:
+    AnyGraphContext(GraphPrepare &g) : m_graphp(&g) {}
+    AnyGraphContext(GraphPrepare *gp) : m_graphp(gp) {}
+    AnyGraphContext(RefersToGraph &rtg) : m_graphp(&rtg.graph()) {}
+    AnyGraphContext(RefersToGraph *rtgp) : m_graphp(&rtgp->graph()) {}
+    GraphPrepare &graph() const { return *m_graphp; }
+};
+
+class OpRef final {
+  public:
+    unsigned long long int input_id;
+
+    explicit OpRef(unsigned long long int in_id) // from id
+        : input_id(in_id)
+    {
+    }
+    OpRef(unsigned long long int in_id,
+          size_t out_idx) // from (id, idx) - legacy
+        : input_id(in_id)
+    {
+    }
+    OpRef() : input_id() {}
+    ~OpRef() = default;
+    OpRef(const OpRef &) = default;
+    OpRef(OpRef &&) = default;
+    OpRef &operator=(OpRef const &) = default;
+    OpRef &operator=(OpRef &&) = default;
+    API_EXPORT OpDef &dereference(AnyGraphContext) const;
+    API_EXPORT const OutputDef &output_def(AnyGraphContext) const;
+
+    // note,  these ops all do a lookup via output_def()
+    API_EXPORT size_t rank(AnyGraphContext c) const { return output_def(c).rank; }
+    API_EXPORT DType dtype(AnyGraphContext c) const { return output_def(c).dtype; }
+    API_EXPORT size_t dim(AnyGraphContext c, size_t idx) const
+    {
+        const OutputDef &od = output_def(c);
+        assert(idx < od.rank);
+        return od.max_sizes[idx];
+    }
+    API_EXPORT int32_t zero_offset(AnyGraphContext c) const { return output_def(c).zero_offset; }
+    API_EXPORT float stepsize(AnyGraphContext c) const { return output_def(c).stepsize; }
+
+    bool operator==(const OpRef &ref) const { return input_id == ref.input_id; }
+    bool operator!=(const OpRef &ref) const { return !operator==(ref); }
+};
+
+class OpDef : public OpDefFlags {
+  protected:
+    hnnx::splithist_t splithist;
+    const std::reference_wrapper<GraphPrepare> graphref;
+    class ForConst {
+    };
+    // this constructor is for OpDef_ConstBase
+    // it sets the flags to just 'is_const | is_constbase'.
+    API_EXPORT OpDef(GraphPrepare &graph_in, OpId my_id_in, hnnx::opname_tag_parm_t opstr_in, OutputDef const &odef,
+                     ForConst const &)
+        : OpDefFlags(opstr_in, true), splithist(), graphref(graph_in), id(my_id_in), opstr(opstr_in), input_defs(),
+          output_def(odef)
+    {
+    }
+
+  public:
+    OpId id;
+    hnnx::opname_tag_t opstr;
+    API_EXPORT void change_opstr_internal(hnnx::opname_tag_t new_opstr)
+    {
+        opstr = new_opstr;
+        opstr_hashval = hnnx::find_opname_hash(new_opstr);
+    }
+
+    std::vector<OpRef> input_defs; // These should be mutable, we mess with them during optimization
+    OutputDef output_def;
+
+    API_EXPORT inline hnnx::splithist_t get_splithist() const { return splithist; }
+    API_EXPORT inline void set_splithist(hnnx::splithist_t val) { splithist = val; }
+    API_EXPORT inline void set_splithist(OpDef const &other) { splithist = other.splithist; }
+    API_EXPORT inline void inherit_memos(AnyGraphContext, OpDef const &other);
+    // with 0 or 1 output (output_def_in may be null)
+    API_EXPORT OpDef(GraphPrepare &graph_in, OpId my_id_in, hnnx::opname_tag_parm_t opstr_in,
+                     std::vector<OpRef> &&input_defs_in, OutputDef const *output_def_in, hnnx::splithist_t sl)
+        : OpDefFlags(opstr_in, input_defs_in.size(), (output_def_in == nullptr) ? 0 : 1), splithist(sl),
+          graphref(graph_in), id(my_id_in), opstr(opstr_in), input_defs(std::move(input_defs_in)), output_def()
+    {
+        if (output_def_in != nullptr) {
+            output_def = *output_def_in;
+        } else {
+            output_def.dtype = DType::None;
+        }
+    }
+
+    API_EXPORT OpDef(GraphPrepare &graph_in, OpId my_id_in, hnnx::opname_tag_parm_t opstr_in,
+                     std::vector<OpRef> &&input_defs_in, OutputDef const *output_def_in)
+        : OpDefFlags(opstr_in, input_defs_in.size(), (output_def_in == nullptr) ? 0 : 1), splithist(),
+          graphref(graph_in), id(my_id_in), opstr(opstr_in), input_defs(std::move(input_defs_in)), output_def()
+    {
+        if (output_def_in != nullptr) {
+            output_def = *output_def_in;
+        } else {
+            output_def.dtype = DType::None;
+        }
+    }
+
+    OpDef(OpDef const &) = delete; // use the .copy() method
+    OpDef(OpDef &&) = default; // we can return them though
+    OpDef &operator=(OpDef const &) = delete;
+    OpDef &operator=(OpDef &&) = delete;
+    API_EXPORT GraphPrepare &graph() const { return graphref.get(); }
+    // make a copy with the same output and no inputs; then
+    // copy shape from 'shape_from' (if not null) and output
+    //  spec from 'outp_from' (if not null)
+    API_EXPORT OpDef make_output_exemplar(OutputDef const *size_from, OutputDef const *outp_from) const;
+    // make a copy with the same output and no inputs
+    API_EXPORT inline OpDef make_output_exemplar() const { return make_output_exemplar(nullptr, nullptr); }
+
+    API_EXPORT size_t n_inputs() const { return input_defs.size(); }
+    API_EXPORT size_t n_outputs() const { return output_def.dtype == DType::None ? 0 : 1; }
+    //> true if the OpDef has outputs
+    API_EXPORT bool has_outputs() const { return !(output_def.dtype == DType::None); }
+    //> true if the node is a 'sink' for the purposes of sheduler
+    /// If we add special nodes with no outputs that are not graph sinks, they can be excluded here.
+    //
+    API_EXPORT bool is_graph_sink() const { return !has_outputs(); }
+    //> True if the OpDef has multiple outputs (has 'Multi' output type)
+    API_EXPORT bool has_multiple_outputs() const { return output_def.dtype == DType::Multi; }
+    API_EXPORT OpRef reference() const { return OpRef{id, 0}; }
+    // these are only safe to use when has_outputs()
+    API_EXPORT OutputDef &get_outputdef() { return output_def; } //use when need to modify output_def
+    API_EXPORT OutputDef const &get_outputdef() const { return output_def; } //return read-only output_def
+
+    API_EXPORT virtual hnnx::uptr_Op generate(hnnx::OpIoPtrs const &) const;
+    API_EXPORT virtual const uint8_t *const_data_ptr() const { return nullptr; }
+    API_EXPORT virtual size_t const_data_len() const { return 0; }
+    // By convention, op names that start with '#' are constant regardless of input
+    // This is useful (for example) to get quantization parameters out of output defs
+    API_EXPORT virtual const Tensor *get_tensor() const { return nullptr; }
+    API_EXPORT virtual void release_memory() {}
+    virtual ~OpDef() = default;
+    API_EXPORT static bool compare_less(const OpDef &lhs, const OpDef &rhs);
+
+    API_EXPORT static bool compare_eq(const OpDef &lhs, const OpDef &rhs);
+
+    struct compare_less_ptr_functor {
+        bool operator()(const OpDef *lhs, const OpDef *rhs) const { return OpDef::compare_less(*lhs, *rhs); }
+    };
+    API_EXPORT bool exact_same_as(const OpDef &rhs);
+    API_EXPORT virtual void nndebug_serialize(hnnx::Serializer &sctx) const;
+    API_EXPORT void serialize(hnnx::Serializer &sctx) const;
+    API_EXPORT OpDef(GraphPrepare &graph_in, hnnx::Deserializer &dctx);
+};
+
+namespace hnnx {
+
+API_FUNC_EXPORT bool compare_eq(OutputDef const &, OutputDef const &);
+
+// common base for OpDef_Const and OpDef_Shape.
+// these are the OpDef which will be kept in const_map, keyed by the content_hash.
+// compare_eq() is used to check exact match amongst any two instances; compare_less may be
+// used to order them.
+//
+// the ordering will be a little arbitrary - if two have different hashes, they will first be ordered
+// according to the hashes; otherwise we will go through a multi-key compare of the OpDef attributes
+// and finally call tensor_compare (if both are OpDef_Const). If a more rational ordering is needed,
+// this can be added, but it will tend to be slower than the hashed compare if you have a lot of Const
+// with matching shapes.
+// If A and B are both OpDefConstBase, then OpDef::compare_less(A,B) will be the same as OpDef_ConstBase::compare_less(A,B).
+//
+// The content_hash is never 0; zero is used to mark 'unknown'; the next time get_content_hash() is called, find_content_hash()
+// will be called to determine the hash, and then it will be stored in content_hash for next time.
+//
+class OpDef_ConstBase : public OpDef {
+    mutable uint32_t content_hash = 0;
+
+  protected:
+    OpDef_ConstBase(GraphPrepare &graph_in, OpId my_id_in, opname_tag_parm_t opstr, OutputDef const &output_def)
+        : OpDef(graph_in, my_id_in, opstr, output_def, OpDef::ForConst{})
+    {
+    }
+
+  public:
+    // finds content_hash ( if not already found ) and return it
+    API_EXPORT uint32_t get_content_hash() const
+    {
+        return (content_hash == 0) ? get_content_hash_func() : content_hash;
+    }
+    API_EXPORT void invalidate_content_hash() { content_hash = 0; }
+    // does it have a content_hash?
+    API_EXPORT inline bool has_content_hash() const { return content_hash != 0; }
+    API_EXPORT inline OpDef_ConstBase(GraphPrepare &graph_in, Deserializer &dctx)
+        : OpDef(graph_in, dctx), content_hash(0)
+    {
+    }
+
+  protected:
+    API_EXPORT uint32_t find_basic_hash() const noexcept; // find the hash of opstr and OutputDef.
+    API_EXPORT uint32_t get_content_hash_func() const noexcept;
+    API_EXPORT virtual uint32_t find_content_hash() const noexcept = 0;
+};
+
+API_FUNC_EXPORT int compare_constbase(const OpDef_ConstBase &lhs, const OpDef_ConstBase &rhs);
+API_FUNC_EXPORT inline bool compare_constbase_eq(const OpDef_ConstBase &lhs, const OpDef_ConstBase &rhs)
+{
+    return lhs.get_content_hash() == rhs.get_content_hash() && compare_constbase(lhs, rhs) == 0;
+}
+
+class OpDef_Const : public OpDef_ConstBase {
+  public:
+    std::unique_ptr<Tensor> const_data;
+    OpDef_Const(OpDef_Const const &) = delete;
+    OpDef_Const &operator=(const OpDef_Const &) = delete;
+    OpDef_Const(OpDef_Const &&) = delete;
+    OpDef_Const &operator=(OpDef_Const &&) = delete;
+    API_EXPORT OpDef_Const(GraphPrepare &graph_in, OpId my_id_in, OutputDef const &output_def, const uint8_t *data_in,
+                           size_t len);
+    API_EXPORT OpDef_Const(GraphPrepare &graph_in, OpId my_id_in, std::unique_ptr<Tensor> tensor_in);
+    API_EXPORT virtual ~OpDef_Const();
+    API_EXPORT virtual const uint8_t *const_data_ptr() const override;
+    API_EXPORT virtual size_t const_data_len() const override;
+    API_EXPORT virtual uptr_Op generate(OpIoPtrs const &) const override;
+    API_EXPORT virtual const Tensor *get_tensor() const override { return const_data.get(); }
+    API_EXPORT virtual void release_memory() override;
+    API_EXPORT void serialize(Serializer &sctx) const;
+    API_EXPORT OpDef_Const(GraphPrepare &graph_in, Deserializer &dctx);
+
+  protected:
+    API_EXPORT virtual uint32_t find_content_hash() const noexcept override;
+};
+
+class OpDef_Shape : public OpDef_ConstBase {
+  public:
+    API_EXPORT OpDef_Shape(GraphPrepare &graph_in, OpId my_id_in, OutputDef const &output_def)
+        : OpDef_ConstBase(graph_in, my_id_in, "$Shape", output_def)
+    {
+    }
+    API_EXPORT virtual const uint8_t *const_data_ptr() const override { return nullptr; }
+    API_EXPORT virtual size_t const_data_len() const override { return 0; }
+    API_EXPORT virtual uptr_Op generate(OpIoPtrs const &) const override;
+    API_EXPORT void serialize(Serializer &sctx) const;
+    API_EXPORT OpDef_Shape(GraphPrepare &graph_in, Deserializer &dctx);
+
+  protected:
+    // hash of a shape includes only the basic hash.
+    API_EXPORT virtual uint32_t find_content_hash() const noexcept override;
+};
+
+// This implemnts SAME_ENCODING in optimization constraints.
+API_FUNC_EXPORT inline bool same_encoding(OutputDef const &oda, OutputDef const &odb)
+{
+    DType const d = oda.dtype;
+    if (odb.dtype != d) return false;
+    // done if not quantized
+    if (!DType_info(d).is_quant) return true;
+    return (oda.stepsize == odb.stepsize && oda.zero_offset == odb.zero_offset);
+}
+
+// This implements SAME_SHAPE in optimization constraints.
+API_FUNC_EXPORT inline bool same_shape(OutputDef const &oda, OutputDef const &odb)
+{
+    NN_UINT32_T const rankA = oda.rank;
+    NN_UINT32_T const rankB = odb.rank;
+
+    if (rankA != rankB) {
+        return false;
+    }
+
+    for (size_t idx = 0; idx < rankA; idx++) {
+        if (oda.max_sizes[idx] != odb.max_sizes[idx]) return false;
+    }
+
+    return true;
+}
+
+} // namespace hnnx
+
+POP_VISIBILITY()
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_extra_info.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_extra_info.h
new file mode 100755
index 0000000000000..48075e2106eb3
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_extra_info.h
@@ -0,0 +1,70 @@
+//==============================================================================
+//
+// Copyright (c) Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef OP_EXTRA_INFO_H
+#define OP_EXTRA_INFO_H 1
+
+#include <utility>
+
+#include "interface_defs.h"
+
+namespace hnnx {
+
+/*
+    map Op* to a few properties, to avoid the need to keep them in the Op
+    object. Currently contains the ID, the gate/done checkpoint indices, and
+    the number of scratch outputs. This sctruct is part of the runlist and its 
+    memory footprint is important. It is currently 24 bytes:
+        opid: 8 bytes
+        chkpts: 8 bytes
+        op_tag: 4 bytes
+        (alignment padding : 4)
+*/
+
+// OpExtraInfo - the 'extra_list' componen of runlists, as an array
+// of these, so we want to keep them small; any attributes that are
+// only needed at prepare time can go in the OpExtraAttrib which is a subclass.
+// The 'mapped' type of m_op_extra_info_map is OpExtraAttrib.
+
+struct OpExtraInfo {
+    using Chkpts = std::pair<int, int>;
+
+    OpId id;
+    Chkpts chkpts;
+    const char *op_tag;
+    explicit OpExtraInfo(OpId id_in) : id(id_in), chkpts(-1, -1) {}
+    OpExtraInfo(OpId id_in, int cg, int dc) : id(id_in), chkpts(cg, dc) {}
+    OpExtraInfo() : OpExtraInfo(0) {}
+
+    bool valid() const { return id != 0; };
+    void clear() { id = 0; };
+};
+
+struct OpExtraAttrib : public OpExtraInfo {
+    // fields below here are valid only at prepare time.
+    bool for_hlx : 1; // HVX op to be moved to HLX
+    unsigned int num_scratch_outputs : 5;
+    unsigned int self_slicing_op_nslices : 4; // 0 means just 1 slice; otherwise >= 2
+
+    // can construct from OpExtraInfo
+    OpExtraAttrib() : OpExtraInfo() { clear_fields(); }
+    OpExtraAttrib(OpExtraInfo const &baseval) : OpExtraInfo(baseval) { clear_fields(); }
+    explicit OpExtraAttrib(OpId id_in) : OpExtraInfo(id_in) { clear_fields(); }
+    OpExtraAttrib(OpId id_in, int cg, int dc) : OpExtraInfo(id_in, cg, dc) { clear_fields(); }
+
+    void clear_fields()
+    {
+        for_hlx = false;
+        num_scratch_outputs = 0;
+        self_slicing_op_nslices = 0;
+    }
+};
+
+} // namespace hnnx
+
+#endif // OP_EXTRA_INFO_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_info.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_info.h
new file mode 100755
index 0000000000000..bb80c6807b994
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_info.h
@@ -0,0 +1,113 @@
+//=============================================================================
+//
+//  Copyright (c) 2020 Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//============================================================================
+
+#ifndef OP_INFO_H
+#define OP_INFO_H
+
+#include <typeinfo>
+#include <typeindex>
+#include <functional>
+#include <map>
+
+#include "flags.h"
+#include "op_registry.h"
+#include "cost_funcs.h"
+#include "weak_linkage.h"
+
+PUSH_VISIBILITY(default)
+
+class Op;
+
+namespace hnnx {
+
+class OpInfo {
+    cost_function_t cost;
+    Flags_word flags;
+    bool is_external_flag;
+    bool is_simple_op;
+    union {
+        OpFactory op_factory;
+        SimpleOpFactory simple_op_factory;
+    };
+    const std::string_view type_tag;
+
+  public:
+    //LCOV_EXCL_START [SAFTYSWCCB-1542]
+    OpInfo(cost_function_t cost_in, Flags_word flags_in, OpFactory op_factory_in, bool is_external_in,
+           const std::string_view type_tag_in)
+        : cost(cost_in), flags(flags_in), is_external_flag(is_external_in), is_simple_op(false),
+          op_factory(op_factory_in), type_tag(type_tag_in)
+    {
+    }
+    //LCOV_EXCL_STOP
+    OpInfo(cost_function_t cost_in, Flags_word flags_in, SimpleOpFactory simple_op_factory_in, bool is_external_in,
+           const std::string_view type_tag_in)
+        : cost(cost_in), flags(flags_in), is_external_flag(is_external_in), is_simple_op(true),
+          simple_op_factory(simple_op_factory_in), type_tag(type_tag_in)
+    {
+    }
+
+    ~OpInfo() = default;
+
+    API_EXPORT Flags_word get_flags() const { return flags; }
+
+    API_EXPORT cost_function_t const &get_cost() const { return cost; } //LCOV_EXCL_LINE [SAFTYSWCCB-1542]
+
+    API_EXPORT bool is_external() const { return is_external_flag; }
+
+    //LCOV_EXCL_START [SAFTYSWCCB-1542]
+    API_EXPORT const char *get_type_tag() const { return type_tag.data(); }
+
+    API_EXPORT OpFactory get_op_factory() const { return !is_simple_op ? op_factory : nullptr; }
+    API_EXPORT SimpleOpFactory get_simple_op_factory() const { return is_simple_op ? simple_op_factory : nullptr; }
+    //LCOV_EXCL_STOP
+};
+
+using InfoMapType = std::map<std::type_index, OpInfo>;
+
+// after the instance is constructed, this points to it.
+extern InfoMapType *op_info_map_inst_p;
+API_FUNC_EXPORT InfoMapType &get_op_info_map_function();
+
+inline InfoMapType &get_op_info_map()
+{
+    return (op_info_map_inst_p != nullptr) ? *op_info_map_inst_p : get_op_info_map_function();
+}
+
+// most access to the map are lookup. This does a lookup and returns null if not found.
+API_FUNC_EXPORT OpInfo const *op_info_map_lookup(std::type_index tind);
+
+// handy adapters
+API_FUNC_EXPORT inline OpInfo const *op_info_map_lookup(std::type_info const &t)
+{
+    return op_info_map_lookup(std::type_index(t));
+}
+template <typename OP> // can't just use Op since it's incomplete here.
+API_FUNC_EXPORT inline OpInfo const *op_info_map_lookup(OP const *op)
+{
+    static_assert(std::is_base_of<Op, OP>::value);
+    return op_info_map_lookup(std::type_index(typeid(*op)));
+}
+
+API_FUNC_EXPORT void register_op_info(const std::type_info &type, cost_function_t cost, Flags_word flags,
+                                      OpFactory op_factory, bool is_external, const std::string_view type_tag);
+API_FUNC_EXPORT void register_op_info(const std::type_info &type, cost_function_t cost, Flags_word flags,
+                                      SimpleOpFactory op_factory, bool is_external, const std::string_view type_tag);
+
+template <typename T, typename OPFACTORY>
+API_FUNC_EXPORT inline void register_op_info(cost_function_t cost, Flags_word flags, OPFACTORY op_factory,
+                                             bool is_external, const std::string_view type_tag)
+{
+    register_op_info(typeid(T), cost, flags, op_factory, is_external, type_tag);
+}
+
+} // namespace hnnx
+
+POP_VISIBILITY()
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_package_feature_support.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_package_feature_support.h
new file mode 100755
index 0000000000000..12e4ef2b0a85d
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_package_feature_support.h
@@ -0,0 +1,178 @@
+//==============================================================================
+//
+// Copyright (c) 2020, 2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef OP_PACKAGE_FEATURE_SUPPORT_H
+#define OP_PACKAGE_FEATURE_SUPPORT_H
+
+/*
+ * Used by external op packages
+ * for specifying orders of op parameters
+ * and listing axis parameters
+ * and listing per channel scale ops
+ *
+ * Using any of the following features/macros on HTP default package is invalid
+ * For any op package using HTP (internal) default package name,
+ *   axis parameters listed using macros below will be ignored
+ *   if there are parameter orders and/or per-channel ops listed using macros below, op package registration will fail
+ */
+
+#include <string>
+#include <string_view>
+#include <set>
+#include <vector>
+#include <unordered_map>
+#include <cstdarg>
+
+#include "op_package_name.h"
+#include "weak_linkage.h"
+#include "macros_attribute.h"
+
+PUSH_VISIBILITY(default)
+
+namespace hnnx {
+
+// configuration of each op package op parameter
+typedef struct {
+    std::string name;
+    bool isMandatory;
+    void *defaultVal;
+
+} ParamConfig_t;
+
+typedef std::unordered_map<std::string, std::vector<ParamConfig_t>> ParamMap_t; // pkg::op name -> vector<param>
+
+API_EXPORT API_C_FUNC std::string API_FUNC_NAME(combine_pkg_op_name)(const char *package_name, const char *op_name);
+
+/*
+ * adds a new op entry into an op parameter map
+ * returns nullptr if this op already exists in the map
+ */
+API_EXPORT std::vector<ParamConfig_t> *add_to_param_map(ParamMap_t &pmap, std::string_view package_op_name);
+
+// adds a new op parameter entry into a vector of parameter configs
+API_EXPORT void add_package_individual_param_config(std::vector<ParamConfig_t> *pvec, const char *param_name,
+                                                    bool mandatory, void *default_val);
+
+// base conditon for hnnx::add_package_param_configs_base
+API_EXPORT void add_package_param_configs_base(std::vector<ParamConfig_t> *pvec);
+
+// adds a variable number of ParamConfig_t constructed from [param_name, mandatory, default_val] to a ParamConfig_t vector
+template <typename... T>
+API_EXPORT void add_package_param_configs_base(std::vector<ParamConfig_t> *pvec, const char *param_name, bool mandatory,
+                                               void *default_val, T &&...args)
+{
+    add_package_individual_param_config(pvec, param_name, mandatory, default_val);
+    add_package_param_configs_base(pvec, std::forward<T>(args)...);
+}
+
+// inserts a new  op entry into a ParamMap_t and adds a variable number of ParamConfig_t
+template <typename... T>
+API_EXPORT void add_package_param_configs(ParamMap_t &pmap, std::string_view package_op_name, const char *param_name,
+                                          bool mandatory, void *default_val, T &&...args)
+{
+    std::vector<ParamConfig_t> *v = add_to_param_map(pmap, package_op_name);
+    if (!v) return;
+
+    add_package_individual_param_config(v, param_name, mandatory, default_val);
+    add_package_param_configs_base(v, std::forward<T>(args)...);
+}
+
+//  base conditon for hnnx::add_package_axis_params
+API_EXPORT void add_package_axis_params(std::set<std::string> &aset);
+
+// adds a variable number of axis parameter names into a set of axis parameter names
+template <typename... T>
+API_EXPORT void add_package_axis_params(std::set<std::string> &aset, const char *param_name, T &&...args)
+{
+    aset.insert(std::string(param_name));
+    add_package_axis_params(aset, std::forward<T>(args)...);
+}
+
+//  base conditon for hnnx::add_package_per_channel_ops
+API_EXPORT void add_package_per_channel_ops(std::set<std::string> &oset);
+
+// adds a variable number of per-channel scaled package_name::op_name into a set of package_name::op_name
+template <typename... T>
+API_EXPORT void add_package_per_channel_ops(std::set<std::string> &oset, const char *op_name, T &&...args)
+{
+    oset.insert(combine_pkg_op_name(THIS_PKG_NAME_STR, op_name));
+    add_package_per_channel_ops(oset, std::forward<T>(args)...);
+}
+
+} // namespace hnnx
+
+// Initialize ParamAxes and ChannelQuantizedOps maps as well
+#define INIT_PACKAGE_PARAM_ORDER_DEF()                                                                                 \
+    API_HIDDEN hnnx::ParamMap_t &current_package_param_order_storage_map_func()                                        \
+    {                                                                                                                  \
+        static hnnx::ParamMap_t pm;                                                                                    \
+        return pm;                                                                                                     \
+    }                                                                                                                  \
+    API_HIDDEN std::set<std::string> &currentPackageParamAxesSetFunc()                                                 \
+    {                                                                                                                  \
+        static std::set<std::string> axes;                                                                             \
+        return axes;                                                                                                   \
+    }                                                                                                                  \
+    API_HIDDEN std::set<std::string> &currentPackagePerChannelQuantizedOpsSetFunc()                                    \
+    {                                                                                                                  \
+        static std::set<std::string> per_channel_ops;                                                                  \
+        return per_channel_ops;                                                                                        \
+    }                                                                                                                  \
+    extern "C" {                                                                                                       \
+    void clearPackageParamOrderStorageMapFunc() { current_package_param_order_storage_map_func().clear(); }            \
+    void clearPackageParamAxesSetFunc() { currentPackageParamAxesSetFunc().clear(); }                                  \
+    void clearPackagePerChannelQuantizedOpsSetFunc() { currentPackagePerChannelQuantizedOpsSetFunc().clear(); }        \
+    }                                                                                                                  \
+    std::unordered_map<std::string, hnnx::ParamMap_t *> &getPkgParamTmpMap();                                          \
+    std::unordered_map<std::string, std::set<std::string> *> &getPkgParamAxesTmpMap();                                 \
+    std::unordered_map<std::string, std::set<std::string> *> &getPkgPerChannelOpsTmpMap();                             \
+    void clearPkgStorage()                                                                                             \
+    {                                                                                                                  \
+        clearPackageOpsStorageVecFunc();                                                                               \
+        clearPackageOptStorageVecFunc();                                                                               \
+        clearPackageParamOrderStorageMapFunc();                                                                        \
+        clearPackageParamAxesSetFunc();                                                                                \
+        clearPackagePerChannelQuantizedOpsSetFunc();                                                                   \
+    }
+
+#define DECLARE_PACKAGE_PARAM_ORDER_DEF() API_HIDDEN hnnx::ParamMap_t &current_package_param_order_storage_map_func();
+
+#define DEF_PACKAGE_PARAM_ORDER(OP, PARAM1, MANDATORY1, DEFAULT1, ...)                                                 \
+    [[maybe_unused]] static bool CTRICKS_PASTER(_PKG_PARAM_ORDER_REG_, __LINE__) =                                     \
+            (hnnx::add_package_param_configs(current_package_param_order_storage_map_func(),                           \
+                                             hnnx::combine_pkg_op_name(THIS_PKG_NAME_STR, OP), PARAM1, MANDATORY1,     \
+                                             DEFAULT1, ##__VA_ARGS__),                                                 \
+             true);
+
+// clean all op_pkg storage during process exit
+#define REGISTER_PACKAGE_PARAM_ORDERS()                                                                                \
+    if (getPkgParamTmpMap().find(std::string(THIS_PKG_NAME_STR)) == getPkgParamTmpMap().end())                         \
+        getPkgParamTmpMap()[std::string(THIS_PKG_NAME_STR)] = &current_package_param_order_storage_map_func();         \
+    [[maybe_unused]] bool CTRICKS_PASTER(_CLEAN_PKG_PARAMS_, __LINE__) = (std::atexit(clearPkgStorage), true);
+
+#define LIST_PACKAGE_AXIS_PARAMS(...)                                                                                  \
+    [[maybe_unused]] static bool CTRICKS_PASTER(_PKG_AXIS_PARAMS_, __LINE__) =                                         \
+            (hnnx::add_package_axis_params(currentPackageParamAxesSetFunc(), ##__VA_ARGS__), true);
+
+#define REGISTER_PACKAGE_AXIS_PARAMS()                                                                                 \
+    if (getPkgParamAxesTmpMap().find(std::string(THIS_PKG_NAME_STR)) == getPkgParamAxesTmpMap().end())                 \
+        getPkgParamAxesTmpMap()[std::string(THIS_PKG_NAME_STR)] = &currentPackageParamAxesSetFunc();
+
+#define LIST_PACKAGE_PER_CHANNEL_QUANTIZED_OPS(...)                                                                    \
+    [[maybe_unused]] static bool CTRICKS_PASTER(_PKG_PER_CHANNEL_OPS_, __LINE__) =                                     \
+            (hnnx::add_package_per_channel_ops(currentPackagePerChannelQuantizedOpsSetFunc(), ##__VA_ARGS__), true);
+
+#define REGISTER_PACKAGE_PER_CHANNEL_QUANTIZED_OPS()                                                                   \
+    if (getPkgPerChannelOpsTmpMap().find(std::string(THIS_PKG_NAME_STR)) == getPkgPerChannelOpsTmpMap().end())         \
+        getPkgPerChannelOpsTmpMap()[std::string(THIS_PKG_NAME_STR)] = &currentPackagePerChannelQuantizedOpsSetFunc();
+
+DECLARE_PACKAGE_PARAM_ORDER_DEF()
+
+POP_VISIBILITY()
+
+#endif // OP_PACKAGE_FEATURE_SUPPORT_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_package_name.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_package_name.h
new file mode 100755
index 0000000000000..d8da9153bb765
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_package_name.h
@@ -0,0 +1,48 @@
+//==============================================================================
+//
+// Copyright (c) 2020-2021, 2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef OP_PACKAGE_NAME_H
+#define OP_PACKAGE_NAME_H
+
+#ifndef THIS_PKG_NAME
+#define THIS_PKG_NAME
+#define THIS_PKG_NAME_STR ""
+#else
+#define TO_STR(x)         #x
+#define TO_STR2(x)        TO_STR(x)
+#define THIS_PKG_NAME_STR TO_STR2(THIS_PKG_NAME)
+#endif
+
+#include <cstring>
+#include "weak_linkage.h"
+
+namespace hnnx {
+
+inline char const *get_opname_with_pkg_prefix(std::string &tmp, char const *opstr,
+                                              char const *prefix = THIS_PKG_NAME_STR)
+{
+    if (!opstr || opstr[0] == '$' || strstr(opstr, "::") != nullptr) return opstr;
+    // build result in 'tmp' and return pointer to it
+    tmp = prefix;
+    tmp += "::";
+    tmp += opstr;
+    return tmp.c_str();
+}
+
+// LCOV_EXCL_START [SAFTYSWCCB-1542]
+inline bool opname_has_pkg_prefix(char const *opstr)
+{
+    return strstr(opstr, "::") != nullptr;
+}
+// LCOV_EXCL_STOP
+
+API_C_FUNC std::string API_FUNC_NAME(get_default_pkg_name)();
+
+} // namespace hnnx
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_register.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_register.h
new file mode 100755
index 0000000000000..e80db6898b8bf
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_register.h
@@ -0,0 +1,216 @@
+//==============================================================================
+//
+// Copyright (c) 2020-2021 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef OP_REGISTER_H
+#define OP_REGISTER_H 1
+
+#include "c_tricks.h"
+#include "op_registry.h"
+#include "serialize_register.h"
+#include "cost_funcs.h"
+#include "op_info.h"
+#include "op_register_types.h"
+#include "op_package_name.h"
+#include "template_help.h"
+#include "weak_linkage.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+
+namespace hnnx {
+PUSH_VISIBILITY(default)
+
+API_EXPORT OpFactory make_op_custom_internal(const std::string_view op_name_in, const std::string_view type_tag,
+                                             op_reg_parms const &opreg_parms, bool is_external = false);
+
+API_EXPORT OpFactory make_op_custom(const std::string_view op_name_in, std::string_view const type_tag,
+                                    op_reg_parms const &opreg_parms);
+
+POP_VISIBILITY()
+template <bool IS_SIMPLE> struct item_return {
+};
+
+template <> struct item_return<false> {
+    typedef op_reg_parms type;
+};
+
+template <> struct item_return<true> {
+    typedef simop_reg_parms type;
+};
+
+// parms_for is wrapped in this class to avoid if constexpr implementation since
+// the AUTOSAR checker doesn't evaluate if constexpr blocks properly
+template <bool IS_SIMPLE> class GetParms {
+  public:
+    template <typename Derived, int I> constexpr static typename item_return<IS_SIMPLE>::type get();
+    template <auto FP, int I> constexpr static typename item_return<IS_SIMPLE>::type get();
+};
+
+// LCOV_EXCL_START [SAFTYSWCCB-1736] constexprs resolved during compile time
+// used in pub/impl/ops_opts_registration_defs.h for internal ops with constexpr lvalue
+template <> class GetParms<false> {
+  public:
+    template <typename Derived, int I> constexpr static typename item_return<false>::type get()
+    {
+        return op_reg_parms::parms_for<Derived, FlagCounter<Derived, I>::get()>();
+    }
+
+    template <auto FP, int I> constexpr static typename item_return<false>::type get()
+    {
+        using Derived = typename DerivedType<FP>::type;
+        return op_reg_parms::parms_for<Derived, FlagCounter<Derived, I>::get()>();
+    }
+};
+
+template <> class GetParms<true> {
+  public:
+    template <typename Derived, int I> constexpr static typename item_return<true>::type get()
+    {
+        return simop_reg_parms::parms_for_simple<Derived, FlagCounter<Derived, I>::get()>();
+    }
+
+    template <auto FP, int I> constexpr static typename item_return<true>::type get()
+    {
+        using Derived = typename DerivedType<FP>::type;
+        return simop_reg_parms::parms_for_simple<Derived, FlagCounter<Derived, I>::get()>();
+    }
+};
+//LCOV_EXCL_STOP
+
+} // namespace hnnx
+
+/** ModifiedDerivedType is used to perform a transformation from
+ * Tensor_TCM -> Tensor for different tensor types. Both FLAGS_FOR and
+ * APPEND_REG_OP_ELEM use this metafunction to implement TCM folding for execute.
+ * For more details, see docs/register-op-tcm-folding.md
+ */
+namespace fold {
+template <auto, int> struct ModifiedDerivedType;
+} //namespace fold
+// Need the line number to avoid making the same template specialization
+// multiple times
+#define MDT(W, LINE)                                                                                                   \
+    namespace fold {                                                                                                   \
+    template <> struct ModifiedDerivedType<W, LINE> : public ModifiedDerivedTypeParent {                               \
+        using Modified = typename DerivedType<W>::type;                                                                \
+    };                                                                                                                 \
+    } //namespace fold
+
+/** @brief Create an Op type's type suffix from an optional name variant and argument types */
+#define TYPE_SUFFIX(OP, NMVRT, ARGS)                                                                                   \
+    (hnnx::ConcatStr<hnnx::ConstexprStrLen(OP), hnnx::ConstexprStrLen(NMVRT) + hnnx::ConstexprStrLen(ARGS)>(           \
+            OP, (hnnx::ConcatStr<hnnx::ConstexprStrLen(NMVRT), hnnx::ConstexprStrLen(ARGS)>(NMVRT, ARGS).data())))
+
+#ifndef OP_REG_COMPILE
+#define DEF_NATIVE_OP(F, OP, LINE) DEF_NATIVE_OP_NMVRT(F, F, OP, "", LINE)
+
+#define DEF_NATIVE_OP_NO_TCM_FOLDING(F, OP) DEF_NATIVE_OP_NMVRT_NO_TCM_FOLDING(F, F, OP, "")
+
+#define DEF_NATIVE_OP_NMVRT(F, W, OP, NMVRT, LINE)                                                                     \
+    MDT(F, LINE)                                                                                                       \
+    APPEND_REG_OP_ELEM(W, THIS_PKG_NAME_STR "::" OP, TYPE_SUFFIX(OP, NMVRT, hnnx::ArgsTuples2<F>::inputTypeNames), LINE)
+
+#define DEF_NATIVE_OP_NMVRT_NO_TCM_FOLDING(F, W, OP, NMVRT)                                                            \
+    APPEND_REG_OP_ELEM_NO_TCM_FOLDING(W, THIS_PKG_NAME_STR "::" OP,                                                    \
+                                      TYPE_SUFFIX(OP, NMVRT, hnnx::ArgsTuples2<F>::inputTypeNames), false)
+
+#else
+#define DEF_NATIVE_OP(F, OP, LINE)                          __reg_op__(F, OP)<<<__FILE__, __LINE__>>>
+#define DEF_NATIVE_OP_NO_TCM_FOLDING(F, OP)                 __reg_op__(F, OP)<<<__FILE__, __LINE__>>>
+#define DEF_NATIVE_OP_NMVRT(F, W, OP, NMVRT, LINE)          __reg_op__(F, OP)<<<__FILE__, __LINE__>>>
+#define DEF_NATIVE_OP_NMVRT_NO_TCM_FOLDING(F, W, OP, NMVRT) __reg_op__(F, OP)<<<__FILE__, __LINE__>>>
+#endif
+
+// TCM folding is an optimization to reduce skel size, so we only need it for execute.
+#if defined(PREPARE_DISABLED) && !defined(TCM_FOLDING_DISABLED)
+#define REGISTER_OP(F, STR) DEF_NATIVE_OP(F, STR, __LINE__)
+#else
+#define REGISTER_OP(F, STR) DEF_NATIVE_OP_NO_TCM_FOLDING(F, STR)
+#endif
+
+// see register-op-tcm-folding.md
+#define REGISTER_OP_NO_TCM_FOLDING(F, STR)    DEF_NATIVE_OP_NO_TCM_FOLDING(F, STR)
+#define REGISTER_OP_WRAPPER(F, W, STR, NMVRT) DEF_NATIVE_OP_NMVRT_NO_TCM_FOLDING(F, W, STR, NMVRT)
+
+#define REGISTER_OP_EXT(F, STR, NMVRT) REGISTER_OP_WRAPPER(F, F, STR, NMVRT)
+
+#define REGISTER_OP_HVX_EXT(F, STR, NMVRT)                                                                             \
+    FLAGS_FOR_DT_NO_TCM_FOLDING(F, Flags::RESOURCE_HVX)                                                                \
+    REGISTER_OP_EXT(F, STR, NMVRT)
+
+#define REGISTER_OP_HVX(F, STR)                                                                                        \
+    FLAGS_FOR_DT(F, Flags::RESOURCE_HVX)                                                                               \
+    REGISTER_OP(F, STR)
+
+#define REGISTER_OP_HVX_NO_TCM_FOLDING(F, STR)                                                                         \
+    FLAGS_FOR_DT_NO_TCM_FOLDING(F, Flags::RESOURCE_HVX)                                                                \
+    REGISTER_OP_NO_TCM_FOLDING(F, STR)
+
+#define REGISTER_OP_HVX_COPY(F, STR)                                                                                   \
+    FLAGS_FOR_DT(F, Flags::RESOURCE_HVX, Flags::IS_COPY);                                                              \
+    REGISTER_OP(F, STR)
+
+#define REGISTER_OP_HVX_COPY_NO_TCM_FOLDING(F, STR)                                                                    \
+    FLAGS_FOR_DT_NO_TCM_FOLDING(F, Flags::RESOURCE_HVX, Flags::IS_COPY)                                                \
+    REGISTER_OP_NO_TCM_FOLDING(F, STR)
+
+#define REGISTER_OP_HLX_EXT(F, STR, NMVRT)                                                                             \
+    FLAGS_FOR_DT_NO_TCM_FOLDING(F, Flags::RESOURCE_HLX)                                                                \
+    REGISTER_OP_EXT(F, STR, NMVRT)
+
+#define REGISTER_OP_HLX(F, STR)                                                                                        \
+    FLAGS_FOR_DT(F, Flags::RESOURCE_HLX)                                                                               \
+    REGISTER_OP(F, STR)
+
+#define REGISTER_OP_HLX_NO_TCM_FOLDING(F, STR)                                                                         \
+    FLAGS_FOR_DT_NO_TCM_FOLDING(F, Flags::RESOURCE_HLX)                                                                \
+    REGISTER_OP_NO_TCM_FOLDING(F, STR)
+
+#define REGISTER_OP_HLX_COPY(F, STR)                                                                                   \
+    FLAGS_FOR_DT(F, Flags::RESOURCE_HLX, Flags::IS_COPY);                                                              \
+    REGISTER_OP(F, STR)
+
+#define REGISTER_OP_HLX_COPY_NO_TCM_FOLDING(F, STR)                                                                    \
+    FLAGS_FOR_DT_NO_TCM_FOLDING(F, Flags::RESOURCE_HLX, Flags::IS_COPY)                                                \
+    REGISTER_OP_NO_TCM_FOLDING(F, STR)
+
+#define REGISTER_OP_HMX(F, STR)                                                                                        \
+    FLAGS_FOR_DT(F, Flags::RESOURCE_HMX);                                                                              \
+    REGISTER_OP(F, STR)
+
+#define REGISTER_OP_HMX_NO_TCM_FOLDING(F, STR)                                                                         \
+    FLAGS_FOR_DT_NO_TCM_FOLDING(F, Flags::RESOURCE_HMX);                                                               \
+    REGISTER_OP_NO_TCM_FOLDING(F, STR)
+
+#define REGISTER_OP_HVX_SRC_DESTRUCTIVE(F, STR)                                                                        \
+    FLAGS_FOR_DT(F, Flags::RESOURCE_HVX, Flags::CAN_BE_SRC_DESTRUCTIVE);                                               \
+    REGISTER_OP(F, STR)
+
+#define REGISTER_OP_HLX_SRC_DESTRUCTIVE(F, STR)                                                                        \
+    FLAGS_FOR_DT(F, Flags::RESOURCE_HLX, Flags::CAN_BE_SRC_DESTRUCTIVE);                                               \
+    REGISTER_OP(F, STR)
+
+#define REGISTER_OP_HVX_SRC_DESTRUCTIVE_NO_TCM_FOLDING(F, STR)                                                         \
+    FLAGS_FOR_DT_NO_TCM_FOLDING(F, Flags::RESOURCE_HVX, Flags::CAN_BE_SRC_DESTRUCTIVE);                                \
+    REGISTER_OP_NO_TCM_FOLDING(F, STR)
+
+//Register Ops which are never serialized, because they will be removed in const propagation
+#define REGISTER_OP_CONST_HVX(F, STR)                                                                                  \
+    FLAGS_FOR_DT(F, Flags::RESOURCE_HVX, Flags::IS_CONST);                                                             \
+    REGISTER_OP(F, STR)
+
+#define REGISTER_OP_CONST_HVX_NO_TCM_FOLDING(F, STR)                                                                   \
+    FLAGS_FOR_DT_NO_TCM_FOLDING(F, Flags::RESOURCE_HVX, Flags::IS_CONST);                                              \
+    REGISTER_OP_NO_TCM_FOLDING(F, STR)
+
+#define REGISTER_OP_CONST(F, STR)                                                                                      \
+    FLAGS_FOR_DT(F, Flags::IS_CONST);                                                                                  \
+    REGISTER_OP(F, STR)
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_register_ext.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_register_ext.h
new file mode 100755
index 0000000000000..1dbf651eecd37
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_register_ext.h
@@ -0,0 +1,139 @@
+//==============================================================================
+//
+// Copyright (c) 2021-2024 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef OP_REGISTER_EXT_H
+#define OP_REGISTER_EXT_H
+
+#include "graph_status.h"
+#include "template_help.h"
+#include "op_utils.h"
+#include "op_info.h"
+#include "op_registry.h"
+#include "template_help_tensor_ext.h"
+#include "serialize_register.h"
+#include "op_register_types.h"
+#include "pco_declarations.h"
+#include "macros_attribute.h"
+#include "weak_linkage.h"
+
+PUSH_VISIBILITY(default)
+
+namespace hnnx {
+
+class PackageOpStorageBase {
+  public:
+    const std::string op_name;
+    const std::string_view type_tag;
+    const SimpleOpFactory simpop;
+    const std::type_info &type_info;
+    const Op::tensor_deserializer_register_func deserializer_reg_func;
+    const deserialize_op_func deserialize_func;
+    cost_function_t cost_f;
+    const Flags_word flags;
+
+    API_EXPORT PackageOpStorageBase(const std::string_view op_name_in, const std::string_view type_tag_in,
+                                    const SimpleOpFactory simpop_in, const std::type_info &tinf,
+                                    const Op::tensor_deserializer_register_func deserializer_reg_func_in,
+                                    const deserialize_op_func deserialize_func_in, const cost_function_t cost_f_in,
+                                    Flags_word flags_in);
+
+    API_EXPORT OpFactory make_op_wrapper() const;
+};
+
+// The map to store op package ops
+API_EXPORT std::map<std::string, std::vector<std::unique_ptr<PackageOpStorageBase>> *> &get_pkg_op_tmp_map();
+
+} // namespace hnnx
+
+POP_VISIBILITY()
+
+#define INIT_PKG_CORE_INIT_FUNC()                                                                                      \
+    static bool sg_init = false;                                                                                       \
+    extern "C" int op_pkg_init(PackageOpIf &pkg_if)                                                                    \
+    {                                                                                                                  \
+        pkg_if._name = THIS_PKG_NAME_STR;                                                                              \
+        if (sg_init) {                                                                                                 \
+            return GraphStatus::Success;                                                                               \
+        }                                                                                                              \
+        REGISTER_PACKAGE_OPS();                                                                                        \
+        REGISTER_PACKAGE_OPTIMIZATIONS()                                                                               \
+        sg_init = true;                                                                                                \
+        return GraphStatus::Success;                                                                                   \
+    }
+
+#define INIT_PACKAGE_OP_DEF()                                                                                          \
+    API_HIDDEN std::vector<std::unique_ptr<hnnx::PackageOpStorageBase>> &current_package_ops_storage_vec_func()        \
+    {                                                                                                                  \
+        static std::vector<std::unique_ptr<hnnx::PackageOpStorageBase>> opv;                                           \
+        return opv;                                                                                                    \
+    }                                                                                                                  \
+    extern "C" {                                                                                                       \
+    void clearPackageOpsStorageVecFunc() { current_package_ops_storage_vec_func().clear(); }                           \
+    }
+
+#define DECLARE_PACKAGE_OP_DEF()                                                                                       \
+    API_HIDDEN std::vector<std::unique_ptr<hnnx::PackageOpStorageBase>> &current_package_ops_storage_vec_func();
+
+#define REGISTER_PACKAGE_OPS()                                                                                         \
+    if (hnnx::get_pkg_op_tmp_map().find(std::string(THIS_PKG_NAME_STR)) == hnnx::get_pkg_op_tmp_map().end()) {         \
+        hnnx::get_pkg_op_tmp_map()[std::string(THIS_PKG_NAME_STR)] = &current_package_ops_storage_vec_func();          \
+        hnnx::pkg_ops_opts_registration();                                                                             \
+    }
+
+/** @brief Create an Op type's type suffix from argument types */
+#define PKG_TYPE_SUFFIX(OP, ARGS) (hnnx::ConcatStr<hnnx::ConstexprStrLen(OP), hnnx::ConstexprStrLen(ARGS)>(OP, ARGS))
+
+#ifndef OP_REG_COMPILE
+#define DEF_PACKAGE_OP(F, OP)                                                                                          \
+    FLAGS_FOR_DT_NO_TCM_FOLDING(F, Flags::RESOURCE_HVX)                                                                \
+    APPEND_REG_OP_ELEM_NO_TCM_FOLDING(                                                                                 \
+            F, THIS_PKG_NAME_STR "::" OP,                                                                              \
+            PKG_TYPE_SUFFIX(THIS_PKG_NAME_STR "::" OP, hnnx::ArgsTuples2<F>::inputTypeNames), true)
+#else
+#define DEF_PACKAGE_OP(F, OP) __reg_op__(F, OP)<<<__FILE__, __LINE__>>>
+#endif
+
+//LCOV_EXCL_START [SAFTYSWCCB-1542]
+using package_cost_function_t = float (*)(Op const *);
+inline float call_cost_func(package_cost_function_t func, const Op *op)
+{
+    return (func)(op);
+}
+inline float call_cost_func(std::string_view, const Op *op)
+{
+    return 0.0;
+}
+//LCOV_EXCL_STOP
+namespace hnnx {
+template <auto F>
+void add_package_op_ext(std::vector<std::unique_ptr<PackageOpStorageBase>> &ops, const std::string_view op_name_in,
+                        const char *type_tag, const package_cost_function_t cost_f_in, Flags_word flags_in);
+}
+
+#ifndef OP_REG_COMPILE
+#define DEF_PACKAGE_OP_AND_COST_AND_FLAGS(F, OP, COST, ...)                                                            \
+    COST_OF(F, COST)                                                                                                   \
+    FLAGS_FOR_DT_NO_TCM_FOLDING(F, __VA_ARGS__)                                                                        \
+    APPEND_REG_OP_ELEM_NO_TCM_FOLDING(                                                                                 \
+            F, THIS_PKG_NAME_STR "::" OP,                                                                              \
+            PKG_TYPE_SUFFIX(THIS_PKG_NAME_STR "::" OP, hnnx::ArgsTuples2<F>::inputTypeNames), true)
+
+#define DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS(F, OP, COST_F, ...)                                                        \
+    COST_OF_F(F, [](const Graph &, const Op *op) -> float { return call_cost_func(COST_F, op); })                      \
+    FLAGS_FOR_DT_NO_TCM_FOLDING(F, __VA_ARGS__)                                                                        \
+    APPEND_REG_OP_ELEM_NO_TCM_FOLDING(                                                                                 \
+            F, THIS_PKG_NAME_STR "::" OP,                                                                              \
+            PKG_TYPE_SUFFIX(THIS_PKG_NAME_STR "::" OP, hnnx::ArgsTuples2<F>::inputTypeNames), true)
+#else
+#define DEF_PACKAGE_OP_AND_COST_AND_FLAGS(F, OP, COST, ...)     __reg_op__(F, OP)<<<__FILE__, __LINE__>>>
+#define DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS(F, OP, COST_F, ...) __reg_op__(F, OP)<<<__FILE__, __LINE__>>>
+#endif
+
+DECLARE_PACKAGE_OP_DEF()
+
+#endif // OP_REGISTER_EXT_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_register_types.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_register_types.h
new file mode 100755
index 0000000000000..0fc961e332312
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_register_types.h
@@ -0,0 +1,100 @@
+//==============================================================================
+//
+// Copyright (c) Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef OP_REGISTER_TYPES_H
+#define OP_REGISTER_TYPES_H 1
+
+#include "op_registry.h"
+#include "serialize_register.h"
+#include "cost_funcs.h"
+#include "op_info.h"
+#include "op_package_name.h"
+#include "size_align_code.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#if !defined(ANDROID) && !(defined(_WIN32) && defined(_M_ARM64) && defined(PREPARE_DISABLED))
+#define DESERIALIZATION_ENABLED 1
+#else
+#define DESERIALIZATION_ENABLED 0
+#endif
+
+namespace hnnx {
+
+// package of info for op construction.
+// LCOV_EXCL_START [SAFTYSWCCB-1736] constexprs resolved during compile time
+struct op_reg_parms {
+#ifndef PREPARE_DISABLED
+    OpFactory newop;
+    std::type_info const *tinf;
+#endif
+#if DESERIALIZATION_ENABLED == 1
+    Op::tensor_deserializer_register_func deserializer_reg_func;
+    deserialize_op_func deserialize_func;
+    deserialize_dtor_func deserialize_dtor;
+#endif
+#ifndef PREPARE_DISABLED
+    cost_function_t cost_f;
+    Flags_word flags;
+#endif
+#if DESERIALIZATION_ENABLED == 1
+    size_align_code_t size_align_code;
+    inline constexpr size_t get_size() const { return size_align_code.size(); }
+    inline constexpr size_t get_align() const { return size_align_code.align(); }
+#endif
+    template <typename Derived, int N> static constexpr op_reg_parms parms_for();
+};
+
+// generate an 'op_reg_parms' for a given Op type.
+// this should be expanded only once for each Derived, so we want it inlined.
+template <typename Derived, int N> [[gnu::always_inline]] constexpr op_reg_parms op_reg_parms::parms_for()
+{
+    return op_reg_parms
+    {
+#ifndef PREPARE_DISABLED
+        Derived::create, &typeid(Derived),
+#endif
+#if DESERIALIZATION_ENABLED == 1
+                Derived::get_tensor_deserializer_register_func(),
+                test_flag_for(flags_for<Derived, N>(), Flags::IS_CONST) ? nullptr
+                                                                        : alloc_func_for_op<Derived>::alloc_func,
+                !std::is_trivially_destructible<Derived>::value ? dealloc_func_for_op<Derived>::func : nullptr,
+#endif
+#ifndef PREPARE_DISABLED
+                get_costf<Derived>(), flags_for<Derived, N>(),
+#endif
+#if DESERIALIZATION_ENABLED == 1
+                alloc_func_for_op<Derived>::op_size_align,
+#endif
+    };
+}
+// LCOV_EXCL_STOP
+
+struct simop_reg_parms {
+    SimpleOpFactory sim_newop;
+    std::type_info const *tinf;
+    Op::tensor_deserializer_register_func deserializer_reg_func;
+    deserialize_op_func deserialize_func;
+    cost_function_t cost_f;
+    Flags_word flags;
+    template <typename Derived, int N> static constexpr simop_reg_parms parms_for_simple();
+};
+
+template <typename Derived, int N> [[gnu::always_inline]] constexpr simop_reg_parms simop_reg_parms::parms_for_simple()
+{
+    return simop_reg_parms{Derived::create,
+                           &typeid(Derived),
+                           Derived::get_tensor_deserializer_register_func(),
+                           alloc_func_for_op_ext<Derived>::alloc_func,
+                           get_costf<Derived>(),
+                           flags_for<Derived, N>()};
+}
+} // namespace hnnx
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_registry.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_registry.h
new file mode 100755
index 0000000000000..00a3c10bcab56
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_registry.h
@@ -0,0 +1,130 @@
+//==============================================================================
+//
+// Copyright (c) 2018 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef OP_REGISTRY_H
+#define OP_REGISTRY_H 1
+
+#include "op.h"
+#include "op_def.h"
+#include "weak_linkage.h"
+
+#include <map>
+#include <memory>
+#include <string>
+
+/*
+ * We need a way to generate ops.  This known as a "factory", being able to
+ * return an appropriate op based on the op type and maybe other factors.
+ *
+ * How do we know what ops exist?  Maintaining a list is not acceptable: it is
+ * inflexible, requires constant code changes, is error prone, doesn't allow
+ * dynamic op addition, etc.
+ *
+ * So we must have some mechanism to register ops with the op factory.
+ *
+ * The interface is pretty simple: register an op with the registry, or generate
+ * an op that exists in the registry
+ *
+ * We're going to use strings to indicate the type of operation, this is less
+ * error prone (mismatching IDs) than a numeric ID.
+ *
+ * The factory has to be virtualized in some way: Map a string to a function
+ * which generates an op.  Using templated classes, however, generates a lot of
+ * extra code.  To keep this smaller, a bare function pointer is used.  This
+ * generates a unique_ptr which wraps the op.  If the op cannot be created, then
+ * the unique_ptr contains nullptr.  Each factory function is actually a static
+ * member of a concrete template class, but since static member functions have
+ * the same signature as simple functions, we can just use a function pointer.
+ *
+ */
+
+namespace hnnx {
+
+// An op factory function.  Just a bare function poiner in order to keep
+// things as small as possible.
+using OpFactory = uptr_Op (*)(OpIoPtrs const &, const OpId, SimpleOpFactory);
+
+struct op_reg_info_t {
+    OpFactory op_factory{nullptr}; // function pointer for generating ops
+    SimpleOpFactory simple_op_factory{nullptr}; // function pointer for generating SimpleOp's for SimpleOpWrapper's
+    bool is_external{false};
+};
+using OpRegistry_map_t = std::multimap<opname_tag_t, struct op_reg_info_t>;
+
+PUSH_VISIBILITY(default)
+/*
+	 * We register an op with the registry by giving it a name, and a std::unique_ptr to a generator for that op.
+	 * Returns a reference to the op once emplaced.
+	 * Why? Because that allows us to create static variables with the results, causing the functions to be loaded automatically...
+	 */
+extern API_FUNC_EXPORT OpFactory register_op(opname_tag_t name, OpFactory newop, SimpleOpFactory simop,
+                                             bool is_external);
+
+/*
+	 * Generate an op
+	 */
+API_FUNC_EXPORT uptr_Op op_factory_generate(OpIoPtrs const &op_io_ptrs, OpId id_in);
+
+/**Function returns a reference for the registered ops structure
+	 * This function currently enables unit tests on the op registry
+	 *
+	 */
+extern API_FUNC_EXPORT const OpRegistry_map_t &get_registered_ops();
+
+// Function clean up all package(external) ops from op maps
+extern API_FUNC_EXPORT void clear_pkg_ops_in_op_maps();
+
+// for 'introspect', we want a mapping from each registered OpFactory (a function pointer)
+// to the corresponding typeid ptr. This map is built via calls to
+// register_optype_by_factory; this function is normally a weak def which does nothing,
+// but introspect.cc redefines it.
+//
+API_FUNC_EXPORT void register_optype_by_factory(OpFactory fp, hnnx::opname_tag_t opname_tag, std::type_info const &typ,
+                                                const std::string_view type_tag);
+API_FUNC_EXPORT void register_optype_by_factory(SimpleOpFactory fp, hnnx::opname_tag_t opname_tag,
+                                                std::type_info const &typ, const std::string_view type_tag);
+
+POP_VISIBILITY()
+
+// LCOV_EXCL_START [SAFTYSWCCB-1736] constexprs resolved during compile time
+template <int N, int M> constexpr auto ConcatStr(const char *a, const char *b)
+{
+    std::array<char, N + M + 1> result{};
+    char *const des = result.data();
+    for (size_t i = 0; i < N; i++) {
+        des[i] = a[i];
+    }
+    size_t idx = N;
+    for (size_t j = 0; j < M; j++, idx++) {
+        des[idx] = b[j];
+    }
+    des[idx] = 0;
+    return result;
+}
+// LCOV_EXCL_STOP
+
+template <typename T> constexpr size_t ConstexprStrLen(T s)
+{
+    return 0;
+}
+
+// LCOV_EXCL_START [SAFTYSWCCB-1736] constexprs resolved during compile time
+template <> constexpr size_t ConstexprStrLen<const char *>(const char *str)
+{
+    size_t len = 0;
+    while (*str != 0) {
+        len++;
+        str++;
+    }
+    return len;
+}
+// LCOV_EXCL_STOP
+
+} // namespace hnnx
+
+#endif /*OP_FACTORY_H*/
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_utils.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_utils.h
new file mode 100755
index 0000000000000..3611f25406ae1
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_utils.h
@@ -0,0 +1,225 @@
+//==============================================================================
+//
+// Copyright (c) Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef OP_UTILS_H
+#define OP_UTILS_H 1
+
+#include "interface_defs.h"
+#include "op_def.h"
+#include "tensor.h"
+#include "build_options_pub.h"
+
+#include <memory>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+namespace hnnx {
+
+template <typename T> static inline bool is_output_def_valid(const OutputDef &output_def, Graph &graph_in)
+{
+    return tensor_generator_valid<T>(nullptr, output_def, graph_in);
+}
+
+template <typename T>
+static inline bool is_input_tensor_compatible(Graph &graph_in, Tensor const *tensor, unsigned position)
+{
+    // dynamic_cast below is used to realise 'std::is_base_of' check with tensor object
+    // the cast uses Run Time Type Identification (RTTI) mechanism
+    // to infer the true object type to downcast when valid
+    // else returns a nullptr
+    if (!tensor || !dynamic_cast<T>(tensor)) {
+        if constexpr (build_options_pub::DebugRegistry)
+            debuglog("input tensor is %p of type %s in position %d, dynamic cast to %s failed", tensor,
+                     typeid(*tensor).name(), position, typeid(T).name());
+        return false;
+    }
+    return true;
+}
+
+template <typename TupType, size_t... I>
+static inline bool are_output_defs_valid_helper(std::index_sequence<I...>, OutputDef const *const *outputs_in,
+                                                Graph &graph_in)
+{
+    //  tensor_generator below returns a unique pointer which will be released on return (i.e. when object goes out of scope)
+    // this check preferably should be done with boolean valid_tensor() method instead of creating an actual tensor
+    // but for now to limit generation of more template code this approach should suffice.
+    return (is_output_def_valid<std::tuple_element_t<I, TupType>>(*outputs_in[I], graph_in) && ...);
+}
+
+template <size_t N, typename TupType>
+static inline bool are_output_defs_valid(OutputDef const *const *outputs_in, Graph &graph_in)
+{
+    return are_output_defs_valid_helper<TupType>(std::make_index_sequence<N>{}, outputs_in, graph_in);
+}
+
+template <typename TupType, size_t... I>
+static inline bool are_input_tensors_compatible_helper(std::index_sequence<I...>, Graph &graph_in,
+                                                       Tensor const *const *inputs_in)
+{
+    return ((is_input_tensor_compatible<std::tuple_element_t<I, TupType>>(graph_in, inputs_in[I], I)) && ...);
+}
+
+template <size_t N, typename TupType>
+static inline bool are_input_tensors_compatible(Graph &graph_in, Tensor const *const *inputs_in)
+{
+    return are_input_tensors_compatible_helper<TupType>(std::make_index_sequence<N>{}, graph_in, inputs_in);
+}
+
+template <typename T>
+std::unique_ptr<Tensor> tensor_output_alloc(const Op *producer_in, const OutputDef &output_def, Graph &graph_in)
+{
+    return std::move(tensor_generator<T>(producer_in, output_def, graph_in));
+}
+
+// a pointer to a tensor_generator<T>() function, for some T
+typedef std::unique_ptr<Tensor> (*tensor_generate_fp)(Op const *, OutputDef const &, Graph &);
+//
+// for TupType being a tuple of N tensor-types:
+//  tensor_gen_array<TupType> returns a constexpr array of N tensor_generate_fp.
+//
+template <typename TupType, size_t N, size_t... I>
+inline constexpr std::array<tensor_generate_fp, N> tensor_gen_array_helper(std::index_sequence<I...>)
+{
+    return {tensor_generator<std::tuple_element_t<I, TupType>>...};
+}
+template <typename TupType>
+inline constexpr std::array<tensor_generate_fp, std::tuple_size_v<TupType>> tensor_gen_array()
+{
+    constexpr size_t N = std::tuple_size_v<TupType>;
+    return tensor_gen_array_helper<TupType, N>(std::make_index_sequence<N>{});
+}
+// and tensor_gen_array_ptr<TupType> returns a pointer to such an array
+template <typename TupType> inline tensor_generate_fp const *tensor_gen_array_ptr()
+{
+    if constexpr (std::tuple_size_v<TupType> != 0) {
+        static constexpr std::array<tensor_generate_fp, std::tuple_size_v<TupType>> ptr_array =
+                tensor_gen_array<TupType>();
+        return ptr_array.data();
+    } else {
+        return nullptr;
+    }
+}
+
+////////////////
+// Code to generate a table of {dtype, rank} pairs, for the 'scratch' tensors in an op.
+struct dt_rank_pair {
+    DType dt;
+    unsigned rank;
+};
+
+// map a Tensor type to a dt_rank_pair: dt_rank_pair_for_tens<T>::value.
+// General case fails.
+template <typename Tens> struct dt_rank_pair_for_tens {
+    static_assert(int(sizeof(Tens)) < 0, "Can't use Tens as 'scratch' output type, only Concrete Tensor");
+};
+// Specialized for Concrete<Tensor> only.
+template <typename Tinfo> struct dt_rank_pair_for_tens<ConcreteTensor<Tinfo>> {
+  private:
+    using CT_traits = tensor_traits<ConcreteTensor<Tinfo>>;
+
+  public:
+    static constexpr dt_rank_pair value = {CT_traits::dtype, CT_traits::rank};
+};
+template <typename TupType, size_t N, size_t... I>
+inline constexpr std::array<dt_rank_pair, N> tensor_dt_rank_array_helper(std::index_sequence<I...>)
+{
+    return {dt_rank_pair_for_tens<std::tuple_element_t<I, TupType>>::value...};
+}
+template <typename TupType> // make and return the array...
+inline constexpr std::array<dt_rank_pair, std::tuple_size_v<TupType>> tensor_dt_rank_array()
+{
+    constexpr size_t N = std::tuple_size_v<TupType>;
+    return tensor_dt_rank_array_helper<TupType, N>(std::make_index_sequence<N>{});
+}
+// and tensor_dt_rank_array_ptr<TupType> returns a pointer to such an array
+template <typename TupType> inline dt_rank_pair const *tensor_dt_rank_array_ptr()
+{
+    if constexpr (std::tuple_size_v<TupType> != 0) {
+        static constexpr std::array<dt_rank_pair, std::tuple_size_v<TupType>> dt_array =
+                tensor_dt_rank_array<TupType>();
+        return dt_array.data();
+    } else {
+        return nullptr;
+    }
+}
+// A mechanism for invoking tensor_dt_rank_array_ptr<..> with the *last* tuple types
+// in tup-types, after removing the first NPREFIX (and, a goal here is to avoid creating
+// more of the 'static constexpr dt_array' above than needed; any cases with the same
+// tail will invoke the same array)
+template <unsigned NPREFIX, typename TupOfTens, bool FINAL = (NPREFIX == 0)> struct tensor_dt_rank_array_for_scratch;
+
+// case with NPREFIX = 0
+template <typename... Tts> struct tensor_dt_rank_array_for_scratch<0, std::tuple<Tts...>, true> {
+    static inline dt_rank_pair const *table_p() { return tensor_dt_rank_array_ptr<std::tuple<Tts...>>(); }
+};
+// other cases with NPREFIX >= 1
+template <unsigned NPREFIX, typename T1, typename... Tts>
+struct tensor_dt_rank_array_for_scratch<NPREFIX, std::tuple<T1, Tts...>, false>
+    : public tensor_dt_rank_array_for_scratch<NPREFIX - 1, std::tuple<Tts...>> {
+    // inherit table_p()
+};
+////////////////
+
+//
+// given tensor type, get spatial mask
+//
+template <typename Ttype> inline uint32_t get_spatial_mask()
+{
+    // NOLINTNEXTLINE(misc-const-correctness): Don't const this variable
+    uint32_t spatial_mask = 0x38; //0b111000
+    if constexpr (std::is_base_of<LayoutWideCrouton_8, Ttype>::value) spatial_mask = 0x20; //0b100000
+    return spatial_mask;
+}
+
+} // namespace hnnx
+
+//
+// find dim + (stride-1))/stride
+// avoid a divide when stride = 1..4
+inline size_t stride_divide(size_t dim, size_t stride)
+{
+    if (stride >= 2) {
+        dim += stride - 1;
+        switch (stride) {
+        case 2:
+            return dim >> 1;
+        case 3:
+            return dim / 3u; // compiler will have a trick for this.
+        case 4:
+            return dim >> 2;
+        default:
+            return dim / stride;
+        }
+    }
+    return dim;
+}
+
+//
+// given input size, window size, and stride, find the output size and the 'prepad' (padding on
+// left/top needed to align first output.
+// 'same_shape' is false for 'valid', true for 'same'
+//
+static inline std::tuple<size_t, size_t> find_output_size_and_prepad(bool same_shape, size_t insize, size_t winsize,
+                                                                     size_t stride)
+{
+    size_t outsize;
+    size_t prepad;
+    if (same_shape) {
+        outsize = insize;
+        prepad = (winsize - 1) / 2;
+    } else {
+        outsize = insize - (winsize - 1);
+        prepad = 0;
+    }
+    // 'outsize' is correct for stride=1; adjust for general stride
+    outsize = stride_divide(outsize, stride);
+    return std::tuple<size_t, size_t>(outsize, prepad);
+}
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/opname_tag.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/opname_tag.h
new file mode 100755
index 0000000000000..8adb264abdad3
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/opname_tag.h
@@ -0,0 +1,266 @@
+//==============================================================================
+//
+// Copyright (c) 2019-2021,2024 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef OPNAME_TAG_H
+#define OPNAME_TAG_H 1
+
+// uncomment this to get string_tag_t for 'opname_tag_t'
+#define WITH_STRING_REG_OPSTR 1
+// uncomment this to get string_tag_t for operand_tag_t and split_context_t
+#define WITH_STRING_REG_OPERAND 1
+
+#include <string>
+#include <string_view>
+#include "conversions.h"
+#include "macros_attribute.h"
+#include "weak_linkage.h"
+
+#if defined(WITH_STRING_REG_OPSTR) || defined(WITH_STRING_REG_OPERAND)
+#include "string_registry2.h"
+#endif
+#include "op_package_name.h"
+
+struct Options;
+
+namespace hnnx {
+
+namespace opname_hash_ns {
+// a 16-bit string hash; used to speed up optimization passes.
+// must match that used in 'offline' rule builder.
+inline unsigned opname_hash_impl(char const *s, unsigned n)
+{
+    unsigned h = 0;
+    for (int i = 0; i < (int)n; i++) {
+        h = muladdu32_modular(0x381u, h, (unsigned char)s[i]);
+    }
+    return h & 0xFFFF;
+}
+} // namespace opname_hash_ns
+
+#ifndef WITH_STRING_REG_OPSTR // not deployed; keep using std::string
+typedef std::string opname_tag_t;
+// opname_tag_parm_t is an opname_tag_t const & if opname_tag_t is 'heavy',
+// and the same as opname_tag_t if 'light'
+typedef std::string const &opname_tag_parm_t;
+
+inline unsigned find_opname_hash(std::string const &nm)
+{
+    return opname_hash_ns::opname_hash_impl(nm.data(), nm.size());
+}
+#endif
+
+#ifndef WITH_STRING_REG_OPERAND // not deployed; keep using std::string
+typedef std::string operand_tag_t;
+typedef std::string const &operand_tag_parm_t;
+typedef std::string split_context_tag_t;
+typedef std::string const &split_context_tag_parm_t;
+#endif
+
+#if defined(WITH_STRING_REG_OPSTR) || defined(WITH_STRING_REG_OPERAND)
+
+PUSH_VISIBILITY(default)
+namespace opname_hash_ns {
+struct opname_hash_functor {
+    unsigned operator()(char const *s, size_t n) const { return opname_hash_impl(s, n); }
+};
+
+// type for the string registry
+
+typedef string_registry_two<opname_hash_functor> StringRegistry;
+typedef StringRegistry::string_key string_key;
+
+} // namespace opname_hash_ns
+
+// declare that this specialization exists in a C++ file
+extern template class string_registry_two<opname_hash_ns::opname_hash_functor>;
+
+//
+// string_tag_t needs to have the following:
+//   - null_ctor
+//   - copy-ctor, move_ctor, op= and op=move
+//   - construct/assign from: std::string_view, std::string, char const *
+//   - operator ==, != and < , to self, for maps
+//   - specialization of std::hash to support unordered_map
+//   - c_str()  returns char const *
+//   - conversion to std::string (explicit)
+//   - conversion to std::string_view (explicit)
+//   - ideally, == and != comparison to char const*, std::string, and std::string_view
+//     should be possible without converting the other to string_tag_t
+//   - size(), length()
+//   - operator [](size_t), unchecked, read-only
+//
+//
+
+class string_tag_t {
+    using string_key = opname_hash_ns::string_key;
+    using registry_t = opname_hash_ns::StringRegistry;
+    string_key skey; // <-- this is the only data item. It's a pointer.
+
+    // initially-null pointer to the global string reg, which is a static locsl.
+    API_EXPORT static registry_t *globalStringReg;
+    // function to get its address
+    API_EXPORT static registry_t &get_stringreg_func();
+    static inline registry_t &get_stringreg() { return globalStringReg ? *globalStringReg : get_stringreg_func(); }
+    // these are just implementations of get_stringreg().map_str( ..various .. );
+    API_EXPORT static string_key map_str(std::string_view s);
+    API_EXPORT static string_key map_str(std::string const &s);
+    API_EXPORT static string_key map_str(char const *s);
+
+  public:
+    ~string_tag_t() = default;
+    // same as string_tag_t(string_view), but if the name is not already in the registry,
+    // it will return string_tag for "" (can check with result.empty())
+    API_EXPORT static string_tag_t map_str_checked(std::string_view);
+
+    string_tag_t() : skey(opname_hash_ns::StringRegistry::map_empty_str()) {}
+    string_tag_t(string_tag_t const &x) = default;
+    string_tag_t(string_tag_t &&x) = default;
+    string_tag_t(std::string_view x) : skey(map_str(x)) {}
+    string_tag_t(std::string const &x) : skey(map_str(x)) {}
+    string_tag_t(char const *x) : skey(map_str(x)) {}
+    string_tag_t &operator=(string_tag_t const &) = default;
+    string_tag_t &operator=(string_tag_t &&) = default;
+    string_tag_t &operator=(std::string_view x)
+    {
+        skey = map_str(x);
+        return *this;
+    }
+    string_tag_t &operator=(std::string const &x)
+    {
+        skey = map_str(x);
+        return *this;
+    }
+    string_tag_t &operator=(char const *x)
+    {
+        skey = map_str(x);
+        return *this;
+    }
+    bool operator==(string_tag_t const &rhs) const { return skey == rhs.skey; }
+    bool operator<(string_tag_t const &rhs) const { return skey < rhs.skey; }
+    char const *c_str() const { return registry_t::c_str(skey); }
+    // the 'explicit' on conversion to string could be removed, but I'd prefer to have implicit
+    // conversions flagged, they can usually be modified to something that doesn't allocate memory.
+    explicit operator std::string() const { return registry_t::unmap(skey); }
+    operator std::string_view() const { return registry_t::unmap_sv(skey); }
+    char operator[](size_t i) const { return c_str()[i]; }
+    size_t size() const { return skey->first.size(); }
+    size_t length() const { return size(); }
+    bool empty() const { return skey == registry_t::map_empty_str(); }
+    API_EXPORT bool operator==(std::string_view x) const; //{ return registry_t::unmap_sv(skey)==x;}
+    API_EXPORT bool operator==(std::string const &x) const; // { return registry_t::unmap_sv(skey)==x;}
+    API_EXPORT bool operator==(char const *x) const; //{ return registry_t::unmap_sv(skey)==std::string_view(x);}
+    template <class T> bool operator!=(T &&other) const { return !this->operator==(std::forward<T>(other)); }
+    // string_tag_t::nullobj() returns a 'null' object which is mostly unusable; it can
+    // be copied; also == and != will work. null object are equal to each other and
+    // different from other string_tag_t.
+    API_EXPORT static inline string_tag_t nullobj() { return string_tag_t((string_key) nullptr); }
+
+  protected:
+    explicit string_tag_t(string_key k) : skey(k) {}
+    friend struct std::hash<string_tag_t>;
+    friend unsigned find_opname_hash(string_tag_t const &nm);
+    // this  meets the requirements for std::hash: mapping may change from run to run.
+    API_EXPORT size_t std_hash_val() const noexcept { return size_t(skey); }
+};
+POP_VISIBILITY()
+
+inline bool operator==(std::string_view a, string_tag_t const &b)
+{
+    return b == a;
+}
+inline bool operator==(std::string const &a, string_tag_t const &b)
+{
+    return b == a;
+}
+inline bool operator==(char const *a, string_tag_t const &b)
+{
+    return b == a;
+}
+inline bool operator!=(std::string_view a, string_tag_t const &b)
+{
+    return !(b == a);
+}
+inline bool operator!=(std::string const &a, string_tag_t const &b)
+{
+    return !(b == a);
+}
+inline bool operator!=(char const *a, string_tag_t const &b)
+{
+    return !(b == a);
+}
+#endif
+
+#ifdef WITH_STRING_REG_OPSTR // string reg deployed for opname
+typedef string_tag_t opname_tag_t;
+typedef string_tag_t opname_tag_parm_t;
+inline unsigned find_opname_hash(string_tag_t const &nm)
+{
+    return nm.skey->second & 0xFFFF;
+}
+#endif
+
+#ifdef WITH_STRING_REG_OPERAND // deployed for operand_tag and split_context_tag
+typedef string_tag_t operand_tag_t;
+typedef string_tag_t operand_tag_parm_t;
+typedef string_tag_t split_context_tag_t;
+typedef string_tag_t split_context_parm_t;
+#endif
+
+PUSH_VISIBILITY(default)
+// this is for use in code where we need to transform a literal name to opstr,
+// e.g. opname_tag_t opname_Concat = make_opname( "Concat");
+//
+API_EXPORT opname_tag_t make_opname(char const *opname, char const *prefix = THIS_PKG_NAME_STR);
+POP_VISIBILITY()
+
+// For retrieving an option value from within DEF_OPT rules, converted to T.
+// Returns true if the option exists and can convert to T;
+// if it doesn't, return false and sets result to 0.
+// This is only implemented for T = int, size_t, float.
+// 'bool' option values can be obtained as int or size_t, will be 0 or 1.
+// 'string' values are treated as bool (false if empty).
+template <typename T> bool get_option_value(Options const &ops, hnnx::opname_tag_parm_t optname, T &result);
+
+} // namespace hnnx
+
+#if defined(WITH_STRING_REG_OPSTR) || defined(WITH_STRING_REG_OPERAND)
+namespace std {
+template <> struct hash<hnnx::string_tag_t> {
+    typedef hnnx::string_tag_t argument_type;
+    typedef std::size_t result_type;
+    result_type operator()(argument_type const &s) const noexcept { return s.std_hash_val(); }
+};
+} // namespace std
+
+// these declarations make it possible to use a string_tag_t as the key
+// in a minihash or miniset
+namespace hnnx {
+namespace minObj {
+template <typename Key, bool ERASE_OK> struct hashmap_traits;
+template <typename T> struct findhash;
+uint32_t findhash_sizet(size_t);
+template <> struct hashmap_traits<hnnx::string_tag_t, true> {
+    static constexpr bool valid = true;
+};
+template <> struct hashmap_traits<hnnx::string_tag_t, false> {
+    static constexpr bool valid = true;
+    static inline hnnx::string_tag_t generate_null() { return hnnx::string_tag_t::nullobj(); }
+    static inline bool is_null(hnnx::string_tag_t k) { return k == hnnx::string_tag_t::nullobj(); }
+};
+template <> struct findhash<hnnx::string_tag_t> {
+    inline uint32_t operator()(hnnx::string_tag_t s) const
+    {
+        return findhash_sizet(std::hash<hnnx::string_tag_t>()(s));
+    }
+};
+} // namespace minObj
+} // namespace hnnx
+
+#endif
+
+#endif // OPNAME_TAG_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/ops_opts_registration.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/ops_opts_registration.h
new file mode 100755
index 0000000000000..6cba5ede6cab3
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/ops_opts_registration.h
@@ -0,0 +1,444 @@
+//==============================================================================
+//
+// Copyright (c) 2022 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef OPS_OPTS_REGISTRATION_H
+#define OPS_OPTS_REGISTRATION_H 1
+
+// Note that Op files must include this file AFTER they've included either
+// typical_op.h or variadic_op.h, since these headers both give definitions for DerivedType
+
+#include "log.h"
+#include "ops_opts_registration_defs.h"
+#include "optimize_flags.h"
+#include "optimize.h"
+#include "op_register.h"
+#include "op_register_ext.h"
+#include <cstdint>
+#include <cinttypes>
+#include <string>
+#include <string_view>
+
+namespace hnnx {
+
+/** @brief reg_op_node */
+class reg_op_node {
+    /** @brief parms parameters (cost func, flags, etc) for the Op */
+    union {
+        op_reg_parms op_parms;
+        simop_reg_parms simple_op_parms;
+    };
+    /** @brief op_name */
+    uint16_t op_name_offset;
+    /** @brief type_tag */
+    uint16_t type_tag_offset;
+
+    std::string_view const get_subview(std::string_view const strtab, std::string_view::size_type const start) const
+    {
+        return std::string_view{strtab.data() + start};
+    }
+
+  public:
+    // LCOV_EXCL_START [SAFTYSWCCB-1736] constexprs resolved during compile time
+    /** @brief reg_op_node @param a @param n @param t */
+    constexpr reg_op_node(op_reg_parms const p, uint16_t const n, uint16_t const t) noexcept
+        : op_parms(p), op_name_offset(n), type_tag_offset(t)
+    {
+    }
+
+    constexpr reg_op_node(simop_reg_parms const p, uint16_t const n, uint16_t const t) noexcept
+        : simple_op_parms(p), op_name_offset(n), type_tag_offset(t)
+    {
+    }
+
+    /** @brief reg_op_node */
+    constexpr reg_op_node() noexcept : reg_op_node(op_reg_parms{}, 0, 0) {}
+
+    // LCOV_EXCL_STOP
+
+    /** @brief process invoke the make_op_custom function */
+    void core_process(std::string_view const op_name_strtab, std::string_view const type_tag_strtab) const
+    {
+        std::string_view const op_name = get_subview(op_name_strtab, op_name_offset);
+        std::string_view const type_tag = get_subview(type_tag_strtab, type_tag_offset);
+        hnnx::make_op_custom(op_name, type_tag, op_parms);
+    }
+
+    /** @brief process append external oppkg ops into op vector for later use */
+    void pkg_process(std::string_view const op_name_strtab, std::string_view const type_tag_strtab) const
+    {
+        std::string_view const op_name = get_subview(op_name_strtab, op_name_offset);
+        std::string_view const type_tag = get_subview(type_tag_strtab, type_tag_offset);
+        std::vector<std::unique_ptr<PackageOpStorageBase>> &ops = current_package_ops_storage_vec_func();
+        ops.push_back(std::make_unique<PackageOpStorageBase>(
+                op_name, type_tag, simple_op_parms.sim_newop, *(simple_op_parms.tinf),
+                simple_op_parms.deserializer_reg_func, simple_op_parms.deserialize_func, simple_op_parms.cost_f,
+                simple_op_parms.flags));
+    }
+};
+
+#ifdef PREPARE_DISABLED
+/** @brief reg_optim_node This is stub class that does not
+ *  register DEF_OPTs when prepare is disabled.
+*/
+class reg_optim_node {
+  public:
+    /** @brief No-op when prepare is disabled */
+    void core_process(std::string_view const fname) const { (void)fname; }
+
+    /** @brief No-op when prepare is disabled */
+    void pkg_process(std::string_view const fname) const { (void)fname; }
+};
+#else
+/** @brief reg_optim_node */
+class reg_optim_node {
+    /** @brief defopt */
+    hnnx::get_entire_defopt_t defopt;
+    /** @brief flags */
+    OptimFlags::flags_t flags;
+    /** @brief priority */
+    uint16_t priority;
+    /** @brief line */
+    uint16_t line;
+
+  public:
+    /** @brief reg_optim_node @param p @param fl @param m @param c @param r @param f @param l */
+    constexpr reg_optim_node(uint16_t const p, OptimFlags::flags_t const fl, hnnx::get_entire_defopt_t d,
+                             uint16_t const l) noexcept
+        : defopt(d), flags(fl), priority(p), line(l)
+    {
+    }
+
+    /** @brief reg_optim_node */
+    constexpr reg_optim_node() noexcept : reg_optim_node(0, 0U, nullptr, 0) {}
+
+    /** @brief process invoke the add_package_opt function */
+    void core_process(std::string_view const fname) const
+    {
+        hnnx::add_package_opt(current_package_opts_storage_vec_func(), priority, flags, defopt, fname.data(), line);
+    }
+
+    /** @brief process invoke the add_package_opt function for external oppkg */
+    void pkg_process(std::string_view const fname) const
+    {
+        hnnx::add_package_opt(current_package_opts_storage_vec_func(), priority, flags, defopt, fname.data(), line);
+    }
+};
+#endif
+
+/** @brief sv_size_wrapper a wrapper template for string_view that carries the view size as
+ *  a template parameter. This allows the size to be inferred by the built_array
+ *  constructor template.
+ */
+
+// LCOV_EXCL_START [SAFTYSWCCB-1736] constexprs resolved during compile time
+template <std::string_view::size_type S> struct sv_size_wrapper {
+    std::string_view v;
+};
+
+/** @brief built_array */
+template <typename T, uint32_t N> class built_array {
+    /** @brief arr */
+    std::array<T, N> arr{};
+
+  public:
+    /** @brief size
+     *  @return the array size
+     */
+    static constexpr uint32_t size() { return N; }
+    /** @brief get_arr
+     *  @return the array
+     */
+    constexpr const std::array<T, N> get_arr() const noexcept { return arr; }
+    /** @brief built_array
+     *  @param old the previous array
+     *  @param newElem the new element to append
+     */
+    constexpr built_array(built_array<T, N - 1> const &old, T newElem)
+    {
+        if constexpr (N > 1) {
+            for (uint32_t i = 0U; i < N - 1U; i++) {
+                arr[i] = old.get_arr()[i];
+            }
+        }
+        arr[N - 1U] = newElem;
+    }
+    /** @brief append
+     *  @param newElem the new element to append
+     *  @return the new array
+     */
+    constexpr built_array<T, N + 1> append(T newElem) const { return built_array<T, N + 1>(*this, newElem); }
+
+    /** @brief append
+     *  @param newElem the new element to append
+     *  @return the new array
+     */
+    template <std::string_view::size_type I> constexpr built_array<T, N + I> append(sv_size_wrapper<I> newElem) const
+    {
+        return built_array<T, N + I>(*this, newElem);
+    }
+
+    /** @brief built_array
+     *  @param old the previous array
+     *  @param newElem a view of the array of new elements to append
+     */
+    template <std::string_view::size_type I>
+    constexpr built_array(built_array<T, N - I> const &old, sv_size_wrapper<I> newElem)
+    {
+        if constexpr (N > I) {
+            for (uint32_t i = 0U; i < (N - I); i++) {
+                arr[i] = old.get_arr()[i];
+            }
+        }
+        for (uint32_t i = (N - I); i < N; i++) {
+            arr[i] = newElem.v[i - (N - I)];
+        }
+    }
+};
+
+/** @brief built_array specialization for N = 0 */
+template <typename T> class built_array<T, 0> {
+  public:
+    /** @brief built_array constructor */
+    constexpr built_array() = default;
+    /** @brief append
+     *  @param newElem the new element to append
+     *  @return the new array
+     */
+    constexpr built_array<T, 1> append(T newElem) const { return built_array<T, 1>(*this, newElem); }
+
+    /** @brief append
+     *  @param newElem the new element to append
+     *  @return the new array
+     */
+    template <std::string_view::size_type I> constexpr built_array<T, I> append(sv_size_wrapper<I> newElem) const
+    {
+        return built_array<T, I>(*this, newElem);
+    }
+
+    /** @brief get_arr
+     *  @return the array
+     */
+    constexpr static const std::array<T, 0> get_arr() noexcept { return std::array<T, 0>{}; }
+};
+
+/** @brief op_name_strtab_t empty struct to help specialize arr_container for the op_name string table */
+struct op_name_strtab_t {
+};
+/** @brief type_tag_strtab empty struct to help specialize arr_container for the type_tag string table */
+struct type_tag_strtab_t {
+};
+
+template <typename> constexpr bool is_strtab()
+{
+    return false;
+}
+template <> constexpr bool is_strtab<op_name_strtab_t>()
+{
+    return true;
+}
+template <> constexpr bool is_strtab<type_tag_strtab_t>()
+{
+    return true;
+}
+
+// LCOV_EXCL_STOP
+
+/** @brief arr_container */
+template <typename T, bool S = is_strtab<T>()> struct arr_container {
+    /** @brief chain link to the built_array contained in this structure */
+    template <typename UNIQ_TY, uint32_t I> static constexpr built_array<T, I> chain = {};
+};
+
+/** @brief arr_container<T, true> */
+template <typename T> struct arr_container<T, true> {
+    /** @brief chain link to the built_array contained in this structure */
+    template <typename UNIQ_TY, uint32_t I, uint32_t S>
+    static constexpr built_array<std::string::value_type, S> chain = {};
+};
+
+/** @brief reg_op_table */
+class reg_op_table {
+    reg_op_node const *entries;
+    uint32_t num_entries;
+    std::string_view op_name_strtab;
+    std::string_view type_tag_strtab;
+
+  public:
+    constexpr reg_op_node const *get_entries() const noexcept { return entries; }
+    constexpr uint32_t get_num_entries() const noexcept { return num_entries; }
+    constexpr std::string_view const get_op_name_strtab() const noexcept { return op_name_strtab; }
+    constexpr std::string_view const get_type_tag_strtab() const noexcept { return type_tag_strtab; }
+    // LCOV_EXCL_START [SAFTYSWCCB-1736] constexprs resolved during compile time, consexpr constructor
+    constexpr reg_op_table(reg_op_node const *const p, uint32_t const n, std::string_view::value_type const *const o,
+                           std::string_view::size_type const o_size, std::string_view::value_type const *const t,
+                           std::string_view::size_type const t_size) noexcept
+        : entries(p), num_entries(n), op_name_strtab{o, o_size}, type_tag_strtab{t, t_size}
+    {
+    }
+    constexpr reg_op_table() noexcept : reg_op_table(nullptr, 0U, "", 0U, "", 0U) {}
+    // LCOV_EXCL_STOP
+};
+
+/** @brief reg_op_table_wrapper */
+using reg_op_table_wrapper = reg_op_table const *(*)();
+
+/** @brief reg_opt_table */
+class reg_opt_table {
+    reg_optim_node const *entries;
+    uint32_t num_entries;
+    std::string_view file_name;
+
+  public:
+    constexpr reg_optim_node const *get_entries() const noexcept { return entries; }
+    constexpr uint32_t get_num_entries() const noexcept { return num_entries; }
+    constexpr std::string_view const get_file_name() const noexcept { return file_name; }
+    // LCOV_EXCL_START [SAFTYSWCCB-1736] constexprs resolved during compile time, consexpr constructor
+    constexpr reg_opt_table(reg_optim_node const *const p, uint32_t const n,
+                            std::string_view::value_type const *const f) noexcept
+        : entries(p), num_entries(n), file_name{f}
+    {
+    }
+    constexpr reg_opt_table() noexcept : reg_opt_table(nullptr, 0U, "") {}
+    // LCOV_EXCL_STOP
+};
+
+/** @brief reg_opt_table_wrapper */
+using reg_opt_table_wrapper = reg_opt_table const *(*)();
+
+/** @brief op_name_strtab_container */
+using op_name_strtab_container = arr_container<op_name_strtab_t>;
+/** @brief type_tag_strtab_container */
+using type_tag_strtab_container = arr_container<type_tag_strtab_t>;
+/** @brief op_arr_container */
+using op_arr_container = arr_container<reg_op_node>;
+/** @brief opt_arr_container */
+using opt_arr_container = arr_container<reg_optim_node>;
+/** @brief op_table_arr_container */
+using op_table_arr_container = arr_container<reg_op_table_wrapper>;
+/** @brief opt_table_arr_container */
+using opt_table_arr_container = arr_container<reg_opt_table_wrapper>;
+
+/** @brief ba_str a built_array of char strings */
+template <uint32_t I> using ba_str = built_array<std::string::value_type, I>;
+
+/** @brief ba_op a built_array of reg_op_nodes */
+template <uint32_t I> using ba_op = built_array<reg_op_node, I>;
+
+/** @brief ba_opt a built_array of reg_optim_nodes */
+template <uint32_t I> using ba_opt = built_array<reg_optim_node, I>;
+
+/** @brief ba_op_table a built_array of reg_op_table_wrappers */
+template <uint32_t I> using ba_op_table = built_array<reg_op_table_wrapper, I>;
+
+/** @brief ba_opt_table a built_array of reg_opt_table_wrappers */
+template <uint32_t I> using ba_opt_table = built_array<reg_opt_table_wrapper, I>;
+
+/**
+ * @brief
+ * NodeCounter template converts the __COUNTER__'s
+ * current value to counts for number of reg_op_nodes and
+ * reg_optim_nodes created so far. It is specialized upon every
+ * REGISTER_OP/DEF_OPT by incrementing either "reg_op_count"
+ * or "reg_opt_count", with member functions that can get the
+ * current counts.
+ */
+// LCOV_EXCL_START [SAFTYSWCCB-1736] constexprs resolved during compile time
+template <typename UNIQ_TY, int32_t I> class NodeCounter {
+    /** @brief inc_op @return 0, or 1 if the op count is incremented */
+    constexpr static int32_t inc_op() noexcept { return 0; }
+    /** @brief inc_opt @return 0, or 1 if the opt count is incremented */
+    constexpr static int32_t inc_opt() noexcept { return 0; }
+    /** @brief inc_op_name_strtab_size @return 0, or some string size constant if the string table needs to grow */
+    constexpr static uint64_t inc_op_name_strtab_size() noexcept { return 0; }
+    /** @brief inc_type_tag_strtab_size @return 0, or some string size constant if the string table needs to grow */
+    constexpr static uint64_t inc_type_tag_strtab_size() noexcept { return 0; }
+
+  public:
+    /** @brief reg_op_count @return The number of ops that have been registered so far */
+    constexpr static int32_t reg_op_count() noexcept { return inc_op() + NodeCounter<UNIQ_TY, I - 1>::reg_op_count(); }
+    /** @brief reg_op_count @return The number of opts that have been registered so far */
+    constexpr static int32_t reg_opt_count() noexcept
+    {
+        return inc_opt() + NodeCounter<UNIQ_TY, I - 1>::reg_opt_count();
+    }
+    /** @brief op_name_strtab_size @return The string table size for the ops that have been registered so far */
+    constexpr static uint64_t op_name_strtab_size() noexcept
+    {
+        return inc_op_name_strtab_size() + NodeCounter<UNIQ_TY, I - 1>::op_name_strtab_size();
+    }
+    /** @brief type_tag_strtab_size @return The string table size for the ops that have been registered so far */
+    constexpr static uint64_t type_tag_strtab_size() noexcept
+    {
+        return inc_type_tag_strtab_size() + NodeCounter<UNIQ_TY, I - 1>::type_tag_strtab_size();
+    }
+};
+
+/** @brief Shorthand for op_name_strtab_container::chain<...> */
+template <typename U, size_t I> constexpr auto op_name_chain()
+{
+    return op_name_strtab_container::chain<U, NodeCounter<U, I>::reg_op_count(),
+                                           NodeCounter<U, I>::op_name_strtab_size()>;
+}
+
+/** @brief Shorthand for type_tag_strtab_container::chain<...> */
+template <typename U, size_t I> constexpr auto type_tag_chain()
+{
+    return type_tag_strtab_container::chain<U, NodeCounter<U, I>::reg_op_count(),
+                                            NodeCounter<U, I>::type_tag_strtab_size()>;
+}
+
+/**
+ * @brief StrtabUpdate A class template for storing:
+ * - The result of the existence check
+ * - The offset the new string
+ * when appending to the Op name and type suffix string tables.
+ *
+ * @tparam U Unique type for identifying the translation unit containing this update
+ * @tparam I Unique index number identifying the "ith" Op to be registered in this file
+ */
+template <typename U, uint32_t I> struct StrtabUpdate {
+    /** @brief is_new_op_name whether the Op name to be appended is already present in the op_name_strtab */
+    static bool const is_new_op_name;
+    /** @brief op_name_offset short offset locating the Op name in the op_name_strtab */
+    static uint16_t const op_name_offset;
+    /** @brief is_new_type_tag whether the type suffix to be appended is already present in the type_tag_strtab */
+    static bool const is_new_type_tag;
+    /** @brief type_tag_offset short offset locating the type suffix in the type_tag_strtab */
+    static uint16_t const type_tag_offset;
+};
+
+/**
+ * @brief strtab_append Append string to table iff it is not already present.
+ * @tparam U Unique type to ensure independence of specializations across translation units
+ * @tparam I REGISTER_OP index number
+ * @tparam N Current table size
+ */
+template <typename U, uint32_t I, std::string_view::size_type M, bool A, uint32_t N>
+constexpr auto strtab_append(ba_str<N> const &curr, std::string_view const newString)
+{
+    if constexpr (A) {
+        sv_size_wrapper<M> const w{newString};
+        return curr.append(w);
+    } else {
+        return curr;
+    }
+}
+
+/** @brief make_string_view Convert an array of string data into a string_view, but
+ *  substitute in an empty string if the array's .data() would be nullptr.
+ */
+template <std::string_view::size_type N>
+constexpr std::string_view make_string_view(std::array<std::string::value_type, N> const &arr) noexcept
+{
+    return arr.size() != 0 ? std::string_view{arr.data(), arr.size()} : std::string_view{"", 1};
+}
+// LCOV_EXCL_STOP
+
+} // namespace hnnx
+
+#endif // OPS_OPTS_REGISTRATION_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/ops_opts_registration_defs.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/ops_opts_registration_defs.h
new file mode 100755
index 0000000000000..ea6eab42b94ec
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/ops_opts_registration_defs.h
@@ -0,0 +1,445 @@
+//==============================================================================
+//
+// Copyright (c) 2020-2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef OPS_OPTS_REGISTRATION_DEFS_H
+#define OPS_OPTS_REGISTRATION_DEFS_H 1
+
+#include "unique_types.h"
+#include "c_tricks.h"
+
+namespace fold {
+template <auto, int> struct ModifiedDerivedType;
+} //namespace fold
+
+/** @brief IMPL_APPEND_REG_OP_ELEM_NO_TCM_FOLDING (used by REGISTER_OP, REGISTER_OP_HVX, etc.) */
+// LCOV_EXCL_START [SAFTYSWCCB-1736] constexprs resolved during compile time
+#define IMPL_APPEND_REG_OP_ELEM_NO_TCM_FOLDING(I, FP, OP, TAG, IS_SIMPLE)                                              \
+    /** @brief Increment the Op count for this file @return 1 */                                                       \
+    template <> constexpr int32_t NC<(I)>::inc_op() noexcept { return 1; }                                             \
+                                                                                                                       \
+    /** @brief Whether the name for this Op is already present in the Op name string table */                          \
+    template <>                                                                                                        \
+    constexpr bool StrUpd<(I)>::is_new_op_name =                                                                       \
+            make_string_view(op_name_chain<UniqTy<0>, I - 1>().get_arr()).rfind(std::string_view{(OP), sizeof(OP)}) == \
+            std::string_view::npos;                                                                                    \
+                                                                                                                       \
+    /** @brief Whether the name for this type suffix is already present in the type suffix string table */             \
+    template <>                                                                                                        \
+    constexpr bool StrUpd<(I)>::is_new_type_tag =                                                                      \
+            make_string_view(type_tag_chain<UniqTy<0>, I - 1>().get_arr())                                             \
+                    .rfind(std::string_view{(TAG).data(), (TAG).size()}) == std::string_view::npos;                    \
+                                                                                                                       \
+    /** @brief Update the size of the Op name string table for this file. @return 0 or sizeof(OP) */                   \
+    template <> constexpr uint64_t NC<(I)>::inc_op_name_strtab_size() noexcept                                         \
+    {                                                                                                                  \
+        if (StrUpd<(I)>::is_new_op_name) {                                                                             \
+            return sizeof(OP);                                                                                         \
+        } else {                                                                                                       \
+            return 0U;                                                                                                 \
+        }                                                                                                              \
+    }                                                                                                                  \
+                                                                                                                       \
+    /** @brief Update the size of the type suffix string table for this file. @return 0 or TAG.size() */               \
+    template <> constexpr uint64_t NC<(I)>::inc_type_tag_strtab_size() noexcept                                        \
+    {                                                                                                                  \
+        if (StrUpd<(I)>::is_new_type_tag) {                                                                            \
+            return (TAG).size();                                                                                       \
+        } else {                                                                                                       \
+            return 0U;                                                                                                 \
+        }                                                                                                              \
+    }                                                                                                                  \
+                                                                                                                       \
+    /** @brief Grow the Op name string table for this file. No-op if it already contains the string */                 \
+    template <>                                                                                                        \
+    template <>                                                                                                        \
+    constexpr ba_str<NC<(I)>::op_name_strtab_size()>                                                                   \
+            op_name_strtab_container::chain<UniqTy<0>, NC<(I)>::reg_op_count(), NC<(I)>::op_name_strtab_size()> =      \
+                    strtab_append<UniqTy<0>, I, sizeof(OP), StrUpd<(I)>::is_new_op_name>(                              \
+                            op_name_chain<UniqTy<0>, I - 1>(), std::string_view{(OP), sizeof(OP)});                    \
+                                                                                                                       \
+    /** @brief Get the offset of this Op name in the Op name string table. */                                          \
+    template <>                                                                                                        \
+    constexpr uint16_t StrUpd<(I)>::op_name_offset = static_cast<uint16_t>(                                            \
+            make_string_view(op_name_chain<UniqTy<0>, I>().get_arr()).rfind(std::string_view{(OP), sizeof(OP)}));      \
+                                                                                                                       \
+    /** @brief Grow the type suffix string table for this file. No-op if it already contains the string */             \
+    template <>                                                                                                        \
+    template <>                                                                                                        \
+    constexpr ba_str<NC<(I)>::type_tag_strtab_size()>                                                                  \
+            type_tag_strtab_container::chain<UniqTy<0>, NC<(I)>::reg_op_count(), NC<(I)>::type_tag_strtab_size()> =    \
+                    strtab_append<UniqTy<0>, I, (TAG).size(), StrUpd<(I)>::is_new_type_tag>(                           \
+                            type_tag_chain<UniqTy<0>, I - 1>(), std::string_view{(TAG).data(), (TAG).size()});         \
+                                                                                                                       \
+    /** @brief Record the offset of this type suffix in the type suffix string table. */                               \
+    template <>                                                                                                        \
+    constexpr uint16_t StrUpd<(I)>::type_tag_offset =                                                                  \
+            static_cast<uint16_t>(make_string_view(type_tag_chain<UniqTy<0>, I>().get_arr())                           \
+                                          .rfind(std::string_view{(TAG).data(), (TAG).size()}));                       \
+                                                                                                                       \
+    /** @brief Finally, append a new element to the Op registration table. */                                          \
+    template <>                                                                                                        \
+    template <>                                                                                                        \
+    constexpr ba_op<NC<(I)>::reg_op_count()> op_arr_container::chain<UniqTy<0>, NC<(I)>::reg_op_count()> =             \
+            chain<UniqTy<0>, NC<(I - 1)>::reg_op_count()>.append(                                                      \
+                    hnnx::reg_op_node{hnnx::GetParms<IS_SIMPLE>::get<FP, I>(), StrUpd<(I)>::op_name_offset,            \
+                                      StrUpd<(I)>::type_tag_offset});
+
+/** @brief IMPL_APPEND_REG_OP_ELEM (used by REGISTER_OP, REGISTER_OP_HVX, etc.) */
+#define IMPL_APPEND_REG_OP_ELEM(I, FP, OP, TAG, LINE)                                                                  \
+    /** @brief Increment the Op count for this file @return 1 */                                                       \
+    template <> constexpr int32_t NC<(I)>::inc_op() noexcept { return 1; }                                             \
+                                                                                                                       \
+    /** @brief Whether the name for this Op is already present in the Op name string table */                          \
+    template <>                                                                                                        \
+    constexpr bool StrUpd<(I)>::is_new_op_name =                                                                       \
+            make_string_view(op_name_chain<UniqTy<0>, I - 1>().get_arr()).find(std::string_view{(OP), sizeof(OP)}) ==  \
+            std::string_view::npos;                                                                                    \
+                                                                                                                       \
+    /** @brief Whether the name for this type suffix is already present in the type suffix string table */             \
+    template <>                                                                                                        \
+    constexpr bool StrUpd<(I)>::is_new_type_tag =                                                                      \
+            make_string_view(type_tag_chain<UniqTy<0>, I - 1>().get_arr())                                             \
+                    .find(std::string_view{(TAG).data(), (TAG).size()}) == std::string_view::npos;                     \
+                                                                                                                       \
+    /** @brief Update the size of the Op name string table for this file. @return 0 or sizeof(OP) */                   \
+    template <> constexpr uint64_t NC<(I)>::inc_op_name_strtab_size() noexcept                                         \
+    {                                                                                                                  \
+        if (StrUpd<(I)>::is_new_op_name) {                                                                             \
+            return sizeof(OP);                                                                                         \
+        } else {                                                                                                       \
+            return 0U;                                                                                                 \
+        }                                                                                                              \
+    }                                                                                                                  \
+                                                                                                                       \
+    /** @brief Update the size of the type suffix string table for this file. @return 0 or TAG.size() */               \
+    template <> constexpr uint64_t NC<(I)>::inc_type_tag_strtab_size() noexcept                                        \
+    {                                                                                                                  \
+        if (StrUpd<(I)>::is_new_type_tag) {                                                                            \
+            return (TAG).size();                                                                                       \
+        } else {                                                                                                       \
+            return 0U;                                                                                                 \
+        }                                                                                                              \
+    }                                                                                                                  \
+                                                                                                                       \
+    /** @brief Grow the Op name string table for this file. No-op if it already contains the string */                 \
+    template <>                                                                                                        \
+    template <>                                                                                                        \
+    constexpr ba_str<NC<(I)>::op_name_strtab_size()>                                                                   \
+            op_name_strtab_container::chain<UniqTy<0>, NC<(I)>::reg_op_count(), NC<(I)>::op_name_strtab_size()> =      \
+                    strtab_append<UniqTy<0>, I, sizeof(OP), StrUpd<(I)>::is_new_op_name>(                              \
+                            op_name_chain<UniqTy<0>, I - 1>(), std::string_view{(OP), sizeof(OP)});                    \
+                                                                                                                       \
+    /** @brief Get the offset of this Op name in the Op name string table. */                                          \
+    template <>                                                                                                        \
+    constexpr uint16_t StrUpd<(I)>::op_name_offset = static_cast<uint16_t>(                                            \
+            make_string_view(op_name_chain<UniqTy<0>, I>().get_arr()).find(std::string_view{(OP), sizeof(OP)}));       \
+                                                                                                                       \
+    /** @brief Grow the type suffix string table for this file. No-op if it already contains the string */             \
+    template <>                                                                                                        \
+    template <>                                                                                                        \
+    constexpr ba_str<NC<(I)>::type_tag_strtab_size()>                                                                  \
+            type_tag_strtab_container::chain<UniqTy<0>, NC<(I)>::reg_op_count(), NC<(I)>::type_tag_strtab_size()> =    \
+                    strtab_append<UniqTy<0>, I, (TAG).size(), StrUpd<(I)>::is_new_type_tag>(                           \
+                            type_tag_chain<UniqTy<0>, I - 1>(), std::string_view{(TAG).data(), (TAG).size()});         \
+                                                                                                                       \
+    /** @brief Record the offset of this type suffix in the type suffix string table. */                               \
+    template <>                                                                                                        \
+    constexpr uint16_t StrUpd<(I)>::type_tag_offset =                                                                  \
+            static_cast<uint16_t>(make_string_view(type_tag_chain<UniqTy<0>, I>().get_arr())                           \
+                                          .find(std::string_view{(TAG).data(), (TAG).size()}));                        \
+                                                                                                                       \
+    /** @brief Finally, append a new element to the Op registration table. */                                          \
+    /** @brief IS_SIMPLE argument to GetParms::get is always false; we only fold for internal ops, not op packages */  \
+    template <>                                                                                                        \
+    template <>                                                                                                        \
+    constexpr ba_op<NC<(I)>::reg_op_count()> op_arr_container::chain<UniqTy<0>, NC<(I)>::reg_op_count()> =             \
+            chain<UniqTy<0>, NC<(I - 1)>::reg_op_count()>.append(                                                      \
+                    hnnx::reg_op_node{hnnx::GetParms<false>::get<fold::ModifiedDerivedType<FP, LINE>::Modified, I>(),  \
+                                      StrUpd<(I)>::op_name_offset, StrUpd<(I)>::type_tag_offset});
+
+/** @brief APPEND_REG_OP_ELEM (used by REGISTER_OP, REGISTER_OP_HVX, etc.) */
+#define APPEND_REG_OP_ELEM(FP, OP, TAG, LINE) IMPL_APPEND_REG_OP_ELEM(__COUNTER__, FP, OP, TAG, LINE)
+/** @breif see register-op-tcm-folding.md **/
+#define APPEND_REG_OP_ELEM_NO_TCM_FOLDING(FP, OP, TAG, IS_SIMPLE)                                                      \
+    IMPL_APPEND_REG_OP_ELEM_NO_TCM_FOLDING(__COUNTER__, FP, OP, TAG, IS_SIMPLE)
+
+/** @brief IMPL_APPEND_REG_OPT_ELEM (used by DEF_OPT and DEF_OPTIM) */
+#define IMPL_APPEND_REG_OPT_ELEM(I, PRIORITY, FLAGS, DEFOPTFN, LINE)                                                   \
+                                                                                                                       \
+    /** @brief Increment the Optimization count for this file @return 1 */                                             \
+    template <> constexpr int32_t NC<(I)>::inc_opt() noexcept { return 1; }                                            \
+                                                                                                                       \
+    /** @brief Append a new element to the Optimization registration table. */                                         \
+    template <>                                                                                                        \
+    template <>                                                                                                        \
+    constexpr ba_opt<NC<(I)>::reg_opt_count()> opt_arr_container::chain<UniqTy<0>, NC<(I)>::reg_opt_count()> =         \
+            chain<UniqTy<0>, NC<(I - 1)>::reg_opt_count()>.append(hnnx::reg_optim_node{                                \
+                    static_cast<uint16_t>(PRIORITY), (FLAGS), (DEFOPTFN), static_cast<uint16_t>(LINE)});
+
+/** @brief APPEND_REG_OPT_ELEM (used by DEF_OPT and DEF_OPTIM) */
+#define APPEND_REG_OPT_ELEM(PRIORITY, FLAGS, DEFOPTFN, LINE)                                                           \
+    IMPL_APPEND_REG_OPT_ELEM(__COUNTER__, PRIORITY, FLAGS, DEFOPTFN, LINE)
+
+#define IMPL_INITIALIZE_TABLES(COUNT)                                                                                  \
+    DEFINE_UNIQ_TY()                                                                                                   \
+    using hnnx::reg_op_node;                                                                                           \
+    using hnnx::reg_optim_node;                                                                                        \
+    using hnnx::built_array;                                                                                           \
+    using hnnx::ba_op;                                                                                                 \
+    using hnnx::ba_opt;                                                                                                \
+    using hnnx::ba_str;                                                                                                \
+    using hnnx::NodeCounter;                                                                                           \
+    using hnnx::op_arr_container;                                                                                      \
+    using hnnx::opt_arr_container;                                                                                     \
+    using hnnx::op_name_strtab_container;                                                                              \
+    using hnnx::type_tag_strtab_container;                                                                             \
+    using hnnx::StrtabUpdate;                                                                                          \
+    using hnnx::strtab_append;                                                                                         \
+    using hnnx::make_string_view;                                                                                      \
+    using hnnx::op_name_chain;                                                                                         \
+    using hnnx::type_tag_chain;                                                                                        \
+    namespace {                                                                                                        \
+    template <int32_t I> using NC = NodeCounter<UniqTy<0>, I>;                                                         \
+    template <int32_t I> using StrUpd = StrtabUpdate<UniqTy<0>, I>;                                                    \
+    }                                                                                                                  \
+    template <> constexpr int32_t NC<(COUNT)>::reg_op_count() noexcept { return 0; }                                   \
+    template <> constexpr int32_t NC<(COUNT)>::reg_opt_count() noexcept { return 0; }                                  \
+    template <> constexpr uint64_t NC<(COUNT)>::op_name_strtab_size() noexcept { return 0U; }                          \
+    template <> constexpr uint64_t NC<(COUNT)>::type_tag_strtab_size() noexcept { return 0U; }                         \
+    template <> template <> constexpr ba_op<0> op_arr_container::chain<UniqTy<0>, 0> = {};                             \
+    template <> template <> constexpr ba_opt<0> opt_arr_container::chain<UniqTy<0>, 0> = {};                           \
+    template <> template <> constexpr ba_str<0> op_name_strtab_container::chain<UniqTy<0>, 0, 0> = {};                 \
+    template <> template <> constexpr ba_str<0> type_tag_strtab_container::chain<UniqTy<0>, 0, 0> = {};
+
+// LCOV_EXCL_STOP
+
+#define INITIALIZE_TABLES() IMPL_INITIALIZE_TABLES(__COUNTER__)
+
+#define OPS_REG_TABLE(NAME)      CTRICKS_PASTER(NAME, _inner_ops_regist_table)
+#define OP_NAME_STR_TABLE(NAME)  CTRICKS_PASTER(NAME, _inner_op_name_strtab)
+#define TYPE_TAG_STR_TABLE(NAME) CTRICKS_PASTER(NAME, _inner_type_tag_strtab)
+#define EXT_OPS_REG_TABLE(NAME)  CTRICKS_PASTER(NAME, _ops_table)
+
+#define OPTS_REG_TABLE(NAME)     CTRICKS_PASTER(NAME, _inner_opts_regist_table)
+#define EXT_OPTS_REG_TABLE(NAME) CTRICKS_PASTER(NAME, _opts_table)
+
+/**
+ * @brief IMPL_FINALIZE_TABLES defines the registration tables for both
+ * the ops and opts defined in the Op source file
+ */
+#define IMPL_FINALIZE_TABLES(COUNT, NAME)                                                                                                                              \
+    namespace {                                                                                                                                                        \
+    /** @brief The completed Op registration table */                                                                                                                  \
+    constexpr auto OPS_REG_TABLE(NAME) = op_arr_container::chain<UniqTy<0>, NC<(COUNT)>::reg_op_count()>.get_arr();                                                    \
+    /** @brief The completed Op name string table */                                                                                                                   \
+    constexpr auto OP_NAME_STR_TABLE(NAME) =                                                                                                                           \
+            op_name_strtab_container::chain<UniqTy<0>, NC<(COUNT)>::reg_op_count(), NC<(COUNT)>::op_name_strtab_size()>.get_arr();                                     \
+    /** @brief The completed type suffix string table */                                                                                                               \
+    constexpr auto TYPE_TAG_STR_TABLE(NAME) = type_tag_strtab_container::chain<UniqTy<0>, NC<(COUNT)>::reg_op_count(), NC<(COUNT)>::type_tag_strtab_size()>.get_arr(); \
+    /** @brief The completed Optimization registration table */                                                                                                        \
+    constexpr auto OPTS_REG_TABLE(NAME) = opt_arr_container::chain<UniqTy<0>, NC<(COUNT)>::reg_opt_count()>.get_arr();                                                 \
+    }                                                                                                                                                                  \
+    namespace hnnx {                                                                                                                                                   \
+    /** @brief Exported getter function for the Op registration table, its associated string tables, and their sizes */                                                \
+    extern "C" reg_op_table const *EXT_OPS_REG_TABLE(NAME)()                                                                                                           \
+    {                                                                                                                                                                  \
+        static constexpr reg_op_table table{                                                                                                                           \
+                OPS_REG_TABLE(NAME).empty() ? nullptr : &OPS_REG_TABLE(NAME).front(),                                                                                  \
+                OPS_REG_TABLE(NAME).size(),                                                                                                                            \
+                OP_NAME_STR_TABLE(NAME).empty() ? nullptr : &OP_NAME_STR_TABLE(NAME).front(),                                                                          \
+                OP_NAME_STR_TABLE(NAME).size(),                                                                                                                        \
+                TYPE_TAG_STR_TABLE(NAME).empty() ? nullptr : &TYPE_TAG_STR_TABLE(NAME).front(),                                                                        \
+                TYPE_TAG_STR_TABLE(NAME).size()};                                                                                                                      \
+        return &table;                                                                                                                                                 \
+    }                                                                                                                                                                  \
+    /** @brief Exported getter function for the Optimization registration table and its size */                                                                        \
+    extern "C" reg_opt_table const *EXT_OPTS_REG_TABLE(NAME)()                                                                                                         \
+    {                                                                                                                                                                  \
+        static constexpr reg_opt_table table{OPTS_REG_TABLE(NAME).size() ? &OPTS_REG_TABLE(NAME).front() : nullptr,                                                    \
+                                             OPTS_REG_TABLE(NAME).size(), __FILE__};                                                                                   \
+        return &table;                                                                                                                                                 \
+    }                                                                                                                                                                  \
+    }
+
+/**
+ * @brief FINALIZE_TABLES is a thunk to IMPL_FINALIZE_TABLES
+ *
+ */
+#define FINALIZE_TABLES(NAME) IMPL_FINALIZE_TABLES(__COUNTER__, NAME)
+
+// The following macros are applied in ops_opts_registration.cc
+
+#ifdef _M_ARM64EC
+// ARM64EC functions with C linkage prepend '#' to the decorated name
+#define EMPTY_OPS_TABLE  #default_empty_ops_table
+#define EMPTY_OPTS_TABLE #default_empty_opts_table
+#else
+#define EMPTY_OPS_TABLE  default_empty_ops_table
+#define EMPTY_OPTS_TABLE default_empty_opts_table
+#endif
+
+#if defined(_MSC_VER)
+/**
+ * These macros provide the MSVC-equivalent implementation of weak linkage,
+ * by use of __pragma(comment(linker, /alternatename:<symbol>=<alias>)).
+ *
+ * This pragma is analogous to __attribute__((weak, alias("<alias>")))
+ * when applied to a symbol on GCC/Clang.
+ */
+#define MSVC_LINKER_PRAGMA2(ARG) __pragma(comment(linker, #ARG))
+
+// clang-format off
+#define MSVC_LINKER_PRAGMA(SYMBOL, ALT) MSVC_LINKER_PRAGMA2(/alternatename:SYMBOL=ALT)
+// clang-format on
+
+#define OPS_TABLE_WEAK_SYMBOL(NAME)                                                                                    \
+    MSVC_LINKER_PRAGMA(EXT_OPS_REG_TABLE(NAME), EMPTY_OPS_TABLE)                                                       \
+    extern "C" reg_op_table const *EXT_OPS_REG_TABLE(NAME)();
+
+#define OPTS_TABLE_WEAK_SYMBOL(NAME)                                                                                   \
+    MSVC_LINKER_PRAGMA(EXT_OPTS_REG_TABLE(NAME), EMPTY_OPTS_TABLE)                                                     \
+    extern "C" reg_opt_table const *EXT_OPTS_REG_TABLE(NAME)();
+
+#else
+
+#define DECLARE_CLANG_WEAK_SYMBOL2(TYPE, IDENT, ALIAS)                                                                 \
+    extern "C" TYPE const *IDENT() __attribute__((weak, alias(#ALIAS)));
+
+#define DECLARE_CLANG_WEAK_SYMBOL(TYPE, ID, ALIAS) DECLARE_CLANG_WEAK_SYMBOL2(TYPE, ID, ALIAS)
+
+#define OPS_TABLE_WEAK_SYMBOL(NAME) DECLARE_CLANG_WEAK_SYMBOL(reg_op_table, EXT_OPS_REG_TABLE(NAME), EMPTY_OPS_TABLE)
+#define OPTS_TABLE_WEAK_SYMBOL(NAME)                                                                                   \
+    DECLARE_CLANG_WEAK_SYMBOL(reg_opt_table, EXT_OPTS_REG_TABLE(NAME), EMPTY_OPTS_TABLE)
+
+#endif
+
+/**
+ * @brief As part of loading the HTP core, register all of the Ops and Optimization rules.
+ * This will be called at static-initialization time.
+*/
+#define OP_OPT_PROCESSOR(PREFIX)                                                                                       \
+    namespace hnnx {                                                                                                   \
+    void PREFIX##_process_op_registration_list()                                                                       \
+    {                                                                                                                  \
+        const uint32_t size = ::PREFIX##_op_package_ops_list.size();                                                   \
+        for (uint32_t i = 0U; i < size; i++) {                                                                         \
+            reg_op_table const *const op_tab = ::PREFIX##_op_package_ops_list[i]();                                    \
+            reg_op_node const *const entries = op_tab->get_entries();                                                  \
+            std::string_view const names = op_tab->get_op_name_strtab();                                               \
+            std::string_view const suffixes = op_tab->get_type_tag_strtab();                                           \
+            for (uint32_t j = 0U; j < op_tab->get_num_entries(); j++) {                                                \
+                const reg_op_node reg_op = entries[j];                                                                 \
+                reg_op.PREFIX##_process(names, suffixes);                                                              \
+            }                                                                                                          \
+        }                                                                                                              \
+    }                                                                                                                  \
+    void PREFIX##_process_opt_registration_list()                                                                      \
+    {                                                                                                                  \
+        const uint32_t size = ::PREFIX##_op_package_opts_list.size();                                                  \
+        for (uint32_t i = 0U; i < size; i++) {                                                                         \
+            reg_opt_table const *const opt_tab = ::PREFIX##_op_package_opts_list[i]();                                 \
+            reg_optim_node const *const entries = opt_tab->get_entries();                                              \
+            std::string_view const fname = opt_tab->get_file_name();                                                   \
+            /* LCOV_EXCL_START [SAFTYSWCCB-1542] */                                                                    \
+            for (uint32_t j = 0U; j < opt_tab->get_num_entries(); j++) {                                               \
+                const reg_optim_node reg_opt = entries[j];                                                             \
+                reg_opt.PREFIX##_process(fname);                                                                       \
+                /* Silences AUTOSAR checker when PREPARE_DISABLED is set */                                            \
+                (void)reg_opt;                                                                                         \
+            }                                                                                                          \
+            /* LCOV_EXCL_STOP */                                                                                       \
+        }                                                                                                              \
+    }                                                                                                                  \
+    static void PREFIX##_ops_opts_registration()                                                                       \
+    {                                                                                                                  \
+        PREFIX##_process_op_registration_list();                                                                       \
+        PREFIX##_process_opt_registration_list();                                                                      \
+    }                                                                                                                  \
+    } // namespace hnnx
+
+#define IMPL_BEGIN_OPS_OPTS_LIST(I)                                                                                    \
+    namespace hnnx {                                                                                                   \
+    extern "C" reg_op_table const *default_empty_ops_table()                                                           \
+    {                                                                                                                  \
+        static const reg_op_table table{};                                                                             \
+        return &table;                                                                                                 \
+    }                                                                                                                  \
+    extern "C" reg_opt_table const *default_empty_opts_table()                                                         \
+    {                                                                                                                  \
+        static const reg_opt_table table{};                                                                            \
+        return &table;                                                                                                 \
+    }                                                                                                                  \
+    template <> template <> constexpr ba_op_table<0> op_table_arr_container::chain<UniqTy<0>, (I)> = {};               \
+    template <> template <> constexpr ba_opt_table<0> opt_table_arr_container::chain<UniqTy<0>, (I)> = {};             \
+    }
+
+/** @brief Begin defining the list of Ops/Opts registration lists */
+#define BEGIN_OPS_OPTS_LIST()     IMPL_BEGIN_OPS_OPTS_LIST(__COUNTER__)
+#define BEGIN_PKG_OPS_OPTS_LIST() IMPL_BEGIN_OPS_OPTS_LIST(__COUNTER__)
+
+#define IMPL_END_OPS_OPTS_LIST(PREFIX, COUNT)                                                                          \
+    /** Append the empty 'default' tables here, so we don't violate AUTOSAR by leaving them unused. */                 \
+    namespace hnnx {                                                                                                   \
+    template <>                                                                                                        \
+    template <>                                                                                                        \
+    constexpr ba_op_table<(COUNT)> op_table_arr_container::chain<UniqTy<0>, (COUNT)> =                                 \
+            chain<UniqTy<0>, (COUNT)-1>.append(&default_empty_ops_table);                                              \
+                                                                                                                       \
+    template <>                                                                                                        \
+    template <>                                                                                                        \
+    constexpr ba_opt_table<(COUNT)> opt_table_arr_container::chain<UniqTy<0>, (COUNT)> =                               \
+            chain<UniqTy<0>, (COUNT)-1>.append(&default_empty_opts_table);                                             \
+    }                                                                                                                  \
+                                                                                                                       \
+    namespace {                                                                                                        \
+    /** @brief PREFIX_op_package_ops_list references all of the registered ops for the HTP Core. */                    \
+    auto const PREFIX##_op_package_ops_list = hnnx::op_table_arr_container::chain<UniqTy<0>, (COUNT)>.get_arr();       \
+                                                                                                                       \
+    /** @brief PREFIX_op_package_opts_list references all of the registered graph optimizations for the HTP Core. */   \
+    auto const PREFIX##_op_package_opts_list = hnnx::opt_table_arr_container::chain<UniqTy<0>, (COUNT)>.get_arr();     \
+    }
+
+/** @brief Finish defining the list of Ops/Opts registration lists */
+#define END_OPS_OPTS_LIST()                                                                                            \
+    IMPL_END_OPS_OPTS_LIST(core, __COUNTER__)                                                                          \
+    OP_OPT_PROCESSOR(core)                                                                                             \
+    /** Force Ops and Opts to be registered at static-init time. */                                                    \
+    /** This is done to avoid init-time regressions when Graph::init_once() is called during graph creation. */        \
+    /** NOTE: OpPackages register in their init functions rather than at static-init time (See op_register_ext.h) */   \
+    [[maybe_unused]] static bool core_REGISTER_OPS_AND_OPTS = (hnnx::core_ops_opts_registration(), true);
+
+#define END_PKG_OPS_OPTS_LIST() IMPL_END_OPS_OPTS_LIST(pkg, __COUNTER__) OP_OPT_PROCESSOR(pkg)
+
+/** @brief Declare the list of registered ops and optimizations
+ *  for the given op file, and append it to the linked list.
+ *
+ *  A weak definition is provided for the registration list that initializes it to an 'empty' list
+ *  (only the null sentinel node is present). Op source files will then override these weak definitions
+ *  with strong ones containing the actual list.
+ */
+#define IMPL_DECLARE_OPS_OPTS_LIST(I, NAME)                                                                            \
+    namespace hnnx {                                                                                                   \
+    OPS_TABLE_WEAK_SYMBOL(NAME)                                                                                        \
+                                                                                                                       \
+    template <>                                                                                                        \
+    template <>                                                                                                        \
+    constexpr ba_op_table<(I)>                                                                                         \
+            op_table_arr_container::chain<UniqTy<0>, (I)> = chain<UniqTy<0>, (I)-1>.append(&EXT_OPS_REG_TABLE(NAME));  \
+                                                                                                                       \
+    OPTS_TABLE_WEAK_SYMBOL(NAME)                                                                                       \
+                                                                                                                       \
+    template <>                                                                                                        \
+    template <>                                                                                                        \
+    constexpr ba_opt_table<(I)> opt_table_arr_container::chain<UniqTy<0>, (I)> =                                       \
+            chain<UniqTy<0>, (I)-1>.append(&EXT_OPTS_REG_TABLE(NAME));                                                 \
+    }
+
+#define DECLARE_OPS_OPTS_LIST(NAME)     IMPL_DECLARE_OPS_OPTS_LIST(__COUNTER__, NAME)
+#define DECLARE_PKG_OPS_OPTS_LIST(NAME) IMPL_DECLARE_OPS_OPTS_LIST(__COUNTER__, NAME)
+
+#endif // OPS_OPTS_REGISTRATION_DEFS_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/optim_filter.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/optim_filter.h
new file mode 100755
index 0000000000000..5eca4a0520122
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/optim_filter.h
@@ -0,0 +1,62 @@
+//==============================================================================
+//
+// Copyright (c) 2021 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef OPTIM_FILTER_H
+#define OPTIM_FILTER_H 1
+#include <memory>
+
+// OptimFilter is built from a string, and implements a filter
+// indicating which which optimizations you want logged for debug.
+// it contains a pointer to a subclass of OptimFilterImplBase;
+// different subclasses could be built, depending on the string.
+class Graph;
+namespace hnnx {
+class GraphOptInfo;
+class Match;
+} // namespace hnnx
+
+namespace hnnx {
+
+#if defined(WITH_OPT_DEBUG)
+
+class OptimFilterImplBase {
+  public:
+    virtual ~OptimFilterImplBase();
+    virtual bool test_optim(hnnx::GraphOptInfo const &, Match const &) const = 0;
+};
+
+class OptimFilter {
+    std::unique_ptr<OptimFilterImplBase> p_impl; // null means never match
+  public:
+    OptimFilter(std::string const &filter_string);
+    OptimFilter(GraphPrepare const &g); // delegates to the other ctor; solves a header ordering problem.
+    OptimFilter(OptimFilter &&) = default;
+    ~OptimFilter();
+    bool test_optim(hnnx::GraphOptInfo const &gi, Match const &m) const
+    {
+        auto const *p = p_impl.get();
+        if (p) return p->test_optim(gi, m);
+        return false;
+    }
+};
+#else
+
+//dummy implementation
+
+class OptimFilter { // this is an empty class when !WITH_OPT_DEBUG
+  public:
+    OptimFilter(std::string const &filter_string) {}
+    OptimFilter(GraphPrepare const &) {}
+    OptimFilter(OptimFilter &&) = default;
+    bool test_optim(hnnx::GraphOptInfo const &gi, Match const &m) const { return false; }
+};
+#endif
+
+} //namespace hnnx
+
+#endif // OPTIM_FILTER_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/optimize.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/optimize.h
new file mode 100755
index 0000000000000..453328d2849d9
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/optimize.h
@@ -0,0 +1,2574 @@
+//==============================================================================
+//
+// Copyright (c) 2020-2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef OPTIMIZE_H
+#define OPTIMIZE_H 1
+
+/*
+ * PLEASE LEAVE graph.h OUT OF THIS FILE
+ */
+
+#include "c_tricks.h"
+#include "op_def.h"
+#include "unique_types.h"
+
+#include <array>
+#include <cassert>
+#include <functional>
+#include <limits>
+#include <map>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+#include <iso646.h>
+#include "optimize_defs.h"
+#include "optimize_flags.h"
+#include "optim_filter.h"
+#include "match_op.h"
+#include "oexpr.h"
+#include "build_options_pub.h"
+#include "op_package_name.h"
+#include "tensor_info.h"
+#include "macros_attribute.h"
+#include "weak_linkage.h"
+
+#ifndef PREPARE_DISABLED
+/*
+ * We want Match, Replacement, and Constraint to have mostly their own namespace,
+ * so that things like "OP" can mean different things in different places.
+ *
+ * However, we want to be able to share things like the context
+ * We could do this with extra state in each owned class, but that seems wasteful.
+ *
+ * Instead, we use private class members to give things unique namespaces, but
+ * inherit to concatenate classes and values that should be shared.
+ *
+ *
+ * A note about a trick:
+ *  Each class (Match, Constraint, Replacement) has a templated function for
+ *  UniqueType that's unused.  That lets us createa arbitrary member functions
+ *  later.
+ *
+ *  If you're curious, the unique type comes from the current filename and line.
+ *
+ *  There's also a member function pointer that is used when creating instances
+ *  of Match/Replacement/Constraint that we initialize to the arbitrary member
+ *  functions that we're creating.
+ *
+ */
+
+class Replacement;
+
+using ReplFunc = OptFunction<OpRef(Replacement &, OpDef const &)>;
+
+namespace hnnx {
+
+class Match;
+
+typedef std::function<bool(Match &, OpDef const &)> MatchFunc;
+
+template <oExp::OpVnt V, typename T> ReplFunc wrap_as_replfunc(oExp::opexpr<V, T> &&opr)
+{
+    return ReplFunc::create([op{std::move(opr)}](Replacement &rpx, OpDef const &) -> OpRef { return op.eval(rpx); });
+}
+template <oExp::OpVnt V, typename T> ReplFunc wrap_as_replfunc(oExp::opexpr<V, T> const &op)
+{
+    return ReplFunc::create([op](Replacement &rpx, OpDef const &) -> OpRef { return op.eval(rpx); });
+}
+inline ReplFunc wrap_as_replfunc(ReplFunc &&rep)
+{
+    return std::move(rep);
+}
+inline ReplFunc wrap_as_replfunc(ReplFunc const &rep)
+{
+    return rep;
+}
+
+} // namespace hnnx
+
+#include "weak_linkage.h"
+PUSH_VISIBILITY(default)
+namespace gxE {
+class API_EXPORT GXEngine;
+}
+POP_VISIBILITY()
+
+// these are function objects which return the various types,
+// and are called with a const reference to 'Constraint'
+//
+// some of them will cheat and used a static-cast to look at the Split_Context;
+// maybe it should be moved to Constraint class. Also, those that implement MESSAGE
+// etc will cheat and call a non-const method of the Constraint class.
+//
+
+typedef oExp::sFunction<int> ReplFuncInt;
+typedef oExp::sFunction<bool> ReplFuncBool;
+typedef oExp::sFunction<DType> ReplFuncDType;
+typedef oExp::sFunction<float> ReplFuncFloat;
+
+typedef OpRef (*external_replace_funcp)(Replacement &, OpDef const &);
+
+namespace hnnx {
+
+PUSH_VISIBILITY(default)
+
+// EJP: FIXME: Instead of separate optim_config things that require several changes in several places,
+// we need to plumb through a way to get an option out of graph.options that comes from options.def
+
+// the optim_config struct is visible within the namespace of
+// a DEFOPT as 'Config', e.g. Config.tcm_size reads the tcm size.
+//
+// The actual values are kept in struct optim_config_values, which is instantiated
+// within the optimization object.
+//
+// The struct is actually a static variable which contains instances of optim_configvar;
+// each one contains a field pointer into optim_config_values. When these appear
+// in an expression, they are converted to an oExp<config,T> containig a copy
+// of the struct offset; i.e. the oExp can be built without an instance of optim_config_values existing.
+//
+
+struct optim_config_values {
+    // values which are not directly available from 'Options'
+    size_t tcm_size; // the current tcm_size
+    size_t tcm_size_for_tiling; // tcm size to be used for tiling
+};
+
+// wrapper functions for graph access
+API_EXPORT OpRef graph_gen_Const_int32_common_wrapper(GraphPrepare &graph_in, const OpDef &old,
+                                                      const OutputDef &out_def, const uint8_t *data, size_t data_len);
+
+template <DType DT>
+API_EXPORT OpRef graph_gen_Const_scalar_wrapper(GraphPrepare &graph_in, const OpDef &old,
+                                                typename dtype_traits<DT>::element_type constval);
+
+// these are written as specializations.
+template <>
+API_EXPORT OpRef graph_gen_Const_scalar_wrapper<DType::Int32>(GraphPrepare &graph_in, const OpDef &old,
+                                                              NN_INT32_T constval);
+template <>
+API_EXPORT OpRef graph_gen_Const_scalar_wrapper<DType::Float32>(GraphPrepare &graph_in, const OpDef &old,
+                                                                float constval);
+POP_VISIBILITY()
+
+/* EJP: FIXME: A lot of stuff has accumulated here... const generation, helper functions, etc... */
+
+/*
+ * EJP: FIXME: see if we can change some of these functions to just return OpRef instead of
+ * having to return a funcgtion<OpRef(OpDef &)> and all the lambda stuff
+ */
+
+// Need separate function because underlying std::map of opdef_map_t is protected
+template <typename opdef_map_t, typename OpId_t> inline bool exists(opdef_map_t const &m, const OpId_t &test)
+{
+    return m.find(test) != m.end();
+}
+
+template <template <typename, typename> class C, typename K, typename V>
+inline bool exists(C<K, V> const &m, const K &test)
+{
+    return m.find(test) != m.end();
+}
+
+template <template <typename, typename, typename, typename> class C, typename K, typename V, typename V1, typename V2>
+inline bool exists(C<K, V, V1, V2> const &m, const K &test)
+{
+    return m.find(test) != m.end();
+}
+
+/*
+ * EJP: FIXME: this stuff here at a global level should move somewhere.
+ * Maybe even outline the functions...
+ */
+
+namespace opt_util {
+// map_to_size_t(x)
+// maps integer types to size_t;
+// passes ReplFuncInt as-is
+// This is used to minimize the number of distinct specializations
+// of gen_Shape (each having its own lambda).
+template <typename T> struct map_to_sizet_helper {
+    static_assert(std::numeric_limits<T>::is_integer);
+    static inline constexpr size_t convert(T x) { return x; }
+};
+template <oExp::Variant V, typename T> struct map_to_sizet_helper<oExp::expr<V, T>> {
+    static inline ReplFuncInt convert(oExp::expr<V, T> &&x) { return oExp::wrap_as_function<int>(std::move(x)); }
+};
+
+template <typename T> inline auto map_to_size_t(T &&x)
+{
+    return map_to_sizet_helper<T>::convert(std::forward<T>(x));
+}
+
+inline size_t eval_size(oExp::ECtx &, size_t size)
+{
+    return (size_t)size;
+}
+inline size_t eval_size(oExp::ECtx &e, ReplFuncInt const &f)
+{
+    return (size_t)(f(e));
+}
+
+template <typename... Ts> inline ReplFunc gen_Shape_inner(Ts... sizes)
+{
+    return ReplFunc::create([=](Replacement &rpx, const OpDef &old) -> OpRef {
+        OutputDef out_def = {
+                sizeof...(Ts), //rank
+                DType::Int32, //dtype
+                {eval_size(rpx, sizes)...}, //max_sizes
+                0, //zero_offset
+                0, //stepsize
+        };
+        auto &g = old.graph();
+        auto newref = graph_gen_Const_int32_common_wrapper(g, old, out_def, NULL, 0);
+#if 0
+        debuglog("Const shape %llx: rank=%zd (%zd,%zd,%zd,%zd...)",
+                 newref.input_id, out_def.rank, out_def.max_sizes[0],
+                 out_def.max_sizes[1], out_def.max_sizes[2],
+                 out_def.max_sizes[3]);
+#endif
+        return newref;
+    });
+}
+
+} // namespace opt_util
+
+} // namespace hnnx
+
+// This gen_Shape is intended for use in Replacement rules; the parameters
+// are either integer constants or ReplFuncInt 's
+// It returns a function.
+
+/// \ingroup OptReplacement
+/// @brief gen_Shape(..dims..) - construct an OpDef_Shape of the given dimensions.
+///
+/// The dimension parameters can be integers, but may also be one of
+/// SPLIT_START("tag"), SPLIT_SIZE("tag"), SPLIT_DIM("tag"),  provided the expression
+/// appears inside the operand of an AUTOSPLIT which uses the same tag
+///
+template <typename... Ts> inline ReplFunc gen_Shape(Ts... sizes)
+{
+    return hnnx::opt_util::gen_Shape_inner(hnnx::opt_util::map_to_size_t(std::move(sizes))...);
+}
+
+PUSH_VISIBILITY(default)
+
+//
+// 'QuickShape' can be returned from a SHAPEFN_APPLY function; returning
+// a QuickShape is equivalent to returning a gen_Shape() with the same dimensions.
+//
+struct QuickShape {
+    struct empty_rank {
+        unsigned r;
+    };
+
+    static constexpr unsigned maxdims = 8;
+    unsigned rank;
+    size_t dims[maxdims];
+    // make with specific rank and dimensions, up to 4
+    explicit inline constexpr QuickShape(size_t d) : rank(1), dims{d} {}
+    inline constexpr QuickShape(size_t d0, size_t d1) : rank(2), dims{d0, d1} {}
+    inline constexpr QuickShape(size_t d0, size_t d1, size_t d2) : rank(3), dims{d0, d1, d2} {}
+    inline constexpr QuickShape(size_t d0, size_t d1, size_t d2, size_t d3) : rank(4), dims{d0, d1, d2, d3} {}
+    inline constexpr QuickShape(size_t d0, size_t d1, size_t d2, size_t d3, size_t d4)
+        : rank(5), dims{d0, d1, d2, d3, d4}
+    {
+    }
+
+    // build from an OutputDef's shape info
+    QuickShape(OutputDef const &odef)
+    {
+        int const r = std::min((unsigned)odef.rank, maxdims);
+        rank = r;
+        for (int i = 0; i < r; i++) {
+            dims[i] = odef.max_sizes[i];
+        }
+    }
+    // take rank from first argument but fill with specified value
+    QuickShape(OutputDef shape, size_t fill)
+    {
+        rank = shape.rank;
+        for (int i = 0; i < rank; i++) {
+            dims[i] = fill;
+        }
+    }
+
+    // set an output def based on QuickShape. Only useful in implementing modifiers.
+    API_EXPORT void to_outdef(OutputDef &odef) noexcept;
+    explicit inline constexpr QuickShape(empty_rank const &erank) : rank(std::min((unsigned)erank.r, maxdims)), dims()
+    {
+    }
+    // build with a given rank, and all zero dims
+    static inline constexpr QuickShape make_empty(unsigned r) { return QuickShape(empty_rank{r}); }
+    // shortcut for make_empty( odef.rank)
+    API_EXPORT static inline QuickShape make_empty(OutputDef const &odef)
+    {
+        return QuickShape(empty_rank{unsigned(odef.rank)});
+    }
+
+    size_t &operator[](unsigned dim)
+    {
+        assert(dim < rank);
+        return dims[dim];
+    }
+};
+
+// This is an 'immediate' gen_Shape. Shape can be given as vararg ints,
+// or as std::vector<size_t>.
+//
+
+template <typename... Ts> API_EXPORT OpRef gen_Shape_immed(const OpDef &some_op, Ts... sizes);
+API_EXPORT OpRef gen_Shape_immed(const OpDef &some_op, std::vector<size_t> const &shape);
+
+POP_VISIBILITY()
+
+// this is intended to be used with an explicit <DType::Float32> or whatever.
+// (gen_Const_scalar must be specialized for the supported types).
+template <DType DT> inline OpRef gen_ConstScalar_imm(const OpDef &old, typename dtype_traits<DT>::element_type constval)
+{
+    auto &g = old.graph();
+    return hnnx::graph_gen_Const_scalar_wrapper<DT>(g, old, constval);
+}
+// these are intended to be used in replacement rules, they return ReplFunc.
+
+PUSH_VISIBILITY(default)
+/// \ingroup OptReplacement
+/// @brief gen_ConstScalar_f32(floatval) - Make an Opdef_Const with given scalar float value
+API_EXPORT ReplFunc gen_ConstScalar_f32(float constval);
+API_EXPORT ReplFunc gen_ConstScalar_f32_func(ReplFuncFloat &&constval_f);
+POP_VISIBILITY()
+
+template <typename T> inline ReplFunc gen_ConstScalar_f32(T &&expr)
+{
+    return gen_ConstScalar_f32_func(oExp::wrap_as_function<float>(std::forward<T>(expr)));
+}
+
+PUSH_VISIBILITY(default)
+/// \ingroup OptReplacement
+/// @brief gen_ConstScalar_i32(intval) - Make an Opdef_Const with given scalar float value
+API_EXPORT ReplFunc gen_ConstScalar_i32(int constval);
+API_EXPORT ReplFunc gen_ConstScalar_i32_func(ReplFuncInt &&constval_f);
+POP_VISIBILITY()
+
+template <typename T> inline ReplFunc gen_ConstScalar_i32(T &&expr)
+{
+    return gen_ConstScalar_i32_func(oExp::wrap_as_function<int>(std::forward<T>(expr)));
+}
+
+PUSH_VISIBILITY(default)
+API_EXPORT ReplFunc gen_ConstArr_f32(float constval, size_t n);
+API_EXPORT ReplFunc gen_ConstArr_f32_func(ReplFuncFloat &&val_func, ReplFuncInt &&n_func);
+POP_VISIBILITY()
+
+template <typename TVAL, typename TN> inline ReplFunc gen_ConstArr_f32(TVAL &&val, TN &&nn)
+{
+    return gen_ConstArr_f32_func(oExp::wrap_as_function<float>(std::forward<TVAL>(val)),
+                                 oExp::wrap_as_function<int>(std::forward<TN>(nn)));
+}
+
+PUSH_VISIBILITY(default)
+API_EXPORT ReplFunc gen_ConstArr_i32(NN_INT32_T constval, size_t n);
+API_EXPORT ReplFunc gen_ConstArr_i32_func(ReplFuncInt &&val_func, ReplFuncInt &&n_func);
+POP_VISIBILITY()
+
+template <typename TVAL, typename TN> inline ReplFunc gen_ConstArr_i32(TVAL &&val, TN &&nn)
+{
+    return gen_ConstArr_i32_func(oExp::wrap_as_function<int>(std::forward<TVAL>(val)),
+                                 oExp::wrap_as_function<int>(std::forward<TN>(nn)));
+}
+
+PUSH_VISIBILITY(default)
+//
+// gen_ConstArr_vals_i32( ... ) allows creation of a an int32 const array, shape [1,1,1,n],
+// with the given set of values in it.
+
+// this implementation only used when all the values are constants
+API_EXPORT ReplFunc gen_ConstMat_i32__func(std::vector<NN_INT32_T> &&);
+// this one is passed a std::vector of ReplFuncInt
+API_EXPORT ReplFunc gen_ConstMat_i32__func(std::vector<ReplFuncInt> &&);
+POP_VISIBILITY()
+
+namespace hnnx {
+
+// all_are_int<T,T,T,...>()  returns true if all of T,T .. are int,long or unsigned.
+// or reference to.
+//
+template <typename... Ts> struct all_are_int_helper {
+    static_assert(sizeof...(Ts) == 0, "template problem");
+    static constexpr bool value = true;
+};
+template <typename T1, typename... Ts> struct all_are_int_helper<T1, Ts...> {
+    using TX = std::remove_reference_t<T1>;
+    static constexpr bool value = (std::is_same_v<TX, int> || std::is_same_v<TX, long> ||
+                                   std::is_same_v<TX, unsigned>)&&all_are_int_helper<Ts...>::value;
+};
+
+template <typename... Ts> inline constexpr bool all_are_int()
+{
+    return all_are_int_helper<Ts...>::value;
+}
+
+} // namespace hnnx
+
+//
+// gen_ConstMat_i32( wid, ... wid*dep values ... ) -> [1,1,wid,dep] filled
+// in with the values. 'wid' can zero, which is treated as 1.
+
+template <typename TW, typename... Ts> inline ReplFunc gen_ConstMat_i32(TW &&wid, Ts &&...values)
+{
+    if constexpr (hnnx::all_are_int<TW, Ts...>()) {
+        std::vector<NN_INT32_T> parms = {NN_INT32_T(wid), NN_INT32_T(values)...};
+        return gen_ConstMat_i32__func(std::move(parms));
+    } else {
+        std::vector<ReplFuncInt> parms = {oExp::wrap_as_function<int>(std::forward<TW>(wid)),
+                                          oExp::wrap_as_function<int>(std::forward<Ts>(values))...};
+        return gen_ConstMat_i32__func(std::move(parms));
+    }
+}
+// gen_ConstArr_vals_i32 is just a special case of gen_ConstMat_i32
+
+template <typename... Ts> inline ReplFunc gen_ConstArr_vals_i32(Ts &&...values)
+{
+    return gen_ConstMat_i32(0, std::forward<Ts>(values)...);
+}
+
+struct Split_Context {
+    int start;
+    int size;
+    int dim;
+};
+
+PUSH_VISIBILITY(default)
+
+/**
+ * \defgroup AutoSplitShapeFnApply  Functions for AUTOSPLIT_SHAPEFN_APPLY
+ * \ingroup OptReplacement
+ *
+ * These are functions which may be used with SHAPEFN_APPLY.
+ *
+ * The first parameter is always Replacement &; the second is a Split_Context const & (obtained via the 'split_tag' parmeter
+ * to the AUTOSPLIT_SHAPEFN_APPLY' and the remaining parameters are obtained from the AUTOSPLIT_SHAPEFN_APPLY, and may be
+ * OpRef (mapped from "OperandTag" in the SHAPEFN_APPLY), or scalar values.
+ *
+ * The return value may be an OpRef representing a new graph object; instead, the function may return a QuickShape object
+ * representing a shape, and the framework will convert this to an OpDef_Shape.
+ *
+ *
+ * @{
+ */
+// :::EXTERNAL_SHAPEFN::: {  qshape simpledim_split_start(split,op,int); }
+
+/// @brief make 'start' shape for 'simple' split (on specific dimension)
+///
+/// E.g. if dim= 2, and the SPLIT_START is 96, a shape { 0, 0, 96, 0} will be generated.
+///
+/// This is used within CHANGEDIM_SLICE
+///
+API_EXPORT QuickShape simpledim_split_start(Replacement &rpx, Split_Context const &splitinfo, OpRef const &orig,
+                                            int dim);
+
+// :::EXTERNAL_SHAPEFN::: {  qshape simpledim_split_size(split,op,int); }
+
+/// @brief make 'size' shape for 'simple' split (on specific dimension)
+///
+/// E.g. if tdim=2, and the SPLIT_START is 30, a shape { b, h, 30, d} will be generated
+/// (where b,h,d are the 'default' dims)
+///
+/// This is used within CHANGEDIM_SLICE
+///
+API_EXPORT QuickShape simpledim_split_size(Replacement &rpx, Split_Context const &splitinfo, OpRef const &orig,
+                                           int dim);
+
+// :::EXTERNAL_SHAPEFN::: {  qshape simple_split_start(split,op); }
+
+/// @brief make 'start' shape for 'simple' split
+///
+/// E.g. if SPLIT_DIM=3, and the SPLIT_START is 96, a shape { 0, 0, 0, 96} will be generated.
+///
+/// This is used within TYPICAL_SLICE
+///
+API_EXPORT QuickShape simple_split_start(Replacement &rpx, Split_Context const &splitinfo, OpRef const &orig);
+
+// :::EXTERNAL_SHAPEFN::: {  qshape simple_split_size(split,op); }
+
+/// @brief make 'size' shape for 'simple' split
+///
+/// E.g. if SPLIT_DIM=3, and the SPLIT_SIZE is 30, a shape { b, h, w, 30} will be generated.
+/// (where b,h,w are the 'default' dims)
+///
+/// This is used within TYPICAL_SLICE
+///
+API_EXPORT QuickShape simple_split_size(Replacement &rpx, Split_Context const &splitinfo, OpRef const &orig);
+
+// :::EXTERNAL_SHAPEFN::: {  qshape conv_valid_split_start(split,op,op); }
+
+/// @brief make 'start' shape for splitting input to 'valid' convolution, where the input is being split along height or width
+///
+/// Generates shape {0, SPLIT_START * stride_h, 0, 0 }
+/// or
+/// Generates shape {0, 0, SPLIT_START * stride_w, 0 }
+API_EXPORT QuickShape conv_valid_split_start(Replacement &rpx, Split_Context const &splitinfo, OpRef const &Act,
+                                             OpRef const &stride);
+
+// :::EXTERNAL_SHAPEFN::: {  qshape conv_valid_split_size(split,op,op,int,int); }
+
+/// @brief make 'size' shape for splitting input to 'valid' dilated convolution, where the input is being split along height or width
+///
+/// Generates shape {0, inrows, 0, 0} or {0, 0, incols, 0}
+///
+/// where inrows = stride_h * (SPLIT_SIZE-1) + (filter_h - 1) * dilation + 1
+///       incols = stride_w * (SPLIT_SIZE-1) + (filter_w - 1) * dilation + 1
+///
+API_EXPORT QuickShape conv_valid_split_size(Replacement &rpx, Split_Context const &splitinfo, OpRef const &Act,
+                                            OpRef const &stride, int window, int dilation);
+
+// :::EXTERNAL_SHAPEFN::: {  qshape conv_split_start_valid(split,op,op,op); }
+
+/// @brief make 'start' shape for splitting input to 'valid' convolution, where the input is being split along height
+///
+/// Generates shape {0, SPLIT_START * stride_h, 0, 0 }
+
+API_EXPORT QuickShape conv_split_start_valid(Replacement &rpx, Split_Context const &splitinfo, OpRef const &Act,
+                                             OpRef const &weights, OpRef const &stride);
+
+// :::EXTERNAL_SHAPEFN::: {  qshape conv_split_size_valid(split,op,op,op); }
+
+/// @brief make 'size' shape for splitting input to 'valid' convolution, where the input is being split along height
+///
+/// Generates shape {0, inrows, 0, 0 }
+///
+/// where inrows = stride_h * (SPLIT_SIZE-1) + filter_h
+///
+API_EXPORT QuickShape conv_split_size_valid(Replacement &rpx, Split_Context const &splitinfo, OpRef const &Act,
+                                            OpRef const &weights, OpRef const &stride);
+
+// :::EXTERNAL_SHAPEFN::: {  qshape conv_split_size_valid_dil(split,op,op,op,op); }
+
+/// @brief make 'size' shape for splitting input to 'valid' dilated convolution, where the input is being split along height
+///
+/// Generates shape {0, inrows, 0, 0 }
+///
+/// where inrows = stride_h * (SPLIT_SIZE-1) + (filter_h - 1) * dilation + 1
+///
+API_EXPORT QuickShape conv_split_size_valid_dil(Replacement &rpx, Split_Context const &splitinfo, OpRef const &Act,
+                                                OpRef const &weights, OpRef const &stride, OpRef const &dilation);
+
+// :::EXTERNAL_SHAPEFN::: {  qshape pool_split_start_valid(split,op,op,op); }
+
+/// @brief make 'start' shape for splitting input to 'valid' Xpool, where the input is being split along height
+///
+/// Generates shape {0, SPLIT_START * stride_h, 0, 0 }
+API_EXPORT QuickShape pool_split_start_valid(Replacement &rpx, Split_Context const &splitinfo, OpRef const &Act,
+                                             OpRef const &window, OpRef const &stride);
+
+// :::EXTERNAL_SHAPEFN::: {  qshape pool_split_size_valid(split,op,op,op); }
+
+/// @brief make 'size' shape for splitting input to 'valid' Xpool, where the input is being split along height
+///
+/// Generates shape {0, inrows, 0, 0 }
+///
+/// where inrows = stride_h * (SPLIT_SIZE-1) + window_h
+///
+API_EXPORT QuickShape pool_split_size_valid(Replacement &rpx, Split_Context const &splitinfo, OpRef const &Act,
+                                            OpRef const &window, OpRef const &stride);
+
+/** @} */
+
+namespace optim_extfunc { // in concat_opt.cc
+API_EXPORT QuickShape offset_into_concat(Replacement &rpx, Split_Context const &splitinfo, OpRef const &concat,
+                                         OpRef const &base_shape);
+}
+
+/**
+ * \defgroup ShapeFnApply  Functions for SHAPEFN_APPLY
+ * \ingroup OptReplacement
+ *
+ * These are functions which may be used with SHAPEFN_APPLY.
+ *
+ * The first parameter is always Replacement &; the remaining parameters are obtained from the SHAPEFN_APPLY, and may be
+ * OpRef (mapped from "OperandTag" in the SHAPEFN_APPLY), or scalar values.
+ *
+ * The return value may be an OpRef representing a new graph object; instead, the function may return a QuickShape object
+ * representing a shape, and the framework will convert this to an OpDef_Shape.
+ *
+ * @{
+ */
+
+// :::EXTERNAL_SHAPEFN::: {  qshape split_merge_start(op,op); }
+
+API_EXPORT QuickShape split_merge_start(Replacement &rpx, OpRef const &inner, OpRef const &outer);
+
+//@brief Create shape with extra amount added along some axis
+// :::EXTERNAL_SHAPEFN::: { qshape shape_add_on_axis(op,op,int); }
+API_EXPORT QuickShape shape_add_on_axis(Replacement &rpx, OpRef const &start, OpRef const &amt, int axis);
+
+// :::EXTERNAL_SHAPEFN::: {  qshape conv_same_padded_size(op,op,op); }
+///@brief find padded shape for input to 'same' convolution
+///
+/// For a 'same' convolution, produce a shape the same as 'Act', but expanded in H and W dimensions to allow for the
+/// padding needed (as determined by the given filter shape and stride)
+///
+API_EXPORT QuickShape conv_same_padded_size(Replacement &rpx, OpRef const &Act, OpRef const &weights,
+                                            OpRef const &stride);
+
+// :::EXTERNAL_SHAPEFN::: {  qshape conv_same_padded_size(op,op,op,op); }
+///@brief same as \conv_same_padded_size that support dilation, default should be {1,1}
+API_EXPORT QuickShape conv_same_padded_size_dilation(Replacement &rpx, OpRef const &Act, OpRef const &weights,
+                                                     OpRef const &stride, OpRef const &dilation);
+
+// :::EXTERNAL_SHAPEFN::: {  qshape conv_same_before(op,op,op); }
+
+///@brief find padded offset (top/left margin) for input to 'same' convolution
+///
+/// For a 'same' convolution, produce a shape which indicates how the input needs to be padded on top and left to
+/// be processed as 'valid' - as determined by the given filter shape and stride. The resulting shape will be
+///
+///   { 0, top_padding,  left_padding, 0 }
+///
+API_EXPORT QuickShape conv_same_before(Replacement &rpx, OpRef const &Act, OpRef const &weights, OpRef const &stride);
+
+// :::EXTERNAL_SHAPEFN::: {  qshape conv_same_before(op,op,op,op); }
+
+///@brief same as \conv_same_before that support dilation, default should be {1,1}
+API_EXPORT QuickShape conv_same_before_dilation(Replacement &rpx, OpRef const &Act, OpRef const &weights,
+                                                OpRef const &stride, OpRef const &dilation);
+
+// :::EXTERNAL_SHAPEFN::: {  qshape pool_same_padded_size(op,op,op); }
+
+///@brief find padded shape for input to 'same' Xpool
+///
+/// For a 'same' Xpool, produce a shape the same as 'Act', but expanded in H and W dimensions to allow for the
+/// padding needed (as determined by the given window shape and stride)
+///
+API_EXPORT QuickShape pool_same_padded_size(Replacement &rpx, OpRef const &Act, OpRef const &window,
+                                            OpRef const &stride);
+
+// :::EXTERNAL_SHAPEFN::: {  qshape pool_same_before(op,op,op); }
+
+///@brief find padded offset (top/left margin) for input to 'same' Xpool
+///
+/// For a 'same' Xpool, produce a shape which indicates how the input needs to be padded on top and left to
+/// be processed as 'valid' - as determined by the given window shape and stride. The resulting shape will be
+///
+///   { 0, top_padding,  left_padding, 0 }
+///
+API_EXPORT QuickShape pool_same_before(Replacement &rpx, OpRef const &Act, OpRef const &window, OpRef const &stride);
+
+// :::EXTERNAL_SHAPEFN::: {  qshape conv_s2d_shape(op,op,op); }
+
+/// @brief
+///
+/// Compute the out shape of a conv whose input has gone through a space to depth transformation
+/// Effective input shape is roundup(input_shape, stride) / stride
+/// Effective out shape is (eff in - filter + 1) (note that stride is changed to 1 after s2d)
+/// Does not handle dilation (this is handled earlier on in the def opt path)
+///
+API_EXPORT QuickShape conv_s2d_shape(Replacement &rpx, OpRef const &Act, OpRef const &filter, OpRef const &stride);
+
+// :::EXTERNAL_SHAPEFN::: {  qshape pad_total_for_qnn(op,op); }
+
+///@brief use input pad_amount to calculate padded offset for input from 'QNN_Conv' to 'valid' convolution
+///
+/// For a 'QNN_Conv' convolution, produce a shape the same as 'Act', but expanded in H and W dimensions, which
+/// is determined by the input pad_amount: [[h_before, h_after], [w_before, w_after]]
+///
+API_EXPORT QuickShape pad_total_for_qnn(Replacement &rpx, OpRef const &Act, OpRef const &pad_amount);
+
+///@brief use input pad_amount to calculate padded offset for input to use 'valid' pooling
+///
+/// For QNN pool ops, produce a shape the same as 'Act', but expanded in H and W dimensions, which
+/// is determined by the input pad_amount: [[h_before, h_after], [w_before, w_after]]
+///
+API_EXPORT QuickShape pad_total_for_qnn_round(Replacement &rpx, OpRef const &Act, OpRef const &Stride,
+                                              OpRef const &pad_amount, OpRef const &rounding_mode);
+
+// :::EXTERNAL_SHAPEFN::: {  qshape pad_before_for_qnn(op,op); }
+
+///@brief use input pad_amount to get the (top/left margin) for input from 'QNN_Conv' to 'valid' convolution
+///
+/// For a 'QNN_Conv' convolution, produce the result shape of padded shape to
+/// be processed as 'valid' - as determined by the given pad_amount. The resulting shape will be
+///
+///   { 0, top_padding,  left_padding, 0 }
+///
+API_EXPORT QuickShape pad_before_for_qnn(Replacement &rpx, OpRef const &Act, OpRef const &pad_amount);
+
+// :::EXTERNAL_SHAPEFN::: {  qshape explicit_pad_for_qnn(op,op); }
+
+API_EXPORT OpRef explicit_pad_for_qnn(Replacement &rpx, OpRef const &output, OpRef const &pad_amount);
+
+// :::EXTERNAL_SHAPEFN::: {  qshape reshape_hw_to_4d(op); }
+
+///@brief given a tensor representing [h, w] expand to [1, h, w, 1]
+API_EXPORT QuickShape reshape_hw_to_4d(Replacement &rpx, OpRef const &stride);
+
+// :::EXTERNAL_SHAPEFN::: {  qshape reshape_bhw_to_4d(op); }
+
+///@brief given a tensor representing [b, h, w] expand to [b, h, w, 1]
+API_EXPORT QuickShape reshape_bhw_to_4d(Replacement &rpx, OpRef const &stride);
+
+// :::EXTERNAL_SHAPEFN::: {  qshape shape_after_transpose(op,op); }
+
+///@brief gives the new shape of a tensor after a transpose has been applied to it
+API_EXPORT QuickShape shape_after_transpose(Replacement &rpx, OpRef const &input, OpRef const &tx_control);
+
+// :::EXTERNAL_SHAPEFN::: {  qshape shape_after_spaceToBatch(op,op); }
+// :::EXTERNAL_SHAPEFN::: {  qshape shape_after_spaceToBatch_w_pad(op,op,op); }
+
+///@brief gives the new shape of a tensor after a SpaceToBatch transformation
+API_EXPORT QuickShape shape_after_spaceToBatch(Replacement &rpx, OpRef const &input, OpRef const &block_size);
+API_EXPORT QuickShape shape_after_spaceToBatch_w_pad(Replacement &rpx, OpRef const &input, OpRef const &block_size,
+                                                     OpRef const &pads);
+
+// :::EXTERNAL_SHAPEFN::: {  qshape shape_after_depthToSpace(op,op); }
+
+///@brief gives the new shape after depthToSpace transformation
+API_EXPORT QuickShape shape_after_depthToSpace(Replacement &rpx, OpRef const &input, OpRef const &block_size);
+
+// :::EXTERNAL_SHAPEFN::: {  qshape before_pad_shape(op); }
+
+///@brief extracts before pads from pad tensor
+API_EXPORT QuickShape before_pad_shape(Replacement &rpx, OpRef const &input, OpRef const &padding);
+/** @} */
+
+// :::EXTERNAL_SHAPEFN::: {  qshape after_pad_shape(op); }
+
+///@brief extracts before pads from pad tensor
+API_EXPORT QuickShape after_pad_shape(Replacement &rpx, OpRef const &input, OpRef const &padding);
+/** @} */
+
+// :::EXTERNAL_SHAPEFN::: {  qshape gen_null_shape(op); }
+
+///@brief generate a shape of all 0s with same rank as input
+API_EXPORT QuickShape gen_null_shape(Replacement &rpx, OpRef const &input);
+/** @} */
+
+// :::EXTERNAL_SHAPEFN::: {  qshape shape_after_pad(op,op); }
+
+///@brief gives the new shape after pad applied
+API_EXPORT QuickShape shape_after_pad(Replacement &rpx, OpRef const &input, OpRef const &padding);
+/** @} */
+
+// :::EXTERNAL_SHAPEFN::: {  qshape pad_after_shape(op,op,op); }
+
+///@brief calculate the padding after a tensor
+API_EXPORT QuickShape pad_after_shape(Replacement &rpx, OpRef const &input, OpRef const &before_pad,
+                                      OpRef const &total_size);
+
+/////////////////////////////////////////////////////////////
+// Given a match rule like
+//
+//  Op("Add","X","B"),
+// or
+//  Op("Slice_shape",Op("Slice_shape","Input","inner_start","inner_size"),"outer_start","outer_size"),
+//
+// .. we make an MatchOp object that can match it and bind the named parameters
+//
+// This is done in two steps
+//  (1) first, we 'parse' the rule, this is done by executing the code in the context of a MatchBuilder
+//      member function. Each Op() returns a shared_ptr<MatchAstNode>.
+//  (2) we look at that, and based on what it is, we construct something based on MatchOpBase,
+//      which has a method in it to do the matching.
+//      The return from (1) can then be discarded
+//
+//
+// during matching:
+///  - The matcher engine works by first checking all the Op types and parameter counts (in pre-order
+//     traversal), and then going back to bind/check the named operands, all  by following a table.
+//     There are no "operand_tag_t" involved in this process, since the indices into the output array
+//     are baked into the tables in advance.
+//   - in the process, all the OpRef  stored in an array of at most [MATCH_MAX_PATTERN], which is
+//     stored in the Match object. The first will contain the 'root' Opref, the next 'n' are the matched subops,
+//     and the rest are distinct matched parameter names. 'n' could be 0.
+//   - Each instance of MatchOpBase has an array std::vector<pair<operand_tag_t,int>> m_operindex, which supplies
+//     the 'operand' names for those matched refs, and maps them to indices in the array (the names are in order).
+//   - On a complete match we use Match.set_current_matchop() to install a pointer to the MatchOpBase;
+//     subsequently, in 'Constraint' and 'Replace' phases, that object's lookup_opertag() method is used to
+//     map operand names to OpRef (it maps the m_operindex to an index into the array).
+// TODO: we should probably also have a parallel array in Match of the corresponding OpDef pointers, which
+// can be filled in lazily (starting with the ones already obtained during matching, with the rest init to NULL).
+// This could reduce repeated lookups in op_def_map during Constraint and Replace phases.
+//
+//
+
+#define OP_CSTR(op) ((op).c_str())
+
+//
+// The subclasses of MatchOpBase are declared and implemented in match_op.cc
+//
+// MatchOpBase
+//      +---MatchOpSimple		    // for 1-level pattern with no duplicate operand names
+//      +---MatchOpGeneral			// for all other cases.
+//
+//
+namespace hnnx {
+
+class MatchOpBase;
+// These are the state vars within Match which belong to MatchOp.
+struct MatchOpState {
+    MatchOpBase const *current_matchop; // after a match, points to the matchop, which does operand lookups.
+    // table of OpRef bound by the match; current_matchop->lookup_opertag is used to find the index
+    // for a given operand tag.
+    // (only the first 'n' are valid, where n is current_matchop->m_matchcount)
+    OpRef bound_opref[MATCH_MAX_PATTERN];
+    // These are either null or pointing to the OpDef indicated by bound_opref[i]
+    // (only the first 'n' are valid, where n is current_matchop->m_matchcount)
+    OpDef const *bound_opdef[MATCH_MAX_PATTERN];
+
+    // This holds pointers to operands matched by MatchopIterator
+    std::array<OpDef const *, MATCH_MAX_PATTERN> matched_opdef;
+
+    API_EXPORT int lookup_opertag(operand_tag_parm_t optag) const;
+    bool cse_candidate; // True for rules match Op (x, Op(...))
+};
+
+//
+class MatchOpBase {
+  protected:
+    opname_tag_t m_opname0; // name of the root op.
+    // 0 <= min <= max
+    unsigned short m_min_inputs; // range of input counts on the root op
+    unsigned short m_max_inputs;
+    unsigned short m_matchcount; // size of table needed for match
+
+    // A fixed list, mapping operand tags to indices in the mapped operands;
+    // sorted by operand tag.
+    // This is used in lookup_opertag()
+    std::vector<std::pair<operand_tag_t, int>> m_operindex;
+    //
+    // This contains the char const * used for displaying the context
+    // (see optim_trace.cc)
+    // It may be empty, if this was not enabled in the build.
+    std::unique_ptr<const char[]> match_debug_desc;
+
+    API_EXPORT virtual bool do_subclass_match(Match &m, OpDef const &op) const = 0;
+
+    API_EXPORT MatchOpBase(MatchAstNode const *, int matchcount,
+                           std::vector<std::pair<operand_tag_t, int>> &&operindex);
+
+    API_EXPORT static MatchOpState &matchop_state(Match &m);
+    // lookup_ref:   transform an OpRef to OpDef using the methods in Match
+    API_EXPORT OpDef const &lookup_ref(Match &m, OpRef const &op) const;
+
+  public:
+    // OpRef to the matched pattern Ops are stored in a linear array,
+    // with [0] being the 'base' Op.
+    //  For MatchOpSimple, the rest of the array is filled up with the Op's inputs.
+    //  For MatchOpGeneral, starting in [1] the array is filled with refs
+    //   to all the 'subordinate' Ops (in pre-order). There may be 0 of these.
+    //   The rest of the array is filled with OpRef to he named input operands.
+    //
+    // A table of opdesc is used to match and gather the 'Ops'in the table. Results
+    // are stored in order in the match list, starting at [1]
+    struct opdesc {
+        opname_tag_t opname; // name of the sub op
+        unsigned short loc_idx; // index of previously matched containing op, in match table
+        unsigned short in_idx; // which input do we look at
+        unsigned short min_n, max_n; // range of input count
+    };
+
+  protected:
+    // parm desc are used to gather the 'named' params
+    // results are stored in order in match table.
+    // Records with dup_ipx >0 are different: for these, the operand is
+    // obtained, and checked to see if it's a dup of the one already at
+    // dup_index. If it is not, the match fails; if it is, the matching
+    // proceeds, and nothing is added to the output (note, it is not allowed
+    // or useful to have an operand aliased to the root op, index 0).
+    //
+    struct parmdesc {
+        unsigned short loc_idx; // index of previously matched op in match table
+        unsigned short in_idx; // which input do we look at
+        unsigned short dup_idx; // if !=0, must be a dup of dup_index-1
+    };
+
+  public:
+    // this returns the m_matchcount; can be used for auto-sizing the bind array in match.
+    // it needs to be at least as large as the get_mathcount of all the MatchOp.
+    int get_matchcount() const { return m_matchcount; }
+    // this builds a MatchOp of appropriate class from a MatchAst
+    API_EXPORT static MatchOp_uptr build_MatchOp(MatchAstNode *);
+
+    API_EXPORT virtual ~MatchOpBase();
+    API_EXPORT bool do_match(Match &m, OpDef const &op) const;
+    // lookup an operand tag in m_operindex
+    // Returns -1 if not found, or the index (will be in range 0..get_matchcount()-1)
+    //
+    API_EXPORT int lookup_opertag(operand_tag_parm_t optag) const;
+    // this is so we can organize rules based on the root opname.
+    API_EXPORT opname_tag_parm_t get_root_opname() const { return m_opname0; }
+
+    // these are used for WITH_OPT_DEBUG. When it is not defined. they return nullptr and empty-map.
+    API_EXPORT char const *get_debug_desc() const { return match_debug_desc.get(); } // may return nullptr
+    API_EXPORT std::map<OpId, operand_tag_parm_t> get_inverse_map(MatchOpState const &m) const;
+
+    API_EXPORT const std::vector<std::pair<operand_tag_t, int>> &get_operindex() const { return m_operindex; };
+
+    // Number of operators in match
+    API_EXPORT virtual unsigned match_size() const = 0;
+    API_EXPORT virtual const std::vector<opdesc> *get_opdesc() const = 0;
+};
+
+static void fail_lookup(operand_tag_parm_t optag)
+{
+    errlog("Parameter %s not found", optag.c_str());
+    throw std::runtime_error("match parm not found");
+}
+
+API_FUNC_EXPORT inline int MatchOpState::lookup_opertag(operand_tag_parm_t optag) const
+{
+    int const idx = current_matchop->lookup_opertag(optag);
+    if (idx < 0) fail_lookup(optag);
+    return idx;
+}
+
+/////////////////////////////////////////////////////////////
+
+/** \defgroup OptMatch Match-Pattern Expressions for Optimization Rules
+ * \ingroup OptimizationFuncs
+ *
+ * These are the operations available for writing 'Match Pattern' expressions.
+ */
+
+/////////////////////////////////////////////////////////////
+
+/** Base Class for Graph Optimization Context
+ * This has the shared data elements and functionality, available to all parts of the optimization
+ */
+
+class GraphOptContext_Base : public RefersToGraph {
+  protected:
+    API_EXPORT GraphOptContext_Base(GraphPrepare &g) : RefersToGraph(g) {}
+};
+
+// this is a virtual base class which is used to implement MESSAGE dumps
+// while running optimization; it abstracts away the difference between
+// 'built-in' optimizations, and externally generated, via two different
+// subclasses
+
+class OptDebugBase {
+  protected:
+    GraphPrepare &m_graph;
+    uint32_t m_saved_opid;
+    OptDebugBase(GraphPrepare &g) : m_graph(g), m_saved_opid(0) {}
+    OptDebugBase(GraphPrepare &g, uint32_t saved_opid) : m_graph(g), m_saved_opid(saved_opid) {}
+
+  public:
+    GraphPrepare &graph() const { return m_graph; }
+    // these are stubs unless WITH_OPT_DEBUG is #defined
+    API_EXPORT void show_optim(FILE *f, int indent); // show what a rule has matched
+    API_EXPORT void show_optim_replace(FILE *f, OpId opid, int indent);
+    API_EXPORT virtual ~OptDebugBase();
+
+  protected:
+    // these are used by show_optim, show_optim_replace to access the match context
+
+    virtual char const *get_debug_desc() const = 0; // get the 'matchdesc' string for current optimization
+    // get an OpRef of an op which is in the pattern at 'idx'
+    virtual OpRef get_bound_opref(unsigned idx) const = 0;
+    // get an OpDef * to to an op which is in the pattern at 'idx'
+    virtual OpDef const *get_bound_opdef(unsigned idx) const = 0;
+
+  public:
+    API_EXPORT virtual uint32_t saved_opid() const { return m_saved_opid; }
+
+    // get mapping from OpId->parm for all OpId in the match pattern; this is used
+    // to show the replacement pattern.
+    using id_to_parmname_map = std::map<OpId, operand_tag_parm_t>;
+    API_EXPORT virtual id_to_parmname_map get_id_to_parmname_map() const = 0;
+    API_EXPORT virtual std::string get_debug_filepos() const = 0;
+};
+
+/*
+ * The Match class contains the functionality for the match functions
+ * to implement pattern matching
+ *
+ * We want to write something like:
+ *  Op("Relu",Op("ConvLayer","Act","Weights","Bias","Stride"))
+ * Where the first parameter is the name of an operation
+ * And the rest of the strings are names that match an input that we can use to
+ * refer to the input
+ *
+ * We need to refer to inputs again even in matching: if we see the same string
+ * twice it needs to be the same thing in both places.
+ *
+ * But primarily we will need to use these strings while during extra constraints
+ * and replacement.
+ */
+
+class GraphOptInfo;
+
+class Match : public GraphOptContext_Base {
+    friend class GraphOptInfo;
+    friend class MatchOpBase;
+    friend class OptDebugForMatch;
+
+  protected:
+    OptimFilter optim_filter; // used for WITH_OPT_DEBUG; empty otherwise
+    MatchOpState matchop_state;
+    bool pending_show_replacement = false;
+
+    // op_id_counter is saved here before 'replace'; after replace, any
+    // OpId which are >= this in the upper 32 bits are 'new'.
+    uint32_t save_op_id_counter;
+
+    // optim config vars are set here.
+    optim_config_values config_vars;
+
+    Match(GraphPrepare &g) : GraphOptContext_Base(g), optim_filter(g) { set_config_vars(); }
+    API_EXPORT void set_config_vars();
+
+    // these are debug hooks; they are defined later as inlines
+    API_EXPORT void constraint_begin(GraphOptInfo const &);
+    API_EXPORT void replacement_fail();
+
+  public:
+    API_EXPORT void replacement_succeed(OpId newop);
+    GraphOptInfo const *curr_rule_info = 0; // only used in WITH_OPT_DEBUG
+
+  public:
+    // this can be used to test whether an OpId was created since the replacement
+    // rule started (though, not at all reliable for 'OpDef_ConstBase' ops).
+    API_EXPORT inline bool opid_is_new(OpId op) const { return uint32_t(op >> 32) >= save_op_id_counter; }
+
+    API_EXPORT hnnx::MatchOpState &get_matchop_state() { return matchop_state; }
+
+    //template<typename UniqueType> bool match(OpDef &base);
+    typedef MatchAst_uptr (*matchbuilder_type)();
+    void record_op_id_counter();
+    API_EXPORT optim_config_values const &get_config() const { return config_vars; }
+    API_EXPORT void show_debug_message(char const *why,
+                                       char const *str); // defined in optimize.cc if WITH_OPT_DEBUG is set
+};
+
+// these need to be defined after MatchOpBase and Match.
+inline MatchOpState &MatchOpBase::matchop_state(Match &m)
+{
+    return m.matchop_state;
+}
+inline bool MatchOpBase::do_match(Match &m, OpDef const &op) const
+{
+    if (op.opstr != m_opname0) return false;
+    int const nin = op.n_inputs();
+    if (nin < m_min_inputs || nin > m_max_inputs) return false;
+    bool const res = do_subclass_match(m, op);
+    m.matchop_state.current_matchop = res ? this : nullptr;
+    return res;
+}
+
+// Subclass of OptDebugBase for use with Match
+class OptDebugForMatch : public OptDebugBase {
+  protected:
+    Match const &m_match;
+
+  public:
+    OptDebugForMatch(Match const &m) : OptDebugBase(m.graph(), m.save_op_id_counter), m_match(m) {}
+    API_EXPORT virtual ~OptDebugForMatch() override;
+    API_EXPORT virtual std::string get_debug_filepos() const override;
+
+  protected:
+    API_EXPORT virtual char const *get_debug_desc() const override;
+    // get an OpRef of an op which is in the pattern at 'idx'
+    API_EXPORT virtual OpRef get_bound_opref(unsigned idx) const override;
+    // get an OpDef * to to an op which is in the pattern at 'idx'
+    API_EXPORT virtual OpDef const *get_bound_opdef(unsigned idx) const override;
+    API_EXPORT virtual id_to_parmname_map get_id_to_parmname_map() const override;
+};
+
+// define these debug hooks
+inline void Match::constraint_begin(GraphOptInfo const &grinfo)
+{
+    if constexpr (not build_options_pub::DefOptLog) {
+        return;
+    }
+    pending_show_replacement = false;
+    curr_rule_info = &grinfo;
+}
+inline void Match::replacement_fail() {}
+// Match::replacement_succeed(OpId newop) is in optimize.cc
+
+} // namespace hnnx
+
+namespace oExp {
+class opdef_accessor;
+}
+namespace tiling {
+class TileShapeBase;
+}
+
+/*
+ * Constraints are an expression that can inspect a matched pattern
+ * to see if the situation is actually valid
+ *
+ * EXTERNAL_CONSTRAINT is a hook that can be used to write your own constraint functions.
+ */
+
+namespace constraint_lib {
+
+class Constraint : public hnnx::Match {
+    friend class oExp::opdef_accessor;
+
+  protected:
+    Constraint(GraphPrepare &g) : Match(g) {}
+    /* We can put arithmetic functions in a separate library, but we want the namespace here. */
+    /* Functions that need things like the context to evaluate should probably go here */
+    OpRef get_opref(hnnx::operand_tag_parm_t param_name) const
+    {
+        int const idx = matchop_state.lookup_opertag(param_name);
+        return matchop_state.bound_opref[idx];
+    }
+
+  private:
+    const OpDef &get_opdef_from_idx(int idx)
+    {
+        OpDef const *odp = matchop_state.bound_opdef[idx];
+        if (odp == nullptr) {
+            odp = &matchop_state.bound_opref[idx].dereference(this);
+            matchop_state.bound_opdef[idx] = odp;
+        }
+        return *odp;
+    }
+    const OpDef &get_opdef(hnnx::operand_tag_parm_t param_name)
+    {
+        return get_opdef_from_idx(matchop_state.lookup_opertag(param_name));
+    }
+    const OutputDef &get_outdef(hnnx::operand_tag_parm_t param_name)
+    {
+        int const idx = matchop_state.lookup_opertag(param_name);
+        OpDef const &def = get_opdef_from_idx(idx);
+        return def.get_outputdef();
+    }
+    // this method is used by oExp::opdef_accessor; the definition
+    // is in oexpr.cc (it can't be inlined here because it needs Graph).
+    API_EXPORT OpDef const &lookup_opdef(OpId oid) const;
+
+  public:
+    template <typename UniqueType> static ReplFuncBool constraint();
+    typedef ReplFuncBool (*constraintfn_type)();
+    friend class tiling::TileShapeBase;
+
+#ifdef WITH_OPT_DEBUG
+    // This is current state associated with
+    // evaluation of a logical operator (and, or)
+    // inside a predicated
+    struct TraceState {
+        unsigned depth; // depth in the expression tree, 1 is the outermost and or or
+        unsigned clause; // cluse number within that and/or, starting with 0
+        const char *op; // operator type, "and" or "or"
+        bool result; // result
+    };
+    // Current evaluate state
+    TraceState trace{0, 0, "", false};
+    // History of evaluations
+    std::vector<TraceState> trace_vector;
+#endif
+};
+
+} // namespace constraint_lib
+
+using Constraint = constraint_lib::Constraint;
+
+/** \defgroup OptReplacement Replacement-Rule Expressions for Optimization Rules
+ * \ingroup OptimizationFuncs
+ *
+ * These are the operations available for writing 'Replacement Rule' expressions. Certain of these
+ * accept scalar inputs; for these you can use constant values, or 'constraint' expressions.
+ *
+ * Note: the operations in this group which appear to return a graph element ( Op, gen_Shape, etc)
+ * actually return a ReplFunc, which is a std::function that is called to generate the graph element.
+ *
+ * Likewise, SPLIT_START, SPLIT_SIZE, SPLIT_DIM actually return ReplFuncInt, a std::function which is called
+ * to generate the integer result, which changes as the autosplit is iterated.
+ */
+
+/*
+ * The Replacement generates the new pattern.
+ *
+ * EJP: FIXME: maybe we can make things simpler here....
+ *
+ * Once we've passed the Match and Constraint phase, we want to generate a new
+ * set of Ops to replace the sequence.
+ *
+ * We use the same Op() syntax to generate new things, we use "strings" to
+ * refer to matched items, and things typically work nicely.
+ *
+ * Well, sometimes anyway.
+ *
+ * It's common to want to do things like slicing, where we want to generate
+ * lots of ops... so adding some extra things to be able to slice into multiple
+ * things and concatenate them is helpful.
+ *
+ * But when we try to do that, we run into problems where the items in the
+ * dictionary are evaluated before we put them in.  So we do a lot of work
+ * with these deferred std::function returns.  Then we just copy what woks
+ * to do it again... but I think it might be wasteful.
+ *
+ * As we're generating these new ops, we start off with the output definition
+ * of the thing we're replacing.  That works fine for doing a simple substitution
+ * like Op(Relu,Op(ConvLayer,Act,W,B,S)) --> Op(ConvLayer_relu,Act,W,B,S)
+ * But if you want to (for example) split weights or pad activations, you need
+ * to change the sizes of inputs, not just keep inheriting the output's output def.
+ *
+ * So we have this WITH_SIZE and friends, but there's probably a better
+ * system that we could concieve of.
+ *
+ * Beyond that, it seems like a lot of the size / quant parameter / slicing
+ * code might be kind of common, so maybe some more library code that hides the
+ * ugliness is good enough to make the common cases simple.
+ *
+ */
+class Replacement : public Constraint {
+    friend class gxE::GXEngine;
+
+    // Thiis only used to suppress some AUTOSPLIT rules when we are using
+    // the centalilzed tiler. It should not land.
+    class SkipAutosplit : std::exception {
+        virtual const char *what() const noexcept { return "autothread skipped"; }
+    };
+
+  protected:
+    OpDef const *m_curr_op; // used as id reference in 'APPLY'
+    API_EXPORT_IMPORT static std::string pkg_flag;
+    Replacement(GraphPrepare &g) : Constraint(g), m_curr_op(NULL) {}
+
+  public:
+    struct ReplacedId {
+      private:
+        OpId replaced_id = 0;
+
+      public:
+        ReplacedId() {} // = default;
+
+        bool inline is_set() const { return replaced_id != 0; }
+        bool inline is_clear() const { return replaced_id == 0; }
+
+        void inline set(OpId replaced_id_in)
+        {
+            assert(replaced_id_in != 0);
+            assert(is_clear());
+            replaced_id = replaced_id_in;
+        }
+        void inline clear()
+        {
+            assert(is_set());
+            replaced_id = 0;
+        }
+        OpId inline get() const
+        {
+            assert(is_set());
+            return replaced_id;
+        }
+    };
+
+    API_EXPORT auto find_context(hnnx::split_context_tag_t tag)
+    {
+        auto cur = split_context.rbegin();
+        auto end = split_context.rend();
+        for (; cur != end; cur++)
+            if (cur->first == tag) return cur;
+        errlog("no context found for %s", tag.c_str());
+        return split_context.rend();
+    }
+    API_EXPORT const Split_Context &lookup_split(hnnx::split_context_tag_t tag) const
+    {
+        return const_cast<Replacement *>(this)->find_context(tag)->second;
+    }
+    hnnx::MatchOpState &get_matchop_state() { return matchop_state; }
+    OpRef match_root() const { return matchop_state.bound_opref[0]; }
+    API_EXPORT OpRef do_replacement(const OpDef &oldop, ReplFunc const &replace_func);
+    API_EXPORT static void set_pkg_flag(std::string &s) { pkg_flag = s; }
+
+  private:
+    std::vector<std::pair<hnnx::split_context_tag_t, Split_Context>> split_context;
+    Split_Context &push_split(hnnx::split_context_tag_t tag)
+    {
+        if (split_context.capacity() < 8) split_context.reserve(8);
+        assert(split_context.size() < split_context.capacity());
+        return split_context.emplace_back(tag, Split_Context{}).second;
+    }
+    void pop_split() { split_context.pop_back(); }
+    Split_Context &lookup_split(hnnx::split_context_tag_t tag) { return find_context(tag)->second; }
+    // apply_param_adapter is a gasket for parameters to SHAPEFN_APPLY and similar:
+    //   int, size_t, float, dtype -> same
+    //   OpRef -> same;
+    //   operand_tag -> lookup OpRef;
+    //   ReplFunc -> call it to get OpRef.
+    API_EXPORT inline int apply_param_adapter(const OpDef &base, int val) { return val; }
+    API_EXPORT inline size_t apply_param_adapter(const OpDef &base, size_t val) { return val; }
+    API_EXPORT inline float apply_param_adapter(const OpDef &base, float val) { return val; }
+    API_EXPORT inline DType apply_param_adapter(const OpDef &base, DType val) { return val; }
+    API_EXPORT inline OpRef apply_param_adapter(const OpDef &base, hnnx::operand_tag_parm_t str)
+    {
+        return get_opref(str);
+    }
+    API_EXPORT inline OpRef apply_param_adapter(const OpDef &base, OpRef ref) { return ref; }
+    API_EXPORT inline OpRef apply_param_adapter(const OpDef &base, ReplFunc const &f) { return f(*this, base); }
+
+    template <oExp::Variant V, typename T>
+    API_EXPORT inline auto apply_param_adapter(const OpDef &base, oExp::expr<V, T> const &expn)
+    {
+        return expn.eval(*this);
+    }
+    template <oExp::OpVnt V, typename T>
+    API_EXPORT inline OpRef apply_param_adapter(const OpDef &base, oExp::opexpr<V, T> const &expn)
+    {
+        return expn.eval(*this);
+    }
+
+    // 'runtime' of ResizeDim
+    API_EXPORT OpRef do_ResizeDim(OpDef const &old, int dim, int size, ReplFunc const &f, bool reduce_dim = false,
+                                  hnnx::splithist_t const *new_splithist = nullptr);
+
+    // A thin subclass of ReplFunc, which can be constructed from a ReplFunc, but also
+    // from an opexpr<V,T>
+    struct ReplFunc_general : ReplFunc {
+        ReplFunc_general(ReplFunc &&f) : ReplFunc(std::move(f)) {}
+        ReplFunc_general(ReplFunc_general &&src) = default;
+        ReplFunc_general(ReplFunc_general const &) = default;
+        template <oExp::OpVnt V, typename T>
+        ReplFunc_general(oExp::opexpr<V, T> &&op) : ReplFunc(hnnx::wrap_as_replfunc(op))
+        {
+        }
+        template <oExp::OpVnt V, typename T>
+        ReplFunc_general(oExp::opexpr<V, T> const &op) : ReplFunc(hnnx::wrap_as_replfunc(op))
+        {
+        }
+    };
+    // A thin subclass of ReplFunc, which can be constructed from a ReplFunc, but also
+    // from an operand tag, or string (via Operand()), or a fixed OpRef (this is to support OUTPUT_OF and similar)
+    struct ReplFunc_or_Operand : ReplFunc {
+        ReplFunc_or_Operand(ReplFunc &&f) : ReplFunc(std::move(f)) {}
+        ReplFunc_or_Operand(ReplFunc_or_Operand &&src) = default;
+        ReplFunc_or_Operand(ReplFunc_or_Operand const &) = default;
+        ReplFunc_or_Operand(hnnx::operand_tag_parm_t str) : ReplFunc(Operand(str)) {}
+        ReplFunc_or_Operand(char const *str) : ReplFunc(Operand(str)) {}
+        API_EXPORT ReplFunc_or_Operand(OpRef const &);
+
+        template <oExp::OpVnt V, typename T>
+        ReplFunc_or_Operand(oExp::opexpr<V, T> &&op) : ReplFunc(hnnx::wrap_as_replfunc(op))
+        {
+        }
+        template <oExp::OpVnt V, typename T>
+        ReplFunc_or_Operand(oExp::opexpr<V, T> const &op) : ReplFunc(hnnx::wrap_as_replfunc(op))
+        {
+        }
+    };
+
+    /// \ingroup OptReplacement
+    /// @brief ResizeDim(dim,size, expr) - evaluate 'expr' with a modification of reference shape
+    ///
+    /// The reference shape used to evaluate 'expr' is modified from the default by changing dimension 'dim' to size'
+    ///
+    API_EXPORT static ReplFunc ResizeDim(int dim, int size, ReplFunc_general &&f);
+    //
+    // Modifiers (e.g. WITH_SIZE( ref, target )
+    //  work like this:
+    //     (a) evaluate the 'ref' subtree using the current reference OpDef as reference;
+    //     (b) execute the modifier. This creates a temporary OpDef object, which combines
+    //         attributes of the original ref object, and the one constructed from ref;
+    //         e.g. WITH_SIZE takes rank and shape from 'ref' and the dtype etc from previoud ref
+    //     (c) now, execute the 'target' subtree using this temporary object as the reference.
+    //         The result of that is the result of the modifier. The temporary OpDef is discarded.
+    //
+    // this does WITH_SIZE, WITH_TYPE, WITH_SAME_OUTPUT
+    static const int mode_with_size = 1;
+    static const int mode_with_type = 2;
+    static const int mode_with_same_output = mode_with_size | mode_with_type;
+
+    // immed_modifier does step (b) above; it makes the temp object from the ref result and the current opdef
+    // The lambda inside WITH_output_like does steps (a) and (c).
+    //
+    API_EXPORT OpDef immed_modifier(OpRef const &ref, OpDef const &old, int mode);
+
+    API_EXPORT static ReplFunc WITH_output_like(ReplFunc_or_Operand &&ref, ReplFunc &&f, int mode);
+
+    // implements WrapOp and WrapOpAlways
+    API_EXPORT static ReplFunc WrapOp_internal(char const *op_name, char const *package, ReplFunc_or_Operand &&in_op,
+                                               bool is_idem); // True for WrapOp, false for WrapOpAlways
+
+    // implements WrapOp("op", "parmname") specifically
+    API_EXPORT static ReplFunc WrapOp_quick(char const *op_name, char const *package, char const *parm);
+
+  public:
+    API_EXPORT OpDef immed_modifier_OPID(OpRef const &ref, OpDef const &old);
+
+  private:
+    /// \ingroup OptReplacement
+    /// @brief WITH_SAME_ID(refexp, expr) - evaluate 'expr' using 'refexp' for the reference opid
+    API_HIDDEN inline static ReplFunc WITH_SAME_ID(ReplFunc_or_Operand &&ref, ReplFunc_general &&f)
+    {
+        return ReplFunc::create([=](Replacement &rpx, const OpDef &old) -> OpRef {
+            OpDef const new_def = rpx.immed_modifier_OPID(ref(rpx, old), old);
+            return f(rpx, new_def);
+        });
+    }
+
+    /// \ingroup OptReplacement
+    /// @brief WITH_SPLIT_HISTORY(refexp, expr) - evaluate 'expr' using 'refexp' for the split history
+    API_HIDDEN inline static ReplFunc WITH_SPLIT_HISTORY(ReplFunc_or_Operand &&ref, ReplFunc_general &&f)
+    {
+        return ReplFunc::create([=](Replacement &rpx, const OpDef &old) -> OpRef {
+            OpRef new_id = f(rpx, old);
+            OpDef &new_def = new_id.dereference(rpx.graph());
+            OpRef const ref_id = ref(rpx, old);
+            new_def.set_splithist(ref_id.dereference(rpx.graph()).get_splithist());
+            return new_id;
+        });
+    }
+
+    API_EXPORT void do_SPLIT_HISTORY(const OpDef &Src, int dim, OpDef &expr);
+
+    /// \ingroup OptReplacement
+    /// @brief WITH_SPLIT_HISTORY(refexp, dim, expr) - evaluate 'expr' using 'refexp' for the split history
+    //
+    // Add a new entry to split history table using refexp as the parent, dim as dimension.
+    // expr is expected to be a Concat or a InstanceNorm.SumAndSquares_TileReduce.
+    // The number of splits is determined by the number of children of expr
+    // The chunksize is determined by the first non-constant child
+    API_HIDDEN inline static ReplFunc WITH_SPLIT_HISTORY(ReplFunc_or_Operand &&ref, int dim, ReplFunc_general &&f)
+    {
+        return ReplFunc::create([=](Replacement &rpx, const OpDef &old) -> OpRef {
+            OpRef const ref_id = ref(rpx, old);
+            OpRef new_id = f(rpx, old);
+            rpx.do_SPLIT_HISTORY(ref_id.dereference(rpx), dim, new_id.dereference(rpx));
+            return new_id;
+        });
+    }
+
+  public:
+    /// \ingroup OptReplacement
+    /// @brief WITH_SIZE(refexp, expr) - evaluate 'expr' using 'refexp' for the reference output size
+    API_HIDDEN inline static ReplFunc WITH_SIZE(ReplFunc_or_Operand &&shape, ReplFunc_general &&f)
+    {
+        return WITH_output_like(std::move(shape), std::move(f), mode_with_size);
+    }
+
+  private:
+    /// \ingroup OptReplacement
+    /// @brief WITH_TYPE(refexp, expr) - evaluate 'expr' using 'refexp' for the reference output type
+    API_HIDDEN inline static ReplFunc WITH_TYPE(ReplFunc_or_Operand &&type, ReplFunc_general &&f)
+    {
+        return WITH_output_like(std::move(type), std::move(f), mode_with_type);
+    }
+    /// \ingroup OptReplacement
+    /// @brief WITH_SAME_OUTPUT(refexp, expr) - evaluate 'expr' using 'refexp' for the reference output type and size
+    API_HIDDEN inline static ReplFunc WITH_SAME_OUTPUT(ReplFunc_or_Operand &&ref, ReplFunc_general &&f)
+    {
+        return WITH_output_like(std::move(ref), std::move(f), mode_with_type | mode_with_size);
+    }
+
+    /// \ingroup OptReplacement
+    /// @brief Doesn't change anything in graph, just returns the op and adds it to const_tracker if enabled
+    /// Does NOT support tracking scalars
+    /// Please only use quant dependent constants, and avoid double tracking the same const
+    API_HIDDEN inline static ReplFunc MARK_REPLACEABLE_QCONST(ReplFunc_or_Operand &&op)
+    {
+        return ReplFunc::create([op](Replacement &rpx, const OpDef &old) -> OpRef {
+            return rpx.add_TRACKED_OP(rpx, old, std::move(op));
+        });
+    }
+
+    // what are comments
+    OpRef add_TRACKED_OP(Replacement &rpx, const OpDef &old, const ReplFunc_or_Operand &&op);
+
+    API_HIDDEN inline static ReplFunc WrapOp(char const *opname, ReplFunc_or_Operand &&f)
+    {
+        return WrapOp_internal(opname, pkg_flag.c_str(), std::move(f), true);
+    }
+    API_HIDDEN inline static ReplFunc WrapOp(char const *opname, char const *operand)
+    {
+        return WrapOp_quick(opname, pkg_flag.c_str(), operand);
+    }
+    API_HIDDEN inline static ReplFunc WrapOpAlways(char const *opname, ReplFunc_or_Operand &&f)
+    {
+        return WrapOp_internal(opname, pkg_flag.c_str(), std::move(f), false);
+    }
+
+    API_EXPORT OpRef immed_gen_ShapeOf(OpRef const &shaperef, OpDef const &old);
+    /// \ingroup OptReplacement
+    /// @brief gen_ShapeOf(any_oper) - Construct an OpDef_Shape with the shape taken from the given graph operation.
+    API_EXPORT static ReplFunc gen_ShapeOf(ReplFunc_or_Operand &&shape);
+
+    API_EXPORT static inline OpDef immed_modifier_OUTPUT_TYPE(OpDef const &old, DType dtype, int32_t zero_offset,
+                                                              float stepsize)
+    {
+        OutputDef temp{};
+        temp.dtype = dtype;
+        temp.zero_offset = zero_offset;
+        temp.stepsize = stepsize;
+        return old.make_output_exemplar(nullptr, &temp);
+    }
+
+    API_EXPORT static ReplFunc WITH_OUTPUT_TYPE_func(ReplFuncDType &&dtype, ReplFuncInt &&zero_offset,
+                                                     ReplFuncFloat &&stepsize, ReplFunc &&f);
+
+  public:
+    /// \ingroup OptReplacement
+    /// @brief WITH_OUTPUT_TYPE(dtype,zero_offset,stepsize,expr) - evaluate 'expr' but using the specified output type.
+    ///
+    /// A temporary reference is created which specifies the given dtype, step, and offset instead of the
+    /// default; this is used to evaluate 'expr'. If the dtype is not quantized, use 0 and 1.0f for zero_offset and stepsize.
+    API_EXPORT static ReplFunc WITH_OUTPUT_TYPE(DType dtype, int32_t zero_offset, float stepsize, ReplFunc_general &&f);
+
+    // adapter to allow  WITH_OUTPUT_TYPE to be called with some mixture of literals and oExp, and have them all converted to
+    // function objects, which are forwarded to WITH_OUTPUT_TYPE_func
+    template <typename TDT, typename TZO, typename TSS>
+    static inline ReplFunc WITH_OUTPUT_TYPE(TDT &&dtype, TZO &&zero_offset, TSS &&stepsize, ReplFunc_general &&f)
+    {
+        return WITH_OUTPUT_TYPE_func(oExp::wrap_as_function<DType>(std::forward<TDT>(dtype)),
+                                     oExp::wrap_as_function<int>(std::forward<TZO>(zero_offset)),
+                                     oExp::wrap_as_function<float>(std::forward<TSS>(stepsize)), std::move(f));
+    }
+
+  private:
+    /// \ingroup OptReplacement
+    /// @brief WITH_MULT_OUT(int num_outputs, expr) - evaluate 'expr' with 'DType::Multi' for 'num_outputs' outputs
+    ///
+    /// A temporary reference is created with OutputDef configured to make an Multi-Output op with the given number
+    /// of outputs. This is used to evaluate 'expr'. num_outputs must be >=2.
+    ///
+    API_EXPORT static ReplFunc WITH_MULTI_OUT(unsigned num_outputs, ReplFunc_general &&f);
+
+    /// immed_WITH_MULTI_OUT makes the OpDef used in WITH_MULTI_OUT.
+    API_EXPORT static OpDef immed_WITH_MULTI_OUT(OpDef const &old, unsigned num_outputs);
+
+    static OpRef shapefn_adapt_result(const OpDef &old, OpRef const &inp) { return inp; };
+    API_EXPORT static OpRef shapefn_adapt_result(const OpDef &old, QuickShape const &inp);
+
+    template <typename F_T, typename... Arg_Ts>
+    API_HIDDEN OpRef immed_SHAPEFN_APPLY(const OpDef &old, F_T f, Arg_Ts &&...args)
+    {
+        OpDef const *const keep = m_curr_op;
+        m_curr_op = &old;
+        OpRef result = shapefn_adapt_result(old, f(*this, std::forward<Arg_Ts>(args)...));
+        m_curr_op = keep;
+        return result;
+    }
+    /// \ingroup OptReplacement
+    /// @brief SHAPEFN_APPLY(function,parms...) - generate a shape object by calling a specified function.
+    ///
+    /// The named function is called, with specified parameters. These can be strings (assumed to be be operand
+    /// references, and converted to OpRef), or scalar expressions.
+    ///
+    /// See also: \ref ShapeFnApply
+    template <typename F_T, typename... Arg_Ts> API_HIDDEN static ReplFunc SHAPEFN_APPLY(F_T f, Arg_Ts... args)
+    {
+        return ReplFunc::create([=](Replacement &rpx, const OpDef &old) -> OpRef {
+            /* Call f(rpx,args...) */
+            return rpx.immed_SHAPEFN_APPLY(old, f, rpx.apply_param_adapter(old, args)...);
+        });
+    }
+
+    /// \ingroup OptReplacement
+    /// @brief AUTOSPLIT_SHAPEFN_APPLY(function, "split_tag", parms...) - generate a shape object by calling a specified function.
+    ///
+    /// The named function is called, with specified parameters. These can be strings (assumed to be be operand
+    /// references, and converted to OpRef), or scalar expressions.
+    /// The 'split_tag' parameter is converted to a reference to a Split_Context
+    ///
+    /// See also: \ref AutoSplitShapeFnApply
+    template <typename F_T, typename... Arg_Ts>
+    API_HIDDEN static ReplFunc AUTOSPLIT_SHAPEFN_APPLY(F_T f, hnnx::split_context_tag_t whatsplit, Arg_Ts... args)
+    {
+        return ReplFunc::create([=](Replacement &rpx, const OpDef &old) -> OpRef {
+            /* Call f(rpx,split_context.at(whatsplit),args...) */
+            return rpx.immed_SHAPEFN_APPLY(old, f, rpx.lookup_split(whatsplit), rpx.apply_param_adapter(old, args)...);
+        });
+    }
+    // AUTOSPLIT_SLICE, TYPICAL_SLICE, CHANGEDIM_SLICE
+    // are 'macro' operations - same effect as inserting a more
+    // complex expression in the source rule.
+
+    /// \ingroup OptReplacement
+    /// @brief AUTOSPLIT_SLICE(in, start, size ) -> WITH_SIZE( size, WITH_TYPE( in, Op("Slice_shape", in, start, size)))
+    ///
+    /// This generates a "Slice_shape" op applied to the given input 'in', with the given 'start' and 'size' shapes. The
+    /// output shape is configured to match 'size', and the output type is always the same as 'in'
+    ///
+    API_EXPORT static ReplFunc AUTOSPLIT_SLICE(ReplFunc_or_Operand &&in, ReplFunc_or_Operand &&start,
+                                               ReplFunc_or_Operand &&size);
+
+    /// \ingroup OptReplacement
+    /// @brief Create a slice of an autosplit via a simple split along a dimension.
+    ///
+    /// This does an 'AUTOSPLIT_SLICE' where the size and start are calculated by
+    /// simple_split_start, and simple_split_size, i.e. the split is done exactly as the output split,
+    /// in the same dimension, with no overlap.
+    ///
+    /// Equivalent to the following:
+    ///
+    ///     TYPICAL_SLICE(in, "tag" ) ->
+    ///       AUTOSPLIT_SLICE( in,
+    ///          AUTOSPLIT_SHAPEFN_APPLY( simple_split_start, tag, in ),
+    ///          AUTOSPLIT_SHAPEFN_APPLY( simple_split_size, tag, in ))
+    API_EXPORT static ReplFunc TYPICAL_SLICE(ReplFunc_or_Operand &&in, hnnx::split_context_tag_t whatsplit);
+
+    /// \ingroup OptReplacement
+    /// @brief Create a slice of an autosplit
+    ///
+    /// This does an 'AUTOSPLIT_SLICE' where the size and start are calculated by
+    /// simpledim_split_start, and simpledim_split_size, i.e. the split is done as the output split
+    /// with no overlap, but it may be applied to a different axis than that specified
+    /// in the AUTOSPLIT.
+    ///
+    /// Equivalent to the following:
+    ///
+    ///     CHANGEDIM_SLICE(in, "tag", int newdim ) ->
+    ///       AUTOSPLIT_SLICE( in,
+    ///          AUTOSPLIT_SHAPEFN_APPLY( simpledim_split_start, tag, in, newdim ),
+    ///          AUTOSPLIT_SHAPEFN_APPLY( simpledim_split_size, tag, in, newdim ))
+    API_EXPORT static ReplFunc CHANGEDIM_SLICE(ReplFunc_or_Operand &&in, hnnx::split_context_tag_t whatsplit,
+                                               int newdim);
+
+    // Pretty much all TYPICAL_SLICE and CHANGEDIM_SLICE are just ("string", "string") .. so this wrapper
+    // will save some code space at the call sites.
+    API_EXPORT static ReplFunc CHANGEDIM_SLICE(char const *in_parm, char const *whatsplit, int newdim);
+
+  public:
+    static ReplFunc TYPICAL_SLICE(char const *in_parm, char const *whatsplit)
+    {
+        return CHANGEDIM_SLICE(in_parm, whatsplit, -1);
+    }
+
+  public:
+    // this actually implements TYPICAL_SLICE (with newdim=-1) and CHANGEDIM_SLICE (with newdim >=0)
+    API_EXPORT OpRef do_TYPICAL_SLICE(OpDef const &old, OpRef input_op, hnnx::split_context_tag_t whatsplit, int newdim,
+                                      bool reduce_dim = false);
+
+    API_EXPORT OpRef do_AUTOSPLIT(OpDef const &old, int dim, Split_Context &splitinfo, int chunksize, ReplFunc const &f,
+                                  bool reduce_dim = false, bool autothread = false);
+
+    API_EXPORT OpRef do_AUTOTHREAD(OpDef const &old, int dim, hnnx::split_context_tag_t varname, int ntiles,
+                                   ReplFunc const &f);
+
+    /// \ingroup OptReplacement
+    /// @brief Expand an expression by splitting on some dimension.
+    ///
+    /// AUTOSPLIT( dim, "tag", size,  <repl_expression> ) causes the operation to be split into
+    /// slices along dimension dim, with each slice being of 'size' (or possibly smaller, for the last one).
+    ///
+    /// This done by
+    ///
+    ///   * Repeatedly evaluating the 'repl_expression', once for each slice
+    ///   * Using a 'Concat' on the specified dimension to join the results.
+    ///   * Within each iteration, SPLIT_START("tag") and SPLIT_SIZE"tag"), when evaluated within <repl_expression>,
+    ///    will reflect the extent of the current split in the output, and thus can be used to construct the corresponding
+    ///    slices of the input. SLICE_DIM("tag") will always give the value supplied to the AUTOSPLIT as 'dim'. Normally, this is all done within
+    ///    functions invoked via AUTOSPLIT_SHAPEFN_APPLY.
+    ///
+    /// Rules with AUTOSPLIT should have a constraint to prevent them from being applied where the split dimension does not exceed size.
+    ///
+    /// @param dim        Dimension on which to split
+    /// @param varname    A string indentifying the split context
+    /// @param chunksize  The size of each slice of the output
+    /// @param f          The subexpression to generate each part of the split
+  public:
+    API_EXPORT static ReplFunc AUTOSPLIT(int dim, hnnx::split_context_tag_t varname, int chunksize,
+                                         ReplFunc_general &&f);
+
+  private:
+    API_EXPORT static ReplFunc AUTOSPLIT_func(ReplFuncInt &&dim, hnnx::split_context_tag_t varname,
+                                              ReplFuncInt &&chunksize, ReplFunc &&f);
+
+    template <oExp::Variant V1, typename T1, oExp::Variant V2, typename T2>
+    API_HIDDEN inline static ReplFunc AUTOSPLIT(oExp::expr<V1, T1> &&dim, hnnx::split_context_tag_t varname,
+                                                oExp::expr<V2, T2> &&chunksize, ReplFunc_general &&f)
+    {
+        return AUTOSPLIT_func(oExp::wrap_as_function<int>(std::move(dim)), varname,
+                              oExp::wrap_as_function<int>(std::move(chunksize)), std::move(f));
+    }
+    // TODO: need a better way to do this (map 'int' or oExp which is int, to ReplFuncInt).
+    template <oExp::Variant V2, typename T2>
+    API_HIDDEN inline static ReplFunc AUTOSPLIT(int dim, hnnx::split_context_tag_t varname,
+                                                oExp::expr<V2, T2> &&chunksize, ReplFunc_general &&f)
+    {
+        return AUTOSPLIT_func(oExp::make_literal_sfunction<int>(dim), varname,
+                              oExp::wrap_as_function<int>(std::move(chunksize)), std::move(f));
+    }
+    template <oExp::Variant V1, typename T1>
+    API_HIDDEN inline static ReplFunc AUTOSPLIT(oExp::expr<V1, T1> &&dim, hnnx::split_context_tag_t varname,
+                                                int chunksize, ReplFunc_general &&f)
+    {
+        return AUTOSPLIT_func(oExp::wrap_as_function<int>(std::move(dim)), varname,
+                              oExp::make_literal_sfunction<int>(chunksize), std::move(f));
+    }
+
+  public:
+    // Performs AUTOSPLIT in the specified dimension to create at most options.autothread_hvx_ntiles splits
+    // that will not be further autothreaded.
+    API_EXPORT static ReplFunc AUTOTHREAD_HVX(int dim, hnnx::split_context_tag_t varname, ReplFunc_general &&f);
+    // Same for options.autothread_hmx_ntiles.
+    API_EXPORT static ReplFunc AUTOTHREAD_HMX(int dim, hnnx::split_context_tag_t varname, ReplFunc_general &&f);
+
+    static ReplFunc first_AUTOSPLIT(int const dim, hnnx::split_context_tag_t const varname, int const chunksize,
+                                    ReplFunc const &f);
+
+  private:
+    static ReplFunc first_AUTOSPLIT(int const dim, hnnx::split_context_tag_t const varname,
+                                    ReplFuncInt const &&chunksize, ReplFunc const &f);
+
+    template <oExp::Variant V2, typename T2>
+    API_FUNC_HIDDEN inline static ReplFunc AUTOSPLIT_FIRST(int dim, hnnx::split_context_tag_t varname,
+                                                           oExp::expr<V2, T2> &&chunksize, ReplFunc_general &&f)
+    {
+        return Replacement::first_AUTOSPLIT(dim, varname, oExp::wrap_as_function<int>(std::move(chunksize)), f);
+    }
+
+    API_FUNC_HIDDEN inline static ReplFunc AUTOSPLIT_FIRST(int const dim, hnnx::split_context_tag_t const varname,
+                                                           int const chunksize, ReplFunc_general const &&f)
+    {
+        return Replacement::first_AUTOSPLIT(dim, varname, chunksize, f);
+    }
+
+  private:
+    /// AUTOSPLIT and reduce the dim
+    API_EXPORT static ReplFunc AUTOSPLIT_REDUCE(int dim, hnnx::split_context_tag_t varname, ReplFunc_general &&f);
+
+    API_EXPORT static ReplFunc TYPICAL_SLICE_REDUCE(ReplFunc_or_Operand &&in, hnnx::split_context_tag_t whatsplit);
+
+    /// \ingroup OptReplacement
+    /// @brief Create a multi-output Op by iterating over expression
+    ///
+    /// OP_ITER( op_base, "tag", lo_index, hi_index, <repl_expression> )
+    ///
+    /// The operation will iterate for "I" >= lo_index, < hi_index; for each value, the repl_expression
+    /// is evaluated, and a new Op is created which
+    ///
+    ///  - has the same opstr as op_base, and the same inputs, plus additions inputs generated by
+    ///    the iteration
+    ///  - the OutputDef of the new op is defined by the context of the ITER_OP, and may be different
+    ///    from that of the op_base.
+    ///
+    /// if lo_index <= hi_index, no iteration is done, and the built Op has the same inputs
+    /// as op_base. Rules with OP_ITER should have a constraint to prevent them from being
+    /// applied where this could be incorrect.
+    ///
+    /// @param op_base    'Reference' Op supplying the name and fixed inputs
+    /// @param varname    A string indentifying the split context
+    /// @param lo_index   the first input index
+    /// @param hi_index   the last input index+1
+    /// @param f          The subexpression to iterate.
+    ///
+    API_EXPORT static ReplFunc OP_ITER(ReplFunc &&op_base, hnnx::split_context_tag_t varname, int lo_index,
+                                       int hi_index, ReplFunc_general &&f);
+
+    // same with ReplFuncInt for the index values, so they can be expressions
+    API_EXPORT static ReplFunc OP_ITER_func(ReplFunc &&op_base, hnnx::split_context_tag_t const &varname,
+                                            ReplFuncInt &&lo_index, ReplFuncInt &&hi_index, ReplFunc &&f);
+
+    // template to map expressions to ReplFuncInt
+    template <typename TLO, typename THI>
+    API_HIDDEN inline static ReplFunc OP_ITER(ReplFunc &&op_base, hnnx::split_context_tag_t varname, TLO &&lo_index,
+                                              THI &&hi_index, ReplFunc_general &&f)
+    {
+        return OP_ITER_func(std::move(op_base), varname, oExp::wrap_as_function<int>(std::forward<TLO>(lo_index)),
+                            oExp::wrap_as_function<int>(std::forward<THI>(hi_index)), std::move(f));
+    }
+
+    API_EXPORT OpRef do_OP_ITER(OpDef const &old, OpDef const &base_op, Split_Context &splitinfo, int lo_index,
+                                int hi_index, ReplFunc const &f);
+
+    /// \ingroup OptReplacement
+    /// @brief INHERIT_MEMOS_FROM(refexp, expr) - evaluate 'expr' inheriting any persistent memos from 'refexp'
+
+    API_EXPORT void do_INHERIT_MEMOS(const OpDef &old, OpDef &newdef);
+
+    API_HIDDEN inline static ReplFunc INHERIT_MEMOS_FROM(ReplFunc_or_Operand &&ref, ReplFunc_general &&f)
+    {
+        return ReplFunc::create([=](Replacement &rpx, const OpDef &old) -> OpRef {
+            OpRef const ref_id = ref(rpx, old);
+            OpRef new_id = f(rpx, old);
+            rpx.do_INHERIT_MEMOS(ref_id.dereference(rpx), new_id.dereference(rpx));
+            return new_id;
+        });
+    }
+
+    /// \ingroup OptReplacement
+    /// @brief WITH_MEMOS(refexp, expr) - evaluate 'expr' using 'refexp' for persistant memos
+
+    API_EXPORT void do_WITH_MEMOS(const OpDef &old, OpDef &newdef);
+
+    API_HIDDEN inline static ReplFunc WITH_MEMOS(ReplFunc_or_Operand &&ref, ReplFunc_general &&f)
+    {
+        return ReplFunc::create([=](Replacement &rpx, const OpDef &old) -> OpRef {
+            OpRef const ref_id = ref(rpx, old);
+            OpRef new_id = f(rpx, old);
+            rpx.do_WITH_MEMOS(ref_id.dereference(rpx), new_id.dereference(rpx));
+            return new_id;
+        });
+    }
+
+    // this is basically a SHAPEFN_APPLY for a function with no other inputs.
+    // We still want to bind it into a std::function.
+
+    /// \ingroup OptReplacement
+    /// @brief Generate replacement by calling an external function
+    ///
+    /// The function must be: OpRef function( Replacement &, OpDef const &op);
+    ///
+    /// ... where 'op' is the OpDef being replaced.
+    /// The return value is the OpRef of the replacement. If it's the same as the OpRef of 'op', it is assumed
+    /// that the rule has no effect in this situation.
+    ///
+    API_HIDDEN static ReplFunc EXTERNAL_REPLACE(external_replace_funcp f)
+    {
+        return ReplFunc(ReplFunc::FunctionWrapper, (void *)f);
+    }
+
+    /// \ingroup OptReplacement
+    /// @brief Define a new mult-output Op, with one of its outputs
+    ///
+    ///  OpMultiOut( n_out, outno, "opstr", ...inputs... )
+    ///  is equivalent to
+    ///     Op( "$Out",  WITH_MULTI_OUT( n_out, Op("opstr", ... inputs...),
+    ///        gen_Shape(0,0,n_out,outno))
+    ///
+    /// If any of the inputs have Op in them, they will need to have WITH_ modifiers
+    /// for shape and type enclosing them.
+    ///
+    template <typename... Ts>
+    API_HIDDEN inline static ReplFunc OpMultiOut(unsigned n_out, unsigned outno, char const *opstr, Ts &&...ts)
+    {
+        assert(n_out >= 2 && outno < n_out);
+        return Op("$Out", WITH_MULTI_OUT(n_out, Op(opstr, std::forward<Ts>(ts)...)),
+                  gen_Shape(0, 0, size_t(n_out), size_t(outno)));
+    }
+
+  public:
+    /// \ingroup OptReplacement
+    /// @brief Generate reference to an operand in the match rule: Operand("opname")
+    ///
+    /// This need not be written in rules; if "X" appears in any part of a replacement rule
+    /// where an 'Op' expression is needed, it will be interpreted as Operand("X"). Including
+    /// the case where the entire replacement rule is just "X" (i.e. the rule 'bypasses' input X to
+    /// the output).
+    ///
+    static ReplFunc Operand(hnnx::operand_tag_parm_t str)
+    {
+        return ReplFunc::create([=](Replacement &rpx, const OpDef &old) -> OpRef { return rpx.get_opref(str); });
+    }
+    static ReplFunc Operand(ReplFunc const &opf) { return opf; }
+    static ReplFunc Operand(ReplFunc &&opf) { return std::move(opf); }
+    template <oExp::OpVnt V, typename T> static ReplFunc Operand(oExp::opexpr<V, T> &&op)
+    {
+        return hnnx::wrap_as_replfunc(op);
+    }
+    template <oExp::OpVnt V, typename T> static ReplFunc Operand(oExp::opexpr<V, T> const &op)
+    {
+        return hnnx::wrap_as_replfunc(op);
+    }
+
+    API_EXPORT static ReplFunc Op_inner(char const *str, char const *package, int n_in, ReplFunc const *ifuncs);
+
+    // all of the Ts for Op should be either an operand_tag_t (or convertible to one)
+    // or should be a ReplFunc
+    // This Op() just maps all the operand tags to  ReplFunc
+    // (by passing the them all through Operand(), which has no effect on functions)
+    // They are placed in an array, passed to Op_inner.
+    //
+    /// \ingroup OptReplacement
+    /// @brief Generate a new Op in a replacement rule: Op("opname", ... inputs ... )
+    ///
+    /// The inputs can be any 'replacement' expressions, or operand tags; the shape and type of the Op output
+    /// are inherited from the replaced Op -- or from the innermost modifier, if the Op appears
+    /// within a modifier.
+    ///
+
+    template <typename... Ts> API_HIDDEN static ReplFunc Op(char const *str, Ts... ts)
+    {
+        std::array<ReplFunc, sizeof...(Ts)> input_funcs = {Replacement::Operand(ts)...};
+        return Op_inner(str, pkg_flag.c_str(), sizeof...(Ts), input_funcs.data());
+    }
+
+  private:
+    // this is to include oExp::SELECT, on an equal namespace footing with these other select.
+    template <typename TS, typename TA, typename TB> static inline auto SELECT(TS &&sel, TA &&a, TB &&b)
+    {
+        // compiler wants to use this for Repl inputs, too...
+        // send those to SELECT_func
+        if constexpr (std::is_constructible<ReplFunc_or_Operand, TA>::value ||
+                      std::is_constructible<ReplFunc_or_Operand, TB>::value) {
+            return SELECT_func(oExp::wrap_as_function<bool>(std::forward<TS>(sel)),
+                               ReplFunc_or_Operand(std::forward<TA>(a)), ReplFunc_or_Operand(std::forward<TB>(b)));
+        } else {
+            return oExp::SELECT(std::forward<TS>(sel), std::forward<TA>(a), std::forward<TB>(b));
+        }
+    }
+
+    // this is to implement all of the select cases where ?: works, where the result is not ReplFunc
+    /* removed - I doubt this is safe
+	template <typename TA, typename TB>
+	static auto SELECT( bool sel, TA &&iftrue, TB &&iffalse) -> decltype(sel?iftrue:iffalse) {
+		return sel? std::forward<TA>(iftrue): std::forward<TB>(iffalse);
+	} */
+    // SELECT ReplFunc with immediate execution
+    // The second one allows "Parmname" as one operand, third allows two.
+    API_EXPORT static ReplFunc SELECT(bool sel, ReplFunc_general &&iftrue, ReplFunc_general &&iffalse);
+    API_EXPORT static ReplFunc SELECT(bool sel, ReplFunc_or_Operand &&iftrue, ReplFunc_or_Operand &&iffalse);
+    API_EXPORT static ReplFunc SELECT(bool sel, char const *iftrue, char const *iffalse);
+    // SELECT ReplFunc with deferred execution (returned function will call sel(), and then one
+    // of the functions).
+    API_EXPORT static ReplFunc SELECT_func(ReplFuncBool &&sel, ReplFunc_or_Operand &&iftrue,
+                                           ReplFunc_or_Operand &&iffalse);
+
+    template <oExp::Variant V, typename T>
+    static ReplFunc SELECT(oExp::expr<V, T> &&condn, ReplFunc_or_Operand &&iftrue, ReplFunc_or_Operand &&iffalse)
+    {
+        return SELECT_func(oExp::wrap_as_function<bool>(std::move(condn)), std::move(iftrue), std::move(iffalse));
+    }
+
+    /*
+	OpRef do_replacement(const OpDef & oldop, ReplFunc const & f)
+	{
+		return f(*this,oldop);
+	}
+	OpRef do_replacement(const OpDef & oldop, hnnx::operand_tag_parm_t str)
+	{
+		return get_opref(str);
+	}*/
+  public:
+    OpDef const &curr_op() const { return *m_curr_op; }
+
+    API_EXPORT static OpRef gen_node(const hnnx::opname_tag_t str, size_t n_in, OpRef const *inputs, const OpDef &old,
+                                     char const *package_name = THIS_PKG_NAME_STR, const OpDef *model = nullptr);
+    static inline OpRef gen_node(const hnnx::opname_tag_t str, std::vector<OpRef> const &inputs, const OpDef &old,
+                                 char const *package_name = THIS_PKG_NAME_STR, const OpDef *model = nullptr)
+    {
+        return gen_node(str, inputs.size(), inputs.data(), old, package_name, model);
+    }
+    // allow {opref1, opref2} for 'inputs' (without becoming std::vector)
+    static inline OpRef gen_node(const hnnx::opname_tag_t str, std::initializer_list<OpRef> inputs, const OpDef &old,
+                                 char const *package_name = THIS_PKG_NAME_STR, const OpDef *model = nullptr)
+    {
+        return gen_node(str, inputs.size(), inputs.begin(), old, package_name, model);
+    }
+    template <size_t N>
+    static inline OpRef gen_node(const hnnx::opname_tag_t str, std::array<OpRef, N> const &inputs, const OpDef &old,
+                                 char const *package_name = THIS_PKG_NAME_STR, const OpDef *model = nullptr)
+    {
+        return gen_node(str, N, inputs.data(), old, package_name, model);
+    }
+
+    API_EXPORT OpRef gen_Shape_in_graph(const OpDef &old, int rank, size_t const *sizes);
+
+    template <DType DT>
+    API_EXPORT OpRef gen_Const_scalar(const OpDef &old, typename dtype_traits<DT>::element_type constval);
+
+    template <DType DT>
+    API_EXPORT OpRef gen_Const_1D_array(const OpDef &old, typename dtype_traits<DT>::element_type const *vals,
+                                        size_t n);
+
+    template <DType DT>
+    API_EXPORT OpRef gen_Const_mD_array(const OpDef &old, typename dtype_traits<DT>::element_type const *vals, size_t n,
+                                        size_t m);
+
+    API_EXPORT OpRef gen_Const_int32_common(const OpDef &old, const OutputDef &out_def, const uint8_t *data,
+                                            size_t data_len);
+
+    API_EXPORT OpRef gen_Const_float_common(const OpDef &old, const OutputDef &out_def, const uint8_t *data,
+                                            size_t data_len);
+
+    template <typename UniqueType> API_EXPORT static ReplFunc first_replacement();
+    template <typename UniqueType> API_EXPORT static ReplFunc replacement();
+    //typedef OpRef (Replacement::*replacementfn_type)(const OpDef &oldop);
+    typedef ReplFunc (*replacementfn_type)();
+
+    template <typename T> static constexpr bool is_Op_type()
+    {
+        return std::is_constructible<ReplFunc_or_Operand, T>::value;
+    }
+};
+
+namespace hnnx {
+
+//
+// These implement CONSTVAL_INT, CONSTVAL_INT_VALID
+// and GETCONST_FLOAT, CONSTVAL_FLOAT_VALID
+// The first part of the return value is the result from CONSTVAL_INT(op,idx)
+// The second part is the return from CONSTVAL_INT_VALID
+API_EXPORT std::pair<NN_INT32_T, bool> getconst_int_impl(GraphPrepare &g, OpDef const &opdef, int index);
+API_EXPORT std::pair<NN_INT32_T, bool> getconst_int_impl(GraphPrepare &g, OpDef const &opdef, int index, int index2);
+API_EXPORT std::pair<float, bool> getconst_float_impl(GraphPrepare &g, OpDef const &opdef, int index);
+API_EXPORT std::pair<float, bool> getconst_float_impl(GraphPrepare &g, OpDef const &opdef, int index, int index2);
+API_EXPORT bool producer_for_impl(OpDef const &opdef, const hnnx::opname_tag_t consumer_opname);
+
+class GraphOptInfo;
+/*
+ * A GraphOptContext ties these all together, along with the 'attempt' method
+ */
+class GraphOptContext : public Replacement {
+  public:
+    GraphOptContext(GraphPrepare &g) : Replacement(g) {}
+    void set_rule(GraphOptInfo const *const Rule) { curr_rule_info = Rule; }
+    API_EXPORT OpId attempt(GraphOptInfo const &, OpDef &oldop);
+};
+
+class entire_defopt {
+  public:
+    hnnx::MatchAst_uptr matcher;
+    ReplFuncBool constraint;
+    ReplFunc replacement;
+    ReplFunc replacement_first;
+    void (*register_tiling)(GraphOptInfo *);
+};
+
+using get_entire_defopt_t = entire_defopt (*)();
+
+template <typename T> entire_defopt get_entire_defopt();
+
+class GraphOptPass;
+
+// GraphOpInfo: contains pointers to all the specialized methods.
+// These are all created as global variables, and they are linked together
+// in a linked list; optimization_passes will be populated with pointers
+// to them.
+
+class GraphOptInfo {
+    friend class GraphOptContext;
+
+    int priority;
+    OptimFlags::flags_t flags;
+    /** @brief defopt A pointer to the function that generates the
+     *  match, constraint, and replacement characterizing this optimization
+     */
+    get_entire_defopt_t defopt_fn;
+
+  public:
+    MatchOp_uptr matchop_ptr; //stores the built matchop.
+    ReplFuncBool constraint_func; // function object for the constraint.
+    ReplFunc replace_func; // fucntion object for replacement.
+    ReplFunc replace_first_func;
+    GraphOptInfo const *next_in_pass = nullptr; // next opt for the same opstr in the same pass.
+
+    // note, WITH OPT_DEBUG must be consistent across a build now, otherwise you
+    // should get a link error (at least on "add_package_opt").
+    char const *debug_filename = nullptr;
+    int debug_lineno = 0;
+
+    bool is(unsigned flag) const { return flags & flag; }
+
+  protected:
+    // this is done in populate_optimization_map, for all optims.
+    virtual void build_matchop()
+    {
+        entire_defopt defopt;
+        defopt = defopt_fn();
+
+        matchop_ptr = MatchBuilder::build_matcher(defopt.matcher);
+        // build the constraint function too
+        // If the actual constraint function is detected to be the 'always true'
+        // function, we leave constraint_func empty.
+        ReplFuncBool const cfunc = defopt.constraint;
+        int const check = oExp::check_sfunction_bool(cfunc);
+        if (check != 1) constraint_func = cfunc;
+        replace_func = defopt.replacement;
+        replace_first_func = defopt.replacement_first;
+        if (defopt.register_tiling && (get_flags() & (OptimFlags::flags_t(OptimFlags::central_autosplit_flag) |
+                                                      OptimFlags::flags_t(OptimFlags::central_only_autosplit_flag))))
+            defopt.register_tiling(this);
+    }
+
+  public:
+    API_EXPORT GraphOptInfo(int priority, OptimFlags::flags_t flags_in, get_entire_defopt_t defopt_in);
+    GraphOptInfo(int priority, OptimFlags::flags_t flags_in);
+    API_EXPORT virtual ~GraphOptInfo() = default;
+
+    // This fills in the optimization map.
+    API_EXPORT static void insert_optimization(std::map<int, GraphOptPass> &opt_passes, GraphOptInfo *p);
+    API_EXPORT static void populate_package_optimization_map(std::vector<std::unique_ptr<GraphOptInfo>> &opts);
+
+    API_EXPORT inline bool test_constraint(Constraint &cst) const
+    {
+        // an empty constraint_func means 'always'
+        return constraint_func ? constraint_func(cst) : true;
+    }
+    API_EXPORT GraphOptInfo const *next_optim() const { return next_in_pass; }
+    API_EXPORT void set_next_in_pass(const GraphOptInfo *next) { next_in_pass = next; }
+
+    API_EXPORT MatchOpBase &get_matchop() const { return *matchop_ptr.get(); }
+
+    API_EXPORT OptimFlags::flags_t get_flags() const { return flags; }
+    API_EXPORT bool has_flags(OptimFlags::flags_t v) const { return (flags & v) != 0; }
+    API_EXPORT inline void add_debug_info(char const *const filename, const int lineno)
+    {
+        debug_filename = filename;
+        debug_lineno = lineno;
+    }
+    API_EXPORT inline char const *get_filename() const { return debug_filename; }
+    // get the filename.cc:lineo as a string
+    API_EXPORT std::string get_debug_filepos() const;
+
+#ifndef PREPARE_DISABLED
+    template <typename T> static void declare_tiling_rule(GraphOptInfo *);
+
+    // Register a tiling rule (defined in tiler.cc)
+    static unsigned declare_tiling_rule(unsigned dim, const char *Var, GraphOptInfo *, const char *filename,
+                                        unsigned lineno);
+#endif // PREPARE_DISABLED
+    ReplFunc get_replacement() const { return replace_func; }
+    ReplFunc get_replacement_first() const { return replace_first_func; }
+    API_EXPORT int get_priority() const { return priority; }
+#ifndef PREPAREA_DISABLED
+    void set_priority(int new_priority) const { const_cast<GraphOptInfo *>(this)->priority = new_priority; }
+#endif
+};
+
+#ifndef DEF_OPT_COMPILE
+#define DEF_AUTOSPLIT_COMMON(PRIORITY, FLAGS, MATCHCODE, CONSTRAINTCODE, dim, var, CHUNKSIZE, REPLACE)                 \
+    template <> hnnx::MatchAst_uptr MatchBuilder::matcher<UNIQUE_TYPE>() { return MATCHCODE; }                         \
+    template <> ReplFuncBool Constraint::constraint<UNIQUE_TYPE>()                                                     \
+    {                                                                                                                  \
+        using namespace oExp_for_cst;                                                                                  \
+        using oExp::INT;                                                                                               \
+        using oExp::UINT;                                                                                              \
+        auto result = oExp::wrap_param_to<bool>(                                                                       \
+                AND(CONSTRAINTCODE, OR(OPTION_BOOL("ignore_chunksize"), GT(DIM_OF("*", dim), CHUNKSIZE))));            \
+        return oExp::wrap_as_function<bool>(result);                                                                   \
+    }                                                                                                                  \
+    template <> ReplFunc Replacement::replacement<UNIQUE_TYPE>()                                                       \
+    {                                                                                                                  \
+        using namespace oExp_for_repl;                                                                                 \
+        using oExp::INT;                                                                                               \
+        using oExp::UINT;                                                                                              \
+        pkg_flag = THIS_PKG_NAME_STR;                                                                                  \
+        return Operand(AUTOSPLIT(dim, var, CHUNKSIZE, REPLACE));                                                       \
+    }                                                                                                                  \
+    template <> ReplFunc Replacement::first_replacement<UNIQUE_TYPE>()                                                 \
+    {                                                                                                                  \
+        using namespace oExp_for_repl;                                                                                 \
+        using oExp::INT;                                                                                               \
+        using oExp::UINT;                                                                                              \
+        pkg_flag = THIS_PKG_NAME_STR;                                                                                  \
+        return Operand(REPLACE);                                                                                       \
+    }                                                                                                                  \
+    template <> inline constexpr hnnx::OptimFlags::flags_t hnnx::OptimFlags::flag_evaluate<UNIQUE_TYPE>() noexcept     \
+    {                                                                                                                  \
+        return static_cast<uint32_t>(any_rule) | static_cast<uint32_t>(FLAGS);                                         \
+    }                                                                                                                  \
+    template <> void hnnx::GraphOptInfo::declare_tiling_rule<UNIQUE_TYPE>(GraphOptInfo * info)                         \
+    {                                                                                                                  \
+        declare_tiling_rule(dim, var, info, __FILE__, __LINE__);                                                       \
+    }                                                                                                                  \
+    template <> hnnx::entire_defopt hnnx::get_entire_defopt<UNIQUE_TYPE>()                                             \
+    {                                                                                                                  \
+        return hnnx::entire_defopt{MatchBuilder::matcher<UNIQUE_TYPE>(), Constraint::constraint<UNIQUE_TYPE>(),        \
+                                   Replacement::replacement<UNIQUE_TYPE>(),                                            \
+                                   Replacement::first_replacement<UNIQUE_TYPE>(),                                      \
+                                   GraphOptInfo::declare_tiling_rule<UNIQUE_TYPE>};                                    \
+    }                                                                                                                  \
+    REGISTER_INTERNAL_PACKAGE_OPT((PRIORITY), hnnx::OptimFlags::flag_evaluate<UNIQUE_TYPE>(),                          \
+                                  &hnnx::get_entire_defopt<UNIQUE_TYPE>);
+#else
+#define DEF_AUTOSPLIT_COMMON(PRIORITY, FLAGS, MATCHCODE, CONSTRAINTCODE, dim, var, CHUNKSIZE, REPLACE)                 \
+    __def_opt__(PRIORITY, FLAGS, MATCHCODE, AND(CONSTRAINTCODE, GT(DIM_OF("*", dim), CHUNKSIZE)),                      \
+                AUTOSPLIT(dim, var, CHUNKSIZE, REPLACE))<<<__FILE__, __LINE__>>>
+//  ---> the format of this line must agree with the assumption in scripts/rewrite/hash_rule.py
+#endif
+
+#define DEF_AUTOSPLIT(PRIORITY, MATCHCODE, CONSTRAINTCODE, dim, var, CHUNKSIZE, REPLACE)                               \
+    DEF_AUTOSPLIT_COMMON(PRIORITY, 0, MATCHCODE, CONSTRAINTCODE, dim, var, CHUNKSIZE, REPLACE)
+#define DEF_AUTOSPLITIM(PRIORITY, FLAGS, MATCHCODE, CONSTRAINTCODE, dim, var, CHUNKSIZE, REPLACE)                      \
+    DEF_AUTOSPLIT_COMMON(PRIORITY, FLAGS, MATCHCODE, CONSTRAINTCODE, dim, var, CHUNKSIZE, REPLACE)
+#define DEF_AUTOSPLIT_ORDERED(PRIORITY, MATCHCODE, CONSTRAINTCODE, dim, var, CHUNKSIZE, REPLACE)                       \
+    DEF_AUTOSPLIT_COMMON(PRIORITY, ordered_autosplit_flag, MATCHCODE, CONSTRAINTCODE, dim, var, CHUNKSIZE, REPLACE)
+#define DEF_AUTOSPLIT_TYPICAL(PRIORITY, OPSTR, ARITY, dim, CHUNKSIZE)                                                  \
+    DEF_AUTOSPLIT_COMMON(PRIORITY, central_autosplit_flag, OpVarIn(OPSTR), OK, dim, "I", CHUNKSIZE,                    \
+                         OP_ITER(Op(OPSTR), "J", 0, INPUTS_OF("*"),                                                    \
+                                 SELECT(GE(SPLIT_START("J"), ARITY), ITER_INPUT_OF("*", "J"),                          \
+                                        TYPICAL_SLICE(ITER_INPUT_OF("*", "J"), "I"))))
+
+// This class organizes the rules that which
+// are part of the same optimization pass (priority, phase,...)
+// These are grouped into collections which have the same root
+// type in the match. For each group, we maintain a vector
+// the rules in the order in which they should be attempted.
+// This vector is augmented with additional information
+// that guides selection of the next rule to be attempted based
+// on previous information about the match.
+class GraphOptPass {
+
+    // This type is the type of the byte codes used in the
+    // matching enging.
+    using code_t = unsigned short;
+    // This structure holds the vector of rules with
+    // a common priority and root operator type.
+    struct MatcherState {
+        std::vector<const GraphOptInfo *> rules;
+        std::vector<code_t> codes; // byte-codes to drive match checking
+        std::vector<opname_tag_t> opstrs; // opstr values needed in operand matches
+        API_EXPORT int dump() const;
+    };
+    class StateBuilder;
+
+    int priority; // common priority (pass number, phase) for this pass.
+    OptimFlags::flags_t flags; // 'or' of certain flags in the whole pass
+    // use a minhash_noerase for the rules, if we can:
+    using rules_map_t =
+            std::conditional_t<std::is_same_v<opname_tag_t, string_tag_t>, minihash_noerase<opname_tag_t, MatcherState>,
+                               std::map<opname_tag_t, MatcherState>>;
+    // Map of type level match name to associated MatcherState
+    rules_map_t rules;
+    // for each name in the rules, bit  find_opname_hash(name)&63
+    // is set in set_bitmap, so we don't even need to probe the map
+    // unless we see that bit.
+    uint64_t set_bitmap;
+
+    // hash an opstr to a single-bit bit-mask.
+    static uint64_t hash(hnnx::opname_tag_t opstr) { return uint64_t(1) << (find_opname_hash(opstr) & 63); }
+
+    // This class provides iteration over the matche returning each candidate.
+    // A nullptr in "current" is a sentinel for no more rules to attempt.
+    class MatchIterator {
+        MatchOpState &matchop_state; // State carried between match attempts
+        const MatcherState *matcher = nullptr; // the rules and byte codes for matching
+        unsigned state = 0; // the current state of the match
+        const GraphOptInfo *current = nullptr; // the current rule
+
+        API_EXPORT void advance(); // advance to the next rule and update state
+        API_EXPORT void advance_select(); // advance to the next rule by testing an input operand
+
+      public:
+        MatchIterator(MatchOpState &matchop_state, const MatcherState &matcher, unsigned state)
+            : matchop_state(matchop_state), matcher(&matcher), state(state)
+        {
+            advance();
+        }
+        // This constructor is used for "end" iterators and just sets current to nullptr
+        MatchIterator(MatchOpState &matchop_state) : matchop_state(matchop_state) {}
+
+        // These operators are intended only for use in range-for constructs
+        const GraphOptInfo &operator*() { return *current; }
+        bool operator!=(const MatchIterator &other) { return current != other.current; }
+        void operator++() { advance(); }
+    };
+
+  public:
+    explicit GraphOptPass(int pri) : priority(pri), flags(0), set_bitmap(0) {}
+    GraphOptPass(GraphOptPass &&) = default;
+
+    // Add a rule in evaluation order...
+    API_EXPORT void add_optim(GraphOptInfo *p);
+
+    // Build the codes and opstrs for each MatcherState after
+    // all rules have been added.
+    API_EXPORT void build_matchers();
+
+    // return the priority for thi pass
+    API_EXPORT int get_priority() const { return priority; }
+    // return the combined flags for rules in this pass
+    API_EXPORT OptimFlags::flags_t get_flags() const { return flags; }
+
+    // used in introspect.cc only
+    API_EXPORT const rules_map_t &get_rules() const { return rules; }
+
+    // This instance is used whenever there are no matches to
+    // the root of an operator
+    API_EXPORT_IMPORT static MatcherState empty_matcher;
+
+    // Return the matcher state associated with rules that
+    // might match opdef.
+    API_EXPORT const MatcherState &get_optims(OpDef const *opdef) const
+    {
+        // avoid the map if the the filter test failes...
+        if (not(set_bitmap & hash(opdef->opstr))) return empty_matcher;
+        auto iter = rules.find(opdef->opstr);
+        return (iter == rules.end()) ? empty_matcher : iter->second;
+    }
+
+    // This class is just an adapter so we can return information
+    // about a rule in a form that is suitable for use in a range for.
+    class RuleList {
+        MatchOpState &matchop_state;
+        const MatcherState &state;
+
+      public:
+        RuleList(MatchOpState &matchop_state, const MatcherState &state) : matchop_state(matchop_state), state(state) {}
+        API_EXPORT MatchIterator begin() const noexcept { return MatchIterator(matchop_state, state, 0); }
+        API_EXPORT MatchIterator end() const noexcept { return MatchIterator(matchop_state); }
+    };
+
+    // Return the rules which might match 'opdef' using 'matchop_state'
+    // to cachine opdef looksups.
+    API_EXPORT RuleList optims(MatchOpState &matchop_state, const OpDef *opdef) const
+    {
+        matchop_state.matched_opdef[0] = opdef;
+        return RuleList(matchop_state, get_optims(opdef));
+    }
+};
+
+} // namespace hnnx
+
+POP_VISIBILITY()
+
+#include "oexpr_post.h"
+
+//
+//
+
+PUSH_VISIBILITY(default)
+
+namespace hnnx {
+
+API_EXPORT std::map<int, GraphOptPass> &get_optimization_passes();
+
+API_EXPORT std::map<std::string, std::vector<std::unique_ptr<GraphOptInfo>> *> &get_pkg_opt_tmp_map();
+
+API_EXPORT void add_package_opt(std::vector<std::unique_ptr<GraphOptInfo>> &opts, int priority,
+                                OptimFlags::flags_t flags_in, get_entire_defopt_t defopt_in, char const *const fname,
+                                const int lineno);
+// This entry is only for backwards ABI compatibility for exising op packages
+// compiled when fname and line number were not in the default build.
+API_EXPORT void add_package_opt(std::vector<std::unique_ptr<GraphOptInfo>> &opts, int priority,
+                                OptimFlags::flags_t flags_in, get_entire_defopt_t defopt_in);
+
+API_EXPORT std::string get_opname_with_default_pkg_prefix(char const *opname);
+
+} // namespace hnnx
+
+POP_VISIBILITY()
+
+#define INIT_PACKAGE_OPTIMIZATION_DEF()                                                                                \
+    API_HIDDEN std::vector<std::unique_ptr<hnnx::GraphOptInfo>> &current_package_opts_storage_vec_func()               \
+    {                                                                                                                  \
+        static std::vector<std::unique_ptr<hnnx::GraphOptInfo>> optv;                                                  \
+        return optv;                                                                                                   \
+    }                                                                                                                  \
+    extern "C" {                                                                                                       \
+    void clearPackageOptStorageVecFunc() { current_package_opts_storage_vec_func().clear(); }                          \
+    }
+
+#define DECLARE_PACKAGE_OPTIMIZATION_DEF()                                                                             \
+    API_HIDDEN std::vector<std::unique_ptr<hnnx::GraphOptInfo>> &current_package_opts_storage_vec_func();
+
+#define REGISTER_EXTERNAL_PACKAGE_OPT(PRIORITY, FLAGS, DEFOPT) APPEND_REG_OPT_ELEM(PRIORITY, FLAGS, DEFOPT, __LINE__)
+
+#define REGISTER_INTERNAL_PACKAGE_OPT(PRIORITY, FLAGS, DEFOPT) APPEND_REG_OPT_ELEM(PRIORITY, FLAGS, DEFOPT, __LINE__)
+
+#define DEF_PACKAGE_OPTIMIZATION(PRIORITY, MATCHCODE, CONSTRAINTCODE, REPLACECODE)                                     \
+    DEF_PACKAGE_OPTIMIZATION_WITH_FLAGS(PRIORITY, 0, MATCHCODE, CONSTRAINTCODE, REPLACECODE)
+
+#define DEF_PACKAGE_OPTIMIZATION_WITH_FLAGS(PRIORITY, FLAGS, MATCHCODE, CONSTRAINTCODE, REPLACECODE)                   \
+    DEF_PACKAGE_OPTIMIZATION_COMMON(PRIORITY, FLAGS, MATCHCODE, CONSTRAINTCODE, REPLACECODE)                           \
+    REGISTER_EXTERNAL_PACKAGE_OPT((PRIORITY), hnnx::OptimFlags::flag_evaluate<UNIQUE_TYPE>(),                          \
+                                  &hnnx::get_entire_defopt<UNIQUE_TYPE>);
+
+#define DEF_INTERNAL_PACKAGE_OPTIMIZATION(PRIORITY, MATCHCODE, CONSTRAINTCODE, REPLACECODE)                            \
+    DEF_INTERNAL_PACKAGE_OPTIMIZATION_WITH_FLAGS(PRIORITY, 0, MATCHCODE, CONSTRAINTCODE, REPLACECODE)
+
+#define DEF_INTERNAL_PACKAGE_OPTIMIZATION_WITH_FLAGS(PRIORITY, FLAGS, MATCHCODE, CONSTRAINTCODE, REPLACECODE)          \
+    DEF_PACKAGE_OPTIMIZATION_COMMON(PRIORITY, FLAGS, MATCHCODE, CONSTRAINTCODE, REPLACECODE)                           \
+    REGISTER_INTERNAL_PACKAGE_OPT((PRIORITY), hnnx::OptimFlags::flag_evaluate<UNIQUE_TYPE>(),                          \
+                                  &hnnx::get_entire_defopt<UNIQUE_TYPE>);
+
+#ifndef DEF_OPT_COMPILE
+#define DEF_PACKAGE_OPTIMIZATION_COMMON(PRIORITY, FLAGS, MATCHCODE, CONSTRAINTCODE, REPLACECODE)                       \
+    template <> [[gnu::always_inline, gnu::cold]] hnnx::MatchAst_uptr MatchBuilder::matcher<UNIQUE_TYPE>()             \
+    {                                                                                                                  \
+        return MATCHCODE;                                                                                              \
+    }                                                                                                                  \
+    template <> [[gnu::always_inline, gnu::cold]] ReplFuncBool Constraint::constraint<UNIQUE_TYPE>()                   \
+    {                                                                                                                  \
+        using namespace oExp_for_cst;                                                                                  \
+        using oExp::INT;                                                                                               \
+        using oExp::UINT;                                                                                              \
+        auto result = oExp::wrap_param_to<bool>(CONSTRAINTCODE);                                                       \
+        return oExp::wrap_as_function<bool>(result);                                                                   \
+    }                                                                                                                  \
+    template <> [[gnu::always_inline, gnu::cold]] ReplFunc Replacement::replacement<UNIQUE_TYPE>()                     \
+    {                                                                                                                  \
+        using namespace oExp_for_repl;                                                                                 \
+        using oExp::INT;                                                                                               \
+        using oExp::UINT;                                                                                              \
+        pkg_flag = THIS_PKG_NAME_STR;                                                                                  \
+        return Operand(REPLACECODE);                                                                                   \
+    }                                                                                                                  \
+    template <> inline constexpr hnnx::OptimFlags::flags_t hnnx::OptimFlags::flag_evaluate<UNIQUE_TYPE>() noexcept     \
+    {                                                                                                                  \
+        return static_cast<uint32_t>(any_rule) | static_cast<uint32_t>(FLAGS);                                         \
+    }                                                                                                                  \
+    template <> hnnx::entire_defopt hnnx::get_entire_defopt<UNIQUE_TYPE>()                                             \
+    {                                                                                                                  \
+        return hnnx::entire_defopt{MatchBuilder::matcher<UNIQUE_TYPE>(),                                               \
+                                   Constraint::constraint<UNIQUE_TYPE>(),                                              \
+                                   Replacement::replacement<UNIQUE_TYPE>(),                                            \
+                                   {},                                                                                 \
+                                   nullptr};                                                                           \
+    }
+#else
+#define DEF_PACKAGE_OPTIMIZATION_COMMON(PRIORITY, FLAGS, MATCHCODE, CONSTRAINTCODE, REPLACECODE)                       \
+    __def_opt__(PRIORITY, FLAGS, MATCHCODE, CONSTRAINTCODE, REPLACECODE)<<<__FILE__, __LINE__>>>
+//  ---> the format of this line must agree with the assumption in scripts/rewrite/hash_rule.py
+#endif
+
+#define REGISTER_PACKAGE_OPTIMIZATIONS()                                                                               \
+    {                                                                                                                  \
+        auto &pkg_opt_map = hnnx::get_pkg_opt_tmp_map(); /* package registration map */                                \
+        auto [iter, ok] = pkg_opt_map.try_emplace(std::string(THIS_PKG_NAME_STR),                                      \
+                                                  nullptr); /*see if we can insert an empty one */                     \
+        if (ok) iter->second = &current_package_opts_storage_vec_func();                                               \
+    } /* if so, replace it with this */
+
+#define DEF_OPTIM(PRIORITY, FLAGS, MATCHCODE, CONSTRAINTCODE, REPLACECODE)                                             \
+    DEF_INTERNAL_PACKAGE_OPTIMIZATION_WITH_FLAGS(PRIORITY, FLAGS, MATCHCODE, CONSTRAINTCODE, REPLACECODE)
+
+#define DEF_OPT(PRIORITY, MATCHCODE, CONSTRAINTCODE, REPLACECODE)                                                      \
+    DEF_INTERNAL_PACKAGE_OPTIMIZATION(PRIORITY, MATCHCODE, CONSTRAINTCODE, REPLACECODE)
+
+#define FROM_DEFAULT_PACKAGE(OP) hnnx::get_opname_with_default_pkg_prefix(OP).c_str()
+
+DECLARE_PACKAGE_OPTIMIZATION_DEF()
+
+// [DEPRECATED] Old Pass Phases
+// see docs/def_opt_migration.md
+// #define BEGIN         0
+// #define GRAPH_CLEANUP 100
+// #define PRE_QNN       500
+// #define QNN           1000
+// #define EARLY         2000
+// #define MIDDLE        3000
+// #define LATE          4000
+
+// New Pass Phases
+// see docs/def_opt_migration.md to understand how DEF_OPT
+// rules were initially put into these ranges.
+
+// Anything that we need to do before we start optimizing things away
+#define PRE_OPTIMIZATION 50
+
+// Rewriting that needs to happen to clean up to prepare for translation
+#define CLEANUP_GRAPH 100
+
+// Other rewriting that needs to occur to prepare for translation or avoid special cases
+#define PRE_TRANSLATE 1000
+
+// Translate from upper level op definitions to our internal ops and op patterns.
+// This was called "QNN" before
+#define TRANSLATE 2000
+
+// Any rules that need to run fairly early to figure out what's going on in the graph
+#define ANALYSIS 3000
+
+// Fixes for quantization in the graph
+#define QUANT_FIXES 4000
+
+// Replace ops with other ops to simplify the graph, before dimension reshaping.
+// Some of "EARLY" goes here.  Often fission and fusion will go here.
+#define PRE_RESHAPE_OP_SIMPLIFY 5000
+
+// Reshaping spatial dimension to help performance
+#define SPATIAL_RESHAPE 6000
+
+// Exchanging space and depth to help performance
+#define SPACE_DEPTH 7000
+
+// Replace ops with other ops to simplify the graph, post dimension reshaping.
+// A lot of "EARLY" goes here.  Often fission and fusion will go here.
+#define POST_RESHAPE_OP_SIMPLIFY 8000
+
+// Anything that needs to happen before tiling
+#define PRE_TILING 10000
+
+// Tiling large ops to make them smaller
+#define TILING 11000
+
+// This is the phase that central tiling logically runs at
+// TODO(charcall) remove this and just run before TILE_CLEANUP
+#define CENTRAL_TILING 11900
+
+// Clean up the graph after tiling.  Slice-of-concat, etc.
+#define TILE_CLEANUP 12000
+
+// Passes that should happen after tiling
+#define POST_TILING 13000
+
+// Graph rewriting for actual op implementations, specializations, and their requirements.
+// What was once LATE+0 through LATE+9 is often this kind of thing.
+#define HARD_OPS 20000
+
+// Move data to TCM and remove unnecessary data moves.
+// Perhaps, eventually accomplished by different infrastructure.
+#define BEFORE_TCM_MIGRATION 20800
+
+// Move data to TCM and remove unnecessary data moves.
+// Perhaps, eventually accomplished by different infrastructure.
+#define TCM_MIGRATION 21000
+
+// Passes that run after TCM migration ops are inserted
+#define POST_TCM 22000
+
+// Anything that needs to be simplified at the very end
+#define FINAL_CLEANUP 23000
+
+// do we need to do any aux graph specific cleanup
+#define AUX_CLEANUP 25000
+
+// Some DEF_AUTOSPLIT rules are only used by the central tiler
+// and the chunk size is not used. We give a symbolic name for this case
+#define CHUNK_NOT_USED 0
+
+// LEGACY support for OLD pass phase names
+// EXTERNAL use only in OpPackages and at the QNN-level
+// see docs/def_opt_migration.md
+// DO NOT USE THESE ON HTP CORE!
+// HTP Core developers should read docs/def_opt_migration.md
+#ifndef DISABLE_LEGACY_PASS_SYMBOLS
+#define BEGIN         0 // (CLEANUP_GRAPH)
+#define GRAPH_CLEANUP 50 // (CLEANUP_GRAPH + 50)
+#define PRE_QNN       1050 // (PRE_TRANSLATE + 50)
+#define QNN           2050 // (TRANSLATE + 50)
+#define EARLY         3050 // (ANALYSIS + 50)
+#define MIDDLE        20050 // (HARD_OPS + 50)
+#define LATE          21050 // (TCM_MIGRATION + 50)
+// For Upcoming centralized LAYOUT_AND_PLACEMENT changes
+#define LAYOUT_AND_PLACEMENT 21100 // (TCM_MIGRATION + 100)
+#endif
+
+#define GET_DILVALUE(arg1, arg2, ...) arg2
+
+#define TYPICAL_CONV_SLICE(in, tag, stride, filt_taps, ...)                                                            \
+    AUTOSPLIT_SLICE(in, AUTOSPLIT_SHAPEFN_APPLY(conv_valid_split_start, tag, in, stride),                              \
+                    AUTOSPLIT_SHAPEFN_APPLY(conv_valid_split_size, tag, in, stride, filt_taps,                         \
+                                            GET_DILVALUE(dummy, ##__VA_ARGS__, 1)))
+
+#ifndef DTP_COMPILE
+#define DEF_TENSOR_PROPERTIES(...)                                                                                     \
+    namespace DefProperties {                                                                                          \
+    [[maybe_unused]] static bool CTRICKS_PASTER(opdef_proprety, __LINE__) =                                            \
+            hnnx::register_tensor_properties(THIS_PKG_NAME_STR, TensorInfoBuilder(THIS_PKG_NAME_STR, __VA_ARGS__));    \
+    }
+#else
+#define DEF_TENSOR_PROPERTIES(...) __dtp__(__VA_ARGS__)<<<__FILE__, __LINE__>>>
+#endif
+#else
+#define DEF_TENSOR_PROPERTIES(...)
+#define DEF_AUTOSPLIT(...)
+#define DEF_AUTOSPLITIM(...)
+#define DEF_AUTOSPLIT_ORDERED(...)
+#define DEF_AUTOSPLIT_TYPICAL(...)
+#define DEF_PACKAGE_OPTIMIZATION(PRIORITY, MATCHCODE, CONSTRAINTCODE, REPLACECODE)
+#define DEF_PACKAGE_OPTIMIZATION_WITH_FLAGS(PRIORITY, FLAGS, MATCHCODE, CONSTRAINTCODE, REPLACECODE)
+#define DEF_OPTIM(PRIORITY, FLAGS, MATCHCODE, CONSTRAINTCODE, REPLACECODE)
+#define DEF_OPT(PRIORITY, MATCHCODE, CONSTRAINTCODE, REPLACECODE)
+#define INIT_PACKAGE_OPTIMIZATION_DEF()                                                                                \
+    /* Provide no-op definition so clearPkgStorage still works */                                                      \
+    extern "C" void clearPackageOptStorageVecFunc() {}
+#define REGISTER_PACKAGE_OPTIMIZATIONS()
+#endif // PREPARE_DISABLED
+
+#define COMPILER_FOR(XXF, FUNC, PARA)                                                                                  \
+    template <> constexpr bool has_compile_method<XXF> = true;                                                         \
+    template <> struct OpaqueT_FOR<XXF> {                                                                              \
+        using type = PARA;                                                                                             \
+    };                                                                                                                 \
+    template <> hnnx::Executable::ItemType hnnx::TypicalOpWithCompiler<XXF, PARA>::compile(Graph &graph_in) const      \
+    {                                                                                                                  \
+        static_assert(check_szal());                                                                                   \
+        return FUNC(graph_in, this);                                                                                   \
+    }
+
+#endif // OPTIMIZE_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/optimize_defs.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/optimize_defs.h
new file mode 100755
index 0000000000000..dc7a35b015267
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/optimize_defs.h
@@ -0,0 +1,462 @@
+//==============================================================================
+//
+// Copyright (c) 2020-2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef OPTIMIZE_DEFS_H
+#define OPTIMIZE_DEFS_H 1
+
+// this file contains #define that need to be seen by the optimization rule parser,
+// in addition to the C++ code. Don't place any #include in here.
+//
+
+/**
+ * \defgroup OptConstraint
+ * @{
+ */
+
+#define IS_SCALAR(X) AND(EQ(DIM_BATCHES(X), 1), EQ(DIM_HEIGHT(X), 1), EQ(DIM_WIDTH(X), 1), EQ(DIM_DEPTH(X), 1))
+
+#define IS_NOT_SCALAR(X) OR(NE(DIM_BATCHES(X), 1), NE(DIM_HEIGHT(X), 1), NE(DIM_WIDTH(X), 1), NE(DIM_DEPTH(X), 1))
+
+#define IS_SHAPE_1x1x1xd(X) AND(EQ(DIM_BATCHES(X), 1), EQ(DIM_HEIGHT(X), 1), EQ(DIM_WIDTH(X), 1), NE(RANK_OF(X), 5))
+
+#define IS_SHAPE_1x1x1x1xc(X)                                                                                          \
+    AND(EQ(RANK_OF(X), 5), EQ(DIM_BATCHES(X), 1), EQ(DIM_HEIGHT(X), 1), EQ(DIM_WIDTH(X), 1), EQ(DIM_DEPTH(X), 1))
+
+#define IS_1HD_H1D(A, B)                                                                                               \
+    AND(EQ(DIM_HEIGHT(A), 1), NE(DIM_WIDTH(A), 1), EQ(DIM_WIDTH(B), 1), EQ(DIM_WIDTH(A), DIM_HEIGHT(B)),               \
+        EQ(DIM_DEPTH(A), DIM_DEPTH(B)))
+
+/// @brief A constant specifying the standard split for output depth
+// NOTE: HEXNNVVV-330 workaround: stay at 32 channels or FP test starts to fail...
+#define MIN_CHANNEL_SPLIT_SIZE 32
+#define CHANNEL_SPLIT_SIZE     256
+
+// TCM_SIZE and TCM_TOOBIG are renamed to TCM_MAXTENSOR_SIZE and TCM_MAXTENSOR_HALF_SIZE
+// to help with tilling on 2x4mb auto hardware
+// where total tcm size is multiple of tiling size
+//#define TCM_SIZE OPTION_UINT("tcm_size")
+//#define TCM_TOOBIG DIV(OPTION_UINT("tcm_size"), 2)
+
+// TCM_MAXTENSOR_HALF_SIZE is one half of the TCM tiling size
+#define TCM_MAXTENSOR_SIZE      OPTION_UINT("tcm_size_for_tiling")
+#define TCM_MAXTENSOR_HALF_SIZE DIV(OPTION_UINT("tcm_size_for_tiling"), 2)
+
+// Used in depth slicing, where height slicing has not been done
+// Depth could be 1.
+#define ELEMWISE_TILE_SIZE(ACT)                                                                                        \
+    MUL(TILE_HEIGHT, ROUNDUP(MUL(DIM_WIDTH(ACT), ELEMENTSIZE_OF(ACT)), 8),                                             \
+        ROUNDUP(MIN(CHANNEL_SPLIT_SIZE, DIM_DEPTH(ACT)), 32))
+
+#define ELEMWISE_TOOBIG(A, B, OUT)                                                                                     \
+    GT(ADD(ELEMWISE_TILE_SIZE(A), ELEMWISE_TILE_SIZE(B), ELEMWISE_TILE_SIZE(OUT)), TCM_MAXTENSOR_SIZE)
+
+#define WEIGHT_STORAGE(WEIGHT, SPLIT)                                                                                  \
+    MUL(ELEMENTSIZE_OF(WEIGHT), DIM_FILTHEIGHT(WEIGHT), DIM_FILTWIDTH(WEIGHT), ROUNDUP(DIM_FILTDEPTH(WEIGHT), 32),     \
+        MIN(SPLIT, DIM_NFILTS(WEIGHT)))
+
+#define ACT_STORAGE_EST(WEIGHT, ACT, SPLIT)                                                                            \
+    ADD(MUL(ROUNDUP(ADD(7, DIM_FILTHEIGHT(WEIGHT)), 8), ROUNDUP(MUL(DIM_WIDTH(ACT), ELEMENTSIZE_OF(ACT)), 8),          \
+            ROUNDUP(DIM_DEPTH(ACT), 32)),                                                                              \
+        MUL(8, ROUNDUP(MUL(DIM_WIDTH("*"), ELEMENTSIZE_OF("*")), 8), SPLIT))
+
+#define GOOD_WEIGHTS(WEIGHT, ACT, SPLIT)                                                                               \
+    LT(ADD(WEIGHT_STORAGE(WEIGHT, SPLIT), ACT_STORAGE_EST(WEIGHT, ACT, SPLIT)),                                        \
+       SELECT(EQ(ELEMENTSIZE_OF(WEIGHT), 1), TCM_MAXTENSOR_HALF_SIZE, MUL(DIV(TCM_MAXTENSOR_SIZE, 2048), 1041)))
+
+/* controls when we can be more aggressive with tiling */
+#define CAN_FINE_SPLIT EQ(OPTION_UINT("can_fine_split"), 1)
+
+#define BIG_WIDTH_SIZE OPTION_UINT("big_width_split")
+
+#define DO_BIG_WIDTH_SPLIT                                                                                             \
+    AND(GT(DIM_WIDTH("*"), BIG_WIDTH_SIZE), CAN_FINE_SPLIT, GE(DIM_DEPTH("*"), 4), GT(DIM_HEIGHT("*"), 1))
+
+#define AUTOTHREAD_ENABLED OPTION_INT("enable_autothread")
+
+// Helper to decide if we should auto thread
+#define SHOULD_AUTOTHREAD                                                                                              \
+    AND(NOT(OPTION_BOOL("central_tiler")), GT(DATA_SIZE("*"), MUL(OPTION_INT("autothread_size_kb"), 1024)))
+// When autothreading on width, beware of rounding based on element size
+#define SHOULD_AUTOTHREAD1(ACT1)       AND(SHOULD_AUTOTHREAD, EQ(ELEMENTSIZE_OF("*"), ELEMENTSIZE_OF(ACT1)))
+#define SHOULD_AUTOTHREAD2(ACT1, ACT2) AND(SHOULD_AUTOTHREAD1(ACT1), EQ(ELEMENTSIZE_OF("*"), ELEMENTSIZE_OF(ACT2)))
+#define SHOULD_AUTOTHREAD3(ACT1, ACT2, ACT3)                                                                           \
+    AND(SHOULD_AUTOTHREAD2(ACT1, ACT2), EQ(ELEMENTSIZE_OF("*"), ELEMENTSIZE_OF(ACT3)))
+
+// This is used for some unary operators so that they are supertiled the same way
+// as binary ops even though they have a smaller footprint
+// In particular Sqrt(Mul(x,y))...
+#define SHOULD_AUTOTHREAD_UNARY                                                                                        \
+    AND(NOT(OPTION_BOOL("central_tiler")), GT(MUL(3, DATA_SIZE("*")), MUL(OPTION_INT("autothread_size_kb"), 2 * 1024)))
+// When autothreading on width, beware of rounding based on element size
+#define SHOULD_AUTOTHREAD_UNARY1(ACT) AND(SHOULD_AUTOTHREAD_UNARY, EQ(ELEMENTSIZE_OF("*"), ELEMENTSIZE_OF(ACT)))
+
+/*
+ * "Choose the maximum channel split size that doesn't make the slice of weights too big"
+ */
+#define SMART_CHANNEL_SIZE(WEIGHT_STR, ACT_STR)                                                                        \
+    SELECT(GOOD_WEIGHTS(WEIGHT_STR, ACT_STR, CHANNEL_SPLIT_SIZE), CHANNEL_SPLIT_SIZE,                                  \
+           SELECT(GOOD_WEIGHTS(WEIGHT_STR, ACT_STR, DIV(CHANNEL_SPLIT_SIZE, 2)), DIV(CHANNEL_SPLIT_SIZE, 2),           \
+                  SELECT(GOOD_WEIGHTS(WEIGHT_STR, ACT_STR, DIV(CHANNEL_SPLIT_SIZE, 4)), DIV(CHANNEL_SPLIT_SIZE, 4),    \
+                         32)))
+
+// For HMX DWC
+#define DWC_ACT_STORAGE_EST(WEIGHT, ACT, SPLIT)                                                                        \
+    ADD(MUL(ROUNDUP(ADD(7, DIM_FILTHEIGHT(WEIGHT)), 8), ROUNDUP(MUL(DIM_WIDTH(ACT), ELEMENTSIZE_OF(ACT)), 8), SPLIT),  \
+        MUL(8, ROUNDUP(MUL(DIM_WIDTH("*"), ELEMENTSIZE_OF("*")), 8), SPLIT))
+
+#define DWC_GOOD_WEIGHTS(WEIGHT, ACT, SPLIT)                                                                           \
+    LT(ADD(WEIGHT_STORAGE(WEIGHT, SPLIT), DWC_ACT_STORAGE_EST(WEIGHT, ACT, SPLIT)), DIV(TCM_MAXTENSOR_SIZE, 2))
+
+#define DWC_SMART_CHANNEL_SIZE(WEIGHT_STR, ACT_STR)                                                                    \
+    SELECT(DWC_GOOD_WEIGHTS(WEIGHT_STR, ACT_STR, CHANNEL_SPLIT_SIZE), CHANNEL_SPLIT_SIZE,                              \
+           SELECT(DWC_GOOD_WEIGHTS(WEIGHT_STR, ACT_STR, DIV(CHANNEL_SPLIT_SIZE, 2)), DIV(CHANNEL_SPLIT_SIZE, 2),       \
+                  SELECT(DWC_GOOD_WEIGHTS(WEIGHT_STR, ACT_STR, DIV(CHANNEL_SPLIT_SIZE, 4)),                            \
+                         DIV(CHANNEL_SPLIT_SIZE, 4), 32)))
+
+// For other depthwise ops
+#define MAX_CHANNEL_SIZE(ACT)                                                                                          \
+    ROUNDUP(MAX(1, DIV(TCM_MAXTENSOR_HALF_SIZE,                                                                        \
+                       MUL(ROUNDUP(DIM_HEIGHT(ACT), 8), ROUNDUP(MUL(DIM_WIDTH(ACT), ELEMENTSIZE_OF(ACT)), 8)))),       \
+            32)
+
+/// @brief SAME_QUANT("A", "B") -> true if the operands have the same stepsize and zero offset
+#define SAME_QUANT(OPA, OPB)                                                                                           \
+    OR(AND(EQ(STEPSIZE_OF(OPA), STEPSIZE_OF(OPB)), EQ(ZERO_OFFSET_OF(OPA), ZERO_OFFSET_OF(OPB))),                      \
+       AND(IS_FLOAT16(OPA), IS_FLOAT16(OPB)), AND(IS_FLOAT32(OPA), IS_FLOAT32(OPB)))
+
+/// @brief SAME_DTYPE_QUANT("A", "B") -> true if the operands have the same dtype, stepsize and zero offset
+#define SAME_DTYPE_QUANT(OPA, OPB)                                                                                     \
+    AND(EQ(DTYPE_OF(OPA), DTYPE_OF(OPB)), EQ(STEPSIZE_OF(OPA), STEPSIZE_OF(OPB)),                                      \
+        EQ(ZERO_OFFSET_OF(OPA), ZERO_OFFSET_OF(OPB)), NOT(OPTION_BOOL("quant_is_updateable")))
+
+/// @brief MIN_QU8(X) -> min of range defined by a scale/offset for a qu8 tensor
+#define MIN_QU8(X) MUL(STEPSIZE_OF(X), MUL(-1.0f, ZERO_OFFSET_OF(X)))
+
+/// @brief MAX_QU8(X) -> max of range defined by a scale/offset for a qu8 tensor
+#define MAX_QU8(X) MUL(STEPSIZE_OF(X), SUB(255.0f, ZERO_OFFSET_OF(X)))
+
+/// @brief OPCONST(X) enforces that op X is a Const during pattern matching
+#define OPCONST(X) LET(X, Op("$Const"))
+
+/// @brief OPCONST_DDR(X) enforces that op Name is a Const during pattern matching
+///   the "constant_crouton_from_ddr" is discarded in final cleanup and the
+///   constant is loaded from memory. It is used in contexts where crouton format
+///   is expected.
+#define OPCONST_DDR(Name)      Op("constant_crouton_from_ddr", Op("ForceFormat_Crouton", OPCONST(Name)))
+#define OPCONST_FLAT_DDR(Name) Op("constant_flat_from_ddr", OPCONST(Name))
+
+/// @brief OPCONST_TCM(X) enforces that op Name is a Const during pattern matching
+///   the "constant_crouton_to_vtcm" will be converted to a sequence to load
+///   the constant into TCM memory during final cloeanup
+#define OPCONST_TCM(Name)      Op("constant_crouton_to_vtcm", OPCONST(Name))
+#define OPCONST_FLAT_TCM(Name) Op("constant_flat_to_vtcm", OPCONST(Name))
+
+// How wide should the output tile be?
+// Well,
+// * We have HEX_VTCM_MB - WEIGHT_STORAGE available from VTCM
+// * Input is roughly (input height * input depth) * (1+filter-related-value) * WIDTH
+// * Output is output depth * 8 * WIDTH
+// So we should take (HEX_VTCM_MB-WEIGHT_STORAGE) and divide by
+//     (input height * input depth) * (1+filter-related-value) + (OUTPUT DEPTH*8)
+// And then round down to a multiple of 8 probably
+// But we need to do at least 8 wide
+
+// need "Too big" indication for constraint (which might be "big total width"), then
+// need to tile into at least ~4 chunks to actually shrink size.
+
+#define SUBS(a, b)      SELECT(GT(a, b), SUB(a, b), 0)
+#define MIN_WIDTH_OF(A) SELECT(EQ(ELEMENTSIZE_OF(A), 1), 8, SELECT(EQ(ELEMENTSIZE_OF(A), 2), 4, 2))
+#define MIN_WIDTH       8
+
+#define ESTIMATE_TENSOR_SIZE(T)                                                                                        \
+    MUL(DIM_BATCHES(T), ROUNDUP(DIM_HEIGHT(T), TILE_HEIGHT), ROUNDUP(MUL(DIM_WIDTH(T), ELEMENTSIZE_OF(T)), 8),         \
+        ROUNDUP(DIM_DEPTH(T), 32))
+
+#define ESTIMATE_SIZE(ACT, WEIGHTS, OUT)                                                                               \
+    ADD(MUL(DIM_BATCHES(ACT),                                                                                          \
+            ROUNDUP(ADD(DIM_HEIGHT(ACT), SELECT(EQ(DIM_FILTHEIGHT(WEIGHTS), 1), 0, 8)), TILE_HEIGHT),                  \
+            ROUNDUP(ADD(MUL(DIM_WIDTH(ACT), ELEMENTSIZE_OF(ACT)), SELECT(EQ(DIM_FILTWIDTH(WEIGHTS), 1), 0, 8)), 8),    \
+            ROUNDUP(DIM_DEPTH(ACT), 32)),                                                                              \
+        ROUNDUP(WEIGHT_STORAGE(WEIGHTS, DIM_NFILTS(WEIGHTS)), 2048), ESTIMATE_TENSOR_SIZE(OUT),                        \
+        ROUNDUP(MUL(8, ROUNDUP(DIM_DEPTH(OUT), 32)), 2048))
+
+#define ESTIMATE_SIZE_QUANT(ACT, WEIGHTS, OUT)                                                                         \
+    ADD(MUL(DIM_BATCHES(ACT),                                                                                          \
+            ROUNDUP(ADD(DIM_HEIGHT(ACT), SELECT(EQ(DIM_FILTHEIGHT(WEIGHTS), 1), 0, 8)), TILE_HEIGHT),                  \
+            ROUNDUP(ADD(MUL(DIM_WIDTH(ACT), ELEMENTSIZE_OF(ACT)), SELECT(EQ(DIM_FILTWIDTH(WEIGHTS), 1), 0, 8)), 8),    \
+            ROUNDUP(DIM_DEPTH(ACT), 32)),                                                                              \
+        ROUNDUP(MUL(WEIGHT_STORAGE(WEIGHTS, DIM_NFILTS(WEIGHTS)), 2), 2048), ESTIMATE_TENSOR_SIZE(OUT),                \
+        ROUNDUP(MUL(8, ROUNDUP(DIM_DEPTH(OUT), 32)), 2048))
+
+// minimum size required for convolution
+#define GET_IN_SIZE(OUT_SIZE, FILT_SIZE, STRIDE, DILATION)                                                             \
+    ADD(MUL(SUB(OUT_SIZE, 1), STRIDE), MUL(SUB(FILT_SIZE, 1), DILATION), 1)
+
+#define ESTIMATE_MIN_SIZE(ACT, WEIGHTS, STRIDE, DIL_H, DIL_W)                                                          \
+    ADD(MUL(ROUNDUP(ADD(GET_IN_SIZE(8, DIM_FILTHEIGHT(WEIGHTS), DIM_HEIGHT(STRIDE), DIL_H),                            \
+                        SELECT(EQ(DIM_FILTHEIGHT(WEIGHTS), 1), 0, 8)),                                                 \
+                    8),                                                                                                \
+            ROUNDUP(ADD(MUL(GET_IN_SIZE(MIN_WIDTH_OF(ACT), DIM_FILTWIDTH(WEIGHTS), DIM_WIDTH(STRIDE), DIL_W),          \
+                            ELEMENTSIZE_OF(ACT)),                                                                      \
+                        SELECT(EQ(DIM_FILTWIDTH(WEIGHTS), 1), 0, 8)),                                                  \
+                    8),                                                                                                \
+            ROUNDUP(DIM_DEPTH(ACT), 32)),                                                                              \
+        ROUNDUP(WEIGHT_STORAGE(WEIGHTS, 32), 2048), 2048, 2048)
+
+#define ESTIMATE_SIZE_ALIGNED_SLICE(ACT, START, OUT)                                                                   \
+    ADD(SELECT(EQ(DIM_WIDTH(START), 0), ESTIMATE_TENSOR_SIZE(ACT), MUL(ESTIMATE_TENSOR_SIZE(ACT), 2)),                 \
+        ESTIMATE_TENSOR_SIZE(OUT))
+
+// minimum width to make convolution fit into vtcm
+#define MAX_GOOD_WIDTH_CONV(ACT_STR, WEIGHT_STR, OUT_STR, STRIDE, DILATION, TCMSIZE)                                   \
+    MAX(ROUNDUP(DIV(SUBS(DIV(MUL(TCMSIZE, 7), 8),                                                                      \
+                         ADD(ROUNDUP(WEIGHT_STORAGE(WEIGHT_STR, DIM_NFILTS(WEIGHT_STR)), 2048),                        \
+                             MUL(ROUNDUP(DIM_HEIGHT(ACT_STR), 8), ROUNDUP(DIM_DEPTH(ACT_STR), 32),                     \
+                                 SUB(DIM_FILTWIDTH(WEIGHT_STR), 1), DILATION, ELEMENTSIZE_OF(ACT_STR)))),              \
+                    MUL(4, ADD(MUL(ELEMENTSIZE_OF(OUT_STR), ROUNDUP(DIM_DEPTH(OUT_STR), 32),                           \
+                                   ROUNDUP(DIM_HEIGHT(OUT_STR), 8)),                                                   \
+                               MUL(ELEMENTSIZE_OF(ACT_STR), ROUNDUP(DIM_HEIGHT(ACT_STR), 8), STRIDE,                   \
+                                   ROUNDUP(DIM_DEPTH(ACT_STR), 32))))),                                                \
+                MIN_WIDTH_OF(OUT_STR)),                                                                                \
+        MIN_WIDTH_OF(OUT_STR))
+
+#define MAX_GOOD_WIDTH_CONV_QUANT(ACT_STR, WEIGHT_STR, OUT_STR, STRIDE, DILATION, TCMSIZE)                             \
+    MAX(ROUNDUP(DIV(SUBS(DIV(MUL(TCMSIZE, 7), 8),                                                                      \
+                         ADD(ROUNDUP(MUL(WEIGHT_STORAGE(WEIGHT_STR, DIM_NFILTS(WEIGHT_STR)), 2), 2048),                \
+                             MUL(ROUNDUP(DIM_HEIGHT(ACT_STR), 8), ROUNDUP(DIM_DEPTH(ACT_STR), 32),                     \
+                                 SUB(DIM_FILTWIDTH(WEIGHT_STR), 1), DILATION, ELEMENTSIZE_OF(ACT_STR)))),              \
+                    MUL(4, ADD(MUL(ELEMENTSIZE_OF(OUT_STR), ROUNDUP(DIM_DEPTH(OUT_STR), 32),                           \
+                                   ROUNDUP(DIM_HEIGHT(OUT_STR), 8)),                                                   \
+                               MUL(ELEMENTSIZE_OF(ACT_STR), ROUNDUP(DIM_HEIGHT(ACT_STR), 8), STRIDE,                   \
+                                   ROUNDUP(DIM_DEPTH(ACT_STR), 32))))),                                                \
+                MIN_WIDTH_OF(OUT_STR)),                                                                                \
+        MIN_WIDTH_OF(OUT_STR))
+
+#define MAX_GOOD_WIDTH(ACT_STR, WEIGHT_STR, OUT_STR, TCMSIZE)                                                          \
+    MAX_GOOD_WIDTH_CONV(ACT_STR, WEIGHT_STR, OUT_STR, 1, 1, TCMSIZE)
+
+// Tile CURRENT into equal-size chunks, less than or equalt to size TARGET
+#define EVEN_TILE_UNDER_CUSTOM(CURRENT, TARGET, ROUNDER) ROUNDUP(DIV(CURRENT, ADD(DIV(CURRENT, TARGET), 1)), ROUNDER)
+
+// Stays adaptive until CURRENT = 2*TARGET, then just tiles to size TARGET consistently
+// Balances the need to avoid the horrible case where you tile 260 into 256, with the benefits of consistent tiling
+// (that is, maybe avoiding lots of concat - retile operations in a large network
+#define EVEN_TILE_UNDER_CUTOFF2_CUSTOM(CURRENT, TARGET, ROUNDER)                                                       \
+    SELECT(GT(CURRENT, MUL(2, TARGET)), TARGET, EVEN_TILE_UNDER_CUSTOM(CURRENT, TARGET, ROUNDER))
+
+// Gotta be small enough to fit into VTCM, and then another factor of 4 so that we exploit parallelism.
+// Using half of VTCM instead of the full VTCM, as a precaution  / to let other ops run in parallel
+// Gotta use DIM_CHANNEL(ACT_STR) since there's no channel-tiling the activation unless it's DWC, and stride 2 isn't.
+// Has to be rounded up to 16, because it must be a multiple of 8 in the end, and space2depth divides width by 2
+// Default is 256; smaller tiles usually get better performance, and size 256 doesn't incur much overhead.
+// Also, for the activation, multiplying height by 2 (becase that's how it'll get tiled if "*" is tiled to TILE_HEIGHT)
+// and multiplying the whole thing by 2 (because whatever we tile "*" to, we tile ACT to 2x, due to stride).
+//
+// Sometimes, we need to tile all the way down to 8; hence, we can specify whether to round to 16 or to 8
+#define SMART_EARLY_WIDTH_S2(ACT_STR, WEIGHT_STR, OUT_STR, TCMSIZE, ROUNDER)                                           \
+    EVEN_TILE_UNDER_CUTOFF2_CUSTOM(                                                                                    \
+            DIM_WIDTH(OUT_STR),                                                                                        \
+            MIN(256, ROUNDUP(MAX(16, DIV(SUBS(DIV(MUL(TCMSIZE, 7), 16),                                                \
+                                              ROUNDUP(WEIGHT_STORAGE(WEIGHT_STR,                                       \
+                                                                     MIN(DIM_NFILTS(WEIGHT_STR), CHANNEL_SPLIT_SIZE)), \
+                                                      2048)),                                                          \
+                                         ADD(MUL(2, ELEMENTSIZE_OF(ACT_STR), DIM_BATCHES(ACT_STR),                     \
+                                                 MUL(TILE_HEIGHT, 2), ROUNDUP(DIM_DEPTH(ACT_STR), 32)),                \
+                                             MUL(ELEMENTSIZE_OF(OUT_STR), DIM_BATCHES(ACT_STR), TILE_HEIGHT,           \
+                                                 MIN_CHANNEL_SPLIT_SIZE)))),                                           \
+                             ROUNDER)),                                                                                \
+            ROUNDER)
+
+#define SMART_EARLY_WIDTH_ADAPTIVE_ROUNDING_S2(ACT_STR, WEIGHT_STR, OUT_STR, TCMSIZE)                                  \
+    SELECT(AND(IS_QUINT8(WEIGHT_STR), IS_QUINT16(ACT_STR)),                                                            \
+           SMART_EARLY_WIDTH_S2(ACT_STR, WEIGHT_STR, OUT_STR, TCMSIZE, 8),                                             \
+           SMART_EARLY_WIDTH_S2(ACT_STR, WEIGHT_STR, OUT_STR, TCMSIZE, 16))
+
+#define FLAT_TENSOR_SIZE(T)                                                                                            \
+    ROUNDUP(MUL(ELEMENTSIZE_OF(T), DIM_BATCHES(T), DIM_HEIGHT(T), DIM_WIDTH(T), DIM_DEPTH(T)), 2048)
+
+// Tile the width based on the input and output size
+// for channel_shuffle op.
+#define MAX_GOOD_WIDTH_CHANSHUF(ACT_STR, OUT_STR, TCMSIZE)                                                             \
+    MAX(MIN_WIDTH,                                                                                                     \
+        ROUNDUP(DIV(DIV(MUL(TCMSIZE, 6), 8), MUL(4, ADD(MUL(ELEMENTSIZE_OF(ACT_STR), ROUNDUP(DIM_HEIGHT(ACT_STR), 8),  \
+                                                            ROUNDUP(DIM_DEPTH(ACT_STR), 32)),                          \
+                                                        MUL(ELEMENTSIZE_OF(OUT_STR), ROUNDUP(DIM_HEIGHT(OUT_STR), 8),  \
+                                                            ROUNDUP(DIM_DEPTH(OUT_STR), 32))))),                       \
+                8))
+
+// Tile the width based on the input and output size
+// for quantize op.
+#define MAX_GOOD_WIDTH_QUANTIZE(IN, OUT, TCMSIZE)                                                                      \
+    MAX(MIN_WIDTH_OF(OUT),                                                                                             \
+        DIV(TCMSIZE, MUL(4, ADD(MUL(ELEMENTSIZE_OF(IN), ROUNDUP(DIM_HEIGHT(IN), 8), ROUNDUP(DIM_DEPTH(IN), 32)),       \
+                                MUL(ELEMENTSIZE_OF(OUT), ROUNDUP(DIM_HEIGHT(OUT), 8), ROUNDUP(DIM_DEPTH(IN), 32))))))
+
+#define MAX_GOOD_WIDTH_ELEMWISE(FIRST_IN_STR, SECOND_IN_STR, OUT_STR, TCMSIZE)                                         \
+    MAX(MIN_WIDTH, ROUNDUP(DIV(DIV(MUL(TCMSIZE, 6), 8),                                                                \
+                               MUL(4, ADD(MUL(ELEMENTSIZE_OF(FIRST_IN_STR), ROUNDUP(DIM_HEIGHT(FIRST_IN_STR), 8),      \
+                                              ROUNDUP(DIM_DEPTH(FIRST_IN_STR), 32)),                                   \
+                                          MUL(ELEMENTSIZE_OF(SECOND_IN_STR), ROUNDUP(DIM_HEIGHT(SECOND_IN_STR), 8),    \
+                                              ROUNDUP(DIM_DEPTH(SECOND_IN_STR), 32)),                                  \
+                                          MUL(ELEMENTSIZE_OF(OUT_STR), ROUNDUP(DIM_HEIGHT(OUT_STR), 8),                \
+                                              ROUNDUP(DIM_DEPTH(OUT_STR), 32))))),                                     \
+                           8))
+
+#define ESTIMATE_H1_MATMUL_SIZE(A, B)                                                                                  \
+    ADD(MUL(8, ROUNDUP(DIM_WIDTH(A), 8), ROUNDUP(DIM_DEPTH(A), 32), ELEMENTSIZE_OF(A)),                                \
+        MUL(1, 1, ROUNDUP(DIM_WIDTH(B), 32), ROUNDUP(DIM_DEPTH(B), 32), ELEMENTSIZE_OF(B)),                            \
+        MUL(8, ROUNDUP(DIM_WIDTH("*"), 8), ROUNDUP(DIM_DEPTH("*"), 32), ELEMENTSIZE_OF("*")))
+
+// Smartly select a suitable width for FP16 batchnorm operation.
+// starting from initial width and then halving it.
+#define SMART_BATCHNORM_WIDTH(IN_STR, WEIGHTS, OUT_STR, INITIAL_WIDTH)                                                 \
+    SELECT(OR(GT(DIM_WIDTH(IN_STR), INITIAL_WIDTH),                                                                    \
+              NOT(AND(GT(DIM_WIDTH(IN_STR), DIV(INITIAL_WIDTH, 2)),                                                    \
+                      GT(ADD(ESTIMATE_TENSOR_SIZE(OUT_STR), ESTIMATE_TENSOR_SIZE(IN_STR),                              \
+                             ESTIMATE_TENSOR_SIZE(WEIGHTS)),                                                           \
+                         TCM_MAXTENSOR_SIZE)))),                                                                       \
+           INITIAL_WIDTH, DIV(INITIAL_WIDTH, 2))
+
+/** @} */
+
+#define CONST_ZERO_OFF(OPERAND) gen_ConstScalar_i32(ZERO_OFFSET_OF(OPERAND))
+
+// wrap tile_height option for better usability
+
+#define TILE_HEIGHT OPTION_UINT("tile_height")
+
+// These are used to help optimize graphs when the relaxed_precision_flag is set
+#define CAST_TO_DTYPE(X, DTYPE) WITH_OUTPUT_TYPE(DTYPE, 0, 1.0f, Op(FROM_DEFAULT_PACKAGE("QNN_Cast"), X))
+
+#define CAST_TO_FP16(X) WITH_SIZE(X, CAST_TO_DTYPE(X, DType::Float16))
+
+#define CAST_TO_FP32(X) CAST_TO_DTYPE(X, DType::Float32)
+
+#define MAKE_OP_FP16_AND_INSERT_CAST(OP) CAST_TO_FP32(WITH_SIZE("*", WITH_OUTPUT_TYPE(DType::Float16, 0, 1.0f, OP)))
+
+#define IS_BINARY_FP16(A, B, Out) AND(IS_FLOAT16(A), IS_FLOAT16(B), IS_FLOAT16(Out))
+
+#define IS_BINARY_FP32(A, B, Out) AND(IS_FLOAT32(A), IS_FLOAT32(B), IS_FLOAT32(Out))
+
+#define FP16_CONST_CAST(X, Y) LET(X, Op(FROM_DEFAULT_PACKAGE("Cast_fp32_to_fp16_plain"), Y))
+
+#define FP16_CONST_CASTSLICE(X, Y, Z)                                                                                  \
+    LET(X, Op(FROM_DEFAULT_PACKAGE("SlicePad_shape_inplace"),                                                          \
+              LET(Y, Op(FROM_DEFAULT_PACKAGE("Cast_fp32_to_fp16_plain"), Z)), "Before", "Start", "Out", "Zero"))
+
+#define CONVERT_BINARY_OP_TO_FP16(OP, A, B)                                                                            \
+    DEF_OPTIM(CLEANUP_GRAPH+130, relaxed_precision_flag, Op(OP, A, B), IS_BINARY_FP32(A, B, "*"),                          \
+              MAKE_OP_FP16_AND_INSERT_CAST(Op(OP, CAST_TO_FP16(A), CAST_TO_FP16(B))))
+
+// These are used to reshape 1xHx1xD or 1x1xWxD QUint8CroutonTensor/QUint16CroutonTensor
+#define SHAPE_FROM_W1(A)                                                                                               \
+    SELECT(IS_QUINT8(A), gen_Shape(DIM_BATCHES(A), DIV(ADD(DIM_HEIGHT(A), 7), 8), 8, DIM_DEPTH(A)),                    \
+           gen_Shape(DIM_BATCHES(A), DIV(ADD(DIM_HEIGHT(A), 3), 4), 4, DIM_DEPTH(A)))
+
+#define REARRANGE_FROM_W1(A) WITH_SIZE(SHAPE_FROM_W1(A), WITH_TYPE(WITH_SAME_ID("*", A), Op("space_rearrange", A)))
+
+// Use this instead of the one above when "A" is a text string instead of an operator constructor
+#define REARRANGE_FROM_W1_OP(A) WITH_SIZE(SHAPE_FROM_W1(A), WITH_TYPE(A, Op("space_rearrange", A)))
+
+#define REARRANGE_TO_W1(OP) Op("space_rearrange", WITH_SIZE(SHAPE_FROM_W1("*"), WITH_TYPE("*", OP)))
+
+#define SHAPE_FROM_H1(A)                                                                                               \
+    SELECT(IS_QUINT8(A),                                                                                               \
+           gen_Shape(DIM_BATCHES(A), MIN(8, DIV(ADD(DIM_WIDTH(A), 7), 8)), MUL(DIV(ADD(DIM_WIDTH(A), 63), 64), 8),     \
+                     DIM_DEPTH(A)),                                                                                    \
+           gen_Shape(DIM_BATCHES(A), MIN(8, DIV(ADD(DIM_WIDTH(A), 3), 4)), MUL(DIV(ADD(DIM_WIDTH(A), 31), 32), 4),     \
+                     DIM_DEPTH(A)))
+
+#define REARRANGE_FROM_H1(A) WITH_SIZE(SHAPE_FROM_H1(A), WITH_TYPE(A, Op("space_rearrange", A)))
+
+#define REARRANGE_TO_H1(OP) Op("space_rearrange", WITH_SIZE(SHAPE_FROM_H1("*"), WITH_TYPE("*", OP)))
+
+// This is intended to be seen only by the external parser, not by the C++ compiler.
+// DEF_OPTIM is mapped to DEF_OPTIM_PARSE(...), with the PRIO and FLAGS parameter both
+// string-quoted -- this allows these to have non-conformant (non-lispy) syntax,
+// without complicating the parser;
+// and DEF_OPT(PRIO,PAT...) is just DEF_OPRIM_PARSE("prio","0"...)
+
+#ifdef EXTERNAL_DEFOPT_PARSER
+#define DEF_OPTIM(PRIO, FLAGS, PAT, CST, REP) DEF_OPTIM_PARSE(#PRIO, #FLAGS, PAT, CST, REP)
+#define DEF_OPT(PRIO, PAT, CST, REP)          DEF_OPTIM_PARSE(#PRIO, "0", PAT, CST, REP)
+
+// Some DEF_OPT use this
+#define MAX_DIMENSIONS 8
+
+// FIXME - maybe the parser should understand FROM_DEFAULT_PACKAGE("opname") directly
+#define FROM_DEFAULT_PACKAGE(name) name
+
+#endif
+
+// ------------- Batch to Space/Depth to Space Ops ----------------
+
+// Checks whether the given width slice factor is enough
+// or should it be halved.
+#define GOOD_WIDTH_DEPTHTOSPACE_CHECK(IN_STR, OUT_STR, SLICE)                                                          \
+    AND(GT(DIM_WIDTH(OUT_STR), SLICE),                                                                                 \
+        GT(ADD(ESTIMATE_TENSOR_SIZE(IN_STR), ESTIMATE_TENSOR_SIZE(OUT_STR)), TCM_MAXTENSOR_SIZE))
+
+// Searches for a good slice factor starting from the
+// initial slice factor (WIDTH_SLICE_FACTOR) and
+// halves it until the slice does not fit into vtcm.
+// This macro is supposed to start with initial slice factor of 128,
+// then check for 64 and then 32 if required.
+#define SMART_WIDTH_DEPTHTOSPACE(IN_STR, OUT_STR, WIDTH_SLICE_FACTOR)                                                  \
+    SELECT(GOOD_WIDTH_DEPTHTOSPACE_CHECK(IN_STR, OUT_STR, WIDTH_SLICE_FACTOR), WIDTH_SLICE_FACTOR,                     \
+           SELECT(GOOD_WIDTH_DEPTHTOSPACE_CHECK(IN_STR, OUT_STR, DIV(WIDTH_SLICE_FACTOR, 2)),                          \
+                  DIV(WIDTH_SLICE_FACTOR, 2), DIV(WIDTH_SLICE_FACTOR, 4)))
+
+// 1) conv + act fusion should only happen if conv doesnt feed any other op
+// 2) User can override this with the "force_conv_fusion" set opt
+// 3) In the case that the graph is a QNN model with --debug (per layer outputs) enabled
+// we still want the fusion to happen so that the final graph output matches the non --debug version
+// QNN's --debug is used purely for accuracy debugging, so the performance impact from the duplication
+// of the conv op isnt a concern
+#define ACT_FUSION_MULTI_OUT_CHECK(OP)                                                                                 \
+    OR(EXTERNAL_CONSTRAINT(has_only_one_consumer, OP), OPTION_BOOL("force_conv_fusion"),                               \
+       AND(PRODUCER_FOR(OP, "*Output"), EXTERNAL_CONSTRAINT(has_n_consumers, OP, 2), PRODUCER_FOR("*", "*Output")))
+
+// For central tiler, just return true but for legacy evaluate the
+// conjunction of the arguments
+
+// This should be used to separate predicate into semantic and tiling preference options
+// where the tiling preferences (typically references to target TCM size) should be
+// wrappered in this macro.
+#define SHOULD_TILE(...) OR(OPTION_BOOL("central_tiler"), AND(__VA_ARGS__))
+
+// if u8, w>8 && w%8 == 0
+// if u16, w>4 && w%4 == 0, not fully utilizing the crouton, but still much better performance
+// TODO: Will remove the second rule once the space rearrange is fully implemented to reshape
+// the entire model from Input toward the output
+#define WIDTH_TO_HEIGHTX_CONSTRAINT(OPSTR)                                                                             \
+    OR(AND(GT(DIM_WIDTH(OPSTR), TILE_HEIGHT), EQ(REM(DIM_WIDTH(OPSTR), TILE_HEIGHT), 0)),                              \
+       AND(IS_QUINT16(OPSTR), GT(DIM_WIDTH(OPSTR), 4), EQ(REM(DIM_WIDTH(OPSTR), 4), 0)))
+
+#define HEIGHTX_SHAPE(OPSTR)                                                                                           \
+    SELECT(EQ(REM(DIM_WIDTH(OPSTR), TILE_HEIGHT), 0),                                                                  \
+           gen_Shape(DIM_BATCHES(OPSTR), MUL(DIM_HEIGHT(OPSTR), TILE_HEIGHT), DIV(DIM_WIDTH(OPSTR), TILE_HEIGHT),      \
+                     DIM_DEPTH(OPSTR)),                                                                                \
+           SELECT(EQ(REM(DIM_WIDTH(OPSTR), 4), 0),                                                                     \
+                  gen_Shape(DIM_BATCHES(OPSTR), MUL(DIM_HEIGHT(OPSTR), 4), DIV(DIM_WIDTH(OPSTR), 4),                   \
+                            DIM_DEPTH(OPSTR)),                                                                         \
+                  gen_Shape(DIM_BATCHES(OPSTR), DIM_HEIGHT(OPSTR), DIM_WIDTH(OPSTR), DIM_DEPTH(OPSTR))))
+
+#define HEIGHT84_SHAPE(OPSTR)                                                                                          \
+    SELECT(EQ(REM(DIM_WIDTH(OPSTR), TILE_HEIGHT), 0),                                                                  \
+           gen_Shape(DIM_BATCHES(OPSTR), TILE_HEIGHT, DIV(DIM_WIDTH(OPSTR), TILE_HEIGHT), DIM_DEPTH(OPSTR)),           \
+           gen_Shape(DIM_BATCHES(OPSTR), 4, DIV(DIM_WIDTH(OPSTR), 4), DIM_DEPTH(OPSTR)))
+
+// Only perform reshape from height to width when the height is huge
+#define HEIGHT_TO_WIDTH_SHAPE(OPSTR)                                                                                   \
+    gen_Shape(DIM_BATCHES(OPSTR), TILE_HEIGHT, DIV(DIM_HEIGHT(OPSTR), TILE_HEIGHT), DIM_DEPTH(OPSTR))
+#define HEIGHT_TO_WIDTH_CONSTRAINT(OPSTR)                                                                              \
+    AND(GE(DIM_HEIGHT(OPSTR), 8192), EQ(REM(DIM_HEIGHT(OPSTR), TILE_HEIGHT), 0), EQ(DIM_WIDTH(OPSTR), 1),              \
+        EQ(REM(DIM_DEPTH(OPSTR), 32), 0))
+
+#endif
\ No newline at end of file
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/optimize_flags.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/optimize_flags.h
new file mode 100755
index 0000000000000..a905b684c54dc
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/optimize_flags.h
@@ -0,0 +1,196 @@
+//==============================================================================
+//
+// Copyright (c) 2020 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef OPTIMIZE_FLAGS_H_
+#define OPTIMIZE_FLAGS_H_
+
+#include <stdint.h>
+#include "weak_linkage.h"
+
+#ifndef PREPARE_DISABLED
+
+PUSH_VISIBILITY(default)
+
+namespace hnnx {
+
+struct OptimFlags {
+    typedef uint32_t flags_t; // can change to uint64 if needed
+
+    template <int N> struct flagbit {
+        static constexpr flags_t val = flags_t(1) << N;
+    };
+
+    // :::>OPTIMFLAG_SYMBOLS{(\w+)\s*=\s*flagbit<(\d+)>::val}
+    enum f : flags_t {
+        any_rule = flagbit<0>::val, // all rules have this bit forced on.
+        cse_after_always = flagbit<1>::val, // always do CSE after a pass containing this rule
+        cse_after_if = flagbit<2>::val, // always do CSE after the pass, if the rule succeeds.
+        cse_set_triggerA = flagbit<3>::val, // trigger CSE when this rule succeeds.
+        cse_before_if_triggerA = flagbit<4>::val, // if triggered, CSE before pass containing this rule
+        cse_set_triggerB = flagbit<5>::val, // trigger CSE when this rule succeeds.
+        cse_before_if_triggerB = flagbit<6>::val, // if triggered, CSE before pass containing this rule
+        fold_relu_flag = flagbit<
+                7>::val, // if overall flag for folding relu is enabled all rules containing this flag will be triggered
+        cp_after_always = flagbit<8>::val, // always do CP after a pass containing this rule
+        hmx_short_conv_flag = flagbit<9>::
+                val, // if overall flag for using hmx to do short conv is enabled all rules containing this flag will be triggered
+        relaxed_precision_flag = flagbit<10>::
+                val, // if overall flag for relaxed precision is enabled all rules containing this flag will be triggered
+
+        // An AUTOSPLIT rule that is not part of central tiling and is executed
+        // when central tiling is enabled or not.
+        explicit_autosplit_flag = flagbit<11>::val,
+        // A rule with an AUTOSPLIT that should be ignored when central tiling
+        // is enabled.
+        central_autosplit_flag = flagbit<12>::val,
+        // A rule which is only used by the central tiler and is never executed
+        // when central tiling is disabled.
+        central_only_autosplit_flag = flagbit<13>::val,
+        // A rule which is ignored completely when central tiling is enabled
+        central_ignore_autosplit_flag = flagbit<14>::val,
+
+        // This is used to disable rules which are centralized into a common TCM migration pass
+        tcm_migration_old_flag = flagbit<15>::val,
+        // This is used to specify rules only used when centralized TCM is in use.
+        tcm_migration_new_flag = flagbit<16>::val,
+        cp_after_if = flagbit<17>::val, // trigger CP when the rule succeeds
+        prepare_aux_graph = flagbit<18>::val, // is this an aux graph prepare?
+        autothread_flag = flagbit<19>::val, // Always set the autothread flag
+        trace_rule = flagbit<20>::val, // extra diagnosic tracing on a rule.
+    };
+
+    /*
+The trace_rule flag is used to enable extra debugging
+for a rule to debug predicates. It should only be used
+in development and not commited to the mainline branch.
+
+It requires WITH_OPT_DEBUG to be defined.
+Adding trace_rule to a rule such as
+
+DEF_OPTIM(SPATIAL_RESHAPE+100, trace_rule,
+	Op("ConvLayer","Activations","Weights","Stride","Bias","Scale","ConvCtrl","OutCtrl"),
+	AND(
+		OR( EQ(ZERO_OFFSET_OF("Weights"), 128), OPTION_BOOL("hmx_short_conv_flag")),
+		OR(IS_QUINT8("Activations"), IS_QUINT16("Activations")),
+		AND(
+			NOT( AND( EQ(DIM_HEIGHT("Stride"),1), EQ(DIM_WIDTH("Stride"),1)) ),
+			NOT( AND( EQ(DIM_HEIGHT("Stride"),2), EQ(DIM_WIDTH("Stride"),2)) ),
+			NOT( AND( EQ(DIM_HEIGHT("Stride"),4), EQ(DIM_WIDTH("Stride"),4), EQ( DIM_FILTHEIGHT("Weights"), 3), EQ( DIM_FILTWIDTH("Weights"), 3)))
+		)
+	),
+
+might generate this output:
+
+optimize.cc:101:conv.cc:1875 attempt 0x140d0000001d q::ConvLayer
+optimize.cc:115:0x140d0000001d predicate depth 2 or clause 1 result=1
+optimize.cc:115:0x140d0000001d predicate depth 2 or clause 0 result=1
+optimize.cc:115:0x140d0000001d predicate depth 3 and clause 1 result=1
+optimize.cc:115:0x140d0000001d predicate depth 2 and clause 0 result=0
+optimize.cc:115:0x140d0000001d predicate depth 1 and clause 2 result=0
+
+here the "depth" field indicate the position in the expression tree of 
+and AND or OR operator with the outermost such operation being a 1. 
+The clause term identifies the operator of that operator numbered left-to-right
+starting with 0. Result is the result of evaluating that cluase.
+
+Note that we stop evaluating an "AND" or "OR" once we determine its result.
+Thus, for an "AND" we generally will list only one clause and all preceding
+clauses are known to be "true". Similarly we list only one clause for an "OR"
+and preceding clauses are known to be "false"
+
+Happy hunting
+
+*/
+
+    // :::<OPTIMFLAG_SYMBOLS
+
+    // To clarify cse_set_triggerA etc:
+    // If rule 'X' has cse_set_triggerA, and rule 'Y' in a *later* pass has cse_before_if_triggerA,
+    // then if X gets applied, there will always be at least one subsequent CSE
+    // operation before the pass containing rule Y starts. Likewise for B.
+
+    // this is the union of the flags which need to be collected across all rules
+    // when building the optimization table; the result is stored in GraphOptPass.flags
+
+    static constexpr flags_t combine_over_pass =
+            cse_after_always | cse_before_if_triggerA | cse_before_if_triggerA | cp_after_always;
+
+    // Engine to make decision to do CSE after a pass.
+    // (and others can be added as needed)
+    // Lifetime is the full optimization process.
+    // After each pass (except the last), update() method is called,
+    // and then need_cse()  returns a bool indicating if CSE should be done.
+    // Note, this is not called after the final pass since we do CSE anyway.
+
+    // In the 'm_trigger' word, cse_set_triggerA and cse_set_triggerB bits are
+    // set when these triggers have happened in a previous pass;
+    // they are cleared whenever we decided to do CSE.
+    //
+    // The 'any_rule' bit in curr_trigger is a special case (all rules have this bit in their flags):
+    // we clear it whenever we decide to do CSE, and we set it in any other case when any rule has been executed.
+    // So, if it's clear when we are called, and the success_flags don't have it, nothing has changed since the previous CSE,
+    // and we can ignore all the other CSE conditions.
+
+    class OptFlagState {
+        flags_t m_trigger; // triggers are held here.
+        bool m_need_cse;
+        bool m_need_cp;
+
+      public:
+        OptFlagState() : m_trigger(0), m_need_cse(false), m_need_cp(false) {}
+        API_EXPORT inline void update(flags_t previous_pass_flags, // GraphOptPass.flags from the previous pass
+                                      flags_t next_pass_flags, // GraphOptPass.flags from the next pass
+                                      flags_t success_flags) // 'or' of all rules which succeeded in previous pass.
+        {
+            flags_t const trigs = m_trigger;
+            flags_t next_trigs = trigs | (success_flags & (any_rule | cse_set_triggerA | cse_set_triggerB));
+            bool const do_cse =
+                    ((next_trigs & any_rule) != 0 &&
+                     ((success_flags & cse_after_if) != 0 || (previous_pass_flags & cse_after_always) != 0 ||
+                      ((next_trigs & cse_set_triggerA) != 0 && (next_pass_flags & cse_before_if_triggerA) != 0) ||
+                      ((next_trigs & cse_set_triggerB) != 0 && (next_pass_flags & cse_before_if_triggerB) != 0)));
+            bool const do_cp = ((next_trigs & any_rule) != 0 &&
+                                ((success_flags & cp_after_if) != 0 || (previous_pass_flags & cp_after_always) != 0));
+            if (do_cse) {
+                // we are going to do cse; so reset all triggers and return true
+                next_trigs = 0;
+            }
+            if (do_cp) {
+                // we are going to do cse; so reset all triggers and return true
+                next_trigs = 0;
+            }
+            // if not doing CSE, accumulate any triggers
+            m_trigger = next_trigs;
+            m_need_cse = do_cse;
+            m_need_cp = do_cp;
+        }
+
+        API_EXPORT inline bool need_cse() const { return m_need_cse; }
+        API_EXPORT inline bool need_cp() const { return m_need_cp; }
+    };
+
+    // this is a trick to allow flags to be used in rules without namespace prefix
+    // i.e. if the flags are "flagname1 | flagname2"
+    // the #define will expand it to
+    //  inline constexpr OptimFlags::flag_eval<SOMETYPE>() {  return any_rule | (flagname1| flagname2); }
+    // .. and then call that to get the value.
+    //
+
+    template <typename U> static constexpr flags_t flag_evaluate() noexcept
+    {
+        static_assert(false && sizeof(U), "must be specialized");
+        return any_rule;
+    }
+};
+
+} // namespace hnnx
+
+POP_VISIBILITY()
+
+#endif
+#endif /* OPTIMIZE_FLAGS_H_ */
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/padding.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/padding.h
new file mode 100755
index 0000000000000..cee0253510cb0
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/padding.h
@@ -0,0 +1,84 @@
+//==============================================================================
+//
+// Copyright (c) 2020, 2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef HEXNN_PADDING_H
+#define HEXNN_PADDING_H 1
+#include <array>
+#include <utility>
+#include <functional>
+#include <algorithm>
+
+/* 
+ * This is a nice experiment, but maybe we should always just have padding
+ * and represent nopadding with padding 0,0,0,0
+ */
+
+typedef size_t Idx;
+typedef long SIdx;
+
+template <Idx Rank> class NoPadding {
+  public:
+    static constexpr unsigned int is_padded = 0;
+    template <typename PadT>
+    inline constexpr std::array<Idx, Rank> pad_coords(const std::array<Idx, Rank> &coords,
+                                                      const std::array<PadT, Rank> &left_padding) const
+    {
+        std::array<Idx, Rank> ret{coords};
+        return ret;
+    }
+    template <typename PadT>
+    inline constexpr std::array<Idx, Rank> pad_coords(const std::array<SIdx, Rank> &coords,
+                                                      const std::array<PadT, Rank> &left_padding) const
+    {
+        std::array<Idx, Rank> ret{};
+        std::transform(coords.cbegin(), coords.cend(), ret.begin(), [](SIdx x) -> Idx { return x; });
+        return ret;
+    }
+};
+
+/*
+ * This padding is just the left/top padding, which affects the coordinates.
+ * But if we want right/bottom padding, that would be extra information needed somewhere else.
+ */
+
+template <Idx Rank> class Padding {
+  public:
+    static constexpr unsigned int is_padded = 1;
+    template <typename PadT>
+    inline const std::array<Idx, Rank> pad_coords(const std::array<Idx, Rank> &coords,
+                                                  const std::array<PadT, Rank> &left_padding) const
+    {
+        std::array<Idx, Rank> ret{};
+        std::transform(left_padding.begin(), left_padding.end(), coords.begin(), ret.begin(), std::plus<Idx>());
+        return ret;
+    }
+    template <typename PadT>
+    inline const std::array<Idx, Rank> pad_coords(const std::array<SIdx, Rank> &coords,
+                                                  const std::array<PadT, Rank> &left_padding) const
+    {
+        std::array<Idx, Rank> ret{};
+        std::transform(left_padding.begin(), left_padding.end(), coords.begin(), ret.begin(), std::plus<Idx>());
+        return ret;
+    }
+};
+
+#if 0
+const std::array<Idx, 4> foo(const Padding<4> &x,
+                             const std::array<SIdx, 4> coords)
+{
+    return x.pad_coords(coords);
+}
+
+const std::array<Idx, 4> foo2(const NoPadding<4> &x,
+                              const std::array<SIdx, 4> coords)
+{
+    return x.pad_coords(coords);
+}
+#endif
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/pco_declarations.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/pco_declarations.h
new file mode 100755
index 0000000000000..de2fdfc35e853
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/pco_declarations.h
@@ -0,0 +1,75 @@
+//==============================================================================
+//
+// Copyright (c) 2024 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+#ifndef PCO_DECL_H
+#define PCO_DECL_H
+#include "dtype_enum.h"
+#include "graph_status.h"
+#include <cstddef>
+
+#ifndef THIS_PKG_NAME_STR
+#ifndef THIS_PKG_NAME
+#define THIS_PKG_NAME
+#define THIS_PKG_NAME_STR ""
+#else
+#define TO_STR(x)         #x
+#define TO_STR2(x)        TO_STR(x)
+#define THIS_PKG_NAME_STR TO_STR2(THIS_PKG_NAME)
+#endif
+#endif
+
+//
+// Interface for HTP op packages. This is a reduced subset of capability compared to QNN.
+//
+
+// Optional termination function.  Perform any shutdown and return success if
+// OK.  May be ommitted.
+typedef GraphStatus (*PackageOpTermFn_t)();
+
+// Interface class.  An op package is dynamically loaded, then the special
+// function op_pkg_init is loaded and called.  It takes a reference argument to
+// a PackageOpIf.
+//
+// In addition to specifying the name and optional termination function, this
+// function should perform any relevant op and optimization rule registration.
+// It's possible that this function may be called more than once, though we try
+// to avoid it.  So, to be on the safe side, it should return immediately with
+// GraphStatus::Success if it's already been called.
+//
+// _name must be non-null and non-empty.  It's used as a unique key into the
+// registry, to avoid duplicate loading of op packages, should one be specified
+// more than once in the list of options.
+//
+// _term may be null.
+
+struct PackageOpIf {
+    const char *_name = nullptr;
+    PackageOpTermFn_t _term = nullptr;
+    const char *decl_json_ptr = nullptr;
+    size_t decl_json_size = 0;
+};
+
+// Entry point function for the op package.
+typedef GraphStatus (*PackageOpInitFn_t)(PackageOpIf &);
+
+#define INIT_PKG_CORE_INIT_FUNC_WITH_JSON_DECLARATION()                                                                \
+    static bool sg_init = false;                                                                                       \
+    extern "C" int op_pkg_init(PackageOpIf &pkg_if)                                                                    \
+    {                                                                                                                  \
+        pkg_if._name = THIS_PKG_NAME_STR;                                                                              \
+        if (sg_init) {                                                                                                 \
+            return GraphStatus::Success;                                                                               \
+        }                                                                                                              \
+        REGISTER_PACKAGE_OPS();                                                                                        \
+        REGISTER_PACKAGE_OPTIMIZATIONS();                                                                              \
+        pkg_if.decl_json_ptr = decl_json;                                                                              \
+        pkg_if.decl_json_size = decl_json_size;                                                                        \
+        sg_init = true;                                                                                                \
+        return GraphStatus::Success;                                                                                   \
+    }
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/perf_timing.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/perf_timing.h
new file mode 100755
index 0000000000000..22233372b944d
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/perf_timing.h
@@ -0,0 +1,32 @@
+//==============================================================================
+//
+// Copyright (c) 2018,2021 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef PERF_TIMING_H
+#define PERF_TIMING_H 1
+
+#include <stdint.h>
+#include "weak_linkage.h"
+#include "macros_attribute.h"
+
+PUSH_VISIBILITY(default)
+
+class PcyclePoint {
+  public:
+    API_EXPORT PcyclePoint(bool enable);
+    API_EXPORT void stop();
+    API_EXPORT uint64_t get_total() const { return end > start ? (end - start) : 0; }
+    API_EXPORT uint64_t get_start() const { return start; }
+    API_EXPORT uint64_t get_end() const { return end; }
+    //private:
+    uint64_t start;
+    uint64_t end;
+};
+
+POP_VISIBILITY()
+
+#endif //PERF_TIMING_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/pickle_header_tags.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/pickle_header_tags.h
new file mode 100755
index 0000000000000..367eaf1af55b3
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/pickle_header_tags.h
@@ -0,0 +1,94 @@
+//==============================================================================
+//
+// Copyright (c) 2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef PICKLE_HEADER_TAGS_H_
+#define PICKLE_HEADER_TAGS_H_
+
+enum HTP_header_const {
+    Hdr_MAGIC = 0x7309F72B,
+    Hdr_MAGIC_MULTI = 0x3790FA5C, // magic # for a 'multi-pickle' main header
+    HdrVersion_VERSION = 1,
+    HdrVersion_GRAPH_PATCH_VERSION = 1,
+    HdrVersion_VERSION_FLAG_MULTI_NSP = 0x8000, // 'or' to version in multi-pickle header.
+    MULTI_SER_ALIGN = 64, // all blobs in multi-pickle are padded out to multiple of this
+    HdrTag_IDENT = 'I' + 256 * 'd',
+    HdrTag_SIZE = 'S' + 256 * 'z',
+    HdrTag_VERSION = 'V' + 256 * 'r',
+    HdrTag_OPTIONS = 'O' + 256 * 'p',
+    HdrTag_SHARES = 'W' + 256 * 's', // list of cbnames for weight-sharing
+    HdrTag_MEMORY = 'M' + 256 * 'm', // 'memory' usage info
+    HdrTag_CONTENTS = 'T' + 256 * 'c', // 'table of contents' in multi-pickle header.
+    HdrTag_MULTI = 'M' + 256 * 'u', // size info for multi-pickle header
+    HdrTag_IOSPEC = 'I' + 256 * 'o',
+    HdrTag_EMPTY = 'E' + 256 * 'm',
+    HdrTag_PATCH_METADATA = 0x2AFF5A2B,
+    HdrTag_CONSTPOOL = 'C' + 256 * 'p',
+    HdrTag_QUANT_PARAM_UPDATE = 'Q' + 256 * 'u',
+    HdrTag_ENDHDR = 'Z' + 256 * 'z',
+
+    // size of field, in bytes, specifying the names within the Sw tag
+    CBNAME_LEN = 45,
+    // size of field, in bytes, including final NULL, specifying the names within the Sw tag
+    LEN_SHARED_BUFFER_NAME = CBNAME_LEN + 1,
+
+    POOLDESC_IS_REPLACEABLE = 8, // const pool constructed as 'replaceable'
+    POOLDESC_IS_DEHYDRATED = 64, // pool requires const graph rehydration
+    POOLDESC_MC_CONST = 15, // represent Const data
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+constexpr inline bool htp_header_is_valid_MAGIC(const unsigned val)
+{
+    return val == Hdr_MAGIC || val == Hdr_MAGIC_MULTI;
+}
+constexpr inline unsigned htp_header_get_MAGIC(void const *const p)
+{
+    return *(unsigned const *)p;
+}
+
+//
+// Given a pointer to an in-memory header, locate the payload field corresponding to 'tag'.
+// If found, returns the length of the payload (which is >=0), after setting *payload_ptr.
+// If not found, returns -1.
+//
+//  hdr, buflen:  where the header is in memory.
+//      'hdr' must be 32-bit aligned; will not access beyond' buflen' bytes.
+//  tag:
+//      the 16-bit tag you're looking for.
+//
+inline int htp_header_locate_field(const void *hdr, const size_t buflen, const unsigned tag, void **const payload_ptr)
+{
+    if (buflen < 12) return -1; // not large enough for any fields.
+    const unsigned *rp = (const unsigned *)hdr;
+    if (!htp_header_is_valid_MAGIC(*rp)) return -1;
+    size_t const hwords = rp[1] & 0xFFFFu;
+    size_t const max_hdr_words = std::min(hwords, buflen / 4);
+    // do not look at this, or past.
+    unsigned const *const limitp = rp + max_hdr_words;
+    rp += 2; // point to first tag
+    while (rp < limitp) {
+        unsigned const recdesc = *rp;
+        unsigned const rlen = recdesc & 0xFFFFu;
+        if (rlen < 1 || rp + rlen > limitp) break; // bad record
+        if ((recdesc >> 16u) == tag) { // found it...
+            *payload_ptr = (void *)(rp + 1);
+            return (rlen - 1) * sizeof(unsigned);
+        }
+        rp += rlen;
+    }
+    return -1;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+#endif // PICKLE_HEADER_TAGS_H_
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/serdes_tensors.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/serdes_tensors.h
new file mode 100755
index 0000000000000..80d02cc93f604
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/serdes_tensors.h
@@ -0,0 +1,105 @@
+//==============================================================================
+//
+// Copyright (c) 2021-2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef SERDES_TENSORS_H
+#define SERDES_TENSORS_H 1
+
+#include "forward_classes.h"
+
+namespace hnnx {
+
+// common header SerTensorConn, DeserTensorConn
+// these are classes, stored within the Serialize and Deserialize objects,
+// which deal with the serialization of connectivity.
+//
+
+////////////////////////////
+// Assuming no forward refs:
+// -------------------------
+//     ser.tensor_def(tp)  Associates the next sequential index (starting with 1) with tp,
+//                               but does not store it;.
+//     ser.tensor_ref(tp)  Stores the index previously associated with tp
+//
+//  On deserializing:
+//     deser.tensor_def(tp)  Associates the next sequential index (starting with 1) with tp,
+//                               by appending tp to a vector;
+//     deser.tensor_ref(Tensor *&tp)
+//                              reads the index, reads the tp from a table.
+//
+// To support forward refs
+// -----------------------------------
+//
+// We assign 'forward index' codes to tensor_ref for which there is yet no tensor_def.
+// This is described first in terms of decoder actions, which are simpler:
+//
+//   (1) deser.tensor_ref reads a word containing a tensor index. If it has bit 30 set, it is instead the first
+//       word of an update sequence (described below), below, which is folled by an index word.
+//       - If the index word has "00" in bits 31:30, then it's the index of a tensor previously defined by tensor_def
+//       - otherwise, msbs are "10", and the word is a 'forward index, 0x80000000+k; the decoder stores the
+//         adddress of the tensor pointer to be updated in a linear table at offset [k]. The first k seen will be 0;
+//         each subsequent will be most 1 greater than any any previous.
+//
+//   (2) deser.tensor_def defines a tensor; its address is appended to an array used to resolve the normal tensor_ref.
+//
+//   (3) as mentioned, sometimes when expecting the index for tensor_deser, we obtain a word with bit 30 set. This flags
+//       a sequence of one of more 'update records', followed by the index.
+//       Each update record encodes a tensor index (previously defined via tensor_def) and one or more 'forward indices'
+//       which are resolved by that tensor. The decoder sets all of the corresponding pointers immediately, since forward
+//       reference indices resolved may be reused.
+//
+// The enooder thus acts as follows:
+//
+//   We have a vector<int> "forward_allocation_table" for the forward index 'k' values; this contains a set of linked-lists,
+//   each value is actually the index  of the next value in the chain, with -1 marking the end; there is a free list.
+//   Each entry in the list at [k] (except free records) represents an unresolved forward reference which was encoded as
+//   0x80000000+k. Each linked-list corresponds to a specific tensor, the head being in tensor_index_map.
+//
+//   There is a map<Tensor *,unsigned>  "tensor_index_map" which represents what index is assigned to each tensor.
+//   When the value has "00" in the upper  bits, it means tensor_def has been done, and that's the assigned index value.
+//   Otherwise it has "10" in upper bits, it is the most recent forward index 0x80000000+k, and 'k' is the 'head' pointer
+//   into the forward index table.
+//
+//   When a tensor_def is done:
+//      - tensor is assigned the next sequential index.
+//      - if there is already an entry, it must be a forward index: this, and the new index, are
+//         placed on 'pending_update_pairs' list to generate a resolution record at the next opportunity.
+//      - in any case the tensor is assigned the next sequential index, and tensor_index_map is updates.
+//
+//   When a tensor_ref is done:
+//      First, any pending update records are processed, described below [see note [+]]
+//      if the tensor has been defined via tensor_def, we simply encode the index assigned to it.
+//      Otherwise:
+//         - we assign an available forward index to it, by taking one from the free list, or
+//           growing the forward index table.
+//         - if the tensor is not in the map already,y the new entry will have a link of -1, otherwise
+//           the new entry links to the existing chain.
+//         - in either case, the tensor_index_map will now point to the index of the new entry in the chain.
+//
+//     To process a pending update record for a forward index 'k', and a associated tensor index:
+//          - encode the new index value, the forward index 'k', and any subsequent indicies in the table,
+//            following the chain; and in the process, we put all the table entries in the free list.
+//
+// note [+]: a forward index freed in this step, may be reused immediately after to represente a different
+// tensor's forward reference; so the decoder must resolve them immediately.
+//
+// one little problem ... if the deserialize interface uses 'need_fixup' to apply one fixup
+// to two or more tensors, there's no way to represent that in the data above.. I think the proper
+// way to fix this is to eliminate this practice from the code base, so that the 'serialize' process will
+// be able to do all the work - in the meantime I've made 'pending_tensor_updates' table a pair of pointers,
+// so that we can have up to two tensor pointers set from each deserialize event, with little overhead.
+
+class SerTensorConnDefs {
+  public:
+    typedef unsigned tensor_idx;
+    typedef Tensor const *ptr_type;
+    static constexpr tensor_idx LOWER_BIT_MASK = 0x3FFFFFFF;
+};
+
+} // namespace hnnx
+
+#endif // SERDES_TENSORS_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/serialize_defs.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/serialize_defs.h
new file mode 100755
index 0000000000000..eaa8117175124
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/serialize_defs.h
@@ -0,0 +1,79 @@
+//==============================================================================
+//
+// Copyright (c) Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef SERIALIZE_DEFS_H
+#define SERIALIZE_DEFS_H 1
+
+#include <cstdint>
+#include <typeinfo>
+
+// General class of op: foreground for main thread, vec for HVX thread(s), mtx
+// for HMX thread(s).
+enum OpStoreType {
+    OpStoreFg,
+    OpStoreVec, // HVX (vector)
+    OpStoreMtx, // HMX (matrix)
+    OpStoreElt, // HLX (element-wise long vector)
+};
+
+class Op;
+
+namespace hnnx {
+
+class Serializer;
+class Deserializer;
+class Deserz;
+
+/**
+ * @brief Common base to register error in Serialize/Deserialize
+ * Calling register_error sets the error string (unless there is one already).
+ * any_error() should be checked after each full use of serialize/deserialize
+ * Note: register_error must be called with a string constant, or other
+ * persistent string, since the contents of the string are not copied.
+ *
+ */
+class DeSerError {
+    char const *errstr = nullptr; // null if no error
+  public:
+    void reset_error() { errstr = nullptr; }
+    bool any_error() const { return errstr != nullptr; }
+    void register_error(char const *estr)
+    { // must be a persistent string!
+        if (errstr == nullptr) errstr = estr;
+    }
+    char const *error_string() const { return errstr; }
+};
+
+// We allow 4 bits of extra flag storage when storing an Op type, using
+// the upper four bits of the index.
+constexpr uint32_t SerializeOpFlagMask = 0xf0000000u;
+constexpr uint32_t SerializeOpFlagShift = 28u;
+
+void op_serialize_common(Serializer &sctx, Op const *op, std::type_info const *actual_type = nullptr);
+
+static constexpr unsigned OP_SEQNO_MARKER_XOR = 0x1303ee71u;
+static constexpr unsigned OP_SEQNO_MARKER_MASK = 0x1FFFFFFFu; // upper 3 bits reserved for flags.
+static constexpr unsigned OP_SEQNO_PRELOAD_FLAG = 0x80000000u;
+// if this bit is set in the sequence word, it means one or more 'extended attribute'
+// words follow.
+static constexpr unsigned OP_SEQNO_EXTATTR_FLAG = 0x40000000u;
+// The general format for 'extended attribute' word is;
+//  bits 30..24 tell you what it means, and
+//   bit 31 tells you it's not the last one.
+static constexpr unsigned OP_EXTATTR_SELF_SLICING = 0x1; // 8 LSBs = # of slices (>=2)
+
+// bits 23..0 points to the index of predicate conditions
+// bits 30..24 tell you what it means
+// bit 31 tell you that you are are not the last one
+static constexpr unsigned OP_EXTATTR_PREDICATE = 0x2;
+// Getting the last 24 bits. It contains the index of the predicate condition
+static constexpr unsigned OP_PRED_CONDITION_INDEX_MASK = 0x00ffffff;
+
+} // namespace hnnx
+
+#endif // SERIALIZE_DEFS_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/serialize_oplist.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/serialize_oplist.h
new file mode 100755
index 0000000000000..1dcac0ac186e1
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/serialize_oplist.h
@@ -0,0 +1,252 @@
+//==============================================================================
+//
+// Copyright (c) 2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef SERIALIZE_OPLIST
+#define SERIALIZE_OPLIST 1
+#include <cstdint>
+#include <array>
+#include <utility>
+
+#include "forward_classes.h"
+#include "bake_defs.h"
+
+namespace hnnx {
+
+class Checkpoints;
+
+namespace bake {
+
+template <unsigned X> static constexpr unsigned log2_ceil()
+{
+    if constexpr (X <= 16) {
+        static_assert(X > 0, "log2_ceil<0> not valid!");
+        return (X <= 2) ? (X - 1) : (X <= 4) ? 2 : (X <= 8) ? 3 : 4;
+    } else {
+        return log2_ceil<(X + 15) / 16>() + 4;
+    }
+}
+
+// this
+template <typename OpaqueT> inline constexpr unsigned encode_opaquet_size()
+{
+    // op_opaque_tgt_info<OpaqueT> must be specialized to provide length and alignment ('on target').
+    constexpr unsigned length = op_opaque_tgt_info<OpaqueT>::length;
+    if constexpr (length == 0) { // if 0, we don't care about the alignment
+        return 2;
+    } else {
+        constexpr unsigned align = op_opaque_tgt_info<OpaqueT>::alignment;
+        // otherwise, alignment must be a suitable power of 2, and length must be a multiple of it.
+        static_assert(align >= 1 && align <= max_opaquet_align && (align & (align - 1)) == 0, "bad alignment value");
+        static_assert(length % align == 0, "length must be a multiple of alignment");
+        constexpr unsigned lower_bits = (align <= 4) ? 2 : log2_ceil<align>();
+        return (length << 8) | lower_bits;
+    }
+}
+
+} // namespace bake
+
+class OpSerHandle;
+
+class SerOpsInterface {
+    friend class hnnx::OpSerHandle;
+
+  protected:
+    SerOpsInterface() = default;
+    ~SerOpsInterface() = default;
+    SerOpsInterface(SerOpsInterface const &) = delete;
+    SerOpsInterface &operator=(const SerOpsInterface &) = delete;
+    SerOpsInterface(SerOpsInterface &&) = delete;
+    SerOpsInterface &operator=(SerOpsInterface &&) = delete;
+    // Common handler for op_typical, op_variadic, op_typical_with_extra.
+    // mode = 0 for op_typical
+    //       = 1 for op_variadic
+    //       = 3 for op_simpleop
+    //   for op_typical_with_extra:
+    //       lower 8 bits are log2(aligment) - must be >= 2, <= log2(max_opaquet_align)
+    //       uppper 24 bits are size, multiple of alignment.
+    //       If the size is 0, lower 8 bits are always 2.
+    // So, codes 4..257 are available.
+    static constexpr unsigned opMODE_typical = 0;
+    static constexpr unsigned opMODE_variadic = 1;
+    static constexpr unsigned opMODE_simpleop = 3;
+
+    virtual void op_serialize_func(Op const *op, unsigned n_in, Tensor const *const *in_tens, unsigned n_out,
+                                   uptr_Tensor const *out_tens, unsigned mode) = 0;
+    // Used for ConstWrapperOp, ShapeWrapperOp, DummyN
+    virtual void op_for_tensor_func(Op const *op, unsigned n_out, uptr_Tensor const *out_tens) = 0;
+
+    virtual void prescan_ops_func(Op *const *seq_of_ops, unsigned n_ops, bool last = false) = 0;
+
+  public:
+    // 'top-level' sequencing calls
+    // Before serializing the allocator,
+    // (1) call 'graph_io_tensots' with prescan = true, just to prescan the tensors
+    //    (no serialization is done)
+    // (2) present all Ops to prescan_ops in the same order as they will be serialized.
+    //     This may be done in more than one call, with 'last = true' on the last one.
+    //     (or finish with a call to prescan_ops_done).
+    inline void prescan_ops(std::vector<Op *> const &seq_of_ops, bool last = false)
+    {
+        prescan_ops_func(seq_of_ops.data(), seq_of_ops.size(), last);
+    }
+    inline void prescan_ops(Op *const *seq_of_ops, unsigned n_ops, bool last = false)
+    {
+        prescan_ops_func(seq_of_ops, n_ops, last);
+    }
+    inline void prescan_ops_done() { prescan_ops_func(nullptr, 0, true); }
+
+    virtual void graph_io_tensors(unsigned n_in, uptr_Tensor const *in_tensors, unsigned n_out,
+                                  uptr_Tensor const *out_tensors, bool is_prescan = false) = 0;
+    virtual void checkpoints_table(hnnx::Checkpoints const &) = 0;
+    virtual void before_runlists(unsigned nops_norun, unsigned nops_main, unsigned nops_vector, unsigned nops_mtx,
+                                 unsigned nops_elt) = 0; // call before serializing 'non-runlist'
+    virtual void after_non_runlist() = 0; // call after serializing 'non_runlist', before 'combined runlist'
+    virtual void after_runlist() = 0; // call after runlist complete.
+
+    // tensor_serialize_func needs to know what basic thing it's dealing with, and it then
+    // can discover everything else via virtual calls.
+    static constexpr unsigned tensMODE_fail = 0; // used for tensors which can't serialize
+    static constexpr unsigned tensMODE_general = 1; // Concrete tensor, all cases
+    static constexpr unsigned tensMODE_shape = 2; // TensorShape<Rank>
+    static constexpr unsigned tensMODE_scalar = 3; // TensorSlcrDT<DT>
+
+    // This is called to serialize each op; op_seqno is the 0-based index
+    // (i.e. the number of ops previously serialized).
+    virtual void serialize_op(Op const &, unsigned op_seqno) = 0;
+    // to be called from TypicalOpIoBase<N_OUT, N_IN>::serialize
+    template <size_t N_IN, size_t N_OUT>
+    inline void op_typical(Op const *op, std::array<const Tensor *, N_IN> const &inputs,
+                           std::array<uptr_Tensor, N_OUT> const &outputs)
+    {
+        op_serialize_func(op, N_IN, inputs.data(), N_OUT, outputs.data(), opMODE_typical);
+    }
+    // to be called from TypicalOpWithCompiler<F, OpaqueT>::serialize, with OpaqueT explicitly specified
+    template <typename OpaqueT, size_t N_IN, size_t N_OUT>
+    inline void op_typical_with_extra(Op const *op, std::array<const Tensor *, N_IN> const &inputs,
+                                      std::array<uptr_Tensor, N_OUT> const &outputs)
+    {
+        op_serialize_func(op, N_IN, inputs.data(), N_OUT, outputs.data(), bake::encode_opaquet_size<OpaqueT>());
+    }
+
+    // to be called from VariadicOpBase::serialize
+    template <typename V_IN, typename V_OUT>
+    inline void op_variadic(Op const *op, V_IN const &inputs, V_OUT const &outputs)
+    {
+        op_serialize_func(op, inputs.size(), inputs.data(), outputs.size(), outputs.data(), opMODE_variadic);
+    }
+
+    // to be used for SimpleOpWrapper::serialize; op_serialize_func will dynamic-cast to SimpleOpWrapper
+    // and then obtain the proper type.
+    template <typename V_IN, typename V_OUT>
+    inline void op_simpleop(Op const *op, V_IN const &inputs, V_OUT const &outputs)
+    {
+        op_serialize_func(op, inputs.size(), inputs.data(), outputs.size(), outputs.data(), opMODE_simpleop);
+    }
+
+    // Used for ConstWrapperOp, ShapeWrapperOp, DummyN
+    inline void op_for_tensor(Op const *op, unsigned n_out, uptr_Tensor const *out_tens)
+    {
+        op_for_tensor_func(op, n_out, out_tens);
+    }
+    inline void op_for_tensor(Op const *op, uptr_Tensor const &out_tens) { op_for_tensor_func(op, 1, &out_tens); }
+
+    // Each call to serialize method of Tensor goes to tensor_serialize.
+    // Normally, these occur within an 'Op' serialize method, but 'graph in/out' tensors are also
+    // serialized.
+    virtual void tensor_serialize(Tensor const *tens) = 0;
+
+    // Given a pointer to a ShapeFlags which is really a Shape<RANK>, serialize its content.
+    // Does not include 'shared object' protocol (only called when we have a new one)
+    virtual void shape_serialize(ShapeFlags const *basep, unsigned rank) = 0;
+
+    // used to handle framework ops.
+    // The serialize method calls methods of the returned handle (which call protected .spcl_XX virtual methods)
+    // and then .spcl_done is called when the handle is deleted. So you can do the whole thing in one line,
+    // e.g.
+    //   sctx.op_special(this).data_u32({val1, val2}}.size_vec(ptr);
+    //
+    // It is expected that no other serialization activity occurs between the call to .op_special(),
+    // and the call to spcl_done (when the handle is deleted).
+    //
+    virtual OpSerHandle op_special(Op const *op) = 0;
+
+  protected:
+    // methods called by methods of the 'OpSerHandle'
+    // See OpSerHandle to see what they do.
+    virtual void spcl_done(OpSerHandle &) = 0; // called by ~OpSerHandle
+    virtual void spcl_add_u32(OpSerHandle &, uint32_t const *p, unsigned n) = 0;
+    virtual void spcl_add_sized_vec(OpSerHandle &, uint32_t const *data, bool extra) = 0;
+    virtual void spcl_fill_nullptr(OpSerHandle &, unsigned n) = 0;
+
+    OpSerHandle make_opser_handle(unsigned info);
+};
+
+class OpSerHandle {
+    friend class SerOpsInterface;
+
+  protected:
+    SerOpsInterface &owner;
+    unsigned info;
+    OpSerHandle(SerOpsInterface &owner_in, unsigned info_in) : owner(owner_in), info(info_in) {}
+
+  public:
+    inline ~OpSerHandle() { owner.spcl_done(*this); }
+    OpSerHandle(const OpSerHandle &) = delete;
+    OpSerHandle &operator=(const OpSerHandle &) = delete;
+    OpSerHandle(OpSerHandle &&) = delete;
+    OpSerHandle &operator=(OpSerHandle &&) = delete;
+    //////////////////////////////
+    // add literal u32 values
+    // (1) generic ptr/offs
+    inline OpSerHandle &data_u32(uint32_t const *p, unsigned n)
+    {
+        owner.spcl_add_u32(*this, p, n);
+        return *this;
+    }
+    // (2) { vals, ... }
+    inline OpSerHandle &data_u32(std::initializer_list<uint32_t> vals)
+    {
+        owner.spcl_add_u32(*this, vals.begin(), vals.size());
+        return *this;
+    }
+    // (3) single value
+    inline OpSerHandle &data_u32(uint32_t val)
+    {
+        owner.spcl_add_u32(*this, &val, 1);
+        return *this;
+    }
+    ///////////////////////////
+    // Add an 'outboard' array of literal u32, as in Spill/Fill/BlockZap/McSend.
+    // The first word must be the len of the remaining values (in bytes; >=4 and a multiple of 4).
+    // pickle format consists of writing the entire array (with the first word serving as the len).
+    // If 'extra' is true, it means that array has an extra word at the end, not in the count,
+    // and not serialized (such as the 0-marker at end of blockzap).
+    inline OpSerHandle &sized_vec(uint32_t const *arr_data, bool extra = false)
+    {
+        owner.spcl_add_sized_vec(*this, arr_data, extra);
+        return *this;
+    }
+    ///////////////////////////
+    // add one or more 'null pointer fill', this has no effect on the pickle but it reserves
+    // pointer slot(s) in the baked op image.
+    inline OpSerHandle &fill_nullptr(unsigned n = 1)
+    {
+        owner.spcl_fill_nullptr(*this, n);
+        return *this;
+    }
+};
+// This is a way for subclasses of SerOpsInterface to make an OpSerHandle via its protected ctor.
+//
+inline OpSerHandle SerOpsInterface::make_opser_handle(unsigned info)
+{
+    return OpSerHandle(*this, info);
+}
+
+} // namespace hnnx
+
+#endif // SERIALIZE_OPLIST
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/serialize_register.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/serialize_register.h
new file mode 100755
index 0000000000000..650fa58a65b53
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/serialize_register.h
@@ -0,0 +1,91 @@
+//==============================================================================
+//
+// Copyright (c) Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef SERIALIZE_REGISTER_H
+#define SERIALIZE_REGISTER_H 1
+
+#include <stdexcept>
+#include "crate.h"
+#include "op_info.h"
+#include "macros_attribute.h"
+#include "weak_linkage.h"
+#include "size_align_code.h"
+#include "dcrate_inlines.h"
+
+namespace hnnx {
+
+class SimpleOpWrapper;
+
+template <typename T> struct deserialize_tensor_using_constructor {
+    static uptr_Tensor deserialize(Deserz &dctx)
+    {
+        // put the deserialized
+        // Tensor into the crate, using a 'Tensor_Deleter' which won't actually try to delete it.
+        Tensor *const t_ptr = dctx.dcrate()->emplace0<T>(dctx);
+        return std::unique_ptr<Tensor, Tensor_Deleter>(t_ptr, Tensor_Deleter(true));
+    }
+};
+
+// Allocation/deallocation for Op
+
+template <typename T> struct alloc_func_for_op {
+    static void *alloc_func(void *ptr, Deserz &dctx) { return new (ptr) T(dctx); }
+    // this is here so that specializations of deserialize_tensor_using_constructor
+    // can be made which have size 0; this is only for 'ConatWrapper' and 'ShapeWrapper'.
+    static constexpr size_align_code_t op_size_align = size_align_code_t::for_type<T>();
+};
+
+PUSH_VISIBILITY(default)
+API_EXPORT void deserialize_simple_op_wrapper(void *, Deserz &dctx, std::unique_ptr<SimpleOpBase> sop_in);
+POP_VISIBILITY()
+
+template <typename T> struct alloc_func_for_op_ext {
+    static void *alloc_func(void *ptr, Deserz &dctx)
+    {
+        auto sop = std::make_unique<T>();
+        deserialize_simple_op_wrapper(ptr, dctx, std::move(sop));
+        return ptr;
+    }
+};
+
+template <typename T> struct dealloc_func_for_op {
+    static void func(Graph *graph_in, void *ptr)
+    {
+        if constexpr (has_clear<T>) {
+            static_cast<T *>(ptr)->clear(graph_in);
+        }
+        static_cast<T *>(ptr)->~T();
+    }
+};
+// specialize for 'int'; used for all trivially-destructable types.
+template <> struct dealloc_func_for_op<int> {
+    static void func(Graph *graph_in, void *ptr) {}
+};
+
+template <typename T> //
+inline constexpr deserialize_dtor_func get_dealloc_func_for_op()
+{
+    if constexpr (!std::is_trivially_destructible<T>::value) {
+        return dealloc_func_for_op<T>::func;
+    } else {
+        // we only need one of these
+        return dealloc_func_for_op<int>::func;
+    }
+}
+
+template <typename OPTYPE> inline void register_framework_op(char const *opname)
+{
+    using alloc_func = alloc_func_for_op<OPTYPE>;
+    register_op_info(typeid(OPTYPE), hnnx::cost_function_t(StandardCosts::FAST), 0, (SimpleOpFactory) nullptr, false,
+                     opname);
+    op_deserializer_fn const fn(alloc_func::alloc_func, get_dealloc_func_for_op<OPTYPE>(), alloc_func::op_size_align);
+    deserialize_op_register(&typeid(OPTYPE), opname, fn);
+}
+
+} // namespace hnnx
+#endif // SERIALIZE_REGISTER_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/shape.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/shape.h
new file mode 100755
index 0000000000000..0fa60e3c3838d
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/shape.h
@@ -0,0 +1,199 @@
+//==============================================================================
+//
+// Copyright (c) 2020 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef HEXNN_SHAPE_H
+#define HEXNN_SHAPE_H 1
+
+#include <cstdint>
+#include <set>
+#include <array>
+#include <cstring>
+#include <map>
+#include "interface_defs.h"
+#include "template_help.h"
+#include "serialize_defs.h"
+#include "weak_linkage.h"
+#include "macros_attribute.h"
+#include "dynamic_tensors.h"
+
+class Graph;
+
+// a bit of weirdness here, to avoid the need to use a different std::map or std::set
+// for each rank of shape.
+// Existing shapes are registered in a multimap<unsigned, void const*>, one per Rank;
+// the key is a hash, and the value is a pointer to a Shape<N>.
+//
+namespace hnnx {
+
+using shape_reduce_map = std::multimap<unsigned, void const *>;
+//
+// shape_hash is used to hash a Shape<Rank> object; the 'len' value is
+// supplied by Shape::shplen(), and depends on Rank.
+unsigned shape_hash(void const *, unsigned shplen);
+//
+// this compares two shapes for equality.
+inline bool shape_compare_eq(void const *a, void const *b, unsigned shplen)
+{
+    return std::memcmp(a, b, shplen) == 0;
+}
+
+// This looks up a shape in a shape_reduce_map, independently of Rank.
+// If a matching value is found, it returns an iterator to it, and the caller
+// can look at iter->second. If no value is found, it inserts an entry
+//  { hash, nullptr}, and returns the iterator pointing to that; caller
+//  will see iter->second is null, and must replace it with a pointer
+//  to a persistent value equal to '*shp'.
+shape_reduce_map::iterator shape_find_in_map(shape_reduce_map &map, void const *shp, unsigned hash, unsigned shplen);
+
+void shape_serialize(Serializer &sctx, unsigned rank, size_t const *dims, size_t const *max_dims, uint8_t const *pad);
+
+// values for the ShapeFlags.flags
+// 'constant' is used for constant tensors
+// 'uncached' is used for 'uncached' (dma spill/fill)
+// Note: other flags may be or'd' in later in upper bits,
+// so test 'constant' and 'uncached' as bit tests.
+enum class ShapeFlag {
+    none = 0,
+    constant = 1,
+    uncached = 2, // mutually exclusive with 'constant'
+};
+
+struct ShapeFlags {
+    uint16_t flags;
+    // must not have any undefined padding between ShapeFlags and Shape<Rank>dims,
+    // so we have this explicit paddings
+  private:
+    uint16_t padding[sizeof(size_t) / sizeof(uint16_t) - 1] = {
+            0,
+    };
+
+  public:
+    ShapeFlags() : flags(0) {}
+    explicit ShapeFlags(ShapeFlag flags_in) : flags(unsigned(flags_in)) {}
+    ShapeFlags(ShapeFlags const &) = default;
+    virtual ~ShapeFlags() = default;
+
+    inline bool is_const_memory() const { return (flags & unsigned(ShapeFlag::constant)) != 0; }
+    inline bool is_uncached_memory() const { return (flags & unsigned(ShapeFlag::uncached)) != 0; }
+    inline bool ok_src_bypass() const
+    {
+        return (flags & (unsigned(ShapeFlag::uncached) | unsigned(ShapeFlag::uncached))) != 0;
+    }
+    inline bool ok_dst_bypass() const { return (flags & unsigned(ShapeFlag::uncached)) != 0; }
+
+    // avoid warning about unused private member:
+    unsigned avoid_warning() const { return padding[0]; }
+};
+// This is used by 'persistent_clone' to duplicate a shape object, but with a new flags value.
+// The 'ref_shape' is really a pointer to a Shape<rank>, with 'rank' in supported range.
+// This will just do Shape<rank>::canonical_shape( graph, *ref_shape, new_flags), and return the result
+// cast back to ShapeFlags const *.
+ShapeFlags const *copy_shape_with_flags(Graph &gr, ShapeFlags const *ref_shape, unsigned rank, ShapeFlag newflags);
+
+} // namespace hnnx
+
+PUSH_VISIBILITY(default)
+
+// Functionality shared between Shape<Rank> and DynamicShape<Rank>
+template <size_t Rank> struct ShapeInterface : public hnnx::ShapeFlags {
+    ShapeInterface() : dims(), isDynamicShape(false){};
+    explicit ShapeInterface(std::array<size_t, Rank> dims_in, const bool is_dynamic_shape_in)
+        : dims(dims_in), isDynamicShape(is_dynamic_shape_in){};
+
+    mutable std::array<size_t, Rank> dims;
+    const size_t isDynamicShape;
+    inline const std::array<size_t, Rank> &get_dims() const { return dims; }
+
+    void set_dims(std::array<size_t, Rank> const &dims_in) const;
+    DynamicStatus get_state() const;
+    void set_state(DynamicStatus new_state) const;
+};
+
+template <size_t Rank> struct Shape : public ShapeInterface<Rank> {
+    using ShapeInterface<Rank>::flags;
+    using ShapeInterface<Rank>::dims;
+
+    Shape() : max_dims(), pad(){};
+    explicit Shape(const size_t *dims_in)
+        : ShapeInterface<Rank>(hnnx::ptr_to_stdarray<Rank, size_t>(dims_in), false),
+          max_dims(hnnx::ptr_to_stdarray<Rank, size_t>(dims_in)), pad(){};
+    Shape(std::array<size_t, Rank> dims_in, std::array<size_t, Rank> max_dims_in)
+        : ShapeInterface<Rank>(dims_in, false), max_dims(max_dims_in), pad(){};
+    //  copy, but change the flags
+    Shape(Shape const &ref, hnnx::ShapeFlag newflags) : Shape(ref) { flags = unsigned(newflags); }
+    std::array<size_t, Rank> max_dims;
+    std::array<uint8_t, Rank> pad;
+    static constexpr size_t RankVal = Rank;
+    // make crated shape matching given shape, or re-use an existing crated shape
+    API_EXPORT static const Shape *canonical_shape(Graph &graph_in, const Shape &val);
+    API_EXPORT static const Shape *canonical_shape(Graph &graph_in, const OutputDef &def);
+    // force a given ShapeFlag state
+    API_EXPORT static const Shape *canonical_shape(Graph &graph_in, const Shape &val, hnnx::ShapeFlag newflags);
+    // copy into crate without checking for existing duplicate
+    API_EXPORT static const Shape *crated_shape(Graph &graph_in, const Shape &val);
+
+    bool operator<(const Shape &rhs) const { return std::memcmp(this, &rhs, shplen()) < 0; }
+    API_EXPORT static const Shape *deserialize(hnnx::Deserz &dctx, Shape const **ptrloc);
+    API_EXPORT void serialize(hnnx::Serializer &sctx) const;
+
+#ifndef PREPARE_DISABLED
+    std::string get_shape_info() const;
+#endif
+
+  protected:
+    API_EXPORT unsigned shplen() const { return (char const *)&pad[0] + Rank - (char const *)this; }
+};
+// FIXME: this is incomplete since it doesn't have Shape<Rank> methods
+// This doesn't have flags either, so there can be only one distinct instance of Shape<0>.
+//
+template <> struct Shape<0> {
+    std::array<uint8_t, 1> dims;
+    std::array<uint8_t, 1> max_dims;
+    std::array<uint8_t, 1> pad;
+
+  protected:
+    unsigned shplen() const { return 0; }
+};
+
+template <size_t Rank> struct DynamicShape : public ShapeInterface<Rank> {
+    using ShapeInterface<Rank>::dims;
+
+  protected:
+    mutable DynamicStatus dynamic_state;
+
+  public:
+    explicit DynamicShape(const size_t *dims_in, DynamicStatus state_in)
+        : ShapeInterface<Rank>(hnnx::ptr_to_stdarray<Rank, size_t>(dims_in), true), dynamic_state(state_in){};
+    explicit DynamicShape(std::array<size_t, Rank> dims_in, DynamicStatus state_in)
+        : ShapeInterface<Rank>(dims_in, true), dynamic_state(state_in){};
+    inline void set_dims(std::array<size_t, Rank> const &dims_in) const { this->dims = dims_in; }
+    DynamicStatus get_state() const { return dynamic_state; }
+    inline void set_state(DynamicStatus new_state) const { dynamic_state = new_state; }
+    API_EXPORT static ShapeInterface<Rank> const *deserialize(hnnx::Deserz &dctx, const ShapeInterface<Rank> **ptrloc,
+                                                              ShapeInterface<Rank> const *shape);
+    // copy into crate without checking for existing duplicate
+    API_EXPORT static DynamicShape<Rank> *crated_shape(Graph &graph_in, const DynamicShape &val);
+};
+
+// Need to define a get_dynamic_shape_obj() function in base Tensor class
+// because Serializer::tensor_serialize() requires it.
+// null_dynamic_shape is a dummy dynamic_shape used for
+// scalar tensor and tensor shape
+static const DynamicShape<1> null_dynamic_shape = DynamicShape<1>(std::array<size_t, 1>({0}), DynamicStatus::ValidData);
+
+POP_VISIBILITY()
+
+using Shapes = hnnx::shape_reduce_map[7];
+
+#if 0
+struct ShapeRepository {
+    Shapes shapes;
+};
+#endif
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/simple_op.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/simple_op.h
new file mode 100755
index 0000000000000..fbe3f4344241d
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/simple_op.h
@@ -0,0 +1,197 @@
+//==============================================================================
+//
+// Copyright (c) 2021 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef SIMPLE_OP_H
+#define SIMPLE_OP_H
+
+#include "graph_status.h"
+#include "template_help.h"
+#include "op_utils.h"
+#include "template_help_tensor_ext.h"
+#include "tensor.h"
+#include "cost.h"
+#include "weak_linkage.h"
+#include "macros_attribute.h"
+
+namespace hnnx {
+
+PUSH_VISIBILITY(default)
+
+// base class of SimpleOp
+class SimpleOpBase {
+    std::unique_ptr<SimpleOpBase> next_sop;
+
+  public:
+    SimpleOpBase() {}
+    SimpleOpBase(const SimpleOpBase &) = delete;
+    SimpleOpBase &operator=(const SimpleOpBase &) = delete;
+    SimpleOpBase(SimpleOpBase &&) = delete;
+    SimpleOpBase &operator=(SimpleOpBase &&) = delete;
+    API_EXPORT virtual ~SimpleOpBase();
+    API_EXPORT virtual std::type_info const *true_type() const { return &typeid(*this); }
+    API_EXPORT virtual size_t get_n_inputs() const = 0;
+    API_EXPORT virtual size_t get_n_outputs() const = 0;
+    API_EXPORT virtual uint8_t const *get_input_tensor_types() const = 0;
+    API_EXPORT virtual uint8_t const *get_output_tensor_types() const = 0;
+    API_EXPORT virtual bool needs_tcm() const = 0;
+    API_EXPORT virtual GraphStatus execute(Tensor const *const *inputs_p, unsigned n_in, uptr_Tensor const *outputs_p,
+                                           unsigned n_out) const noexcept = 0;
+    API_EXPORT static void release_chain(std::unique_ptr<SimpleOpBase> &listhead) noexcept;
+
+    API_EXPORT inline void set_next(std::unique_ptr<SimpleOpBase> &&nextp) { next_sop = std::move(nextp); }
+};
+
+POP_VISIBILITY()
+
+/*
+ * SimpleOp class
+ * used by external op packages
+ * for the purpose of exposing fewer symbols
+ */
+template <auto F> class SimpleOp : public SimpleOpBase {
+    using Ftype = std::remove_pointer_t<decltype(F)>;
+
+  public:
+    // the collection of input types, as pointers
+    using input_ptr_tuple_type = typename ArgsTuples<Ftype>::input_ptr_tuple;
+    // the collection of output types, as pointers
+    using output_ptr_tuple_type = typename ArgsTuples<Ftype>::output_ptr_tuple;
+    // the inputs as real types
+    using input_tuple_defs = typename ArgsTuples<Ftype>::input_tuple;
+    // the outputs as real types
+    using output_tuple_defs = typename ArgsTuples<Ftype>::output_tuple;
+    // A graph argument is not allowed
+    using graph_ptr_tuple_type = typename ArgsTuples<Ftype>::graph_ptr_tuple;
+
+    // numbers of inputs and outputs
+    static constexpr size_t n_inputs = ArgsTuples<Ftype>::n_inputs;
+    static constexpr size_t n_outputs = ArgsTuples<Ftype>::n_outputs;
+
+    // indices representing input and outputs tensor types
+    // only tensor types from AllTensors in template_help_tensor_ext.h are allowed to be used in SimpleOp
+    static constexpr std::array<uint8_t, n_inputs> input_tensor_type_indices =
+            tensors_to_indices<std::array<uint8_t, n_inputs>, input_tuple_defs>();
+    static constexpr std::array<uint8_t, n_outputs> output_tensor_type_indices =
+            tensors_to_indices<std::array<uint8_t, n_outputs>, output_tuple_defs>();
+    // boolean representing whether all tensor types used in outputs are from AllTensors list
+    static constexpr bool are_tensor_types_valid = check_tensor_types_valid<output_tuple_defs>();
+
+    // number of graph parameter
+    static constexpr size_t n_graphs = std::tuple_size<std::decay_t<graph_ptr_tuple_type>>::value;
+
+    SimpleOp() : SimpleOpBase() {}
+    SimpleOp(const SimpleOp &) = delete;
+    SimpleOp &operator=(const SimpleOp &) = delete;
+    SimpleOp(SimpleOp &&) = delete;
+    SimpleOp &operator=(SimpleOp &&) = delete;
+
+    ~SimpleOp() override = default;
+
+    size_t get_n_inputs() const override { return n_inputs; }
+
+    size_t get_n_outputs() const override { return n_outputs; }
+
+    uint8_t const *get_input_tensor_types() const override { return input_tensor_type_indices.data(); }
+
+    uint8_t const *get_output_tensor_types() const override { return output_tensor_type_indices.data(); }
+
+    bool needs_tcm() const override
+    {
+        // replace with less dependency in the future
+        static constexpr bool needs_tcm_t = has_memclass<MemoryClass::TCM, output_tuple_defs>::value;
+        return needs_tcm_t;
+    }
+
+    static inline bool valid_construction(size_t n_inputs_in, size_t n_outputs_in, Tensor const *const *inputs_in,
+                                          OutputDef const *const *outputs_in, Graph &graph_in)
+    {
+        if (n_inputs != n_inputs_in) return false;
+        if (n_outputs != n_outputs_in) return false;
+        if (!are_input_tensors_compatible<n_inputs, input_ptr_tuple_type>(graph_in, inputs_in)) return false;
+        if (!are_output_defs_valid<n_outputs, output_tuple_defs>(outputs_in, graph_in)) return false;
+        if (n_graphs) return false;
+        return true;
+    }
+
+  protected:
+    // generate parameter I (in range 0..parm_n_total-1) for calling the func within execute.
+    // Return type is 'auto &' so it will always return a reference.
+    template <size_t I>
+    inline auto &get_exec_parm(Tensor const *const *const inputs_in, uptr_Tensor const *const outputs_in) const noexcept
+    {
+        if constexpr (I < n_outputs) { // output
+            using output_ptr_t = std::tuple_element_t<I, output_ptr_tuple_type>;
+            // extract output[I], downcast to output_ptr_t, return ref
+            return *static_cast<output_ptr_t>(outputs_in[I].get());
+        } else {
+            static_assert(I < n_outputs + n_inputs);
+            using input_ptr_t = std::tuple_element_t<I - n_outputs, input_ptr_tuple_type>;
+            // extract input[I - n_inputs], downcast to output_ptr_t, return ref
+            return *static_cast<input_ptr_t>(inputs_in[I - n_outputs]);
+        }
+    }
+    template <size_t... I>
+    inline GraphStatus call_with_parms(Ftype f, Tensor const *const *const inputs_in,
+                                       uptr_Tensor const *const outputs_in, std::index_sequence<I...>) const noexcept
+    {
+        return GraphStatus(f(get_exec_parm<I>(inputs_in, outputs_in)...));
+    }
+
+  public:
+    GraphStatus execute(Tensor const *const *const inputs_p, const unsigned n_in, uptr_Tensor const *const outputs_p,
+                        const unsigned n_out) const noexcept override
+    {
+        // the SimpleOpWrapper ctor calls get_n_inputs and get_n_outputs to size its arrays, so
+        // this correspondence should not need more than an assert.
+        assert(n_in == n_inputs && n_out == n_outputs);
+        return call_with_parms(F, inputs_p, outputs_p, std::make_index_sequence<n_outputs + n_inputs>{});
+    }
+
+    static std::unique_ptr<SimpleOpBase> create(size_t n_inputs_in, size_t n_outputs_in, Tensor const *const *inputs_in,
+                                                OutputDef const *const *outputs_in, Graph &graph_in)
+    {
+        if (SimpleOp::valid_construction(n_inputs_in, n_outputs_in, inputs_in, outputs_in, graph_in)) {
+            return std::move(std::make_unique<SimpleOp>());
+        } else {
+            return std::unique_ptr<SimpleOp>{};
+        }
+    }
+
+    using tensor_deserializer_register_func = int (*)();
+
+    static constexpr tensor_deserializer_register_func get_tensor_deserializer_register_func()
+    {
+        return hnnx::deserialize_tensor_tuple<output_tuple_defs, false>::f_ptr();
+    }
+};
+
+} // namespace hnnx
+
+/**
+ * @brief All external Op source files must invoke this macro at the top of the file,
+ * before any COST_OF/REGISTER_OP/DEF_OPT calls.
+ *
+ */
+#define BEGIN_PKG_OP_DEFINITION(NAME) INITIALIZE_TABLES()
+
+/**
+ * @brief All external Op source files must invoke this macro at the bottom of the
+ * file, after all COST_OF/REGISTER_OP/DEF_OPT calls.
+ *
+ */
+#define END_PKG_OP_DEFINITION(NAME) FINALIZE_TABLES(NAME)
+
+template <auto F> struct SimpleOpType {
+    using type = hnnx::SimpleOp<F>;
+};
+
+template <auto F> struct DerivedType {
+    using type = hnnx::SimpleOp<F>;
+};
+
+#endif // SIMPLE_OP_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/simple_reg.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/simple_reg.h
new file mode 100755
index 0000000000000..68a455f80e281
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/simple_reg.h
@@ -0,0 +1,13 @@
+//==============================================================================
+//
+// Copyright (c) 2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+// We need the specific order for these headers
+// clang-format off
+#include "simple_op.h"
+#include "ops_opts_registration.h"
+// clang-format on
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/size_align_code.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/size_align_code.h
new file mode 100755
index 0000000000000..181db1e6ed66f
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/size_align_code.h
@@ -0,0 +1,72 @@
+//=============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//============================================================================
+
+#ifndef SIZE_ALIGN_CODE_H
+#define SIZE_ALIGN_CODE_H
+
+#include <cstddef>
+#include <cstdint>
+
+PUSH_VISIBILITY(default)
+
+namespace hnnx {
+
+// LCOV_EXCL_START [SAFTYSWCCB-1736] constexprs resolved during compile time
+template <size_t N> unsigned constexpr log2_floor_of()
+{
+    if constexpr (N < 4) {
+        return N >= 2 ? 1 : 0; // note, 0->0
+    }
+    size_t x = N;
+    unsigned res = 2;
+    while (x >= 8) {
+        x >>= 1;
+        res++;
+    }
+    return res;
+}
+
+// LCOV_EXCL_STOP
+
+// size_align_code_t combines the size and alignment of an op in a size_t word.
+
+class size_align_code_t {
+    // this has 'K' in lower 4 bits, and 'W' in upper bits;
+    // the alignment is 1 << K, and the size is W << K.
+    size_t code;
+
+  public:
+    constexpr size_align_code_t() : code(0) {}
+    constexpr size_align_code_t(size_align_code_t const &) = default;
+    constexpr size_align_code_t(size_align_code_t &&) = default;
+    constexpr size_align_code_t &operator=(size_align_code_t const &) = default;
+    constexpr size_align_code_t &operator=(size_align_code_t &&) = default;
+    ~size_align_code_t() = default;
+
+    // construct for a given op type T, e.g. size_align_code_t::for_type<OpType>();
+    template <typename T> static constexpr size_align_code_t for_type()
+    {
+        size_align_code_t result{};
+        constexpr size_t sz = sizeof(T);
+        constexpr size_t algn = alignof(T);
+        static_assert(algn >= 1u && algn <= 32768u && (algn & (algn - 1)) == 0, "bad alignment");
+        static_assert(sz > 0 && sz % algn == 0, "bad size");
+        constexpr unsigned log2a = log2_floor_of<algn>();
+        result.code = (sz / algn) * 16 | log2a;
+        return result;
+    }
+    size_t constexpr size() const { return (code >> 4u) << (code & 0xFu); }
+    size_t constexpr align() const { return size_t(1) << (code & 0xFu); }
+    bool constexpr is_null() const { return code == 0; }
+};
+
+} // namespace hnnx
+
+POP_VISIBILITY()
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/splithist.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/splithist.h
new file mode 100755
index 0000000000000..765113af77ff7
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/splithist.h
@@ -0,0 +1,191 @@
+//==============================================================================
+//
+// Copyright (c) 2020 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef SPLITHIST_H
+#define SPLITHIST_H
+
+#include <vector>
+#include "interface_defs.h"
+#include "log.h"
+
+namespace hnnx {
+// class to represent a slice history.
+// This is just a wrapper around unsigned;
+// public methods allow null-construct, copy, compare-eq.
+// Also, special 'next_slice' increment operation.
+class SplitHistoryTable;
+
+class splithist_t {
+    unsigned val;
+    explicit splithist_t(unsigned v) : val(v) {}
+    friend SplitHistoryTable;
+    // this is the number of split which can refer to
+    // one record in the table; it determines how many
+    // LSBS of 'val' are the slice no; the remainder
+    // are table index.
+    static unsigned constexpr SPLITPER = 1024;
+
+  public:
+    splithist_t() : val(0) {}
+    splithist_t(splithist_t const &) = default;
+    splithist_t &operator=(splithist_t const &) = default;
+    ~splithist_t() = default;
+
+    inline bool empty() const { return val == 0; }
+    // the following 2 methods only work if nsplit < SPLITPER
+    // they are only intended for results of AUTOTHREAD
+    unsigned get_record() const { return val / SPLITPER; }
+    unsigned get_sliceno() const { return val % SPLITPER; }
+
+    inline bool operator==(splithist_t const &rhs) const { return val == rhs.val; }
+    inline bool operator!=(splithist_t const &rhs) const { return val != rhs.val; }
+    inline void next_slice()
+    {
+        unsigned nextval = val + 1;
+        if ((nextval & (SPLITPER - 1)) == 0) { // rolled over
+            nextval -= SPLITPER * 2; // reset, and back one record.
+        }
+        val = nextval;
+    }
+    inline unsigned value() const { return val; }
+};
+class SplitHistory {
+    friend SplitHistoryTable;
+
+  public:
+    unsigned orig_id = 0;
+    unsigned unique_id = 0;
+    typedef std::vector<std::tuple<int, int, int>> indices_list_t;
+    indices_list_t split_indices;
+    SplitHistory() : orig_id(0), unique_id(0), split_indices() {}
+    SplitHistory(SplitHistory const &) = default;
+    SplitHistory(SplitHistory &&) = default;
+    SplitHistory &operator=(SplitHistory const &) = default;
+    SplitHistory &operator=(SplitHistory &&) = default;
+    ~SplitHistory() = default;
+
+    bool empty() const { return split_indices.empty(); }
+    std::string indices_string() const;
+    std::string indices_string_raw() const;
+    std::vector<int> offsets() const;
+
+  private:
+    SplitHistory(unsigned orig_id_in, unsigned unique_id_in, indices_list_t &&spl_in)
+        : orig_id(orig_id_in), unique_id(unique_id_in), split_indices(std::move(spl_in))
+    {
+    }
+};
+
+// The table in SplitHistoryTable represents the history of 'node split events' for any
+// given OpDef, with a series of trees encoded in a table. Nodes with the same history
+// have the same 32-bit splithist_t (and references are not counted).
+// For each slice generated by an autosplit, we represent the dimension sliced and the slice
+// index; if the op slices was not previously sliced, we retain its 'original id' in the root
+// of a tree; if it was previously sliced, we retain the previous split information as a reference
+// towards the root of the tree.
+// Since all slices within a split have the same information other than the slice index, we
+// store that in the 'pointer': the splithist_t (in its upper bits) references one of the entries
+// in the table via a 1-based index; and the lower bits of splithist_t are a slice index. So, only
+// one record needs to be added to the tree for each AUTOSPLIT done:
+//
+// - If a OpDef Q which was never split before is split on a given index, we make a new
+//   'root' record containing the original id from Q, and the dimension. The splithist_t generated
+//   for each of the parts all have the same table index - to the new record - but the
+//   'slice index' are all different : 0,1,2 ...
+// - If an OpDef Q was previously split (and thus has  non-zero splithist), we instead
+//   make a 'subsplit' record, which contains the splithist_t of Q (this acts as a
+//   pointer towards the tree root), and the split dim.
+//
+//  If any split involves more than SPLITPER parts, the slice indices won't all fit in the
+//  the lower bits of splithist_t. To support this, we add 'extension' records ahead of
+//  the normal records in the table.
+//  E.g. if SPLITPER=1024 and we need to make 2500 split, we need two extension records,
+//  ahead of the 'root' or 'subsplit' entry, as follows in the table:
+//
+//         - an extension record with recoff = 2
+//         - an extension record with recoff = 1
+//         - the main record (root or subsplit).
+//
+//   Now, for the first 1024 slices of that split [0..1023], the splithist_t upper bits contain the index
+//   of the main record, and have slice_idx 0..1023; the next 1024 slices [1024..2047] have upper bits
+//   referencing the extension record with recoff=1, also with slice_idx = 0..0123; and the final 452 slices
+//   [2048..2499] have upper bits referencing the extension record with recoff=2, and have slice_idx = 0..451.
+//   Extension records occupy one entry in the table, so when an extension record is encountered when following
+//   a splithist_t, it is easy to locate the 'main' record it refers to, and to adjust the slice index to the proper
+//   value, by using the 'recoff' in the extension record.
+//
+// So, there are three types of records:
+//   - Root record, contains a 3-bit 'dimno', and 32-bit 'original op id' (the lower 32 bits of the OpId which is split)
+//   - Subsplit record, contains a 3-bit dimno, and 'subsplit_t' which points one level towards the root (and, it may point to
+//     an extension record, if the containing split has more than SPLITPER slices).
+//   - extension records, which contain a 'recoff' value.
+// These records are stored in a vector of 'unsigned'; in addition to the fields given above, there
+// are upper bits in the first word which signify what the record type is. Also, the records are often packed
+// into one word, sometimes two, according to the magnitude of the values to be encoded. The 'extension' record always
+// fits in one word.
+// The details of the encoding are described in splithist.cc, near the three methods of SplitHistoryTable which
+// need to know about them (e.g. cursor_from_splithist unpacks a record to 'Cursor' object, compensating
+// for extension record when present).
+
+class SplitHistoryTable {
+    std::vector<unsigned> table;
+    static unsigned constexpr SPLITPER = splithist_t::SPLITPER;
+    // MAX_SLICE_N determines the max # of splits which can be done at any one
+    // time, which limits the number of extension words allowed. This is set
+    // to a very high value here; it's really just to provide an upper limit
+    // for sanity-checking the data structure, and should not be the deciding
+    // limit (max # of inputs on a concat).
+    static unsigned constexpr MAX_SLICE_N = 4 * 1024 * 1024;
+    static unsigned constexpr MAX_EXTRECS = (MAX_SLICE_N - 1) / SPLITPER;
+
+  public:
+    SplitHistoryTable() = default;
+    splithist_t make_new_split(uint32_t dimno, OpId nodeid, splithist_t oldhist, int nslices, uint32_t slice_size,
+                               uint32_t first_sliceno = 0);
+    SplitHistory get_splithist(splithist_t shist) const;
+    // Equivalent to get_splithist, but it only returns {orig_id, unique_id}
+    // instead of the full SplitHistory, and will be faster as a result.
+    std::pair<unsigned, unsigned> get_splithist_ids(splithist_t shist) const;
+
+    // Return the main split record / slice number.
+    // This differs from the values returned by splithist_t get_record() / get_sliceno() for extension records.
+    unsigned get_splithist_main_record(splithist_t shist) const;
+    unsigned get_splithist_main_sliceno(splithist_t shist) const;
+
+    // When two nodes containing splithist_t values 'a' and 'b' are combined
+    // by CSE, this determines the splithist for the combined node.
+    splithist_t resolve(splithist_t const a, splithist_t const b, const bool is_const) const
+    {
+        return ((a == b) || b.empty()) ? a : a.empty() ? b : resolve_func(a, b, is_const);
+    }
+
+    // Return true iff splithist_b is empty or is a parent of splithist_a
+    bool is_parent_of(splithist_t a, splithist_t b) const;
+    // Return the immediate parent of a
+    splithist_t get_parent(splithist_t a) const;
+
+  protected:
+    splithist_t resolve_func(splithist_t a, splithist_t b, bool is_const) const;
+
+    struct Cursor {
+        unsigned rec_index; // current record's index (skipping indirect record if any)
+        splithist_t parent; // link to parent, or empty (0) if this is root.
+        int sliceno; // slice index (adjusted for indirect record if any)
+        int dimno;
+        int slicesize;
+        inline bool is_root() const { return parent.empty(); }
+    };
+    // internal methods to traverse from a position to the root.
+    void cursor_from_splithist(Cursor &, splithist_t const &) const;
+    void cursor_to_parent(Cursor &c) const; // move to parent, only legal if not root.
+    unsigned extract_orig_id(unsigned recindex) const; // only to be used on root record.
+};
+
+} // end namespace hnnx
+
+#endif // SPLITHIST_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/string_registry2.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/string_registry2.h
new file mode 100755
index 0000000000000..72c7b686d929b
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/string_registry2.h
@@ -0,0 +1,203 @@
+//==============================================================================
+//
+// Copyright (c) 2018, 2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef STRING_REGISTRY_TWO
+#define STRING_REGISTRY_TWO 1
+#include <array>
+#include <string>
+#include <string_view>
+#include <map>
+#include <list>
+#include <vector>
+#include <stdexcept>
+#include <cstring>
+#include "weak_linkage.h"
+
+//
+// 'string registry'
+//
+// maps std::string -> string_key (which is a pointer)
+//  and back.
+// Each 'new'  string returns a new key; each previously seen string
+// gives the same key as before.
+// The empty string always maps to a specific value, a statically allocated entity which can be used for static init
+// of string_key objects.
+//
+// The object pointed to is a pair <stringview, hashval> -- and the stringview pointer is guaranteed to be
+// null-terminated - so the conversion from string_key to char * (or to its hash) is very quick.
+
+// NOTE: entries cannot be deleted. the destructor of the string_registry_template<> will free everything.
+// Exception: you can also call clear(), which deletes everything except the entry for empty string.
+//       Of course this forgets the previous mapping completely.
+
+PUSH_VISIBILITY(default)
+
+namespace hnnx {
+
+template <int K = 0x13121> struct polynomial_string_hash {
+    API_EXPORT unsigned operator()(char const *s, size_t n) const
+    {
+        unsigned h = 0;
+        for (int i = 0; i < (int)n; i++) {
+            h = h * (unsigned(K) | 1) + s[i];
+        }
+        return h;
+    }
+};
+
+//
+// The data structure is:
+//
+//     std::map<std::string_view,hashval_t>  m_fwd_map;
+//
+//         This maps 'known' strings to their hashes. the string_view references memory in
+//         'bulk storage' (see below), each string is null-terminated.
+//         This could also be an unordered_map
+//
+//    std::list< std::array< char, BULKN> > m_bulk;	 // list of memory chunks for strings.
+//    char * m_bulk_current;						// points to m_bulk.back()[0] (or null when none)
+//    size_t m_bulk_pos;
+//            No. of bytes used in m_bulk_current.
+//    NOTE: the nodes in m_bulk  cannot be moved, since the m_fwd_map keys point to them.
+//
+//
+
+template <class HASHFUNC> class string_registry_two {
+    typedef unsigned hashval_t;
+    typedef std::pair<const std::string_view, hashval_t> mapval_t;
+
+  public:
+    typedef mapval_t const *string_key;
+
+  protected:
+    static constexpr int BULKN = 4096 - 2 * sizeof(void *);
+
+    HASHFUNC hasher;
+
+    typedef std::array<char, BULKN> bulkarray;
+    std::list<bulkarray> m_bulk;
+    char *m_bulk_current;
+    size_t m_bulk_pos;
+    std::map<std::string_view, hashval_t> m_fwd_map;
+
+    API_EXPORT unsigned get_hash(std::string_view s) { return hasher(s.data(), s.size()); }
+
+    API_EXPORT char *need_bulk(size_t n)
+    {
+        if (n > BULKN) throw std::length_error("string too long");
+        if (m_bulk_current == nullptr || m_bulk_pos + n > BULKN) {
+            m_bulk.emplace_back();
+            m_bulk_current = &m_bulk.back()[0];
+            m_bulk_pos = 0;
+        }
+        char *const res = m_bulk_current + m_bulk_pos;
+        m_bulk_pos += n;
+        return res;
+    }
+
+    static mapval_t empty_string_node;
+
+  public:
+    API_EXPORT string_registry_two();
+    string_registry_two(string_registry_two<HASHFUNC> const &) = delete;
+    string_registry_two &operator=(string_registry_two<HASHFUNC> const &) = delete;
+
+    // the number of entries (not counting empty string)
+
+    API_EXPORT size_t size() const { return m_fwd_map.size(); }
+    API_EXPORT void clear(); // forget everything; free all memory
+    // forward map: string to key
+    API_EXPORT string_key map_str(std::string_view s);
+    API_EXPORT string_key map_str(std::string const &s);
+    API_EXPORT string_key map_str(char const *s);
+
+    // like map_str_(s), but if the string is not in the map already,
+    // will *not* add it; it will return the string_key for "".
+    API_EXPORT string_key map_str_checked(std::string_view s) const;
+
+    // reverse map: to read-only C string.
+    API_EXPORT static char const *c_str(string_key sk) { return sk->first.data(); }
+    // reverse map to std::string or view.
+    API_EXPORT static std::string unmap(string_key sk) { return std::string(sk->first); }
+    API_EXPORT static std::string_view const &unmap_sv(string_key sk) { return sk->first; }
+    // this is the string_key for "", which is a statically allocated value.
+    // Use NOINLINE to avoid "definition of dllimport static field " and "unresolved external symbol" errors on Windows
+    API_EXPORT NOINLINE static string_key map_empty_str() { return &empty_string_node; };
+};
+
+template <class HASHFUNC>
+typename string_registry_two<HASHFUNC>::mapval_t hnnx::string_registry_two<HASHFUNC>::empty_string_node = {{"", 0}, 0};
+
+template <class HASHFUNC>
+API_EXPORT hnnx::string_registry_two<HASHFUNC>::string_registry_two() : m_bulk_current(nullptr), m_bulk_pos(0)
+{
+}
+
+template <class HASHFUNC> API_EXPORT void hnnx::string_registry_two<HASHFUNC>::clear()
+{
+    // clear the rev maps
+    // clear the fwd map
+    m_fwd_map.clear();
+    // clear all the bulk storage (but leave one, if there is one)
+    if (m_bulk.size() > 0) {
+        while (m_bulk.size() > 1)
+            m_bulk.pop_back();
+        m_bulk_current = &m_bulk.back()[0];
+        m_bulk_pos = 0;
+    }
+}
+
+template <class HASHFUNC>
+typename string_registry_two<HASHFUNC>::string_key hnnx::string_registry_two<HASHFUNC>::map_str(char const *s)
+{
+    return map_str(std::string_view(s));
+}
+template <class HASHFUNC>
+typename string_registry_two<HASHFUNC>::string_key hnnx::string_registry_two<HASHFUNC>::map_str(std::string const &s)
+{
+    return map_str(std::string_view(s));
+}
+
+template <class HASHFUNC>
+typename string_registry_two<HASHFUNC>::string_key hnnx::string_registry_two<HASHFUNC>::map_str(std::string_view s)
+{
+    size_t const slen = s.size();
+    if (slen == 0) return &empty_string_node; // empty string
+
+    // (1) try to find key in the current map.
+    // if it's there, and it usually should be, we don't need to do anything else.
+
+    auto found = m_fwd_map.lower_bound(s);
+    if (found != m_fwd_map.end() && s == found->first) return &*found;
+
+    // ok, now we have to do an insert. first put the string in bulk storage
+    //
+    char *const dst = need_bulk(slen + 1);
+    memcpy(dst, s.data(), slen);
+    dst[slen] = '\0';
+
+    unsigned const hash = get_hash(s);
+
+    auto ins_iter = m_fwd_map.emplace_hint(found, std::make_pair(std::string_view(dst, slen), hash));
+    return &*ins_iter;
+}
+template <class HASHFUNC>
+typename string_registry_two<HASHFUNC>::string_key API_FUNC_EXPORT
+hnnx::string_registry_two<HASHFUNC>::map_str_checked(std::string_view s) const
+{
+    if (s.size() != 0) {
+        auto found = m_fwd_map.lower_bound(s);
+        if (found != m_fwd_map.end() && s == found->first) return &*found;
+    }
+    return &empty_string_node;
+}
+} // namespace hnnx
+
+POP_VISIBILITY()
+
+#endif // STRING_REGISTRY_TWO
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/template_help.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/template_help.h
new file mode 100755
index 0000000000000..ffe21c8f90f32
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/template_help.h
@@ -0,0 +1,711 @@
+
+//==============================================================================
+//
+// Copyright (c) Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef HEXNN_TEMPLATE_HELP_H
+#define HEXNN_TEMPLATE_HELP_H 1
+
+#include <functional>
+#include <memory>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+#include "type_name.h"
+
+class Graph;
+class Tensor;
+template <typename P> class Vector;
+
+namespace hnnx {
+struct OsS; // this is the 'real name of hnnx::op_slice_spec
+
+/* Wrap Types or Values in Templates */
+/* I'm not sure that these are always needed, but it came in handy as I'm learning these things */
+
+template <template <typename> typename Tname> struct TemplateTypeWrapper {
+};
+
+template <template <size_t> typename Tname> struct TemplateIdxWrapper {
+};
+
+template <typename twrap, size_t val> struct UnwrapIdxTemplate_struct {
+};
+
+template <template <size_t> typename Twrap, size_t Val>
+struct UnwrapIdxTemplate_struct<TemplateIdxWrapper<Twrap>, Val> {
+    using type = Twrap<Val>;
+};
+
+template <typename Twrap, size_t Val> using UnwrapIdxTemplate = typename UnwrapIdxTemplate_struct<Twrap, Val>::type;
+
+template <typename twrap, typename tapply> struct UnwrapTypeTemplate_struct {
+};
+
+template <template <typename> typename Twrap, typename Tapply>
+struct UnwrapTypeTemplate_struct<TemplateTypeWrapper<Twrap>, Tapply> {
+    using type = Twrap<Tapply>;
+};
+
+template <typename Twrap, typename Tapply>
+using UnwrapTypeTemplate = typename UnwrapTypeTemplate_struct<Twrap, Tapply>::type;
+
+/*
+ * Helper functions for dealing with tuples.
+ *
+ * FIXME: EJP: some of these things might need to be refactored, as sometimes
+ * they have some extra functionality for some specific use, instead of being as
+ * generic and reusable as possible...
+ *
+ */
+
+/*
+ * EJP: As I'm getting better at all this template stuff,
+ * I should go back and refactor all this TypeFilter stuff.
+ */
+
+template <typename...> struct TupleCons;
+
+/*
+ * Create a tuple type of one element and the contents of an additional tuple
+ */
+template <template <typename...> typename C, typename T, typename... Rest> struct TupleCons<T, C<Rest...>> {
+    using type = C<T, Rest...>;
+};
+
+template <template <typename> class Pred, template <typename> class Wrapper, typename...> struct TypeFilter;
+
+/*
+ * Just a single element: create empty tuple or tuple with the single element
+ */
+template <template <typename> class Pred, template <typename> class Wrapper, typename Head>
+struct TypeFilter<Pred, Wrapper, Head> {
+    using type =
+            std::conditional_t<Pred<std::remove_reference_t<std::remove_pointer_t<Head>>>::value,
+                               std::tuple<Wrapper<std::remove_reference_t<Head>>>, // FIXME: remove remove_reference_t
+                               std::tuple<>>;
+};
+/*
+ * Filter this element and concatenate with the rest of the elements
+ */
+
+/*
+ * EJP: Maybe change this to take a tuple, so that we can refine
+ */
+template <template <typename> class Pred, template <typename> class Wrapper, typename Head, typename... Tail>
+struct TypeFilter<Pred, Wrapper, Head, Tail...> {
+    using type = std::conditional_t<Pred<std::remove_reference_t<std::remove_pointer_t<Head>>>::value,
+                                    // FIXME: remove remove_reference_t here...
+                                    typename TupleCons<Wrapper<std::remove_reference_t<Head>>,
+                                                       typename TypeFilter<Pred, Wrapper, Tail...>::type>::type,
+                                    typename TypeFilter<Pred, Wrapper, Tail...>::type>;
+};
+
+template <template <typename> typename Pred, typename...> struct TupFilter;
+//template<template<typename> class Pred, template<typename> class Wrapper, typename...> struct TupFilter;
+
+//template<template<typename> class Pred, template<typename> class Pred2, typename Head>
+//template<template<typename...> typename C, template<typename> class Pred, typename Head>
+
+template <template <typename> typename Pred, template <typename...> typename C> struct TupFilter<Pred, C<>> {
+    using type = C<>;
+};
+
+template <template <typename> typename Pred, template <typename...> typename C, typename Head>
+struct TupFilter<Pred, C<Head>> {
+    using type = std::conditional_t<Pred<Head>::value, C<Head>, C<>>;
+};
+
+template <template <typename> typename Pred, template <typename...> typename C, typename Head, typename... Rest>
+struct TupFilter<Pred, C<Head, Rest...>> {
+    using type = std::conditional_t<Pred<Head>::value,
+                                    typename TupleCons<Head, typename TupFilter<Pred, C<Rest...>>::type>::type,
+                                    typename TupFilter<Pred, C<Rest...>>::type>;
+};
+
+template <template <typename> class Wrap, typename...> struct TupMap;
+
+template <template <typename> typename Wrap, template <typename...> typename C> struct TupMap<Wrap, C<>> {
+    using type = C<>;
+};
+
+template <template <typename> class Wrap, template <typename...> typename C, typename... Rest>
+struct TupMap<Wrap, C<Rest...>> {
+    using type = C<Wrap<Rest>...>;
+};
+
+#if 0
+template <template <typename> class Wrap, template <typename...> typename C,
+          typename... Ts>
+using TupMap_t = typename TupMap<Wrap, C<Ts...>>::type;
+
+template <template <typename> class Filt, template <typename...> typename C,
+          typename... Ts>
+using TupFilter_t = typename TupFilter<Filt, C<Ts...>>::type;
+#else
+template <template <typename> class Wrap, typename Tup> using TupMap_t = typename TupMap<Wrap, Tup>::type;
+
+template <template <typename> class Filt, typename Tup> using TupFilter_t = typename TupFilter<Filt, Tup>::type;
+#endif
+
+template <typename T> struct Unboxed {
+    using type = T;
+};
+
+template <typename T> struct Unboxed<const Vector<T>> {
+    using type = T;
+};
+template <typename T> struct Unboxed<Vector<T>> {
+    using type = T;
+};
+
+//template<template<typename...> typename C, typename T, typename... Ts>
+//struct Unboxed<const C<T,Ts...>> {
+//	using type = T;
+//};
+
+template <typename T> using unboxed_t = typename Unboxed<T>::type;
+
+template <class T>
+using is_not_const = std::integral_constant<bool, !std::is_const<std::remove_pointer_t<unboxed_t<T>>>::value>;
+template <class T>
+using is_const = std::integral_constant<bool, std::is_const<std::remove_pointer_t<unboxed_t<T>>>::value>;
+
+//template<template<typename...> typename C, typename...>
+template <typename T, typename Default> struct First_Tuple_Element {
+};
+
+template <template <typename...> typename C, typename First, typename... Rest, typename Default>
+struct First_Tuple_Element<C<First, Rest...>, Default> {
+    using type = First;
+};
+
+template <template <typename...> typename C, typename Default> struct First_Tuple_Element<C<>, Default> {
+    using type = Default;
+};
+
+template <typename T, typename Default> using first_tuple_element = typename First_Tuple_Element<T, Default>::type;
+
+/*
+ * Use index sequence to turn a normal pointer unknown size array into a fixed size std::array
+ *
+ * Maybe this could get refactored into some kind of thing like tuple map
+ */
+
+template <size_t N, typename T, size_t... I>
+constexpr static inline const std::array<T, N> ptr_to_stdarray_helper(const T *carray, std::index_sequence<I...>)
+{
+    const std::array<T, N> ret = {{carray[I]...}};
+    return ret;
+}
+
+template <size_t N, typename T> constexpr static inline const std::array<T, N> ptr_to_stdarray(const T *carray)
+{
+    return ptr_to_stdarray_helper<N, T>(carray, std::make_index_sequence<N>{});
+}
+
+/*
+ * These are kind of like add_pointer / add_pointer_t
+ */
+
+template <typename T> struct add_uniqueptr {
+    using type = typename std::unique_ptr<T>;
+};
+template <class T> using add_uniqueptr_t = typename add_uniqueptr<T>::type;
+
+//////////
+// Op function parameter categories
+// The order of these is important: The operands must
+// appear in order of increasing category. Also, no two operands
+// can have the same category, unless it's tensor_out or tensor_in
+// (see CheckOpFuncArgs below).
+enum class OpArgCategory { //
+    invalid, // none of the below
+    tensor_out, // T &, where T is a Tensor subclass
+    vararg_out, // Vector<T*> const &; or Vector<T*>
+    tensor_in, // T const &, where T is a Tensor subclass.
+    vararg_in, // Vector<T const*> const &; or Vector<T*>
+    slice_spec, // op_slice_spec (passed by value)
+    graph_ref, // Graph const &
+};
+
+template <typename T> struct OpArgCat {
+    static constexpr OpArgCategory value = OpArgCategory::invalid;
+};
+
+// T& or T const &; Ok if  T subclass of Tensor;
+template <typename T> struct OpArgCat<T &> {
+    static constexpr OpArgCategory value = !std::is_base_of_v<Tensor, T> ? OpArgCategory::invalid
+                                           : std::is_const_v<T>          ? OpArgCategory::tensor_in
+                                                                         : OpArgCategory::tensor_out;
+};
+// Graph const & ok
+template <> struct OpArgCat<Graph const &> {
+    static constexpr OpArgCategory value = OpArgCategory::graph_ref;
+};
+
+// Also: Vector<T*> is ok as pass-by-value or pass-by-const-ref.
+// Implementation of Vector<P> is just {P const *base, size_t n}
+//
+template <typename T> struct OpArgCat<Vector<T *> const &> {
+    static constexpr OpArgCategory value = !std::is_base_of_v<Tensor, T> ? OpArgCategory::invalid
+                                           : std::is_const_v<T>          ? OpArgCategory::vararg_in
+                                                                         : OpArgCategory::vararg_out;
+};
+template <typename T> struct OpArgCat<Vector<T *>> : public OpArgCat<Vector<T *> const &> {
+};
+
+// op_slice_spec is OK as a parameter
+template <> struct OpArgCat<OsS> {
+    static constexpr OpArgCategory value = OpArgCategory::slice_spec;
+};
+//////////
+// Check all the 'category' of the Op function args, which must conform to
+//
+// - `tensor_out` (0 or more) - parameter is `T &`
+// - `varag_out` (0 or 1, only if `VariadicOp`) - parameter is `VECTOR<T *> const &`
+// - `tensor_in` (0 or more) - parameter is `T const &`
+// - `vararg_in` (0 or 1, only if `VariadicOp`) - parameter is `VECTOR<T const *> const &`
+// - `tensor_out` (0 or more) - parameter is `T &` (these are 'scratch outputs')
+// - `slice_spec` (0 or 1) - parameter is `op_slice_spec`
+// - `graph_ref` (0 or 1) - parameter is `Graph &`
+//
+// This is done by traversing and checking these rules:
+//  - Each one's category must be >= the previous category, and can only be equal if it's 'tensor_out' or 'tensor_in'.
+//    The first one's previous is considered to be 'invalid', which is < all valid.
+//  EXCEPT:
+//  - New category of tensor_out is allowed after any > tensor_out and < slice_spec; this is counted as the first 'scratch output'.
+//  - if at least one 'scratch output' has been seen before, the category must be either 'tensor_out' (which is counted), or >= slice_spec.
+//
+// LCOV_EXCL_START [SAFTYSWCCB-1736] constexprs resolved during compile time
+// used in locally with  constexpr lvalue
+template <OpArgCategory... Args> inline constexpr int CheckOpFuncArgs()
+{
+    constexpr unsigned N = sizeof...(Args);
+    int num_scratch_out = 0;
+    if constexpr (N > 0) {
+        OpArgCategory cat_previous = OpArgCategory::invalid;
+        constexpr OpArgCategory cats[N] = {Args...};
+        for (unsigned i = 0; i < N; i++) {
+            OpArgCategory cat = cats[i];
+            if (cat < cat_previous) {
+                // not allowed, except for 'tensor_out' where it's interpreted as first 'scratch'
+                if (cat == OpArgCategory::tensor_out && cat_previous < OpArgCategory::slice_spec) {
+                    num_scratch_out = 1;
+                    cat_previous = cat;
+                    continue;
+                } else {
+                    return -1;
+                }
+            } else if (cat == cat_previous && cat != OpArgCategory::tensor_in && cat != OpArgCategory::tensor_out) {
+                // only tensor_in, tensor_out can repeat previous category.
+                return -1;
+            }
+            // special checks when the previous was a 'scratch output'
+            if (num_scratch_out > 0 && cat_previous == OpArgCategory::tensor_out) {
+                if (cat == OpArgCategory::tensor_out) {
+                    num_scratch_out++; // count one more scratch output
+                } else if (cat < OpArgCategory::slice_spec) {
+                    return -1; // any after 'scratch out' must be slice_spec or graph_ref.
+                }
+            }
+            cat_previous = cat;
+        }
+    }
+    return num_scratch_out;
+}
+// LCOV_EXCL_STOP
+/*
+ * Generic template for Concat<....> (implementation follows later in this file)
+ */
+template <typename... T> struct Concat_struct;
+
+/*
+ * Make the name nice
+ */
+template <typename... T> using Concat = typename Concat_struct<T...>::type;
+
+//////////
+// ArgTupFilter_t<CAT, Args...> -> tuple<Args...> with only ops of given cat removed.
+// Also, refs are removed.
+//
+template <typename T1, typename TUP> struct TupleBuild {
+};
+template <typename T1, typename... Types> struct TupleBuild<T1, std::tuple<Types...>> {
+    using type = std::tuple<T1, Types...>;
+};
+
+template <OpArgCategory CAT, typename... Types> struct ArgTupFilterHelper {
+};
+
+template <OpArgCategory CAT, typename T1, typename... Types> struct ArgTupFilterHelper<CAT, T1, Types...> {
+  private:
+    using tail = typename ArgTupFilterHelper<CAT, Types...>::type;
+
+  public:
+    using type = std::conditional_t<OpArgCat<T1>::value == CAT, // is T1 included?
+                                    typename TupleBuild<std::remove_reference_t<T1>, tail>::type, tail>;
+};
+
+// just one...
+template <OpArgCategory CAT, typename T1> struct ArgTupFilterHelper<CAT, T1> {
+    using type = std::conditional_t<OpArgCat<T1>::value == CAT, std::tuple<std::remove_reference_t<T1>>, std::tuple<>>;
+};
+
+// empty case...
+template <OpArgCategory CAT> struct ArgTupFilterHelper<CAT> {
+    using type = std::tuple<>;
+};
+
+template <OpArgCategory CAT, typename... Types> using ArgTupFilter_t = typename ArgTupFilterHelper<CAT, Types...>::type;
+
+//////////
+template <typename R> struct ArgsTuples;
+
+template <typename R, typename... Args> struct ArgsTuples<R(Args...)> {
+  private:
+    static constexpr int check_op_func_val = CheckOpFuncArgs<OpArgCat<Args>::value...>();
+    static_assert(check_op_func_val >= 0, "Improper Op arg parameters");
+
+  public:
+    // If this is > 0, then that many of the 'output' (the last of them) are actually 'scratch' output,
+    // they are counted in n_outputs.
+    // If not supported in VariadicOp, this must be checked there.
+    static constexpr size_t n_scratch_outputs = (check_op_func_val <= 0) ? size_t(0) : size_t(check_op_func_val);
+
+    // extract 'Graph const &' and 'op_slice_spec'
+    using const_graph_tup = ArgTupFilter_t<OpArgCategory::graph_ref, Args...>; // reference to graph?
+    using slice_spec_tup = ArgTupFilter_t<OpArgCategory::slice_spec, Args...>; // 'slice_spec'?
+
+    using input_tuple = ArgTupFilter_t<OpArgCategory::tensor_in, Args...>; // the inputs as real types
+    using output_tuple = ArgTupFilter_t<OpArgCategory::tensor_out, Args...>; // the outputs as real types
+    using var_input_tuple = ArgTupFilter_t<OpArgCategory::vararg_in, Args...>; // variadic input tuple
+    using var_output_tuple = ArgTupFilter_t<OpArgCategory::vararg_out, Args...>; // variadic output tuple
+
+    using input_ptr_tuple = TupMap_t<std::add_pointer_t, input_tuple>; // The inputs as pointers
+    using output_ptr_tuple = TupMap_t<std::add_pointer_t,
+                                      output_tuple>; // the outputs as pointers
+    using output_uniqueptrs_tuple = TupMap_t<add_uniqueptr_t,
+                                             output_tuple>; // the outputs as std::unique_ptrs
+    using graph_ptr_tuple = TupMap_t<std::add_pointer_t,
+                                     const_graph_tup>; // the graph as pointer
+
+    static constexpr size_t n_inputs = std::tuple_size<input_tuple>::value; // number of inputs
+    static constexpr size_t n_outputs = std::tuple_size<output_tuple>::value; // number of outputs
+    static constexpr bool has_graph = (std::tuple_size<const_graph_tup>::value > 0); // does it have a graph operand?
+    static constexpr bool has_slice_spec = (std::tuple_size<slice_spec_tup>::value > 0); // has op_slice_spec?
+
+    // To support 'scratch output', we want the 'nameArray' to be based on:
+    //  outputs, scratchout, varout, inputs, varin, graphref
+    // .. even though 'scratchout' parms appear later in the function.
+    // 'output_tuple' is the regular outputs followed by the scratch outputs, so the below will work.
+    //
+    using tname_args_tuple = Concat<output_tuple, var_output_tuple, input_tuple, var_input_tuple, const_graph_tup>;
+    //a string in the form of "@t1.t2.t3"... where t1,t2,t3,etc are the typenames of the input arguments as defined by DEFINE_TYPENAME
+    static constexpr auto nameArray =
+            GetTypeNames<tname_args_tuple>(std::make_index_sequence<std::tuple_size_v<tname_args_tuple>>{});
+    static constexpr const char *inputTypeNames = nameArray.data();
+};
+
+template <auto F> struct ArgsTuples2 : public ArgsTuples<std::remove_pointer_t<decltype(F)>> {
+};
+
+// contains_type< tuple<a,b,c>, x >::value: true if x is in a,b,c ...
+// no 'remove ref' etc is done.
+template <typename TUPLET, typename T> struct contains_type {
+};
+
+template <typename T> struct contains_type<std::tuple<>, T> {
+    static const bool value = false; // empty tuple contains nothing
+};
+/*
+template <typename TA, typename T>
+struct contains_type< std::tuple<TA>, T > {
+	static const bool value = std::is_same<TA,T>::value;
+};
+*/
+template <typename T, typename... TX> struct contains_type<std::tuple<T, TX...>, T> {
+    static const bool value = true;
+};
+template <typename TA, typename... TX, typename T> struct contains_type<std::tuple<TA, TX...>, T> {
+    static const bool value = contains_type<std::tuple<TX...>, T>::value;
+};
+template <typename TUPLET, typename T> struct not_contains_type {
+    static const bool value = !contains_type<TUPLET, T>::value;
+};
+
+/*
+ * Specialized that actually does the work:
+ * Given two containers (Containter template C) "A" and "B", concatenate A and B
+ */
+
+template <template <typename...> typename C, typename... As> struct Concat_struct<C<As...>> {
+    using type = C<As...>;
+};
+
+template <template <typename...> typename C, typename... As, typename... Bs> struct Concat_struct<C<As...>, C<Bs...>> {
+    using type = C<As..., Bs...>;
+};
+
+template <template <typename...> typename C, typename... As, typename... Bs, typename... Cs>
+struct Concat_struct<C<As...>, C<Bs...>, C<Cs...>> {
+    using type = C<As..., Bs..., Cs...>;
+};
+
+template <typename W, typename X, typename Y, typename Z, typename... More> struct Concat_struct<W, X, Y, Z, More...> {
+    using type = typename Concat_struct<typename Concat_struct<W, X, Y>::type,
+                                        typename Concat_struct<Z, More...>::type>::type;
+};
+
+#if !defined(NDEBUG)
+static_assert(std::is_same_v<Concat<std::tuple<int>, std::tuple<char, float>, std::tuple<>, std::tuple<bool>,
+                                    std::tuple<void *, int *>>,
+                             std::tuple<int, char, float, bool, void *, int *>>);
+static_assert(std::is_same_v<Concat<std::tuple<>, std::tuple<float, char>, std::tuple<int, char *>, std::tuple<bool>>,
+                             std::tuple<float, char, int, char *, bool>>);
+#endif
+
+#if 0 // UNUSED >>>
+/*
+ * Generic template
+ */
+template <typename... T> struct Product_helper_struct;
+
+/*
+ * Make the name nice
+ */
+template <typename... T> using Product_helper = typename Product_helper_struct<T...>::type;
+
+/*
+ * Product helper specialization:
+ * Container "C"
+ * A single container of types
+ */
+template <template <typename...> typename C, typename... As> struct Product_helper_struct<C<As...>> {
+    using type = C<As...>;
+};
+
+/*
+ * Product helper specialization:
+ * Container "C"
+ * Product with empty set is empty set always
+ */
+
+template <template <typename...> typename C, typename... Prefixes, typename... Rest>
+struct Product_helper_struct<C<Prefixes...>, C<>, Rest...> {
+    using type = C<>;
+};
+
+// The two functions below do the bulk of the work
+
+/*
+ * Product helper specialization
+ * First Arg: a container of prefixes,
+ * Second Arg: a single (containered) element to append to each prefix
+ * Args...: All the rest of the work
+ *
+ * Create a container of new prefixes by concatinating each prefix with the new element
+ * Then recurse using these new prefixes with the rest of the work
+ *
+ * This handles a single element.
+ * The element is containerized so that it also handles a container with a single element,
+ * or the last element in a list.
+ */
+
+template <template <typename...> typename C, typename... Prefixes, typename Elem, typename... Rest>
+struct Product_helper_struct<C<Prefixes...>, C<Elem>, Rest...> {
+    using new_prefixes = C<Concat<Prefixes, C<Elem>>...>;
+    using type = Product_helper<new_prefixes, Rest...>;
+};
+
+/*
+ * Product helper specialization
+ * First Arg: a container of prefixes,
+ * Second Arg: More than one containered elements
+ * Args...: All the rest of the work
+ *
+ * Create a first list with the first element off the second argument, and
+ * create the list recursing with just the single containerized element
+ *  (This will use the specialization above)
+ * Then create a second list by recursing with the rest of the elements of the second argument
+ * Finally, Concatenate these two lists.
+ *
+ * EJP: I think maybe I'm starting to get the hang of these template things.
+ *
+ */
+
+template <template <typename...> typename C, typename... Prefixes, typename FirstElem, typename... RestElem,
+          typename... Rest>
+struct Product_helper_struct<C<Prefixes...>, C<FirstElem, RestElem...>, Rest...> {
+    using type = Concat<Product_helper<C<Prefixes...>, C<FirstElem>, Rest...>,
+                        Product_helper<C<Prefixes...>, C<RestElem...>, Rest...>>;
+};
+
+template <typename... T> struct Product_struct;
+
+template <template <typename...> typename C> struct Product_struct<C<>> {
+    using type = C<>;
+};
+
+#if 0
+template <template <typename...> typename C, typename... First,
+          typename... Rest>
+struct Product_struct<C<C<First...>, Rest...>> {
+    using type = Product_helper<C<C<First>...>, Rest...>;
+};
+#else
+template <template <typename...> typename C, typename... Rest> struct Product_struct<C<Rest...>> {
+    using type = Product_helper<C<C<>>, Rest...>;
+};
+#endif
+
+template <typename... T> using Product = typename Product_struct<T...>::type;
+
+#endif // <<< UNUSED
+
+template <typename IterT> struct pair_to_iterators : std::pair<IterT, IterT> {
+    pair_to_iterators(std::pair<IterT, IterT> const &&iter_pair_in) : std::pair<IterT, IterT>(std::move(iter_pair_in))
+    {
+    }
+    pair_to_iterators(std::pair<IterT, IterT> const &iter_pair_in) : std::pair<IterT, IterT>(iter_pair_in) {}
+    IterT begin() const { return this->first; }
+    IterT end() const { return this->second; }
+    IterT cbegin() const { return this->first; }
+    IterT cend() const { return this->second; }
+};
+
+// insert a numeric object in a sorted vector,
+// unless it's already there. Useful instead of set<T>
+// if sizeof(T) and n are fairly small (since it uses O(n) inserts)
+// Returns true if the value was inserted, false if there was a dup.
+template <typename T, typename Allocator> bool insert_ordered_no_dups(std::vector<T, Allocator> &vec, T const &value)
+{
+    if (vec.empty() || vec.back() < value) {
+        vec.emplace_back(value);
+    } else {
+        int hi = vec.size();
+        int lo = 0;
+        T const *p = &vec[0];
+        while (lo < hi) {
+            int const mid = (lo + hi) / 2u;
+            if (value < p[mid]) {
+                hi = mid;
+            } else if (p[mid] < value) {
+                lo = mid + 1;
+            } else {
+                return false; // is a dup.
+            }
+        }
+        vec.insert(vec.begin() + lo, value);
+    }
+    return true;
+}
+
+// generic binary search.
+// ======
+// This is coded in a form which tends to work well on hexagon;
+// the representation of the remaining sublist as ptr,offset
+// reduces the critical-path calculation, and the update is simple
+// enough that it usually requires no conditional calculations.
+//
+// =====
+// Look for k in arr[0..n-1], which must be in order.
+// Returns address of the first element which is >=k; if n==0 or
+// 'extr' is a key extractor, which reads a K from T.
+// if all are < k, it returns &arr[n].
+//  uses comparisons K < K,  and also K==K if CHECK_EQ
+//
+//  CHECK_EQ: includes an '==k' check in each iteration. Generally
+//    this will be faster if comparisons are cheap and it's common
+//    to actually find the value in the list. Note, if the list
+//    contains more than one of k, the result may be any of these if
+//    CHECK_EQ is true; always the first if CHECK_EQ is false.
+//
+//  LIN_THR: when the sublist is <= this, use a linear search.
+//  This is much faster per iteration than the binary search, when
+//  the key is just an int and the records are not much bigger than an int.
+//  For a table of ints or similar, probably should be about 6.
+//  If you set this to >=3 or so, CHECK_EQ should always be false
+//  since the equality check is unlikely to hit, and it makes the loop longer.
+
+template <bool CHECK_EQ, int LIN_THR, typename EXTRACTOR, typename T, typename K>
+inline T const *array_search_ordered(T const *arr, int n, K const &k, EXTRACTOR extr)
+{
+    T const *p = arr;
+    static constexpr int LTHR = (LIN_THR < 3) ? 0 : 3;
+
+    // invariant : p[0..n-1] have not been examined
+    //  all before p[0] are <k, all at p[n] and beyond are >=k
+    while (n > LTHR) {
+        int const nx = n - 1;
+        n >>= 1;
+        T const *const px = &p[n];
+        K const &pxv = extr(*px);
+        if constexpr (CHECK_EQ)
+            if (pxv == k) return px;
+        if (pxv < k) {
+            p = px + 1; // select part of list after px
+            n = nx >> 1;
+        }
+    }
+    if constexpr (LTHR > 0) {
+        T const *const p_end = &p[n];
+        while (p < p_end && extr(*p) < k)
+            ++p;
+    }
+    return p;
+}
+
+// same with no EXTRACTOR
+template <bool CHECK_EQ, int LIN_THR, typename T> T const *array_search_ordered(T const *arr, int n, T const &k)
+{
+    return array_search_ordered<CHECK_EQ, LIN_THR>(arr, n, k, [](T const &x) { return x; });
+}
+
+// same with T *
+template <bool CHECK_EQ, int LIN_THR, typename EXTRACTOR, typename T, typename K>
+T *array_search_ordered(T *arr, int n, K const &k, EXTRACTOR extr)
+{
+    return const_cast<T *>(
+            array_search_ordered<CHECK_EQ, LIN_THR, EXTRACTOR, T>(const_cast<T const *>(arr), n, k, extr));
+}
+// with T *, no extractor
+template <bool CHECK_EQ, int LIN_THR, typename T> T *array_search_ordered(T *arr, int n, T const &k)
+{
+    return const_cast<T *>(
+            array_search_ordered<CHECK_EQ, LIN_THR>(const_cast<T const *>(arr), n, k, [](T const &x) { return x; }));
+}
+
+// given a std::vector<pair<T1,T2>> - which must be sorted in increasing
+// order of T1 - look up a key (T1) value 'k', and return a pointer to the matching
+// T2 - or null if not found.
+// This makes use of T1 < T1 and T1==T1
+template <typename T1, typename T2> T2 const *sorted_pair_lookup(std::vector<std::pair<T1, T2>> const &v, T1 const &k)
+{
+    int n = v.size();
+    auto const *arrp = v.data();
+    auto const *posn = array_search_ordered<false, 6>(arrp, n, k, [](decltype(*arrp) const &p) { return p.first; });
+    if (posn >= &arrp[n] || posn->first != k) return nullptr;
+    return &posn->second;
+}
+
+// lookup in a vector of sorted tuples; based on the first element.
+// Returns pointer to the whole matching tuple, or nullptr.
+template <typename T1, typename... Tx>
+std::tuple<T1, Tx...> const *sorted_tuple_lookup(std::vector<std::tuple<T1, Tx...>> const &v, T1 const &k)
+{
+    int n = v.size();
+    auto const *arrp = v.data();
+    auto const *posn =
+            array_search_ordered<false, 4>(arrp, n, k, [](decltype(*arrp) const &tup) { return std::get<0>(tup); });
+    if (posn >= &arrp[n] || std::get<0>(*posn) != k) return nullptr;
+    return posn;
+}
+
+} // namespace hnnx
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/template_help_tensor_ext.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/template_help_tensor_ext.h
new file mode 100755
index 0000000000000..8eccfca1929e7
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/template_help_tensor_ext.h
@@ -0,0 +1,130 @@
+//==============================================================================
+//
+// Copyright (c) Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef TEMPLATE_HELP_TENSOR_EXT_H
+#define TEMPLATE_HELP_TENSOR_EXT_H
+
+#include "tensor.h"
+#include "macros_attribute.h"
+#include "weak_linkage.h"
+
+namespace hnnx {
+
+int tensor_deserializer_register_ext(size_t n_out, uint8_t const *out_indices);
+
+/*
+ * mapping each predefined tensor type to an index
+ * used by SimpleOp and SimpleOpWrapper to support op packages
+ */
+
+template <typename T> static constexpr uint8_t tensor_idx = 0;
+
+template <> inline constexpr uint8_t tensor_idx<Tensor> = 1;
+template <> inline constexpr uint8_t tensor_idx<PlainFloatTensor> = 2;
+template <> inline constexpr uint8_t tensor_idx<PlainFloatTensor_TCM> = 3;
+template <> inline constexpr uint8_t tensor_idx<PlainFloat16Tensor> = 4;
+template <> inline constexpr uint8_t tensor_idx<PlainFloat16Tensor_TCM> = 5;
+// REMOVED template <> inline constexpr uint8_t tensor_idx<D32FloatTensor> = 6;
+// REMOVED template <> inline constexpr uint8_t tensor_idx<D32PaddedFloatTensor> = 7;
+template <> inline constexpr uint8_t tensor_idx<Int32Tensor> = 8;
+template <> inline constexpr uint8_t tensor_idx<Int32Tensor_TCM> = 9;
+template <> inline constexpr uint8_t tensor_idx<Int32CroutonTensor> = 10;
+template <> inline constexpr uint8_t tensor_idx<Int32CroutonTensor_TCM> = 11;
+template <> inline constexpr uint8_t tensor_idx<QuantUint8Tensor> = 12;
+template <> inline constexpr uint8_t tensor_idx<QuantUint8Tensor_TCM> = 13;
+template <> inline constexpr uint8_t tensor_idx<QuantInt8Tensor> = 14;
+template <> inline constexpr uint8_t tensor_idx<QuantInt8Tensor_TCM> = 15;
+template <> inline constexpr uint8_t tensor_idx<QuantUint16Tensor> = 16;
+template <> inline constexpr uint8_t tensor_idx<QuantUint16Tensor_TCM> = 17;
+template <> inline constexpr uint8_t tensor_idx<QuantInt16Tensor> = 18;
+template <> inline constexpr uint8_t tensor_idx<QuantInt16Tensor_TCM> = 19;
+template <> inline constexpr uint8_t tensor_idx<QuantInt32Tensor> = 20;
+template <> inline constexpr uint8_t tensor_idx<QuantInt32Tensor_TCM> = 21;
+template <> inline constexpr uint8_t tensor_idx<QUint8CroutonTensor> = 22;
+template <> inline constexpr uint8_t tensor_idx<QUint8CroutonTensor_TCM> = 23;
+template <> inline constexpr uint8_t tensor_idx<QInt8CroutonTensor> = 24;
+template <> inline constexpr uint8_t tensor_idx<QInt8CroutonTensor_TCM> = 25;
+template <> inline constexpr uint8_t tensor_idx<QUint8Crouton4x1Tensor> = 26;
+template <> inline constexpr uint8_t tensor_idx<QUint8Crouton4x1Tensor_TCM> = 27;
+template <> inline constexpr uint8_t tensor_idx<QUint8Crouton2x2Tensor> = 28;
+template <> inline constexpr uint8_t tensor_idx<QUint8Crouton2x2Tensor_TCM> = 29;
+template <> inline constexpr uint8_t tensor_idx<QUint8WideCroutonTensor> = 30;
+template <> inline constexpr uint8_t tensor_idx<QUint8WideCroutonTensor_TCM> = 31;
+template <> inline constexpr uint8_t tensor_idx<QUint8WideCrouton2x2Tensor> = 32;
+template <> inline constexpr uint8_t tensor_idx<QUint8WideCrouton2x2Tensor_TCM> = 33;
+template <> inline constexpr uint8_t tensor_idx<QUint16CroutonTensor> = 34;
+template <> inline constexpr uint8_t tensor_idx<QUint16CroutonTensor_TCM> = 35;
+template <> inline constexpr uint8_t tensor_idx<QInt32CroutonTensor> = 36;
+template <> inline constexpr uint8_t tensor_idx<QInt32CroutonTensor_TCM> = 37;
+template <> inline constexpr uint8_t tensor_idx<QInt32WideCroutonTensor> = 38;
+template <> inline constexpr uint8_t tensor_idx<QInt32WideCroutonTensor_TCM> = 39;
+
+template <> inline constexpr uint8_t tensor_idx<TensorShape<4>> = 40;
+
+template <> inline constexpr uint8_t tensor_idx<F16CroutonTensor> = 41;
+template <> inline constexpr uint8_t tensor_idx<F16CroutonTensor_TCM> = 42;
+// all tensor types supported in op package ops
+// clang-format off
+using AllTensors =
+        std::tuple<Tensor, Tensor, PlainFloatTensor, PlainFloatTensor_TCM, PlainFloat16Tensor, PlainFloat16Tensor_TCM,
+                   Tensor, Tensor, // REMOVED: D32FloatTensor, D32PaddedFloatTensor,
+                   Int32Tensor, Int32Tensor_TCM, Int32CroutonTensor,
+                   Int32CroutonTensor_TCM, QuantUint8Tensor, QuantUint8Tensor_TCM, QuantInt8Tensor, QuantInt8Tensor_TCM,
+                   QuantUint16Tensor, QuantUint16Tensor_TCM, QuantInt16Tensor, QuantInt16Tensor_TCM, QuantInt32Tensor,
+                   QuantInt32Tensor_TCM, QUint8CroutonTensor, QUint8CroutonTensor_TCM, QInt8CroutonTensor,
+                   QInt8CroutonTensor_TCM, QUint8Crouton4x1Tensor, QUint8Crouton4x1Tensor_TCM, QUint8Crouton2x2Tensor,
+                   QUint8Crouton2x2Tensor_TCM, QUint8WideCroutonTensor, QUint8WideCroutonTensor_TCM,
+                   QUint8WideCrouton2x2Tensor, QUint8WideCrouton2x2Tensor_TCM, QUint16CroutonTensor,
+                   QUint16CroutonTensor_TCM, QInt32CroutonTensor, QInt32CroutonTensor_TCM, QInt32WideCroutonTensor,
+                   QInt32WideCroutonTensor_TCM, TensorShape<4>, F16CroutonTensor, F16CroutonTensor_TCM>;
+// clang-format on
+
+struct tensor_info {
+    std::type_info const *tid;
+    bool needs_des;
+    tensor_deserializer_fn desf;
+    tensor_generate_fp genf;
+};
+
+// returns a map : tensor index -> tensor_info
+PUSH_VISIBILITY(default)
+API_EXPORT std::map<uint8_t, tensor_info> &get_tensor_info_map();
+POP_VISIBILITY()
+
+//LCOV_EXCL_START [SAFTYSWCCB-1736] constexprs resolved during compile time
+// used in pub/impl/simple_op.h for indices representing input and outputs tensor types
+// and have lvalue constexpr
+template <typename AggType, class Tup, size_t... I>
+static inline constexpr AggType tensors_to_indices_helper(std::index_sequence<I...>)
+{
+    return AggType{tensor_idx<std::tuple_element_t<I, Tup>>...};
+}
+
+// converts a tuple of tensor types to a vector of corresponding indices
+template <typename AggType, class Tup> static inline constexpr AggType tensors_to_indices()
+{
+    return tensors_to_indices_helper<AggType, Tup>(
+            std::make_index_sequence<std::tuple_size<std::decay_t<Tup>>::value>{});
+}
+//LCOV_EXCL_STOP
+
+template <class Tup, size_t... I>
+static inline constexpr bool check_tensor_types_valid_helper(std::index_sequence<I...>)
+{
+    return (((bool)tensor_idx<std::tuple_element_t<I, Tup>>)&&...);
+}
+
+// checks tensor types in a tuple are all from AllTensors list
+template <class Tup> static inline constexpr bool check_tensor_types_valid()
+{
+    return check_tensor_types_valid_helper<Tup>(std::make_index_sequence<std::tuple_size<std::decay_t<Tup>>::value>{});
+}
+
+} // namespace hnnx
+
+#endif // TEMPLATE_HELP_TENSOR_EXT_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/tensor.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/tensor.h
new file mode 100755
index 0000000000000..480dc7aded1ac
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/tensor.h
@@ -0,0 +1,4086 @@
+//==============================================================================
+//
+// Copyright (c) Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef HEXNN_TENSOR_H
+#define HEXNN_TENSOR_H 1
+/*
+ * This file is trying to figure out a nice Tensor class, which allows for access
+ * to a data structure with potentially unknown underlying data types and layout.
+ *
+ * What is a Tensor? It's a multidimensional array of data
+ * It has a "Rank": the number of dimensions.
+ * It has a shape.
+ * It contains data values.
+ * There is a mechanism to access the data values.
+ *
+ * From an abstract perspective, that's about all we should have to know about a tensor.
+ * However, to form a concrete tensor, it should also be observed that:
+ *
+ * The data values have some type.  They may be encoded/decoded with some extra information.
+ * The data is laid out in some fashion.  It may be a contiguous block, it may have the data
+ *   shuffled in some way, it may have a table of pointers to fixed-size blocks...
+ * There might be extra padding values around the "true" data
+ *
+ * To facilitate the most abstract interfaces being available while also being
+ * able to specify concrete tensors and have the compiler understand the mechanics
+ * of the concrete tensor, we probably need to have:
+ * * Abstract tensor as a base class that provides a generic interface always, using runtime polymorphism
+ * * Subclasses that provide more concrete tensor representations, finalizing aspects of the tensor,
+ * * Concrete classes that provide the compiler with full visibility in the details of the tensor
+ *
+ * Because values may be encoded/decoded from their internal representation, in the most abstract
+ * representation we can not just return a data element.  Instead, we return an accessor object.
+ * The accessor object works like an rvalue or lvalue, but is able to decode (rvalue) or encode (lvalue)
+ * the data as appropriate.
+ * (At least, that's how I think it should work...)
+ *
+ * We'd like to use the operator() to allow us to have multidimentional-array-indices-like interface,
+ * much in the same way we have in Eigen.  So for a 4D tensor, with indices batchidx,row,col,channel,
+ * we should be able to say out_tensor(batchidx,row,col,channel) = in_tensor(batchidx,row,col,channel)
+ *
+ * Although we might consider a variety of different types for tensor internal values, including int32,
+ * I propose to use "float" as the defualt interface type.  It should work well for many integers, and
+ * is the appropriate interface for real data whether quantized or not.  A reasonable alternative would
+ * be double, but double is quite a bit more expensive on Hexagon.  Of course, other methods of accessing
+ * the data could be made for different types.
+ *
+ * Having extremely abstract tensors allows us to have extremely generic
+ * functions, but having easily available less abstract tensors should allow us
+ * to easily specify constraints for ops that are more optimal or that are
+ * demanding certain parameters for their inputs.  For example, if we always want
+ * our convolutional op to have a 4D input tensor, we might specify that it is
+ * a RankedTensor<4> instead of a Tensor, indicating that the op can only use
+ * a tensor with rank 4.
+ *
+ * Tensors are very fundamental to how we are going to work on things, so it's incredibly important to
+ * be on our best behavior here.
+ *
+ */
+
+/*
+ * EJP: FIXME:
+ * * The helper classes, Accessors and Interfaces and such, should be in a sub-namespace for cleanliness.
+ * * Make the Abstract/Base/Generic naming consistent.  I like "Generic" at the moment.
+ */
+#include <cassert>
+#include <array>
+#include <cmath>
+#include <limits>
+#include <numeric>
+#include <set>
+#include <type_traits>
+#include <vector>
+#include <stdexcept>
+#include <cstddef>
+#include <typeindex>
+
+#include "allocator.h"
+#include "shape.h"
+#include "serialize_oplist.h"
+#include "deserializer.h"
+#include "dtype.h"
+#include "float16.h"
+#include "graph_status.h"
+#include "interface_defs.h"
+#include "log.h"
+#include "memory_layout.h"
+#include "padding.h"
+#include "template_help.h"
+#include "conversions.h"
+#include "crate.h"
+#include "minihash.h"
+#include "macros_attribute.h"
+#include "weak_linkage.h"
+#include "dynamic_tensors.h"
+
+#define TENSOR_MAGIC 0x1337beef
+
+//#define ALWAYS_INLINE /* NOTHING */
+
+#if 0
+/*
+ * What is the type of an Index?
+ * What is a Signed Index that might be negative?
+ * Maybe both of these should just be "int" everywhere.
+ */
+using Idx = size_t;
+using SIdx = long;
+
+#endif
+
+/*
+ * This name makes no sense.
+ */
+#if 0
+struct OctetType {
+    uint8_t *buf;
+    size_t buflen;
+};
+#endif
+
+#include "weak_linkage.h"
+PUSH_VISIBILITY(default)
+
+class NullInterface;
+template <typename T> class PlainInterface;
+template <typename T> class ScaleOffsetInterface;
+
+template <> constexpr DType dtype_of_type<ScaleOffsetInterface<uint8_t>>()
+{
+    return DType::QUInt8;
+}
+template <> constexpr DType dtype_of_type<ScaleOffsetInterface<int8_t>>()
+{
+    return DType::QInt8;
+}
+template <> constexpr DType dtype_of_type<ScaleOffsetInterface<uint16_t>>()
+{
+    return DType::QUInt16;
+}
+template <> constexpr DType dtype_of_type<ScaleOffsetInterface<int16_t>>()
+{
+    return DType::QInt16;
+}
+
+template <> constexpr DType dtype_of_type<PlainInterface<Float16>>()
+{
+    return DType::Float16;
+}
+template <> constexpr DType dtype_of_type<PlainInterface<float>>()
+{
+    return DType::Float32;
+}
+template <> constexpr DType dtype_of_type<ScaleOffsetInterface<NN_INT32_T>>()
+{
+    return DType::QInt32;
+}
+template <> constexpr DType dtype_of_type<PlainInterface<NN_INT32_T>>()
+{
+    return DType::Int32;
+}
+template <> constexpr DType dtype_of_type<PlainInterface<NN_INT64_T>>()
+{
+    return DType::Int64;
+}
+
+extern long long int dma_validate_cycles;
+
+namespace hnnx {
+API_EXPORT extern uint64_t checksum_bytes(uint64_t prev, uint8_t const *bytes, unsigned n);
+class InterfaceRef;
+struct intfc_methods;
+
+// this is to solve a circular dependency issue; defined in graph.h
+class DMA_Manager;
+DMA_Manager *get_dma_manager(Graph const &);
+DMA_Manager *get_dma_manager(Deserz const &);
+
+//
+// This type represent a set of block_id, across a tensor or group of
+// tensors.
+//typedef std::set<void*> blockid_set_t;
+// .. but this should work too...
+typedef miniset<void *> blockid_set_t;
+
+//
+// This is an interface class; a reference to this
+// is passed to tensor->enum_memory_blocks; the tensor then calls the 'supply_blocks_func' method
+// (maybe using one of the handy wrappers) to generate one or more 'void*' which are the block
+// ids.
+//   Rules are:
+//   - if 'supply_blocks_func' is called with memclass < 0, the memory class of the block is unspecified;
+//     if it is called with memclass >=0, the value  is  MemoryClass and the tensor guarantees that all
+//     of the blocks are in that class.
+//   - Tensor may in general make multiple calls to supply_blocks_func in one call to enum_memory_blocks, and may
+//     supply different values of mclass parameter. But currently it is only one.
+//   - The tensor does *not* guarantee that the same id is not presented multiple times in one call
+//     to enum_memory_blocks.
+//
+class MemBlockEnumerator {
+  public:
+    API_EXPORT virtual ~MemBlockEnumerator() {}
+    API_EXPORT virtual void supply_blocks_func(Tensor const *tensp, int memclass, void *const *ptr, size_t num) = 0;
+    // Tensors can use these wrappers
+    API_EXPORT inline void supply_blocks(Tensor const *tensp, void *const *ptr, size_t num)
+    {
+        supply_blocks_func(tensp, -1, ptr, num);
+    }
+    API_EXPORT inline void supply_blocks(Tensor const *tensp, MemoryClass mc, void *const *ptr, size_t num)
+    {
+        supply_blocks_func(tensp, int(mc), ptr, num);
+    }
+};
+// utility class, to enumerate to a std::set
+// if mclass_sel >=0, we skip tensors which have a different memory class.
+class MemBlockEnumToSet : public MemBlockEnumerator {
+    blockid_set_t &m_set;
+    int m_memclass_sel;
+
+  public:
+    API_EXPORT explicit MemBlockEnumToSet(blockid_set_t &s, int mclass_sel = -1) : m_set(s), m_memclass_sel(mclass_sel)
+    {
+    }
+    API_EXPORT MemBlockEnumToSet(blockid_set_t &s, MemoryClass mc) : m_set(s), m_memclass_sel(int(mc)) {}
+    API_EXPORT virtual void supply_blocks_func(Tensor const *, int memclass, void *const *ptr, size_t num) override
+    {
+        if (m_memclass_sel >= 0 && memclass >= 0 && m_memclass_sel != memclass) return;
+        for (size_t i = 0; i < num; i++) {
+            if (ptr[i] != Allocator::vacant()) m_set.emplace(ptr[i]);
+        }
+    }
+};
+// This is to support Tensor::enum_memory_blocks_withfunc( ..callable..)
+//  and similar for Op methods
+template <typename ENFUNC> class MemBlockEnumWrapper : public MemBlockEnumerator {
+    ENFUNC m_enfunc;
+
+    API_EXPORT virtual void supply_blocks_func(Tensor const *tensp, int memclass, void *const *ptr, size_t num) override
+    {
+        m_enfunc(tensp, memclass, ptr, num);
+    }
+
+  public:
+    API_EXPORT inline MemBlockEnumWrapper(ENFUNC &&ef) : m_enfunc(std::move(ef)) {}
+    API_EXPORT inline MemBlockEnumWrapper(ENFUNC const &ef) : m_enfunc(ef) {}
+};
+
+// This is to support Tensor::replace_memory_blocks_withfunc( ..callable..)
+//  and similar for Op methods
+//  The 'replfunc' is called as: void* replfunc( Tensor const *tp, void *old_blkid)
+//  for each block in the tensor; the returned value is used as the replacement blkid.
+template <typename REPLFUNC> class MemBlockReplBlockWrapper : public MemBlockEnumerator {
+    REPLFUNC m_replfunc;
+
+    API_EXPORT virtual void supply_blocks_func(Tensor const *tensp, int memclass, void *const *ptr, size_t num) override
+    {
+        for (unsigned i = 0; i < num; i++) {
+            void *newblk = m_replfunc(tensp, ptr[i]);
+            const_cast<void *&>(ptr[i]) = newblk;
+        }
+    }
+
+  public:
+    API_EXPORT inline MemBlockReplBlockWrapper(REPLFUNC &&ef) : m_replfunc(std::move(ef)) {}
+    API_EXPORT inline MemBlockReplBlockWrapper(REPLFUNC const &ef) : m_replfunc(ef) {}
+};
+
+} // namespace hnnx
+
+/*
+ * An Interface has all the necessary values and functionality to encode and decode values
+ *
+ * virtual methods do generic conversion to/from floats, with a void * to the encoded data.
+ *
+ * Each concrete Tensor (and some less-than-concrete) has an instance of an Interface.
+ *
+ * IMPORTANY: All interface classes must be trivially destructible.
+ * As a result, even though we have virtual methods, it is safe to
+ * have no virtual dtor.
+ * This is important for performance, since every tensor has an Interface subclass
+ * embedded in it; and most tensor classes need no destructor for any other reason.
+ * So a dtor requirement in the 'interface' could add time to the teardown,
+ * even if the dtors don't do very much.
+ */
+
+class Interface {
+  protected:
+    // base class has the 'dtype info' for the interface. It must occupy an
+    // aligned 4-byte location.
+    alignas(4) dtype_info dtinfo;
+
+    explicit constexpr Interface(dtype_info const dti) : dtinfo(dti) {}
+    using intfc_methods = hnnx::intfc_methods;
+
+  public:
+    // Base class can read this info directly from the 'dtinfo' field:
+    constexpr inline dtype_info get_dt_info() const noexcept { return dtinfo; }
+    constexpr inline unsigned element_size() const noexcept { return dtinfo.elbytes; }
+    constexpr inline DType get_dtype() const noexcept { return dtinfo.dtype; }
+    constexpr inline bool is_quantized() const noexcept { return dtinfo.is_quant; }
+
+    struct qparms {
+        int offset;
+        float scale;
+        float scale_recip;
+    };
+    using read_float_fp = float (*)(Interface const *, void const *) noexcept;
+    using write_float_fp = void (*)(Interface const *, void *, const float) noexcept;
+    using get_qparms_fp = qparms const *(*)(Interface const *) noexcept;
+    using ifc_hash_fp = uint32_t (*)(Interface const *) noexcept;
+    using ifc_compare_fp = int (*)(Interface const *, Interface const *) noexcept;
+    static unsigned constexpr N_types = unsigned(DType::ZZ_LAST_DTYPE);
+
+    // This constructs an InterfaceRef for an arbitrary Interface instance, by using the dtype
+    // in the header word to select the method table.
+    // This is in tensor.cc; it is expected to be used fairly rarely, and maybe not at all at runtime,
+    // but it will be pretty quick.
+    API_EXPORT hnnx::InterfaceRef get_refobj() const;
+
+    template <typename IFC> static Interface const *canonical_instance_for(); // inline is below
+
+  protected:
+    template <DType DT> static intfc_methods const &methods_for(); // inline is below
+
+    API_EXPORT static constexpr qparms null_parms = {0, 1.0f, 1.0f};
+    static inline qparms const *get_null_qparms(Interface const *) noexcept { return &null_parms; }
+};
+
+namespace hnnx {
+// ONLY for use in intfc_methods
+class IfcExemplar final : public Interface {
+  public:
+    constexpr IfcExemplar(dtype_info const dt) : Interface(dt) {}
+    constexpr IfcExemplar() : Interface(dtype_info{}) {}
+};
+
+//
+// for each 'concrete' subclass of Interace, there is one private instance
+// of this, which is 'static constexpr methods_instance',
+// e.g. PlainInterface<float>::methods_instance is one of these.
+// The ifc_hash' method does not compute the complete hash; it remains to 'xor' with unsigned(dtype).
+// In cases where it just returns 0, a null ptr can be used.
+//
+struct intfc_methods {
+    hnnx::IfcExemplar exemplar; // <- contains the dtype_info
+    Interface::read_float_fp read_float;
+    Interface::write_float_fp write_float;
+    Interface::get_qparms_fp get_qparms;
+    Interface::ifc_hash_fp ifc_hash; // <- may be null if it always returns 0 (e.g. PlainInterface)
+    Interface::ifc_compare_fp ifc_compare; // <- may be null if always returns 0 (e.g. PlainIterface).
+};
+// All of the intfc_methods are stored in this table, which can be indexed by DType.
+using ifc_method_table_t = std::array<intfc_methods, Interface::N_types>;
+constexpr ifc_method_table_t construct_ifc_method_table(); // in tensor.cc; not publicly visible
+// This is defined in tensor.cc, and built at compile time.
+API_EXPORT_IMPORT extern const ifc_method_table_t ifc_method_table;
+
+} // namespace hnnx
+// can define this now.
+template <DType DT> inline hnnx::intfc_methods const &Interface::methods_for()
+{
+    return hnnx::ifc_method_table[unsigned(DT)];
+}
+//
+// for a given actual interface class, get a pointer to a 'canonical'
+// instance; this is actually the 'exemplar' field in the methods table.
+//  - Must be a subclass of Interface
+//  - Must have a 'dtype' attribute (or be NullInterface)
+//  - Must be the same size as Interface (cannot be used for ScaleOffsetInterface).
+//
+template <typename IFC> inline Interface const *Interface::canonical_instance_for()
+{
+    static_assert(std::is_base_of_v<Interface, IFC>);
+    static_assert(sizeof(IFC) == sizeof(Interface));
+    constexpr DType dt = IFC::dtype;
+    return &hnnx::ifc_method_table[unsigned(dt)].exemplar;
+}
+// NullInterface doesn't have a dtype (and maybe shouldn't...)
+template <> inline Interface const *Interface::canonical_instance_for<NullInterface>()
+{
+    return &hnnx::ifc_method_table[unsigned(DType::UNKNOWN)].exemplar;
+}
+
+namespace hnnx {
+// The 'interface(); virtual method of Tensor now returns this object.
+// If the interface is anything but ScaleOffsetInterface, the 'interface' ptr can be null.
+class InterfaceRef {
+
+  public:
+    using qparms = Interface::qparms;
+
+  protected:
+    intfc_methods const *methods_p;
+    Interface const *intfc_p;
+
+  private: // use make_null() method if needed
+    constexpr InterfaceRef() : methods_p(nullptr), intfc_p(nullptr) {}
+
+  public:
+    InterfaceRef(InterfaceRef const &) = default;
+    InterfaceRef &operator=(InterfaceRef const &) = default;
+    // not really public since null_init_token isn't
+
+    API_EXPORT InterfaceRef(intfc_methods const &mthods, Interface const *ifc_p) : methods_p(&mthods), intfc_p(ifc_p) {}
+    API_EXPORT qparms const *get_qparms() const { return methods_p->get_qparms(intfc_p); }
+    API_EXPORT float get_scale() const { return methods_p->get_qparms(intfc_p)->scale; }
+    API_EXPORT float get_scale_recip() const { return methods_p->get_qparms(intfc_p)->scale_recip; }
+    API_EXPORT int32_t get_offset() const { return methods_p->get_qparms(intfc_p)->offset; }
+    API_EXPORT void write_float(void *ptr, const float in) const noexcept { methods_p->write_float(intfc_p, ptr, in); }
+    API_EXPORT float read_float(const void *ptr) const noexcept { return methods_p->read_float(intfc_p, ptr); }
+    API_EXPORT uint32_t interface_hash() const noexcept
+    {
+        uint32_t h = methods_p->ifc_hash ? methods_p->ifc_hash(intfc_p) : 0;
+        return h ^ uint32_t(methods_p->exemplar.get_dtype());
+    }
+    API_EXPORT dtype_info get_dt_info() const noexcept { return methods_p->exemplar.get_dt_info(); }
+    API_EXPORT unsigned element_size() const noexcept { return methods_p->exemplar.element_size(); }
+    API_EXPORT DType get_dtype() const noexcept { return methods_p->exemplar.get_dtype(); }
+    API_EXPORT bool is_quantized() const noexcept { return methods_p->exemplar.is_quantized(); }
+    // might as well have get_refobj() method, for consistency of interface...
+    API_EXPORT inline InterfaceRef get_refobj() const { return *this; }
+
+    // this is used as a 'pseudo-ctor' to make a null InterfaceRef, in a few places.
+    // (null ctor is currently private)
+    static inline constexpr InterfaceRef make_null() { return InterfaceRef{}; }
+
+    Interface const *get_intfc_ptr() const { return intfc_p; }
+    intfc_methods const *get_methods_ptr() const { return methods_p; }
+
+    // Ordered compare of two 'InterfaceRef'. If the types are different (detected by
+    // different method pointers), then we order based on the addresses of the method
+    // tables; since the method tables are all in one big array, this means they are
+    // ordered according to 'dtype'.
+    API_EXPORT int compare(InterfaceRef const &rhs) const noexcept
+    {
+        if (methods_p != rhs.methods_p) return (methods_p < rhs.methods_p) ? -1 : 1;
+        if (intfc_p == rhs.intfc_p) return 0; // same type, same object
+        auto fp = methods_p->ifc_compare;
+        return (fp == nullptr) ? 0 : (*fp)(intfc_p, rhs.intfc_p);
+    }
+    API_EXPORT bool compare_eq(InterfaceRef const &rhs) const noexcept
+    {
+        if (methods_p != rhs.methods_p) return false;
+        if (intfc_p == rhs.intfc_p) return true; // same type, same object
+        auto fp = methods_p->ifc_compare;
+        return (fp == nullptr) ? true : ((*fp)(intfc_p, rhs.intfc_p) == 0);
+    }
+    friend inline bool operator==(InterfaceRef const &lhs, InterfaceRef const &rhs) noexcept
+    {
+        return lhs.compare_eq(rhs);
+    }
+    friend inline bool operator!=(InterfaceRef const &lhs, InterfaceRef const &rhs) noexcept
+    {
+        return !lhs.compare_eq(rhs);
+    }
+    friend inline bool operator<(InterfaceRef const &lhs, InterfaceRef const &rhs) noexcept
+    {
+        return lhs.compare(rhs) < 0;
+    }
+};
+
+template <typename IFCT> Interface::qparms &qparms_for_interface_patch(size_t);
+} // namespace hnnx
+
+namespace hnnx {
+// make_interface<INTFC>::from_odef( Graph &, OutputDef const &odef)
+// returns a pointer to an INTFC suitable for odef, either
+// by finding an existing one, or by adding a new one to the crate.
+// make_interface<INTFC>::from_deser(Deseralizer & dctx)
+// returns a pointer to an INTFC , deserialized,
+// by finding an existing one which matches, or by adding a new one to the crate.
+template <typename INTFC> struct make_interface {
+    API_EXPORT static Interface const *from_odef(Graph &, OutputDef const &odef);
+    API_EXPORT static Interface const *from_deser(Deserz &dctx, Interface const **ptrloc);
+};
+} // namespace hnnx
+
+/*
+ * But guess what... you can't ever instantiate an abstract class!
+ * So if we want to return a generic Accessor, we need to make it non-abstract.
+ *
+ * We need an abstract pointer to some element.  The way to do this is void *
+ * We need a pointer to the Interface, which needs to be able to work with a void *
+ *
+ * This pushes the runtime polymorphism into the Interface, which we can share
+ * between Accessor instances
+ */
+
+class GenericAccessorRO {
+  protected:
+    void *data;
+    const Interface &interface;
+    Interface::read_float_fp read_fp;
+
+  public:
+    API_EXPORT GenericAccessorRO(void const *const data_in, hnnx::InterfaceRef const &interface_in)
+        : data(const_cast<void *>(data_in)), interface(*interface_in.get_intfc_ptr()),
+          read_fp(interface_in.get_methods_ptr()->read_float)
+    {
+    }
+    API_EXPORT GenericAccessorRO(GenericAccessorRO const &) = default;
+    typedef GenericAccessorRO AccessorRO;
+    API_EXPORT inline float as_float() const { return (*read_fp)(&interface, data); }
+    API_EXPORT inline operator float() const { return as_float(); }
+};
+class GenericAccessor : public GenericAccessorRO {
+    Interface::write_float_fp write_fp;
+
+  public:
+    API_EXPORT GenericAccessor(void *const data_in, hnnx::InterfaceRef const &interface_in)
+        : GenericAccessorRO(data_in, interface_in), write_fp(interface_in.get_methods_ptr()->write_float)
+    {
+    }
+    API_EXPORT GenericAccessor(GenericAccessor const &) = default;
+    API_EXPORT inline void set_float(float v) { write_fp(&interface, data, v); }
+    API_EXPORT inline float operator=(float v)
+    {
+        set_float(v);
+        return v;
+    }
+    API_EXPORT inline float operator=(GenericAccessorRO const &rhs)
+    {
+        float const v = rhs.as_float();
+        set_float(v);
+        return v;
+    }
+    API_EXPORT inline float operator=(GenericAccessor const &rhs)
+    {
+        if (this != &rhs) {
+            return operator=(static_cast<GenericAccessorRO const &>(rhs));
+        }
+        return this->as_float();
+    }
+};
+// this is returned by Tensor::get_dtype_intfc()
+//
+struct DTypeScaleOff {
+    DType dtype;
+    float scale;
+    int offset;
+    DTypeScaleOff(DType dt, float sc, int zo) noexcept : dtype(dt), scale(sc), offset(zo) {}
+    explicit DTypeScaleOff(DType dt) noexcept : dtype(dt), scale(1.0f), offset(0) {}
+    DTypeScaleOff() noexcept : DTypeScaleOff(DType::UNKNOWN) {}
+    // construct from dtype and qparms ref, etc...
+    DTypeScaleOff(DType dt, Interface::qparms const &qpp) noexcept : dtype(dt), scale(qpp.scale), offset(qpp.offset) {}
+    DTypeScaleOff(DType dt, hnnx::InterfaceRef const &iref) noexcept : DTypeScaleOff(dt, *iref.get_qparms()) {}
+    explicit DTypeScaleOff(hnnx::InterfaceRef const &iref) noexcept
+        : DTypeScaleOff(iref.get_dtype(), *iref.get_qparms())
+    {
+    }
+    DTypeScaleOff(DTypeScaleOff const &) = default;
+    DTypeScaleOff &operator=(DTypeScaleOff const &) = default;
+};
+
+/**
+ * For each 'interface' there are a pair of accessor classes
+ *   Interface::Accessor
+ *   Interface::AccessorRO
+ *   .. which the types returned by Tensor(..indices...)
+ *
+ *  These have the following:
+ *       typedef AccessorRO;                            - correponding RO type.
+ *       typedef element_type;							- type of the stored element
+ *       element_type .value() const;					- direct read
+ *       .as_float() const;								- convert to float
+ *       operator float() const;						- same
+ *  (If not RO):
+ *       .set_value( element_type &);                   - direct store
+ *       .set_float( float )							- assign from float
+ *       operator=( float )								- assign from float
+ *       operator=( Accessor const & )				    - assign from same accessor
+ *       operator=( AccessorRO const & )				- assign from R/O accessor
+ *  The assignment operators may return either a float,
+ *    or an AccessorRO by value
+ *    or an Accessor const &  (which is *this)
+ *    or an AccessorRO const & (only if it's *this by subclass).
+ *
+ *  Both have copy ctors, and AccessorRO(Accessor const &) works.
+ *
+ *  AccessorRO may or may not be a direct public base of Accessor
+ *
+ *  The 'GenericAccessor' and GenericAccessorRO have all of the above, except
+ *  for element_type, .value(), and .set_value().
+ */
+
+/**
+ * @class NullInterface
+ *
+ * @brief A NullInterface throws away data and returns zero
+ */
+
+class NullInterface final : public Interface {
+    friend constexpr hnnx::ifc_method_table_t hnnx::construct_ifc_method_table();
+
+  public:
+    API_EXPORT inline constexpr NullInterface() : Interface(hnnx::dtype_info_v<DType::UNKNOWN>) {}
+
+    static inline constexpr dtype_info get_dt_info() noexcept { return hnnx::dtype_info_v<DType::UNKNOWN>; }
+    static inline qparms const *get_qparms() noexcept { return &Interface::null_parms; }
+    static inline constexpr DType get_dtype() noexcept { return get_dt_info().dtype; }
+    static inline constexpr unsigned element_size() noexcept { return get_dt_info().elbytes; }
+    static inline constexpr bool is_quantized() noexcept { return get_dt_info().is_quant; }
+    static inline uint32_t interface_hash() noexcept { return uint32_t(DType::UNKNOWN); }
+
+  private:
+    static void write_float(Interface const *, void *ptr, const float in) noexcept {}
+    static float read_float(Interface const *, const void *ptr) noexcept { return 0.0f; }
+
+    // LCOV_EXCL_START [SAFTYSWCCB-1736] constexprs resolved during compile time
+    static constexpr intfc_methods get_method_table()
+    {
+        return {hnnx::dtype_info_v<DType::UNKNOWN>, read_float, write_float, get_null_qparms, nullptr, nullptr};
+    }
+    // LCOV_EXCL_STOP
+
+  public:
+    // hide the slower implementations in the base class...
+    API_EXPORT inline float get_scale() const noexcept { return 1.0f; }
+    API_EXPORT inline float get_scale_recip() const noexcept { return 1.0f; }
+    API_EXPORT inline int32_t get_offset() const noexcept { return 0; }
+    API_EXPORT int compare(const NullInterface &rhs) const { return 0; };
+
+    static inline hnnx::InterfaceRef get_refobj() noexcept
+    {
+        return hnnx::InterfaceRef(methods_for<DType::UNKNOWN>(), Interface::canonical_instance_for<NullInterface>());
+    }
+    // NullInterface has a null DTypeScaleOff
+    static inline DTypeScaleOff get_dtype_scaleoff() noexcept { return DTypeScaleOff(); }
+
+  private:
+    // Accessor for NullInterface - empty class.
+    struct nullval {
+        operator float() const { return 0.0f; }
+    };
+    class AcsrRO {
+      public:
+        using element_type = nullval;
+        using AccessorRO = AcsrRO;
+        API_EXPORT AcsrRO() {}
+        API_EXPORT AcsrRO(void const *, NullInterface const *) {}
+        API_EXPORT AcsrRO(AcsrRO const &) = default;
+        API_EXPORT inline element_type value() const { return nullval{}; }
+        API_EXPORT inline float as_float() const { return 0.0f; }
+        API_EXPORT inline operator float() const { return 0.0f; }
+    };
+    class Acsr : public AcsrRO {
+      public:
+        using element_type = nullval;
+        using AccessorRO = AcsrRO;
+        API_EXPORT Acsr(void *, const NullInterface *) {}
+        API_EXPORT Acsr(Acsr const &) = default;
+        API_EXPORT inline void set_float(float v) {}
+        API_EXPORT inline void set_value(element_type v) {}
+        API_EXPORT inline float operator=(float v) { return 0.0f; }
+        API_EXPORT inline float operator=(AcsrRO const &rhs) { return 0.0f; }
+        API_EXPORT inline float operator=(Acsr const &rhs) { return 0.0f; }
+    };
+
+  public:
+    using Accessor = Acsr;
+    using AccessorRO = AcsrRO;
+};
+
+// make_interface for NullInterface; easy, just have one
+// and return a pointer to it.
+template <> struct hnnx::make_interface<NullInterface> {
+    API_EXPORT static inline Interface const *from_odef(Graph &, OutputDef const &odef)
+    {
+        return Interface::canonical_instance_for<NullInterface>();
+    }
+    API_EXPORT static inline Interface const *from_deser(Deserz &dctx, Interface const **)
+    {
+        return Interface::canonical_instance_for<NullInterface>();
+    }
+};
+
+/**
+ * @class PlainInterface
+ *
+ * @brief A tensor with Floats needs no conversion.
+ * You could also use this for integral value tensors where the integral values are the true values;
+ * they would get converted to floats.
+ */
+template <typename T> class PlainInterface final : public Interface {
+    friend constexpr hnnx::ifc_method_table_t hnnx::construct_ifc_method_table();
+
+  public:
+    using element_type = T;
+    static constexpr DType dtype = dtype_of_type<PlainInterface>();
+    API_EXPORT explicit constexpr PlainInterface(const OutputDef &def) : Interface(get_dt_info()) {}
+    API_EXPORT constexpr PlainInterface() : Interface(get_dt_info()) {}
+    API_EXPORT explicit constexpr PlainInterface(hnnx::Deserz &) : PlainInterface() {}
+    API_EXPORT static inline constexpr T convert_from_float(const float &in)
+    {
+        return saturate_round<T>(in);
+    } // except for T=float!
+    API_EXPORT static inline constexpr float convert_to_float(const T &in) { return float(in); }
+
+    static inline qparms const *get_qparms() noexcept { return &Interface::null_parms; }
+    static inline constexpr dtype_info get_dt_info() noexcept { return hnnx::dtype_info_v<dtype>; }
+    static inline constexpr DType get_dtype() noexcept { return dtype; }
+    static inline constexpr unsigned element_size() noexcept { return get_dt_info().elbytes; }
+    static inline constexpr bool is_quantized() noexcept { return get_dt_info().is_quant; }
+
+  private:
+    static void write_float(Interface const *self, void *ptr, const float in) noexcept; // inlined below
+    static inline float read_float(Interface const *, const void *ptr) noexcept
+    {
+        auto p = static_cast<const T *>(ptr);
+        return convert_to_float(*p);
+    }
+    // LCOV_EXCL_START [SAFTYSWCCB-1736] constexprs resolved during compile time
+    static constexpr intfc_methods get_method_table()
+    {
+        return {hnnx::dtype_info_v<dtype>, read_float, write_float, get_null_qparms, nullptr, nullptr};
+    }
+    // LCOV_EXCL_STOP
+
+  public:
+    static inline uint32_t interface_hash() noexcept { return uint32_t(dtype); }
+    static inline hnnx::InterfaceRef get_refobj() noexcept
+    {
+        return hnnx::InterfaceRef(methods_for<dtype>(), Interface::canonical_instance_for<PlainInterface>());
+    }
+    static inline DTypeScaleOff get_dtype_scaleoff() noexcept { return DTypeScaleOff(dtype); }
+
+    API_EXPORT static inline int compare(const PlainInterface &rhs) noexcept { return 0; }
+    API_EXPORT static inline float get_scale() noexcept { return 1.0f; }
+    API_EXPORT static inline float get_scale_recip() noexcept { return 1.0f; }
+    API_EXPORT static inline int32_t get_offset() noexcept { return 0; }
+
+  private:
+    // Accessor for PlainInterface
+    // Doesn't need a reference to interface, just a data pointer (or data, for AcsrRO)
+    // We can't actually call it AccessorRO since it needs to contain a typedef AccessorRO.
+    //
+    class Acsr;
+    class AcsrRO {
+      protected:
+        T val;
+
+      public:
+        using element_type = T;
+        using AccessorRO = AcsrRO;
+        API_EXPORT AcsrRO(void const *data_in, PlainInterface const *) : val(*static_cast<T const *>(data_in)) {}
+        API_EXPORT AcsrRO(AcsrRO const &) = default;
+        API_EXPORT AcsrRO &operator=(AcsrRO const &) = default;
+        API_EXPORT AcsrRO(Acsr const &a) : val(a.value()) {}
+        API_EXPORT inline element_type value() const { return val; }
+        API_EXPORT inline float as_float() const { return convert_to_float(val); }
+        API_EXPORT inline operator float() const { return as_float(); }
+    };
+    class Acsr {
+      protected:
+        T *data;
+
+      public:
+        using element_type = T;
+        using AccessorRO = AcsrRO;
+        API_EXPORT Acsr(void *data_in, PlainInterface const *) : data(static_cast<T *>(data_in)) {}
+        API_EXPORT Acsr(Acsr const &) = default;
+        API_EXPORT inline element_type value() const { return *data; }
+        API_EXPORT inline float as_float() const { return convert_to_float(*data); }
+        API_EXPORT inline operator float() const { return as_float(); }
+
+        API_EXPORT inline void set_float(float v) { *data = convert_from_float(v); }
+        API_EXPORT inline void set_value(element_type v) { *data = v; }
+        API_EXPORT inline float operator=(float v)
+        {
+            set_float(v);
+            return v;
+        }
+        // when copying from an Acsr of the same type we don't need to
+        // convert to float and back.
+        // @@we could also define operator= for other cases, e.g.
+        //  int32 from int16, to do the operation without going to float.
+        API_EXPORT inline AcsrRO operator=(Acsr const &rhs)
+        {
+            if (this != &rhs) {
+                T v = rhs.value();
+                set_value(v);
+            }
+            return AcsrRO(*this);
+        }
+        API_EXPORT inline AcsrRO operator=(AcsrRO const &rhs)
+        {
+            T v = rhs.value();
+            set_value(v);
+            return AcsrRO(*this);
+        }
+    };
+
+  public:
+    using Accessor = Acsr;
+    using AccessorRO = AcsrRO;
+};
+
+//PlainInterface<float>::convert_from_float: no-op
+template <> API_EXPORT inline constexpr float PlainInterface<float>::convert_from_float(const float &in)
+{
+    return in;
+}
+
+//PlainInterface<Float16>::convert_from_float: no-op for values in Float16 range, clamp to max otherwise.
+template <> API_EXPORT inline constexpr Float16 PlainInterface<Float16>::convert_from_float(const float &in)
+{
+    Float16 const max_as_fp16 = std::numeric_limits<Float16>::max();
+    float const max_as_fp32 = static_cast<float>(max_as_fp16);
+
+    if (in > max_as_fp32) return std::numeric_limits<Float16>::infinity();
+    if (in < -max_as_fp32) return -std::numeric_limits<Float16>::infinity();
+    return static_cast<Float16>(in);
+}
+
+// needs to be defined *after* convert_from_float is specialized
+template <typename T> inline void PlainInterface<T>::write_float(Interface const *, void *ptr, const float in) noexcept
+{
+    auto p = static_cast<T *>(ptr);
+    *p = convert_from_float(in);
+}
+
+// make_interface for PlainInterface<T>; easy, just have one
+// and return a pointer to it.
+
+template <typename T> struct hnnx::make_interface<PlainInterface<T>> {
+    API_EXPORT static inline Interface const *from_odef(Graph &, OutputDef const &odef)
+    {
+        return Interface::canonical_instance_for<PlainInterface<T>>();
+    }
+    API_EXPORT static inline Interface const *from_deser(Deserz &dctx, Interface const **)
+    {
+        return Interface::canonical_instance_for<PlainInterface<T>>();
+    }
+};
+
+extern template class PlainInterface<float>; // in tensor.cc
+extern template class PlainInterface<NN_INT32_T>;
+extern template class PlainInterface<NN_INT64_T>;
+
+/**
+ * @class ScaleOffsetInterface
+ *
+ * @brief A tensor could also have a scale+offset interface
+ * This is good for quantization schemes where you want to quantize an arbitrary, possibly asymmetric range.
+ * We compute and cache the reciprocal of the scale for conversion from float
+ * A default constructor sets the offset to 0 and the scale to 1.0, which would be suitable for integers.
+ * The reciprocal of scale should be computed so we don't have to divide.
+ */
+
+class SOIfcBase : public Interface { // base of ScaleOffsetInterface<T>
+  protected:
+    Interface::qparms qp; // offset, scale, scale_recip;
+    template <typename X> friend Interface::qparms &hnnx::qparms_for_interface_patch(size_t);
+    Interface::qparms &qparms_for_patch() { return qp; }
+    // Can only be constructed by subclass.
+    SOIfcBase(const int offset_, const float scale_, const dtype_info dt_)
+        : Interface(dt_), qp({offset_, scale_, 1.0f / scale_})
+    {
+    }
+    SOIfcBase(const OutputDef &def, const dtype_info dt_) : SOIfcBase(def.zero_offset, def.stepsize, dt_)
+    {
+        if (def.stepsize == 0.0f) debuglog("Oops: zero stepsize");
+    }
+    SOIfcBase(hnnx::Deserz &dctx, const dtype_info dt_) : Interface(dt_)
+    {
+        qp.offset = dctx.deserialize_uint32();
+        qp.scale = dctx.deserialize_float();
+        qp.scale_recip = 1.0f / qp.scale;
+    }
+
+    // these two are protected here (they only make sense in comparing same type)
+    // but are exposed in subclass via wrappers
+    API_EXPORT inline int compare(const SOIfcBase &rhs) const noexcept
+    {
+        if (qp.offset != rhs.qp.offset) return (qp.offset < rhs.qp.offset) ? -1 : 1;
+        if (qp.scale != rhs.qp.scale) return (qp.scale < rhs.qp.scale) ? -1 : 1;
+        return 0;
+    }
+    API_EXPORT inline bool compare_eq(const SOIfcBase &rhs) const noexcept
+    {
+        return qp.offset == rhs.qp.offset && qp.scale == rhs.qp.scale;
+    }
+
+  public:
+    API_EXPORT inline float get_scale() const noexcept { return qp.scale; }
+    API_EXPORT inline float get_scale_recip() const noexcept { return qp.scale_recip; }
+    API_EXPORT inline int32_t get_offset() const noexcept { return qp.offset; }
+    API_EXPORT inline qparms const *get_qparms() const noexcept { return &qp; }
+
+  protected:
+    static inline Interface::qparms const *get_qparms_meth(Interface const *const self) noexcept
+    {
+        return &static_cast<SOIfcBase const &>(*self).qp;
+    }
+    static int ifc_compare(Interface const *const lhs, Interface const *const rhs) noexcept
+    {
+        auto const &rhs_ref = *static_cast<SOIfcBase const *>(rhs);
+        return static_cast<SOIfcBase const *>(lhs)->compare(rhs_ref);
+    }
+    static uint32_t ifc_hash(Interface const *const self) noexcept
+    {
+        Interface::qparms const *qpp = get_qparms_meth(self);
+        // NOTE; it's important that if two ScaleOffsetInterface<T> objects for two *different*
+        // T have the same scale and offset, they must have different hash values. So 'dtype'.
+        // is rolled into the hash. Hash collisions are OK if either scale or offset is different.
+        return unsigned(qpp->offset) * 0x10661 ^ (image_convert<unsigned, float>(qpp->scale) << 1);
+    }
+};
+
+template <typename T> class ScaleOffsetInterface final : public SOIfcBase {
+    friend constexpr hnnx::ifc_method_table_t hnnx::construct_ifc_method_table();
+
+  public:
+    API_EXPORT ScaleOffsetInterface(const int offs_, const float scale_) : SOIfcBase(offs_, scale_, get_dt_info()) {}
+    API_EXPORT explicit ScaleOffsetInterface(const OutputDef &def) : SOIfcBase(def, get_dt_info()) {}
+    API_EXPORT ScaleOffsetInterface() : SOIfcBase(0, 1.0f, get_dt_info()) {}
+    API_EXPORT explicit ScaleOffsetInterface(hnnx::Deserz &dctx) : SOIfcBase(dctx, get_dt_info()) {}
+
+    using element_type = T;
+    static constexpr DType dtype = dtype_of_type<ScaleOffsetInterface>();
+    template <typename TX> API_EXPORT static inline constexpr T saturate(TX in) { return saturate_cast<T>(in); }
+    API_EXPORT inline constexpr T convert_from_float(float in) const
+    {
+        return saturate_round<T>(qp.offset + in * qp.scale_recip);
+    }
+    API_EXPORT inline constexpr float convert_to_float(T in) const
+    {
+        if constexpr (sizeof(T) <= 2)
+            return (float(int(in) - qp.offset)) * qp.scale;
+        else
+            return (float(in) - qp.offset) * qp.scale;
+    }
+    static constexpr inline dtype_info get_dt_info() noexcept { return hnnx::dtype_info_v<dtype>; }
+    static inline constexpr DType get_dtype() noexcept { return dtype; }
+    static inline constexpr unsigned element_size() noexcept { return get_dt_info().elbytes; }
+    static inline constexpr bool is_quantized() noexcept { return get_dt_info().is_quant; }
+
+  private:
+    static inline void write_float(Interface const *const self, void *ptr, const float in) noexcept
+    {
+        assert(ptr != nullptr);
+        auto p = static_cast<T *>(ptr);
+        *p = static_cast<ScaleOffsetInterface<T> const *>(self)->convert_from_float(in);
+    }
+    static inline float read_float(Interface const *const self, const void *ptr) noexcept
+    {
+        assert(ptr != nullptr);
+        auto p = static_cast<const T *>(ptr);
+        return static_cast<ScaleOffsetInterface<T> const *>(self)->convert_to_float(*p);
+    }
+    // LCOV_EXCL_START [SAFTYSWCCB-1736] constexprs resolved during compile time
+    static constexpr intfc_methods get_method_table()
+    {
+        return {hnnx::dtype_info_v<dtype>, read_float, write_float, get_qparms_meth, ifc_hash, ifc_compare};
+    }
+    // LCOV_EXCL_STOP
+
+  public:
+    inline uint32_t interface_hash() const noexcept { return ifc_hash(this) ^ uint32_t(dtype); }
+
+    inline hnnx::InterfaceRef get_refobj() const noexcept { return hnnx::InterfaceRef(methods_for<dtype>(), this); }
+    inline DTypeScaleOff get_dtype_scaleoff() const noexcept { return DTypeScaleOff(dtype, *get_qparms_meth(this)); }
+
+    API_EXPORT inline int compare(const ScaleOffsetInterface &rhs) const noexcept { return SOIfcBase::compare(rhs); }
+    API_EXPORT inline bool compare_eq(const ScaleOffsetInterface &rhs) const noexcept
+    {
+        return SOIfcBase::compare_eq(rhs);
+    }
+
+  private:
+    // Accessor for ScaleOffsetInterface
+    class Acsr;
+    class AcsrRO {
+      protected:
+        T val;
+        const ScaleOffsetInterface<T> &interface;
+
+      public:
+        using element_type = T;
+        using AccessorRO = AcsrRO;
+        API_EXPORT AcsrRO(void const *data_in, const ScaleOffsetInterface *interface_in)
+            : val(*static_cast<T const *>(data_in)), interface(*interface_in)
+        {
+        }
+
+        API_EXPORT AcsrRO(AcsrRO const &) = default;
+        AcsrRO &operator=(AcsrRO const &) = default;
+        API_EXPORT inline element_type value() const { return val; }
+        API_EXPORT inline float as_float() const { return interface.convert_to_float(val); }
+        API_EXPORT inline operator float() const { return as_float(); }
+        API_EXPORT AcsrRO(Acsr const &a) : val(a.value()), interface(a.interface) {}
+    };
+    class Acsr {
+        friend class AcsrRO;
+
+      protected:
+        T *data;
+        const ScaleOffsetInterface<T> &interface;
+
+      public:
+        using element_type = T;
+        using AccessorRO = AcsrRO;
+        API_EXPORT Acsr(void *data_in, const ScaleOffsetInterface *interface_in)
+            : data(static_cast<T *>(data_in)), interface(*interface_in)
+        {
+        }
+        Acsr(Acsr const &) = default;
+        API_EXPORT inline element_type value() const { return *data; }
+        API_EXPORT inline float as_float() const { return interface.convert_to_float(*data); }
+        API_EXPORT inline operator float() const { return as_float(); }
+        API_EXPORT inline void set_float(float v) { *data = interface.convert_from_float(v); }
+        API_EXPORT inline void set_value(element_type v) { *data = v; }
+        API_EXPORT inline float operator=(float v)
+        {
+            set_float(v);
+            return v;
+        }
+        API_EXPORT inline float operator=(Acsr const &rhs)
+        {
+            if (this != &rhs) {
+                float const v = rhs.as_float();
+                set_float(v);
+                return v;
+            }
+            return this->as_float();
+        }
+        API_EXPORT inline float operator=(AcsrRO const &rhs)
+        {
+            float const v = rhs.as_float();
+            set_float(v);
+            return v;
+        }
+    };
+
+  public:
+    using Accessor = Acsr;
+    using AccessorRO = AcsrRO;
+};
+
+// make_interface for ScaleOffsetInterface.
+template <typename T> struct hnnx::make_interface<ScaleOffsetInterface<T>> {
+    // can only declare these here, since we can't see into Graph at this point.
+    // Code is in tensor.cc
+    API_EXPORT static ScaleOffsetInterface<T> const *from_exemplar(Graph &, ScaleOffsetInterface<T> const &exemplar);
+    API_EXPORT static Interface const *from_odef(Graph &g, OutputDef const &odef)
+    {
+        // make an exemplar...
+        ScaleOffsetInterface<T> const exemplar(odef);
+        return from_exemplar(g, exemplar);
+    }
+    API_EXPORT static Interface const *from_deser(Deserz &dctx, Interface const **const ptrloc)
+    {
+        // deserialize the id; p is a pointer to where it is in index,
+        const auto [objp, indexp] = dctx.deserialize_shared_obj<Interface>(ptrloc);
+        // if indexp is null, it's a ref to previous obj; 'objp' is the pointer and we're done.
+        if (indexp == nullptr) return objp;
+        // otherwise, make a new one and store its address at indexp.
+        Interface const *const new_p = dctx.dcrate()->emplace0<ScaleOffsetInterface<T>>(dctx);
+        *indexp = new_p; // for next time it's used
+        return new_p;
+    }
+
+  protected:
+    // put in crate without checking for dups.
+    API_EXPORT static ScaleOffsetInterface<T> const *to_crate(Graph &, ScaleOffsetInterface<T> const &exemplar);
+};
+
+extern template class ScaleOffsetInterface<uint8_t>; // in tensor.cc
+extern template class ScaleOffsetInterface<uint16_t>;
+
+//////////////////////////////////////////////////////////////////////////////////////////
+/// @brief compile-time traits of tensor classes
+/// E.g. the construct tensor_traits<TYPE>::element_type will obtain the element_type
+/// of any Tensor subclass which has one.
+/// Note that tensor_traits<Tensor> has no defined attributes.
+///
+/// Full list of traits is below
+///
+/// These are defined in all non-abstract Tensor subclasses:
+///  - constexpr DType dtype            // always present (except Tensor,RankedTensor,LayoutTensor); sometimes UNKNOWN
+///  - constexpr unsigned rank          // always present (except Tensor); sometimes 0
+///  - typedef element_type;            // always present (except Tensor,RankedTensor,LayoutTensor); void in TensorShape
+///  - typedef storage_type             // always present (except Tensor,RankedTensor); void in TensorShape
+///
+/// In LayoutTensor and ConcreteTensor:
+///  - typedef layouttensor_type        // The LayoutTensor<> class
+///  - typedef layout_type
+///  - typedef pad_type                 // Padding<rank> or NoPadding<rank>
+///  - constexpr bool has_padding
+///  - constexpr bool is_chunked;
+///  - constexpr bool is_indirect;      // usually same as is_chunked; always <= is_chunked
+///  - typedef raw_type                 // See below [1]
+///
+/// Only in ConcreteTensor:
+///  - constexpr MemoryClass memclass;
+///
+/// Only in ConcreteTensor, TensorScalar:
+///  - typedef interface_type
+///
+///  [1] raw_type is defined in the classes which have get_raw(...),
+///     and is the type which get_raw returns a ref to.
+///     For LayoutTensor, it is the same as storage_type; for ConcreteTensor,
+///     it is the same as element_type.
+///
+template <typename TENST> using tensor_traits = typename TENST::traits;
+
+//////////////////////////////////////////////////////////////////////////////////////////
+
+/*
+ * Now that we have Interfaces and Accessors, which we will use to give a consistent interface to Tensors,
+ * let's work on the actual Tensors
+ */
+
+/*
+ * @class Tensor
+ *
+ * @brief This is the abstract base class for Tensors.
+ * All tensors allow you to index into them with foo(a,b,c);
+ * You can query rank, dim, etc
+ *
+ * But, you're probably better off with one of the more specific Tensor types for performance,
+ * since a lot of the virtual functions become trivial for the compiler if they can be inlined.
+ *
+ */
+
+class Tensor {
+  public:
+    enum class clone_mode {
+        duplicate,
+        UNUSED_persistent,
+    };
+
+    // Use with 'dims' to query dimension sizes by name e.g. auto [h, d] = tensor.dims(Tensor::HEIGHT, Tensor::DEPTH)
+    enum dimensions { BATCH, HEIGHT, WIDTH, DEPTH, CHANNEL };
+
+    API_EXPORT virtual hnnx::InterfaceRef interface() const noexcept = 0;
+
+    struct traits { // empty
+    };
+
+    API_EXPORT virtual const char *true_name() const { return typeid(*this).name(); }
+    API_EXPORT explicit Tensor(const Op *producer_in) {}
+    API_EXPORT explicit Tensor(hnnx::Deserz &) {}
+    API_EXPORT Tensor(const Tensor &old, hnnx::Allocator *allocator, clone_mode) {}
+    API_EXPORT virtual ~Tensor(){}; // virtual destructor
+    API_EXPORT virtual size_t rank() const noexcept = 0; // What's the rank of this tensor?
+    API_EXPORT virtual size_t dim(size_t index) const noexcept = 0; // What's the length of some dimension?
+    API_EXPORT virtual std::pair<size_t const *, size_t>
+    get_dims() const noexcept = 0; // return rank, and address of dims[0..n-1]
+
+    API_EXPORT virtual size_t max_dim(size_t index) const noexcept = 0;
+    API_EXPORT virtual std::pair<size_t const *, size_t>
+    get_max_dims() const noexcept = 0; // return rank, and address of dims[0..n-1]
+    // this is the first virtual method defined externally.
+    API_EXPORT virtual uint32_t find_content_hash() const noexcept; // find 'content hash' of the data.
+
+  protected:
+    // Note, this is a const method returning a non-const pointer;
+    // but we only allow it to publicly return a non-const
+    // pointer when used in non-const wrapper methods.
+    // if 'iref' is not null, *iref is also set to what interface() would return.
+    API_EXPORT virtual void *element_addr(size_t rank, SIdx const coords_in[],
+                                          hnnx::InterfaceRef *iref = nullptr) const noexcept = 0;
+    // this is for implementing dims(..indices...) in Tensor and subclasses.
+    // The 'dims_r' is a return from get_dims() method.
+    template <typename... T> //
+    static inline std::array<size_t, sizeof...(T)> //
+    dims_extractor(std::pair<size_t const *, size_t> const dims_r, T... indices)
+    {
+        auto const read_dim = [dims_r](unsigned i) -> size_t { return (i < dims_r.second) ? dims_r.first[i] : 1; };
+        return {read_dim(indices)...};
+    }
+    // this is for implementing dims()
+    template <unsigned R, typename... T> //
+    static inline constexpr std::array<size_t, R> //
+    dims_extractor_all(std::pair<size_t const *, size_t> const dims_r)
+    {
+        std::array<size_t, R> result{};
+        for (unsigned i = 0; i < R; i++) {
+            result[i] = (i < dims_r.second) ? dims_r.first[i] : 1;
+        }
+        return result;
+    }
+
+  public:
+    // element_ptr on insufficiently specialized class gives the result as a void *.
+    API_EXPORT inline ALWAYSINLINE void const *element_ptr(size_t rank, const SIdx coords[]) const
+    {
+        return (void const *)element_addr(rank, coords);
+    }
+    API_EXPORT inline ALWAYSINLINE void *element_ptr(size_t rank, const SIdx coords[])
+    {
+        return element_addr(rank, coords);
+    }
+
+    API_EXPORT std::tuple<size_t, size_t, size_t, size_t> get_dims_4() const
+    {
+        size_t const *ptr = nullptr;
+        size_t n = 0;
+        std::tie(ptr, n) = get_dims(); // virtual call
+        if (n != 4) throw std::runtime_error("rank not 4");
+        return std::make_tuple(ptr[0], ptr[1], ptr[2], ptr[3]);
+    }
+    // this is a common case.
+    API_EXPORT std::tuple<size_t, size_t> get_dims_1_2() const
+    {
+        size_t const *ptr = nullptr;
+        size_t n = 0;
+        std::tie(ptr, n) = get_dims(); // virtual call
+        if (n < 3) throw std::runtime_error("rank not >=3");
+        return std::make_tuple(ptr[1], ptr[2]);
+    }
+
+    ALWAYSINLINE inline std::array<size_t, 4> dims() const
+    { // make compatible with typical concrete tensor.
+        return dims_extractor_all<4>(get_dims());
+    }
+
+    template <typename... T> API_EXPORT const std::array<size_t, sizeof...(T)> dims(T... indices) const
+    {
+        return dims_extractor(get_dims(), indices...);
+    }
+
+    API_EXPORT virtual DTypeScaleOff get_dtype_intfc() const noexcept = 0;
+    template <typename... T> API_EXPORT const std::array<size_t, sizeof...(T)> max_dims(T... indices) const
+    {
+        return dims_extractor(get_max_dims(), indices...);
+    }
+    // if you need more than one of these, it is recommended to unpack
+    // the result from get_dtype_intfc()
+    API_EXPORT DType get_dtype() const { return get_dtype_intfc().dtype; }
+    API_EXPORT float interface_scale() const { return get_dtype_intfc().scale; }
+    API_EXPORT NN_INT32_T interface_offset() const { return get_dtype_intfc().offset; }
+    API_EXPORT OutputDef gen_output_def() const;
+
+    template <typename... ind_types> API_EXPORT inline GenericAccessorRO operator()(ind_types... inds) const
+    {
+        const std::array<SIdx, sizeof...(ind_types)> indarr = {static_cast<SIdx>(inds)...};
+        hnnx::InterfaceRef intfc = hnnx::InterfaceRef::make_null();
+        void *const ptr = element_addr(sizeof...(ind_types), indarr.data(), &intfc);
+        return GenericAccessorRO(ptr, intfc);
+    }
+    template <typename... ind_types> API_EXPORT inline GenericAccessor operator()(ind_types... inds)
+    {
+        const std::array<SIdx, sizeof...(ind_types)> indarr = {static_cast<SIdx>(inds)...};
+        hnnx::InterfaceRef intfc = hnnx::InterfaceRef::make_null();
+        void *const ptr = element_addr(sizeof...(ind_types), indarr.data(), &intfc);
+        return GenericAccessor(ptr, intfc);
+    }
+
+    /*
+     * Returned by virtual method get_tensor_format_code:
+     *
+     *                                   (General)           (Shape)             (Scalar)
+     *  Bits  3:0  dtype code             x                  0 = UNKNOWN          x
+     *  Bits  5:4  (reserved, zero)
+     *  Bits  7:6  log2(element_size)     x                  0                    x
+     *  Bits 11:8  rank                   x                  x                    0
+     *  Bits 15:12 (reserved, 0)
+     *  Bit  16     is_tcm                x                  0                    0
+     *  Bit  17     is_quantized          x                  0                    x
+     *  Bit  18     is_indirect           x                  0                    0
+     *  Bit  19     is_chunked            x                  0                    0
+     *  Bit  20     is_not_flat           x                  0                    0
+     *  Bits  27:31	 (reserved, 0)
+     *  Bits 31:28  mode                  tensMODE_general   tensMODE_shape       tensMODE_scalar
+	 *-------------------------------------
+     * Returned by get_tensor_info():
+     * This is a bit weird, due to a legacy bug, but I'm restating it as below,
+     * which remains compatible:
+     *
+     *  For Concrete tensor:
+     *      Bits 3:0    DType
+     *      Bits 7:4    '0001'
+     *      Bits 11:8   rank
+     *      Bits 15:12  '0000'
+     *      Bits 19:16  memory class
+     *      Bits 27:20  zero
+     *      Bits 31:28  tensMODE_general
+     *
+     *  For Shape and Scalar tensors: Bits 31:28 contain tensMODE_shape, or tensMODE_scalar; others bits ar 0.
+     *  Classes which cannot be serialized return 0 in the upper 4 bits.
+     */
+    enum {
+        tformat_dtype_shift = 0u,
+        tformat_dtype_mask = 0xFu,
+        tformat_log2sz_shift = 6u,
+        tformat_log2sz_mask = 3u,
+        tformat_rank_shift = 8u,
+        tformat_rank_mask = 0xFu,
+        tformat_is_tcm = 1u << 16u,
+        tformat_is_quantized = 1u << 17u,
+        tformat_is_indirect = 1u << 18u,
+        tformat_is_chunked = 1u << 19u,
+        tformat_is_not_flat = 1u << 20u,
+        tformat_tmode_shift = 28u,
+        tformat_tmode_mask = 0xFu,
+    };
+
+  protected:
+    template <typename IFC> static inline constexpr uint32_t formatcode_for_interface()
+    {
+        constexpr DType dt = dtype_of_type<IFC>();
+        uint32_t result = unsigned(dt);
+        constexpr unsigned elbytes = sizeof(typename IFC::element_type);
+        constexpr unsigned log2sz = (elbytes == 8) ? 3 : (elbytes == 4) ? 2 : (elbytes == 2) ? 1 : 0;
+        static_assert(elbytes == (1u << log2sz));
+        result |= log2sz << tformat_log2sz_shift;
+        if (dtype_traits<dt>::is_quant) result |= tformat_is_quantized;
+        return result;
+    }
+    template <typename TRAITS> static inline constexpr uint32_t formatcode_for_general()
+    {
+        constexpr unsigned rankval = TRAITS::rank;
+        uint32_t result = formatcode_for_interface<typename TRAITS::interface_type>();
+        result |= (rankval << tformat_rank_shift);
+        if (TRAITS::memclass == MemoryClass::TCM) result |= tformat_is_tcm;
+        if (TRAITS::is_indirect) result |= tformat_is_indirect;
+        if (TRAITS::is_chunked) result |= tformat_is_chunked;
+        if (!std::is_base_of_v<FlatMemoryLayout<rankval>, typename TRAITS::layout_type>) {
+            result |= tformat_is_not_flat;
+        }
+        return (hnnx::SerOpsInterface::tensMODE_general << tformat_tmode_shift) | result;
+    }
+
+    template <unsigned RANK> static inline constexpr uint32_t formatcode_for_shape()
+    {
+        static_assert(RANK <= tformat_rank_mask);
+        return (hnnx::SerOpsInterface::tensMODE_shape << tformat_tmode_shift) | (RANK << tformat_rank_shift);
+    }
+    template <typename IFC> static inline constexpr uint32_t formatcode_for_scalar()
+    {
+        return (hnnx::SerOpsInterface::tensMODE_scalar << tformat_tmode_shift) | formatcode_for_interface<IFC>();
+    }
+
+    static inline constexpr uint32_t pack_tensor_info(DType type, uint32_t rank, MemoryClass mclass)
+    {
+        uint32_t tinfo = 0x10;
+        tinfo |= static_cast<uint32_t>(type) & 0xFu;
+        tinfo |= (rank & 0xFu) << 8u;
+        tinfo |= (static_cast<uint32_t>(mclass) & 0xF) << 16u;
+        tinfo |= hnnx::SerOpsInterface::tensMODE_general << tformat_tmode_shift;
+        return tinfo;
+    }
+
+  public:
+#ifndef PREPARE_DISABLED
+    virtual std::string get_shape_info() const { return {}; }
+#endif
+    API_EXPORT virtual uint32_t get_tensor_info() const noexcept; // returns 0;
+    API_EXPORT virtual uint32_t get_tensor_format_code() const noexcept; // returns 0;
+    // returns false if the dims are the same; true if different, or maybe different.
+    API_EXPORT virtual bool set_dims(const size_t dims[]) = 0; // Set the shape of the tensor
+    API_EXPORT virtual bool set_dims(const Tensor &prototype) = 0; // Set the shape of the tensor same as another.
+
+    API_EXPORT virtual void set_valid_dims(const size_t new_dims[]) = 0;
+    API_EXPORT virtual DynamicStatus get_dynamic_state() const = 0;
+    // void * rather than DynamicShape<rank> & because we dont have rank templated
+    API_EXPORT virtual void const *get_dynamic_shape_obj() const noexcept = 0;
+    API_EXPORT inline void allocate(hnnx::Allocator &allocator, unsigned options = 0)
+    {
+        return allocate_func(allocator, options);
+    }
+
+    /* EJP: FIXME: temporary functions */
+    /*
+	 * Some of these functions are convenient for now, but don't necessarily
+	 * need to live for a long time if we find better ways of doing things.
+	 */
+    API_EXPORT virtual void *raw_data() noexcept = 0; // Get pointer to raw data
+    API_EXPORT void const *raw_data_const() const noexcept { return const_cast<Tensor *>(this)->raw_data(); }
+    API_EXPORT virtual void set_raw_data_despite_danger(void *buffer)
+    {
+        assert(!"Invalid to set raw pointer on this type of tensor");
+    }
+    API_EXPORT virtual size_t total_storage_elements() const = 0;
+    API_EXPORT virtual size_t total_storage_bytes() const = 0;
+    API_EXPORT virtual size_t valid_storage_elements() const = 0;
+    API_EXPORT virtual size_t valid_storage_bytes() const = 0;
+    API_EXPORT const char *truetype() const noexcept { return typeid(*this).name(); }
+
+    // Append the set of allocated memory blocks to blocklist.
+    API_EXPORT void get_memory_blocks(hnnx::blockid_set_t &blocklist, int mc_sel = -1) const;
+    API_EXPORT inline void get_memory_blocks(hnnx::blockid_set_t &blocklist, MemoryClass mc) const
+    {
+        get_memory_blocks(blocklist, int(mc));
+    }
+    // return the set of memory blocks
+    API_EXPORT hnnx::blockid_set_t get_memory_blocks(int mc_sel = -1) const;
+    API_EXPORT inline hnnx::blockid_set_t get_memory_blocks(MemoryClass mc) const { return get_memory_blocks(int(mc)); }
+
+    // Supply the allocated memory blocks to the enumerator.
+    API_EXPORT virtual void enum_memory_blocks(hnnx::MemBlockEnumerator &) const = 0;
+
+    // The 'ef' parameter to these functions is a callable (function, lambda, std::function...)
+    // compatible with MemBlockEnumerator::supply_blocks_func
+    template <typename ENFUNC> API_EXPORT inline void enum_memory_blocks_withfunc(ENFUNC &&ef) const
+    {
+        hnnx::MemBlockEnumWrapper<std::remove_reference_t<ENFUNC>> enumer(std::forward<ENFUNC>(ef));
+        this->enum_memory_blocks(enumer);
+    }
+    // The 'rf' parameter to these functions is a callable (function, lambda, std::function...)
+    // .. called as ( Tensor const *, void *old_blkid) -> void *new_blkid
+    template <typename REPLFUNC> API_EXPORT inline void replace_memory_blocks_withfunc(REPLFUNC &&rf)
+    {
+        hnnx::MemBlockReplBlockWrapper<std::remove_reference_t<REPLFUNC>> enumer(std::forward<REPLFUNC>(rf));
+        this->enum_memory_blocks(enumer);
+    }
+    // this is passed a map<void*,void*> or any similar type with find() and end(),
+    // and uses it to edit the blocks in the tensor.
+    template <typename MAPTYPE> API_EXPORT inline void replace_memory_blocks_withmap(MAPTYPE const &map)
+    {
+        replace_memory_blocks_withfunc([&map](Tensor const *, void *oldid) {
+            auto found_at = map.find(oldid);
+            return (found_at != map.end()) ? found_at->second : oldid;
+        });
+    }
+
+    API_EXPORT void serialize(hnnx::SerOpsInterface &sctx) const { sctx.tensor_serialize(this); }
+    // The same tensor in the same layout, but with persistent storage.
+
+    API_EXPORT std::unique_ptr<Tensor> persistent_clone(hnnx::Allocator *allocator, bool zoneb = false) const;
+    // same thing, but does refcounts in 'zone B'
+    API_EXPORT inline std::unique_ptr<Tensor> persistent_clone_Op(hnnx::Allocator *allocator) const
+    {
+        return persistent_clone(allocator, true);
+    }
+    // similar in effect to persistent_clone_Op, but can onlt be applied to
+    // existing persistent tensors; and only copies the tensor, not the data.
+
+    API_EXPORT std::unique_ptr<Tensor> shallow_clone_Op(hnnx::Allocator *allocator) const;
+
+    // decref the ref counts of any contained blocks (all must be persistent)
+    API_EXPORT void persistent_decref(hnnx::Allocator *allocator, bool zoneb = false) const;
+    // same thing, but does refcounts in 'zone B'
+    API_EXPORT inline void persistent_decref_Op(hnnx::Allocator *allocator) const
+    {
+        return persistent_decref(allocator, true);
+    }
+
+    // a 'duplicate' - same type,layout,dims; references the same
+    // memory block(s) (where applicable).
+    API_EXPORT std::unique_ptr<Tensor> duplicate_clone(hnnx::Allocator *allocator) const
+    {
+        return reallocate_clone(allocator, true);
+    }
+    // do a 'reallocate clone': the new tensor is the same type, layout, dims
+    // but the block table is zeroed.
+    // If dup=true, this is the same as duplicate_clone.
+    API_EXPORT std::unique_ptr<Tensor> reallocate_clone(hnnx::Allocator *allocator, bool dup = false) const;
+
+    // 'compare' in the base class:
+    //   - if the types are different, return -1 or 1 depending on that.
+    //   - otherwise call protected virtual compare_sametype(), which can then use static_cast
+    //     to downcast (and doesn't need to recurse back to the base).
+
+    API_EXPORT int compare(const Tensor *rhs) const
+    {
+        Tensor const *const lhs = this;
+        std::type_info const &lhs_type = typeid(*lhs);
+        std::type_info const &rhs_type = typeid(*rhs);
+        if (lhs_type == rhs_type) {
+            return lhs->compare_sametype(rhs);
+        } else {
+            return lhs_type.before(rhs_type) ? -1 : 1;
+        }
+    }
+
+    API_EXPORT virtual uint64_t get_checksum() const { return 0LL; };
+    // these only work on specific types; in others, you inherit the base class implementation
+    // which raises a runtime error. You can use tile_support() to find out if support exists
+    API_EXPORT virtual void const *read_tile(unsigned flags, void *buffer, size_t b, int h, int w, int d) const;
+    API_EXPORT virtual void write_tile(unsigned flags, void const *buffer, size_t b, int h, int w, int d);
+    enum {
+        tile_8bit = 1, // set when the data is 8 bit and the tensor supports tile access
+        tile_16bit = 2, // set when the data is 16 bit and the tensor supports tile access
+        tile_32bit = 4, // set when the data is 32 bit and the tensor supports tile access
+        tile_any = (1 + 2 + 4), // one of these bits is set if there is any support
+        tile_fast = 16, // set only when one of the XXbit is set, and the support is vector accelerated.
+        tile_direct = 32 // set only when when 'fast' is set, and a direct mapping is possible
+    };
+
+    API_EXPORT virtual unsigned tile_support_bits() const;
+    API_EXPORT inline bool tile_support() const { return (tile_support_bits() & tile_any) != 0; }
+    API_EXPORT inline bool tile_support_fast() const { return (tile_support_bits() & tile_fast) != 0; }
+    API_EXPORT inline bool tile_support_direct() const { return (tile_support_bits() & tile_direct) != 0; }
+    // this is currently a wrapper on tile_write, which inserts the 'write_strategy' flag, and suppresses broadcast
+    // and copy flags. It may change to a separate virtual func.
+    // (this is defined as an inline, in tile_extract.h).
+    API_EXPORT void *write_tile_strategy(unsigned flags, void *buffer, size_t b, int h, int w, int d);
+
+    API_EXPORT static uint32_t content_hash_data(void const *, size_t nbytes, bool is_float) noexcept;
+    API_EXPORT static uint32_t content_hash_data_indirect(uint32_t inhash, void **blocks, unsigned nblocks,
+                                                          size_t blockbytes, bool is_float) noexcept;
+
+    API_EXPORT static uint32_t build_hash(size_t const *dims, int n, uint32_t previous) noexcept;
+
+    struct API_EXPORT tensor_blockinfo {
+        void **blkptrs; // pointer to block table (nullptr if no blocks)
+        // shapepp is a pointer to the shape pointer (where applicable; otherwise null). If a clone
+        // is done, it points to the field in the cloned tensor.
+        hnnx::ShapeFlags const *const *shapepp;
+        // This is a pointer to the tensor's 'interface' pointer, if it has one; otherwise nullptr.
+        // If a clone is being done, it points to the field in the cloned tensor.
+        Interface const *const *interfacepp;
+        size_t nblocks; // number of blocks
+        size_t blocksize; // size of block, in bytes
+        DType dtype;
+        MemoryClass mclass;
+        bool is_indirect; // indicates that the layout is indirect.
+        bool is_chunked; // indicates that the layout is chunked;
+        void setup(DType dt = DType::UNKNOWN, MemoryClass mc = MemoryClass::Default)
+        {
+            blkptrs = nullptr;
+            shapepp = nullptr;
+            interfacepp = nullptr;
+            nblocks = 0;
+            blocksize = 0;
+            dtype = dt;
+            mclass = mc;
+            is_indirect = false;
+            is_chunked = false;
+        }
+    };
+    API_EXPORT inline void get_tensor_blockinfo(tensor_blockinfo *infop) const { clone_util(nullptr, nullptr, infop); }
+
+    // deserialize a single block pointer for a contiguous tensor.
+    API_EXPORT static void *deserialize_block_pointer(hnnx::Deserz &dctx);
+    // deserialize an indirect blocktable, given ref to pointer.
+    // 'nblocks' may be 1, instead of actual len, if we are not decoding 'classic' format.
+    template <typename T> // T = storage_type
+    inline static void deserialize_blocktable(hnnx::Deserz &dctx, T **&blockptr, unsigned const nblocks)
+    {
+        deserialize_blocktable_generic(dctx, (void ***)&blockptr, nblocks);
+    }
+    API_EXPORT static void deserialize_blocktable_generic(hnnx::Deserz &dctx, void ***blockp_loc, unsigned nblocks);
+
+  protected:
+    API_EXPORT virtual void allocate_func(hnnx::Allocator &allocator, unsigned options) = 0;
+    API_EXPORT virtual int compare_sametype(const Tensor *rhs) const = 0;
+
+    // clone_util is an overburdened virtual function, which performs duplicate_clone almost directly,
+    // and provides other info by which the other clone methods, and the decref methods,
+    // can all be done generically in the base class.
+    //
+    // - If tensp != null, it will create a duplicate_clone, and store it at *tensp;
+    // - If infop != null, it will fill in *infop with the tensor info.
+    // If *both* are not null, then infop->blkptrs will point to the block table in the
+    //  original tensor, and the return value is the block pointer in the new tensor.
+    //  Otherwise the return value is null (and it will be null in any case, if the tensor
+    //  has no blocks).
+    //
+    //
+    // Note: allocator may be null if tensp is null.
+
+    API_EXPORT virtual void **clone_util(hnnx::Allocator *allocator, std::unique_ptr<Tensor> *tensp,
+                                         tensor_blockinfo *infop) const = 0;
+};
+
+// A FakeTensor is intended as an intermediate base for special subclasses which
+// need to be based on Tensor but don't need to support most of the interface.
+// subclassing should be done in .cc files or private headers where possible.
+//
+// All of the abstract 'virtual=0' methods (other than get_dtype) are overridden here;
+// many (those shown as protected) will all throw exceptions if called; the others do
+// null things as shown.
+// So when you subclass, just override whatever ones you need and leave the rest.
+//
+// In particular, get_dtype() returns DType::None.
+//
+class FakeTensor : public Tensor {
+  public:
+    explicit FakeTensor(const Op *producer_in) : Tensor(producer_in) {}
+    API_EXPORT explicit FakeTensor(hnnx::Deserz &);
+
+  protected:
+    // all will throw exception if called
+    API_EXPORT virtual void *element_addr(size_t rank, SIdx const coords_in[],
+                                          hnnx::InterfaceRef *iref = nullptr) const noexcept override;
+    API_EXPORT virtual hnnx::InterfaceRef interface() const noexcept override;
+    API_EXPORT virtual void allocate_func(hnnx::Allocator &allocator, unsigned options) override;
+    API_EXPORT virtual void *raw_data() noexcept override;
+    API_EXPORT virtual size_t total_storage_elements() const override;
+    API_EXPORT virtual size_t total_storage_bytes() const override;
+    API_EXPORT virtual size_t valid_storage_elements() const override;
+    API_EXPORT virtual size_t valid_storage_bytes() const override;
+    API_EXPORT virtual void const *get_dynamic_shape_obj() const noexcept override;
+    API_EXPORT virtual void **clone_util(hnnx::Allocator *allocator, std::unique_ptr<Tensor> *tensp,
+                                         Tensor::tensor_blockinfo *infop) const override;
+    API_EXPORT virtual int compare_sametype(const Tensor *rhs) const override;
+
+  public:
+    // defined as shown
+    API_EXPORT virtual size_t rank() const noexcept override; //->0
+    API_EXPORT virtual size_t dim(size_t index) const noexcept override; //->0
+    API_EXPORT virtual std::pair<size_t const *, size_t> get_dims() const noexcept override; //->{null,0}
+    API_EXPORT virtual size_t max_dim(size_t index) const noexcept override; //->0
+    API_EXPORT virtual std::pair<size_t const *, size_t> get_max_dims() const noexcept override; //->{null,0}
+    API_EXPORT virtual bool set_dims(const size_t dims[]) override; // -> false
+    API_EXPORT virtual bool set_dims(const Tensor &prototype) override; // ->false
+    API_EXPORT virtual void enum_memory_blocks(hnnx::MemBlockEnumerator &) const override; // nothing
+
+    API_EXPORT virtual DTypeScaleOff get_dtype_intfc() const noexcept override; // { return DTypeScaleOff(None); }
+};
+/**
+ * @class RankedTensor
+ *
+ * @brief Almost as abstract as Tensor, but we template the Rank.
+ * This allows us to have compile-time checking of the operator(), as well as a public Rank that is static constexpr
+ * so it doesn't take a lot of space or performance...
+ * The other benefit here is that we can specify RankedTensor<4> to have a fairly generic tensor,
+ * but enforce the number of dimensions of the tensor.
+ */
+
+template <unsigned TRank> class RankedTensor : public Tensor {
+  public:
+    struct traits {
+        static constexpr unsigned Rank = TRank;
+    };
+
+    API_EXPORT explicit RankedTensor(const Op *producer_in) : Tensor(producer_in) {}
+    API_EXPORT explicit RankedTensor(hnnx::Deserz &dctx) : Tensor(dctx) {}
+    API_EXPORT RankedTensor(const RankedTensor &old, hnnx::Allocator *allocator, clone_mode cmode)
+        : Tensor(old, allocator, cmode)
+    {
+    }
+    static constexpr auto Rank = TRank;
+    API_EXPORT virtual inline size_t rank() const noexcept override final { return Rank; }
+    template <typename... ind_types> API_EXPORT inline GenericAccessorRO operator()(ind_types... inds) const
+    {
+        static_assert(Rank == (sizeof...(ind_types)), "# of coords must match Rank");
+        const SIdx indarr[] = {static_cast<SIdx>(inds)...};
+        hnnx::InterfaceRef intfc = hnnx::InterfaceRef::make_null();
+        void *const ptr = element_addr(Rank, indarr, &intfc);
+        return GenericAccessorRO(ptr, intfc);
+    }
+    template <typename... ind_types> API_EXPORT inline GenericAccessor operator()(ind_types... inds)
+    {
+        static_assert(Rank == (sizeof...(ind_types)), "# of coords must match Rank");
+        const SIdx indarr[] = {static_cast<SIdx>(inds)...};
+        hnnx::InterfaceRef intfc = hnnx::InterfaceRef::make_null();
+        void *const ptr = element_addr(Rank, indarr, &intfc);
+        return GenericAccessor(ptr, intfc);
+    }
+};
+
+/**
+ * @class TensorShape
+ *
+ * @brief This is a tensor that just has a shape, no memory or type or anything.
+ * This needs to be non-abstract
+ *
+ * EJP: FIXME: should we really use this, or just use Const? Or special like-Const op?
+ * EJP: FIXME: Performance is not so criticial here, we need it to respect the interface
+ * but we really want to make this convenient representation and be formable from an OutputDef
+ *
+ * TensorShape should already be canonized by the nature of being a const op.
+ * We might be able to share TensorShapes shapes and Tensor shapes, but it seems unnecessary.
+ */
+
+template <unsigned TRank> class TensorShape : public RankedTensor<TRank> {
+    using Parent = RankedTensor<TRank>;
+
+  protected:
+    API_EXPORT static constexpr NullInterface null_interface{};
+    // These functions are not really part of the interface, but we need them to implement operator()
+    API_EXPORT virtual void *element_addr(size_t rank, SIdx const coords_in[],
+                                          hnnx::InterfaceRef *const iref = nullptr) const noexcept override final
+    {
+        if (iref) *iref = NullInterface::get_refobj();
+        return nullptr;
+    }
+    API_EXPORT virtual hnnx::InterfaceRef interface() const noexcept override final
+    {
+        return NullInterface::get_refobj();
+    }
+
+  public:
+    API_EXPORT const char *true_name() const override { return type_name<TensorShape<TRank>>(); };
+
+    using Parent::Rank;
+    struct traits {
+        using element_type = void;
+        using storage_type = void;
+        static constexpr DType dtype = DType::UNKNOWN;
+        static constexpr unsigned rank = TRank;
+    };
+
+    //using Shape_t = Shape<Rank>;
+    //const Shape_t *shape;
+    const std::array<size_t, Rank> shape;
+    API_EXPORT virtual size_t dim(size_t index) const noexcept override final { return shape[index]; }
+    API_EXPORT const std::array<size_t, Rank> &dims() const { return shape; };
+    API_EXPORT virtual size_t max_dim(size_t index) const noexcept override final { return dim(index); }
+    API_EXPORT const std::array<size_t, Rank> &max_dims() const { return dims(); };
+    template <typename... T> API_EXPORT const std::array<size_t, sizeof...(T)> dims(T... indices) const
+    {
+        return Tensor::dims_extractor(get_dims(), indices...);
+    }
+    API_EXPORT virtual std::pair<size_t const *, size_t> get_dims() const noexcept override
+    {
+        return std::pair<size_t const *, size_t>(&shape[0], Rank);
+    }
+    template <typename... T> API_EXPORT const std::array<size_t, sizeof...(T)> max_dims(T... indices) const
+    {
+        return Tensor::dims_extractor(get_max_dims(), indices...);
+    }
+    API_EXPORT virtual std::pair<size_t const *, size_t> get_max_dims() const noexcept override { return get_dims(); }
+    API_EXPORT virtual DTypeScaleOff get_dtype_intfc() const noexcept override
+    {
+        return NullInterface::get_dtype_scaleoff();
+    }
+
+    API_EXPORT virtual uint32_t get_tensor_format_code() const noexcept override
+    {
+        return Tensor::formatcode_for_shape<Rank>();
+    }
+
+    API_EXPORT virtual uint32_t get_tensor_info() const noexcept override
+    {
+        return hnnx::SerOpsInterface::tensMODE_shape << Tensor::tformat_tmode_shift;
+    }
+
+    // Optional, but maybe helpful?
+    API_EXPORT virtual bool set_dims(const size_t dims[]) override
+    {
+        static_assert("Shapes are immutable");
+        return true;
+    } // immutable
+    API_EXPORT virtual bool set_dims(const Tensor &prototype) override
+    {
+        static_assert("Shapes are immutable");
+        return true;
+    } // immutable
+
+    virtual void set_valid_dims(const size_t new_dims[]) override
+    {
+        static_assert("Shapes are immutable");
+    } // immutable
+    // TensorShapes always contain fully valid data;
+    virtual DynamicStatus get_dynamic_state() const override { return DynamicStatus::ValidData; }
+    // EJP: FIXME: temporary functions
+    API_EXPORT virtual void *raw_data() noexcept override
+    {
+        return nullptr;
+    } // Allocate storage ourselves instead of fancy memory allocator
+    API_EXPORT virtual size_t total_storage_elements() const override { return 0; }
+    API_EXPORT virtual size_t total_storage_bytes() const override { return 0; }
+    API_EXPORT virtual size_t valid_storage_elements() const override { return 0; }
+    API_EXPORT virtual size_t valid_storage_bytes() const override { return 0; }
+    API_EXPORT virtual void const *get_dynamic_shape_obj() const noexcept override
+    {
+        return (void *)&null_dynamic_shape;
+    }
+    TensorShape(const Op *producer_in, const OutputDef &def, Graph &graph_in)
+        : Parent(producer_in), shape(hnnx::ptr_to_stdarray<Rank, size_t>(&def.max_sizes[0]))
+    {
+    }
+    explicit TensorShape(hnnx::Deserz &dctx) : Parent(dctx), shape(dctx.deserialize_uint32_array_sizet<Rank>()) {}
+
+    TensorShape(const TensorShape &old, hnnx::Allocator *allocator, Tensor::clone_mode cmode)
+        : Parent(old, allocator, cmode), shape(old.shape)
+    {
+    }
+
+    API_EXPORT virtual void enum_memory_blocks(hnnx::MemBlockEnumerator &) const override { return; }
+
+    API_EXPORT virtual void allocate_func(hnnx::Allocator &allocator, unsigned options) override final {}
+
+    API_EXPORT virtual void **clone_util(hnnx::Allocator *allocator, std::unique_ptr<Tensor> *tensp,
+                                         Tensor::tensor_blockinfo *infop) const override
+    {
+        if (tensp) *tensp = std::make_unique<TensorShape>(*this, allocator, Tensor::clone_mode::duplicate);
+        if (infop) infop->setup();
+        return nullptr;
+    }
+
+  protected:
+    API_EXPORT virtual int compare_sametype(const Tensor *rhs_in) const override
+    {
+        auto *rhs = static_cast<const TensorShape *>(rhs_in);
+        for (int i = 0; i < Rank; i++) {
+            int const dimdiff = this->shape[i] - rhs->shape[i];
+            if (dimdiff != 0) return dimdiff;
+        }
+        return 0;
+    }
+    API_EXPORT virtual uint32_t find_content_hash() const noexcept override
+    {
+        return Tensor::build_hash(&shape[0], Rank, 0x113014);
+    }
+    API_EXPORT static const char *code_to_type_name() { return TensorTypeStruct<TensorShape<TRank>>::name; }
+};
+
+/*
+ * I think we should have a Scalar Constant
+ * * Immutable shape
+ * * Rank 0
+ * * Coords ignored
+ * * Dim(x) == 0
+ * * Templated type / interface?
+ */
+
+//
+// Tensor Scalar depending on DType
+
+template <DType DT> class TensorSclrDT : public Tensor {
+  protected:
+    using T = typename dtype_traits<DT>::element_type;
+    using Interface_t = std::conditional_t<dtype_traits<DT>::is_quant, ScaleOffsetInterface<T>, PlainInterface<T>>;
+    using Accessor_t = typename Interface_t::Accessor;
+    using Const_Accessor_t = typename Interface_t::AccessorRO;
+    Interface_t interface_inst;
+
+  public:
+    API_EXPORT virtual hnnx::InterfaceRef interface() const noexcept override final
+    {
+        return interface_inst.get_refobj();
+    }
+    API_EXPORT inline float interface_scale() const { return interface_inst.get_scale(); }
+    API_EXPORT inline float interface_scale_recip() const { return interface_inst.get_scale_recip(); }
+    API_EXPORT inline int32_t interface_offset() const { return interface_inst.get_offset(); }
+
+    API_EXPORT virtual uint32_t get_tensor_format_code() const noexcept override
+    {
+        return Tensor::formatcode_for_scalar<Interface_t>();
+    }
+
+    API_EXPORT virtual uint32_t get_tensor_info() const noexcept override
+    {
+        return hnnx::SerOpsInterface::tensMODE_scalar << tformat_tmode_shift;
+    }
+
+    // EJP: FIXME: this should just be the value, but then GenericAccessor constructor
+    // complains about const value going to a const Accessor where the constructor in
+    // const Accessor is written to have a normal void pointer input... sigh.
+    T value;
+
+  protected:
+    // These functions are not really part of the interface, but we need them to implement operator()
+    API_EXPORT virtual void *element_addr(size_t rank, SIdx const coords_in[],
+                                          hnnx::InterfaceRef *const iref = nullptr) const noexcept override final
+    {
+        if (iref) *iref = interface_inst.get_refobj();
+        return (void *)&value;
+    }
+
+  public:
+    API_EXPORT const char *true_name() const override { return type_name<TensorSclrDT<DT>>(); };
+
+    struct traits {
+        using element_type = T;
+        using storage_type = typename dtype_traits<DT>::storage_type;
+        using interface_type = Interface_t;
+        static constexpr DType dtype = DT;
+        static constexpr unsigned rank = 0;
+    };
+
+    API_EXPORT virtual size_t rank() const noexcept override { return 0; } // What's the rank of this tensor?
+    API_EXPORT virtual size_t dim(size_t index) const noexcept override { return 1; }
+    API_EXPORT virtual size_t max_dim(size_t index) const noexcept override
+    {
+        return 1;
+    } // What's the length of some dimension?
+    API_EXPORT virtual std::pair<size_t const *, size_t> get_dims() const noexcept override
+    {
+        return std::pair<size_t const *, size_t>(nullptr, 0);
+    }
+    virtual std::pair<size_t const *, size_t> get_max_dims() const noexcept override
+    {
+        return std::pair<size_t const *, size_t>(nullptr, 0);
+    }
+    static constexpr DType dtype = dtype_of_type<Interface_t>();
+    API_EXPORT virtual DTypeScaleOff get_dtype_intfc() const noexcept override
+    {
+        return interface_inst.get_dtype_scaleoff();
+    }
+
+    // Optional, but maybe helpful?
+    API_EXPORT virtual bool set_dims(const size_t dims[]) override
+    {
+        static_assert("Scalar dims are immutable");
+        return true;
+    } // immutable
+    API_EXPORT virtual bool set_dims(const Tensor &prototype) override
+    {
+        static_assert("Scalar dims are immutable");
+        return true;
+    } // immutable
+    virtual void set_valid_dims(const size_t new_dims[]) override
+    {
+        static_assert("Scalar dims are immutable");
+    } // immutable
+    // scalar tensors always contain fully valid data;
+    virtual DynamicStatus get_dynamic_state() const override { return DynamicStatus::ValidData; }
+    // EJP: FIXME: temporary functions
+    API_EXPORT virtual void *raw_data() noexcept override final
+    {
+        return &value;
+    } // Allocate storage ourselves instead of fancy memory allocator
+    API_EXPORT const void *raw_data() const noexcept
+    {
+        return &value;
+    } // Allocate storage ourselves instead of fancy memory allocator
+    API_EXPORT virtual size_t total_storage_elements() const override { return 0; }
+    API_EXPORT virtual size_t total_storage_bytes() const override { return 0; }
+    API_EXPORT virtual size_t valid_storage_elements() const override { return 0; }
+    API_EXPORT virtual size_t valid_storage_bytes() const override { return 0; }
+    API_EXPORT virtual void const *get_dynamic_shape_obj() const noexcept override
+    {
+        return (void *)&null_dynamic_shape;
+    }
+    API_EXPORT virtual void allocate_func(hnnx::Allocator &allocator, unsigned options) override final {}
+    API_EXPORT TensorSclrDT(const Op *producer_in, T value_in) : Tensor(producer_in), value(value_in)
+    {
+        static_assert(!dtype_traits<DT>::is_quant, "FIXME: need different constructor");
+    }
+    API_EXPORT explicit TensorSclrDT(hnnx::Deserz &dctx)
+        : Tensor(dctx), interface_inst(dctx), value(dctx.deserialize_type<T>())
+    {
+    }
+    API_EXPORT TensorSclrDT(const TensorSclrDT &old, hnnx::Allocator *allocator, clone_mode cmode)
+        : Tensor(old, allocator, cmode), interface_inst(old.interface_inst), value(old.value)
+    {
+    }
+
+    template <typename... ind_types> API_EXPORT inline const Const_Accessor_t operator()(ind_types... inds) const
+    {
+        return Const_Accessor_t((void *)&value, &interface_inst);
+    }
+    template <typename... ind_types> API_EXPORT inline Accessor_t operator()(ind_types... inds)
+    {
+        return Accessor_t(&value, &interface_inst);
+    }
+    API_EXPORT virtual void enum_memory_blocks(hnnx::MemBlockEnumerator &) const override { return; }
+
+    API_EXPORT virtual void **clone_util(hnnx::Allocator *allocator, std::unique_ptr<Tensor> *tensp,
+                                         Tensor::tensor_blockinfo *infop) const override
+    {
+        if (tensp) *tensp = std::make_unique<TensorSclrDT>(*this, allocator, clone_mode::duplicate);
+        if (infop) infop->setup(DT);
+        return nullptr;
+    }
+
+  protected:
+    API_EXPORT virtual int compare_sametype(const Tensor *rhs_in) const override
+    {
+        // FIXME @@ if Interface_t is quantized, we should compare quantization too.
+        auto *rhs = static_cast<const TensorSclrDT *>(rhs_in);
+        if (this->value < rhs->value) return -1;
+        if (this->value == rhs->value) return 0;
+        return 1;
+    }
+    API_EXPORT virtual uint32_t find_content_hash() const noexcept override
+    {
+        uint32_t const h = interface().interface_hash() ^ mulu32_modular(unsigned(DT), 0x107301);
+        return mulu32_modular(h, 0x104301) ^ content_hash_data(&this->value, sizeof(T), dtype_traits<DT>::is_float);
+    }
+    API_EXPORT static const char *code_to_type_name() { return TensorTypeStruct<TensorSclrDT<DT>>::name; }
+};
+// Tensor Scalar depending on type (assuming PlainInterface)
+
+template <typename T> using TensorScalar = TensorSclrDT<dtype_of_type<PlainInterface<T>>()>;
+
+///////////////////////////////////////////
+// this is contained within LayoutTensor, and implements the block pointer,
+// or pointers.
+// The first parameter indicates if the layout is indirect; we specialize
+// the whole class on true vs. false.
+// The remaining template parms are the same as those in the LayoutTensot containing
+// it.
+
+/// >> for contiguous tensors
+
+template <typename STYPE, typename TLayout, typename Pad_t> struct layout_mem_contig {
+    static constexpr unsigned Rank = TLayout::Rank;
+    using Shape_t = Shape<Rank>;
+    using storage_type = STYPE;
+    static constexpr TLayout layout{};
+    static constexpr Pad_t pad{};
+
+    storage_type *bulk_data;
+
+    API_EXPORT inline layout_mem_contig(Shape_t const *shp, Graph &graph_in) : bulk_data(){};
+
+    // duplicate clone from another
+    API_EXPORT inline layout_mem_contig(Shape_t const *shp, layout_mem_contig const &other, hnnx::Allocator *alloc,
+                                        Tensor::clone_mode cmode)
+        : bulk_data(other.bulk_data)
+    {
+    }
+
+    // construct from deserialize
+    API_EXPORT layout_mem_contig(Shape_t const *, hnnx::Deserz &dctx)
+        : bulk_data((storage_type *)Tensor::deserialize_block_pointer(dctx))
+    {
+    }
+
+    // this implements raw_data in the containing tensor
+    API_EXPORT inline ALWAYSINLINE void *raw_data() const noexcept { return (void *)bulk_data; }
+
+    // this implements set_raw_data_despite_danger(void *buffer) override final { bulk_data = static_cast<T *>(buffer); }
+    API_EXPORT inline ALWAYSINLINE void set_raw_data_despite_danger(void *buffer)
+    {
+        bulk_data = static_cast<storage_type *>(buffer);
+    }
+
+    // this implements element_addr in the containing tensor.
+    API_EXPORT ALWAYSINLINE void *element_addr(Shape_t const *shp, size_t rank, SIdx const coords_in[]) const noexcept
+    {
+        //assert(rank == Rank);
+        const std::array<size_t, Rank> padded_coords =
+                pad.pad_coords(hnnx::ptr_to_stdarray<Rank, SIdx>(&coords_in[0]), shp->pad);
+        size_t const offset = layout.linear_offset(padded_coords, shp->max_dims);
+        return (void *)&bulk_data[offset];
+    }
+
+    // element_addr impl that takes into account dynamic valid_dims when calculating
+    // the offset into the flat memory buffer
+    ALWAYSINLINE void *element_addr(Shape_t const *shp, size_t rank, SIdx const coords_in[],
+                                    std::array<size_t, Rank> const &valid_dims) const noexcept
+    {
+        const std::array<size_t, Rank> padded_coords =
+                pad.pad_coords(hnnx::ptr_to_stdarray<Rank, SIdx>(&coords_in[0]), shp->pad);
+        size_t const offset = layout.linear_offset(padded_coords, valid_dims);
+        return (void *)&bulk_data[offset];
+    }
+
+    PUSH_WARNING()
+    DISABLE_WARNING("-Wcast-qual", MSVC_NO_EQUIV)
+    // get pointer to block table, and length
+    API_EXPORT inline ALWAYSINLINE void **get_block_list_ptr() const { return (void **)&bulk_data; }
+    POP_WARNING()
+    API_EXPORT static inline ALWAYSINLINE size_t get_block_list_len(Shape_t const *shp) { return 1; }
+    // block size for allocation
+    API_EXPORT inline ALWAYSINLINE static size_t get_elements_per_block(Shape_t const *shp)
+    {
+        return std::accumulate(shp->max_dims.cbegin(), shp->max_dims.cend(), 1, std::multiplies<size_t>());
+    }
+    // find the address of the block pointer containing the specified coords.
+    // (not used for contig. tensor, but this is reasonable impl).
+    API_EXPORT inline storage_type **block_ptr_addr(Shape_t const *shape, std::array<SIdx, Rank> coords) const
+    {
+        return &bulk_data;
+    }
+    // dummy for this
+    API_EXPORT inline void realloc_blocktab(hnnx::Allocator *alloc, Shape_t const *old_shape, Shape_t const *new_shape)
+    {
+        bulk_data = nullptr;
+    }
+
+    // compare memory (raw compare)
+    API_EXPORT int compare_memory(Shape_t const *shp, layout_mem_contig const &rhs) const
+    {
+        size_t const len = get_elements_per_block(shp) * sizeof(storage_type);
+        return memcmp(bulk_data, rhs.bulk_data, len);
+    }
+    // find content hash of memory.
+    //
+    API_EXPORT uint32_t find_content_hash(Shape_t const *shp, uint32_t oldhash, bool is_float) const
+    {
+        size_t const len = get_elements_per_block(shp) * sizeof(storage_type);
+        return mulu32_modular(oldhash, 0x223131) ^ Tensor::content_hash_data(bulk_data, len, is_float);
+    }
+};
+
+/// >> for indirect tensors
+namespace indirect_layout_mem {
+API_EXPORT inline void **make_blocktab(size_t n_blocks, Graph &graph_in)
+{
+    return hnnx::graph_crate(graph_in)->alloc_array_zero<void *>(n_blocks);
+}
+
+template <typename CRATE> // Crate or DCrate
+inline void **make_blocktab_for_overwrite(const size_t n_blocks, CRATE *const crate_p)
+{
+    return crate_p->template alloc_array<void *>(n_blocks);
+}
+
+// TODO: make this not inline.
+API_EXPORT inline int compare_indirect_blocks(void **ptr_a, void **ptr_b, size_t nblocks, size_t blocklen)
+{
+    for (size_t i = 0; i < nblocks; i++) {
+        int const cmp = memcmp(ptr_a[i], ptr_b[i], blocklen);
+        if (cmp != 0) return cmp;
+    }
+    return 0;
+}
+} // namespace indirect_layout_mem
+
+//  layout_mem for indirect.
+template <typename STYPE, typename TLayout, typename Pad_t> struct layout_mem_indirect {
+    static constexpr unsigned Rank = TLayout::Rank;
+    using Shape_t = Shape<Rank>;
+    using storage_type = STYPE;
+    static constexpr TLayout layout{};
+    static constexpr Pad_t pad{};
+
+    storage_type **blocktab;
+
+    // construct table
+    API_EXPORT layout_mem_indirect(Shape_t const *shp, Graph &graph_in)
+        : blocktab((storage_type **)indirect_layout_mem::make_blocktab(layout.num_blocks(shp->max_dims), graph_in))
+    {
+    }
+    // duplicate clone from another
+    API_EXPORT layout_mem_indirect(Shape_t const *shp, layout_mem_indirect const &other, hnnx::Allocator *alloc,
+                                   Tensor::clone_mode cmode)
+        : blocktab()
+    {
+        unsigned const nblocks = layout.num_blocks(shp->max_dims);
+        hnnx::Crate *crate_p = hnnx::graph_crate(alloc->graph);
+        blocktab = (storage_type **)indirect_layout_mem::make_blocktab_for_overwrite(nblocks, crate_p);
+        std::memcpy(blocktab, other.blocktab, sizeof(void *) * nblocks);
+    }
+    // construct from deserialize
+    API_EXPORT layout_mem_indirect(Shape_t const *shp, hnnx::Deserz &dctx) : blocktab()
+    {
+        // if we are not 'classic' format, we may not be able to access shape object here due to delayed
+        // pointer resolution. But we don't need nblocks unless classic format. 1 is the 'don't know'
+        // value.
+        unsigned const nblocks = dctx.classic_format() ? layout.num_blocks(shp->max_dims) : 1;
+        Tensor::deserialize_blocktable(dctx, blocktab, nblocks);
+    }
+
+    // this implements raw_data in the containing tensor
+    API_EXPORT inline ALWAYSINLINE void *raw_data() const noexcept { return (void *)blocktab[0]; }
+    // this implements set_raw_data_despite_danger(void *buffer) override final { bulk_data = static_cast<T *>(buffer); }
+    API_EXPORT inline void set_raw_data_despite_danger(void *buffer)
+    {
+        assert(!"Invalid to set raw pointer on this type of tensor");
+    }
+
+    // this implements element_addr in the containing tensor.
+    API_EXPORT ALWAYSINLINE void *element_addr(Shape_t const *shp, size_t rank, SIdx const coords_in[]) const noexcept
+    {
+        assert(rank == Rank);
+        std::array<size_t, Rank> const padded_coords =
+                pad.pad_coords(hnnx::ptr_to_stdarray<Rank, SIdx>(&coords_in[0]), shp->pad);
+        size_t const block_offset = layout.chunk_offset(padded_coords, shp->max_dims);
+        size_t const block_idx = layout.chunk_index(padded_coords, shp->max_dims);
+        return (void *)&blocktab[block_idx][block_offset];
+    }
+
+    // element_addr impl for dynamic valid_dims code path. Block offset/index calculation
+    // is identical to non dynamic variant for chunked memory layouts.
+    ALWAYSINLINE void *element_addr(Shape_t const *shp, size_t rank, SIdx const coords_in[],
+                                    std::array<size_t, Rank> const &valid_dims) const noexcept
+    {
+        assert(rank == Rank);
+        std::array<size_t, Rank> const padded_coords =
+                pad.pad_coords(hnnx::ptr_to_stdarray<Rank, SIdx>(&coords_in[0]), shp->pad);
+        size_t const block_offset = layout.chunk_offset(padded_coords, shp->max_dims);
+        size_t const block_idx = layout.chunk_index(padded_coords, shp->max_dims);
+        return (void *)&blocktab[block_idx][block_offset];
+    }
+    // get pointer to block table, and length
+    API_EXPORT inline ALWAYSINLINE void **get_block_list_ptr() const { return (void **)blocktab; }
+    API_EXPORT static inline ALWAYSINLINE size_t get_block_list_len(Shape_t const *shp)
+    {
+        return layout.num_blocks(shp->max_dims);
+    }
+    // block size for allocation
+    API_EXPORT inline ALWAYSINLINE static size_t get_elements_per_block(Shape_t const *) { return layout.chunk_total; }
+    // find the address of the block pointer containing the specified coords.
+    API_EXPORT inline storage_type **block_ptr_addr(Shape_t const *shape, std::array<SIdx, Rank> coords) const
+    {
+        std::array<size_t, Rank> const padded_coords = pad.pad_coords(coords, shape->pad);
+        size_t const block_idx = layout.chunk_index(padded_coords, shape->max_dims);
+        return &blocktab[block_idx];
+    }
+    // reallocate for change from old_shape to new_shape (typically just the padding
+    // is changed) and zero the blocktab. If the shape is not actually changed, or if
+    // the blocktab isn't larger than before, we keep the old one, but we still clear it.
+    API_EXPORT inline void realloc_blocktab(hnnx::Allocator *alloc, Shape_t const *old_shape, Shape_t const *new_shape)
+    {
+        unsigned const nblocks = layout.num_blocks(new_shape->max_dims);
+        if (old_shape != new_shape) {
+            unsigned const old_nblocks = layout.num_blocks(old_shape->max_dims);
+            if (nblocks > old_nblocks) { // need reallocate.
+                blocktab = (storage_type **)indirect_layout_mem::make_blocktab(nblocks, alloc->graph);
+                return; // already zeroed
+            }
+        }
+        ::memset(blocktab, 0, nblocks * sizeof(void *));
+    }
+
+    // compare memory (raw compare)
+    API_EXPORT int compare_memory(Shape_t const *shp, layout_mem_indirect const &rhs) const
+    {
+        size_t const nblocks = layout.num_blocks(shp->max_dims);
+        size_t const blocklen = sizeof(storage_type) * layout.chunk_total;
+        return indirect_layout_mem::compare_indirect_blocks((void **)blocktab, (void **)rhs.blocktab, nblocks,
+                                                            blocklen);
+    }
+    // find content hash of memory.
+    //
+    API_EXPORT uint32_t find_content_hash(Shape_t const *shp, uint32_t oldhash, bool is_float) const
+    {
+        size_t const nblocks = layout.num_blocks(shp->max_dims);
+        size_t const blocklen = sizeof(storage_type) * layout.chunk_total;
+        return Tensor::content_hash_data_indirect(oldhash, (void **)blocktab, nblocks, blocklen, is_float);
+    }
+};
+///////////////////////////////////////////
+template <typename Linfo> class LayoutTensor;
+template <typename Linfo> class BlockTableAccessor {
+  protected:
+    static constexpr unsigned Rank = Linfo::Rank;
+    using storage_type = typename Linfo::storage_type;
+    using pointer_type = storage_type *;
+    using TLayout = typename Linfo::Tlayout;
+    using Pad_t = typename Linfo::Pad_t;
+    static_assert(Linfo::is_indirect && Linfo::is_chunked);
+    pointer_type *blktab; // the base of the block table
+    std::array<size_t, Rank> blkdims; // dims of the block table in blocks
+    std::array<size_t, Rank> blkstrides; // 'strides' (note stride for dim i is blkstrides[i+1];
+    // stride for dim RANK-1  is 1; blkstrides[0] is the whole size.
+    std::array<unsigned, Rank> margin; // margin offset
+  public:
+    API_EXPORT explicit BlockTableAccessor(LayoutTensor<Linfo> const &tens) : blktab(tens.blocktab_ptr())
+    {
+        Shape<Rank> const &shp = *tens.shape;
+        size_t allprod = 1;
+        for (int i = Rank - 1; i >= 0; --i) {
+            unsigned const blk = TLayout::ChunkSizes[i];
+            size_t const blkdim = shp.max_dims[i] / blk;
+            allprod *= blkdim;
+            blkdims[i] = blkdim;
+            margin[i] = shp.pad[i];
+            blkstrides[i] = allprod;
+        }
+    }
+    // methods which have the same name as tensor methods, do
+    // the same thing here.
+
+    API_EXPORT inline static constexpr unsigned rank() { return Rank; }
+
+    API_EXPORT inline size_t blocktab_len() const { return blkstrides[0]; }
+    API_EXPORT inline pointer_type *blocktab_ptr() const { return blktab; }
+    API_EXPORT inline size_t blocktab_blocksize() const { return TLayout::chunk_total; };
+    API_EXPORT inline size_t blocktab_blocksize_bytes() const { return TLayout::chunk_total * sizeof(storage_type); };
+
+    API_EXPORT inline size_t blocktab_dim(int i) const { return blkdims[i]; }
+    API_EXPORT inline size_t blocktab_dim_stride(int i) const { return (i < Rank - 1) ? blkstrides[i + 1] : 1; }
+
+    // block_ptr_address(b,h,w,d) and block_ptr accept element coordinates.
+
+    template <typename... ind_types> API_EXPORT inline pointer_type *block_ptr_address(ind_types... inds) const
+    {
+        static_assert(Rank == (sizeof...(ind_types)), "# of coords must match Rank");
+        const std::array<SIdx, Rank> coords = {{static_cast<SIdx>(inds)...}};
+        return block_ptr_calc(coords);
+    }
+    template <typename... ind_types> API_EXPORT inline pointer_type &block_ptr(ind_types... inds) const
+    {
+        static_assert(Rank == (sizeof...(ind_types)), "# of coords must match Rank");
+        const std::array<SIdx, Rank> coords = {{static_cast<SIdx>(inds)...}};
+        return *block_ptr_calc(coords);
+    }
+    // blktab(b,h,w,d) accepts *block* coords
+    //
+    template <typename... ind_types> API_EXPORT inline pointer_type &blocktab(ind_types... inds) const
+    {
+        static_assert(Rank == (sizeof...(ind_types)), "# of coords must match Rank");
+        const std::array<SIdx, Rank> coords = {{static_cast<SIdx>(inds)...}};
+        return *blktab_ptr_calc(coords);
+    }
+    // same_table_shape: the shape of the table is the same as the 'other'.
+    API_EXPORT bool same_table_shape(BlockTableAccessor const &other) const
+    {
+        for (int i = 0; i < Rank; i++)
+            if (blkdims[i] != other.blkdims[i]) return false;
+        return true;
+    }
+    // 'same_layout' means the same table shape and the same padding offset. Dims may not be identical.
+    API_EXPORT bool same_layout(BlockTableAccessor const &other) const
+    {
+        if (!same_table_shape(other)) return false;
+        for (int i = 0; i < Rank; i++)
+            if (margin[i] != other.margin[i]) return false;
+        return true;
+    }
+
+  protected:
+    API_EXPORT pointer_type *block_ptr_calc(std::array<SIdx, Rank> const &coords) const
+    {
+        size_t sum = 0;
+        for (int i = 0; i < Rank; i++) {
+            unsigned blk = TLayout::ChunkSizes[i];
+            unsigned idx = (coords[i] + margin[i] + (blk - 1)) / blk;
+            sum += idx * ((i < Rank - 1) ? blkstrides[i + 1] : 1);
+        }
+        return blktab + sum;
+    }
+    API_EXPORT pointer_type *blktab_ptr_calc(std::array<SIdx, Rank> const &coords) const
+    {
+        size_t sum = coords[Rank - 1];
+        for (int i = 0; i < Rank - 1; i++) {
+            sum += coords[i] * blkstrides[i + 1];
+        }
+        return blktab + sum;
+    }
+};
+
+//
+// Constructors of LayoutTensor (all protected; only used by subclass ctor):
+// LayoutTensor(const Op * producer_in, const OutputDef &def, Graph &graph_in, <<func pointer>>)
+//    - build for given shape, attached to given producer.
+//  LayoutTensor(const Op *producer_in, hnnx::Deserz & dctx, <<funct pointer>>)
+//    - deserialize. Notr that dctx contains a graph ref.
+//  LayoutTensor(const ConcreteTensor &old, hnnx::Allocator *allocator,Tensor::clone_mode cmode)
+//    - 'clone duplicate' of the given tensor. Note that cmode is ignored.
+//
+// The function pointers in the first two cases are used to construct the correct
+// interface object, according to the Interface_t of the subclass.
+//
+
+template <typename Linfo> class LayoutTensor : public RankedTensor<Linfo::Rank> {
+  protected:
+    using BaseRT = RankedTensor<Linfo::Rank>;
+    API_EXPORT static constexpr unsigned Rank = Linfo::Rank;
+    using storage_type = typename Linfo::storage_type;
+    using TLayout = typename Linfo::Tlayout;
+    using Pad_t = typename Linfo::Pad_t;
+    API_EXPORT static constexpr bool is_chunked = Linfo::is_chunked;
+    static_assert(is_chunked == (TLayout::chunk_total > 1));
+    API_EXPORT static constexpr bool is_indirect = Linfo::is_indirect;
+    API_EXPORT static constexpr bool is_padded = !std::is_same<Pad_t, NoPadding<Rank>>::value;
+
+    static_assert(!(is_indirect && !is_chunked), "non-chunked layouts can't be indirect");
+
+    Interface const *const interface_ptr; // pointer to shared instance of Interface subclass.
+    int32_t const *const dummy_interface_ptr = nullptr; // need to immediately follow the interface pointer
+    using Shape_t = Shape<Rank>;
+    using Dynamic_shape_t = DynamicShape<Rank>;
+    using ShapeInterface_t = ShapeInterface<Rank>;
+
+  public:
+    Shape_t const *shape;
+    ShapeInterface_t const *dynamic_shape; // need to immediately follow the shape pointer
+    API_EXPORT static constexpr TLayout layout{};
+    API_EXPORT static constexpr Pad_t pad{};
+#ifndef PREPARE_DISABLED
+    std::string get_shape_info() const override { return shape->get_shape_info(); }
+#endif
+
+  protected: // interface, then shape, then mem
+    using layout_mem_t = std::conditional_t<is_indirect, layout_mem_indirect<storage_type, TLayout, Pad_t>,
+                                            layout_mem_contig<storage_type, TLayout, Pad_t>>;
+    layout_mem_t mem;
+
+  public:
+    struct API_EXPORT traits {
+        using storage_type = LayoutTensor::storage_type;
+        using raw_type = LayoutTensor::storage_type; // result from get_raw()
+        static constexpr unsigned rank = Rank;
+        static constexpr bool is_indirect = LayoutTensor::is_indirect;
+        static constexpr bool is_chunked = LayoutTensor::is_chunked;
+        static constexpr bool has_padding = !std::is_same<Pad_t, NoPadding<Rank>>::value;
+        using pad_type = Pad_t;
+        using layout_type = TLayout;
+        using layouttensor_type = LayoutTensor;
+    };
+
+  protected:
+    // only used in the deserialize ctor
+    Interface const *&interface_ptr_ref() { return const_cast<Interface const *&>(interface_ptr); }
+    // ctors are marked noinline; otherwise they just get inlined
+    // into all the ConcreteTensor ctors, which isn't really helpful.
+    [[gnu::noinline]] API_EXPORT LayoutTensor(const Op *producer_in, const OutputDef &def, Graph &graph_in,
+                                              Interface const *(*ifc_maker)(Graph &, OutputDef const &))
+        : BaseRT(producer_in), interface_ptr((*ifc_maker)(graph_in, def)),
+          shape(Shape_t::canonical_shape(
+                  graph_in, Shape_t(hnnx::ptr_to_stdarray<Rank, size_t>(&def.max_sizes[0]),
+                                    mem.layout.pad(hnnx::ptr_to_stdarray<Rank, size_t>(&def.max_sizes[0]))))),
+          dynamic_shape(
+                  Dynamic_shape_t::crated_shape(graph_in, Dynamic_shape_t(shape->dims, DynamicStatus::ValidData))),
+          mem(shape, graph_in)
+    {
+    }
+    using interface_deser_func = Interface const *(*)(hnnx::Deserz &, Interface const **);
+    [[gnu::noinline]] API_EXPORT LayoutTensor(hnnx::Deserz &dctx, interface_deser_func const ifc_deser_fp)
+        : BaseRT(dctx), interface_ptr((*ifc_deser_fp)(dctx, &interface_ptr_ref())),
+          shape(Shape_t::deserialize(dctx, &shape)),
+          dynamic_shape(Dynamic_shape_t::deserialize(dctx, &dynamic_shape, (ShapeInterface_t const *)shape)),
+          mem(shape, dctx)
+    {
+    }
+    // clone ctor.
+    [[gnu::noinline]] API_EXPORT LayoutTensor(const LayoutTensor &old, hnnx::Allocator *allocator,
+                                              Tensor::clone_mode cmode)
+        : BaseRT(old, allocator, cmode), interface_ptr(old.interface_ptr), shape(old.shape),
+          dynamic_shape(old.dynamic_shape), mem(shape, old.mem, allocator, cmode)
+    {
+    }
+
+  public:
+    API_EXPORT virtual inline size_t dim(size_t index) const noexcept override final
+    {
+        return dynamic_shape->get_dims()[index];
+    }
+    API_EXPORT const std::array<size_t, Rank> &dims() const { return dynamic_shape->get_dims(); }
+    API_EXPORT virtual inline size_t max_dim(size_t index) const noexcept override final { return shape->dims[index]; }
+    API_EXPORT const std::array<size_t, Rank> &max_dims() const { return shape->dims; }
+
+    template <typename... T> API_EXPORT const std::array<size_t, sizeof...(T)> dims(T... indices) const
+    {
+        return Tensor::dims_extractor(get_dims(), indices...);
+    }
+    API_EXPORT virtual std::pair<size_t const *, size_t> get_dims() const noexcept final
+    {
+        return std::pair<size_t const *, size_t>(&dynamic_shape->get_dims()[0], Rank);
+    }
+    template <typename... T> API_EXPORT const std::array<size_t, sizeof...(T)> max_dims(T... indices) const
+    {
+        return Tensor::dims_extractor(get_max_dims(), indices...);
+    }
+    API_EXPORT virtual std::pair<size_t const *, size_t> get_max_dims() const noexcept final
+    {
+        return std::pair<size_t const *, size_t>(&shape->dims[0], Rank);
+    }
+#if defined(NDEBUG) || defined(NO_SETDIMS_CHECK)
+    API_EXPORT virtual inline bool set_dims(const size_t dims[]) override final { return false; }
+    API_EXPORT virtual inline bool set_dims(const Tensor &prototype) override final { return false; }
+#else
+    API_EXPORT virtual inline bool set_dims(const size_t dims[]) override
+    {
+        // for (int i = 0; i < Rank; i++) {
+        //     assert(dims[i] == shape->dims[i]);
+        // }
+        return false;
+    }
+    API_EXPORT virtual inline bool set_dims(const Tensor &prototype) override
+    {
+        auto [dims_p, dims_n] = prototype.get_max_dims();
+        assert(dims_n == Rank);
+        return set_dims(dims_p);
+    }
+#endif
+    API_EXPORT virtual inline void set_valid_dims(const size_t new_dims[]) override final
+    {
+        DynamicStatus new_state = DynamicStatus::ValidData;
+        for (unsigned i = 0u; i < Rank; i++) {
+            assert(new_dims[i] <= shape->dims[i]);
+            if (new_dims[i] <= 0) {
+                new_state = DynamicStatus::InvalidData;
+            }
+            if (new_state == DynamicStatus::ValidData && new_dims[i] < shape->dims[i]) {
+                new_state = DynamicStatus::SemiValidData;
+            }
+        }
+        dynamic_shape->set_state(new_state);
+        dynamic_shape->set_dims(hnnx::ptr_to_stdarray<Rank, size_t>(&new_dims[0]));
+    }
+    API_EXPORT virtual inline DynamicStatus get_dynamic_state() const override { return dynamic_shape->get_state(); }
+    API_EXPORT virtual void const *get_dynamic_shape_obj() const noexcept override
+    {
+        return (void const *)dynamic_shape;
+    };
+    // 'interface()' needs to be overriden in ConcreteTensor
+    API_EXPORT inline float interface_scale() const { return this->interface().get_scale(); }
+    API_EXPORT inline float interface_scale_recip() const { return this->interface().get_scale_recip(); }
+    API_EXPORT inline int32_t interface_offset() const { return this->interface().get_offset(); }
+
+    // for direct access to bulk_data, in contiguous tensors only
+    //  data_ptr() can be assigned to.
+    API_EXPORT inline std::conditional_t<is_indirect, void, storage_type *&> data_ptr()
+    {
+        if constexpr (!is_indirect) {
+            return mem.bulk_data;
+        }
+    }
+    API_EXPORT inline std::conditional_t<is_indirect, void, storage_type *const &> data_ptr() const
+    {
+        if constexpr (!is_indirect) {
+            return mem.bulk_data;
+        }
+    }
+
+    // block table access
+    API_EXPORT inline storage_type **blocktab_ptr() const { return (storage_type **)mem.get_block_list_ptr(); }
+    API_EXPORT inline storage_type *&blocktab_at(size_t i)
+    {
+        if constexpr (!is_indirect) {
+            assert(i == 0);
+            return *(storage_type **)mem.get_block_list_ptr();
+        } else {
+            return ((storage_type **)mem.get_block_list_ptr())[i];
+        }
+    }
+    API_EXPORT inline storage_type *const &blocktab_at(size_t i) const
+    {
+        if constexpr (!is_indirect) {
+            assert(i == 0);
+            return *(storage_type **)mem.get_block_list_ptr();
+        } else {
+            return ((storage_type **)mem.get_block_list_ptr())[i];
+        }
+    }
+    API_EXPORT inline size_t blocktab_len() const { return mem.get_block_list_len(shape); }
+    API_EXPORT inline size_t blocktab_blocksize() const { return mem.get_elements_per_block(shape); }
+    API_EXPORT inline size_t blocktab_blocksize_bytes() const
+    {
+        return mem.get_elements_per_block(shape) * sizeof(storage_type);
+    }
+
+    // TODO: make total_storage elements have an optional bool parameter
+    // to return in bytes; and then total_storage_bytes is a wrapper.
+    API_EXPORT virtual inline size_t total_storage_bytes() const final override
+    {
+        return total_storage_elements() * sizeof(storage_type);
+    }
+    API_EXPORT virtual inline size_t total_storage_elements() const final override
+    {
+        size_t const total_elements =
+                std::accumulate(shape->max_dims.cbegin(), shape->max_dims.cend(), 1, std::multiplies<size_t>());
+        return total_elements;
+    }
+    API_EXPORT inline size_t valid_storage_bytes() const final override
+    {
+        return valid_storage_elements() * sizeof(storage_type);
+    }
+    API_EXPORT inline size_t valid_storage_elements() const final override
+    {
+        size_t const total_elements = std::accumulate(dynamic_shape->get_dims().cbegin(),
+                                                      dynamic_shape->get_dims().cend(), 1, std::multiplies<size_t>());
+        return total_elements;
+    }
+    API_EXPORT virtual void *raw_data() noexcept override final { return mem.raw_data(); }
+    API_EXPORT virtual void set_raw_data_despite_danger(void *buffer) override final
+    {
+        mem.set_raw_data_despite_danger(buffer);
+    }
+
+  protected:
+    // Underlying code for change_{shape,pad,shape_pad}
+    API_EXPORT void change_shapepad_impl(hnnx::Allocator &allocator, size_t const *const p_new_dims,
+                                         size_t const *const p_new_pads = nullptr) // optional pads
+    {
+#if !defined(PREPARE_DISABLED)
+        Shape_t newshape = *shape; // copy old shape
+        if (p_new_dims) {
+            for (int i = 0; i < Rank; i++)
+                newshape.dims[i] = p_new_dims[i];
+        }
+        if (p_new_pads) {
+            for (int i = 0; i < Rank; i++)
+                newshape.pad[i] = p_new_pads[i];
+        }
+        newshape.max_dims = layout.pad(pad.pad_coords(newshape.dims, newshape.pad));
+        // nake a persistent copy of new shape
+        Shape_t const *const new_shape_p = Shape_t::canonical_shape(allocator.graph, newshape);
+        // new_shape_p will be same pointer as shape, if shape wasn't changed. realloc_blocktab
+        // checks for that.
+        mem.realloc_blocktab(&allocator, shape, new_shape_p);
+        shape = new_shape_p;
+#else
+        throw std::runtime_error("change_pad or shape w/o prepare");
+#endif
+    }
+
+  public:
+    // change the padding; and reallocate blocktab if it's larger as a result.
+    // in any case, all of the block pointers are zeroed.
+    // Can only be done early in prepare (i.e. just as Op is created).
+    inline void change_pad(std::array<size_t, Rank> const &new_pad, hnnx::Allocator &allocator)
+    {
+        change_shapepad_impl(allocator, nullptr, new_pad.data());
+    }
+    // Used to change the shape of a tensor;
+    // Can only be done early in prepare (i.e. just as Op is created). Initially used only to
+    // support 'scratch' outputs.
+    // Please use 'change_shape_pad' if you also want to change padding.
+    inline void change_shape(std::array<size_t, Rank> const &new_dims, hnnx::Allocator &allocator)
+    {
+        change_shapepad_impl(allocator, new_dims.data());
+    }
+    // special entry point for use by 'generic' change_shape operation.
+    inline void change_shape_arr(size_t const *const p_new_dims, hnnx::Allocator &allocator)
+    {
+        change_shapepad_impl(allocator, p_new_dims);
+    }
+
+    // Use instead of change_shape if you want to change the padding too
+    // Can only be done early in prepare (i.e. just as Op is created).
+    inline void change_shape_pad(std::array<size_t, Rank> const &new_dims, std::array<size_t, Rank> const &new_pad,
+                                 hnnx::Allocator &allocator)
+    {
+        change_shapepad_impl(allocator, new_dims.data(), new_pad.data());
+    }
+
+    template <typename... ind_types>
+    API_EXPORT inline storage_type const *const *block_ptr_address(ind_types... inds) const
+    {
+        const std::array<SIdx, Rank> coords = {{static_cast<SIdx>(inds)...}};
+        return mem.block_ptr_addr(shape, coords);
+    }
+    template <typename... ind_types> API_EXPORT inline storage_type *const *block_ptr_address(ind_types... inds)
+    {
+        const std::array<SIdx, Rank> coords = {{static_cast<SIdx>(inds)...}};
+        return mem.block_ptr_addr(shape, coords);
+    }
+    template <typename... ind_types> API_EXPORT inline storage_type const *block_ptr(ind_types... inds) const
+    {
+        return *block_ptr_address(inds...);
+    }
+    template <typename... ind_types> API_EXPORT inline storage_type *block_ptr(ind_types... inds)
+    {
+        return *block_ptr_address(inds...);
+    }
+
+    API_EXPORT std::conditional_t<is_indirect, BlockTableAccessor<Linfo>, void> blocktable_accessor() const
+    {
+        if constexpr (is_indirect) {
+            return BlockTableAccessor<Linfo>(*this);
+        }
+    }
+
+    // this only makes sense for indirect tensors.
+    API_EXPORT std::conditional_t<is_indirect, std::array<size_t, Linfo::Rank>, void> tile_strides() const
+    {
+        if constexpr (is_indirect) {
+            std::array<size_t, Linfo::Rank> ret = {0};
+            ret[Linfo::Rank - 1] = 1;
+            for (int i = Linfo::Rank - 2; i >= 0; i--) {
+                ret[i] = ret[i + 1] * (shape->max_dims[i + 1] / layout.ChunkSizes[i + 1]);
+            }
+            return ret;
+        }
+    }
+
+    // get_raw_addr(...) on this class gives a storage_type *.
+    template <typename... ind_types> API_EXPORT inline storage_type const *get_raw_addr(ind_types... inds) const
+    {
+        static_assert(Rank == (sizeof...(ind_types)), "# of coords must match Rank");
+        const std::array<SIdx, Rank> coords = {{static_cast<SIdx>(inds)...}};
+        return (storage_type const *)element_addr0(Rank, coords.data());
+    }
+    template <typename... ind_types> API_EXPORT inline storage_type *get_raw_addr(ind_types... inds)
+    {
+        static_assert(Rank == (sizeof...(ind_types)), "# of coords must match Rank");
+        const std::array<SIdx, Rank> coords = {{static_cast<SIdx>(inds)...}};
+        return (storage_type *)this->element_ptr(Rank, coords.data());
+    }
+    template <typename... ind_types> API_EXPORT inline storage_type const &get_raw(ind_types... inds) const
+    {
+        static_assert(Rank == (sizeof...(ind_types)), "# of coords must match Rank");
+        const std::array<SIdx, Rank> coords = {{static_cast<SIdx>(inds)...}};
+        return *(storage_type const *)element_addr0(Rank, coords.data());
+    }
+    template <typename... ind_types> API_EXPORT inline storage_type &get_raw(ind_types... inds)
+    {
+        static_assert(Rank == (sizeof...(ind_types)), "# of coords must match Rank");
+        const std::array<SIdx, Rank> coords = {{static_cast<SIdx>(inds)...}};
+        return *(storage_type *)element_addr0(Rank, coords.data());
+    }
+    // tile interface. These are defined in tile_extract.h
+    API_EXPORT virtual void const *read_tile(unsigned flags, void *buffer, size_t b, int h, int w,
+                                             int d) const override;
+    API_EXPORT virtual void write_tile(unsigned flags, void const *buffer, size_t b, int h, int w, int d) override;
+    API_EXPORT virtual unsigned tile_support_bits() const override;
+
+    // Return a reference to *this; useful to get the layout base class reference
+    // for any tensor class which has one.
+    // So if you call func(in.layout_base(), out.layout_base()), where 'func'
+    // is a template func, it will be specialized according to the layout of
+    // in and out, not the subclass.
+    API_EXPORT inline LayoutTensor &layout_base() { return *this; }
+    API_EXPORT inline LayoutTensor const &layout_base() const { return *this; }
+
+    // checksum for debug
+    [[gnu::noinline]] API_EXPORT virtual uint64_t get_checksum() const override
+    {
+        // NOLINTNEXTLINE(misc-const-correctness): Don't const this variable
+        uint64_t chk = 0;
+        if constexpr (Rank == 4) {
+            auto [batch, heights, width, depth] = this->get_dims_4();
+            // TODO : maybe add a special case for R4flat layout/no padding; one call to checksum_bytes.
+            if (batch && heights && width && depth) {
+                storage_type const x0 = *(storage_type const *)this->get_raw_addr(0, 0, 0, 0);
+                for (size_t b = 0; b < batch; b++) {
+                    for (size_t h = 0; h < heights; h++) {
+                        for (size_t w = 0; w < width; w++) {
+                            for (size_t d = 0; d < depth; d++) {
+                                storage_type x = *(storage_type const *)this->get_raw_addr(b, h, w, d);
+                                x ^= x0;
+                                union {
+                                    storage_type as_x;
+                                    uint8_t as_byte[sizeof(storage_type)];
+                                } uu = {x};
+                                chk = hnnx::checksum_bytes(chk, uu.as_byte, sizeof(storage_type));
+                            }
+                        }
+                    }
+                }
+                chk ^= x0;
+            }
+        }
+        return chk;
+    }
+
+  protected:
+    // element_addr is delegated to the particular specialization of layout_mem
+    // virtual method 'element_addr' is defined only in the concrete subclasses, and calls this.
+    // (in addition to sometimes returning an interface)
+    ALWAYSINLINE void *element_addr0(size_t rank, const SIdx coords_in[]) const noexcept
+    {
+        return mem.element_addr(shape, rank, coords_in, dynamic_shape->get_dims());
+    }
+
+    // compare_sametype is not overloaded here; LayoutTensor is an abstract class
+
+    // This is called from ConcreteTensor::compare_sametype to fully compare two tensors
+    // which are already known to be the same type (and have same interface)
+    [[gnu::noinline]] API_EXPORT int compare_sametype_layout(LayoutTensor const *rhs) const
+    {
+        if (shape->dims != rhs->shape->dims) {
+            return std::lexicographical_compare(shape->dims.begin(), shape->dims.end(), rhs->shape->dims.begin(),
+                                                rhs->shape->dims.end())
+                           ? -1
+                           : 1;
+        }
+        if (is_padded) {
+            if (shape->max_dims != rhs->shape->max_dims) {
+                return std::lexicographical_compare(shape->max_dims.begin(), shape->max_dims.end(),
+                                                    rhs->shape->max_dims.begin(), rhs->shape->max_dims.end())
+                               ? -1
+                               : 1;
+            }
+            // TODO: compare padding too. Maybe have a Padding method for this.
+        }
+        // compare memory now (delegate to layout_mem).
+        return mem.compare_memory(shape, rhs->mem);
+    }
+    // allocation and enumeration.
+    [[gnu::noinline]] API_EXPORT void allocate_layout(hnnx::Allocator &allocator, unsigned options, MemoryClass mclass)
+    {
+        // get the pointer to block table; and number of entries in it.
+        void **const blocktab = this->mem.get_block_list_ptr();
+        size_t const nblocks = this->mem.get_block_list_len(this->shape);
+        size_t const blocksize = sizeof(storage_type) * this->mem.get_elements_per_block(this->shape);
+        size_t const align = traits::is_indirect ? blocksize : std::min(size_t(256), sizeof(storage_type));
+
+        allocator.allocate_n(blocktab, // pointer to pointers,
+                             nblocks, // number of pointers
+                             blocksize, align, mclass, options, this->get_dtype());
+    }
+    [[gnu::noinline]] API_EXPORT void enum_memory_blocks_layout(hnnx::MemBlockEnumerator &en, MemoryClass mclass) const
+    {
+        // get the pointer to block table; and number of entries in it.
+        void **const blocktab = this->mem.get_block_list_ptr();
+        size_t const nblocks = this->mem.get_block_list_len(this->shape);
+        en.supply_blocks(this, mclass, (void *const *)blocktab, nblocks);
+    }
+    // called from find_content_hash in the ConcreteTensor class. hash_in includes
+    // hash of dtype and interface.
+    [[gnu::noinline]] API_EXPORT uint32_t find_content_hash_layout(uint32_t hash_in, bool is_float) const noexcept
+    {
+        uint32_t h = hash_in ^ (Linfo::Rank * 0x102401u);
+        h = Tensor::build_hash(shape->dims.data(), Linfo::Rank, hash_in);
+        if (is_padded) {
+            h = Tensor::build_hash(shape->max_dims.data(), Linfo::Rank, h);
+            // TODO: including padding too (or instead)
+        }
+        return mem.find_content_hash(shape, h, is_float);
+    }
+    API_EXPORT static const char *code_to_type_name() { return TensorTypeStruct<LayoutTensor<Linfo>>::name; }
+};
+
+//
+// Constructors of ConcreteTensor:
+// ConcreteTensor(const Op * producer_in, const OutputDef &def, Graph &graph_in)
+//    - build for given shape, attached to given producer.
+//  ConcreteTensor(const Op *producer_in, const OutputDef &def, Graph & graph_in, T * data_in)
+//    - same, but initialize pointer to given. Only available in 'flat' tensors.
+//  ConcreteTensor(hnnx::Deserz & dctx)
+//    - deserialize. Note that dctx contains a grap ref.
+//  ConcreteTensor(const ConcreteTensor &old, hnnx::Allocator *allocator,Tensor::clone_mode cmode)
+//    - 'clone duplicate' of the given tensor. Note that cmode is ignored.
+//
+
+template <typename Tinfo> class ConcreteTensor : public LayoutTensor<typename Tinfo::Lconfig> {
+  protected:
+    using Interface_t = typename Tinfo::Interface_t;
+    using Layout_t = typename Tinfo::Tlayout;
+    using Pad_t = typename Tinfo::Pad_t;
+    static constexpr DType dtype = dtype_of_type<Interface_t>();
+    API_EXPORT static constexpr bool is_indirect = Tinfo::is_indirect;
+    API_EXPORT static constexpr unsigned Rank = Layout_t::Rank;
+    using BaseLayout = LayoutTensor<typename Tinfo::Lconfig>;
+    using BaseRT = typename BaseLayout::BaseRT;
+
+    // make sure it's compatible with supplied base class
+    static_assert(Rank == BaseLayout::Rank && is_indirect == BaseLayout::traits::is_indirect &&
+                          std::is_same<Layout_t, typename BaseLayout::traits::layout_type>::value &&
+                          std::is_same<Pad_t, typename BaseLayout::traits::pad_type>::value,
+                  "incompatible base class for ConcreteTensor");
+
+    inline Interface_t const *interface_typed() const { return static_cast<Interface_t const *>(this->interface_ptr); }
+
+  public:
+    API_EXPORT const char *true_name() const override { return Tinfo::typetag; };
+    using Accessor_t = typename Interface_t::Accessor;
+    using Const_Accessor_t = typename Interface_t::AccessorRO;
+    using element_type = typename Interface_t::element_type;
+
+    struct API_EXPORT traits : public BaseLayout::traits {
+        static constexpr DType dtype = ConcreteTensor::dtype;
+        using element_type = typename dtype_traits<dtype>::element_type;
+        using raw_type = element_type; // result from get_raw()
+        using interface_type = Interface_t;
+        static constexpr MemoryClass memclass = Tinfo::memclass;
+    };
+    //
+    //  - build for given shape, attached to given producer.
+    //  - pass the nase class ctor a specialized ctor, it uses to make the interface
+    //   from the output def.
+    API_EXPORT ConcreteTensor(const Op *producer_in, const OutputDef &def, Graph &graph_in)
+        : BaseLayout(producer_in, def, graph_in, hnnx::make_interface<Interface_t>::from_odef)
+    {
+    }
+    API_EXPORT ConcreteTensor(const Op *producer_in, const OutputDef &def, Graph &graph_in, element_type *data_in)
+        : BaseLayout(producer_in, def, graph_in, hnnx::make_interface<Interface_t>::from_odef)
+    {
+        this->mem.set_raw_data_despite_danger((void *)data_in);
+    }
+    //   - deserialize. Note that dctx contains a graph ref.
+    //   We pass the base class a pointer to specialized function, which it uses to
+    //  deserialize the interface.
+    API_EXPORT explicit ConcreteTensor(hnnx::Deserz &dctx)
+        : BaseLayout(dctx, &hnnx::make_interface<Interface_t>::from_deser)
+    {
+    }
+    //    - 'clone duplicate' of the given tensor. Note that cmode is ignored.
+    API_EXPORT ConcreteTensor(const ConcreteTensor &old, hnnx::Allocator *allocator, Tensor::clone_mode cmode)
+        : BaseLayout(old, allocator, cmode)
+    {
+    }
+
+    API_EXPORT virtual DTypeScaleOff get_dtype_intfc() const noexcept override
+    {
+        return interface_typed()->get_dtype_scaleoff();
+    }
+
+    API_EXPORT virtual hnnx::InterfaceRef interface() const noexcept override final
+    {
+        return interface_typed()->get_refobj();
+    }
+    API_EXPORT inline float interface_scale() const { return interface_typed()->get_scale(); }
+    API_EXPORT inline float interface_scale_recip() const { return interface_typed()->get_scale_recip(); }
+    API_EXPORT inline int32_t interface_offset() const { return interface_typed()->get_offset(); }
+
+    API_EXPORT inline ALWAYSINLINE const element_type *element_ptr(size_t rank, const SIdx coords[]) const
+    {
+        return (element_type const *)this->element_addr0(rank, coords);
+    }
+    API_EXPORT inline ALWAYSINLINE element_type *element_ptr(size_t rank, const SIdx coords[])
+    {
+        return (element_type *)this->element_addr0(rank, coords);
+    }
+
+    // Some methods return the same thing as in LayoutTensor, but
+    // with the type being element_type instead of storage_type.
+    API_EXPORT inline std::conditional_t<is_indirect, void, element_type *&> data_ptr()
+    {
+        if constexpr (!is_indirect) {
+            return (element_type *&)this->mem.bulk_data;
+        }
+    }
+    API_EXPORT inline std::conditional_t<is_indirect, void, element_type *const &> data_ptr() const
+    {
+        if constexpr (!is_indirect) {
+            return (element_type *const &)this->mem.bulk_data;
+        }
+    }
+
+    // block table access
+    API_EXPORT inline element_type **blocktab_ptr() const { return (element_type **)this->mem.get_block_list_ptr(); }
+    API_EXPORT inline element_type *&blocktab_at(size_t i) { return (element_type *&)BaseLayout::blocktab_at(i); }
+    API_EXPORT inline element_type *const &blocktab_at(size_t i) const
+    {
+        return (element_type *const &)BaseLayout::blocktab_at(i);
+    }
+
+    template <typename... ind_types>
+    API_EXPORT inline element_type const *const *block_ptr_address(ind_types... inds) const
+    {
+        return (element_type const *const *)BaseLayout::block_ptr_address(inds...);
+    };
+    template <typename... ind_types> API_EXPORT inline element_type *const *block_ptr_address(ind_types... inds)
+    {
+        return (element_type *const *)BaseLayout::block_ptr_address(inds...);
+    };
+    template <typename... ind_types> API_EXPORT inline element_type const *block_ptr(ind_types... inds) const
+    {
+        return *this->block_ptr_address(inds...);
+    }
+    template <typename... ind_types> API_EXPORT inline element_type *block_ptr(ind_types... inds)
+    {
+        return *this->block_ptr_address(inds...);
+    }
+
+    // direct access methods.
+    //
+    template <typename... ind_types> API_EXPORT inline Const_Accessor_t operator()(ind_types... inds) const
+    {
+        static_assert(Rank == (sizeof...(ind_types)), "# of coords must match Rank");
+        const std::array<SIdx, Rank> coords = {static_cast<SIdx>(inds)...};
+        return Const_Accessor_t(this->element_addr0(Rank, coords.data()), interface_typed());
+    }
+    template <typename... ind_types> API_EXPORT inline Accessor_t operator()(ind_types... inds)
+    {
+        static_assert(Rank == (sizeof...(ind_types)), "# of coords must match Rank");
+        const std::array<SIdx, Rank> coords = {{static_cast<SIdx>(inds)...}};
+        return Accessor_t(this->element_addr0(Rank, coords.data()), interface_typed());
+    }
+    template <typename... ind_types> API_EXPORT inline element_type const &get_raw(ind_types... inds) const
+    {
+        static_assert(Rank == (sizeof...(ind_types)), "# of coords must match Rank");
+        const std::array<SIdx, Rank> coords = {{static_cast<SIdx>(inds)...}};
+        return *(element_type const *)this->element_addr0(Rank, coords.data());
+    }
+    template <typename... ind_types> API_EXPORT inline element_type &get_raw(ind_types... inds)
+    {
+        static_assert(Rank == (sizeof...(ind_types)), "# of coords must match Rank");
+        const std::array<SIdx, Rank> coords = {{static_cast<SIdx>(inds)...}};
+        return *(element_type *)this->element_addr0(Rank, coords.data());
+    }
+    template <typename... ind_types> API_EXPORT inline element_type const *get_raw_addr(ind_types... inds) const
+    {
+        static_assert(Rank == (sizeof...(ind_types)), "# of coords must match Rank");
+        const std::array<SIdx, Rank> coords = {{static_cast<SIdx>(inds)...}};
+        return (element_type const *)this->element_addr0(Rank, coords.data());
+    }
+    template <typename... ind_types> API_EXPORT inline element_type *get_raw_addr(ind_types... inds)
+    {
+        static_assert(Rank == (sizeof...(ind_types)), "# of coords must match Rank");
+        const std::array<SIdx, Rank> coords = {{static_cast<SIdx>(inds)...}};
+        return (element_type *)this->element_addr0(Rank, coords.data());
+    }
+    API_EXPORT virtual uint32_t get_tensor_format_code() const noexcept override
+    {
+        return Tensor::formatcode_for_general<traits>();
+    }
+
+    API_EXPORT virtual uint32_t get_tensor_info() const noexcept override
+    {
+        return Tensor::pack_tensor_info(traits::dtype, Rank, traits::memclass);
+    }
+    // allocation and enumeration.
+    API_EXPORT virtual void allocate_func(hnnx::Allocator &allocator, unsigned options) override final
+    {
+        this->allocate_layout(allocator, options, traits::memclass);
+    }
+    API_EXPORT virtual void enum_memory_blocks(hnnx::MemBlockEnumerator &en) const override
+    {
+        this->enum_memory_blocks_layout(en, traits::memclass);
+    }
+    // hash the dtype and interface, and let find_content_hash_layout do the rest.
+    API_EXPORT virtual uint32_t find_content_hash() const noexcept override final
+    {
+        uint32_t const h = interface().interface_hash() ^ mulu32_modular(unsigned(dtype), 0x107301);
+        static constexpr bool is_float = dtype_traits<dtype>::is_float;
+        return this->find_content_hash_layout(h, is_float);
+    }
+
+  protected:
+    // because this (may) need to return an "InterfaceRef" via iref pointer, it's defined
+    // here in the 'Concrete' class, but it uses the non-virtual 'element_addr0'
+    // in the LayoutTensor base class to find the address, and adds the InterfaceRef if
+    // requested.
+    API_EXPORT virtual ALWAYSINLINE void *
+    element_addr(size_t rank, const SIdx coords_in[],
+                 hnnx::InterfaceRef *const iref = nullptr) const noexcept final override
+    {
+        if (iref) *iref = interface_typed()->get_refobj();
+        return this->element_addr0(rank, coords_in);
+    }
+    API_EXPORT virtual int compare_sametype(const Tensor *rhs_in) const override
+    {
+        // compare the interface, and then all the rest is done in compare_sametype_layout.
+        auto *rhs = static_cast<ConcreteTensor const *>(rhs_in);
+        int const icmp = interface_typed()->compare(*rhs->interface_typed());
+        if (icmp != 0) return icmp;
+        return this->compare_sametype_layout(rhs);
+    }
+
+    API_EXPORT virtual void **clone_util(hnnx::Allocator *allocator, std::unique_ptr<Tensor> *tensp,
+                                         Tensor::tensor_blockinfo *infop) const override
+    {
+        void **retval = nullptr;
+        ConcreteTensor const *newtens = nullptr;
+        if (tensp) {
+            *tensp = std::make_unique<ConcreteTensor>(*this, allocator, Tensor::clone_mode::duplicate);
+            newtens = static_cast<ConcreteTensor const *>(tensp->get());
+            retval = (void **)newtens->mem.get_block_list_ptr();
+        }
+        if (infop) {
+            infop->setup(traits::dtype, traits::memclass);
+            infop->blkptrs = (void **)this->mem.get_block_list_ptr();
+            // pretend that a pointer to Shape<Rank> is really a pointer to its base class ShapeFlags
+            // we provide a pointer to the shape field in the cloned tensor, if applicable; otherwise in 'this'.
+            infop->shapepp = (const hnnx::ShapeFlags *const *)&(newtens ? newtens : this)->shape;
+            infop->interfacepp = &(newtens ? newtens : this)->interface_ptr;
+            infop->nblocks = this->mem.get_block_list_len(this->shape);
+            infop->blocksize = sizeof(element_type) * this->mem.get_elements_per_block(this->shape);
+            infop->is_indirect = this->is_indirect;
+            infop->is_chunked = traits::is_chunked;
+            return retval;
+        }
+        return nullptr;
+    }
+    API_EXPORT static const char *code_to_type_name() { return TensorTypeStruct<ConcreteTensor<Tinfo>>::name; }
+};
+
+template <typename T> class TensorIter;
+template <typename T> class TensorCIter;
+
+template <typename T> class IterableTensor {
+    typedef TensorIter<T> iterator;
+    typedef TensorCIter<T> const_iterator;
+    typedef ptrdiff_t difference_type;
+    typedef size_t size_type;
+    typedef T value_type;
+    typedef T *pointer;
+    typedef const T *const_pointer;
+    typedef T &reference;
+    typedef const T &const_reference;
+
+  protected:
+    pointer myTensor;
+    const_pointer myCTensor;
+    const std::array<size_t, 4> increments;
+    mutable std::array<size_t, 4> dims;
+    const bool is_const;
+
+  public:
+    friend iterator; //class TensorIter<T> ;
+    friend const_iterator;
+
+    API_EXPORT inline IterableTensor(reference t, std::array<size_t, 4> inc)
+        : myTensor(&t), myCTensor(const_cast<const_pointer>(&t)), increments(inc), is_const(false)
+    {
+        assert(myCTensor && myCTensor->rank() == 4);
+        for (int i = 0; i < 4; i++) {
+            dims[i] = myCTensor->dim(i);
+        }
+    }
+
+    API_EXPORT inline IterableTensor(const_reference t, std::array<size_t, 4> inc)
+        : myTensor(nullptr), myCTensor(&t), increments(inc), is_const(true)
+    {
+        assert(myCTensor && myCTensor->rank() == 4);
+        for (int i = 0; i < 4; i++) {
+            dims[i] = myCTensor->dim(i);
+        }
+    }
+
+    API_EXPORT inline size_t dim(size_t index) const { return dims[index]; }
+
+    API_EXPORT inline auto access(size_t b, size_t h, size_t w, size_t d) &
+    {
+        assert(!is_const && myTensor);
+        return (*myTensor)(b, h, w, d);
+    }
+    API_EXPORT inline auto access(size_t b, size_t h, size_t w, size_t d) &&
+    {
+        assert(myCTensor);
+        return (*myCTensor)(b, h, w, d);
+    }
+    API_EXPORT inline auto access(size_t b, size_t h, size_t w, size_t d) const &&
+    {
+        assert(myCTensor);
+        return (*myCTensor)(b, h, w, d);
+    }
+
+    API_EXPORT inline auto read(size_t b, size_t h, size_t w, size_t d) const
+    {
+        assert(myCTensor);
+        return (*myCTensor)(b, h, w, d);
+    }
+
+    API_EXPORT inline auto operator()(size_t b, size_t h, size_t w, size_t d) { return access(b, h, w, d); }
+    API_EXPORT inline auto operator()(size_t b, size_t h, size_t w, size_t d) const
+    {
+        assert(myCTensor);
+        return (*myCTensor)(b, h, w, d);
+    }
+
+    API_EXPORT inline bool operator==(const IterableTensor<T> &it) const
+    {
+        return (this->myCTensor == it.myCTensor) && (this->increments == it.increments);
+    }
+    API_EXPORT inline bool operator!=(const IterableTensor<T> &it) const { return !(*this == it); }
+
+    API_EXPORT inline iterator begin()
+    {
+        std::array<size_t, 4> const start = {0, 0, 0, 0};
+        return iterator(*this, start);
+    }
+    API_EXPORT inline const_iterator begin() const
+    {
+        std::array<size_t, 4> start = {0, 0, 0, 0};
+        return const_iterator(*this, start);
+    }
+    API_EXPORT inline iterator begin(std::array<size_t, 4> start) { return iterator(*this, start); }
+    API_EXPORT inline const_iterator begin(std::array<size_t, 4> start) const { return const_iterator(*this, start); }
+    API_EXPORT inline iterator end()
+    {
+        std::array<size_t, 4> const end = {dims[0], 0, 0, 0};
+        return iterator(*this, end);
+    }
+    API_EXPORT inline const_iterator end() const
+    {
+        std::array<size_t, 4> end = {dims[0], 0, 0, 0};
+        return const_iterator(*this, end);
+    }
+    API_EXPORT inline iterator end(std::array<size_t, 4> end) { return iterator(*this, end); }
+    API_EXPORT inline const_iterator end(std::array<size_t, 4> end) const { return const_iterator(*this, end); }
+
+    API_EXPORT ~IterableTensor() {}
+};
+
+template <typename T> class TensorIter {
+  private:
+    IterableTensor<T> &myITensor;
+    std::array<size_t, 4> location;
+    API_EXPORT bool increment(size_t dim)
+    {
+        size_t const inc = myITensor.increments[dim];
+        if (inc) {
+            size_t const loc = location[dim];
+            if (loc + inc < myITensor.dim(dim)) {
+                location[dim] += inc;
+                return true;
+            } else if (dim != 0) {
+                location[dim] = 0;
+            }
+        }
+        if (dim == 0) {
+            location[0]++;
+            return true;
+        }
+
+        return false;
+    }
+
+    API_EXPORT inline void incrementLocation()
+    {
+        int i = location.size();
+        while (!increment(--i))
+            ;
+    }
+
+  protected:
+    API_EXPORT inline TensorIter(const TensorIter<T> &to_copy)
+        : myITensor(to_copy.myITensor), location(to_copy.location)
+    {
+    }
+
+  public:
+    API_EXPORT inline TensorIter(IterableTensor<T> &it, std::array<size_t, 4> loc) : myITensor(it), location(loc) {}
+
+    API_EXPORT inline TensorIter<T> &clone() { return TensorIter<T>(*this); }
+
+    API_EXPORT inline std::array<size_t, 4> get_location() { return location; }
+
+    API_EXPORT inline bool operator==(const TensorIter<T> &ti) const
+    {
+        if (this->myITensor == ti.myITensor) {
+            for (int i = 0; i < this->location.size(); i++) {
+                if (this->location[i] != ti.location[i]) {
+                    return false;
+                }
+            }
+            return true;
+        }
+        return false;
+    }
+    API_EXPORT inline bool operator!=(const TensorIter<T> &ti) const { return !(*this == ti); }
+    API_EXPORT inline operator float() const { return myITensor(location[0], location[1], location[2], location[3]); }
+    API_EXPORT inline TensorIter<T> &operator=(const float v)
+    {
+        myITensor(location[0], location[1], location[2], location[3]) = v;
+        return *this;
+    }
+    //inline TensorIter<T>&operator=(const TensorIter<T>& v) { return this->operator=(float(v)); }
+    //inline auto & operator*() {return myITensor(location[0],location[1],location[2],location[3]);}
+    API_EXPORT inline TensorIter<T> &operator++()
+    {
+        incrementLocation();
+        return *this;
+    }
+    API_EXPORT inline TensorIter<T> operator++(int)
+    {
+        TensorIter<T> const clone = TensorIter<T>(*this);
+        incrementLocation();
+        return clone;
+    }
+
+    ~TensorIter() {}
+};
+
+template <typename T> class TensorCIter {
+  private:
+    const IterableTensor<T> &myITensor;
+    std::array<size_t, 4> location;
+    API_EXPORT bool increment(size_t dim)
+    {
+        const size_t inc = myITensor.increments[dim];
+        if (inc) {
+            size_t const loc = location[dim];
+            if (loc + inc < myITensor.dim(dim)) {
+                location[dim] += inc;
+                return true;
+            } else if (dim != 0) {
+                location[dim] = 0;
+            }
+        }
+        if (dim == 0) {
+            location[0]++;
+            return true;
+        }
+
+        return false;
+    }
+
+    API_EXPORT inline void incrementLocation()
+    {
+        int i = location.size();
+        while (!increment(--i))
+            ;
+    }
+
+  public:
+    API_EXPORT inline TensorCIter(const IterableTensor<T> &it, std::array<size_t, 4> loc) : myITensor(it), location(loc)
+    {
+    }
+
+    API_EXPORT inline std::array<size_t, 4> get_location() { return location; }
+
+    API_EXPORT inline bool operator==(const TensorCIter<T> &ti) const
+    {
+        if (this->myITensor == ti.myITensor) {
+            for (int i = 0; i < this->location.size(); i++) {
+                if (this->location[i] != ti.location[i]) {
+                    return false;
+                }
+            }
+            return true;
+        }
+        return false;
+    }
+    API_EXPORT inline bool operator!=(const TensorCIter<T> &ti) const { return !(*this == ti); }
+    API_EXPORT inline operator float() const
+    {
+        return myITensor.read(location[0], location[1], location[2], location[3]);
+    }
+    API_EXPORT inline TensorCIter<T> &operator++()
+    {
+        incrementLocation();
+        return *this;
+    }
+    API_EXPORT inline TensorCIter<T> operator++(int)
+    {
+        TensorCIter<T> clone(*this);
+        incrementLocation();
+        return clone;
+    }
+
+    ~TensorCIter() {}
+};
+
+//----- to be removed start -----
+// HashTable tensor
+// EJP: FIXME in the future!
+// We want tensors in the future to be read-only
+// This has mutable members
+// Instead we will need to allocate the normal stuff in memory
+// And separately allocate a little extra in the per-instance mutable data pool for the mutable data
+// But for now we will just modify the tensor.
+template <typename Tinfo> class HashTableTensor : public ConcreteTensor<Tinfo> {
+  public:
+    //basically inherit constructors
+    using ConcreteTensor<Tinfo>::ConcreteTensor;
+    // Add extra scalar parameters for metadata: capacity, etc. Note these are mutable!
+    mutable uint32_t max_chain = 0; // EJP: FIXME: MUTABLE!
+};
+//----- to be removed end -----
+
+namespace Ldefs {
+template <unsigned elbytes> struct stype_for;
+template <> struct stype_for<1> {
+    typedef uint8_t type;
+};
+template <> struct stype_for<2> {
+    typedef uint16_t type;
+};
+template <> struct stype_for<4> {
+    typedef NN_UINT32_T type;
+};
+template <> struct stype_for<8> {
+    typedef NN_UINT64_T type;
+};
+} // namespace Ldefs
+// macro to define a layout config struct in Ldefs namespace:
+// paramaters are:
+//  - name of struct (in Ldefs namespace)
+//  - number of bytes per  storage element
+//  - 'layout' type (which determines rank
+//  - name of 'padding' template (Pading or NoPadding).
+//
+// Normally, if the layout is chunked, you get an indirect tensor.
+// Use LAYOUTDEF_CONTIG to get a contiguous tensor with a chunked layout.
+//
+// Do not create different configurations with the same parameters;
+// all this does is generate extra duplicate code.
+//
+#define LAYOUTDEF(NAME, ELBYTES, LAYOUT, PAD)                                                                          \
+    namespace Ldefs {                                                                                                  \
+    struct API_EXPORT NAME {                                                                                           \
+        using Tlayout = LAYOUT;                                                                                        \
+        using storage_type = stype_for<ELBYTES>::type;                                                                 \
+        static constexpr unsigned Rank = Tlayout::Rank;                                                                \
+        using Pad_t = PAD<Rank>;                                                                                       \
+        static constexpr bool is_chunked = Tlayout::chunk_total > 1;                                                   \
+        static constexpr bool is_indirect = is_chunked;                                                                \
+    };                                                                                                                 \
+    }
+// define a layout config which has chunked addressing, but contiguous alloc.
+#define LAYOUTDEF_CONTIG(NAME, ELBYTES, LAYOUT, PAD)                                                                   \
+    namespace Ldefs {                                                                                                  \
+    struct API_EXPORT NAME {                                                                                           \
+        using Tlayout = LAYOUT;                                                                                        \
+        using storage_type = stype_for<ELBYTES>::type;                                                                 \
+        static constexpr unsigned Rank = Tlayout::Rank;                                                                \
+        using Pad_t = PAD<Rank>;                                                                                       \
+        static constexpr bool is_chunked = Tlayout::chunk_total > 1;                                                   \
+        static constexpr bool is_indirect = false;                                                                     \
+    };                                                                                                                 \
+    }
+
+#define DEFINE_TYPENAMES(TYPE, NAME)                                                                                   \
+    DEFINE_TYPENAME(TYPE, NAME);                                                                                       \
+    DEFINE_TYPENAME_V(Vector<const TYPE *>, NAME);
+
+// Create function that accesses the TensorTypeStruct::name that places the map of opcode ->
+// typename in .rodata.
+// There are two versions of this function, one below (which is called specifically for those
+// tensor types which are NOT one off : RankedTensor, TensorSclrDT, LayoutTensor,
+// ConcreteTensor).
+// If it is one of the above four tensor types, it is declared as a static member function which
+// gets created during the explicity template specialisations below.
+// Behaviour:
+//  - If explicity specialised and one of RankedTensor, TensorSclrDT, LayoutTensor, ConcreteTensor,
+//    static member function creates the map entry in .rodata.
+//  - If not explicity specialised, need to call DECLARE_TENSOR_CODE_TO_TYPENAME_STRING macro in
+//    order to place entry in .rodata
+template <typename T> API_FUNC_EXPORT constexpr const char *code_to_type_name()
+{
+    return "unknown";
+}
+
+#define DECLARE_TENSOR_CODE_TO_TYPENAME_STRING(TYPE)                                                                   \
+    template <> API_FUNC_EXPORT const char *code_to_type_name<TYPE>() { return TensorTypeStruct<TYPE>::name; }
+
+// macro to define a ConcreteTensor config in Tdefs namespace
+//  LAYOUTNAME is a layout defined by LAYOUTDEF macro
+// DTYPE and MCLASS are just dtype and memory class.
+// You must use a layout with element size matching the dtype.
+//
+// It is possible to create different configurations with
+// the same paramaters; and in this way create different
+// ConcreateTensor types which behave in the same way.
+//
+// For instamce, QFloatCrouton and Int32Crouton have different identities
+// and the same configuration.
+//
+#define TENSORDEF_MC(NAME, LAYOUTNAME, DTYPE, MCLASS, ENCODENAME)                                                      \
+    namespace Tdefs {                                                                                                  \
+    struct API_EXPORT NAME {                                                                                           \
+        using Lconfig = Ldefs::LAYOUTNAME;                                                                             \
+        using Tlayout = Lconfig::Tlayout;                                                                              \
+        using storage_type = Lconfig::storage_type;                                                                    \
+        using element_type = dtype_traits<DTYPE>::element_type;                                                        \
+        static_assert(sizeof(element_type) == sizeof(storage_type), "layout has wrong element size");                  \
+        using Interface_t = std::conditional_t<dtype_traits<DTYPE>::is_quant, ScaleOffsetInterface<element_type>,      \
+                                               PlainInterface<element_type>>;                                          \
+        static constexpr size_t Rank = Lconfig::Rank;                                                                  \
+        using Pad_t = Lconfig::Pad_t;                                                                                  \
+        static constexpr bool is_chunked = Lconfig::is_chunked;                                                        \
+        static constexpr bool is_indirect = Lconfig::is_indirect;                                                      \
+        static constexpr MemoryClass memclass = MCLASS;                                                                \
+        static constexpr const char *typetag = ENCODENAME;                                                             \
+    };                                                                                                                 \
+    }                                                                                                                  \
+    DEFINE_TYPENAMES(ConcreteTensor<Tdefs::NAME>, ENCODENAME);
+
+//----- to be removed start -----
+#define TENSORDEF_HASH(NAME, LAYOUTNAME, DTYPE, MCLASS, ENCODENAME)                                                    \
+    namespace Tdefs {                                                                                                  \
+    struct API_EXPORT NAME {                                                                                           \
+        using Lconfig = Ldefs::LAYOUTNAME;                                                                             \
+        using Tlayout = Lconfig::Tlayout;                                                                              \
+        using storage_type = Lconfig::storage_type;                                                                    \
+        using element_type = dtype_traits<DTYPE>::element_type;                                                        \
+        static_assert(sizeof(element_type) == sizeof(storage_type), "layout has wrong element size");                  \
+        using Interface_t = std::conditional_t<dtype_traits<DTYPE>::is_quant, ScaleOffsetInterface<element_type>,      \
+                                               PlainInterface<element_type>>;                                          \
+        static constexpr size_t Rank = Lconfig::Rank;                                                                  \
+        using Pad_t = Lconfig::Pad_t;                                                                                  \
+        static constexpr bool is_chunked = Lconfig::is_chunked;                                                        \
+        static constexpr bool is_indirect = Lconfig::is_indirect;                                                      \
+        static constexpr MemoryClass memclass = MCLASS;                                                                \
+        static constexpr const char *typetag = ENCODENAME;                                                             \
+    };                                                                                                                 \
+    }                                                                                                                  \
+    DEFINE_TYPENAMES(HashTableTensor<Tdefs::NAME>, ENCODENAME);
+//----- to be removed end -----
+
+#define TENSORDEF(NAME, LAYOUTNAME, DTYPE, ENCODENAME)                                                                 \
+    TENSORDEF_MC(NAME, LAYOUTNAME, DTYPE, MemoryClass::Default, ENCODENAME)
+
+// LAYOUTDEF defines a configuration
+//
+LAYOUTDEF(Flat_8, 1, R4FlatMemoryLayout, NoPadding)
+LAYOUTDEF(Flat5D_8, 1, R5FlatMemoryLayout, NoPadding)
+LAYOUTDEF(Flat_16, 2, R4FlatMemoryLayout, NoPadding)
+LAYOUTDEF(Flat5D_16, 2, R5FlatMemoryLayout, NoPadding)
+LAYOUTDEF(Flat_32, 4, R4FlatMemoryLayout, NoPadding)
+LAYOUTDEF(Flat5D_32, 4, R5FlatMemoryLayout, NoPadding)
+LAYOUTDEF(Flat6D_32, 4, R6FlatMemoryLayout, NoPadding)
+LAYOUTDEF(Flat_64, 8, R4FlatMemoryLayout, NoPadding)
+
+LAYOUTDEF(Crouton_8, 1, R4CroutonLayout, Padding)
+LAYOUTDEF(Crouton_16, 2, R4Crouton2Layout, Padding)
+LAYOUTDEF(Crouton_32, 4, R4Crouton4Layout, Padding)
+LAYOUTDEF(Crouton4x1_8, 1, R4Crouton4x1Layout, Padding)
+LAYOUTDEF(Crouton2x2_8, 1, R4Crouton2x2Layout, Padding)
+LAYOUTDEF(WideCrouton_8, 1, R4WideCroutonLayout, Padding)
+LAYOUTDEF(WideCrouton2x2_8, 1, R4WideCrouton2x2Layout, Padding)
+LAYOUTDEF(WideCrouton_32, 4, R4WideCrouton4Layout, Padding)
+
+// UNUSED LAYOUTDEF(R4Depth32_32, 4, R4Depth32MemoryLayout, NoPadding)
+// UNUSED LAYOUTDEF(R4Depth32_32pad, 4, R4Depth32MemoryLayout, Padding)
+
+DEFINE_TYPENAME(LayoutTensor<Ldefs::Flat_8>, "yfB")
+DEFINE_TYPENAME(LayoutTensor<Ldefs::Flat5D_8>, "yf5B")
+DEFINE_TYPENAME(LayoutTensor<Ldefs::Flat_16>, "yfH")
+DEFINE_TYPENAME(LayoutTensor<Ldefs::Flat5D_16>, "yf5H")
+DEFINE_TYPENAME(LayoutTensor<Ldefs::Flat_32>, "yfI")
+DEFINE_TYPENAME(LayoutTensor<Ldefs::Flat5D_32>, "yf5I")
+DEFINE_TYPENAME(LayoutTensor<Ldefs::Flat6D_32>, "yf6I")
+DEFINE_TYPENAME(LayoutTensor<Ldefs::Flat_64>, "yfL")
+
+DEFINE_TYPENAME(LayoutTensor<Ldefs::Crouton_8>, "ycB")
+DEFINE_TYPENAME(LayoutTensor<Ldefs::Crouton_16>, "ycH")
+DEFINE_TYPENAME(LayoutTensor<Ldefs::Crouton_32>, "ycI")
+
+// 5D LAYOUTDEFs for Croutons
+// LAYOUTDEF(Crouton_8_5D, 1, R5CroutonLayout, Padding)
+// LAYOUTDEF(Crouton_16_5D, 2, R5Crouton2Layout, Padding)
+// LAYOUTDEF(Crouton_32_5D, 4, R5Crouton4Layout, Padding)
+
+// TENSORDEF
+// 8-bit
+TENSORDEF(QuantUint8, Flat_8, DType::QUInt8, "fB")
+TENSORDEF(QuantUint8_5D, Flat5D_8, DType::QUInt8, "f5B")
+TENSORDEF(QuantInt8, Flat_8, DType::QInt8, "fb")
+TENSORDEF(QuantInt8_5D, Flat5D_8, DType::QInt8, "f5b")
+TENSORDEF(QUint8Crouton, Crouton_8, DType::QUInt8, "cB")
+TENSORDEF(QUint8Crouton4x1, Crouton4x1_8, DType::QUInt8, "c#B")
+TENSORDEF(QUint8Crouton2x2, Crouton2x2_8, DType::QUInt8, "c#B")
+TENSORDEF(QUint8WideCrouton, WideCrouton_8, DType::QUInt8, "wB")
+TENSORDEF(QUint8WideCrouton2x2, WideCrouton2x2_8, DType::QUInt8, "w#B")
+TENSORDEF(QInt8Crouton, Crouton_8, DType::QInt8, "cb")
+
+TENSORDEF_MC(QuantUint8_TCM, Flat_8, DType::QUInt8, MemoryClass::TCM, "FB")
+TENSORDEF_MC(QuantUint8_5D_TCM, Flat5D_8, DType::QUInt8, MemoryClass::TCM, "F5B")
+TENSORDEF_MC(QuantInt8_TCM, Flat_8, DType::QInt8, MemoryClass::TCM, "Fb")
+TENSORDEF_MC(QuantInt8_5D_TCM, Flat5D_8, DType::QInt8, MemoryClass::TCM, "F5b")
+TENSORDEF_MC(QUint8Crouton_TCM, Crouton_8, DType::QUInt8, MemoryClass::TCM, "CB")
+TENSORDEF_MC(QUint8Crouton4x1_TCM, Crouton4x1_8, DType::QUInt8, MemoryClass::TCM, "C#B")
+TENSORDEF_MC(QUint8Crouton2x2_TCM, Crouton2x2_8, DType::QUInt8, MemoryClass::TCM, "C#B")
+TENSORDEF_MC(QUint8WideCrouton_TCM, WideCrouton_8, DType::QUInt8, MemoryClass::TCM, "WB")
+TENSORDEF_MC(QUint8WideCrouton2x2_TCM, WideCrouton2x2_8, DType::QUInt8, MemoryClass::TCM, "W#B")
+TENSORDEF_MC(QInt8Crouton_TCM, Crouton_8, DType::QInt8, MemoryClass::TCM, "Cb")
+
+// 16-bit
+TENSORDEF(QuantUint16, Flat_16, DType::QUInt16, "fH")
+TENSORDEF(QuantUint16_5D, Flat5D_16, DType::QUInt16, "f5H")
+TENSORDEF(QuantInt16, Flat_16, DType::QInt16, "fh")
+TENSORDEF(QuantInt16_5D, Flat5D_16, DType::QInt16, "f5h")
+TENSORDEF(QUint16Crouton, Crouton_16, DType::QUInt16, "cH")
+TENSORDEF(QInt16Crouton, Crouton_16, DType::QInt16, "ch")
+TENSORDEF(F16Crouton, Crouton_16, DType::Float16, "ce")
+TENSORDEF(F16Weights, Flat_16, DType::Float16, "fw")
+TENSORDEF(PlainFloat16, Flat_16, DType::Float16, "fe")
+TENSORDEF(PlainFloat16_5D, Flat5D_16, DType::Float16, "f5e")
+
+TENSORDEF_MC(QuantUint16_TCM, Flat_16, DType::QUInt16, MemoryClass::TCM, "FH")
+TENSORDEF_MC(QuantUint16_5D_TCM, Flat5D_16, DType::QUInt16, MemoryClass::TCM, "F5H")
+TENSORDEF_MC(QuantInt16_TCM, Flat_16, DType::QInt16, MemoryClass::TCM, "Fh")
+TENSORDEF_MC(QuantInt16_5D_TCM, Flat5D_16, DType::QInt16, MemoryClass::TCM, "F5h")
+TENSORDEF_MC(QUint16Crouton_TCM, Crouton_16, DType::QUInt16, MemoryClass::TCM, "CH")
+TENSORDEF_MC(QInt16Crouton_TCM, Crouton_16, DType::QInt16, MemoryClass::TCM, "Ch")
+TENSORDEF_MC(F16Crouton_TCM, Crouton_16, DType::Float16, MemoryClass::TCM, "Ce")
+TENSORDEF_MC(F16Weights_TCM, Flat_16, DType::Float16, MemoryClass::TCM, "Fw")
+TENSORDEF_MC(PlainFloat16_TCM, Flat_16, DType::Float16, MemoryClass::TCM, "Fe")
+TENSORDEF_MC(PlainFloat16_5D_TCM, Flat5D_16, DType::Float16, MemoryClass::TCM, "F5e")
+
+// 32-bit
+TENSORDEF(Int32, Flat_32, DType::Int32, "fi")
+TENSORDEF(Int32_5D, Flat5D_32, DType::Int32, "f5i")
+TENSORDEF(Int32_6D, Flat6D_32, DType::Int32, "f6i")
+TENSORDEF(QuantInt32, Flat_32, DType::QInt32, "fs")
+TENSORDEF(PlainFloat, Flat_32, DType::Float32, "ff")
+TENSORDEF(PlainFloat5D, Flat5D_32, DType::Float32, "f5f")
+TENSORDEF(QFloat, Flat_32, DType::Int32, "ft")
+// UNUSED TENSORDEF(D32Float, R4Depth32_32, DType::Float32, "rf")
+// UNUSED TENSORDEF(D32PaddedFloat, R4Depth32_32pad, DType::Float32, "pf")
+TENSORDEF(Int32Crouton, Crouton_32, DType::Int32, "ci")
+TENSORDEF(QInt32Crouton, Crouton_32, DType::QInt32, "cs")
+TENSORDEF(QInt32WideCrouton, WideCrouton_32, DType::QInt32, "ws")
+TENSORDEF(QFloatCrouton, Crouton_32, DType::Int32, "ct")
+TENSORDEF(FloatCrouton, Crouton_32, DType::Float32, "cf")
+
+TENSORDEF_MC(Int32_TCM, Flat_32, DType::Int32, MemoryClass::TCM, "Fi")
+TENSORDEF_MC(Int32_5D_TCM, Flat5D_32, DType::Int32, MemoryClass::TCM, "F5i")
+TENSORDEF_MC(QInt32Crouton_TCM, Crouton_32, DType::QInt32, MemoryClass::TCM, "Cs")
+TENSORDEF_MC(QInt32WideCrouton_TCM, WideCrouton_32, DType::QInt32, MemoryClass::TCM, "Ws")
+TENSORDEF_MC(QuantInt32_TCM, Flat_32, DType::QInt32, MemoryClass::TCM, "Fs")
+TENSORDEF_MC(PlainFloat_TCM, Flat_32, DType::Float32, MemoryClass::TCM, "Ff")
+TENSORDEF_MC(PlainFloat_5D_TCM, Flat5D_32, DType::Float32, MemoryClass::TCM, "F5f")
+TENSORDEF_MC(QFloat_TCM, Flat_32, DType::Int32, MemoryClass::TCM, "Ft")
+TENSORDEF_MC(Int32Crouton_TCM, Crouton_32, DType::Int32, MemoryClass::TCM, "Ci")
+TENSORDEF_MC(QFloatCrouton_TCM, Crouton_32, DType::Int32, MemoryClass::TCM, "Ct")
+TENSORDEF_MC(FloatCrouton_TCM, Crouton_32, DType::Float32, MemoryClass::TCM, "Cf")
+TENSORDEF_HASH(Int32Hash, Flat_32, DType::Int32, MemoryClass::Default, "o1i") // to be removed
+TENSORDEF_HASH(Int32Hash_TCM, Flat_32, DType::Int32, MemoryClass::TCM, "O1i") // to be removed
+
+// 64-bit
+TENSORDEF(Int64, Flat_64, DType::Int64, "fl")
+TENSORDEF_MC(Int64_TCM, Flat_64, DType::Int64, MemoryClass::TCM, "Fl")
+
+DEFINE_TYPENAMES(Vector<Tensor *>, "t*");
+DEFINE_TYPENAMES(TensorScalar<float>, "nf");
+DEFINE_TYPENAMES(TensorScalar<NN_INT32_T>, "ni");
+DEFINE_TYPENAMES(TensorScalar<NN_INT64_T>, "nl");
+DEFINE_TYPENAMES(TensorShape<1>, "s1");
+DEFINE_TYPENAMES(TensorShape<2>, "s2");
+DEFINE_TYPENAMES(TensorShape<3>, "s3");
+DEFINE_TYPENAMES(TensorShape<4>, "s4");
+DEFINE_TYPENAMES(TensorShape<5>, "s5");
+DEFINE_TYPENAMES(Tensor, "t");
+
+template <> constexpr const char *type_name<Graph>()
+{
+    return "";
+}
+
+extern template class ConcreteTensor<Tdefs::PlainFloat>;
+extern template class ConcreteTensor<Tdefs::PlainFloat5D>;
+extern template class ConcreteTensor<Tdefs::PlainFloat_TCM>;
+extern template class ConcreteTensor<Tdefs::PlainFloat_5D_TCM>;
+extern template class ConcreteTensor<Tdefs::PlainFloat16>;
+extern template class ConcreteTensor<Tdefs::PlainFloat16_TCM>;
+extern template class ConcreteTensor<Tdefs::PlainFloat16_5D>;
+extern template class ConcreteTensor<Tdefs::PlainFloat16_5D_TCM>;
+extern template class ConcreteTensor<Tdefs::QFloat>;
+extern template class ConcreteTensor<Tdefs::QFloat_TCM>;
+// UNUSED extern template class ConcreteTensor<Tdefs::D32Float>;
+// UNUSED extern template class ConcreteTensor<Tdefs::D32PaddedFloat>;
+extern template class ConcreteTensor<Tdefs::QuantUint8>;
+extern template class ConcreteTensor<Tdefs::QuantUint8_5D>;
+extern template class ConcreteTensor<Tdefs::QuantInt8>;
+extern template class ConcreteTensor<Tdefs::QuantInt8_5D>;
+extern template class ConcreteTensor<Tdefs::QuantUint16>;
+extern template class ConcreteTensor<Tdefs::QuantUint16_5D>;
+extern template class ConcreteTensor<Tdefs::QuantInt16>;
+extern template class ConcreteTensor<Tdefs::QuantInt16_5D>;
+extern template class ConcreteTensor<Tdefs::QuantInt16_TCM>;
+extern template class ConcreteTensor<Tdefs::QuantInt16_5D_TCM>;
+extern template class ConcreteTensor<Tdefs::QuantInt32>;
+extern template class ConcreteTensor<Tdefs::QuantInt32_TCM>;
+extern template class ConcreteTensor<Tdefs::Int32>;
+extern template class ConcreteTensor<Tdefs::Int32_5D>;
+extern template class ConcreteTensor<Tdefs::Int32_6D>;
+extern template class ConcreteTensor<Tdefs::QUint8Crouton>;
+extern template class ConcreteTensor<Tdefs::QInt8Crouton>;
+extern template class ConcreteTensor<Tdefs::QUint8Crouton_TCM>;
+extern template class ConcreteTensor<Tdefs::QInt8Crouton_TCM>;
+extern template class ConcreteTensor<Tdefs::QUint8Crouton4x1>;
+extern template class ConcreteTensor<Tdefs::QUint8Crouton4x1_TCM>;
+extern template class ConcreteTensor<Tdefs::QUint8Crouton2x2>;
+extern template class ConcreteTensor<Tdefs::QUint8Crouton2x2_TCM>;
+extern template class ConcreteTensor<Tdefs::QuantUint8_TCM>;
+extern template class ConcreteTensor<Tdefs::QuantUint8_5D_TCM>;
+extern template class ConcreteTensor<Tdefs::QuantUint16_TCM>;
+extern template class ConcreteTensor<Tdefs::QuantUint16_5D_TCM>;
+extern template class ConcreteTensor<Tdefs::QuantInt8_TCM>;
+extern template class ConcreteTensor<Tdefs::QuantInt8_5D_TCM>;
+extern template class ConcreteTensor<Tdefs::QUint16Crouton>;
+extern template class ConcreteTensor<Tdefs::QUint16Crouton_TCM>;
+extern template class ConcreteTensor<Tdefs::F16Crouton>;
+extern template class ConcreteTensor<Tdefs::F16Crouton_TCM>;
+extern template class ConcreteTensor<Tdefs::F16Weights>;
+extern template class ConcreteTensor<Tdefs::F16Weights_TCM>;
+extern template class ConcreteTensor<Tdefs::QInt32Crouton>;
+extern template class ConcreteTensor<Tdefs::QInt32Crouton_TCM>;
+extern template class ConcreteTensor<Tdefs::Int32Crouton>;
+extern template class ConcreteTensor<Tdefs::Int32Crouton_TCM>;
+extern template class ConcreteTensor<Tdefs::QFloatCrouton>;
+extern template class ConcreteTensor<Tdefs::QFloatCrouton_TCM>;
+extern template class ConcreteTensor<Tdefs::FloatCrouton>;
+extern template class ConcreteTensor<Tdefs::FloatCrouton_TCM>;
+
+// standard layouts are instantiated in tensor.h
+extern template class LayoutTensor<Ldefs::Flat_8>;
+extern template class LayoutTensor<Ldefs::Flat_16>;
+extern template class LayoutTensor<Ldefs::Flat_32>;
+extern template class LayoutTensor<Ldefs::Flat5D_32>;
+extern template class LayoutTensor<Ldefs::Flat6D_32>;
+
+extern template class LayoutTensor<Ldefs::Crouton_8>;
+extern template class LayoutTensor<Ldefs::Crouton_16>;
+extern template class LayoutTensor<Ldefs::Crouton_32>;
+
+// shape and scalar tensor
+extern template class TensorShape<1>;
+extern template class TensorShape<2>;
+extern template class TensorShape<3>;
+extern template class TensorShape<4>;
+extern template class TensorShape<5>;
+
+extern template class TensorSclrDT<dtype_of_type<PlainInterface<float>>()>;
+extern template class TensorSclrDT<dtype_of_type<PlainInterface<NN_INT32_T>>()>;
+
+template <typename T> // FIXME  - alias for transition
+using TensorContiguous = ConcreteTensor<T>;
+
+/////////////////////////
+typedef ConcreteTensor<Tdefs::PlainFloat16_5D> PlainFloat16Tensor5D;
+typedef ConcreteTensor<Tdefs::PlainFloat5D> PlainFloatTensor5D;
+typedef ConcreteTensor<Tdefs::PlainFloat> PlainFloatTensor;
+typedef ConcreteTensor<Tdefs::PlainFloat16> PlainFloat16Tensor;
+// UNUSED typedef ConcreteTensor<Tdefs::D32Float> D32FloatTensor;
+// UNUSED typedef ConcreteTensor<Tdefs::D32PaddedFloat> D32PaddedFloatTensor;
+typedef ConcreteTensor<Tdefs::QuantUint8> QuantUint8Tensor;
+typedef ConcreteTensor<Tdefs::QuantUint8_5D> QuantUint8Tensor5D;
+typedef ConcreteTensor<Tdefs::QuantInt8> QuantInt8Tensor;
+typedef ConcreteTensor<Tdefs::QuantInt8_5D> QuantInt8Tensor5D;
+typedef ConcreteTensor<Tdefs::QuantUint16> QuantUint16Tensor;
+typedef ConcreteTensor<Tdefs::QuantUint16_5D> QuantUint16Tensor5D;
+typedef ConcreteTensor<Tdefs::QuantInt16> QuantInt16Tensor;
+typedef ConcreteTensor<Tdefs::QuantInt16_5D> QuantInt16Tensor5D;
+typedef ConcreteTensor<Tdefs::QuantInt32> QuantInt32Tensor;
+typedef ConcreteTensor<Tdefs::Int32> Int32Tensor;
+typedef ConcreteTensor<Tdefs::Int32_5D> Int32Tensor5D;
+typedef ConcreteTensor<Tdefs::Int32_6D> Int32Tensor6D;
+typedef ConcreteTensor<Tdefs::Int64> Int64Tensor;
+typedef ConcreteTensor<Tdefs::QUint8Crouton> QUint8CroutonTensor;
+typedef ConcreteTensor<Tdefs::QInt8Crouton> QInt8CroutonTensor;
+typedef ConcreteTensor<Tdefs::QUint16Crouton> QUint16CroutonTensor;
+typedef ConcreteTensor<Tdefs::QInt16Crouton> QInt16CroutonTensor;
+typedef ConcreteTensor<Tdefs::F16Crouton> F16CroutonTensor;
+typedef ConcreteTensor<Tdefs::F16Weights> F16WeightsTensor;
+typedef ConcreteTensor<Tdefs::QInt32Crouton> QInt32CroutonTensor;
+typedef ConcreteTensor<Tdefs::Int32Crouton> Int32CroutonTensor;
+typedef ConcreteTensor<Tdefs::QFloatCrouton> QFloatCroutonTensor;
+typedef ConcreteTensor<Tdefs::QUint8WideCrouton> QUint8WideCroutonTensor;
+typedef ConcreteTensor<Tdefs::QUint8WideCrouton2x2> QUint8WideCrouton2x2Tensor;
+typedef ConcreteTensor<Tdefs::QInt32WideCrouton> QInt32WideCroutonTensor;
+typedef ConcreteTensor<Tdefs::QUint8Crouton4x1> QUint8Crouton4x1Tensor;
+typedef ConcreteTensor<Tdefs::QUint8Crouton2x2> QUint8Crouton2x2Tensor;
+typedef ConcreteTensor<Tdefs::QFloat> QFloatTensor;
+typedef HashTableTensor<Tdefs::Int32Hash> Int32HashTableTensor; // to be removed
+
+// These were once TensorContiguous
+typedef ConcreteTensor<Tdefs::PlainFloat> PlainFloatContiguousTensor;
+typedef ConcreteTensor<Tdefs::QFloat> QFloatContiguousTensor;
+
+struct ModifiedDerivedTypeParent {
+    using PlainFloatTensor_TCM = PlainFloatTensor;
+    using PlainFloatTensor5D_TCM = PlainFloatTensor5D;
+    using PlainFloat16Tensor_TCM = PlainFloat16Tensor;
+    using PlainFloat16Tensor5D_TCM = PlainFloat16Tensor5D;
+    using QFloatTensor_TCM = QFloatTensor;
+    using QuantInt16Tensor_TCM = QuantInt16Tensor;
+    using QuantInt16Tensor5D_TCM = QuantInt16Tensor5D;
+    using QuantInt32Tensor_TCM = QuantInt32Tensor;
+    using QUint8CroutonTensor_TCM = QUint8CroutonTensor;
+    using QInt8CroutonTensor_TCM = QInt8CroutonTensor;
+    using QUint8Crouton4x1Tensor_TCM = QUint8Crouton4x1Tensor;
+    using QUint8Crouton2x2Tensor_TCM = QUint8Crouton2x2Tensor;
+    using QuantUint8Tensor_TCM = QuantUint8Tensor;
+    using QuantUint8Tensor5D_TCM = QuantUint8Tensor5D;
+    using QuantUint16Tensor_TCM = QuantUint16Tensor;
+    using QuantUint16Tensor5D_TCM = QuantUint16Tensor5D;
+    using QuantInt8Tensor_TCM = QuantInt8Tensor;
+    using QuantInt8Tensor5D_TCM = QuantInt8Tensor5D;
+    using QUint16CroutonTensor_TCM = QUint16CroutonTensor;
+    using QInt16CroutonTensor_TCM = QInt16CroutonTensor;
+    using F16CroutonTensor_TCM = F16CroutonTensor;
+    using F16WeightsTensor_TCM = F16WeightsTensor;
+    using QInt32CroutonTensor_TCM = QInt32CroutonTensor;
+    using Int32CroutonTensor_TCM = Int32CroutonTensor;
+    using QFloatCroutonTensor_TCM = QFloatCroutonTensor;
+    using QUint8WideCroutonTensor_TCM = QUint8WideCroutonTensor;
+    using QUint8WideCrouton2x2Tensor_TCM = QUint8WideCrouton2x2Tensor;
+    using QInt32WideCroutonTensor_TCM = QInt32WideCroutonTensor;
+    using Int32Tensor_TCM = Int32Tensor;
+    using Int32Tensor5D_TCM = Int32Tensor5D;
+    using Int64Tensor_TCM = Int64Tensor;
+    using Int32HashTableTensor_TCM = Int32HashTableTensor; // to be removed
+};
+
+/////////////////////////
+
+typedef ConcreteTensor<Tdefs::PlainFloat_TCM> PlainFloatTensor_TCM;
+typedef ConcreteTensor<Tdefs::PlainFloat_5D_TCM> PlainFloatTensor5D_TCM;
+typedef ConcreteTensor<Tdefs::PlainFloat16_TCM> PlainFloat16Tensor_TCM;
+typedef ConcreteTensor<Tdefs::PlainFloat16_5D_TCM> PlainFloat16Tensor5D_TCM;
+typedef ConcreteTensor<Tdefs::QFloat_TCM> QFloatTensor_TCM;
+typedef ConcreteTensor<Tdefs::QuantInt16_TCM> QuantInt16Tensor_TCM;
+typedef ConcreteTensor<Tdefs::QuantInt16_5D_TCM> QuantInt16Tensor5D_TCM;
+typedef ConcreteTensor<Tdefs::QuantInt32_TCM> QuantInt32Tensor_TCM;
+typedef ConcreteTensor<Tdefs::QUint8Crouton_TCM> QUint8CroutonTensor_TCM;
+typedef ConcreteTensor<Tdefs::QInt8Crouton_TCM> QInt8CroutonTensor_TCM;
+typedef ConcreteTensor<Tdefs::QUint8Crouton4x1_TCM> QUint8Crouton4x1Tensor_TCM;
+typedef ConcreteTensor<Tdefs::QUint8Crouton2x2_TCM> QUint8Crouton2x2Tensor_TCM;
+typedef ConcreteTensor<Tdefs::QuantUint8_TCM> QuantUint8Tensor_TCM;
+typedef ConcreteTensor<Tdefs::QuantUint8_5D_TCM> QuantUint8Tensor5D_TCM;
+typedef ConcreteTensor<Tdefs::QuantUint16_TCM> QuantUint16Tensor_TCM;
+typedef ConcreteTensor<Tdefs::QuantUint16_5D_TCM> QuantUint16Tensor5D_TCM;
+typedef ConcreteTensor<Tdefs::QuantInt8_TCM> QuantInt8Tensor_TCM;
+typedef ConcreteTensor<Tdefs::QuantInt8_5D_TCM> QuantInt8Tensor5D_TCM;
+typedef ConcreteTensor<Tdefs::QUint16Crouton_TCM> QUint16CroutonTensor_TCM;
+typedef ConcreteTensor<Tdefs::QInt16Crouton_TCM> QInt16CroutonTensor_TCM;
+typedef ConcreteTensor<Tdefs::F16Crouton_TCM> F16CroutonTensor_TCM;
+typedef ConcreteTensor<Tdefs::F16Weights_TCM> F16WeightsTensor_TCM;
+typedef ConcreteTensor<Tdefs::QInt32Crouton_TCM> QInt32CroutonTensor_TCM;
+typedef ConcreteTensor<Tdefs::Int32Crouton_TCM> Int32CroutonTensor_TCM;
+typedef ConcreteTensor<Tdefs::QFloatCrouton_TCM> QFloatCroutonTensor_TCM;
+typedef ConcreteTensor<Tdefs::QUint8WideCrouton_TCM> QUint8WideCroutonTensor_TCM;
+typedef ConcreteTensor<Tdefs::QUint8WideCrouton2x2_TCM> QUint8WideCrouton2x2Tensor_TCM;
+typedef ConcreteTensor<Tdefs::QInt32WideCrouton_TCM> QInt32WideCroutonTensor_TCM;
+typedef HashTableTensor<Tdefs::Int32Hash_TCM> Int32HashTableTensor_TCM; // to be removed
+
+// These were once TensorContiguous
+typedef ConcreteTensor<Tdefs::Int32_TCM> Int32Tensor_TCM;
+typedef ConcreteTensor<Tdefs::Int32_5D_TCM> Int32Tensor5D_TCM;
+typedef ConcreteTensor<Tdefs::Int64_TCM> Int64Tensor_TCM;
+
+// typedef for layouts
+typedef LayoutTensor<Ldefs::Flat_8> LayoutFlat_8;
+typedef LayoutTensor<Ldefs::Flat5D_8> LayoutFlat5D_8;
+typedef LayoutTensor<Ldefs::Flat_16> LayoutFlat_16;
+typedef LayoutTensor<Ldefs::Flat5D_16> LayoutFlat5D_16;
+typedef LayoutTensor<Ldefs::Flat_32> LayoutFlat_32;
+typedef LayoutTensor<Ldefs::Flat5D_32> LayoutFlat5D_32;
+typedef LayoutTensor<Ldefs::Flat_64> LayoutFlat_64;
+
+// 'standard' crouton layouts.
+typedef LayoutTensor<Ldefs::Crouton_8> LayoutCrouton_8; // [1,8,8,32]
+typedef LayoutTensor<Ldefs::WideCrouton_8> LayoutWideCrouton_8; // [1,2,32,32]
+typedef LayoutTensor<Ldefs::Crouton_16> LayoutCrouton_16; // [1,8,4,32] interleaved
+typedef LayoutTensor<Ldefs::Crouton_32> LayoutCrouton_32; // [1,8,2,32]
+typedef LayoutTensor<Ldefs::WideCrouton_32> LayoutWideCrouton_32; // [1,2,8,32]
+
+typedef LayoutTensor<Ldefs::Crouton4x1_8> LayoutCrouton4x1_8;
+typedef LayoutTensor<Ldefs::Crouton2x2_8> LayoutCrouton2x2_8;
+typedef LayoutTensor<Ldefs::WideCrouton2x2_8> LayoutWideCrouton2x2_8;
+
+using TypicalTensors =
+        std::tuple<PlainFloatTensor, PlainFloatTensor5D, PlainFloat16Tensor, QuantUint8Tensor, QuantUint8Tensor5D,
+                   QuantInt8Tensor, QuantInt8Tensor5D, QuantUint16Tensor, QuantUint16Tensor5D, QuantInt16Tensor,
+                   QuantInt32Tensor, Int32Tensor, Int32Tensor5D, Int32Tensor6D, QUint8CroutonTensor, QInt8CroutonTensor,
+                   QUint8Crouton4x1Tensor, QUint8Crouton2x2Tensor, QUint16CroutonTensor, QInt16CroutonTensor,
+                   QInt32CroutonTensor, QFloatTensor, QFloatCroutonTensor, Int32CroutonTensor, PlainFloat16Tensor_TCM,
+                   PlainFloat16Tensor5D, Int64Tensor, Int32HashTableTensor /*to be removed*/, QuantInt16Tensor5D>;
+
+namespace hnnx {
+// these tensor types are 'pre-registered' for deserialize
+// clang-format off
+using CoreTensors =
+        std::tuple<PlainFloatTensor, PlainFloatTensor5D, PlainFloat16Tensor, Int32Tensor, Int32Tensor5D, Int32Tensor6D,
+                   PlainFloatTensor_TCM, PlainFloatTensor5D_TCM, Int32Tensor_TCM, QuantUint8Tensor, QuantUint8Tensor5D,
+                   QuantInt8Tensor, QuantInt8Tensor5D, QuantUint8Tensor_TCM, QuantUint8Tensor5D_TCM,
+                   QuantInt8Tensor_TCM, QuantInt8Tensor5D_TCM, QuantUint16Tensor, QuantUint16Tensor5D, QuantInt16Tensor,
+                   QuantUint16Tensor_TCM, QuantUint16Tensor5D_TCM, QuantInt16Tensor_TCM, QuantInt32Tensor,
+                   QUint8CroutonTensor, QuantInt32Tensor_TCM, QUint8CroutonTensor_TCM, QInt8CroutonTensor,
+                   QUint16CroutonTensor, QInt8CroutonTensor_TCM, QUint16CroutonTensor_TCM, QInt32CroutonTensor,
+                   QInt16CroutonTensor, QInt16CroutonTensor_TCM, QInt32CroutonTensor_TCM, QInt32WideCroutonTensor,
+                   QInt32WideCroutonTensor_TCM, QFloatTensor, QFloatCroutonTensor, Int32CroutonTensor,
+                   Int32CroutonTensor_TCM,
+                   // UNUSED D32FloatTensor, D32PaddedFloatTensor,
+                   F16CroutonTensor, F16CroutonTensor_TCM,
+                   QUint8WideCroutonTensor, QUint8WideCroutonTensor_TCM, QUint8Crouton2x2Tensor_TCM,
+                   QUint8WideCrouton2x2Tensor_TCM, PlainFloat16Tensor_TCM, PlainFloat16Tensor5D, Int32Tensor5D_TCM,
+                   Int64Tensor, Int64Tensor_TCM, Int32HashTableTensor, Int32HashTableTensor_TCM /*to be removed*/,
+                   QuantInt16Tensor5D_TCM, QuantInt16Tensor5D>;
+// clang-format on
+
+API_EXPORT const char *get_op_true_name(const Op *op);
+
+////// Tensor Generator //////////////
+
+template <typename T, typename TX>
+API_EXPORT inline std::unique_ptr<T> make_tensor_template(Op const *op, OutputDef const &odef, Graph &g)
+{
+    return std::unique_ptr<T>(std::make_unique<TX>(op, odef, g));
+}
+
+// we make tables of these entries:
+//  rank, dtype, pointer to function which makes it.
+// The tables are built only as static constexpr variable in tensor_generator_lookup<T>::lookup
+// so there should be only one table per TensorType after link.
+//
+struct tensor_generator_table_entry {
+    typedef Tensor T; // maybe needs to be a template parm
+    typedef std::unique_ptr<T> (*maketens_funcp)(Op const *, OutputDef const &, Graph &);
+
+    int rank;
+    DType dtype;
+    maketens_funcp fp;
+
+    // default ctor
+    inline constexpr tensor_generator_table_entry() : rank(), dtype(), fp() {}
+
+    // each entry is constructed based on pointer to the tensor type.
+    template <typename TX>
+    inline constexpr tensor_generator_table_entry(TX const *)
+        : rank(tensor_traits<TX>::rank), dtype(tensor_traits<TX>::dtype), fp(make_tensor_template<T, TX>)
+    {
+    }
+};
+// a thing to make the constexpr table..
+template <typename TTUPLE, size_t... I>
+inline constexpr std::array<tensor_generator_table_entry, std::tuple_size_v<TTUPLE>>
+        make_tengen_init(std::index_sequence<I...>)
+{
+    return {tensor_generator_table_entry(static_cast<typename std::tuple_element_t<I, TTUPLE> *>(nullptr))...};
+}
+
+template <typename TensorType> struct API_EXPORT tensor_generator_lookup {
+    template <typename TX>
+    using has_TensorType_as_base = std::integral_constant<bool, std::is_base_of<TensorType, TX>::value>;
+    // this is a tuple of types for which T is a common base.
+    using applicable_types = TupFilter_t<has_TensorType_as_base, TypicalTensors>;
+    static constexpr size_t NTYPES = std::tuple_size_v<applicable_types>;
+
+    static tensor_generator_table_entry const *lookup(int rank, DType dtype)
+    {
+        // this is a table of their rank, dtype, ctor function.
+        static constexpr std::array<tensor_generator_table_entry, NTYPES> typedescs =
+                make_tengen_init<applicable_types>(std::make_index_sequence<NTYPES>{});
+        tensor_generator_table_entry const *p = typedescs.data();
+
+        for (int i = 0; i < int(NTYPES); i++) {
+            if (p->dtype == dtype && p->rank == rank) return p;
+            p++;
+        }
+        return nullptr;
+    }
+
+    static std::unique_ptr<Tensor> make [[gnu::noinline]] (const Op *producer_in, const OutputDef &def, Graph &graph_in)
+    {
+        // concrete types get a shortcut if the dtype & rank match...
+        if constexpr (!std::is_abstract<TensorType>::value) {
+            if (def.dtype == tensor_traits<TensorType>::dtype && def.rank == tensor_traits<TensorType>::rank) {
+                return make_tensor_template<Tensor, TensorType>(producer_in, def, graph_in);
+            }
+        }
+        tensor_generator_table_entry const *const lookup_result = lookup(def.rank, def.dtype);
+        if (lookup_result != nullptr) {
+            return lookup_result->fp(producer_in, def, graph_in);
+        }
+        errlog("Lookup in %d tensor types failed (%p: <<%s>>)", int(NTYPES), producer_in,
+               get_op_true_name(producer_in));
+        return nullptr;
+    }
+    // return true if 'make' would succeed.
+    static bool is_valid(const OutputDef &def)
+    {
+        if constexpr (!std::is_abstract<TensorType>::value) {
+            if (def.dtype == tensor_traits<TensorType>::dtype && def.rank == tensor_traits<TensorType>::rank)
+                return true;
+            else {
+                debuglog(
+                        "def.dtype %u, tensor_traits<TensorType>::dtype %u, def.rank %u, tensor_traits<TensorType>::rank %u",
+                        (unsigned)def.dtype, (unsigned)tensor_traits<TensorType>::dtype, def.rank,
+                        unsigned(tensor_traits<TensorType>::rank));
+            }
+        }
+        return lookup(def.rank, def.dtype) != nullptr;
+    }
+};
+// external API of tensor generator:
+//    tensor_generator<T>( Op const *, OutputDef const &, Graph &) ->  std::unique_ptr<Tensor>
+//    tensor_generator_valid<T>( Op const *, OutputDef const &, Graph &) ->  bool
+//
+// A call to tensor_generator<T>(..) is really a call to tensor_generator_lookup<T>::make(..)
+//
+
+API_EXPORT bool tensor_tall_crouton_disabled(Graph const &g);
+API_EXPORT bool tensor_wide_crouton_disabled(Graph const &g);
+
+template <typename T, typename = void> struct is_wide_crouton {
+    static constexpr bool value = false;
+};
+template <typename T, typename = void> struct is_tall_crouton {
+    static constexpr bool value = false;
+};
+template <typename T> struct is_wide_crouton<T, std::void_t<decltype(T::layout)>> {
+    static constexpr bool value = (T::layout.chunk_total == 8 * 8 * 32) && (T::layout.ChunkSizes[2] > 1) &&
+                                  (T::layout.ChunkSizes[1] < T::layout.ChunkSizes[2]);
+};
+template <typename T> struct is_tall_crouton<T, std::void_t<decltype(T::layout)>> {
+    static constexpr bool value = (T::layout.chunk_total == 8 * 8 * 32) && (T::layout.ChunkSizes[1] > 1) &&
+                                  (T::layout.ChunkSizes[1] >= T::layout.ChunkSizes[2]);
+};
+
+template <typename TensorType>
+constexpr std::unique_ptr<Tensor> (*tensor_generator)(const Op *producer_in, const OutputDef &def,
+                                                      Graph &graph_in) = tensor_generator_lookup<TensorType>::make;
+template <typename TensorType>
+API_FUNC_EXPORT inline bool tensor_generator_valid(const Op *producer_in, const OutputDef &def, Graph &graph_in)
+{
+    if constexpr (is_wide_crouton<TensorType>::value) {
+        if (tensor_wide_crouton_disabled(graph_in)) {
+            debuglog("Wide croutons disabled...");
+            return false;
+        }
+    }
+    if constexpr (is_tall_crouton<TensorType>::value) {
+        if (tensor_tall_crouton_disabled(graph_in)) {
+            debuglog("Tall croutons disabled...");
+            return false;
+        }
+    }
+    return tensor_generator_lookup<TensorType>::is_valid(def);
+}
+
+// make a scalar tensor for a given def (with 0 rank, and specific dtype). Returns an empty
+// pointer if there is no support.
+API_FUNC_EXPORT std::unique_ptr<Tensor> tensor_generator_scalar(const Op *producer_in, const OutputDef &def,
+                                                                void const *data, size_t len);
+
+template <int relative_tolerance = 1 /* 1% */, int absolute_tolerance = 1 /* in 'FLT_EPSILON' ref <climits> */>
+static inline constexpr int almost_eq(float rhs, float lhs)
+{
+    return std::abs(rhs - lhs) <= (
+                                          // should it be max of (absolute, relative) ?
+                                          (absolute_tolerance * std::numeric_limits<float>::epsilon()) +
+                                          (relative_tolerance / 100.0 * std::abs(lhs)));
+}
+
+using cmp_function = std::function<int(float, float)>;
+#ifndef PREPARE_DISABLED
+extern GraphStatus tensor_copy(Tensor &lhs, const Tensor &rhs);
+#endif
+extern GraphStatus check_dims(const Tensor &lhs, const Tensor &rhs);
+
+//
+// Set the shape of Tensor D to the same as Tensor S, and
+// then copy the contents, adapting to whatever shapes and data format
+//
+API_FUNC_EXPORT void tensor_copy_4d(Tensor &dst, Tensor const &src);
+
+API_FUNC_EXPORT void tensor_registry_testing();
+
+template <typename T> struct memclass_of {
+    static constexpr MemoryClass memclass = tensor_traits<T>::memclass;
+};
+template <> struct memclass_of<Tensor> {
+    static constexpr MemoryClass memclass = MemoryClass::Default;
+};
+
+template <typename T> struct memclass_of<Vector<T *>> {
+    static constexpr MemoryClass memclass = memclass_of<T>::memclass;
+};
+template <typename T> struct memclass_of<const Vector<T *>> {
+    static constexpr MemoryClass memclass = memclass_of<T>::memclass;
+};
+
+template <MemoryClass C, typename... Ts> struct has_memclass;
+
+template <MemoryClass C> struct has_memclass<C, std::tuple<>> {
+    static constexpr bool value = false;
+};
+
+template <MemoryClass C, typename T, typename... Ts> struct has_memclass<C, std::tuple<T, Ts...>> {
+    static constexpr bool value = memclass_of<T>::memclass == C || has_memclass<C, std::tuple<Ts...>>::value;
+};
+
+///////////////////////////////////////
+
+// mechanism to generate functions to register tensor for serializing
+// reg_tens_for_deser<T1,...>::f() -> int is a static function which
+// registers T1,T2.
+// To keep code size down, the code is only there for <T>; for more than
+// one, the others are all called in sequence.
+//
+//  reg_tens_for_deser<T1,...>::f_ptr() is a static inline which returns
+//  a pointer to f.
+//
+template <typename... T> struct reg_tens_for_deser {
+    static int f() { return (reg_tens_for_deser<T>::f(), ...); }
+    static constexpr auto f_ptr() -> int (*)() { return &f; }
+};
+// For empty list make fptr return null
+// since most of the Ops have nothing to do, and a null pointer takes
+// less code to make than the address of a function.
+template <> struct reg_tens_for_deser<> {
+    static constexpr auto f_ptr() -> int (*)() { return nullptr; }
+};
+
+// single item
+//
+template <typename T> struct reg_tens_for_deser<T> {
+    static int f()
+    {
+        using TT = std::remove_reference_t<std::remove_cv_t<T>>;
+        static_assert(std::is_same_v<T, TT>);
+        if constexpr (!(std::is_abstract<T>::value)) {
+            deserialize_tensor_register(typeid(T), type_name<T>(),
+                                        deserialize_tensor_using_constructor<T>::deserialize);
+        }
+        return 0;
+    }
+    static constexpr auto f_ptr() -> int (*)()
+    {
+        if constexpr (!(std::is_abstract<T>::value)) {
+            return &f;
+        } else {
+            // let's just have one empty function
+            return reg_tens_for_deser<>::f_ptr();
+        }
+    }
+};
+
+template <typename TUP> struct map_rtfd_type {
+};
+template <typename... T> struct map_rtfd_type<std::tuple<T...>> {
+    using type = reg_tens_for_deser<T...>;
+};
+// given a tuple TUP, deserialize_tensor_tuple<TUP,FORCE>::f_ptr()
+// returns a pointer to a function
+// which registers all of the types which are not in SkipRegTensors<FORCE>
+// (i.e. if FORCE is false, all of the types which are not in CoreTensors;
+// if force is true, all of the types).
+//
+
+// This is SkipRegTensors<True>, which is used for FORCE=true (don't skip).
+// SkipRegTensors<false> is defined at the bottom of tensors.h.
+template <bool FORCE> struct SkipRegTensors {
+    using type = std::tuple<>;
+};
+
+template <typename TUP, bool FORCE = false> struct deserialize_tensor_tuple {
+    template <typename T> using not_core_tensor = not_contains_type<typename SkipRegTensors<FORCE>::type, T>;
+    using filtered_T = std::conditional_t<FORCE, TUP, typename TupFilter<not_core_tensor, TUP>::type>;
+    using rtfd_type = typename map_rtfd_type<filtered_T>::type;
+    static constexpr auto f_ptr() -> int (*)() { return rtfd_type::f_ptr(); }
+    static int f() { return rtfd_type::f(); }
+};
+
+template <> struct SkipRegTensors<false> {
+    using type = CoreTensors;
+};
+
+// map a tensor type to its layout tensor.
+template <typename TT> using layout_of = typename tensor_traits<TT>::layouttensor_type;
+
+} // namespace hnnx
+
+POP_VISIBILITY()
+
+#include "tile_extract.h"
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/tensor_info.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/tensor_info.h
new file mode 100755
index 0000000000000..56c52e433e4bf
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/tensor_info.h
@@ -0,0 +1,267 @@
+//==============================================================================
+//
+// Copyright (c) 2023-2024 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef TENSOR_INFO_H
+#define TENSOR_INFO_H 1
+
+#include "opname_tag.h"
+
+#include <optional>
+#include <vector>
+#include <limits>
+
+namespace hnnx {
+
+struct TensorInfo {
+    using TensorFlags = unsigned; // treatad a a bit vector
+    using NameSet = unsigned; // a bit vector of input and output names
+            // outputs are the first "num_outputs" bits
+
+    hnnx::opname_tag_t op_name;
+    unsigned num_outputs = 1; // at least 1 "*" but can be longer
+
+    NameSet fixed = 0; // names which do not change, roughly flat&main_memory
+    NameSet is_crouton = 0;
+    NameSet is_flat = 0;
+    NameSet is_main_memory = 0;
+    NameSet is_tcm = 0;
+    NameSet prefer_tcm = 0; // place in tcm if they fit
+    NameSet fixed_constants = 0;
+    NameSet unaligned_ok = 0;
+    NameSet crouton_if = 0; // assign crouton if any input name in this set is
+    unsigned flat_above = std::numeric_limits<unsigned>::max();
+    unsigned main_memory_above = std::numeric_limits<unsigned>::max();
+    unsigned tcm_above = std::numeric_limits<unsigned>::max();
+    unsigned crouton_above = std::numeric_limits<unsigned>::max();
+    unsigned unaligned_ok_above = std::numeric_limits<unsigned>::max();
+
+    std::vector<std::tuple<TensorFlags, hnnx::opname_tag_t>> renames;
+    std::vector<std::pair<NameSet, opname_tag_t>> early_renames;
+    // std::vector<std::pair<unsigned, Flags>> outputs;
+};
+} // namespace hnnx
+
+namespace DefProperties {
+enum class PropertyFlags {
+    TCM = 1,
+    MAIN_MEMORY = 2,
+    CROUTON = 4,
+    FLAT = 8,
+    INVARIANT = 16,
+};
+
+constexpr PropertyFlags FLAT = PropertyFlags::FLAT;
+constexpr PropertyFlags MAIN_MEMORY = PropertyFlags::MAIN_MEMORY;
+constexpr PropertyFlags CROUTON = PropertyFlags::CROUTON;
+constexpr PropertyFlags TCM = PropertyFlags::TCM;
+constexpr PropertyFlags INVARIANT = PropertyFlags::INVARIANT;
+constexpr unsigned OTHERWISE = 0;
+
+// Rename(flag-list, new_name) if after migration all the specified
+// flags are set, the operator is renamted to new_name
+// Example: Rename(CROUTON,TCM, "Add.tcm")
+struct Rename {
+    unsigned flags = 0;
+    const char *name = nullptr;
+    inline void update() {}
+    inline void update1(const PropertyFlags flag) { flags |= static_cast<unsigned>(flag); }
+    inline void update1(const unsigned flag) { assert(flag == OTHERWISE); }
+    inline void update1(const char *const name_p) { name = name_p; }
+    template <typename First, typename... Rest> void update(First first, Rest... rest)
+    {
+        update1(first);
+        update(rest...);
+    }
+    template <typename... Flags> explicit Rename(Flags... flags_) { update(flags_...); }
+};
+
+struct FlatAboveArg {
+    unsigned position;
+    FlatAboveArg(const unsigned position_) : position(position_) {}
+};
+
+struct StringList {
+    std::vector<const char *> names;
+    inline void update() {}
+    inline void update1(const char *const name) { names.push_back(name); }
+    template <typename First, typename... Rest> void update(First first, Rest... rest)
+    {
+        update1(first);
+        update(rest...);
+    }
+    template <typename... Args> explicit StringList(Args... args) { update(args...); }
+};
+
+struct Op : public StringList {
+    template <typename... Args> explicit Op(Args... args) : StringList(args...) {}
+};
+struct Outputs : public StringList {
+    template <typename... Args> explicit Outputs(Args... args) : StringList(args...) {}
+};
+struct Fixed : public StringList {
+    template <typename... Args> explicit Fixed(Args... args) : StringList(args...) {}
+};
+struct Crouton : public StringList {
+    template <typename... Args> explicit Crouton(Args... args) : StringList(args...) {}
+};
+struct Flat : public StringList {
+    template <typename... Args> explicit Flat(Args... args) : StringList(args...) {}
+};
+struct MainMemory : public StringList {
+    template <typename... Args> explicit MainMemory(Args... args) : StringList(args...) {}
+};
+struct Tcm : public StringList {
+    template <typename... Args> explicit Tcm(Args... args) : StringList(args...) {}
+};
+struct PreferTcm : public StringList {
+    template <typename... Args> explicit PreferTcm(Args... args) : StringList(args...) {}
+};
+
+struct FixedConstant : public StringList {
+    template <typename... Args> explicit FixedConstant(Args... args) : StringList(args...) {}
+};
+
+struct UnalignedOk : public StringList {
+    template <typename... Args> explicit UnalignedOk(Args... args) : StringList(args...) {}
+};
+struct WhenFitsIfTCM : public StringList {
+    template <typename... Args> explicit WhenFitsIfTCM(Args... args) : StringList(args...) {}
+};
+struct CroutonIf : public StringList {
+    template <typename... Args> explicit CroutonIf(Args... args) : StringList(args...) {}
+};
+
+struct TilingOnly {
+};
+
+// An object that is used to build a TensorInfo from a variadic constructor clal
+struct TensorInfoBuilder : public hnnx::TensorInfo {
+    const char *package;
+    std::vector<const char *> names;
+    std::vector<const char *> output_names;
+    bool tiling_only = false;
+    // convert to flags and record elipsis position
+    API_FUNC_EXPORT std::pair<unsigned, std::optional<unsigned>> nameset(const StringList &list,
+                                                                         bool input_only = false);
+    inline void update() {}
+
+    // Old style, just the root name
+    inline void update1(const char *const name)
+    {
+        assert(names.empty());
+        op_name = name;
+        names.push_back("*");
+    }
+    // new style Op(root_name, ...)
+    inline void update1(Op names_p)
+    {
+        assert(names.empty());
+        names = std::move(names_p.names);
+        assert((std::all_of(names.begin(), names.end(), [](const char *name) { return strcmp(name, "...") != 0; })) and
+               "Invalid name '...'");
+        op_name = names[0];
+        names[0] = "*";
+    }
+    inline void update1(Outputs names_p)
+    {
+        assert(output_names.empty());
+        assert(not names_p.names.empty());
+        num_outputs = names_p.names.size();
+        output_names = std::move(names_p.names);
+        assert((std::all_of(output_names.begin(), output_names.end(),
+                            [](const char *name) { return strcmp(name, "...") != 0; })) and
+               "Invalid name '...'");
+    }
+    inline void update_flags(unsigned flags_, unsigned &true_, const unsigned false_)
+    {
+        flags_ &= ~(true_ | false_ | fixed); // don't change a position alreay set
+        true_ |= flags_;
+    }
+    inline void update1(const Fixed fixed_p)
+    {
+        const unsigned fixed_names = nameset(fixed_p).first;
+        const unsigned really_fixed = fixed_names & ~(is_flat | is_crouton | is_tcm | prefer_tcm | is_main_memory);
+        update_flags(fixed_names, is_flat, is_crouton);
+        update_flags(fixed_names, is_main_memory, is_tcm | prefer_tcm);
+        fixed |= really_fixed;
+    }
+    inline void update1(const PreferTcm operand_list)
+    {
+        prefer_tcm |= nameset(operand_list, /*input_only*/ true).first;
+    }
+    inline void update1(const Flat operand_list)
+    {
+        auto [f, elipsis] = nameset(operand_list);
+        update_flags(f, is_flat, is_crouton);
+        if (elipsis.has_value()) flat_above = *elipsis;
+    }
+    inline void update1(const Crouton operand_list)
+    {
+        auto [f, elipsis] = nameset(operand_list);
+        update_flags(f, is_crouton, is_flat);
+        if (elipsis.has_value()) crouton_above = *elipsis;
+    }
+    inline void update1(const MainMemory operand_list)
+    {
+        auto [f, elipsis] = nameset(operand_list);
+        update_flags(f, is_main_memory, is_tcm | prefer_tcm);
+        if (elipsis.has_value()) main_memory_above = *elipsis;
+    }
+    inline void update1(const Tcm operand_list)
+    {
+        auto [f, elipsis] = nameset(operand_list);
+        update_flags(f, is_tcm, is_main_memory);
+        if (elipsis.has_value()) tcm_above = *elipsis;
+    }
+    inline void update1(const UnalignedOk operand_list)
+    {
+        auto [f, elipsis] = nameset(operand_list);
+        unaligned_ok |= f;
+        if (elipsis.has_value()) unaligned_ok_above = *elipsis;
+    }
+    inline void update1(const FixedConstant operand_list)
+    {
+        const unsigned f = nameset(operand_list, /*input_only*/ true).first;
+        fixed_constants |= f;
+    }
+    inline void update1(const CroutonIf operand_list)
+    {
+        const unsigned f = nameset(operand_list, /*input_only*/ true).first;
+        crouton_if |= f;
+    }
+    inline void update1(const TilingOnly) { tiling_only = true; }
+    API_FUNC_EXPORT void update1(Rename rename); // in tcm_migration.cc
+    // void update1(Output output) { outputs.emplace_back(output.index, output.flags); }
+    void update1(WhenFitsIfTCM rename);
+
+    // void update1(NoPropagationToArgs stops_p) { stops |= stops_p.positions; }
+    // void update1(NoPropagationAboveArg above) { stops |= 0xffff << (above.position + 1); }
+    inline void update1(const FlatAboveArg flat_above_p) { flat_above = flat_above_p.position; }
+    // inline void update1(const unsigned flag) { flags |= flag; }
+    template <typename First, typename... Rest> void update(First first, Rest... rest)
+    {
+        update1(first);
+        update(rest...);
+        assert(not names.empty() and "No name specified for DEF_TENSOR_PROPERTIES");
+    }
+
+    template <typename... Args> TensorInfoBuilder(const char *package_, Args... args) : package(package_)
+    {
+        update(args...);
+    }
+    TensorInfoBuilder() = default;
+};
+
+} // namespace DefProperties
+
+namespace hnnx {
+// In tcm_migration.cc
+extern API_FUNC_EXPORT bool register_tensor_properties(const char *package, DefProperties::TensorInfoBuilder);
+} // namespace hnnx
+
+#endif // TENSOR_INFO_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/tile_extract.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/tile_extract.h
new file mode 100755
index 0000000000000..2f4fb81b7fa4b
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/tile_extract.h
@@ -0,0 +1,797 @@
+
+//==============================================================================
+//
+// Copyright (c) Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/*
+ * tile_extract.h
+ *
+ *  Created on: Nov 8, 2019
+ *      Author: smithg
+ */
+#ifndef TILE_EXTRACT_H_
+#define TILE_EXTRACT_H_
+
+#include "intrinsics.h"
+#include "dynamic_tensors.h"
+
+/*
+ *  This defines functions which are templated on Tensor subclasses,
+ *  and which extract a tile of data from the tensor, and replace it.
+ *  The tiles are normally 2K bytes;
+ *
+ *    - for qu8/qi8 data, the tile is 8x8x32 in 'flat' order.
+ *    - for qu16/qi16 data, the tile is 8h x 4w x 32 in 'crouton' order,
+ *         (on each row, the first 2 elements are in 32 {w0,w1} pairs,
+ *         then the rest are in 32 {w2,w3} pairs.
+ *       There is flag to force 'flat' order for qu16 data
+ *
+ *    - For qint32 data (and for int32,float), the default tile is 8x2x32; in order to
+ *         match the 8x8x32 Crouton size, while keeping the tile in 2K bytes
+ *
+ * However, in all cases you can specify a specific tile height (in range 1..8) by or'ing
+ * the value into the lower bits of the 'flags' word (a zero value gives the default for the element size).
+ *
+ *
+ *  Operations accept these 'flags', or'd  together (and combined with an optional tile height)
+ *    tileExt::copy       - forces a copy operation on tile_read, even when not needed.
+ *    tileExt::unshuffled - this has no effect on qu8 data; for qu16,
+ *                          the data will be unpacked into 'flat' order instead of shuffled
+ *                          (when storing back, this refers to the order in which the data
+ *                          is presented).
+ *    tileExt::broadcast  - see below; supports broadcasting of input dimensions on read.
+ *
+ *
+ * READING TILES
+ * =============
+ * The normal 'read' operation is to extract a tile at coordinates (b,h,w,d), which represent
+ * the 'origin' of the tile. The tile of a shape [1,TH,TW,32] is extracted from
+ *                   [ b, h ... h+TH-1,  w ... w + TW-1,  d ... d + 31]
+ *
+ * Here TH = Tile height (as specified by flags, or default by element size)
+ *      TW = Tile width (depends on element size)
+ *
+ * The caller supplies a pointer to a vector-aligned buffer area of sufficient size.
+ * The extract function will either extract the data into this area, or -- when possible --
+ * will simply supply a pointer to where the data already is, in memory. If you use the 'copy'
+ * flag, the data will always be copied to the work area, which can be useful if you want to
+ * modify it in-place (such modification is not safe unless 'copy' is specified).
+ *
+ * EDGE TREATMENT
+ * --------------
+ *  If the specified region for a read falls outside the boundaries of the tensor, the corresponding
+ *  portions of the result will contain 'garbage' data - except for 'broadcast' as below. It is a requirement
+ *  that at least part of the tile falls within the bounds of the input tensor:
+ *
+ *     - 'd' coordinate must be >= -31 and  < input_depth
+ *     - 'w' coordinate must be >= -(TW-1) and < input_width
+ *     - 'h' coordinate must be >= -(TH-1) and < input_width
+ *
+ * Use of negative coordinates causes the data to be displaced to the right/down in the tile,
+ * with the left/top filled with 'garbage'.
+ * When reading from a 'crouton' tensor, the data may be gathered from as many as 8 actual tiles,
+ * according to which of h,w,d dimensions are misaligned in the request.
+ *
+ *
+ * BROADCAST
+ * ---------
+ *  The 'broadcast' flag only applies to 'read' operations, and it has the following effect:
+ *     - if the input tensor has batches=1, the input 'b' parameter is ignored and treated as 0.
+ *     - if the input tensor has height=1, the 'h' parameter is ignored, the single row of
+ *           data will replicated to all TH rows of the extracted tile
+ *     - if the input tensor has width=1, the width parameter is ignored, and the column
+ *           of data will be replicated to all TW columns of the extracted tile.
+ *     - if the input tensor has depth=1, the depth parameter is ignored, the
+ *           data will be replicated to all 32 depths of the extracted tile.
+ *
+ * These are independent; so it may be that you have width broadcast, but height and depth
+ * broadcast do not occur (since the conditions are not met for those dims) and therefore
+ * there could still be 'garbage' bytes in the result.
+ *
+ * WRITING TILES
+ * =============
+ *  Caller supplies the data for a tile; function stores that to the tensor, respecting
+ *  edges (data clipped as needed).
+ *    - TH determined as per read_tile: specified in lower bits of flags, or default if zero;
+ *    - 'unshuffled' flag applies (only affects 16-bit)
+ *    - The ranges of the h and w coords are the same as for reading: tile must contain at least one
+ *      value which falls into the tensor dims.
+ *    - d must be a multiple of 32, 0 <= d < output_depth.  Thus, for crouton format,
+ *      at most 4 actual tiles will be need to be written to (depending on h and w alignment).
+ *      In cases where the output is a 'chunked' format such as crouton or d32, and the output
+ *      depth is not a multiple of 32, the write extent of the last depth unit may be effectively
+ *      padded out (i.e. garbage bytes will be written to a 'margin' area of the tensor). Likewise,
+ *      garbage values may be stored into margin areas when the tile overlaps left or right in width dimension.
+ *
+ *   Another way to do writes, which allows computing the result directly into a crouton tensor:
+ *      (1) before the operation, call
+ *            void *ptr = tens->write_tile_strategy( flags, tmp_buffer, b,h,w,d );
+ *       .. this has the same requirements as 'write_tile', but it will do nothing except either:
+ *          (a) return pointer to where the data can be directly written; or
+ *          (b) return 'tmp_buffer'.
+ *      (2) perform the operation, writing the results to the address returned by write_tile_strategy
+ *      (3) only if (ptr == tmp_buffer):
+ *                 call tens->write_buffer( flags, tmp_buffer, b,h,w,d )
+ *          (with the *exact* params used in the call to write_tile_strategy).
+ *      Step (1) can be skipped if tile_support_direct() returns false for the output tensor (see below). Note also,
+ *      'unshuffled' stores to 16-bit crouton may never be direct-mapped.
+ *
+ * Important: If you specify a particular height in the flags, do not exceed that when storing the output, if
+ * using write_tile_strategy.  For instance, if TH=3 is specified in the flags, and write_tile_strategy
+ * returns a direct pointer, the pointer may be to the last 3 rows of a crouton, so storing 4 rows will corrupt
+ * some other data.
+ *
+ * CHECKING SUPPORT
+ * ================
+ * Tensors have the following virtual methods, which indicate capabilities of the tensor types:
+ *     bool tile_support() const;
+ *     bool tile_support_fast() const;
+ *     bool tile_support_direct() const;
+ *
+ *  - tile_support():       if this returns false, the tile_read/write methods are not supported and will throw an assert.
+ *    (if properly deployed, this should only happen where the dtype of the tensor is not supported by tiles)
+ *  - tile_support_fast():    returns true if the tile support is at least better than a series of element-by-element virtual calls.
+ *  - tile_support_direct():  if true, there is a possibility that a 'direct mapping' to the tile layout can occur, depending
+ *                          on the tile position (i.e. it's a crouton layout). When false, you can skip calling write_tile_strategy()
+ *                          since it will never succeed.
+ *
+ *  Implementation node: there is actually just one virtual method tile_support_bits() which returns 'unsigned'; the methods above test individual bits
+ *  of that method's result.
+ *
+ */
+#include "weak_linkage.h"
+#include "macros_attribute.h"
+PUSH_VISIBILITY(default)
+
+namespace tileExt {
+enum tile_flags : unsigned {
+    // lower 5 bits contain 'ht'. This must be 0 (to indicate 'default') or a number in range 1..8
+    // The default is normally 8; for 32-bit tiles it is 2.
+    tile_ht_mask = 31,
+    copy = 32,
+    unshuffled = 64,
+    broadcast = 128,
+
+    write_strategy = 256, // used internally only
+    write_strategy_keep = unshuffled | tile_ht_mask
+};
+
+} //namespace tileExt
+
+namespace hnnx {
+
+namespace tileExt_priv {
+
+// these are designed so that, for tensor types which don't support tile ops, the read_tile and write_tile
+// methods can just jump to them.
+API_EXPORT uint8_t const *unsupported_read(Tensor const *, unsigned flags_unused, uint8_t *buf);
+API_EXPORT void unsupported_write(Tensor *);
+
+template <typename STYPE, unsigned RANK>
+API_EXPORT uint8_t const *generic_tile_read(Tensor const *, unsigned flags,
+                                            uint8_t *tbuf, // caller-supplied buffer
+                                            size_t b, int h, int w, int d);
+template <typename STYPE, unsigned RANK>
+API_EXPORT void generic_tile_write(Tensor *, unsigned flags,
+                                   uint8_t const *tbuf, // caller-supplied buffer
+                                   size_t b, int h, int w, int d);
+
+template <unsigned FLAGS, typename T> struct tile_support_flags_for {
+    static constexpr unsigned value = FLAGS | ((sizeof(T) == 1) ? Tensor::tile_8bit : 0) |
+                                      ((sizeof(T) == 2) ? Tensor::tile_16bit : 0) |
+                                      ((sizeof(T) == 4) ? Tensor::tile_32bit : 0);
+    static_assert((value & Tensor::tile_any) != 0);
+};
+
+// Determine (based on 'Linfo') if the support for a tensor layout is supported generically;
+// returns true if it is.
+//  (and any such types, hopefully all, can have 'fast' support).
+//
+// We support:
+//  - only with rank 4, and with unsupported storage_type
+//  - any flat;
+//  - no 'contiguous chunked' tensors;
+//  - any 'normal' crouton (i.e. no 'wide' crouton)
+//
+template <typename Linfo> constexpr bool tile_support_test_for_linfo()
+{
+    using storage_type = typename Linfo::storage_type;
+    if constexpr (Linfo::Rank != 4 ||
+                  !(std::is_same_v<storage_type, uint8_t> || std::is_same_v<storage_type, uint16_t> ||
+                    std::is_same_v<storage_type, NN_UINT32_T>)) {
+        return false;
+    } else if constexpr (Linfo::is_chunked) {
+        // only the 'normal' crouton layouts.
+        if constexpr (std::is_same_v<Linfo, R4CroutonLayout> || std::is_same_v<Linfo, R4Crouton2Layout> ||
+                      std::is_same_v<Linfo, R4Crouton4Layout>) {
+            return true;
+        } else {
+            return false;
+        }
+    } else {
+        return true; // flat tensor
+    }
+}
+
+//
+// Generic tile methods - forwards to generic operations,
+// or to 'unsupported' when generic can't be used.
+// We will specialize this class for cases which have specific support.
+template <typename Linfo> struct tile_methods {
+    // we can use a generic method if Rank=4 and storage_type is one of uint8, uint16, NN_UINT32_T
+    using storage_type = typename Linfo::storage_type;
+    using LayoutTensorType = LayoutTensor<Linfo>;
+    static constexpr unsigned Rank = Linfo::Rank;
+    static constexpr bool is_generic = tile_support_test_for_linfo<Linfo>();
+
+    static constexpr bool tile_support_any = is_generic;
+    static constexpr bool tile_support_fast = false;
+
+    API_EXPORT static inline uint8_t const *tile_read(LayoutTensorType const *tensor, // tensor to read from
+                                                      unsigned flags,
+                                                      uint8_t *tbuf, // caller-supplied buffer
+                                                      size_t b, int h, int w, int d) // coordinates
+    {
+        // If this fires, it likely means a new tensor layout has been added, for which 'tile_support_test_for_linfo'
+        // returns true; you can either:
+        //   (a) modify tile_support_test_for_linfo() to return false for that - and no get no tile support;
+        //   (b) re-enable generic support (see WITH_GENERIC_TILE_SUPPORT in tile_support.cc) - and get slow tile support;
+        //       (in that case you should remove this static_assert).
+        //   (c) create a specialization of tile_methods<> for the new layout and add fast support.
+        static_assert(!is_generic, "missing tile support");
+        if constexpr (is_generic) {
+            return tileExt_priv::generic_tile_read<storage_type, Rank>(tensor, flags, tbuf, b, h, w, d);
+        } else {
+            return unsupported_read(tensor, flags, tbuf);
+        }
+    }
+
+    API_EXPORT static inline void tile_write(LayoutTensorType *tensor, // tensor to write to
+                                             unsigned flags,
+                                             uint8_t const *tbuf, // caller-supplied buffer
+                                             size_t b, int h, int w, int d)
+    {
+        if constexpr (is_generic) {
+            tileExt_priv::generic_tile_write<storage_type, Rank>(tensor, flags, tbuf, b, h, w, d);
+        } else {
+            unsupported_write(tensor);
+        }
+    }
+    API_EXPORT static constexpr unsigned tile_support_bits()
+    {
+        if constexpr (is_generic) {
+            return tile_support_flags_for<0, storage_type>::value;
+        } else {
+            return 0;
+        }
+    }
+};
+// specialize for 'flat', no-padding case
+// Methods are defined in tile_extract.cc
+template <typename Linfo> struct tile_methods_r4flat {
+    using TensType = LayoutTensor<Linfo>;
+    static constexpr bool tile_support_any = true;
+    static constexpr bool tile_support_fast = true;
+
+    API_EXPORT static uint8_t const *tile_read(TensType const *tensor, // tensor to read from
+                                               unsigned flags,
+                                               uint8_t *tbuf, // caller-supplied buffer
+                                               size_t b, int h, int w, int d);
+    API_EXPORT static void tile_write(TensType *tensor, // tensor to store to
+                                      unsigned flags,
+                                      uint8_t const *tbuf, // caller-supplied buffer
+                                      size_t b, int h, int w, int d);
+    API_EXPORT static constexpr unsigned tile_support_bits()
+    {
+        using storage_type = typename Linfo::storage_type;
+        return tile_support_flags_for<Tensor::tile_fast, storage_type>::value;
+    }
+};
+// specialize tile_methods for flat layout
+template <> struct tile_methods<Ldefs::Flat_8> : public tile_methods_r4flat<Ldefs::Flat_8> {
+};
+template <> struct tile_methods<Ldefs::Flat_16> : public tile_methods_r4flat<Ldefs::Flat_16> {
+};
+template <> struct tile_methods<Ldefs::Flat_32> : public tile_methods_r4flat<Ldefs::Flat_32> {
+};
+
+// specialize for Crouton, padding case
+// Methods are defined in tile_extract.cc
+template <typename Linfo> struct tile_methods_r4crouton {
+    using TensType = LayoutTensor<Linfo>;
+    static constexpr bool tile_support_any = true;
+    static constexpr bool tile_support_fast = true;
+    API_EXPORT static uint8_t const *tile_read(TensType const *tensor, // tensor to read from
+                                               unsigned flags,
+                                               uint8_t *tbuf, // caller-supplied buffer
+                                               size_t b, int h, int w, int d);
+    API_EXPORT static void tile_write(TensType *tensor, // tensor to store to
+                                      unsigned flags,
+                                      uint8_t const *tbuf, // caller-supplied buffer
+                                      size_t b, int h, int w, int d);
+    API_EXPORT static constexpr unsigned tile_support_bits()
+    {
+        using storage_type = typename Linfo::storage_type;
+        constexpr unsigned direct = Tensor::tile_direct;
+        return tile_support_flags_for<Tensor::tile_fast | direct, storage_type>::value;
+    }
+};
+// specialize tile_methods for crouton layout
+// 8 bit
+template <> struct tile_methods<Ldefs::Crouton_8> : public tile_methods_r4crouton<Ldefs::Crouton_8> {
+};
+// 16 bit (different layout!)
+template <> struct tile_methods<Ldefs::Crouton_16> : public tile_methods_r4crouton<Ldefs::Crouton_16> {
+};
+
+// 32 bit
+template <> struct tile_methods<Ldefs::Crouton_32> : public tile_methods_r4crouton<Ldefs::Crouton_32> {
+};
+
+} // namespace tileExt_priv
+
+} // namespace hnnx
+
+// write_tile_strategy implementation (here, since it depends on the flag defs)
+
+API_FUNC_EXPORT inline void *Tensor::write_tile_strategy(unsigned flags, void *buffer, size_t b, int h, int w, int d)
+{
+    unsigned const newflags = (flags & tileExt::write_strategy_keep) | tileExt::write_strategy;
+    void const *const res = const_cast<Tensor &>(*this).read_tile(newflags, buffer, b, h, w, d);
+    return const_cast<void *>(res);
+}
+//
+// define the virtual methods for the tensor classes.
+// These could be moved inside the classes, provided the "tile_read/write" functions are declared above that,
+// and any specializations of it are defined before the tensor classes are specialized.
+template <typename Linfo>
+API_FUNC_EXPORT void const *LayoutTensor<Linfo>::read_tile(unsigned flags, void *buffer, size_t b, int h, int w,
+                                                           int d) const
+{
+    return (void const *)hnnx::tileExt_priv::tile_methods<Linfo>::tile_read(this, flags, (uint8_t *)buffer, b, h, w, d);
+}
+template <typename Linfo>
+API_FUNC_EXPORT void LayoutTensor<Linfo>::write_tile(unsigned flags, void const *buffer, size_t b, int h, int w, int d)
+{
+    hnnx::tileExt_priv::tile_methods<Linfo>::tile_write(this, flags, (uint8_t const *)buffer, b, h, w, d);
+}
+template <typename Linfo> API_FUNC_EXPORT unsigned LayoutTensor<Linfo>::tile_support_bits() const
+{
+    return hnnx::tileExt_priv::tile_methods<Linfo>::tile_support_bits();
+}
+
+namespace tileExt {
+
+template <typename T> struct layout_def_of {
+};
+template <typename L> struct layout_def_of<LayoutTensor<L>> {
+    using type = L;
+};
+//
+// a way to tell at compile time if a tensor has tile support. It must
+// be a layout tensor, or subclass of.
+//    tileExt::tile_support_test<T>::support_any    <- any support at all, including 'generic'
+//    tileExt::tile_support_test<T>::support_fast    <- support better than 'generic'.
+template <typename TENST> class tile_support_test {
+    using LTYPE = typename tensor_traits<TENST>::layouttensor_type;
+    using methods = hnnx::tileExt_priv::tile_methods<typename layout_def_of<LTYPE>::type>;
+
+  public:
+    static constexpr bool support_any = methods::tile_support_any;
+    static constexpr bool support_fast = methods::tile_support_fast;
+};
+
+/////////////////////////////////////////
+// 'aligned_buffer' classes
+// On hexagon we can make the compiler align
+// it by putting an HVX vector in the union;
+// on x86 it's done manually
+/////////////////////////////////////////
+template <unsigned NVECS> struct aligned_buffer_base {
+    static_assert(NVECS >= 1);
+
+  protected:
+#ifdef __hexagon__
+    static constexpr bool manual_align = false;
+#else
+    static constexpr bool manual_align = true;
+#endif
+    union {
+        uint32_t u32arr[NVECS * 32 + (manual_align ? 31 : 0)];
+#ifdef __hexagon__
+        HVX_Vector varr[NVECS];
+#endif
+    };
+    API_EXPORT void *arr_addr() const
+    {
+        if constexpr (manual_align) {
+            size_t tmp = size_t(&u32arr[0]);
+            tmp = (tmp + 127) & ~size_t(127);
+            return (void *)tmp;
+        } else {
+            return (void *)&u32arr[0];
+        }
+    }
+};
+
+// useful subclasses of tile_buffer_template:
+
+template <unsigned NVECS> struct tile_buffer_template : public aligned_buffer_base<NVECS> {
+  public:
+    uint8_t *buf() { return reinterpret_cast<uint8_t *>(this->arr_addr()); };
+    uint8_t const *buf() const { return reinterpret_cast<uint8_t const *>(this->arr_addr()); };
+};
+// aligned buffer of 2K
+using tile_buffer = tile_buffer_template<16>;
+// aligned buffer of 1K
+using tile_half_buffer = tile_buffer_template<8>;
+// aligned buffer of 4K (for 4x8 32-bit tile)
+using tile_double_buffer = tile_buffer_template<32>;
+// aligned buffer of 8K (for 8x8 32-bit tile)
+using tile_quad_buffer = tile_buffer_template<64>;
+
+//
+// 'arrays' of NBUFS tile buffers...
+//  call 'buf(i)' method, with  i in range 0..NBUFS-1, to get a pointer to one of the buffers.
+//
+template <unsigned NBUFS, unsigned NVECS> struct tile_buffers_template : public aligned_buffer_base<NBUFS * NVECS> {
+    using Parent = aligned_buffer_base<NBUFS * NVECS>;
+
+  public:
+#ifdef SAFE_ALLOC
+    // For safety, clear everything to 0 so that if we load less then the size
+    // of a tile, memory will have a deterministic value.
+    tile_buffers_template()
+    {
+        // Clear memory if compiled with the SAFE_ALLOC option.
+        memset(Parent::u32arr, 0, sizeof(Parent::u32arr));
+    }
+#endif
+
+    API_EXPORT uint8_t *buf(unsigned i = 0) { return reinterpret_cast<uint8_t *>(this->arr_addr()) + NVECS * 128 * i; };
+    API_EXPORT uint8_t const *buf(unsigned i = 0) const
+    {
+        return reinterpret_cast<uint8_t const *>(this->arr_addr()) + NVECS * 128 * i;
+    };
+};
+
+template <unsigned NBUFS> using tile_buffers = tile_buffers_template<NBUFS, 16>;
+
+template <unsigned NBUFS> using tile_half_buffers = tile_buffers_template<NBUFS, 8>;
+
+template <unsigned NBUFS> using tile_double_buffers = tile_buffers_template<NBUFS, 32>;
+
+template <unsigned NBUFS> using tile_quad_buffers = tile_buffers_template<NBUFS, 64>;
+
+////////////////////////////////////////////////////
+/// TileStoreWindow<int ELBYTES>
+////////////////////////////////////////////////////
+// (not really part of the tile_extract interface, but closely related).
+// This is a class to manage storing tiles directly to a 'window'
+// of the output tensor, or any flat tensor, using the same write_tile
+// interface, but relative to (and clipped to) a predetermined window.
+//
+template <unsigned int RANK = 4> class TileStoreWindowBase {
+  protected:
+    void *ptr;
+    void *ptrw; // pointer to window start.
+    unsigned elsize; // element bytes
+    unsigned dims[RANK]; // dimensions of the output
+    size_t winsize[RANK]; // window to store to
+    unsigned winoffs[RANK]; // offset of the window.
+    size_t strides[RANK];
+
+  public:
+    API_EXPORT inline unsigned win_dim(int i) const { return winsize[i]; }
+    API_EXPORT inline unsigned full_dim(int i) const { return dims[i]; }
+    API_EXPORT inline size_t stride(int i) const { return strides[i]; }
+    API_EXPORT void *addr_base() const { return ptr; }
+    API_EXPORT void *win_base() const { return ptrw; }
+    // this is to support Tensor::get_dims()
+    API_EXPORT std::pair<size_t const *, size_t> get_windims() const noexcept { return {winsize, RANK}; }
+
+    // set the descriptor up with  specified 'flat' tensor
+    // for the output (described as pointer and oshape)
+
+    API_EXPORT TileStoreWindowBase(Tensor &otensor, TensorShape<RANK> const &out_shape, unsigned elbytes)
+    {
+        ptr = ptrw = otensor.raw_data();
+        size_t stride = elbytes;
+        for (int i = RANK - 1; i >= 0; --i) {
+            unsigned const dim = out_shape.dim(i);
+            dims[i] = winsize[i] = dim;
+            winoffs[i] = 0;
+            strides[i] = stride;
+            stride *= dim;
+        }
+        elsize = elbytes;
+    }
+
+    API_EXPORT TileStoreWindowBase(Tensor &otensor, std::array<size_t, RANK> out_dims, unsigned elbytes)
+    {
+        ptr = ptrw = otensor.raw_data();
+        size_t stride = elbytes;
+        for (int i = RANK - 1; i >= 0; --i) {
+            size_t const dim = out_dims[i];
+            dims[i] = winsize[i] = dim;
+            winoffs[i] = 0;
+            strides[i] = stride;
+            stride *= dim;
+        }
+        elsize = elbytes;
+    }
+    // set output tensor and window all at once.
+    // might be worth writing this out as a single 'for' loop.
+
+    template <typename ITType>
+    API_EXPORT TileStoreWindowBase(Tensor &otensor, ITType const &itens, TensorShape<RANK> const &offset,
+                                   TensorShape<RANK> const &out_shape, unsigned elbytes)
+        : TileStoreWindowBase(otensor, out_shape, elbytes)
+    {
+        set_window(itens, offset);
+    }
+
+    template <typename ITType>
+    TileStoreWindowBase(Tensor &otensor, ITType const &itens, TensorShape<RANK> const &offset,
+                        std::array<size_t, RANK> const &out_shape, unsigned elbytes)
+        : TileStoreWindowBase(otensor, out_shape, elbytes)
+    {
+        set_window(itens, offset);
+    }
+
+    // set a window with the size taken from the given tensor, and the offset
+    // from the given ShapeTensor.
+    // 'tens' can also be a Shape<4>.
+    //
+    template <typename TType> API_EXPORT inline void set_window(TType const &tens, TensorShape<RANK> const &offset)
+    {
+        size_t const *windims;
+        size_t tens_rank = 0;
+        if constexpr (std::is_same<TType, Shape<4>>::value || std::is_same<TType, Shape<5>>::value) {
+            windims = tens.dims.data();
+            tens_rank = tens.RankVal;
+        } else {
+            windims = tens.get_dims().first;
+            tens_rank = tens.rank();
+        }
+        size_t delta = 0;
+        int dim_offset = 0;
+        if (tens_rank + 1 == RANK) {
+            winsize[0] = 1;
+            dim_offset = 1;
+        }
+
+        unsigned len = 0;
+        for (int i = 0; i < RANK; ++i) {
+            unsigned const offs = offset.dim(i);
+
+            if (1 == dim_offset) {
+                len = (0 == i) ? 1 : windims[i - dim_offset];
+            } else {
+                len = windims[i];
+            }
+            assert(len > 0 && offs + len <= dims[i]);
+            winoffs[i] = offs;
+            winsize[i] = len;
+            delta += offs * strides[i];
+        }
+        ptrw = (void *)((char *)ptr + delta);
+    }
+
+    // It may make make sense to add other ctors, for other uses of the
+    // same kind of thing.
+    // Once the structure is set up, only winsize[], strides[] and ptrw are used
+    // by the write_tile method.
+};
+
+template <unsigned ELBYTES, unsigned RANK = 4> class TileStoreWindow : public TileStoreWindowBase<RANK> {
+  public:
+    API_EXPORT TileStoreWindow(Tensor &otensor, TensorShape<RANK> const &out_shape)
+        : TileStoreWindowBase<RANK>(otensor, out_shape, ELBYTES)
+    {
+    }
+    API_EXPORT TileStoreWindow(Tensor &otensor, std::array<size_t, RANK> out_dims)
+        : TileStoreWindowBase<RANK>(otensor, out_dims, ELBYTES)
+    {
+    }
+    template <typename ITType>
+    API_EXPORT TileStoreWindow(Tensor &otensor, ITType const &itens, TensorShape<RANK> const &offset,
+                               TensorShape<RANK> const &out_shape)
+        : TileStoreWindowBase<RANK>(otensor, itens, offset, out_shape, ELBYTES)
+    {
+    }
+
+    template <typename ITType>
+    TileStoreWindow(Tensor &otensor, ITType const &itens, TensorShape<RANK> const &offset,
+                    std::array<size_t, RANK> const &out_shape)
+        : TileStoreWindowBase<RANK>(otensor, itens, offset, out_shape, ELBYTES)
+    {
+    }
+    // store a tile to the window, at the given b,h,w,d
+    // These are relative to the window size.
+    //
+    //  b,h,w,d can be any value >=0 and < the window size in that dim; exceptions being
+    //   (1) h and w can be <0 (but some of the tile must still fall in range; so they
+    //       must be at least -(tht-1) and -(TW-1) resp.
+    //   (2) 'd' must be a multiple of TD=32.
+    //
+    // Tile dims are TW=8 (or TW=4 for ELBYTES=2), TD=32,
+    // Tile height tht is adjustable, coded into the lower 5 bits of 'flags',
+    // and <=8; if the lower 5 bits are zero, the default tile height is used, which
+    // is 8.
+    //
+    // The input 'tiledata' must be a vector aligned pointer to 'tht' tile row.
+    // For ELBYTES=1,2 or 4, a tile row is 256 bytes;.
+    // (i.e. a tile row is TW*TD*ELBYTES bytes).
+    // The only other thing in flags is optional tileExt::unshuffled, which
+    // applies only when ELBYTES=2, and indicates that the tiledata is not shuffled
+    // i.e it is 4*32*int16, rather than 2 x { 32*2*int16}.
+    //
+    API_EXPORT void write_tile(unsigned flags, void const *tiledata, size_t b, int h, int w, int d);
+
+    // this is to support element_addr virtual method in  TileStoreWindowTensor
+    API_EXPORT void *element_addr(SIdx const indices[RANK]) const
+    {
+        int offset = 1;
+        for (int i = 0; i < RANK - 1; ++i) {
+            offset += indices[i] * TileStoreWindowBase<RANK>::strides[i];
+        }
+        offset += indices[RANK - 1] * ELBYTES;
+        return (void *)((char *)TileStoreWindowBase<RANK>::ptrw + offset);
+    }
+};
+
+// this is the same as a TileStoreWindow, but also supports a general Tensor
+// output interface via virtual methods.
+// It must be constructed with reference to a TensorContiguous of rank 4 and matching dtype.
+// It internally creates a reference to that tensor's interface object, so
+// that t(b,h,w,d) works. .
+//
+// The only reason you need to have the exact dtype is to make get_dtype_intfc work,
+// and so that get_raw and get_raw_addr have the expected return types. If you use a different
+// dtype of the correct element size, everything else will work (and get_raw will work with a different
+// return type of the same size).
+// So there is a way to make a more generic one of these based on element size
+// only, to support use cases where we want the same code to handle QUint8 and QUint8 for instance.
+// The t(b,h,w,d) method will still convert correctly to/from float, since the interface object is taken
+// from 'otensor':
+//     TileStoreWindowTensorGeneric<ELBYTES>
+//
+
+template <DType DT> class TileStoreWindowTensor : public FakeTensor {
+    using element_type = typename dtype_traits<DT>::element_type;
+    static constexpr unsigned Rank = 4;
+    TileStoreWindow<sizeof(element_type)> ts_window;
+    hnnx::InterfaceRef const intfc;
+
+  public:
+    struct traits {
+        static constexpr DType dtype = DT;
+        using element_type = typename dtype_traits<dtype>::element_type;
+        using raw_type = typename dtype_traits<dtype>::raw_type;
+    };
+    API_EXPORT TileStoreWindowTensor(Tensor &otensor, TensorShape<4> const &out_shape)
+        : FakeTensor(nullptr), ts_window(otensor, out_shape), intfc(otensor.interface())
+    {
+    }
+
+    template <typename ITType>
+    API_EXPORT TileStoreWindowTensor(Tensor &otensor, ITType const &itens, TensorShape<4> const &offset,
+                                     TensorShape<4> const &out_shape)
+        : FakeTensor(nullptr), ts_window(otensor, itens, offset, out_shape), intfc(otensor.interface())
+    {
+    }
+
+    template <typename TType> API_EXPORT inline void set_window(TType const &tens, TensorShape<4> const &offset)
+    {
+        ts_window.template set_window<TType>(tens, offset);
+    }
+    template <typename... ind_types> API_EXPORT inline element_type *get_raw_addr(ind_types... inds)
+    {
+        static_assert(Rank == (sizeof...(ind_types)), "# of coords must match Rank");
+        const std::array<SIdx, Rank> coords = {{static_cast<SIdx>(inds)...}};
+        return (element_type *)element_ptr(Rank, coords.data());
+    }
+    template <typename... ind_types> API_EXPORT inline element_type &get_raw(ind_types... inds)
+    {
+        static_assert(Rank == (sizeof...(ind_types)), "# of coords must match Rank");
+        const std::array<SIdx, Rank> coords = {{static_cast<SIdx>(inds)...}};
+        return *(element_type *)this->element_addr(Rank, coords.data());
+    }
+
+  protected:
+    API_EXPORT virtual void *element_addr(size_t rank, SIdx const coords_in[],
+                                          hnnx::InterfaceRef *iref = nullptr) const noexcept override
+    {
+        assert(rank == Rank);
+        if (iref) *iref = intfc;
+        return ts_window.element_addr(coords_in);
+    }
+
+  public:
+    API_EXPORT virtual size_t rank() const noexcept override { return Rank; }
+    API_EXPORT virtual hnnx::InterfaceRef interface() const noexcept override final { return intfc; }
+    API_EXPORT virtual size_t dim(size_t index) const noexcept override { return ts_window.win_dim(index); }
+    API_EXPORT virtual std::pair<size_t const *, size_t> get_dims() const noexcept override
+    {
+        return ts_window.get_windims();
+    }
+
+    API_EXPORT virtual inline bool set_dims(const size_t dims[]) override final
+    {
+        for (int i = 0; i < Rank; i++) {
+            assert(dims[i] == ts_window.win_dim(i));
+        }
+        return false;
+    }
+    API_EXPORT virtual inline bool set_dims(const Tensor &prototype) override final
+    {
+        auto [dims_p, dims_n] = prototype.get_dims();
+        assert(dims_n == Rank);
+        return set_dims(dims_p);
+    }
+
+    API_EXPORT virtual inline void set_valid_dims(const size_t new_dims[]) override final
+    {
+        for (int i = 0; i < Rank; i++) {
+            assert(new_dims[i] <= ts_window.win_dim(i));
+        }
+        // AMINE TODO: update TileStoreWindowBase to handle valid dims correctly
+    }
+    // AMINE TODO: update TileStoreWindowBase to handle valid dims correctly
+    virtual inline DynamicStatus get_dynamic_state() const override { return DynamicStatus::ValidData; }
+
+    API_EXPORT virtual DTypeScaleOff get_dtype_intfc() const noexcept override
+    {
+        // @@FIXME - could be resolved at compile time by mapping DT->Interface_t
+        return DTypeScaleOff(DT, *intfc.get_qparms());
+    }
+
+    API_EXPORT virtual void write_tile(unsigned flags, void const *buffer, size_t b, int h, int w, int d) override final
+    {
+        ts_window.write_tile(flags, buffer, b, h, w, d);
+    }
+    // We don't support actually doing read_tile, but we need to implement it in case someone calls
+    // write_tile_strategy.
+    API_EXPORT virtual void const *read_tile(unsigned flags, void *buffer, size_t b, int h, int w,
+                                             int d) const override final
+    {
+        assert((flags & write_strategy) != 0);
+        return buffer; // always fail on write_tile_strategy.
+    }
+    API_EXPORT virtual unsigned tile_support_bits() const override final
+    {
+        return hnnx::tileExt_priv::tile_support_flags_for<tile_fast, element_type>::value;
+    }
+};
+
+template <unsigned ELBYTES> class TileStoreWindowTensorGeneric {
+    static_assert(false && ELBYTES, "not specialized for this value of ELBYTES");
+};
+
+template <> class TileStoreWindowTensorGeneric<1> : public TileStoreWindowTensor<DType::QUInt8> {
+};
+template <> class TileStoreWindowTensorGeneric<2> : public TileStoreWindowTensor<DType::QUInt16> {
+};
+template <> class TileStoreWindowTensorGeneric<4> : public TileStoreWindowTensor<DType::Int32> {
+};
+
+//
+// generic utilities:
+// raw_copy_via_tiles<ELBYTES>: this copies 'raw data' from 'in' to 'out' using tile operations.
+// Caller must ensure that both tensors have ELBYTES per element, and both types support the tile
+// interface.
+// All tile operations are 8 rows high, even on 32-bit tiles.
+//
+// This is instantiated for ELBYTES = 1,2,4 in tile_extract.cc
+template <unsigned ELBYTES> API_FUNC_EXPORT int raw_copy_by_tiles(Tensor &out, Tensor const &in, unsigned flags = 0);
+
+} // namespace tileExt
+
+POP_VISIBILITY()
+
+#endif /* TILE_EXTRACT_H_ */
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/type_help.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/type_help.h
new file mode 100755
index 0000000000000..a3d30d984e40b
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/type_help.h
@@ -0,0 +1,103 @@
+//==============================================================================
+//
+// Copyright (c) 2020 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef TYPE_HELP_H
+#define TYPE_HELP_H 1
+#include <type_traits>
+#include <typeinfo>
+#include <cstddef>
+
+namespace hnnx {
+
+//////////////////////////////////////////////////////////////
+// value_proxy<T,UNORDERED>::type
+//    maps 'long' to either int or to long long;
+//    unsigned long to unsigned or to unsigned long long;
+//    Any pointer is mapped to whatever size_t maps to;
+//    if UNORDERED: also maps signed integer types to unsigned
+//    all other types are unchanged
+//
+// This is useful to reduce code bloat; e.g. a 'set' class operating
+//  on T can actually use type_proxy<T> internally, so that e.g. set
+//  of 'void *' and set of 'int const *' will both use the same templated code
+//  as size_t.
+//
+
+template <typename T, bool UNORDERED> struct value_proxy {
+    typedef T type;
+};
+// map unsigned long to unsigned int or to unsigned long long
+template <bool UNORDERED> struct value_proxy<unsigned long, UNORDERED> {
+    typedef std::conditional_t<(sizeof(unsigned long) <= 4), unsigned, unsigned long long> type;
+};
+// likewise for long
+template <> struct value_proxy<long, false> {
+    typedef std::conditional_t<(sizeof(long) <= 4), int, long long> type;
+};
+template <> struct value_proxy<long, true> {
+    typedef typename value_proxy<unsigned long, true>::type type;
+};
+
+// map signed types to unsigned, if unordered
+template <> struct value_proxy<short, true> {
+    typedef unsigned short type;
+};
+template <> struct value_proxy<int, true> {
+    typedef unsigned type;
+};
+template <> struct value_proxy<long long, true> {
+    typedef unsigned long long type;
+};
+template <> struct value_proxy<signed char, true> {
+    typedef unsigned char type;
+};
+
+// all pointer -> size_t-> either unsigned or ull
+template <typename T, bool UNORDERED> struct value_proxy<T *, UNORDERED> {
+    typedef typename value_proxy<size_t, UNORDERED>::type type;
+    static_assert(sizeof(type) == sizeof(T *));
+};
+
+#if 0 // >>>> unused now
+typedef uint64_t namesig_t;
+
+////////// hash to 64 /////
+template <int N>
+inline constexpr uint64_t strconst_to_namesig_t(const char (&str)[N])
+{
+    // need to maybe try different K?
+    // any odd number is relatively prime to 2^64
+    const uint64_t K = 0x310901;
+    uint64_t result = 0;
+    for (int i = 0; i < N - 1; i++)
+        result = result * K + (uint8_t)str[i];
+    return result;
+}
+// in serialize.cc
+uint64_t typeinfo_to_namesig_t(std::type_info const &tinfo) noexcept;
+
+// this is a strategy to ensure each name signature is only found once.
+// (Each T gets its own static variable)
+//
+template <typename T> struct nameinfo {
+    static namesig_t namesig()
+    {
+        static namesig_t sig = typeinfo_to_namesig_t(typeid(T));
+        return sig;
+    }
+};
+
+template <typename T> inline namesig_t name_sig_for()
+{
+    return nameinfo<T>::namesig();
+};
+#endif /// <<< unused now
+
+} // namespace hnnx
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/type_name.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/type_name.h
new file mode 100755
index 0000000000000..1d162a616c5cb
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/type_name.h
@@ -0,0 +1,132 @@
+//==============================================================================
+//
+// Copyright (c) 2021 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef TYPE_NAME_H
+#define TYPE_NAME_H 1
+
+#include <array>
+#include <string_view>
+
+template <typename T> constexpr const char *type_name()
+{
+    return "unknown";
+}
+
+// Macros called from tensor.h when declaring a new tensor type whcih creates a map from op code to
+// typename
+template <typename> struct TensorTypeStruct;
+#define DEFINE_TYPENAME(TYPE, TYPENAME)                                                                                \
+    template <> struct TensorTypeStruct<TYPE> {                                                                        \
+        static constexpr const char *name = "CODE_TO_TENSORTYPE:" TYPENAME " " #TYPE;                                  \
+    };                                                                                                                 \
+    template <> constexpr const char *type_name<TYPE>() { return TYPENAME; }
+
+#define DEFINE_TYPENAME_V(TYPE, TYPENAME)                                                                              \
+    template <> constexpr const char *type_name<TYPE>() { return TYPENAME; }
+
+/* use DEFINE_TYPENAME to define the typename for classes
+e.g.
+DEFINE_TYPENAME(MyTensor8, mt8);
+DEFINE_TYPENAME(MyTensor16, mt16);
+*/
+// DEFINE_TYPENAME(int, int);
+// DEFINE_TYPENAME(float, float);
+
+// LCOV_EXCL_START [SAFTYSWCCB-1736] constexprs resolved during compile time
+// used in /pub/impl/template_help.h with constexpr lvalue
+template <typename T> constexpr void AddTypeNameSize(size_t &size)
+{
+    std::string_view const name = type_name<std::remove_cv_t<std::remove_reference_t<T>>>();
+    size += 1; //add space for "." or "@"
+    size += name.size();
+}
+
+template <typename... TYPES> constexpr size_t GetTypeNamesTotalSize()
+{
+    size_t size = 0;
+    (AddTypeNameSize<TYPES>(size), ...);
+    return size;
+}
+
+template <typename T> constexpr void AppendTypeName(char *des, size_t &offset, size_t &duplicate, size_t &left)
+{
+    left--;
+    std::string_view const name = type_name<std::remove_cv_t<std::remove_reference_t<T>>>();
+    size_t i = offset;
+    bool same = false;
+    if (offset != 0) { //if not the first name
+        same = true;
+        des[i++] = '.'; //add delimiter
+        size_t const len = name.size();
+        for (int j = 0; j < len; j++) {
+            if (des[offset - 1 - j] != name[len - 1 - j]) {
+                same = false;
+                break;
+            }
+        }
+        if (same && des[offset - len - 1] != '.' && des[offset - len - 1] != '@') {
+            same = false;
+        }
+        if (same) duplicate += 1;
+    } else
+        des[i++] = '@';
+    if (!same) {
+        if (offset != 0) {
+            if (duplicate > 1) {
+                des[i - 1] = '*';
+                if (duplicate >= 10) {
+                    des[i++] = 48 + duplicate / 10;
+                    des[i++] = 48 + duplicate % 10;
+                } else {
+                    des[i++] = 48 + duplicate;
+                }
+                des[i++] = '.';
+            }
+            duplicate = 1; //add delimiter
+        }
+        for (auto n : name)
+            des[i++] = n;
+        des[i] = 0;
+        offset = i;
+    }
+    if (left == 0 && duplicate > 1) {
+        des[i - 1] = '*';
+        if (duplicate >= 10) {
+            des[i++] = 48 + duplicate / 10;
+            des[i++] = 48 + duplicate % 10;
+        } else {
+            des[i++] = 48 + duplicate;
+        }
+    }
+}
+
+template <typename... TYPES> constexpr auto GetTypeNames()
+{
+    std::array<char, GetTypeNamesTotalSize<TYPES...>() + 1> result{};
+    char *des = result.data();
+    size_t offset = 0;
+    size_t duplicate = 1;
+    size_t left = sizeof...(TYPES);
+    (AppendTypeName<TYPES>(des, offset, duplicate, left), ...);
+    return result;
+}
+
+template <typename TYPESTUPLE, std::size_t... I> constexpr auto GetTypeNames(std::index_sequence<I...>)
+{
+    std::array<char, GetTypeNamesTotalSize<std::tuple_element_t<I, TYPESTUPLE>...>() + 1> result{};
+    char *const des = result.data();
+    size_t offset = 0;
+    size_t duplicate = 1;
+    size_t left = sizeof...(I);
+    (AppendTypeName<std::tuple_element_t<I, TYPESTUPLE>>(des, offset, duplicate, left), ...);
+    return result;
+}
+
+// LCOV_EXCL_STOP
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/unique_types.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/unique_types.h
new file mode 100755
index 0000000000000..909a7128af713
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/unique_types.h
@@ -0,0 +1,68 @@
+//==============================================================================
+//
+// Copyright (c) 2020 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef UNIQUE_TYPES_H
+#define UNIQUE_TYPES_H 1
+
+#if 1
+// simpler way ... generates smaller code
+#define DEFINE_UNIQ_TY()                                                                                               \
+    namespace {                                                                                                        \
+    template <int K> struct UniqTy {                                                                                   \
+    };                                                                                                                 \
+    } // namespace
+#define UNIQUE_TYPE UniqTy<__LINE__>
+
+#else
+/*
+ * EJP: FIXME maybe
+ * sizeof() is unsigned, so when we subtract constants we get unsigned results.
+ * This means that instead of just checking for < 0, we need to check for >= sizeof(STR)
+ * Or... we can cast to signed.  That seems to work.
+ */
+
+//#define STRINDEX(NUM,STR) ((((NUM) >= 0) && ((NUM) < sizeof(STR))) ? (STR[NUM]) : 0)
+
+#define STRINDEX(NUM, STR) ((((signed)(NUM)) >= 0) ? (STR[NUM]) : 0)
+#define EXPAND_LAST_16(SIZE, STR)                                                                                      \
+    STRINDEX(((SIZE)-0xF), STR), STRINDEX(((SIZE)-0xE), STR), STRINDEX(((SIZE)-0xD), STR),                             \
+            STRINDEX(((SIZE)-0xC), STR), STRINDEX(((SIZE)-0xB), STR), STRINDEX(((SIZE)-0xA), STR),                     \
+            STRINDEX(((SIZE)-0x9), STR), STRINDEX(((SIZE)-0x8), STR), STRINDEX(((SIZE)-0x7), STR),                     \
+            STRINDEX(((SIZE)-0x6), STR), STRINDEX(((SIZE)-0x5), STR), STRINDEX(((SIZE)-0x4), STR),                     \
+            STRINDEX(((SIZE)-0x3), STR), STRINDEX(((SIZE)-0x2), STR), STRINDEX(((SIZE)-0x1), STR),                     \
+            STRINDEX(((SIZE)-0x0), STR)
+
+#define EXPAND_LAST_32(SIZE, STR) EXPAND_LAST_16((SIZE - 0x10), STR), EXPAND_LAST_16((SIZE - 0x00), STR)
+
+#if 0
+/* If we need bigger file names... */
+#define EXPAND_LAST_64(SIZE, STR)                                                                                      \
+    EXPAND_LAST_16((SIZE - 0x30), STR), EXPAND_LAST_16((SIZE - 0x20), STR), EXPAND_LAST_16((SIZE - 0x10), STR),        \
+            EXPAND_LAST_16((SIZE - 0x00), STR)
+#endif
+
+/*
+ * sizeof(STR)-1 is the trailing '\0'
+ * So let's start at sizeof(STR)-2.
+ */
+#define EXPAND_STR(STR) EXPAND_LAST_32(sizeof(STR) - 2, STR)
+
+/*
+ * FIXME maybe: we could strip out zeros.
+ */
+
+namespace hnnx {
+
+template <int line, char... file_chars> struct Unique_Identifier {
+};
+
+} // namespace hnnx
+
+#define UNIQUE_TYPE hnnx::Unique_Identifier<__LINE__, EXPAND_STR(__FILE__)>
+#endif
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/weak_linkage.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/weak_linkage.h
new file mode 100755
index 0000000000000..175d8aa62824b
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/weak_linkage.h
@@ -0,0 +1,56 @@
+//==============================================================================
+//
+// Copyright (c) 2021-2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef WEAK_LINKAGE_H
+#define WEAK_LINKAGE_H 1
+
+#include "c_tricks.h"
+
+#if defined(IMPORT_SYMBOLS) && defined(ENABLE_WEAK_LINKAGE)
+#define API_C_FUNC       extern
+#define API_FUNC_NAME(N) (*N)
+#else
+#define API_C_FUNC
+#define API_FUNC_NAME(N) N
+#endif
+
+// Macro API_FUNC_EXPORT to export symbols
+#if defined(_MSC_VER)
+#define API_FUNC_EXPORT __declspec(dllexport)
+#else
+#define API_FUNC_EXPORT __attribute__((visibility("default")))
+#endif // _MSC_VER
+
+// Macro API_EXPORT_IMPORT to export class static variables
+#if defined(_MSC_VER)
+#if defined(BUILD_OP_PACKAGE)
+#define API_EXPORT_IMPORT __declspec(dllimport)
+#else // not BUILD_OP_PACKAGE, export symbols for building library
+#define API_EXPORT_IMPORT __declspec(dllexport)
+#endif
+#else // not define _MSC_VER
+#define API_EXPORT_IMPORT __attribute__((visibility("default")))
+#endif // _MSC_VER
+
+// Macro API_FUNC_HIDDEN to hide symbols
+#if defined(_MSC_VER)
+#define API_FUNC_HIDDEN
+#else
+#define API_FUNC_HIDDEN __attribute__((visibility("hidden")))
+#endif // _MSC_VER
+
+// Add new macros to support #pragma GCC visibility push/pop
+#if defined(_MSC_VER)
+#define PUSH_VISIBILITY(kind)
+#define POP_VISIBILITY()
+#else
+#define PUSH_VISIBILITY(kind) _Pragma(TOSTRING(GCC visibility push(kind)))
+#define POP_VISIBILITY()      _Pragma("GCC visibility pop")
+#endif
+
+#endif // WEAK_LINKAGE_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTPQEMU/QnnHtpQemuCommon.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTPQEMU/QnnHtpQemuCommon.h
new file mode 100755
index 0000000000000..e18c81f12df9e
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTPQEMU/QnnHtpQemuCommon.h
@@ -0,0 +1,56 @@
+//=============================================================================
+//
+// Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+// All rights reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//=============================================================================
+
+/** @file
+ *  @brief QNN HTP QEMU Common components
+ *
+ *         This file defines versioning and other identification details
+ *         and supplements QnnCommon.h for HTP QEMU backend
+ */
+
+#ifndef QNN_HTP_QEMU_COMMON_H
+#define QNN_HTP_QEMU_COMMON_H
+
+#include "QnnCommon.h"
+
+/// HTP QEMU Backend identifier
+#define QNN_BACKEND_ID_HTP_QEMU 13
+
+/// HTP QEMU interface provider
+#define QNN_HTP_QEMU_INTERFACE_PROVIDER_NAME "HTP_QEMU_QTI_AISW"
+
+// HTP QEMU API Version values
+#define QNN_HTP_QEMU_API_VERSION_MAJOR 1
+#define QNN_HTP_QEMU_API_VERSION_MINOR 0
+#define QNN_HTP_QEMU_API_VERSION_PATCH 0
+
+// clang-format off
+
+/// Macro to set Qnn_ApiVersion_t for HTP QEMU backend
+#define QNN_HTP_QEMU_API_VERSION_INIT                                 \
+  {                                                                   \
+    {                                                                 \
+        QNN_API_VERSION_MAJOR,        /*coreApiVersion.major*/        \
+        QNN_API_VERSION_MINOR,        /*coreApiVersion.major*/        \
+        QNN_API_VERSION_PATCH         /*coreApiVersion.major*/        \
+    },                                                                \
+    {                                                                 \
+      QNN_HTP_QEMU_API_VERSION_MAJOR,     /*backendApiVersion.major*/ \
+      QNN_HTP_QEMU_API_VERSION_MINOR,     /*backendApiVersion.minor*/ \
+      QNN_HTP_QEMU_API_VERSION_PATCH      /*backendApiVersion.patch*/ \
+    }                                                                 \
+  }
+
+// clang-format on
+
+// DSP Context blob Version values
+#define QNN_HTP_QEMU_CONTEXT_BLOB_VERSION_MAJOR 3
+#define QNN_HTP_QEMU_CONTEXT_BLOB_VERSION_MINOR 2
+#define QNN_HTP_QEMU_CONTEXT_BLOB_VERSION_PATCH 2
+
+#endif  // QNN_HTP_QEMU_COMMON_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/IR/QnnIrCommon.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/IR/QnnIrCommon.h
new file mode 100755
index 0000000000000..03c65cd7e86bd
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/IR/QnnIrCommon.h
@@ -0,0 +1,50 @@
+//=============================================================================
+//
+//  Copyright (c) 2023-2024 Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//=============================================================================
+
+/** @file
+ *  @brief QNN IR Common components
+ *
+ *         This file defines versioning and other identification details
+ *         and supplements QnnCommon.h for Ir backend
+ */
+
+#ifndef QNN_IR_COMMON_H
+#define QNN_IR_COMMON_H
+
+#include "QnnCommon.h"
+
+/// Ir Backend Identifier
+#define QNN_BACKEND_ID_IR 9
+
+/// Ir interface provider
+#define QNN_IR_INTERFACE_PROVIDER_NAME "IR_QTI_AISW"
+
+// Ir API Version Values
+#define QNN_IR_API_VERSION_MAJOR 0
+#define QNN_IR_API_VERSION_MINOR 1
+#define QNN_IR_API_VERSION_PATCH 0
+
+// clang-format off
+// Macro to set Qnn_ApiVersion_t for Ir backend
+#define QNN_IR_API_VERSION_INIT                                    \
+  {                                                                      \
+    {                                                                    \
+      QNN_API_VERSION_MAJOR,    /* coreApiVersion.major */               \
+      QNN_API_VERSION_MINOR,    /* coreApiVersion.minor */               \
+      QNN_API_VERSION_PATCH     /* coreApiVersion.patch */               \
+    },                                                                   \
+    {                                                                    \
+      QNN_IR_API_VERSION_MAJOR, /* backendApirVersion.major */     \
+      QNN_IR_API_VERSION_MINOR, /* backendApirVersion.minor */     \
+      QNN_IR_API_VERSION_PATCH, /* backendApirVersion.patch */     \
+    }                                                                    \
+  }
+
+// clang-format on
+
+#endif // QNN_IR_COMMON_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/IR/QnnIrGraph.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/IR/QnnIrGraph.h
new file mode 100755
index 0000000000000..58188487c803f
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/IR/QnnIrGraph.h
@@ -0,0 +1,76 @@
+//==============================================================================
+//
+// Copyright (c) 2023-2024 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/**
+ *  @file
+ *  @brief  A header which defines the QNN Ir specialization of the QnnGraph.h interface.
+ */
+
+#ifndef QNN_IR_GRAPH_H
+#define QNN_IR_GRAPH_H
+
+#ifdef __cplusplus
+#include <cstdint>
+#else
+#include <stdint.h>
+#endif
+
+#include "QnnGraph.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum {
+  QNN_IR_GRAPH_SERIALIZATION_TYPE_FLAT_BUFFER = 1
+} QnnIrGraph_SerializationType_t;
+
+typedef enum {
+  QNN_IR_GRAPH_CONFIG_OPTION_SERIALIZATION = 1,
+  QNN_IR_GRAPH_CONFIG_OPTION_UNKNOWN       = 0x7fffffff
+} QnnIrGraph_ConfigOption_t;
+
+typedef struct {
+  QnnIrGraph_SerializationType_t serializationType;
+  const char *outputPath;
+} QnnIrGraph_SerializationOption_t;
+
+/**
+ * @brief A struct which Structure describing the set of configurations supported by graph.
+
+*/
+typedef struct {
+  QnnIrGraph_ConfigOption_t option;
+  union {
+    QnnIrGraph_SerializationOption_t serializationOption;
+  };
+} QnnIrGraph_CustomConfig_t;
+
+// clang-format off
+/// QnnIrGraph_CustomConfig_t initializer macro
+
+#define QNN_IR_GRAPH_SERIALIZATION_OPTION_INIT \
+  {                                                  \
+    QNN_IR_GRAPH_SERIALIZATION_TYPE_FLAT_BUFFER        \
+    ""                                               \
+  }
+
+#define QNN_IR_GRAPH_CUSTOM_CONFIG_INIT                       \
+  {                                                                 \
+    QNN_IR_GRAPH_CONFIG_OPTION_SERIALIZATION, /*option*/      \
+    {                                                               \
+      QNN_IR_GRAPH_SERIALIZATION_OPTION_INIT                  \
+    }                                                               \
+  }
+// clang-format on
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/LPAI/QnnLpaiBackend.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/LPAI/QnnLpaiBackend.h
new file mode 100755
index 0000000000000..4b952e3ae9f36
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/LPAI/QnnLpaiBackend.h
@@ -0,0 +1,176 @@
+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All rights reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/** @file
+ *  @brief QNN LPAI component Backend API.
+ *
+ *         The interfaces in this file work with the top level QNN
+ *         API and supplements QnnBackend.h for LPAI backend
+ */
+
+#ifndef QNN_LPAI_BACKEND_H
+#define QNN_LPAI_BACKEND_H
+
+#include "QnnBackend.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+/**
+ * @brief An enum which defines the different backend custom config options
+ */
+typedef enum {
+  // see QnnLpaiBackend_CustomConfigHwInfo_t
+  QNN_LPAI_BACKEND_CUSTOM_CFG_HW_INFO,
+  QNN_LPAI_BACKEND_CUSTOM_CFG_UNDEFINED = 0x7fffffff
+} QnnLpaiBackend_CustomConfigOption_t;
+
+/**
+ * @brief An enum which defines the different targets supported by LPAI compilation.
+ */
+typedef enum {
+  /// LPAI model will be compiled for x86
+  QNN_LPAI_BACKEND_TARGET_X86 = 0,
+  /// LPAI model will be compiled for ARM
+  QNN_LPAI_BACKEND_TARGET_ARM = 1,
+  /// LPAI model will be compiled for ADSP
+  QNN_LPAI_BACKEND_TARGET_ADSP = 2,
+  /// LPAI model will be compiled for TENSILICA
+  QNN_LPAI_BACKEND_TARGET_TENSILICA = 3,
+  /// UNKNOWN enum event that must not be used
+  QNN_LPAI_BACKEND_TARGET_UNKNOWN = 0x7fffffff,
+} QnnLpaiBackend_Target_t;
+
+/**
+ * @brief An enum which defines the version of LPAI Hardware.
+ */
+typedef enum {
+  /// No LPAI HW will be used
+  QNN_LPAI_BACKEND_HW_VERSION_NA   = 0,
+  /// LPAI HW version v1
+  QNN_LPAI_BACKEND_HW_VERSION_V1   = 0x00000001,
+  /// LPAI HW version v2
+  QNN_LPAI_BACKEND_HW_VERSION_V2   = 0x00000002,
+  /// LPAI HW version v3
+  QNN_LPAI_BACKEND_HW_VERSION_V3   = 0x00000003,
+  /// LPAI HW version v4
+  QNN_LPAI_BACKEND_HW_VERSION_V4   = 0x00000004,
+  /// LPAI HW version v5
+  QNN_LPAI_BACKEND_HW_VERSION_V5   = 0x00000005,
+  /// LPAI HW version v5.1
+  QNN_LPAI_BACKEND_HW_VERSION_V5_1 = 0x00010005,
+  /// LPAI HW version v6
+  QNN_LPAI_BACKEND_HW_VERSION_V6   = 0x00000006,
+  /// LPAI HW default version v5
+  QNN_LPAI_BACKEND_HW_VERSION_DEFAULT = QNN_LPAI_BACKEND_HW_VERSION_V5,
+  /// UNKNOWN enum event that must not be used
+  QNN_LPAI_BACKEND_HW_VERSION_UNKNOWN = 0x7fffffff,
+} QnnLpaiBackend_HwVersion_t;
+
+//=============================================================================
+// Public Functions
+//=============================================================================
+
+//------------------------------------------------------------------------------
+//   Implementation Definition
+//------------------------------------------------------------------------------
+
+/**
+ * @brief Structure describing the set of configurations supported by the backend.
+ *        Objects of this type are to be referenced through QnnBackend_CustomConfig_t.
+ */
+typedef struct {
+  uint32_t option;
+  void* config;
+} QnnLpaiBackend_CustomConfig_t;
+
+// clang-format off
+/// QnnLpaiBackend_CustomConfig_t initializer macro
+#define QNN_LPAI_BACKEND_CUSTOM_CONFIG_INIT                        \
+  {                                                                \
+    QNN_LPAI_BACKEND_CUSTOM_CFG_UNDEFINED,           /*option*/    \
+    NULL                                             /*config*/    \
+  }
+// clang-format on
+
+typedef struct {
+  QnnLpaiBackend_Target_t lpaiTarget;
+  QnnLpaiBackend_HwVersion_t hwVersion;
+} QnnLpaiBackend_CustomConfigHwInfo_t;
+
+// clang-format off
+/// QnnLpaiBackend_CustomConfigHwInfo_t initializer macro
+#define QNN_LPAI_BACKEND_CUSTOM_CONFIG_HW_INFO_INIT                                 \
+  {                                                                         \
+    QNN_LPAI_BACKEND_TARGET_ADSP,        /*lpaiTarget*/                     \
+    QNN_LPAI_BACKEND_HW_VERSION_DEFAULT, /*hwVersion*/                      \
+  }
+// clang-format on
+
+/**
+ * @brief Enum describing the set of properties supported by the backend.
+ *        Objects of this type are to be referenced through QnnBackend_CustomProperty_t.
+ */
+typedef enum {
+  // get the start address alignment and size alignment requirement of buffers, see
+  // QnnLpaiBackend_BufferAlignmentReq_t
+  QNN_LPAI_BACKEND_GET_PROP_ALIGNMENT_REQ,
+  // indicate if cached binary buffer need to be persistent until QnnContext_free is called, return
+  // bool
+  // if true is returned, need to specify QNN_CONTEXT_CONFIG_PERSISTENT_BINARY during
+  // QnnContext_createFromBinary
+  QNN_LPAI_BACKEND_GET_PROP_REQUIRE_PERSISTENT_BINARY,
+  // Unused, present to ensure 32 bits.
+  QNN_LPAI_BACKEND_GET_PROP_UNDEFINED = 0x7fffffff
+} QnnLpaiBackend_GetPropertyOption_t;
+
+typedef struct {
+  // the start address of the buffer must be startAddrAlignment-byte aligned
+  uint32_t startAddrAlignment;
+  // the allocated buffer must be a multiple of sizeAlignment bytes
+  uint32_t sizeAlignment;
+} QnnLpaiBackend_BufferAlignmentReq_t;
+
+// clang-format off
+/// QnnLpaiBackend_BufferAlignmentReq_t initializer macro
+#define QNN_LPAI_BACKEND_ALIGNMENT_REQ_INIT                          \
+  {                                                                  \
+    0u,                                      /*startAddrAlignment*/  \
+    0u                                       /*sizeAlignment*/       \
+  }
+// clang-format on
+
+// used by QnnBackend_getProperty
+typedef struct {
+  uint32_t option;
+  void* property;
+} QnnLpaiBackend_CustomProperty_t;
+
+// clang-format off
+/// QnnLpaiBackend_CustomProperty_t initializer macro
+#define QNN_LPAI_BACKEND_CUSTOM_PROPERTY_INIT                        \
+  {                                                                  \
+    QNN_LPAI_BACKEND_GET_PROP_UNDEFINED,               /*option*/    \
+    NULL                                               /*property*/  \
+  }
+// clang-format on
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/LPAI/QnnLpaiCommon.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/LPAI/QnnLpaiCommon.h
new file mode 100755
index 0000000000000..5a6f039f28f74
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/LPAI/QnnLpaiCommon.h
@@ -0,0 +1,63 @@
+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All rights reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/** @file
+ *  @brief QNN LPAI Common components
+ *
+ *         This file defines versioning and other identification details
+ *         and supplements QnnCommon.h for LPAI backend
+ */
+
+#ifndef QNN_LPAI_COMMON_H
+#define QNN_LPAI_COMMON_H
+
+#include "QnnCommon.h"
+
+/// QNN LPAI Backend identifier
+#define QNN_BACKEND_ID_LPAI 12
+#define QNN_BACKEND_ID_LPAI_ISLAND 16
+
+/// QNN LPAI interface provider
+#define QNN_LPAI_INTERFACE_PROVIDER_NAME "LPAI_QTI_AISW"
+
+/// QNN LPAI API Version values for V5
+#define QNN_LPAI_API_VERSION_MAJOR 2
+#define QNN_LPAI_API_VERSION_MINOR 17
+#define QNN_LPAI_API_VERSION_PATCH 0
+
+
+// clang-format off
+
+/// Macro to set Qnn_ApiVersion_t for LPAI backend
+#define QNN_LPAI_API_VERSION_INIT                                \
+  {                                                              \
+    {                                                            \
+      QNN_API_VERSION_MAJOR,     /*coreApiVersion.major*/        \
+      QNN_API_VERSION_MINOR,     /*coreApiVersion.major*/        \
+      QNN_API_VERSION_PATCH      /*coreApiVersion.major*/        \
+    },                                                           \
+    {                                                            \
+      QNN_LPAI_API_VERSION_MAJOR, /*backendApiVersion.major*/    \
+      QNN_LPAI_API_VERSION_MINOR, /*backendApiVersion.minor*/    \
+      QNN_LPAI_API_VERSION_PATCH  /*backendApiVersion.patch*/    \
+    }                                                            \
+  }
+
+// clang-format on
+
+/// QNN LPAI Binary Version values
+#define QNN_LPAI_BINARY_VERSION_MAJOR 1
+#define QNN_LPAI_BINARY_VERSION_MINOR 0
+#define QNN_LPAI_BINARY_VERSION_PATCH 0
+
+/// QNN LPAI Context blob Version values
+#define QNN_LPAI_CONTEXT_BLOB_VERSION_MAJOR 1
+#define QNN_LPAI_CONTEXT_BLOB_VERSION_MINOR 0
+#define QNN_LPAI_CONTEXT_BLOB_VERSION_PATCH 0
+
+#endif  // QNN_LPAI_COMMON_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/LPAI/QnnLpaiContext.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/LPAI/QnnLpaiContext.h
new file mode 100755
index 0000000000000..410e8e75f5dfe
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/LPAI/QnnLpaiContext.h
@@ -0,0 +1,55 @@
+//=============================================================================
+//
+//  Copyright (c) 2024 Qualcomm Technologies, Inc.
+//  All rights reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//=============================================================================
+
+/** @file
+ *  @brief QNN LPAI Context components
+ */
+
+#ifndef QNN_LPAI_CONTEXT_H
+#define QNN_LPAI_CONTEXT_H
+
+#ifdef __cplusplus
+#include <cstdint>
+#else
+#include <stdint.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "QnnLpaiContextInt.h"
+
+typedef struct {
+  uint32_t option;
+  void* config;
+} QnnLpaiContext_CustomConfig_t;
+// clang-format on
+
+typedef enum {
+  // see QnnLpaiMem_MemType_t
+  QNN_LPAI_CONTEXT_SET_CFG_MODEL_BUFFER_MEM_TYPE =
+      QNN_LPAI_CONTEXT_SET_CFG_MODEL_BUFFER_MEM_TYPE_DEFAULT,
+  // Unused, present to ensure 32 bits.
+  QNN_LPAI_CONTEXT_SET_CFG_UNDEFINED = 0x7fffffff
+} QnnLpaiContext_SetConfigOption_t;
+
+// clang-format off
+// QnnLpaiContext_CustomConfig_t initializer macro
+#define QNN_LPAI_CONTEXT_CUSTOM_CONFIG_INIT                        \
+  {                                                                \
+    QNN_LPAI_CONTEXT_SET_CFG_UNDEFINED,               /*option*/   \
+    NULL                                              /*config*/   \
+  }
+
+// clang-format on
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif // QNN_LPAI_CONTEXT_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/LPAI/QnnLpaiContextInt.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/LPAI/QnnLpaiContextInt.h
new file mode 100755
index 0000000000000..b4a1cc4638653
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/LPAI/QnnLpaiContextInt.h
@@ -0,0 +1,25 @@
+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All rights reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/** @file
+ *  @brief Internal versioning details for QNN LPAI Context components
+ */
+
+#ifndef QNN_LPAI_CONTEXT_INT_H
+#define QNN_LPAI_CONTEXT_INT_H
+
+#define QNN_LPAI_CONTEXT_SET_CFG_BASE  1
+
+// versions for setConfig options
+typedef enum {
+  QNN_LPAI_CONTEXT_SET_CFG_MODEL_BUFFER_MEM_TYPE_V1 = QNN_LPAI_CONTEXT_SET_CFG_BASE,
+  QNN_LPAI_CONTEXT_SET_CFG_MODEL_BUFFER_MEM_TYPE_DEFAULT =
+      QNN_LPAI_CONTEXT_SET_CFG_MODEL_BUFFER_MEM_TYPE_V1
+} QnnLpaiContext_SetConfigOption_ModelBufferMemTypeVersion_t;
+
+#endif  // QNN_LPAI_CONTEXT_INT_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/LPAI/QnnLpaiDevice.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/LPAI/QnnLpaiDevice.h
new file mode 100755
index 0000000000000..b7999480fb254
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/LPAI/QnnLpaiDevice.h
@@ -0,0 +1,50 @@
+//=============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All rights reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//=============================================================================
+
+/** @file
+ *  @brief QNN LPAI Device components
+ */
+
+#ifndef QNN_LPAI_DEVICE_H
+#define QNN_LPAI_DEVICE_H
+
+#ifdef __cplusplus
+#include <cstdint>
+#else
+#include <stdint.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * This structure is being used in QnnDevice_HardwareDeviceInfoV1_t
+ * QnnDevice_getPlatformInfo use this structure to list the supported device features/info
+ */
+typedef struct _QnnDevice_DeviceInfoExtension_t {
+  uint32_t socModel;        // An enum value defined in Qnn Header that represent SoC model
+  uint32_t arch;            // This field shows the architecture of this device
+  const char* domainName;   // This field shows the domain name of this device
+} QnnLpaiDevice_DeviceInfoExtension_t;
+
+// clang-format off
+/// QnnLpaiDevice_DeviceInfoExtension_t initializer macro
+#define QNN_LPAI_DEVICE_INFO_EXTENSION_INIT                          \
+  {                                                                  \
+    0u,                                         /*socModel*/         \
+    0u,                                         /*arch*/             \
+    "adsp"                                      /*domainName*/       \
+  }
+// clang-format on
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // QNN_LPAI_DEVICE_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/LPAI/QnnLpaiGraph.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/LPAI/QnnLpaiGraph.h
new file mode 100755
index 0000000000000..8441a419ce56f
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/LPAI/QnnLpaiGraph.h
@@ -0,0 +1,145 @@
+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All rights reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/** @file
+ *  @brief QNN LPAI Graph components
+ */
+
+#ifndef QNN_LPAI_GRAPH_H
+#define QNN_LPAI_GRAPH_H
+
+#ifdef __cplusplus
+#include <cstdint>
+#else
+#include <stdint.h>
+#endif
+
+#include "QnnLpaiGraphInternal.h"
+#include "QnnLpaiMem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum {
+  // see QnnLpaiGraph_Mem_t
+  QNN_LPAI_GRAPH_SET_CFG_SCRATCH_MEM = QNN_LPAI_GRAPH_SET_CFG_SCRATCH_MEM_DEFAULT,
+  // see QnnLpaiGraph_Mem_t
+  QNN_LPAI_GRAPH_SET_CFG_PERSISTENT_MEM = QNN_LPAI_GRAPH_SET_CFG_PERSISTENT_MEM_DEFAULT,
+  // see QnnLpaiGraph_PerfCfg_t
+  QNN_LPAI_GRAPH_SET_CFG_PERF_CFG = QNN_LPAI_GRAPH_SET_CFG_PERF_CFG_DEFAULT,
+  // see QnnLpaiGraph_CoreAffinity_t
+  QNN_LPAI_GRAPH_SET_CFG_CORE_AFFINITY = QNN_LPAI_GRAPH_SET_CFG_CORE_AFFINITY_DEFAULT,
+  // Unused, present to ensure 32 bits.
+  QNN_LPAI_GRAPH_SET_CFG_UNDEFINED = 0x7fffffff
+} QnnLpaiGraph_SetConfigOption_t;
+
+typedef enum {
+  // get the size requirement of scratch memory, return uint32_t
+  QNN_LPAI_GRAPH_GET_PROP_SCRATCH_MEM_SIZE = QNN_LPAI_GRAPH_GET_PROP_SCRATCH_MEM_SIZE_DEFAULT,
+  // get the size requirement of persistent memory, return uint32_t
+  QNN_LPAI_GRAPH_GET_PROP_PERSISTENT_MEM_SIZE = QNN_LPAI_GRAPH_GET_PROP_PERSISTENT_MEM_SIZE_DEFAULT,
+  // Unused, present to ensure 32 bits.
+  QNN_LPAI_GRAPH_GET_PROP_UNDEFINED = 0x7fffffff
+} QnnLpaiGraph_GetPropertyOption_t;
+
+typedef enum {
+  QNN_LPAI_GRAPH_CLIENT_PERF_TYPE_REAL_TIME     = 1,
+  QNN_LPAI_GRAPH_CLIENT_PERF_TYPE_NON_REAL_TIME = 2,
+  // Unused, present to ensure 32 bits.
+  QNN_LPAI_GRAPH_CLIENT_PERF_TYPE_UNDEFINED = 0x7fffffff
+} QnnLpaiGraph_ClientPerfType_t;
+
+typedef struct {
+  uint32_t fps;
+  uint32_t ftrtRatio;
+  QnnLpaiGraph_ClientPerfType_t clientType;
+} QnnLpaiGraph_PerfCfg_t;
+
+// clang-format off
+/// QnnLpaiGraph_PerfCfg_t initializer macro
+#define QNN_LPAI_GRAPH_PERF_CFG_INIT                             \
+  {                                                              \
+    1u,                                        /*fps*/           \
+    10u,                                       /*ftrtRatio*/     \
+    QNN_LPAI_GRAPH_CLIENT_PERF_TYPE_REAL_TIME  /*clientType*/    \
+  }
+// clang-format on
+
+typedef enum {
+  QNN_LPAI_GRAPH_CORE_AFFINITY_SOFT = 1,
+  QNN_LPAI_GRAPH_CORE_AFFINITY_HARD = 2,
+  // Unused, present to ensure 32 bits.
+  QNN_LPAI_GRAPH_CORE_AFFINITY_UNDEFINED = 0x7fffffff
+} QnnLpaiGraph_CoreAffinityType_t;
+
+typedef struct {
+  QnnLpaiGraph_CoreAffinityType_t affinity;
+  uint32_t coreSelection;
+} QnnLpaiGraph_CoreAffinity_t;
+
+// clang-format off
+/// QnnLpaiGraph_CoreAffinity_t initializer macro
+#define QNN_LPAI_GRAPH_CORE_AFFINITY_INIT                          \
+  {                                                                \
+    QNN_LPAI_GRAPH_CORE_AFFINITY_SOFT,       /*affinity*/          \
+    0u                                       /*core_selection*/    \
+  }
+// clang-format on
+
+typedef struct {
+  QnnLpaiMem_MemType_t memType;
+  uint32_t size;
+  void* addr;
+} QnnLpaiGraph_Mem_t;
+
+// clang-format off
+/// QnnLpaiGraph_Mem_t initializer macro
+#define QNN_LPAI_GRAPH_MEM_INIT                            \
+  {                                                        \
+    QNN_LPAI_MEM_TYPE_UNDEFINED,         /*memType*/       \
+    0u,                                  /*size*/          \
+    NULL                                 /*addr*/          \
+  }
+// clang-format on
+
+// used by QnnGraph_setConfig
+typedef struct {
+  uint32_t option;
+  void* config;
+} QnnLpaiGraph_CustomConfig_t;
+
+// clang-format off
+/// QnnLpaiGraph_CustomConfig_t initializer macro
+#define QNN_LPAI_GRAPH_CUSTOM_CONFIG_INIT                          \
+  {                                                                \
+    QNN_LPAI_GRAPH_SET_CFG_UNDEFINED,                /*option*/    \
+    NULL                                             /*config*/    \
+  }
+// clang-format on
+
+// used by QnnGraph_getProperty
+typedef struct {
+  uint32_t option;
+  void* property;
+} QnnLpaiGraph_CustomProperty_t;
+
+// clang-format off
+/// QnnLpaiGraph_CustomProperty_t initializer macro
+#define QNN_LPAI_GRAPH_CUSTOM_PROPERTY_INIT                          \
+  {                                                                  \
+    QNN_LPAI_GRAPH_GET_PROP_UNDEFINED,                 /*option*/    \
+    NULL                                               /*property*/  \
+  }
+// clang-format on
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // QNN_LPAI_GRAPH_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/LPAI/QnnLpaiGraphInternal.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/LPAI/QnnLpaiGraphInternal.h
new file mode 100755
index 0000000000000..21249df4ac125
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/LPAI/QnnLpaiGraphInternal.h
@@ -0,0 +1,57 @@
+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All rights reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/** @file
+ *  @brief Internal versioning details for QNN LPAI Graph components
+ */
+
+#ifndef QNN_LPAI_GRAPH_INTERNAL_H
+#define QNN_LPAI_GRAPH_INTERNAL_H
+
+#define QNN_LPAI_GRAPH_SET_CFG_BASE  1
+#define QNN_LPAI_GRAPH_GET_PROP_BASE 1
+
+// versions for setConfig options
+typedef enum {
+  QNN_LPAI_GRAPH_SET_CFG_SCRATCH_MEM_V1      = QNN_LPAI_GRAPH_SET_CFG_BASE,
+  QNN_LPAI_GRAPH_SET_CFG_SCRATCH_MEM_DEFAULT = QNN_LPAI_GRAPH_SET_CFG_SCRATCH_MEM_V1
+} QnnLpaiGraph_SetConfigOption_ScratchMemVersion_t;
+
+typedef enum {
+  QNN_LPAI_GRAPH_SET_CFG_PERSISTENT_MEM_V1      = QNN_LPAI_GRAPH_SET_CFG_BASE + 100,
+  QNN_LPAI_GRAPH_SET_CFG_PERSISTENT_MEM_DEFAULT = QNN_LPAI_GRAPH_SET_CFG_PERSISTENT_MEM_V1
+} QnnLpaiGraph_SetConfigOption_PersistentMemVersion_t;
+
+typedef enum {
+  QNN_LPAI_GRAPH_SET_CFG_PERF_CFG_V1      = QNN_LPAI_GRAPH_SET_CFG_BASE + 200,
+  QNN_LPAI_GRAPH_SET_CFG_PERF_CFG_DEFAULT = QNN_LPAI_GRAPH_SET_CFG_PERF_CFG_V1
+} QnnLpaiGraph_SetConfigOption_PerfConfigVersion_t;
+
+typedef enum {
+  QNN_LPAI_GRAPH_SET_CFG_CORE_AFFINITY_V1      = QNN_LPAI_GRAPH_SET_CFG_BASE + 300,
+  QNN_LPAI_GRAPH_SET_CFG_CORE_AFFINITY_DEFAULT = QNN_LPAI_GRAPH_SET_CFG_CORE_AFFINITY_V1
+} QnnLpaiGraph_SetConfigOption_CoreAffinityVersion_t;
+
+typedef enum {
+  QNN_LPAI_GRAPH_SET_CFG_PREPARE_V1      = QNN_LPAI_GRAPH_SET_CFG_BASE + 10000,
+  QNN_LPAI_GRAPH_SET_CFG_PREPARE_DEFAULT = QNN_LPAI_GRAPH_SET_CFG_PREPARE_V1
+} QnnLpaiGraph_SetConfigOption_PrepareVersion_t;
+
+// versions for getProperty options
+typedef enum {
+  QNN_LPAI_GRAPH_GET_PROP_SCRATCH_MEM_SIZE_V1      = QNN_LPAI_GRAPH_GET_PROP_BASE,
+  QNN_LPAI_GRAPH_GET_PROP_SCRATCH_MEM_SIZE_DEFAULT = QNN_LPAI_GRAPH_GET_PROP_SCRATCH_MEM_SIZE_V1
+} QnnLpaiGraph_GetPropertyOption_ScratchMemSizeVersion_t;
+
+typedef enum {
+  QNN_LPAI_GRAPH_GET_PROP_PERSISTENT_MEM_SIZE_V1 = QNN_LPAI_GRAPH_GET_PROP_BASE + 100,
+  QNN_LPAI_GRAPH_GET_PROP_PERSISTENT_MEM_SIZE_DEFAULT =
+      QNN_LPAI_GRAPH_GET_PROP_PERSISTENT_MEM_SIZE_V1
+} QnnLpaiGraph_GetPropertyOption_PersistentMemSizeVersion_t;
+
+#endif  // QNN_LPAI_GRAPH_INTERNAL_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/LPAI/QnnLpaiGraphPrepare.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/LPAI/QnnLpaiGraphPrepare.h
new file mode 100755
index 0000000000000..6f5f383c3e179
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/LPAI/QnnLpaiGraphPrepare.h
@@ -0,0 +1,61 @@
+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All rights reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/** @file
+ *  @brief QNN LPAI Graph Preparation component.
+ *
+ */
+
+#ifndef QNN_LPAI_GRAPH_PREPARE_H
+#define QNN_LPAI_GRAPH_PREPARE_H
+
+#ifdef __cplusplus
+#include <cstdint>
+#else
+#include <stdint.h>
+#endif
+
+#include "QnnLpaiGraphInternal.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//------------------------------------------------------------------------------
+//   Implementation Definition
+//------------------------------------------------------------------------------
+
+typedef enum {
+  QNN_LPAI_GRAPH_SET_CFG_PREPARE = QNN_LPAI_GRAPH_SET_CFG_PREPARE_DEFAULT
+} QnnLpaiGraph_ConfigPrepareOption_t;
+
+/**
+ * @brief Structure describing the set of configurations supported by the graph config prepare.
+ *        Objects of this type are to be referenced through QnnGraph_CustomConfig_t.
+ *
+ */
+// todo: will replace with high-level config
+typedef struct {
+  uint32_t enablePerLayer;
+} QnnLpaiGraph_CustomConfigPrepare_t;
+
+// clang-format off
+/// QnnLpaiGraph_CustomConfigPrepare_t initializer macro
+// todo: will replace with high-level config
+#define QNN_LPAI_GRAPH_CUSTOM_CONFIG_PREPARE_INIT               \
+  {                                                             \
+      0u                                /*enablePerLayer*/      \
+  }
+
+// clang-format on
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/LPAI/QnnLpaiMem.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/LPAI/QnnLpaiMem.h
new file mode 100755
index 0000000000000..3537678d34cd0
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/LPAI/QnnLpaiMem.h
@@ -0,0 +1,55 @@
+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All rights reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/** @file
+ *  @brief QNN LPAI Memory components
+ */
+
+#ifndef QNN_LPAI_MEM_H
+#define QNN_LPAI_MEM_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __cplusplus
+#include <cstdint>
+#else
+#include <stdint.h>
+#endif
+
+typedef enum {
+  QNN_LPAI_MEM_TYPE_DDR       = 1,
+  QNN_LPAI_MEM_TYPE_LLC       = 2,
+  QNN_LPAI_MEM_TYPE_TCM       = 3,
+  QNN_LPAI_MEM_TYPE_UNDEFINED = 0x7fffffff
+} QnnLpaiMem_MemType_t;
+
+/**
+ * @brief Definition of custom mem info
+ */
+typedef struct {
+  /// file descriptor for memory
+  int32_t fd;
+  /// offset from start of fd
+  uint32_t offset;
+} QnnLpaiMem_MemInfoCustom_t;
+
+// clang-format off
+#define QNN_LPAI_MEM_INFO_CUSTOM_INIT                             \
+  {                                                               \
+    0,                                          /*fd*/            \
+    0u                                          /*offset*/        \
+  }
+// clang-format on
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // QNN_LPAI_MEM_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnBackend.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnBackend.h
new file mode 100755
index 0000000000000..7d8506b23b0d9
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnBackend.h
@@ -0,0 +1,429 @@
+//=============================================================================
+//
+//  Copyright (c) 2019-2024 Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//=============================================================================
+
+/**
+ *  @file
+ *  @brief  Backend component API.
+ *
+ *          This is top level QNN API component.
+ *          Most of the QNN API requires backend to be created first.
+ */
+
+#ifndef QNN_BACKEND_H
+#define QNN_BACKEND_H
+
+#include "QnnCommon.h"
+#include "QnnTypes.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+/**
+ * @brief QNN Backend API result / error codes.
+ */
+typedef enum {
+  QNN_BACKEND_MIN_ERROR = QNN_MIN_ERROR_BACKEND,
+  ////////////////////////////////////////////
+
+  /// Qnn Backend success
+  QNN_BACKEND_NO_ERROR = QNN_SUCCESS,
+  /// General error relating to memory allocation in Backend API
+  QNN_BACKEND_ERROR_MEM_ALLOC = QNN_COMMON_ERROR_MEM_ALLOC,
+  /// Backend attempted to be created on an unsupported platform
+  QNN_BACKEND_ERROR_UNSUPPORTED_PLATFORM = QNN_COMMON_ERROR_PLATFORM_NOT_SUPPORTED,
+  /// Backend failed to initialize
+  QNN_BACKEND_ERROR_CANNOT_INITIALIZE = QNN_MIN_ERROR_BACKEND + 0,
+  /// Failed to free allocated resources during termination
+  QNN_BACKEND_ERROR_TERMINATE_FAILED = QNN_MIN_ERROR_BACKEND + 2,
+  /// Backend does not support requested functionality
+  QNN_BACKEND_ERROR_NOT_SUPPORTED = QNN_MIN_ERROR_BACKEND + 3,
+  /// Invalid function argument
+  QNN_BACKEND_ERROR_INVALID_ARGUMENT = QNN_MIN_ERROR_BACKEND + 4,
+  /// Could not find specified op package
+  QNN_BACKEND_ERROR_OP_PACKAGE_NOT_FOUND = QNN_MIN_ERROR_BACKEND + 5,
+  /// Could not load interface provider from op package library
+  QNN_BACKEND_ERROR_OP_PACKAGE_IF_PROVIDER_NOT_FOUND = QNN_MIN_ERROR_BACKEND + 6,
+  /// Failed to register op package
+  QNN_BACKEND_ERROR_OP_PACKAGE_REGISTRATION_FAILED = QNN_MIN_ERROR_BACKEND + 7,
+  /// Backend does not support the op config's interface version
+  QNN_BACKEND_ERROR_OP_PACKAGE_UNSUPPORTED_VERSION = QNN_MIN_ERROR_BACKEND + 8,
+  /// An Op with the same package name and op name was already registered
+  QNN_BACKEND_ERROR_OP_PACKAGE_DUPLICATE = QNN_MIN_ERROR_BACKEND + 9,
+  /// Inconsistent backend configuration
+  QNN_BACKEND_ERROR_INCONSISTENT_CONFIG = QNN_MIN_ERROR_BACKEND + 10,
+  /// Invalid backend handle
+  QNN_BACKEND_ERROR_INVALID_HANDLE = QNN_MIN_ERROR_BACKEND + 11,
+  /// Invalid config
+  QNN_BACKEND_ERROR_INVALID_CONFIG = QNN_MIN_ERROR_BACKEND + 12,
+  ////////////////////////////////////////////
+  QNN_BACKEND_MAX_ERROR = QNN_MAX_ERROR_BACKEND,
+  // Unused, present to ensure 32 bits.
+  QNN_BACKEND_ERROR_UNDEFINED = 0x7FFFFFFF
+} QnnBackend_Error_t;
+
+/**
+ * @brief Backend specific object for custom configuration
+ *
+ * Please refer to documentation provided by the backend for usage information
+ */
+typedef void* QnnBackend_CustomConfig_t;
+
+/**
+ * @brief This enum defines backend config options.
+ */
+typedef enum {
+  /// Sets backend custom options via QnnBackend_CustomConfig_t
+  QNN_BACKEND_CONFIG_OPTION_CUSTOM = 0,
+  /// Sets error reporting level
+  QNN_BACKEND_CONFIG_OPTION_ERROR_REPORTING = 1,
+  /// Key-value pair of platform options.
+  QNN_BACKEND_CONFIG_OPTION_PLATFORM = 2,
+  // Unused, present to ensure 32 bits.
+  QNN_BACKEND_CONFIG_OPTION_UNDEFINED = 0x7FFFFFFF
+} QnnBackend_ConfigOption_t;
+
+/**
+ * @brief This struct provides backend configuration.
+ */
+typedef struct {
+  QnnBackend_ConfigOption_t option;
+  union UNNAMED {
+    QnnBackend_CustomConfig_t customConfig;
+    /// Applies error reporting configuration across backend.
+    /// All QNN contexts share this common error configuration
+    /// for APIs that are independent of a context.
+    Qnn_ErrorReportingConfig_t errorConfig;
+    /// Null-terminated platform option key-value pair. Multiple platform options can be specified.
+    /// Max length is 1024.
+    const char* platformOption;
+  };
+} QnnBackend_Config_t;
+
+/// QnnBackend_Config_t initializer macro
+#define QNN_BACKEND_CONFIG_INIT                     \
+  {                                                 \
+    QNN_BACKEND_CONFIG_OPTION_UNDEFINED, /*option*/ \
+    {                                               \
+      NULL /*customConfig*/                         \
+    }                                               \
+  }
+
+/**
+ * @brief Struct which encapsulates the fully-qualified name of an operation.
+ */
+typedef struct {
+  /// The op package to which the operation belongs.
+  const char* packageName;
+  /// The type name of the operation.
+  const char* name;
+  /// The intended target platform for the combination of domain and operation name.
+  /// Target may be unused (NULL) by some backends.
+  const char* target;
+} QnnBackend_OperationName_t;
+
+// clang-format off
+/// QnnBackend_OperationName_t initializer macro
+#define QNN_BACKEND_OPERATION_NAME_INIT \
+  {                                     \
+    NULL,     /*packageName*/           \
+    NULL,     /*name*/                  \
+    NULL      /*target*/                \
+  }
+// clang-format on
+
+/**
+ * @brief This enum defines backend property options.
+ */
+typedef enum {
+  /// Gets backend custom properties, see backend specific documentation.
+  QNN_BACKEND_PROPERTY_OPTION_CUSTOM = 0,
+  /// Value selected to ensure 32 bits.
+  QNN_BACKEND_PROPERTY_OPTION_UNDEFINED = 0x7FFFFFFF
+} QnnBackend_PropertyOption_t;
+
+/**
+ * @brief Backend specific object for custom property
+ *
+ * Please refer to documentation provided by the backend for usage information
+ */
+typedef void* QnnBackend_CustomProperty_t;
+
+/**
+ * @brief This struct provides backend property.
+ *        Option is specified by the client. Everything
+ *        else is written by the backend.
+ */
+typedef struct {
+  QnnBackend_PropertyOption_t option;
+  union UNNAMED {
+    QnnBackend_CustomProperty_t customProperty;
+  };
+} QnnBackend_Property_t;
+
+// clang-format off
+/// QnnBackend_Property_t initializer macro
+#define QNN_BACKEND_PROPERTY_INIT                     \
+  {                                                   \
+    QNN_BACKEND_PROPERTY_OPTION_UNDEFINED, /*option*/ \
+    {                                                 \
+      NULL /*customProperty*/                         \
+    }                                                 \
+  }
+// clang-format on
+
+//=============================================================================
+// Public Functions
+//=============================================================================
+
+/**
+ * @brief Initialize a backend library and create a backend handle. Function is re-entrant and
+ *        thread-safe.
+ *
+ * @param[in] logger A handle to the logger, use NULL handle to disable logging.
+ *                   QnnBackend doesn't manage the lifecycle of logger and must be freed by using
+ *                   QnnLog_free().
+ *
+ * @param[in] config Pointer to a NULL terminated array of config option pointers.
+ *                   NULL is allowed and indicates no config options are provided.
+ *                   All config options have default value, in case not provided.
+ *                   If same config option type is provided multiple times,
+ *                   the last option value will be used.
+ *
+ * @param[out] backend A handle to the created backend.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: No error encountered
+ *         - QNN_BACKEND_ERROR_UNSUPPORTED_PLATFORM: Backend attempted to be created on
+ *           unsupported platform
+ *         - QNN_BACKEND_ERROR_INCONSISTENT_CONFIG: One or more backend configurations are
+ *           inconsistent between multiple create calls. Refer to backend headers for which
+ *           configuration options must be consistent.
+ *         - QNN_BACKEND_ERROR_CANNOT_INITIALIZE: backend failed to initialize
+ *         - QNN_BACKEND_ERROR_MEM_ALLOC: error related to memory allocation
+ *         - QNN_BACKEND_ERROR_INVALID_HANDLE: _logger_ is not a valid handle
+ *         - QNN_BACKEND_ERROR_INVALID_CONFIG: one or more config values is invalid
+ *         - QNN_BACKEND_ERROR_NOT_SUPPORTED: an optional feature is not supported
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnBackend_create(Qnn_LogHandle_t logger,
+                                    const QnnBackend_Config_t** config,
+                                    Qnn_BackendHandle_t* backend);
+/**
+ * @brief A function to set/modify configuration options on an already generated backend.
+ *
+ * @param[in] backend A backend handle.
+ *
+ * @param[in] config Pointer to a NULL terminated array of config option pointers.
+ *                   NULL is allowed and indicates no config options are provided.
+ *                   All config options have default value, in case not provided.
+ *                   If same config option type is provided multiple times,
+ *                   the last option value will be used.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: no error is encountered
+ *         - QNN_BACKEND_ERROR_INVALID_HANDLE: _backend_ is not a valid handle
+ *         - QNN_BACKEND_ERROR_INVALID_CONFIG: at least one config option is invalid
+ *         - QNN_BACKEND_ERROR_NOT_SUPPORTED: an optional feature is not supported
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnBackend_setConfig(Qnn_BackendHandle_t backend,
+                                       const QnnBackend_Config_t** config);
+
+/**
+ * @brief Get the QNN API version.
+ *
+ * @note Safe to call any time, backend does not have to be created.
+ *
+ * @param[out] pVersion Pointer to version object.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: No error encountered
+ *         - QNN_BACKEND_ERROR_INVALID_ARGUMENT: if _pVersion_ was NULL
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnBackend_getApiVersion(Qnn_ApiVersion_t* pVersion);
+
+/**
+ * @brief Get build id for backend library.
+ *
+ * @note Safe to call any time, backend does not have to be created.
+ *
+ * @param[out] id Pointer to string containing the build id.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: No error encountered
+ *         - QNN_BACKEND_ERROR_NOT_SUPPORTED: No build ID is available
+ *         - QNN_BACKEND_ERROR_INVALID_ARGUMENT: if _id_ is NULL
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnBackend_getBuildId(const char** id);
+
+/**
+ * @brief Register an operation package with the backend handle.
+ *
+ * @param[in] backend  A backend handle.
+ *
+ * @param[in] packagePath Path on disk to the op package library to load.
+ *
+ * @param[in] interfaceProvider The name of a function in the op package library which satisfies
+ *                              the QnnOpPackage_InterfaceProvider_t interface. The backend will
+ *                              use this function to retrieve the op package's interface.
+ *
+ * @param[in] target An optional parameter specifying the target platform on which the backend must
+ *                   register the op package. Required in scenarios where an op package is to be
+ *                   loaded on a processing unit that is different from the target on which the
+ *                   backend runs. Ex: loading a DSP op package on ARM for optional online context
+ *                   caching. Refer to additional documentation for a list of permissible target
+ *                   names.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: No error encountered
+ *         - QNN_BACKEND_ERROR_INVALID_ARGUMENT: if _packagePath_ or _interfaceProvider_ is NULL
+ *         - QNN_BACKEND_ERROR_OP_PACKAGE_NOT_FOUND: Could not open _packagePath_
+ *         - QNN_BACKEND_ERROR_OP_PACKAGE_IF_PROVIDER_NOT_FOUND: Could not find _interfaceProvider_
+ *           symbol in package library
+ *         - QNN_BACKEND_ERROR_OP_PACKAGE_REGISTRATION_FAILED: Op package registration failed
+ *         - QNN_BACKEND_ERROR_OP_PACKAGE_UNSUPPORTED_VERSION: Op package has interface version not
+ *           supported by this backend
+ *         - QNN_BACKEND_ERROR_NOT_SUPPORTED: Op package registration is not supported.
+ *         - QNN_BACKEND_ERROR_INVALID_HANDLE: _backend_ is not a valid handle
+ *         - QNN_BACKEND_ERROR_OP_PACKAGE_DUPLICATE: OpPackageName+OpName must be unique.
+ *           Op package content information can be be obtained with QnnOpPackage interface.
+ *           Indicates that an Op with the same package name and op name was already registered.
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION: SSR occurence (successful recovery)
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION_FATAL: SSR occurence (unsuccessful recovery)
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnBackend_registerOpPackage(Qnn_BackendHandle_t backend,
+                                               const char* packagePath,
+                                               const char* interfaceProvider,
+                                               const char* target);
+
+/**
+ * @brief Get the supported operations registered to a backend handle including built-in ops.
+ *
+ * @param[in] backend A backend handle. Can be NULL to obtain the built-in op package.
+ *
+ * @param[out] numOperations Number of supported operations.
+ *
+ * @param[out] operations Array of operation names. Memory is backend owned and de-allocated
+ *                        during QnnBackend_free.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: No error encountered
+ *         - QNN_BACKEND_ERROR_INVALID_ARGUMENT: if _numOperations_ or _operations_ is NULL
+ *         - QNN_BACKEND_ERROR_INVALID_HANDLE: _backend_ is not a valid handle
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION: SSR occurence (successful recovery)
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION_FATAL: SSR occurence (unsuccessful recovery)
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnBackend_getSupportedOperations(Qnn_BackendHandle_t backend,
+                                                    uint32_t* numOperations,
+                                                    const QnnBackend_OperationName_t** operations);
+
+/**
+ * @brief A method to validate op config with an appropriate op package
+ *        This is a wrapper API around the actual OpPackage interface method
+ *        that performs op validation. Backend may pick an appropriate op package
+ *        among ones that are registered with it for validation based on the attributes
+ *        of the op configuration.
+ *
+ * @param[in] backend A backend handle.
+ *
+ * @param[in] opConfig Fully qualified struct containing the configuration of the operation.
+ *
+ * @note  _inputTensors_ and _outputTensors_ inside opConfig must be fully qualified for
+ *        complete validation. However, their IDs (_id_) and names (_name_) are ignored during
+ *        validation.
+ *
+ * @return Error code
+ *         - QNN_SUCCESS if validation is successful
+ *         - QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE: op config validation failed
+ *         - QNN_BACKEND_ERROR_NOT_SUPPORTED: Validation API not supported
+ *         - QNN_BACKEND_ERROR_OP_PACKAGE_NOT_FOUND: No op package with matching
+ *           op config attributes found.
+ *         - QNN_BACKEND_ERROR_INVALID_HANDLE: _backend_ is not a valid handle
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnBackend_validateOpConfig(Qnn_BackendHandle_t backend, Qnn_OpConfig_t opConfig);
+
+/**
+ * @brief A function to get a list of backend properties.
+ *        Backends are not required to support this API.
+ *
+ * @param[in] backendHandle A backend handle.
+ *
+ * @param[in/out] properties Pointer to a null terminated array of pointers containing the
+ *                           properties associated with the passed backendHandle. Memory for
+ *                           this information is owned and managed by the client. Client
+ *                           needs to populate the property options being requested. If
+ *                           _contextHandle_ is not recognized, the pointer _properties_
+ *                           points to is set to nullptr.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: no error is encountered
+ *         - QNN_BACKEND_ERROR_INVALID_HANDLE: _backendHandle_ is not a valid handle
+ *         - QNN_BACKEND_ERROR_INVALID_ARGUMENT: _properties_ is NULL or at least one property option
+ *           is invalid
+ *         - QNN_BACKEND_ERROR_NOT_SUPPORTED: at least one valid property option is not
+ *           supported
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnBackend_getProperty(Qnn_BackendHandle_t backendHandle,
+                                         QnnBackend_Property_t** properties);
+
+/**
+ * @brief Free all resources associated with a backend handle.
+ *
+ * @param[in] backend handle to be freed.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: No error encountered.
+ *         - QNN_BACKEND_ERROR_MEM_ALLOC: error related to memory deallocation
+ *         - QNN_BACKEND_ERROR_TERMINATE_FAILED: indicates failure to free
+ *           resources or failure to invalidate handles and pointers allocated
+ *           by the library
+ *         - QNN_BACKEND_ERROR_INVALID_HANDLE: _backend_ is not a valid handle
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION: SSR occurence (successful recovery)
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION_FATAL: SSR occurence (unsuccessful recovery)
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnBackend_free(Qnn_BackendHandle_t backend);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnCommon.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnCommon.h
new file mode 100755
index 0000000000000..ca3d0d2959070
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnCommon.h
@@ -0,0 +1,225 @@
+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/**
+ * @file
+ * @brief   Common API components
+ *
+ *          A header which contains common components shared between different
+ *          parts of the API, for example, definition of "context" type. This
+ *          simplifies the cross-inclusion of headers.
+ */
+
+#ifndef QNN_COMMON_H
+#define QNN_COMMON_H
+
+#ifdef __cplusplus
+#include <cstdint>
+#else
+#include <stdint.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+
+// Macro controlling visibility of QNN API
+#ifndef QNN_API
+#define QNN_API
+#endif
+
+//! \cond
+// Macro to enable processing unnamed unions under struct for documentation purposes
+#define UNNAMED
+//! \endcond
+
+// Provide values to use for API version.
+#define QNN_API_VERSION_MAJOR 2
+#define QNN_API_VERSION_MINOR 25
+#define QNN_API_VERSION_PATCH 0
+
+/// NULL backend identifier.
+#define QNN_BACKEND_ID_NULL 0
+
+/*
+ * Identifiers for known backends that may be included into the SDK.
+ * These identifiers are defined by each backend in Qnn<backend>Common.h.
+ * Identifiers must be unique per backend.
+ *
+ * - QNN_BACKEND_ID_NULL      0
+ * - QNN_BACKEND_ID_REFERENCE 1
+ * - QNN_BACKEND_ID_SAVER     2
+ * - QNN_BACKEND_ID_CPU       3
+ * - QNN_BACKEND_ID_GPU       4
+ * - QNN_BACKEND_ID_DSP       5
+ * - QNN_BACKEND_ID_HTP       6
+ */
+
+/// Global value indicating success
+#define QNN_SUCCESS 0
+
+// Error code space assigned to API components
+#define QNN_MIN_ERROR_COMMON              1000
+#define QNN_MAX_ERROR_COMMON              1999
+#define QNN_MIN_ERROR_PROPERTY            2000
+#define QNN_MAX_ERROR_PROPERTY            2999
+#define QNN_MIN_ERROR_OP_PACKAGE          3000
+#define QNN_MAX_ERROR_OP_PACKAGE          3999
+#define QNN_MIN_ERROR_BACKEND             4000
+#define QNN_MIN_ERROR_BACKEND_SAVER       4950
+#define QNN_MAX_ERROR_BACKEND_SAVER       4998
+#define QNN_MAX_ERROR_BACKEND             4999
+#define QNN_MIN_ERROR_CONTEXT             5000
+#define QNN_MAX_ERROR_CONTEXT             5999
+#define QNN_MIN_ERROR_GRAPH               6000
+#define QNN_MAX_ERROR_GRAPH               6999
+#define QNN_MIN_ERROR_TENSOR              7000
+#define QNN_MAX_ERROR_TENSOR              7999
+#define QNN_MIN_ERROR_MEM                 8000
+#define QNN_MAX_ERROR_MEM                 8999
+#define QNN_MIN_ERROR_SIGNAL              9000
+#define QNN_MAX_ERROR_SIGNAL              9999
+#define QNN_MIN_ERROR_ERROR               10000
+#define QNN_MAX_ERROR_ERROR               10999
+#define QNN_MIN_ERROR_LOG                 11000
+#define QNN_MAX_ERROR_LOG                 11999
+#define QNN_MIN_ERROR_PROFILE             12000
+#define QNN_MAX_ERROR_PROFILE             12999
+#define QNN_MIN_ERROR_PERF_INFRASTRUCTURE 13000
+#define QNN_MAX_ERROR_PERF_INFRASTRUCTURE 13999
+#define QNN_MIN_ERROR_DEVICE              14000
+#define QNN_MAX_ERROR_DEVICE              14999
+// Reserved range for QNN system APIs: 30000-50000
+#define QNN_MIN_ERROR_SYSTEM    30000
+#define QNN_MAX_ERROR_SYSTEM    49999
+#define QNN_MIN_ERROR_INTERFACE 60000
+#define QNN_MAX_ERROR_INTERFACE 60999
+
+// Utility macros
+#define QNN_PASTE_THREE(a, b, c) a##b##c
+
+/// Simple utility to extract 16-bit error code from 64-bit Qnn_ErrorHandle_t
+#define QNN_GET_ERROR_CODE(errorHandle) (errorHandle & 0xFFFF)
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+// clang-format off
+
+/**
+ * @brief A typedef to indicate QNN API return handle. Return error codes from APIs are to be read
+ * out from the least significant 16 bits of the field. The higher order bits are reserved for
+ * internal tracking purposes.
+ */
+typedef uint64_t Qnn_ErrorHandle_t;
+
+/**
+ * @brief Definition of the QNN handle type. This handle type is the base type for all other QNN
+ * handle types. Handles typically have corresponding create and free API functions.
+ */
+typedef void* Qnn_Handle_t;
+
+/**
+ * @brief Definition of the QNN backend handle. Backend handles are often used as a parent when
+ * creating handles other QNN components (e.g. contexts).
+ */
+typedef Qnn_Handle_t Qnn_BackendHandle_t;
+
+/**
+ * @brief Definition of the QNN context handle.
+ */
+typedef Qnn_Handle_t Qnn_ContextHandle_t;
+
+/**
+ * @brief Definition of the QNN device handle.
+ */
+typedef Qnn_Handle_t Qnn_DeviceHandle_t;
+
+/**
+ * @brief Definition of the QNN graph handle. Graph handles cannot be free'd.
+ */
+typedef Qnn_Handle_t Qnn_GraphHandle_t;
+
+/**
+ * @brief Definition of the QNN log handle.
+ */
+typedef Qnn_Handle_t Qnn_LogHandle_t;
+
+/**
+ * @brief Definition of the QNN memory handle.
+ */
+typedef Qnn_Handle_t Qnn_MemHandle_t;
+
+/**
+ * @brief Definition of the QNN profile handle.
+ */
+typedef Qnn_Handle_t Qnn_ProfileHandle_t;
+
+/**
+ * @brief An opaque control object which may be used to control the execution behavior of various
+ * QNN functions. A signal object may only be used by one call at a time; if the same signal
+ * object is supplied to a second call before the first has terminated, the second call will
+ * immediately fail with an error. When the call using a signal returns gracefully, the signal
+ * object is made available again.
+ */
+typedef Qnn_Handle_t Qnn_SignalHandle_t;
+
+// clang-format on
+
+/**
+ * @brief An enum which defines error codes commonly used across API components.
+ */
+typedef enum {
+  QNN_COMMON_MIN_ERROR = QNN_MIN_ERROR_COMMON,
+  //////////////////////////////////////////
+
+  /// API or feature is not supported by implementation.
+  QNN_COMMON_ERROR_NOT_SUPPORTED = QNN_MIN_ERROR_COMMON + 0,
+  /// Memory allocation related error.
+  QNN_COMMON_ERROR_MEM_ALLOC = QNN_MIN_ERROR_COMMON + 2,
+  /// System level error, such as related to platform / OS services
+  QNN_COMMON_ERROR_SYSTEM = QNN_MIN_ERROR_COMMON + 3,
+  /// Invalid function argument
+  QNN_COMMON_ERROR_INVALID_ARGUMENT = QNN_MIN_ERROR_COMMON + 4,
+  /// Illegal operation or sequence of operations
+  QNN_COMMON_ERROR_OPERATION_NOT_PERMITTED = QNN_MIN_ERROR_COMMON + 5,
+  /// Attempt to use QNN API on an unsupported platform
+  QNN_COMMON_ERROR_PLATFORM_NOT_SUPPORTED = QNN_MIN_ERROR_COMMON + 6,
+  /// Communication errors with platform / OS service (service is recoverable)
+  QNN_COMMON_ERROR_SYSTEM_COMMUNICATION = QNN_MIN_ERROR_COMMON + 7,
+  /// Loaded libraries are of incompatible versions
+  QNN_COMMON_ERROR_INCOMPATIBLE_BINARIES = QNN_MIN_ERROR_COMMON + 8,
+  /// Attempt to reload library already loaded in this process
+  QNN_COMMON_ERROR_LOADING_BINARIES = QNN_MIN_ERROR_COMMON + 9,
+  /// Resource allocation related error.
+  QNN_COMMON_ERROR_RESOURCE_UNAVAILABLE = QNN_MIN_ERROR_COMMON + 10,
+  /// Communication errors with platform / OS service (service is non-recoverable)
+  QNN_COMMON_ERROR_SYSTEM_COMMUNICATION_FATAL = QNN_MIN_ERROR_COMMON + 11,
+  /// General error, which has not been identified as any other error type.
+  QNN_COMMON_ERROR_GENERAL = QNN_MIN_ERROR_COMMON + 100,
+
+  //////////////////////////////////////////
+  QNN_COMMON_MAX_ERROR = QNN_MAX_ERROR_COMMON,
+  // Unused, present to ensure 32 bits.
+  QNN_COMMON_ERROR_UNDEFINED = 0x7FFFFFFF
+} QnnCommon_Error_t;
+
+//=============================================================================
+// Public Functions
+//=============================================================================
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // QNN_COMMON_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnContext.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnContext.h
new file mode 100755
index 0000000000000..3efc119ac2b16
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnContext.h
@@ -0,0 +1,1034 @@
+//==============================================================================
+//
+// Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/**
+ *  @file
+ *  @brief  Context component API.
+ *
+ *          Requires Backend to be initialized.
+ *          Graphs and Tensors are created within Context.
+ *          Context content once created can be cached into a binary form.
+ */
+
+#ifndef QNN_CONTEXT_H
+#define QNN_CONTEXT_H
+
+#include "QnnCommon.h"
+#include "QnnTypes.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+/**
+ * @brief QNN Context API result / error codes.
+ */
+typedef enum {
+  QNN_CONTEXT_MIN_ERROR = QNN_MIN_ERROR_CONTEXT,
+  ////////////////////////////////////////////
+
+  /// Qnn context success
+  QNN_CONTEXT_NO_ERROR = QNN_SUCCESS,
+  /// There is optional API component that is not supported yet. See QnnProperty.
+  QNN_CONTEXT_ERROR_UNSUPPORTED_FEATURE = QNN_COMMON_ERROR_NOT_SUPPORTED,
+  /// Context-specific memory allocation/deallocation failure
+  QNN_CONTEXT_ERROR_MEM_ALLOC = QNN_COMMON_ERROR_MEM_ALLOC,
+  /// An argument to QNN context API is deemed invalid by a backend
+  QNN_CONTEXT_ERROR_INVALID_ARGUMENT = QNN_MIN_ERROR_CONTEXT + 0,
+  /// A QNN context has not yet been created in the backend
+  QNN_CONTEXT_ERROR_CTX_DOES_NOT_EXIST = QNN_MIN_ERROR_CONTEXT + 1,
+  /// Invalid/NULL QNN context handle
+  QNN_CONTEXT_ERROR_INVALID_HANDLE = QNN_MIN_ERROR_CONTEXT + 2,
+  /// Attempting an operation when graphs in a context haven't been finalized
+  QNN_CONTEXT_ERROR_NOT_FINALIZED = QNN_MIN_ERROR_CONTEXT + 3,
+  /// Attempt to access context binary with an incompatible version
+  QNN_CONTEXT_ERROR_BINARY_VERSION = QNN_MIN_ERROR_CONTEXT + 4,
+  /// Failure to create context from binary
+  QNN_CONTEXT_ERROR_CREATE_FROM_BINARY = QNN_MIN_ERROR_CONTEXT + 5,
+  /// Failure to get size of a QNN serialized context
+  QNN_CONTEXT_ERROR_GET_BINARY_SIZE_FAILED = QNN_MIN_ERROR_CONTEXT + 6,
+  /// Failure to generate a QNN serialized context
+  QNN_CONTEXT_ERROR_GET_BINARY_FAILED = QNN_MIN_ERROR_CONTEXT + 7,
+  /// Invalid context binary configuration
+  QNN_CONTEXT_ERROR_BINARY_CONFIGURATION = QNN_MIN_ERROR_CONTEXT + 8,
+  /// Failure to set profile
+  QNN_CONTEXT_ERROR_SET_PROFILE = QNN_MIN_ERROR_CONTEXT + 9,
+  /// Invalid config
+  QNN_CONTEXT_ERROR_INVALID_CONFIG = QNN_MIN_ERROR_CONTEXT + 10,
+  /// Attempt to create a context from suboptimal binary
+  QNN_CONTEXT_ERROR_BINARY_SUBOPTIMAL = QNN_MIN_ERROR_CONTEXT + 11,
+  /// Call aborted early due to a QnnSignal_trigger call issued
+  /// to the observed signal object.
+  QNN_CONTEXT_ERROR_ABORTED = QNN_MIN_ERROR_CONTEXT + 12,
+  /// Call aborted early due to a QnnSignal timeout
+  QNN_CONTEXT_ERROR_TIMED_OUT = QNN_MIN_ERROR_CONTEXT + 13,
+  /// Incremental Binary Buffer was not allocated by backend
+  QNN_CONTEXT_ERROR_INCREMENT_INVALID_BUFFER = QNN_MIN_ERROR_CONTEXT + 14,
+  ////////////////////////////////////////////
+  QNN_CONTEXT_MAX_ERROR = QNN_MAX_ERROR_CONTEXT,
+  // Unused, present to ensure 32 bits.
+  QNN_CONTEXT_ERROR_UNDEFINED = 0x7FFFFFFF
+} QnnContext_Error_t;
+
+/**
+ * @brief Context specific object for custom configuration
+ *
+ * Please refer to documentation provided by the backend for usage information
+ */
+typedef void* QnnContext_CustomConfig_t;
+
+/**
+ * @brief This enum defines context config options.
+ */
+typedef enum {
+  /// Sets context custom options via QnnContext_CustomConfig_t
+  QNN_CONTEXT_CONFIG_OPTION_CUSTOM = 0,
+  /// Sets the default priority for graphs in this context. QNN_GRAPH_CONFIG_OPTION_PRIORITY can be
+  /// used to override this default.
+  QNN_CONTEXT_CONFIG_OPTION_PRIORITY = 1,
+  /// Sets the error reporting level.
+  QNN_CONTEXT_CONFIG_OPTION_ERROR_REPORTING = 2,
+  /// Sets the string used for custom oem functionality. This config option is DEPRECATED.
+  QNN_CONTEXT_CONFIG_OPTION_OEM_STRING = 3,
+  /// Sets async execution queue depth for all graphs in this context. This option represents the
+  /// number of executions that can be in the queue at a given time before QnnGraph_executeAsync()
+  /// will start blocking until a new spot is available. Queue depth is subject to a maximum limit
+  /// determined by the backend and available system resources. The default depth is
+  /// backend-specific, refer to SDK documentation.
+  QNN_CONTEXT_CONFIG_ASYNC_EXECUTION_QUEUE_DEPTH = 4,
+  /// Null terminated array of null terminated strings listing the names of the graphs to
+  /// deserialize from a context binary. All graphs are enabled by default. An error is generated if
+  /// an invalid graph name is provided.
+  QNN_CONTEXT_CONFIG_ENABLE_GRAPHS = 5,
+  /// Sets the peak memory limit hint of a deserialized context in megabytes
+  QNN_CONTEXT_CONFIG_MEMORY_LIMIT_HINT = 6,
+  /// Indicates that the context binary pointer is available during QnnContext_createFromBinary and
+  /// until QnnContext_free is called.
+  QNN_CONTEXT_CONFIG_PERSISTENT_BINARY = 7,
+  /// Sets the context binary check type when reading binary caches
+  QNN_CONTEXT_CONFIG_BINARY_COMPATIBILITY = 8,
+  // Unused, present to ensure 32 bits.
+  QNN_CONTEXT_CONFIG_UNDEFINED = 0x7FFFFFFF
+} QnnContext_ConfigOption_t;
+
+typedef enum {
+  /// A binary cache is compatible if it could run on the device. This is the
+  /// default.
+  QNN_CONTEXT_BINARY_COMPATIBILITY_PERMISSIVE = 0,
+  /// A binary cache is compatible if it could run on the device and fully
+  /// utilize hardware capability, otherwise QnnContext_CreateFromBinary
+  /// may return QNN_CONTEXT_ERROR_BINARY_SUBOPTIMAL.
+  QNN_CONTEXT_BINARY_COMPATIBILITY_STRICT = 1,
+  // Unused, present to ensure 32 bits
+  QNN_CONTEXT_BINARY_COMPATIBILITY_TYPE_UNDEFINED = 0x7FFFFFF
+} QnnContext_BinaryCompatibilityType_t;
+
+typedef enum {
+  /// Sets a numeric value for the maximum queue depth
+  QNN_CONTEXT_ASYNC_EXECUTION_QUEUE_DEPTH_TYPE_NUMERIC = 0,
+
+  // Unused, present to ensure 32 bits
+  QNN_CONTEXT_ASYNC_EXECUTION_QUEUE_DEPTH_TYPE_UNDEFINED = 0x7FFFFFF
+} QnnContext_AsyncExecutionQueueDepthType_t;
+
+/**
+ * @brief This struct provides async execution queue depth.
+ */
+typedef struct {
+  QnnContext_AsyncExecutionQueueDepthType_t type;
+  union UNNAMED {
+    uint32_t depth;
+  };
+} QnnContext_AsyncExecutionQueueDepth_t;
+
+/// QnnContext_AsyncExecutionQueueDepth_t initializer macro
+#define QNN_CONTEXT_ASYNC_EXECUTION_QUEUE_DEPTH_INIT                 \
+  {                                                                  \
+    QNN_CONTEXT_ASYNC_EXECUTION_QUEUE_DEPTH_TYPE_UNDEFINED, /*type*/ \
+    {                                                                \
+      0 /*depth*/                                                    \
+    }                                                                \
+  }
+
+/**
+ * @brief This struct provides context configuration.
+ */
+typedef struct {
+  QnnContext_ConfigOption_t option;
+  union UNNAMED {
+    /// Used with QNN_CONTEXT_CONFIG_OPTION_CUSTOM.
+    QnnContext_CustomConfig_t customConfig;
+    /// Used with QNN_CONTEXT_CONFIG_OPTION_PRIORITY.
+    Qnn_Priority_t priority;
+    /// Used with QNN_CONTEXT_CONFIG_OPTION_ERROR_REPORTING.
+    Qnn_ErrorReportingConfig_t errorConfig;
+    /// DEPRECATED. Used with QNN_CONTEXT_CONFIG_OPTION_OEM_STRING
+    const char* oemString;
+    /// Used with QNN_CONTEXT_CONFIG_ASYNC_EXECUTION_QUEUE_DEPTH
+    QnnContext_AsyncExecutionQueueDepth_t asyncExeQueueDepth;
+    /// Used with QNN_CONTEXT_CONFIG_ENABLE_GRAPHS
+    const char* const* enableGraphs;
+    /// Used with QNN_CONTEXT_CONFIG_MEMORY_LIMIT_HINT
+    uint64_t memoryLimitHint;
+    /// Used with QNN_CONTEXT_CONFIG_PERSISTENT_BINARY
+    uint8_t isPersistentBinary;
+    /// Used with QNN_CONTEXT_CONFIG_BINARY_COMPATIBILITY
+    QnnContext_BinaryCompatibilityType_t binaryCompatibilityType;
+  };
+} QnnContext_Config_t;
+
+/// QnnContext_Config_t initializer macro
+#define QNN_CONTEXT_CONFIG_INIT              \
+  {                                          \
+    QNN_CONTEXT_CONFIG_UNDEFINED, /*option*/ \
+    {                                        \
+      NULL /*customConfig*/                  \
+    }                                        \
+  }
+
+/**
+ * @brief Enum to distinguish notify type
+ */
+typedef enum {
+  // Graph initialization
+  QNN_CONTEXT_NOTIFY_TYPE_GRAPH_INIT = 0,
+  // Context initialization
+  QNN_CONTEXT_NOTIFY_TYPE_CONTEXT_INIT = 1,
+  /// Unused, present to ensure 32 bits.
+  QNN_CONTEXT_NOTIFY_TYPE_UNDEFINED = 0x7FFFFFF
+} QnnContext_createFromBinaryAsyncNotifyType_t;
+
+/**
+ * @brief A client-defined callback function.
+ *
+ * @param[in] context handle to a created context
+ *
+ * @param[in] graph handle to a created graph
+ *
+ * @param[in] graphName created graph's name
+ *
+ * @param[in] notifyType enum type indicating whether a context or a graph init is complete
+ *
+ * @param[in] notifyParam Client supplied data object which may be used to identify
+ *                        which function this callback applies to.
+ *
+ * @param[in] status graph or context initialization result
+ *
+ * @return None
+ *
+ */
+typedef void (*QnnContext_createFromBinaryNotifyFn_t)(
+    Qnn_ContextHandle_t context,
+    Qnn_GraphHandle_t graph,
+    const char* graphName,
+    QnnContext_createFromBinaryAsyncNotifyType_t notifyType,
+    void* notifyParam,
+    Qnn_ErrorHandle_t status);
+
+/**
+ * @brief This structure serves as a consolidated representation of context-related parameters.
+ *        QnnContext_createFromBinaryListAsync API takes a list of these parameters for initializing
+ *        multiple context binaries.
+ */
+typedef struct {
+  /// Config pointer to a NULL-terminated array of config option pointers for one context. NULL
+  /// is allowed and indicates that no config options are provided. If not provided, all config
+  /// options have default values consistent with the serialized context. If the same config option
+  /// type is provided multiple times, the last option value will be used.
+  const QnnContext_Config_t** config;
+  /// A pointer to the context binary
+  const void* binaryBuffer;
+  /// Holds the size of the context binary
+  const Qnn_ContextBinarySize_t binaryBufferSize;
+  /// The profile handle on which metrics are populated and can be queried. Use a NULL handle
+  /// to disable profile collection. If a handle is reused, it will reset and be populated with
+  /// values from the current call.
+  Qnn_ProfileHandle_t profile;
+  /// Pointer to a notification function, cannot be NULL
+  QnnContext_createFromBinaryNotifyFn_t notifyFunc;
+  /// Client-supplied data object which will be passed back via _notifyFn_ and can be used to
+  /// identify which context's asynchronous initialization instance the __notifyFn__ applies to.
+  /// Can be NULL if client does not need it.
+  void* notifyParam;
+} QnnContext_ParamsV1_t;
+
+/**
+ * @brief Enum to distinguish various context params definitions
+ */
+typedef enum {
+  QNN_CONTEXT_PARAMS_VERSION_1 = 1,
+  /// Unused, present to ensure 32 bits.
+  QNN_CONTEXT_PARAMS_VERSION_UNDEFINED = 0x7FFFFFFF
+} QnnContext_ParamsVersion_t;
+
+/**
+ * @brief Structure which provides various versions of context params
+ */
+typedef struct {
+  QnnContext_ParamsVersion_t version;
+  union UNNAMED {
+    QnnContext_ParamsV1_t v1;
+  };
+} QnnContext_Params_t;
+
+/**
+ * @brief Enum to distinguish type of binary section to retrieve
+ */
+typedef enum {
+  /// Portion of the context binary containing recent updates applied through
+  /// QnnTensor_updateGraphTensors() or QnnTensor_updateContextTensors()
+  QNN_CONTEXT_SECTION_UPDATABLE = 1,
+  /// Unused, present to ensure 32 bits.
+  QNN_CONTEXT_SECTION_UNDEFINED = 0x7FFFFFFF
+} QnnContext_SectionType_t;
+
+/**
+ * @brief An enum specifying memory types of context binary data.
+ */
+typedef enum {
+  /// Raw memory pointer
+  QNN_CONTEXTMEMTYPE_RAW = 0,
+  /// Memory object, provide capability for memory sharing in between QNN accelerator backends.
+  QNN_CONTEXTMEMTYPE_MEMHANDLE = 1,
+  // Unused, present to ensure 32 bits.
+  QNN_CONTEXTMEMTYPE_UNDEFINED = 0x7FFFFFFF
+} QnnContext_MemType_t;
+
+/**
+ * @brief A struct which defines a memory buffer
+ */
+typedef struct {
+  /// app-accessible data pointer, provided by app.
+  void* data;
+  /// size of buffer, in bytes, pointed to by data.
+  Qnn_ContextBinarySize_t dataSize;
+} Qnn_BinaryBuffer_t;
+
+/**
+ * @brief This structure defines a client created binary buffer containing the context binary.
+ *
+ */
+typedef struct {
+  QnnContext_MemType_t memType;
+  /// Actual data contained in the context binary.
+  union UNNAMED {
+    /// Context binary data provided by client as a pointer to raw memory (see
+    /// QNN_CONTEXTMEMTYPE_RAW).
+    Qnn_BinaryBuffer_t binaryBuf;
+    /// Context binary data shared via a memory handle (see QNN_CONTEXTMEMTYPE_MEMHANDLE).
+    Qnn_MemHandle_t memHandle;
+  };
+} QnnContext_BufferV1_t;
+
+/**
+ * @brief Enum to distinguish various context params definitions
+ */
+typedef enum {
+  QNN_CONTEXT_BUFFER_VERSION_1 = 1,
+  /// Unused, present to ensure 32 bits.
+  QNN_CONTEXT_BUFFER_VERSION_UNDEFINED = 0x7FFFFFFF
+} QnnContext_BufferVersion_t;
+
+/**
+ * @brief Structure which provides various versions of context params
+ */
+typedef struct {
+  QnnContext_BufferVersion_t version;
+  union UNNAMED {
+    QnnContext_BufferV1_t v1;
+  };
+} QnnContext_Buffer_t;
+
+/**
+ * @brief This enum defines context property options.
+ */
+typedef enum {
+  /// Gets context custom properties, see backend specific documentation.
+  QNN_CONTEXT_PROPERTY_OPTION_CUSTOM = 0,
+  /// Value selected to ensure 32 bits.
+  QNN_CONTEXT_PROPERTY_OPTION_UNDEFINED = 0x7FFFFFFF
+} QnnContext_PropertyOption_t;
+
+/**
+ * @brief Context specific object for custom property
+ *
+ * Please refer to documentation provided by the backend for usage information
+ */
+typedef void* QnnContext_CustomProperty_t;
+
+/**
+ * @brief This struct provides context property.
+ *        Option is specified by the client. Everything
+ *        else is written by the backend.
+ */
+typedef struct {
+  QnnContext_PropertyOption_t option;
+  union UNNAMED {
+    QnnContext_CustomProperty_t customProperty;
+  };
+} QnnContext_Property_t;
+
+// clang-format off
+/// QnnContext_Property_t initializer macro
+#define QNN_CONTEXT_PROPERTY_INIT                     \
+  {                                                   \
+    QNN_CONTEXT_PROPERTY_OPTION_UNDEFINED, /*option*/ \
+    {                                                 \
+      NULL /*customProperty*/                         \
+    }                                                 \
+  }
+// clang-format on
+
+//=============================================================================
+// Public Functions
+//=============================================================================
+
+/**
+ * @brief A function to create a context.
+ *        Context holds graphs, operations and tensors
+ *
+ * @param[in] backend A backend handle.
+ *
+ * @param[in] device A device handle to set hardware affinity for the created context. NULL value
+ *                   can be supplied for device handle and it is equivalent to calling
+ *                   QnnDevice_create() with NULL config.
+ *
+ * @param[in] config Pointer to a NULL terminated array of config option pointers. NULL is allowed
+ *                   and indicates no config options are provided. All config options have default
+ *                   value, in case not provided. If same config option type is provided multiple
+ *                   times, the last option value will be used.
+ *
+ * @param[out] context A handle to the created context.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: no error is encountered
+ *         - QNN_CONTEXT_ERROR_INVALID_ARGUMENT: at least one argument is invalid
+ *         - QNN_CONTEXT_ERROR_MEM_ALLOC: failure in allocating memory when creating context
+ *         - QNN_CONTEXT_ERROR_INVALID_HANDLE: _backend_ or _device_ is not a valid handle
+ *         - QNN_CONTEXT_ERROR_UNSUPPORTED_FEATURE: an optional feature is not supported
+ *         - QNN_CONTEXT_ERROR_INVALID_CONFIG: one or more config values is invalid
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION: SSR occurence (successful recovery)
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION_FATAL: SSR occurence (unsuccessful recovery)
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnContext_create(Qnn_BackendHandle_t backend,
+                                    Qnn_DeviceHandle_t device,
+                                    const QnnContext_Config_t** config,
+                                    Qnn_ContextHandle_t* context);
+
+/**
+ * @brief A function to set/modify configuration options on an already generated context.
+ *        Backends are not required to support this API.
+ *
+ * @param[in] context A context handle.
+ *
+ * @param[in] config Pointer to a NULL terminated array of config option pointers. NULL is allowed
+ *                   and indicates no config options are provided. All config options have default
+ *                   value, in case not provided. If same config option type is provided multiple
+ *                   times, the last option value will be used. If a backend cannot support all
+ *                   provided configs it will fail.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: no error is encountered
+ *         - QNN_CONTEXT_ERROR_INVALID_HANDLE:  _context_ is not a valid handle
+ *         - QNN_CONTEXT_ERROR_INVALID_ARGUMENT: at least one config option is invalid
+ *         - QNN_CONTEXT_ERROR_UNSUPPORTED_FEATURE: an optional feature is not supported
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION: SSR occurence (successful recovery)
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION_FATAL: SSR occurence (unsuccessful recovery)
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnContext_setConfig(Qnn_ContextHandle_t context,
+                                       const QnnContext_Config_t** config);
+
+/**
+ * @brief A function to get the size of memory to be allocated to hold
+ *        the context content in binary (serialized) form.
+ *        This function must be called after all entities in the context have been finalized.
+ *
+ * @param[in] context A context handle.
+ *
+ * @param[out] binaryBufferSize The amount of memory in bytes a client will need to allocate
+ *                              to hold context content in binary form.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: no error is encountered
+ *         - QNN_CONTEXT_ERROR_UNSUPPORTED_FEATURE: a feature is not supported
+ *         - QNN_CONTEXT_ERROR_INVALID_HANDLE:  _context_ is not a valid handle
+ *         - QNN_CONTEXT_ERROR_INVALID_ARGUMENT: _binaryBufferSize_ is NULL
+ *         - QNN_CONTEXT_ERROR_NOT_FINALIZED: if there were any non-finalized entities in the
+ *           context
+ *         - QNN_CONTEXT_ERROR_GET_BINARY_SIZE_FAILED: Operation failure due to other factors
+ *         - QNN_COMMON_ERROR_OPERATION_NOT_PERMITTED: Attempting to get binary size for a
+ *           context re-created from a cached binary.
+ *         - QNN_CONTEXT_ERROR_MEM_ALLOC: Not enough memory is available to retrieve the context
+ *           content.
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnContext_getBinarySize(Qnn_ContextHandle_t context,
+                                           Qnn_ContextBinarySize_t* binaryBufferSize);
+
+/**
+ * @brief A function to get the context content in binary (serialized) form.
+ *        The binary can be used to re-create context by using QnnContext_createFromBinary(). This
+ *        function must be called after all entities in the context have been finalized. Unconsumed
+ *        tensors are not included in the binary. Client is responsible for allocating sufficient
+ *        and valid memory to hold serialized context content produced by this method. It is
+ *        recommended the user calls QnnContext_getBinarySize() to allocate a buffer of sufficient
+ *        space to hold the binary.
+ *
+ * @param[in] context A context handle.
+ *
+ * @param[in] binaryBuffer Pointer to the user-allocated context binary memory.
+ *
+ * @param[in] binaryBufferSize Size of _binaryBuffer_ to populate context binary with, in bytes.
+ *
+ * @param[out] writtenBufferSize Amount of memory actually written into _binaryBuffer_, in bytes.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: no error is encountered
+ *         - QNN_CONTEXT_ERROR_UNSUPPORTED_FEATURE: a feature is not supported
+ *         - QNN_CONTEXT_ERROR_INVALID_HANDLE:  _context_ is not a valid handle
+ *         - QNN_CONTEXT_ERROR_INVALID_ARGUMENT: one of the arguments to the API is invalid/NULL
+ *         - QNN_CONTEXT_ERROR_NOT_FINALIZED: if there were any non-finalized entities in the
+ *           context
+ *         - QNN_CONTEXT_ERROR_GET_BINARY_FAILED: Operation failure due to other factors
+ *         - QNN_COMMON_ERROR_OPERATION_NOT_PERMITTED: Attempting to get binary for a
+ *           context re-created from a cached binary.
+ *         - QNN_CONTEXT_ERROR_MEM_ALLOC: Not enough memory is available to retrieve the context
+ *           content.
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnContext_getBinary(Qnn_ContextHandle_t context,
+                                       void* binaryBuffer,
+                                       Qnn_ContextBinarySize_t binaryBufferSize,
+                                       Qnn_ContextBinarySize_t* writtenBufferSize);
+
+/**
+ * @brief A function to validate a stored binary.
+ *        The binary was previously obtained via QnnContext_getBinary() and stored by a client.
+ *
+ * @param[in] backend A backend handle.
+ *
+ * @param[in] device A device handle to set hardware affinity for the created context. NULL value
+ *                   can be supplied for device handle and it is equivalent to calling
+ *                   QnnDevice_create() with NULL config.
+ *
+ * @param[in] config Pointer to a NULL terminated array of config option pointers. NULL is allowed
+ *                   and indicates no config options are provided. In case they are not provided,
+ *                   all config options have a default value in accordance with the serialized
+ *                   context. If the same config option type is provided multiple times, the last
+ *                   option value will be used.
+ *
+ * @param[in] binaryBuffer A pointer to the context binary.
+ *
+ * @param[in] binaryBufferSize Holds the size of the context binary.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: no error is encountered
+ *         - QNN_CONTEXT_ERROR_UNSUPPORTED_FEATURE: a feature is not supported
+ *         - QNN_CONTEXT_ERROR_INVALID_ARGUMENT: _binaryBuffer_ is NULL
+ *         - QNN_CONTEXT_ERROR_MEM_ALLOC: memory allocation error while validating binary cache
+ *         - QNN_CONTEXT_ERROR_CREATE_FROM_BINARY: failed to validate binary cache
+ *         - QNN_CONTEXT_ERROR_BINARY_VERSION: incompatible version of the binary
+ *         - QNN_CONTEXT_ERROR_BINARY_CONFIGURATION: binary is not configured for this device
+ *         - QNN_CONTEXT_ERROR_BINARY_SUBOPTIMAL: suboptimal binary is used when
+ *           QNN_CONTEXT_BINARY_COMPATIBILITY_STRICT is specified in the config option
+ *         - QNN_CONTEXT_ERROR_INVALID_HANDLE: _backend_, or _device_ is not a valid handle
+ *         - QNN_CONTEXT_ERROR_INVALID_CONFIG: one or more config values is invalid
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnContext_validateBinary(Qnn_BackendHandle_t backend,
+                                            Qnn_DeviceHandle_t device,
+                                            const QnnContext_Config_t** config,
+                                            const void* binaryBuffer,
+                                            Qnn_ContextBinarySize_t binaryBufferSize);
+
+/**
+ * @brief A function to create a context from a stored binary.
+ *        The binary was previously obtained via QnnContext_getBinary() and stored by a client. The
+ *        content of a context created in this way cannot be further altered, meaning *no* new
+ *        nodes or tensors can be added to the context. Creating context by deserializing provided
+ *        binary is meant for fast content creation, ready to execute on.
+ *
+ * @param[in] backend A backend handle.
+ *
+ * @param[in] device A device handle to set hardware affinity for the created context. NULL value
+ *                   can be supplied for device handle and it is equivalent to calling
+ *                   QnnDevice_create() with NULL config.
+ *
+ * @param[in] config Pointer to a NULL terminated array of config option pointers. NULL is allowed
+ *                   and indicates no config options are provided. In case they are not provided,
+ *                   all config options have a default value in accordance with the serialized
+ *                   context. If the same config option type is provided multiple times, the last
+ *                   option value will be used.
+ *
+ * @param[in] binaryBuffer A pointer to the context binary.
+ *
+ * @param[in] binaryBufferSize Holds the size of the context binary.
+ *
+ * @param[out] context A handle to the created context.
+ *
+ * @param[in] profile The profile handle on which metrics are populated and can be queried. Use
+ *                    NULL handle to disable profile collection. A handle being re-used would reset
+ *                    and is populated with values from the current call.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: no error is encountered
+ *         - QNN_CONTEXT_ERROR_UNSUPPORTED_FEATURE: a feature is not supported
+ *         - QNN_CONTEXT_ERROR_INVALID_ARGUMENT: _binaryBuffer_ or _context_ is NULL
+ *         - QNN_CONTEXT_ERROR_MEM_ALLOC: memory allocation error while creating context
+ *         - QNN_CONTEXT_ERROR_CREATE_FROM_BINARY: failed to deserialize binary and
+ *           create context from it
+ *         - QNN_CONTEXT_ERROR_BINARY_VERSION: incompatible version of the binary
+ *         - QNN_CONTEXT_ERROR_BINARY_CONFIGURATION: binary is not configured for this device
+ *         - QNN_CONTEXT_ERROR_BINARY_SUBOPTIMAL: suboptimal binary is used when
+ *           QNN_CONTEXT_BINARY_COMPATIBILITY_STRICT is specified in the config option
+ *         - QNN_CONTEXT_ERROR_SET_PROFILE: failed to set profiling info
+ *         - QNN_CONTEXT_ERROR_INVALID_HANDLE: _backend_, __profile_, or _device_ is not a
+ *           valid handle
+ *         - QNN_CONTEXT_ERROR_INVALID_CONFIG: one or more config values is invalid
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION: SSR occurence (successful recovery)
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION_FATAL: SSR occurence (unsuccessful recovery)
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnContext_createFromBinary(Qnn_BackendHandle_t backend,
+                                              Qnn_DeviceHandle_t device,
+                                              const QnnContext_Config_t** config,
+                                              const void* binaryBuffer,
+                                              Qnn_ContextBinarySize_t binaryBufferSize,
+                                              Qnn_ContextHandle_t* context,
+                                              Qnn_ProfileHandle_t profile);
+
+/**
+ * @brief A function to create a context from a stored binary, which supports control signals.
+ *        The binary was previously obtained via QnnContext_getBinary() and stored by a client. The
+ *        content of a context created in this way cannot be further altered, meaning *no* new
+ *        nodes or tensors can be added to the context. Creating context by deserializing provided
+ *        binary is meant for fast content creation, ready to execute on.
+ *
+ * @param[in] backend A backend handle.
+ *
+ * @param[in] device A device handle to set hardware affinity for the created context. NULL value
+ *                   can be supplied for device handle and it is equivalent to calling
+ *                   QnnDevice_create() with NULL config.
+ *
+ * @param[in] config Pointer to a NULL terminated array of config option pointers. NULL is allowed
+ *                   and indicates no config options are provided. In case they are not provided,
+ *                   all config options have a default value in accordance with the serialized
+ *                   context. If the same config option type is provided multiple times, the last
+ *                   option value will be used.
+ *
+ * @param[in] binaryBuffer A pointer to the context binary.
+ *
+ * @param[in] binaryBufferSize Holds the size of the context binary.
+ *
+ * @param[out] context A handle to the created context.
+ *
+ * @param[in] profile The profile handle on which metrics are populated and can be queried. Use
+ *                    NULL handle to disable profile collection. A handle being re-used would reset
+ *                    and is populated with values from the current call.
+ *
+ * @param[in] signal Signal object to control the execution of the create context from binary
+ *                   process. NULL may be passed to indicate that no execution control is requested,
+ *                   and the create operation should continue to completion uninterrupted.
+ *                   The signal object, if not NULL, is considered to be in-use for
+ *                   the duration of the call.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: no error is encountered
+ *         - QNN_CONTEXT_ERROR_UNSUPPORTED_FEATURE: a feature is not supported
+ *         - QNN_CONTEXT_ERROR_INVALID_ARGUMENT: _binaryBuffer_ or _context_ is NULL
+ *         - QNN_CONTEXT_ERROR_MEM_ALLOC: memory allocation error while creating context
+ *         - QNN_CONTEXT_ERROR_CREATE_FROM_BINARY: failed to deserialize binary and
+ *           create context from it
+ *         - QNN_CONTEXT_ERROR_BINARY_VERSION: incompatible version of the binary
+ *         - QNN_CONTEXT_ERROR_BINARY_CONFIGURATION: binary is not configured for this device
+ *         - QNN_CONTEXT_ERROR_BINARY_SUBOPTIMAL: suboptimal binary is used when
+ *           QNN_CONTEXT_BINARY_COMPATIBILITY_STRICT is specified in the config option
+ *         - QNN_CONTEXT_ERROR_SET_PROFILE: failed to set profiling info
+ *         - QNN_CONTEXT_ERROR_INVALID_HANDLE: _backend_, __profile_, or _device_ is not a
+ *           valid handle
+ *         - QNN_CONTEXT_ERROR_INVALID_CONFIG: one or more config values is invalid
+ *         - QNN_CONTEXT_ERROR_ABORTED: the call is aborted before completion due to user
+ *           cancellation
+ *         - QNN_CONTEXT_ERROR_TIMED_OUT: the call is aborted before completion due to a timeout
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnContext_createFromBinaryWithSignal(Qnn_BackendHandle_t backend,
+                                                        Qnn_DeviceHandle_t device,
+                                                        const QnnContext_Config_t** config,
+                                                        const void* binaryBuffer,
+                                                        Qnn_ContextBinarySize_t binaryBufferSize,
+                                                        Qnn_ContextHandle_t* context,
+                                                        Qnn_ProfileHandle_t profile,
+                                                        Qnn_SignalHandle_t signal);
+
+/**
+ * @brief The purpose of this function is to asynchronously create multiple contexts from binaries
+ *        in a single API call. The API can be used with QnnSignal. When the function is invoked,
+ *        the deserialization/initialization of each context will occur in the background. As soon
+ *        as a graph reaches an executable state (i.e., initialization is completed), the client
+ *        will receive a notification via the specified notification function. Once notification is
+ *        received, the client can proceed to execute the graph. If a context contains multiple
+ *        graphs, there will be multiple callbacks through the notification function.
+ *
+ * @note: Until a notification is received indicating that at least one graph or context is in an
+ *        executable state, other context or graph-based functions cannot be called.
+ *
+ * @param[in] backend A backend handle.
+ *
+ * @param[in] device A device handle used to set hardware affinity for the created context. A
+ *                   NULL value can be supplied for the device handle, which is equivalent to
+ *                   calling QnnDevice_create() with a NULL config.
+ *
+ * @param[in] contextParams Pointer to a NULL-terminated array of context parameters.
+ *
+ * @param[in] listConfig Config pointer to a NULL-terminated array of config option pointers that
+ *                       apply to all contexts in the list. NULL is allowed and indicates that no
+ *                       config options are provided. If not provided, all config options have
+ *                       default values consistent with the serialized context. If the same config
+ *                       option type is provided multiple times, the last option value will be used.
+ *                       listConfig will override options also specified in contextParams.
+ *
+ * @param[in] signal Signal object to control the create context from binary process.
+ *                   NULL may be passed to indicate that no execution control is requested,
+ *                   and the create operation should continue to completion uninterrupted.
+ *                   The signal object, if not NULL, is considered to be in-use for
+ *                   the duration of the call including the asynchronous
+ *                   deserialization/initialization.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: no error is encountered
+ *         - QNN_CONTEXT_ERROR_UNSUPPORTED_FEATURE: a feature is not supported
+ *         - QNN_CONTEXT_ERROR_INVALID_ARGUMENT: contextParams is empty or any individual
+ *           contextParam's _binaryBuffer_ or notifyFunc is NULL or binaryBufferSize is 0
+ *         - QNN_CONTEXT_ERROR_MEM_ALLOC: memory allocation error while creating any context
+ *         - QNN_CONTEXT_ERROR_CREATE_FROM_BINARY: failed to deserialize any binary and create
+ *           context from it
+ *         - QNN_CONTEXT_ERROR_BINARY_VERSION: incompatible version of the binary
+ *         - QNN_CONTEXT_ERROR_BINARY_CONFIGURATION: binary is not configured for this device
+ *         - QNN_CONTEXT_ERROR_BINARY_SUBOPTIMAL: suboptimal binary is used when
+ *           QNN_CONTEXT_BINARY_COMPATIBILITY_STRICT is specified in the config option
+ *         - QNN_CONTEXT_ERROR_SET_PROFILE: failed to set profiling info
+ *         - QNN_CONTEXT_ERROR_INVALID_HANDLE: _backend_, _device_, __signal__ or any individual
+ *           contextParam's _profile_ is not a valid handle
+ *         - QNN_CONTEXT_ERROR_INVALID_CONFIG: one or more config values in either listConfig or
+ *           any individual contextParam's config is invalid
+ *         - QNN_CONTEXT_ERROR_ABORTED: the call is aborted before completion due to user
+ *           cancellation including during any individual asynchronous
+ *           deserialization/initialization
+ *         - QNN_CONTEXT_ERROR_TIMED_OUT: the call is aborted before completion due to a timeout
+ *           including during any individual asynchronous deserialization/initialization
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnContext_createFromBinaryListAsync(Qnn_BackendHandle_t backend,
+                                                       Qnn_DeviceHandle_t device,
+                                                       const QnnContext_Params_t** contextParams,
+                                                       const QnnContext_Config_t** listConfig,
+                                                       Qnn_SignalHandle_t signal);
+/**
+ * @brief Retrieve a section of the binary as specified by __section__. The size of this section
+ *        depends on the type of section requested. For example, for QNN_CONTEXT_SECTION_UPDATABLE
+ *        sections, this will have all the updatable tensor information.
+ *
+ * @param[in] context A context handle.
+ *
+ * @param[in] graph A graph handle. This argument is optional. When supplied the return size only
+ *                  applies to the size of the context binary section pertaining to this graph. When
+ *                  excluded the returned binary contains associated updates to all graphs in the
+ *                  context. Some backends may require _graph_ as an argument. Support is determined
+ *                  by QNN_PROPERTY_CONTEXT_SUPPORT_BINARY_SECTION_FULL_CONTEXT.
+ *
+ * @param[in] section The section of the binary to retrieve.
+ *
+ * @param[out] binaryBufferSize The amount of memory in bytes a client will need to allocate
+ *                              to hold context content updates in binary form.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: no error is encountered
+ *         - QNN_CONTEXT_ERROR_UNSUPPORTED_FEATURE: a feature is not supported
+ *         - QNN_CONTEXT_ERROR_INVALID_HANDLE:  _context_ is not a valid handle
+ *         - QNN_CONTEXT_ERROR_INVALID_ARGUMENT: _binaryBufferSize_ is NULL
+ *         - QNN_CONTEXT_ERROR_NOT_FINALIZED: if there were any non-finalized entities in the
+ *           context
+ *         - QNN_CONTEXT_ERROR_GET_BINARY_SIZE_FAILED: Operation failure due to other factors
+ *         - QNN_COMMON_ERROR_OPERATION_NOT_PERMITTED: Attempting to get binary size for a
+ *           context re-created from a cached binary.
+ *         - QNN_CONTEXT_ERROR_MEM_ALLOC: Not enough memory is available to retrieve the context
+ *           content.
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+
+QNN_API
+Qnn_ErrorHandle_t QnnContext_getBinarySectionSize(Qnn_ContextHandle_t context,
+                                                  Qnn_GraphHandle_t graph,
+                                                  QnnContext_SectionType_t section,
+                                                  Qnn_ContextBinarySize_t* binaryBufferSize);
+
+/**
+ * @brief Retrieve section of the context binary. Content of the section is specified by
+ *        __section__. The size of the section is retrieved from QnnContext_getBinarySectionSize().
+ *
+ * @param[in] context A context handle.
+ *
+ * @param[in] binaryBuffer Pointer to the user-allocated context binary memory.
+ *
+ * @param[in] graph A graph handle. This argument is optional. When supplied the returned binary
+ *                  only contains the context binary section pertaining to this graph. When excluded
+ *                  the returned binary contains associated updates to all graphs in the context.
+ *                  Some backends may require _graph_ as an argument. Support is determined by
+ *                  QNN_PROPERTY_CONTEXT_SUPPORT_BINARY_SECTION_FULL_CONTEXT.
+ *
+ * @param[in] section The section of the binary to retrieve. When section is
+ *                    QNN_CONTEXT_SECTION_UPDATABLE the returned binary will contain all of the
+ *                    updatable tensors associated with the context and graph combination. Binary
+ *                    sections of type QNN_CONTEXT_SECTION_UPDATABLE have Qnn System Context
+ *                    metadata containing information about any modified input and output tensors,
+ *                    and therefore may be used with QnnSystemContext_getMetadata() and
+ *                    QnnSystemContext_getBinaryInfo().
+ *
+ * @param[in] profile The profile handle on which metrics are populated and can be queried. Use
+ *                    NULL handle to disable profile collection. A handle being re-used would reset
+ *                    and is populated with values from the current call.
+ *
+ * @param[in] signal Signal object to control the execution of the create context from binary
+ *                   process. NULL may be passed to indicate that no execution control is requested,
+ *                   and the create operation should continue to completion uninterrupted.
+ *                   The signal object, if not NULL, is considered to be in-use for
+ *                   the duration of the call.
+ *
+ * @param[out] writtenBufferSize Amount of memory actually written into _binaryBuffer_, in bytes.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: no error is encountered
+ *         - QNN_CONTEXT_ERROR_UNSUPPORTED_FEATURE: a feature is not supported
+ *         - QNN_CONTEXT_ERROR_INVALID_HANDLE:  _context_ is not a valid handle
+ *         - QNN_CONTEXT_ERROR_INVALID_ARGUMENT: one of the arguments to the API is invalid/NULL
+ *         - QNN_CONTEXT_ERROR_NOT_FINALIZED: if there were any non-finalized entities in the
+ *           context
+ *         - QNN_CONTEXT_ERROR_GET_BINARY_FAILED: Operation failure due to other factors
+ *         - QNN_COMMON_ERROR_OPERATION_NOT_PERMITTED: Attempting to get binary update for a
+ *           context re-created from a cached binary.
+ *         - QNN_CONTEXT_ERROR_MEM_ALLOC: Not enough memory is available to retrieve the context
+ *           content.
+ *
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnContext_getBinarySection(Qnn_ContextHandle_t context,
+                                              Qnn_GraphHandle_t graph,
+                                              QnnContext_SectionType_t section,
+                                              const QnnContext_Buffer_t* binaryBuffer,
+                                              Qnn_ContextBinarySize_t* writtenBufferSize,
+                                              Qnn_ProfileHandle_t profile,
+                                              Qnn_SignalHandle_t signal);
+
+/**
+ * @brief Apply a section to the contextBinary produced by a prior QnnContext_getBinarySection()
+ *        call. If successful, this section overwrites previously applied sections. If the call to
+ *        applyBinarySection() fails, it indicates the changes were not applied, and that the
+ *        context retains its prior state. In this case the context is still valid and may be used
+ *        for subsequent inferences.
+ *
+ * @param[in] context A context handle.
+ *
+ * @param[in] graph A graph handle. This argument is optional. When supplied the returned binary
+ *                  only contains the context binary section pertaining to this graph. When excluded
+ *                  the returned binary contains associated updates to all graphs in the context.
+ *
+ * @param[in] section The section of the binary to retrieve. When section is
+ *                    QNN_CONTEXT_SECTION_UPDATABLE the returned binary will contain all of the
+ *                    updatable tensors associated with the context and graph combination.
+ *
+ * @param[in] binaryBuffer Pointer to the user-allocated context binary memory.
+ *
+ * @param[in] profile The profile handle on which metrics are populated and can be queried. Use
+ *                    NULL handle to disable profile collection. A handle being re-used would reset
+ *                    and is populated with values from the current call.
+ *
+ * @param[in] signal Signal object to control the execution of the create context from binary
+ *                   process. NULL may be passed to indicate that no execution control is requested,
+ *                   and the create operation should continue to completion uninterrupted.
+ *                   The signal object, if not NULL, is considered to be in-use for
+ *                   the duration of the call.
+ *
+ *         - QNN_SUCCESS: no error is encountered
+ *         - QNN_CONTEXT_ERROR_UNSUPPORTED_FEATURE: a feature is not supported
+ *         - QNN_CONTEXT_ERROR_INVALID_ARGUMENT: _binaryBuffer_ or _context_ is NULL
+ *         - QNN_CONTEXT_ERROR_MEM_ALLOC: memory allocation error while creating context update
+ *         - QNN_CONTEXT_ERROR_SET_PROFILE: failed to set profiling info
+ *         - QNN_CONTEXT_ERROR_INVALID_HANDLE: _backend_, __profile_, or _signal_ is not a
+ *           valid handle
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ * @note When using this API with QNN_CONTEXT_CONFIG_PERSISTENT_BINARY enabled,
+ *       binaryBuffer should be available and persistent from first call to
+ *       QnnContext_applyBinarySection until QnnContext_free.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnContext_applyBinarySection(Qnn_ContextHandle_t context,
+                                                Qnn_GraphHandle_t graph,
+                                                QnnContext_SectionType_t section,
+                                                const QnnContext_Buffer_t* binaryBuffer,
+                                                Qnn_ProfileHandle_t profile,
+                                                Qnn_SignalHandle_t signal);
+
+/**
+ * @brief A function to get a list of context properties.
+ *        Backends are not required to support this API.
+ *
+ * @param[in] contextHandle A context handle.
+ *
+ * @param[in/out] properties Pointer to a null terminated array of pointers containing the
+ *                           properties associated with the passed contextHandle. Memory for
+ *                           this information is owned and managed by the client. Client
+ *                           needs to populate the property options being requested. If
+ *                           _contextHandle_ is not recognized, the pointer _properties_
+ *                           points to is set to nullptr.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: no error is encountered
+ *         - QNN_CONTEXT_ERROR_INVALID_HANDLE: _contextHandle_ is not a valid handle
+ *         - QNN_CONTEXT_ERROR_INVALID_ARGUMENT: _properties_ is NULL or at least one property option
+ *           is invalid
+ *         - QNN_CONTEXT_ERROR_UNSUPPORTED_FEATURE: at least one valid property option is not
+ *           supported
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnContext_getProperty(Qnn_ContextHandle_t contextHandle,
+                                         QnnContext_Property_t** properties);
+
+
+/**
+ * @brief A function to get the next piece of the context binary, incrementally produced from the backend.
+ *        The backend returns a pointer to constant data which it owns, the data's size and the starting offset
+ *        where the incremental binary buffer begins. The memory provided here must be released through
+ *        QnnContext_releaseIncrementalBinary. Incremental pieces of the context binary may be provided
+ *        in random order i.e. startOffset is independent of previous calls.
+ *
+ *        @note modifications made to the context in between calls to QnnContext_getIncrementalBinary
+ *        results in undefined behavior.
+ *
+ * @param[in] context A context handle.
+ *
+ * @param[out] binaryBuffer Pointer to backend provided/owned buffer
+ *
+ * @param[out] startOffset Starting offset for binary data.
+ *
+ * @param[out] writtenBufferSize Amount of memory actually written into _binaryBuffer_, in bytes.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: no error is encountered
+ *         - QNN_CONTEXT_ERROR_UNSUPPORTED_FEATURE: a feature is not supported
+ *         - QNN_CONTEXT_ERROR_INVALID_HANDLE:  _context_ is not a valid handle
+ *         - QNN_CONTEXT_ERROR_INVALID_ARGUMENT: one of the arguments to the API is invalid/NULL
+ *         - QNN_CONTEXT_ERROR_NOT_FINALIZED: if there were any non-finalized entities in the
+ *           context
+ *         - QNN_CONTEXT_ERROR_GET_BINARY_FAILED: Operation failure due to other factors
+ *         - QNN_COMMON_ERROR_OPERATION_NOT_PERMITTED: Attempting to get binary for a
+ *           context re-created from a cached binary.
+ *         - QNN_CONTEXT_ERROR_MEM_ALLOC: Not enough memory is available to retrieve the context
+ *           content.
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnContext_getIncrementalBinary(Qnn_ContextHandle_t context,
+                                                  const void** binaryBuffer,
+                                                  Qnn_ContextBinarySize_t* startOffset,
+                                                  Qnn_ContextBinarySize_t* writtenBufferSize);
+/**
+ * @brief A function to release a incrementally allocated portion of the context binary
+ *        retrieved from a previous call to QnnContext_getIncrementalBinary.
+ *
+ * @param[in] context A context handle.
+ *
+ * @param[out] binaryBuffer Pointer to backend provided/owned buffer
+ *
+ * @param[out] startOffset Starting offset for binary data.
+ *
+ * @param[out] writtenBufferSize Amount of memory actually written into _binaryBuffer_, in bytes.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: no error is encountered
+ *         - QNN_CONTEXT_ERROR_UNSUPPORTED_FEATURE: a feature is not supported
+ *         - QNN_CONTEXT_ERROR_INVALID_HANDLE:  _context_ is not a valid handle
+ *         - QNN_CONTEXT_ERROR_INVALID_ARGUMENT: one of the arguments to the API is invalid/NULL
+ *         - QNN_CONTEXT_ERROR_NOT_FINALIZED: if there were any non-finalized entities in the
+ *           context
+ *         - QNN_COMMON_ERROR_OPERATION_NOT_PERMITTED: Attempting to get binary for a
+ *           context re-created from a cached binary.
+ *         - QNN_CONTEXT_ERROR_INCREMENT_INVALID_BUFFER: The buffer __binaryBuffer__ starting at
+ *           __startOffset__ was not allocated by the backend.
+ *         - QNN_CONTEXT_ERROR_MEM_ALLOC: Not enough memory is available to retrieve the context
+ *           content.
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ * */
+QNN_API
+Qnn_ErrorHandle_t QnnContext_releaseIncrementalBinary(Qnn_ContextHandle_t context,
+                                                      const void* binaryBuffer,
+                                                      Qnn_ContextBinarySize_t startOffset);
+
+/**
+ * @brief A function to free the context and all associated graphs, operations & tensors
+ *
+ * @param[in] context A context handle.
+ *
+ * @param[in] profile The profile handle on which metrics are populated and can be queried. Use
+ *                    NULL handle to disable profile collection. A handle being re-used would reset
+ *                    and is populated with values from the current call.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: no error is encountered
+ *         - QNN_CONTEXT_ERROR_INVALID_ARGUMENT: _profile_ is not a valid handle
+ *         - QNN_CONTEXT_ERROR_INVALID_HANDLE:  _context_ is not a valid handle
+ *         - QNN_CONTEXT_ERROR_MEM_ALLOC: an error is encountered with de-allocation of associated
+ *           memory
+ *         - QNN_CONTEXT_ERROR_SET_PROFILE: failed to set profiling info
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION: SSR occurence (successful recovery)
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION_FATAL: SSR occurence (unsuccessful recovery)
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnContext_free(Qnn_ContextHandle_t context, Qnn_ProfileHandle_t profile);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // QNN_CONTEXT_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnDevice.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnDevice.h
new file mode 100755
index 0000000000000..1b792af9e1aca
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnDevice.h
@@ -0,0 +1,461 @@
+//=============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//=============================================================================
+
+/**
+ *  @file
+ *  @brief  Device component API.
+ *
+ *          This is the top level QNN API component for hardware resource management.
+ */
+
+#ifndef QNN_DEVICE_H
+#define QNN_DEVICE_H
+
+#include "QnnCommon.h"
+#include "QnnTypes.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+
+/// Reserved value to select a default device
+#define QNN_DEVICE_DEFAULT_DEVICE_ID 0xFFFFFFFF
+
+/// Reserved value to select a default core
+#define QNN_DEVICE_DEFAULT_CORE_ID 0xFFFFFFFF
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+/**
+ * @brief QNN Device API result / error codes.
+ */
+typedef enum {
+  QNN_DEVICE_MIN_ERROR = QNN_MIN_ERROR_DEVICE,
+  ////////////////////////////////////////////
+  /// There is optional API component that is not supported yet. See QnnProperty.
+  QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE = QNN_COMMON_ERROR_NOT_SUPPORTED,
+  /// Memory allocation/deallocation failure
+  QNN_DEVICE_ERROR_MEM_ALLOC = QNN_COMMON_ERROR_MEM_ALLOC,
+  /// Invalid function argument
+  QNN_DEVICE_ERROR_INVALID_ARGUMENT = QNN_COMMON_ERROR_INVALID_ARGUMENT,
+  /// Invalid handle
+  QNN_DEVICE_ERROR_INVALID_HANDLE = QNN_MIN_ERROR_DEVICE + 0,
+  /// Invalid config values
+  QNN_DEVICE_ERROR_INVALID_CONFIG = QNN_MIN_ERROR_DEVICE + 1,
+  /// Hardware unavailable
+  QNN_DEVICE_ERROR_HARDWARE_UNAVAILABLE = QNN_MIN_ERROR_DEVICE + 2,
+  /// Device is associated to a context
+  QNN_DEVICE_ERROR_ASSOCIATED_TO_CONTEXT = QNN_MIN_ERROR_DEVICE + 3,
+  /// Qnn Device success
+  QNN_DEVICE_NO_ERROR = QNN_SUCCESS,
+
+  ////////////////////////////////////////////
+  QNN_DEVICE_MAX_ERROR = QNN_MAX_ERROR_DEVICE,
+  // Unused, present to ensure 32 bits.
+  QNN_DEVICE_ERROR_UNDEFINED = 0x7FFFFFFF
+} QnnDevice_Error_t;
+
+/**
+ * @brief Backend specific opaque infrastructure object
+ *
+ * Please refer to the documentation provided by the backend for usage information.
+ */
+typedef struct _QnnDevice_Infrastructure_t* QnnDevice_Infrastructure_t;
+
+/**
+ * @brief Backend-defined structure to populate backend specific information for core info
+ */
+typedef struct _QnnDevice_CoreInfoExtension_t* QnnDevice_CoreInfoExtension_t;
+
+/**
+ * @brief Version 1 of the structure defining per Core info
+ */
+typedef struct {
+  /// ID of the enumerated core
+  uint32_t coreId;
+  /// Type of the core, as specified by the backend
+  uint32_t coreType;
+  /// Backend specific extension for core info. Refer to backend headers for the definition
+  QnnDevice_CoreInfoExtension_t coreInfoExtension;
+} QnnDevice_CoreInfoV1_t;
+
+// clang-format off
+/// QnnDevice_CoreInfoV1_t initializer macro
+#define QNN_DEVICE_CORE_INFO_V1_INIT                  \
+  {                                                   \
+    QNN_DEVICE_DEFAULT_CORE_ID, /*coreId*/            \
+    0u,                         /*coreType*/          \
+    NULL                        /*coreInfoExtension*/ \
+  }
+// clang-format on
+
+/**
+ * @brief Enum to distinguish core info versions
+ */
+typedef enum {
+  QNN_DEVICE_CORE_INFO_VERSION_1 = 1,
+  // Unused, present to ensure 32 bits.
+  QNN_DEVICE_CORE_INFO_VERSION_UNDEFINED = 0x7FFFFFFF
+} QnnDevice_CoreInfoVersion_t;
+
+/**
+ * @brief Structure defining per core info
+ */
+typedef struct {
+  QnnDevice_CoreInfoVersion_t version;
+  union UNNAMED {
+    /// Core info which corresponds to version QNN_DEVICE_CORE_INFO_VERSION_1
+    QnnDevice_CoreInfoV1_t v1;
+  };
+} QnnDevice_CoreInfo_t;
+
+/// QnnDevice_CoreInfo_t initializer macro
+#define QNN_DEVICE_CORE_INFO_INIT               \
+  {                                             \
+    QNN_DEVICE_CORE_INFO_VERSION_1, /*version*/ \
+    {                                           \
+      QNN_DEVICE_CORE_INFO_V1_INIT /*v1*/       \
+    }                                           \
+  }
+
+/**
+ * @brief Backend-defined structure to populate backend specific information for device info
+ */
+typedef struct _QnnDevice_DeviceInfoExtension_t* QnnDevice_DeviceInfoExtension_t;
+
+/**
+ * @brief Version 1 of the structure defining Hardware Device info
+ */
+typedef struct {
+  /// ID of the device
+  uint32_t deviceId;
+  /// Type of the device
+  uint32_t deviceType;
+  /// Number of cores in a device
+  uint32_t numCores;
+  /// Array of core info structures
+  QnnDevice_CoreInfo_t* cores;
+  /// Backend specific extension for device info. Refer to backend headers for the definition
+  QnnDevice_DeviceInfoExtension_t deviceInfoExtension;
+} QnnDevice_HardwareDeviceInfoV1_t;
+
+// clang-format off
+/// QnnDevice_HardwareDeviceInfoV1_t initializer macro
+#define QNN_DEVICE_HARDWARE_DEVICE_INFO_V1_INIT           \
+  {                                                       \
+    QNN_DEVICE_DEFAULT_DEVICE_ID, /*deviceId*/            \
+    0u,                           /*deviceType*/          \
+    0u,                           /*numCores*/            \
+    NULL,                         /*cores*/               \
+    NULL                          /*deviceInfoExtension*/ \
+  }
+// clang-format on
+
+/**
+ * @brief Enum to distinguish device info versions
+ */
+typedef enum {
+  QNN_DEVICE_HARDWARE_DEVICE_INFO_VERSION_1 = 1,
+  // Unused, present to ensure 32 bits.
+  QNN_DEVICE_HARDWARE_DEVICE_INFO_VERSION_UNDEFINED = 0x7FFFFFFF
+} QnnDevice_HardwareDeviceInfoVersion_t;
+
+/**
+ * @brief Structure defining hardware device info (typically a SoC or PCIe extension)
+ */
+typedef struct {
+  QnnDevice_HardwareDeviceInfoVersion_t version;
+  union UNNAMED {
+    /// Device info which corresponds to version QNN_DEVICE_HARDWARE_DEVICE_INFO_VERSION_1
+    QnnDevice_HardwareDeviceInfoV1_t v1;
+  };
+} QnnDevice_HardwareDeviceInfo_t;
+
+/// QnnDevice_HardwareDeviceInfo_t initializer macro
+#define QNN_DEVICE_HARDWARE_DEVICE_INFO_INIT               \
+  {                                                        \
+    QNN_DEVICE_HARDWARE_DEVICE_INFO_VERSION_1, /*version*/ \
+    {                                                      \
+      QNN_DEVICE_HARDWARE_DEVICE_INFO_V1_INIT /*v1*/       \
+    }                                                      \
+  }
+
+/**
+ * @brief Version 1 of the structure defining platform info
+ */
+typedef struct {
+  /// Number of devices
+  uint32_t numHwDevices;
+  /// Array of device info structures
+  QnnDevice_HardwareDeviceInfo_t* hwDevices;
+} QnnDevice_PlatformInfoV1_t;
+
+// clang-format off
+/// QnnDevice_PlatformInfoV1_t initializer macro
+#define QNN_DEVICE_PLATFORM_INFO_V1_INIT \
+  {                                      \
+    0u,      /*numHwDevices*/            \
+    NULL     /*hwDevices*/               \
+  }
+// clang-format on
+
+/**
+ * @brief Enum to distinguish platform info versions
+ */
+typedef enum {
+  QNN_DEVICE_PLATFORM_INFO_VERSION_1 = 1,
+  // Unused, present to ensure 32 bits.
+  QNN_DEVICE_PLATFORM_INFO_VERSION_UNDEFINED = 0x7FFFFFFF
+} QnnDevice_PlatformInfoVersion_t;
+
+/**
+ * @brief Structure defining the platform info
+ */
+typedef struct {
+  QnnDevice_PlatformInfoVersion_t version;
+  union UNNAMED {
+    /// Platform info which corresponds to version QNN_DEVICE_PLATFORM_INFO_VERSION_1
+    QnnDevice_PlatformInfoV1_t v1;
+  };
+} QnnDevice_PlatformInfo_t;
+
+/// QnnDevice_PlatformInfo_t initializer macro
+#define QNN_DEVICE_PLATFORM_INFO_INIT               \
+  {                                                 \
+    QNN_DEVICE_PLATFORM_INFO_VERSION_1, /*version*/ \
+    {                                               \
+      QNN_DEVICE_PLATFORM_INFO_V1_INIT /*v1*/       \
+    }                                               \
+  }
+
+/**
+ * @brief Backend specific object for custom configuration
+ *
+ * Please refer to documentation provided by the backend for usage information
+ */
+typedef void* QnnDevice_CustomConfig_t;
+
+/**
+ * @brief This enum defines config options to control QnnDevice_Config_t
+ */
+typedef enum {
+  /// sets backend custom options
+  QNN_DEVICE_CONFIG_OPTION_CUSTOM = 0,
+  /// select QnnDevice_PlatformInfo_t
+  QNN_DEVICE_CONFIG_OPTION_PLATFORM_INFO = 1,
+  /// Unused, present to ensure 32 bits.
+  QNN_DEVICE_CONFIG_OPTION_UNDEFINED = 0x7FFFFFFF
+} QnnDevice_ConfigOption_t;
+
+/**
+ * @brief This struct provides device configuration.
+ */
+typedef struct {
+  QnnDevice_ConfigOption_t option;
+  union UNNAMED {
+    QnnDevice_CustomConfig_t customConfig;
+    QnnDevice_PlatformInfo_t* hardwareInfo;
+  };
+} QnnDevice_Config_t;
+
+/// QnnDevice_Config_t initializer macro
+#define QNN_DEVICE_CONFIG_INIT                     \
+  {                                                \
+    QNN_DEVICE_CONFIG_OPTION_UNDEFINED, /*option*/ \
+    {                                              \
+      NULL /*customConfig*/                        \
+    }                                              \
+  }
+
+//=============================================================================
+// Public Functions
+//=============================================================================
+
+/**
+ * @brief A function to get the collection of devices and cores that a QNN backend is able to
+ *        recognize and communicate with. Memory is owned by the backend and deallocated with a call
+ *        to QnnDevice_freePlatformInfo().
+ *
+ * @note This function may not be supported for offline preparation
+ *
+ * @param[in] logger A handle to the logger, use NULL handle to disable logging. QnnDevice doesn't
+ *                   manage the lifecycle of logger and must be freed by using QnnLog_free().
+ *
+ * @param[out] platformInfo Information about the platform. Memory for this information is owned
+ *                          and managed by QNN backend.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: no error is encountered
+ *         - QNN_DEVICE_ERROR_INVALID_ARGUMENT: _platformInfo_ is NULL
+ *         - QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE: API is not supported
+ *         - QNN_DEVICE_ERROR_MEM_ALLOC: failure in allocating memory for _platformInfo_
+ *         - QNN_DEVICE_ERROR_INVALID_HANDLE: invalid _logger_
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnDevice_getPlatformInfo(Qnn_LogHandle_t logger,
+                                            const QnnDevice_PlatformInfo_t** platformInfo);
+
+/**
+ * @brief A function to free the memory allocated during QnnDevice_getPlatformInfo()
+ *
+ * @note This function may not be supported for offline preparation.
+ *
+ * @param[in] logger A handle to the logger, use NULL handle to disable logging. QnnDevice doesn't
+ *                   manage the lifecycle of logger and must be freed by using QnnLog_free().
+ *
+ * @param[in] platformInfo Information about the platform. Memory for this information is owned and
+ *                         managed by QNN backend.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: no error is encountered
+ *         - QNN_DEVICE_ERROR_INVALID_ARGUMENT: _platformInfo_ is NULL
+ *         - QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE: API is not supported
+ *         - QNN_DEVICE_ERROR_MEM_ALLOC: failure in de-allocating memory for _platformInfo_
+ *         - QNN_DEVICE_ERROR_INVALID_HANDLE: invalid _logger_
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnDevice_freePlatformInfo(Qnn_LogHandle_t logger,
+                                             const QnnDevice_PlatformInfo_t* platformInfo);
+
+/**
+ * @brief Get device hardware infrastructure interface object
+ *
+ * This is optional capability, support is advertised via QnnProperty. If supported, please refer
+ * to documentation and/or header file provided by the backend for usage information.
+ *
+ * @param[out] deviceInfra Pointer to infrastructure interface object. The pointer returned is a
+ *                         backend owned memory.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: No error encountered
+ *         - QNN_DEVICE_ERROR_INVALID_HANDLE: _device_ is not a valid handle
+ *         - QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE: API is not supported
+ *         - QNN_DEVICE_ERROR_INVALID_ARGUMENT: if _deviceInfra_ is NULL
+ *         - QNN_DEVICE_ERROR_MEM_ALLOC: insufficient memory to return _deviceInfra_
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnDevice_getInfrastructure(const QnnDevice_Infrastructure_t* deviceInfra);
+
+/**
+ * @brief Create a logical device handle to a subset of hardware resources available on the
+ *        platform.
+ *
+ * @param[in] logger A handle to the logger, use NULL handle to disable logging. QnnDevice doesn't
+ *                   manage the lifecycle of logger and must be freed by using QnnLog_free().
+ *
+ * @param[in] config Pointer to a NULL terminated array of config option pointers. NULL is allowed
+ *                   and indicates no config options are provided. All config options have default
+ *                   value, in case not provided. If same config option type is provided multiple
+ *                   times, the last option value will be used.
+ *
+ * @note NULL value for config creates a device handle with default configuration. Unless mentioned
+ *       in backend specific headers, default configuration would enable all the devices and cores
+ *       present on a platform for which a backend can control.
+ *
+ * @param[out] device A handle to the created device.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: no error is encountered
+ *         - QNN_DEVICE_ERROR_INVALID_ARGUMENT: _device_ is NULL
+ *         - QNN_DEVICE_ERROR_INVALID_HANDLE: _logger_ is not a valid handle
+ *         - QNN_DEVICE_ERROR_INVALID_CONFIG: one or more configuration values is invalid
+ *         - QNN_DEVICE_ERROR_MEM_ALLOC: failure in allocating memory when creating device
+ *         - QNN_DEVICE_ERROR_HARDWARE_UNAVAILABLE: requested hardware resources are unavailable
+ *         - QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE: API is not supported
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION: SSR occurence (successful recovery)
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION_FATAL: SSR occurence (unsuccessful recovery)
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnDevice_create(Qnn_LogHandle_t logger,
+                                   const QnnDevice_Config_t** config,
+                                   Qnn_DeviceHandle_t* device);
+
+/**
+ * @brief A function to set/modify configuration options on an already created device.
+ *        Backends are not required to support this API.
+ *
+ * @param[in] device A device handle.
+ *
+ * @param[in] config Pointer to a NULL terminated array of config option pointers. NULL is allowed
+ *                   and indicates no config options are provided. All config options have default
+ *                   value, in case not provided. If same config option type is provided multiple
+ *                   times, the last option value will be used.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: no error is encountered
+ *         - QNN_DEVICE_ERROR_INVALID_HANDLE: _device_ is not a valid handle
+ *         - QNN_DEVICE_ERROR_INVALID_ARGUMENT: at least one argument is invalid
+ *         - QNN_DEVICE_ERROR_INVALID_CONFIG: one or more configuration values is invalid
+ *         - QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE: API is not supported
+ *         - QNN_DEVICE_ERROR_ASSOCIATED_TO_CONTEXT: _device_ has associated contexts. Free the
+ *           associations before attempting to change the config.
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnDevice_setConfig(Qnn_DeviceHandle_t device, const QnnDevice_Config_t** config);
+
+/**
+ * @brief A function to get platform info associated with a device handle.
+ *
+ * @param[in] device A device handle.
+ *
+ * @param[out] platformInfo Information about the platform. Memory for this information is owned
+ *                          and managed by QNN backend.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: no error is encountered
+ *         - QNN_DEVICE_ERROR_INVALID_HANDLE: _device_ is not a valid handle
+ *         - QNN_DEVICE_ERROR_INVALID_ARGUMENT: _platformInfo_ is NULL
+ *         - QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE: API is not supported
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnDevice_getInfo(Qnn_DeviceHandle_t device,
+                                    const QnnDevice_PlatformInfo_t** platformInfo);
+
+/**
+ * @brief Free the created device and perform any deallocation of the resources allocated during
+ *        device create.
+ *
+ * @param[in] device A device handle.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: no error is encountered
+ *         - QNN_DEVICE_ERROR_INVALID_HANDLE: _device_ is not a valid handle
+ *         - QNN_DEVICE_ERROR_MEM_ALLOC: an error is encountered with de-allocation of associated
+ *           memory, failure to invalidate handles or other allocated resources
+ *         - QNN_DEVICE_ERROR_ASSOCIATED_TO_CONTEXT: One or more contexts associated with the device
+ *           handle is not freed
+ *         - QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE: API is not supported
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnDevice_free(Qnn_DeviceHandle_t device);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // QNN_DEVICE_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnError.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnError.h
new file mode 100755
index 0000000000000..5948b23f8786a
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnError.h
@@ -0,0 +1,122 @@
+//==============================================================================
+//
+// Copyright (c) 2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/**
+ *  @file
+ *  @brief  Error handling API
+ *
+ *          Requires Backend to be initialized.
+ *          Provides means to get detailed error information.
+ */
+
+#ifndef QNN_ERROR_H
+#define QNN_ERROR_H
+
+#include "QnnCommon.h"
+#include "QnnTypes.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+/**
+ * @brief QNN Error API result / error codes.
+ */
+typedef enum {
+  QNN_ERROR_MIN_ERROR = QNN_MIN_ERROR_ERROR,
+  ////////////////////////////////////////
+
+  /// Qnn Error success
+  QNN_ERROR_NO_ERROR = QNN_SUCCESS,
+  /// Invalid function argument
+  QNN_ERROR_INVALID_ARGUMENT = QNN_MIN_ERROR_ERROR + 0,
+  /// Unrecognized or invalid error handle
+  QNN_ERROR_INVALID_ERROR_HANDLE = QNN_MIN_ERROR_ERROR + 1,
+  ////////////////////////////////////////
+  QNN_ERROR_MAX_ERROR = QNN_MAX_ERROR_ERROR,
+  // Unused, present to ensure 32 bits.
+  QNN_ERROR_UNDEFINED = 0x7FFFFFFF
+} QnnError_Error_t;
+
+//=============================================================================
+// Public Functions
+//=============================================================================
+
+/**
+ * @brief Query QNN backend for string message describing the error code.
+ * Returned message should contain basic information about the nature of the
+ * error.
+ *
+ * @param[in] errorHandle   Error handle to request descriptive message for.
+ *
+ * @param[out] errorMessage Pointer to a null terminated character array containing the message
+ *                          associated with the passed errorHandle. The memory is statically
+ *                          owned and should not be freed by the caller. If _errorHandle_
+ *                          is not recognized, the pointer _errorMessage_ points to is set to
+ *                          nullptr.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: error string corresponding to the error handle successfully queried
+ *         - QNN_ERROR_INVALID_ARGUMENT: _errorMessage_ is null
+ *         - QNN_ERROR_INVALID_ERROR_HANDLE: _errorHandle_ not recognized
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnError_getMessage(Qnn_ErrorHandle_t errorHandle, const char** errorMessage);
+
+/**
+ * @brief Query QNN backend for verbose string message describing the error code.
+ * Returned message should contain detailed information about the nature of the
+ * error.
+ *
+ * @param[in] errorHandle   Error handle to request descriptive message for.
+ *
+ * @param[out] errorMessage Pointer to a null terminated character array containing the verbose
+ *                          message associated with the passed errorHandle. The memory is
+ *                          owned by the backend and only freed when the caller invokes
+ *                          QnnError_freeVerboseMessage, passing the same error handle. If
+ *                          _errorHandle_ is not recognized, the pointer _errorMessage_ points
+ *                          to is set to nullptr.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: error string corresponding to the error handle successfully queried
+ *         - QNN_ERROR_INVALID_ARGUMENT: _errorMessage_ is null
+ *         - QNN_ERROR_INVALID_ERROR_HANDLE: _errorHandle_ not recognized by backend
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnError_getVerboseMessage(Qnn_ErrorHandle_t errorHandle,
+                                             const char** errorMessage);
+
+/**
+ * @brief Inform QNN backend that the memory associated with the verbose message
+ * returned by a previous call to QnnError_getVerboseMessage will no longer be
+ * accessed by the caller and may be freed.
+ *
+ * @param[in] errorMessage Address of character buffer returned in previous call to
+ *                          QnnError_getVerboseMessage.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: backend acknowledges the caller will no longer access memory
+ *           associated with previous call to QnnError_getVerboseMessage
+ *         - QNN_ERROR_INVALID_ARGUMENT: _errorMessage_ is null or unrecognized
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnError_freeVerboseMessage(const char* errorMessage);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // QNN_ERROR_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnGraph.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnGraph.h
new file mode 100755
index 0000000000000..9082af8ba902c
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnGraph.h
@@ -0,0 +1,873 @@
+//==============================================================================
+//
+// Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+// All rights reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/**
+ *  @file
+ *  @brief  Graph component API
+ *
+ *          Requires Backend to be initialized.
+ *          Provides composable graph API. Graph is created inside Context.
+ *          Nodes are added to the graph. Nodes are connected with Tensors.
+ *          Once finalized graph can be executed.
+ */
+
+#ifndef QNN_GRAPH_H
+#define QNN_GRAPH_H
+
+#include "QnnCommon.h"
+#include "QnnTypes.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+/**
+ * @brief QNN Graph API result / error codes.
+ */
+typedef enum {
+  QNN_GRAPH_MIN_ERROR = QNN_MIN_ERROR_GRAPH,
+  ////////////////////////////////////////
+
+  /// Qnn Graph success
+  QNN_GRAPH_NO_ERROR = QNN_SUCCESS,
+  /// There is optional API component that is not supported yet. See QnnProperty.
+  QNN_GRAPH_ERROR_UNSUPPORTED_FEATURE = QNN_COMMON_ERROR_NOT_SUPPORTED,
+  /// General error relating to memory allocation in processing graph API
+  QNN_GRAPH_ERROR_MEM_ALLOC = QNN_COMMON_ERROR_MEM_ALLOC,
+  /// General type of graph error, which has not been identified as any
+  /// other error type. Any Graph API can return this error code.
+  QNN_GRAPH_ERROR_GENERAL = QNN_COMMON_ERROR_GENERAL,
+  /// An argument to QNN API is deemed invalid by a backend
+  QNN_GRAPH_ERROR_INVALID_ARGUMENT = QNN_MIN_ERROR_GRAPH + 0,
+  /// Invalid graph handle
+  QNN_GRAPH_ERROR_INVALID_HANDLE = QNN_MIN_ERROR_GRAPH + 1,
+  /// No graph with specified info is registered in the backend
+  QNN_GRAPH_ERROR_GRAPH_DOES_NOT_EXIST = QNN_MIN_ERROR_GRAPH + 2,
+  /// Invalid or duplicate graph name
+  QNN_GRAPH_ERROR_INVALID_NAME = QNN_MIN_ERROR_GRAPH + 3,
+  /// Invalid or NULL QNN tensor
+  QNN_GRAPH_ERROR_INVALID_TENSOR = QNN_MIN_ERROR_GRAPH + 4,
+  /// Some elements in the op config data are invalid
+  QNN_GRAPH_ERROR_INVALID_OP_CONFIG = QNN_MIN_ERROR_GRAPH + 5,
+  /// Failure to set profile
+  QNN_GRAPH_ERROR_SET_PROFILE = QNN_MIN_ERROR_GRAPH + 6,
+  /// Node added before its dependent node(s)
+  QNN_GRAPH_ERROR_UNCONNECTED_NODE = QNN_MIN_ERROR_GRAPH + 7,
+  /// Failure in creating graph with specified configuration
+  QNN_GRAPH_ERROR_CREATE_FAILED = QNN_MIN_ERROR_GRAPH + 20,
+  /// Graph couldn't be optimized with specified list of ops or config
+  QNN_GRAPH_ERROR_OPTIMIZATION_FAILED = QNN_MIN_ERROR_GRAPH + 21,
+  /// Graph finalize failed
+  QNN_GRAPH_ERROR_FINALIZE_FAILED = QNN_MIN_ERROR_GRAPH + 22,
+  /// Attempt to execute graph before finalizing it
+  QNN_GRAPH_ERROR_GRAPH_NOT_FINALIZED = QNN_MIN_ERROR_GRAPH + 23,
+  /// Attempt to modify graph after finalizing it
+  QNN_GRAPH_ERROR_GRAPH_FINALIZED = QNN_MIN_ERROR_GRAPH + 24,
+  /// FIFO queue cannot register any more async execution requests
+  QNN_GRAPH_ERROR_EXECUTION_ASYNC_FIFO_FULL = QNN_MIN_ERROR_GRAPH + 25,
+  /// A control signal object was provided to a call, but that signal object
+  /// is already in-use by another call.
+  QNN_GRAPH_ERROR_SIGNAL_IN_USE = QNN_MIN_ERROR_GRAPH + 30,
+  /// Call aborted early due to a QnnSignal_trigger call issued
+  /// to the observed signal object.
+  QNN_GRAPH_ERROR_ABORTED = QNN_MIN_ERROR_GRAPH + 31,
+  /// Attempt to bind to a graph a profile handle that is already in-use
+  /// by another graph.
+  QNN_GRAPH_ERROR_PROFILE_IN_USE = QNN_MIN_ERROR_GRAPH + 32,
+  /// Call aborted early due to a QnnSignal timeout
+  QNN_GRAPH_ERROR_TIMED_OUT = QNN_MIN_ERROR_GRAPH + 33,
+  /// Operation not permitted on a subgraph
+  QNN_GRAPH_ERROR_SUBGRAPH = QNN_MIN_ERROR_GRAPH + 34,
+  /// Graph is not enabled
+  QNN_GRAPH_ERROR_DISABLED = QNN_MIN_ERROR_GRAPH + 35,
+  /// Dynamic tensor shape error
+  QNN_GRAPH_ERROR_DYNAMIC_TENSOR_SHAPE = QNN_MIN_ERROR_GRAPH + 36,
+  /// Tensor sparsity error
+  QNN_GRAPH_ERROR_TENSOR_SPARSITY = QNN_MIN_ERROR_GRAPH + 37,
+  /// Early termination error
+  QNN_GRAPH_ERROR_EARLY_TERMINATION = QNN_MIN_ERROR_GRAPH + 38,
+  /// Invalid context error
+  QNN_GRAPH_ERROR_INVALID_CONTEXT = QNN_MIN_ERROR_GRAPH + 39,
+
+  ////////////////////////////////////////
+  QNN_GRAPH_MAX_ERROR = QNN_MAX_ERROR_GRAPH,
+  // Unused, present to ensure 32 bits.
+  QNN_GRAPH_ERROR_UNDEFINED = 0x7FFFFFFF
+} QnnGraph_Error_t;
+
+/**
+ * @brief This enum defines graph config options.
+ */
+typedef enum {
+  /// Sets backend custom configs, see backend specific documentation.
+  QNN_GRAPH_CONFIG_OPTION_CUSTOM = 0,
+  /// Sets priority of a graph within the context. This config overrides
+  /// QNN_CONTEXT_CONFIG_OPTION_PRIORITY which provides the default graph priority.
+  QNN_GRAPH_CONFIG_OPTION_PRIORITY = 3,
+  /// Enables continuous profiling of a graph. This can include finalize and execute data. The
+  /// profile handle will be bound to the graph until a new handle is bound or the graph has been
+  /// freed. This feature is mutually exclusive with the per-API profile handles. A
+  /// Qnn_ProfileHandle_t bound to a graph can be concurrently used with QnnProfile_get* APIs. A
+  /// Qnn_ProfileHandle_t may only be bound to one graph at a time. A different Qnn_ProfileHandle_t
+  /// may be bound to the graph via QnnGraph_setConfig.
+  QNN_GRAPH_CONFIG_OPTION_PROFILE_HANDLE = 4,
+  /// Sets the profiling state of a graph. This config should only be used in conjunction with
+  /// profiling handles bound with QNN_GRAPH_CONFIG_OPTION_PROFILE_HANDLE. The behaviour is that
+  /// the profiling data is only collected when the state is enabled. Setting the state to disabled
+  /// causes the profiling data collection to cease. The default state is
+  /// QNN_GRAPH_PROFILING_STATE_ENABLED.
+  QNN_GRAPH_CONFIG_OPTION_SET_PROFILING_STATE = 5,
+  /// Sets the maximum number of QnnGraph_execute/QnnGraph_executeAsync calls that will be profiled.
+  /// This config should only be used in conjunction with profiling handles bound with
+  /// QNN_GRAPH_CONFIG_OPTION_PROFILE_HANDLE. The default is the
+  /// QnnGraph_Config_t::numProfilingExecutions maximum numerical limit.
+  QNN_GRAPH_CONFIG_OPTION_SET_PROFILING_NUM_EXECUTIONS = 6,
+  // Unused, present to ensure 32 bits.
+  QNN_GRAPH_CONFIG_OPTION_UNDEFINED = 0x7FFFFFFF
+} QnnGraph_ConfigOption_t;
+
+/**
+ * @brief Graph specific object for custom configuration
+ *
+ * Please refer to documentation provided by the backend for usage information
+ */
+typedef void* QnnGraph_CustomConfig_t;
+
+/**
+ * @brief This enum defines graph profiling states.
+ */
+typedef enum {
+  /// Profiling is enabled for the graph
+  QNN_GRAPH_PROFILING_STATE_ENABLED = 1,
+  /// Profiling is disabled for the graph
+  QNN_GRAPH_PROFILING_STATE_DISABLED = 2,
+  // Unused, present to ensure 32 bits.
+  QNN_GRAPH_PROFILING_STATE_UNDEFINED = 0x7FFFFFFF
+} QnnGraph_ProfilingState_t;
+
+/**
+ * @brief This struct provides graph configuration.
+ */
+typedef struct {
+  QnnGraph_ConfigOption_t option;
+  union UNNAMED {
+    QnnGraph_CustomConfig_t customConfig;
+    Qnn_Priority_t priority;
+    Qnn_ProfileHandle_t profileHandle;
+    QnnGraph_ProfilingState_t profilingState;
+    uint32_t numProfilingExecutions;
+  };
+} QnnGraph_Config_t;
+
+/// QnnGraph_Config_t initializer macro
+#define QNN_GRAPH_CONFIG_INIT                     \
+  {                                               \
+    QNN_GRAPH_CONFIG_OPTION_UNDEFINED, /*option*/ \
+    {                                             \
+      NULL /*customConfig*/                       \
+    }                                             \
+  }
+
+/**
+ * @brief This enum defines graph property options.
+ */
+typedef enum {
+  /// Sets backend custom properties, see backend specific documentation.
+  QNN_GRAPH_PROPERTY_OPTION_CUSTOM = 0,
+  /// Value selected to ensure 32 bits.
+  QNN_GRAPH_PROPERTY_OPTION_UNDEFINED = 0x7FFFFFFF
+} QnnGraph_PropertyOption_t;
+
+/**
+ * @brief Graph specific object for custom property
+ *
+ * Please refer to documentation provided by the backend for usage information
+ */
+typedef void* QnnGraph_CustomProperty_t;
+
+/**
+ * @brief This struct provides graph property.
+ *        Option is specified by the client. Everything
+ *        else is written by the backend.
+ */
+typedef struct {
+  QnnGraph_PropertyOption_t option;
+  union UNNAMED {
+    QnnGraph_CustomProperty_t customProperty;
+  };
+} QnnGraph_Property_t;
+
+// clang-format off
+/// QnnGraph_Property_t initializer macro
+#define QNN_GRAPH_PROPERTY_INIT                     \
+  {                                                 \
+    QNN_GRAPH_PROPERTY_OPTION_UNDEFINED, /*option*/ \
+    {                                               \
+      NULL /*customProperty*/                       \
+    }                                               \
+  }
+// clang-format on
+
+/**
+ * @brief This enum defines graph execution environment options.
+ */
+typedef enum {
+  // Environment option for binding a set of client registered memory handles for a tensor set.
+  QNN_GRAPH_EXECUTE_ENVIRONMENT_OPTION_BIND_MEM_HANDLES = 0,
+  // Environment option for discovering backend allocated client buffer pointers.
+  QNN_GRAPH_EXECUTE_ENVIRONMENT_OPTION_POPULATE_CLIENT_BUFS = 1,
+  // Unused, present to ensure 32 bits.
+  QNN_GRAPH_EXECUTE_ENVIRONMENT_OPTION_UNDEFINED = 0x7FFFFFFF
+} QnnGraph_ExecuteEnvironmentOption_t;
+
+/**
+ * @brief This struct provides graph execution environment options.
+ * @note QnnGraph_ExecuteEnvironment_t is entirely owned by the client.
+ */
+typedef struct {
+  // Option is required to be set for any instance of QnnGraph_ExecuteEnvironment_t.
+  QnnGraph_ExecuteEnvironmentOption_t option;
+  union UNNAMED {
+    // See QNN_GRAPH_EXECUTE_ENVIRONMENT_OPTION_BIND_MEM_HANDLES and
+    // QNN_GRAPH_EXECUTE_ENVIRONMENT_OPTION_POPULATE_CLIENT_BUFS.
+    Qnn_TensorSet_t tensorSet;
+  };
+} QnnGraph_ExecuteEnvironment_t;
+
+/**
+ * @brief This struct provides status associated with Qnn_NotifyFn_t() function.
+ */
+typedef struct {
+  Qnn_ErrorHandle_t error;
+} Qnn_NotifyStatus_t;
+
+/// Qnn_NotifyStatus_t initializer macro
+#define QNN_NOTIFY_STATUS_INIT \
+  { 0u /*error*/ }
+
+/**
+ * @brief A client-defined callback function. It is not guaranteed that a spot in the execution
+ *        queue is free once this callback is called. i.e. it cannot be inferred that once a
+ *        callback is received, the next call to QnnGraph_executeAsync() will not block due to the
+ *        queue being full.
+ *
+ * @param[in] notifyParam Client supplied data object which may be used to identify
+ *                        which function this callback applies to.
+ *
+ * @param[in] notifyStatus Execution status associate with callback.
+ *
+ * @return None
+ *
+ */
+typedef void (*Qnn_NotifyFn_t)(void* notifyParam, Qnn_NotifyStatus_t notifyStatus);
+
+//=============================================================================
+// Public Functions
+//=============================================================================
+
+/**
+ * @brief A function to create an empty graph.
+ *        The function returns an opaque object to be used on all graph APIs
+ *        (addNode, finalize, execute, ...)
+ *
+ * @param[in] contextHandle A handle to the context in which the graph would be created.
+ *
+ * @param[in] graphName A string which identifies the graph. Graph name allows retrieval of the
+ *                      graph after creating the context from cached binary.  _graphName_ must be
+ *                      unique within the _context_.
+ *
+ * @param[in] config Pointer to a NULL terminated array of config option pointers. NULL is allowed
+ *                   and indicates no config options are provided. All config options have default
+ *                   value, in case not provided. If same config option type is provided multiple
+ *                   times, the last option value will be used.
+ *
+ * @param[out] graphHandle The created graph handle.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: the graph was successfully created
+ *         - QNN_GRAPH_ERROR_INVALID_ARGUMENT: _graph_ is NULL or at least one config option was
+ *           invalid
+ *         - QNN_GRAPH_ERROR_INVALID_NAME: _graphName_ is NULL or not unique within the
+ *           _context_
+ *         - QNN_GRAPH_ERROR_INVALID_HANDLE: _context_ is not a valid handle
+ *         - QNN_GRAPH_ERROR_MEM_ALLOC: create failed due to memory/resource allocation
+ *         - QNN_GRAPH_ERROR_UNSUPPORTED_FEATURE: some API feature is not supported yet, e.g.
+ *           config option
+ *         - QNN_GRAPH_ERROR_CREATE_FAILED: create failed due to some other reason
+ *         - QNN_COMMON_ERROR_OPERATION_NOT_PERMITTED: create failed when context is
+ *           re-created from binary using QnnContext_createFromBinary().
+ *         - QNN_GRAPH_ERROR_PROFILE_IN_USE: when a profile handle is passed as graph config, that
+ *           profile handle can only be bound to one graph at a time
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnGraph_create(Qnn_ContextHandle_t contextHandle,
+                                  const char* graphName,
+                                  const QnnGraph_Config_t** config,
+                                  Qnn_GraphHandle_t* graphHandle);
+
+/**
+ * @brief A function to create an empty graph which will be a subgraph of another graph.
+ *        The function returns an opaque object to be used to add nodes to the subgraph.
+ *        A subgraph can not be explicitly finalized or executed. Only a graph with no
+ *        parent graphs can be finalized and executed.
+ *
+ * @param[in] graphHandle Handle to the graph in which the subgraph is created.
+ *
+ * @param[in] graphName A string which identifies the graph. Graph name allows retrieval of the
+ *                      graph after creating the context from cached binary. _graphName_ must be
+ *                      unique within the _context_.
+ *
+ * @param[out] subgraphHandle The created subgraph handle.
+ *
+ * @note A subgraph can have another subgraph as a parent.
+ *
+ * @note Nodes and tensors can be added to a subgraph before and/or after the subgraph handle has
+ *       been included as part of an op config added as a node.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: the graph was successfully created
+ *         - QNN_GRAPH_ERROR_INVALID_ARGUMENT: _subgraphHandle_ is NULL
+ *         - QNN_GRAPH_ERROR_INVALID_NAME: _graphName_ is NULL or not unique within the
+ *           _context_
+ *         - QNN_GRAPH_ERROR_INVALID_HANDLE: _graphHandle_ is not a valid handle
+ *         - QNN_GRAPH_ERROR_MEM_ALLOC: create failed due to memory/resource allocation
+ *         - QNN_GRAPH_ERROR_UNSUPPORTED_FEATURE: This API is not yet supported
+ *         - QNN_GRAPH_ERROR_CREATE_FAILED: create failed due to some other reason
+ *         - QNN_COMMON_ERROR_OPERATION_NOT_PERMITTED: create failed when context is
+ *           re-created from binary using QnnContext_createFromBinary().
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnGraph_createSubgraph(Qnn_GraphHandle_t graphHandle,
+                                          const char* graphName,
+                                          Qnn_GraphHandle_t* subgraphHandle);
+
+/**
+ * @brief A function to set/modify configuration options on an already created graph.
+ *        Backends are not required to support this API.
+ *
+ * @param[in] graphHandle A graph handle.
+ *
+ * @param[in] config Pointer to a NULL terminated array of config option pointers. NULL is allowed
+ *                   and indicates no config options are provided. All config options have default
+ *                   value, in case not provided. If same config option type is provided multiple
+ *                   times, the last option value will be used. If a backend cannot support all
+ *                   provided configs it will fail.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: no error is encountered
+ *         - QNN_GRAPH_ERROR_INVALID_HANDLE: _graphHandle_ is not a valid handle
+ *         - QNN_GRAPH_ERROR_INVALID_ARGUMENT: at least one config option is invalid
+ *         - QNN_GRAPH_ERROR_GRAPH_FINALIZED: at least one valid config option is not valid
+ *           on a finalized graph
+ *         - QNN_GRAPH_ERROR_SUBGRAPH: operation not permitted on a subgraph
+ *         - QNN_GRAPH_ERROR_UNSUPPORTED_FEATURE: at least one valid config option is not supported
+ *         - QNN_GRAPH_ERROR_PROFILE_IN_USE: when a profile handle is passed as graph config, that
+ *           profile handle can only be bound to one graph at a time
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION: SSR occurence (successful recovery)
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION_FATAL: SSR occurence (unsuccessful recovery)
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnGraph_setConfig(Qnn_GraphHandle_t graphHandle,
+                                     const QnnGraph_Config_t** config);
+
+/**
+ * @brief A function to get a list of graph properties.
+ *        Backends are not required to support this API.
+ *
+ * @param[in] graphHandle A graph handle.
+ *
+ * @param[in/out] properties Pointer to a null terminated array of pointers containing the
+ *                           properties associated with the passed graphHandle. Memory for
+ *                           this information is owned and managed by the client. Client
+ *                           needs to populate the property options being requested. If
+ *                           _graphHandle_ is not recognized, the pointer _properties_
+ *                           points to is set to nullptr.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: no error is encountered
+ *         - QNN_GRAPH_ERROR_INVALID_HANDLE: _graphHandle_ is not a valid handle
+ *         - QNN_GRAPH_ERROR_INVALID_ARGUMENT: _properties_ is NULL or at least one property option
+ *           is invalid
+ *         - QNN_GRAPH_ERROR_UNSUPPORTED_FEATURE: at least one valid property option is not
+ *           supported
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnGraph_getProperty(Qnn_GraphHandle_t graphHandle,
+                                       QnnGraph_Property_t** properties);
+
+/**
+ * @brief A function to add a node to the graph
+ *
+ * @param[in] graphHandle The graph or sub-graph handle to add the node to.
+ *
+ * @note The following conditions should be honored by tensors specified as
+ *       part of opConfig:
+ *       1. No tensor in the list opConfig.outputTensors can be of type
+ *          QNN_TENSOR_TYPE_APP_WRITE or QNN_TENSOR_TYPE_STATIC.
+ *       2. All parameters in the opConfig that happen to be tensors must be
+ *          of the type QNN_TENSOR_TYPE_STATIC.
+ *       3. Tensors express connectivity between nodes. However, it is permissible
+ *          for tensors to remain 'unconsumed' in a graph, i.e.,
+ *          not act as inputs to any other node in the graph.
+ *
+ * @note QnnGraph does not validate opConfig used in creating node beyond checks for basic sanity.
+ *       A thorough validation of opConfig for this node defined in a certain op package
+ *       has to be done via QnnBackend_validateOpConfig().
+ *
+ * @note Nodes must be added in dependency order. i.e. all QNN_TENSOR_TYPE_NATIVE inputs to the node
+ *       must be outputs of a previously added node.
+ *
+ * @param[in] opConfig A struct containing the configuration of the operation which should be
+ *                     added as a node in the graph. The tensor objects in this structure for
+ *                     inputs and outputs to the node must be created with APIs in QnnTensor.h
+ *                     which register them with a backend. Unrecognized tensors in the opConfig
+ *                     result in failure. Since the tensor ID is provided by the backend and is
+ *                     unique, it is sufficient to only specify a valid tensor ID in the
+ *                     Qnn_Tensor_t structures associated with the opConfig. All other fields
+ *                     including any static data are ignored by the backend when parsing these
+ *                     tensors.
+ *
+ * @return Error code
+ *         - QNN_SUCCESS: the node is successfully added to the graph
+ *         - QNN_GRAPH_ERROR_INVALID_OP_CONFIG: misconfigured operation - invalid op config
+ *           Thrown when a BE cannot match package name and/or op name with any
+ *           registered op packages, or when
+ *           tensor metadata for tensors in opConfig differs from that used in
+ *           registering them with a graph using QnnTensor_createGraphTensor().
+ *         - QNN_GRAPH_ERROR_INVALID_TENSOR: when tensor objects within opConfig are invalid
+ *         - QNN_GRAPH_ERROR_INVALID_HANDLE: _graph_ is not a valid handle
+ *         - QNN_GRAPH_ERROR_GRAPH_FINALIZED: add nodes on a finalized graph
+ *         - QNN_GRAPH_ERROR_UNCONNECTED_NODE: node added before its dependent node(s)
+ *         - QNN_GRAPH_ERROR_UNSUPPORTED_FEATURE: some API feature is not supported yet
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnGraph_addNode(Qnn_GraphHandle_t graphHandle, Qnn_OpConfig_t opConfig);
+
+/**
+ * @brief A function to finalize the graph.
+ *        If called on a graph that was composed, the runtime will process the graph, validate that
+ *        all operations are created successfully and that connectivity is correct.
+ *        If called on a graph that was retrieved from a context binary (subject to backend support,
+ *        see QNN_PROPERTY_GRAPH_SUPPORT_FINALIZE_DESERIALIZED_GRAPH), the runtime will perform
+ *        additional setup required before execution.
+ *
+ * @param[in] graphHandle Handle to the graph to be finalized.
+ *
+ * @param[in] profileHandle The profile handle on which metrics is populated and can be queried.
+ *                          Use NULL handle to disable profile collection. A handle being re-used
+ *                          would reset and is populated with values from the current call. This
+ *                          handle must be NULL when a continuous profile handle has been configured
+ *                          via the QNN_GRAPH_CONFIG_OPTION_PROFILE_HANDLE option
+ *
+ * @param[in] signalHandle Signal object to control the execution of the finalize process. NULL may
+ *                         be passed to indicate that no execution control is requested, and the
+ *                         finalize operation should continue to completion uninterrupted.
+ *                         The signal object, if not NULL, is considered to be in-use for
+ *                         the duration of the call.
+ *
+ * @note Graphs that contain zero nodes will fail to finalize.
+ *
+ * @note Some runtimes may require that this function is called before execution of a graph
+ *       retrieved from a context binary, refer to backend specific documentation.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: the graph is finalized successfully
+ *         - QNN_GRAPH_ERROR_INVALID_HANDLE: _graph_ is not a valid handle
+ *         - QNN_GRAPH_ERROR_INVALID_ARGUMENT:
+ *            - invalid param passed in OR
+ *            - continuous graph profiling is enabled and the per-API handle is not NULL.
+ *         - QNN_GRAPH_ERROR_CREATE_FAILED: op/kernel creation failed
+ *         - QNN_GRAPH_ERROR_OPTIMIZATION_FAILED: graph optimization failed
+ *         - QNN_GRAPH_ERROR_UNSUPPORTED_FEATURE: some API feature is not supported yet,
+ *           e.g. signal or profile
+ *         - QNN_GRAPH_ERROR_SET_PROFILE: set profile failed
+ *         - QNN_GRAPH_ERROR_SIGNAL_IN_USE: the supplied control signal is
+ *           already in-use by another call.
+ *         - QNN_GRAPH_ERROR_ABORTED: the call is aborted before completion due to user cancellation
+ *         - QNN_GRAPH_ERROR_TIMED_OUT: the call is aborted before completion due to a timeout
+ *         - QNN_GRAPH_ERROR_FINALIZE_FAILED: finalize failed for some other reason
+ *         - QNN_GRAPH_ERROR_SUBGRAPH: operation not permitted on a subgraph
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION: SSR occurence (successful recovery)
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION_FATAL: SSR occurence (unsuccessful recovery)
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnGraph_finalize(Qnn_GraphHandle_t graphHandle,
+                                    Qnn_ProfileHandle_t profileHandle,
+                                    Qnn_SignalHandle_t signalHandle);
+
+/**
+ * @brief A function to retrieve a graph based on name.
+ *        This function is typically used when a context was created from cached binary. The
+ *        re-created context has graph(s) which are also re-created. The function returns the graph
+ *        handle to be used for all graph APIs (addNode, finalize, execute, ...).
+ *
+ * @param[in] contextHandle An opaque ID to the context.
+ *
+ * @param[in] graphName A string which identifies the graph.
+ *
+ * @param[out] graphHandle A pointer to the graph handle that is being retrieved.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: the graph was successfully retrieved
+ *         - QNN_GRAPH_ERROR_INVALID_NAME: _graphName_ or _graph_ is NULL
+ *         - QNN_GRAPH_ERROR_INVALID_HANDLE: _context_ is not a valid handle
+ *         - QNN_GRAPH_ERROR_GRAPH_DOES_NOT_EXIST: graph not found/created
+ *         - QNN_GRAPH_ERROR_SUBGRAPH: operation not permitted on a subgraph
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnGraph_retrieve(Qnn_ContextHandle_t contextHandle,
+                                    const char* graphName,
+                                    Qnn_GraphHandle_t* graphHandle);
+
+/**
+ * @brief A function to optionally prepare an execution environment. Client can provide environment
+ *        options to a backend such that optimizations can be applied a backend or discovered by the
+ *        client. The options are:
+ *        - QNN_GRAPH_EXECUTE_ENVIRONMENT_OPTION_BIND_MEM_HANDLES: An option to achieve zero copy of
+ *          tensor data during execution. Done by grouping sets of I/O tensors and binding their
+ *          memory layout to a graph handle before execution.
+ *        - QNN_GRAPH_EXECUTE_ENVIRONMENT_OPTION_POPULATE_CLIENT_BUFS: An option to achieve zero
+ *          copy of tensor data in cases of backend-allocated memory. Clients should use this option
+ *          to discover memory layout of input and output tensors allocated by the backend.
+ *
+ * @note See SDK documentation for backend specific behaviour. Backend support for environment
+ *       options can be determined by querying the corresponding capability.
+ *
+ * @param[in] graphHandle A handle to the graph that is being prepared for execution
+ *
+ * @param[in/out] envs An array of pointers to execution environment options of length envSize. The
+ *                     option field is required to be set for all environments in the array. A
+ *                     backend may not support all options provided. If extra environment options
+ *                     are provided, the backend will set them to a default value (e.g.
+ *                     QNN_TENSOR_SET_INIT).
+ *
+ * @param[in] envSize Size of the array pointed to by envs.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: The execution environment was successfully prepared.
+ *         - QNN_GRAPH_ERROR_INVALID_HANDLE: _graph_ is not a valid handle.
+ *         - QNN_GRAPH_ERROR_INVALID_ARGUMENT: One or more fields in the provided envs is NULL or
+ *           invalid.
+ *         - QNN_GRAPH_ERROR_UNSUPPORTED_FEATURE: One or more env options is not supported by the
+ *           backend.
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnGraph_prepareExecutionEnvironment(Qnn_GraphHandle_t graphHandle,
+                                                       QnnGraph_ExecuteEnvironment_t** envs,
+                                                       uint32_t envSize);
+
+/**
+ * @brief Synchronously execute a finalized graph.
+ *
+ * @param[in] graphHandle Handle of finalized graph to execute.
+ *
+ * @param[in] inputs Array of tensors with which to populate graph inputs.
+ *
+ * @param[in] numInputs Number of input tensors.
+ *
+ * @param[out] outputs Array of output tensors which the graph will populate with output values.
+ *
+ * @param[in] numOutputs Number of output tensors.
+ *
+ * @param[in] profileHandle The profile handle on which metrics is populated and can be queried.
+ *                          Use NULL handle to disable profile collection. A handle being reused
+ *                          would reset and is populated with values from the current call. This
+ *                          handle must be NULL when a continuous profile handle has been configured
+ *                          via the QNN_GRAPH_CONFIG_OPTION_PROFILE_HANDLE option
+ *
+ * @param[in] signalHandle Signal object which may be used to control the execution of this call.
+ *                         NULL indicates execution should proceed as normal.
+ *                         The signal object, if not NULL, is considered to be in-use
+ *                         for the duration of the call.
+ *
+ * @note Tensors in _inputs_ and _outputs_ must carry the same ID that was assigned when they were
+ *       created. Values for all other attributes in Qnn_Tensor_t are assumed from the point at
+ *       which they were registered with a backend at the time of tensor creation, with the
+ *       following exceptions:
+ *       - Tensor data provided by client in structs such as _clientBuf_ can be changed between
+ *         invocations to execute().
+ *       - Batch multiple: An _inputs_ or _outputs_ tensor _dimensions_ field, if non-null, should
+ *         match the values provided at tensor creation, with the following exception. The batch
+ *         dimension, as determined by the op definition, can be an integer multiple of the
+ *         respective dimension provided at tensor creation. All _inputs_ and _outputs_ tensors
+ *         must have the same batch multiple.
+ *       - Dynamic output dimensions: An _outputs_ tensor Qnn_TensorV1_t _dimensions_ field, if
+ *         non-null, can vary after graph execution. As determined by the op definition, non-batch
+ *         dimensions may be less than the respective dimension at tensor creation.
+ *       - Dynamic dimensions: If an _inputs_ tensor was created with a non-null Qnn_TensorV2_t
+ *         _isDynamicDimensions_ field, the corresponding dynamic dimensions must be provided by
+ *         the caller. If an _outputs_ tensor was created with a non-null Qnn_TensorV2_t
+ *         _isDynamicDimensions_ field, the _dimensions_ must be non-null and the output dimensions
+ *         will be written by the backend. In a scenario where maximum dimensions will be exceeded,
+ *         the backend will generate an error code indicating loss of data and will fill the tensor
+ *         with as much data as possible.
+ *       - Other fields like _dataType_ can also be permitted to change between invocations to
+ *         QnnGraph_execute()/QnnGraph_executeAsync() for certain ops that perform data type
+ *         conversions.
+ *       - Some backends may be able to execute a graph with no _inputs_ provided the graph has no
+ *         application-writable tensors.
+ *       - QnnGraph_execute() can only accept tensors of type QNN_TENSOR_TYPE_APP_READ,
+ *         QNN_TENSOR_TYPE_APP_WRITE, QNN_TENSOR_TYPE_APP_READ_WRITE,
+ *         QNN_TENSOR_TYPE_OPTIONAL_APP_READ, QNN_TENSOR_TYPE_OPTIONAL_APP_WRITE, and
+ *         QNN_TENSOR_TYPE_OPTIONAL_APP_READWRITE. Tensors provided with a different type will
+ *         result in QnnGraph_execute() failure.
+ *       - Clients may exclude tensors of type QNN_TENSOR_TYPE_OPTIONAL_APP_READ,
+ *         QNN_TENSOR_TYPE_OPTIONAL_APP_WRITE, and QNN_TENSOR_TYPE_OPTIONAL_APP_READ from the
+ *         _inputs_ and _outputs_ arguments. If a QNN_TENSOR_TYPE_OPTIONAL_APP_WRITE tensor is
+ *         excluded from the _inputs_ argument, the value of that tensor will be dictated by the
+ *         backend defined behavior for that model. QNN_TENSOR_TYPE_OPTIONAL_APP_READ tensors may be
+ *         excluded from the _outputs_ argument. In this case a backend will not populate the tensor
+ *         on the QnnGraph_execute() call, and the data of these tensors is null. This is an
+ *         optional feature. Backends broadcast support for this feature with
+ *         QNN_PROPERTY_TENSOR_SUPPORT_OPTIONAL_APP_WRITE,
+ *         QNN_PROPERTY_TENSOR_SUPPORT_OPTIONAL_APP_READ, and
+ *         QNN_PROPERTY_TENSOR_SUPPORT_OPTIONAL_APP_READWRITE.
+ *       - Mixing different tensor versions in the same graph (e.g. Qnn_TensorV1_t and
+ *         Qnn_TensorV2_t) may result in performance degradation.
+ *
+ * @note If there are simultaneous calls to QnnGraph_execute() and QnnGraph_executeAsync(), the
+ *       priority for enqueuing or executing is equal. Both functions operate on the same queue,
+ *       the only difference in behavior is whether the function returns when the execution is
+ *       enqueued, or when the execution finishes. If there are executions already enqueued, the
+ *       execution will be added to the end of the queue, and QnnGraph_execute() will block while
+ *       waiting in the queue.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: the graph was successfully executed
+ *         - QNN_GRAPH_ERROR_INVALID_HANDLE: _graph_ is not a valid handle
+ *         - QNN_GRAPH_ERROR_GRAPH_NOT_FINALIZED: graph was not finalized
+ *         - QNN_GRAPH_ERROR_SUBGRAPH: cannot execute a subgraph
+ *         - QNN_GRAPH_ERROR_INVALID_ARGUMENT:
+ *            - _inputs_ or _outputs_ is NULL or ill-formed OR
+ *            - _inputs_ is NOT NULL and _numInputs_ is 0 OR
+ *            - _outputs_ is NOT NULL and _numOutputs_ is 0 OR
+ *            - _profile_ handle is invalid OR
+ *            - continuous graph profiling is enabled and the per-API handle is not NULL.
+ *         - QNN_GRAPH_ERROR_INVALID_TENSOR: one or more tensors in _inputs_ or _outputs_
+ *           is invalid or not recognized by graph
+ *         - QNN_GRAPH_ERROR_UNSUPPORTED_FEATURE: graph execution is not supported on this
+ *           backend or some API feature is not supported yet, e.g. signal, profile, or batch
+ *           multiplier
+ *         - QNN_GRAPH_ERROR_SET_PROFILE: set profile failed
+ *         - QNN_GRAPH_ERROR_SIGNAL_IN_USE: the supplied control signal is already in-use by
+ *           another call.
+ *         - QNN_GRAPH_ERROR_ABORTED: the call is aborted before completion due to user cancellation
+ *         - QNN_GRAPH_ERROR_TIMED_OUT: the call is aborted before completion due to a timeout
+ *         - QNN_GRAPH_ERROR_DISABLED: the graph was not enabled when the context was deserialized
+ *         - QNN_GRAPH_ERROR_DYNAMIC_TENSOR_SHAPE: An error occurred that is related to dynamic
+ *           tensor shape. For example, a tensor maximum dimension was exceeded.
+ *         - QNN_GRAPH_ERROR_TENSOR_SPARSITY: An error occurred that is related to tensor sparsity.
+ *           For example, the maximum number of specified elements was exceeded.
+ *         - QNN_GRAPH_ERROR_EARLY_TERMINATION: Graph execution terminated early due to defined op
+ *           behavior.
+ *         - QNN_GRAPH_ERROR_INVALID_CONTEXT: Graph execution failed due to context already being
+ *           freed.
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION: SSR occurence (successful recovery)
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION_FATAL: SSR occurence (unsuccessful recovery)
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnGraph_execute(Qnn_GraphHandle_t graphHandle,
+                                   const Qnn_Tensor_t* inputs,
+                                   uint32_t numInputs,
+                                   Qnn_Tensor_t* outputs,
+                                   uint32_t numOutputs,
+                                   Qnn_ProfileHandle_t profileHandle,
+                                   Qnn_SignalHandle_t signalHandle);
+
+/**
+ * @brief Asynchronously execute a finalized graph. Graphs will be enqueued for execution in FIFO
+ * order. There is no guarantee that graphs will finish execution in the same order they were
+ * enqueued. If the the execution queue is full, this function will block until space is available.
+ *
+ * @param[in] graphHandle Handle of finalized graph to execute.
+ *
+ * @param[in] inputs Array of input tensors with which to populate graph inputs.
+ *
+ * @param[in] numInputs Number of input tensors.
+ *
+ * @param[out] outputs Array of tensors which the graph will populate with output values.
+ *
+ * @param[in] numOutputs Number of output tensors.
+ *
+ * @param[in] profileHandle The profile handle on which metrics is populated and can be queried.
+ *                          Use NULL handle to disable profile collection. A handle being reused
+ *                          would reset and is populated with values from the enqueued execute
+ *                          call. Profile handle management/reuse across asynchronous calls is
+ *                          client's responsibility. Behavior is undefined if same profile handle
+ *                          is used by two enqueued execute instances at the same time. This
+ *                          handle must be NULL when a continuous profile handle has been
+ *                          configured via the QNN_GRAPH_CONFIG_OPTION_PROFILE_HANDLE option
+ *
+ * @param[in] signalHandle Signal object which may be used to control the execution of this call.
+ *                         NULL indicates execution should proceed as normal. All pending
+ *                         executions in the queue are affected by Signal control. Instance
+ *                         executing when Signal control is issued may not be affected.
+ *                         The signal object, if not NULL, is considered to be in-use
+ *                         for the duration of the call. For timeout signals, the timeout
+ *                         duration applies from the QnnGraph_executeAsync call until the
+ *                         callback is called. The same Qnn_GraphHandle_t can be used
+ *                         for multiple calls to QnnGraph_executeAsync, however, different
+ *                         Qnn_SignalHandle_t must be supplied.
+ *
+ * @param[in] notifyFn Pointer to notification function, called when execution is finished. NULL
+ *                     indicates no notification is requested. _notifyFn_ will be called in
+ *                     context of backend owned thread, with priority equal or lower than client's
+ *                     calling thread. Please note that a failed call to QnnGraph_executeAsync
+ *                     does not call the notification function.
+ *
+ * @param[in] notifyParam Client-supplied data object which will be passed back via _notifyFn_ and
+ *                        can be used to identify asynchronous execution instance. Can be NULL.
+ *
+ * @note Tensors in _inputs_ and _outputs_ must carry the same ID that was assigned when they were
+ *       created. Values for all other attributes in Qnn_Tensor_t are assumed from the point at
+ *       which they were registered with a backend at the time of tensor creation, with the
+ *       following exceptions:
+ *       - Tensor data provided by client in structs such as _clientBuf_ can be changed between
+ *         invocations to execute().
+ *       - Batch multiple: An _inputs_ or _outputs_ tensor _dimensions_ field, if non-null, should
+ *         match the values provided at tensor creation, with the following exception. The batch
+ *         dimension, as determined by the op definition, can be an integer multiple of the
+ *         respective dimension provided at tensor creation. All _inputs_ and _outputs_ tensors
+ *         must have the same batch multiple.
+ *       - Dynamic output dimensions: An _outputs_ tensor Qnn_TensorV1_t _dimensions_ field, if
+ *         non-null, can vary after graph execution. As determined by the op definition, non-batch
+ *         dimensions may be less than the respective dimension at tensor creation.
+ *       - Dynamic dimensions: If an _inputs_ tensor was created with a non-null Qnn_TensorV2_t
+ *         _isDynamicDimensions_ field, the corresponding dynamic dimensions must be provided by
+ *         the caller. If an _outputs_ tensor was created with a non-null Qnn_TensorV2_t
+ *         _isDynamicDimensions_ field, the _dimensions_ must be non-null and the output dimensions
+ *         will be written by the backend. In a scenario where maximum dimensions will be exceeded,
+ *         the backend will generate an error code indicating loss of data and will fill the tensor
+ *         with as much data as possible.
+ *       - Other fields like _dataType_ can also be permitted to change between invocations to
+ *         QnnGraph_execute()/QnnGraph_executeAsync() for certain ops that perform data type
+ *         conversions.
+ *       - Some backends may be able to execute a graph with no _inputs_ provided the graph has no
+ *         application-writable tensors.
+ *       - QnnGraph_executeAsync() can only accept tensors of type QNN_TENSOR_TYPE_APP_READ,
+ *         QNN_TENSOR_TYPE_APP_WRITE, QNN_TENSOR_TYPE_APP_READ_WRITE,
+ *         QNN_TENSOR_TYPE_OPTIONAL_APP_READ, QNN_TENSOR_TYPE_OPTIONAL_APP_WRITE, and
+ *         QNN_TENSOR_TYPE_OPTIONAL_APP_READWRITE. Tensors provided with a different type will
+ *         result in QnnGraph_execute() failure.
+ *       - Clients may exclude tensors of type QNN_TENSOR_TYPE_OPTIONAL_APP_READ,
+ *         QNN_TENSOR_TYPE_OPTIONAL_APP_WRITE, and QNN_TENSOR_TYPE_OPTIONAL_APP_READ from the
+ *         _inputs_ and _outputs_ arguments. If a QNN_TENSOR_TYPE_OPTIONAL_APP_WRITE tensor is
+ *         excluded from the _inputs_ argument, the value of that tensor will be dictated by the
+ *         backend defined behavior for that model. QNN_TENSOR_TYPE_OPTIONAL_APP_READ tensors may be
+ *         excluded from the _outputs_ argument. In this case a backend will not populate the tensor
+ *         on the QnnGraph_execute() call, and the data of these tensors is null. This is an
+ *         optional feature. Backends broadcast support for this feature with
+ *         QNN_PROPERTY_TENSOR_SUPPORT_OPTIONAL_APP_WRITE,
+ *         QNN_PROPERTY_TENSOR_SUPPORT_OPTIONAL_APP_READ, and
+ *         QNN_PROPERTY_TENSOR_SUPPORT_OPTIONAL_APP_READWRITE.
+ *       - Mixing different tensor versions in the same graph (e.g. Qnn_TensorV1_t and
+ *         Qnn_TensorV2_t) may result in performance degradation.
+ *
+ * @note If there are simultaneous calls to QnnGraph_execute() and QnnGraph_executeAsync(), the
+ *       priority for enqueuing or executing is equal. Both functions will add to the same queue,
+ *       the only difference in behavior is whether the function returns when the execution is
+ *       enqueued, or when the execution finishes.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: the graph was successfully executed
+ *         - QNN_GRAPH_ERROR_INVALID_HANDLE: _graph_ is not a valid handle
+ *         - QNN_GRAPH_ERROR_GRAPH_NOT_FINALIZED: graph was not finalized
+ *         - QNN_GRAPH_ERROR_SUBGRAPH: cannot execute a subgraph
+ *         - QNN_GRAPH_ERROR_INVALID_ARGUMENT:
+ *            - _inputs_ or _outputs_ is NULL or ill-formed OR
+ *            - _inputs_ is NOT NULL and _numInputs_ is 0 OR
+ *            - _outputs_ is NOT NULL and _numOutputs_ is 0 OR
+ *            - _profile_ handle is invalid OR
+ *            - continuous graph profiling is enabled and the per-API handle is not NULL.
+ *         - QNN_GRAPH_ERROR_INVALID_TENSOR: one or more tensors in _inputs_ or _outputs_
+ *           is invalid or not recognized by graph
+ *         - QNN_GRAPH_ERROR_UNSUPPORTED_FEATURE: asynchronous graph execution is not supported on
+ *           this backend or some API feature is not supported yet, e.g. signal, profile, or batch
+ *           multiplier
+ *         - QNN_GRAPH_ERROR_SIGNAL_IN_USE: the supplied control signal is already in-use by
+ *           another call.
+ *         - QNN_GRAPH_ERROR_ABORTED: the call is aborted before completion due to user cancellation
+ *         - QNN_GRAPH_ERROR_TIMED_OUT: the call is aborted before completion due to a timeout
+ *         - QNN_GRAPH_ERROR_DISABLED: the graph was not enabled when the context was deserialized
+ *         - QNN_GRAPH_ERROR_DYNAMIC_TENSOR_SHAPE: An error occurred that is related to dynamic
+ *           tensor shape. For example, a tensor maximum dimension was exceeded.
+ *         - QNN_GRAPH_ERROR_TENSOR_SPARSITY: An error occurred that is related to tensor sparsity.
+ *           For example, the maximum number of specified elements was exceeded.
+ *         - QNN_GRAPH_ERROR_EARLY_TERMINATION: Graph execution terminated early due to defined op
+ *           behavior.
+ *         - QNN_GRAPH_ERROR_INVALID_CONTEXT: Graph execution failed due to context already being
+ *           freed.
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION: SSR occurence (successful recovery)
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION_FATAL: SSR occurence (unsuccessful recovery)
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnGraph_executeAsync(Qnn_GraphHandle_t graphHandle,
+                                        const Qnn_Tensor_t* inputs,
+                                        uint32_t numInputs,
+                                        Qnn_Tensor_t* outputs,
+                                        uint32_t numOutputs,
+                                        Qnn_ProfileHandle_t profileHandle,
+                                        Qnn_SignalHandle_t signalHandle,
+                                        Qnn_NotifyFn_t notifyFn,
+                                        void* notifyParam);
+
+/**
+ * @brief A function to release an execution environment prepared via
+ *        QnnGraph_prepareExecutionEnvironment. If this API is not called, environments will be
+ *        released automatically during QnnContext_free.
+ *
+ * @param[in] graphHandle Handle to the graph that the environment is being released from.
+ *
+ * @param[in] envs An array of pointers to execution environment options previously used for
+ *                 preparation.
+ *
+ * @param[in] envSize Size of the array pointed to by envs.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: The execution environment was successfully released.
+ *         - QNN_GRAPH_ERROR_INVALID_HANDLE: _graph_ is not a valid handle.
+ *         - QNN_GRAPH_ERROR_INVALID_ARGUMENT: Invalid envs provided to be released.
+ *         - QNN_GRAPH_ERROR_UNSUPPORTED_FEATURE: One or more envs options is not supported by the
+ *           backend.
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnGraph_releaseExecutionEnvironment(Qnn_GraphHandle_t graphHandle,
+                                                       const QnnGraph_ExecuteEnvironment_t** envs,
+                                                       uint32_t envSize);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // QNN_GRAPH_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnInterface.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnInterface.h
new file mode 100755
index 0000000000000..69486cdf0b53e
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnInterface.h
@@ -0,0 +1,679 @@
+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All rights reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/**
+ *  @file
+ *  @brief  QNN Interface API
+ *
+ *          QNN Interface is an abstraction combining all QNN component APIs. QNN Interface
+ *          provides typedef variant of QNN component APIs and API to get QNN interface object(s).
+ *          QNN Interface API can coexist with QNN component APIs. Visibility of Interface and
+ *          Component APIs is determined by build configuration, specifically by QNN_API and
+ *          QNN_INTERFACE macro definitions.
+ */
+
+#ifndef QNN_INTERFACE_H
+#define QNN_INTERFACE_H
+
+#include "QnnCommon.h"
+#include "QnnTypes.h"
+
+// QNN Component API headers
+#include "QnnBackend.h"
+#include "QnnContext.h"
+#include "QnnDevice.h"
+#include "QnnError.h"
+#include "QnnGraph.h"
+#include "QnnLog.h"
+#include "QnnMem.h"
+#include "QnnProfile.h"
+#include "QnnProperty.h"
+#include "QnnSignal.h"
+#include "QnnTensor.h"
+
+// QNN Op integration headers
+#include "QnnOpDef.h"
+#include "QnnOpPackage.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+
+// Macro controlling visibility of QNN Interface API
+#ifndef QNN_INTERFACE
+#define QNN_INTERFACE
+#endif
+
+// Utility macros for version and name construction
+#define QNN_INTERFACE_VER_EVAL(major, minor)          QNN_PASTE_THREE(major, _, minor)
+#define QNN_INTERFACE_NAME_EVAL(prefix, body, suffix) QNN_PASTE_THREE(prefix, body, suffix)
+
+// Construct interface type name from version, e.g. QnnInterface_ImplementationV0_0_t
+#define QNN_INTERFACE_VER_TYPE_EVAL(ver_major, ver_minor) \
+  QNN_INTERFACE_NAME_EVAL(                                \
+      QnnInterface_ImplementationV, QNN_INTERFACE_VER_EVAL(ver_major, ver_minor), _t)
+
+// Construct interface name from version, e.g. v0_0
+#define QNN_INTERFACE_VER_NAME_EVAL(ver_major, ver_minor) \
+  QNN_INTERFACE_NAME_EVAL(v, QNN_INTERFACE_VER_EVAL(ver_major, ver_minor), )
+
+// Interface type name for current API version
+#define QNN_INTERFACE_VER_TYPE \
+  QNN_INTERFACE_VER_TYPE_EVAL(QNN_API_VERSION_MAJOR, QNN_API_VERSION_MINOR)
+
+// Interface name for current API version
+#define QNN_INTERFACE_VER_NAME \
+  QNN_INTERFACE_VER_NAME_EVAL(QNN_API_VERSION_MAJOR, QNN_API_VERSION_MINOR)
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+/**
+ * @brief QNN Interface API result / error codes
+ */
+typedef enum {
+  QNN_INTERFACE_MIN_ERROR = QNN_MIN_ERROR_INTERFACE,
+  ////////////////////////////////////////
+
+  QNN_INTERFACE_NO_ERROR                = QNN_SUCCESS,
+  QNN_INTERFACE_ERROR_NOT_SUPPORTED     = QNN_COMMON_ERROR_NOT_SUPPORTED,
+  QNN_INTERFACE_ERROR_INVALID_PARAMETER = QNN_COMMON_ERROR_INVALID_ARGUMENT,
+
+  ////////////////////////////////////////
+  QNN_INTERFACE_MAX_ERROR = QNN_MAX_ERROR_INTERFACE
+} QnnInterface_Error_t;
+
+//
+// From QnnProperty.h
+//
+
+/** @brief See QnnProperty_hasCapability()*/
+typedef Qnn_ErrorHandle_t (*QnnProperty_HasCapabilityFn_t)(QnnProperty_Key_t key);
+
+//
+// From QnnBackend.h
+//
+
+/** @brief See QnnBackend_create()*/
+typedef Qnn_ErrorHandle_t (*QnnBackend_CreateFn_t)(Qnn_LogHandle_t logger,
+                                                   const QnnBackend_Config_t** config,
+                                                   Qnn_BackendHandle_t* backend);
+
+/** @brief See QnnBackend_setConfig()*/
+typedef Qnn_ErrorHandle_t (*QnnBackend_SetConfigFn_t)(Qnn_BackendHandle_t backend,
+                                                      const QnnBackend_Config_t** config);
+
+/** @brief See QnnBackend_getApiVersion()*/
+typedef Qnn_ErrorHandle_t (*QnnBackend_GetApiVersionFn_t)(Qnn_ApiVersion_t* pVersion);
+
+/** @brief See QnnBackend_getBuildId()*/
+typedef Qnn_ErrorHandle_t (*QnnBackend_GetBuildIdFn_t)(const char** id);
+
+/** @brief See QnnBackend_registerOpPackage()*/
+typedef Qnn_ErrorHandle_t (*QnnBackend_RegisterOpPackageFn_t)(Qnn_BackendHandle_t backend,
+                                                              const char* packagePath,
+                                                              const char* interfaceProvider,
+                                                              const char* target);
+
+/** @brief See QnnBackend_getSupportedOperations()*/
+typedef Qnn_ErrorHandle_t (*QnnBackend_GetSupportedOperationsFn_t)(
+    Qnn_BackendHandle_t backend,
+    uint32_t* numOperations,
+    const QnnBackend_OperationName_t** operations);
+
+/** @brief See QnnBackend_validateOpConfig()*/
+typedef Qnn_ErrorHandle_t (*QnnBackend_ValidateOpConfigFn_t)(Qnn_BackendHandle_t backend,
+                                                             Qnn_OpConfig_t opConfig);
+
+/** @brief See QnnBackend_free()*/
+typedef Qnn_ErrorHandle_t (*QnnBackend_FreeFn_t)(Qnn_BackendHandle_t backend);
+
+/** @brief See QnnBackend_getProperty()*/
+typedef Qnn_ErrorHandle_t (*QnnBackend_GetPropertyFn_t)(Qnn_BackendHandle_t backendHandle,
+                                                        QnnBackend_Property_t** properties);
+
+//
+// From QnnContext.h
+//
+
+/** @brief See QnnContext_create()*/
+typedef Qnn_ErrorHandle_t (*QnnContext_CreateFn_t)(Qnn_BackendHandle_t backend,
+                                                   Qnn_DeviceHandle_t device,
+                                                   const QnnContext_Config_t** config,
+                                                   Qnn_ContextHandle_t* context);
+
+/** @brief See QnnContext_setConfig()*/
+typedef Qnn_ErrorHandle_t (*QnnContext_SetConfigFn_t)(Qnn_ContextHandle_t context,
+                                                      const QnnContext_Config_t** config);
+
+/** @brief See QnnContext_getBinarySize()*/
+typedef Qnn_ErrorHandle_t (*QnnContext_GetBinarySizeFn_t)(
+    Qnn_ContextHandle_t context, Qnn_ContextBinarySize_t* binaryBufferSize);
+
+/** @brief See QnnContext_getBinary()*/
+typedef Qnn_ErrorHandle_t (*QnnContext_GetBinaryFn_t)(Qnn_ContextHandle_t context,
+                                                      void* binaryBuffer,
+                                                      Qnn_ContextBinarySize_t binaryBufferSize,
+                                                      Qnn_ContextBinarySize_t* writtenBufferSize);
+
+/** @brief See QnnContext_createFromBinary()*/
+typedef Qnn_ErrorHandle_t (*QnnContext_CreateFromBinaryFn_t)(
+    Qnn_BackendHandle_t backend,
+    Qnn_DeviceHandle_t device,
+    const QnnContext_Config_t** config,
+    const void* binaryBuffer,
+    Qnn_ContextBinarySize_t binaryBufferSize,
+    Qnn_ContextHandle_t* context,
+    Qnn_ProfileHandle_t profile);
+
+/** @brief See QnnContext_free()*/
+typedef Qnn_ErrorHandle_t (*QnnContext_FreeFn_t)(Qnn_ContextHandle_t context,
+                                                 Qnn_ProfileHandle_t profile);
+
+/** @brief See QnnContext_validateBinary()*/
+typedef Qnn_ErrorHandle_t (*QnnContext_ValidateBinaryFn_t)(
+    Qnn_BackendHandle_t backend,
+    Qnn_DeviceHandle_t device,
+    const QnnContext_Config_t** config,
+    const void* binaryBuffer,
+    Qnn_ContextBinarySize_t binaryBufferSize);
+
+/** @brief See QnnContext_createFromBinaryWithSignal()*/
+typedef Qnn_ErrorHandle_t (*QnnContext_CreateFromBinaryWithSignalFn_t)(
+    Qnn_BackendHandle_t backend,
+    Qnn_DeviceHandle_t device,
+    const QnnContext_Config_t** config,
+    const void* binaryBuffer,
+    Qnn_ContextBinarySize_t binaryBufferSize,
+    Qnn_ContextHandle_t* context,
+    Qnn_ProfileHandle_t profile,
+    Qnn_SignalHandle_t signal);
+
+/** @brief See QnnContext_createFromBinaryListAsync()*/
+typedef Qnn_ErrorHandle_t (*QnnContext_CreateFromBinaryListAsyncFn_t)(
+    Qnn_BackendHandle_t backend,
+    Qnn_DeviceHandle_t device,
+    const QnnContext_Params_t** contextParams,
+    const QnnContext_Config_t** listConfig,
+    Qnn_SignalHandle_t signal);
+
+/** @brief See QnnContext_getBinarySectionSize()*/
+typedef Qnn_ErrorHandle_t (*QnnContext_GetBinarySectionSizeFn_t)(
+    Qnn_ContextHandle_t context,
+    Qnn_GraphHandle_t graph,
+    QnnContext_SectionType_t section,
+    Qnn_ContextBinarySize_t* binaryBufferSize);
+
+/** @brief See QnnContext_getBinarySection()*/
+typedef Qnn_ErrorHandle_t (*QnnContext_GetBinarySectionFn_t)(
+    Qnn_ContextHandle_t context,
+    Qnn_GraphHandle_t graph,
+    QnnContext_SectionType_t section,
+    const QnnContext_Buffer_t* binaryBuffer,
+    Qnn_ContextBinarySize_t* writtenBufferSize,
+    Qnn_ProfileHandle_t profile,
+    Qnn_SignalHandle_t signal);
+/** @brief See QnnContext_applyBinarySection()*/
+typedef Qnn_ErrorHandle_t (*QnnContext_ApplyBinarySectionFn_t)(
+    Qnn_ContextHandle_t context,
+    Qnn_GraphHandle_t graph,
+    QnnContext_SectionType_t section,
+    const QnnContext_Buffer_t* binaryBuffer,
+    Qnn_ProfileHandle_t profile,
+    Qnn_SignalHandle_t signal);
+
+/** @brief See QnnContext_getProperty()*/
+typedef Qnn_ErrorHandle_t (*QnnContext_GetPropertyFn_t)(Qnn_ContextHandle_t contextHandle,
+                                                        QnnContext_Property_t** properties);
+
+/** @brief See QnnContext_getIncrementalBinary()*/
+typedef Qnn_ErrorHandle_t (*QnnContext_GetIncrementalBinaryFn_t)(Qnn_ContextHandle_t context,
+                                                      const void** binaryBuffer,
+                                                      Qnn_ContextBinarySize_t* startOffset,
+                                                      Qnn_ContextBinarySize_t* writtenBufferSize);
+
+/** @brief See QnnContext_releaseIncrementalBinary()*/
+typedef Qnn_ErrorHandle_t (*QnnContext_ReleaseIncrementalBinaryFn_t)(Qnn_ContextHandle_t context,
+                                                                 const void* binaryBuffer,
+                                                                 Qnn_ContextBinarySize_t startOffset);
+//
+// From QnnGraph.h
+//
+
+/** @brief See QnnGraph_create()*/
+typedef Qnn_ErrorHandle_t (*QnnGraph_CreateFn_t)(Qnn_ContextHandle_t contextHandle,
+                                                 const char* graphName,
+                                                 const QnnGraph_Config_t** config,
+                                                 Qnn_GraphHandle_t* graphHandle);
+
+/** @brief See QnnGraph_createSubgraph()*/
+typedef Qnn_ErrorHandle_t (*QnnGraph_CreateSubgraphFn_t)(Qnn_GraphHandle_t graphHandle,
+                                                         const char* graphName,
+                                                         Qnn_GraphHandle_t* subgraphHandle);
+
+/** @brief See QnnGraph_setConfig()*/
+typedef Qnn_ErrorHandle_t (*QnnGraph_SetConfigFn_t)(Qnn_GraphHandle_t graphHandle,
+                                                    const QnnGraph_Config_t** config);
+
+/** @brief See QnnGraph_getProperty()*/
+typedef Qnn_ErrorHandle_t (*QnnGraph_GetPropertyFn_t)(Qnn_GraphHandle_t graphHandle,
+                                                      QnnGraph_Property_t** properties);
+
+/** @brief See QnnGraph_addNode()*/
+typedef Qnn_ErrorHandle_t (*QnnGraph_AddNodeFn_t)(Qnn_GraphHandle_t graphHandle,
+                                                  Qnn_OpConfig_t opConfig);
+
+/** @brief See QnnGraph_finalize()*/
+typedef Qnn_ErrorHandle_t (*QnnGraph_FinalizeFn_t)(Qnn_GraphHandle_t graphHandle,
+                                                   Qnn_ProfileHandle_t profileHandle,
+                                                   Qnn_SignalHandle_t signalHandle);
+
+/** @brief See QnnGraph_retrieve()*/
+typedef Qnn_ErrorHandle_t (*QnnGraph_RetrieveFn_t)(Qnn_ContextHandle_t contextHandle,
+                                                   const char* graphName,
+                                                   Qnn_GraphHandle_t* graphHandle);
+
+/** @brief See QnnGraph_prepareExecutionEnvironment()*/
+typedef Qnn_ErrorHandle_t (*QnnGraph_PrepareExecutionEnvironmentFn_t)(
+    Qnn_GraphHandle_t graphHandle, QnnGraph_ExecuteEnvironment_t** envs, uint32_t envSize);
+
+/** @brief See QnnGraph_execute()*/
+typedef Qnn_ErrorHandle_t (*QnnGraph_ExecuteFn_t)(Qnn_GraphHandle_t graphHandle,
+                                                  const Qnn_Tensor_t* inputs,
+                                                  uint32_t numInputs,
+                                                  Qnn_Tensor_t* outputs,
+                                                  uint32_t numOutputs,
+                                                  Qnn_ProfileHandle_t profileHandle,
+                                                  Qnn_SignalHandle_t signalHandle);
+
+/** @brief See QnnGraph_executeAsync()*/
+typedef Qnn_ErrorHandle_t (*QnnGraph_ExecuteAsyncFn_t)(Qnn_GraphHandle_t graphHandle,
+                                                       const Qnn_Tensor_t* inputs,
+                                                       uint32_t numInputs,
+                                                       Qnn_Tensor_t* outputs,
+                                                       uint32_t numOutputs,
+                                                       Qnn_ProfileHandle_t profileHandle,
+                                                       Qnn_SignalHandle_t signalHandle,
+                                                       Qnn_NotifyFn_t notifyFn,
+                                                       void* notifyParam);
+
+/** @brief See QnnGraph_releaseExecutionEnvironment()*/
+typedef Qnn_ErrorHandle_t (*QnnGraph_ReleaseExecutionEnvironmentFn_t)(
+    Qnn_GraphHandle_t graphHandle, const QnnGraph_ExecuteEnvironment_t** envs, uint32_t envSize);
+
+//
+// From QnnTensor.h
+//
+
+/** @brief See QnnTensor_createContextTensor()*/
+typedef Qnn_ErrorHandle_t (*QnnTensor_CreateContextTensorFn_t)(Qnn_ContextHandle_t context,
+                                                               Qnn_Tensor_t* tensor);
+
+/** @brief See QnnTensor_createGraphTensor()*/
+typedef Qnn_ErrorHandle_t (*QnnTensor_CreateGraphTensorFn_t)(Qnn_GraphHandle_t graph,
+                                                             Qnn_Tensor_t* tensor);
+
+/** @brief See QnnTensor_updateContextTensor()*/
+typedef Qnn_ErrorHandle_t (*QnnTensor_UpdateContextTensorsFn_t)(Qnn_ContextHandle_t context,
+                                                                const Qnn_Tensor_t** tensor,
+                                                                uint64_t numTensors);
+
+/** @brief See QnnTensor_updateGraphTensor()*/
+typedef Qnn_ErrorHandle_t (*QnnTensor_UpdateGraphTensorsFn_t)(Qnn_GraphHandle_t graph,
+                                                              const Qnn_Tensor_t** tensor,
+                                                              uint64_t numTensors);
+
+//
+// From QnnLog.h
+//
+
+/** @brief See QnnLog_create()*/
+typedef Qnn_ErrorHandle_t (*QnnLog_CreateFn_t)(QnnLog_Callback_t callback,
+                                               QnnLog_Level_t maxLogLevel,
+                                               Qnn_LogHandle_t* logger);
+
+/** @brief See QnnLog_setLogLevel()*/
+typedef Qnn_ErrorHandle_t (*QnnLog_SetLogLevelFn_t)(Qnn_LogHandle_t logger,
+                                                    QnnLog_Level_t maxLogLevel);
+
+/** @brief See QnnLog_free()*/
+typedef Qnn_ErrorHandle_t (*QnnLog_FreeFn_t)(Qnn_LogHandle_t logger);
+
+//
+// From QnnProfile.h
+//
+
+/** @brief See QnnProfile_create()*/
+typedef Qnn_ErrorHandle_t (*QnnProfile_CreateFn_t)(Qnn_BackendHandle_t backend,
+                                                   QnnProfile_Level_t level,
+                                                   Qnn_ProfileHandle_t* profile);
+
+/** @brief See QnnProfile_setConfig()*/
+typedef Qnn_ErrorHandle_t (*QnnProfile_SetConfigFn_t)(Qnn_ProfileHandle_t profileHandle,
+                                                      const QnnProfile_Config_t** config);
+
+/** @brief See QnnProfile_getEvents()*/
+typedef Qnn_ErrorHandle_t (*QnnProfile_GetEventsFn_t)(Qnn_ProfileHandle_t profile,
+                                                      const QnnProfile_EventId_t** profileEventIds,
+                                                      uint32_t* numEvents);
+
+/** @brief See QnnProfile_getSubEvents()*/
+typedef Qnn_ErrorHandle_t (*QnnProfile_GetSubEventsFn_t)(QnnProfile_EventId_t eventId,
+                                                         const QnnProfile_EventId_t** subEventIds,
+                                                         uint32_t* numSubEvents);
+
+/** @brief See QnnProfile_getEventData()*/
+typedef Qnn_ErrorHandle_t (*QnnProfile_GetEventDataFn_t)(QnnProfile_EventId_t eventId,
+                                                         QnnProfile_EventData_t* eventData);
+
+/** @brief See QnnProfile_getExtendedEventData()*/
+typedef Qnn_ErrorHandle_t (*QnnProfile_GetExtendedEventDataFn_t)(
+    QnnProfile_EventId_t eventId, QnnProfile_ExtendedEventData_t* eventData);
+
+/** @brief See QnnProfile_free()*/
+typedef Qnn_ErrorHandle_t (*QnnProfile_FreeFn_t)(Qnn_ProfileHandle_t profile);
+
+//
+// From QnnMem.h
+//
+
+/** @brief See QnnMem_register()*/
+typedef Qnn_ErrorHandle_t (*QnnMem_RegisterFn_t)(Qnn_ContextHandle_t context,
+                                                 const Qnn_MemDescriptor_t* memDescriptors,
+                                                 uint32_t numDescriptors,
+                                                 Qnn_MemHandle_t* memHandles);
+
+/** @brief See QnnMem_deRegister()*/
+typedef Qnn_ErrorHandle_t (*QnnMem_DeRegisterFn_t)(const Qnn_MemHandle_t* memHandles,
+                                                   uint32_t numHandles);
+
+//
+// From QnnDevice.h
+//
+
+/** @brief See QnnDevice_getPlatformInfo()*/
+typedef Qnn_ErrorHandle_t (*QnnDevice_GetPlatformInfoFn_t)(
+    Qnn_LogHandle_t logger, const QnnDevice_PlatformInfo_t** platformInfo);
+
+/** @brief See QnnDevice_freePlatformInfo()*/
+typedef Qnn_ErrorHandle_t (*QnnDevice_FreePlatformInfoFn_t)(
+    Qnn_LogHandle_t logger, const QnnDevice_PlatformInfo_t* platformInfo);
+
+/** @brief See QnnDevice_getInfrastructure()*/
+typedef Qnn_ErrorHandle_t (*QnnDevice_GetInfrastructureFn_t)(
+    const QnnDevice_Infrastructure_t* deviceInfra);
+
+/** @brief See QnnDevice_create()*/
+typedef Qnn_ErrorHandle_t (*QnnDevice_CreateFn_t)(Qnn_LogHandle_t logger,
+                                                  const QnnDevice_Config_t** config,
+                                                  Qnn_DeviceHandle_t* device);
+
+/** @brief See QnnDevice_setConfig()*/
+typedef Qnn_ErrorHandle_t (*QnnDevice_SetConfigFn_t)(Qnn_DeviceHandle_t device,
+                                                     const QnnDevice_Config_t** config);
+
+/** @brief See QnnDevice_getInfo()*/
+typedef Qnn_ErrorHandle_t (*QnnDevice_GetInfoFn_t)(Qnn_DeviceHandle_t device,
+                                                   const QnnDevice_PlatformInfo_t** platformInfo);
+
+/** @brief See QnnDevice_free()*/
+typedef Qnn_ErrorHandle_t (*QnnDevice_FreeFn_t)(Qnn_DeviceHandle_t device);
+
+//
+// From QnnSignal.h
+//
+
+/** @brief See QnnSignal_create()*/
+typedef Qnn_ErrorHandle_t (*QnnSignal_CreateFn_t)(Qnn_BackendHandle_t backend,
+                                                  const QnnSignal_Config_t** config,
+                                                  Qnn_SignalHandle_t* signal);
+
+/** @brief See QnnSignal_setConfig()*/
+typedef Qnn_ErrorHandle_t (*QnnSignal_SetConfigFn_t)(Qnn_SignalHandle_t signal,
+                                                     const QnnSignal_Config_t** config);
+
+/** @brief See QnnSignal_trigger()*/
+typedef Qnn_ErrorHandle_t (*QnnSignal_TriggerFn_t)(Qnn_SignalHandle_t signal);
+
+/** @brief See QnnSignal_free()*/
+typedef Qnn_ErrorHandle_t (*QnnSignal_FreeFn_t)(Qnn_SignalHandle_t signal);
+
+//
+// From QnnError.h
+//
+
+/** @brief See QnnError_getMessage()*/
+typedef Qnn_ErrorHandle_t (*QnnError_GetMessageFn_t)(Qnn_ErrorHandle_t errorHandle,
+                                                     const char** errorMessage);
+/** @brief See QnnError_getVerboseMessage()*/
+typedef Qnn_ErrorHandle_t (*QnnError_GetVerboseMessageFn_t)(Qnn_ErrorHandle_t errorHandle,
+                                                            const char** errorMessage);
+/** @brief See QnnError_freeVerboseMessage()*/
+typedef Qnn_ErrorHandle_t (*QnnError_FreeVerboseMessageFn_t)(const char* errorMessage);
+
+// clang-format off
+
+/**
+ * @brief This struct defines Qnn interface specific to version.
+ *        Interface functions are allowed to be NULL if not supported/available.
+ *
+ */
+typedef struct {
+  QnnProperty_HasCapabilityFn_t             propertyHasCapability;
+
+  QnnBackend_CreateFn_t                     backendCreate;
+  QnnBackend_SetConfigFn_t                  backendSetConfig;
+  QnnBackend_GetApiVersionFn_t              backendGetApiVersion;
+  QnnBackend_GetBuildIdFn_t                 backendGetBuildId;
+  QnnBackend_RegisterOpPackageFn_t          backendRegisterOpPackage;
+  QnnBackend_GetSupportedOperationsFn_t     backendGetSupportedOperations;
+  QnnBackend_ValidateOpConfigFn_t           backendValidateOpConfig;
+  QnnBackend_FreeFn_t                       backendFree;
+
+  QnnContext_CreateFn_t                     contextCreate;
+  QnnContext_SetConfigFn_t                  contextSetConfig;
+  QnnContext_GetBinarySizeFn_t              contextGetBinarySize;
+  QnnContext_GetBinaryFn_t                  contextGetBinary;
+  QnnContext_CreateFromBinaryFn_t           contextCreateFromBinary;
+  QnnContext_FreeFn_t                       contextFree;
+
+  QnnGraph_CreateFn_t                       graphCreate;
+  QnnGraph_CreateSubgraphFn_t               graphCreateSubgraph;
+  QnnGraph_SetConfigFn_t                    graphSetConfig;
+  QnnGraph_AddNodeFn_t                      graphAddNode;
+  QnnGraph_FinalizeFn_t                     graphFinalize;
+  QnnGraph_RetrieveFn_t                     graphRetrieve;
+  QnnGraph_ExecuteFn_t                      graphExecute;
+  QnnGraph_ExecuteAsyncFn_t                 graphExecuteAsync;
+
+  QnnTensor_CreateContextTensorFn_t         tensorCreateContextTensor;
+  QnnTensor_CreateGraphTensorFn_t           tensorCreateGraphTensor;
+
+  QnnLog_CreateFn_t                         logCreate;
+  QnnLog_SetLogLevelFn_t                    logSetLogLevel;
+  QnnLog_FreeFn_t                           logFree;
+
+  QnnProfile_CreateFn_t                     profileCreate;
+  QnnProfile_SetConfigFn_t                  profileSetConfig;
+  QnnProfile_GetEventsFn_t                  profileGetEvents;
+  QnnProfile_GetSubEventsFn_t               profileGetSubEvents;
+  QnnProfile_GetEventDataFn_t               profileGetEventData;
+  QnnProfile_GetExtendedEventDataFn_t       profileGetExtendedEventData;
+  QnnProfile_FreeFn_t                       profileFree;
+
+  QnnMem_RegisterFn_t                       memRegister;
+  QnnMem_DeRegisterFn_t                     memDeRegister;
+
+  QnnDevice_GetPlatformInfoFn_t             deviceGetPlatformInfo;
+  QnnDevice_FreePlatformInfoFn_t            deviceFreePlatformInfo;
+  QnnDevice_GetInfrastructureFn_t           deviceGetInfrastructure;
+  QnnDevice_CreateFn_t                      deviceCreate;
+  QnnDevice_SetConfigFn_t                   deviceSetConfig;
+  QnnDevice_GetInfoFn_t                     deviceGetInfo;
+  QnnDevice_FreeFn_t                        deviceFree;
+
+  QnnSignal_CreateFn_t                      signalCreate;
+  QnnSignal_SetConfigFn_t                   signalSetConfig;
+  QnnSignal_TriggerFn_t                     signalTrigger;
+  QnnSignal_FreeFn_t                        signalFree;
+
+  QnnError_GetMessageFn_t                   errorGetMessage;
+  QnnError_GetVerboseMessageFn_t            errorGetVerboseMessage;
+  QnnError_FreeVerboseMessageFn_t           errorFreeVerboseMessage;
+
+  QnnGraph_PrepareExecutionEnvironmentFn_t  graphPrepareExecutionEnvironment;
+  QnnGraph_ReleaseExecutionEnvironmentFn_t  graphReleaseExecutionEnvironment;
+  QnnGraph_GetPropertyFn_t                  graphGetProperty;
+
+  QnnContext_ValidateBinaryFn_t             contextValidateBinary;
+  QnnContext_CreateFromBinaryWithSignalFn_t contextCreateFromBinaryWithSignal;
+  QnnContext_CreateFromBinaryListAsyncFn_t  contextCreateFromBinaryListAsync;
+  QnnTensor_UpdateGraphTensorsFn_t          tensorUpdateGraphTensors;
+  QnnTensor_UpdateContextTensorsFn_t        tensorUpdateContextTensors;
+  QnnContext_GetBinarySectionSizeFn_t       contextGetBinarySectionSize;
+  QnnContext_GetBinarySectionFn_t           contextGetBinarySection;
+  QnnContext_ApplyBinarySectionFn_t         contextApplyBinarySection;
+  QnnBackend_GetPropertyFn_t                backendGetProperty;
+  QnnContext_GetPropertyFn_t                contextGetProperty;
+  QnnContext_GetIncrementalBinaryFn_t       contextGetIncrementalBinary;
+  QnnContext_ReleaseIncrementalBinaryFn_t   contextReleaseIncrementalBinary;
+} QNN_INTERFACE_VER_TYPE;
+
+/// QNN_INTERFACE_VER_TYPE initializer macro
+#define QNN_INTERFACE_VER_TYPE_INIT { \
+  NULL, /*propertyHasCapability*/ \
+  NULL, /*backendCreate*/ \
+  NULL, /*backendSetConfig*/ \
+  NULL, /*backendGetApiVersion*/ \
+  NULL, /*backendGetBuildId*/ \
+  NULL, /*backendRegisterOpPackage*/ \
+  NULL, /*backendGetSupportedOperations*/ \
+  NULL, /*backendValidateOpConfig*/ \
+  NULL, /*backendFree*/ \
+  NULL, /*contextCreate*/ \
+  NULL, /*contextSetConfig*/ \
+  NULL, /*contextGetBinarySize*/ \
+  NULL, /*contextGetBinary*/ \
+  NULL, /*contextCreateFromBinary*/ \
+  NULL, /*contextFree*/ \
+  NULL, /*graphCreate*/ \
+  NULL, /*graphCreateSubgraph*/ \
+  NULL, /*graphSetConfig*/ \
+  NULL, /*graphAddNode*/ \
+  NULL, /*graphFinalize*/ \
+  NULL, /*graphRetrieve*/ \
+  NULL, /*graphExecute*/ \
+  NULL, /*graphExecuteAsync*/ \
+  NULL, /*tensorCreateContextTensor*/ \
+  NULL, /*tensorCreateGraphTensor*/ \
+  NULL, /*logCreate*/ \
+  NULL, /*logSetLogLevel*/ \
+  NULL, /*logFree*/ \
+  NULL, /*profileCreate*/ \
+  NULL, /*profileSetConfig*/ \
+  NULL, /*profileGetEvents*/ \
+  NULL, /*profileGetSubEvents*/ \
+  NULL, /*profileGetEventData*/ \
+  NULL, /*profileGetExtendedEventData*/ \
+  NULL, /*profileFree*/ \
+  NULL, /*memRegister*/ \
+  NULL, /*memDeRegister*/ \
+  NULL, /*deviceGetPlatformInfo*/ \
+  NULL, /*deviceFreePlatformInfo*/ \
+  NULL, /*deviceGetInfrastructure*/ \
+  NULL, /*deviceCreate*/ \
+  NULL, /*deviceSetConfig*/ \
+  NULL, /*deviceGetInfo*/ \
+  NULL, /*deviceFree*/ \
+  NULL, /*signalCreate*/ \
+  NULL, /*signalSetConfig*/ \
+  NULL, /*signalTrigger*/ \
+  NULL, /*signalFree*/ \
+  NULL, /*errorGetMessage*/ \
+  NULL, /*errorGetVerboseMessage*/ \
+  NULL, /*errorFreeVerboseMessage*/ \
+  NULL, /*graphPrepareExecutionEnvironment*/ \
+  NULL, /*graphReleaseExecutionEnvironment*/ \
+  NULL, /*graphGetProperty*/ \
+  NULL, /*contextValidateBinary*/ \
+  NULL, /*contextCreateFromBinaryWithSignal*/\
+  NULL, /*contextCreateFromBinaryListAsync*/ \
+  NULL, /*tensorUpdateGraphTensor*/ \
+  NULL, /*tensorUpdateContextTensor*/ \
+  NULL, /*contextGetBinarySectionSize*/ \
+  NULL, /*contextGetBinarySection*/ \
+  NULL, /*contextApplyBinarySection*/ \
+  NULL, /*backendGetProperty*/ \
+  NULL, /*contextGetProperty*/ \
+  NULL, /*contextGetIncrementalProperty*/ \
+  NULL, /*contextReleaseIncrementalProperty*/ \
+}
+
+typedef struct {
+  /// Backend identifier. See QnnCommon.h for details.
+  /// Allowed to be QNN_BACKEND_ID_NULL in case of single backend library, in which case
+  /// clients can deduce backend identifier based on library being loaded.
+  uint32_t backendId;
+  /// Interface provider name. Allowed to be NULL.
+  const char* providerName;
+  // API version for provided interface
+  Qnn_ApiVersion_t apiVersion;
+  union UNNAMED {
+    // Core interface type and name: e.g. QnnInterface_ImplementationV0_0_t v0_0;
+    QNN_INTERFACE_VER_TYPE  QNN_INTERFACE_VER_NAME;
+  };
+} QnnInterface_t;
+
+/// QnnInterface_t initializer macro
+#define QNN_INTERFACE_INIT                                   \
+  {                                                          \
+    QNN_BACKEND_ID_NULL,      /*backendId*/                  \
+    NULL,                     /*providerName*/               \
+    QNN_API_VERSION_INIT,     /*apiVersion*/                 \
+    {                                                        \
+      QNN_INTERFACE_VER_TYPE_INIT /*QNN_INTERFACE_VER_NAME*/ \
+    }                                                        \
+  }
+
+// clang-format on
+
+//=============================================================================
+// Public Functions
+//=============================================================================
+
+/**
+ * @brief Get list of available interface providers.
+ *
+ * @param[out] providerList A pointer to an array of available interface providers. The lifetime of
+ *                          returned interface object pointers corresponds to the lifetime of the
+ *                          provider library. Contents are to be considered invalid if the provider
+ *                          library is terminated/unloaded. This function can be called immediately
+ *                          after provider library has been loaded.
+ *
+ * @param[out] numProviders Number of available interface objects in _providerList_.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: No error.
+ *         - QNN_INTERFACE_INVALID_PARAMETER: Invalid parameter was provided.
+ *           Either _providerList_ or _numProviders_ was NULL.
+ *         - QNN_INTERFACE_ERROR_NOT_SUPPORTED: API not supported.
+ */
+QNN_INTERFACE
+Qnn_ErrorHandle_t QnnInterface_getProviders(const QnnInterface_t*** providerList,
+                                            uint32_t* numProviders);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // QNN_INTERFACE_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnLog.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnLog.h
new file mode 100755
index 0000000000000..5a1ae624a4056
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnLog.h
@@ -0,0 +1,170 @@
+//=============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All rights reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//=============================================================================
+
+/**
+ *  @file
+ *  @brief  Logging component API.
+ *
+ *          Provides means for QNN backends to output logging data.
+ */
+
+#ifndef QNN_LOG_H
+#define QNN_LOG_H
+
+#ifdef __cplusplus
+#include <cstdarg>
+#else
+#include <stdarg.h>
+#endif
+
+#include "QnnCommon.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+/**
+ * @brief QNN Log API result / error codes.
+ */
+typedef enum {
+  QNN_LOG_MIN_ERROR = QNN_MIN_ERROR_LOG,
+  ////////////////////////////////////
+
+  /// Qnn Log success
+  QNN_LOG_NO_ERROR = QNN_SUCCESS,
+  /// General error relating to memory allocation in Log API
+  QNN_LOG_ERROR_MEM_ALLOC = QNN_COMMON_ERROR_MEM_ALLOC,
+  /// Unable to initialize logging
+  QNN_LOG_ERROR_INITIALIZATION = QNN_MIN_ERROR_LOG + 2,
+  /// Invalid argument passed
+  QNN_LOG_ERROR_INVALID_ARGUMENT = QNN_MIN_ERROR_LOG + 3,
+  /// Invalid log handle passed
+  QNN_LOG_ERROR_INVALID_HANDLE = QNN_MIN_ERROR_LOG + 4,
+  ////////////////////////////////////
+  QNN_LOG_MAX_ERROR = QNN_MAX_ERROR_LOG,
+  // Unused, present to ensure 32 bits.
+  QNN_LOG_ERROR_UNDEFINED = 0x7FFFFFFF
+} QnnLog_Error_t;
+
+typedef enum {
+  // Enum Levels must be in ascending order, so that the enum value
+  // can be compared with the "maximum" set in QnnLog_create().
+  QNN_LOG_LEVEL_ERROR   = 1,
+  QNN_LOG_LEVEL_WARN    = 2,
+  QNN_LOG_LEVEL_INFO    = 3,
+  QNN_LOG_LEVEL_VERBOSE = 4,
+  /// Reserved for developer debugging
+  QNN_LOG_LEVEL_DEBUG = 5,
+  // Present to ensure 32 bits
+  QNN_LOG_LEVEL_MAX = 0x7fffffff
+} QnnLog_Level_t;
+
+/**
+ * @brief Signature for user-supplied logging callback.
+ *
+ * @warning The backend may call this callback from multiple threads, and expects that it is
+ *          re-entrant.
+ *
+ * @param[in] fmt Printf-style message format specifier.
+ *
+ * @param[in] level Log level for the message. Will not be higher than the maximum specified in
+ *                  QnnLog_create.
+ *
+ * @param[in] timestamp Backend-generated timestamp which is monotonically increasing, but
+ *                      otherwise meaningless.
+ *
+ * @param[in] args Message-specific parameters, to be used with fmt.
+ */
+typedef void (*QnnLog_Callback_t)(const char* fmt,
+                                  QnnLog_Level_t level,
+                                  uint64_t timestamp,
+                                  va_list args);
+
+//=============================================================================
+// Public Functions
+//=============================================================================
+
+/**
+ * @brief Create a handle to a logger object. This function can be
+ *        called before QnnBackend_create().
+ *
+ * @warning With different logging level enabled, the inference time may vary.
+ *
+ * @param[in] callback Callback to handle backend-generated logging messages. NULL indicates
+ *                     backend may direct log messages to the default log stream on the target
+ *                     platform when possible (e.g. to logcat in case of Android).
+ *
+ * @param[in] maxLogLevel Maximum level of messages which the backend will generate.
+ *
+ * @param[out] logger The created log handle.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: if logging is successfully initialized.
+ *         - QNN_COMMON_ERROR_NOT_SUPPORTED: logging is not supported.
+ *         - QNN_LOG_ERROR_INVALID_ARGUMENT: if one or more arguments is invalid.
+ *         - QNN_LOG_ERROR_MEM_ALLOC: for memory allocation errors.
+ *         - QNN_LOG_ERROR_INITIALIZATION: log init failed.
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION: SSR occurence (successful recovery)
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION_FATAL: SSR occurence (unsuccessful recovery)
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnLog_create(QnnLog_Callback_t callback,
+                                QnnLog_Level_t maxLogLevel,
+                                Qnn_LogHandle_t* logger);
+
+/**
+ * @brief A function to change the log level for the supplied log handle.
+ *
+ * @warning With different logging level enabled, the inference time may vary.
+ *
+ * @param[in] logger A log handle.
+ *
+ * @param[in] maxLogLevel New maximum log level.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: if the level is changed successfully.
+ *         - QNN_LOG_ERROR_INVALID_ARGUMENT: if maxLogLevel is not a valid QnnLog_Level_t level.
+ *         - QNN_LOG_ERROR_INVALID_HANDLE: _logHandle_ is not a valid handle
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION: SSR occurence (successful recovery)
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION_FATAL: SSR occurence (unsuccessful recovery)
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnLog_setLogLevel(Qnn_LogHandle_t logger, QnnLog_Level_t maxLogLevel);
+
+/**
+ * @brief A function to free the memory associated with the log handle.
+ *
+ * @param[in] logger A log handle.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: indicates logging is terminated.
+ *         - QNN_LOG_ERROR_MEM_ALLOC: for memory de-allocation errors.
+ *         - QNN_LOG_ERROR_INVALID_HANDLE: _logHandle_ is not a valid handle
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnLog_free(Qnn_LogHandle_t logger);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnMem.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnMem.h
new file mode 100755
index 0000000000000..6db196c21cfee
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnMem.h
@@ -0,0 +1,241 @@
+//==============================================================================
+//
+// Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+// All rights reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/**
+ *  @file
+ *  @brief  Memory registration component API.
+ *
+ *          Requires Backend to be initialized.
+ *          Provides means to register externally allocated memory with a backend.
+ */
+
+#ifndef QNN_MEM_H
+#define QNN_MEM_H
+
+#include "QnnCommon.h"
+#include "QnnTypes.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+
+/// Invalid memory file descriptor value
+#define QNN_MEM_INVALID_FD -1
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+/**
+ * @brief QNN Mem(ory) API result / error codes.
+ */
+typedef enum {
+  QNN_MEM_MIN_ERROR = QNN_MIN_ERROR_MEM,
+  ////////////////////////////////////
+
+  /// Qnn Memory success
+  QNN_MEM_NO_ERROR = QNN_SUCCESS,
+  /// Backend does not support requested functionality
+  QNN_MEM_ERROR_NOT_SUPPORTED = QNN_COMMON_ERROR_NOT_SUPPORTED,
+  /// Invalid function argument
+  QNN_MEM_ERROR_INVALID_ARGUMENT = QNN_MIN_ERROR_MEM + 0,
+  /// Invalid memory handle
+  QNN_MEM_ERROR_INVALID_HANDLE = QNN_MIN_ERROR_MEM + 1,
+  /// Provided memory has already been registered
+  QNN_MEM_ERROR_ALREADY_REGISTERED = QNN_MIN_ERROR_MEM + 2,
+  /// Error in memory mapping
+  QNN_MEM_ERROR_MAPPING = QNN_MIN_ERROR_MEM + 3,
+  /// Invalid memory shape based on a backend's memory restrictions (e.g. alignment incompatibility)
+  QNN_MEM_ERROR_INVALID_SHAPE = QNN_MIN_ERROR_MEM + 4,
+  /// Backend does not support requested memory type
+  QNN_MEM_ERROR_UNSUPPORTED_MEMTYPE = QNN_MIN_ERROR_MEM + 5,
+
+  ////////////////////////////////////
+  QNN_MEM_MAX_ERROR = QNN_MAX_ERROR_MEM,
+  // Unused, present to ensure 32 bits.
+  QNN_MEM_ERROR_UNDEFINED = 0x7FFFFFFF
+} QnnMem_Error_t;
+
+/**
+ * @brief A struct which describes the shape of memory
+ */
+typedef struct {
+  /// Number of dimensions
+  uint32_t numDim;
+  /// Array holding size of each dimension. Size of array is = numDim
+  uint32_t* dimSize;
+  /// Additional configuration in string, for extensibility. Allowed to be NULL
+  const char* shapeConfig;
+} Qnn_MemShape_t;
+
+#define SHAPE_CONFIG_DATA_FORMAT_UBWC_RGBA8888 "DATA_FORMAT_UBWC_RGBA8888"
+#define SHAPE_CONFIG_DATA_FORMAT_UBWC_NV12     "DATA_FORMAT_UBWC_NV12"
+#define SHAPE_CONFIG_DATA_FORMAT_UBWC_NV12_Y   "DATA_FORMAT_UBWC_NV12_Y"
+#define SHAPE_CONFIG_DATA_FORMAT_UBWC_NV12_UV  "DATA_FORMAT_UBWC_NV12_UV"
+
+// clang-format off
+/// Qnn_MemShape_t initializer macro
+#define QNN_MEM_SHAPE_INIT    \
+  {                           \
+    0u,       /*numDim*/      \
+    NULL,     /*dimSize*/     \
+    NULL      /*shapeConfig*/ \
+  }
+// clang-format on
+
+/**
+ * @brief An enumeration of memory types which may be used to provide data for a QNN tensor.
+ */
+typedef enum {
+  /// Memory allocated by ION manager. ION memory can only be registered with Backend libraries
+  /// when a device supports ION manager.
+  QNN_MEM_TYPE_ION = 1,
+  /// Memory allocated by a custom backend mechanism.
+  QNN_MEM_TYPE_CUSTOM = 2,
+  /// Memory allocated by DMA-BUF subsystem.
+  QNN_MEM_TYPE_DMA_BUF = 3,
+  // Unused, present to ensure 32 bits.
+  QNN_MEM_TYPE_UNDEFINED = 0x7FFFFFFF
+} Qnn_MemType_t;
+
+/**
+ * @brief a struct which includes ION related information
+ */
+typedef struct {
+  /// file descriptor for memory, must be set to QNN_MEM_INVALID_FD if not applicable
+  int32_t fd;
+} Qnn_MemIonInfo_t;
+
+/// Qnn_MemIonInfo_t initializer macro
+#define QNN_MEM_ION_INFO_INIT \
+  { QNN_MEM_INVALID_FD /*fd*/ }
+
+/**
+ * @brief Definition of custom mem info opaque object. This object type is managed by backend
+ * specific APIs obtained by a custom backend mechanism.
+ */
+typedef void* Qnn_MemInfoCustom_t;
+
+/**
+ * @brief a struct which includes DMA-BUF related information
+ */
+typedef struct {
+  /// file descriptor for memory, must be set to QNN_MEM_INVALID_FD if not applicable
+  int32_t fd;
+  /// data pointer, created by app, using mmap on above file descriptor.
+  void* data;
+} Qnn_MemDmaBufInfo_t;
+
+/// Qnn_MemDmaBufInfo_t initializer macro
+#define QNN_MEM_DMA_BUF_INFO_INIT \
+  {                               \
+    QNN_MEM_INVALID_FD, /*fd*/    \
+        NULL            /*data*/  \
+  }
+
+/**
+ * @brief A struct which describes memory params
+ */
+typedef struct {
+  /// memory shape
+  Qnn_MemShape_t memShape;
+  /// memory data type
+  Qnn_DataType_t dataType;
+  /// memory type
+  Qnn_MemType_t memType;
+
+  union UNNAMED {
+    Qnn_MemIonInfo_t ionInfo;
+    Qnn_MemInfoCustom_t customInfo;
+    Qnn_MemDmaBufInfo_t dmaBufInfo;
+  };
+} Qnn_MemDescriptor_t;
+
+// clang-format off
+/// Qnn_MemDescriptor_t initializer macro
+#define QNN_MEM_DESCRIPTOR_INIT          \
+  {                                      \
+    QNN_MEM_SHAPE_INIT,     /*memShape*/ \
+    QNN_DATATYPE_UNDEFINED, /*dataType*/ \
+    QNN_MEM_TYPE_UNDEFINED, /*memType*/  \
+    {                                    \
+      QNN_MEM_ION_INFO_INIT /*ionInfo*/  \
+    }                                    \
+  }
+// clang-format on
+
+//=============================================================================
+// Public Functions
+//=============================================================================
+
+/**
+ * @brief Register existing memory to memory handle.
+ *        Used to instruct QNN to use this memory directly.
+ *
+ * @param[in] context A context handle.
+ *
+ * @param[in] memDescriptors Array of memory descriptors to be registered.
+ *
+ * @param[in] numDescriptors Number of memory descriptors in the array.
+ *
+ * @param[out] memHandles Array of allocated memory handles, length is _numDescriptors_. Same shape
+ *                        as _memDescriptors_ (i.e. memHandles[n] corresponds to
+ *                        memDescriptors[n]).
+ *
+ * @note memHandles parameter: Array memory is owned by the client. Array size must be at least
+ *       _numDescriptors_*sizeof(Qnn_MemHandle_t). The array will be initialized to NULL by the
+ *       backend. Upon failure, no memory will be registered and the _memHandles_ array will remain
+ *       NULL.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: memory was successfully registered
+ *         - QNN_MEM_ERROR_NOT_SUPPORTED: backend does not support this API
+ *         - QNN_MEM_ERROR_ALREADY_REGISTERED: memory has already been registered
+ *         - QNN_MEM_ERROR_UNSUPPORTED_MEMTYPE: backend does not support a memType specified within
+ *           _memDescriptors_
+ *         - QNN_MEM_ERROR_MAPPING: failed to map between memory file descriptor and memory address
+ *         - QNN_MEM_ERROR_INVALID_ARGUMENT: NULL array ptr or invalid memory descriptor
+ *         - QNN_MEM_ERROR_INVALID_SHAPE: backend does not support a memShape specified within
+ *           _memDescriptors_
+ *         - QNN_MEM_ERROR_INVALID_HANDLE: _context_ is not a valid handle
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnMem_register(Qnn_ContextHandle_t context,
+                                  const Qnn_MemDescriptor_t* memDescriptors,
+                                  uint32_t numDescriptors,
+                                  Qnn_MemHandle_t* memHandles);
+
+/**
+ * @brief Deregister a memory handle which was registered via QnnMem_register and invalidates
+ *        memHandle for the given backend handle.
+ *
+ * @param[in] memHandles Array of memory handles to be deregistered.
+ *
+ * @param[in] numHandles Number of memory handles in the array.
+ *
+ * @note memHandles parameter: Upon failure, all valid handles within _memHandles_ will still be
+ *       de-registered.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: memory was successfully de-registered
+ *         - QNN_MEM_ERROR_NOT_SUPPORTED: backend does not support this API
+ *         - QNN_MEM_ERROR_INVALID_ARGUMENT: _memHandles_ is NULL
+ *         - QNN_MEM_ERROR_INVALID_HANDLE: a handle within _memHandles_ is NULL/invalid
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnMem_deRegister(const Qnn_MemHandle_t* memHandles, uint32_t numHandles);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // QNN_MEM_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnOpDef.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnOpDef.h
new file mode 100755
index 0000000000000..40b752def0574
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnOpDef.h
@@ -0,0 +1,722 @@
+//=============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All rights reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//=============================================================================
+
+//=============================================================================
+// !!! This is an auto-generated file. Do NOT modify manually !!!
+//=============================================================================
+
+/**
+ * @file
+ * @brief QNN operation definition related names and constants.
+ *
+ *        Supported QNN operations are named alphabetically and belong to the
+ *        QNN_OP_PACKAGE_NAME_QTI_AISW.
+ */
+
+#ifndef QNN_OP_DEF_H
+#define QNN_OP_DEF_H
+
+// The Op package name
+#define QNN_OP_PACKAGE_NAME_QTI_AISW "qti.aisw"
+
+#define QNN_OPSET_VERSION_MAJOR 2
+#define QNN_OPSET_VERSION_MINOR 2
+#define QNN_OPSET_VERSION_PATCH 0
+
+#define QNN_OP_ARGB_TO_RGB                      "ArgbToRgb"
+#define QNN_OP_ARGB_TO_RGB_PARAM_INPUT_ORDER    "input_order"
+#define QNN_OP_ARGB_TO_RGB_INPUT_ORDER_ARGB     0
+#define QNN_OP_ARGB_TO_RGB_INPUT_ORDER_RGBA     1
+#define QNN_OP_ARGB_TO_RGB_PARAM_REVERSE_OUTPUT "reverse_output"
+
+#define QNN_OP_ARGMAX                 "Argmax"
+#define QNN_OP_ARGMAX_PARAM_AXIS      "axis"
+#define QNN_OP_ARGMAX_PARAM_KEEP_DIMS "keep_dims"
+
+#define QNN_OP_ARGMIN                 "Argmin"
+#define QNN_OP_ARGMIN_PARAM_AXIS      "axis"
+#define QNN_OP_ARGMIN_PARAM_KEEP_DIMS "keep_dims"
+
+#define QNN_OP_AXIS_ALIGNED_BBOX_TRANSFORM               "AxisAlignedBboxTransform"
+#define QNN_OP_AXIS_ALIGNED_BBOX_TRANSFORM_PARAM_WEIGHTS "weights"
+
+#define QNN_OP_BATCHNORM "Batchnorm"
+
+#define QNN_OP_BATCH_PERMUTATION "BatchPermutation"
+
+#define QNN_OP_BATCH_TO_SPACE                  "BatchToSpace"
+#define QNN_OP_BATCH_TO_SPACE_PARAM_BLOCK_SIZE "block_size"
+#define QNN_OP_BATCH_TO_SPACE_PARAM_CROPS      "crops"
+
+#define QNN_OP_BBOX_TRANSFORM                            "BboxTransform"
+#define QNN_OP_BBOX_TRANSFORM_PARAM_WEIGHTS              "weights"
+#define QNN_OP_BBOX_TRANSFORM_PARAM_APPLY_SCALE          "apply_scale"
+#define QNN_OP_BBOX_TRANSFORM_PARAM_ANGLE_BOUNDS         "angle_bounds"
+#define QNN_OP_BBOX_TRANSFORM_PARAM_ANGLE_CLIP_THRESHOLD "angle_clip_threshold"
+
+#define QNN_OP_BOX_WITH_NMS_LIMIT                            "BoxWithNmsLimit"
+#define QNN_OP_BOX_WITH_NMS_LIMIT_PARAM_NMS_KERNEL_METHOD    "nms_kernel_method"
+#define QNN_OP_BOX_WITH_NMS_LIMIT_NMS_KERNEL_METHOD_HARD     0
+#define QNN_OP_BOX_WITH_NMS_LIMIT_NMS_KERNEL_METHOD_LINEAR   1
+#define QNN_OP_BOX_WITH_NMS_LIMIT_NMS_KERNEL_METHOD_GAUSSIAN 2
+#define QNN_OP_BOX_WITH_NMS_LIMIT_PARAM_NMS_SCORE_THRESHOLD  "nms_score_threshold"
+#define QNN_OP_BOX_WITH_NMS_LIMIT_PARAM_SCORE_THRESHOLD      "score_threshold"
+#define QNN_OP_BOX_WITH_NMS_LIMIT_PARAM_PRE_NMS_LIMIT        "pre_nms_limit"
+#define QNN_OP_BOX_WITH_NMS_LIMIT_PARAM_IOU_THRESHOLD        "iou_threshold"
+#define QNN_OP_BOX_WITH_NMS_LIMIT_PARAM_SIGMA                "sigma"
+
+#define QNN_OP_BUFFER                         "Buffer"
+#define QNN_OP_BUFFER_PARAM_BUFFER_SIZE       "buffer_size"
+#define QNN_OP_BUFFER_PARAM_BUFFER_DIM        "buffer_dim"
+#define QNN_OP_BUFFER_PARAM_STRIDE            "stride"
+#define QNN_OP_BUFFER_PARAM_MODE              "mode"
+#define QNN_OP_BUFFER_MODE_BLOCKING           0
+#define QNN_OP_BUFFER_MODE_NON_BLOCKING_LEFT  1
+#define QNN_OP_BUFFER_MODE_NON_BLOCKING_RIGHT 2
+#define QNN_OP_BUFFER_PARAM_BUFFER_PADDING    "buffer_padding"
+
+#define QNN_OP_CAST "Cast"
+
+#define QNN_OP_CHANNEL_SHUFFLE                  "ChannelShuffle"
+#define QNN_OP_CHANNEL_SHUFFLE_PARAM_NUM_GROUPS "num_groups"
+#define QNN_OP_CHANNEL_SHUFFLE_PARAM_AXIS       "axis"
+
+#define QNN_OP_COL2_IM                   "Col2Im"
+#define QNN_OP_COL2_IM_PARAM_KERNEL_SIZE "kernel_size"
+#define QNN_OP_COL2_IM_PARAM_STRIDE      "stride"
+#define QNN_OP_COL2_IM_PARAM_PAD_AMOUNT  "pad_amount"
+#define QNN_OP_COL2_IM_PARAM_DILATION    "dilation"
+
+#define QNN_OP_COLLECT_RPN_PROPOSALS                     "CollectRpnProposals"
+#define QNN_OP_COLLECT_RPN_PROPOSALS_PARAM_RPN_MIN_LEVEL "rpn_min_level"
+#define QNN_OP_COLLECT_RPN_PROPOSALS_PARAM_RPN_MAX_LEVEL "rpn_max_level"
+#define QNN_OP_COLLECT_RPN_PROPOSALS_PARAM_POST_NMS_TOP  "post_nms_top"
+
+#define QNN_OP_COMBINED_NMS                           "CombinedNms"
+#define QNN_OP_COMBINED_NMS_PARAM_MAX_BOXES_PER_CLASS "max_boxes_per_class"
+#define QNN_OP_COMBINED_NMS_PARAM_MAX_TOTAL_BOXES     "max_total_boxes"
+#define QNN_OP_COMBINED_NMS_PARAM_IOU_THRESHOLD       "iou_threshold"
+#define QNN_OP_COMBINED_NMS_PARAM_SCORE_THRESHOLD     "score_threshold"
+#define QNN_OP_COMBINED_NMS_PARAM_PAD_PER_CLASS       "pad_per_class"
+#define QNN_OP_COMBINED_NMS_PARAM_CLIP_BOXES          "clip_boxes"
+
+#define QNN_OP_CONCAT            "Concat"
+#define QNN_OP_CONCAT_PARAM_AXIS "axis"
+
+#define QNN_OP_CONSTANT_OF_SHAPE             "ConstantOfShape"
+#define QNN_OP_CONSTANT_OF_SHAPE_PARAM_VALUE "value"
+
+#define QNN_OP_CONV_1D                  "Conv1d"
+#define QNN_OP_CONV_1D_PARAM_STRIDE     "stride"
+#define QNN_OP_CONV_1D_PARAM_PAD_AMOUNT "pad_amount"
+#define QNN_OP_CONV_1D_PARAM_GROUP      "group"
+#define QNN_OP_CONV_1D_PARAM_DILATION   "dilation"
+
+#define QNN_OP_CONV_2D                  "Conv2d"
+#define QNN_OP_CONV_2D_PARAM_STRIDE     "stride"
+#define QNN_OP_CONV_2D_PARAM_PAD_AMOUNT "pad_amount"
+#define QNN_OP_CONV_2D_PARAM_GROUP      "group"
+#define QNN_OP_CONV_2D_PARAM_DILATION   "dilation"
+
+#define QNN_OP_CONV_3D                             "Conv3d"
+#define QNN_OP_CONV_3D_PARAM_STRIDE                "stride"
+#define QNN_OP_CONV_3D_PARAM_PAD_AMOUNT            "pad_amount"
+#define QNN_OP_CONV_3D_PARAM_GROUP                 "group"
+#define QNN_OP_CONV_3D_PARAM_DILATION              "dilation"
+#define QNN_OP_CONV_3D_PARAM_REUSE_SPARSE_INDICIES "reuse_sparse_indicies"
+
+#define QNN_OP_CONVERT                           "Convert"
+#define QNN_OP_CONVERT_PARAM_DYNAMIC_INPUT_DATA  "dynamic_input_data"
+#define QNN_OP_CONVERT_PARAM_DYNAMIC_OUTPUT_DATA "dynamic_output_data"
+
+#define QNN_OP_CORRELATION_1D                    "Correlation1D"
+#define QNN_OP_CORRELATION_1D_PARAM_DISPLACEMENT "displacement"
+#define QNN_OP_CORRELATION_1D_PARAM_SHIFT        "shift"
+
+#define QNN_OP_CREATE_SPARSE "CreateSparse"
+
+#define QNN_OP_CROP_AND_RESIZE                                     "CropAndResize"
+#define QNN_OP_CROP_AND_RESIZE_PARAM_RESIZE_DIMS                   "resize_dims"
+#define QNN_OP_CROP_AND_RESIZE_PARAM_INTERPOLATION_MODE            "interpolation_mode"
+#define QNN_OP_CROP_AND_RESIZE_INTERPOLATION_MODE_BILINEAR         0
+#define QNN_OP_CROP_AND_RESIZE_INTERPOLATION_MODE_NEAREST_NEIGHBOR 1
+#define QNN_OP_CROP_AND_RESIZE_PARAM_EXTRAPOLATION_VALUE           "extrapolation_value"
+
+#define QNN_OP_CUMULATIVE_SUM                 "CumulativeSum"
+#define QNN_OP_CUMULATIVE_SUM_PARAM_AXIS      "axis"
+#define QNN_OP_CUMULATIVE_SUM_PARAM_EXCLUSIVE "exclusive"
+#define QNN_OP_CUMULATIVE_SUM_PARAM_REVERSE   "reverse"
+
+#define QNN_OP_DEPTH_TO_SPACE                  "DepthToSpace"
+#define QNN_OP_DEPTH_TO_SPACE_PARAM_BLOCK_SIZE "block_size"
+#define QNN_OP_DEPTH_TO_SPACE_PARAM_MODE       "mode"
+#define QNN_OP_DEPTH_TO_SPACE_MODE_DCR         0
+#define QNN_OP_DEPTH_TO_SPACE_MODE_CRD         1
+
+#define QNN_OP_DEPTH_WISE_CONV_1D                  "DepthWiseConv1d"
+#define QNN_OP_DEPTH_WISE_CONV_1D_PARAM_STRIDE     "stride"
+#define QNN_OP_DEPTH_WISE_CONV_1D_PARAM_PAD_AMOUNT "pad_amount"
+#define QNN_OP_DEPTH_WISE_CONV_1D_PARAM_DILATION   "dilation"
+
+#define QNN_OP_DEPTH_WISE_CONV_2D                  "DepthWiseConv2d"
+#define QNN_OP_DEPTH_WISE_CONV_2D_PARAM_STRIDE     "stride"
+#define QNN_OP_DEPTH_WISE_CONV_2D_PARAM_PAD_AMOUNT "pad_amount"
+#define QNN_OP_DEPTH_WISE_CONV_2D_PARAM_DILATION   "dilation"
+
+#define QNN_OP_DEQUANTIZE "Dequantize"
+
+#define QNN_OP_DETECTION_OUTPUT                             "DetectionOutput"
+#define QNN_OP_DETECTION_OUTPUT_PARAM_DELTA_SCALING_FACTORS "delta_scaling_factors"
+#define QNN_OP_DETECTION_OUTPUT_PARAM_CONFIDENCE_THRESHOLD  "confidence_threshold"
+#define QNN_OP_DETECTION_OUTPUT_PARAM_IOU_THRESHOLD         "iou_threshold"
+#define QNN_OP_DETECTION_OUTPUT_PARAM_NMS_TYPE              "nms_type"
+#define QNN_OP_DETECTION_OUTPUT_NMS_TYPE_FAST               0
+#define QNN_OP_DETECTION_OUTPUT_NMS_TYPE_REGULAR            1
+#define QNN_OP_DETECTION_OUTPUT_PARAM_BACKGROUND_CLASS_IDX  "background_class_idx"
+#define QNN_OP_DETECTION_OUTPUT_PARAM_USE_BG_IN_NMS         "use_bg_in_nms"
+#define QNN_OP_DETECTION_OUTPUT_PARAM_OUTPUT_BACKGROUND     "output_background"
+#define QNN_OP_DETECTION_OUTPUT_PARAM_SHARE_LOCATION        "share_location"
+#define QNN_OP_DETECTION_OUTPUT_PARAM_NMS_ETA               "nms_eta"
+#define QNN_OP_DETECTION_OUTPUT_PARAM_DETECTION_LIMIT       "detection_limit"
+
+#define QNN_OP_DISTRIBUTE_FPN_PROPOSALS                           "DistributeFpnProposals"
+#define QNN_OP_DISTRIBUTE_FPN_PROPOSALS_PARAM_ROI_MIN_LEVEL       "roi_min_level"
+#define QNN_OP_DISTRIBUTE_FPN_PROPOSALS_PARAM_ROI_MAX_LEVEL       "roi_max_level"
+#define QNN_OP_DISTRIBUTE_FPN_PROPOSALS_PARAM_ROI_CANONICAL_SCALE "roi_canonical_scale"
+#define QNN_OP_DISTRIBUTE_FPN_PROPOSALS_PARAM_ROI_CANONICAL_LEVEL "roi_canonical_level"
+
+#define QNN_OP_ELEMENT_WISE_ABS "ElementWiseAbs"
+
+#define QNN_OP_ELEMENT_WISE_ADD "ElementWiseAdd"
+
+#define QNN_OP_ELEMENT_WISE_AND "ElementWiseAnd"
+
+#define QNN_OP_ELEMENT_WISE_ASIN "ElementWiseAsin"
+
+#define QNN_OP_ELEMENT_WISE_ATAN "ElementWiseAtan"
+
+#define QNN_OP_ELEMENT_WISE_BINARY                              "ElementWiseBinary"
+#define QNN_OP_ELEMENT_WISE_BINARY_PARAM_OPERATION              "operation"
+#define QNN_OP_ELEMENT_WISE_BINARY_OPERATION_ADD                0
+#define QNN_OP_ELEMENT_WISE_BINARY_OPERATION_AND                1
+#define QNN_OP_ELEMENT_WISE_BINARY_OPERATION_DIVIDE             2
+#define QNN_OP_ELEMENT_WISE_BINARY_OPERATION_EQUAL              3
+#define QNN_OP_ELEMENT_WISE_BINARY_OPERATION_FLOOR_DIV          4
+#define QNN_OP_ELEMENT_WISE_BINARY_OPERATION_FMOD               5
+#define QNN_OP_ELEMENT_WISE_BINARY_OPERATION_GREATER            6
+#define QNN_OP_ELEMENT_WISE_BINARY_OPERATION_GREATER_EQUAL      7
+#define QNN_OP_ELEMENT_WISE_BINARY_OPERATION_LESS               8
+#define QNN_OP_ELEMENT_WISE_BINARY_OPERATION_LESS_EQUAL         9
+#define QNN_OP_ELEMENT_WISE_BINARY_OPERATION_MAXIMUM            10
+#define QNN_OP_ELEMENT_WISE_BINARY_OPERATION_MINIMUM            11
+#define QNN_OP_ELEMENT_WISE_BINARY_OPERATION_MOD                12
+#define QNN_OP_ELEMENT_WISE_BINARY_OPERATION_MULTIPLY           13
+#define QNN_OP_ELEMENT_WISE_BINARY_OPERATION_NOT_EQUAL          14
+#define QNN_OP_ELEMENT_WISE_BINARY_OPERATION_OR                 15
+#define QNN_OP_ELEMENT_WISE_BINARY_OPERATION_POWER              16
+#define QNN_OP_ELEMENT_WISE_BINARY_OPERATION_SQUARED_DIFFERENCE 17
+#define QNN_OP_ELEMENT_WISE_BINARY_OPERATION_SUBTRACT           18
+#define QNN_OP_ELEMENT_WISE_BINARY_OPERATION_XOR                19
+
+#define QNN_OP_ELEMENT_WISE_CEIL "ElementWiseCeil"
+
+#define QNN_OP_ELEMENT_WISE_COS "ElementWiseCos"
+
+#define QNN_OP_ELEMENT_WISE_DIVIDE "ElementWiseDivide"
+
+#define QNN_OP_ELEMENT_WISE_EQUAL "ElementWiseEqual"
+
+#define QNN_OP_ELEMENT_WISE_EXP "ElementWiseExp"
+
+#define QNN_OP_ELEMENT_WISE_FLOOR "ElementWiseFloor"
+
+#define QNN_OP_ELEMENT_WISE_FLOOR_DIV "ElementWiseFloorDiv"
+
+#define QNN_OP_ELEMENT_WISE_FMOD "ElementWiseFmod"
+
+#define QNN_OP_ELEMENT_WISE_GREATER "ElementWiseGreater"
+
+#define QNN_OP_ELEMENT_WISE_GREATER_EQUAL "ElementWiseGreaterEqual"
+
+#define QNN_OP_ELEMENT_WISE_LESS "ElementWiseLess"
+
+#define QNN_OP_ELEMENT_WISE_LESS_EQUAL "ElementWiseLessEqual"
+
+#define QNN_OP_ELEMENT_WISE_LOG "ElementWiseLog"
+
+#define QNN_OP_ELEMENT_WISE_MAXIMUM "ElementWiseMaximum"
+
+#define QNN_OP_ELEMENT_WISE_MINIMUM "ElementWiseMinimum"
+
+#define QNN_OP_ELEMENT_WISE_MOD "ElementWiseMod"
+
+#define QNN_OP_ELEMENT_WISE_MULTIPLY "ElementWiseMultiply"
+
+#define QNN_OP_ELEMENT_WISE_NEG "ElementWiseNeg"
+
+#define QNN_OP_ELEMENT_WISE_NEURON                        "ElementWiseNeuron"
+#define QNN_OP_ELEMENT_WISE_NEURON_PARAM_OPERATION        "operation"
+#define QNN_OP_ELEMENT_WISE_NEURON_OPERATION_ELU          0
+#define QNN_OP_ELEMENT_WISE_NEURON_OPERATION_GELU         1
+#define QNN_OP_ELEMENT_WISE_NEURON_OPERATION_HARD_SIGMOID 2
+#define QNN_OP_ELEMENT_WISE_NEURON_OPERATION_HARD_SWISH   3
+#define QNN_OP_ELEMENT_WISE_NEURON_OPERATION_RELU         4
+#define QNN_OP_ELEMENT_WISE_NEURON_OPERATION_RELU_MIN_MAX 5
+#define QNN_OP_ELEMENT_WISE_NEURON_OPERATION_SIGMOID      6
+#define QNN_OP_ELEMENT_WISE_NEURON_OPERATION_SOFTPLUS     7
+#define QNN_OP_ELEMENT_WISE_NEURON_OPERATION_TANH         8
+#define QNN_OP_ELEMENT_WISE_NEURON_PARAM_ALPHA            "alpha"
+#define QNN_OP_ELEMENT_WISE_NEURON_PARAM_BETA             "beta"
+#define QNN_OP_ELEMENT_WISE_NEURON_PARAM_MIN_VALUE        "min_value"
+#define QNN_OP_ELEMENT_WISE_NEURON_PARAM_MAX_VALUE        "max_value"
+#define QNN_OP_ELEMENT_WISE_NEURON_PARAM_THRESHOLD        "threshold"
+
+#define QNN_OP_ELEMENT_WISE_NOT "ElementWiseNot"
+
+#define QNN_OP_ELEMENT_WISE_NOT_EQUAL "ElementWiseNotEqual"
+
+#define QNN_OP_ELEMENT_WISE_OR "ElementWiseOr"
+
+#define QNN_OP_ELEMENT_WISE_POWER "ElementWisePower"
+
+#define QNN_OP_ELEMENT_WISE_ROUND "ElementWiseRound"
+
+#define QNN_OP_ELEMENT_WISE_RSQRT "ElementWiseRsqrt"
+
+#define QNN_OP_ELEMENT_WISE_SELECT "ElementWiseSelect"
+
+#define QNN_OP_ELEMENT_WISE_SIN "ElementWiseSin"
+
+#define QNN_OP_ELEMENT_WISE_SIGN "ElementWiseSign"
+
+#define QNN_OP_ELEMENT_WISE_SOFTPLUS                 "ElementWiseSoftplus"
+#define QNN_OP_ELEMENT_WISE_SOFTPLUS_PARAM_BETA      "beta"
+#define QNN_OP_ELEMENT_WISE_SOFTPLUS_PARAM_THRESHOLD "threshold"
+
+#define QNN_OP_ELEMENT_WISE_SQUARED_DIFFERENCE "ElementWiseSquaredDifference"
+
+#define QNN_OP_ELEMENT_WISE_SQUARE_ROOT "ElementWiseSquareRoot"
+
+#define QNN_OP_ELEMENT_WISE_SUBTRACT "ElementWiseSubtract"
+
+#define QNN_OP_ELEMENT_WISE_UNARY                      "ElementWiseUnary"
+#define QNN_OP_ELEMENT_WISE_UNARY_PARAM_OPERATION      "operation"
+#define QNN_OP_ELEMENT_WISE_UNARY_OPERATION_ABS        0
+#define QNN_OP_ELEMENT_WISE_UNARY_OPERATION_ASIN       1
+#define QNN_OP_ELEMENT_WISE_UNARY_OPERATION_ATAN       2
+#define QNN_OP_ELEMENT_WISE_UNARY_OPERATION_CEIL       3
+#define QNN_OP_ELEMENT_WISE_UNARY_OPERATION_COS        4
+#define QNN_OP_ELEMENT_WISE_UNARY_OPERATION_EXP        5
+#define QNN_OP_ELEMENT_WISE_UNARY_OPERATION_FLOOR      6
+#define QNN_OP_ELEMENT_WISE_UNARY_OPERATION_LOG        7
+#define QNN_OP_ELEMENT_WISE_UNARY_OPERATION_NEG        8
+#define QNN_OP_ELEMENT_WISE_UNARY_OPERATION_NOT        9
+#define QNN_OP_ELEMENT_WISE_UNARY_OPERATION_RECIPROCAL 10
+#define QNN_OP_ELEMENT_WISE_UNARY_OPERATION_ROUND      11
+#define QNN_OP_ELEMENT_WISE_UNARY_OPERATION_RSQRT      12
+#define QNN_OP_ELEMENT_WISE_UNARY_OPERATION_SIGN       13
+#define QNN_OP_ELEMENT_WISE_UNARY_OPERATION_SIN        14
+#define QNN_OP_ELEMENT_WISE_UNARY_OPERATION_SQRT       15
+
+#define QNN_OP_ELEMENT_WISE_XOR "ElementWiseXor"
+
+#define QNN_OP_ELU             "Elu"
+#define QNN_OP_ELU_PARAM_ALPHA "alpha"
+
+#define QNN_OP_EXPAND_DIMS            "ExpandDims"
+#define QNN_OP_EXPAND_DIMS_PARAM_AXIS "axis"
+#define QNN_OP_EXPAND_DIMS_PARAM_AXES "axes"
+
+#define QNN_OP_EXTRACT_GLIMPSE                  "ExtractGlimpse"
+#define QNN_OP_EXTRACT_GLIMPSE_PARAM_SIZE       "size"
+#define QNN_OP_EXTRACT_GLIMPSE_PARAM_CENTERED   "centered"
+#define QNN_OP_EXTRACT_GLIMPSE_PARAM_NORMALIZED "normalized"
+#define QNN_OP_EXTRACT_GLIMPSE_PARAM_NOISE      "noise"
+#define QNN_OP_EXTRACT_GLIMPSE_NOISE_UNIFORM    0
+#define QNN_OP_EXTRACT_GLIMPSE_NOISE_GAUSSIAN   1
+#define QNN_OP_EXTRACT_GLIMPSE_NOISE_ZEROES     2
+
+#define QNN_OP_EXTRACT_PATCHES               "ExtractPatches"
+#define QNN_OP_EXTRACT_PATCHES_PARAM_SIZE    "size"
+#define QNN_OP_EXTRACT_PATCHES_PARAM_STRIDE  "stride"
+#define QNN_OP_EXTRACT_PATCHES_PARAM_RATE    "rate"
+#define QNN_OP_EXTRACT_PATCHES_PARAM_PADDING "padding"
+#define QNN_OP_EXTRACT_PATCHES_PADDING_VALID 0
+#define QNN_OP_EXTRACT_PATCHES_PADDING_SAME  1
+
+#define QNN_OP_FULLY_CONNECTED                 "FullyConnected"
+#define QNN_OP_FULLY_CONNECTED_PARAM_KEEP_DIMS "keep_dims"
+
+#define QNN_OP_GATHER            "Gather"
+#define QNN_OP_GATHER_PARAM_AXIS "axis"
+
+#define QNN_OP_GATHER_ELEMENTS            "GatherElements"
+#define QNN_OP_GATHER_ELEMENTS_PARAM_AXIS "axis"
+
+#define QNN_OP_GATHER_ND                  "GatherNd"
+#define QNN_OP_GATHER_ND_PARAM_BATCH_DIMS "batch_dims"
+
+#define QNN_OP_GELU "Gelu"
+
+#define QNN_OP_GENERATE_PROPOSALS                       "GenerateProposals"
+#define QNN_OP_GENERATE_PROPOSALS_PARAM_IMG_SIZE_RATIO  "img_size_ratio"
+#define QNN_OP_GENERATE_PROPOSALS_PARAM_MIN_SIZE        "min_size"
+#define QNN_OP_GENERATE_PROPOSALS_PARAM_PRE_NMS_LIMIT   "pre_nms_limit"
+#define QNN_OP_GENERATE_PROPOSALS_PARAM_POST_NMS_LIMIT  "post_nms_limit"
+#define QNN_OP_GENERATE_PROPOSALS_PARAM_IOU_THRESHOLD   "iou_threshold"
+#define QNN_OP_GENERATE_PROPOSALS_PARAM_BBOX_XFORM_CLIP "bbox_xform_clip"
+
+#define QNN_OP_GET_SPARSE_INDICES "GetSparseIndices"
+
+#define QNN_OP_GET_SPARSE_VALUES "GetSparseValues"
+
+#define QNN_OP_GRID_SAMPLE                         "GridSample"
+#define QNN_OP_GRID_SAMPLE_PARAM_ALIGN_CORNERS     "align_corners"
+#define QNN_OP_GRID_SAMPLE_PARAM_MODE              "mode"
+#define QNN_OP_GRID_SAMPLE_MODE_BILINEAR           0
+#define QNN_OP_GRID_SAMPLE_MODE_NEAREST            1
+#define QNN_OP_GRID_SAMPLE_PARAM_PADDING_MODE      "padding_mode"
+#define QNN_OP_GRID_SAMPLE_PADDING_MODE_ZEROS      0
+#define QNN_OP_GRID_SAMPLE_PADDING_MODE_BORDER     1
+#define QNN_OP_GRID_SAMPLE_PADDING_MODE_REFLECTION 2
+
+#define QNN_OP_GROUP_NORM               "GroupNorm"
+#define QNN_OP_GROUP_NORM_PARAM_EPSILON "epsilon"
+#define QNN_OP_GROUP_NORM_PARAM_GROUP   "group"
+
+#define QNN_OP_GRU                           "Gru"
+#define QNN_OP_GRU_PARAM_DIRECTION           "direction"
+#define QNN_OP_GRU_DIRECTION_FORWARD         0
+#define QNN_OP_GRU_DIRECTION_REVERSE         1
+#define QNN_OP_GRU_PARAM_LINEAR_BEFORE_RESET "linear_before_reset"
+#define QNN_OP_GRU_PARAM_TIME_MAJOR          "time_major"
+
+#define QNN_OP_HARD_SWISH "HardSwish"
+
+#define QNN_OP_HEAT_MAP_MAX_KEY_POINT "HeatMapMaxKeyPoint"
+
+#define QNN_OP_IM2_COL                   "Im2Col"
+#define QNN_OP_IM2_COL_PARAM_KERNEL_SIZE "kernel_size"
+#define QNN_OP_IM2_COL_PARAM_STRIDE      "stride"
+#define QNN_OP_IM2_COL_PARAM_PAD_AMOUNT  "pad_amount"
+#define QNN_OP_IM2_COL_PARAM_DILATION    "dilation"
+
+#define QNN_OP_IF                  "If"
+#define QNN_OP_IF_PARAM_THEN_GRAPH "then_graph"
+#define QNN_OP_IF_PARAM_ELSE_GRAPH "else_graph"
+
+#define QNN_OP_IMAGE_PROJECTION_TRANSFORM                                     "ImageProjectionTransform"
+#define QNN_OP_IMAGE_PROJECTION_TRANSFORM_PARAM_INTERPOLATION_MODE            "interpolation_mode"
+#define QNN_OP_IMAGE_PROJECTION_TRANSFORM_INTERPOLATION_MODE_BILINEAR         0
+#define QNN_OP_IMAGE_PROJECTION_TRANSFORM_INTERPOLATION_MODE_NEAREST_NEIGHBOR 1
+
+#define QNN_OP_INSTANCE_NORM                          "InstanceNorm"
+#define QNN_OP_INSTANCE_NORM_PARAM_EPSILON            "epsilon"
+#define QNN_OP_INSTANCE_NORM_PARAM_MODE               "mode"
+#define QNN_OP_INSTANCE_NORM_MODE_MU_SIGMA            0
+#define QNN_OP_INSTANCE_NORM_MODE_RMS                 1
+#define QNN_OP_INSTANCE_NORM_PARAM_NORMALIZE_VARIANCE "normalize_variance"
+#define QNN_OP_INSTANCE_NORM_PARAM_REGION             "region"
+#define QNN_OP_INSTANCE_NORM_REGION_ACROSS_SPATIAL    0
+#define QNN_OP_INSTANCE_NORM_REGION_ACROSS_CHANNEL    1
+#define QNN_OP_INSTANCE_NORM_REGION_ACROSS_ALL        2
+
+#define QNN_OP_L2_NORM               "L2Norm"
+#define QNN_OP_L2_NORM_PARAM_AXIS    "axis"
+#define QNN_OP_L2_NORM_PARAM_AXES    "axes"
+#define QNN_OP_L2_NORM_PARAM_EPSILON "epsilon"
+
+#define QNN_OP_L2_POOL_2D                   "L2Pool2d"
+#define QNN_OP_L2_POOL_2D_PARAM_FILTER_SIZE "filter_size"
+#define QNN_OP_L2_POOL_2D_PARAM_STRIDE      "stride"
+#define QNN_OP_L2_POOL_2D_PARAM_PAD_AMOUNT  "pad_amount"
+
+#define QNN_OP_LAYER_NORM               "LayerNorm"
+#define QNN_OP_LAYER_NORM_PARAM_EPSILON "epsilon"
+#define QNN_OP_LAYER_NORM_PARAM_AXES    "axes"
+
+#define QNN_OP_LOG_SOFTMAX            "LogSoftmax"
+#define QNN_OP_LOG_SOFTMAX_PARAM_AXIS "axis"
+#define QNN_OP_LOG_SOFTMAX_PARAM_BETA "beta"
+
+#define QNN_OP_LRN                       "Lrn"
+#define QNN_OP_LRN_PARAM_ALPHA           "alpha"
+#define QNN_OP_LRN_PARAM_BETA            "beta"
+#define QNN_OP_LRN_PARAM_BIAS            "bias"
+#define QNN_OP_LRN_PARAM_RADIUS          "radius"
+#define QNN_OP_LRN_PARAM_REGION          "region"
+#define QNN_OP_LRN_REGION_ACROSS_CHANNEL 0
+#define QNN_OP_LRN_REGION_WITHIN_CHANNEL 1
+
+#define QNN_OP_LSTM                             "Lstm"
+#define QNN_OP_LSTM_PARAM_DIRECTION             "direction"
+#define QNN_OP_LSTM_DIRECTION_FORWARD           0
+#define QNN_OP_LSTM_DIRECTION_REVERSE           1
+#define QNN_OP_LSTM_PARAM_CELL_CLIP_THRESHOLD   "cell_clip_threshold"
+#define QNN_OP_LSTM_PARAM_OUTPUT_CLIP_THRESHOLD "output_clip_threshold"
+#define QNN_OP_LSTM_PARAM_TIME_MAJOR            "time_major"
+#define QNN_OP_LSTM_PARAM_INPUT_GATE_QSCALE     "input_gate_qscale"
+#define QNN_OP_LSTM_PARAM_FORGET_GATE_QSCALE    "forget_gate_qscale"
+#define QNN_OP_LSTM_PARAM_CELL_GATE_QSCALE      "cell_gate_qscale"
+#define QNN_OP_LSTM_PARAM_OUTPUT_GATE_QSCALE    "output_gate_qscale"
+#define QNN_OP_LSTM_PARAM_HIDDEN_STATE_OFFSET   "hidden_state_offset"
+#define QNN_OP_LSTM_PARAM_HIDDEN_STATE_QSCALE   "hidden_state_qscale"
+
+#define QNN_OP_MASKED_SOFTMAX                   "MaskedSoftmax"
+#define QNN_OP_MASKED_SOFTMAX_PARAM_MODE        "mode"
+#define QNN_OP_MASKED_SOFTMAX_MODE_UNCOMPRESSED 0
+#define QNN_OP_MASKED_SOFTMAX_MODE_COMPRESSED   1
+
+#define QNN_OP_MOMENTS                 "Moments"
+#define QNN_OP_MOMENTS_PARAM_AXES      "axes"
+#define QNN_OP_MOMENTS_PARAM_KEEP_DIMS "keep_dims"
+
+#define QNN_OP_MULTI_CLASS_NMS                       "MultiClassNms"
+#define QNN_OP_MULTI_CLASS_NMS_PARAM_IOU_THRESHOLD   "iou_threshold"
+#define QNN_OP_MULTI_CLASS_NMS_PARAM_SCORE_THRESHOLD "score_threshold"
+#define QNN_OP_MULTI_CLASS_NMS_PARAM_SOFT_NMS_SIGMA  "soft_nms_sigma"
+
+#define QNN_OP_NON_MAX_SUPPRESSION                          "NonMaxSuppression"
+#define QNN_OP_NON_MAX_SUPPRESSION_PARAM_IOU_THRESHOLD      "iou_threshold"
+#define QNN_OP_NON_MAX_SUPPRESSION_PARAM_SCORE_THRESHOLD    "score_threshold"
+#define QNN_OP_NON_MAX_SUPPRESSION_PARAM_MAX_BOXES_SELECTED "max_boxes_selected"
+
+#define QNN_OP_NON_ZERO "NonZero"
+
+#define QNN_OP_NV12_TO_RGB                    "Nv12ToRgb"
+#define QNN_OP_NV12_TO_RGB_PARAM_OUTPUT_ORDER "output_order"
+#define QNN_OP_NV12_TO_RGB_OUTPUT_ORDER_RGB   0
+#define QNN_OP_NV12_TO_RGB_OUTPUT_ORDER_BGR   1
+
+#define QNN_OP_NV21_TO_RGB                    "Nv21ToRgb"
+#define QNN_OP_NV21_TO_RGB_PARAM_OUTPUT_ORDER "output_order"
+#define QNN_OP_NV21_TO_RGB_OUTPUT_ORDER_RGB   0
+#define QNN_OP_NV21_TO_RGB_OUTPUT_ORDER_BGR   1
+
+#define QNN_OP_ONE_HOT                 "OneHot"
+#define QNN_OP_ONE_HOT_PARAM_DEPTH     "depth"
+#define QNN_OP_ONE_HOT_PARAM_AXIS      "axis"
+#define QNN_OP_ONE_HOT_PARAM_ON_VALUE  "on_value"
+#define QNN_OP_ONE_HOT_PARAM_OFF_VALUE "off_value"
+
+#define QNN_OP_PACK            "Pack"
+#define QNN_OP_PACK_PARAM_AXIS "axis"
+
+#define QNN_OP_MAT_MUL                     "MatMul"
+#define QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN0 "transpose_in0"
+#define QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN1 "transpose_in1"
+
+#define QNN_OP_PAD                          "Pad"
+#define QNN_OP_PAD_PARAM_SCHEME             "scheme"
+#define QNN_OP_PAD_SCHEME_CONSTANT          0
+#define QNN_OP_PAD_SCHEME_MIRROR_SYMMETRIC  1
+#define QNN_OP_PAD_SCHEME_MIRROR_REFLECT    2
+#define QNN_OP_PAD_SCHEME_EDGE              3
+#define QNN_OP_PAD_PARAM_PAD_AMOUNT         "pad_amount"
+#define QNN_OP_PAD_PARAM_PAD_CONSTANT_VALUE "pad_constant_value"
+
+#define QNN_OP_POOL_AVG_2D                           "PoolAvg2d"
+#define QNN_OP_POOL_AVG_2D_PARAM_FILTER_SIZE         "filter_size"
+#define QNN_OP_POOL_AVG_2D_PARAM_STRIDE              "stride"
+#define QNN_OP_POOL_AVG_2D_PARAM_PAD_AMOUNT          "pad_amount"
+#define QNN_OP_POOL_AVG_2D_PARAM_COUNT_PAD_FOR_EDGES "count_pad_for_edges"
+#define QNN_OP_POOL_AVG_2D_PARAM_ROUNDING_MODE       "rounding_mode"
+#define QNN_OP_POOL_AVG_2D_ROUNDING_MODE_FLOOR       0
+#define QNN_OP_POOL_AVG_2D_ROUNDING_MODE_CEIL        1
+
+#define QNN_OP_POOL_AVG_3D                           "PoolAvg3d"
+#define QNN_OP_POOL_AVG_3D_PARAM_FILTER_SIZE         "filter_size"
+#define QNN_OP_POOL_AVG_3D_PARAM_STRIDE              "stride"
+#define QNN_OP_POOL_AVG_3D_PARAM_PAD_AMOUNT          "pad_amount"
+#define QNN_OP_POOL_AVG_3D_PARAM_COUNT_PAD_FOR_EDGES "count_pad_for_edges"
+#define QNN_OP_POOL_AVG_3D_PARAM_ROUNDING_MODE       "rounding_mode"
+#define QNN_OP_POOL_AVG_3D_ROUNDING_MODE_FLOOR       0
+#define QNN_OP_POOL_AVG_3D_ROUNDING_MODE_CEIL        1
+
+#define QNN_OP_POOL_MAX_2D                     "PoolMax2d"
+#define QNN_OP_POOL_MAX_2D_PARAM_FILTER_SIZE   "filter_size"
+#define QNN_OP_POOL_MAX_2D_PARAM_STRIDE        "stride"
+#define QNN_OP_POOL_MAX_2D_PARAM_PAD_AMOUNT    "pad_amount"
+#define QNN_OP_POOL_MAX_2D_PARAM_ROUNDING_MODE "rounding_mode"
+#define QNN_OP_POOL_MAX_2D_ROUNDING_MODE_FLOOR 0
+#define QNN_OP_POOL_MAX_2D_ROUNDING_MODE_CEIL  1
+
+#define QNN_OP_POOL_MAX_3D                     "PoolMax3d"
+#define QNN_OP_POOL_MAX_3D_PARAM_FILTER_SIZE   "filter_size"
+#define QNN_OP_POOL_MAX_3D_PARAM_STRIDE        "stride"
+#define QNN_OP_POOL_MAX_3D_PARAM_PAD_AMOUNT    "pad_amount"
+#define QNN_OP_POOL_MAX_3D_PARAM_ROUNDING_MODE "rounding_mode"
+#define QNN_OP_POOL_MAX_3D_ROUNDING_MODE_FLOOR 0
+#define QNN_OP_POOL_MAX_3D_ROUNDING_MODE_CEIL  1
+
+#define QNN_OP_PRELU "Prelu"
+
+#define QNN_OP_QUANTIZE "Quantize"
+
+#define QNN_OP_REDUCE_MAX                 "ReduceMax"
+#define QNN_OP_REDUCE_MAX_PARAM_AXES      "axes"
+#define QNN_OP_REDUCE_MAX_PARAM_KEEP_DIMS "keep_dims"
+
+#define QNN_OP_REDUCE_MEAN                 "ReduceMean"
+#define QNN_OP_REDUCE_MEAN_PARAM_AXES      "axes"
+#define QNN_OP_REDUCE_MEAN_PARAM_KEEP_DIMS "keep_dims"
+
+#define QNN_OP_REDUCE_MIN                 "ReduceMin"
+#define QNN_OP_REDUCE_MIN_PARAM_AXES      "axes"
+#define QNN_OP_REDUCE_MIN_PARAM_KEEP_DIMS "keep_dims"
+
+#define QNN_OP_REDUCE_PROD                 "ReduceProd"
+#define QNN_OP_REDUCE_PROD_PARAM_AXES      "axes"
+#define QNN_OP_REDUCE_PROD_PARAM_KEEP_DIMS "keep_dims"
+
+#define QNN_OP_REDUCE_SUM                 "ReduceSum"
+#define QNN_OP_REDUCE_SUM_PARAM_AXES      "axes"
+#define QNN_OP_REDUCE_SUM_PARAM_KEEP_DIMS "keep_dims"
+
+#define QNN_OP_REDUCE_SUM_SQUARE                 "ReduceSumSquare"
+#define QNN_OP_REDUCE_SUM_SQUARE_PARAM_AXES      "axes"
+#define QNN_OP_REDUCE_SUM_SQUARE_PARAM_KEEP_DIMS "keep_dims"
+
+#define QNN_OP_RELU "Relu"
+
+#define QNN_OP_RELU1 "Relu1"
+
+#define QNN_OP_RELU6 "Relu6"
+
+#define QNN_OP_RELU_MIN_MAX                 "ReluMinMax"
+#define QNN_OP_RELU_MIN_MAX_PARAM_MIN_VALUE "min_value"
+#define QNN_OP_RELU_MIN_MAX_PARAM_MAX_VALUE "max_value"
+
+#define QNN_OP_RESHAPE "Reshape"
+
+#define QNN_OP_RESIZE                                        "Resize"
+#define QNN_OP_RESIZE_PARAM_EXCLUDE_OUTSIDE                  "exclude_outside"
+#define QNN_OP_RESIZE_PARAM_TRANSFORMATION_MODE              "transformation_mode"
+#define QNN_OP_RESIZE_TRANSFORMATION_MODE_HALF_PIXEL         0
+#define QNN_OP_RESIZE_TRANSFORMATION_MODE_PYTORCH_HALF_PIXEL 1
+#define QNN_OP_RESIZE_TRANSFORMATION_MODE_ALIGN_CORNERS      2
+#define QNN_OP_RESIZE_TRANSFORMATION_MODE_ASYMMETRIC         3
+#define QNN_OP_RESIZE_PARAM_INTERPOLATION_MODE               "interpolation_mode"
+#define QNN_OP_RESIZE_INTERPOLATION_MODE_NEAREST             0
+#define QNN_OP_RESIZE_INTERPOLATION_MODE_LINEAR              1
+#define QNN_OP_RESIZE_INTERPOLATION_MODE_CUBIC               2
+#define QNN_OP_RESIZE_PARAM_NEAREST_MODE                     "nearest_mode"
+#define QNN_OP_RESIZE_NEAREST_MODE_ROUND_PREFER_FLOOR        0
+#define QNN_OP_RESIZE_NEAREST_MODE_ROUND_PREFER_CEIL         1
+#define QNN_OP_RESIZE_NEAREST_MODE_FLOOR                     2
+#define QNN_OP_RESIZE_NEAREST_MODE_CEIL                      3
+#define QNN_OP_RESIZE_PARAM_CUBIC_COEFF                      "cubic_coeff"
+
+#define QNN_OP_RESIZE_BILINEAR                          "ResizeBilinear"
+#define QNN_OP_RESIZE_BILINEAR_PARAM_ALIGN_CORNERS      "align_corners"
+#define QNN_OP_RESIZE_BILINEAR_PARAM_HALF_PIXEL_CENTERS "half_pixel_centers"
+#define QNN_OP_RESIZE_BILINEAR_PARAM_ANTIALIAS          "antialias"
+
+#define QNN_OP_RESIZE_NEAREST_NEIGHBOR                          "ResizeNearestNeighbor"
+#define QNN_OP_RESIZE_NEAREST_NEIGHBOR_PARAM_ALIGN_CORNERS      "align_corners"
+#define QNN_OP_RESIZE_NEAREST_NEIGHBOR_PARAM_HALF_PIXEL_CENTERS "half_pixel_centers"
+
+#define QNN_OP_RMS_NORM               "RmsNorm"
+#define QNN_OP_RMS_NORM_PARAM_EPSILON "epsilon"
+#define QNN_OP_RMS_NORM_PARAM_AXES    "axes"
+
+#define QNN_OP_ROI_ALIGN                         "RoiAlign"
+#define QNN_OP_ROI_ALIGN_PARAM_IMG_SIZE_RATIO    "img_size_ratio"
+#define QNN_OP_ROI_ALIGN_PARAM_NUM_SAMPLES_Y     "num_samples_y"
+#define QNN_OP_ROI_ALIGN_PARAM_NUM_SAMPLES_X     "num_samples_x"
+#define QNN_OP_ROI_ALIGN_PARAM_ALIGNED           "aligned"
+#define QNN_OP_ROI_ALIGN_PARAM_ALLOW_INVALID_ROI "allow_invalid_roi"
+
+#define QNN_OP_ROI_POOLING                      "RoiPooling"
+#define QNN_OP_ROI_POOLING_PARAM_IMG_SIZE_RATIO "img_size_ratio"
+
+#define QNN_OP_SCATTER_ELEMENTS                 "ScatterElements"
+#define QNN_OP_SCATTER_ELEMENTS_PARAM_AXIS      "axis"
+#define QNN_OP_SCATTER_ELEMENTS_PARAM_REDUCTION "reduction"
+#define QNN_OP_SCATTER_ELEMENTS_REDUCTION_NONE  0
+#define QNN_OP_SCATTER_ELEMENTS_REDUCTION_ADD   1
+#define QNN_OP_SCATTER_ELEMENTS_REDUCTION_MUL   2
+#define QNN_OP_SCATTER_ELEMENTS_REDUCTION_MAX   3
+
+#define QNN_OP_SCATTER_ND                 "ScatterNd"
+#define QNN_OP_SCATTER_ND_PARAM_REDUCTION "reduction"
+#define QNN_OP_SCATTER_ND_REDUCTION_NONE  0
+#define QNN_OP_SCATTER_ND_REDUCTION_ADD   1
+#define QNN_OP_SCATTER_ND_REDUCTION_MUL   2
+
+#define QNN_OP_SHAPE             "Shape"
+#define QNN_OP_SHAPE_PARAM_START "start"
+#define QNN_OP_SHAPE_PARAM_END   "end"
+
+#define QNN_OP_SIGMOID "Sigmoid"
+
+#define QNN_OP_SOFTMAX            "Softmax"
+#define QNN_OP_SOFTMAX_PARAM_AXIS "axis"
+#define QNN_OP_SOFTMAX_PARAM_BETA "beta"
+
+#define QNN_OP_SPACE_TO_BATCH                  "SpaceToBatch"
+#define QNN_OP_SPACE_TO_BATCH_PARAM_BLOCK_SIZE "block_size"
+#define QNN_OP_SPACE_TO_BATCH_PARAM_PAD_AMOUNT "pad_amount"
+
+#define QNN_OP_SPACE_TO_DEPTH                  "SpaceToDepth"
+#define QNN_OP_SPACE_TO_DEPTH_PARAM_BLOCK_SIZE "block_size"
+#define QNN_OP_SPACE_TO_DEPTH_PARAM_MODE       "mode"
+#define QNN_OP_SPACE_TO_DEPTH_MODE_DCR         0
+#define QNN_OP_SPACE_TO_DEPTH_MODE_CRD         1
+
+#define QNN_OP_SPARSE_TO_DENSE "SparseToDense"
+
+#define QNN_OP_SPLIT                   "Split"
+#define QNN_OP_SPLIT_PARAM_AXIS        "axis"
+#define QNN_OP_SPLIT_PARAM_SPLIT_INDEX "split_index"
+
+#define QNN_OP_SQUEEZE            "Squeeze"
+#define QNN_OP_SQUEEZE_PARAM_AXES "axes"
+
+#define QNN_OP_STRIDED_SLICE                     "StridedSlice"
+#define QNN_OP_STRIDED_SLICE_PARAM_RANGES        "ranges"
+#define QNN_OP_STRIDED_SLICE_PARAM_BEGIN_MASK    "begin_mask"
+#define QNN_OP_STRIDED_SLICE_PARAM_END_MASK      "end_mask"
+#define QNN_OP_STRIDED_SLICE_PARAM_SHRINK_AXES   "shrink_axes"
+#define QNN_OP_STRIDED_SLICE_PARAM_NEW_AXES_MASK "new_axes_mask"
+
+#define QNN_OP_TANH "Tanh"
+
+#define QNN_OP_TILE                 "Tile"
+#define QNN_OP_TILE_PARAM_MULTIPLES "multiples"
+
+#define QNN_OP_TOP_K               "TopK"
+#define QNN_OP_TOP_K_PARAM_K       "k"
+#define QNN_OP_TOP_K_PARAM_LARGEST "largest"
+
+#define QNN_OP_TRANSPOSE            "Transpose"
+#define QNN_OP_TRANSPOSE_PARAM_PERM "perm"
+
+#define QNN_OP_TRANSPOSE_CONV_1D                      "TransposeConv1d"
+#define QNN_OP_TRANSPOSE_CONV_1D_PARAM_STRIDE         "stride"
+#define QNN_OP_TRANSPOSE_CONV_1D_PARAM_PAD_AMOUNT     "pad_amount"
+#define QNN_OP_TRANSPOSE_CONV_1D_PARAM_GROUP          "group"
+#define QNN_OP_TRANSPOSE_CONV_1D_PARAM_OUTPUT_PADDING "output_padding"
+
+#define QNN_OP_TRANSPOSE_CONV_2D                      "TransposeConv2d"
+#define QNN_OP_TRANSPOSE_CONV_2D_PARAM_STRIDE         "stride"
+#define QNN_OP_TRANSPOSE_CONV_2D_PARAM_PAD_AMOUNT     "pad_amount"
+#define QNN_OP_TRANSPOSE_CONV_2D_PARAM_GROUP          "group"
+#define QNN_OP_TRANSPOSE_CONV_2D_PARAM_OUTPUT_PADDING "output_padding"
+
+#define QNN_OP_TRANSPOSE_CONV_3D                      "TransposeConv3d"
+#define QNN_OP_TRANSPOSE_CONV_3D_PARAM_STRIDE         "stride"
+#define QNN_OP_TRANSPOSE_CONV_3D_PARAM_PAD_AMOUNT     "pad_amount"
+#define QNN_OP_TRANSPOSE_CONV_3D_PARAM_DILATION       "dilation"
+#define QNN_OP_TRANSPOSE_CONV_3D_PARAM_GROUP          "group"
+#define QNN_OP_TRANSPOSE_CONV_3D_PARAM_OUTPUT_PADDING "output_padding"
+
+#define QNN_OP_UN_PACK            "UnPack"
+#define QNN_OP_UN_PACK_PARAM_AXIS "axis"
+
+#endif  // QNN_OP_DEF_H
\ No newline at end of file
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnOpPackage.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnOpPackage.h
new file mode 100755
index 0000000000000..8df5f4328a7f7
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnOpPackage.h
@@ -0,0 +1,561 @@
+//=============================================================================
+//
+//  Copyright (c) 2019-2023 Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//=============================================================================
+
+/**
+ *  @file
+ *  @brief  QNN Operation Package API
+ *
+ *          Provides interface to the backend to use registered OpPackage libraries.
+ */
+
+#ifndef QNN_OP_PACKAGE_H
+#define QNN_OP_PACKAGE_H
+
+#include "QnnCommon.h"
+#include "QnnLog.h"
+#include "QnnTypes.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+
+#define QNN_OP_PACKAGE_RESERVED_INFO_SIZE 12
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+/**
+ * @brief A typedef for op package handles.
+ */
+typedef Qnn_Handle_t Qnn_OpPackageHandle_t;
+
+/**
+ * @brief Backend-defined and -provided infrastructure object which provides the package
+ *        access to backend-wide facilities, e.g. memory management.
+ */
+typedef struct _QnnOpPackage_GlobalInfrastructure_t* QnnOpPackage_GlobalInfrastructure_t;
+
+/**
+ * @brief Backend-defined and -provided infrastructure object which provides the package
+ *        access to graph-specific facilities, e.g. execution context or graph structure
+ *        manipulation methods.
+ */
+typedef struct _QnnOpPackage_GraphInfrastructure_t* QnnOpPackage_GraphInfrastructure_t;
+
+/**
+ * @brief Backend-defined structure which represents Op implementation, with content
+ *        executable within the context of a backend. Provided and managed by the package.
+ */
+typedef struct _QnnOpPackage_OpImpl_t* QnnOpPackage_OpImpl_t;
+
+/**
+ * @brief Backend-defined structure which contains the parameters and connectivity information
+ *        for an operation node.
+ */
+typedef struct _QnnOpPackage_Node_t* QnnOpPackage_Node_t;
+
+/**
+ * @brief Backend-defined structure which encapsulates a graph optimization. Provided by the
+ *        package to the backend to enable the backend to optimize graphs containing
+ *        operation nodes for operations defined by this package.
+ */
+typedef struct _QnnOpPackage_Optimization_t* QnnOpPackage_Optimization_t;
+
+/**
+ * @brief Backend-defined structure which contains information for an operation.
+ *        Provided by the package to the backend to convey information needed to properly
+ *        construct an operation.
+ */
+typedef struct _QnnOpPackage_OperationInfo_t* QnnOpPackage_OperationInfo_t;
+
+/**
+ * @brief Backend-defined structure which contains information about Op package.
+ *        Provided by the package to the backend to convey information needed to properly
+ *        use the package.
+ */
+typedef struct _QnnOpPackage_PackageInfo_t QnnOpPackage_PackageInfo_t;
+
+/**
+ * @brief QNN OpPackage API result / error codes.
+ */
+typedef enum {
+  QNN_OP_PACKAGE_MIN_ERROR = QNN_MIN_ERROR_OP_PACKAGE,
+  //////////////////////////////////////////////////
+
+  QNN_OP_PACKAGE_NO_ERROR = QNN_SUCCESS,
+  /// There is optional API component that is not supported yet. See QnnProperty.
+  QNN_OP_PACKAGE_ERROR_UNSUPPORTED_FEATURE = QNN_COMMON_ERROR_NOT_SUPPORTED,
+  /// Op package library was already initialized.
+  QNN_OP_PACKAGE_ERROR_LIBRARY_ALREADY_INITIALIZED = QNN_MIN_ERROR_OP_PACKAGE + 0,
+  /// Attempt to call a function in an uninitialized op package library.
+  QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED = QNN_MIN_ERROR_OP_PACKAGE + 1,
+  /// An invalid op package handle was provided.
+  QNN_OP_PACKAGE_ERROR_INVALID_HANDLE = QNN_MIN_ERROR_OP_PACKAGE + 2,
+  /// Invalid infrastructure object used in initializing op package
+  QNN_OP_PACKAGE_ERROR_INVALID_INFRASTRUCTURE = QNN_MIN_ERROR_OP_PACKAGE + 100,
+  /// Invalid op package info object used in initializing op package
+  QNN_OP_PACKAGE_ERROR_INVALID_INFO = QNN_MIN_ERROR_OP_PACKAGE + 101,
+  /// Op configuration failed validation
+  QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE = QNN_MIN_ERROR_OP_PACKAGE + 110,
+  /// Invalid function argument
+  QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT = QNN_MIN_ERROR_OP_PACKAGE + 200,
+  /// Indicates an error has occurred due to a condition unforeseen by QNN, and possibly
+  /// meaningful only in the context of the particular op package. Unless otherwise
+  /// noted, any op package function may return this error.
+  QNN_OP_PACKAGE_ERROR_GENERAL = QNN_COMMON_ERROR_GENERAL,
+
+  //////////////////////////////////////////////////
+  QNN_OP_PACKAGE_MAX_ERROR = QNN_MAX_ERROR_OP_PACKAGE,
+  // Unused, present to ensure 32 bits.
+  QNN_OP_PACKAGE_ERROR_UNDEFINED = 0x7FFFFFFF
+} QnnOpPackage_Error_t;
+
+/**
+ * @brief Struct describing the contents of an Op package.
+ *        \n Reported to the backend by QnnOpPackage_GetInfoFn_t.
+ */
+typedef struct {
+  /// Op package name. Must not be NULL nor empty string.
+  const char* packageName;
+  /// Array holding names of operations provided by the op package. Must not be NULL.
+  /// Number of elements in the array is specified with _numOperations_.
+  const char** operationNames;
+  /// Array holding backend-defined operation information.
+  /// This is optional, backend-specific information. Can be NULL.
+  /// If not NULL, number of elements in the array is specified with _numOperations_.
+  const QnnOpPackage_OperationInfo_t* operationInfo;
+  /// Number of elements in _operationNames_ and _operationInfo_ arrays.
+  uint32_t numOperations;
+  /// Array holding backend-defined graph optimizations.
+  /// This is optional, backend-specific information. Can be NULL.
+  /// If not NULL, number of elements in the array is specified with _numOptimizations_.
+  const QnnOpPackage_Optimization_t* optimizations;
+  /// Number of elements in _optimizations_ array.
+  uint32_t numOptimizations;
+  /// BuildId (as returned by QnnBackend_getBuildId(), also see QNN_SDK_BUILD_ID)
+  /// from QNN SDK which was used to create this OpPackage with. Allowed to be NULL.
+  const char* sdkBuildId;
+  /// API Version (as returned by QnnBackend_getApiVersion()) from QNN SDK which was
+  /// used to create this OpPackage with. Allowed to be NULL.
+  const Qnn_ApiVersion_t* sdkApiVersion;
+  /// Op package level information. Allowed to be NULL.
+  const QnnOpPackage_PackageInfo_t* packageInfo;
+  /// Version of the set of operations implemented in the op package.
+  const Qnn_Version_t* opsetVersion;
+  /// Reserved for future extensibility. Must be memset to 0.
+  size_t reserved[QNN_OP_PACKAGE_RESERVED_INFO_SIZE];
+} QnnOpPackage_Info_t;
+
+// clang-format off
+/// QnnOpPackage_Info_t initializer macro
+#define QNN_OP_PACKAGE_INFO_INIT   \
+  {                                \
+    NULL,     /*packageName*/      \
+    NULL,     /*operationNames*/   \
+    NULL,     /*operationInfo*/    \
+    0u,       /*numOperations*/    \
+    NULL,     /*optimizations*/    \
+    0u,       /*numOptimizations*/ \
+    NULL,     /*sdkBuildId*/       \
+    NULL,     /*sdkApiVersion*/    \
+    NULL,     /*packageInfo*/      \
+    NULL,     /*opsetVersion*/     \
+    { 0u }    /*reserved*/         \
+  }
+// clang-format on
+
+//------------------------------------------------------------------------------
+//   API Methods
+//------------------------------------------------------------------------------
+
+/**
+ * @brief Initialize an Op package library's data structures. This function must be called before
+ *        any other library functions. Calling multiple times will result in errors after the first
+ *        call. This function can be called again after QnnOpPackage_TerminateFn_t.
+ *
+ * @param[in] infrastructure Global infrastructure object provided by the backend, for use in all
+ *                           operations in the package. This is guaranteed to live at least until
+ *                           QnnOpPackage_TerminateFn_t returns, and is safe to cache.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: Op package library was successfully initialized.
+ *         - QNN_OP_PACKAGE_ERROR_LIBRARY_ALREADY_INITIALIZED: This package library
+ *           has already been initialized.
+ *         - QNN_OP_PACKAGE_ERROR_INVALID_INFRASTRUCTURE: Op package initialization failed
+ *           due to invalid infrastructure content.
+ *         - QNN_OP_PACKAGE_ERROR_GENERAL: Op package library failed to initialize.
+ */
+typedef Qnn_ErrorHandle_t (*QnnOpPackage_InitFn_t)(
+    QnnOpPackage_GlobalInfrastructure_t infrastructure);
+
+/**
+ * @brief Terminate an Op package library, freeing all data structures and invalidating any memory
+ *        or handles provided by the library. This function may be called again after a subsequent
+ *        call to QnnOpPackage_InitFn_t.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: Op package library was successfully terminated.
+ *         - QNN_OP_PACKAGE_ERROR_GENERAL: Op package library termination failed.
+ */
+typedef Qnn_ErrorHandle_t (*QnnOpPackage_TerminateFn_t)();
+
+/**
+ * @brief Retrieve a QnnOpPackage_Info_t struct from an Op package library describing all
+ *        operations and optimizations provided by the library.
+ *
+ * @param[out] info Info object for the library. This pointer shall point to memory owned by the op
+ *                  package library and remain valid until QnnOpPackage_TerminateFn_t is called on
+ *                  the library. The contents of this struct shall not change before
+ *                  QnnOpPackage_TerminateFn_t is called.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: Info is fetched successfully.
+ *         - QNN_OP_PACKAGE_ERROR_INVALID_INFO: 'info' argument was NULL or invalid.
+ *         - QNN_OP_PACKAGE_ERROR_GENERAL: Other error occurred.
+ */
+typedef Qnn_ErrorHandle_t (*QnnOpPackage_GetInfoFn_t)(const QnnOpPackage_Info_t** info);
+
+/**
+ * @brief Verifies that this op with the specified config can be successfully executed.
+ *
+ * @param[in] opConfig Op configuration in question.
+ *
+ * @note  _inputTensors_ and _outputTensors_ inside opConfig must be fully qualified for
+ *        complete validation. However, their unique IDs (_id_) are ignored during validation.
+ *
+ * @return error code:
+ *         - QNN_SUCCESS if validation is successful
+ *         - QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE: op config validation failed
+ *         - QNN_OP_PACKAGE_ERROR_UNSUPPORTED_FEATURE: Validation API not supported
+ */
+typedef Qnn_ErrorHandle_t (*QnnOpPackage_ValidateOpConfigFn_t)(Qnn_OpConfig_t opConfig);
+
+/**
+ * @brief Create Op implementation with executable content for a given node.
+ *
+ * @pre The corresponding QnnOpPackage_ValidateOpConfigFn_t should return
+ *      QNN_SUCCESS for the supplied node.
+ *
+ * @param[in] graphInfrastructure Infrastructure for the graph to which the node and kernels
+ *                                belong. This memory is guaranteed to live at least until all
+ *                                created kernels are freed, and may be safely cached.
+ *
+ * @param[in] node Node object for which kernels should be created. This node may be freed before
+ *                 the created kernels. Neither the node nor it's members should be cached.
+ *
+ * @param[out] opImpl Op implementation with executable content to compute the operation specified
+ *                    by _node_. The Op implementation contents will be freed by the backend with
+ *                    QnnOpPackage_FreeOpImplFn_t.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: Op implementation is created successfully
+ *         - QNN_OP_PACKAGE_ERROR_INVALID_INFRASTRUCTURE: Failed to create op implementation
+ *           due to invalid graph infrastructure content.
+ *         - QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT: one or more invalid arguments (e.g. NULL)
+ *         - QNN_OP_PACKAGE_ERROR_UNSUPPORTED_FEATURE: API not supported
+ *         - QNN_OP_PACKAGE_ERROR_GENERAL: Other error occurred.
+ */
+typedef Qnn_ErrorHandle_t (*QnnOpPackage_CreateOpImplFn_t)(
+    QnnOpPackage_GraphInfrastructure_t graphInfrastructure,
+    QnnOpPackage_Node_t node,
+    QnnOpPackage_OpImpl_t* opImpl);
+
+/**
+ * @brief Free the resources associated with Op implementation previously allocated by
+ *        QnnOpPackage_CreateOpImplFn_t.
+ *
+ * @param[in] opImpl Op implementation which should be freed.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS if Op implementation resources are successfully freed.
+ *         - QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT: _opImpl_ argument was NULL.
+ *         - QNN_OP_PACKAGE_ERROR_UNSUPPORTED_FEATURE: API not supported.
+ *         - QNN_OP_PACKAGE_ERROR_GENERAL: Other error occurred.
+ */
+typedef Qnn_ErrorHandle_t (*QnnOpPackage_FreeOpImplFn_t)(QnnOpPackage_OpImpl_t opImpl);
+
+/**
+ * @brief See QnnLog_create() in QnnLog.h for documentation.
+ */
+typedef Qnn_ErrorHandle_t (*QnnOpPackage_LogInitializeFn_t)(QnnLog_Callback_t callback,
+                                                            QnnLog_Level_t maxLogLevel);
+
+/**
+ * @brief See QnnLog_setLogLevel() in QnnLog.h for documentation.
+ */
+typedef Qnn_ErrorHandle_t (*QnnOpPackage_LogSetLevelFn_t)(QnnLog_Level_t maxLogLevel);
+
+/**
+ * @brief See QnnLog_free() in QnnLog.h for documentation.
+ */
+typedef Qnn_ErrorHandle_t (*QnnOpPackage_LogTerminateFn_t)(void);
+
+/**
+ * @brief Initialize an op package library and create an op package handle.
+ *
+ * @param[in] infrastructure Global infrastructure object provided by the backend for use in all
+ *                           operations in the package.
+ *
+ * @param[in] callback Callback to handle op package generated logging messages. NULL represents
+ *                     that logging is disabled.
+ *
+ * @param[in] maxLogLevel Maximum level of messages which the op package will generate.
+ *
+ * @param[out] opPackage The created op package handle.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: Op package was successfully created.
+ *         - QNN_OP_PACKAGE_ERROR_UNSUPPORTED_PLATFORM: Op package attempted to be created on an
+ *           unsupported platform.
+ *         - QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT: if one or more arguments is invalid.
+ *         - QNN_OP_PACKAGE_ERROR_INVALID_INFRASTRUCTURE: Op package initialization failed due to
+ *           invalid infrastructure content.
+ *         - QNN_OP_PACKAGE_ERROR_GENERAL: Op package library failed to initialize.
+ */
+typedef Qnn_ErrorHandle_t (*QnnOpPackage_CreateFn_t)(
+    QnnOpPackage_GlobalInfrastructure_t infrastructure,
+    QnnLog_Callback_t callback,
+    QnnLog_Level_t maxLogLevel,
+    Qnn_OpPackageHandle_t* opPackage);
+
+/**
+ * @brief Verifies that this op with the specified config can be successfully executed.
+ *
+ * @param[in] opPackage An op package handle.
+ *
+ * @param[in] opConfig Op configuration in question.
+ *
+ * @note  _inputTensors_ and _outputTensors_ inside opConfig must be fully qualified for complete
+ *        validation. However, their unique _id_ and _name_ are ignored during validation.
+ *
+ * @return error code:
+ *         - QNN_SUCCESS No error encountered.
+ *         - QNN_OP_PACKAGE_ERROR_INVALID_HANDLE: _opPackage_ is not a valid handle.
+ *         - QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE: op config validation failed
+ *         - QNN_OP_PACKAGE_ERROR_UNSUPPORTED_FEATURE: Validation API not supported
+ */
+typedef Qnn_ErrorHandle_t (*QnnOpPackage_ValidateOpConfigHandleFn_t)(
+    Qnn_OpPackageHandle_t opPackage, Qnn_OpConfig_t opConfig);
+
+/**
+ * @brief Create op implementation with executable content for a given node.
+ *
+ * @pre The corresponding QnnOpPackage_ValidateOpConfigFn_t should return QNN_SUCCESS for the
+ *      supplied node.
+ *
+ * @param[in] opPackage An op package handle.
+ *
+ * @param[in] graphInfrastructure Infrastructure for the graph to which the node and kernels belong.
+ *                                This memory is guaranteed to live at least until all created
+ *                                kernels are freed and may be safely cached.
+ *
+ * @param[in] node Node object for which kernels should be created. This node may be freed before
+ *                 the created kernels. Neither the node nor it's members should be cached.
+ *
+ * @param[out] opImpl Op implementation with executable content to compute the operation specified
+ *                    by _node_. The Op implementation contents will be freed by the backend with
+ *                    QnnOpPackage_FreeOpImplFn_t.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: No error encountered.
+ *         - QNN_OP_PACKAGE_ERROR_INVALID_HANDLE: _opPackage_ is not a valid handle.
+ *         - QNN_OP_PACKAGE_ERROR_INVALID_INFRASTRUCTURE: Failed to create op implementation
+ *           due to invalid graph infrastructure content.
+ *         - QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT: one or more invalid arguments (e.g. NULL)
+ *         - QNN_OP_PACKAGE_ERROR_UNSUPPORTED_FEATURE: API not supported
+ *         - QNN_OP_PACKAGE_ERROR_GENERAL: Other error occurred.
+ */
+typedef Qnn_ErrorHandle_t (*QnnOpPackage_CreateOpImplHandleFn_t)(
+    Qnn_OpPackageHandle_t opPackage,
+    QnnOpPackage_GraphInfrastructure_t graphInfrastructure,
+    QnnOpPackage_Node_t node,
+    QnnOpPackage_OpImpl_t* opImpl);
+
+/**
+ * @brief Free the resources associated with Op implementation previously allocated by
+ *        QnnOpPackage_CreateOpImplFn_t.
+ *
+ * @param[in] opPackage An op package handle.
+ *
+ * @param[in] opImpl Op implementation which should be freed.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS No error encountered.
+ *         - QNN_OP_PACKAGE_ERROR_INVALID_HANDLE: _opPackage_ is not a valid handle.
+ *         - QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT: _opImpl_ argument was NULL.
+ *         - QNN_OP_PACKAGE_ERROR_UNSUPPORTED_FEATURE: API not supported.
+ *         - QNN_OP_PACKAGE_ERROR_GENERAL: Other error occurred.
+ */
+typedef Qnn_ErrorHandle_t (*QnnOpPackage_FreeOpImplHandleFn_t)(Qnn_OpPackageHandle_t opPackage,
+                                                               QnnOpPackage_OpImpl_t opImpl);
+
+/**
+ * @brief A function to change the log level for the supplied op package handle.
+ *
+ * @param[in] opPackage An op package handle.
+ *
+ * @param[in] maxLogLevel New maximum log level.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: No error encountered.
+ *         - QNN_OP_PACKAGE_ERROR_INVALID_HANDLE: _opPackage_ is not a valid handle.
+ *         - QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT: if maxLogLevel is not a valid log level.
+ */
+typedef Qnn_ErrorHandle_t (*QnnOpPackage_LogSetLevelHandleFn_t)(Qnn_OpPackageHandle_t opPackage,
+                                                                QnnLog_Level_t maxLogLevel);
+
+/**
+ * @brief Free all resources associated with an op package handle.
+ *
+ * @param[in] Op package handle to be freed.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: No error encountered.
+ *         - QNN_OP_PACKAGE_ERROR_INVALID_HANDLE: _opPackage_ is not a valid handle.
+ *         - QNN_OP_PACKAGE_ERROR_GENERAL: Indicates failure to free op package allocated resources.
+ */
+typedef Qnn_ErrorHandle_t (*QnnOpPackage_FreeFn_t)(Qnn_OpPackageHandle_t opPackage);
+
+//------------------------------------------------------------------------------
+//   Implementation Definition
+//------------------------------------------------------------------------------
+
+// clang-format off
+/// QnnOpPackage_ImplementationV1_4_t version initializer macro
+#define QNN_OP_PACKAGE_API_VERSION_1_4_0 \
+{                                        \
+  1u, /*major*/                          \
+  4u, /*minor*/                          \
+  0u  /*patch*/                          \
+}
+
+/**
+ * @brief Version 1.4 QNN Op Package Implementation structure.
+ *
+ *        Contains function pointers for each interface method defined in the
+ *        1.4 QNN Op Package API.
+ */
+typedef struct
+{
+  QnnOpPackage_InitFn_t             init;
+  QnnOpPackage_TerminateFn_t        terminate;
+  QnnOpPackage_GetInfoFn_t          getInfo;
+  QnnOpPackage_ValidateOpConfigFn_t validateOpConfig;
+  QnnOpPackage_CreateOpImplFn_t     createOpImpl;
+  QnnOpPackage_FreeOpImplFn_t       freeOpImpl;
+  QnnOpPackage_LogInitializeFn_t    logInitialize;
+  QnnOpPackage_LogSetLevelFn_t      logSetLevel;
+  QnnOpPackage_LogTerminateFn_t     logTerminate;
+} QnnOpPackage_ImplementationV1_4_t;
+
+/// QnnOpPackage_ImplementationV1_4_t initializer macro
+#define QNN_OP_PACKAGE_IMPLEMENTATION_V1_4_INIT \
+  {                                             \
+    NULL,     /*init*/                          \
+    NULL,     /*terminate*/                     \
+    NULL,     /*getInfo*/                       \
+    NULL,     /*validateOpConfig*/              \
+    NULL,     /*createOpImpl*/                  \
+    NULL,     /*freeOpImpl*/                    \
+    NULL,     /*logInitialize*/                 \
+    NULL,     /*logSetLevel*/                   \
+    NULL      /*logTerminate*/                  \
+  }
+// clang-format on
+
+// clang-format off
+/// QnnOpPackage_ImplementationV2_0_t version initializer macro
+#define QNN_OP_PACKAGE_API_VERSION_2_0_0 \
+{                                        \
+  2u, /*major*/                          \
+  0u, /*minor*/                          \
+  0u  /*patch*/                          \
+}
+
+/**
+ * @brief Version 2.0 QNN Op Package Implementation structure.
+ *
+ *        Contains function pointers for each interface method defined in the
+ *        2.0 QNN Op Package API.
+ */
+typedef struct
+{
+  QnnOpPackage_CreateFn_t                 create;
+  QnnOpPackage_GetInfoFn_t                getInfo;
+  QnnOpPackage_ValidateOpConfigHandleFn_t validateOpConfig;
+  QnnOpPackage_CreateOpImplHandleFn_t     createOpImpl;
+  QnnOpPackage_FreeOpImplHandleFn_t       freeOpImpl;
+  QnnOpPackage_LogSetLevelHandleFn_t      logSetLevel;
+  QnnOpPackage_FreeFn_t                   free;
+} QnnOpPackage_ImplementationV2_0_t;
+
+/// QnnOpPackage_ImplementationV2_0_t initializer macro
+#define QNN_OP_PACKAGE_IMPLEMENTATION_V2_0_INIT \
+  {                                             \
+    NULL,     /*create*/                        \
+    NULL,     /*getInfo*/                       \
+    NULL,     /*validateOpConfig*/              \
+    NULL,     /*createOpImpl*/                  \
+    NULL,     /*freeOpImpl*/                    \
+    NULL,     /*logSetLevel*/                   \
+    NULL      /*free*/                          \
+  }
+// clang-format on
+
+/**
+ * @brief Structure which provides the package version and implementation
+ *        for a given package. Will be queried by the backend using the
+ *        package's implementation provider.
+ */
+typedef struct {
+  /// Version of the QNN Op Package Interface which this package provides.
+  /// The Op Package Interface is accessed through correspondingly named implementation.
+  Qnn_Version_t interfaceVersion;
+  union UNNAMED {
+    QnnOpPackage_ImplementationV1_4_t v1_4;
+    QnnOpPackage_ImplementationV2_0_t v2_0;
+  };
+} QnnOpPackage_Interface_t;
+
+/// QnnOpPackage_Interface_t initializer macro
+#define QNN_OP_PACKAGE_INTERFACE_INIT                      \
+  {                                                        \
+    QNN_OP_PACKAGE_API_VERSION_1_4_0, /*interfaceVersion*/ \
+    {                                                      \
+      QNN_OP_PACKAGE_IMPLEMENTATION_V1_4_INIT /*v1_4*/     \
+    }                                                      \
+  }
+
+/**
+ * @brief A function to retrieve the interface provided by the Op package.
+ *        The name of this function is not prescribed by Op Package API, but must
+ *        be documented by the package developer and supplied to QNN backend by the client.
+ *        See QnnBackend_registerOpPackage().
+ *
+ * @param[out] interface QNN Op Package interface structure, populated with the version and
+ *                       interface methods this Op package provides. Caller to manage the lifetime
+ *                       of the pointer, though the contents are to be considered invalid if the op
+ *                       package library is terminated/unloaded.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: Op package interface is successfully retrieved.
+ *         - QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT: _interface_ argument was NULL.
+ *         - QNN_OP_PACKAGE_ERROR_GENERAL: Other error occurred.
+ */
+typedef Qnn_ErrorHandle_t (*QnnOpPackage_InterfaceProvider_t)(QnnOpPackage_Interface_t* interface);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnProfile.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnProfile.h
new file mode 100755
index 0000000000000..6bdc270834970
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnProfile.h
@@ -0,0 +1,610 @@
+//==============================================================================
+//
+// Copyright (c) 2019-2024 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/**
+ *  @file
+ *  @brief Profile component API.
+ *
+ *          Requires Backend to be initialized.
+ *          Provides means to profile QNN backends to evaluate performance
+ *          (memory and timing) of graphs and operations.
+ */
+
+#ifndef QNN_PROFILE_H
+#define QNN_PROFILE_H
+
+#include "QnnCommon.h"
+#include "QnnTypes.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+/**
+ * @brief QnnProfile_EventType_t definition to get stats related to creation of
+ *        context and graphs. If supported, this profile data captures stats
+ *        starting with the context creation (QnnContext_create) and ending with
+ *        graph finalize (QnnGraph_finalize). Alternatively, in case of loading
+ *        a cached context, it captures stats for creating context from the
+ *        cache (QnnContext_createFromBinary).
+ *
+ * @note init information maybe available on both QNN_PROFILE_LEVEL_BASIC and
+ *       QNN_PROFILE_LEVEL_DETAILED levels
+ *
+ * @note If unit information is not available, the value should be interpreted
+ *       as time in microseconds.
+ */
+#define QNN_PROFILE_EVENTTYPE_INIT 100
+
+/**
+ * @brief QnnProfile_EventType_t definition to get stats related to finalize
+ *        operation on graphs in a context. If supported, this profile data
+ *        captures stats for graph finalize (QnnGraph_finalize).
+ *
+ * @note finalize information maybe available on both QNN_PROFILE_LEVEL_BASIC and
+ *       QNN_PROFILE_LEVEL_DETAILED levels
+ *
+ * @note If unit information is not available, the value should be interpreted
+ *       as time in microseconds.
+ */
+#define QNN_PROFILE_EVENTTYPE_FINALIZE 300
+
+/**
+ * @brief QnnProfile_EventType_t definition to get stats related to execution
+ *        of graphs in a context (QnnGraph_execute or QnnGraph_executeAsync).
+ *        Basic level might include stats related to execution of entire graphs.
+ *        In addition, detailed level can include stats related to individual
+ *        nodes in graphs as sub-events.
+ *
+ * @note execute information maybe available on both QNN_PROFILE_LEVEL_BASIC and
+ *       QNN_PROFILE_LEVEL_DETAILED levels
+ *
+ * @note If unit information is not available, the value should be interpreted
+ *       as time in microseconds.
+ */
+#define QNN_PROFILE_EVENTTYPE_EXECUTE 400
+
+/**
+ * @brief QnnProfile_EventType_t definition to get data related to execution of
+ *        an operation. This value can be interpreted appropriately in conjunction
+ *        with the unit.
+ *
+ * @note node specific information is available on QNN_PROFILE_LEVEL_DETAILED level
+ *
+ * @note This is a sub-event of the QNN_PROFILE_EVENTTYPE_EXECUTE event.
+ */
+#define QNN_PROFILE_EVENTTYPE_NODE 404
+
+/**
+ * @brief QnnProfile_EventType_t definition to get stats related to time spent
+ *        waiting in a queue when executing a graph.
+ *
+ * @note execute enqueue information maybe available on both QNN_PROFILE_LEVEL_BASIC
+ *       and QNN_PROFILE_LEVEL_DETAILED levels
+ *
+ * @note This is a sub-event of the QNN_PROFILE_EVENTTYPE_EXECUTE event.
+ */
+#define QNN_PROFILE_EVENTTYPE_EXECUTE_QUEUE_WAIT 405
+
+/**
+ * @brief QnnProfile_EventType_t definition to get stats related to time spent
+ *        pre-processing in preparation of executing a graph.
+ *
+ * @note execute preprocess information maybe available on both QNN_PROFILE_LEVEL_BASIC
+ *       and QNN_PROFILE_LEVEL_DETAILED levels
+ *
+ * @note This is a sub-event of the QNN_PROFILE_EVENTTYPE_EXECUTE event.
+ */
+#define QNN_PROFILE_EVENTTYPE_EXECUTE_PREPROCESS 406
+
+/**
+ * @brief QnnProfile_EventType_t definition to get stats related to time spent
+ *        on-device executing a graph.
+ *
+ * @note execute device information maybe available on both QNN_PROFILE_LEVEL_BASIC
+ *       and QNN_PROFILE_LEVEL_DETAILED levels
+ *
+ * @note This is a sub-event of the QNN_PROFILE_EVENTTYPE_EXECUTE event.
+ */
+#define QNN_PROFILE_EVENTTYPE_EXECUTE_DEVICE 407
+
+/**
+ * @brief QnnProfile_EventType_t definition to get stats related to time spent
+ *        post-processing after execution of a graph.
+ *
+ * @note execute postprocess information maybe available on both QNN_PROFILE_LEVEL_BASIC
+ *       and QNN_PROFILE_LEVEL_DETAILED levels
+ *
+ * @note This is a sub-event of the QNN_PROFILE_EVENTTYPE_EXECUTE event.
+ */
+#define QNN_PROFILE_EVENTTYPE_EXECUTE_POSTPROCESS 408
+
+/**
+ * @brief QnnProfile_EventType_t definition to get stats related to deinit
+ *        graphs and free context operation. This profile data captures stats
+ *        for QnnContext_free.
+ *
+ * @note deinit information maybe available on both QNN_PROFILE_LEVEL_BASIC and
+ *       QNN_PROFILE_LEVEL_DETAILED levels
+ *
+ * @note If unit information is not available, the value should be interpreted
+ *       as time in microseconds.
+ */
+#define QNN_PROFILE_EVENTTYPE_DEINIT 500
+
+/**
+ * @brief QnnProfile_EventType_t definition to get traces related to graph
+ *        preparation and execution steps. This profile data captures stats
+ *        for QnnGraph_execute.
+ *
+ * @note trace information is available on QNN_PROFILE_LEVEL_DETAILED
+ *       level only.
+ */
+#define QNN_PROFILE_EVENTTYPE_TRACE 600
+
+/**
+ * @brief QnnProfile_EventType_t definition reserved for each back end to define
+ *        and extend
+ *
+ * @note The client should consult the backend-specific SDK documentation for
+ *       information regarding interpretation of unit, value and identifier.
+ */
+#define QNN_PROFILE_EVENTTYPE_BACKEND 1000
+
+/**
+ * @brief Basic QnnProfile_Level_t definition that allows to collect performance
+ *        metrics for graph finalization and execution stages.
+ */
+#define QNN_PROFILE_LEVEL_BASIC 1
+
+/**
+ * @brief Detailed QnnProfile_Level_t definition that allows to collect performance
+ *        metrics for each operation in the graph
+ */
+#define QNN_PROFILE_LEVEL_DETAILED 2
+
+/**
+ * @brief QnnProfile_Level_t definition reserved for each back end to define and
+ *        extend
+ */
+#define QNN_PROFILE_LEVEL_BACKEND 1000
+
+/**
+ * @brief QnnProfile_EventUnit_t definition to provide profiling measurement as
+ *        time in microseconds
+ */
+#define QNN_PROFILE_EVENTUNIT_MICROSEC 1
+
+/**
+ * @brief QnnProfile_EventUnit_t definition to provide profiling measurement as
+ *        memory in bytes
+ */
+#define QNN_PROFILE_EVENTUNIT_BYTES 2
+
+/**
+ * @brief QnnProfile_EventUnit_t definition to provide profiling measurement as
+ *        time in cycles
+ */
+#define QNN_PROFILE_EVENTUNIT_CYCLES 3
+
+/**
+ * @brief QnnProfile_EventUnit_t definition to provide profiling measurement as
+ *        a count
+ */
+#define QNN_PROFILE_EVENTUNIT_COUNT 4
+
+/**
+ * @brief QnnProfile_EventUnit_t definition to provide profiling measurement as
+ *        an opaque object
+ */
+#define QNN_PROFILE_EVENTUNIT_OBJECT 5
+
+/**
+ * @brief QnnProfile_EventUnit_t definition to provide profiling measurement with
+ *        no unit
+ */
+#define QNN_PROFILE_EVENTUNIT_NONE 6
+
+/**
+ * @brief QnnProfile_EventUnit_t definition reserved for each back end to define
+ *        and extend
+ */
+#define QNN_PROFILE_EVENTUNIT_BACKEND 1000
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+/**
+ * @brief QNN Profile API result / error codes.
+ */
+typedef enum {
+  QNN_PROFILE_MIN_ERROR = QNN_MIN_ERROR_PROFILE,
+  ////////////////////////////////////////////
+
+  /// Qnn Profile success
+  QNN_PROFILE_NO_ERROR = QNN_SUCCESS,
+  /// Backend does not support requested functionality
+  QNN_PROFILE_ERROR_UNSUPPORTED = QNN_COMMON_ERROR_NOT_SUPPORTED,
+  /// Invalid function argument
+  QNN_PROFILE_ERROR_INVALID_ARGUMENT = QNN_COMMON_ERROR_INVALID_ARGUMENT,
+  /// General error relating to memory allocation in Profile API
+  QNN_PROFILE_ERROR_MEM_ALLOC = QNN_COMMON_ERROR_MEM_ALLOC,
+  /// Invalid/NULL QNN profile handle
+  QNN_PROFILE_ERROR_INVALID_HANDLE = QNN_MIN_ERROR_PROFILE + 0,
+  /// Attempt to free or reconfigure a profile handle that is in-use
+  QNN_PROFILE_ERROR_HANDLE_IN_USE = QNN_MIN_ERROR_PROFILE + 1,
+  /// Event is incompatible with API
+  QNN_PROFILE_ERROR_INCOMPATIBLE_EVENT = QNN_MIN_ERROR_PROFILE + 2,
+
+  ////////////////////////////////////////////
+  QNN_PROFILE_MAX_ERROR = QNN_MAX_ERROR_PROFILE,
+  // Unused, present to ensure 32 bits.
+  QNN_PROFILE_ERROR_UNDEFINED = 0x7FFFFFFF
+} QnnProfile_Error_t;
+
+/**
+ * @brief Backend defined type for a profiled event such as time_taken, time_start, memory
+ */
+typedef uint32_t QnnProfile_EventType_t;
+
+/**
+ * @brief Represents a profiled event value
+ */
+typedef uint64_t QnnProfile_EventValue_t;
+
+/**
+ * @brief Profile levels supported by each backend
+ */
+typedef uint32_t QnnProfile_Level_t;
+
+/**
+ * @brief ID of a profiling event
+ */
+typedef uint64_t QnnProfile_EventId_t;
+
+/**
+ * @brief Unit of measurement of a profiling event
+ */
+typedef uint32_t QnnProfile_EventUnit_t;
+
+/**
+ * @brief This struct provides event information.
+ */
+typedef struct {
+  /// Type of event
+  QnnProfile_EventType_t type;
+  /// Unit of measurement for the event
+  QnnProfile_EventUnit_t unit;
+  /// Value for the event
+  QnnProfile_EventValue_t value;
+  /// Identifier for the event
+  const char* identifier;
+} QnnProfile_EventData_t;
+
+// clang-format off
+/// QnnProfile_EventData_t initializer macro
+#define QNN_PROFILE_EVENT_DATA_INIT \
+  {                                 \
+    0u,      /*type*/               \
+    0u,      /*unit*/               \
+    0u,      /*value*/              \
+    NULL     /*identifier*/         \
+  }
+// clang-format on
+
+/**
+ * @brief A struct which defines a backend opaque object
+ */
+typedef struct {
+  /// Opaque object
+  Qnn_OpaqueObject_t opaqueObject;
+  /// Name of the file. Can be NULL.
+  const char* fileName;
+} QnnProfile_BackendOpaqueObject_t;
+
+// clang-format off
+/// QnnProfile_BackendOpaqueObject_t initializer macro
+#define QNN_PROFILE_BACKEND_OPAQUE_OBJECT_INIT \
+  {                                            \
+    QNN_OPAQUE_OBJECT_INIT, /*opaqueObject*/   \
+    NULL                    /*fileName*/       \
+  }
+// clang-format on
+
+/**
+ * @brief This struct provides extended event information.
+ */
+typedef struct {
+  /// Type of the event
+  QnnProfile_EventType_t type;
+  /// Unit of measurement for the event
+  QnnProfile_EventUnit_t unit;
+  /// Event data
+  /// The field used is dependent on the event unit.
+  union UNNAMED {
+    /// Used for MICROSEC, BYTES, CYCLES, COUNT.
+    Qnn_Scalar_t value;
+    /// Used for OBJECT.
+    QnnProfile_BackendOpaqueObject_t backendOpaqueObject;
+  };
+  /// Timestamp for the event, represented in microsecond unit.
+  uint64_t timestamp;
+  /// Identifier for the event. Can be NULL.
+  const char* identifier;
+} QnnProfile_ExtendedEventDataV1_t;
+
+// clang-format off
+/// QnnProfile_ExtendedEventDataV1_t initializer macro
+#define QNN_PROFILE_EXTENDED_EVENT_DATA_V1_INIT \
+  {                                             \
+    0u,               /*type*/                  \
+    0u,               /*unit*/                  \
+    {                                           \
+      QNN_SCALAR_INIT /*value*/                 \
+    },                                          \
+    0u,               /*timestamp*/             \
+    NULL              /*identifier*/            \
+  }
+// clang-format on
+
+typedef enum {
+  QNN_PROFILE_DATA_VERSION_1         = 1,
+  QNN_PROFILE_DATA_VERSION_UNDEFINED = 0x7FFFFFFF
+} QnnProfile_ExtendedEventDataVersion_t;
+
+typedef struct {
+  QnnProfile_ExtendedEventDataVersion_t version;
+  union UNNAMED {
+    QnnProfile_ExtendedEventDataV1_t v1;
+  };
+} QnnProfile_ExtendedEventData_t;
+
+// clang-format off
+/// QnnProfile_ExtendedEventData_t initializer macro
+#define QNN_PROFILE_EXTENDED_EVENT_DATA_INIT         \
+  {                                                  \
+    QNN_PROFILE_DATA_VERSION_1, /*version*/          \
+    {                                                \
+      QNN_PROFILE_EXTENDED_EVENT_DATA_V1_INIT /*v1*/ \
+    }                                                \
+  }
+// clang-format on
+
+/**
+ * @brief This enum defines profile config options.
+ */
+typedef enum {
+  /// Sets backend custom configs, see backend specific documentation.
+  QNN_PROFILE_CONFIG_OPTION_CUSTOM = 0,
+  /// This config sets the maximum number of profiling events
+  /// that can be stored in the profile handle. Once the maximum
+  /// number of events is reached, no more events will be stored.
+  /// The absolute maximum number of events is subject to a maximum limit
+  /// determined by the backend and available system resources. The default
+  /// maximum number of events is backend-specific, refer to SDK documentation.
+  QNN_PROFILE_CONFIG_OPTION_MAX_EVENTS = 1,
+  /// Set optrace profiling support via enableOptrace flag.
+  /// Please note that the trace information is available on QNN_PROFILE_LEVEL_DETAILED level only.
+  QNN_PROFILE_CONFIG_OPTION_ENABLE_OPTRACE = 2,
+  /// Value selected to ensure 32 bits.
+  QNN_PROFILE_CONFIG_OPTION_UNDEFINED = 0x7FFFFFFF
+} QnnProfile_ConfigOption_t;
+
+/**
+ * @brief Profile specific object for custom configuration
+ *
+ * Please refer to documentation provided by the backend for usage information
+ */
+typedef void* QnnProfile_CustomConfig_t;
+
+/**
+ * @brief This struct provides profile configuration.
+ */
+typedef struct {
+  QnnProfile_ConfigOption_t option;
+  union UNNAMED {
+    QnnProfile_CustomConfig_t customConfig;
+    uint64_t numMaxEvents;
+    uint8_t enableOptrace;
+  };
+} QnnProfile_Config_t;
+
+// clang-format off
+/// QnnProfile_Config_t initializer macro
+#define QNN_PROFILE_CONFIG_INIT                     \
+  {                                                 \
+    QNN_PROFILE_CONFIG_OPTION_UNDEFINED, /*option*/ \
+    {                                               \
+      NULL /*customConfig*/                         \
+    }                                               \
+  }
+// clang-format on
+
+//=============================================================================
+// Public Functions
+//=============================================================================
+
+/**
+ * @brief Create a handle to a profile object.
+ *
+ * @param[in] backend A backend handle.
+ *
+ * @param[in] level Granularity level at which the profile should collect events.
+ *
+ * @param[out] profile A profile handle.
+ *
+ * @return Error code
+ *         - QNN_SUCCESS: No error encountered
+ *         - QNN_PROFILE_ERROR_INVALID_ARGUMENT: _profile_ is NULL or _level_ is invalid.
+ *         - QNN_PROFILE_ERROR_UNSUPPORTED: Profiling is unsupported on a backend.
+ *         - QNN_PROFILE_ERROR_MEM_ALLOC: Error in allocating memory when creating profile handle
+ *         - QNN_PROFILE_ERROR_INVALID_HANDLE: _backend_ is not a valid handle
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnProfile_create(Qnn_BackendHandle_t backend,
+                                    QnnProfile_Level_t level,
+                                    Qnn_ProfileHandle_t* profile);
+
+/**
+ * @brief A function to set/modify configuration options on an already created profile handle.
+ *
+ * @param[in] profileHandle A profile handle.
+ *
+ * @param[in] config Pointer to a NULL terminated array of config option pointers. NULL is allowed
+ *                   and indicates no config options are provided. All config options have default
+ *                   value, in case not provided. If same config option type is provided multiple
+ *                   times, the last option value will be used. If a backend cannot support all
+ *                   provided configs it will fail.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: no error is encountered
+ *         - QNN_PROFILE_ERROR_INVALID_HANDLE: _profileHandle_ is not a valid handle
+ *         - QNN_PROFILE_ERROR_INVALID_ARGUMENT: at least one config option is invalid
+ *         - QNN_PROFILE_ERROR_HANDLE_IN_USE: when attempting to reconfigure a profile handle
+ *         - QNN_PROFILE_ERROR_UNSUPPORTED: Config option is not supported
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnProfile_setConfig(Qnn_ProfileHandle_t profileHandle,
+                                       const QnnProfile_Config_t** config);
+
+/**
+ * @brief Get Qnn profile events collected on the profile handle.
+ *
+ * @param[in] profile A profile handle.
+ *
+ * @param[out] profileEventIds Returns handles to Qnn profile events collected on this profile
+ *                             object.
+ *
+ * @param[out] numEvents Number of profile events.
+ *
+ * @note profileEvents parameter: profile event memory is associated with the profile object and
+ *       released on profile object release in QnnProfile_free().
+ *
+ * @return Error code
+ *         - QNN_SUCCESS: No error encountered
+ *         - QNN_PROFILE_ERROR_INVALID_ARGUMENT: _profileEventIds_ or _numEvents_ is NULL.
+ *         - QNN_PROFILE_ERROR_INVALID_HANDLE: _profile_ is not a valid handle.
+ *         - QNN_PROFILE_ERROR_MEM_ALLOC: error related to memory allocation
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnProfile_getEvents(Qnn_ProfileHandle_t profile,
+                                       const QnnProfile_EventId_t** profileEventIds,
+                                       uint32_t* numEvents);
+
+/**
+ * @brief Get Qnn profile event handles nested within this Qnn profile event handle.
+ *
+ * @param[in] eventId QNN Profile event whose sub events are being queried.
+ *
+ * @param[out] subEventIds Nested profile events on this event.
+ *
+ * @param[out] numSubEvents Number of profile events.
+ *
+ * @note subEventIds parameter: profile event memory is associated with the profile object and
+ *       released on profile object release in QnnProfile_free().
+ *
+ * @return Error code
+ *         - QNN_SUCCESS: No error encountered
+ *         - QNN_PROFILE_ERROR_INVALID_ARGUMENT: _subEventIds_ or _numSubEvents_ is NULL.
+ *         - QNN_PROFILE_ERROR_INVALID_HANDLE: _eventId_ does not identify a valid event.
+ *         - QNN_PROFILE_ERROR_MEM_ALLOC: error related to memory allocation
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnProfile_getSubEvents(QnnProfile_EventId_t eventId,
+                                          const QnnProfile_EventId_t** subEventIds,
+                                          uint32_t* numSubEvents);
+
+/**
+ * @brief Query the data associated with this profile event.
+ *
+ * @param[in] eventId Qnn profile event being queried.
+ *
+ * @param[out] eventData Event data associated to this event.
+ *
+ * @note eventData parameter: eventData memory is associated with the profile object and released
+ *       on profile object release in QnnProfile_free().
+ *
+ * @return Error code
+ *         - QNN_SUCCESS: No error encountered
+ *         - QNN_PROFILE_ERROR_UNSUPPORTED: API not supported.
+ *         - QNN_PROFILE_ERROR_INCOMPATIBLE_EVENT: _eventData_ is incompatible with the API. Use
+ *           QnnProfile_getExtendedEventData instead.
+ *         - QNN_PROFILE_ERROR_INVALID_ARGUMENT: _eventData_ is NULL.
+ *         - QNN_PROFILE_ERROR_INVALID_HANDLE: _eventId_ does not identify a valid event.
+ *         - QNN_PROFILE_ERROR_MEM_ALLOC: error related to memory allocation
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnProfile_getEventData(QnnProfile_EventId_t eventId,
+                                          QnnProfile_EventData_t* eventData);
+
+/**
+ * @brief Query the data associated with this profile extended event.
+ *
+ * @param[in] eventId Qnn profile extended event being queried.
+ *
+ * @param[out] eventData Event data associated to this extended event.
+ *
+ * @note eventData parameter: eventData memory is associated with the profile object and released
+ *       on profile object release in QnnProfile_free().
+ *
+ * @return Error code
+ *         - QNN_SUCCESS: No error encountered
+ *         - QNN_PROFILE_ERROR_UNSUPPORTED: API not supported.
+ *         - QNN_PROFILE_ERROR_INVALID_ARGUMENT: _eventData_ is NULL.
+ *         - QNN_PROFILE_ERROR_INVALID_HANDLE: _eventId_ does not identify a valid event.
+ *         - QNN_PROFILE_ERROR_MEM_ALLOC: error related to memory allocation
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnProfile_getExtendedEventData(QnnProfile_EventId_t eventId,
+                                                  QnnProfile_ExtendedEventData_t* eventData);
+
+/**
+ * @brief Free memory associated with the profile handle.
+ *        All associated QnnProfile_EventId_t event handles are implicitly freed.
+ *
+ * @param[in] profile Handle to be freed.
+ *
+ * @note Releasing the profile handle invalidates the memory returned via calls on this handle such
+ *       as QnnProfile_getEvents(), QnnProfile_getSubEvents(), QnnProfile_getEventData(),
+ *       QnnProfile_getExtendedEventData(), etc.
+ *
+ * @note The profile handle cannot be freed when it is bound to another API component or
+ *       in use by an API call.
+ *
+ * @return Error code
+ *         - QNN_SUCCESS: No error encountered
+ *         - QNN_PROFILE_ERROR_INVALID_HANDLE: _profile_ is not a valid handle.
+ *         - QNN_PROFILE_ERROR_MEM_ALLOC: error related to memory de-allocation
+ *         - QNN_PROFILE_ERROR_HANDLE_IN_USE: _profile_ is in-use and cannot be freed.
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnProfile_free(Qnn_ProfileHandle_t profile);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // QNN_PROFILE_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnProperty.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnProperty.h
new file mode 100755
index 0000000000000..3843462a24d8f
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnProperty.h
@@ -0,0 +1,680 @@
+//==============================================================================
+//
+// Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+// All rights reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/**
+ *  @file
+ *  @brief  Property component API.
+ *
+ *          Provides means for client to discover capabilities of a backend.
+ */
+
+#ifndef QNN_PROPERTY_H
+#define QNN_PROPERTY_H
+
+#include "QnnCommon.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+///
+/// Definition of QNN_PROPERTY_GROUP_CORE property group.
+///
+
+/**
+ * @brief Property group for the QNN core property group.
+ */
+#define QNN_PROPERTY_GROUP_CORE 0x00000001
+
+///
+/// Definition of QNN_PROPERTY_GROUP_BACKEND property group. This group is Core (non-optional) API.
+///
+
+/**
+ * @brief Property group for the QNN Backend API property group. This is a non-optional API
+ *        component and cannot be used as a property key.
+ */
+#define QNN_PROPERTY_GROUP_BACKEND (QNN_PROPERTY_GROUP_CORE + 100)
+
+/**
+ * @brief Property key for determining if a backend supports QnnBackend_registerOpPackage.
+ */
+#define QNN_PROPERTY_BACKEND_SUPPORT_OP_PACKAGE (QNN_PROPERTY_GROUP_BACKEND + 4)
+
+/**
+ * @brief Property key for determining whether a backend supports the
+ *        QNN_BACKEND_CONFIG_OPTION_PLATFORM configuration.
+ */
+#define QNN_PROPERTY_BACKEND_SUPPORT_PLATFORM_OPTIONS (QNN_PROPERTY_GROUP_BACKEND + 5)
+
+/**
+ * @brief Property key for determining whether a backend supports graph composition.
+ *        The following are considered graph composition APIs:
+ *        - QnnContext_create
+ *        - QnnGraph_create
+ *        - QnnGraph_addNode
+ *        - QnnGraph_finalize
+ *        - QnnTensor_createContextTensor
+ *        - QnnTensor_createGraphTensor
+ *        - QnnBackend_validateOpConfig
+ */
+#define QNN_PROPERTY_BACKEND_SUPPORT_COMPOSITION (QNN_PROPERTY_GROUP_BACKEND + 6)
+
+/**
+ * @brief Property key for determining whether a backend supports setting
+ *        QNN_BACKEND_PROPERTY_OPTION_CUSTOM as a property option.
+ */
+#define QNN_PROPERTY_BACKEND_SUPPORT_CUSTOM_PROPERTY (QNN_PROPERTY_GROUP_BACKEND + 7)
+
+///
+/// Definition of QNN_PROPERTY_GROUP_CONTEXT property group. This group is Core (non-optional) API.
+///
+
+/**
+ * @brief Property group for the QNN Context API property group. This is a non-optional API
+ *        component and cannot be used as a property key.
+ */
+#define QNN_PROPERTY_GROUP_CONTEXT (QNN_PROPERTY_GROUP_CORE + 200)
+
+/**
+ * @brief Property key for determining whether a backend supports context binaries. It determines
+ *        supports for the following APIs:
+ *        - QnnContext_getBinarySize
+ *        - QnnContext_getBinary
+ *        - QnnContext_createFromBinary
+ */
+#define QNN_PROPERTY_CONTEXT_SUPPORT_CACHING (QNN_PROPERTY_GROUP_CONTEXT + 1)
+
+/**
+ * @brief Property key for determining whether a backend supports the QnnContext_Config_t data
+ *        structure.
+ */
+#define QNN_PROPERTY_CONTEXT_SUPPORT_CONFIGURATION (QNN_PROPERTY_GROUP_CONTEXT + 4)
+
+/**
+ * @brief Property key for determining whether a backend supports graph enablement in a context. See
+ *        QNN_CONTEXT_CONFIG_ENABLE_GRAPHS.
+ */
+#define QNN_PROPERTY_CONTEXT_SUPPORT_CONFIG_ENABLE_GRAPHS (QNN_PROPERTY_GROUP_CONTEXT + 5)
+
+/**
+ * @brief Property key for determining whether a backend supports memory limits in a context. See
+ *        QNN_CONTEXT_CONFIG_MEMORY_LIMIT.
+ */
+#define QNN_PROPERTY_CONTEXT_SUPPORT_CONFIG_MEMORY_LIMIT_HINT (QNN_PROPERTY_GROUP_CONTEXT + 6)
+
+/**
+ * @brief Property key for determining whether a backend supports context binaries that are readable
+ *        throughout the lifetime of the context. See QNN_CONTEXT_CONFIG_PERSISTENT_BINARY.
+ */
+#define QNN_PROPERTY_CONTEXT_SUPPORT_CONFIG_PERSISTENT_BINARY (QNN_PROPERTY_GROUP_CONTEXT + 7)
+
+/**
+ * @brief Property key for determining whether a backend supports binary compatibility control in a
+ *        context. See QNN_CONTEXT_CONFIG_BINARY_COMPATIBILITY.
+ */
+#define QNN_PROPERTY_CONTEXT_SUPPORT_CONFIG_BINARY_COMPATIBILITY_TYPE \
+  (QNN_PROPERTY_GROUP_CONTEXT + 8)
+
+/**
+ * @brief Property key for determining whether a backend supports validation of a stored binary. It
+ *        determines support for QnnContext_validateBinary.
+ */
+#define QNN_PROPERTY_CONTEXT_SUPPORT_VALIDATE_BINARY (QNN_PROPERTY_GROUP_CONTEXT + 9)
+
+/**
+ * @brief Property key for determining whether a backend supports creating a context from a stored
+ *        binary, which supports control signals. It determines support for
+ *        QnnContext_createFromBinaryWithSignal.
+ */
+#define QNN_PROPERTY_CONTEXT_SUPPORT_CREATE_FROM_BINARY_WITH_SIGNALS \
+  (QNN_PROPERTY_GROUP_CONTEXT + 10)
+
+/**
+ * @brief Property key for determining whether a backend supports creating multiple contexts from
+ *        binaries in a single API call. It determines support for
+ *        QnnContext_createFromBinaryListAsync.
+ */
+#define QNN_PROPERTY_CONTEXT_SUPPORT_CREATE_FROM_BINARY_LIST_ASYNC (QNN_PROPERTY_GROUP_CONTEXT + 11)
+
+/**
+ * @brief Property key for determining whether a backend supports creation and application of
+ *        updates for an existing context binary. This determines support for
+ *        QnnContext_getBinarySectionSize(), QnnContext_retrieveBinarySection(), and
+ *        QnnContext_applyBinarySection() with QNN_CONTEXT_SECTION_UPDATABLE.
+ */
+#define QNN_PROPERTY_CONTEXT_SUPPORT_BINARY_UPDATES (QNN_PROPERTY_GROUP_CONTEXT + 12)
+
+/**
+ * @brief Property key for determining whether a backend supports use of binary sections without the
+ *        __graph__ argument provided.
+ */
+#define QNN_PROPERTY_CONTEXT_SUPPORT_BINARY_SECTION_FULL_CONTEXT (QNN_PROPERTY_GROUP_CONTEXT + 13)
+
+/**
+ * @brief Property key for determining whether a backend supports setting
+ *        QNN_CONTEXT_PROPERTY_OPTION_CUSTOM as a property option.
+ */
+#define QNN_PROPERTY_CONTEXT_SUPPORT_CUSTOM_PROPERTY (QNN_PROPERTY_GROUP_CONTEXT + 14)
+
+/**
+ * @brief Property key for determining whether a backend supports QnnContext_getIncrementalBinary
+ *        and QnnContext_releaseIncrementalBinary.
+ */
+#define QNN_PROPERTY_CONTEXT_SUPPORT_INCREMENTAL_BINARY (QNN_PROPERTY_GROUP_CONTEXT + 15)
+
+///
+/// Definition of QNN_PROPERTY_GROUP_GRAPH property group. This group is Core (non-optional) API.
+///
+
+/**
+ * @brief Property group for the QNN Graph API property group. This is a non-optional API
+ *        component and cannot be used as a property key.
+ */
+#define QNN_PROPERTY_GROUP_GRAPH (QNN_PROPERTY_GROUP_CORE + 300)
+
+/**
+ * @brief Property key for determining whether a backend supports graph configuration. It determines
+ *        support for QnnGraph_setConfig.
+ */
+#define QNN_PROPERTY_GRAPH_SUPPORT_CONFIG (QNN_PROPERTY_GROUP_GRAPH + 1)
+
+/**
+ * @brief Property key for determining whether a backend supports signals.
+ * @note This capability is equivalent to all of QNN_PROPERTY_GRAPH_SUPPORT_FINALIZE_SIGNAL,
+ *       QNN_PROPERTY_GRAPH_SUPPORT_EXECUTE_SIGNAL, and
+ *       QNN_PROPERTY_GRAPH_SUPPORT_EXECUTE_ASYNC_SIGNAL having support.
+ * @note DEPRECATED: Use QNN_PROPERTY_GRAPH_SUPPORT_FINALIZE_SIGNAL,
+ *       QNN_PROPERTY_GRAPH_SUPPORT_EXECUTE_SIGNAL, or
+ *       QNN_PROPERTY_GRAPH_SUPPORT_EXECUTE_ASYNC_SIGNAL for QnnGraph API support for QnnSignal.
+ */
+#define QNN_PROPERTY_GRAPH_SUPPORT_SIGNALS (QNN_PROPERTY_GROUP_GRAPH + 2)
+
+/**
+ * @brief Property key for determining whether a backend supports asynchronous graph execution. It
+ *        determines support for QnnGraph_executeAsync.
+ */
+#define QNN_PROPERTY_GRAPH_SUPPORT_ASYNC_EXECUTION (QNN_PROPERTY_GROUP_GRAPH + 3)
+
+/**
+ * @brief Property key for determining whether a backend supports execution of graphs with null
+ *        inputs. This implies that the graph will contain no APP_WRITE tensors.
+ */
+#define QNN_PROPERTY_GRAPH_SUPPORT_NULL_INPUTS (QNN_PROPERTY_GROUP_GRAPH + 4)
+
+/**
+ * @brief Property key for determining whether a backend supports priority control of graphs within
+ *        a context. See QNN_GRAPH_CONFIG_OPTION_PRIORITY.
+ */
+#define QNN_PROPERTY_GRAPH_SUPPORT_PRIORITY_CONTROL (QNN_PROPERTY_GROUP_GRAPH + 5)
+
+/**
+ * @brief Property key for determining whether a backend supports QnnSignal for QnnGraph_finalize.
+ */
+#define QNN_PROPERTY_GRAPH_SUPPORT_FINALIZE_SIGNAL (QNN_PROPERTY_GROUP_GRAPH + 6)
+
+/**
+ * @brief Property key for determining whether a backend supports QnnSignal for QnnGraph_execute.
+ */
+#define QNN_PROPERTY_GRAPH_SUPPORT_EXECUTE_SIGNAL (QNN_PROPERTY_GROUP_GRAPH + 7)
+
+/**
+ * @brief Property key for determining whether a backend supports QnnSignal for
+ *        QnnGraph_executeAsync.
+ */
+#define QNN_PROPERTY_GRAPH_SUPPORT_EXECUTE_ASYNC_SIGNAL (QNN_PROPERTY_GROUP_GRAPH + 8)
+
+/**
+ * @brief Property key for determining whether a backend supports graph-level continuous profiling.
+ *        See QNN_GRAPH_CONFIG_OPTION_PROFILE_HANDLE.
+ */
+#define QNN_PROPERTY_GRAPH_SUPPORT_CONTINUOUS_PROFILING (QNN_PROPERTY_GROUP_GRAPH + 9)
+
+/**
+ * @brief Property key for determining whether a backend supports graph execution. It determines
+ *        support for QnnGraph_execute.
+ */
+#define QNN_PROPERTY_GRAPH_SUPPORT_EXECUTE (QNN_PROPERTY_GROUP_GRAPH + 10)
+
+/**
+ * @brief Property key for determining whether a backend supports batch multiplier.
+ */
+#define QNN_PROPERTY_GRAPH_SUPPORT_BATCH_MULTIPLE (QNN_PROPERTY_GROUP_GRAPH + 11)
+
+/**
+ * @brief Property key for determining whether a backend supports per-API profiling data
+ *        for graph execution.
+ */
+#define QNN_PROPERTY_GRAPH_SUPPORT_EXECUTE_PER_API_PROFILING (QNN_PROPERTY_GROUP_GRAPH + 12)
+
+/**
+ * @brief Property key for determining whether a backend supports subgraphs. It determines support
+ *        for QnnGraph_createSubgraph.
+ */
+#define QNN_PROPERTY_GRAPH_SUPPORT_SUBGRAPH (QNN_PROPERTY_GROUP_GRAPH + 13)
+
+/**
+ * @brief Property key for determining whether a backend supports graph profiling state. See
+ *        QNN_GRAPH_CONFIG_OPTION_SET_PROFILING_STATE.
+ */
+#define QNN_PROPERTY_GRAPH_SUPPORT_PROFILING_STATE (QNN_PROPERTY_GROUP_GRAPH + 14)
+
+/**
+ * @brief Property key for determining whether a backend supports controlling the number of
+ *        profiling executions of a graph. See QNN_GRAPH_CONFIG_OPTION_SET_PROFILING_NUM_EXECUTIONS.
+ */
+#define QNN_PROPERTY_GRAPH_SUPPORT_SET_PROFILING_NUM_EXECUTIONS (QNN_PROPERTY_GROUP_GRAPH + 15)
+
+/**
+ * @brief Property key for determining whether a backend supports the
+ *        QNN_GRAPH_EXECUTE_ENVIRONMENT_OPTION_BIND_MEM_HANDLES execution environment option for
+ *        binding client allocated mem handles to a graph.
+ */
+#define QNN_PROPERTY_GRAPH_SUPPORT_ENV_OPTION_BIND_MEM_HANDLES (QNN_PROPERTY_GROUP_GRAPH + 16)
+
+/**
+ * @brief Property key for determining whether a backend supports the
+ *        QNN_GRAPH_EXECUTE_ENVIRONMENT_OPTION_POPULATE_CLIENT_BUFS execution environment option for
+ *        populating client buffers with backend allocated memory.
+ */
+#define QNN_PROPERTY_GRAPH_SUPPORT_ENV_OPTION_POPULATE_CLIENT_BUFS (QNN_PROPERTY_GROUP_GRAPH + 17)
+
+/**
+ * @brief Property key for determining whether a backend supports finalizing
+ *        (QnnGraph_finalize) a graph retrieved from a context binary.
+ */
+#define QNN_PROPERTY_GRAPH_SUPPORT_FINALIZE_DESERIALIZED_GRAPH (QNN_PROPERTY_GROUP_GRAPH + 18)
+
+/**
+ * @brief Property key for determining whether a backend supports setting
+ *        QNN_GRAPH_PROPERTY_OPTION_CUSTOM as a property option.
+ */
+#define QNN_PROPERTY_GRAPH_SUPPORT_CUSTOM_PROPERTY (QNN_PROPERTY_GROUP_GRAPH + 19)
+
+/**
+ * @brief Property key for determining whether a backend supports early termination of graph
+ *        execution.
+ */
+#define QNN_PROPERTY_GRAPH_SUPPORT_EARLY_TERMINATION (QNN_PROPERTY_GROUP_GRAPH + 20)
+
+/**
+ * @brief Property key for determining whether a backend supports online preparation of
+ *        graphs.
+ */
+#define QNN_PROPERTY_GRAPH_SUPPORT_ONLINE_PREPARE (QNN_PROPERTY_GROUP_GRAPH + 21)
+
+///
+/// Definition of QNN_PROPERTY_GROUP_OP_PACKAGE property group. This group is Optional portion of
+/// API.
+///
+
+/**
+ * @brief Property group for the QNN Op Package API property group. This can be used as a key to
+ *        check if Op Package API is supported by a backend.
+ */
+#define QNN_PROPERTY_GROUP_OP_PACKAGE (QNN_PROPERTY_GROUP_CORE + 400)
+
+/**
+ * @brief Property key for determining whether an op package supports validation.
+ */
+#define QNN_PROPERTY_OP_PACKAGE_SUPPORTS_VALIDATION (QNN_PROPERTY_GROUP_OP_PACKAGE + 1)
+
+/**
+ * @brief Property key for determining whether an op package supports op implementation creation and
+ *        freeing.
+ */
+#define QNN_PROPERTY_OP_PACKAGE_SUPPORTS_OP_IMPLS (QNN_PROPERTY_GROUP_OP_PACKAGE + 2)
+
+/**
+ * @brief Property key for determining whether an op package supports duplication of operation
+ *        names, such that there are duplicated op_package_name::op_name combinations.
+ */
+#define QNN_PROPERTY_OP_PACKAGE_SUPPORTS_DUPLICATE_NAMES (QNN_PROPERTY_GROUP_OP_PACKAGE + 3)
+
+///
+/// Definition of QNN_PROPERTY_GROUP_TENSOR property group. This group is Core (non-optional) API.
+///
+
+/**
+ * @brief Property group for the QNN Tensor API property group. This is a non-optional API
+ *        component and cannot be used as a property key.
+ */
+#define QNN_PROPERTY_GROUP_TENSOR (QNN_PROPERTY_GROUP_CORE + 500)
+
+/**
+ * @brief Property key to determine whether a backend supports Qnn_MemHandle_t type tensors.
+ */
+#define QNN_PROPERTY_TENSOR_SUPPORT_MEMHANDLE_TYPE (QNN_PROPERTY_GROUP_TENSOR + 1)
+
+/**
+ * @brief Property key to determine whether a backend supports creating context tensors. It
+ *        determines support for QnnTensor_createContextTensor.
+ */
+#define QNN_PROPERTY_TENSOR_SUPPORT_CONTEXT_TENSORS (QNN_PROPERTY_GROUP_TENSOR + 2)
+
+/**
+ * @brief Property key to determine whether a backend supports dynamic tensor dimensions.
+ */
+#define QNN_PROPERTY_TENSOR_SUPPORT_DYNAMIC_DIMENSIONS (QNN_PROPERTY_GROUP_TENSOR + 3)
+
+/**
+ * @brief Property key to determine whether a backend supports tensor sparsity.
+ */
+#define QNN_PROPERTY_TENSOR_SUPPORT_SPARSITY (QNN_PROPERTY_GROUP_TENSOR + 4)
+
+/**
+ * @brief Property key to determine whether a backend supports updating static tensor weight data
+ *        and quantization encodings, if applicable.
+ */
+#define QNN_PROPERTY_TENSOR_SUPPORT_UPDATEABLE_STATIC_TENSORS (QNN_PROPERTY_GROUP_TENSOR + 5)
+
+/**
+ * @brief Property key to determine whether a backend supports updating quantization tensor
+ *        encodings for UPDATABLE_NATIVE tensors.
+ */
+#define QNN_PROPERTY_TENSOR_SUPPORT_UPDATEABLE_NATIVE_TENSORS (QNN_PROPERTY_GROUP_TENSOR + 6)
+
+/**
+ * @brief Property key to determine whether a backend supports updating quantization tensor
+ *        encodings for UPDATABLE_APP_READ, UPDATABLE_APP_WRITE, and UPDATABLE_APP_READWRITE
+ *        tensors.
+ */
+#define QNN_PROPERTY_TENSOR_SUPPORT_UPDATEABLE_APP_TENSORS (QNN_PROPERTY_GROUP_TENSOR + 7)
+
+/**
+ * @brief Property key to determine whether a backend supports scale-offset quantization encodings.
+ *        See QNN_QUANTIZATION_ENCODING_SCALE_OFFSET.
+ */
+#define QNN_PROPERTY_TENSOR_SUPPORT_QUANTIZATION_ENCODING_SCALE_OFFSET \
+  (QNN_PROPERTY_GROUP_TENSOR + 8)
+
+/**
+ * @brief Property key to determine whether a backend supports axis scale-offset quantization
+ *        encodings. See QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET.
+ */
+#define QNN_PROPERTY_TENSOR_SUPPORT_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET \
+  (QNN_PROPERTY_GROUP_TENSOR + 9)
+
+/**
+ * @brief Property key to determine whether a backend supports bit-width scale-offset quantization
+ *        encodings. See QNN_QUANTIZATION_ENCODING_BW_SCALE_OFFSET.
+ */
+#define QNN_PROPERTY_TENSOR_SUPPORT_QUANTIZATION_ENCODING_BW_SCALE_OFFSET \
+  (QNN_PROPERTY_GROUP_TENSOR + 10)
+
+/**
+ * @brief Property key to determine whether a backend supports bit-width axis scale-offset
+ *        quantization encodings. See QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET.
+ */
+#define QNN_PROPERTY_TENSOR_SUPPORT_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET \
+  (QNN_PROPERTY_GROUP_TENSOR + 11)
+
+/**
+ * @brief Property key to determine whether a backend supports block quantization encodings. See
+ *        QNN_QUANTIZATION_ENCODING_BLOCK.
+ */
+#define QNN_PROPERTY_TENSOR_SUPPORT_QUANTIZATION_ENCODING_BLOCK (QNN_PROPERTY_GROUP_TENSOR + 12)
+
+/**
+ * @brief Property key to determine whether a backend supports blockwise expansion
+ *        quantization encodings. See QNN_QUANTIZATION_ENCODING_BLOCKWISE_EXPANSION.
+ */
+#define QNN_PROPERTY_TENSOR_SUPPORT_QUANTIZATION_ENCODING_BLOCKWISE_EXPANSION \
+  (QNN_PROPERTY_GROUP_TENSOR + 13)
+
+/**
+ * @brief Property key to determine whether a backend supports vector quantization encodings. See
+ *        QNN_QUANTIZATION_ENCODING_VECTOR.
+ */
+#define QNN_PROPERTY_TENSOR_SUPPORT_QUANTIZATION_ENCODING_VECTOR (QNN_PROPERTY_GROUP_TENSOR + 14)
+
+/**
+ * @brief Property key to determine whether a backend supports deferred loading of raw tensor data
+ *        through a callback. See QNN_TENSORMEMTYPE_RETRIEVE_RAW.
+ */
+#define QNN_PROPERTY_TENSOR_SUPPORT_RETRIEVE_RAW (QNN_PROPERTY_GROUP_TENSOR + 15)
+
+/**
+ * @brief Property key for determining whether a backend supports optional application
+ *        writable tensors.
+ */
+#define QNN_PROPERTY_TENSOR_SUPPORT_OPTIONAL_APP_WRITE (QNN_PROPERTY_GROUP_TENSOR + 16)
+/**
+ * @brief Property key for determining whether a backend supports optional application
+ *        readable tensors.
+ */
+#define QNN_PROPERTY_TENSOR_SUPPORT_OPTIONAL_APP_READ (QNN_PROPERTY_GROUP_TENSOR + 17)
+/**
+ * @brief Property key for determining whether a backend supports optional application
+ *        readable/writable tensors.
+ */
+#define QNN_PROPERTY_TENSOR_SUPPORT_OPTIONAL_APP_READWRITE (QNN_PROPERTY_GROUP_TENSOR + 18)
+
+/**
+ * @brief Property key for determining whether a backend supports MX data format
+ */
+#define QNN_PROPERTY_TENSOR_SUPPORT_MX_DATA_FORMAT (QNN_PROPERTY_GROUP_TENSOR + 19)
+
+/**
+ *  @brief Property key for determining whether a backend supports
+ *         QNN_TENSOR_DATA_FORMAT_UBWC_RGBA8888 data format.
+ */
+#define QNN_PROPERTY_TENSOR_SUPPORT_UBWC_RGBA8888 (QNN_PROPERTY_GROUP_TENSOR + 20)
+
+/**
+ * @brief Property key for determining whether a backend supports QNN_TENSOR_DATA_FORMAT_UBWC_NV12
+ *        data format.
+ */
+#define QNN_PROPERTY_TENSOR_SUPPORT_UBWC_NV12 (QNN_PROPERTY_GROUP_TENSOR + 21)
+
+/**
+ * @brief Property key for determining whether a backend supports QNN_TENSOR_DATA_FORMAT_UBWC_NV12_Y
+ *        data format.
+ */
+#define QNN_PROPERTY_TENSOR_SUPPORT_UBWC_NV12_Y (QNN_PROPERTY_GROUP_TENSOR + 22)
+
+/**
+ * @brief Property key for determining whether a backend supports
+ *        QNN_TENSOR_DATA_FORMAT_UBWC_NV12_UV data format.
+ */
+#define QNN_PROPERTY_TENSOR_SUPPORT_UBWC_NV12_UV (QNN_PROPERTY_GROUP_TENSOR + 23)
+
+///
+/// Definition of QNN_PROPERTY_GROUP_ERROR property group. This group is Optional portion of API.
+///
+
+/**
+ * @brief Property key for the QNN Error API property group. This can be used as a key to
+ *        check if Error API is supported by a backend.
+ */
+#define QNN_PROPERTY_GROUP_ERROR (QNN_PROPERTY_GROUP_CORE + 1000)
+
+/**
+ * @brief Property key to determine whether a backend supports retrieving verbose string descriptors
+ *        of errorHandles. It determines support for QnnError_getVerboseMessage.
+ */
+#define QNN_PROPERTY_ERROR_GET_VERBOSE_MESSAGE (QNN_PROPERTY_GROUP_ERROR + 1)
+
+///
+/// Definition of QNN_PROPERTY_GROUP_MEMORY property group. This group is an optional API.
+///
+
+/**
+ * @brief Property group for the QNN Memory API property group. This can be used as a key to
+ *        check if Memory API is supported by a backend.
+ */
+#define QNN_PROPERTY_GROUP_MEMORY (QNN_PROPERTY_GROUP_CORE + 1100)
+
+/**
+ * @brief Property key to determine whether a backend supports ion memory type.
+ */
+#define QNN_PROPERTY_MEMORY_SUPPORT_MEM_TYPE_ION (QNN_PROPERTY_GROUP_MEMORY + 1)
+
+/**
+ * @brief Property key to determine whether a backend supports custom memory type.
+ */
+#define QNN_PROPERTY_MEMORY_SUPPORT_MEM_TYPE_CUSTOM (QNN_PROPERTY_GROUP_MEMORY + 2)
+
+/**
+ * @brief Property key to determine whether a backend supports DMA-BUF memory type.
+ */
+#define QNN_PROPERTY_MEMORY_SUPPORT_MEM_TYPE_DMA_BUF (QNN_PROPERTY_GROUP_MEMORY + 3)
+
+///
+/// Definition of QNN_PROPERTY_GROUP_SIGNAL property group. This group is an optional API.
+///
+
+/**
+ * @brief Property group for signal support. This can be used as a key to
+ *        check if Signal API is supported by a backend.
+ */
+#define QNN_PROPERTY_GROUP_SIGNAL (QNN_PROPERTY_GROUP_CORE + 1200)
+
+/**
+ * @brief Property key to determine whether a backend supports abort signals.
+ */
+#define QNN_PROPERTY_SIGNAL_SUPPORT_ABORT QNN_PROPERTY_GROUP_SIGNAL + 1
+
+/**
+ * @brief Property key to determine whether a backend supports timeout signals.
+ */
+#define QNN_PROPERTY_SIGNAL_SUPPORT_TIMEOUT QNN_PROPERTY_GROUP_SIGNAL + 2
+
+///
+/// Definition of QNN_PROPERTY_GROUP_LOG property group. This group is an optional API.
+///
+
+/**
+ * @brief Property group for log support. This can be used as a key to
+ *        check if Log API is supported by a backend.
+ */
+#define QNN_PROPERTY_GROUP_LOG (QNN_PROPERTY_GROUP_CORE + 1300)
+
+/**
+ * @brief Property key for determining whether a backend supports logging with the
+ *        system's default stream (callback=NULL).
+ */
+#define QNN_PROPERTY_LOG_SUPPORTS_DEFAULT_STREAM (QNN_PROPERTY_GROUP_LOG + 1)
+
+///
+/// Definition of QNN_PROPERTY_GROUP_PROFILE property group. This group is an optional API.
+///
+
+/**
+ * @brief Property group for profile support. This can be used as a key to
+ *        check if Profile API is supported by a backend.
+ */
+#define QNN_PROPERTY_GROUP_PROFILE (QNN_PROPERTY_GROUP_CORE + 1400)
+
+/**
+ * @brief Property key for determining whether a backend supports the
+ *        QNN_PROFILE_CONFIG_OPTION_CUSTOM config option.
+ */
+#define QNN_PROPERTY_PROFILE_SUPPORT_CUSTOM_CONFIG (QNN_PROPERTY_GROUP_PROFILE + 1)
+
+/**
+ * @brief Property key for determining whether a backend supports the
+ *        QNN_PROFILE_CONFIG_OPTION_MAX_EVENTS config option.
+ */
+#define QNN_PROPERTY_PROFILE_SUPPORT_MAX_EVENTS_CONFIG (QNN_PROPERTY_GROUP_PROFILE + 2)
+
+/**
+ * @brief Property key for determining whether a backend supports querying extended event data. It
+ *        determines support for QnnProfile_getExtendedEventData.
+ */
+#define QNN_PROPERTY_PROFILE_SUPPORTS_EXTENDED_EVENT (QNN_PROPERTY_GROUP_PROFILE + 3)
+
+/**
+ * @brief Property key for determining whether a backend supports the
+ *        QNN_PROFILE_CONFIG_OPTION_ENABLE_OPTRACE config option.
+ */
+#define QNN_PROPERTY_PROFILE_SUPPORT_OPTRACE_CONFIG (QNN_PROPERTY_GROUP_PROFILE + 4)
+
+/**
+ * @brief Property group for device support. This can be used as a key to
+ *        check if Device API is supported by a backend.
+ */
+#define QNN_PROPERTY_GROUP_DEVICE (QNN_PROPERTY_GROUP_CORE + 1500)
+
+/**
+ * @brief Property key for determining if a backend supports QnnDevice_getInfrastructure.
+ */
+#define QNN_PROPERTY_DEVICE_SUPPORT_INFRASTRUCTURE (QNN_PROPERTY_GROUP_DEVICE + 1)
+
+///
+/// Definition of QNN_PROPERTY_GROUP_CUSTOM property group. This group represents backend defined
+/// properties.
+///
+
+/**
+ * @brief Property group for custom backend properties.
+ */
+#define QNN_PROPERTY_GROUP_CUSTOM (QNN_PROPERTY_GROUP_CORE + 2000)
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+/**
+ * @brief Type used for unique property identifiers.
+ */
+typedef uint32_t QnnProperty_Key_t;
+
+/**
+ * @brief QNN Property API result / error codes.
+ */
+typedef enum {
+  QNN_PROPERTY_MIN_ERROR = QNN_MIN_ERROR_PROPERTY,
+  //////////////////////////////////////////////
+
+  QNN_PROPERTY_NO_ERROR = QNN_SUCCESS,
+  /// Property in question is supported
+  QNN_PROPERTY_SUPPORTED = QNN_SUCCESS,
+  /// Property in question not supported.
+  QNN_PROPERTY_NOT_SUPPORTED = QNN_COMMON_ERROR_NOT_SUPPORTED,
+
+  // Remaining values signal errors.
+
+  /// Backend did not recognize the property key.
+  QNN_PROPERTY_ERROR_UNKNOWN_KEY = QNN_MIN_ERROR_PROPERTY + 0,
+
+  //////////////////////////////////////////////
+  QNN_PROPERTY_MAX_ERROR = QNN_MAX_ERROR_PROPERTY,
+  // Unused, present to ensure 32 bits.
+  QNN_PROPERTY_ERROR_UNDEFINED = 0x7FFFFFFF
+} QnnProperty_Error_t;
+
+//=============================================================================
+// Public Functions
+//=============================================================================
+
+/**
+ * @brief Queries a capability of the backend.
+ *
+ * @note Safe to call any time, backend does not have to be created.
+ *
+ * @param[in] key Key which identifies the capability within group.
+ *
+ * @return Error code:
+ *         - QNN_PROPERTY_SUPPORTED: if the backend supports capability.
+ *         - QNN_PROPERTY_ERROR_UNKNOWN_KEY: The provided key is not valid.
+ *         - QNN_PROPERTY_NOT_SUPPORTED: if the backend does not support capability.
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnProperty_hasCapability(QnnProperty_Key_t key);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnSdkBuildId.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnSdkBuildId.h
new file mode 100755
index 0000000000000..4c5eaea5964b6
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnSdkBuildId.h
@@ -0,0 +1,19 @@
+//==============================================================================
+//
+// Copyright (c) 2021-2025 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+//=============================================================================
+// !!! This is an auto-generated file. Do NOT modify manually !!!
+//=============================================================================
+
+#ifndef QNN_SDK_BUILD_ID_H
+#define QNN_SDK_BUILD_ID_H
+
+/// QNN SDK build id
+#define QNN_SDK_BUILD_ID "v2.34.0.250424201103_119471"
+
+#endif  // QNN_SDK_BUILD_ID_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnSignal.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnSignal.h
new file mode 100755
index 0000000000000..34cd3bca4d2a1
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnSignal.h
@@ -0,0 +1,219 @@
+//=============================================================================
+//
+//  Copyright (c) 2020-2024 Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//=============================================================================
+
+/**
+ *  @file
+ *  @brief  Signal component API.
+ *
+ *          Requires Backend to be initialized.
+ *          Provides means to manage Signal objects.
+ *          Signal objects are used to control execution of other components.
+ */
+
+#ifndef QNN_SIGNAL_H
+#define QNN_SIGNAL_H
+
+#include "QnnCommon.h"
+#include "QnnTypes.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+/**
+ * @brief QNN Signal API result / error codes.
+ */
+typedef enum {
+  QNN_SIGNAL_MIN_ERROR = QNN_MIN_ERROR_SIGNAL,
+  //////////////////////////////////////////
+
+  QNN_SIGNAL_NO_ERROR = QNN_SUCCESS,
+  /// Backend does not support the requested functionality
+  QNN_SIGNAL_ERROR_UNSUPPORTED = QNN_COMMON_ERROR_NOT_SUPPORTED,
+  /// Attempt to reconfigure, free, or supply to a second QNN function call
+  /// a signal object that is already in use.
+  QNN_SIGNAL_ERROR_SIGNAL_IN_USE = QNN_MIN_ERROR_SIGNAL + 0,
+  /// Signal object is idle and not being used by an outstanding function
+  /// call.
+  QNN_SIGNAL_ERROR_SIGNAL_IDLE = QNN_MIN_ERROR_SIGNAL + 1,
+  /// Invalid configuration error
+  QNN_SIGNAL_ERROR_INVALID_ARGUMENT = QNN_MIN_ERROR_SIGNAL + 2,
+  /// NULL or unrecognized signal handle error
+  QNN_SIGNAL_ERROR_INVALID_HANDLE = QNN_MIN_ERROR_SIGNAL + 3,
+  /// Timeout error
+  QNN_SIGNAL_ERROR_TIMEOUT = QNN_MIN_ERROR_SIGNAL + 4,
+  /// API supplied with incompatible signal type
+  QNN_SIGNAL_ERROR_INCOMPATIBLE_SIGNAL_TYPE = QNN_MIN_ERROR_SIGNAL + 5,
+  // Mem allocation error
+  QNN_SIGNAL_ERROR_MEM_ALLOC = QNN_COMMON_ERROR_MEM_ALLOC,
+
+  //////////////////////////////////////////
+  QNN_SIGNAL_MAX_ERROR = QNN_MAX_ERROR_SIGNAL,
+  // Unused, present to ensure 32 bits.
+  QNN_SIGNAL_ERROR_UNDEFINED = 0x7FFFFFFF
+} QnnSignal_Error_t;
+
+/**
+ * @brief Custom configuration for Signal object
+ *
+ * Please refer to documentation provided by the backend for usage information
+ */
+typedef void* QnnSignal_CustomConfig_t;
+
+/**
+ * @brief This enum defines signal config options.
+ */
+typedef enum {
+  /// Sets signal custom options via QnnSignal_CustomConfig_t
+  QNN_SIGNAL_CONFIG_OPTION_CUSTOM = 0,
+  /// Sets abort on API calls invoked with a signal object.
+  /// Abort and Timeout signals are mutually exclusive and
+  /// cannot be used together.
+  QNN_SIGNAL_CONFIG_OPTION_ABORT = 1,
+  /// Sets timeout interval on API calls invoked with a signal
+  /// object. Timeout and Abort signals are mutually exclusive
+  /// and cannot be used together.
+  QNN_SIGNAL_CONFIG_OPTION_TIMEOUT = 2,
+  // Unused, present to ensure 32 bits.
+  QNN_SIGNAL_CONFIG_UNDEFINED = 0x7FFFFFFF
+} QnnSignal_ConfigOption_t;
+
+/**
+ * @brief This struct provides signal configuration.
+ */
+typedef struct {
+  /// Type of config object used to configure the signal
+  QnnSignal_ConfigOption_t option;
+  /// Union of mutually exclusive config values based on
+  /// the type specified by 'option'.
+  union UNNAMED {
+    QnnSignal_CustomConfig_t customConfig;
+    /// Timeout interval is represented in microseconds.
+    /// Tolerance for the Timeout is platform dependent and
+    /// cannot be guaranteed.
+    uint64_t timeoutDurationUs;
+  };
+} QnnSignal_Config_t;
+
+/// QnnSignal_Config_t initializer macro
+#define QNN_SIGNAL_CONFIG_INIT              \
+  {                                         \
+    QNN_SIGNAL_CONFIG_UNDEFINED, /*option*/ \
+    {                                       \
+      NULL /*customConfig*/                 \
+    }                                       \
+  }
+
+//=============================================================================
+// Public Functions
+//=============================================================================
+
+/**
+ * @brief Create a new signal object. The object will be configured with desired
+ *        behavior and is idle and available for usage.
+ *
+ * @param[in] backend A backend handle
+ *
+ * @param[in] config  Pointer to a NULL terminated array of config option pointers.
+ *                    NULL is allowed, indicates no config options are provided, and
+ *                    signal will not be configured to do anything. All config options
+ *                    have default value, in case not provided. If same config
+ *                    option type is provided multiple times, the last option value
+ *                    will be used.
+ *
+ * @param[out] signal Handle to newly created signal object.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: if the signal is created successfully
+ *         - QNN_SIGNAL_ERROR_INVALID_ARGUMENT: at least one argument or config option invalid
+ *         - QNN_SIGNAL_ERROR_INVALID_HANDLE: _backend_ is not a valid handle
+ *         - QNN_SIGNAL_ERROR_UNSUPPORTED: if QnnSignal API is not supported on the backend
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION: SSR occurence (successful recovery)
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION_FATAL: SSR occurence (unsuccessful recovery)
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnSignal_create(Qnn_BackendHandle_t backend,
+                                   const QnnSignal_Config_t** config,
+                                   Qnn_SignalHandle_t* signal);
+
+/**
+ * @brief Set/change a configuration on an existing signal
+ *
+ * @param[in] signal Signal object whose configuration needs to be set
+ *
+ * @param[in] config Pointer to a NULL terminated array of config option pointers.
+ *                   NULL is allowed and may be used to reset any previously set configuration.
+ *                   No default values are assumed for config options that are not set.
+ *                   If same config option type is provided multiple times,
+ *                   the last option value will be used. If a backend cannot support
+ *                   all provided configs it will fail.
+ *
+ * @return Error Code:
+ *         - QNN_SUCCESS: if the config is set successfully
+ *         - QNN_SIGNAL_ERROR_INVALID_HANDLE: signal handle is null or invalid
+ *         - QNN_SIGNAL_ERROR_INVALID_ARGUMENT: one or more config values is invalid
+ *         - QNN_SIGNAL_ERROR_SIGNAL_IN_USE: when attempting to reconfigure a signal
+ *           that is active and in-use.
+ *         - QNN_SIGNAL_ERROR_UNSUPPORTED: if QnnSignal API is not supported on the backend
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnSignal_setConfig(Qnn_SignalHandle_t signal, const QnnSignal_Config_t** config);
+
+/**
+ * @brief Triggers the signal action during the associated API call. For abort config signals, it
+ *        causes the associated API call to gracefully cease execution at the earliest opportunity.
+ *        This function will block until the targeted call has released associated resources and is
+ *        ready to return in it's own calling context. When the associated API call is initiated,
+ *        the signal object will be in-use and not available to another call. When the associated
+ *        API call returns, the associated signal object will be available and can safely be passed
+ *        to another call.
+ *
+ * @param[in] signal Signal handle used by the associated API call
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: if the trigger is successful.
+ *         - QNN_SIGNAL_ERROR_INVALID_HANDLE: signal handle is null or invalid
+ *         - QNN_SIGNAL_ERROR_INCOMPATIBLE_SIGNAL_TYPE: API does not support the signal type
+ *         - QNN_SIGNAL_ERROR_TRIGGER_SIGNAL_IDLE: if the signal is not currently in-use, and hence
+ *           can not be triggered.
+ *         - QNN_SIGNAL_ERROR_UNSUPPORTED: if QnnSignal API is not supported on the backend
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION: SSR occurence (successful recovery)
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION_FATAL: SSR occurence (unsuccessful recovery)
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnSignal_trigger(Qnn_SignalHandle_t signal);
+
+/**
+ * @brief Free memory and resources associated with an available signal object.
+ *
+ * @param[in] signal The signal object to free.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: if the signal object is successfully freed
+ *         - QNN_SIGNAL_ERROR_INVALID_HANDLE: signal handle is null or invalid
+ *         - QNN_SIGNAL_ERROR_SIGNAL_IN_USE: if the signal object is currently in-use
+ *         - QNN_SIGNAL_ERROR_MEM_ALLOC: an error is encountered with de-allocation of associated
+ *           memory
+ *         - QNN_SIGNAL_ERROR_UNSUPPORTED: if QnnSignal API is not supported on the backend
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnSignal_free(Qnn_SignalHandle_t signal);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnTensor.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnTensor.h
new file mode 100755
index 0000000000000..a8fbec5fe45c3
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnTensor.h
@@ -0,0 +1,222 @@
+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All rights reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/**
+ *  @file
+ *  @brief  Tensor component API.
+ *
+ *          Requires Backend to be initialized.
+ *          Tensors have either Context or Graph scope. Tensors created with
+ *          Context scope can be used within Graphs that belong to same Context,
+ *          but not vice versa. Tensors hold either operation's static/constant
+ *          data or input/output activation data.
+ */
+
+#ifndef QNN_TENSOR_H
+#define QNN_TENSOR_H
+
+#include "QnnCommon.h"
+#include "QnnTypes.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+/**
+ * @brief QNN Tensor API result / error codes.
+ */
+typedef enum {
+  QNN_TENSOR_MIN_ERROR = QNN_MIN_ERROR_TENSOR,
+  //////////////////////////////////////////
+
+  /// Success.
+  QNN_TENSOR_NO_ERROR = QNN_SUCCESS,
+  /// Invalid context/graph handle in creating tensor.
+  QNN_TENSOR_ERROR_INVALID_HANDLE = QNN_MIN_ERROR_TENSOR + 1,
+  /// Tensor with specified credentials not registered with a context/graph.
+  QNN_TENSOR_ERROR_DOES_NOT_EXIST = QNN_MIN_ERROR_TENSOR + 2,
+  /// (deprecated) Tensor has already been registered with backend.
+  QNN_TENSOR_ERROR_ALREADY_EXISTS = QNN_MIN_ERROR_TENSOR + 3,
+  /// Invalid tensor param.
+  QNN_TENSOR_ERROR_INVALID_TENSOR_PARAM = QNN_MIN_ERROR_TENSOR + 4,
+  /// This tensor param is currently unsupported.
+  QNN_TENSOR_ERROR_UNSUPPORTED_TENSOR_PARAM = QNN_MIN_ERROR_TENSOR + 5,
+  /// (deprecated) A hash collision has occurred with a previously registered tensor's name.
+  QNN_TENSOR_ERROR_NAME_HASH_COLLISION = QNN_MIN_ERROR_TENSOR + 6,
+  /// Tensor provided for update is invalid.
+  QNN_TENSOR_ERROR_INCOMPATIBLE_TENSOR_UPDATE = QNN_MIN_ERROR_TENSOR + 7,
+  /// There is optional API component that is not supported yet. See QnnProperty.
+  QNN_TENSOR_ERROR_UNSUPPORTED_FEATURE = QNN_COMMON_ERROR_NOT_SUPPORTED,
+
+  //////////////////////////////////////////
+  QNN_TENSOR_MAX_ERROR = QNN_MAX_ERROR_TENSOR,
+  // Unused, present to ensure 32 bits.
+  QNN_TENSOR_ERROR_UNDEFINED = 0x7FFFFFFF
+} QnnTensor_Error_t;
+
+//=============================================================================
+// Public Functions
+//=============================================================================
+
+/**
+ * @brief A function to create a new tensor on Qnn_ContextHandle_t.
+ *
+ *        This call may or may not allocate memory, depending on the Qnn_TensorType_t
+ *        value specified in tensor and the accelerator implementation.
+ *        Optionally it may be initialized with data provided in the tensor if present.
+ *
+ * @warning Context tensors cannot be of type QNN_TENSOR_TYPE_NATIVE.
+ *          Native tensors connect nodes within a single graph.
+ *
+ * @warning Context tensors cannot be of datatype QNN_DATATYPE_STRING.
+ *
+ * @param[in] context The context in which the tensor would be created.
+ *
+ * @param[in,out] tensor Pointer to a user-allocated struct containing information on the tensor
+ *                       (type, name, data format, dimensions, data, etc). For tensors containing
+ *                       static data (such as weights or biases), the tensor type is expected to be
+ *                       QNN_TENSOR_TYPE_STATIC. Valid data must be presented in the tensor object
+ *                       at creation. This data will be copied, and may be safely de-allocated
+ *                       after this call returns. Other tensor types (e.g: APP_READ, APP_WRITE,
+ *                       APP_READWRITE, NULL) must have the data pointer set to NULL at the time of
+ *                       creation. Any preset value in _id_ will be overwritten by the backend as
+ *                       part of this call. Subsequent usage of the tensor must reference this _id_.
+ *                       Creating a tensor with a name that duplicates a previously created tensor
+ *                       name in the context and all child graphs results in undefined behaviour.
+ *                       The _dimensions_ are treated as the maximum dimensions during tensor
+ *                       creation.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: Successfully created a context tensor
+ *         - QNN_TENSOR_ERROR_INVALID_HANDLE: Provided context handle is invalid
+ *         - QNN_TENSOR_ERROR_INVALID_TENSOR_PARAM: One or more tensor parameters is invalid
+ *         - QNN_TENSOR_ERROR_UNSUPPORTED_TENSOR_PARAM: One or more tensor parameters are
+ *           unsupported
+ *         - QNN_COMMON_ERROR_MEM_ALLOC: Failure in creating tensor due to issues with memory
+ *           allocation
+ *         - QNN_TENSOR_ERROR_UNSUPPORTED_FEATURE: some API feature is not supported yet
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnTensor_createContextTensor(Qnn_ContextHandle_t context, Qnn_Tensor_t* tensor);
+
+/**
+ * @brief A function to create a new tensor on Qnn_GraphHandle_t.
+ *
+ *        This call may or may not allocate memory, depending on the Qnn_TensorType_t
+ *        value specified in tensor and the accelerator implementation.
+ *        Optionally it may be initialized with data provided in the tensor if present.
+ *
+ * @warning Graph tensors cannot be of type QNN_TENSOR_TYPE_APP_READWRITE. R/W tensors connect
+ *          multiple graphs.
+ *
+ * @warning Graph tensors cannot be of datatype QNN_DATATYPE_STRING.
+ *
+ * @param[in] graph The graph or sub-graph in which the tensor would be created.
+ *
+ * @param[in,out] tensor Pointer to a user-allocated struct containing information on the tensor
+ *                (type, name, data format, dimensions, data, etc). For tensors containing static
+ *                data (such as weights or biases), the tensor type is expected to be
+ *                QNN_TENSOR_TYPE_STATIC. Valid data must be presented in the tensor object at
+ *                creation. This data will be copied, and may be safely de-allocated after this
+ *                call returns. Other tensor types (e.g: NATIVE, APP_READ, APP_WRITE, NULL) must
+ *                have the data pointer set to NULL at the time of creation. Any preset value in
+ *                _id_ will be overwritten by the backend as part of this call. Subsequent usage of
+ *                the tensor must reference this _id_. Creating a tensor with a name that
+ *                duplicates a previously created tensor name in the graph or parent context
+ *                results in undefined behaviour. The _dimensions_ are treated as the maximum
+ *                dimensions during tensor creation.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: Successfully created a graph tensor
+ *         - QNN_TENSOR_ERROR_INVALID_HANDLE: Provided graph handle is invalid
+ *         - QNN_TENSOR_ERROR_INVALID_TENSOR_PARAM: One or more tensor parameters is invalid
+ *         - QNN_TENSOR_ERROR_UNSUPPORTED_TENSOR_PARAM: One or more tensor parameters are
+ *           unsupported
+ *         - QNN_COMMON_ERROR_MEM_ALLOC: Failure in creating tensor due to issues with memory
+ *           allocation
+ *         - QNN_TENSOR_ERROR_UNSUPPORTED_FEATURE: some API feature is not supported yet
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnTensor_createGraphTensor(Qnn_GraphHandle_t graph, Qnn_Tensor_t* tensor);
+
+/**
+ * @brief Update a graph tensor with the new provided tensor information.
+ *        Tensors provided here are associated with the tensor in the backend through the ID field.
+ *        Valid fields to update are: data and quantization parameters for UPDATEABLE_STATIC
+ *        tensors, quantization parameters for UPDATEABLE_NATIVE, UPDATEABLE_APP_READ,
+ *        UPDATEABLE_APP_WRITE, and UPDATEABLE_APP_READWRITE tensors.
+ *        Multiple calls to QnnTensor_updateGraphTensors() can be made, but the updates will
+ *        not take effect until QnnGraph_finalize() is called.
+ *        Backends may support a subset of updateable tensor types.
+
+ *
+ *  @return Error code:
+ *         - QNN_SUCCESS: Successfully updated the graph tensors
+ *         - QNN_TENSOR_ERROR_INVALID_HANDLE: Provided graph handle is invalid
+ *         - QNN_TENSOR_ERROR_INVALID_TENSOR_PARAM: One or more tensor parameters is invalid
+ *         - QNN_TENSOR_ERROR_UNSUPPORTED_TENSOR_PARAM: One or more tensor parameters are
+ *           unsupported
+ *         - QNN_GRAPH_ERROR_GRAPH_NOT_FINALIZED: graph needs to be finalized before updating
+ *           graph tensors.
+ *         - QNN_COMMON_ERROR_MEM_ALLOC: Failure in creating tensor due to issues with memory
+ *           allocation
+ *         - QNN_TENSOR_ERROR_INCOMPATIBLE_TENSOR_UPDATE: provided tensor is invalid and cannot
+ *           be applied as an update.
+ *         - QNN_TENSOR_ERROR_UNSUPPORTED_FEATURE: some API feature is not supported yet
+ *
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnTensor_updateGraphTensors(Qnn_GraphHandle_t graph,
+                                               const Qnn_Tensor_t** tensor,
+                                               uint64_t numTensors);
+
+/**
+ * @brief Update a context tensor with the new provided tensor information.
+ *        Tensors provided here are associated with the tensor in the backend through the ID field.
+ *        Valid fields to update are: data and quantization parameters for UPDATEABLE_STATIC
+ *        tensors, quantization parameters for UPDATEABLE_NATIVE, UPDATEABLE_APP_READ,
+ *        UPDATEABLE_APP_WRITE, and UPDATEABLE_APP_READWRITE tensors. Multiple calls to
+ *        QnnTensor_updateContextTensors() can be made, but the updates will not take effect until
+ *        QnnGraph_finalize() is called for one or more of the graphs to which the context tensors
+ *        are associated. Backends may support a subset of updateable tensor types.
+ *
+ *  @return Error code:
+ *         - QNN_SUCCESS: Successfully updated the context tensor
+ *         - QNN_TENSOR_ERROR_INVALID_HANDLE: Provided context handle is invalid
+ *         - QNN_TENSOR_ERROR_INVALID_TENSOR_PARAM: One or more tensor parameters is invalid
+ *         - QNN_TENSOR_ERROR_UNSUPPORTED_TENSOR_PARAM: One or more tensor parameters are
+ *           unsupported
+ *         - QNN_COMMON_ERROR_MEM_ALLOC: Failure in creating tensor due to issues with memory
+ *           allocation
+ *         - QNN_TENSOR_ERROR_INCOMPATIBLE_TENSOR_UPDATE: provided tensor is invalid and cannot
+ *           be applied as an update.
+ *         - QNN_TENSOR_ERROR_UNSUPPORTED_FEATURE: some API feature is not supported yet
+ *
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnTensor_updateContextTensors(Qnn_ContextHandle_t context,
+                                                 const Qnn_Tensor_t** tensor,
+                                                 uint64_t numTensors);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // QNN_TENSOR_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnTypes.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnTypes.h
new file mode 100755
index 0000000000000..03dcd3567d539
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnTypes.h
@@ -0,0 +1,1296 @@
+//==============================================================================
+//
+// Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+// All rights reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/**
+ *  @file
+ *  @brief  A header which contains the base types required by the API.
+ *          Strings are expected to be UTF-8 encoded and NULL terminated.
+ */
+
+#ifndef QNN_TYPES_H
+#define QNN_TYPES_H
+
+#ifdef __cplusplus
+#include <cstddef>
+#include <cstdint>
+#else
+#include <stddef.h>
+#include <stdint.h>
+#endif
+
+#include "QnnCommon.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Data Types
+//=============================================================================
+/**
+ * @brief A structure which defines Op Mapping information from Source Framework Operation
+ *        to QNN Operation.
+ */
+typedef enum {
+  QNN_OP_MAPPING_TYPE_TENSOR = 0,
+  QNN_OP_MAPPING_TYPE_OP = 1
+} Qnn_MappingType_t;
+
+typedef struct {
+  const char* name;
+  Qnn_MappingType_t type;
+} Qnn_OpMappingPair_t;
+
+typedef struct {
+  /// Name of the QNN Operation or Tensor
+  const char* name;
+  /// Associated pairs to this tensor or operation.
+  Qnn_OpMappingPair_t* pair;
+  /// Number of pairs
+  uint32_t numPairs;
+} Qnn_Mapping_t;
+
+typedef struct {
+  const char* graphName;
+  Qnn_Mapping_t* opMappings;
+  uint32_t numOpMappings;
+  Qnn_Mapping_t* tensorMappings;
+  uint32_t numTensorMappings;
+} Qnn_OpMappingV1_t;
+
+/// Version for Qnn_OpMapping_t
+typedef enum {
+  QNN_OP_MAPPING_VERSION_1 = 1,
+} Qnn_OpMappingVersion_t;
+
+typedef struct {
+  Qnn_OpMappingVersion_t version;
+  union UNNAMED {
+    Qnn_OpMappingV1_t* v1;
+  };
+} Qnn_OpMapping_t;
+
+// clang-format off
+/// Qnn_OpMapping_t initializer macro
+#define QNN_OP_MAPPING_INIT        \
+  {                                \
+    QNN_OP_MAPPING_VERSION_1,      \
+    {                              \
+      NULL                         \
+    }                              \
+  }
+// clang-format on
+
+// clang-format off
+/// Qnn_OpMappingV1_t initializer macro
+#define QNN_OP_MAPPING_V1_INIT     \
+  {                                \
+      NULL, /*graphName*/          \
+      NULL, /*opMappings*/         \
+      0,    /*numOpMappings*/      \
+      NULL, /*tensorMappings*/     \
+      0,    /*numTensorMappings*/  \
+  }
+// clang-format on
+
+/**
+ * @brief An enum which defines various data types.
+ *
+ * @note  4-bit data types (QNN_DATATYPE_SFIXED_POINT_4 and
+ *        QNN_DATATYPE_UFIXED_POINT_4) are stored in tightly
+ *        packed format into a single byte in little endian
+ *        format. This allows two 4-bit quantized elements to be
+ *        stored in a single byte. The lower nibble stores the first
+ *        value while the higher nibble stores the second value.
+ *        For example, to represent two 4-bit quantized values of
+ *        10 and 4, they will be stored in a single byte as (0100 1010).
+ */
+typedef enum {
+  // Signed Int: 0x00XX
+
+  /// 8-bit integer type
+  QNN_DATATYPE_INT_8 = 0x0008,
+  /// 16-bit integer type
+  QNN_DATATYPE_INT_16 = 0x0016,
+  /// 32-bit integer type
+  QNN_DATATYPE_INT_32 = 0x0032,
+  /// 64-bit integer type
+  QNN_DATATYPE_INT_64 = 0x0064,
+
+  // Unsigned Int: 0x01XX
+  QNN_DATATYPE_UINT_8  = 0x0108,
+  QNN_DATATYPE_UINT_16 = 0x0116,
+  QNN_DATATYPE_UINT_32 = 0x0132,
+  QNN_DATATYPE_UINT_64 = 0x0164,
+
+  // Float: 0x02XX
+  QNN_DATATYPE_FLOAT_16 = 0x0216,
+  QNN_DATATYPE_FLOAT_32 = 0x0232,
+  QNN_DATATYPE_FLOAT_64 = 0x0264,
+
+  // Signed Fixed Point: 0x03XX
+  QNN_DATATYPE_SFIXED_POINT_4  = 0x0304,
+  QNN_DATATYPE_SFIXED_POINT_8  = 0x0308,
+  QNN_DATATYPE_SFIXED_POINT_16 = 0x0316,
+  QNN_DATATYPE_SFIXED_POINT_32 = 0x0332,
+
+  // Unsigned Fixed Point: 0x04XX
+  QNN_DATATYPE_UFIXED_POINT_4  = 0x0404,
+  QNN_DATATYPE_UFIXED_POINT_8  = 0x0408,
+  QNN_DATATYPE_UFIXED_POINT_16 = 0x0416,
+  QNN_DATATYPE_UFIXED_POINT_32 = 0x0432,
+
+  // Bool: 0x05XX
+  /// 8-bit boolean type, 0 = false, any non-zero value = true
+  QNN_DATATYPE_BOOL_8 = 0x0508,
+
+  // String: 0x06xx
+  QNN_DATATYPE_STRING = 0x0608,
+
+  // Unused, present to ensure 32 bits.
+  QNN_DATATYPE_UNDEFINED = 0x7FFFFFFF
+} Qnn_DataType_t;
+
+/**
+ * @brief An enum which defines the different precision modes supported by QNN backends.
+ *        A precision mode may be used to express the math type used in the implementation
+ *        of an operation.
+ */
+typedef enum {
+  // FLOATING POINT REPRESENTATIONS
+
+  /// 32-bit Floating point precision. The format of the floating point
+  /// value is left to backends to choose.
+  QNN_PRECISION_FLOAT32 = 0,
+  /// 16-bit Floating point precision. The format of the floating point
+  /// value is left to backends to choose.
+  QNN_PRECISION_FLOAT16 = 1,
+
+  // Unused, present to ensure 32 bits.
+  QNN_PRECISION_UNDEFINED = 0x7FFFFFFF
+} Qnn_Precision_t;
+
+/**
+ * @brief An enum to specify the tensor type, application accessible or native to QNN
+ *
+ */
+typedef enum {
+  /// Client application writeable tensor.
+  QNN_TENSOR_TYPE_APP_WRITE = 0,
+  /// Client application readable tensor.
+  QNN_TENSOR_TYPE_APP_READ = 1,
+  /// Tensor that can both be read and written by an application. Used in scenarios that may include
+  /// supplying an output tensor from one graph as the input to another graph.
+  QNN_TENSOR_TYPE_APP_READWRITE = 2,
+  /// Tensor native to a graph which may be optimized by a backend and are not accessible by a
+  /// client.
+  QNN_TENSOR_TYPE_NATIVE = 3,
+  /// Static data which doesn't change during execution and may be optimized by a backend. Since the
+  /// data cannot change, static tensors cannot have dynamic dimensions.
+  QNN_TENSOR_TYPE_STATIC = 4,
+  /// Tensor type NULL which can be used to represent optional tensors. Other Qnn_Tensor_t metadata
+  /// is ignored.
+  QNN_TENSOR_TYPE_NULL = 5,
+  /// Tensor containing static data whose content or quantization encodings may
+  /// be modified by a client after tensor creation.
+  QNN_TENSOR_TYPE_UPDATEABLE_STATIC = 6,
+  /// Tensor native to a graph whose quantization encodings may be modified by a
+  /// client after tensor creation.
+  QNN_TENSOR_TYPE_UPDATEABLE_NATIVE = 7,
+  /// Application writable tensor whose quantization encodings may be modified by a
+  /// client after tensor creation.
+  QNN_TENSOR_TYPE_UPDATEABLE_APP_WRITE = 8,
+  /// Application readable tensor whose quantization encodings may be modified by a
+  /// client after tensor creation.
+  QNN_TENSOR_TYPE_UPDATEABLE_APP_READ = 9,
+  /// Application readable/writable tensor whose quantization encodings may be modified by a
+  /// client after tensor creation.
+  QNN_TENSOR_TYPE_UPDATEABLE_APP_READWRITE = 10,
+  /// Tensor type OPTIONAL_APP_WRITE represents an application writable (input) tensor that may be
+  /// excluded from inferences
+  QNN_TENSOR_TYPE_OPTIONAL_APP_WRITE = 11,
+  /// Tensor type OPTIONAL_APP_READ represents an application readable (output) tensor that may be
+  /// excluded from inferences.
+  QNN_TENSOR_TYPE_OPTIONAL_APP_READ = 12,
+  /// Tensor type OPTIONAL_APP_READ_WRITE represents an application readable (output) or writable
+  /// (input) tensor that may be excluded from inferences.
+  QNN_TENSOR_TYPE_OPTIONAL_APP_READWRITE = 13,
+  // Unused, present to ensure 32 bits.
+  QNN_TENSOR_TYPE_UNDEFINED = 0x7FFFFFFF
+} Qnn_TensorType_t;
+
+/**
+ * @brief An enum to specify the parameter type : Scalar or Tensor
+ */
+typedef enum {
+  QNN_PARAMTYPE_SCALAR = 0,
+  QNN_PARAMTYPE_TENSOR = 1,
+  // Unused, present to ensure 32 bits.
+  QNN_PARAMTYPE_UNDEFINED = 0xFFFFFFFF
+} Qnn_ParamType_t;
+
+/**
+ * @brief An enum to specify definition source for field(s) following this enum
+ */
+typedef enum {
+  /// Indicates backend implementation to update or decide
+  QNN_DEFINITION_IMPL_GENERATED = 0,
+  /// Indicates that provided definition needs to be used
+  QNN_DEFINITION_DEFINED = 1,
+  // Unused, present to ensure 32 bits.
+  QNN_DEFINITION_UNDEFINED = 0x7FFFFFFF
+} Qnn_Definition_t;
+
+/**
+ * @brief An enum to specify a priority.
+ */
+typedef enum {
+  /// QNN_PRIORITY_LOW is always available for use.
+  QNN_PRIORITY_LOW = 0,
+  /// QNN_PRIORITY_NORMAL is always available for use.
+  QNN_PRIORITY_NORMAL  = 100,
+  QNN_PRIORITY_DEFAULT = QNN_PRIORITY_NORMAL,
+  /// QNN_PRIORITY_NORMAL_HIGH usage may be restricted and would silently be treated as
+  /// QNN_PRIORITY_NORMAL
+  QNN_PRIORITY_NORMAL_HIGH = 150,
+  /// QNN_PRIORITY_HIGH usage may be restricted and would silently be treated as
+  /// QNN_PRIORITY_NORMAL
+  QNN_PRIORITY_HIGH = 200,
+  // Unused, present to ensure 32 bits.
+  QNN_PRIORITY_UNDEFINED = 0x7FFFFFFF
+} Qnn_Priority_t;
+
+/**
+ * @brief A typedef to indicate context binary size.
+ */
+typedef uint64_t Qnn_ContextBinarySize_t;
+
+/**
+ * @brief An enum to describe reporting levels for the error handling API
+ * QNN_ERROR_REPORTING_LEVEL_BRIEF: get basic information about an error
+ * QNN_ERROR_REPORTING_LEVEL_DETAILED: get detailed information about an error
+ * in memory-based object forms
+ */
+typedef enum {
+  QNN_ERROR_REPORTING_LEVEL_BRIEF    = 0,
+  QNN_ERROR_REPORTING_LEVEL_DETAILED = 1,
+  // Unused, present to ensure 32 bits.
+  QNN_ERROR_REPORTING_LEVEL_UNDEFINED = 0x7FFFFFFF
+} Qnn_ErrorReportingLevel_t;
+
+/**
+ * @brief A typedef describing error reporting configuration
+ */
+typedef struct {
+  /// Error reporting level
+  Qnn_ErrorReportingLevel_t reportingLevel;
+  /// Amount of memory to be reserved for error information. Specified in KB
+  uint32_t storageLimit;
+} Qnn_ErrorReportingConfig_t;
+
+// clang-format off
+/// Qnn_ErrorReportingConfig_t initializer macro
+#define QNN_ERROR_REPORTING_CONFIG_INIT                     \
+  {                                                         \
+    QNN_ERROR_REPORTING_LEVEL_UNDEFINED, /*reportingLevel*/ \
+    0u                                   /*storageLimit*/   \
+  }
+// clang-format on
+
+/**
+ * @brief A struct which is used to provide a version number using 3 values:
+ * major, minor, patch
+ */
+typedef struct {
+  uint32_t major;
+  uint32_t minor;
+  uint32_t patch;
+} Qnn_Version_t;
+
+// clang-format off
+/// Qnn_Version_t initializer macro
+#define QNN_VERSION_INIT \
+  {                      \
+    0u,    /*major*/     \
+    0u,    /*minor*/     \
+    0u     /*patch*/     \
+  }
+// clang-format on
+
+/**
+ * @brief A struct used to provide the versions of both the core QNN API
+ * and any Backend Specific API
+ */
+typedef struct {
+  /// Version of the QNN core API common to all backends
+  Qnn_Version_t coreApiVersion;
+  /// Version of the backend-specific API
+  Qnn_Version_t backendApiVersion;
+} Qnn_ApiVersion_t;
+
+/// Qnn_ApiVersion_t initializer macro
+#define QNN_API_VERSION_INIT                            \
+  {                                                     \
+    {                                                   \
+        QNN_API_VERSION_MAJOR, /*coreApiVersion.major*/ \
+        QNN_API_VERSION_MINOR, /*coreApiVersion.minor*/ \
+        QNN_API_VERSION_PATCH  /*coreApiVersion.patch*/ \
+    },                                                  \
+        QNN_VERSION_INIT /*backendApiVersion*/          \
+  }
+
+/**
+ * @brief A value representing an immutable value which configures a node.
+ */
+typedef struct {
+  Qnn_DataType_t dataType;
+  union UNNAMED {
+    float floatValue;
+    double doubleValue;
+    uint64_t uint64Value;
+    int64_t int64Value;
+    uint32_t uint32Value;
+    int32_t int32Value;
+    uint16_t uint16Value;
+    int16_t int16Value;
+    uint8_t uint8Value;
+    int8_t int8Value;
+    uint8_t bool8Value;
+    const char* stringValue;
+  };
+} Qnn_Scalar_t;
+
+/// Qnn_Scalar_t initializer macro
+#define QNN_SCALAR_INIT                  \
+  {                                      \
+    QNN_DATATYPE_UNDEFINED, /*dataType*/ \
+    {                                    \
+      0.0f /*floatValue*/                \
+    }                                    \
+  }
+
+/**
+ * @brief An enum to specify quantization encoding type structure
+ *
+ */
+typedef enum {
+  /// Indicates per-tensor scale-offset encoding type. See Qnn_ScaleOffset_t. Support can be checked
+  /// via QNN_PROPERTY_TENSOR_SUPPORT_QUANTIZATION_ENCODING_SCALE_OFFSET.
+  QNN_QUANTIZATION_ENCODING_SCALE_OFFSET = 0,
+  /// Indicates per-axis (e.g. per-channel) scale-offset encoding type. See Qnn_AxisScaleOffset_t.
+  /// Support can be checked via
+  /// QNN_PROPERTY_TENSOR_SUPPORT_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET.
+  QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET = 1,
+  /// Indicates bit-width scale-offset encoding type. See Qnn_BwScaleOffset_t. Support can be
+  /// checked via QNN_PROPERTY_TENSOR_SUPPORT_QUANTIZATION_ENCODING_BW_SCALE_OFFSET.
+  QNN_QUANTIZATION_ENCODING_BW_SCALE_OFFSET = 2,
+  /// Indicates bit-width per-axis scale-offset encoding type. See Qnn_BwAxisScaleOffset_t. Support
+  /// can be checked via QNN_PROPERTY_TENSOR_SUPPORT_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET.
+  QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET = 3,
+  /// Indicates per-block scale-offset encoding type. See Qnn_BlockScaleOffset_t. Support can be
+  /// checked via QNN_PROPERTY_TENSOR_SUPPORT_QUANTIZATION_ENCODING_BLOCK_SCALE_OFFSET.
+  QNN_QUANTIZATION_ENCODING_BLOCK = 4,
+  /// Indicates per-block scale-offset encoding type. See Qnn_BlockScaleOffset_t. Support can be
+  /// checked via QNN_PROPERTY_TENSOR_SUPPORT_QUANTIZATION_ENCODING_BLOCKWISE_EXPANSION.
+  QNN_QUANTIZATION_ENCODING_BLOCKWISE_EXPANSION = 5,
+  /// Indicates VQ compression encoding type. See Qnn_VectorQuantCompression_t. Support can be
+  /// checked via QNN_PROPERTY_TENSOR_SUPPORT_QUANTIZATION_ENCODING_VQ_COMPRESSION.
+  QNN_QUANTIZATION_ENCODING_VECTOR = 6,
+  // Unused, present to ensure 32 bits.
+  QNN_QUANTIZATION_ENCODING_UNDEFINED = 0x7FFFFFFF
+} Qnn_QuantizationEncoding_t;
+
+/**
+ * @brief A struct to express scale-offset quantization encoding.
+ *
+ * float_value = (quantized_value + offset) * scale
+ */
+typedef struct {
+  /// scale must be strictly positive
+  float scale;
+  int32_t offset;
+} Qnn_ScaleOffset_t;
+
+// clang-format off
+/// Qnn_ScaleOffset_t initializer macro
+#define QNN_SCALE_OFFSET_INIT \
+  {                           \
+    0.0f, /*scale*/           \
+    0     /*offset*/          \
+  }
+// clang-format on
+
+/**
+ * @brief A struct to express quantization parameters as a positive scale with a zero offset and a
+ * bitwidth.
+ *
+ * float_value = (quantized_value + offset) * scale
+ *
+ * bitwidth must be > 0, and is used to express the true number of bits used to quantize the value,
+ * which may be different from the bitwidth of the tensor indicated by its data type. For example:
+ * the quantization encoding for a tensor of type QNN_DATATYPE_UFIXED_POINT_8 that is quantized to
+ * 4-bit precision may be expressed by setting bitwidth = 4. In such circumstances, data quantized
+ * to a lower precision will still occupy the full extent of bits allotted to the tensor as per its
+ * data type in unpacked form.
+ *
+ * The datatype used must be the smallest type which can accommodate the bitwidth. For example: a
+ * tensor quantized to 4-bit precision must use an 8-bit datatype, 16-bit or larger datatypes are
+ * not permitted.
+ *
+ * Tensor elements are expected to occupy the least significant bits of the total size alloted to
+ * the datatype, and all bits above the specified bitwidth will be ignored. For example: an 8-bit
+ * datatype tensor quantized to 4-bit precision will be interpreted as a 4-bit value contained in
+ * the lower 4 bits of each element, and the upper 4 bits will be ignored. For signed datatypes, the
+ * value will be interpreted as a two's complement integer where the signed bit is the most
+ * significant bit permitted by the specified bitwidth. For example: -3 would be represented as
+ * 0b11111101 as a signed 8-bit integer, but can also be represented as 0b00001101 as a signed 4-bit
+ * integer stored in an 8-bit container. Either of these representations are valid to express -3 as
+ * a 4-bit signed integer in an 8-bit container, and will be treated identically because the upper 4
+ * bits will be ignored.
+ */
+typedef struct {
+  /// bitwidth must be <= number of bits specified by data type of tensor
+  uint32_t bitwidth;
+  /// scale must be strictly positive
+  float scale;
+  int32_t offset;
+} Qnn_BwScaleOffset_t;
+
+// clang-format off
+/// Qnn_BwScaleOffset_t initializer macro
+#define QNN_BW_SCALE_OFFSET_INIT \
+  {                              \
+    0u,   /*bitwidth*/           \
+    0.0f, /*scale*/              \
+    0     /*offset*/             \
+  }
+// clang-format on
+
+/**
+ * @brief A struct to express per-axis quantization parameters as a scale with a zero offset
+ */
+typedef struct {
+  int32_t axis;
+  uint32_t numScaleOffsets;
+  Qnn_ScaleOffset_t* scaleOffset;
+} Qnn_AxisScaleOffset_t;
+
+// clang-format off
+/// Qnn_AxisScaleOffset_t initializer macro
+#define QNN_AXIS_SCALE_OFFSET_INIT \
+  {                                \
+    0,       /*axis*/              \
+    0u,      /*numScaleOffsets*/   \
+    NULL     /*scaleOffset*/       \
+  }                                \
+// clang-format on
+
+/**
+ * @brief A struct to express per-axis quantization parameters as collection of scales, offsets
+ * and bitwidth.
+ *
+ * bitwidth must be > 0 and applies commonly to all axes. It is used to express the true number of
+ * bits used to quantize the value, which may be different from the bitwidth of the tensor indicated
+ * by its data type. For example: the quantization encoding for a tensor of type
+ * QNN_DATATYPE_UFIXED_POINT_8 that is quantized to 4-bit precision may be expressed by setting
+ * bitwidth = 4. In such circumstances, data quantized to a lower precision will still occupy the
+ * full extent of bits allotted to the tensor as per its data type in unpacked form.
+ *
+ * The datatype used must be the smallest type which can accommodate the bitwidth. For example: a
+ * tensor quantized to 4-bit precision must use an 8-bit datatype, 16-bit or larger datatypes are
+ * not permitted.
+ *
+ * Tensor elements are expected to occupy the least significant bits of the total size alloted to
+ * the datatype, and all bits above the specified bitwidth will be ignored. For example: an 8-bit
+ * datatype tensor quantized to 4-bit precision will be interpreted as a 4-bit value contained in
+ * the lower 4 bits of each element, and the upper 4 bits will be ignored. For signed datatypes, the
+ * value will be interpreted as a two's complement integer where the signed bit is the most
+ * significant bit permitted by the specified bitwidth. For example: -3 would be represented as
+ * 0b11111101 as a signed 8-bit integer, but can also be represented as 0b00001101 as a signed 4-bit
+ * integer stored in an 8-bit container. Either of these representations are valid to express -3 as
+ * a 4-bit signed integer in an 8-bit container, and will be treated identically because the upper 4
+ * bits will be ignored.
+ */
+typedef struct {
+  /// bitwidth must be <= number of bits specified by data type of tensor
+  uint32_t bitwidth;
+  int32_t axis;
+  /// numElements applies to both scales and offsets and they are supposed to be a one-to-one match
+  uint32_t numElements;
+  /// scales must be strictly positive
+  float* scales;
+  /// offsets must match scales in their dimension except when it can be NULL to indicate that the
+  /// value is symmetrically quantized and hence, offset = 0
+  int32_t* offsets;
+} Qnn_BwAxisScaleOffset_t;
+
+// clang-format off
+/// Qnn_BwAxisScaleOffset_t initializer macro
+#define QNN_BW_AXIS_SCALE_OFFSET_INIT \
+  {                                   \
+    0u,      /*bitwidth*/             \
+    0,       /*axis*/                 \
+    0u,      /*numElements*/          \
+    NULL,    /*scales*/               \
+    NULL     /*offsets*/              \
+  }
+// clang-format on
+
+/**
+ * @brief A struct to express block quantization parameters. A tensor is divided into blocks of
+ * size blockSize, where blockSize is an array of length rank.
+ *
+ * @note num of scaleOffsets (i.e. num of blocks) must be ==
+ * ceil(dimensions[0]/blockSize[0])*ceil(dimensions[1]/blockSize[1]) ...
+ * .... *ceil(dimensions[rank-1] / blockSize[rank-1]). *
+ */
+typedef struct {
+  /// Dimensions of the block in number of tensor elements.
+  /// Pointer to an array of size RANK(Weight). Each element specifies the size along the
+  /// corresponding dimension
+  uint32_t* blockSize;
+
+  /// Array of size numBlocks of scale offset pairs.
+  Qnn_ScaleOffset_t* scaleOffset;
+} Qnn_BlockEncoding_t;
+
+// clang-format off
+/// Qnn_BlockEncoding_t initializer macro
+#define QNN_BLOCK_ENCODING_INIT     \
+  {                                 \
+    0u,      /*blockSize*/          \
+    NULL     /*scaleOffset*/        \
+  }                                 \
+// clang-format on
+
+/**
+ * @brief An enum to specify blockwise expansion block scale storage widths
+ *
+ */
+typedef enum {
+    QNN_BLOCKWISE_EXPANSION_BITWIDTH_SCALE_STORAGE_8 = 0,
+    QNN_BLOCKWISE_EXPANSION_BITWIDTH_SCALE_STORAGE_16 = 1,
+    // Unused, present to ensure 32 bits.
+    QNN_BLOCKWISE_EXPANSION_BITWIDTH_SCALE_STORAGE_UNDEFINED = 0x7FFFFFFF
+} Qnn_BlockwiseExpansionBlockScaleStorageType_t;
+
+/**
+ * @brief A struct to express block-wise expansion quantization parameters.
+ *
+ * @note This quantization encoding must not be used with dynamically shaped tensors.
+ *
+ */
+typedef struct {
+    /// The dimension (typically the channel dimension)
+    int32_t axis;
+    /// Array of size axisSize of scale offset pairs.
+    Qnn_ScaleOffset_t* scaleOffsets;
+    /// Number of blocks within the axis.
+    uint32_t numBlocksPerAxis;
+    /// Block bitwidth (e.g. 12 bits for 4 to 16 expansion)
+    uint32_t blockScaleBitwidth;
+    /// Size of the block scaling storage, must be able to store at least blockScaleBitwidth sized values.
+    Qnn_BlockwiseExpansionBlockScaleStorageType_t blockScaleStorageType;
+    union UNNAMED {
+        /// A contiguous array of block scalings of size axisSize*numBlocksPerAxis. The array is laid out such that an element can be accessed via blocksScale8[axisIter*numBlocksPerAxis+blockIter].
+        /// Used when blockStorageSize is QNN_BLOCKWISE_EXPANSION_BITWIDTH_SCALE_STORAGE_8.
+        uint8_t* blocksScale8;
+        /// A contiguous array of block scalings of size axisSize*numBlocksPerAxis. The array is laid out such that an element can be accessed via blocksScale16[axisIter*numBlocksPerAxis+blockIter].
+        /// Used when blockStorageSize is QNN_BLOCKWISE_EXPANSION_BITWIDTH_SCALE_STORAGE_16.
+        uint16_t* blocksScale16;
+    };
+} Qnn_BlockwiseExpansion_t;
+
+// clang-format off
+/// Qnn_BlockScaleOffset_t initializer macro
+#define QNN_BLOCKWISE_EXPANSION_INIT                                              \
+  {                                                                               \
+    0,                                                  /*axis*/                  \
+    NULL,                                               /*scaleOffsets*/          \
+    0u,                                                 /*numBlocksPerAxis*/      \
+    0u,                                                 /*blockScaleBitwidth*/    \
+    QNN_BLOCKWISE_EXPANSION_BITWIDTH_SCALE_STORAGE_UNDEFINED, /*blockScaleStorageType*/ \
+    {                                                                             \
+      NULL,                                             /*blocksScale8*/          \
+    }                                                                             \
+  }                                                                               \
+// clang-format on
+
+/**
+ * @brief A struct to express vector quantization parameters.
+ *
+ * @note This quantization encoding is a specific case of per-channel quantization where
+ * the weights and parameters are crafted in such a way to allow for compression and
+ * codebook generation. For each group of rowsPerBlock*columnsPerBlock weights, there
+ * will be 2^indexBitwidth unique vectorDimension-tuples of weights.
+ *
+ * @note This quantization encoding must not be used with dynamically shaped tensors.
+ *
+ */
+typedef struct {
+    /// Vector Quantization can be thought of as per-channel quantization with specifically
+    /// crafted weights and encoding parameters that allow for codebook generation
+    /// Each weight within the codebook is bwAxisScaleOffset.bitwidth bits wide
+    Qnn_BwAxisScaleOffset_t bwAxisScaleOffset;
+    /// Number of rows in the block of decoded weight coordinates
+    uint32_t rowsPerBlock;
+    /// Number of colums inf the block of decoded weight coordinates
+    uint32_t columnsPerBlock;
+    /// The dimension of the vector encoding. e.g 1D,2D,3D... for 1, 2 or 3 weights per index, respectively
+    uint8_t vectorDimension;
+    /// A value describing how the weights from a given lookup will be unpacked
+    uint8_t vectorStride;
+    /// The bitwidth of the each index into the codebook
+    uint8_t indexBitwidth;
+} Qnn_VectorEncoding_t;
+
+// clang-format off
+/// Qnn_VectorEncoding_t initializer macro
+#define QNN_VECTOR_ENCODING_INIT                                                  \
+  {                                                                               \
+    QNN_BW_AXIS_SCALE_OFFSET_INIT,                        /*bwAxisScaleOffset*/   \
+    0u,                                                   /*rowsPerBlock*/        \
+    0u,                                                   /*columnsPerBlock*/     \
+    0u,                                                   /*vectorDimension*/     \
+    0u,                                                   /*vectorStride*/        \
+    0u,                                                   /*indexBitwidth*/       \
+  }                                                                               \
+// clang-format on
+
+/**
+ * @brief A struct which defines the quantization parameters, and union of supported quantization
+ * encoding structs.
+ */
+typedef struct {
+  Qnn_Definition_t encodingDefinition;
+  /// Quantization encoding type identifying quantization encoding structure to use
+  Qnn_QuantizationEncoding_t quantizationEncoding;
+  union UNNAMED {
+    /// Used when quantizationEncoding is QNN_QUANTIZATION_ENCODING_SCALE_OFFSET. Note that this field is a value.
+    Qnn_ScaleOffset_t scaleOffsetEncoding;
+    /// Used when quantizationEncoding is QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET. Note that this field is a value.
+    Qnn_AxisScaleOffset_t axisScaleOffsetEncoding;
+    /// Used when quantizationEncoding is QNN_QUANTIZATION_ENCODING_BW_SCALE_OFFSET. Note that this field is a value.
+    Qnn_BwScaleOffset_t bwScaleOffsetEncoding;
+    /// Used when quantizationEncoding is QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET. Note that this field is a value.
+    Qnn_BwAxisScaleOffset_t bwAxisScaleOffsetEncoding;
+    /// Used when quantizationEncoding is QNN_QUANTIZATION_ENCODING_BLOCK. Note that this field is a value.
+    Qnn_BlockEncoding_t blockEncoding;
+    /// Used when quantizationEncoding is QNN_QUANTIZATION_ENCODING_BLOCKWISE_EXPANSION. Note that this field is a pointer.
+    Qnn_BlockwiseExpansion_t* blockwiseExpansion;
+    /// Used when quantizationEncoding is QNN_QUANTIZATION_ENCODING_VECTOR. Note that this field is a pointer.
+    Qnn_VectorEncoding_t* vectorEncoding;
+  };
+} Qnn_QuantizeParams_t;
+
+// clang-format off
+/// Qnn_QuantizeParams_t initializer macro
+#define QNN_QUANTIZE_PARAMS_INIT                                      \
+  {                                                                   \
+    QNN_DEFINITION_UNDEFINED,                /*encodingDefinition*/   \
+    QNN_QUANTIZATION_ENCODING_UNDEFINED,     /*quantizationEncoding*/ \
+    {                                                                 \
+      QNN_SCALE_OFFSET_INIT /*scaleOffsetEncoding*/                   \
+    }                                                                 \
+  }
+// clang-format on
+
+/**
+ * @brief An n-dimensional tensor formatted in memory as flat buffer where the last dimension varies
+ *        the fastest. Also known as a dense tensor.
+ */
+#define QNN_TENSOR_DATA_FORMAT_DENSE       0
+#define QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER QNN_TENSOR_DATA_FORMAT_DENSE
+
+/**
+ * @brief An n-dimensional tensor formatted in memory as a sparse tensor. Sparse tensors may only be
+ *        QNN_TENSOR_TYPE_NATIVE. Sparse tensors must also fully specify Qnn_SparseParams_t.
+ */
+#define QNN_TENSOR_DATA_FORMAT_SPARSE 1
+
+// TODO: advertise the layout
+/**
+ * @brief A tensor formatted as a codebook. This tensor data format is to be used only in
+ * conjunction with a quantized QNN_TENSOR_TYPE_STATIC tensor using the Qnn_VectorEncoding_t
+ * encoding.
+ */
+#define QNN_TENSOR_DATA_FORMAT_CODEBOOK 2
+
+/**
+ * @brief Tensor data formatted in microscaling (MX) format. Compatible with multiple data types.
+ */
+#define QNN_TENSOR_DATA_FORMAT_MX 3
+
+/**
+* @brief An tensor compressed in memory in UBWC_RGBA8888 format, using the universal
+ *       bandwidth compression (UBWC) scheme.
+*/
+#define QNN_TENSOR_DATA_FORMAT_UBWC_RGBA8888 4
+
+/**
+* @brief An tensor compressed in memory in UBWC_NV12 format, using the universal
+ *       bandwidth compression (UBWC) scheme.
+*/
+#define QNN_TENSOR_DATA_FORMAT_UBWC_NV12 5
+
+/**
+* @brief An tensor compressed in memory in UBWC_NV12 format, using the universal
+ *       bandwidth compression (UBWC) scheme. This data format particularly represents
+ *       the Y plane of the NV12 format
+*/
+#define QNN_TENSOR_DATA_FORMAT_UBWC_NV12_Y 6
+
+/**
+* @brief An tensor compressed in memory in UBWC_NV12 format, using the universal
+ *       bandwidth compression (UBWC) scheme. This data format particularly represents
+ *       the UV plane of the NV12 format
+*/
+#define QNN_TENSOR_DATA_FORMAT_UBWC_NV12_UV 7
+
+/**
+ * @brief Tensor data formatted in native HMX weight format. This data format is desgined
+ *        specifically for HMX weights.
+ *        This format only supports the following datatype for now:
+ *        UFIXED_UINT_8 with offset=128.
+*/
+#define QNN_TENSOR_DATA_FORMAT_HMX_WEIGHT_LAYOUT 8
+
+/**
+ * @brief Tensor data format identifier. The default format
+ *        QNN_TENSOR_DATA_FORMAT_DENSE is supported by all backends. Backends may also support
+ *        QNN_TENSOR_DATA_FORMAT_SPARSE or QNN_TENSOR_DATA_FORMAT_CODEBOOK.
+ * @note  Data format for intermediate tensors, i.e ones of type QNN_TENSOR_TYPE_NATIVE
+ *        may not be honored by a backend, because it can choose to pick a data format that is
+ *        more conducive for its execution.
+ */
+typedef uint32_t Qnn_TensorDataFormat_t;
+
+/**
+ * @brief An enum specifying memory types of tensor data.
+ */
+typedef enum {
+  /// Raw memory pointer
+  QNN_TENSORMEMTYPE_RAW = 0,
+  /// Memory object, provide capability for memory sharing in between QNN accelerator backends.
+  QNN_TENSORMEMTYPE_MEMHANDLE = 1,
+  /// Callback to retrieve a raw memory pointer
+  QNN_TENSORMEMTYPE_RETRIEVE_RAW = 2,
+  // Unused, present to ensure 32 bits.
+  QNN_TENSORMEMTYPE_UNDEFINED = 0x7FFFFFFF
+} Qnn_TensorMemType_t;
+
+/**
+ * @brief A struct which defines a memory buffer
+ *
+ */
+typedef struct {
+  /// app-accessible data pointer, provided by app.
+  void* data;
+  /// size of buffer, in bytes, pointed to by data.
+  uint32_t dataSize;
+} Qnn_ClientBuffer_t;
+
+/**
+ * @brief A client-defined function used to obtain tensor data when the tensor memory type is
+ *        QNN_TENSORMEMTYPE_RETRIEVE_RAW. Qnn_GetTensorRawDataRn_t may be called multiple times for
+ * the same tensor. Each call to Qnn_GetTensorRawDataRn_t must be accompanied by a call to
+ * Qnn_FreeTensorRawDataFn_t to free any allocated data for that tensor. It is not required that
+ * this function be thread safe, unless needed to support retrieval of tensor resources that may be
+ * shared between threads.
+ *
+ * @param[in] id The tensor ID.
+ * @param[in] context the context to which the tensor is associated
+ * @param[in] graph the graph to which the context is associated. For context tensors this field
+ *            should be null.
+ *
+ * @param[out] clientBuf Pointer to the tensor's client buffer.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: Client Buffer data successfully provided.
+ *         - QNN_TENSOR_ERROR_DOES_NOT_EXIST: Tensor with __id__ does not exist or was not created
+ * as QNN_TENSORMEMTYPE_RETRIEVE_RAW.
+ *         - QNN_COMMON_ERROR_INVALID_ARGUMENT: __clientBuf__ is NULL
+ *         - QNN_COMMON_ERROR_RESOURCE_UNAVAILABLE: Requested tensor data cannot be allocated.
+ *
+ */
+typedef Qnn_ErrorHandle_t (*Qnn_GetTensorRawDataFn_t)(Qnn_ContextHandle_t context,
+                                                      Qnn_GraphHandle_t graph,
+                                                      uint64_t id,
+                                                      Qnn_ClientBuffer_t* clientBuf);
+
+/**
+ * @brief A client-defined function used to free tensor data previously obtained by
+ * Qnn_GetTensorDataFn_t. After the call to Qnn_FreeTensorDataFn_t the data provided in the client
+ * buffer clientBuf should be considered invalid. If Qnn_GetTensorRawDataRn_t has been called
+ * multiple times for the same tensor then Qnn_FreeTensorRawDataFn_t must be called an equivalent
+ * number of times to free all allocated data for this tensor. It is not required that this function
+ * be thread safe, unless needed to support releasing of tensor resources that may be shared between
+ * threads.
+ *
+ * @param[in] id The tensor ID.
+ * @param[in] context the context to which the tensor is associated
+ * @param[in] graph the graph to which the context is associated. For context tensors this field
+ * should be null.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: Client Buffer data successfully freed.
+ *         - QNN_TENSOR_ERROR_DOES_NOT_EXIST: Tensor with __id__ does not exist, was not created
+ *           as QNN_TENSORMEMTYPE_RETRIEVE_RAW, or has already been free'd.
+ *
+ */
+typedef Qnn_ErrorHandle_t (*Qnn_FreeTensorRawDataFn_t)(Qnn_ContextHandle_t context,
+                                                       Qnn_GraphHandle_t graph,
+                                                       uint64_t id);
+
+typedef struct {
+  Qnn_GetTensorRawDataFn_t getTensorData;
+  Qnn_FreeTensorRawDataFn_t freeTensorData;
+} Qnn_TensorRetrieveRaw_t;
+
+// clang-format off
+/// Qnn_TensorDataRetrieve_t initializer macro
+#define QNN_TENSOR_RETRIEVE_RAW_INIT \
+  {                                  \
+    NULL,   /*getTensorData*/        \
+    NULL    /*freeTensorData*/       \
+  }
+// clang-format on
+
+// clang-format off
+/// Qnn_ClientBuffer_t initializer macro
+#define QNN_CLIENT_BUFFER_INIT \
+  {                            \
+    NULL, /*data*/             \
+    0u    /*dataSize*/         \
+  }
+// clang-format on
+
+/**
+ * @brief A struct which defines an opaque object
+ *
+ */
+typedef struct {
+  /// Data pointer to the opaque object
+  void* data;
+  /// Size of buffer, in bytes, pointed to by data
+  uint64_t len;
+} Qnn_OpaqueObject_t;
+
+// clang-format off
+/// Qnn_OpaqueObject_t initializer macro
+#define QNN_OPAQUE_OBJECT_INIT \
+  {                            \
+    NULL, /*data*/             \
+    0u    /*len*/              \
+  }
+// clang-format on
+
+/**
+ * @brief A struct which describes the properties of a V1 version of tensor.
+ *
+ */
+typedef struct {
+  /// Integer identifier for a tensor.
+  uint32_t id;
+  /// Tensor name.
+  const char* name;
+  /// Tensor type.
+  Qnn_TensorType_t type;
+  /// Tensor data formatting in memory (refer to definition type for info).
+  Qnn_TensorDataFormat_t dataFormat;
+  /// Tensor data type.
+  Qnn_DataType_t dataType;
+  /// Tensor quantization params.
+  Qnn_QuantizeParams_t quantizeParams;
+  /// Tensor rank.
+  uint32_t rank;
+  /// Tensor dimension array of length _rank_. For detailed behavior of dimensions field with
+  /// various APIs, refer SDK documentation. Must be NULL when rank is 0.
+  uint32_t* dimensions;
+  /// Tensor memory type.
+  Qnn_TensorMemType_t memType;
+  /// Actual data contained in the tensor.
+  union UNNAMED {
+    /// Tensor data provided by client as a pointer to raw memory (see QNN_TENSORMEMTYPE_RAW).
+    Qnn_ClientBuffer_t clientBuf;
+    /// Tensor data shared via a memory handle (see QNN_TENSORMEMTYPE_MEMHANDLE).
+    Qnn_MemHandle_t memHandle;
+  };
+} Qnn_TensorV1_t;
+
+// clang-format off
+/// Qnn_TensorV1_t initializer macro
+#define QNN_TENSOR_V1_INIT                                        \
+  {                                                               \
+    0u,                                     /*id*/                \
+    NULL,                                   /*name*/              \
+    QNN_TENSOR_TYPE_UNDEFINED,              /*type*/              \
+    QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER,     /*dataFormat*/        \
+    QNN_DATATYPE_UNDEFINED,                 /*dataType*/          \
+    QNN_QUANTIZE_PARAMS_INIT,               /*quantizeParams*/    \
+    0u,                                     /*rank*/              \
+    NULL,                                   /*dimensions*/        \
+    QNN_TENSORMEMTYPE_UNDEFINED,            /*memType*/           \
+    {                                                             \
+      QNN_CLIENT_BUFFER_INIT                /*clientBuf*/         \
+    }                                                             \
+  }
+// clang-format on
+
+/**
+ * @brief An enum specifying sparse layout of a tensor. Used only when *dataFormat* is set to
+ * QNN_TENSOR_DATA_FORMAT_SPARSE.
+ */
+typedef enum {
+  /// Hybrid coordinate list sparse tensor layout
+  QNN_SPARSE_LAYOUT_HYBRID_COO = 0,
+  // Unused, present to ensure 32 bits.
+  QNN_SPARSE_LAYOUT_UNDEFINED = 0x7FFFFFFF
+} Qnn_SparseLayoutType_t;
+
+/**
+ * @brief A struct which defines the parameters for a COO sparse tensor layout.
+ */
+typedef struct {
+  /// Number of specified elements of a sparse tensor. Treated as the maximum when creating a
+  /// tensor.
+  uint32_t numSpecifiedElements;
+  /// Size of the index for a hybrid COO sparse tensor. The size of the index can range from 1 to
+  /// the rank of the tensor. This feature allows for partially sparse tensors.
+  uint32_t numSparseDimensions;
+} Qnn_SparseLayoutHybridCoo_t;
+
+// clang-format off
+/// Qnn_SparseLayoutCoo_t initializer macro
+#define QNN_SPARSE_LAYOUT_HYBRID_COO_INIT \
+  {                                       \
+    0u, /*numSpecifiedElements*/          \
+    0u  /*numSparseDimensions*/           \
+  }
+// clang-format on
+
+/**
+ * @brief A struct which defines the sparse tensor parameters. See the SDK documentation for
+ *        details. Used only when *dataFormat* is set to QNN_TENSOR_DATA_FORMAT_SPARSE.
+ */
+typedef struct {
+  /// Specifies the sparse tensor layout
+  Qnn_SparseLayoutType_t type;
+  union UNNAMED {
+    /// Hybrid coordinate list layout. Used when *type* is QNN_SPARSE_LAYOUT_HYBRID_COO.
+    Qnn_SparseLayoutHybridCoo_t hybridCoo;
+  };
+} Qnn_SparseParams_t;
+
+// clang-format off
+/// Qnn_SparseParams_t initializer macro
+#define QNN_SPARSE_PARAMS_INIT                      \
+  {                                                 \
+    QNN_SPARSE_LAYOUT_UNDEFINED,      /*type*/      \
+    QNN_SPARSE_LAYOUT_HYBRID_COO_INIT /*hybridCoo*/ \
+  }
+// clang-format on
+
+/**
+ * @brief A struct which describes the properties of a V2 version of tensor.
+ *
+ */
+typedef struct {
+  /// Unique integer identifier for a tensor, generated by the backend based on the tensor name.
+  uint32_t id;
+  /// Unique tensor name.
+  const char* name;
+  /// Tensor type.
+  Qnn_TensorType_t type;
+  /// Tensor data formatting in memory (refer to definition type for info).
+  Qnn_TensorDataFormat_t dataFormat;
+  /// Tensor data type.
+  Qnn_DataType_t dataType;
+  /// Tensor quantization params.
+  Qnn_QuantizeParams_t quantizeParams;
+  /// Tensor rank. Note that rank cannot be dynamic.
+  uint32_t rank;
+  /// Tensor dimension array of length _rank_. For detailed behavior of dimensions field with
+  /// various APIs, refer to their API documentation. Must be NULL when rank is 0. Must contain
+  /// non-zero values if non-null.
+  uint32_t* dimensions;
+  /// Tensor memory type.
+  Qnn_TensorMemType_t memType;
+  /// Actual data contained in the tensor.
+  union UNNAMED {
+    /// Tensor data provided by client as a pointer to raw memory (see QNN_TENSORMEMTYPE_RAW).
+    Qnn_ClientBuffer_t clientBuf;
+    /// Tensor data shared via a memory handle (see QNN_TENSORMEMTYPE_MEMHANDLE).
+    Qnn_MemHandle_t memHandle;
+    /// Tensor data provided by client as a raw pointer retrieved through a callback
+    /// (QNN_TENSORMEMTYPE_RETRIEVE_RAW)
+    Qnn_TensorRetrieveRaw_t* retrieveRaw;
+  };
+  /// A boolean array of length _rank_ indicating if a tensor dimension is dynamic. Must be NULL
+  /// when rank is 0. Can be NULL if all dimensions are static. A true (non-zero) value indicates
+  /// the corresponding dimension is dynamic and a false (zero) value indicates the corresponding
+  /// dimension is static. Note that QNN_TENSOR_TYPE_STATIC tensors (see _type_) cannot have dynamic
+  /// dimensions. Support for this field can be queried via
+  /// QNN_PROPERTY_TENSOR_SUPPORT_DYNAMIC_DIMENSIONS. If this field is unsupported, it must be NULL.
+  uint8_t* isDynamicDimensions;
+  /// Sparse tensor parameters. Pertains only to sparse tensors (see QNN_TENSOR_DATA_FORMAT_SPARSE).
+  /// Support for this field can be queried via QNN_PROPERTY_TENSOR_SUPPORT_SPARSITY.
+  Qnn_SparseParams_t sparseParams;
+  /// Indicates whether or not a call to QnnGraph_execute[Async] produced this output tensor.
+  /// Applicable only to QNN_TENSOR_TYPE_APP_READ and QNN_TENSOR_TYPE_APP_READWRITE tensor types.
+  /// This field will be undefined if QNN_PROPERTY_GRAPH_SUPPORT_EARLY_TERMINATION is not
+  /// supported. Otherwise, this field is not used.
+  uint8_t isProduced;
+} Qnn_TensorV2_t;
+
+// clang-format off
+/// Qnn_TensorV2_t initializer macro
+#define QNN_TENSOR_V2_INIT                                     \
+  {                                                            \
+    0u,                                 /*id*/                 \
+    NULL,                               /*name*/               \
+    QNN_TENSOR_TYPE_UNDEFINED,          /*type*/               \
+    QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, /*dataFormat*/         \
+    QNN_DATATYPE_UNDEFINED,             /*dataType*/           \
+    QNN_QUANTIZE_PARAMS_INIT,           /*quantizeParams*/     \
+    0u,                                 /*rank*/               \
+    NULL,                               /*dimensions*/         \
+    QNN_TENSORMEMTYPE_UNDEFINED,        /*memType*/            \
+    {                                                          \
+      QNN_CLIENT_BUFFER_INIT            /*clientBuf*/          \
+    },                                                         \
+    NULL,                               /*isDynamicDimension*/ \
+    QNN_SPARSE_PARAMS_INIT,             /*sparseParams*/       \
+    0u                                  /*isProduced*/         \
+  }
+// clang-format on
+
+/**
+ * @brief Enum to distinguish various tensor versions
+ */
+typedef enum {
+  /// Enum to choose usage of Qnn_TensorV1_t in Qnn_Tensor_t
+  QNN_TENSOR_VERSION_1 = 1,
+  /// Enum to choose usage of Qnn_TensorV2_t in Qnn_Tensor_t
+  QNN_TENSOR_VERSION_2 = 2,
+  // Unused, present to ensure 32 bits.
+  QNN_TENSOR_VERSION_UNDEFINED = 0x7FFFFFFF
+} Qnn_TensorVersion_t;
+
+/**
+ * @brief A struct which provides various versions of a tensor
+ */
+typedef struct {
+  /// Version of the QNN tensor
+  Qnn_TensorVersion_t version;
+  union UNNAMED {
+    /// Tensor version 1 (see QNN_TENSOR_VERSION_1)
+    Qnn_TensorV1_t v1;
+    /// Tensor version 2 (see QNN_TENSOR_VERSION_2)
+    Qnn_TensorV2_t v2;
+  };
+} Qnn_Tensor_t;
+
+/// Qnn_Tensor_t initializer macro
+#define QNN_TENSOR_INIT               \
+  {                                   \
+    QNN_TENSOR_VERSION_1, /*version*/ \
+    {                                 \
+      QNN_TENSOR_V1_INIT /*v1*/       \
+    }                                 \
+  }
+
+/**
+ * @brief A struct which describes the properties of a V1 set of input and output tensors
+ *
+ */
+typedef struct {
+  /// The number of input tensors.
+  uint32_t numInputs;
+  /// Array of input tensors.
+  Qnn_Tensor_t* inputs;
+  /// The number of output tensors.
+  uint32_t numOutputs;
+  /// Array of output tensors.
+  Qnn_Tensor_t* outputs;
+} Qnn_TensorSetV1_t;
+
+// clang-format off
+/// Qnn_TensorSetV1_t initializer macro
+#define QNN_TENSOR_SET_V1_INIT \
+  {                            \
+    0u,   /*inputs*/           \
+    NULL, /*inputTensors*/     \
+    0u,   /*numOutputs*/       \
+    NULL  /*outputs*/          \
+  }
+// clang-format on
+
+/**
+ * @brief Enum to distinguish between tensor set versions
+ */
+typedef enum {
+  /// Enum to choose usage of Qnn_TensorSetV1_t in Qnn_TensorSet_t
+  QNN_TENSOR_SET_VERSION_1 = 1,
+  /// Unused, present to ensure 32 bits.
+  QNN_TENSOR_SET_VERSION_UNDEFINED = 0x7FFFFFFF
+} Qnn_TensorSetVersion_t;
+
+/**
+ * @brief A struct which provides the version of a tensor set
+ */
+typedef struct {
+  /// Version of the QNN tensor set
+  Qnn_TensorSetVersion_t version;
+  union UNNAMED {
+    /// Tensor set version 1 (see QNN_TENSOR_SET_VERSION_1)
+    Qnn_TensorSetV1_t v1;
+  };
+} Qnn_TensorSet_t;
+
+/// Qnn_TensorSet_t initializer macro
+#define QNN_TENSOR_SET_INIT               \
+  {                                       \
+    QNN_TENSOR_SET_VERSION_1, /*version*/ \
+    {                                     \
+      QNN_TENSOR_SET_V1_INIT /*v1*/       \
+    }                                     \
+  }
+
+/**
+ * @brief A struct which defines a named scalar or tensor parameter.
+ *
+ */
+typedef struct {
+  /// Parameter type: scalar or tensor
+  Qnn_ParamType_t paramType;
+  /// Name of the parameter
+  const char* name;
+
+  union UNNAMED {
+    /// Scalar parameter specification
+    Qnn_Scalar_t scalarParam;
+    /// Tensor parameter specification; tensors referred to must be STATIC.
+    Qnn_Tensor_t tensorParam;
+  };
+} Qnn_Param_t;
+
+// clang-format off
+/// Qnn_Param_t initializer macro
+#define QNN_PARAM_INIT                     \
+  {                                        \
+    QNN_PARAMTYPE_UNDEFINED, /*paramType*/ \
+    NULL,                    /*name*/      \
+    {                                      \
+      QNN_SCALAR_INIT /*scalarParam*/      \
+    }                                      \
+  }
+// clang-format on
+
+/**
+ * @brief This struct defines the configuration for a single operation.
+ */
+typedef struct {
+  /// A human-readable name for the operation instance.
+  const char* name;
+  /// The name of the operation package to which this operation's type belongs.
+  const char* packageName;
+  /// The name of operation type (e.g. Conv2D).
+  const char* typeName;
+  /// The number of static parameters provided in the params array.
+  uint32_t numOfParams;
+  /// Array of operation parameters.
+  Qnn_Param_t* params;
+  /// The number of input tensors.
+  uint32_t numOfInputs;
+  /// Array of input tensors.
+  Qnn_Tensor_t* inputTensors;
+  /// The number of output tensors.
+  uint32_t numOfOutputs;
+  /// Array of output tensors.
+  Qnn_Tensor_t* outputTensors;
+} Qnn_OpConfigV1_t;
+
+// clang-format off
+/// Qnn_OpConfigV1_t initializer macro
+#define QNN_OPCONFIG_V1_INIT    \
+  {                             \
+    NULL,     /*name*/          \
+    NULL,     /*packageName*/   \
+    NULL,     /*typeName*/      \
+    0u,       /*numOfParams*/   \
+    NULL,     /*params*/        \
+    0u,       /*numOfInputs*/   \
+    NULL,     /*inputTensors*/  \
+    0u,       /*numOfOutputs*/  \
+    NULL      /*outputTensors*/ \
+  }
+// clang-format on
+
+/**
+ * @brief Enum to distinguish various opConfig versions
+ */
+typedef enum {
+  /// Enum to choose usage of Qnn_OpConfigV1_t in Qnn_OpConfig_t
+  QNN_OPCONFIG_VERSION_1 = 1,
+  // Unused, present to ensure 32 bits.
+  QNN_OPCONFIG_VERSION_UNDEFINED = 0x7FFFFFFF
+} Qnn_OpConfigVersion_t;
+
+/**
+ * @brief Structure which provides various versions of an opConfig
+ */
+typedef struct {
+  /// Version of the QNN opConfig
+  Qnn_OpConfigVersion_t version;
+  union UNNAMED {
+    /// Op config version 1 (see QNN_OPCONFIG_VERSION_1)
+    Qnn_OpConfigV1_t v1;
+  };
+} Qnn_OpConfig_t;
+
+// clang-format off
+/// Qnn_OpConfig_t initializer macro
+#define QNN_OPCONFIG_INIT               \
+  {                                     \
+    QNN_OPCONFIG_VERSION_1, /*version*/ \
+    {                                   \
+      QNN_OPCONFIG_V1_INIT /*v1*/       \
+    }                                   \
+  }
+// clang-format on
+
+/**
+ * @brief An enum which identifies SOC models.
+ *
+ * @deprecated This enumeration will no longer be updated.
+ */
+typedef enum {
+  QNN_SOC_MODEL_UNKNOWN = 0,
+  QNN_SOC_MODEL_SM8350  = 30,
+  QNN_SOC_MODEL_SM8325  = 34,
+  QNN_SOC_MODEL_SM7350  = 32,
+  QNN_SOC_MODEL_SM7325  = 35,
+  QNN_SOC_MODEL_SM8450  = 36,
+  QNN_SOC_MODEL_SC8280X = 37,
+  QNN_SOC_MODEL_SM7315  = 38,
+  QNN_SOC_MODEL_SA8295  = 39,
+  QNN_SOC_MODEL_SM7450  = 41,
+  QNN_SOC_MODEL_SM8475  = 42,
+  QNN_SOC_MODEL_SM8550  = 43,
+  QNN_SOC_MODEL_SM6450  = 50,
+  QNN_SOC_MODEL_SA8255  = 52,
+  QNN_SOC_MODEL_SM7475  = 54,
+  QNN_SOC_MODEL_SM4450  = 59,
+} Qnn_SocModel_t;
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // QNN_TYPES_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/Saver/QnnSaver.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/Saver/QnnSaver.h
new file mode 100755
index 0000000000000..0d3960a0cd5b7
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/Saver/QnnSaver.h
@@ -0,0 +1,202 @@
+//=============================================================================
+//
+//  Copyright (c) 2020-2023 Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//=============================================================================
+
+/** @file
+ *  @brief QNN Saver component API.
+ *
+ *         Provides an interface to the client to allow configuration
+ *         of settings that are specific to the Saver Backend
+ */
+
+#ifndef QNN_SAVER_H
+#define QNN_SAVER_H
+
+#include "QnnTypes.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+
+// Macro controlling visibility of Saver API
+#ifndef QNN_SAVER_API
+#define QNN_SAVER_API
+#endif
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+/**
+ * @brief QNN Saver API result / error codes.
+ */
+typedef enum {
+  QNN_SAVER_MIN_ERROR = QNN_MIN_ERROR_BACKEND_SAVER,
+  ////////////////////////////////////////
+
+  /// The API has been recorded by Saver, however the return value is fake and should not be used.
+  /// This error code is generally returned from get() APIs where Saver has no capability to
+  /// actually fulfill the request, but can still record the API in saver_output.c.
+  /// Saver will return this error code from the following QNN APIs:
+  ///   - QnnBackend_getSupportedOperations()
+  ///   - QnnContext_getBinarySize()
+  ///   - QnnContext_getBinary()
+  ///   - QnnProperty_hasCapability()
+  ///   - QnnProfile_getEvents()
+  ///   - QnnProfile_getSubEvents()
+  ///   - QnnProfile_getEventData()
+  ///   - QnnProfile_getExtendedEventData()
+  ///   - QnnDevice_getPlatformInfo()
+  ///   - QnnDevice_getInfo()
+  ///   - QnnDevice_getInfrastructure()
+  QNN_SAVER_ERROR_DUMMY_RETVALUE = QNN_MIN_ERROR_BACKEND_SAVER + 0,
+  /// The API must be called before any others, but backend instance has already been instantiated.
+  QNN_SAVER_ERROR_ALREADY_INSTANTIATED = QNN_MIN_ERROR_BACKEND_SAVER + 1,
+
+  ////////////////////////////////////////
+  QNN_SAVER_MAX_ERROR = QNN_MAX_ERROR_BACKEND_SAVER
+} QnnSaver_Error_t;
+
+/**
+ * @brief A struct which is used to provide alternative model + data file names for Saver outputs
+ */
+typedef struct {
+  /// Configuration of the model file name. Must not be NULL and must not contain slashes. Default
+  /// is "saver_output.c"
+  const char* modelFileName;
+  /// Configuration of the data file name. Must not be NULL and must not contain slashes. Default is
+  /// "params.bin"
+  const char* dataFileName;
+} QnnSaver_FileConfig_t;
+
+/**
+ * @brief This enum contains the supported config options for Saver
+ */
+typedef enum {
+  /// Configuration of the location Saver outputs.
+  /// This config option must be provided before any other QNN APIs are called, unless provided
+  /// concurrently with QNN_SAVER_CONFIG_OPTION_FILE_CONFIG.
+  QNN_SAVER_CONFIG_OPTION_OUTPUT_DIRECTORY = 0,
+  /// Configuration of timestamp appended to Saver outputs.
+  /// This config option must be provided before any other QNN APIs are called, and is mutually
+  /// exclusive with QNN_SAVER_CONFIG_OPTION_FILE_CONFIG.
+  QNN_SAVER_CONFIG_OPTION_APPEND_TIMESTAMP = 1,
+  /// Configuration indicating to Saver which backend to interpret custom configs as. This option
+  /// should only be used if you are providing custom configs to QNN APIs that support them
+  /// (e.g. QnnBackend_create()) and you want these custom configs to be recorded by Saver.
+  /// This config option must be provided before any other QNN APIs are called, unless provided
+  /// concurrently with QNN_SAVER_CONFIG_OPTION_FILE_CONFIG.
+  QNN_SAVER_CONFIG_OPTION_BACKEND_ID = 2,
+  /// Configuration of the filenames of outputs from Saver. This configuration can be used to switch
+  /// the output file streams dynamically during runtime.
+  /// This config option is mutually exclusive with QNN_SAVER_CONFIG_OPTION_APPEND_TIMESTAMP.
+  QNN_SAVER_CONFIG_OPTION_FILE_CONFIG = 3,
+  /// Configuration controlling whether the header should be written to the saver output file.
+  /// This config must be provided concurrently with QNN_SAVER_CONFIG_OPTION_FILE_CONFIG.
+  QNN_SAVER_CONFIG_WRITE_OUTPUT_HEADER = 4,
+  /// Configuration controlling whether the footer should be written to the saver output file.
+  /// This config must be provided concurrently with QNN_SAVER_CONFIG_OPTION_FILE_CONFIG.
+  QNN_SAVER_CONFIG_WRITE_OUTPUT_FOOTER = 5,
+  // Unused, present to ensure 32 bits.
+  QNN_SAVER_CONFIG_OPTION_UNDEFINED = 0x7FFFFFFF
+} QnnSaver_ConfigOption_t;
+
+/**
+ * @brief A struct that provides configuration for Saver
+ */
+typedef struct {
+  /// Type of Saver configuration option
+  QnnSaver_ConfigOption_t option;
+  /// Union of mutually exclusive config values based on
+  /// the type specified by 'option'.
+  union UNNAMED {
+    /// Path to a directory where Saver output should be stored. The directory will
+    /// be created if it doesn't exist already. If a relative filepath is given, the location is
+    /// relative to the current working directory. Defaults to "./saver_output/" if not provided.
+    const char* outputDirectory;
+    /// Boolean flag to indicate if a timestamp should be appended to the
+    /// filename of Saver outputs to prevent them from being overwritten during
+    /// consecutive uses of Saver. Note that all input tensor data is dumped into params.bin, so
+    /// this setting may use lots of storage over time. Any nonzero value will enable the timestamp.
+    /// Defaults to 0 (false) if not provided.
+    uint8_t appendTimestamp;
+    /// Backend identifier indicating which backend to interpret custom configs as.
+    /// These identifiers are defined by each backend in a Qnn<Backend>Common.h file
+    /// included with the SDK.
+    uint32_t backendId;
+    /// Alternative filenames for Saver outputs.
+    QnnSaver_FileConfig_t fileConfig;
+    /// Boolean flag to indicate if the saver output header should be written or not.
+    /// The 'header' refers to the static text at the top of the output file before any APIs are
+    /// recorded (header includes, beginning of main(), command line parsing, etc.)
+    /// This config would be used when writing to a pre-existing saver output file created from a
+    /// previous call to QnnSaver_initialize(), providing a fileConfig
+    /// (QNN_SAVER_CONFIG_OPTION_FILE_CONFIG) and writeOutputFooter == 0.
+    /// Because the output files already exist, the they will be opened in append mode.
+    /// Defaults to 1 (true) if not provided.
+    uint8_t writeOutputHeader;
+    /// Boolean flag to indicate if the saver output footer should be written or not.
+    /// The 'footer' refers to the static text at the bottom of the output file after all APIs have
+    /// been recorded (misc. cleanup, the end of main(), etc.)
+    /// This config would be used when writing to a saver output file that will be appended to at
+    /// later point with a subsequent call to QnnSaver_initialize(), providing a fileConfig
+    /// (QNN_SAVER_CONFIG_OPTION_FILE_CONFIG) and writeOutputHeader == 0
+    /// Defaults to 1 (true) if not provided.
+    uint8_t writeOutputFooter;
+  };
+} QnnSaver_Config_t;
+
+// clang-format off
+/// QnnSaver_Config_t initializer macro
+#define QNN_SAVER_CONFIG_INIT                     \
+  {                                               \
+    QNN_SAVER_CONFIG_OPTION_UNDEFINED, /*option*/ \
+    {                                             \
+      NULL /*outputDirectory*/                    \
+    }                                             \
+  }
+// clang-format on
+
+//=============================================================================
+// API Methods
+//=============================================================================
+
+/**
+ * @brief Supply the Saver backend with configuration options.
+ *        This function only needs to be called if you are providing configs to Saver.
+ *        If no configuration is needed, you may simply call any other QNN API to initialize the
+ *        Saver.
+ *
+ * @note There are restrictions which affect when certain configurations can be provided, refer to
+ *       QnnSaver_ConfigOption_t.
+ *
+ * @param[in] config Pointer to a NULL terminated array of config option pointers.
+ *                   NULL is allowed and indicates no config options are provided,
+ *                   however this function only serves to supply configs, so it
+ *                   is unnecessary to call if no configuration is desired.
+ *                   All config options have a default value, in case not provided.
+ *                   If the same config option type is provided multiple times,
+ *                   the last option value will be used.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: No error encountered
+ *         - QNN_COMMON_ERROR_INVALID_ARGUMENT: A config was supplied incorrectly
+ *         - QNN_SAVER_ERROR_ALREADY_INSTANTIATED: Saver backend was already initialized
+ */
+
+QNN_SAVER_API
+Qnn_ErrorHandle_t QnnSaver_initialize(const QnnSaver_Config_t** config);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // QNN_SAVER_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/Saver/QnnSaverCommon.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/Saver/QnnSaverCommon.h
new file mode 100755
index 0000000000000..9af94daed45b7
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/Saver/QnnSaverCommon.h
@@ -0,0 +1,51 @@
+//=============================================================================
+//
+//  Copyright (c) 2020-2021, 2023 Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//=============================================================================
+
+/** @file
+ *  @brief QNN Saver Common components
+ *
+ *         This file defines versioning and other identification details
+ *         and supplements QnnCommon.h for Saver backend
+ */
+
+#ifndef QNN_SAVER_COMMON_H
+#define QNN_SAVER_COMMON_H
+
+#include "QnnCommon.h"
+
+/// Saver Backend identifier
+#define QNN_BACKEND_ID_SAVER 2
+
+/// Saver interface provider
+#define QNN_SAVER_INTERFACE_PROVIDER_NAME "SAVER_QTI_AISW"
+
+// Saver API Version values
+#define QNN_SAVER_API_VERSION_MAJOR 1
+#define QNN_SAVER_API_VERSION_MINOR 1
+#define QNN_SAVER_API_VERSION_PATCH 0
+
+// clang-format off
+
+/// Macro to set Qnn_ApiVersion_t for Saver backend
+#define QNN_SAVER_API_VERSION_INIT                               \
+  {                                                              \
+    {                                                            \
+      QNN_API_VERSION_MAJOR,     /*coreApiVersion.major*/        \
+      QNN_API_VERSION_MINOR,     /*coreApiVersion.major*/        \
+      QNN_API_VERSION_PATCH      /*coreApiVersion.major*/        \
+    },                                                           \
+    {                                                            \
+      QNN_SAVER_API_VERSION_MAJOR, /*backendApiVersion.major*/   \
+      QNN_SAVER_API_VERSION_MINOR, /*backendApiVersion.minor*/   \
+      QNN_SAVER_API_VERSION_PATCH  /*backendApiVersion.patch*/   \
+    }                                                            \
+  }
+
+// clang-format on
+
+#endif  // QNN_SAVER_COMMON_H
\ No newline at end of file
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/System/QnnSystemCommon.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/System/QnnSystemCommon.h
new file mode 100755
index 0000000000000..e5474ee7ac54f
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/System/QnnSystemCommon.h
@@ -0,0 +1,59 @@
+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/**
+ * @file
+ * @brief   QNN System Common API component
+ *
+ *          A header which contains common types shared by QNN system components.
+ *          This simplifies the cross-inclusion of headers.
+ */
+
+#ifndef QNN_SYSTEM_COMMON_H
+#define QNN_SYSTEM_COMMON_H
+
+#include "QnnCommon.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+
+// libQnnSystem.so system interface provider name
+#define QNN_SYSTEM_INTERFACE_PROVIDER_NAME "SYSTEM_QTI_AISW"
+
+// Macro controlling visibility of QNN_SYSTEM API
+#ifndef QNN_SYSTEM_API
+#define QNN_SYSTEM_API
+#endif
+
+// Provide values to use for API version.
+#define QNN_SYSTEM_API_VERSION_MAJOR 1
+#define QNN_SYSTEM_API_VERSION_MINOR 4
+#define QNN_SYSTEM_API_VERSION_PATCH 0
+
+// Error code space assigned to system API components
+#define QNN_SYSTEM_CONTEXT_MIN_ERROR QNN_MIN_ERROR_SYSTEM
+#define QNN_SYSTEM_CONTEXT_MAX_ERROR (QNN_SYSTEM_CONTEXT_MIN_ERROR + 999)
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+//=============================================================================
+// Public Functions
+//=============================================================================
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // QNN_SYSTEM_COMMON_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/System/QnnSystemContext.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/System/QnnSystemContext.h
new file mode 100755
index 0000000000000..9fd21cdccb053
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/System/QnnSystemContext.h
@@ -0,0 +1,537 @@
+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All rights reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/**
+ *  @file
+ *  @brief  QNN System Context API.
+ *
+ *          This is a system API header dedicated to extensions to QnnContext
+ *          that provide backend-agnostic services to users.
+ */
+
+#ifndef QNN_SYSTEM_CONTEXT_H
+#define QNN_SYSTEM_CONTEXT_H
+
+#include "QnnDevice.h"
+#include "QnnTypes.h"
+#include "System/QnnSystemCommon.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Error Codes
+//=============================================================================
+
+/**
+ * @brief QNN System Context API result / error codes.
+ */
+typedef enum {
+  QNN_SYSTEM_CONTEXT_MINERROR = QNN_MIN_ERROR_SYSTEM,
+  //////////////////////////////////////////
+
+  /// Qnn System Context success
+  QNN_SYSTEM_CONTEXT_NO_ERROR = QNN_SUCCESS,
+  /// There is optional API component that is not supported yet.
+  QNN_SYSTEM_CONTEXT_ERROR_UNSUPPORTED_FEATURE = QNN_COMMON_ERROR_NOT_SUPPORTED,
+  /// QNN System Context invalid handle
+  QNN_SYSTEM_CONTEXT_ERROR_INVALID_HANDLE = QNN_SYSTEM_CONTEXT_MINERROR + 0,
+  /// One or more arguments to a System Context API is/are NULL/invalid.
+  QNN_SYSTEM_CONTEXT_ERROR_INVALID_ARGUMENT = QNN_SYSTEM_CONTEXT_MINERROR + 1,
+  /// Generic Failure in achieving the objective of a System Context API
+  QNN_SYSTEM_CONTEXT_ERROR_OPERATION_FAILED = QNN_SYSTEM_CONTEXT_MINERROR + 2,
+
+  // Errors related to context caching
+  /// Malformed context binary
+  QNN_SYSTEM_CONTEXT_ERROR_MALFORMED_BINARY = QNN_SYSTEM_CONTEXT_MINERROR + 10,
+  //////////////////////////////////////////
+  QNN_SYSTEM_CONTEXT_MAXERROR = QNN_MAX_ERROR_SYSTEM
+} QnnSystemContext_Error_t;
+
+/*****************************************************************************/
+/* Enums and data structures corresponding to QnnSystemContext               */
+/*****************************************************************************/
+
+typedef enum {
+  QNN_SYSTEM_CONTEXT_GRAPH_INFO_VERSION_1 = 0x01,
+  QNN_SYSTEM_CONTEXT_GRAPH_INFO_VERSION_2 = 0x02,
+  QNN_SYSTEM_CONTEXT_GRAPH_INFO_VERSION_3 = 0x03,
+  // Unused, present to ensure 32 bits.
+  QNN_SYSTEM_CONTEXT_GRAPH_INFO_UNDEFINED = 0x7FFFFFFF
+} QnnSystemContext_GraphInfoVersion_t;
+
+typedef enum {
+  QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_1 = 0x01,
+  QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_2 = 0x02,
+  QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_3 = 0x03,
+  // Unused, present to ensure 32 bits.
+  QNN_SYSTEM_CONTEXT_BINARY_INFO_UNDEFINED = 0x7FFFFFFF
+} QnnSystemContext_BinaryInfoVersion_t;
+
+//=============================================================================
+// Data structures representing context binary metadata contents
+//=============================================================================
+
+/**
+ * @brief Struct that provides information about graphs registered with a context.
+ *        This is version V1 of the structure.
+ */
+typedef struct {
+  /// Name of graph
+  const char* graphName;
+  /// Number of input tensors to graph
+  uint32_t numGraphInputs;
+  /// List of input tensors to graph
+  Qnn_Tensor_t* graphInputs;
+  /// Number of output tensors from graph
+  uint32_t numGraphOutputs;
+  /// List of output tensors from graph
+  Qnn_Tensor_t* graphOutputs;
+} QnnSystemContext_GraphInfoV1_t;
+
+// clang-format off
+/// QnnSystemContext_GraphInfoV1_t initializer macro
+#define QNN_SYSTEM_CONTEXT_GRAPH_INFO_V1_INIT  \
+  {                                            \
+     NULL,    /* graphName */                  \
+     0,       /* numGraphInputs */             \
+     NULL,    /* graphInputs */                \
+     0,       /* numGraphOutputs */            \
+     NULL,    /* graphOutputs */               \
+  }
+// clang-format on
+
+/**
+ * @brief Struct that provides information about graphs registered with a context.
+ *        This is version V2 of the structure.
+ */
+typedef struct {
+  /// Name of graph
+  const char* graphName;
+  /// Number of input tensors to graph
+  uint32_t numGraphInputs;
+  /// List of input tensors to graph
+  Qnn_Tensor_t* graphInputs;
+  /// Number of output tensors from graph
+  uint32_t numGraphOutputs;
+  /// List of output tensors from graph
+  Qnn_Tensor_t* graphOutputs;
+  /// Number of updatable tensors from graph
+  uint32_t numUpdateableTensors;
+  /// List of updatable tensors from graph
+  Qnn_Tensor_t* updateableTensors;
+} QnnSystemContext_GraphInfoV2_t;
+
+// clang-format off
+/// QnnSystemContext_GraphInfoV2_t initializer macro
+#define QNN_SYSTEM_CONTEXT_GRAPH_INFO_V2_INIT  \
+  {                                            \
+     NULL,    /* graphName */                  \
+     0,       /* numGraphInputs */             \
+     NULL,    /* graphInputs */                \
+     0,       /* numGraphOutputs */            \
+     NULL,    /* graphOutputs */               \
+     0,       /* numUpdateableTensors */        \
+     NULL,    /* updateableTensors */           \
+  }
+// clang-format on
+
+/**
+ * @brief Struct that provides information about graphs registered with a context.
+ *        This is version V3 of the structure.
+ */
+typedef struct {
+  /// Name of graph
+  const char* graphName;
+  /// Number of input tensors to graph
+  uint32_t numGraphInputs;
+  /// List of input tensors to graph
+  Qnn_Tensor_t* graphInputs;
+  /// Number of output tensors from graph
+  uint32_t numGraphOutputs;
+  /// List of output tensors from graph
+  Qnn_Tensor_t* graphOutputs;
+  /// Number of updatable tensors from graph
+  uint32_t numUpdateableTensors;
+  /// List of updatable tensors from graph
+  Qnn_Tensor_t* updateableTensors;
+  /// Size of graph info blob stored in the context binary, in bytes
+  uint32_t graphBlobInfoSize;
+  /// Graph Info blob. Needs to be interpreted based on backend-specific instructions
+  void* graphBlobInfo;
+  /// start Op Index of a graph
+  uint32_t startOpIndex;
+  /// end Op Index of a graph
+  uint32_t endOpIndex;
+} QnnSystemContext_GraphInfoV3_t;
+
+// clang-format off
+/// QnnSystemContext_GraphInfoV3_t initializer macro
+#define QNN_SYSTEM_CONTEXT_GRAPH_INFO_V3_INIT  \
+  {                                            \
+     NULL,    /* graphName */                  \
+     0,       /* numGraphInputs */             \
+     NULL,    /* graphInputs */                \
+     0,       /* numGraphOutputs */            \
+     NULL,    /* graphOutputs */               \
+     0,       /* numUpdateableTensors */       \
+     NULL,    /* updateableTensors */          \
+     0,       /* graphBlobInfoSize */          \
+     NULL,    /* graphInfoBlob */              \
+     0,       /* startOpIndex */               \
+     0,       /* endOpIndex */                 \
+  }
+// clang-format on
+
+typedef struct {
+  QnnSystemContext_GraphInfoVersion_t version;
+  union UNNAMED {
+    QnnSystemContext_GraphInfoV1_t graphInfoV1;
+    QnnSystemContext_GraphInfoV2_t graphInfoV2;
+    QnnSystemContext_GraphInfoV3_t graphInfoV3;
+  };
+} QnnSystemContext_GraphInfo_t;
+
+// clang-format off
+/// QnnSystemContext_GraphInfo_t initializer macro
+#define QNN_SYSTEM_CONTEXT_GRAPH_INFO_INIT                      \
+  {                                                             \
+    QNN_SYSTEM_CONTEXT_GRAPH_INFO_UNDEFINED,  /* version */     \
+    {                                                           \
+      QNN_SYSTEM_CONTEXT_GRAPH_INFO_V1_INIT  /* graphInfoV1 */  \
+    }                                                           \
+  }
+// clang-format on
+
+/**
+ * @brief Struct that provides information about contents of a context binary.
+ *        This is version V1 of the structure.
+ */
+typedef struct {
+  /// Backend that this context binary is associated with
+  uint32_t backendId;
+  /// Build ID of QNN SDK used to create context binary
+  const char* buildId;
+  /// QNN core API version
+  Qnn_Version_t coreApiVersion;
+  /// Version of backend-specific API for the backend producing context binary
+  Qnn_Version_t backendApiVersion;
+  /// Version of the SOC for which context binary was generated
+  const char* socVersion;
+  /// Version of hardware info blob stored in the context binary
+  Qnn_Version_t hwInfoBlobVersion;
+  /// Version of the opaque context blob generated by backend that is packed into the context binary
+  /// Note that the context blob is not part of metadata. It is described by the metadata
+  Qnn_Version_t contextBlobVersion;
+  /// Size of hardware info blob stored in the context binary, in bytes
+  uint32_t hwInfoBlobSize;
+  /// Hardware Info blob. Needs to be interpreted based on backend-specific instructions
+  void* hwInfoBlob;
+
+  /// Size of opaque backend-specific context blob, in bytes
+  uint64_t contextBlobSize;
+
+  // details about graphs stored in context
+  /// Number of context tensors
+  uint32_t numContextTensors;
+  /// List of tensors registered to this context. Includes updatable context tensors.
+  Qnn_Tensor_t* contextTensors;
+  /// Number of graphs registered with this context
+  uint32_t numGraphs;
+  /// List of graphs registered to this context
+  QnnSystemContext_GraphInfo_t* graphs;
+} QnnSystemContext_BinaryInfoV1_t;
+
+// clang-format off
+/// QnnSystemContext_BinaryInfoV1_t initializer macro
+#define QNN_SYSTEM_CONTEXT_BINARY_INFO_V1_INIT                           \
+  {                                                                      \
+    0,                     /* backendId */                               \
+    NULL,                  /* buildId */                                 \
+    QNN_VERSION_INIT,      /* coreApiVersion */                          \
+    QNN_VERSION_INIT,      /* backendApiVersion */                       \
+    NULL,                  /* socVersion */                              \
+    QNN_VERSION_INIT,      /* hwInfoBlobVersion */                       \
+    QNN_VERSION_INIT,      /* contextBlobVersion */                      \
+    0,                     /* hwInfoBlobSize */                          \
+    NULL,                  /* hwInfoBlob */                              \
+    0,                     /* contextBlobSize */                         \
+    0,                     /* numContextTensors */                       \
+    NULL,                  /* contextTensors */                          \
+    0,                     /* numGraphs */                               \
+    NULL                  /* graphs */                                   \
+  }
+// clang-format on
+
+/**
+ * @brief Struct that provides information about contents of a context binary.
+ *        This is version V2 of the structure.
+ */
+typedef struct {
+  /// Backend that this context binary is associated with
+  uint32_t backendId;
+  /// Build ID of QNN SDK used to create context binary
+  const char* buildId;
+  /// QNN core API version
+  Qnn_Version_t coreApiVersion;
+  /// Version of backend-specific API for the backend producing context binary
+  Qnn_Version_t backendApiVersion;
+  /// Version of the SOC for which context binary was generated
+  const char* socVersion;
+  /// Version of hardware info blob stored in the context binary
+  Qnn_Version_t hwInfoBlobVersion;
+  /// Version of the opaque context blob generated by backend that is packed into the context binary
+  /// Note that the context blob is not part of metadata. It is described by the metadata
+  Qnn_Version_t contextBlobVersion;
+  /// Size of hardware info blob stored in the context binary, in bytes
+  uint32_t hwInfoBlobSize;
+  /// Hardware Info blob. Needs to be interpreted based on backend-specific instructions
+  void* hwInfoBlob;
+  /// Size of opaque backend-specific context blob, in bytes
+  uint64_t contextBlobSize;
+
+  // details about graphs stored in context
+  /// Number of context tensors
+  uint32_t numContextTensors;
+  /// List of tensors registered to this context
+  Qnn_Tensor_t* contextTensors;
+  /// Number of graphs registered with this context
+  uint32_t numGraphs;
+  /// List of graphs registered to this context
+  QnnSystemContext_GraphInfo_t* graphs;
+
+  /// Device information associated with the context
+  QnnDevice_PlatformInfo_t* platformInfo;
+} QnnSystemContext_BinaryInfoV2_t;
+
+// clang-format off
+/// QnnSystemContext_BinaryInfoV2_t initializer macro
+#define QNN_SYSTEM_CONTEXT_BINARY_INFO_V2_INIT       \
+  {                                                  \
+    0,                  /* backendId */              \
+    NULL,               /* buildId */                \
+    QNN_VERSION_INIT,   /* coreApiVersion */         \
+    QNN_VERSION_INIT,   /* backendApiVersion */      \
+    NULL,               /* socVersion */             \
+    QNN_VERSION_INIT,   /* hwInfoBlobVersion */      \
+    QNN_VERSION_INIT,   /* contextBlobVersion */     \
+    0,                  /* hwInfoBlobSize */         \
+    NULL,               /* hwInfoBlob */             \
+    0,                  /* contextBlobSize */        \
+    0,                  /* numContextTensors */      \
+    NULL,               /* contextTensors */         \
+    0,                  /* numGraphs */              \
+    NULL,               /* graphs */                 \
+    NULL                /* platformInfo */           \
+  }
+// clang-format on
+
+/**
+ * @brief Struct that provides information about contents of a context binary.
+ *        This is version V3 of the structure.
+ */
+typedef struct {
+  /// Backend that this context binary is associated with
+  uint32_t backendId;
+  /// Build ID of QNN SDK used to create context binary
+  const char* buildId;
+  /// QNN core API version
+  Qnn_Version_t coreApiVersion;
+  /// Version of backend-specific API for the backend producing context binary
+  Qnn_Version_t backendApiVersion;
+  /// Version of the SOC for which context binary was generated
+  const char* socVersion;
+  /// Version of the opaque context blob generated by backend that is packed into the context binary
+  /// Note that the context blob is not part of metadata. It is described by the metadata
+  Qnn_Version_t contextBlobVersion;
+  /// Size of opaque backend-specific context blob, in bytes
+  uint64_t contextBlobSize;
+
+  // details about graphs stored in context
+  /// Number of context tensors
+  uint32_t numContextTensors;
+  /// List of tensors registered to this context
+  Qnn_Tensor_t* contextTensors;
+  /// Number of graphs registered with this context
+  uint32_t numGraphs;
+  /// List of graphs registered to this context
+  QnnSystemContext_GraphInfo_t* graphs;
+
+  /// Device information associated with the context
+  QnnDevice_PlatformInfo_t* platformInfo;
+  /// Size of context metadata stored in the context binary, in bytes
+  uint32_t contextMetadataSize;
+  /// context-specific settings
+  void* contextMetadata;
+  /// An integer representation of the identifier for the SoC
+  uint32_t socModel;
+} QnnSystemContext_BinaryInfoV3_t;
+
+// clang-format off
+/// QnnSystemContext_BinaryInfoV3_t initializer macro
+#define QNN_SYSTEM_CONTEXT_BINARY_INFO_V3_INIT       \
+  {                                                  \
+    0,                  /* backendId */              \
+    NULL,               /* buildId */                \
+    QNN_VERSION_INIT,   /* coreApiVersion */         \
+    QNN_VERSION_INIT,   /* backendApiVersion */      \
+    NULL,               /* socVersion */             \
+    QNN_VERSION_INIT,   /* contextBlobVersion */     \
+    0,                  /* contextBlobSize */        \
+    0,                  /* numContextTensors */      \
+    NULL,               /* contextTensors */         \
+    0,                  /* numGraphs */              \
+    NULL,               /* graphs */                 \
+    NULL,               /* platformInfo */           \
+    0,                  /* contextMetadataSize */    \
+    NULL,               /* contextMetadata */        \
+    0                   /* socModel */               \
+    }
+// clang-format on
+
+typedef struct {
+  QnnSystemContext_BinaryInfoVersion_t version;
+  union UNNAMED {
+    QnnSystemContext_BinaryInfoV1_t contextBinaryInfoV1;
+    QnnSystemContext_BinaryInfoV2_t contextBinaryInfoV2;
+    QnnSystemContext_BinaryInfoV3_t contextBinaryInfoV3;
+  };
+} QnnSystemContext_BinaryInfo_t;
+
+// clang-format off
+/// QnnSystemContext_BinaryInfo_t initializer macro
+#define QNN_SYSTEM_CONTEXT_BINARYINFO_INIT                             \
+  {                                                                    \
+    QNN_SYSTEM_CONTEXT_BINARY_INFO_UNDEFINED, /* version */            \
+    {                                                                  \
+      QNN_SYSTEM_CONTEXT_BINARY_INFO_V1_INIT /* contextBinaryInfoV1 */ \
+    }                                                                  \
+  }
+// clang-format on
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+/**
+ * @brief A typedef to indicate a QNN System context handle
+ */
+typedef void* QnnSystemContext_Handle_t;
+
+//=============================================================================
+// Public Functions
+//=============================================================================
+
+/**
+ * @brief A function to create an instance of the QNN system context
+ *
+ * @param[out] sysCtxHandle A handle to the created instance of a systemContext entity
+ *
+ * @return Error code
+ *         - QNN_SUCCESS: Successfully created a systemContext entity
+ *         - QNN_SYSTEM_CONTEXT_ERROR_INVALID_ARGUMENT: sysCtxHandle is NULL
+ *         - QNN_COMMON_ERROR_MEM_ALLOC: Error encountered in allocating memory for
+ *           systemContext instance
+ *         - QNN_SYSTEM_CONTEXT_ERROR_UNSUPPORTED_FEATURE: system context features not supported
+ */
+QNN_SYSTEM_API
+Qnn_ErrorHandle_t QnnSystemContext_create(QnnSystemContext_Handle_t* sysCtxHandle);
+
+/**
+ * @brief A function to get context info from the serialized binary buffer.
+ *
+ * @deprecated Use QnnSystemContext_getMetadata instead
+ *
+ * @param[in]  sysCtxHandle     Handle to the systemContext object
+ *
+ * @param[in]  binaryBuffer     Serialized buffer representing a context binary.
+ *
+ * @param[in]  binaryBufferSize Size of context binary in bytes
+ *
+ * @param[out] binaryInfo       Pointer to memory that will be populated with
+ *                              user-visible information about the context binary.
+ *                              Memory for this information is internally allocated
+ *                              and managed by QNN, and is associated with the
+ *                              handle _sysCtxHandle_ created with QnnSystemContext_create().
+ *                              This memory has to be released by calling
+ *                              QnnSystemContext_free() when it is no longer needed.
+ *
+ * @param[out] binaryInfoSize   Size of metadata describing the contents
+ *                              of the context binary, in bytes.
+ *
+ * @return Error code
+ *         - QNN_SUCCESS: Successfully returned context binary info to caller
+ *         - QNN_SYSTEM_CONTEXT_ERROR_INVALID_HANDLE: Invalid System Context handle
+ *         - QNN_SYSTEM_CONTEXT_ERROR_INVALID_ARGUMENT: One or more arguments to the API
+ *           is/are NULL/invalid.
+ *         - QNN_SYSTEM_CONTEXT_ERROR_OPERATION_FAILED: Failed to obtain context binary info
+ *         - QNN_SYSTEM_CONTEXT_ERROR_MALFORMED_BINARY: The binary is either malformed or
+ *           cannot be parsed successfully.
+ *         - QNN_SYSTEM_CONTEXT_ERROR_UNSUPPORTED_FEATURE: not supported
+ */
+QNN_SYSTEM_API
+Qnn_ErrorHandle_t QnnSystemContext_getBinaryInfo(QnnSystemContext_Handle_t sysCtxHandle,
+                                                 void* binaryBuffer,
+                                                 uint64_t binaryBufferSize,
+                                                 const QnnSystemContext_BinaryInfo_t** binaryInfo,
+                                                 Qnn_ContextBinarySize_t* binaryInfoSize);
+
+/**
+ * @brief A function to get meta data from the serialized binary buffer.
+ *
+ * @param[in]  sysCtxHandle     Handle to the systemContext object
+ *
+ * @param[in]  binaryBuffer     Serialized buffer representing a const context binary.
+ *
+ * @param[in]  binaryBufferSize Size of context binary in bytes
+ *
+ * @param[out] binaryInfo       Pointer to memory that will be populated with
+ *                              user-visible information about the context binary.
+ *                              Memory for this information is internally allocated
+ *                              and managed by QNN, and is associated with the
+ *                              handle _sysCtxHandle_ created with QnnSystemContext_create().
+ *                              This memory has to be released by calling
+ *                              QnnSystemContext_free() when it is no longer needed.
+ *
+ * @return Error code
+ *         - QNN_SUCCESS: Successfully returned context binary info to caller
+ *         - QNN_SYSTEM_CONTEXT_ERROR_INVALID_HANDLE: Invalid System Context handle
+ *         - QNN_SYSTEM_CONTEXT_ERROR_INVALID_ARGUMENT: One or more arguments to the API
+ *           is/are NULL/invalid.
+ *         - QNN_SYSTEM_CONTEXT_ERROR_OPERATION_FAILED: Failed to obtain context binary info
+ *         - QNN_SYSTEM_CONTEXT_ERROR_MALFORMED_BINARY: The binary is either malformed or
+ *           cannot be parsed successfully.
+ *         - QNN_SYSTEM_CONTEXT_ERROR_UNSUPPORTED_FEATURE: not supported
+ */
+QNN_SYSTEM_API
+Qnn_ErrorHandle_t QnnSystemContext_getMetadata(QnnSystemContext_Handle_t sysCtxHandle,
+                                               const void* binaryBuffer,
+                                               Qnn_ContextBinarySize_t binaryBufferSize,
+                                               const QnnSystemContext_BinaryInfo_t** binaryInfo);
+
+/**
+ * @brief A function to free the instance of the System Context object.
+ *        This API clears any intermediate memory allocated and associated
+ *        with a valid handle.
+ *
+ * @param[in] sysCtxHandle Handle to the System Context object
+ *
+ * @return Error code
+ *         - QNN_SUCCESS: Successfully freed instance of System Context
+ *         - QNN_SYSTEM_CONTEXT_ERROR_INVALID_HANDLE: Invalid System Context handle to free
+ *         - QNN_SYSTEM_CONTEXT_ERROR_UNSUPPORTED_FEATURE: not supported
+ */
+QNN_SYSTEM_API
+Qnn_ErrorHandle_t QnnSystemContext_free(QnnSystemContext_Handle_t sysCtxHandle);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // QNN_SYSTEM_CONTEXT_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/System/QnnSystemDlc.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/System/QnnSystemDlc.h
new file mode 100755
index 0000000000000..9b20015aec964
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/System/QnnSystemDlc.h
@@ -0,0 +1,201 @@
+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All rights reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/**
+ *  @file
+ *  @brief  QNN System Context API.
+ *
+ *          This is a system API header to provide
+ *          Deep Learning Container (DLC) services to users.
+ */
+
+#ifndef QNN_SYSTEM_DLC_H
+#define QNN_SYSTEM_DLC_H
+
+#include "QnnInterface.h"
+#include "QnnTypes.h"
+#include "System/QnnSystemCommon.h"
+#include "System/QnnSystemContext.h"
+#include "System/QnnSystemLog.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Error Codes
+//=============================================================================
+
+/**
+ * @brief QNN System Context API result / error codes.
+ */
+typedef enum {
+  QNN_SYSTEM_DLC_MINERROR = QNN_MIN_ERROR_SYSTEM,
+  //////////////////////////////////////////
+
+  /// Qnn System Context success
+  QNN_SYSTEM_DLC_NO_ERROR = QNN_SUCCESS,
+  /// There is optional API component that is not supported yet.
+  QNN_SYSTEM_DLC_ERROR_UNSUPPORTED_FEATURE = QNN_COMMON_ERROR_NOT_SUPPORTED,
+  /// QNN System DLC invalid handle
+  QNN_SYSTEM_DLC_ERROR_INVALID_HANDLE = QNN_SYSTEM_DLC_MINERROR + 0,
+  /// One or more arguments to a System DLC API is/are NULL/invalid.
+  QNN_SYSTEM_DLC_ERROR_INVALID_ARGUMENT = QNN_SYSTEM_DLC_MINERROR + 1,
+  /// Generic Failure in achieving the objective of a System DLC API
+  QNN_SYSTEM_DLC_ERROR_OPERATION_FAILED = QNN_SYSTEM_DLC_MINERROR + 2,
+
+
+  /// Malformed DLC Binary
+  QNN_SYSTEM_DLC_ERROR_MALFORMED_BINARY = QNN_SYSTEM_DLC_MINERROR + 10,
+  //////////////////////////////////////////
+  QNN_SYSTEM_DLC_MAXERROR = QNN_MAX_ERROR_SYSTEM
+} QnnSystemDlc_Error_t;
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+/// Version of the graph config info
+typedef enum {
+  QNN_SYSTEM_DLC_GRAPH_CONFIG_INFO_VERSION_1 = 0x01,
+  // Unused, present to ensure 32 bits.
+  QNN_SYSTEM_DLC_GRAPH_CONFIG_INFO_UNDEFINED = 0x7FFFFFFF
+} QnnSystemContext_GraphConfigInfoVersion_t;
+
+typedef struct {
+  const char* graphName;
+  const QnnGraph_Config_t** graphConfigs;
+  uint32_t numConfigs;
+} QnnSystemDlc_GraphConfigInfoV1_t;
+
+/// @brief structure to define
+typedef struct {
+  QnnSystemContext_GraphConfigInfoVersion_t version;
+  union UNNAMED {
+    QnnSystemDlc_GraphConfigInfoV1_t v1;
+  };
+} QnnSystemDlc_GraphConfigInfo_t;
+
+/**
+ * @brief A typedef to indicate a QNN System DLC handle
+ */
+typedef void* QnnSystemDlc_Handle_t;
+
+//=============================================================================
+// Public Functions
+//=============================================================================
+
+/**
+ * @brief A function to create an instance of the DLC from a file
+ *
+ * @param[in] dlcPath path the DLC
+ * @param[in] logger a log handle produced from QnnSystemLog_create(). Can be NULL
+ * @param[out] dlcHandle A handle to the created instance of a systemContext entity
+ *
+ * @return Error code
+ *         - QNN_SUCCESS: Successfully created a systemContext entity
+ *         - QNN_SYSTEM_DLC_ERROR_INVALID_ARGUMENT: sysCtxHandle is NULL
+ *         - QNN_COMMON_ERROR_MEM_ALLOC: Error encountered in allocating memory for
+ *           systemContext instance
+ *         - QNN_SYSTEM_DLC_ERROR_UNSUPPORTED_FEATURE: system context features not supported
+ */
+QNN_SYSTEM_API
+Qnn_ErrorHandle_t QnnSystemDlc_createFromFile(Qnn_LogHandle_t logger, const char* dlcPath, QnnSystemDlc_Handle_t* dlcHandle);
+
+/**
+ * @brief A function to create an instance of the DLC from a binary buffer
+ *
+ * @param[in]  buffer pointer to buffer representing the DLC
+ * @param[in]  logger a log handle produced from QnnSystemLog_create(). Can be NULL
+ * @param[in]  bufferSize size of the binary buffer
+ * @param[out] dlcHandle A handle to the created instance of a systemContext entity
+ *
+ * @return Error code
+ *         - QNN_SUCCESS: Successfully created a systemContext entity
+ *         - QNN_SYSTEM_DLC_ERROR_INVALID_ARGUMENT: sysCtxHandle is NULL
+ *         - QNN_COMMON_ERROR_MEM_ALLOC: Error encountered in allocating memory for
+ *           systemContext instance
+ *         - QNN_SYSTEM_DLC_ERROR_UNSUPPORTED_FEATURE: system context features not supported
+ */
+QNN_SYSTEM_API
+Qnn_ErrorHandle_t QnnSystemDlc_createFromBinary(Qnn_LogHandle_t logger, const uint8_t* buffer,
+                                                const Qnn_ContextBinarySize_t bufferSize, QnnSystemDlc_Handle_t* dlcHandle);
+
+
+/**
+ * @brief A function to compose graphs from a DLC on a particular backend, __backend__, through
+ *        an interface __interface__. Memory allocated in __graphs__ is owned by clients and may
+ *        be released with calls to free().
+ *
+ * @param[in]  dlcHandle the DLC to retrieve graphs from
+ * @param[in]  graphConfigs the graph configuration information for a particular graph
+ * @param[in]  numGraphConfigs number of graph configurations
+ * @param[in] backend the backend on which to compose the graphs
+ * @param[in]  context the context on which to compose the graphs
+ * @param[in]  interface the interface used to compose the graph.
+ * @param[in]  logger a log handle produced by QnnSystemLog_create()
+ * @param[in] graphVersion version of the graph info structure to be returned
+ * @param[out] graphs An array of graph information representing what was created with the backend.
+ * @param[out] numGraphs the number of created graphs
+ *
+ * @return Error code
+ *         - QNN_SUCCESS: Successfully composed graphs.
+ *         - QNN_SYSTEM_DLC_ERROR_INVALID_ARGUMENT: Argument is NULL
+ *         - QNN_COMMON_ERROR_MEM_ALLOC: Error encountered in allocating memory for
+ *         - QNN_SYSTEM_DLC_ERROR_INVALID_HANDLE: Invalid Dlc handle to free
+ *         - QNN_SYSTEM_DLC_ERROR_UNSUPPORTED_FEATURE: DLC features not supported
+ *
+ */
+QNN_SYSTEM_API
+Qnn_ErrorHandle_t QnnSystemDlc_composeGraphs(QnnSystemDlc_Handle_t dlcHandle,
+                                             const QnnSystemDlc_GraphConfigInfo_t** graphConfigs,
+                                             const uint32_t numGraphConfigs,
+                                             Qnn_BackendHandle_t backend,
+                                             Qnn_ContextHandle_t context,
+                                             QnnInterface_t interface,
+                                             QnnSystemContext_GraphInfoVersion_t graphVersion,
+                                             QnnSystemContext_GraphInfo_t** graphs,
+                                             uint32_t* numGraphs);
+/**
+ * @brief A function to retrieve Op Mapping information from a DLC
+ *
+ * @param[in]  dlcHandle Handle to the DLC
+ * @param[out] opMappings a list of op mappings. The memory allocated here is owned by the System
+ *             library and is released when the corresponding DLC Handle is freed.
+ * @param[out] numOpMappings the number of opMappings
+ *
+ * @return Error code
+ *         - QNN_SUCCESS: Successfully freed instance of System Context
+ *         - QNN_SYSTEM_DLC_ERROR_INVALID_HANDLE: Invalid Dlc handle to free
+ *         - QNN_SYSTEM_DLC_ERROR_UNSUPPORTED_FEATURE: not supported
+ */
+QNN_SYSTEM_API
+Qnn_ErrorHandle_t QnnSystemDlc_getOpMappings(QnnSystemDlc_Handle_t dlcHandle,
+                                             const Qnn_OpMapping_t** opMappings,
+                                             uint32_t* numOpMappings);
+
+/**
+ * @brief A function to free the instance of the System Context object.
+ *        This API clears any intermediate memory allocated and associated
+ *        with a valid handle.
+ *
+ * @param[in] sysCtxHandle Handle to the System Context object
+ *
+ * @return Error code
+ *         - QNN_SUCCESS: Successfully freed instance of System Context
+ *         - QNN_SYSTEM_DLC_ERROR_INVALID_HANDLE: Invalid System Context handle to free
+ *         - QNN_SYSTEM_DLC_ERROR_UNSUPPORTED_FEATURE: not supported
+ */
+QNN_SYSTEM_API
+Qnn_ErrorHandle_t QnnSystemDlc_free(QnnSystemDlc_Handle_t dlcHandle);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // QNN_SYSTEM_DLC_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/System/QnnSystemInterface.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/System/QnnSystemInterface.h
new file mode 100755
index 0000000000000..2e998a6d07569
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/System/QnnSystemInterface.h
@@ -0,0 +1,263 @@
+//==============================================================================
+//
+// Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+// All rights reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/**
+ *  @file
+ *  @brief  QNN System Interface API
+ *
+ *          QNN System Interface is an abstraction combining all QNN System APIs.
+ *          QNN System Interface provides typedef variant of QNN System APIs and
+ *          API to get QNN System interface object(s).
+ *          QNN System Interface API can coexist with QNN System APIs. Visibility
+ *          of Interface and System APIs is determined by build configuration,
+ *          specifically by QNN_SYSTEM_API and QNN_SYSTEM_INTERFACE macro definitions.
+ */
+
+#ifndef QNN_SYSTEM_INTERFACE_H
+#define QNN_SYSTEM_INTERFACE_H
+
+#include "System/QnnSystemCommon.h"
+
+// QNN System API headers
+#include "System/QnnSystemContext.h"
+#include "System/QnnSystemTensor.h"
+#include "System/QnnSystemLog.h"
+#include "System/QnnSystemDlc.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+
+// Macro controlling visibility of QNN System Interface API
+#ifndef QNN_SYSTEM_INTERFACE
+#define QNN_SYSTEM_INTERFACE
+#endif
+
+// Utility macros for version and name construction
+#define QNN_SYSTEM_INTERFACE_VER_EVAL(major, minor)          QNN_PASTE_THREE(major, _, minor)
+#define QNN_SYSTEM_INTERFACE_NAME_EVAL(prefix, body, suffix) QNN_PASTE_THREE(prefix, body, suffix)
+
+// Construct interface type name from version, e.g. QnnSystemInterface_ImplementationV0_0_t
+#define QNN_SYSTEM_INTERFACE_VER_TYPE_EVAL(ver_major, ver_minor) \
+  QNN_SYSTEM_INTERFACE_NAME_EVAL(                                \
+      QnnSystemInterface_ImplementationV, QNN_SYSTEM_INTERFACE_VER_EVAL(ver_major, ver_minor), _t)
+
+// Construct interface name from version, e.g. v0_0
+#define QNN_SYSTEM_INTERFACE_VER_NAME_EVAL(ver_major, ver_minor) \
+  QNN_SYSTEM_INTERFACE_NAME_EVAL(v, QNN_SYSTEM_INTERFACE_VER_EVAL(ver_major, ver_minor), )
+
+// Interface type name for current API version
+#define QNN_SYSTEM_INTERFACE_VER_TYPE \
+  QNN_SYSTEM_INTERFACE_VER_TYPE_EVAL(QNN_SYSTEM_API_VERSION_MAJOR, QNN_SYSTEM_API_VERSION_MINOR)
+
+// Interface name for current API version
+#define QNN_SYSTEM_INTERFACE_VER_NAME \
+  QNN_SYSTEM_INTERFACE_VER_NAME_EVAL(QNN_SYSTEM_API_VERSION_MAJOR, QNN_SYSTEM_API_VERSION_MINOR)
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+/**
+ * @brief QNN System Interface API result / error codes
+ */
+typedef enum {
+  QNN_SYSTEM_INTERFACE_MIN_ERROR = QNN_MIN_ERROR_SYSTEM,
+  ////////////////////////////////////////
+
+  QNN_SYSTEM_INTERFACE_NO_ERROR                = QNN_SUCCESS,
+  QNN_SYSTEM_INTERFACE_ERROR_NOT_SUPPORTED     = QNN_COMMON_ERROR_NOT_SUPPORTED,
+  QNN_SYSTEM_INTERFACE_ERROR_INVALID_PARAMETER = QNN_COMMON_ERROR_INVALID_ARGUMENT,
+
+  ////////////////////////////////////////
+  QNN_SYSTEM_INTERFACE_MAX_ERROR = QNN_MAX_ERROR_SYSTEM
+} QnnSystemInterface_Error_t;
+
+//
+// From QnnSystemContext.h
+//
+
+/** @brief See QnnSystemContext_create()*/
+typedef Qnn_ErrorHandle_t (*QnnSystemContext_CreateFn_t)(QnnSystemContext_Handle_t* sysCtxHandle);
+
+/** @brief See QnnSystemContext_getBinaryInfo()*/
+typedef Qnn_ErrorHandle_t (*QnnSystemContext_GetBinaryInfoFn_t)(
+    QnnSystemContext_Handle_t sysCtxHandle,
+    void* binaryBuffer,
+    uint64_t binaryBufferSize,
+    const QnnSystemContext_BinaryInfo_t** binaryInfo,
+    Qnn_ContextBinarySize_t* binaryInfoSize);
+
+/** @brief See QnnSystemContext_getMetadata()*/
+typedef Qnn_ErrorHandle_t (*QnnSystemContext_GetMetaDataFn_t)(
+    QnnSystemContext_Handle_t sysCtxHandle,
+    const void* binaryBuffer,
+    uint64_t binaryBufferSize,
+    const QnnSystemContext_BinaryInfo_t** binaryInfo);
+
+/** @brief See QnnSystemContext_free()*/
+typedef Qnn_ErrorHandle_t (*QnnSystemContext_FreeFn_t)(QnnSystemContext_Handle_t sysCtxHandle);
+
+//
+// From QnnSystemTensor.h
+//
+
+/** @brief See QnnSystemTensor_getMemoryFootprint()*/
+typedef Qnn_ErrorHandle_t (*QnnSystemTensor_getMemoryFootprintFn_t)(Qnn_Tensor_t tensor,
+                                                                    uint64_t* footprint);
+
+//
+// From QnnSystemLog.h
+//
+
+/** @brief See QnnSystemLog_create()*/
+typedef Qnn_ErrorHandle_t (*QnnSystemLog_createFn_t)(QnnLog_Callback_t callback,
+                                                     QnnLog_Level_t maxLogLevel,
+                                                     Qnn_LogHandle_t* logger);
+
+/** @brief See QnnSystemLog_setLogLevel()*/
+typedef Qnn_ErrorHandle_t (*QnnSystemLog_setLogLevelFn_t)(Qnn_LogHandle_t logger,
+                                                          QnnLog_Level_t maxLogLevel);
+
+/** @brief See QnnSystemLog_free()*/
+typedef Qnn_ErrorHandle_t (*QnnSystemLog_freeFn_t)(Qnn_LogHandle_t logger);
+// clang-format off
+
+//
+// From QnnSystemDlc.h
+//
+
+/** @brief See QnnSystemDlc_createFromFile()*/
+typedef Qnn_ErrorHandle_t (*QnnSystemDlc_createFromFileFn_t)(Qnn_LogHandle_t logger,
+                                                             const char* dlcPath,
+                                                             QnnSystemDlc_Handle_t* dlcHandle);
+/** @brief See QnnSystemDlc_createFromBinary()*/
+typedef Qnn_ErrorHandle_t (*QnnSystemDlc_createFromBinaryFn_t)(Qnn_LogHandle_t logger,
+                                                               const uint8_t* buffer,
+                                                               const Qnn_ContextBinarySize_t bufferSize,
+                                                               QnnSystemDlc_Handle_t* dlcHandle);
+
+/** @brief See QnnSystemDlc_composeGraphs()*/
+typedef Qnn_ErrorHandle_t (*QnnSystemDlc_composeGraphsFn_t)(QnnSystemDlc_Handle_t dlcHandle,
+                                                            const QnnSystemDlc_GraphConfigInfo_t** graphConfigs,
+                                                            const uint32_t numGraphConfigs,
+                                                            Qnn_BackendHandle_t backend,
+                                                            Qnn_ContextHandle_t context,
+                                                            QnnInterface_t interface,
+                                                            QnnSystemContext_GraphInfoVersion_t graphVersion,
+                                                            QnnSystemContext_GraphInfo_t** graphs,
+                                                            uint32_t* numGraphs);
+/** @brief See QnnSystemDlc_getOpMappings()*/
+typedef Qnn_ErrorHandle_t (*QnnSystemDlc_getOpMappingsFn_t)(QnnSystemDlc_Handle_t dlcHandle,
+                                                          const Qnn_OpMapping_t** opMappings,
+                                                          uint32_t* numOpMappings);
+
+/** @brief See QnnSystemDlc_free()*/
+typedef Qnn_ErrorHandle_t (*QnnSystemDlc_freeFn_t)(QnnSystemDlc_Handle_t dlcHandle);
+
+/**
+ * @brief This struct defines Qnn system interface specific to version.
+ *        Interface functions are allowed to be NULL if not supported/available.
+ *
+ */
+typedef struct {
+  QnnSystemContext_CreateFn_t            systemContextCreate;
+  QnnSystemContext_GetBinaryInfoFn_t     systemContextGetBinaryInfo;
+  QnnSystemContext_GetMetaDataFn_t       systemContextGetMetaData;
+  QnnSystemContext_FreeFn_t              systemContextFree;
+  QnnSystemTensor_getMemoryFootprintFn_t systemTensorGetMemoryFootprint;
+  QnnSystemLog_createFn_t                systemLogCreate;
+  QnnSystemLog_setLogLevelFn_t           systemLogSetLogLevel;
+  QnnSystemLog_freeFn_t                  systemLogFree;
+  QnnSystemDlc_createFromFileFn_t        systemDlcCreateFromFile;
+  QnnSystemDlc_createFromBinaryFn_t      systemDlcCreateFromBinary;
+  QnnSystemDlc_composeGraphsFn_t         systemDlcComposeGraphs;
+  QnnSystemDlc_getOpMappingsFn_t         systemDlcGetOpMappings;
+  QnnSystemDlc_freeFn_t                  systemDlcFree;
+} QNN_SYSTEM_INTERFACE_VER_TYPE;
+
+/// QNN_INTERFACE_VER_TYPE initializer macro
+#define QNN_SYSTEM_INTERFACE_VER_TYPE_INIT { \
+  NULL, /*systemContextCreate*/ \
+  NULL, /*systemContextGetBinaryInfo*/ \
+  NULL, /*systemContextGetMetaData*/ \
+  NULL, /*systemContextFree*/ \
+  NULL, /*systemTensorGetMemoryFootprint*/ \
+  NULL, /*systemLogCreate*/ \
+  NULL, /*systemLogSetLogLevel*/ \
+  NULL, /*systemLogFree*/ \
+  NULL, /*systemDlcCreateFromFile*/ \
+  NULL, /*systemDlcCreateFromBinary*/ \
+  NULL, /*systemDlcComposeGraphs*/ \
+  NULL, /*systemDlcGetOpMappings*/ \
+  NULL, /*systemDlcFree*/ \
+}
+
+typedef struct {
+  /// Backend identifier. See QnnCommon.h for details.
+  /// Allowed to be QNN_BACKEND_ID_NULL in case of single backend library or a dedicated system
+  /// library, in which case clients can deduce backend identifier based on library being loaded.
+  uint32_t backendId;
+  /// Interface provider name. Allowed to be NULL.
+  const char* providerName;
+  // API version for provided interface
+  Qnn_Version_t systemApiVersion;
+  union UNNAMED {
+    // Core interface type and name: e.g. QnnSystemInterface_ImplementationV0_0_t v0_0;
+    QNN_SYSTEM_INTERFACE_VER_TYPE  QNN_SYSTEM_INTERFACE_VER_NAME;
+  };
+} QnnSystemInterface_t;
+
+/// QnnSystemInterface_t initializer macro
+#define QNN_SYSTEM_INTERFACE_INIT                                          \
+  {                                                                        \
+    QNN_BACKEND_ID_NULL,     /*backendId*/                                 \
+    NULL,                    /*providerName*/                              \
+    QNN_VERSION_INIT,        /*apiVersion*/                                \
+    {                                                                      \
+      QNN_SYSTEM_INTERFACE_VER_TYPE_INIT /*QNN_SYSTEM_INTERFACE_VER_NAME*/ \
+    }                                                                      \
+  }
+
+// clang-format on
+
+//=============================================================================
+// Public Functions
+//=============================================================================
+
+/**
+ * @brief Get list of available interface providers.
+ *
+ * @param[out] providerList A pointer to an array of available interface providers.
+ *                          The lifetime of returned interface object pointers
+ *                          corresponds to the lifetime of the provider library.
+ *                          Contents are to be considered invalid if the provider
+ *                          library is terminated/unloaded.
+ *                          This function can be called immediately after provider
+ *                          library has been loaded.
+ * @param[out] numProviders Number of available interface objects in _providerList_.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: No error.
+ *         - QNN_SYSTEM_INTERFACE_INVALID_PARAMETER: Invalid parameter was provided.
+ *           Either _providerList_ or _numProviders_ was NULL.
+ *         - QNN_SYSTEM_INTERFACE_ERROR_NOT_SUPPORTED: API not supported.
+ */
+QNN_SYSTEM_INTERFACE
+Qnn_ErrorHandle_t QnnSystemInterface_getProviders(const QnnSystemInterface_t*** providerList,
+                                                  uint32_t* numProviders);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // QNN_SYSTEM_INTERFACE_H
\ No newline at end of file
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/System/QnnSystemLog.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/System/QnnSystemLog.h
new file mode 100755
index 0000000000000..5a0f2ed3baabb
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/System/QnnSystemLog.h
@@ -0,0 +1,99 @@
+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/**
+ * @file
+ * @brief   QNN System Log API component.
+ *
+ *          Provides means for QNN System to output logging data.
+ */
+
+#ifndef QNN_SYSTEM_LOG_H
+#define QNN_SYSTEM_LOG_H
+
+#include "QnnCommon.h"
+#include "QnnLog.h"
+#include "System/QnnSystemCommon.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+//=============================================================================
+// Public Functions
+//=============================================================================
+/**
+ * @brief Create a handle to a logger object.
+ *
+ * @param[in] callback Callback to handle system library generated logging messages. NULL indicates
+ *                     system library may direct log messages to the default log stream on the
+ *                     target platform when possible (e.g. to logcat in case of Android).
+ *
+ * @param[in] maxLogLevel Maximum level of messages which the system library will generate.
+ *
+ * @param[out] logger The created log handle.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: if logging is successfully initialized.
+ *         - QNN_COMMON_ERROR_NOT_SUPPORTED: logging is not supported.
+ *         - QNN_LOG_ERROR_INVALID_ARGUMENT: if one or more arguments is invalid.
+ *         - QNN_LOG_ERROR_MEM_ALLOC: for memory allocation errors.
+ *         - QNN_LOG_ERROR_INITIALIZATION: log init failed.
+ *
+ * @note Use corresponding API through QnnSystemInterface_t.
+ */
+QNN_SYSTEM_API
+Qnn_ErrorHandle_t QnnSystemLog_create(QnnLog_Callback_t callback,
+                                      QnnLog_Level_t maxLogLevel,
+                                      Qnn_LogHandle_t* logger);
+
+/**
+ * @brief A function to change the log level for the supplied log handle.
+ *
+ * @param[in] logger A log handle.
+ *
+ * @param[in] maxLogLevel New maximum log level.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: if the level is changed successfully.
+ *         - QNN_LOG_ERROR_INVALID_ARGUMENT: if maxLogLevel is not a valid QnnLog_Level_t level.
+ *         - QNN_LOG_ERROR_INVALID_HANDLE: _logHandle_ is not a valid handle
+ *
+ * @note Use corresponding API through QnnSystemInterface_t.
+ */
+QNN_SYSTEM_API
+Qnn_ErrorHandle_t QnnSystemLog_setLogLevel(Qnn_LogHandle_t logger, QnnLog_Level_t maxLogLevel);
+
+/**
+ * @brief A function to free the memory associated with the log handle.
+ *
+ * @param[in] logger A log handle.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: indicates logging is terminated.
+ *         - QNN_LOG_ERROR_MEM_ALLOC: for memory de-allocation errors.
+ *         - QNN_LOG_ERROR_INVALID_HANDLE: _logHandle_ is not a valid handle
+ *
+ * @note Use corresponding API through QnnSystemInterface_t.
+ */
+QNN_SYSTEM_API
+Qnn_ErrorHandle_t QnnSystemLog_free(Qnn_LogHandle_t logger);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // QNN_SYSTEM_LOG_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/System/QnnSystemTensor.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/System/QnnSystemTensor.h
new file mode 100755
index 0000000000000..9c199fcbbf250
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/System/QnnSystemTensor.h
@@ -0,0 +1,76 @@
+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All rights reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/**
+ *  @file
+ *  @brief  QNN System Tensor API.
+ *
+ *          This is a system API header dedicated to extensions to QnnTensor
+ *          that provide backend-agnostic services to users.
+ */
+
+#ifndef QNN_SYSTEM_TENSOR_H
+#define QNN_SYSTEM_TENSOR_H
+
+#include "QnnTypes.h"
+#include "System/QnnSystemCommon.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Error Codes
+//=============================================================================
+
+/**
+ * @brief QNN System Tensor API result / error codes.
+ */
+typedef enum {
+  QNN_SYSTEM_TENSOR_MIN_ERROR = QNN_MIN_ERROR_SYSTEM,
+  //////////////////////////////////////////
+
+  /// Qnn System Tensor success
+  QNN_SYSTEM_TENSOR_NO_ERROR = QNN_SUCCESS,
+  /// Qnn System Tensor API is not supported yet
+  QNN_SYSTEM_TENSOR_ERROR_UNSUPPORTED_FEATURE = QNN_COMMON_ERROR_NOT_SUPPORTED,
+  /// One or more arguments to a System Tensor API is/are NULL/invalid.
+  QNN_SYSTEM_TENSOR_ERROR_INVALID_ARGUMENT = QNN_SYSTEM_TENSOR_MIN_ERROR + 1,
+  /// A Qnn_Tensor_t data structure in invalid
+  QNN_SYSTEM_TENSOR_ERROR_INVALID_TENSOR = QNN_SYSTEM_TENSOR_MIN_ERROR + 2,
+  //////////////////////////////////////////
+  QNN_SYSTEM_TENSOR_MAX_ERROR = QNN_MAX_ERROR_SYSTEM
+} QnnSystemTensor_Error_t;
+
+//=============================================================================
+// Public Functions
+//=============================================================================
+
+/**
+ * @brief A function to compute the maximum amount of memory in bytes required to contain tensor data.
+ *        Currently supported data formats are:
+ *        - QNN_DATA_FORMAT_DENSE
+ *
+ * @param[in] tensor A Qnn_Tensor_t data structure.
+ *
+ * @param[out] footprint The maximum amount of memory required to fully contain tensor data.
+ *
+ * @return Error code
+ *         - QNN_SUCCESS: Successfully compute the tensor memory extent
+ *         - QNN_SYSTEM_TENSOR_ERROR_INVALID_ARGUMENT: extent is NULL
+ *         - QNN_SYSTEM_TENSOR_ERROR_INVALID_TENSOR: tensor is ill-configured
+ *         - QNN_SYSTEM_TENSOR_ERROR_UNSUPPORTED_FEATURE: this API is not supported yet
+ */
+QNN_SYSTEM_API
+Qnn_ErrorHandle_t QnnSystemTensor_getMemoryFootprint(Qnn_Tensor_t tensor, uint64_t* footprint);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // QNN_SYSTEM_TENSOR_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/TFLiteDelegate/QnnTFLiteDelegate.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/TFLiteDelegate/QnnTFLiteDelegate.h
new file mode 100755
index 0000000000000..e54c7de3e0bd3
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/TFLiteDelegate/QnnTFLiteDelegate.h
@@ -0,0 +1,614 @@
+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+#ifndef TENSORFLOW_LITE_DELEGATES_QNN_QNN_TFLITE_DELEGATE_H_
+#define TENSORFLOW_LITE_DELEGATES_QNN_QNN_TFLITE_DELEGATE_H_
+
+#include "tensorflow/lite/c/common.h"
+
+#ifndef QNN_DELEGATE_CAPI_EXPORT
+#define QNN_DELEGATE_CAPI_EXPORT
+#endif /* QNN_DELEGATE_CAPI_EXPORT */
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Provide values to use for API version
+// NOLINTBEGIN(cppcoreguidelines-macro-usage)
+#define QNN_DELEGATE_API_VERSION_MAJOR 0
+#define QNN_DELEGATE_API_VERSION_MINOR 24
+#define QNN_DELEGATE_API_VERSION_PATCH 0
+// NOLINTEND(cppcoreguidelines-macro-usage)
+
+/// A struct which is used to provide a version number using 3 values:
+/// major, minor, patch
+typedef struct {  // NOLINT(modernize-use-using)
+  uint32_t major;
+  uint32_t minor;
+  uint32_t patch;
+} QnnDelegateApiVersion;
+
+/// The QNN backend used to delegate the model's nodes. Each backend has
+/// its own set of supported ops and tensor types.
+typedef enum TfLiteQnnDelegateBackendType {  // NOLINT(modernize-use-using)
+  kUndefinedBackend = 0,
+  /// Backend for Adreno<sup>TM</sup> GPU hardware accelerator.
+  kGpuBackend,
+  /// Backend for Hexagon HTP hardware accelerator.
+  kHtpBackend,
+  /// Backend for Hexagon DSP hardware accelerator.
+  kDspBackend,
+  /// Backend for serializing model into dlc
+  kIrBackend,
+} TfLiteQnnDelegateBackendType;
+
+/// Logging level of the delegate and QNN backend.
+typedef enum TfLiteQnnDelegateLogLevel {  // NOLINT(modernize-use-using)
+  /// Disable delegate and QNN backend logging messages.
+  kLogOff = 0,
+  kLogLevelError = 1,
+  kLogLevelWarn = 2,
+  kLogLevelInfo = 3,
+  kLogLevelVerbose = 4,
+  kLogLevelDebug = 5,
+} TfLiteQnnDelegateLogLevel;
+
+/// Options to set Graph Priority. This is directly mapped to Qnn_Priority_t.
+/// Please refer to QNN SDK for additional information.
+typedef enum TfLiteQnnDelegateGraphPriority {  // NOLINT(modernize-use-using)
+  kQnnPriorityDefault = 0,
+  kQnnPriorityLow,
+  kQnnPriorityNormal,
+  kQnnPriorityNormalHigh,
+  kQnnPriorityHigh,
+  kQnnPriorityUndefined,
+} TfLiteQnnDelegateGraphPriority;
+
+/// Options to profile the QNN Delegate execution.
+typedef enum TfLiteQnnDelegateProfilingOptions {  // NOLINT(modernize-use-using)
+  kProfilingOff = 0,
+  kBasicProfiling,
+  kPerOpProfiling,
+  kLintingProfiling,
+} TfLiteQnnDelegateProfilingOptions;
+
+/// Defines the optimization levels of the graph tensors that are not input
+/// nor output tensors. This enum controls the trade-off between performance
+/// and accuracy.
+typedef enum TfLiteQnnDelegateGpuPrecision {  // NOLINT(modernize-use-using)
+  kGpuUserProvided = 0,
+  kGpuFp32,
+  kGpuFp16,
+  kGpuHybrid,
+} TfLiteQnnDelegateGpuPrecision;
+
+/// Defines performance modes available for GPU backend.
+typedef enum TfLiteQnnDelegateGpuPerformanceMode {  // NOLINT(modernize-use-using)
+  kGpuDefault = 0,
+  kGpuHigh,
+  kGpuNormal,
+  kGpuLow,
+} TfLiteQnnDelegateGpuPerformanceMode;
+
+/// Defines performance modes available for HTP backend.
+typedef enum TfLiteQnnDelegateHtpPerformanceMode {  // NOLINT(modernize-use-using)
+  kHtpDefault = 0,
+  kHtpSustainedHighPerformance = 1,
+  kHtpBurst = 2,
+  kHtpHighPerformance = 3,
+  kHtpPowerSaver = 4,
+  kHtpLowPowerSaver = 5,
+  kHtpHighPowerSaver = 6,
+  kHtpLowBalanced = 7,
+  kHtpBalanced = 8,
+  kHtpExtremePowerSaver = 9,
+} TfLiteQnnDelegateHtpPerformanceMode;
+
+/// Defines performance modes available for DSP backend.
+typedef enum TfLiteQnnDelegateDspPerformanceMode {  // NOLINT(modernize-use-using)
+  kDspDefault = 0,
+  kDspSustainedHighPerformance = 1,
+  kDspBurst = 2,
+  kDspHighPerformance = 3,
+  kDspPowerSaver = 4,
+  kDspLowPowerSaver = 5,
+  kDspHighPowerSaver = 6,
+  kDspLowBalanced = 7,
+  kDspBalanced = 8,
+} TfLiteQnnDelegateDspPerformanceMode;
+
+///   Defines performance control strategy
+///
+///   **Manual**: The performance mode is voted as the backend is initialized,
+///   and released at the moment of the backend is destroyed.
+///
+///   Users can control the vote/release of the performance mode by
+///   TfLiteQnnDelegateSetPerf().
+///
+///   Note that this is the default strategy.
+///
+///   For example, users can vote before inference starts, and release after all
+///   invocations are complete.
+///
+///   ~~~~~~~~~~~~~{.cpp}
+///      TfLiteQnnDelegateSetPerf(delegate, kPerformanceVote);
+///      // invoke inferences...
+///      TfLiteQnnDelegateSetPerf(delegate, kPerformanceRelease);
+///   ~~~~~~~~~~~~~
+///
+///   **AUTO**: QNN Delegate votes before starting inference, and releases after
+///   an idle interval.
+typedef enum TfLiteQnnDelegateHtpPerfCtrlStrategy {  // NOLINT(modernize-use-using)
+  kHtpPerfCtrlManual = 0,
+  kHtpPerfCtrlAuto = 1,
+} TfLiteQnnDelegateHtpPerfCtrlStrategy;
+
+/// Defines DSP performance control strategy. Similar to HTP cases.
+typedef enum TfLiteQnnDelegateDspPerfCtrlStrategy {  // NOLINT(modernize-use-using)
+  kDspPerfCtrlManual = 0,
+  kDspPerfCtrlAuto = 1,
+} TfLiteQnnDelegateDspPerfCtrlStrategy;
+
+/// Defines pd sessions available for DSP backend.
+typedef enum TfLiteQnnDelegateDspPdSession {  // NOLINT(modernize-use-using)
+  kDspUnsignedPd = 0,
+  kDspSignedPd,
+  kDspAdaptivePd,
+} TfLiteQnnDelegateDspPdSession;
+
+/// Defines encoding for DSP backend. Dynamic encoding is more precise but
+/// sacrifices a bit of performance.
+typedef enum TfLiteQnnDelegateDspEncoding {  // NOLINT(modernize-use-using)
+  kDspStatic = 0,
+  kDspDynamic = 1,
+  kDspUnknown = 0x7fffffff,
+} TfLiteQnnDelegateDspEncoding;
+
+/// Defines pd sessions available for HTP backend.
+typedef enum TfLiteQnnDelegateHtpPdSession {  // NOLINT(modernize-use-using)
+  kHtpUnsignedPd = 0,
+  kHtpSignedPd,
+} TfLiteQnnDelegateHtpPdSession;
+
+/// Defines the optimization levels of the graph tensors that are not input nor
+/// output tensors. This enum controls the trade-off between performance and
+/// accuracy.
+typedef enum TfLiteQnnDelegateHtpPrecision {  // NOLINT(modernize-use-using)
+  kHtpQuantized = 0,
+  kHtpFp16,
+} TfLiteQnnDelegateHtpPrecision;
+
+/// Defines the optimization strategy used by the HTP backend.
+/// \ref kHtpOptimizeForInference will have longer preparation time, but more
+/// optimal graph. \ref kHtpOptimizeForPrepare will have shorter preparation
+/// time, but less optimal graph. \ref kHtpOptimizeForInferenceO3 will take into
+/// account QNN_HTP_DEVICE_CONFIG_OPTION_SOC configuration when possible. When
+/// SOC information is taken into account, O3 configuration is expected to
+/// provide more optimal graph in most cases, but may result in less optimal
+/// graph in some cases. Please check HTP section in Qnn docs for more detail.
+typedef enum TfLiteQnnDelegateHtpOptimizationStrategy {  // NOLINT(modernize-use-using)
+  kHtpOptimizeForInference = 0,
+  kHtpOptimizeForPrepare,
+  kHtpOptimizeForInferenceO3,
+} TfLiteQnnDelegateHtpOptimizationStrategy;
+
+/// Defines the performance action used by TfLiteQnnDelegateSetPerf()
+typedef enum TfLiteQnnDelegatePerformanceAction {  // NOLINT(modernize-use-using)
+  kPerformanceVote = 0,
+  kPerformanceRelease = 1,
+} TfLiteQnnDelegatePerformanceAction;
+
+/// Specifies the backend options for the GPU backend. To be used when selecting
+/// \ref TfLiteQnnDelegateBackendType.kGpuBackend for the \ref
+/// TfLiteQnnDelegateOptions.backend_type.
+typedef struct {  // NOLINT
+  /// The default precision is half float for the best performance.
+  TfLiteQnnDelegateGpuPrecision precision;
+  /// The default performance mode sets high.
+  TfLiteQnnDelegateGpuPerformanceMode performance_mode;
+  /// The QNN GPU backend supports on-disk kernel persistence strategies where
+  /// compiled GPU kernel binaries are cached to disk and can be shared across
+  /// models having the same kernels and improve warm init times significantly.
+  const char* kernel_repo_dir;
+} TfLiteQnnDelegateGpuBackendOptions;
+
+// clang-format off
+#define QNN_DELEGATE_GPU_OPTION_INIT   \
+  {                                   \
+    kGpuFp16,    /*precision*/        \
+    kGpuDefault, /*performance_mode*/ \
+    ""           /*kernel_repo_dir*/  \
+  }
+// clang-format on
+
+/// Specifies the backend options for the HTP backend. To be used when selecting
+/// \ref TfLiteQnnDelegateBackendType.kGpuBackend for the \ref
+/// TfLiteQnnDelegateOptions.backend_type.
+typedef struct {  // NOLINT
+  /// The default performance mode sets no configurations on the HTP.
+  TfLiteQnnDelegateHtpPerformanceMode performance_mode;
+  /// The default performance control strategy is Manual.
+  TfLiteQnnDelegateHtpPerfCtrlStrategy perf_ctrl_strategy;
+  /// The default precision mode supports quantized networks. Other precision
+  /// modes may only be supported on certain SoCs.
+  TfLiteQnnDelegateHtpPrecision precision;
+  /// Signed or unsigned HTP PD session. The default PD session is unsigned.
+  TfLiteQnnDelegateHtpPdSession pd_session;
+  /// The default optimization strategy will optimize the graph for inference.
+  TfLiteQnnDelegateHtpOptimizationStrategy optimization_strategy;
+  /// When using short conv hmx, one might have better performance,
+  /// but convolution that have short depth and/or weights that are not
+  /// symmetric could exhibit inaccurate results.
+  bool useConvHmx;
+  /// When using fold relu, one might have better performance. This optimization
+  /// is correct when quantization ranges for convolution are equal to or are
+  /// subset of the Relu operation.
+  bool useFoldRelu;
+  /// Option to set VTCM size in MB. This is directly mapped to
+  /// QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE under QnnHtpGraph_ConfigOption_t. If
+  /// VTCM size is set to 0, the default VTCM size will be used.
+  /// If VTCM size is greater than VTCM size available for this device,
+  /// it will be set to the maximum VTCM size for this device.
+  uint32_t vtcm_size;
+  /// Option to set number of HVX threads. This is directly mapped to
+  /// QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS under
+  /// QnnHtpGraph_ConfigOption_t. If this this option is set to 0, the default
+  /// number of HVX threads will be used. If input exceeds the max number of HVX
+  /// threads, the maximum number of threads supported will be used.
+  uint32_t num_hvx_threads;
+  /// Some SoCs come with more than 1 HTP device. You can set which HTP device
+  /// you want to run the model on by this attribute.
+  /// But in most cases, you can just use the default device_id.
+  uint32_t device_id;
+} TfLiteQnnDelegateHtpBackendOptions;
+
+// clang-format off
+#define QNN_DELEGATE_HTP_OPTION_INIT                      \
+  {                                                       \
+    kHtpDefault,              /*performance_mode*/        \
+    kHtpPerfCtrlManual,       /*perf_ctrl_strategy*/      \
+    kHtpFp16,                 /*precision*/               \
+    kHtpUnsignedPd,           /*pd_session*/              \
+    kHtpOptimizeForInference, /*optimization_strategy*/   \
+    true,                     /*useConvHmx*/              \
+    false,                    /*useFoldRelu*/             \
+    0,                        /*vtcm_size*/               \
+    0,                        /*num_hvx_threads*/         \
+    0,                        /*device_id*/               \
+  }
+// clang-format on
+
+/// Specifies the backend options for the DSP backend. To be used when selecting
+/// kDspBackend as the <backend_type>.
+typedef struct {  // NOLINT
+  /// The default performance mode sets no configurations on the DSP.
+  TfLiteQnnDelegateDspPerformanceMode performance_mode;
+  /// The default performance control strategy is Manual.
+  TfLiteQnnDelegateDspPerfCtrlStrategy perf_ctrl_strategy;
+  /// The default PD session is unsigned.
+  TfLiteQnnDelegateDspPdSession pd_session;
+  /// The default Encoding is static
+  TfLiteQnnDelegateDspEncoding encoding;
+} TfLiteQnnDelegateDspBackendOptions;
+
+// clang-format off
+#define QNN_DELEGATE_DSP_OPTION_INIT                      \
+  {                                                       \
+    kDspDefault,              /*performance_mode*/        \
+    kDspPerfCtrlManual,       /*perf_ctrl_strategy*/      \
+    kDspUnsignedPd,           /*pd_session*/              \
+    kDspStatic,               /*encoding*/                \
+  }
+// clang-format on
+
+/// Specifies the backend options for the IR writer backend. To be used when
+/// selecting \ref TfLiteQnnDelegateBackendType.kIrBackend for the \ref
+/// TfLiteQnnDelegateOptions.backend_type.
+typedef struct {  // NOLINT
+  const char* output_path;
+} TfLiteQnnDelegateIrBackendOptions;
+
+// clang-format off
+#define QNN_DELEGATE_IR_OPTION_INIT                      \
+  {                                                      \
+    nullptr,              /*output_path*/                \
+  }
+// clang-format on
+
+/// Map of TFLite custom operator name to op type defined within an op package.
+typedef struct {  // NOLINT
+  /// The TfLiteRegistration::custom_name set during registration.
+  const char* custom_op_name;
+  /// The corresponding op type name defined in the op package.
+  const char* qnn_op_type_name;
+} TfLiteQnnDelegateOpPackageOpMap;
+
+// clang-format off
+#define QNN_DELEGATE_OP_PACKAGE_OPTION_INIT   \
+  {                                           \
+    0,              /*num_op_package_infos*/  \
+    nullptr,        /*op_package_infos*/      \
+  }
+// clang-format on
+
+/// Structure containing the information needed to register and use an op
+/// package with QNN.
+typedef struct {  // NOLINT
+  /// The name of the op package.
+  const char* op_package_name;
+  /// The path on disk to the op package library.
+  const char* op_package_path;
+  /// The name of a function in the op package library which satisfies the
+  /// QnnOpPackage_InterfaceProvider_t interface.
+  const char* interface_provider;
+  /// The target which this op package library was compiled for.
+  const char* target;
+  /// Number of elements in the TfLiteQnnDelegateOpPackageInfo.ops_map array.
+  int num_ops_map;
+  /// An array of TfLiteQnnDelegateOpPackageOpMap structures.
+  TfLiteQnnDelegateOpPackageOpMap* ops_map;
+} TfLiteQnnDelegateOpPackageInfo;
+
+typedef struct {  // NOLINT
+  /// Number of elements in TfLiteQnnDelegateOpPackageOptions.op_package_infos
+  /// array.
+  int num_op_package_infos;
+  /// An array of TfLiteQnnDelegateOpPackageInfo structures.
+  TfLiteQnnDelegateOpPackageInfo* op_package_infos;
+} TfLiteQnnDelegateOpPackageOptions;
+
+typedef struct {  // NOLINT
+  /// Set ops not to be delegated manually based on the op id(s).
+  /// To obtain all the op ids, please refer to tensorflow/lite/builtin_ops.h.
+  /// Notice that we skip all of with the types specified in the
+  /// \ref skip_delegate_ops array. For example, if you set skip to include
+  /// SquaredDifference, all instances of SquaredDifference ops in the
+  /// model will not be delegated.
+  const int* skip_delegate_ops;
+  /// Indicates the length of \ref skip_delegate_ops array.
+  uint32_t skip_delegate_ops_nr;
+  /// Set node IDs not to be delegated.
+  /// Node id can be obtained by node's location information in .tflite.
+  const int* skip_delegate_node_ids;
+  /// Indicates the length of \ref skip_delegate_node_ids array.
+  uint32_t skip_delegate_node_ids_nr;
+} TfLiteQnnDelegateSkipOption;
+
+// clang-format off
+#define QNN_DELEGATE_SKIP_OPTION_INIT          \
+  {                                            \
+    nullptr,     /*skip_delegate_ops*/         \
+    0,           /*skip_delegate_ops_nr*/      \
+    nullptr,     /*skip_delegate_node_ids*/    \
+    0,           /*skip_delegate_node_ids_nr*/ \
+  }
+// clang-format on
+
+typedef struct {  // NOLINT
+  /// The backend QNN library to open and execute the graph with. This is a
+  /// required argument and will error out if kUndefinedBackend is supplied.
+  TfLiteQnnDelegateBackendType backend_type;
+
+  /// Optional parameter to override the QNN backend library.
+  const char* library_path;
+
+  /// Optional parameter specifying the directory of QNN Skel library. Only
+  /// useful for backends which have a Skel library.
+  const char* skel_library_dir;
+
+  /// Optional backend specific options for the GPU backend. Only used when
+  /// selecting \ref TfLiteQnnDelegateBackendType.kGpuBackend, otherwise will be
+  /// ignored.
+  TfLiteQnnDelegateGpuBackendOptions gpu_options;
+
+  /// Optional backend specific options for the HTP backend. Only used when
+  /// selecting \ref TfLiteQnnDelegateBackendType.kHtpBackend, otherwise will be
+  /// ignored.
+  TfLiteQnnDelegateHtpBackendOptions htp_options;
+
+  /// Optional backend specific options for the DSP backend. Only used when
+  /// selecting \ref TfLiteQnnDelegateBackendType.kDspBackend, otherwise will be
+  /// ignored.
+  TfLiteQnnDelegateDspBackendOptions dsp_options;
+
+  /// Optional backend specific options for the IR backend. Only used when
+  /// selecting \ref TfLiteQnnDelegateBackendType.kIrBackend, otherwise will be
+  /// ignored.
+  TfLiteQnnDelegateIrBackendOptions ir_options;
+
+  /// Logging level of the delegate and the backend. Default is off.
+  TfLiteQnnDelegateLogLevel log_level;
+
+  /// Option to enable profiling with the delegate. Default is off.
+  TfLiteQnnDelegateProfilingOptions profiling;
+
+  /// Optional structure to specify op packages loaded and used by the backend.
+  TfLiteQnnDelegateOpPackageOptions op_package_options;
+
+  /// Tensor dump output path. If a path is given, Delegate will write
+  /// outputs of each OP there.
+  /// We don't recommend using this option. It exists only for debugging
+  /// accuracy issues.
+  const char* tensor_dump_output_path;
+
+  /// Specifies the directory of a compiled model. Signals intent to either:
+  ///   * Save the model if the file doesn't exist, or
+  ///   * Restore model from the file.
+  ///
+  /// Model Cache specific options. Only used when setting \ref model_token,
+  /// otherwise will be ignored.
+  ///
+  /// We don't recommend that delegate instances with/without cache be mixed in
+  /// same process, unless an instance <b>without</b> cache is initialized,
+  /// invoked, and *terminated* before an instance with cache is used in order
+  /// to make sure all resources are prepared correctly.
+  ///
+  ///   ~~~~~~~~~~~~~{.cpp}
+  ///
+  ///   TfLiteDelegate* delegate_wo_cache =
+  ///   TfLiteQnnDelegateCreate(&options_wo_cache);
+  ///   interpreter_0->ModifyGraphWithDelegate(delegate_wo_cache);
+  ///
+  ///   // Perform inference with interpreter_0
+  ///
+  ///   TfLiteQnnDelegateDelete(delegate_wo_cache);
+  ///
+  ///   // after this, another delegate_with_cache can be used in the same
+  ///   // process, though not recommended at this moment.
+  ///   TfLiteDelegate* delegate_with_cache =
+  ///   TfLiteQnnDelegateCreate(&options_with_cache);
+  ///
+  ///   // another interpreter
+  ///   interpreter_1->ModifyGraphWithDelegate(delegate_with_cache);
+  ///
+  ///   // more delegates...etc.
+  ///   ~~~~~~~~~~~~~
+  const char* cache_dir;
+  /// The unique null-terminated token string that acts as a ‘namespace’ for all
+  /// serialization entries. Should be unique to a particular model (graph &
+  /// constants). For an example of how to generate this from a TFLite model,
+  /// see StrFingerprint() in lite/delegates/serialization.h.
+  ///
+  /// Model Cache specific options. Only used when setting \ref cache_dir,
+  /// otherwise will be ignored.
+  const char* model_token;
+  /// Option to skip node by specifying node types or node ids.
+  TfLiteQnnDelegateSkipOption skip_options;
+  /// Option to set graph priority.
+  TfLiteQnnDelegateGraphPriority graph_priority;
+} TfLiteQnnDelegateOptions;
+
+// clang-format off
+#define QNN_DELEGATE_OPTION_INIT                                        \
+  {                                                                     \
+    kUndefinedBackend,                    /*backend_type*/              \
+    "",                                   /*library_path*/              \
+    "",                                   /*skel_library_dir*/          \
+    QNN_DELEGATE_GPU_OPTION_INIT,         /*gpu_options*/               \
+    QNN_DELEGATE_HTP_OPTION_INIT,         /*htp_options*/               \
+    QNN_DELEGATE_DSP_OPTION_INIT,         /*dsp_options*/               \
+    QNN_DELEGATE_IR_OPTION_INIT,          /*ir_options*/                \
+    kLogOff,                              /*log_level*/                 \
+    kProfilingOff,                        /*profiling*/                 \
+    QNN_DELEGATE_OP_PACKAGE_OPTION_INIT,  /*op_package_options*/        \
+    "",                                   /*tensor_dump_output_path*/   \
+    "",                                   /*cache_dir*/                 \
+    "",                                   /*model_token*/               \
+    QNN_DELEGATE_SKIP_OPTION_INIT,        /*skip_options*/              \
+    kQnnPriorityDefault,                  /*graph_priority*/            \
+  }
+// clang-format on
+
+typedef int32_t  // NOLINT(modernize-use-using)
+    TfLiteQnnDelegateCapabilityStatus;
+
+// NOLINTBEGIN(cppcoreguidelines-macro-usage)
+/// Return by TfLiteQnnDelegateHasCapability() if the capability is supported.
+#define TfLiteQnnDelegateCapabilitySupported 1
+/// Return by TfLiteQnnDelegateHasCapability() if the capability is not
+/// supported.
+#define TfLiteQnnDelegateCapabilityNotSupported 0
+// NOLINTEND(cppcoreguidelines-macro-usage)
+
+/// Defines possible QNN Delegate capabilities.
+typedef enum TfLiteQnnDelegateCapability {  // NOLINT(modernize-use-using)
+  kCapHtpRuntimeQuant = 0,
+  kCapHtpRuntimeFp16 = 1,
+  kCapGpuRuntime = 2,
+  kCapDspRuntime = 3,
+} TfLiteQnnDelegateCapability;
+
+/// Create the QNN Delegate options structure and populate with default values.
+QNN_DELEGATE_CAPI_EXPORT TfLiteQnnDelegateOptions
+TfLiteQnnDelegateOptionsDefault();
+
+/// Create the QNN Delegate with the specified options.
+QNN_DELEGATE_CAPI_EXPORT TfLiteDelegate* TfLiteQnnDelegateCreate(
+    const TfLiteQnnDelegateOptions* options);
+
+/// Delete the QNN Delegate once no longer required.
+///
+/// Note that this is not a thread-safe function, which might cause unexpected
+/// behaviour when using it with \ref TfLiteQnnDelegateSetPerf, \ref
+/// TfLiteQnnDelegateUpdateHtpPerfMode, \ref TfLiteQnnDelegateUpdateDspPerfMode,
+/// or \ref TfLiteQnnDelegateDelete at the same time.
+QNN_DELEGATE_CAPI_EXPORT void TfLiteQnnDelegateDelete(TfLiteDelegate* delegate);
+
+/// Manually vote or release performance mode. "Vote" to request hardware to
+/// obey the performance mode setting as soon as possible. "Release" to
+/// release the vote. Note that this API only work for HTP/DSP backend with \ref
+/// kHtpPerfCtrlManual or \ref kDspPerfCtrlManual. Return true for success,
+/// false for failure.
+QNN_DELEGATE_CAPI_EXPORT bool TfLiteQnnDelegateSetPerf(
+    TfLiteDelegate* delegate, const TfLiteQnnDelegatePerformanceAction action);
+
+/// Detect whether the capability is supported on the platform running QNN
+/// Delegate.
+///
+/// Note that this is an experimental feature.
+QNN_DELEGATE_CAPI_EXPORT TfLiteQnnDelegateCapabilityStatus
+TfLiteQnnDelegateHasCapability(const TfLiteQnnDelegateCapability cap);
+
+/// This API changes the performance mode of a created QNN Delegate on HTP
+/// backend, returning `true` for the mode set correctly, `false` for any
+/// failure.
+///
+/// It will perform a vote after a successful update. If the strategy of
+/// performance controlling is **manual**, the new mode takes effect before this
+/// API returns.
+///
+/// Note that this API cannot be called during graph invocation, and this is an
+/// experimental feature.
+QNN_DELEGATE_CAPI_EXPORT bool TfLiteQnnDelegateUpdateHtpPerfMode(
+    TfLiteDelegate* delegate, const TfLiteQnnDelegateHtpPerformanceMode mode);
+
+/// This API changes the performance mode of a created QNN Delegate on DSP
+/// backend, returning `true` for the mode set correctly, `false` for any
+/// failure.
+QNN_DELEGATE_CAPI_EXPORT bool TfLiteQnnDelegateUpdateDspPerfMode(
+    TfLiteDelegate* delegate, const TfLiteQnnDelegateDspPerformanceMode mode);
+
+/// Get QNN Delegate API version.
+QNN_DELEGATE_CAPI_EXPORT QnnDelegateApiVersion TfLiteQnnDelegateGetApiVersion();
+
+/// Allocate specific tensors (usually graph inputs and outputs) on shared
+/// memory. Users are responsible to allocate "enough" tensor bytes, and set
+/// alignment as kDefaultTensorAlignment. The function returns a valid pointer
+/// if allocation is successful.
+///
+/// Note that this is an experimental feature.
+QNN_DELEGATE_CAPI_EXPORT void* TfLiteQnnDelegateAllocCustomMem(
+    size_t bytes, size_t alignment);
+
+/// Free the allocated shared memory.
+///
+/// Note that this is an experimental feature.
+QNN_DELEGATE_CAPI_EXPORT void TfLiteQnnDelegateFreeCustomMem(void* buffer_ptr);
+
+/// Structure of profiling result.
+typedef struct {  // NOLINT(modernize-use-using)
+  /// Buffer of profiling result
+  /// will be invalid once TfLiteQnnDelegateClearProfilingResult gets called
+  const uint8_t* buffer;
+  /// Buffer length of profiling result in bytes
+  uint32_t buffer_length;
+} TfLiteQnnDelegateProfilingResult;
+
+/// Get profiling result.
+QNN_DELEGATE_CAPI_EXPORT TfLiteQnnDelegateProfilingResult
+TfLiteQnnDelegateGetProfilingResult(TfLiteDelegate* delegate);
+
+/// Free the recorded profiling result.
+QNN_DELEGATE_CAPI_EXPORT void TfLiteQnnDelegateClearProfilingResult(
+    TfLiteDelegate* delegate);
+
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_DELEGATES_QNN_QNN_TFLITE_DELEGATE_H_
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libPlatformValidatorShared.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libPlatformValidatorShared.so
new file mode 100755
index 0000000000000..614dc4822c8a9
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libPlatformValidatorShared.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnChrometraceProfilingReader.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnChrometraceProfilingReader.so
new file mode 100755
index 0000000000000..9060d479b1adc
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnChrometraceProfilingReader.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnCpu.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnCpu.so
new file mode 100755
index 0000000000000..1a12d1d8a8b06
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnCpu.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnGpu.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnGpu.so
new file mode 100755
index 0000000000000..5389da7d26cd4
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnGpu.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnGpuNetRunExtensions.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnGpuNetRunExtensions.so
new file mode 100755
index 0000000000000..d047d2f30b5a4
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnGpuNetRunExtensions.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnGpuProfilingReader.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnGpuProfilingReader.so
new file mode 100755
index 0000000000000..e7b132a074625
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnGpuProfilingReader.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHta.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHta.so
new file mode 100755
index 0000000000000..da1ac38e414b9
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHta.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtaNetRunExtensions.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtaNetRunExtensions.so
new file mode 100755
index 0000000000000..b4aa55d31aad8
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtaNetRunExtensions.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtp.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtp.so
new file mode 100755
index 0000000000000..151e7dd23d51a
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtp.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtpNetRunExtensions.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtpNetRunExtensions.so
new file mode 100755
index 0000000000000..21def50de605d
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtpNetRunExtensions.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtpOptraceProfilingReader.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtpOptraceProfilingReader.so
new file mode 100755
index 0000000000000..92f47ae2338f8
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtpOptraceProfilingReader.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtpPrepare.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtpPrepare.so
new file mode 100755
index 0000000000000..e0858ffde2511
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtpPrepare.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtpProfilingReader.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtpProfilingReader.so
new file mode 100755
index 0000000000000..5ef985d6f17aa
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtpProfilingReader.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtpV68Stub.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtpV68Stub.so
new file mode 100755
index 0000000000000..e6756906a3a0a
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtpV68Stub.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtpV69Stub.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtpV69Stub.so
new file mode 100755
index 0000000000000..80d871e8f9b52
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtpV69Stub.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtpV73Stub.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtpV73Stub.so
new file mode 100755
index 0000000000000..1095ed348700c
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtpV73Stub.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtpV75Stub.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtpV75Stub.so
new file mode 100755
index 0000000000000..e95cef23fc2b4
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtpV75Stub.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtpV79Stub.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtpV79Stub.so
new file mode 100755
index 0000000000000..bd46b4292eb1d
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtpV79Stub.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnIr.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnIr.so
new file mode 100755
index 0000000000000..6f61df3e55ba7
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnIr.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnJsonProfilingReader.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnJsonProfilingReader.so
new file mode 100755
index 0000000000000..23c44cf813477
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnJsonProfilingReader.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnModelDlc.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnModelDlc.so
new file mode 100755
index 0000000000000..2022596b18bb6
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnModelDlc.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnNetRunDirectV79Stub.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnNetRunDirectV79Stub.so
new file mode 100755
index 0000000000000..ad7e6f21947f0
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnNetRunDirectV79Stub.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnSaver.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnSaver.so
new file mode 100755
index 0000000000000..46c1a07118a22
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnSaver.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnSystem.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnSystem.so
new file mode 100755
index 0000000000000..ebaaf73c11f59
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnSystem.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libhta_hexagon_runtime_qnn.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libhta_hexagon_runtime_qnn.so
new file mode 100755
index 0000000000000..2eec419ffaf9b
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libhta_hexagon_runtime_qnn.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v68/unsigned/libQnnHtpV68.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v68/unsigned/libQnnHtpV68.so
new file mode 100755
index 0000000000000..47f2023462fea
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v68/unsigned/libQnnHtpV68.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v68/unsigned/libQnnHtpV68Skel.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v68/unsigned/libQnnHtpV68Skel.so
new file mode 100755
index 0000000000000..3c5b06d26503e
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v68/unsigned/libQnnHtpV68Skel.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v68/unsigned/libQnnSaver.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v68/unsigned/libQnnSaver.so
new file mode 100755
index 0000000000000..6f0c851d807cf
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v68/unsigned/libQnnSaver.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v68/unsigned/libQnnSystem.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v68/unsigned/libQnnSystem.so
new file mode 100755
index 0000000000000..962dcf0e07ac9
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v68/unsigned/libQnnSystem.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v69/unsigned/libQnnHtpV69.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v69/unsigned/libQnnHtpV69.so
new file mode 100755
index 0000000000000..cd76354b331ff
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v69/unsigned/libQnnHtpV69.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v69/unsigned/libQnnHtpV69Skel.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v69/unsigned/libQnnHtpV69Skel.so
new file mode 100755
index 0000000000000..04b394ecc335a
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v69/unsigned/libQnnHtpV69Skel.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v69/unsigned/libQnnSaver.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v69/unsigned/libQnnSaver.so
new file mode 100755
index 0000000000000..3182094c14764
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v69/unsigned/libQnnSaver.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v69/unsigned/libQnnSystem.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v69/unsigned/libQnnSystem.so
new file mode 100755
index 0000000000000..5cca038f909a5
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v69/unsigned/libQnnSystem.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v73/unsigned/libQnnHtpV73.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v73/unsigned/libQnnHtpV73.so
new file mode 100755
index 0000000000000..1d7beb15373c9
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v73/unsigned/libQnnHtpV73.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v73/unsigned/libQnnHtpV73QemuDriver.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v73/unsigned/libQnnHtpV73QemuDriver.so
new file mode 100755
index 0000000000000..edcd3c5f0ee9b
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v73/unsigned/libQnnHtpV73QemuDriver.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v73/unsigned/libQnnHtpV73Skel.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v73/unsigned/libQnnHtpV73Skel.so
new file mode 100755
index 0000000000000..cd5417cb4807a
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v73/unsigned/libQnnHtpV73Skel.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v73/unsigned/libQnnSaver.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v73/unsigned/libQnnSaver.so
new file mode 100755
index 0000000000000..32ec373dbdeab
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v73/unsigned/libQnnSaver.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v73/unsigned/libQnnSystem.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v73/unsigned/libQnnSystem.so
new file mode 100755
index 0000000000000..6eb416bc109ce
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v73/unsigned/libQnnSystem.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v73/unsigned/libqnnhtpv73.cat b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v73/unsigned/libqnnhtpv73.cat
new file mode 100755
index 0000000000000..ecb30d6269994
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v73/unsigned/libqnnhtpv73.cat differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v73/unsigned/libsnpehtpv73.cat b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v73/unsigned/libsnpehtpv73.cat
new file mode 100755
index 0000000000000..17372b43b1eee
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v73/unsigned/libsnpehtpv73.cat differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v75/unsigned/libQnnHtpV75.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v75/unsigned/libQnnHtpV75.so
new file mode 100755
index 0000000000000..77bc6b968334f
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v75/unsigned/libQnnHtpV75.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so
new file mode 100755
index 0000000000000..ee5ab21705cda
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v75/unsigned/libQnnSaver.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v75/unsigned/libQnnSaver.so
new file mode 100755
index 0000000000000..c97486fee13a8
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v75/unsigned/libQnnSaver.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v75/unsigned/libQnnSystem.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v75/unsigned/libQnnSystem.so
new file mode 100755
index 0000000000000..09ea592e81405
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v75/unsigned/libQnnSystem.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v79/unsigned/libQnnHexagonSkel_dspApp.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v79/unsigned/libQnnHexagonSkel_dspApp.so
new file mode 100755
index 0000000000000..ad0ddad17e5f4
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v79/unsigned/libQnnHexagonSkel_dspApp.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v79/unsigned/libQnnHtpV79.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v79/unsigned/libQnnHtpV79.so
new file mode 100755
index 0000000000000..698420bef38a4
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v79/unsigned/libQnnHtpV79.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v79/unsigned/libQnnHtpV79Skel.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v79/unsigned/libQnnHtpV79Skel.so
new file mode 100755
index 0000000000000..85157e5b280ac
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v79/unsigned/libQnnHtpV79Skel.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v79/unsigned/libQnnNetRunDirectV79Skel.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v79/unsigned/libQnnNetRunDirectV79Skel.so
new file mode 100755
index 0000000000000..e390b30688228
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v79/unsigned/libQnnNetRunDirectV79Skel.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v79/unsigned/libQnnSaver.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v79/unsigned/libQnnSaver.so
new file mode 100755
index 0000000000000..ec6049cb2ba91
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v79/unsigned/libQnnSaver.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v79/unsigned/libQnnSystem.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v79/unsigned/libQnnSystem.so
new file mode 100755
index 0000000000000..d01c7cc6ce3d4
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v79/unsigned/libQnnSystem.so differ
diff --git a/prebuilts/README.md b/prebuilts/README.md
index 61ef99fd92429..b7a86b8a19c04 100644
--- a/prebuilts/README.md
+++ b/prebuilts/README.md
@@ -5,7 +5,7 @@ we should strictly follow Qualcomm's IPR policy, even in open-source community.
 
 ### the [KanTV](https://github.com/kantv-ai) way
 
-- Simple is the beautiful
+- Simple is beautiful
 
   we believe the philosophy of "<b>simple is beautiful</b>" which <b>comes from the great Unix</b>.
 
@@ -25,13 +25,4 @@ we should strictly follow Qualcomm's IPR policy, even in open-source community.
 
 - Hexagon_SDK: a customized/tailored Qualcomm's Hexagon SDK for build project ggml-hexagon conveniently. the fully Hexagon SDK could be found at Qualcomm's offcial website: https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools. one more important thing, the fully Hexagon SDK must be obtained with a Qualcomm Developer Account.
 
-- ggml-dsp: binary libggmlop-skel.so for Qualcomm Snapdragon high-end mobile SoC
-
-```
-#available HTP(NPU) arch version:
-#v68 --- Snapdragon 888
-#v69 --- Snapdragon 8 Gen1
-#v73 --- Snapdragon 8 Gen2
-#v75 --- Snapdragon 8 Gen3
-#v79 --- Snapdragon 8 Elite(aka Gen4)
-```
+- [ggml-dsp](https://github.com/zhouwg/ggml-hexagon/tree/self-build/prebuilts/ggml-dsp): prebuilt libggmldsp-skel.so for Qualcomm Hexagon NPU on Android phone equipped with Qualcomm Snapdragon <b>high-end</b> mobile SoC
diff --git a/prebuilts/ggml-dsp/20250531/libggmldsp-skelv68.so b/prebuilts/ggml-dsp/20250531/libggmldsp-skelv68.so
new file mode 100755
index 0000000000000..2f737cccd27e0
Binary files /dev/null and b/prebuilts/ggml-dsp/20250531/libggmldsp-skelv68.so differ
diff --git a/prebuilts/ggml-dsp/20250531/libggmldsp-skelv69.so b/prebuilts/ggml-dsp/20250531/libggmldsp-skelv69.so
new file mode 100755
index 0000000000000..3e4ac6ae48401
Binary files /dev/null and b/prebuilts/ggml-dsp/20250531/libggmldsp-skelv69.so differ
diff --git a/prebuilts/ggml-dsp/20250531/libggmldsp-skelv73.so b/prebuilts/ggml-dsp/20250531/libggmldsp-skelv73.so
new file mode 100755
index 0000000000000..03fe1db880dbd
Binary files /dev/null and b/prebuilts/ggml-dsp/20250531/libggmldsp-skelv73.so differ
diff --git a/prebuilts/ggml-dsp/20250531/libggmldsp-skelv75.so b/prebuilts/ggml-dsp/20250531/libggmldsp-skelv75.so
new file mode 100755
index 0000000000000..1fe3912aad04a
Binary files /dev/null and b/prebuilts/ggml-dsp/20250531/libggmldsp-skelv75.so differ
diff --git a/prebuilts/ggml-dsp/20250531/libggmldsp-skelv79.so b/prebuilts/ggml-dsp/20250531/libggmldsp-skelv79.so
new file mode 100755
index 0000000000000..d54ded4fe1861
Binary files /dev/null and b/prebuilts/ggml-dsp/20250531/libggmldsp-skelv79.so differ
diff --git a/prebuilts/ggml-dsp/20250609/libggmldsp-skelv68.so b/prebuilts/ggml-dsp/20250609/libggmldsp-skelv68.so
new file mode 100755
index 0000000000000..10404b9000cb8
Binary files /dev/null and b/prebuilts/ggml-dsp/20250609/libggmldsp-skelv68.so differ
diff --git a/prebuilts/ggml-dsp/20250609/libggmldsp-skelv69.so b/prebuilts/ggml-dsp/20250609/libggmldsp-skelv69.so
new file mode 100755
index 0000000000000..15f465679fbc5
Binary files /dev/null and b/prebuilts/ggml-dsp/20250609/libggmldsp-skelv69.so differ
diff --git a/prebuilts/ggml-dsp/20250609/libggmldsp-skelv73.so b/prebuilts/ggml-dsp/20250609/libggmldsp-skelv73.so
new file mode 100755
index 0000000000000..2ffb1626785e1
Binary files /dev/null and b/prebuilts/ggml-dsp/20250609/libggmldsp-skelv73.so differ
diff --git a/prebuilts/ggml-dsp/20250609/libggmldsp-skelv75.so b/prebuilts/ggml-dsp/20250609/libggmldsp-skelv75.so
new file mode 100755
index 0000000000000..5e171a17717a4
Binary files /dev/null and b/prebuilts/ggml-dsp/20250609/libggmldsp-skelv75.so differ
diff --git a/prebuilts/ggml-dsp/20250609/libggmldsp-skelv79.so b/prebuilts/ggml-dsp/20250609/libggmldsp-skelv79.so
new file mode 100755
index 0000000000000..241a6dbb90f92
Binary files /dev/null and b/prebuilts/ggml-dsp/20250609/libggmldsp-skelv79.so differ
diff --git a/prebuilts/ggml-dsp/20250625/libggmldsp-skelv75.so b/prebuilts/ggml-dsp/20250625/libggmldsp-skelv75.so
new file mode 100755
index 0000000000000..d64b3afb37de9
Binary files /dev/null and b/prebuilts/ggml-dsp/20250625/libggmldsp-skelv75.so differ
diff --git a/prebuilts/ggml-dsp/20250625/libggmldsp-skelv79.so b/prebuilts/ggml-dsp/20250625/libggmldsp-skelv79.so
new file mode 100755
index 0000000000000..cc3bd1a1b7d5d
Binary files /dev/null and b/prebuilts/ggml-dsp/20250625/libggmldsp-skelv79.so differ
diff --git a/prebuilts/ggml-dsp/20250627/libggmldsp-skelv75.so b/prebuilts/ggml-dsp/20250627/libggmldsp-skelv75.so
new file mode 100755
index 0000000000000..405ad6e848a91
Binary files /dev/null and b/prebuilts/ggml-dsp/20250627/libggmldsp-skelv75.so differ
diff --git a/prebuilts/ggml-dsp/20250627/libggmldsp-skelv79.so b/prebuilts/ggml-dsp/20250627/libggmldsp-skelv79.so
new file mode 100755
index 0000000000000..f298320fd08c9
Binary files /dev/null and b/prebuilts/ggml-dsp/20250627/libggmldsp-skelv79.so differ
diff --git a/prebuilts/ggml-dsp/README.md b/prebuilts/ggml-dsp/README.md
new file mode 100644
index 0000000000000..edd1bd28eb951
--- /dev/null
+++ b/prebuilts/ggml-dsp/README.md
@@ -0,0 +1,28 @@
+### About ggml-hexagon
+
+ggml-hexagon backend is a specified backend for llama.cpp on Qualcomm Hexagon NPU.
+
+details of ggml-hexagon can be found at: [about ggml-hexagon](https://github.com/zhouwg/ggml-hexagon/discussions/18)
+
+ggml-hexagon backend consists of two parts:
+
+ - codes on ARM AP side(libggml-hexagon.so), <b>fully</b> source code can be found at https://github.com/zhouwg/ggml-hexagon/blob/self-build/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+
+ - codes on cDSP side(libggmldsp-skel.so). <b>reference</b> source code can be found at https://github.com/zhouwg/ggml-hexagon/blob/self-build/ggml/src/ggml-hexagon/kernels, the prebuilt libggmldsp-skel.so can be found in this directory.
+
+### Supported Qualcomm Snapdragon mobile SoC
+
+```
+#v68 --- Snapdragon 888
+#v69 --- Snapdragon 8 Gen1
+#v73 --- Snapdragon 8 Gen2
+#v75 --- Snapdragon 8 Gen3
+#v79 --- Snapdragon 8 Elite(aka Gen4)
+```
+
+
+### ChangeLog
+
+- 20250531: ggml-hexagon.cpp v1.08 + ggml-dsp v0.96
+- 20250609: ggml-hexagon.cpp v1.10 + ggml-dsp v0.97
+- 20250625: ggml-hexagon.cpp v1.13 + ggml-dsp v0.98
diff --git a/scripts/aaptest.sh b/scripts/aaptest.sh
new file mode 100755
index 0000000000000..39aa3c867ba00
--- /dev/null
+++ b/scripts/aaptest.sh
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+#
+# Copyright (c) 2025 The KanTV authors
+#
+# Accuracy And Performance test of various mulmat algotype on Hexagon-cDSP
+#
+set -e
+
+PWD=`pwd`
+PROJECT_HOME_PATH=`pwd`
+PROJECT_ROOT_PATH=${PROJECT_HOME_PATH}
+
+algo_types="0 1 2 3 4 5 6 32 33"
+
+for algo in $algo_types
+do
+    ${PROJECT_ROOT_PATH}/scripts/build-run-android.sh run_benchmark MUL_MAT 3 ${algo}
+    ${PROJECT_ROOT_PATH}/scripts/build-run-android.sh run_testop    MUL_MAT   ${algo}
+done
diff --git a/scripts/build-run-android.sh b/scripts/build-run-android.sh
deleted file mode 100755
index 806e0318e8fd4..0000000000000
--- a/scripts/build-run-android.sh
+++ /dev/null
@@ -1,505 +0,0 @@
-#!/bin/bash
-# build llama.cpp + ggml-hexagon for Qualcomm Snapdragon mobile SoC equipped Android phone on Linux
-#
-# this script will setup local dev envs automatically
-
-set -e
-
-PWD=`pwd`
-PROJECT_HOME_PATH=`pwd`
-PROJECT_ROOT_PATH=${PROJECT_HOME_PATH}
-
-#running path on Android phone
-REMOTE_PATH=/data/local/tmp/
-#LLM model file on Android phone
-GGUF_MODEL_NAME=/sdcard/gemma-3-4b-it-Q8_0.gguf
-#https://huggingface.co/Qwen/Qwen1.5-1.8B-Chat-GGUF/blob/main/qwen1_5-1_8b-chat-q4_0.gguf
-GGUF_MODEL_NAME=/sdcard/qwen1_5-1_8b-chat-q4_0.gguf
-
-#Android NDK can be found at:
-#https://developer.android.com/ndk/downloads
-ANDROID_PLATFORM=android-34
-ANDROID_NDK_VERSION=r28
-ANDROID_NDK_NAME=android-ndk-${ANDROID_NDK_VERSION}
-ANDROID_NDK_FULLNAME=${ANDROID_NDK_NAME}-linux.zip
-ANDROID_NDK=${PROJECT_ROOT_PATH}/prebuilts/${ANDROID_NDK_NAME}
-
-#QNN SDK can be found at:
-#https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk
-QNN_SDK_URL=https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk
-QNN_SDK_VERSION=2.32.0.250228
-QNN_SDK_VERSION=2.33.0.250327
-QNN_SDK_VERSION=2.34.0.250424
-QNN_SDK_PATH=${PROJECT_ROOT_PATH}/prebuilts/QNN_SDK/2.34.0.250424/
-
-#Hexagon SDK can be found at:
-#https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools
-HEXAGON_SDK_PATH=/opt/qcom/Hexagon_SDK/6.2.0.1
-HEXAGON_SDK_PATH=${PROJECT_ROOT_PATH}/prebuilts/Hexagon_SDK/6.2.0.1
-
-#available htp arch version:
-#v68 --- Snapdragon 888
-#v69 --- Snapdragon 8 Gen1
-#v73 --- Snapdragon 8 Gen2
-#v75 --- Snapdragon 8 Gen3
-#v79 --- Snapdragon 8 Elite(aka Gen4)
-
-#8Gen1
-HTP_ARCH_VERSION=v69
-HTP_ARCH_VERSION_a=V69
-
-#8Gen2
-HTP_ARCH_VERSION=v73
-HTP_ARCH_VERSION_a=V73
-
-#8Gen3
-HTP_ARCH_VERSION=v75
-HTP_ARCH_VERSION_a=V75
-
-#8Elite
-HTP_ARCH_VERSION=v79
-HTP_ARCH_VERSION_a=V79
-
-#running_params=" -mg 2 -ngl 99 -t 8 -fa 1 "
-running_params=" -mg 2 -ngl 99 -t 8 "
-
-function dump_vars()
-{
-    echo -e "ANDROID_NDK:          ${ANDROID_NDK}"
-    echo -e "QNN_SDK_PATH:         ${QNN_SDK_PATH}"
-    echo -e "HEXAGON_SDK_PATH:     ${HEXAGON_SDK_PATH}"
-}
-
-
-function show_pwd()
-{
-    echo -e "current working path:$(pwd)\n"
-}
-
-
-function check_hexagon_sdk()
-{
-    is_hexagon_llvm_exist=1
-    if [ ! -f ${PROJECT_ROOT_PATH}/prebuilts/Hexagon_SDK/6.2.0.1/tools/HEXAGON_Tools/8.8.06/NOTICE.txt ]; then
-        echo -e "${TEXT_RED}hexagon LLVM toolchain not exist, pls check...${TEXT_RESET}\n"
-        is_hexagon_llvm_exist=0
-    else
-        printf "hexagon LLVM toolchain already exist\n\n"
-    fi
-
-    #download customized LLVM toolchain HEXAGON_TOOLs_8.8.06.tar.gz
-    if [ ${is_hexagon_llvm_exist} -eq 0 ]; then
-        echo -e "begin downloading hexagon LLVM toolchain \n"
-        wget --no-config --quiet --show-progress -O ${PROJECT_ROOT_PATH}/prebuilts/Hexagon_SDK/6.2.0.1/tools/HEXAGON_Tools/HEXAGON_TOOLs_8.8.06.tar.gz https://github.com/kantv-ai/toolchain/raw/refs/heads/main/HEXAGON_TOOLs_8.8.06.tar.gz
-        if [ $? -ne 0 ]; then
-            printf "failed to download hexagon LLVM toolchain\n"
-            exit 1
-        else
-            zcat ${PROJECT_ROOT_PATH}/prebuilts/Hexagon_SDK/6.2.0.1/tools/HEXAGON_Tools/HEXAGON_TOOLs_8.8.06.tar.gz | tar -C ${PROJECT_ROOT_PATH}/prebuilts/Hexagon_SDK/6.2.0.1/tools/HEXAGON_Tools -xvf -
-            printf "install hexagon LLVM toolchain successfully\n\n"
-        fi
-    fi
-
-    if [ ! -d ${HEXAGON_SDK_PATH} ]; then
-        echo -e "HEXAGON_SDK_PATH ${HEXAGON_SDK_PATH} not exist, pls install it accordingly...\n"
-        exit 0
-    else
-        printf "Qualcomm Hexagon SDK already exist:${HEXAGON_SDK_PATH} \n\n"
-    fi
-}
-
-
-function check_and_download_qnn_sdk()
-{
-    is_qnn_sdk_exist=1
-
-    if [ ! -d ${QNN_SDK_PATH} ]; then
-        echo -e "QNN_SDK_PATH ${QNN_SDK_PATH} not exist, download it from ${QNN_SDK_URL}...\n"
-        is_qnn_sdk_exist=0
-    fi
-
-    if [ ${is_qnn_sdk_exist} -eq 0 ]; then
-        if [ ! -f ${PROJECT_ROOT_PATH}/prebuild/v${QNN_SDK_VERSION}.zip ]; then
-            wget --no-config --quiet --show-progress -O ${PROJECT_ROOT_PATH}/prebuilts/v${QNN_SDK_VERSION}.zip https://softwarecenter.qualcomm.com/api/download/software/sdks/Qualcomm_AI_Runtime_Community/All/${QNN_SDK_VERSION}/v${QNN_SDK_VERSION}.zip
-        fi
-        if [ $? -ne 0 ]; then
-            printf "failed to download Qualcomm QNN SDK to %s \n" "${QNN_SDK_PATH}"
-            exit 1
-        fi
-        cd ${PROJECT_ROOT_PATH}/prebuilts/
-        unzip v${QNN_SDK_VERSION}.zip
-        printf "Qualcomm QNN SDK saved to ${QNN_SDK_PATH} \n\n"
-        cd ${PROJECT_ROOT_PATH}
-    else
-        printf "Qualcomm QNN SDK already exist:${QNN_SDK_PATH} \n\n"
-    fi
-}
-
-
-function check_and_download_ndk()
-{
-    is_android_ndk_exist=1
-
-    if [ ! -d ${ANDROID_NDK} ]; then
-        is_android_ndk_exist=0
-    fi
-
-    if [ ! -f ${ANDROID_NDK}/build/cmake/android.toolchain.cmake ]; then
-        is_android_ndk_exist=0
-    fi
-
-    if [ ${is_android_ndk_exist} -eq 0 ]; then
-
-        if [ ! -f ${PROJECT_ROOT_PATH}/prebuilts/${ANDROID_NDK_FULLNAME} ]; then
-            wget --no-config --quiet --show-progress -O ${PROJECT_ROOT_PATH}/prebuilts/${ANDROID_NDK_FULLNAME} https://dl.google.com/android/repository/${ANDROID_NDK_FULLNAME}
-        fi
-
-        cd ${PROJECT_ROOT_PATH}/prebuilts/
-        unzip ${ANDROID_NDK_FULLNAME}
-
-        if [ $? -ne 0 ]; then
-            printf "failed to download android ndk to %s \n" "${ANDROID_NDK}"
-            exit 1
-        fi
-        cd ${PROJECT_ROOT_PATH}
-
-        printf "android ndk saved to ${ANDROID_NDK} \n\n"
-    else
-        printf "android ndk already exist:${ANDROID_NDK} \n\n"
-    fi
-}
-
-
-function build_arm64
-{
-    cmake -H. -B./out/android -DCMAKE_BUILD_TYPE=Release -DGGML_OPENMP=OFF -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=latest -DCMAKE_C_FLAGS=-march=armv8.7-a -DGGML_HEXAGON=ON -DLLAMA_CURL=OFF -DQNN_SDK_PATH=${QNN_SDK_PATH} -DHEXAGON_SDK_PATH=${HEXAGON_SDK_PATH} -DHTP_ARCH_VERSION=${HTP_ARCH_VERSION}
-    cd out/android
-    make -j16
-    show_pwd
-
-    cd -
-}
-
-function build_arm64_debug
-{
-    cmake -H. -B./out/android -DCMAKE_BUILD_TYPE=Debug -DGGML_OPENMP=OFF -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=latest -DCMAKE_C_FLAGS=-march=armv8.7-a -DGGML_HEXAGON=ON -DLLAMA_CURL=OFF -DQNN_SDK_PATH=${QNN_SDK_PATH} -DHEXAGON_SDK_PATH=${HEXAGON_SDK_PATH} -DHTP_ARCH_VERSION=${HTP_ARCH_VERSION}
-    cd out/android
-    make -j16
-    show_pwd
-
-    cd -
-}
-
-
-function remove_temp_dir()
-{
-    if [ -d out/android ]; then
-        echo "remove out/android directory in `pwd`"
-        rm -rf out/android
-    fi
-}
-
-
-function check_qnn_libs()
-{
-    #reuse the cached qnn libs on Android phone
-    adb shell ls ${REMOTE_PATH}/libQnnCpu.so
-    adb shell ls ${REMOTE_PATH}/libQnnGpu.so
-    adb shell ls ${REMOTE_PATH}/libQnnHtp.so
-    if [ $? -eq 0 ]; then
-        printf "QNN libs already exist on Android phone\n"
-    else
-        update_qnn_libs
-    fi
-    update_qnn_cfg
-}
-
-
-function update_qnn_libs()
-{
-    adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnSystem.so              ${REMOTE_PATH}/
-    adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnCpu.so                 ${REMOTE_PATH}/
-    adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnGpu.so                 ${REMOTE_PATH}/
-
-    adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtp.so                 ${REMOTE_PATH}/
-    adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpNetRunExtensions.so ${REMOTE_PATH}/
-    adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpPrepare.so          ${REMOTE_PATH}/
-    adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtp${HTP_ARCH_VERSION_a}Stub.so          ${REMOTE_PATH}/
-    adb push ${QNN_SDK_PATH}/lib/hexagon-${HTP_ARCH_VERSION}/unsigned/libQnnHtp${HTP_ARCH_VERSION_a}Skel.so     ${REMOTE_PATH}/
-}
-
-
-function update_qnn_cfg()
-{
-    adb push ./scripts/ggml-hexagon.cfg ${REMOTE_PATH}/
-}
-
-
-function build_ggml_hexagon()
-{
-    show_pwd
-    check_and_download_ndk
-    check_and_download_qnn_sdk
-    check_hexagon_sdk
-    dump_vars
-    remove_temp_dir
-    build_arm64
-}
-
-function build_ggml_hexagon_debug()
-{
-    show_pwd
-    check_and_download_ndk
-    check_and_download_qnn_sdk
-    check_hexagon_sdk
-    dump_vars
-    remove_temp_dir
-    build_arm64_debug
-}
-
-#added on 05/31/2025, for purpose of non-tech factor
-function prepare_ggmlhexagon()
-{
-    adb push ./scripts/ggml-hexagon-for-binary-lib.cfg ${REMOTE_PATH}/ggml-hexagon.cfg
-    echo "adb push ${PROJECT_ROOT_PATH}/prebuilts/ggml-dsp/libggmlop-skel${HTP_ARCH_VERSION}.so ${REMOTE_PATH}/libggmlop-skel.so"
-case "$HTP_ARCH_VERSION" in
-    v69)
-        adb push ${PROJECT_ROOT_PATH}/prebuilts/ggml-dsp/libggmlop-skel${HTP_ARCH_VERSION}.so ${REMOTE_PATH}/libggmlop-skel.so
-    ;;
-    v73)
-        adb push ${PROJECT_ROOT_PATH}/prebuilts/ggml-dsp/libggmlop-skel${HTP_ARCH_VERSION}.so ${REMOTE_PATH}/libggmlop-skel.so
-    ;;
-    v75)
-        adb push ${PROJECT_ROOT_PATH}/prebuilts/ggml-dsp/libggmlop-skel${HTP_ARCH_VERSION}.so ${REMOTE_PATH}/libggmlop-skel.so
-    ;;
-    v79)
-        adb push ${PROJECT_ROOT_PATH}/prebuilts/ggml-dsp/libggmlop-skel${HTP_ARCH_VERSION}.so ${REMOTE_PATH}/libggmlop-skel.so
-    ;;
-    *)
-        show_usage
-        exit 1
-    ;;
-esac
-}
-
-function prepare_run_on_phone()
-{
-    if [ $# != 1 ]; then
-        print "invalid param"
-        return
-    fi
-    program=$1
-
-    check_qnn_libs
-
-    if [ -f ./out/android/bin/libggml-cpu.so ]; then
-        adb push ./out/android/bin/*.so ${REMOTE_PATH}/
-    fi
-    adb push ./out/android/bin/${program} ${REMOTE_PATH}/
-
-    #for verify binary library on Hexagon cDSP
-    prepare_ggmlhexagon
-
-    #for build library on Hexagon cDSP from the reference source codes in this project
-    #adb push ./scripts/ggml-hexagon.cfg ${REMOTE_PATH}/ggml-hexagon.cfg
-
-    adb shell chmod +x ${REMOTE_PATH}/${program}
-}
-
-function run_llamacli()
-{
-    prepare_run_on_phone llama-cli
-
-    adb shell "cd ${REMOTE_PATH} \
-               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
-               && ${REMOTE_PATH}/llama-cli ${running_params} -no-cnv -m ${GGUF_MODEL_NAME} -p \"introduce the movie Once Upon a Time in America briefly.\n\""
-
-}
-
-
-function run_llamabench()
-{
-    prepare_run_on_phone llama-bench
-
-    adb shell "cd ${REMOTE_PATH} \
-               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
-               && ${REMOTE_PATH}/llama-bench ${running_params} -m ${GGUF_MODEL_NAME}"
-
-}
-
-
-function run_test-ops()
-{
-    prepare_run_on_phone test-backend-ops
-
-    adb shell "cd ${REMOTE_PATH} \
-               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
-               && ${REMOTE_PATH}/test-backend-ops test"
-
-}
-
-function run_test-op()
-{
-    prepare_run_on_phone test-backend-ops
-
-    echo "adb shell cd ${REMOTE_PATH} \
-               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
-               && ${REMOTE_PATH}/test-backend-ops test -o $opname "
-
-    echo "\n"
-    adb shell "cd ${REMOTE_PATH} \
-               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
-               && ${REMOTE_PATH}/test-backend-ops test -o $opname "
-
-}
-
-
-function print_oplist()
-{
-oplist="DUP
-    ADD
-    ADD1
-    ACC
-    SUB
-    MUL
-    DIV
-    SQR
-    SQRT
-    LOG
-    SIN
-    COS
-    SUM
-    SUM_ROWS
-    MEAN
-    ARGMAX
-    COUNT_EQUAL
-    REPEAT
-    REPEAT_BACK
-    CONCAT
-    SILU_BACK
-    NORM
-    RMS_NORM
-    RMS_NORM_BACK
-    GROUP_NORM
-
-    MUL_MAT
-    MUL_MAT_ID
-    OUT_PROD
-
-    SCALE
-    SET
-    CPY
-    CONT
-    RESHAPE
-    VIEW
-    PERMUTE
-    TRANSPOSE
-    GET_ROWS
-    GET_ROWS_BACK
-    DIAG
-    DIAG_MASK_INF
-    DIAG_MASK_ZERO
-    SOFT_MAX
-    SOFT_MAX_BACK
-    ROPE
-    ROPE_BACK
-    CLAMP
-    CONV_TRANSPOSE_1D
-    IM2COL
-    IM2COL_BACK
-    CONV_TRANSPOSE_2D
-    POOL_1D
-    POOL_2D
-    POOL_2D_BACK
-    UPSCALE
-    PAD
-    PAD_REFLECT_1D
-    ARANGE
-    TIMESTEP_EMBEDDING
-    ARGSORT
-    LEAKY_RELU
-
-    FLASH_ATTN_EXT
-    FLASH_ATTN_BACK
-    SSM_CONV
-    SSM_SCAN
-    WIN_PART
-    WIN_UNPART
-    GET_REL_POS
-    ADD_REL_POS
-    RWKV_WKV6
-    GATED_LINEAR_ATTN"
-
-echo "opname list: "
-echo ${oplist}
-}
-
-function show_usage()
-{
-    echo "Usage:"
-    echo "  $0 help"
-    echo "  $0 print_oplist"
-    echo "  $0 build"
-    echo "  $0 build_debug (enable debug log for developers on ARM-AP side and cDSP side)"
-    echo "  $0 updateqnnlib"
-    echo "  $0 run_testops"
-    echo "  $0 run_testop          [ADD/MUL_MAT]"
-    echo "  $0 run_llamacli"
-    echo "  $0 run_llamabench"
-
-    echo -e "\n\n\n"
-}
-
-
-show_pwd
-
-check_and_download_ndk
-check_and_download_qnn_sdk
-check_hexagon_sdk
-
-if [ $# == 0 ]; then
-    show_usage
-    exit 1
-elif [ $# == 1 ]; then
-    if [ "$1" == "-h" ]; then
-        show_usage
-        exit 1
-    elif [ "$1" == "help" ]; then
-        show_usage
-        exit 1
-    elif [ "$1" == "print_oplist" ]; then
-        print_oplist
-        exit 1
-    elif [ "$1" == "build" ]; then
-        build_ggml_hexagon
-        exit 0
-    elif [ "$1" == "build_debug" ]; then
-        build_ggml_hexagon_debug
-        exit 0
-    elif [ "$1" == "run_testops" ]; then
-        run_test-ops
-        exit 0
-    elif [ "$1" == "run_llamacli" ]; then
-        run_llamacli
-        exit 0
-    elif [ "$1" == "run_llamabench" ]; then
-        run_llamabench
-        exit 0
-    elif [ "$1" == "updateqnnlib" ]; then
-        update_qnn_libs
-        exit 0
-    else
-        show_usage
-        exit 1
-    fi
-elif [ $# == 2 ]; then
-    opname=$2
-#TODO: check opname in oplist
-#opname can be found via print_oplist:
-
-    run_test-op
-    exit 0
-else
-    show_usage
-    exit 1
-fi
diff --git a/scripts/build-run-android.sh b/scripts/build-run-android.sh
new file mode 120000
index 0000000000000..7190ed291f22d
--- /dev/null
+++ b/scripts/build-run-android.sh
@@ -0,0 +1 @@
+build-run-ggmlhexagon-android.sh
\ No newline at end of file
diff --git a/scripts/build-run-ggmlhexagon-android.sh b/scripts/build-run-ggmlhexagon-android.sh
new file mode 100755
index 0000000000000..48d2c1af18a27
--- /dev/null
+++ b/scripts/build-run-ggmlhexagon-android.sh
@@ -0,0 +1,853 @@
+#!/usr/bin/env bash
+#
+# Copyright (c) zhouwg(https://github.com/zhouwg)
+#
+# 1. build llama.cpp + ggml-hexagon backend on Linux for Android phone equipped with Qualcomm Snapdragon mobile SoC
+#    this script will setup local dev envs automatically
+#
+# 2. verify prebuilt libggmldsp-skel.so on Android phone equipped with Qualcomm Snapdragon mobile SoC(8Elite is recommended)
+#
+# 3. compare performance of QNN-CPU,QNN-GPU,QNN-NPU,Hexagon-cDSP,ggml on Android phone equipped with Qualcomm Snapdragon mobile SoC
+#
+#
+set -e
+
+######## part-1: don't modify contents in this part ########
+
+PWD=`pwd`
+PROJECT_HOME_PATH=`pwd`
+PROJECT_ROOT_PATH=${PROJECT_HOME_PATH}
+HOST_CPU_COUNTS=`cat /proc/cpuinfo | grep "processor" | wc | awk '{print int($1)}'`
+
+#running path on Android phone
+REMOTE_PATH=/data/local/tmp
+
+#Android NDK can be found at:
+#https://developer.android.com/ndk/downloads
+ANDROID_PLATFORM=android-34
+ANDROID_NDK_VERSION=r28
+ANDROID_NDK_NAME=android-ndk-${ANDROID_NDK_VERSION}
+ANDROID_NDK_FULLNAME=${ANDROID_NDK_NAME}-linux.zip
+ANDROID_NDK=${PROJECT_ROOT_PATH}/prebuilts/${ANDROID_NDK_NAME}
+
+#Qualcomm QNN SDK can be found at:
+#https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk
+QNN_SDK_URL=https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk
+QNN_SDK_VERSION=2.32.0.250228
+QNN_SDK_VERSION=2.33.0.250327
+QNN_SDK_VERSION=2.34.0.250424
+QNN_SDK_VERSION=2.35.0.250530
+QNN_SDK_VERSION=2.36.0.250627
+#fully official QNN SDK, will be downloaded automatically via this script
+QNN_SDK_PATH=${PROJECT_ROOT_PATH}/prebuilts/QNN_SDK/qairt/2.34.0.250424/
+QNN_SDK_PATH=${PROJECT_ROOT_PATH}/prebuilts/QNN_SDK/qairt/2.35.0.250530/
+QNN_SDK_PATH=${PROJECT_ROOT_PATH}/prebuilts/QNN_SDK/qairt/2.36.0.250627/
+
+#Qualcomm Hexagon SDK can be found at:
+#https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools
+#the official Hexagon SDK, must be obtained with Qualcomm Developer Account
+HEXAGON_SDK_PATH=/opt/qcom/Hexagon_SDK/6.2.0.1
+#customized/tailored Hexagon SDK from the offcial Hexagon SDK for simplify workflow
+HEXAGON_SDK_PATH=${PROJECT_ROOT_PATH}/prebuilts/Hexagon_SDK/6.2.0.1
+
+#running_params=" -ngl 99 -t 4 -n 256 --no-warmup -fa 1 "
+running_params=" -ngl 99 -t 4 -n 256 --no-warmup "
+
+#available prebuilt libs can be found at prebuilts/ggml-dsp
+GGMLDSP_RELEASE_DATE=20250531
+GGMLDSP_RELEASE_DATE=20250609
+GGMLDSP_RELEASE_DATE=20250625
+GGMLDSP_RELEASE_DATE=20250627
+
+
+######## part-2: contents in this part can be modified ########
+
+PROMPT_STRING="introduce the movie Once Upon a Time in America briefly.\n"
+
+#the following LLM models has verified(works fine) with Hexagon-cDSP backend on a Snapdragon 8Elite based Android phone although inference performance is not good at the moment
+#for llama-cli, 6.9 GiB, can be downloadded via this script in function check_prebuilt_models()
+TEST_MODEL_NAME=/sdcard/gemma-3n-E4B-it-Q8_0.gguf
+#for llama-cli, 8.2 GiB
+#TEST_MODEL_NAME=/sdcard/Qwen3-8B-Q8_0.gguf
+#for llama-cli, 4.0 GiB
+#TEST_MODEL_NAME=/sdcard/Qwen3-4B-Q8_0.gguf
+#for llama-cli, 3.9 GiB
+#TEST_MODEL_NAME=/sdcard/gemma-3-4b-it-Q8_0.gguf
+#for llama-cli, 1.6 GiB, can be downloadded via this script in function check_prebuilt_models()
+TEST_MODEL_NAME=/sdcard/MiniCPM4-0.5B-F32.gguf
+#for llama-cli, 1.1 GiB, will be downloaded automatically via this script when running this script at the first time
+TEST_MODEL_NAME=/sdcard/qwen1_5-1_8b-chat-q4_0.gguf
+#for llama-cli, 4.5 GiB, will be downloadded automatically via this script when running this script at the first time
+TEST_MODEL_NAME=/sdcard/gemma-3n-E2B-it-Q8_0.gguf
+
+#for llama-bench, 4.5 GiB, will be downloadded automatically via this script when running this script at the first time
+GGUF_MODEL_NAME=/sdcard/gemma-3n-E2B-it-Q8_0.gguf
+#for llama-bench, 1.12 GiB, will be downloadded automatically via this script when running this script at the first time
+GGUF_MODEL_NAME=/sdcard/qwen1_5-1_8b-chat-q4_0.gguf
+
+#ref: https://github.com/quic/ai-hub-apps/tree/main/tutorials/llm_on_genie
+#supported htp arch version:
+#v73 --- Snapdragon 8 Gen2
+#v75 --- Snapdragon 8 Gen3
+#v79 --- Snapdragon 8 Elite
+
+#8Gen2
+#HTP_ARCH_VERSION=v73
+#HTP_ARCH_VERSION_a=V73
+
+#8Gen3
+#HTP_ARCH_VERSION=v75
+#HTP_ARCH_VERSION_a=V75
+
+#8Elite
+#HTP_ARCH_VERSION=v79
+#HTP_ARCH_VERSION_a=V79
+
+#modify the following two lines to adapt to test phone
+#for simplify workflow, only support v75 and v79, or only support 8Gen3 and 8Elite at the moment
+#v79/8Elite is strongly recommended because:
+#1. sometimes the same dsp codes can running well as expected on Snapdragon 8Elite based phone
+#   but can't works as expected on other Snapdragon based phone(e.g. 8Gen3).
+#2. DSP clock rate on 8Gen3 is slower than DSP clock rate on 8Elite.
+#3. 8Elite support for LP-DDR5x memory, up to 5300 MHz; 8Gen3 support for LP-DDR5x memory, up to 4800 MHz.
+HTP_ARCH_VERSION=v79
+HTP_ARCH_VERSION_a=V79
+
+
+######## part-3: utilities and functions ########
+
+function dump_vars()
+{
+    echo -e "ANDROID_NDK:          ${ANDROID_NDK}"
+    echo -e "QNN_SDK_PATH:         ${QNN_SDK_PATH}"
+    echo -e "HEXAGON_SDK_PATH:     ${HEXAGON_SDK_PATH}"
+}
+
+
+function show_pwd()
+{
+    echo -e "current working path:$(pwd)\n"
+}
+
+
+function check_command_in_host()
+{
+    set +e
+    cmd=$1
+    ls /usr/bin/${cmd}
+    if [ $? -eq 0 ]; then
+        #printf "${cmd} already exist on host machine\n"
+        echo ""
+    else
+        printf "${cmd} not exist on host machine, pls install command line utility ${cmd} firstly and accordingly\n"
+        exit 1
+    fi
+    set -e
+}
+
+
+function check_commands_in_host()
+{
+    check_command_in_host wget
+    check_command_in_host xzcat
+}
+
+
+function check_android_phone()
+{
+    adb shell ls /bin/ls
+    if [ ! $? -eq 0 ]; then
+        printf "pls check Android phone is connected properly\n"
+        exit 1
+    fi
+}
+
+
+function check_and_download_hexagon_sdk()
+{
+    is_hexagon_llvm_exist=1
+    if [ ! -f ${PROJECT_ROOT_PATH}/prebuilts/Hexagon_SDK/6.2.0.1/tools/HEXAGON_Tools/8.8.06/NOTICE.txt ]; then
+        echo -e "${TEXT_RED}minimal-hexagon-sdk not exist...${TEXT_RESET}\n"
+        is_hexagon_llvm_exist=0
+    fi
+
+    if [ ${is_hexagon_llvm_exist} -eq 0 ]; then
+        if [ -f ${PROJECT_ROOT_PATH}/prebuilts/Hexagon_SDK/minimal-hexagon-sdk-6.2.0.1.xz ]; then
+            echo -e "minimal-hexagon-sdk-6.2.0.1.xz already exist\n"
+        else
+            echo -e "begin downloading minimal-hexagon-sdk-6.2.0.1.xz \n"
+            wget --no-config --quiet --show-progress -O ${PROJECT_ROOT_PATH}/prebuilts/Hexagon_SDK/minimal-hexagon-sdk-6.2.0.1.xz https://github.com/kantv-ai/toolchain/raw/refs/heads/main/minimal-hexagon-sdk-6.2.0.1.xz
+            if [ $? -ne 0 ]; then
+                printf "failed to download minimal-hexagon-sdk-6.2.0.1.xz\n"
+                exit 1
+            fi
+        fi
+
+        echo -e "begin decompressing minimal-hexagon-sdk-6.2.0.1.xz \n"
+        xzcat ${PROJECT_ROOT_PATH}/prebuilts/Hexagon_SDK/minimal-hexagon-sdk-6.2.0.1.xz | tar -C ${PROJECT_ROOT_PATH}/prebuilts/Hexagon_SDK/ -xf -
+        if [ $? -ne 0 ]; then
+            printf "failed to decompress minimal-hexagon-sdk-6.2.0.1.xz\n"
+            exit 1
+        fi
+        printf "install minimal-hexagon-sdk successfully\n\n"
+    fi
+
+    if [ ! -d ${HEXAGON_SDK_PATH} ]; then
+        echo -e "HEXAGON_SDK_PATH ${HEXAGON_SDK_PATH} not exist, pls install it accordingly...\n"
+        exit 0
+    else
+        printf "Qualcomm Hexagon SDK already exist:${HEXAGON_SDK_PATH} \n\n"
+    fi
+}
+
+
+function check_and_download_qnn_sdk()
+{
+    is_qnn_sdk_exist=1
+
+    if [ ! -d ${QNN_SDK_PATH} ]; then
+        echo -e "QNN_SDK_PATH ${QNN_SDK_PATH} not exist, download it from ${QNN_SDK_URL}...\n"
+        is_qnn_sdk_exist=0
+    fi
+
+    if [ ${is_qnn_sdk_exist} -eq 0 ]; then
+        if [ ! -f ${PROJECT_ROOT_PATH}/prebuild/v${QNN_SDK_VERSION}.zip ]; then
+            wget --no-config --quiet --show-progress -O ${PROJECT_ROOT_PATH}/prebuilts/QNN_SDK/v${QNN_SDK_VERSION}.zip https://softwarecenter.qualcomm.com/api/download/software/sdks/Qualcomm_AI_Runtime_Community/All/${QNN_SDK_VERSION}/v${QNN_SDK_VERSION}.zip
+        fi
+        if [ $? -ne 0 ]; then
+            printf "failed to download Qualcomm QNN SDK to %s \n" "${QNN_SDK_PATH}"
+            exit 1
+        fi
+        cd ${PROJECT_ROOT_PATH}/prebuilts/QNN_SDK/
+        unzip v${QNN_SDK_VERSION}.zip
+        printf "Qualcomm QNN SDK saved to ${QNN_SDK_PATH} \n\n"
+        cd ${PROJECT_ROOT_PATH}
+    else
+        printf "Qualcomm QNN SDK already exist:    ${QNN_SDK_PATH} \n\n"
+    fi
+}
+
+
+function check_and_download_ndk()
+{
+    is_android_ndk_exist=1
+
+    if [ ! -d ${ANDROID_NDK} ]; then
+        is_android_ndk_exist=0
+    fi
+
+    if [ ! -f ${ANDROID_NDK}/build/cmake/android.toolchain.cmake ]; then
+        is_android_ndk_exist=0
+    fi
+
+    if [ ${is_android_ndk_exist} -eq 0 ]; then
+
+        if [ ! -f ${PROJECT_ROOT_PATH}/prebuilts/${ANDROID_NDK_FULLNAME} ]; then
+            wget --no-config --quiet --show-progress -O ${PROJECT_ROOT_PATH}/prebuilts/${ANDROID_NDK_FULLNAME} https://dl.google.com/android/repository/${ANDROID_NDK_FULLNAME}
+        fi
+
+        cd ${PROJECT_ROOT_PATH}/prebuilts/
+        unzip ${ANDROID_NDK_FULLNAME}
+
+        if [ $? -ne 0 ]; then
+            printf "failed to download Android NDK to %s \n" "${ANDROID_NDK}"
+            exit 1
+        fi
+        cd ${PROJECT_ROOT_PATH}
+
+        printf "Android NDK saved to ${ANDROID_NDK} \n\n"
+    else
+        printf "Android NDK already exist:         ${ANDROID_NDK} \n\n"
+    fi
+}
+
+
+function build_arm64
+{
+    cmake -H. -B./out/ggmlhexagon-android -DCMAKE_BUILD_TYPE=Release -DGGML_OPENMP=OFF -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=latest -DCMAKE_C_FLAGS=-march=armv8.7-a -DGGML_HEXAGON=ON -DLLAMA_CURL=OFF -DQNN_SDK_PATH=${QNN_SDK_PATH} -DHEXAGON_SDK_PATH=${HEXAGON_SDK_PATH} -DHTP_ARCH_VERSION=${HTP_ARCH_VERSION}
+    cd out/ggmlhexagon-android
+    make -j${HOST_CPU_COUNTS}
+    show_pwd
+
+    cd -
+}
+
+
+function build_arm64_debug
+{
+    cmake -H. -B./out/ggmlhexagon-android -DCMAKE_BUILD_TYPE=Debug -DGGML_OPENMP=OFF -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=latest -DCMAKE_C_FLAGS=-march=armv8.7-a -DGGML_HEXAGON=ON -DLLAMA_CURL=OFF -DQNN_SDK_PATH=${QNN_SDK_PATH} -DHEXAGON_SDK_PATH=${HEXAGON_SDK_PATH} -DHTP_ARCH_VERSION=${HTP_ARCH_VERSION}
+    cd out/ggmlhexagon-android
+    make -j${HOST_CPU_COUNTS}
+    show_pwd
+
+    cd -
+}
+
+
+function remove_temp_dir()
+{
+    if [ -d out/ggmlhexagon-android ]; then
+        echo "remove out/ggmlhexagon-android directory in `pwd`"
+        rm -rf out/ggmlhexagon-android
+    fi
+}
+
+
+function check_qnn_libs()
+{
+    set +e
+
+    #reuse the cached qnn libs on Android phone
+    adb shell ls ${REMOTE_PATH}/libQnnCpu.so
+    adb shell ls ${REMOTE_PATH}/libQnnGpu.so
+    adb shell ls ${REMOTE_PATH}/libQnnHtp.so
+    if [ $? -eq 0 ]; then
+        printf "QNN runtime libs already exist on Android phone\n"
+    else
+        printf "QNN runtime libs not exist on Android phone\n"
+        update_qnn_libs
+    fi
+    update_qnn_cfg
+
+    set -e
+}
+
+
+function update_qnn_libs()
+{
+    adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnSystem.so              ${REMOTE_PATH}/
+    adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnCpu.so                 ${REMOTE_PATH}/
+    adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnGpu.so                 ${REMOTE_PATH}/
+
+    adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtp.so                 ${REMOTE_PATH}/
+    adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpNetRunExtensions.so ${REMOTE_PATH}/
+    adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpPrepare.so          ${REMOTE_PATH}/
+    adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtp${HTP_ARCH_VERSION_a}Stub.so          ${REMOTE_PATH}/
+    adb push ${QNN_SDK_PATH}/lib/hexagon-${HTP_ARCH_VERSION}/unsigned/libQnnHtp${HTP_ARCH_VERSION_a}Skel.so     ${REMOTE_PATH}/
+}
+
+
+function update_qnn_cfg()
+{
+    adb push ./scripts/ggml-hexagon.cfg ${REMOTE_PATH}/
+}
+
+
+function build_ggml_hexagon()
+{
+    show_pwd
+    check_and_download_ndk
+    check_and_download_qnn_sdk
+    check_and_download_hexagon_sdk
+    dump_vars
+    remove_temp_dir
+    build_arm64
+}
+
+
+function build_ggml_hexagon_debug()
+{
+    show_pwd
+    check_and_download_ndk
+    check_and_download_qnn_sdk
+    check_and_download_hexagon_sdk
+    dump_vars
+    remove_temp_dir
+    build_arm64_debug
+}
+
+
+function prepare_ggmldsp()
+{
+    adb push ./scripts/ggml-hexagon-for-binary-lib.cfg ${REMOTE_PATH}/ggml-hexagon.cfg
+    echo "adb push ${PROJECT_ROOT_PATH}/prebuilts/ggml-dsp/${GGMLDSP_RELEASE_DATE}/libggmldsp-skel${HTP_ARCH_VERSION}.so ${REMOTE_PATH}/libggmldsp-skel.so"
+case "$HTP_ARCH_VERSION" in
+    v69)
+        adb push ${PROJECT_ROOT_PATH}/prebuilts/ggml-dsp/${GGMLDSP_RELEASE_DATE}/libggmldsp-skel${HTP_ARCH_VERSION}.so ${REMOTE_PATH}/libggmldsp-skel.so
+    ;;
+    v73)
+        adb push ${PROJECT_ROOT_PATH}/prebuilts/ggml-dsp/${GGMLDSP_RELEASE_DATE}/libggmldsp-skel${HTP_ARCH_VERSION}.so ${REMOTE_PATH}/libggmldsp-skel.so
+    ;;
+    v75)
+        adb push ${PROJECT_ROOT_PATH}/prebuilts/ggml-dsp/${GGMLDSP_RELEASE_DATE}/libggmldsp-skel${HTP_ARCH_VERSION}.so ${REMOTE_PATH}/libggmldsp-skel.so
+    ;;
+    v79)
+        adb push ${PROJECT_ROOT_PATH}/prebuilts/ggml-dsp/${GGMLDSP_RELEASE_DATE}/libggmldsp-skel${HTP_ARCH_VERSION}.so ${REMOTE_PATH}/libggmldsp-skel.so
+    ;;
+    *)
+        show_usage
+        exit 1
+    ;;
+esac
+}
+
+
+function check_and_download_model()
+{
+    set +e
+
+    model_name=$1
+    model_url=$2
+
+    adb shell ls /sdcard/${model_name}
+    if [ $? -eq 0 ]; then
+        printf "the prebuild LLM model ${model_name} already exist on Android phone\n"
+    else
+        printf "the prebuild LLM model ${model_name} not exist on Android phone\n"
+        wget --no-config --quiet --show-progress -O ${PROJECT_ROOT_PATH}/models/${model_name} ${model_url}
+        adb push ${PROJECT_ROOT_PATH}/models/${model_name} /sdcard/
+    fi
+
+    set -e
+}
+
+
+function check_prebuilt_models()
+{
+    #normal LLM models
+    #https://huggingface.co/ggml-org/gemma-3-4b-it-GGUF/blob/main/gemma-3-4b-it-Q8_0.gguf,              size 4.13 GiB
+    #https://huggingface.co/Qwen/Qwen1.5-1.8B-Chat-GGUF/blob/main/qwen1_5-1_8b-chat-q4_0.gguf,          size 1.12 GiB
+
+    #customized LLM models for compare inference peformance of QNN-CPU, QNN-GPU, QNN-NPU, cDSP, the default ggml backend
+    #during development stage
+    #https://huggingface.co/zhouwg/kantv/blob/main/t5-very-small-random-F32.gguf,                       size 20.4 MiB
+    #original model:  https://huggingface.co/stas/t5-very-small-random
+
+    #https://huggingface.co/zhouwg/kantv/blob/main/MiniCPM4-0.5B-F32.gguf,                              size 1.74 GiB
+    #original model:  https://huggingface.co/openbmb/MiniCPM4-0.5B
+
+    #customized LLM models for compare inference peformance of QNN-CPU, QNN-GPU, QNN-NPU, cDSP, the default ggml backend
+    #during development stage
+    #https://huggingface.co/zhouwg/kantv/blob/main/t5-277M-F32.gguf,                                    size 1.1  GiB
+
+    set +e
+
+    adb shell ls /sdcard/t5-very-small-random-F32.gguf
+    if [ $? -eq 0 ]; then
+        printf "the prebuild LLM model t5-very-small-random-F32.gguf already exist on Android phone\n"
+    else
+        printf "the prebuild LLM model t5-very-small-random-F32.gguf not exist on Android phone\n"
+        adb push ${PROJECT_ROOT_PATH}/models/t5-very-small-random-F32.gguf /sdcard/
+    fi
+
+    check_and_download_model qwen1_5-1_8b-chat-q4_0.gguf https://huggingface.co/Qwen/Qwen1.5-1.8B-Chat-GGUF/resolve/main/qwen1_5-1_8b-chat-q4_0.gguf
+    #check_and_download_model MiniCPM4-0.5B-F32.gguf https://huggingface.co/zhouwg/kantv/resolve/main/MiniCPM4-0.5B-F32.gguf
+    #check_and_download_model t5-277M-F32.gguf https://huggingface.co/zhouwg/kantv/resolve/main/t5-277M-F32.gguf
+    #check_and_download_model gemma-3n-E4B-it-Q8_0.gguf https://huggingface.co/ggml-org/gemma-3n-E4B-it-GGUF/resolve/main/gemma-3n-E4B-it-Q8_0.gguf
+    check_and_download_model gemma-3n-E2B-it-Q8_0.gguf https://huggingface.co/ggml-org/gemma-3n-E2B-it-GGUF/resolve/main/gemma-3n-E2B-it-Q8_0.gguf
+
+    set -e
+}
+
+
+function prepare_run_on_phone()
+{
+    if [ $# != 1 ]; then
+        print "invalid param"
+        return
+    fi
+    program=$1
+
+    check_qnn_libs
+
+    check_prebuilt_models
+
+    if [ -f ./out/ggmlhexagon-android/bin/libggml-cpu.so ]; then
+        adb push ./out/ggmlhexagon-android/bin/*.so ${REMOTE_PATH}/
+    fi
+    adb push ./out/ggmlhexagon-android/bin/${program} ${REMOTE_PATH}/
+
+    #for troubleshooting issues in upstream llama.cpp project
+    adb shell ls -l ${REMOTE_PATH}/libggml-*.so
+
+    #for verify prebuilt binary library(after 06/2025) on Hexagon cDSP
+    #comment this line when build library on Hexagon cDSP from the reference/self-develop source codes in this project
+    prepare_ggmldsp
+
+    #un-comment this line when build library on Hexagon cDSP from the reference/self-develop source codes in this project
+    #adb push ./scripts/ggml-hexagon.cfg ${REMOTE_PATH}/ggml-hexagon.cfg
+
+    adb shell chmod +x ${REMOTE_PATH}/${program}
+}
+
+function run_llamacli()
+{
+    prepare_run_on_phone llama-cli
+
+    echo "${REMOTE_PATH}/llama-cli ${running_params} -mg ${hexagon_backend} -no-cnv -m ${TEST_MODEL_NAME} -p \"${PROMPT_STRING}\""
+    adb shell "cd ${REMOTE_PATH} \
+               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
+               && ${REMOTE_PATH}/llama-cli ${running_params} -mg ${hexagon_backend} -no-cnv -m ${TEST_MODEL_NAME} -p \"${PROMPT_STRING}\""
+
+}
+
+
+function run_llamabench()
+{
+    prepare_run_on_phone llama-bench
+
+    echo "adb shell \"cd ${REMOTE_PATH} \
+               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
+               && ${REMOTE_PATH}/llama-bench ${running_params} -mg ${hexagon_backend} -m ${GGUF_MODEL_NAME}\""
+    echo "${REMOTE_PATH}/llama-bench ${running_params} -mg ${hexagon_backend} -m ${GGUF_MODEL_NAME}"
+
+    adb shell "cd ${REMOTE_PATH} \
+               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
+               && ${REMOTE_PATH}/llama-bench ${running_params} -mg ${hexagon_backend} -m ${GGUF_MODEL_NAME}"
+
+}
+
+
+function run_threadsafety()
+{
+    prepare_run_on_phone test-thread-safety
+
+    echo "${REMOTE_PATH}/test-thread-safety -np 2 -mg ${hexagon_backend} -m ${GGUF_MODEL_NAME} -p \"hello,world\" -n 256 -ngl 99 "
+    adb shell "cd ${REMOTE_PATH} \
+               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
+               && ${REMOTE_PATH}/test-thread-safety -np 1 -mg ${hexagon_backend} -m ${GGUF_MODEL_NAME} -p \"hello,world\" -n 256 -ngl 99 "
+
+}
+
+
+function run_test-ops()
+{
+    prepare_run_on_phone ggmlhexagon-testops
+
+    adb shell "cd ${REMOTE_PATH} \
+               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
+               && ${REMOTE_PATH}/ggmlhexagon-testops test"
+
+}
+
+
+function check_hexagon_backend
+{
+    if [[ ${hexagon_backend} != 0 ]] && [[ ${hexagon_backend} != 1 ]] && [[ ${hexagon_backend} != 2 ]] && [[ ${hexagon_backend} != 3 ]] && [[ ${hexagon_backend} != 4 ]] ; then
+        printf "invalid hexagon backend\n"
+        printf "valid hexagon backend: 0(QNN_CPU), 1(QNN_GPU), 2(QNN_NPU), 3(cDSP), 4(ggml)\n"
+        exit 1
+    fi
+}
+
+
+function check_mulmat_algotype
+{
+    printf "mulmat_algotype ${mulmat_algotype} \n"
+    if [[ ${mulmat_algotype} != 0 ]] && [[ ${mulmat_algotype} != 1 ]] && [[ ${mulmat_algotype} != 2 ]] && [[ ${mulmat_algotype} != 3 ]] && [[ ${mulmat_algotype} != 4 ]] && [[ ${mulmat_algotype} != 5 ]] && [[ ${mulmat_algotype} != 6 ]] && [[ ${mulmat_algotype} != 32 ]] && [[ ${mulmat_algotype} != 33 ]]; then
+        printf "invalid mulmat algotype\n"
+        printf "valid mulmat algotype: 0, 1, 2, 3, 4, 5, 6, 32, 33 \n"
+        exit 1
+    fi
+}
+
+
+function run_test-op()
+{
+    prepare_run_on_phone ggmlhexagon-testops
+
+    check_mulmat_algotype
+
+    echo "adb shell cd ${REMOTE_PATH} \
+               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
+               && ${REMOTE_PATH}/ggmlhexagon-testops test -o ${opname} -a ${mulmat_algotype} -i ${hexagon_backend}"
+
+    echo "\n"
+    adb shell "cd ${REMOTE_PATH} \
+               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
+               && ${REMOTE_PATH}/ggmlhexagon-testops test -o ${opname} -a ${mulmat_algotype} -i ${hexagon_backend}"
+
+}
+
+
+function run_perf-op()
+{
+    prepare_run_on_phone ggmlhexagon-testops
+
+    check_mulmat_algotype
+
+    echo "adb shell cd ${REMOTE_PATH} \
+               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
+               && ${REMOTE_PATH}/ggmlhexagon-testops perf -o ${opname} -a ${mulmat_algotype} -i ${hexagon_backend}"
+
+    echo "\n"
+    adb shell "cd ${REMOTE_PATH} \
+               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
+               && ${REMOTE_PATH}/ggmlhexagon-testops perf -o ${opname} -a ${mulmat_algotype} -i ${hexagon_backend}"
+
+}
+
+
+function run_benchmark()
+{
+    prepare_run_on_phone ggmlhexagon-benchmark
+
+    check_mulmat_algotype
+
+    echo "${REMOTE_PATH}/ggmlhexagon-benchmark -t ${opname} -b ${hexagon_backend} -m ${row} -n ${col} -a ${mulmat_algotype}"
+    adb shell "cd ${REMOTE_PATH} \
+               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
+               && ${REMOTE_PATH}/ggmlhexagon-benchmark -t ${opname} -b ${hexagon_backend} -m ${row} -n ${col} -a ${mulmat_algotype}"
+
+}
+
+
+function print_oplist()
+{
+oplist="DUP
+    ADD
+    ADD1
+    ACC
+    SUB
+    MUL
+    DIV
+    SQR
+    SQRT
+    LOG
+    SIN
+    COS
+    SUM
+    SUM_ROWS
+    MEAN
+    ARGMAX
+    COUNT_EQUAL
+    REPEAT
+    REPEAT_BACK
+    CONCAT
+    SILU_BACK
+    NORM
+    RMS_NORM
+    RMS_NORM_BACK
+    GROUP_NORM
+
+    MUL_MAT
+    MUL_MAT_ID
+    OUT_PROD
+
+    SCALE
+    SET
+    CPY
+    CONT
+    RESHAPE
+    VIEW
+    PERMUTE
+    TRANSPOSE
+    GET_ROWS
+    GET_ROWS_BACK
+    DIAG
+    DIAG_MASK_INF
+    DIAG_MASK_ZERO
+    SOFT_MAX
+    SOFT_MAX_BACK
+    ROPE
+    ROPE_BACK
+    CLAMP
+    CONV_TRANSPOSE_1D
+    IM2COL
+    IM2COL_BACK
+    CONV_TRANSPOSE_2D
+    POOL_1D
+    POOL_2D
+    POOL_2D_BACK
+    UPSCALE
+    PAD
+    PAD_REFLECT_1D
+    ARANGE
+    TIMESTEP_EMBEDDING
+    ARGSORT
+    LEAKY_RELU
+
+    FLASH_ATTN_EXT
+    FLASH_ATTN_BACK
+    SSM_CONV
+    SSM_SCAN
+    WIN_PART
+    WIN_UNPART
+    GET_REL_POS
+    ADD_REL_POS
+    RWKV_WKV6
+    GATED_LINEAR_ATTN"
+
+echo "opname list: "
+echo ${oplist}
+}
+
+
+function show_usage()
+{
+    echo -e "\n\n\n"
+    echo "Usage:"
+    echo "  $0 help"
+    echo "  $0 print_oplist"
+    echo "  $0 build"
+    echo "  $0 build_debug (enable debug log for developers on ARM-AP side and cDSP side)"
+    echo "  $0 updateqnnlib"
+    echo "  $0 run_testops"
+    echo "  $0 run_testop     ADD/MUL_MAT                                                    (verify accuracy    of ADD/MUL_MAT)"
+    echo "  $0 run_perfop     ADD/MUL_MAT                                                    (verify performance of ADD/MUL_MAT)"
+    echo "  $0 run_llamacli                 0(QNN_CPU)/1(QNN_GPU)/2(QNN_NPU)/3(cdsp)/4(ggml)"
+    echo "  $0 run_llamabench               0(QNN_CPU)/1(QNN_GPU)/2(QNN_NPU)/3(cdsp)/4(ggml)"
+    echo "  $0 run_threadsafety             0(QNN_CPU)/1(QNN_GPU)/2(QNN_NPU)/3(cdsp)/4(ggml)"
+    echo "  $0 run_perfop     MUL_MAT       0(QNN_CPU)/1(QNN_GPU)/2(QNN_NPU)/3(cdsp)/4(ggml) (verify performance of MUL_MAT)"
+    echo "  $0 run_testop     MUL_MAT       0(QNN_CPU)/1(QNN_GPU)/2(QNN_NPU)/3(cdsp)/4(ggml) (verify accuracy    of MUL_MAT)"
+    echo "  $0 run_benchmark  ADD/MUL_MAT   0(QNN_CPU)/1(QNN_GPU)/2(QNN_NPU)/3(cdsp)/4(ggml) (verify performance of ADD/MUL_MAT)"
+    echo "  $0 run_benchmark  ADD/MUL_MAT   0(QNN_CPU)/1(QNN_GPU)/2(QNN_NPU)/3(cdsp)/4(ggml) 256/512/1024/2048/4096 256/512/1024/2048/4096"
+    echo "  $0 run_benchmark  MUL_MAT       3(cdsp)   mulmat_algotype(0,1,2,3,4,5,6,32,33)   (verify performance of MUL_MAT on cDSP)"
+    echo "  $0 run_perfop     MUL_MAT       3(cdsp)   mulmat_algotype(0,1,2,3,4,5,6,32,33)   (verify performance of MUL_MAT on cDSP)"
+    echo "  $0 run_testop     MUL_MAT       3(cdsp)   mulmat_algotype(0,1,2,3,4,5,6,32,33)   (verify accuracy    of MUL_MAT on cDSP)"
+    echo "  $0 run_testop     ADD                                                            (verify accuracy    of ADD     on cDSP)"
+
+    echo -e "\n\n\n"
+}
+
+
+######## part-4: entry point  ########
+
+show_pwd
+
+check_commands_in_host
+check_android_phone
+check_and_download_ndk
+check_and_download_qnn_sdk
+check_and_download_hexagon_sdk
+check_prebuilt_models
+
+if [ $# == 0 ]; then
+    show_usage
+    exit 1
+elif [ $# == 1 ]; then
+    if [ "$1" == "-h" ]; then
+        show_usage
+        exit 1
+    elif [ "$1" == "help" ]; then
+        show_usage
+        exit 1
+    elif [ "$1" == "print_oplist" ]; then
+        print_oplist
+        exit 1
+    elif [ "$1" == "build" ]; then
+        build_ggml_hexagon
+        exit 0
+    elif [ "$1" == "build_debug" ]; then
+        build_ggml_hexagon_debug
+        exit 0
+    elif [ "$1" == "run_testops" ]; then
+        run_test-ops
+        exit 0
+    elif [ "$1" == "updateqnnlib" ]; then
+        update_qnn_libs
+        exit 0
+    else
+        show_usage
+        exit 1
+    fi
+elif [ $# == 2 ]; then
+#TODO: check opname in oplist
+#opname can be found via print_oplist:
+
+    if [ "$1" == "run_testop" ]; then
+        opname=$2
+        mulmat_algotype=32
+        hexagon_backend=3
+        run_test-op
+        exit 0
+    elif [ "$1" == "run_perfop" ]; then
+        opname=$2
+        mulmat_algotype=32
+        hexagon_backend=3
+        check_hexagon_backend
+        run_perf-op
+        exit 0
+    elif [ "$1" == "run_llamacli" ]; then
+        hexagon_backend=$2
+        check_hexagon_backend
+        run_llamacli
+        exit 0
+    elif [ "$1" == "run_llamabench" ]; then
+        hexagon_backend=$2
+        check_hexagon_backend
+        run_llamabench
+        exit 0
+    elif [ "$1" == "run_threadsafety" ]; then
+        hexagon_backend=$2
+        check_hexagon_backend
+        run_threadsafety
+        exit 0
+    else
+        show_usage
+        exit 1
+    fi
+elif [ $# == 3 ]; then
+    if [ "$1" == "run_benchmark" ]; then
+        opname=$2
+        hexagon_backend=$3
+        row=4096
+        col=4096
+        mulmat_algotype=32
+        check_hexagon_backend
+        run_benchmark
+        exit 0
+    elif [ "$1" == "run_testop" ]; then
+        opname=MUL_MAT
+        mulmat_algotype=32
+        hexagon_backend=$3
+        check_hexagon_backend
+        run_test-op
+        exit 0
+    elif [ "$1" == "run_perfop" ]; then
+        opname=MUL_MAT
+        mulmat_algotype=32
+        hexagon_backend=$3
+        run_perf-op
+        exit 0
+    else
+        show_usage
+        exit 1
+    fi
+elif [ $# == 4 ]; then
+    if [ "$1" == "run_benchmark" ]; then
+        opname=MUL_MAT
+        #cDSP
+        hexagon_backend=3
+        row=4096
+        col=4096
+        mulmat_algotype=$4
+        check_mulmat_algotype
+        run_benchmark
+        exit 0
+    elif [ "$1" == "run_perfop" ]; then
+        opname=MUL_MAT
+        hexagon_backend=3
+        mulmat_algotype=$4
+        check_mulmat_algotype
+        run_perf-op
+        exit 0
+    elif [ "$1" == "run_testop" ]; then
+        opname=MUL_MAT
+        hexagon_backend=3
+        mulmat_algotype=$4
+        check_mulmat_algotype
+        run_test-op
+        exit 0
+    else
+        show_usage
+        exit 1
+    fi
+elif [ $# == 5 ]; then
+    if [ "$1" == "run_benchmark" ]; then
+        opname=$2
+        hexagon_backend=$3
+        row=$4
+        col=$5
+        mulmat_algotype=32
+        check_hexagon_backend
+        run_benchmark
+        exit 0
+    else
+        show_usage
+        exit 1
+    fi
+else
+    show_usage
+    exit 1
+fi
diff --git a/scripts/build-run-ggmlopencl-android.sh b/scripts/build-run-ggmlopencl-android.sh
new file mode 100755
index 0000000000000..3d7172e34999d
--- /dev/null
+++ b/scripts/build-run-ggmlopencl-android.sh
@@ -0,0 +1,460 @@
+#!/usr/bin/env bash
+#
+# Copyright (c) Qualcomm(https://www.qualcomm.com/developer/blog/2024/11/introducing-new-opn-cl-gpu-backend-llama-cpp-for-qualcomm-adreno-gpu, original-author)
+# Copyright (c) rmatif(https://github.com/rmatif, core-author)
+# Copyright (c) zhouwg(https://github.com/zhouwg, co-author)
+#
+# build llama.cpp + ggml-opencl backend on Linux for Android phone
+# this script will setup local dev envs automatically
+#
+set -e
+
+######## part-1: don't modify contents in this part ########
+
+PWD=`pwd`
+PROJECT_HOME_PATH=`pwd`
+PROJECT_ROOT_PATH=${PROJECT_HOME_PATH}
+HOST_CPU_COUNTS=`cat /proc/cpuinfo | grep "processor" | wc | awk '{print int($1)}'`
+
+#running path on Android phone
+REMOTE_PATH=/data/local/tmp
+
+#Android NDK can be found at:
+#https://developer.android.com/ndk/downloads
+ANDROID_PLATFORM=android-34
+ANDROID_NDK_VERSION=r28
+ANDROID_NDK_NAME=android-ndk-${ANDROID_NDK_VERSION}
+ANDROID_NDK_FULLNAME=${ANDROID_NDK_NAME}-linux.zip
+ANDROID_NDK=${PROJECT_ROOT_PATH}/prebuilts/${ANDROID_NDK_NAME}
+
+# --- Define NDK paths based on the absolute SDK path ---
+NDK_TOOLCHAIN_SYSROOT_INCLUDE_PATH="${ANDROID_NDK}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include"
+NDK_TOOLCHAIN_SYSROOT_ARM64_LIB_PATH="${ANDROID_NDK}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android"
+
+#OpenCL Headers can be found at:
+#https://https://github.com/KhronosGroup/OpenCL-Headers
+OPENCL_SDK_URL=https://github.com/KhronosGroup/OpenCL-Headers
+OPENCL_SDK_PATH=${PROJECT_ROOT_PATH}/prebuilts/OpenCL_SDK
+OPENCL_HEADERS_PATH=${OPENCL_SDK_PATH}/OpenCL-Headers
+
+#running_params=" -ngl 99 -t 4 -n 256 --no-warmup -fa 1 "
+running_params=" -ngl 99 -t 4 -n 256 --no-warmup "
+
+######## part-2: contents in this part can be modified ########
+#for llama-cli, 1.1 GiB, will be downloaded automatically via this script when running this script at the first time
+TEST_MODEL_NAME=/sdcard/qwen1_5-1_8b-chat-q4_0.gguf
+#for llama-cli, 4.5 GiB, will be downloadded automatically via this script when running this script at the first time
+TEST_MODEL_NAME=/sdcard/gemma-3n-E2B-it-Q8_0.gguf
+
+#for llama-bench, 4.5 GiB, will be downloadded automatically via this script when running this script at the first time
+GGUF_MODEL_NAME=/sdcard/gemma-3n-E2B-it-Q8_0.gguf
+#for llama-bench, 1.12 GiB, will be downloadded automatically via this script when running this script at the first time
+GGUF_MODEL_NAME=/sdcard/qwen1_5-1_8b-chat-q4_0.gguf
+
+######## part-3: utilities and functions ########
+
+function dump_vars()
+{
+    echo -e "ANDROID_NDK:          ${ANDROID_NDK}"
+    echo -e "OPENCL_SDK_PATH:      ${OPENCL_SDK_PATH}"
+}
+
+
+function show_pwd()
+{
+    echo -e "current working path:$(pwd)\n"
+}
+
+
+function check_command_in_host()
+{
+    set +e
+    cmd=$1
+    ls /usr/bin/${cmd}
+    if [ $? -eq 0 ]; then
+        #printf "${cmd} already exist on host machine\n"
+        echo ""
+    else
+        printf "${cmd} not exist on host machine, pls install command line utility ${cmd} firstly and accordingly\n"
+        exit 1
+    fi
+    set -e
+}
+
+
+function check_commands_in_host()
+{
+    check_command_in_host wget
+    check_command_in_host git
+    check_command_in_host ninja
+}
+
+
+function check_android_phone()
+{
+    adb shell ls /bin/ls
+    if [ ! $? -eq 0 ]; then
+        printf "pls check Android phone is connected properly\n"
+        exit 1
+    fi
+}
+
+
+function check_and_download_opencl_sdk()
+{
+    is_opencl_sdk_exist=1
+
+    if [ ! -d ${OPENCL_SDK_PATH} ]; then
+        echo -e "OPENCL_SDK_PATH ${OPENCL_SDK_PATH} not exist, download it from ${OPENCL_SDK_URL}...\n"
+        is_opencl_sdk_exist=0
+    fi
+    if [ ! -f ${NDK_TOOLCHAIN_SYSROOT_ARM64_LIB_PATH}/libOpenCL.so ]; then
+        echo -e "${NDK_TOOLCHAIN_SYSROOT_ARM64_LIB_PATH}/libOpenCL.so not exist...\n"
+        is_opencl_sdk_exist=0
+    fi
+
+    if [ ${is_opencl_sdk_exist} -eq 0 ]; then
+        mkdir -p ${OPENCL_SDK_PATH}
+        cd ${OPENCL_SDK_PATH}
+
+        if [ ! -d OpenCL-Headers ]; then
+            echo "Cloning OpenCL-Headers..."
+            git clone https://github.com/KhronosGroup/OpenCL-Headers
+            if [ $? -ne 0 ]; then
+                printf "failed to download OpenCL-Headers to %s \n" "${OPENCL_SDK_PATH}"
+                exit 1
+            fi
+        fi
+        cd ${PROJECT_ROOT_PATH}/prebuilts/OpenCL_SDK/OpenCL-Headers
+        printf "Copying OpenCL Headers to Android NDK sysroot include: ${NDK_TOOLCHAIN_SYSROOT_INCLUDE_PATH}"
+        mkdir -p ${NDK_TOOLCHAIN_SYSROOT_INCLUDE_PATH}
+        /bin/cp -r -fv CL ${NDK_TOOLCHAIN_SYSROOT_INCLUDE_PATH}
+
+        cd ${PROJECT_ROOT_PATH}/prebuilts/OpenCL_SDK
+        if [ ! -d OpenCL-ICD-Loader ]; then
+            echo "Cloning OpenCL-ICD-Loader..."
+            git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
+            if [ $? -ne 0 ]; then
+                printf "failed to download OpenCL-ICD-Loader to %s \n" "${OPENCL_SDK_PATH}"
+                exit 1
+            fi
+        fi
+        cd ${PROJECT_ROOT_PATH}/prebuilts/OpenCL_SDK/OpenCL-ICD-Loader
+        mkdir -p build
+        cd build
+        cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=latest -DANDROID_STL=c++_shared -DOPENCL_ICD_LOADER_HEADERS_DIR=${NDK_TOOLCHAIN_SYSROOT_INCLUDE_PATH}
+        echo "Building OpenCL-ICD-Loader with ninjia..."
+        ninja
+        if [ $? -ne 0 ]; then
+            printf "failed to build OpenCL-ICD-Loader\n"
+            exit 1
+        fi
+        mkdir -p ${NDK_TOOLCHAIN_SYSROOT_ARM64_LIB_PATH}
+        /bin/cp -fv libOpenCL.so ${NDK_TOOLCHAIN_SYSROOT_ARM64_LIB_PATH}
+
+        echo "OpenCL components setup complete"
+        echo "OpenCL Headers are in: ${NDK_TOOLCHAIN_SYSROOT_INCLUDE_PATH}/CL"
+        echo "libOpenCL.so is in:    ${NDK_TOOLCHAIN_SYSROOT_ARM64_LIB_PATH}/libOpenCL.so"
+
+        cd ${PROJECT_ROOT_PATH}
+    else
+        printf "OpenCL SDK already exist:    ${OPENCL_SDK_PATH} \n\n"
+    fi
+}
+
+
+function check_and_download_ndk()
+{
+    is_android_ndk_exist=1
+
+    if [ ! -d ${ANDROID_NDK} ]; then
+        is_android_ndk_exist=0
+    fi
+
+    if [ ! -f ${ANDROID_NDK}/build/cmake/android.toolchain.cmake ]; then
+        is_android_ndk_exist=0
+    fi
+
+    if [ ${is_android_ndk_exist} -eq 0 ]; then
+
+        if [ ! -f ${PROJECT_ROOT_PATH}/prebuilts/${ANDROID_NDK_FULLNAME} ]; then
+            wget --no-config --quiet --show-progress -O ${PROJECT_ROOT_PATH}/prebuilts/${ANDROID_NDK_FULLNAME} https://dl.google.com/android/repository/${ANDROID_NDK_FULLNAME}
+        fi
+
+        cd ${PROJECT_ROOT_PATH}/prebuilts/
+        unzip ${ANDROID_NDK_FULLNAME}
+
+        if [ $? -ne 0 ]; then
+            printf "failed to download Android NDK to %s \n" "${ANDROID_NDK}"
+            exit 1
+        fi
+        cd ${PROJECT_ROOT_PATH}
+
+        printf "Android NDK saved to ${ANDROID_NDK} \n\n"
+    else
+        printf "Android NDK already exist:   ${ANDROID_NDK} \n\n"
+    fi
+}
+
+
+function build_arm64
+{
+    cmake -H. -B./out/ggmlopencl-android -DCMAKE_BUILD_TYPE=Release -DGGML_OPENMP=OFF -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=latest -DCMAKE_C_FLAGS=-march=armv8.7-a -DGGML_OPENCL=ON -DLLAMA_CURL=OFF
+    cd out/ggmlopencl-android
+    make -j${HOST_CPU_COUNTS}
+    show_pwd
+
+    cd -
+}
+
+
+function build_arm64_debug
+{
+    cmake -H. -B./out/ggmlopencl-android -DCMAKE_BUILD_TYPE=Debug -DGGML_OPENMP=OFF -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=latest -DCMAKE_C_FLAGS=-march=armv8.7-a -DGGML_OPENCL=ON -DLLAMA_CURL=OFF
+    cd out/ggmlopencl-android
+    make -j${HOST_CPU_COUNTS}
+    show_pwd
+
+    cd -
+}
+
+
+function remove_temp_dir()
+{
+    if [ -d out/ggmlopencl-android ]; then
+        echo "remove out/ggmlopencl-android directory in `pwd`"
+        rm -rf out/ggmlopencl-android
+    fi
+}
+
+
+function build_ggml_opencl()
+{
+    show_pwd
+    check_and_download_ndk
+    check_and_download_opencl_sdk
+    dump_vars
+    remove_temp_dir
+    build_arm64
+}
+
+
+function build_ggml_opencl_debug()
+{
+    show_pwd
+    check_and_download_ndk
+    check_and_download_opencl_sdk
+    dump_vars
+    remove_temp_dir
+    build_arm64_debug
+}
+
+
+function check_and_download_model()
+{
+    set +e
+
+    model_name=$1
+    model_url=$2
+
+    adb shell ls /sdcard/${model_name}
+    if [ $? -eq 0 ]; then
+        printf "the prebuild LLM model ${model_name} already exist on Android phone\n"
+    else
+        printf "the prebuild LLM model ${model_name} not exist on Android phone\n"
+        wget --no-config --quiet --show-progress -O ${PROJECT_ROOT_PATH}/models/${model_name} ${model_url}
+        adb push ${PROJECT_ROOT_PATH}/models/${model_name} /sdcard/
+    fi
+
+    set -e
+}
+
+
+function check_prebuilt_models()
+{
+    set +e
+
+    check_and_download_model qwen1_5-1_8b-chat-q4_0.gguf https://huggingface.co/Qwen/Qwen1.5-1.8B-Chat-GGUF/resolve/main/qwen1_5-1_8b-chat-q4_0.gguf
+    check_and_download_model gemma-3n-E2B-it-Q8_0.gguf https://huggingface.co/ggml-org/gemma-3n-E2B-it-GGUF/resolve/main/gemma-3n-E2B-it-Q8_0.gguf
+
+    set -e
+}
+
+
+function prepare_run_on_phone()
+{
+    if [ $# != 1 ]; then
+        print "invalid param"
+        return
+    fi
+    program=$1
+
+    check_prebuilt_models
+
+    if [ -f ./out/ggmlopencl-android/bin/libggml-cpu.so ]; then
+        adb push ./out/ggmlopencl-android/bin/*.so ${REMOTE_PATH}/
+    fi
+    adb push ./out/ggmlopencl-android/bin/${program} ${REMOTE_PATH}/
+
+    adb shell ls -l ${REMOTE_PATH}/libggml-*.so
+
+    adb shell chmod +x ${REMOTE_PATH}/${program}
+}
+
+
+function run_llamacli()
+{
+    prepare_run_on_phone llama-cli
+
+    echo "${REMOTE_PATH}/llama-cli ${running_params} -no-cnv -m ${TEST_MODEL_NAME} -p \"${PROMPT_STRING}\""
+    adb shell "cd ${REMOTE_PATH} \
+               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
+               && ${REMOTE_PATH}/llama-cli ${running_params} -no-cnv -m ${TEST_MODEL_NAME} -p \"${PROMPT_STRING}\""
+
+}
+
+
+function run_llamabench()
+{
+    prepare_run_on_phone llama-bench
+
+    echo "adb shell \"cd ${REMOTE_PATH} \
+               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
+               && ${REMOTE_PATH}/llama-bench ${running_params} -m ${GGUF_MODEL_NAME}\""
+    echo "${REMOTE_PATH}/llama-bench ${running_params} -m ${GGUF_MODEL_NAME}"
+
+    adb shell "cd ${REMOTE_PATH} \
+               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
+               && ${REMOTE_PATH}/llama-bench ${running_params} -m ${GGUF_MODEL_NAME}"
+
+}
+
+
+function run_threadsafety()
+{
+    prepare_run_on_phone test-thread-safety
+
+    echo "${REMOTE_PATH}/test-thread-safety -np 2 -m ${GGUF_MODEL_NAME} -p \"hello,world\" -n 256 -ngl 99 "
+    adb shell "cd ${REMOTE_PATH} \
+               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
+               && ${REMOTE_PATH}/test-thread-safety -np 1 -m ${GGUF_MODEL_NAME} -p \"hello,world\" -n 256 -ngl 99 "
+
+}
+
+
+function run_test-ops()
+{
+    prepare_run_on_phone test-backend-ops
+
+    adb shell "cd ${REMOTE_PATH} \
+               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
+               && ${REMOTE_PATH}/test-backend-ops test"
+
+}
+
+
+function run_test-op()
+{
+    prepare_run_on_phone test-backend-ops
+
+    echo "adb shell cd ${REMOTE_PATH} \
+               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
+               && ${REMOTE_PATH}/test-backend-ops test -o ${opname}"
+
+    echo "\n"
+    adb shell "cd ${REMOTE_PATH} \
+               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
+               && ${REMOTE_PATH}/test-backend-ops test -o ${opname}"
+
+}
+
+
+function run_perf-op()
+{
+    prepare_run_on_phone test-backend-ops
+
+    echo "adb shell cd ${REMOTE_PATH} \
+               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
+               && ${REMOTE_PATH}/test-backend-ops perf -o ${opname}"
+
+    echo "\n"
+    adb shell "cd ${REMOTE_PATH} \
+               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
+               && ${REMOTE_PATH}/test-backend-ops perf -o ${opname}"
+
+}
+
+
+function show_usage()
+{
+    echo -e "\n\n\n"
+    echo "Usage:"
+    echo "  $0 help"
+    echo "  $0 build"
+    echo "  $0 build_debug"
+    echo "  $0 run_testops"
+    echo "  $0 run_testop     ADD/MUL_MAT                                                    (verify accuracy    of ADD/MUL_MAT)"
+    echo "  $0 run_perfop     ADD/MUL_MAT                                                    (verify performance of ADD/MUL_MAT)"
+    echo "  $0 run_llamacli"
+    echo "  $0 run_llamabench"
+    echo "  $0 run_threadsafety"
+    echo -e "\n\n\n"
+}
+
+
+######## part-4: entry point  ########
+
+show_pwd
+
+check_commands_in_host
+check_android_phone
+check_and_download_ndk
+check_and_download_opencl_sdk
+check_prebuilt_models
+
+if [ $# == 0 ]; then
+    show_usage
+    exit 1
+elif [ $# == 1 ]; then
+    if [ "$1" == "-h" ]; then
+        show_usage
+        exit 1
+    elif [ "$1" == "help" ]; then
+        show_usage
+        exit 1
+    elif [ "$1" == "build" ]; then
+        build_ggml_opencl
+        exit 0
+    elif [ "$1" == "build_debug" ]; then
+        build_ggml_opencl_debug
+        exit 0
+    elif [ "$1" == "run_testops" ]; then
+        run_test-ops
+        exit 0
+    elif [ "$1" == "run_llamacli" ]; then
+        run_llamacli
+        exit 0
+    elif [ "$1" == "run_llamabench" ]; then
+        run_llamabench
+        exit 0
+    elif [ "$1" == "run_threadsafety" ]; then
+        run_threadsafety
+        exit 0
+    else
+        show_usage
+        exit 1
+    fi
+elif [ $# == 2 ]; then
+    if [ "$1" == "run_testop" ]; then
+        opname=$2
+        run_test-op
+        exit 0
+    elif [ "$1" == "run_perfop" ]; then
+        opname=$2
+        run_perf-op
+        exit 1
+    fi
+else
+    show_usage
+    exit 1
+fi
diff --git a/scripts/build-run-ggmlvulkan-android.sh b/scripts/build-run-ggmlvulkan-android.sh
new file mode 100755
index 0000000000000..eb5ccb35dae9a
--- /dev/null
+++ b/scripts/build-run-ggmlvulkan-android.sh
@@ -0,0 +1,425 @@
+#!/usr/bin/env bash
+#
+#  Copyright (c) 2024-2025 The ggml authors
+#  Copyright (c) zhouwg(https://github.com/zhouwg, co-author)
+#
+# build llama.cpp + ggml-vulkan backend on Linux for Android phone
+# this script will setup local dev envs automatically
+#
+set -e
+
+######## part-1: don't modify contents in this part ########
+
+PWD=`pwd`
+PROJECT_HOME_PATH=`pwd`
+PROJECT_ROOT_PATH=${PROJECT_HOME_PATH}
+HOST_CPU_COUNTS=`cat /proc/cpuinfo | grep "processor" | wc | awk '{print int($1)}'`
+
+#running path on Android phone
+REMOTE_PATH=/data/local/tmp
+
+#Android NDK can be found at:
+#https://developer.android.com/ndk/downloads
+ANDROID_PLATFORM=android-34
+ANDROID_NDK_VERSION=r28
+ANDROID_NDK_NAME=android-ndk-${ANDROID_NDK_VERSION}
+ANDROID_NDK_FULLNAME=${ANDROID_NDK_NAME}-linux.zip
+ANDROID_NDK=${PROJECT_ROOT_PATH}/prebuilts/${ANDROID_NDK_NAME}
+
+#Vulkan-Headers can be found at:
+#https://github.com/KhronosGroup/Vulkan-Headers
+VULKAN_SDK_URL=https://github.com/KhronosGroup/Vulkan-Headers
+VULKAN_SDK_PATH=${PROJECT_ROOT_PATH}/prebuilts/Vulkan_SDK
+VULKAN_HEADERS_PATH=${VULKAN_SDK_PATH}/Vulkan-Headers
+
+#running_params=" -ngl 99 -t 4 -n 256 --no-warmup -fa 1 "
+running_params=" -ngl 99 -t 4 -n 256 --no-warmup "
+
+######## part-2: contents in this part can be modified ########
+#for llama-cli, 1.1 GiB, will be downloaded automatically via this script when running this script at the first time
+TEST_MODEL_NAME=/sdcard/qwen1_5-1_8b-chat-q4_0.gguf
+#for llama-cli, 4.5 GiB, will be downloadded automatically via this script when running this script at the first time
+TEST_MODEL_NAME=/sdcard/gemma-3n-E2B-it-Q8_0.gguf
+
+#for llama-bench, 4.5 GiB, will be downloadded automatically via this script when running this script at the first time
+GGUF_MODEL_NAME=/sdcard/gemma-3n-E2B-it-Q8_0.gguf
+#for llama-bench, 1.12 GiB, will be downloadded automatically via this script when running this script at the first time
+GGUF_MODEL_NAME=/sdcard/qwen1_5-1_8b-chat-q4_0.gguf
+
+######## part-3: utilities and functions ########
+
+function dump_vars()
+{
+    echo -e "ANDROID_NDK:          ${ANDROID_NDK}"
+    echo -e "VULKAN_SDK_PATH:      ${VULKAN_SDK_PATH}"
+}
+
+
+function show_pwd()
+{
+    echo -e "current working path:$(pwd)\n"
+}
+
+
+function check_command_in_host()
+{
+    set +e
+    cmd=$1
+    ls /usr/bin/${cmd}
+    if [ $? -eq 0 ]; then
+        #printf "${cmd} already exist on host machine\n"
+        echo ""
+    else
+        printf "${cmd} not exist on host machine, pls install command line utility ${cmd} firstly and accordingly\n"
+        exit 1
+    fi
+    set -e
+}
+
+
+function check_commands_in_host()
+{
+    check_command_in_host wget
+    check_command_in_host git
+}
+
+
+function check_android_phone()
+{
+    adb shell ls /bin/ls
+    if [ ! $? -eq 0 ]; then
+        printf "pls check Android phone is connected properly\n"
+        exit 1
+    fi
+}
+
+
+function check_and_download_vulkan_sdk()
+{
+    is_vulkan_sdk_exist=1
+
+    if [ ! -d ${VULKAN_SDK_PATH} ]; then
+        echo -e "VULKAN_SDK_PATH ${VULKAN_SDK_PATH} not exist, download it from ${VULKAN_SDK_URL}...\n"
+        is_vulkan_sdk_exist=0
+    fi
+
+    if [ ${is_vulkan_sdk_exist} -eq 0 ]; then
+        mkdir -p ${VULKAN_SDK_PATH}
+        cd ${VULKAN_SDK_PATH}
+
+        if [ ! -d Vulkan-Headers ]; then
+            echo "Cloning Vulkan-Headers..."
+            git clone https://github.com/KhronosGroup/Vulkan-Headers
+            if [ $? -ne 0 ]; then
+                printf "failed to download Vulkan-Headers to %s \n" "${VULKAN_SDK_PATH}"
+                exit 1
+            fi
+        fi
+
+        echo "Vulkan components setup complete"
+        echo "Vulkan Headers are in: ${VULKAN_HEADERS_PATH}"
+
+        cd ${PROJECT_ROOT_PATH}
+    else
+        printf "Vulkan SDK already exist:    ${VULKAN_SDK_PATH} \n\n"
+    fi
+}
+
+
+function check_and_download_ndk()
+{
+    is_android_ndk_exist=1
+
+    if [ ! -d ${ANDROID_NDK} ]; then
+        is_android_ndk_exist=0
+    fi
+
+    if [ ! -f ${ANDROID_NDK}/build/cmake/android.toolchain.cmake ]; then
+        is_android_ndk_exist=0
+    fi
+
+    if [ ${is_android_ndk_exist} -eq 0 ]; then
+
+        if [ ! -f ${PROJECT_ROOT_PATH}/prebuilts/${ANDROID_NDK_FULLNAME} ]; then
+            wget --no-config --quiet --show-progress -O ${PROJECT_ROOT_PATH}/prebuilts/${ANDROID_NDK_FULLNAME} https://dl.google.com/android/repository/${ANDROID_NDK_FULLNAME}
+        fi
+
+        cd ${PROJECT_ROOT_PATH}/prebuilts/
+        unzip ${ANDROID_NDK_FULLNAME}
+
+        if [ $? -ne 0 ]; then
+            printf "failed to download Android NDK to %s \n" "${ANDROID_NDK}"
+            exit 1
+        fi
+        /bin/cp -fv ${ANDROID_NDK}/shader-tools/linux-x86_64/glslc ${ANDROID_NDK}/shader-tools/linux-x86_64/glsls
+        cd ${PROJECT_ROOT_PATH}
+
+        printf "Android NDK saved to ${ANDROID_NDK} \n\n"
+    else
+        printf "Android NDK already exist:   ${ANDROID_NDK} \n\n"
+    fi
+}
+
+
+function build_arm64
+{
+    cmake -H. -B./out/ggmlvulkan-android -DCMAKE_BUILD_TYPE=Release -DGGML_OPENMP=OFF -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=latest -DCMAKE_C_FLAGS=-march=armv8.7-a -DGGML_VULKAN=ON -DLLAMA_CURL=OFF -DGGML_LLAMAFILE=OFF -DVulkan_GLSLC_EXECUTABLE=${ANDROID_NDK}/shader-tools/linux-x86_64/glsls -DVulkan_INCLUDE_DIR=${VULKAN_HEADERS_PATH}/include
+    cd out/ggmlvulkan-android
+    make -j${HOST_CPU_COUNTS}
+    show_pwd
+
+    cd -
+}
+
+
+function build_arm64_debug
+{
+    cmake -H. -B./out/ggmlvulkan-android -DCMAKE_BUILD_TYPE=Debug -DGGML_OPENMP=OFF -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=latest -DCMAKE_C_FLAGS=-march=armv8.7-a -DGGML_VULKAN=ON -DGGML_VULKAN_DEBUG=ON -DLLAMA_CURL=OFF -DGGML_LLAMAFILE=OFF -DVulkan_GLSLC_EXECUTABLE=${ANDROID_NDK}/shader-tools/linux-x86_64/glsls -DVulkan_INCLUDE_DIR=${VULKAN_HEADERS_PATH}/include
+    cd out/ggmlvulkan-android
+    make -j${HOST_CPU_COUNTS}
+    show_pwd
+
+    cd -
+}
+
+
+function remove_temp_dir()
+{
+    if [ -d out/ggmlvulkan-android ]; then
+        echo "remove out/ggmlvulkan-android directory in `pwd`"
+        rm -rf out/ggmlvulkan-android
+    fi
+}
+
+
+function build_ggml_vulkan()
+{
+    show_pwd
+    check_and_download_ndk
+    check_and_download_vulkan_sdk
+    dump_vars
+    remove_temp_dir
+    build_arm64
+}
+
+
+function build_ggml_vulkan_debug()
+{
+    show_pwd
+    check_and_download_ndk
+    check_and_download_vulkan_sdk
+    dump_vars
+    remove_temp_dir
+    build_arm64_debug
+}
+
+
+function check_and_download_model()
+{
+    set +e
+
+    model_name=$1
+    model_url=$2
+
+    adb shell ls /sdcard/${model_name}
+    if [ $? -eq 0 ]; then
+        printf "the prebuild LLM model ${model_name} already exist on Android phone\n"
+    else
+        printf "the prebuild LLM model ${model_name} not exist on Android phone\n"
+        wget --no-config --quiet --show-progress -O ${PROJECT_ROOT_PATH}/models/${model_name} ${model_url}
+        adb push ${PROJECT_ROOT_PATH}/models/${model_name} /sdcard/
+    fi
+
+    set -e
+}
+
+
+function check_prebuilt_models()
+{
+    set +e
+
+    check_and_download_model qwen1_5-1_8b-chat-q4_0.gguf https://huggingface.co/Qwen/Qwen1.5-1.8B-Chat-GGUF/resolve/main/qwen1_5-1_8b-chat-q4_0.gguf
+    check_and_download_model gemma-3n-E2B-it-Q8_0.gguf https://huggingface.co/ggml-org/gemma-3n-E2B-it-GGUF/resolve/main/gemma-3n-E2B-it-Q8_0.gguf
+
+    set -e
+}
+
+
+function prepare_run_on_phone()
+{
+    if [ $# != 1 ]; then
+        print "invalid param"
+        return
+    fi
+    program=$1
+
+    check_prebuilt_models
+
+    if [ -f ./out/ggmlvulkan-android/bin/libggml-cpu.so ]; then
+        adb push ./out/ggmlvulkan-android/bin/*.so ${REMOTE_PATH}/
+    fi
+    adb push ./out/ggmlvulkan-android/bin/${program} ${REMOTE_PATH}/
+    adb push ./out/ggmlvulkan-android/bin/vulkan-shaders-gen ${REMOTE_PATH}/
+
+    adb shell ls -l ${REMOTE_PATH}/libggml-*.so
+
+    adb shell chmod +x ${REMOTE_PATH}/${program}
+}
+
+
+function run_llamacli()
+{
+    prepare_run_on_phone llama-cli
+
+    echo "${REMOTE_PATH}/llama-cli ${running_params} -no-cnv -m ${TEST_MODEL_NAME} -p \"${PROMPT_STRING}\""
+    adb shell "cd ${REMOTE_PATH} \
+               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
+               && ${REMOTE_PATH}/llama-cli ${running_params} -no-cnv -m ${TEST_MODEL_NAME} -p \"${PROMPT_STRING}\""
+
+}
+
+
+function run_llamabench()
+{
+    prepare_run_on_phone llama-bench
+
+    echo "adb shell \"cd ${REMOTE_PATH} \
+               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
+               && ${REMOTE_PATH}/llama-bench ${running_params} -m ${GGUF_MODEL_NAME}\""
+    echo "${REMOTE_PATH}/llama-bench ${running_params} -m ${GGUF_MODEL_NAME}"
+
+    adb shell "cd ${REMOTE_PATH} \
+               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
+               && ${REMOTE_PATH}/llama-bench ${running_params} -m ${GGUF_MODEL_NAME}"
+
+}
+
+
+function run_threadsafety()
+{
+    prepare_run_on_phone test-thread-safety
+
+    echo "${REMOTE_PATH}/test-thread-safety -np 2 -m ${GGUF_MODEL_NAME} -p \"hello,world\" -n 256 -ngl 99 "
+    adb shell "cd ${REMOTE_PATH} \
+               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
+               && ${REMOTE_PATH}/test-thread-safety -np 1 -m ${GGUF_MODEL_NAME} -p \"hello,world\" -n 256 -ngl 99 "
+
+}
+
+
+function run_test-ops()
+{
+    prepare_run_on_phone test-backend-ops
+
+    adb shell "cd ${REMOTE_PATH} \
+               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
+               && ${REMOTE_PATH}/test-backend-ops test"
+
+}
+
+
+function run_test-op()
+{
+    prepare_run_on_phone test-backend-ops
+
+    echo "adb shell cd ${REMOTE_PATH} \
+               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
+               && ${REMOTE_PATH}/test-backend-ops test -o ${opname}"
+
+    echo "\n"
+    adb shell "cd ${REMOTE_PATH} \
+               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
+               && ${REMOTE_PATH}/test-backend-ops test -o ${opname}"
+
+}
+
+
+function run_perf-op()
+{
+    prepare_run_on_phone test-backend-ops
+
+    echo "adb shell cd ${REMOTE_PATH} \
+               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
+               && ${REMOTE_PATH}/test-backend-ops perf -o ${opname}"
+
+    echo "\n"
+    adb shell "cd ${REMOTE_PATH} \
+               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
+               && ${REMOTE_PATH}/test-backend-ops perf -o ${opname}"
+
+}
+
+
+function show_usage()
+{
+    echo -e "\n\n\n"
+    echo "Usage:"
+    echo "  $0 help"
+    echo "  $0 build"
+    echo "  $0 build_debug"
+    echo "  $0 run_testops"
+    echo "  $0 run_testop     ADD/MUL_MAT                                                    (verify accuracy    of ADD/MUL_MAT)"
+    echo "  $0 run_perfop     ADD/MUL_MAT                                                    (verify performance of ADD/MUL_MAT)"
+    echo "  $0 run_llamacli"
+    echo "  $0 run_llamabench"
+    echo "  $0 run_threadsafety"
+    echo -e "\n\n\n"
+}
+
+
+######## part-4: entry point  ########
+
+show_pwd
+
+check_commands_in_host
+check_android_phone
+check_and_download_ndk
+check_and_download_vulkan_sdk
+check_prebuilt_models
+
+if [ $# == 0 ]; then
+    show_usage
+    exit 1
+elif [ $# == 1 ]; then
+    if [ "$1" == "-h" ]; then
+        show_usage
+        exit 1
+    elif [ "$1" == "help" ]; then
+        show_usage
+        exit 1
+    elif [ "$1" == "build" ]; then
+        build_ggml_vulkan
+        exit 0
+    elif [ "$1" == "build_debug" ]; then
+        build_ggml_vulkan_debug
+        exit 0
+    elif [ "$1" == "run_testops" ]; then
+        run_test-ops
+        exit 0
+    elif [ "$1" == "run_llamacli" ]; then
+        run_llamacli
+        exit 0
+    elif [ "$1" == "run_llamabench" ]; then
+        run_llamabench
+        exit 0
+    elif [ "$1" == "run_threadsafety" ]; then
+        run_threadsafety
+        exit 0
+    else
+        show_usage
+        exit 1
+    fi
+elif [ $# == 2 ]; then
+    if [ "$1" == "run_testop" ]; then
+        opname=$2
+        run_test-op
+        exit 0
+    elif [ "$1" == "run_perfop" ]; then
+        opname=$2
+        run_perf-op
+        exit 1
+    fi
+else
+    show_usage
+    exit 1
+fi
diff --git a/scripts/ggml-hexagon-for-binary-lib.cfg b/scripts/ggml-hexagon-for-binary-lib.cfg
index f51eede770f6f..c0e151de2c1ca 100644
--- a/scripts/ggml-hexagon-for-binary-lib.cfg
+++ b/scripts/ggml-hexagon-for-binary-lib.cfg
@@ -1,8 +1,8 @@
 [general]
 #version of ggml-hexagon.cpp on ARM-AP side
-version = "1.08"
+version = "1.13"
 #version of ggml-dsp.c on cDSP side
-ggmldsp_version = "0.96"
+ggmldsp_version = "0.98.8"
 
 #0: HEXAGON_BACKEND_QNNCPU
 #1: HEXAGON_BACKEND_QNNGPU
@@ -32,9 +32,10 @@ hwaccel_approach = 2
 
 #enable/disable offload quantized type mulmat
 #quatized type mulmat works fine through QNNNPU at the moment
-#quatized type mulmat doesn't works fine through HWACCEL_CDSP at the moment
+#quatized type mulmat can works through HWACCEL_CDSP at the moment
 #this item will make mulmat performance comprision easily
-enable_q_mulmat = 1
+#default value is 0, which is used to compare the performance of GGML_OP_ADD between QNNCPU,QNNGPU,QNNNPU, cDSP, ggml
+enable_q_mulmat = 0
 
 
 # enable/disable print tensors info in op function
@@ -89,3 +90,6 @@ enable_all_q_mulmat = 1
 # 1    disable multi-threading on cDSP side
 # 2-8  thread_counts on cDSP side
 thread_counts = 8
+
+#algorithm type of mulmat on cDSP side
+mulmat_algotype = 0
diff --git a/scripts/ggml-hexagon.cfg b/scripts/ggml-hexagon.cfg
index e40c0a1f01f34..4a425676aae6f 100644
--- a/scripts/ggml-hexagon.cfg
+++ b/scripts/ggml-hexagon.cfg
@@ -1,6 +1,6 @@
 [general]
 #version of ggml-hexagon.cpp on ARM-AP side
-version = "1.08"
+version = "1.13"
 #version of ggml-dsp.c on cDSP side
 ggmldsp_version = "0.63"
 
@@ -89,3 +89,6 @@ enable_all_q_mulmat = 0
 # 1    disable multi-threading on cDSP side
 # 2-8  thread_counts on cDSP side
 thread_counts = 1
+
+#algorithm type of mulmat on cDSP side
+mulmat_algotype = 0
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 06e93b19cbf40..aff34023707d4 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -12,6 +12,10 @@
 #include <limits>
 #include <stdexcept>
 
+#ifdef GGML_USE_HEXAGON
+#include "ggml-hexagon.h"
+#endif
+
 //
 // llama_context
 //
@@ -133,7 +137,15 @@ llama_context::llama_context(
     if (!hparams.vocab_only) {
         // GPU backends
         for (auto * dev : model.devices) {
+#ifdef GGML_USE_HEXAGON
+            if (model.params.main_gpu == HEXAGON_BACKEND_GGML)
+                break;
+#endif
+#ifndef GGML_USE_HEXAGON
             ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
+#else
+            ggml_backend_t backend = ggml_backend_dev_init(dev,reinterpret_cast<const char *>(model.params.main_gpu));
+#endif
             if (backend == nullptr) {
                 throw std::runtime_error(format("failed to initialize %s backend", ggml_backend_dev_name(dev)));
             }
@@ -142,9 +154,18 @@ llama_context::llama_context(
 
         // add ACCEL backends (such as BLAS)
         for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
+#ifdef GGML_USE_HEXAGON
+            if (model.params.main_gpu == HEXAGON_BACKEND_GGML)
+                break;
+#endif
             ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+
             if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
+#ifndef GGML_USE_HEXAGON
                 ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
+#else
+                ggml_backend_t backend = ggml_backend_dev_init(dev,reinterpret_cast<const char *>(model.params.main_gpu));
+#endif
                 if (backend == nullptr) {
                     throw std::runtime_error(format("failed to initialize %s backend", ggml_backend_dev_name(dev)));
                 }
diff --git a/src/llama.cpp b/src/llama.cpp
index 34906cdb62844..71da994adfb7d 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -9,6 +9,9 @@
 
 #include "ggml.h"
 #include "ggml-backend.h"
+#ifdef GGML_USE_HEXAGON
+#include "ggml-hexagon.h"
+#endif
 
 #include <algorithm>
 #include <cstddef>
@@ -181,6 +184,11 @@ static struct llama_model * llama_model_load_from_file_impl(
                     break;
 
                 case GGML_BACKEND_DEVICE_TYPE_GPU:
+#if GGML_USE_HEXAGON
+                    if (params.main_gpu == HEXAGON_BACKEND_GGML) {
+                        break;
+                    }
+#endif
                     ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
                     if (ggml_backend_reg_name(reg) == std::string("RPC")) {
                         rpc_servers.push_back(dev);
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index fc1557a2d4065..4c6c712381c61 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -196,6 +196,10 @@ endif()
 llama_build_and_test(test-gguf.cpp)
 llama_build_and_test(test-backend-ops.cpp)
 
+#dedicated for ggml-hexagon
+llama_build_and_test(ggmlhexagon-benchmark.cpp)
+llama_build_and_test(ggmlhexagon-testops.cpp)
+
 llama_build_and_test(test-model-load-cancel.cpp  LABEL "model")
 llama_build_and_test(test-autorelease.cpp        LABEL "model")
 
diff --git a/tests/ggmlhexagon-benchmark.cpp b/tests/ggmlhexagon-benchmark.cpp
new file mode 100644
index 0000000000000..745808853725b
--- /dev/null
+++ b/tests/ggmlhexagon-benchmark.cpp
@@ -0,0 +1,478 @@
+/*
+ * Copyright (c) zhouwg(https://github.com/zhouwg)
+ * Copyright (c) 2024-2025 The ggml authors
+ *
+ * implementation of self-made Android command line tool for benchmark of ggml-hexagon backend
+ *
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdarg.h>
+#include <string.h>
+#include <stddef.h>
+#include <inttypes.h>
+#if defined(__ANDROID__) || defined(__linux__)
+#include <unistd.h>
+#include <math.h>
+#include <time.h>
+#include <unistd.h>
+#include <dlfcn.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <limits.h>
+#include <signal.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#endif
+
+#include <string>
+#include <vector>
+#include <thread>
+#include <mutex>
+#include <map>
+#include <set>
+#include <tuple>
+#include <queue>
+#include <fstream>
+#include <iostream>
+#include <iomanip>
+#include <sstream>
+#include <chrono>
+#include <memory>
+#include <regex>
+#include <random>
+#include <functional>
+#include <unordered_map>
+#include <condition_variable>
+#include <cassert>
+#include <unordered_set>
+#include <utility>
+#include <algorithm>
+
+#include "gguf.h"
+#include "ggml.h"
+#include "ggml-cpu.h"
+#include "ggml-alloc.h"
+#include "ggml-backend.h"
+#include "ggml-hexagon.h"
+
+static void tensor_dump(const ggml_tensor * tensor, const char * name);
+
+#define TENSOR_DUMP(tensor, bdump) tensor_dump(tensor, #tensor, bdump)
+#define TMPBUF_LEN                 256
+
+static bool ggml_graph_compute_helper(
+        struct ggml_backend * backend,
+        struct ggml_cgraph * graph,
+        std::vector<uint8_t> & buf,
+        int n_threads,
+        ggml_abort_callback abort_callback,
+        void * abort_callback_data) {
+    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, NULL);
+
+    plan.abort_callback = abort_callback;
+    plan.abort_callback_data = abort_callback_data;
+
+    if (plan.work_size > 0) {
+        buf.resize(plan.work_size);
+        plan.work_data = buf.data();
+    }
+
+    if (nullptr != backend)
+        return ggml_backend_graph_compute(backend, graph) == GGML_STATUS_SUCCESS;
+    else
+        return ggml_graph_compute(graph, &plan);
+}
+
+
+static void tensor_dump_elements(const ggml_tensor * tensor) {
+    float value = 0;
+    std::ostringstream tmposs;
+    if (tensor->type == GGML_TYPE_F32) {
+        for (int h = 0; h < tensor->ne[3]; h++) {
+            for (int i = 0; i < tensor->ne[2]; i++) {
+                for (int j = 0; j < tensor->ne[1]; j++) {
+                    for (int k = 0; k < tensor->ne[0]; k++) {
+                        value = ((float *) tensor->data)[h * tensor->ne[2] + i * tensor->ne[1] +
+                                                         j * tensor->ne[0] + k];
+                        tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value
+                               << " ";
+                    }
+                    if (strlen(tmposs.str().c_str()) <= (4096 - 96)) {
+                        printf("%s\n", tmposs.str().c_str());
+                    }
+                    tmposs.clear();
+                    tmposs.str("");
+                }
+            }
+        }
+    }
+
+    printf("\n");
+}
+
+
+static void tensor_dump(const ggml_tensor * tensor, const char * name, int bdump) {
+    printf("dump ggml tensor %s(%s)\n", name, tensor->name);
+    printf("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64", nb = (%5zi, %5zi, %5zi, %5zi)\n",
+          name,
+          tensor->type, ggml_type_name(tensor->type),
+          tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3],
+          tensor->nb[0], tensor->nb[1], tensor->nb[2], tensor->nb[2]);
+    if (1 == bdump)
+        tensor_dump_elements(tensor);
+
+    printf("\n");
+}
+
+
+static uint32_t get_tensor_rank(const ggml_tensor * tensor) {
+    uint32_t rank = 0;
+    for (int i = 0; i < GGML_MAX_DIMS; i++) {
+        if ((0 != tensor->ne[i]) && (1 != tensor->ne[i])) {
+            rank++;
+        }
+    }
+    return rank;
+}
+
+
+static uint32_t get_tensor_data_size(const ggml_tensor * tensor) {
+    return ggml_nbytes(tensor);
+}
+
+
+//ref: https://github.com/ggerganov/llama.cpp/blob/master/tests/test-backend-ops.cpp#L20
+static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) {
+    // static RNG initialization (revisit if n_threads stops being constant)
+    static const size_t n_threads = std::thread::hardware_concurrency();
+    static std::vector<std::default_random_engine> generators = []() {
+        std::random_device rd;
+        std::vector<std::default_random_engine> vec;
+        vec.reserve(n_threads);
+        //for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(1234 + i); } // fixed seed
+        for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(rd()); }
+        return vec;
+    }();
+
+    size_t size = ggml_nelements(tensor);
+    std::vector<float> data(size);
+
+    auto init_thread = [&](size_t ith, size_t start, size_t end) {
+        std::uniform_real_distribution<float> distribution(min, max);
+        for (size_t i = start; i < end; i++) {
+            data[i] = distribution(generators[ith]);
+        }
+    };
+
+    std::vector<std::thread> threads;
+    threads.reserve(n_threads);
+    for (size_t i = 0; i < n_threads; i++) {
+        size_t start =     i*size/n_threads;
+        size_t end   = (i+1)*size/n_threads;
+        threads.emplace_back(init_thread, i, start, end);
+    }
+    for (auto & t : threads) {
+        t.join();
+    }
+    if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) {
+        ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float));
+    } else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16) {
+        GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0);
+        std::vector<uint8_t> dataq(ggml_row_size(tensor->type, size));
+        std::vector<float> imatrix(tensor->ne[0], 1.0f); // dummy importance matrix
+        const float * im = imatrix.data();
+        if (!ggml_quantize_requires_imatrix(tensor->type)) {
+            // when the imatrix is optional, we want to test both quantization with and without imatrix
+            // use one of the random numbers to decide
+            if (data[0] > 0.5f*(min + max)) {
+                im = nullptr;
+            }
+        }
+        ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], im);
+        GGML_ASSERT(ggml_validate_row_data(tensor->type, dataq.data(), dataq.size()));
+        ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
+    } else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
+        // This is going to create some weird integers though.
+        ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor));
+    } else {
+        GGML_ASSERT(false);
+    }
+}
+
+
+//ref: https://github.com/ggerganov/llama.cpp/blob/master/tests/test-backend-ops.cpp#L310
+static void initialize_tensors(ggml_context * ctx) {
+    for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
+        init_tensor_uniform(t);
+    }
+}
+
+
+static void show_usage() {
+    printf(" " \
+        "\nUsage: ggmlhexagon-benchmark [options]\n" \
+        "\n" \
+        "Options:\n" \
+        " -t ADD / MUL_MAT \n" \
+        " -b 0(QNN_CPU) 1(QNN_GPU) 2(QNN_NPU) 3(Hexagon-cDSP) 4(ggml)\n" \
+        " -m row\n" \
+        " -n col\n" \
+        " ?/h print usage information\n\n"
+    );
+}
+
+
+static void get_timestring(char * p_currenttime) {
+    if (nullptr == p_currenttime)
+        return;
+
+
+    auto time_to_string = [](const std::chrono::system_clock::time_point & tp)->std::string {
+        auto as_time_t = std::chrono::system_clock::to_time_t(tp);
+        struct tm tm;
+
+        localtime_r(&as_time_t, &tm);
+
+        std::chrono::milliseconds ms = std::chrono::duration_cast<std::chrono::milliseconds>(tp.time_since_epoch());
+        char buf[TMPBUF_LEN];
+        memset(buf, 0, TMPBUF_LEN);
+        snprintf(buf, sizeof(buf), "%04d-%02d-%02d,%02d:%02d:%02d",
+                 tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday, tm.tm_hour, tm.tm_min, tm.tm_sec);
+        GGML_UNUSED(ms);
+        return buf;
+    };
+
+    std::chrono::system_clock::time_point tp = std::chrono::system_clock::now();
+    snprintf(p_currenttime, TMPBUF_LEN, "%s", time_to_string(tp).c_str());
+}
+
+
+int main(int argc, char * argv[]) {
+    int64_t n_begin_time        = 0LL;
+    int64_t n_end_time          = 0LL;
+    int64_t n_duration          = 0LL;
+    size_t  ctx_size            = 0;
+    int     sizex               = 4096;
+    int     sizey               = 4096;
+    //int     sizez               = 4096;
+
+    int n_backend_type          = HEXAGON_BACKEND_QNNNPU;
+    int n_ggml_op_type          = GGML_OP_ADD;
+    int n_mulmat_algotype       = 0;
+
+    struct ggml_context * ctx   = nullptr;
+    struct ggml_cgraph  * gf    = nullptr;
+    struct ggml_tensor  * src0  = nullptr;
+    struct ggml_tensor  * src1  = nullptr;
+    //struct ggml_tensor  * src2  = nullptr;
+    struct ggml_tensor  * dst   = nullptr;
+    ggml_backend_t backend      = nullptr;
+    ggml_backend_buffer_t buffer= nullptr;
+    ggml_type qtype             = GGML_TYPE_F32;
+    //ggml_type qtype           = GGML_TYPE_Q4_0;
+    std::vector<uint8_t> work_buffer;
+
+    for (int i = 1; i < argc; i++) {
+        if (0 == strcmp(argv[i], "-t")) {
+            if (i + 1 < argc) {
+                if (0 == memcmp(argv[i + 1], "ADD", 3)) {
+                    n_ggml_op_type = GGML_OP_ADD;
+                } else if (0 == memcmp(argv[i + 1], "MUL_MAT", 7)) {
+                    n_ggml_op_type = GGML_OP_MUL_MAT;
+                } else {
+                    show_usage();
+                    return 1;
+                }
+                i++;
+            }
+        } else if (0 == strcmp(argv[i], "-b")) {
+            if (i + 1 < argc) {
+                int backend = atoi(argv[i + 1]);
+                if (backend <= HEXAGON_BACKEND_GGML)
+                {
+                    n_backend_type     = backend;
+                    printf("backend_type %d\n", backend);
+                }
+                else {
+                    show_usage();
+                    return 2;
+                }
+                i++;
+            }
+        } else if (0 == strcmp(argv[i], "-m")) {
+            if (i + 1 < argc) {
+                sizex = atoi(argv[i+1]);
+                i++;
+            }
+        } else if (0 == strcmp(argv[i], "-n")) {
+            if (i + 1 < argc) {
+                sizey = atoi(argv[i+1]);
+                i++;
+            }
+        } else if (0 == strcmp(argv[i], "-a")) {
+            if (i + 1 < argc) {
+                n_mulmat_algotype = atoi(argv[i+1]);
+                i++;
+            }
+        } else {
+            show_usage();
+            return 3;
+        }
+    }
+
+    printf("init backend %d\n", n_backend_type);
+
+#ifdef GGML_USE_HEXAGON
+    //avoid manually modify ggml-hexagon.cfg
+    if (n_backend_type >= HEXAGON_BACKEND_CDSP) {
+        ggml_backend_hexagon_set_cfg(n_backend_type, HWACCEL_CDSP);
+    }
+    if (n_backend_type < HEXAGON_BACKEND_CDSP) {
+        ggml_backend_hexagon_set_cfg(n_backend_type, HWACCEL_QNN);
+    }
+    ggml_backend_hexagon_set_mulmat_algotype(n_mulmat_algotype);
+#endif
+
+    srand(time(NULL));
+
+    ctx_size += 4096 * 4096 * 64;
+    ctx_size += 4096 * 4096 * 64;
+    printf("Allocating Memory of size %ld bytes, %ld MB\n", ctx_size, (ctx_size / 1024 / 1024));
+
+    struct ggml_init_params params = {
+            /*.mem_size   =*/ ctx_size,
+            /*.mem_buffer =*/ NULL,
+            /* no_alloc   =*/ 0
+    };
+
+#ifdef GGML_USE_HEXAGON
+    if (n_backend_type != HEXAGON_BACKEND_GGML) {
+        params.no_alloc = true;
+    }
+#endif
+
+    ctx = ggml_init(params);
+    if (!ctx) {
+        printf("ggml_init failure\n");
+        return 4;
+    }
+
+    printf("creating new tensors\n");
+    printf("ggml_blck_size(%s) %ld\n", ggml_type_name(qtype), ggml_blck_size(qtype));
+    printf("ggml_type_size(%s) %ld\n", ggml_type_name(qtype), ggml_type_size(qtype));
+    if (qtype != GGML_TYPE_F32) {
+        sizex = ggml_blck_size(qtype);
+    }
+
+    printf("ggml op:%d(%s)", n_ggml_op_type, ggml_op_name((enum ggml_op) n_ggml_op_type));
+    src0 = ggml_new_tensor_2d(ctx, qtype,         sizey, sizex);
+    src1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizey, sizex);
+
+    ggml_set_input(src0);
+    ggml_set_input(src1);
+    switch (n_ggml_op_type) {
+        case GGML_OP_ADD:
+            dst = ggml_add(ctx, src0, src1);
+            break;
+        case GGML_OP_MUL_MAT:
+            dst = ggml_mul_mat(ctx, src0, src1);
+            break;
+        default:
+            printf("ggml op %d(%s) not supported", n_ggml_op_type,
+                  ggml_op_name((enum ggml_op) n_ggml_op_type));
+            ggml_free(ctx);
+            return 5;
+    }
+
+    ggml_set_output(dst);
+
+#ifdef GGML_USE_HEXAGON
+    if (n_backend_type != HEXAGON_BACKEND_GGML) {
+        printf("init backend %d\n", n_backend_type);
+        backend = ggml_backend_hexagon_init(n_backend_type, "/data/local/tmp/");
+        if (nullptr == backend) {
+            printf("create  backend %d(%s) failed\n", n_backend_type, ggml_backend_hexagon_get_devname(n_backend_type));
+            ggml_free(ctx);
+            return 6;
+        } else {
+            printf("create  backend %d(%s) succeed\n", n_backend_type, ggml_backend_hexagon_get_devname(n_backend_type));
+        }
+
+        ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend);
+        buffer = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
+        if (nullptr == buffer) {
+            printf("%s: failed to allocate backend buffer\n", __func__);
+            ggml_free(ctx);
+            ggml_backend_free(backend);
+            return 7;
+        }
+    } else {
+        printf("init default cpu backend\n");
+        backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
+    }
+#else
+    backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
+#endif
+    GGML_ASSERT(backend != nullptr);
+
+    printf("creating compute graph\n");
+    gf = ggml_new_graph(ctx);
+    ggml_build_forward_expand(gf, dst);
+
+    if (qtype == GGML_TYPE_F32) {
+        ggml_set_f32(src0, 1.0f);
+        ggml_set_f32(src1, 2.0f);
+        ggml_set_f32(dst, 0.0f);
+    } else {
+        initialize_tensors(ctx);
+    }
+
+    n_begin_time = ggml_time_us();
+    ggml_backend_graph_compute(backend, gf);
+    n_end_time = ggml_time_us();
+    n_duration = (n_end_time - n_begin_time) / 1000;
+    if (get_tensor_data_size(dst) < (256 * 256)) {
+        printf("dump result tensors:\n");
+        TENSOR_DUMP(src0, 1);
+        TENSOR_DUMP(src1, 1);
+        TENSOR_DUMP(dst, 1);
+    } else {
+        if (get_tensor_data_size(dst) < (512 * 512)) {
+            TENSOR_DUMP(dst, 1);
+        }
+        printf("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
+              src0->name,
+              src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2],
+              src0->nb[0], src0->nb[1], src0->nb[2]);
+        printf("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
+              src1->name,
+              src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2],
+              src1->nb[0], src1->nb[1], src1->nb[2]);
+        printf("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
+              dst->name,
+              dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0],
+              dst->nb[1], dst->nb[2]);
+
+    }
+
+    ggml_free(ctx);
+    ggml_backend_buffer_free(buffer);
+    ggml_backend_free(backend);
+
+    char currenttime_string[TMPBUF_LEN];
+    get_timestring(currenttime_string);
+
+#ifdef GGML_USE_HEXAGON
+    if ((n_backend_type == HEXAGON_BACKEND_CDSP) && (n_ggml_op_type == GGML_OP_MUL_MAT)) {
+        printf("[%s] duration of ut GGML_OP_%s with backend %s(algo type:%d): %ld milliseconds\n", currenttime_string, ggml_op_name((enum ggml_op)n_ggml_op_type), ggml_backend_hexagon_get_devname(n_backend_type), ggml_backend_hexagon_get_mulmat_algotype(), n_duration);
+    } else {
+        printf("[%s] duration of ut GGML_OP_%s with backend %s: %ld milliseconds\n", currenttime_string, ggml_op_name((enum ggml_op)n_ggml_op_type), ggml_backend_hexagon_get_devname(n_backend_type), n_duration);
+    }
+#else
+    printf("[%s] duration of ut GGML_OP_%s with the default ggml backend: %ld milliseconds\n", currenttime_string, ggml_op_name((enum ggml_op)n_ggml_op_type), n_duration);
+#endif
+
+    return 0;
+}
diff --git a/tests/ggmlhexagon-testops.cpp b/tests/ggmlhexagon-testops.cpp
new file mode 100644
index 0000000000000..a2b4290f527a7
--- /dev/null
+++ b/tests/ggmlhexagon-testops.cpp
@@ -0,0 +1,5224 @@
+// This file defines tests for various GGML ops and backends.
+// For the forward pass it asserts that the results of multiple backends computing the same GGML ops are consistent.
+// For the backward pass it asserts that the gradients from backpropagation are consistent
+// with the gradients obtained via the method of finite differences ("grad" mode, this is optional).
+// It is also possible to check the performance ("perf" mode).
+//
+// this file has three sections: Section 1 does general setup, section 2 defines the GGML ops to be tested,
+// and section 3 defines which tests to run.
+// Quick start for adding a new GGML op: Go to section 2 and create a struct that inherits from test_case,
+// then go to section 3 and add an instantiation of your struct.
+
+
+// ##############################
+// ## Section 1: General Setup ##
+// ##############################
+
+
+#include <ggml.h>
+#include <ggml-alloc.h>
+#include <ggml-backend.h>
+#include <ggml-cpp.h>
+
+#include <algorithm>
+#include <array>
+#include <cfloat>
+#include <cinttypes>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <future>
+#include <memory>
+#include <random>
+#include <regex>
+#include <string>
+#include <thread>
+#include <vector>
+
+#ifdef GGML_USE_HEXAGON
+#include "ggml-hexagon.h"
+#endif
+
+static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) {
+    size_t nels = ggml_nelements(tensor);
+    std::vector<float> data(nels);
+    {
+        // parallel initialization
+        static const size_t n_threads = std::thread::hardware_concurrency();
+        // static RNG initialization (revisit if n_threads stops being constant)
+        static std::vector<std::default_random_engine> generators = []() {
+            std::random_device rd;
+            std::vector<std::default_random_engine> vec;
+            vec.reserve(n_threads);
+            //for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(1234 + i); } // fixed seed
+            for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(rd()); }
+            return vec;
+        }();
+
+        auto init_thread = [&](size_t ith, size_t start, size_t end) {
+            std::uniform_real_distribution<float> distribution(min, max);
+            auto & gen = generators[ith];
+            for (size_t i = start; i < end; i++) {
+                data[i] = distribution(gen);
+            }
+        };
+
+        std::vector<std::future<void>> tasks;
+        tasks.reserve(n_threads);
+        for (size_t i = 0; i < n_threads; i++) {
+            size_t start =     i*nels/n_threads;
+            size_t end   = (i+1)*nels/n_threads;
+            tasks.push_back(std::async(std::launch::async, init_thread, i, start, end));
+        }
+        for (auto & t : tasks) {
+            t.get();
+        }
+    }
+
+    if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) {
+        ggml_backend_tensor_set(tensor, data.data(), 0, nels * sizeof(float));
+    } else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16) {
+        GGML_ASSERT(nels % ggml_blck_size(tensor->type) == 0);
+
+         // dummy importance matrix
+        std::vector<float> imatrix(tensor->ne[0], 1.0f);
+        const float * im = imatrix.data();
+        if (!ggml_quantize_requires_imatrix(tensor->type)) {
+            // when the imatrix is optional, we want to test both quantization with and without imatrix
+            // use one of the random numbers to decide
+            if (data[0] > 0.5f*(min + max)) {
+                im = nullptr;
+            }
+        }
+
+        std::vector<uint8_t> dataq(ggml_row_size(tensor->type, nels));
+        {
+            // parallel quantization by block
+            size_t blck_size = ggml_blck_size(tensor->type);
+            size_t n_blocks = nels / blck_size;
+
+            auto quantize_thread = [&](size_t start, size_t end) {
+                ggml_quantize_chunk(tensor->type, data.data(), dataq.data(),
+                    start * blck_size, end - start, blck_size, im);
+            };
+
+            const size_t min_blocks_per_thread = 1;
+            const size_t n_threads = std::min<size_t>(std::thread::hardware_concurrency()/2,
+                                                      std::max<size_t>(1, n_blocks / min_blocks_per_thread));
+            std::vector<std::future<void>> tasks;
+            tasks.reserve(n_threads);
+            for (size_t i = 0; i < n_threads; i++) {
+                size_t start =     i*n_blocks/n_threads;
+                size_t end   = (i+1)*n_blocks/n_threads;
+                tasks.push_back(std::async(std::launch::async, quantize_thread, start, end));
+            }
+            for (auto & t : tasks) {
+                t.get();
+            }
+        }
+        ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
+    } else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
+        // This is going to create some weird integers though.
+        ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor));
+    } else if (tensor->type == GGML_TYPE_I64) {
+        // Integers with a size of 8 bytes can be set by mirroring the float data, the specific values are again not really meaningful.
+        const size_t nbytes_half = ggml_nbytes(tensor)/2;
+        ggml_backend_tensor_set(tensor, data.data(), 0*nbytes_half, nbytes_half);
+        ggml_backend_tensor_set(tensor, data.data(), 1*nbytes_half, nbytes_half);
+    } else {
+        GGML_ABORT("fatal error");
+    }
+}
+
+static std::vector<float> tensor_to_float(const ggml_tensor * t) {
+    std::vector<float> tv;
+    tv.reserve(ggml_nelements(t));
+
+    std::vector<uint8_t> buf(ggml_nbytes(t));
+    ggml_backend_tensor_get(t, buf.data(), 0, ggml_nbytes(t));
+
+    const auto * tt = ggml_get_type_traits(t->type);
+    size_t bs = ggml_blck_size(t->type);
+    std::vector<float> vq(ggml_blck_size(t->type));
+    bool quantized = ggml_is_quantized(t->type);
+
+    // access elements by index to avoid gaps in views
+    for (int64_t i3 = 0; i3 < t->ne[3]; i3++) {
+        for (int64_t i2 = 0; i2 < t->ne[2]; i2++) {
+            for (int64_t i1 = 0; i1 < t->ne[1]; i1++) {
+                for (int64_t i0 = 0; i0 < t->ne[0]; i0 += bs) {
+                    size_t i = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0/bs*t->nb[0];
+                    if (t->type == GGML_TYPE_F16) {
+                        tv.push_back(ggml_fp16_to_fp32(*(ggml_fp16_t*)&buf[i]));
+                    } else if (t->type == GGML_TYPE_BF16) {
+                        tv.push_back(ggml_bf16_to_fp32(*(ggml_bf16_t*)&buf[i]));
+                    } else if (t->type == GGML_TYPE_F32) {
+                        tv.push_back(*(float *) &buf[i]);
+                    } else if (t->type == GGML_TYPE_I64) {
+                        tv.push_back((float)*(int64_t *) &buf[i]);
+                    } else if (t->type == GGML_TYPE_I32) {
+                        tv.push_back((float)*(int32_t *) &buf[i]);
+                    } else if (t->type == GGML_TYPE_I16) {
+                        tv.push_back((float)*(int16_t *) &buf[i]);
+                    } else if (t->type == GGML_TYPE_I8) {
+                        tv.push_back((float)*(int8_t *) &buf[i]);
+                    } else if (quantized) {
+                        tt->to_float(&buf[i], vq.data(), bs);
+                        tv.insert(tv.end(), vq.begin(), vq.end());
+                    } else {
+                        GGML_ABORT("fatal error");
+                    }
+                }
+            }
+        }
+    }
+
+    return tv;
+}
+
+// normalized mean squared error = mse(a, b) / mse(a, 0)
+static double nmse(const float * a, const float * b, size_t n) {
+    double mse_a_b = 0.0;
+    double mse_a_0 = 0.0;
+
+    for (size_t i = 0; i < n; i++) {
+        float a_i = a[i];
+        float b_i = b[i];
+
+        mse_a_b += (a_i - b_i) * (a_i - b_i);
+        mse_a_0 += a_i * a_i;
+    }
+
+    return mse_a_b / mse_a_0;
+}
+
+// maximum absolute asymmetry between a and b
+// asymmetry: (a - b) / (a + b)
+// This is more stable than relative error if one of the values fluctuates towards zero.
+// n: number of values to compare.
+// expected_vals: optional vector of expected values for a. If expected_vals is not empty, filter out all comparisons where
+//     a does not match any of the expected values. Needed for noncontinuous gradients where the numerical calculation can fail.
+static double mean_abs_asymm(const float * a, const float * b, const size_t n, const std::vector<float> & expected_vals) {
+    double sum = 0.0f;
+
+    size_t nvalid = 0;
+    for (size_t i = 0; i < n; i++) {
+        if (!expected_vals.empty()) {
+            bool matches_any = false;
+            for (const float & ev : expected_vals) {
+                if (fabsf(a[i] - ev) < 1e-3f) {
+                    matches_any = true;
+                    break;
+                }
+            }
+            if (!matches_any) {
+                continue;
+            }
+        }
+
+        const float asymm = (a[i] - b[i]) / (a[i] + b[i]);
+
+        sum += fabsf(asymm);
+        nvalid++;
+    }
+
+    return sum/nvalid;
+}
+
+// utils for printing the variables of the test cases
+
+template<typename T>
+static std::string var_to_str(const T & x) {
+    return std::to_string(x);
+}
+
+template<typename T, size_t N>
+static std::string var_to_str(const T (&x)[N]) {
+    std::string s = "[";
+    for (size_t i = 0; i < N; i++) {
+        if (i > 0) {
+            s += ",";
+        }
+        s += var_to_str(x[i]);
+    }
+    s += "]";
+    return s;
+}
+
+template<typename T, size_t N>
+static std::string var_to_str(const std::array<T, N> & x) {
+    std::string s = "[";
+    for (size_t i = 0; i < N; i++) {
+        if (i > 0) {
+            s += ",";
+        }
+        s += var_to_str(x[i]);
+    }
+    s += "]";
+    return s;
+}
+
+static std::string var_to_str(ggml_type type) {
+    return ggml_type_name(type);
+}
+
+static std::string var_to_str(ggml_prec prec) {
+    return prec == GGML_PREC_F32 ? "f32" : "def";
+}
+
+static std::string var_to_str(ggml_op_pool pool) {
+    switch (pool) {
+        case GGML_OP_POOL_AVG:  return "avg";
+        case GGML_OP_POOL_MAX:  return "max";
+        default:                return std::to_string(pool);
+    }
+}
+
+static std::string var_to_str(ggml_scale_mode mode) {
+    switch (mode) {
+        case GGML_SCALE_MODE_NEAREST:  return "nearest";
+        case GGML_SCALE_MODE_BILINEAR: return "bilinear";
+        default:                      return std::to_string(mode);
+    }
+}
+
+#define VAR_TO_STR(x) (#x "=" + var_to_str(x))
+
+#define VARS_TO_STR1(a) VAR_TO_STR(a)
+#define VARS_TO_STR2(a, b) VAR_TO_STR(a) + "," + VAR_TO_STR(b)
+#define VARS_TO_STR3(a, b, c) VAR_TO_STR(a) + "," + VARS_TO_STR2(b, c)
+#define VARS_TO_STR4(a, b, c, d) VAR_TO_STR(a) + "," + VARS_TO_STR3(b, c, d)
+#define VARS_TO_STR5(a, b, c, d, e) VAR_TO_STR(a) + "," + VARS_TO_STR4(b, c, d, e)
+#define VARS_TO_STR6(a, b, c, d, e, f) VAR_TO_STR(a) + "," + VARS_TO_STR5(b, c, d, e, f)
+#define VARS_TO_STR7(a, b, c, d, e, f, g) VAR_TO_STR(a) + "," + VARS_TO_STR6(b, c, d, e, f, g)
+#define VARS_TO_STR8(a, b, c, d, e, f, g, h) VAR_TO_STR(a) + "," + VARS_TO_STR7(b, c, d, e, f, g, h)
+#define VARS_TO_STR9(a, b, c, d, e, f, g, h, i) VAR_TO_STR(a) + "," + VARS_TO_STR8(b, c, d, e, f, g, h, i)
+#define VARS_TO_STR10(a, b, c, d, e, f, g, h, i, j) VAR_TO_STR(a) + "," + VARS_TO_STR9(b, c, d, e, f, g, h, i, j)
+#define VARS_TO_STR11(a, b, c, d, e, f, g, h, i, j, k) VAR_TO_STR(a) + "," + VARS_TO_STR10(b, c, d, e, f, g, h, i, j, k)
+#define VARS_TO_STR12(a, b, c, d, e, f, g, h, i, j, k, l) VAR_TO_STR(a) + "," + VARS_TO_STR11(b, c, d, e, f, g, h, i, j, k, l)
+
+#ifdef GGML_USE_SYCL
+static bool inline _isinf(float f) {
+    return (*(uint32_t *)&f & 0x7fffffff) == 0x7f800000;
+}
+#else
+static bool inline _isinf(float f) { return std::isinf(f); }
+#endif
+
+// accept FLT_MAX as infinity
+static bool isinf_or_max(float f) {
+    return _isinf(f) || f == FLT_MAX || f == -FLT_MAX;
+}
+
+static bool ggml_is_view_op(enum ggml_op op) {
+    return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE;
+}
+
+enum test_mode {
+    MODE_TEST,
+    MODE_PERF,
+    MODE_GRAD,
+};
+
+struct test_case {
+    virtual ~test_case() {}
+
+    virtual std::string op_desc(ggml_tensor * t) {
+        return ggml_op_desc(t);
+    }
+
+    virtual std::string vars() {
+        return "";
+    }
+
+    virtual ggml_tensor * build_graph(ggml_context * ctx) = 0;
+
+    virtual double max_nmse_err() {
+        return 1e-7;
+    }
+
+    virtual double max_maa_err() {
+        return 1e-4;
+    }
+
+    virtual float grad_eps() {
+        return 1e-1f;
+    }
+
+    // If false, estimate gradient with 2 points, neglects 3rd order derivative and higher.
+    // If true,  estimate gradient with 4 points, neglects 5th order derivative and higher.
+    virtual bool grad_precise() {
+        return false;
+    }
+
+    // Skip gradient checks if total number of gradients to be checked is larger than this (to speed up the tests).
+    virtual int64_t grad_nmax() {
+        return 10000;
+    }
+
+    // No effect if empty.
+    // If not empty, skip all gradient checks where the numerical result does not match any of the values.
+    // Needed for dealing with noncontinuous gradients (e.g. ReLU) where estimation using finite differences is unreliable.
+    virtual std::vector<float> grad_expect() {
+        return {};
+    }
+
+    virtual void initialize_tensors(ggml_context * ctx) {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
+            init_tensor_uniform(t);
+        }
+    }
+
+    virtual size_t op_size(ggml_tensor * t) {
+        size_t size = ggml_nbytes(t);
+        // add source tensors
+        for (int i = 0; i < GGML_MAX_SRC; i++) {
+            if (t->src[i] != NULL) {
+                size += ggml_nbytes(t->src[i]);
+            }
+        }
+        return size;
+    }
+
+    virtual uint64_t op_flops(ggml_tensor * t) {
+        GGML_UNUSED(t);
+        return 0;
+    }
+
+    virtual bool run_whole_graph() { return false; }
+
+    ggml_cgraph * gf = nullptr;
+    ggml_cgraph * gb = nullptr;
+
+    static const int sentinel_size = 1024;
+
+    test_mode mode;
+
+    std::vector<ggml_tensor *> sentinels;
+
+    void add_sentinel(ggml_context * ctx) {
+        if (mode == MODE_PERF || mode == MODE_GRAD) {
+            return;
+        }
+        ggml_tensor * sentinel = ::ggml_new_tensor_1d(ctx, GGML_TYPE_F32, sentinel_size);
+        ggml_format_name(sentinel, "sent_%zu", sentinels.size());
+        sentinels.push_back(sentinel);
+    }
+
+    // hijack ggml_new_tensor to add sentinels after each tensor to check for overflows in the backend
+
+    ggml_tensor * ggml_new_tensor(ggml_context * ctx, ggml_type type, int n_dims, const int64_t * ne) {
+        ggml_tensor * t = ::ggml_new_tensor(ctx, type, n_dims, ne);
+        add_sentinel(ctx);
+        return t;
+    }
+
+    ggml_tensor * ggml_new_tensor_1d(ggml_context * ctx, ggml_type type, int64_t ne0) {
+        ggml_tensor * t = ::ggml_new_tensor_1d(ctx, type, ne0);
+        add_sentinel(ctx);
+        return t;
+    }
+
+    ggml_tensor * ggml_new_tensor_2d(ggml_context * ctx, ggml_type type, int64_t ne0, int64_t ne1) {
+        ggml_tensor * t = ::ggml_new_tensor_2d(ctx, type, ne0, ne1);
+        add_sentinel(ctx);
+        return t;
+    }
+
+    ggml_tensor * ggml_new_tensor_3d(ggml_context * ctx, ggml_type type, int64_t ne0, int64_t ne1, int64_t ne2) {
+        ggml_tensor * t = ::ggml_new_tensor_3d(ctx, type, ne0, ne1, ne2);
+        add_sentinel(ctx);
+        return t;
+    }
+
+    ggml_tensor * ggml_new_tensor_4d(ggml_context * ctx, ggml_type type, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
+        ggml_tensor * t = ::ggml_new_tensor_4d(ctx, type, ne0, ne1, ne2, ne3);
+        add_sentinel(ctx);
+        return t;
+    }
+
+    bool eval(ggml_backend_t backend1, ggml_backend_t backend2, const char * op_name) {
+        mode = MODE_TEST;
+
+        ggml_init_params params = {
+            /* .mem_size = */ ggml_tensor_overhead()*128 + ggml_graph_overhead(),
+            /* .mem_base = */ NULL,
+            /* .no_alloc = */ true,
+        };
+        ggml_context * ctx = ggml_init(params);
+        GGML_ASSERT(ctx);
+
+        gf = ggml_new_graph(ctx);
+
+        // pre-graph sentinel
+        add_sentinel(ctx);
+
+        ggml_tensor * out = build_graph(ctx);
+
+        if (op_name != nullptr && op_desc(out) != op_name) {
+            //printf("  %s: skipping\n", op_desc(out).c_str());
+            ggml_free(ctx);
+            return true;
+        }
+
+        printf("  %s(%s): ", op_desc(out).c_str(), vars().c_str());
+        fflush(stdout);
+
+        // check if the backends support the ops
+        bool supported = true;
+        for (ggml_backend_t backend : {backend1, backend2}) {
+            for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+                if (!ggml_backend_supports_op(backend, t)) {
+                    printf("not supported [%s] ", ggml_backend_name(backend));
+                    supported = false;
+                    break;
+                }
+            }
+        }
+        if (!supported) {
+            printf("\n");
+            ggml_free(ctx);
+            return true;
+        }
+
+        // post-graph sentinel
+        add_sentinel(ctx);
+
+        // allocate
+        ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend1);
+
+        if (buf == NULL) {
+            printf("failed to allocate tensors [%s] ", ggml_backend_name(backend1));
+            ggml_free(ctx);
+            return false;
+        }
+
+        // build graph
+        ggml_build_forward_expand(gf, out);
+
+        // add sentinels as graph nodes so that they are checked in the callback
+        for (ggml_tensor * sentinel : sentinels) {
+            ggml_graph_add_node(gf, sentinel);
+        }
+
+        // randomize tensors
+        initialize_tensors(ctx);
+
+        // compare
+        struct callback_userdata {
+            bool   ok;
+            double max_err;
+            ggml_backend_t backend1;
+            ggml_backend_t backend2;
+        };
+
+        callback_userdata ud {
+            true,
+            max_nmse_err(),
+            backend1,
+            backend2
+        };
+
+        auto callback = [](int index, ggml_tensor * t1, ggml_tensor * t2, void * user_data) -> bool {
+            callback_userdata * ud = (callback_userdata *) user_data;
+            const char * bn1 = ggml_backend_name(ud->backend1);
+            const char * bn2 = ggml_backend_name(ud->backend2);
+
+            if (t1->op == GGML_OP_NONE) {
+                // sentinels must be unchanged
+                std::vector<uint8_t> t1_data(ggml_nbytes(t1));
+                std::vector<uint8_t> t2_data(ggml_nbytes(t2));
+                ggml_backend_tensor_get(t1, t1_data.data(), 0, ggml_nbytes(t1));
+                ggml_backend_tensor_get(t2, t2_data.data(), 0, ggml_nbytes(t2));
+
+                if (memcmp(t1_data.data(), t2_data.data(), ggml_nbytes(t1)) != 0) {
+                    printf("sentinel mismatch: %s ", t1->name);
+                    ud->ok = false;
+                    return true;
+                }
+            }
+
+            std::vector<float> f1 = tensor_to_float(t1);
+            std::vector<float> f2 = tensor_to_float(t2);
+
+            for (size_t i = 0; i < f1.size(); i++) {
+                // check for nans
+                if (std::isnan(f1[i]) || std::isnan(f2[i])) {
+                    printf("[%s] NaN at index %zu (%s=%f %s=%f) ", ggml_op_desc(t1), i, bn1, f1[i], bn2, f2[i]);
+                    ud->ok = false;
+                    return true;
+                }
+                // check for infs: both must be inf of the same sign, or both must be finite
+                if (isinf_or_max(f1[i]) || isinf_or_max(f2[i])) {
+                    if (isinf_or_max(f1[i]) && isinf_or_max(f2[i])) {
+                        if (std::signbit(f1[i]) != std::signbit(f2[i])) {
+                            printf("[%s] inf sign mismatch: %s=%f %s=%f ", ggml_op_desc(t1), bn1, f1[i], bn2, f2[i]);
+                            ud->ok = false;
+                            return true;
+                        }
+                    } else {
+                        printf("[%s] inf mismatch: %s=%f %s=%f ", ggml_op_desc(t1), bn1, f1[i], bn2, f2[i]);
+                        ud->ok = false;
+                        return true;
+                    }
+                }
+            }
+
+            double err = nmse(f1.data(), f2.data(), f1.size());
+            if (err > ud->max_err) {
+                printf("[%s] NMSE = %.9f > %.9f ", ggml_op_desc(t1), err, ud->max_err);
+                //for (int i = 0; i < (int) f1.size(); i++) {
+                //    printf("%5d %9.6f %9.6f, diff = %9.6f\n", i, f1[i], f2[i], f1[i] - f2[i]);
+                //}
+                //printf("\n");
+                //exit(1);
+                ud->ok = false;
+            }
+            return true;
+
+            GGML_UNUSED(index);
+        };
+
+        const bool cmp_ok = ggml_backend_compare_graph_backend(backend1, backend2, gf, callback, &ud, run_whole_graph() ? out : nullptr);
+
+        if (!cmp_ok) {
+            printf("compare failed ");
+        }
+
+        ggml_backend_buffer_free(buf);
+
+        ggml_free(ctx);
+
+        if (ud.ok && cmp_ok) {
+            printf("\033[1;32mOK\033[0m\n");
+            return true;
+        }
+
+        printf("\033[1;31mFAIL\033[0m\n");
+        return false;
+    }
+
+    bool eval_perf(ggml_backend_t backend, const char * op_name) {
+        mode = MODE_PERF;
+
+        static const size_t graph_nodes = 8192;
+
+        ggml_init_params params = {
+            /* .mem_size = */ ggml_tensor_overhead()*128 + ggml_graph_overhead_custom(graph_nodes, false),
+            /* .mem_base = */ NULL,
+            /* .no_alloc = */ true,
+        };
+        ggml_context_ptr ctx(ggml_init(params)); // smart ptr
+        GGML_ASSERT(ctx);
+
+        ggml_tensor * out = build_graph(ctx.get());
+
+        if (op_name != nullptr && op_desc(out) != op_name) {
+            //printf("  %s: skipping\n", op_desc(out).c_str());
+            return true;
+        }
+
+        int len = printf("  %s(%s): ", op_desc(out).c_str(), vars().c_str());
+        fflush(stdout);
+
+        // check if backends support op
+        if (!ggml_backend_supports_op(backend, out)) {
+            printf("not supported\n");
+            return true;
+        }
+
+        // align while also leaving some margin for variations in parameters
+        int align = 8;
+        int last = (len + align - 1) / align * align;
+        if (last - len < 5) {
+            last += align;
+        }
+        printf("%*s", last - len, "");
+
+        // allocate
+        ggml_backend_buffer_ptr buf(ggml_backend_alloc_ctx_tensors(ctx.get(), backend)); // smart ptr
+
+        if (buf == NULL) {
+            printf("failed to allocate tensors\n");
+            return false;
+        }
+
+        // randomize tensors
+        initialize_tensors(ctx.get());
+
+        // build graph
+        ggml_cgraph * gf = ggml_new_graph_custom(ctx.get(), graph_nodes, false);
+        ggml_build_forward_expand(gf, out);
+
+        // warmup run
+        ggml_status status = ggml_backend_graph_compute(backend, gf);
+        if (status != GGML_STATUS_SUCCESS) {
+            fprintf(stderr, "%s: ggml_backend_graph_compute failed. status=%s \n", __func__, ggml_status_to_string(status));
+            return false;
+        }
+
+        // determine number of runs
+        int n_runs;
+        bool is_cpu = ggml_backend_dev_type(ggml_backend_get_device(backend)) == GGML_BACKEND_DEVICE_TYPE_CPU;
+        if (op_flops(out) > 0) {
+            // based on flops
+            const uint64_t GFLOP = 1000 * 1000 * 1000;
+            const uint64_t target_flops_cpu =   8ULL * GFLOP;
+            const uint64_t target_flops_gpu = 100ULL * GFLOP;
+            uint64_t target_flops = is_cpu ? target_flops_cpu : target_flops_gpu;
+            n_runs = std::min<int>(ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_flops / op_flops(out)) + 1;
+        } else {
+            // based on memory size
+            const size_t GB = 1ULL << 30;
+            const size_t target_size_cpu =  8 * GB;
+            const size_t target_size_gpu = 32 * GB;
+            size_t target_size = is_cpu ? target_size_cpu : target_size_gpu;
+            n_runs = std::min<int>(ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_size / op_size(out)) + 1;
+        }
+
+        // duplicate the op
+        for (int i = 1; i < n_runs; i++) {
+            ggml_graph_add_node(gf, out);
+        }
+
+        // calculate memory
+        size_t mem = n_runs * op_size(out);
+        auto tensor_op_size = [](ggml_tensor * t) {
+            size_t size = ggml_nbytes(t);
+            // add source tensors
+            for (int i = 0; i < GGML_MAX_SRC; i++) {
+                if (t->src[i] != NULL) {
+                    size += ggml_nbytes(t->src[i]);
+                }
+            }
+            return size;
+        };
+        for (int i = 0; i < ggml_graph_n_nodes(gf); ++i) {
+            if (ggml_is_view_op(ggml_graph_node(gf, i)->op) || ggml_graph_node(gf, i) == out) {
+                continue;
+            }
+            mem += tensor_op_size(ggml_graph_node(gf, i));
+        }
+
+        // run
+        int64_t total_time_us = 0;
+        int64_t total_mem = 0;
+        int total_runs = 0;
+        do {
+            int64_t start_time = ggml_time_us();
+            ggml_status status = ggml_backend_graph_compute(backend, gf);
+            if (status != GGML_STATUS_SUCCESS) {
+                fprintf(stderr, "%s: ggml_backend_graph_compute failed. status=%s \n", __func__, ggml_status_to_string(status));
+                return false;
+            }
+            int64_t end_time = ggml_time_us();
+
+            total_time_us += end_time - start_time;
+            total_mem += mem;
+            total_runs += n_runs;
+        } while (total_time_us < 1000*1000); // run for at least 1 second
+
+        printf("    %8d runs - %8.2f us/run - ",
+            total_runs,
+            (double)total_time_us / total_runs);
+
+        if (op_flops(out) > 0) {
+            double flops_per_sec = (op_flops(out) * total_runs) / (total_time_us / 1e6);
+            auto format_flops = [](double flops) -> std::string {
+                char buf[256];
+                if (flops >= 1e12) {
+                    snprintf(buf, sizeof(buf), "%6.2f TFLOP", flops / 1e12);
+                } else if (flops >= 1e9) {
+                    snprintf(buf, sizeof(buf), "%6.2f GFLOP", flops / 1e9);
+                } else if (flops >= 1e6) {
+                    snprintf(buf, sizeof(buf), "%6.2f MFLOP", flops / 1e6);
+                } else {
+                    snprintf(buf, sizeof(buf), "%6.2f KFLOP", flops / 1e3);
+                }
+                return buf;
+            };
+            printf("%s/run - \033[1;34m%sS\033[0m",
+                format_flops(op_flops(out)).c_str(),
+                format_flops(flops_per_sec).c_str());
+
+        } else {
+            printf("%8zu kB/run - \033[1;34m%7.2f GB/s\033[0m",
+                op_size(out) / 1024,
+                total_mem / (total_time_us / 1e6) / 1024.0 / 1024.0 / 1024.0);
+        }
+        printf("\n");
+
+        return true;
+    }
+
+    bool eval_grad(ggml_backend_t backend, const char * op_name) {
+        mode = MODE_GRAD;
+        const std::vector<float> expect = grad_expect();
+
+        ggml_init_params params = {
+            /* .mem_size = */ ggml_tensor_overhead()*128 + 2*ggml_graph_overhead_custom(GGML_DEFAULT_GRAPH_SIZE, true),
+            /* .mem_base = */ NULL,
+            /* .no_alloc = */ true,
+        };
+        ggml_context_ptr ctx(ggml_init(params)); // smart ptr
+        GGML_ASSERT(ctx);
+
+        gf = ggml_new_graph_custom(ctx.get(), GGML_DEFAULT_GRAPH_SIZE, true);
+        gb = ggml_new_graph_custom(ctx.get(), GGML_DEFAULT_GRAPH_SIZE, true);
+
+        ggml_tensor * out = build_graph(ctx.get());
+
+        if ((op_name != nullptr && op_desc(out) != op_name) || out->op == GGML_OP_OPT_STEP_ADAMW) {
+            //printf("  %s: skipping\n", op_desc(out).c_str());
+            return true;
+        }
+
+        printf("  %s(%s): ", op_desc(out).c_str(), vars().c_str());
+        fflush(stdout);
+
+        if (out->type != GGML_TYPE_F32) {
+            printf("not supported [%s->type != FP32]\n", out->name);
+            return true;
+        }
+
+        // check if the backend supports the ops
+        bool supported = true;
+        bool any_params = false;
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx.get()); t != NULL; t = ggml_get_next_tensor(ctx.get(), t)) {
+            if (!ggml_backend_supports_op(backend, t)) {
+                printf("not supported [%s] ", ggml_backend_name(backend));
+                supported = false;
+                break;
+            }
+            if ((t->flags & GGML_TENSOR_FLAG_PARAM)) {
+                any_params = true;
+                if (t->type != GGML_TYPE_F32) {
+                    printf("not supported [%s->type != FP32] ", t->name);
+                    supported = false;
+                    break;
+                }
+            }
+        }
+        if (!any_params) {
+            printf("not supported [%s] \n", op_desc(out).c_str());
+            supported = false;
+        }
+        if (!supported) {
+            printf("\n");
+            return true;
+        }
+
+        int64_t ngrads = 0;
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx.get()); t != NULL; t = ggml_get_next_tensor(ctx.get(), t)) {
+            if (t->flags & GGML_TENSOR_FLAG_PARAM) {
+                ngrads += ggml_nelements(t);
+            }
+        }
+        if (ngrads > grad_nmax()) {
+            printf("skipping large tensors for speed \n");
+            return true;
+        }
+
+
+        if (!ggml_is_scalar(out)) {
+            out = ggml_sum(ctx.get(), out);
+            ggml_set_name(out, "sum_of_out");
+        }
+        ggml_set_loss(out);
+
+        ggml_build_forward_expand(gf, out);
+        ggml_graph_cpy(gf, gb);
+        ggml_build_backward_expand(ctx.get(), gb, nullptr);
+        if (expect.size() != 1 || expect[0] != 0.0f) {
+            GGML_ASSERT(ggml_graph_n_nodes(gb) > ggml_graph_n_nodes(gf));
+            for (ggml_tensor * t = ggml_get_first_tensor(ctx.get()); t != NULL; t = ggml_get_next_tensor(ctx.get(), t)) {
+                GGML_ASSERT(!(t->flags & GGML_TENSOR_FLAG_PARAM) || ggml_graph_get_grad(gb, t)->op != GGML_OP_NONE);
+            }
+        }
+
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx.get()); t != NULL; t = ggml_get_next_tensor(ctx.get(), t)) {
+            if (!ggml_backend_supports_op(backend, t)) {
+                printf("not supported [%s] ", ggml_backend_name(backend));
+                supported = false;
+                break;
+            }
+            if ((t->flags & GGML_TENSOR_FLAG_PARAM) && t->type != GGML_TYPE_F32) {
+                printf("not supported [%s->type != FP32] ", t->name);
+                supported = false;
+                break;
+            }
+        }
+        if (!supported) {
+            printf("\n");
+            return true;
+        }
+
+        // allocate
+        ggml_backend_buffer_ptr buf(ggml_backend_alloc_ctx_tensors(ctx.get(), backend)); // smart ptr
+        if (buf == NULL) {
+            printf("failed to allocate tensors [%s] ", ggml_backend_name(backend));
+            return false;
+        }
+
+        initialize_tensors(ctx.get()); // Randomizes all tensors (including gradients).
+        ggml_graph_reset(gb);    // Sets gradients to 1 if loss, 0 otherwise.
+
+        ggml_status status = ggml_backend_graph_compute(backend, gf);
+        if (status != GGML_STATUS_SUCCESS) {
+            fprintf(stderr, "%s: ggml_backend_graph_compute failed. status=%s \n", __func__, ggml_status_to_string(status));
+            return false;
+        }
+        status = ggml_backend_graph_compute(backend, gb);
+        if (status != GGML_STATUS_SUCCESS) {
+            fprintf(stderr, "%s: ggml_backend_graph_compute failed. status=%s \n", __func__, ggml_status_to_string(status));
+            return false;
+        }
+
+        bool ok = true;
+        for (struct ggml_tensor * t = ggml_get_first_tensor(ctx.get()); t != nullptr; t = ggml_get_next_tensor(ctx.get(), t)) {
+            if (!(t->flags & GGML_TENSOR_FLAG_PARAM)) {
+                continue;
+            }
+
+            const char * bn = ggml_backend_name(backend);
+            const int64_t ne = ggml_nelements(t);
+
+            std::vector<float> ga;
+            struct ggml_tensor * grad = ggml_graph_get_grad(gb, t);
+            if (grad) {
+                ga = tensor_to_float(grad);
+            } else {
+                ga.resize(ne); // default value is 0.0f
+            }
+
+            for (int64_t i = 0; i < ne; ++i) { // gradient algebraic
+                // check for nans
+                if (!std::isfinite(ga[i])) {
+                    printf("[%s] nonfinite gradient at index %" PRId64 " (%s=%f) ", ggml_op_desc(t), i, bn, ga[i]);
+                    ok = false;
+                    break;
+                }
+            }
+            if (!ok) {
+                break;
+            }
+
+            std::vector<float> gn(ne); // gradient numeric
+            GGML_ASSERT(ga.size() == gn.size());
+
+            std::vector<float> x0 = tensor_to_float(t); // original t data
+            GGML_ASSERT(ggml_is_scalar(out));
+            GGML_ASSERT(out->type == GGML_TYPE_F32);
+
+            const float eps = grad_eps();
+            for (int64_t i = 0; i < ne; ++i) {
+                const float xiu  = x0[i] + 1.0f*eps; // x, index i, up
+                const float xiuh = x0[i] + 0.5f*eps; // x, index i, up half
+                const float xidh = x0[i] - 0.5f*eps; // x, index i, down half
+                const float xid  = x0[i] - 1.0f*eps; // x, index i, down
+
+                float fu, fuh, fdh, fd; // output values for xiu, xiuh, xid, xidh
+
+                ggml_backend_tensor_set(t, &xiu, i*sizeof(float), sizeof(float));
+                status = ggml_backend_graph_compute(backend, gf);
+                if (status != GGML_STATUS_SUCCESS) {
+                    fprintf(stderr, "%s: ggml_backend_graph_compute failed. status=%s \n", __func__, ggml_status_to_string(status));
+                    return false;
+                }
+                ggml_backend_tensor_get(out, &fu, 0, ggml_nbytes(out));
+
+                ggml_backend_tensor_set(t, &xid, i*sizeof(float), sizeof(float));
+                status = ggml_backend_graph_compute(backend, gf);
+                if (status != GGML_STATUS_SUCCESS) {
+                    fprintf(stderr, "%s: ggml_backend_graph_compute failed. status=%s \n", __func__, ggml_status_to_string(status));
+                    return false;
+                }
+                ggml_backend_tensor_get(out, &fd, 0, ggml_nbytes(out));
+
+                if (grad_precise()) {
+                    ggml_backend_tensor_set(t, &xiuh, i*sizeof(float), sizeof(float));
+                    status = ggml_backend_graph_compute(backend, gf);
+                    if (status != GGML_STATUS_SUCCESS) {
+                        fprintf(stderr, "%s: ggml_backend_graph_compute failed. status=%s \n", __func__, ggml_status_to_string(status));
+                        return false;
+                    }
+                    ggml_backend_tensor_get(out, &fuh, 0, ggml_nbytes(out));
+
+                    ggml_backend_tensor_set(t, &xidh, i*sizeof(float), sizeof(float));
+                    status = ggml_backend_graph_compute(backend, gf);
+                    if (status != GGML_STATUS_SUCCESS) {
+                        fprintf(stderr, "%s: ggml_backend_graph_compute failed. status=%s \n", __func__, ggml_status_to_string(status));
+                        return false;
+                    }
+                    ggml_backend_tensor_get(out, &fdh, 0, ggml_nbytes(out));
+
+                    gn[i] = (8.0*(double)fuh + (double)fd - (8.0*(double)fdh + (double)fu)) / (6.0*(double)eps);
+                } else {
+                    gn[i] = (fu - fd) / (2.0f*eps);
+                }
+
+                ggml_backend_tensor_set(t, x0.data(), 0, ggml_nbytes(t));
+            }
+
+            const double err = mean_abs_asymm(gn.data(), ga.data(), gn.size(), expect);
+            if (err > max_maa_err()) {
+                printf("[%s] MAA = %.9f > %.9f ", ggml_op_desc(t), err, max_maa_err());
+                ok = false;
+                break;
+            }
+            if (!ok) {
+                break;
+            }
+        }
+
+        if (!ok) {
+            printf("compare failed ");
+        }
+
+        if (ok) {
+            printf("\033[1;32mOK\033[0m\n");
+            return true;
+        }
+
+        printf("\033[1;31mFAIL\033[0m\n");
+        return false;
+    }
+};
+
+
+// ###################################
+// ## Section 2: GGML Op Defintions ##
+// ###################################
+
+
+// The following is an example showing the bare minimum for creating a test for a GGML op.
+
+// GGML_OP_EXAMPLE
+struct test_example : public test_case {
+    // Always define these 2 or variants thereof:
+    const ggml_type type; // The type of the input tensors.
+    const std::array<int64_t, 4> ne; // The shape of the input tensors.
+    // For some ops it's necessary to define multiple types or shapes for the inputs.
+    // Or they may need additional parameters.
+
+    // Put all parameters needed to fully define the test into one of the VARS_TO_STR macros.
+    // In most cases these are just the properties of the struct that you defined above.
+    // This is needed for info prints.
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    // Define a constructor for the struct.
+    // In most cases it will be sufficient to have the same arguments as the struct has properties
+    // and just use initializer lists.
+    test_example(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 5, 4, 3})
+        : type(type), ne(ne) {}
+
+    // Define how a simple GGML compute graph can be constructed for the new GGML op.
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        // Step 1: create input tensors that don't depend on any other tensors:
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(a, "a"); // Setting names is optional but it's useful for debugging.
+
+        ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(b, "b");
+
+        // Step 2: use the op that you want to test in the GGML compute graph.
+        ggml_tensor * out = ggml_add(ctx, a, b); // For this example we're just doing a simple addition.
+        ggml_set_name(out, "out");
+
+        // Step 3: return the output tensor.
+        return out;
+    }
+    // In order to also check the gradients for your op, add calls like ggml_set_param(a)
+    // immediately after you create the tensors.
+    // This is optional and only makes sense if a backward pass has actually been implemented for the new op.
+};
+
+
+// GGML_OP_UNARY
+struct test_unary : public test_case {
+    const ggml_unary_op op;
+    const ggml_type type;
+    const std::array<int64_t, 4> ne_a;
+    int v; // view (1 : non-contiguous a)
+
+    std::string vars() override {
+        return VARS_TO_STR3(type, ne_a, v);
+    }
+
+    test_unary(ggml_unary_op op,
+            ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne_a = {128, 2, 2, 2},
+            int v = 0)
+        : op(op), type(type), ne_a(ne_a), v(v) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        const bool grad_supported = op == GGML_UNARY_OP_ABS || op == GGML_UNARY_OP_SGN || op == GGML_UNARY_OP_NEG ||
+            op == GGML_UNARY_OP_STEP || op == GGML_UNARY_OP_RELU || op == GGML_UNARY_OP_SILU;
+
+        ggml_tensor * a;
+        if (v & 1) {
+            auto ne = ne_a; ne[0] *= 3;
+            a = ggml_new_tensor(ctx, type, 4, ne.data());
+            if (grad_supported) {
+                ggml_set_param(a);
+            }
+            ggml_set_name(a, "a");
+
+            a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0);
+            ggml_set_name(a, "view_of_a");
+        } else {
+            a = ggml_new_tensor(ctx, type, 4, ne_a.data());
+            if (grad_supported) {
+                ggml_set_param(a);
+            }
+            ggml_set_name(a, "a");
+        }
+
+        ggml_tensor * out = ggml_unary(ctx, a, op);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            // test extended range of values to check for NaNs in GELU
+            init_tensor_uniform(t, -150.f, 150.f);
+        }
+    }
+
+    float grad_eps() override {
+        return 15.0f;
+    }
+
+    std::vector<float> grad_expect() override {
+        if (op == GGML_UNARY_OP_ABS) {
+            return {-1.0f, 1.0f};
+        }
+        if (op == GGML_UNARY_OP_SGN || op == GGML_UNARY_OP_STEP) {
+            return {0.0f};
+        }
+        if (op == GGML_UNARY_OP_RELU) {
+            return {0.0f, 1.0f};
+        }
+        return {};
+    }
+
+};
+
+// GGML_OP_GLU
+struct test_glu : public test_case {
+    const ggml_glu_op op;
+    const ggml_type type;
+    const std::array<int64_t, 4> ne_a;
+    int v; // view (1 : non-contiguous a)
+    bool swapped;
+
+    std::string vars() override {
+        return VARS_TO_STR4(type, ne_a, v, swapped);
+    }
+
+    test_glu(ggml_glu_op op,
+            ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne_a = {128, 2, 2, 2},
+            int v = 0,
+            bool swapped = false)
+        : op(op), type(type), ne_a(ne_a), v(v), swapped(swapped) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a;
+        if (v & 1) {
+            auto ne = ne_a; ne[0] *= 3;
+            a = ggml_new_tensor(ctx, type, 4, ne.data());
+            ggml_set_name(a, "a");
+
+            a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0);
+            ggml_set_name(a, "view_of_a");
+        } else {
+            a = ggml_new_tensor(ctx, type, 4, ne_a.data());
+            ggml_set_name(a, "a");
+        }
+
+        ggml_tensor * out = ggml_glu(ctx, a, op, swapped);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            // test extended range of values to check for NaNs in GELU
+            init_tensor_uniform(t, -150.f, 150.f);
+        }
+    }
+};
+
+struct test_glu_split : public test_case {
+    const ggml_glu_op op;
+    const ggml_type type;
+    const std::array<int64_t, 4> ne_a;
+    int v; // view (1 : non-contiguous a)
+
+    std::string vars() override {
+        return VARS_TO_STR3(type, ne_a, v) + ",split";
+    }
+
+    test_glu_split(ggml_glu_op op,
+            ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne_a = {128, 2, 2, 2},
+            int v = 0)
+        : op(op), type(type), ne_a(ne_a), v(v) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a;
+        ggml_tensor * b;
+        if (v & 1) {
+            auto ne = ne_a; ne[0] *= 3;
+            a = ggml_new_tensor(ctx, type, 4, ne.data());
+            ggml_set_name(a, "a");
+
+            a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0);
+            ggml_set_name(a, "view_of_a");
+
+            b = ggml_new_tensor(ctx, type, 4, ne.data());
+            ggml_set_name(b, "b");
+
+            b = ggml_view_4d(ctx, b, ne_a[0], ne_a[1], ne_a[2], ne_a[3], b->nb[1], b->nb[2], b->nb[3], 0);
+            ggml_set_name(a, "view_of_b");
+        } else {
+            a = ggml_new_tensor(ctx, type, 4, ne_a.data());
+            ggml_set_name(a, "a");
+
+            b = ggml_new_tensor(ctx, type, 4, ne_a.data());
+            ggml_set_name(b, "b");
+        }
+
+        ggml_tensor * out = ggml_glu_split(ctx, a, b, op);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            // test extended range of values to check for NaNs in GELU
+            init_tensor_uniform(t, -150.f, 150.f);
+        }
+    }
+};
+
+// GGML_OP_GET_ROWS
+struct test_get_rows : public test_case {
+    const ggml_type type;
+    const int n; // cols
+    const int m; // rows
+    const int r; // rows to get
+    const int b; // batch size
+    const bool v; // view (non-contiguous src1)
+
+    std::string vars() override {
+        return VARS_TO_STR6(type, n, m, r, b, v);
+    }
+
+    test_get_rows(ggml_type type = GGML_TYPE_F32, int n = 10, int m = 5, int r = 3, int b = 1, bool v = false)
+        : type(type), n(n), m(m), r(r), b(b), v(v) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * in = ggml_new_tensor_3d(ctx, type, n, m, b);
+        ggml_set_name(in, "in");
+
+        ggml_tensor * rows = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, r, b);
+        ggml_set_name(rows, "rows");
+        if (v) {
+            rows = ggml_view_2d(ctx, rows, r/2, b, rows->nb[1], 0);
+            ggml_set_name(rows, "view_of_rows");
+        }
+
+        const bool grad_supported = ggml_is_matrix(in) && ggml_is_vector(rows);
+        if (grad_supported) {
+            ggml_set_param(in);
+            // rows is a constant input -> no gradients
+        }
+
+        ggml_tensor * out = ggml_get_rows(ctx, in, rows);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            if (t->type == GGML_TYPE_I32) {
+                if (ggml_is_view_op(t->op)) { continue; }
+                // rows
+                std::vector<int> data(r*b);
+                for (int i = 0; i < r*b; i++) {
+                    data[i] = rand() % m;
+                }
+                ggml_backend_tensor_set(t, data.data(), 0, r * b * sizeof(int));
+            } else {
+                init_tensor_uniform(t);
+            }
+        }
+    }
+};
+
+// GGML_OP_GET_ROWS_BACK
+struct test_get_rows_back : public test_case {
+    const ggml_type type;
+    const int n; // cols
+    const int m; // rows
+    const int r; // rows to get
+    const int b; // batch size
+    const bool v; // view (non-contiguous src1)
+
+    std::string vars() override {
+        return VARS_TO_STR6(type, n, m, r, b, v);
+    }
+
+    test_get_rows_back(ggml_type type = GGML_TYPE_F32, int n = 10, int m = 5, int r = 3, int b = 1, bool v = false)
+        : type(type), n(n), m(m), r(r), b(b), v(v) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * in_forward = ggml_new_tensor_3d(ctx, type, n, m, b);
+        ggml_set_name(in_forward, "in_forward");
+
+        ggml_tensor * rows = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, r, b);
+        ggml_set_name(rows, "rows");
+        if (v) {
+            rows = ggml_view_2d(ctx, rows, r/2, b, rows->nb[1], 0);
+            ggml_set_name(rows, "view_of_rows");
+        }
+
+        ggml_tensor * grad = ggml_new_tensor_3d(ctx, type, n, r, b);
+        ggml_set_name(grad, "grad");
+
+        ggml_tensor * out = ggml_get_rows_back(ctx, grad, rows, in_forward);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            if (t->type == GGML_TYPE_I32) {
+                if (ggml_is_view_op(t->op)) { continue; }
+                // rows
+                std::vector<int> data(r*b);
+                for (int i = 0; i < r*b; i++) {
+                    data[i] = rand() % m;
+                }
+                ggml_backend_tensor_set(t, data.data(), 0, r * b * sizeof(int));
+            } else {
+                init_tensor_uniform(t);
+            }
+        }
+    }
+};
+
+// GGML_OP_SET_ROWS
+struct test_set_rows : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    const std::array<int, 2> nr23; // broadcast only dims 2 and 3
+    const int r; // rows to set
+    const bool v; // view (non-contiguous src1)
+
+    std::string vars() override {
+        return VARS_TO_STR5(type, ne, nr23, r, v);
+    }
+
+    test_set_rows(ggml_type type,
+            std::array<int64_t, 4> ne,
+            std::array<int, 2> nr23,
+            int r, bool v = false)
+        : type(type), ne(ne), nr23(nr23), r(r), v(v) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * dst = ggml_new_tensor_4d(ctx, type,          ne[0], ne[1], ne[2]*nr23[0], ne[3]*nr23[1]);
+        ggml_set_name(dst, "dst");
+
+        ggml_tensor * src = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, ne[0], r,     ne[2]*nr23[0], ne[3]*nr23[1]);
+        ggml_set_name(src, "src");
+
+        ggml_tensor * row_idxs = ggml_new_tensor_3d(ctx, GGML_TYPE_I64, r, ne[2], ne[3]);
+        ggml_set_name(row_idxs, "row_idxs");
+
+        if (v) {
+            src      = ggml_view_4d(ctx, src, ne[0], r/2, ne[2]*nr23[0], ne[3]*nr23[1], src->nb[1], src->nb[2], src->nb[3], 0);
+            row_idxs = ggml_view_3d(ctx, row_idxs, r/2, ne[2], ne[3], row_idxs->nb[1], row_idxs->nb[2], 0);
+            ggml_set_name(row_idxs, "view_of_rows");
+        }
+
+        ggml_tensor * out = ggml_set_rows(ctx, dst, src, row_idxs);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        std::random_device rd;
+        std::default_random_engine rng(rd());
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            if (t->type == GGML_TYPE_I64) {
+                if (ggml_is_view_op(t->op)) {
+                    continue;
+                }
+
+                for (int i2 = 0; i2 < t->ne[2]; i2++) {
+                    for (int i1 = 0; i1 < t->ne[1]; i1++) {
+                        // generate a shuffled subset of row indices
+                        std::vector<int64_t> data(ne[1]);
+                        for (int i = 0; i < ne[1]; i++) {
+                            data[i] = i;
+                        }
+                        std::shuffle(data.begin(), data.end(), rng);
+                        data.resize(t->ne[0]);
+
+                        const size_t offs = i1*t->nb[1] + i2*t->nb[2];
+                        ggml_backend_tensor_set(t, data.data(), offs, t->ne[0]*sizeof(int64_t));
+                    }
+                }
+            } else {
+                init_tensor_uniform(t);
+            }
+        }
+    }
+};
+
+// GGML_OP_ARGMAX
+struct test_argmax : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    test_argmax(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 100, 1, 1})
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_argmax(ctx, a);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        std::random_device rd;
+        std::default_random_engine rng(rd());
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            if (t->type == GGML_TYPE_F32) {
+                // initialize with unique values to avoid ties
+                for (int64_t r = 0; r < ggml_nrows(t); r++) {
+                    std::vector<float> data(t->ne[0]);
+                    for (int i = 0; i < t->ne[0]; i++) {
+                        data[i] = i;
+                    }
+                    std::shuffle(data.begin(), data.end(), rng);
+                    ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(float));
+                }
+            } else {
+                init_tensor_uniform(t);
+            }
+        }
+    }
+
+    double max_nmse_err() override {
+        return 0.0;
+    }
+};
+
+// GGML_OP_COUNT_EQUAL
+struct test_count_equal : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    test_count_equal(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {4, 500, 1, 1})
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(a, "a");
+
+        ggml_tensor * a_argmax = ggml_argmax(ctx, a);
+        ggml_set_name(a_argmax, "a_argmax");
+
+        ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(b, "b");
+
+        ggml_tensor * b_argmax = ggml_argmax(ctx, b);
+        ggml_set_name(b_argmax, "b_argmax");
+
+        ggml_tensor * out = ggml_count_equal(ctx, a_argmax, b_argmax);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    double max_nmse_err() override {
+        return 0.0;
+    }
+};
+
+// GGML_OP_REPEAT
+struct test_repeat : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    const std::array<int, 4> nr;
+
+    std::string vars() override {
+        return VARS_TO_STR3(type, ne, nr);
+    }
+
+    size_t op_size(ggml_tensor * t) override {
+        return ggml_nbytes(t) * 2;
+    }
+
+    test_repeat(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 5, 4, 3},
+            std::array<int, 4> nr = {2, 2, 2, 2})
+        : type(type), ne(ne), nr(nr) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * target = ggml_new_tensor_4d(ctx, type, ne[0]*nr[0], ne[1]*nr[1], ne[2]*nr[2], ne[3]*nr[3]);
+        ggml_set_name(target, "target");
+
+        ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(src);
+        ggml_set_name(src, "src");
+
+        ggml_tensor * out = ggml_repeat(ctx, src, target);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_REPEAT_BACK
+struct test_repeat_back : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    const std::array<int, 4> nr;
+    const bool v; // whether src is a noncontiguous view
+
+    std::string vars() override {
+        return VARS_TO_STR4(type, ne, nr, v);
+    }
+
+    size_t op_size(ggml_tensor * t) override {
+        return ggml_nbytes(t) * 2;
+    }
+
+    test_repeat_back(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {8, 6, 4, 2},
+            std::array<int, 4> nr = {2, 2, 2, 2},
+            bool v = false)
+        : type(type), ne(ne), nr(nr), v(v) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * src = ggml_new_tensor_4d(ctx, type, ne[0]*nr[0], ne[1]*nr[1], ne[2]*nr[2], ne[3]*nr[3]);
+        ggml_set_name(src, "src");
+
+        if (v) {
+            GGML_ASSERT(ne[0] % 2 == 0);
+            GGML_ASSERT(ne[1] % 2 == 0);
+            GGML_ASSERT(ne[2] % 2 == 0);
+            GGML_ASSERT(ne[3] % 2 == 0);
+            GGML_ASSERT(nr[0] % 2 == 0 || nr[0] == 1);
+            GGML_ASSERT(nr[1] % 2 == 0 || nr[1] == 1);
+            GGML_ASSERT(nr[2] % 2 == 0 || nr[2] == 1);
+            GGML_ASSERT(nr[3] % 2 == 0 || nr[3] == 1);
+
+            const int64_t ne00 = nr[0] == 1 ? src->ne[0] : src->ne[0] / 2;
+            const int64_t ne01 = nr[1] == 1 ? src->ne[1] : src->ne[1] / 2;
+            const int64_t ne02 = nr[2] == 1 ? src->ne[2] : src->ne[2] / 2;
+            const int64_t ne03 = nr[3] == 1 ? src->ne[3] : src->ne[3] / 2;
+
+            src = ggml_view_4d(ctx, src, ne00, ne01, ne02, ne03, src->nb[1], src->nb[2], src->nb[3], 0);
+        }
+
+        ggml_tensor * target = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(target, "target");
+
+        ggml_tensor * out = ggml_repeat_back(ctx, src, target);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_DUP
+struct test_dup : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    const std::array<int64_t, 4> permute;
+    bool _use_permute;
+
+    std::string vars() override {
+        std::string v = VARS_TO_STR2(type, ne);
+        if (_use_permute) v += "," + VAR_TO_STR(permute);
+        return v;
+    }
+
+    test_dup(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 10, 20, 1},
+            std::array<int64_t, 4> permute = {0, 0, 0, 0})
+        : type(type), ne(ne), permute(permute),
+            _use_permute(permute[0] + permute[1] + permute[2] + permute[3] > 0) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(src);
+        ggml_set_name(src, "src");
+
+        if (_use_permute) {
+            src = ggml_permute(ctx, src, permute[0], permute[1], permute[2], permute[3]);
+            ggml_set_name(src, "src_permuted");
+        }
+
+        ggml_tensor * out = ggml_dup(ctx, src);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_SET
+struct test_set : public test_case {
+    const ggml_type type_src;
+    const ggml_type type_dst;
+    const std::array<int64_t, 4> ne;
+    const int dim;
+
+    std::string vars() override {
+        return VARS_TO_STR4(type_src, type_dst, ne, dim);
+    }
+
+    size_t op_size(ggml_tensor * t) override {
+        return ggml_nbytes(t) + ggml_nbytes(t->src[0]);
+    }
+
+    test_set(ggml_type type_src = GGML_TYPE_F32, ggml_type type_dst = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {6, 5, 4, 3}, int dim = 1)
+        : type_src(type_src), type_dst(type_dst), ne(ne), dim(dim) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne.data());
+        ggml_set_param(src);
+        ggml_set_name(src, "src");
+
+        auto ne_dst = ne;
+        for (int i = 0; i < dim; ++i) {
+            ne_dst[i] *= 2;
+        }
+        ggml_tensor* dst = ggml_new_tensor(ctx, type_dst, 4, ne_dst.data());
+        ggml_set_param(dst);
+        ggml_set_name(dst, "dst");
+
+        size_t offset = 0;
+        for (int i = 0; i < dim; ++i) {
+            offset += ((ne_dst[i] - ne[i])/2)*dst->nb[i];
+        }
+        ggml_tensor * out = ggml_set(ctx, dst, src,
+            // The backward pass requires setting a contiguous region:
+            src->nb[1], src->nb[2], src->nb[3], offset);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_CPY
+struct test_cpy : public test_case {
+    const ggml_type type_src;
+    const ggml_type type_dst;
+    const std::array<int64_t, 4> ne;
+    const std::array<int64_t, 4> permute_src;
+    const std::array<int64_t, 4> permute_dst;
+    bool _src_use_permute;
+    bool _dst_use_permute;
+
+    std::string vars() override {
+        return VARS_TO_STR5(type_src, type_dst, ne, permute_src, permute_dst);
+    }
+
+    double max_nmse_err() override {
+        return 1e-6;
+    }
+
+    size_t op_size(ggml_tensor * t) override {
+        return ggml_nbytes(t) + ggml_nbytes(t->src[0]);
+    }
+
+    test_cpy(ggml_type type_src = GGML_TYPE_F32, ggml_type type_dst = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 10, 10, 1},
+            std::array<int64_t, 4> permute_src = {0, 0, 0, 0},
+            std::array<int64_t, 4> permute_dst = {0, 0, 0, 0})
+        : type_src(type_src), type_dst(type_dst), ne(ne), permute_src(permute_src), permute_dst(permute_dst),
+          _src_use_permute(permute_src[0] + permute_src[1] + permute_src[2] + permute_src[3] > 0),
+          _dst_use_permute(permute_dst[0] + permute_dst[1] + permute_dst[2] + permute_dst[3] > 0) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne.data());
+        ggml_set_param(src);
+        ggml_set_name(src, "src");
+
+        if (_src_use_permute) {
+            src = ggml_permute(ctx, src, permute_src[0], permute_src[1], permute_src[2], permute_src[3]);
+            ggml_set_name(src, "src_permuted");
+        }
+
+        ggml_tensor * dst = ggml_new_tensor(ctx, type_dst, 4, src->ne);
+        ggml_set_name(dst, "dst");
+
+        if (_dst_use_permute) {
+            dst = ggml_permute(ctx, dst, permute_dst[0], permute_dst[1], permute_dst[2], permute_dst[3]);
+            ggml_set_name(dst, "dst_permuted");
+        }
+
+        ggml_tensor * out = ggml_cpy(ctx, src, dst);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_CONT
+struct test_cont : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    test_cont(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 10, 10, 1})
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(src);
+        ggml_set_name(src, "src");
+
+        src = ggml_transpose(ctx, src);
+        ggml_set_name(src, "src_transposed");
+
+        ggml_tensor * out = ggml_cont(ctx, src);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_ADD
+// GGML_OP_SUB
+// GGML_OP_MUL
+// GGML_OP_DIV
+struct test_bin_bcast : public test_case {
+    using op_t = ggml_tensor * (*) (ggml_context *, ggml_tensor *, ggml_tensor *);
+    op_t op;
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    const std::array<int, 4> nr;
+
+    std::string vars() override {
+        return VARS_TO_STR3(type, ne, nr);
+    }
+
+    size_t op_size(ggml_tensor * t) override {
+        return ggml_nbytes(t) * 3;
+    }
+
+    test_bin_bcast(op_t op, ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 10, 1, 1},
+            std::array<int, 4> nr = {1, 2, 1, 1})
+        : op(op), type(type), ne(ne), nr(nr) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0]*nr[0], ne[1]*nr[1], ne[2]*nr[2], ne[3]*nr[3]);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(b, "b");
+
+        // The backward pass supports broadcasting only for GGML_ADD:
+        const bool grad_supported = op == ggml_add || ggml_are_same_shape(a, b);
+        if (grad_supported) {
+            ggml_set_param(a);
+            ggml_set_param(b);
+        }
+
+        ggml_tensor * out = op(ctx, a, b);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            if (op == ggml_mul || op == ggml_div) {
+                // MUL and DIV have numerical issues around zero:
+                init_tensor_uniform(t, 0.9f, 1.1f);
+            } else {
+                init_tensor_uniform(t);
+            }
+        }
+    }
+
+    float grad_eps() override {
+        return 0.1f * (op == ggml_mul ? ne[0]*ne[1]*ne[2]*ne[3] : 1);
+    }
+
+    bool grad_precise() override {
+        return op == ggml_div;
+    }
+
+    double max_maa_err() override {
+        return op == ggml_add ? 1e-4 : 1e-3;
+    }
+};
+
+// GGML_OP_ADD1
+struct test_add1 : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    test_add1(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 5, 4, 3})
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(a);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * b = ggml_new_tensor_1d(ctx, type, 1);
+        // ggml_set_param(b); // TODO: implement
+        ggml_set_name(b, "b");
+
+        ggml_tensor * out = ggml_add1(ctx, a, b);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    float grad_eps() override {
+        return 0.1f * ne[0]*ne[1]*ne[2]*ne[3];
+    }
+};
+
+// GGML_OP_SCALE
+struct test_scale : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    float scale;
+
+    std::string vars() override {
+        return VARS_TO_STR3(type, ne, scale);
+    }
+
+    test_scale(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 10, 10, 10},
+            float scale = 2.0f)
+        : type(type), ne(ne), scale(scale) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(a);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_scale(ctx, a, scale);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_SILU_BACK
+struct test_silu_back : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    float eps;
+
+    std::string vars() override {
+        return VARS_TO_STR3(type, ne, eps);
+    }
+
+    test_silu_back(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {64, 5, 4, 3},
+            float eps = 1e-6f)
+        : type(type), ne(ne), eps(eps) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(a, "a");
+
+        ggml_tensor * grad = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(grad, "grad");
+
+        ggml_tensor * out = ggml_silu_back(ctx, a, grad);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    bool grad_precise() override {
+        return true;
+    }
+};
+
+// GGML_OP_NORM
+struct test_norm : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    const bool v; // whether a is a non-contiguous view
+    const float eps;
+
+    std::string vars() override {
+        return VARS_TO_STR4(type, ne, v, eps);
+    }
+
+    test_norm(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {64, 5, 4, 3},
+            bool v = false,
+            float eps = 1e-6f)
+        : type(type), ne(ne), v(v), eps(eps) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(a, "a");
+
+        if (v) {
+            a = ggml_view_4d(ctx, a, a->ne[0]/2, a->ne[1]/2, a->ne[2]/2, a->ne[3]/2, a->nb[1], a->nb[2], a->nb[3], 0);
+            ggml_set_name(a, "view of a");
+        }
+
+        ggml_tensor * out = ggml_norm(ctx, a, eps);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_RMS_NORM
+struct test_rms_norm : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    const bool v; // whether a is a non-contiguous view
+    const float eps;
+
+    std::string vars() override {
+        return VARS_TO_STR4(type, ne, v, eps);
+    }
+
+    test_rms_norm(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {64, 5, 4, 3},
+            bool v = false,
+            float eps = 1e-6f)
+        : type(type), ne(ne), v(v), eps(eps) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(a);
+        ggml_set_name(a, "a");
+
+        if (v) {
+            a = ggml_view_4d(ctx, a, a->ne[0]/2, a->ne[1]/2, a->ne[2]/2, a->ne[3]/2, a->nb[1], a->nb[2], a->nb[3], 0);
+            ggml_set_name(a, "view of a");
+        }
+
+        ggml_tensor * out = ggml_rms_norm(ctx, a, eps);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            init_tensor_uniform(t, -10.f, 10.f);
+        }
+    }
+
+    float grad_eps() override {
+        return 1.0f;
+    }
+
+    bool grad_precise() override {
+        return true;
+    }
+};
+
+// GGML_OP_RMS_NORM_BACK
+struct test_rms_norm_back : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    const float eps;
+
+    std::string vars() override {
+        return VARS_TO_STR3(type, ne, eps);
+    }
+
+    test_rms_norm_back(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {64, 5, 4, 3},
+            float eps = 1e-6f)
+        : type(type), ne(ne), eps(eps) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(a, "a");
+
+        ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(b, "b");
+
+        ggml_tensor * out = ggml_rms_norm_back(ctx, a, b, eps);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            init_tensor_uniform(t, -10.f, 10.f);
+        }
+    }
+};
+
+// GGML_OP_RMS_NORM + GGML_OP_MUL
+struct test_rms_norm_mul : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    const float eps;
+
+    std::string op_desc(ggml_tensor * t) override {
+        GGML_UNUSED(t);
+        return "RMS_NORM_MUL";
+    }
+
+    bool run_whole_graph() override { return true; }
+
+    std::string vars() override {
+        return VARS_TO_STR3(type, ne, eps);
+    }
+
+    test_rms_norm_mul(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {64, 5, 4, 3},
+            float eps = 1e-6f)
+        : type(type), ne(ne), eps(eps) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(a);
+        ggml_set_name(a, "a");
+        ggml_set_param(b);
+        ggml_set_name(b, "b");
+
+        // Use a and b early, so we don't end up with an OP_NONE between rms_norm and mul
+        a = ggml_add(ctx, a, b);
+        ggml_tensor * out = ggml_mul(ctx, ggml_rms_norm(ctx, a, eps), b);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            init_tensor_uniform(t, -10.f, 10.f);
+        }
+    }
+
+    double max_nmse_err() override {
+        return 1e-6;
+    }
+
+    float grad_eps() override {
+        return 1.0f;
+    }
+
+    bool grad_precise() override {
+        return true;
+    }
+};
+
+// GGML_OP_SSM_CONV
+struct test_ssm_conv : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne_a;
+    const std::array<int64_t, 4> ne_b;
+
+    std::string vars() override {
+        return VARS_TO_STR3(type, ne_a, ne_b);
+    }
+
+    test_ssm_conv(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne_a = {10, 10, 10, 1},
+            std::array<int64_t, 4> ne_b = {3, 3, 1, 1})
+        : type(type), ne_a(ne_a), ne_b(ne_b) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a   = ggml_new_tensor(ctx, type, 4, ne_a.data());
+        ggml_tensor * b   = ggml_new_tensor(ctx, type, 4, ne_b.data());
+        ggml_tensor * out = ggml_ssm_conv(ctx, a, b);
+        return out;
+    }
+};
+
+// GGML_OP_SSM_SCAN
+struct test_ssm_scan : public test_case {
+    const ggml_type type;
+
+    const int64_t d_state;
+    const int64_t head_dim;
+    const int64_t n_head;
+    const int64_t n_group;
+    const int64_t n_seq_tokens;
+    const int64_t n_seqs;
+
+    std::string vars() override {
+        return VARS_TO_STR7(type, d_state, head_dim, n_head, n_group, n_seq_tokens, n_seqs);
+    }
+
+    test_ssm_scan(ggml_type type = GGML_TYPE_F32,
+            int64_t d_state = 32,
+            int64_t head_dim = 1, // non-zero for Mamba-2
+            int64_t n_head  = 32,
+            int64_t n_group = 1,
+            int64_t n_seq_tokens = 32,
+            int64_t n_seqs = 32)
+        : type(type), d_state(d_state), head_dim(head_dim), n_head(n_head), n_group(n_group), n_seq_tokens(n_seq_tokens), n_seqs(n_seqs) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * s   = ggml_new_tensor_4d(ctx, type, d_state,  head_dim,     n_head,       n_seqs);
+        ggml_tensor * x   = ggml_new_tensor_4d(ctx, type, head_dim, n_head,       n_seq_tokens, n_seqs);
+        ggml_tensor * dt  = ggml_new_tensor_3d(ctx, type, n_head,   n_seq_tokens, n_seqs);
+        ggml_tensor * A   = ggml_new_tensor_2d(ctx, type, (head_dim > 1) ? 1 : d_state, n_head);
+        ggml_tensor * B   = ggml_new_tensor_4d(ctx, type, d_state,  n_group,      n_seq_tokens, n_seqs);
+        ggml_tensor * C   = ggml_new_tensor_4d(ctx, type, d_state,  n_group,      n_seq_tokens, n_seqs);
+        ggml_tensor * ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32,  n_seqs);
+        ggml_tensor * out = ggml_ssm_scan(ctx, s, x, dt, A, B, C, ids);
+        return out;
+    }
+
+    // similar to test_mul_mat_id
+    void initialize_tensors(ggml_context * ctx) override {
+        std::random_device rd;
+        std::default_random_engine rng(rd());
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            if (t->type == GGML_TYPE_I32) {
+                if (ggml_is_view_op(t->op)) { continue; }
+                // ids
+                for (int64_t r = 0; r < ggml_nrows(t); r++) {
+                    std::vector<int32_t> data(t->ne[0]);
+                    for (int i = 0; i < t->ne[0]; i++) {
+                        data[i] = i;
+                    }
+                    std::shuffle(data.begin(), data.end(), rng);
+                    ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(int32_t));
+                }
+            } else {
+                init_tensor_uniform(t);
+            }
+        }
+    }
+};
+// GGML_OP_RWKV_WKV6
+struct test_rwkv_wkv6 : public test_case {
+    const ggml_type type;
+
+    const int64_t head_count;
+    const int64_t head_size;
+    const int64_t n_seq_tokens;
+    const int64_t n_seqs;
+
+    std::string vars() override {
+        return VARS_TO_STR5(type, head_count, head_size, n_seq_tokens, n_seqs);
+    }
+
+    test_rwkv_wkv6(ggml_type type = GGML_TYPE_F32,
+            int64_t head_count = 32, int64_t head_size = 64, int64_t n_seq_tokens = 32, int64_t n_seqs = 32)
+        : type(type), head_count(head_count), head_size(head_size), n_seq_tokens(n_seq_tokens), n_seqs(n_seqs) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        const int64_t n_tokens = n_seq_tokens * n_seqs;
+        ggml_tensor * r   = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
+        ggml_tensor * k   = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
+        ggml_tensor * v   = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
+        ggml_tensor * tf  = ggml_new_tensor(ctx, type, 2, std::vector<int64_t>{ head_size, head_count }.data());
+        ggml_tensor * td  = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
+        ggml_tensor * s   = ggml_new_tensor(ctx, type, 2, std::vector<int64_t>{ head_size * head_size * head_count, n_seqs }.data());
+        ggml_tensor * out = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, s);
+        return out;
+    }
+};
+
+// GGML_OP_GATED_LINEAR_ATTN
+struct test_gla : public test_case {
+    const ggml_type type;
+
+    const int64_t head_count;
+    const int64_t head_size;
+    const int64_t n_seq_tokens;
+    const int64_t n_seqs;
+
+    std::string vars() override {
+        return VARS_TO_STR5(type, head_count, head_size, n_seq_tokens, n_seqs);
+    }
+
+    test_gla(ggml_type type = GGML_TYPE_F32,
+            int64_t head_count = 32, int64_t head_size = 64, int64_t n_seq_tokens = 32, int64_t n_seqs = 32)
+        : type(type), head_count(head_count), head_size(head_size), n_seq_tokens(n_seq_tokens), n_seqs(n_seqs) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        const int64_t n_tokens = n_seq_tokens * n_seqs;
+        ggml_tensor * q   = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
+        ggml_tensor * k   = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
+        ggml_tensor * v   = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
+        ggml_tensor * g   = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
+        ggml_tensor * s   = ggml_new_tensor(ctx, type, 2, std::vector<int64_t>{ head_size * head_size * head_count, n_seqs }.data());
+        ggml_tensor * out = ggml_gated_linear_attn(ctx, k, v, q, g, s, pow(head_size, -0.5));
+        return out;
+    }
+};
+
+// GGML_OP_RWKV_WKV7
+struct test_rwkv_wkv7 : public test_case {
+    const ggml_type type;
+
+    const int64_t head_count;
+    const int64_t head_size;
+    const int64_t n_seq_tokens;
+    const int64_t n_seqs;
+
+    std::string vars() override {
+        return VARS_TO_STR5(type, head_count, head_size, n_seq_tokens, n_seqs);
+    }
+
+    test_rwkv_wkv7(ggml_type type = GGML_TYPE_F32,
+            int64_t head_count = 32, int64_t head_size = 64, int64_t n_seq_tokens = 32, int64_t n_seqs = 32)
+        : type(type), head_count(head_count), head_size(head_size), n_seq_tokens(n_seq_tokens), n_seqs(n_seqs) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        const int64_t n_tokens = n_seq_tokens * n_seqs;
+        ggml_tensor * r   = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
+        ggml_tensor * w   = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
+        ggml_tensor * k   = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
+        ggml_tensor * v   = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
+        ggml_tensor * a   = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
+        ggml_tensor * b   = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
+        // Outputs may become NaN with long seqlen without these normalization
+        a = ggml_l2_norm(ctx, a, 1e-7F);
+        b = ggml_l2_norm(ctx, b, 1e-7F);
+        ggml_tensor * s   = ggml_new_tensor(ctx, type, 2, std::vector<int64_t>{ head_size * head_size * head_count, n_seqs }.data());
+        ggml_tensor * out = ggml_rwkv_wkv7(ctx, r, w, k, v, a, b, s);
+        return out;
+    }
+};
+
+// GGML_OP_MUL_MAT
+struct test_mul_mat : public test_case {
+    const ggml_type type_a;
+    const ggml_type type_b;
+    const int64_t m;
+    const int64_t n;
+    const int64_t k;
+    const std::array<int64_t, 2> bs;  // dims 3 and 4
+    const std::array<int64_t, 2> nr;  // repeat in dims 3 and 4
+    const std::array<int64_t, 4> per; // permutation of dimensions
+    const bool v; // whether a and b are non-contiguous views
+
+    std::string vars() override {
+        return VARS_TO_STR9(type_a, type_b, m, n, k, bs, nr, per, v);
+    }
+
+    double max_nmse_err() override {
+        return 5e-4;
+    }
+
+    int64_t grad_nmax() override {
+        return 20000;
+    }
+
+    uint64_t op_flops(ggml_tensor * t) override {
+        GGML_UNUSED(t);
+        return 2 * m * n * k * bs[0] * nr[0] * bs[1] * nr[1];
+    }
+
+    test_mul_mat(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
+            int64_t m = 32, int64_t n = 32, int64_t k = 32,
+            std::array<int64_t, 2> bs = {10, 10},
+            std::array<int64_t, 2> nr = {2, 2},
+            std::array<int64_t, 4> per = {0, 1, 2, 3},
+            bool v = false)
+        : type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr), per(per), v(v) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        // C^T = A * B^T: (k, m) * (k, n) => (m, n)
+        ggml_tensor * a;
+        ggml_tensor * b;
+
+        const int npermuted = (per[0] != 0) + (per[1] != 1) + (per[2] != 2) + (per[3] != 3);
+        if (npermuted > 0) {
+            GGML_ASSERT(npermuted == 2);
+            GGML_ASSERT(!v); // not handled
+            GGML_ASSERT(!ggml_is_quantized(type_a) || per[0] == 0);
+            GGML_ASSERT(!ggml_is_quantized(type_b) || per[0] == 0);
+
+            // Create tensors with the permuted dimensions, then permute them back to the dimensions given by m,n,k.
+            const int64_t ne_a[4] = {k, m, bs[0],       bs[1]};
+            const int64_t ne_b[4] = {k, n, bs[0]*nr[0], bs[1]*nr[1]};
+
+            a = ggml_new_tensor_4d(ctx, type_a, ne_a[per[0]], ne_a[per[1]], ne_a[per[2]], ne_a[per[3]]);
+            b = ggml_new_tensor_4d(ctx, type_b, ne_b[per[0]], ne_b[per[1]], ne_b[per[2]], ne_b[per[3]]);
+            if (!ggml_is_quantized(type_a)) {
+                if (bs[1] == 1 && nr[1] == 1) {
+                    ggml_set_param(a);
+                }
+                ggml_set_param(b);
+            }
+            ggml_set_name(a, "a");
+            ggml_set_name(b, "b");
+
+            a = ggml_permute(ctx, a, per[0], per[1], per[2], per[3]);
+            b = ggml_permute(ctx, b, per[0], per[1], per[2], per[3]);
+            ggml_set_name(a, "a_permuted");
+            ggml_set_name(b, "b_permuted");
+        } else {
+            if (v) {
+                a = ggml_new_tensor_4d(ctx, type_a, k*2, m, bs[0],       bs[1]);
+                b = ggml_new_tensor_4d(ctx, type_b, k*2, n, bs[0]*nr[0], bs[1]*nr[1]);
+
+                if (!ggml_is_quantized(type_a)) {
+                    if (bs[1] == 1 && nr[1] == 1) {
+                        ggml_set_param(a);
+                    }
+                    ggml_set_param(b);
+                }
+
+                a = ggml_view_4d(ctx, a, k, m, bs[0],       bs[1],       a->nb[1], a->nb[2], a->nb[3], 0);
+                b = ggml_view_4d(ctx, b, k, n, bs[0]*nr[0], bs[1]*nr[1], b->nb[1], b->nb[2], b->nb[3], 0);
+            } else {
+                a = ggml_new_tensor_4d(ctx, type_a, k, m, bs[0],       bs[1]);
+                b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0]*nr[0], bs[1]*nr[1]);
+
+                if (!ggml_is_quantized(type_a)) {
+                    if (bs[1] == 1 && nr[1] == 1) {
+                        ggml_set_param(a);
+                    }
+                    ggml_set_param(b);
+                }
+            }
+            ggml_set_name(a, "a");
+            ggml_set_name(b, "b");
+        }
+
+        ggml_tensor * out = ggml_mul_mat(ctx, a, b);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_MUL_MAT_ID
+struct test_mul_mat_id : public test_case {
+    const ggml_type type_a;
+    const ggml_type type_b;
+    const int n_mats;
+    const int n_used;
+    const bool b; // broadcast b matrix
+    const int64_t m;
+    const int64_t n;
+    const int64_t k;
+
+    std::string vars() override {
+        return VARS_TO_STR8(type_a, type_b, n_mats, n_used, b, m, n, k);
+    }
+
+    double max_nmse_err() override {
+        return 5e-4;
+    }
+
+    uint64_t op_flops(ggml_tensor * t) override {
+        GGML_UNUSED(t);
+        return 2 * m * k * n * n_used;
+    }
+
+    test_mul_mat_id(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
+            int n_mats = 8, int n_used = 2, bool b = false,
+            int64_t m = 32, int64_t n = 32, int64_t k = 32)
+        : type_a(type_a), type_b(type_b), n_mats(n_mats), n_used(n_used), b(b),
+            m(m), n(n), k(k) {
+            GGML_ASSERT(n_used <= n_mats);
+        }
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        // C^T = A * B^T: (k, m) * (k, n) => (m, n)
+        ggml_tensor * as = ggml_new_tensor_3d(ctx, type_a, k, m, n_mats);
+        ggml_set_name(as, "as");
+
+        ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_mats, n);
+        ggml_set_name(ids, "ids");
+        if (n_used != n_mats) {
+            ids = ggml_view_2d(ctx, ids, n_used, n, ids->nb[1], 0);
+            ggml_set_name(ids, "view_of_ids");
+        }
+
+        ggml_tensor * b = ggml_new_tensor_3d(ctx, type_b, k, this->b ? 1 : n_used, n);
+        ggml_set_name(b, "b");
+
+        ggml_tensor * out = ggml_mul_mat_id(ctx, as, b, ids);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        std::random_device rd;
+        std::default_random_engine rng(rd());
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            if (t->type == GGML_TYPE_I32) {
+                if (ggml_is_view_op(t->op)) { continue; }
+                // ids
+                for (int64_t r = 0; r < ggml_nrows(t); r++) {
+                    std::vector<int32_t> data(t->ne[0]);
+                    for (int i = 0; i < t->ne[0]; i++) {
+                        data[i] = i % n_mats;
+                    }
+                    std::shuffle(data.begin(), data.end(), rng);
+                    ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(int32_t));
+                }
+            } else {
+                init_tensor_uniform(t);
+            }
+        }
+    }
+};
+
+// GGML_OP_OUT_PROD
+struct test_out_prod : public test_case {
+    const ggml_type type_a;
+    const ggml_type type_b;
+    const int64_t m;
+    const int64_t n;
+    const int64_t k;
+    const std::array<int64_t, 2> bs; // dims 3 and 4
+    const std::array<int64_t, 2> nr; // repeat in dims 3 and 4
+    const bool trans_b;
+
+    std::string vars() override {
+        return VARS_TO_STR8(type_a, type_b, m, n, k, bs, nr, trans_b);
+    }
+
+    double max_nmse_err() override {
+        return 5e-4;
+    }
+
+    test_out_prod(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
+            int64_t m = 32, int64_t n = 32, int64_t k = 32,
+            std::array<int64_t, 2> bs = {10, 10},
+            std::array<int64_t, 2> nr = {2, 2},
+            bool trans_b = false)
+        : type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr), trans_b(trans_b) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor_4d(ctx, type_a, m, k, bs[0], bs[1]);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * b;
+        if (trans_b) {
+            b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0]*nr[0], bs[1]*nr[1]);
+            b = ggml_transpose(ctx, b);
+        } else {
+            b = ggml_new_tensor_4d(ctx, type_b, n, k, bs[0]*nr[0], bs[1]*nr[1]);
+        }
+        ggml_set_name(b, "b");
+
+        ggml_tensor * out = ggml_out_prod(ctx, a, b);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_SQR
+struct test_sqr : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    test_sqr(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 5, 4, 3})
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(a);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_sqr(ctx, a);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    float grad_eps() override {
+        return 0.1f * 0.25f*ne[0]*ne[1]*ne[2]*ne[3]; // 10% of expected value of sum.
+    }
+};
+
+// GGML_OP_SQRT
+struct test_sqrt : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    test_sqrt(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 3, 3, 2})
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(a);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_sqrt(ctx, a);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        // fill with positive values
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            init_tensor_uniform(t, 50.0f, 100.0f);
+        }
+    }
+
+    float grad_eps() override {
+        return 20.0f;
+    }
+
+    bool grad_precise() override {
+        return true;
+    }
+};
+
+// GGML_OP_LOG
+struct test_log : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    test_log(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 5, 4, 3})
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(a);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_log(ctx, a);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            // log(1) == 0, cluster values there to keep the sum low for better precision in the backward pass:
+            init_tensor_uniform(t, 0.9f, 1.1f);
+        }
+    }
+
+    bool grad_precise() override {
+        return true;
+    }
+};
+
+// GGML_OP_SIN
+struct test_sin : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    test_sin(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 2, 2, 2})
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(a);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_sin(ctx, a);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            init_tensor_uniform(t, -6.5f, 6.5f); // Covers interval [-2*pi, 2*pi].
+        }
+    }
+
+    double max_maa_err() override {
+        return 1e-3;
+    }
+
+    float grad_eps() override {
+        return 0.2f;
+    }
+
+    bool grad_precise() override {
+        return true;
+    }
+};
+
+// GGML_OP_COS
+struct test_cos : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    test_cos(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 2, 2, 2})
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(a);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_cos(ctx, a);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            init_tensor_uniform(t, -6.5f, 6.5f); // Covers interval [-2*pi, 2*pi].
+        }
+    }
+
+    double max_maa_err() override {
+        return 1e-3;
+    }
+
+    float grad_eps() override {
+        return 0.2f;
+    }
+
+    bool grad_precise() override {
+        return true;
+    }
+};
+
+// GGML_OP_CLAMP
+struct test_clamp : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    float min;
+    float max;
+
+    std::string vars() override {
+        return VARS_TO_STR4(type, ne, min, max);
+    }
+
+    test_clamp(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 5, 4, 3},
+            float min = -0.5f, float max = 0.5f)
+        : type(type), ne(ne), min(min), max(max) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_clamp(ctx, a, min, max);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    float grad_eps() override {
+        return 1e-2f;
+    }
+
+    std::vector<float> grad_expect() override {
+        return {0.0f, 1.0f};
+    }
+};
+
+// GGML_OP_DIAG_MASK_INF
+struct test_diag_mask_inf : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    const int n_past;
+
+    std::string vars() override {
+        return VARS_TO_STR3(type, ne, n_past);
+    }
+
+    test_diag_mask_inf(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 10, 3, 2},
+            int n_past = 5)
+        : type(type), ne(ne), n_past(n_past) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(a);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_diag_mask_inf(ctx, a, n_past);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_SOFT_MAX
+struct test_soft_max : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    const bool mask;
+    const ggml_type m_prec;
+    const float scale;
+    const float max_bias;
+
+    std::string vars() override {
+        return VARS_TO_STR6(type, ne, mask, m_prec, scale, max_bias);
+    }
+
+    // the 1024 test with bias occasionally fails:
+    // SOFT_MAX(type=f32,ne=[1024,16,1,1],mask=1,scale=1.000000,max_bias=8.000000): [SOFT_MAX] NMSE = 0.000000103 > 0.000000100 FAIL
+    virtual double max_nmse_err() override {
+        return 1e-6;
+    }
+
+    test_soft_max(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 5, 4, 3},
+            bool mask = false,
+            ggml_type m_prec = GGML_TYPE_F32,
+            float scale = 1.0f,
+            float max_bias = 0.0f)
+        : type(type), ne(ne), mask(mask), m_prec(m_prec), scale(scale), max_bias(max_bias) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(a);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * mask = nullptr;
+        if (this->mask) {
+            mask = ggml_new_tensor_2d(ctx, m_prec, ne[0], ne[1]);
+            ggml_set_name(mask, "mask");
+        }
+
+        ggml_tensor * out = ggml_soft_max_ext(ctx, a, mask, scale, max_bias);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    bool grad_precise() override {
+        return true;
+    }
+};
+
+// GGML_OP_SOFT_MAX_BACK
+struct test_soft_max_back : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    const float scale;
+    const float max_bias;
+
+    std::string vars() override {
+        return VARS_TO_STR4(type, ne, scale, max_bias);
+    }
+
+    test_soft_max_back(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 5, 4, 3},
+            float scale = 1.0f,
+            float max_bias = 0.0f)
+        : type(type), ne(ne), scale(scale), max_bias(max_bias) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(a, "a");
+
+        ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_soft_max_ext_back(ctx, a, b, scale, max_bias);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_ROPE + GGML_OP_ROPE_BACK
+struct test_rope : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne_a;
+    int n_dims;
+    int mode;
+    int n_ctx; // used to generate positions
+    float fs; // freq_scale
+    float ef; // ext_factor
+    float af; // attn_factor
+    bool ff;
+    int v; // view (1 : non-contiguous a)
+    bool forward;
+
+    std::string vars() override {
+        // forward can be inferred from the op, does not need to be printed
+        return VARS_TO_STR10(type, ne_a, n_dims, mode, n_ctx, fs, ef, af, ff, v);
+    }
+
+    test_rope(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne_a = {10, 5, 3, 1},
+            int n_dims = 10, int mode = 0, int n_ctx = 512, float fs = 1.0f,
+            float ef = 0.0f, float af = 0.0f, bool ff = false, int v = 0, bool forward = true)
+        : type(type), ne_a(ne_a), n_dims(n_dims), mode(mode), n_ctx(n_ctx), fs(fs), ef(ef), af(af), ff(ff), v(v), forward(forward) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a;
+        if (v & 1) {
+            auto ne = ne_a; ne[0] *= 2; ne[1] *= 4; ne[2] *= 3;
+            a = ggml_new_tensor(ctx, type, 4, ne.data());
+            if (forward) {
+                ggml_set_param(a);
+            }
+            ggml_set_name(a, "a");
+
+            a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0);
+            ggml_set_name(a, "view_of_a");
+        } else {
+            a = ggml_new_tensor(ctx, type, 4, ne_a.data());
+            if (forward) {
+                ggml_set_param(a);
+            }
+            ggml_set_name(a, "a");
+        }
+
+        const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
+        const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
+
+        ggml_tensor * pos;
+        if (is_mrope || is_vision) {
+            pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne_a[2] * 4);
+        } else {
+            pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne_a[2]);
+        }
+        ggml_set_name(pos, "pos");
+
+        ggml_tensor * freq = nullptr;
+        if (ff) {
+            freq = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_dims/2);
+            ggml_set_name(freq, "freq");
+        }
+
+        ggml_tensor * out;
+        if (is_mrope) {
+            if (is_vision) {
+                GGML_ASSERT(n_dims/4 > 0);
+                int rope_sections[4] = {n_dims/4, n_dims/4, 0, 0}; // Vision-RoPE only use first two dimension for image (x, y) coordinate
+                if (forward) {
+                    out = ggml_rope_multi     (ctx, a, pos, freq, n_dims/2, rope_sections, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
+                } else {
+                    out = ggml_rope_multi_back(ctx, a, pos, freq, n_dims/2, rope_sections, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
+                }
+            } else {
+                GGML_ASSERT(n_dims/3 > 0);
+                int rope_sections[4] = {n_dims/3, n_dims/3, n_dims/3, 0};
+                if (forward) {
+                    out = ggml_rope_multi     (ctx, a, pos, freq, n_dims, rope_sections, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
+                } else {
+                    out = ggml_rope_multi_back(ctx, a, pos, freq, n_dims, rope_sections, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
+                }
+            }
+        } else {
+            if (forward) {
+                out = ggml_rope_ext     (ctx, a, pos, freq, n_dims, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
+            } else {
+                out = ggml_rope_ext_back(ctx, a, pos, freq, n_dims, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
+            }
+
+            // TODO: add test with a non-contiguous view as input ; this case is needed for build_rope_2d in clip.cpp
+        }
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            if (t->type == GGML_TYPE_I32) {
+                // pos
+                const int num_pos_ids = (mode & GGML_ROPE_TYPE_MROPE) ? ne_a[2] * 4 : ne_a[2];
+                std::vector<int> data(num_pos_ids);
+                for (int i = 0; i < num_pos_ids; i++) {
+                    data[i] = rand() % n_ctx;
+                }
+                ggml_backend_tensor_set(t, data.data(), 0, num_pos_ids * sizeof(int));
+            } else {
+                if (t->ne[0] == n_dims/2) {
+                    // frequency factors in the range [0.9f, 1.1f]
+                    init_tensor_uniform(t, 0.9f, 1.1f);
+                } else {
+                    init_tensor_uniform(t);
+                }
+            }
+        }
+    }
+
+    double max_maa_err() override {
+        return 1e-3;
+    }
+
+    bool grad_precise() override {
+        return true;
+    }
+};
+
+// GGML_OP_POOL2D
+struct test_pool2d : public test_case {
+    enum ggml_op_pool pool_type;
+    const ggml_type type_input;
+    const std::array<int64_t, 4> ne_input;
+    // kernel size
+    const int k0;
+    const int k1;
+    // stride
+    const int s0;
+    const int s1;
+    // padding
+    const int p0;
+    const int p1;
+
+    std::string vars() override {
+        return VARS_TO_STR9(pool_type, type_input, ne_input, k0, k1, s0, s1, p0, p1);
+    }
+
+    test_pool2d(ggml_op_pool pool_type = GGML_OP_POOL_AVG,
+            ggml_type type_input = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne_input = {10, 10, 3, 1}, // [input_width, input_height, input_channels, 1]
+            int k0 = 3, int k1 = 3,
+            int s0 = 1, int s1 = 1,
+            int p0 = 1, int p1 = 1)
+        : pool_type(pool_type), type_input(type_input), ne_input(ne_input), k0(k0), k1(k1), s0(s0), s1(s1), p0(p0), p1(p1) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * input = ggml_new_tensor(ctx, type_input, 4, ne_input.data());
+        ggml_set_param(input);
+        ggml_set_name(input, "input");
+
+        ggml_tensor * out = ggml_pool_2d(ctx, input, pool_type, k0, k1, s0, s1, p0, p1);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_CONV_TRANSPOSE_1D
+struct test_conv_transpose_1d : public test_case {
+    const std::array<int64_t, 4> ne_input;
+    const std::array<int64_t, 4> ne_kernel;
+
+    const int s0; // stride
+    const int p0; // padding
+    const int d0; // dilation
+
+    std::string vars() override {
+        return VARS_TO_STR5(ne_input, ne_kernel, s0, p0, d0);
+    }
+
+    test_conv_transpose_1d(std::array<int64_t, 4> ne_input = {197, 32, 1, 1}, // [input_width, input_channels, 1 /* assert in cpu kernel*/, 1 (should be batch)]
+                           std::array<int64_t, 4> ne_kernel = {16, 32, 32, 1}, // [kernel_width, output_channels, input_channels, 1 (should be batch)]
+                           int s0 = 1, int p0 = 0, int d0 = 1)
+        : ne_input(ne_input), ne_kernel(ne_kernel), s0(s0), p0(p0), d0(d0) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * input = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_input.data());
+        ggml_set_name(input, "input");
+
+        ggml_tensor * kernel = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_kernel.data());
+        ggml_set_name(kernel, "kernel");
+
+        ggml_tensor * out = ggml_conv_transpose_1d(ctx, kernel, input, s0, p0, d0);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_CONV_TRANSPOSE_2D
+struct test_conv_transpose_2d : public test_case {
+    const std::array<int64_t, 4> ne_input;
+    const std::array<int64_t, 4> ne_kernel;
+    const int stride;
+
+    std::string vars() override {
+        return VARS_TO_STR3(ne_input, ne_kernel, stride);
+    }
+
+    test_conv_transpose_2d(std::array<int64_t, 4> ne_input = {10, 10, 3, 1}, // [input_width, input_height, input_channels, 1]
+                           std::array<int64_t, 4> ne_kernel = {3, 3, 3, 1}, // [kernel_width, kernel_height, input_channels, 1]
+                           int stride = 1)
+        : ne_input(ne_input), ne_kernel(ne_kernel), stride(stride){}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * input = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_input.data());
+        ggml_set_name(input, "input");
+
+        ggml_tensor * kernel = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne_kernel.data());
+        ggml_set_name(kernel, "kernel");
+
+        ggml_tensor * out = ggml_conv_transpose_2d_p0(ctx, kernel, input, stride);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_IM2COL
+struct test_im2col : public test_case {
+    const ggml_type type_input;
+    const ggml_type type_kernel;
+    const ggml_type dst_type;
+    const std::array<int64_t, 4> ne_input;
+    const std::array<int64_t, 4> ne_kernel;
+    // stride
+    const int s0;
+    const int s1;
+    // padding
+    const int p0;
+    const int p1;
+    // dilation
+    const int d0;
+    const int d1;
+    // mode
+    const bool is_2D;
+
+    std::string vars() override {
+        return VARS_TO_STR12(type_input, type_kernel, dst_type, ne_input, ne_kernel, s0, s1, p0, p1, d0, d1, is_2D);
+    }
+
+    test_im2col(ggml_type type_input = GGML_TYPE_F32, ggml_type type_kernel = GGML_TYPE_F16, ggml_type dst_type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne_input = {10, 10, 3, 1}, // [input_width, input_height, input_channels, 1]
+            std::array<int64_t, 4> ne_kernel = {3, 3, 3, 1}, // [kernel_width, kernel_height, input_channels, 1]
+            int s0 = 1, int s1 = 1,
+            int p0 = 1, int p1 = 1,
+            int d0 = 1, int d1 = 1,
+            bool is_2D = true)
+        : type_input(type_input), type_kernel(type_kernel), dst_type(dst_type), ne_input(ne_input), ne_kernel(ne_kernel), s0(s0), s1(s1), p0(p0), p1(p1), d0(d0), d1(d1), is_2D(is_2D) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * input = ggml_new_tensor(ctx, type_input, 4, ne_input.data());
+        ggml_set_param(input);
+        ggml_set_name(input, "input");
+
+        ggml_tensor * kernel = ggml_new_tensor(ctx, type_kernel, 4, ne_kernel.data());
+        ggml_set_name(kernel, "kernel");
+
+        ggml_tensor * out = ggml_im2col(ctx, kernel, input, s0, s1, p0, p1, d0, d1, is_2D, dst_type);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_CONV_2D_DW
+struct test_conv_2d_dw : public test_case {
+    const std::array<int64_t, 4> ne_input;
+    const std::array<int64_t, 4> ne_kernel;
+    const int stride;
+    const int padding;
+    const int dilation;
+    const bool cwhn;
+
+    std::string vars() override {
+        return VARS_TO_STR6(ne_input, ne_kernel, stride, padding, dilation, cwhn);
+    }
+
+    test_conv_2d_dw(std::array<int64_t, 4> ne_input = {64, 64, 16, 1},
+            std::array<int64_t, 4> ne_kernel = {3, 3, 1, 16},
+            int stride = 1, int padding = 0, int dilation = 1, bool cwhn = false)
+        : ne_input(ne_input), ne_kernel(ne_kernel), stride(stride), padding(padding), dilation(dilation), cwhn(cwhn) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * input = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_input.data());
+        ggml_set_name(input, "input");
+
+        ggml_tensor * kernel = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_kernel.data());
+        ggml_set_name(kernel, "kernel");
+
+        if (cwhn) {
+            // change memory layout to channel-most-contiguous (CWHN),
+            // then permute it back so NE matches the original input
+            input = ggml_cont(ctx, ggml_permute(ctx, input, 1, 2, 0, 3));
+            input = ggml_permute(ctx, input, 2, 0, 1, 3);
+            kernel = ggml_cont(ctx, ggml_permute(ctx, kernel, 2, 3, 1, 0));
+            kernel = ggml_permute(ctx, kernel, 3, 2, 0, 1);
+        }
+
+        ggml_tensor * out = ggml_conv_2d_dw_direct(
+            ctx, kernel, input,
+            stride, stride, padding, padding, dilation, dilation);
+        ggml_set_name(out, "out");
+        return out;
+    }
+};
+
+// GGML_OP_CONCAT
+struct test_concat : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne_a;
+    const int64_t ne_b_d;
+    const int dim;
+    const int v; // view (1 << 0: non-cont a, 1 << 1: non-cont b)
+
+    std::string vars() override {
+        return VARS_TO_STR5(type, ne_a, ne_b_d, dim, v);
+    }
+
+    test_concat(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne_a = {10, 5, 5, 5},
+            int64_t ne_b_d = 5,
+            int dim = 2, int v = 0)
+        : type(type), ne_a(ne_a), ne_b_d(ne_b_d), dim(dim), v(v) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        auto ne_b = ne_a;
+        ne_b[dim] = ne_b_d;
+        ggml_tensor * a;
+        if (v & 1) {
+            auto ne = ne_a; ne[0] *= 2; ne[1] *= 4; ne[2] *= 3;
+            a = ggml_new_tensor(ctx, type, 4, ne.data());
+            ggml_set_name(a, "a");
+
+            a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0);
+            ggml_set_name(a, "view_of_a");
+        } else {
+            a = ggml_new_tensor(ctx, type, 4, ne_a.data());
+            ggml_set_name(a, "a");
+        }
+        ggml_tensor * b;
+        if (v & 2) {
+            auto ne = ne_b; ne[0] *= 3; ne[1] *= 2; ne[2] *= 4;
+            b = ggml_new_tensor(ctx, type, 4, ne.data());
+            ggml_set_name(b, "b");
+
+            b = ggml_view_4d(ctx, b, ne_b[0], ne_b[1], ne_b[2], ne_b[3], b->nb[1], b->nb[2], b->nb[3], 0);
+            ggml_set_name(b, "view_of_b");
+        } else {
+            b = ggml_new_tensor(ctx, type, 4, ne_b.data());
+            ggml_set_name(b, "b");
+        }
+
+        ggml_tensor * out = ggml_concat(ctx, a, b, dim);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_ARGSORT
+struct test_argsort : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    ggml_sort_order order;
+
+    std::string vars() override {
+        return VARS_TO_STR3(type, ne, order);
+    }
+
+    test_argsort(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {16, 10, 10, 10},
+            ggml_sort_order order = GGML_SORT_ORDER_ASC)
+        : type(type), ne(ne), order(order) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_argsort(ctx, a, order);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        std::random_device rd;
+        std::default_random_engine rng(rd());
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            if (t->type == GGML_TYPE_I32) {
+                // indices
+                std::vector<int> data(ggml_nelements(t));
+                for (int i = 0; i < ggml_nelements(t); i++) {
+                    data[i] = rand();
+                }
+                std::shuffle(data.begin(), data.end(), rng);
+                ggml_backend_tensor_set(t, data.data(), 0, ne[0]*ne[1]*ne[2]*ne[3] * sizeof(int));
+            } else if (t->type == GGML_TYPE_F32) {
+                // initialize with unique values to avoid ties
+                for (int64_t r = 0; r < ggml_nrows(t); r++) {
+                    std::vector<float> data(t->ne[0]);
+                    for (int i = 0; i < t->ne[0]; i++) {
+                        data[i] = i;
+                    }
+                    std::shuffle(data.begin(), data.end(), rng);
+                    ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(float));
+                }
+            } else {
+                GGML_ABORT("fatal error");
+            }
+        }
+    }
+};
+
+// GGML_OP_SUM
+struct test_sum : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    test_sum(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 5, 4, 3})
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(a);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_sum(ctx, a);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    float grad_eps() override {
+        return 0.1f * sqrtf(ne[0]*ne[1]*ne[2]*ne[3]);
+    }
+};
+
+// GGML_OP_SUM_ROWS
+struct test_sum_rows : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    test_sum_rows(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 5, 4, 3})
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(a);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_sum_rows(ctx, a);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_MEAN
+struct test_mean : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    test_mean(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 5, 4, 3})
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(a);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_mean(ctx, a);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    float grad_eps() override {
+        return 0.1f * ne[0]*ne[1]*ne[2]*ne[3];
+    }
+};
+
+// GGML_OP_UPSCALE
+struct test_upscale : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    const int32_t scale_factor;
+    const bool transpose;
+    const ggml_scale_mode mode;
+
+    std::string vars() override {
+        return VARS_TO_STR5(type, ne, scale_factor, mode, transpose);
+    }
+
+    test_upscale(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {512, 512, 3, 1},
+            int32_t scale_factor = 2, ggml_scale_mode mode = GGML_SCALE_MODE_NEAREST, bool transpose = false)
+        : type(type), ne(ne), scale_factor(scale_factor), transpose(transpose), mode(mode) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(a, "a");
+
+        if (transpose) {
+            a = ggml_transpose(ctx, a);
+            ggml_set_name(a, "a_transposed");
+        }
+
+        ggml_tensor * out = ggml_upscale(ctx, a, scale_factor, mode);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_UPSCALE (ext)
+struct test_upscale_ext : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    const std::array<int64_t, 4> ne_tgt;
+    const ggml_scale_mode mode = GGML_SCALE_MODE_NEAREST;
+
+    std::string vars() override {
+        return VARS_TO_STR4(type, ne, ne_tgt, mode);
+    }
+
+    test_upscale_ext(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne     = {2, 5,  7, 11},
+            std::array<int64_t, 4> ne_tgt = {5, 7, 11, 13},
+            ggml_scale_mode mode = GGML_SCALE_MODE_NEAREST)
+        : type(type), ne(ne), ne_tgt(ne_tgt), mode(mode) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_interpolate(ctx, a, ne_tgt[0], ne_tgt[1],ne_tgt[2], ne_tgt[3], mode);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_GROUP_NORM
+struct test_group_norm : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    const int32_t num_groups;
+    const float eps;
+
+    std::string vars() override {
+        return VARS_TO_STR4(type, ne, num_groups, eps);
+    }
+
+    test_group_norm(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {64, 64, 320, 1},
+            int32_t num_groups = 32,
+            float eps = 1e-6f)
+        : type(type), ne(ne), num_groups(num_groups), eps(eps) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_group_norm(ctx, a, num_groups, eps);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_L2_NORM
+struct test_l2_norm : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    const float eps;
+
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    test_l2_norm(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {64, 64, 320, 1},
+            float eps = 1e-12f)
+        : type(type), ne(ne), eps(eps) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_l2_norm(ctx, a, eps);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_ACC
+struct test_acc : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne_a;
+    const std::array<int64_t, 4> ne_b;
+
+    std::string vars() override {
+        return VARS_TO_STR3(type, ne_a, ne_b);
+    }
+
+    test_acc(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne_a = {256, 17, 1, 1},
+            std::array<int64_t, 4> ne_b = {256, 16, 1, 1})
+        : type(type), ne_a(ne_a), ne_b(ne_b) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
+        ggml_set_param(a);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne_b.data());
+        ggml_set_param(b);
+        ggml_set_name(b, "b");
+
+        ggml_tensor * out = ggml_acc(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], b->nb[1]);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_PAD
+struct test_pad : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne_a;
+    const int pad_0;
+    const int pad_1;
+
+    std::string vars() override {
+        return VARS_TO_STR4(type, ne_a, pad_0, pad_1);
+    }
+
+    test_pad(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne_a = {512, 512, 1, 1},
+            int pad_0 = 1, int pad_1 = 1)
+        : type(type), ne_a(ne_a), pad_0(pad_0), pad_1(pad_1)  {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_pad(ctx, a, pad_0, pad_1, 0, 0);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_PAD_REFLECT_1D
+struct test_pad_reflect_1d : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne_a;
+    const int pad_0;
+    const int pad_1;
+
+    std::string vars() override {
+        return VARS_TO_STR4(type, ne_a, pad_0, pad_1);
+    }
+
+    test_pad_reflect_1d(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne_a = {512, 34, 2, 1},
+            int pad_0 = 10, int pad_1 = 9)
+        : type(type), ne_a(ne_a), pad_0(pad_0), pad_1(pad_1)  {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 2, ne_a.data());
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_pad_reflect_1d(ctx, a, pad_0, pad_1);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_ARANGE
+struct test_arange : public test_case {
+    const ggml_type type;
+    const float start;
+    const float stop;
+    const float step;
+
+    std::string vars() override {
+        return VARS_TO_STR4(type, start, stop, step);
+    }
+
+    test_arange(ggml_type type = GGML_TYPE_F32,
+            float start = 0.f, float stop = 10.f, float step = 1.f)
+        : type(type), start(start), stop(stop), step(step)  {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * out = ggml_arange(ctx, start, stop, step);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_TIMESTEP_EMBEDDING
+struct test_timestep_embedding : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne_a;
+    const int dim;
+    const int max_period;
+
+    std::string vars() override {
+        return VARS_TO_STR4(type, ne_a, dim, max_period);
+    }
+
+    test_timestep_embedding(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne_a = {2, 1, 1, 1},
+            int dim = 320, int max_period=10000)
+        : type(type), ne_a(ne_a), dim(dim), max_period(max_period)  {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_timestep_embedding(ctx, a, dim, max_period);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_LEAKY_RELU
+struct test_leaky_relu : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne_a;
+    const float negative_slope;
+
+    std::string vars() override {
+        return VARS_TO_STR3(type, ne_a, negative_slope);
+    }
+
+    test_leaky_relu(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne_a = {10, 5, 4, 3},
+            float negative_slope = 0.1f)
+        : type(type), ne_a(ne_a), negative_slope(negative_slope)  {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_leaky_relu(ctx, a, negative_slope, true);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_FLASH_ATTN_EXT
+struct test_flash_attn_ext : public test_case {
+    const int64_t hsk; // K head size
+    const int64_t hsv; // V head size
+    const int64_t nh; // num heads
+    const int64_t nr; // repeat in Q, tests for grouped-query attention
+    const int64_t kv; // kv size
+    const int64_t nb; // batch size
+
+    const bool mask; // use mask
+
+    const float max_bias; // ALiBi
+    const float logit_softcap; // Gemma 2
+
+    const ggml_prec prec;
+    const ggml_type type_KV;
+    std::array<int32_t, 4> permute;
+
+    std::string vars() override {
+        return VARS_TO_STR12(hsk, hsv, nh, nr, kv, nb, mask, max_bias, logit_softcap, prec, type_KV, permute);
+    }
+
+    double max_nmse_err() override {
+        return 5e-4;
+    }
+
+    uint64_t op_flops(ggml_tensor * t) override {
+        GGML_UNUSED(t);
+        // Just counting matmul costs:
+        // Q*K^T is nb x hsk x kv, P*V is nb x kv x hsv, per head
+        return 2 * nh*nr * nb * (hsk + hsv) * kv;
+    }
+
+    test_flash_attn_ext(int64_t hsk = 128, int64_t hsv = 128, int64_t nh = 32, int64_t nr = 1, int64_t kv = 96, int64_t nb = 8,
+                        bool mask = true, float max_bias = 0.0f, float logit_softcap = 0.0f, ggml_prec prec = GGML_PREC_F32,
+                        ggml_type type_KV = GGML_TYPE_F16, std::array<int32_t, 4> permute = {0, 1, 2, 3})
+        : hsk(hsk), hsv(hsv), nh(nh), nr(nr), kv(kv), nb(nb), mask(mask), max_bias(max_bias), logit_softcap(logit_softcap), prec(prec), type_KV(type_KV), permute(permute) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        const int64_t hsk_padded = GGML_PAD(hsk, ggml_blck_size(type_KV));
+        const int64_t hsv_padded = GGML_PAD(hsv, ggml_blck_size(type_KV));
+
+        auto const &create_permuted = [&](ggml_type type, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) -> ggml_tensor * {
+            int64_t ne[4] = {ne0, ne1, ne2, ne3};
+            int64_t ne_perm[4];
+            for (int i = 0; i < 4; ++i) {
+                ne_perm[permute[i]] = ne[i];
+            }
+            ggml_tensor * t = ggml_new_tensor_4d(ctx, type, ne_perm[0], ne_perm[1], ne_perm[2], ne_perm[3]);
+            if (permute != std::array<int32_t, 4>{0, 1, 2, 3}) {
+                t = ggml_permute(ctx, t, permute[0], permute[1], permute[2], permute[3]);
+            }
+            return t;
+        };
+
+        ggml_tensor * q = create_permuted(GGML_TYPE_F32, hsk_padded, nb, nh*nr, 1);
+        ggml_set_name(q, "q");
+
+        ggml_tensor * k = create_permuted(type_KV,       hsk_padded, kv, nh,    1);
+        ggml_set_name(k, "k");
+
+        ggml_tensor * v = create_permuted(type_KV,       hsv_padded, kv, nh,    1);
+        ggml_set_name(v, "v");
+
+        ggml_tensor * m = nullptr;
+        if (mask) {
+            m = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, GGML_PAD(nb, GGML_KQ_MASK_PAD), 1, 1);
+            ggml_set_name(m, "m");
+        }
+
+        ggml_tensor * out = ggml_flash_attn_ext(ctx, q, k, v, m, 1.0f/sqrtf(hsk), max_bias, logit_softcap);
+        ggml_flash_attn_ext_set_prec(out, prec);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    bool grad_precise() override {
+        return true;
+    }
+};
+
+// GGML_OP_CROSS_ENTROPY_LOSS
+struct test_cross_entropy_loss : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    test_cross_entropy_loss(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 5, 4, 3})
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * logits = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(logits);
+        ggml_set_name(logits, "logits");
+
+        ggml_tensor * labels = ggml_new_tensor(ctx, type, 4, ne.data());
+        // The labels are assumed to be constant -> no gradients.
+        ggml_set_name(labels, "labels");
+
+        // Ensure labels add up to 1:
+        labels = ggml_soft_max(ctx, labels);
+        ggml_set_name(labels, "labels_normalized");
+
+        ggml_tensor * out = ggml_cross_entropy_loss(ctx, logits, labels);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        // For larger abs. diffs between logits softmax is more linear, therefore more precise num. gradients.
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            init_tensor_uniform(t, -100.0f, 100.0f);
+        }
+    }
+
+    float grad_eps() override {
+        return 1.0f;
+    }
+
+    bool grad_precise() override {
+        return true;
+    }
+};
+
+// GGML_OP_CROSS_ENTROPY_LOSS_BACK
+struct test_cross_entropy_loss_back : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    test_cross_entropy_loss_back(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 5, 4, 3})
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * grad = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
+        ggml_set_name(grad, "grad");
+
+        ggml_tensor * logits = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(logits, "logits");
+
+        ggml_tensor * labels = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(labels, "labels");
+
+        // Ensure labels add up to 1:
+        labels = ggml_soft_max(ctx, labels);
+        ggml_set_name(labels, "labels_normalized");
+
+        ggml_tensor * out = ggml_cross_entropy_loss_back(ctx, grad, logits, labels);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_OPT_STEP_ADAMW
+struct test_opt_step_adamw : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    test_opt_step_adamw(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 5, 4, 3})
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
+        ggml_set_param(a); // Despite tensor a having gradients the output tensor will not.
+        ggml_set_name(a, "a");
+
+        ggml_tensor * grad = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
+        ggml_set_name(grad, "grad");
+
+        ggml_tensor * grad_m = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
+        ggml_set_name(grad_m, "grad_m");
+
+        ggml_tensor * grad_v = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
+        ggml_set_name(grad_v, "grad_v");
+
+        ggml_tensor * adamw_params = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 7);
+        ggml_set_name(adamw_params, "adamw_params");
+
+        ggml_tensor * out = ggml_opt_step_adamw(ctx, a, grad, grad_m, grad_v, adamw_params);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            init_tensor_uniform(t, 0.0f, 1.0f); // grad_v and adamw_params need non-negative values.
+        }
+    }
+
+    bool grad_precise() override {
+        return true;
+    }
+};
+
+enum llm_norm_type {
+    LLM_NORM,
+    LLM_NORM_RMS,
+};
+
+struct llama_hparams {
+    uint32_t n_vocab;
+    uint32_t n_embd;
+    uint32_t n_head;
+    uint32_t n_head_kv;
+    static constexpr uint32_t n_layer = 1;
+    uint32_t n_rot;
+    uint32_t n_embd_head; // dimension of values (d_v)
+    uint32_t n_ff;
+
+    float f_norm_eps;
+    float f_norm_rms_eps;
+
+    // cparams
+    static constexpr uint32_t n_ctx = 512; // user-specified context size
+    static constexpr uint32_t n_ctx_orig = n_ctx;
+
+    // batch
+    int32_t n_tokens;
+
+    // llm_build_context
+    static constexpr int32_t n_kv    = 32; // size of KV cache to consider (n_kv <= n_ctx
+    static constexpr int32_t kv_head = 1;  // index of where we store new KV data in the cache
+
+    uint32_t n_embd_gqa() const { // dimension of key embeddings across all k-v heads
+        return n_embd_head * n_head_kv;
+    }
+};
+
+// LLM base class
+struct test_llm : public test_case {
+    llama_hparams hp;
+
+protected:
+    test_llm(llama_hparams hp)
+        : hp(std::move(hp)) {
+    }
+
+public:
+    struct ggml_tensor * llm_build_norm(
+            struct ggml_context * ctx,
+             struct ggml_tensor * cur,
+             struct ggml_tensor * mw,
+             struct ggml_tensor * mb,
+                  llm_norm_type   type) {
+        switch (type) {
+            case LLM_NORM:     cur = ggml_norm    (ctx, cur, hp.f_norm_eps); break;
+            case LLM_NORM_RMS: cur = ggml_rms_norm(ctx, cur, hp.f_norm_rms_eps); break;
+        }
+        cur = ggml_mul(ctx, cur, mw);
+        if (mb) {
+            cur = ggml_add(ctx, cur, mb);
+        }
+        return cur;
+    }
+
+    void llm_build_kv_store(
+            struct ggml_context * ctx,
+             struct ggml_tensor * k_l,
+             struct ggml_tensor * v_l,
+             struct ggml_tensor * k_cur,
+             struct ggml_tensor * v_cur) {
+        // compute the transposed [n_tokens, n_embd] V matrix
+        struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, hp.n_embd_gqa(), hp.n_tokens));
+
+        struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, k_l, hp.n_tokens*hp.n_embd_gqa(),
+                (ggml_row_size(k_l->type, hp.n_embd_gqa()))*hp.kv_head);
+
+        struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, v_l, hp.n_tokens, hp.n_embd_gqa(),
+                (  hp.n_ctx)*ggml_element_size(v_l),
+                (hp.kv_head)*ggml_element_size(v_l));
+
+        // important: storing RoPE-ed version of K in the KV cache!
+        ggml_cpy(ctx, k_cur,   k_cache_view);
+        ggml_cpy(ctx, v_cur_t, v_cache_view);
+    }
+
+    struct ggml_tensor * llm_build_kqv(
+            struct ggml_context * ctx,
+             struct ggml_tensor * k_l,
+             struct ggml_tensor * v_l,
+             struct ggml_tensor * q_cur,
+             struct ggml_tensor * kq_mask,
+                        float     kq_scale) {
+        struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
+
+        struct ggml_tensor * k =
+            ggml_view_3d(ctx, k_l,
+                    hp.n_embd_head, hp.n_kv, hp.n_head_kv,
+                    ggml_row_size(k_l->type, hp.n_embd_gqa()),
+                    ggml_row_size(k_l->type, hp.n_embd_head),
+                    0);
+
+        struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
+
+        kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, 0.0f);
+
+        // split cached v into n_head heads
+        struct ggml_tensor * v =
+            ggml_view_3d(ctx, v_l,
+                    hp.n_kv, hp.n_embd_head, hp.n_head_kv,
+                    ggml_element_size(v_l)*hp.n_ctx,
+                    ggml_element_size(v_l)*hp.n_ctx*hp.n_embd_head,
+                    0);
+
+        struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
+
+        struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
+
+        struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, hp.n_embd_head*hp.n_head, hp.n_tokens);
+
+        struct ggml_tensor * wo = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd);
+        cur = ggml_mul_mat(ctx, wo, cur);
+
+        return cur;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            if (t->type == GGML_TYPE_I32) {
+                // pos
+                std::vector<int> data(hp.n_tokens);
+                for (int i = 0; i < hp.n_tokens; i++) {
+                    data[i] = rand() % hp.n_ctx;
+                }
+                ggml_backend_tensor_set(t, data.data(), 0, hp.n_tokens * sizeof(int));
+            } else {
+                init_tensor_uniform(t);
+            }
+        }
+    }
+};
+
+// Llama
+struct test_llama : public test_llm {
+    static constexpr float freq_base = 10000.0f;
+    static constexpr float freq_scale = 1.0f;
+    static constexpr float ext_factor = 0.0f;
+    static constexpr float attn_factor = 1.0f;
+    static constexpr float beta_fast = 32.0f;
+    static constexpr float beta_slow = 1.0f;
+    bool fused;
+
+    std::string op_desc(ggml_tensor * t) override {
+        GGML_UNUSED(t);
+        return "LLAMA";
+    }
+
+    std::string vars() override {
+        auto n_tokens = hp.n_tokens;
+        return VARS_TO_STR1(n_tokens);
+    }
+
+    double max_nmse_err() override {
+        return 2e-3;
+    }
+
+    bool run_whole_graph() override { return fused; }
+
+    test_llama(int n_tokens = 1, bool fused = false)
+        : test_llm({
+            /*n_vocab        =*/ 32000,
+            /*n_embd         =*/ 3200,
+            /*n_head         =*/ 32,
+            /*n_head_kv      =*/ 32,
+            /*n_rot          =*/ 100,
+            /*n_embd_head    =*/ 100,
+            /*n_ff           =*/ 8640,
+            /*f_norm_eps     =*/ 0.f,
+            /*f_norm_rms_eps =*/ 1e-5f,
+            /*n_tokens       =*/ n_tokens,
+        })
+        , fused(fused)
+    {
+    }
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hp.n_embd, hp.n_tokens);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, hp.n_tokens);
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, hp.n_kv, hp.n_tokens, 1);
+
+        ggml_tensor * k_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
+        ggml_tensor * v_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
+
+        for (uint32_t il = 0; il < hp.n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            // norm
+            ggml_tensor * attn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
+            cur = llm_build_norm(ctx, inpL, attn_norm, nullptr, LLM_NORM_RMS);
+
+            // self-attention
+            {
+                ggml_tensor * wq = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd);
+                ggml_tensor * wk = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd_gqa());
+                ggml_tensor * wv = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd_gqa());
+
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = ggml_mul_mat(ctx, wq, cur);
+                struct ggml_tensor * Kcur = ggml_mul_mat(ctx, wk, cur);
+                struct ggml_tensor * Vcur = ggml_mul_mat(ctx, wv, cur);
+
+                Qcur = ggml_rope_ext(
+                    ctx, ggml_reshape_3d(ctx, Qcur, hp.n_embd_head, hp.n_head,    hp.n_tokens), inp_pos, nullptr,
+                    hp.n_rot, 0, hp.n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+
+                Kcur = ggml_rope_ext(
+                    ctx, ggml_reshape_3d(ctx, Kcur, hp.n_embd_head, hp.n_head_kv, hp.n_tokens), inp_pos, nullptr,
+                    hp.n_rot, 0, hp.n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+
+                llm_build_kv_store(ctx, k_l, v_l, Kcur, Vcur);
+
+                cur = llm_build_kqv(ctx, k_l, v_l, Qcur, KQ_mask, 1.0f/sqrtf(float(hp.n_embd_head)));
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx, cur, inpSA);
+
+            // feed-forward network
+            ggml_tensor * ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
+            cur = llm_build_norm(ctx, ffn_inp, ffn_norm, nullptr, LLM_NORM_RMS);
+
+            ggml_tensor * ffn_gate = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_ff);
+            ggml_tensor * ffn_down = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_ff,   hp.n_embd);
+            ggml_tensor * ffn_up   = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_ff);
+            struct ggml_tensor * tmp = ggml_mul_mat(ctx, ffn_up, cur);
+            cur = ggml_mul_mat(ctx, ffn_gate, cur);
+            cur = ggml_silu(ctx, cur);
+            cur = ggml_mul(ctx, cur, tmp);
+            cur = ggml_mul_mat(ctx, ffn_down, cur);
+
+            cur = ggml_add(ctx, cur, ffn_inp);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        ggml_tensor * output_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
+        cur = llm_build_norm(ctx, cur, output_norm, nullptr, LLM_NORM_RMS);
+
+        // lm_head
+        ggml_tensor * output = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_vocab);
+        cur = ggml_mul_mat(ctx, output, cur);
+
+        return cur;
+    }
+};
+
+// Falcon
+struct test_falcon : public test_llm {
+    static constexpr float freq_base = 10000.0f;
+    static constexpr float freq_scale = 1.0f;
+    static constexpr float ext_factor = 0.0f;
+    static constexpr float attn_factor = 1.0f;
+    static constexpr float beta_fast = 32.0f;
+    static constexpr float beta_slow = 1.0f;
+
+    std::string op_desc(ggml_tensor * t) override {
+        GGML_UNUSED(t);
+        return "FALCON";
+    }
+
+    std::string vars() override {
+        auto n_tokens = hp.n_tokens;
+        return VARS_TO_STR1(n_tokens);
+    }
+
+    double max_nmse_err() override {
+        return 2e-3;
+    }
+
+    test_falcon(int n_tokens = 1)
+        : test_llm({
+            /*n_vocab        =*/ 32000,
+            /*n_embd         =*/ 3200,
+            /*n_head         =*/ 50,
+            /*n_head_kv      =*/ 1,
+            /*n_rot          =*/ 64,
+            /*n_embd_head    =*/ 64,
+            /*n_ff           =*/ 8640,
+            /*f_norm_eps     =*/ 1e-5f,
+            /*f_norm_rms_eps =*/ 0.f,
+            /*n_tokens       =*/ n_tokens,
+        }) {
+    }
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hp.n_embd, hp.n_tokens);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, hp.n_tokens);
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, hp.n_kv, hp.n_tokens, 1);
+
+        ggml_tensor * k_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
+        ggml_tensor * v_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
+
+        for (uint32_t il = 0; il < hp.n_layer; ++il) {
+            // norm
+            ggml_tensor * attn_norm_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
+            ggml_tensor * attn_norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
+            ggml_tensor * attn_norm = llm_build_norm(ctx, inpL, attn_norm_w, attn_norm_b, LLM_NORM);
+
+            // self-attention
+            {
+                cur = attn_norm;
+
+                ggml_tensor * wqkv = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd + 2*hp.n_embd_gqa());
+
+                cur = ggml_mul_mat(ctx, wqkv, cur);
+
+                struct ggml_tensor * Qcur = ggml_cont(ctx, ggml_view_2d(ctx, cur, hp.n_embd,     hp.n_tokens, cur->nb[1], 0*sizeof(float)*(hp.n_embd)));
+                struct ggml_tensor * Kcur = ggml_cont(ctx, ggml_view_2d(ctx, cur, hp.n_embd_gqa(), hp.n_tokens, cur->nb[1], 1*sizeof(float)*(hp.n_embd)));
+                struct ggml_tensor * Vcur = ggml_cont(ctx, ggml_view_2d(ctx, cur, hp.n_embd_gqa(), hp.n_tokens, cur->nb[1], 1*sizeof(float)*(hp.n_embd + hp.n_embd_gqa())));
+
+                Qcur = ggml_reshape_3d(ctx, Qcur, hp.n_embd_head, hp.n_head,    hp.n_tokens);
+                Kcur = ggml_reshape_3d(ctx, Kcur, hp.n_embd_head, hp.n_head_kv, hp.n_tokens);
+
+                // using mode = 2 for neox mode
+                Qcur = ggml_rope_ext(
+                    ctx, Qcur, inp_pos, nullptr, hp.n_rot, 2, hp.n_ctx_orig,
+                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
+                );
+
+                Kcur = ggml_rope_ext(
+                    ctx, Kcur, inp_pos, nullptr, hp.n_rot, 2, hp.n_ctx_orig,
+                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
+                );
+
+                llm_build_kv_store(ctx, k_l, v_l, Kcur, Vcur);
+
+                cur = llm_build_kqv(ctx, k_l, v_l, Qcur, KQ_mask, 1.0f/sqrtf(float(hp.n_embd_head)));
+            }
+
+            struct ggml_tensor * ffn_inp = cur;
+
+            // feed forward
+            {
+                ggml_tensor * ffn_up   = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_ff);
+                ggml_tensor * ffn_down = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_ff, hp.n_embd);
+                cur = attn_norm;
+                cur = ggml_mul_mat(ctx, ffn_up, cur);
+                cur = ggml_gelu(ctx, cur);
+                cur = ggml_mul_mat(ctx, ffn_down, cur);
+            }
+
+            cur = ggml_add(ctx, cur, ffn_inp);
+
+            cur = ggml_add(ctx, cur, inpL);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        ggml_tensor * output_norm   = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
+        ggml_tensor * output_norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
+        cur = llm_build_norm(ctx, cur, output_norm, output_norm_b, LLM_NORM);
+
+        // lm_head
+        ggml_tensor * output = ggml_new_tensor_2d(ctx, GGML_TYPE_Q8_0, hp.n_embd, hp.n_vocab);
+        cur = ggml_mul_mat(ctx, output, cur);
+
+        return cur;
+    }
+};
+
+
+// ###########################################
+// ## Section 3: GGML Op Test Instantiation ##
+// ###########################################
+static const ggml_type all_types[] = {
+    GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16,
+    GGML_TYPE_Q4_0, GGML_TYPE_Q4_1,
+    GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
+    GGML_TYPE_Q8_0,
+    GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
+    GGML_TYPE_Q4_K, GGML_TYPE_Q5_K,
+    GGML_TYPE_Q6_K,
+    // GGML_TYPE_TQ1_0, GGML_TYPE_TQ2_0, // TODO: implement for all backends
+    GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S,
+    GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M,
+    GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS,
+};
+
+static const ggml_type base_types[] = {
+    GGML_TYPE_F32, GGML_TYPE_F16,
+    GGML_TYPE_Q8_0, // for I8MM tests
+    GGML_TYPE_Q4_0,
+    GGML_TYPE_Q4_1, // for I8MM tests
+    GGML_TYPE_Q4_K,
+    GGML_TYPE_IQ2_XXS
+};
+
+static const ggml_type other_types[] = {
+    GGML_TYPE_Q4_1,
+    GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
+    GGML_TYPE_Q8_0,
+    GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
+    GGML_TYPE_Q5_K,
+    GGML_TYPE_Q6_K,
+    // GGML_TYPE_TQ1_0, GGML_TYPE_TQ2_0, // TODO: implement for all backends
+    GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S,
+    GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M,
+    GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS,
+    GGML_TYPE_BF16,
+};
+
+// Test cases for evaluation: should try to cover edge cases while using small input sizes to keep the runtime low
+static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
+    std::vector<std::unique_ptr<test_case>> test_cases;
+    std::default_random_engine rng(0);
+
+    // unary ops
+    for (ggml_type type : {GGML_TYPE_F16, GGML_TYPE_F32}) {
+        for (int v : {0, 1}) {
+            for (int op = 0; op < GGML_UNARY_OP_COUNT; op++) {
+                test_cases.emplace_back(new test_unary((ggml_unary_op) op, type, { 128, 2, 2, 2 }, v));
+                test_cases.emplace_back(new test_unary((ggml_unary_op) op, type, { 5, 7, 11, 13 }, v));
+            }
+        }
+    }
+
+    // glu ops
+    for (ggml_type type : {GGML_TYPE_F16, GGML_TYPE_F32}) {
+        for (int v : {0, 1}) {
+            for (int op = 0; op < GGML_GLU_OP_COUNT; op++) {
+                for (bool swapped : {false, true}) {
+                    test_cases.emplace_back(new test_glu((ggml_glu_op) op, type, { 128, 2, 2, 2 }, v, swapped));
+                    test_cases.emplace_back(new test_glu((ggml_glu_op) op, type, { 5, 7, 11, 13 }, v, swapped));
+                }
+
+                test_cases.emplace_back(new test_glu_split((ggml_glu_op) op, type, { 128, 2, 2, 2 }, v));
+                test_cases.emplace_back(new test_glu_split((ggml_glu_op) op, type, { 5, 7, 11, 13 }, v));
+            }
+        }
+    }
+
+    test_cases.emplace_back(new test_get_rows(GGML_TYPE_F32, 1, 8, 2, 1, false));
+    for (ggml_type type : all_types) {
+        for (int b : {1, 7}) {
+            for (bool v : {false, true}) {
+                test_cases.emplace_back(new test_get_rows(type, 256, 5, 4, b, v));
+            }
+        }
+    }
+    for (int b : {1, 7}) {
+        for (bool v : {false, true}) {
+            test_cases.emplace_back(new test_get_rows(GGML_TYPE_I32, 256, 5, 4, b, v));
+        }
+    }
+
+    test_cases.emplace_back(new test_get_rows_back(GGML_TYPE_F32, 1, 8, 2, 1, false));
+    for (ggml_type type : all_types) {
+        for (bool v : {false, true}) {
+            test_cases.emplace_back(new test_get_rows_back(type, 256, 5, 4, 1, v));
+        }
+    }
+    for (bool v : {false, true}) {
+        test_cases.emplace_back(new test_get_rows_back(GGML_TYPE_I32, 256, 5, 4, 1, v));
+    }
+
+    test_cases.emplace_back(new test_set_rows(GGML_TYPE_F32, { 1, 8, 1, 3 }, { 1, 1 }, 2, false));
+    for (ggml_type type : all_types) {
+        for (int b : {1, 7}) {
+            for (bool v : {false, true}) {
+                test_cases.emplace_back(new test_set_rows(type, { 256, 5,  b, 3 }, { 1, 1, }, 1, v));
+                test_cases.emplace_back(new test_set_rows(type, { 256, 11, 1, b }, { 2, 3, }, 7, v));
+
+                test_cases.emplace_back(new test_set_rows(type, { 3*ggml_blck_size(type), 3, b, 1 }, { 2, 3, }, 2, v));
+
+                if (ggml_blck_size(type) == 1) {
+                    test_cases.emplace_back(new test_set_rows(type, { 31, 3, b, 1 }, { 2, 3, }, 2, v));
+                    test_cases.emplace_back(new test_set_rows(type, { 33, 5, 1, b }, { 2, 3, }, 1, v));
+                }
+            }
+        }
+    }
+
+    for (ggml_type type_input : {GGML_TYPE_F32}) {
+        for (ggml_op_pool pool_type : {GGML_OP_POOL_AVG, GGML_OP_POOL_MAX}) {
+            for (int k0 : {1, 3}) {
+                for (int k1 : {1, 3}) {
+                    for (int s0 : {1, 2}) {
+                        for (int s1 : {1, 2}) {
+                            for (int p0 : {0, 1}) {
+                                for (int p1 : {0, 1}) {
+                                    test_cases.emplace_back(new test_pool2d(pool_type, type_input, {10, 10, 3, 1}, k0, k1, s0, s1, p0, p1));
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    // im2col 1D
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false));
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false));
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false));
+    for (int s0 : {1, 3}) {
+        for (int p0 : {0, 3}) {
+            for (int d0 : {1, 3}) {
+                test_cases.emplace_back(new test_im2col(
+                    GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32, {20, 2, 2, 1}, {3, 2, 2, 1},
+                    s0, 0, p0, 0, d0, 0, false));
+            }
+        }
+    }
+
+    // im2col 2D
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32));
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32));
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16));
+    for (int s0 : {1, 3}) {
+        for (int s1 : {1, 3}) {
+            for (int p0 : {0, 3}) {
+                for (int p1 : {0, 3}) {
+                    for (int d0 : {1, 3}) {
+                        for (int d1 : {1, 3}) {
+                            test_cases.emplace_back(new test_im2col(
+                                GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32, {20, 20, 2, 2}, {3, 3, 2, 2},
+                                s0, s1, p0, p1, d0, d1, true));
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    // extra tests for im2col 2D
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 1, 32}, {3, 3, 1, 32}, 1, 1, 1, 1, 1, 1, true));
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 2, 32}, {3, 3, 2, 32}, 1, 1, 1, 1, 1, 1, true));
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 1, 1024}, {3, 3, 1, 1024}, 1, 1, 1, 1, 1, 1, true));
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 2, 1024}, {3, 3, 2, 1024}, 1, 1, 1, 1, 1, 1, true));
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 1, 2048}, {3, 3, 1, 2048}, 1, 1, 1, 1, 1, 1, true));
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 2, 2048}, {3, 3, 2, 2048}, 1, 1, 1, 1, 1, 1, true));
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 1, 2560}, {3, 3, 1, 2560}, 1, 1, 1, 1, 1, 1, true));
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 2, 2560}, {3, 3, 2, 2560}, 1, 1, 1, 1, 1, 1, true));
+
+    // sycl backend will limit task global_range < MAX_INT
+    // test cases for 2D im2col with large input W and H (occurs in stable-diffusion)
+    // however these cases need to alloc more memory which may fail in some devices (Intel Arc770, etc.)
+    // these cases are verified (pass) in Intel(R) Data Center GPU Max 1100 (sycl backend) and NV A30 (cuda backend)
+    // test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {1024, 1024, 256, 1}, {3, 3, 256, 1}, 1, 1, 1, 1, 1, 1, true));
+    // test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32, {1024, 1024, 256, 1}, {3, 3, 256, 1}, 1, 1, 1, 1, 1, 1, true));
+
+    test_cases.emplace_back(new test_conv_2d_dw({17, 34, 9, 1}, {3, 3, 1, 9}, 1, 0, 1, false));
+    test_cases.emplace_back(new test_conv_2d_dw({17, 34, 9, 1}, {3, 3, 1, 9}, 1, 0, 1, true));
+    test_cases.emplace_back(new test_conv_2d_dw({32, 8, 64, 1}, {3, 3, 1, 64}, 2, 1, 1, false));
+    test_cases.emplace_back(new test_conv_2d_dw({32, 8, 64, 1}, {3, 3, 1, 64}, 2, 1, 1, true));
+
+    for(uint32_t Cout : {1, 9}){
+        for(uint32_t Cin : {1, 7}){
+            for(uint32_t K : {1, 3, 1337}){
+                for(uint32_t L : {1, 2, 13}){
+                    for(uint32_t s0: {1, 2, 3}){
+                        test_cases.emplace_back(new test_conv_transpose_1d({L,Cin,1,1}, {K,Cout,Cin,1}, s0, 0, 1));
+                    }
+                }
+            }
+        }
+    }
+
+    test_cases.emplace_back(new test_conv_transpose_1d());
+    test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 3, 0, 1));
+    test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 2, 0, 1));
+    test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 1, 0, 1));
+    test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,2,2,1}, 2, 0, 1));
+    test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,2,2,1}, 1, 0, 1));
+    test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,1,2,1}, 1, 0, 1));
+    test_cases.emplace_back(new test_conv_transpose_1d({2,1,1,1}, {3,1,1,1}, 1, 0, 1));
+
+    test_cases.emplace_back(new test_conv_transpose_2d({3, 2, 3, 1}, {2, 2, 1, 3}, 1));
+    test_cases.emplace_back(new test_conv_transpose_2d({10, 10, 9, 1}, {3, 3, 1, 9}, 2));
+
+    test_cases.emplace_back(new test_count_equal(GGML_TYPE_F32, {4,  500, 1, 1}));
+    test_cases.emplace_back(new test_count_equal(GGML_TYPE_F32, {4, 5000, 1, 1}));
+
+    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {32,    1, 1, 1}));
+    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {100,  10, 1, 1}));
+    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {1024, 10, 1, 1}));
+    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {1024, 12, 1, 1}));
+    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {2000, 10, 1, 1}));
+    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {5438,  3, 1, 1}));
+
+    for (int ne3 : {1, 3}) { // CUDA backward pass only supports ne3 == 1
+        test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 1, 1, 1}));
+        test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {2, 1, 1, 1}));
+        test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 2, 1, 1}));
+        test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 1, 2, 1}));
+        test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 1, 1, 2}));
+        test_cases.emplace_back(new test_repeat(GGML_TYPE_I32, {10, 5, 4, ne3}, {2, 1, 1, 1}));
+        test_cases.emplace_back(new test_repeat(GGML_TYPE_I16, {10, 5, 4, ne3}, {1, 1, 1, 2}));
+    }
+
+    for (bool view : {false, true}) {
+        test_cases.emplace_back(new test_repeat_back(GGML_TYPE_F32, {8, 6, 4, 2}, {1, 1, 1, 1}, view));
+        test_cases.emplace_back(new test_repeat_back(GGML_TYPE_F32, {8, 6, 4, 2}, {2, 1, 1, 1}, view));
+        test_cases.emplace_back(new test_repeat_back(GGML_TYPE_F32, {8, 6, 4, 2}, {1, 2, 1, 1}, view));
+        test_cases.emplace_back(new test_repeat_back(GGML_TYPE_F32, {8, 6, 4, 2}, {1, 1, 2, 1}, view));
+        test_cases.emplace_back(new test_repeat_back(GGML_TYPE_F32, {8, 6, 4, 2}, {1, 1, 1, 2}, view));
+    }
+
+    test_cases.emplace_back(new test_dup(GGML_TYPE_F32));
+    test_cases.emplace_back(new test_dup(GGML_TYPE_F16));
+    test_cases.emplace_back(new test_dup(GGML_TYPE_I32));
+    test_cases.emplace_back(new test_dup(GGML_TYPE_I16));
+    test_cases.emplace_back(new test_dup(GGML_TYPE_F32, {10, 10, 5, 1}, {0, 2, 1, 3}));
+    test_cases.emplace_back(new test_dup(GGML_TYPE_F16, {10, 10, 5, 1}, {0, 2, 1, 3})); // dup by rows
+    test_cases.emplace_back(new test_dup(GGML_TYPE_F32, {10, 10, 5, 1}, {1, 0, 2, 3}));
+    test_cases.emplace_back(new test_dup(GGML_TYPE_F16, {10, 10, 5, 1}, {1, 0, 2, 3})); // dup dst not-contiguous
+    test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10,  8, 3, 1}, {0, 2, 1, 3}));
+    test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10,  8, 3, 1}, {1, 2, 0, 3}));
+
+    for (int dim = 1; dim < GGML_MAX_DIMS; ++dim) {
+        test_cases.emplace_back(new test_set(GGML_TYPE_F32, GGML_TYPE_F32, {6, 5, 4, 3}, dim));
+    }
+
+    for (int dim = 1; dim < GGML_MAX_DIMS; ++dim) {
+        test_cases.emplace_back(new test_set(GGML_TYPE_I32, GGML_TYPE_I32, {6, 5, 4, 3}, dim));
+    }
+
+    // same-type copy
+    for (ggml_type type : all_types) {
+        const auto nk = ggml_blck_size(type);
+
+        for (int k = 1; k < 4; ++k) {
+            test_cases.emplace_back(new test_cpy(type, type, {k*nk, 2, 3, 4}));
+            test_cases.emplace_back(new test_cpy(type, type, {k*nk, 2, 3, 4}, {0, 2, 1, 3}));
+            test_cases.emplace_back(new test_cpy(type, type, {k*nk, 2, 3, 4}, {0, 3, 1, 2}, {0, 2, 1, 3}));
+        }
+    }
+
+    for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_F32}) {
+        for (ggml_type type_dst : all_types) {
+            test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4}));
+            test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows
+        }
+    }
+    for (ggml_type type_src : all_types) {
+        for (ggml_type type_dst : {GGML_TYPE_F32}) {
+            test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4}));
+            test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows
+        }
+    }
+    for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) {
+        for (ggml_type type_dst : {GGML_TYPE_F16, GGML_TYPE_F32}) {
+            test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {1, 0, 2, 3})); // cpy not-contiguous
+        }
+    }
+
+    test_cases.emplace_back(new test_cont());
+    test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {2, 1, 1 ,1}));
+    test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {2, 1, 3 ,5}));
+    test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {2, 3, 5 ,7}));
+    test_cases.emplace_back(new test_cont(GGML_TYPE_F16, {2, 1, 1 ,1}));
+    test_cases.emplace_back(new test_cont(GGML_TYPE_F16, {2, 1, 3 ,5}));
+    test_cases.emplace_back(new test_cont(GGML_TYPE_F16, {2, 3, 5 ,7}));
+    test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, {2, 1, 1 ,1}));
+    test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, {2, 1, 3 ,5}));
+    test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, {2, 3, 5 ,7}));
+
+    auto add_test_bin_bcast = [&](ggml_type type, std::array<int64_t, 4> ne, std::array<int, 4> nr) {
+        for (auto op : {ggml_add, ggml_sub, ggml_mul, ggml_div}) {
+            test_cases.emplace_back(new test_bin_bcast(op, type, ne, nr));
+        }
+    };
+    for (ggml_type type : {GGML_TYPE_F16, GGML_TYPE_F32}) {
+        add_test_bin_bcast(type, {1, 1, 8, 1}, {1, 1, 1, 1});
+        add_test_bin_bcast(type, {1, 1, 1, 1}, {32, 1, 1, 1});
+        add_test_bin_bcast(type, {1, 1, 320, 320}, {1, 1, 1, 1});
+        add_test_bin_bcast(type, {10, 5, 1, 1}, {1, 1, 1, 1});
+        add_test_bin_bcast(type, {10, 5, 4, 1}, {1, 1, 1, 1});
+        add_test_bin_bcast(type, {10, 5, 4, 3}, {1, 1, 1, 1});
+        add_test_bin_bcast(type, {10, 5, 4, 3}, {2, 1, 1, 1});
+        add_test_bin_bcast(type, {10, 5, 4, 3}, {1, 2, 1, 1});
+        add_test_bin_bcast(type, {10, 5, 4, 3}, {1, 1, 2, 1});
+        add_test_bin_bcast(type, {10, 5, 4, 3}, {1, 1, 1, 2});
+        add_test_bin_bcast(type, {10, 5, 4, 3}, {1, 1, 2, 2});
+        add_test_bin_bcast(type, {10, 5, 4, 3}, {1, 2, 2, 2});
+        add_test_bin_bcast(type, {10, 5, 4, 3}, {2, 2, 2, 2});
+
+        // stable diffusion
+        add_test_bin_bcast(type, {1280, 1, 1, 1}, {1, 1, 1, 1});
+        add_test_bin_bcast(type, {1280, 1, 1, 1}, {1, 16, 16, 1});
+        add_test_bin_bcast(type, {1280, 16, 16, 1}, {1, 1, 1, 1});
+        add_test_bin_bcast(type, {1280, 1, 1, 1}, {1, 256, 1, 1});
+        add_test_bin_bcast(type, {1, 1, 1280, 1}, {16, 16, 1, 1});
+        add_test_bin_bcast(type, {16, 16, 1280, 1}, {1, 1, 1, 1});
+        add_test_bin_bcast(type, {1, 1, 1920, 1}, {16, 16, 1, 1});
+        add_test_bin_bcast(type, {1, 1, 2560, 1}, {16, 16, 1, 1});
+        add_test_bin_bcast(type, {1, 1, 1280, 1}, {32, 32, 1, 1});
+        add_test_bin_bcast(type, {1, 1, 1920, 1}, {32, 32, 1, 1});
+        add_test_bin_bcast(type, {1, 1, 640, 1}, {32, 32, 1, 1});
+        add_test_bin_bcast(type, {5120, 1, 1, 1}, {1, 256, 1, 1});
+        add_test_bin_bcast(type, {640, 1, 1, 1}, {1, 1, 1, 1});
+        //add_test_bin_bcast(type, {3, 3, 2560, 1280}, {1, 1, 1, 1});
+        //add_test_bin_bcast(type, {3, 3, 2560, 1280}, {2, 1, 1, 1});
+    }
+
+    test_cases.emplace_back(new test_add1());
+    test_cases.emplace_back(new test_scale());
+    test_cases.emplace_back(new test_silu_back());
+
+    for (float eps : {0.0f, 1e-6f, 1e-4f, 1e-1f}) {
+        for (bool v : {false, true}) {
+            test_cases.emplace_back(new test_norm    (GGML_TYPE_F32, {64, 5, 4, 3}, v, eps));
+            test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, {64, 5, 4, 3}, v, eps));
+        }
+        test_cases.emplace_back(new test_rms_norm_back(GGML_TYPE_F32, {64, 5, 4, 3}, eps));
+        test_cases.emplace_back(new test_l2_norm      (GGML_TYPE_F32, {64, 5, 4, 3}, eps));
+    }
+    for (float eps : {0.0f, 1e-6f, 1e-4f, 1e-1f}) {
+        test_cases.emplace_back(new test_rms_norm_mul(GGML_TYPE_F32, {64, 5, 4, 3}, eps));
+    }
+
+    test_cases.emplace_back(new test_l2_norm(GGML_TYPE_F32, {64, 5, 4, 3}, 1e-12f));
+
+    test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {4, 1536, 1, 1}, {4, 1536, 1, 1}));
+    test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {8, 1536, 1, 1}, {4, 1536, 1, 1}));
+    test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {4, 1536, 4, 1}, {4, 1536, 1, 1}));
+
+    test_cases.emplace_back(new test_ssm_scan(GGML_TYPE_F32, 16, 1024, 32, 4));
+
+    test_cases.emplace_back(new test_rwkv_wkv6(GGML_TYPE_F32, 32, 64, 1, 1));
+    test_cases.emplace_back(new test_rwkv_wkv6(GGML_TYPE_F32, 32, 64, 32, 1));
+    test_cases.emplace_back(new test_rwkv_wkv6(GGML_TYPE_F32, 32, 64, 32, 4));
+    test_cases.emplace_back(new test_rwkv_wkv6(GGML_TYPE_F32, 32, 64, 128, 4));
+
+    test_cases.emplace_back(new test_rwkv_wkv7(GGML_TYPE_F32, 32, 64, 1, 1));
+    test_cases.emplace_back(new test_rwkv_wkv7(GGML_TYPE_F32, 32, 64, 32, 1));
+    test_cases.emplace_back(new test_rwkv_wkv7(GGML_TYPE_F32, 32, 64, 32, 4));
+    test_cases.emplace_back(new test_rwkv_wkv7(GGML_TYPE_F32, 32, 64, 128, 4));
+
+    test_cases.emplace_back(new test_gla(GGML_TYPE_F32, 32, 64, 1, 1));
+    test_cases.emplace_back(new test_gla(GGML_TYPE_F32, 32, 64, 32, 1));
+    test_cases.emplace_back(new test_gla(GGML_TYPE_F32, 32, 64, 32, 4));
+    test_cases.emplace_back(new test_gla(GGML_TYPE_F32, 32, 64, 128, 4));
+
+    for (ggml_type type_a : all_types) {
+        for (int i = 1; i < 10; ++i) {
+            test_cases.emplace_back(new test_mul_mat(type_a,    GGML_TYPE_F32, 16,  i, 256, { 1,  1}, {1, 1}));
+        }
+    }
+
+#if 1
+    for (ggml_type type_a : base_types) {
+        for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) {
+            std::vector<int> ks = { 256 };
+            if (ggml_blck_size(type_a) == 1) {
+                ks.push_back(4);
+            }
+            for (auto k : ks) {
+                // test cases without permutation
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, k, {1, 1}, {1, 1}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, k, {1, 1}, {2, 1}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, k, {1, 1}, {1, 2}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, k, {3, 1}, {1, 1}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, k, {3, 1}, {2, 1}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, k, {3, 2}, {1, 1}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, k, {3, 2}, {2, 1}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, k, {3, 2}, {1, 2}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, k, {3, 2}, {2, 2}));
+
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, k, {1, 1}, {1, 1}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, k, {1, 1}, {2, 1}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, k, {1, 1}, {1, 2}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, k, {3, 1}, {1, 1}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, k, {3, 1}, {2, 1}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, k, {3, 2}, {1, 1}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, k, {3, 2}, {2, 1}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, k, {3, 2}, {1, 2}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, k, {3, 2}, {2, 2}));
+
+                // test cases with permutation
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, k, {2, 3}, {1, 1}, {0, 2, 1, 3}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, k, {2, 3}, {1, 1}, {0, 1, 3, 2}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, k, {2, 3}, {1, 1}, {0, 3, 2, 1}));
+
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  8, k, {2, 3}, {1, 1}, {0, 2, 1, 3}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  8, k, {2, 3}, {1, 1}, {0, 1, 3, 2}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  8, k, {2, 3}, {1, 1}, {0, 3, 2, 1}));
+
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, k, {2, 3}, {1, 1}, {0, 2, 1, 3}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, k, {2, 3}, {1, 1}, {0, 1, 3, 2}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, k, {2, 3}, {1, 1}, {0, 3, 2, 1}));
+            }
+
+            // test cases with large ne00/ne10 to cover stream-k fixup
+            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, 1024, {3, 2}, {1, 1}));
+            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  8, 1024, {3, 2}, {1, 1}));
+            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 1024, {3, 2}, {1, 1}));
+        }
+    }
+    for (ggml_type type_a : other_types) {
+        for (ggml_type type_b : {GGML_TYPE_F32}) {
+            if (ggml_blck_size(type_a) != 256) {
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, ggml_blck_size(type_a), {1,  1}, {1, 1}));
+            }
+            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {1,  1}, {1, 1}));
+        }
+    }
+#else
+    // m = a rows
+    // n = b rows
+    // k = cols
+    std::uniform_int_distribution<> dist_m(1, 128);
+    std::uniform_int_distribution<> dist_n(16, 128);
+    std::uniform_int_distribution<> dist_k(1, 16);
+    for (int i = 0; i < 1000; i++) {
+        for (ggml_type type_a : all_types) {
+            for (ggml_type type_b : {GGML_TYPE_F32}) {
+                int m = dist_m(rng);
+                int n = dist_n(rng);
+                int k = dist_k(rng) * ggml_blck_size(type_a);
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, m, n, k, { 1,  1}, {1, 1}));
+            }
+        }
+    }
+#endif
+
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32,  64, 2,  128, { 8,  1}, {1, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32,  83, 2,  128, { 8,  1}, {4, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32,  64, 2,   64, { 8,  1}, {4, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32,  83, 2,   64, { 8,  1}, {4, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32,  64, 45, 128, { 8,  1}, {4, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 45,  64, { 8,  1}, {4, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056, 1, 193, {1,  1}, {4, 1}, {0, 2, 1, 3}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056, 1, 67,  {1,  1}, {4, 1}, {0, 2, 1, 3}));
+
+    for (auto bs : {1,2,4,8}) {
+        for (auto nr : {1,4}) {
+            for (uint32_t m = 0; m < 2; ++m) {
+                for (uint32_t k = 0; k < 2; ++k) {
+                    for (ggml_type type: {GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_F32}) {
+                        test_cases.emplace_back(new test_mul_mat(type, GGML_TYPE_F32, 1056 + m, 1, 128 + k,  {bs,  1}, {nr, 1}, {0, 2, 1, 3}));
+                        test_cases.emplace_back(new test_mul_mat(type, GGML_TYPE_F32, 128 + m,  1, 1056 + k, {bs,  1}, {nr, 1}, {0, 1, 2, 3}, true));
+                    }
+                }
+            }
+        }
+    }
+
+    // sycl backend will limit task global_range < MAX_INT
+    // test case for f16-type-convert-to-fp32 kernel with large k under fp32 compute dtype (occurs in stable-diffusion)
+    // however this case needs to alloc more memory which may fail in some devices (Intel Arc770, etc.)
+    // this case is verified (pass) in Intel(R) Data Center GPU Max 1100 (sycl backend) and NV A30 (cuda backend)
+    // test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F16, 512, 262144, 9216, {1, 1}, {1, 1}));
+
+    for (ggml_type type_a : base_types) {
+        for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) {
+            for (int n_mats : {4, 8}) {
+                for (int n_used : {1, 2, 4}) {
+                    for (bool b : {false, true}) {
+                        for (int n : {1, 32, 129}) {
+                            int m = 512;
+                            int k = 256;
+                            test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, n_used, b, m, n, k));
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    for (ggml_type type_a : other_types) {
+        for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) {
+            for (int n_mats : {4}) {
+                for (int n_used : {2}) {
+                    for (bool b : {false}) {
+                        for (int n : {1, 32}) {
+                            int m = 512;
+                            int k = 256;
+                            test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, n_used, b, m, n, k));
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    for (ggml_type type_a : base_types) {
+        for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) {
+            for (int n : {1, 16}) {
+                for (int k : {1, 16}) {
+                    for (int bs2 : {1, 3}) {
+                        for (int bs3 : {1, 3}) {
+                            for (int nr2 : {1, 2}) {
+                                for (int nr3 : {1, 2}) {
+                                    test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, n, k, {bs2, bs3}, {nr2, nr3}));
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    for (ggml_type type : {GGML_TYPE_F16, GGML_TYPE_F32}) {
+        test_cases.emplace_back(new test_sqr(type));
+        test_cases.emplace_back(new test_sqrt(type));
+        test_cases.emplace_back(new test_log(type));
+        test_cases.emplace_back(new test_sin(type));
+        test_cases.emplace_back(new test_cos(type));
+        test_cases.emplace_back(new test_clamp(type));
+    }
+
+    test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 1, 1}, 5));
+    test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 3, 1}, 5));
+    test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 3, 2}, 5));
+
+#if 0
+    std::uniform_int_distribution<> dist_ne1(1, 50);
+    int exponent = 1;
+    while (exponent < (1 << 17)) {
+        std::uniform_int_distribution<> dist_ne0(exponent, 2*exponent);
+
+        for (int n = 0; n < 10; ++n) {
+            int64_t ne0 = dist_ne0(rng);
+            int64_t ne1 = dist_ne1(rng);
+            test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, GGML_TYPE_F32, {ne0, ne1, 1, 1}, n/2 == 0, 0.1f, ne0 < 1000 ? 4.0f : 0.0f));
+        }
+
+        exponent <<= 1;
+    }
+#endif
+    for (bool mask : {false, true}) {
+        for (float max_bias : {0.0f, 8.0f}) {
+            if (!mask && max_bias > 0.0f) continue;
+            for (float scale : {1.0f, 0.1f}) {
+                for (int64_t ne0 : {16, 1024}) {
+                    for (int64_t ne1 : {16, 1024}) {
+                        if (mask) {
+                            for (ggml_type m_prec : {GGML_TYPE_F32, GGML_TYPE_F16}) {
+                                test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0,   ne1,   1, 1}, mask, m_prec, scale, max_bias));
+                                test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0-1, ne1-1, 1, 1}, mask, m_prec, scale, max_bias));
+                            }
+                        } else {
+                            /* The precision of mask here doesn't matter as boolean mask is false */
+                            test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0,   ne1,   1, 1}, mask, GGML_TYPE_F32, scale, max_bias));
+                            test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0-1, ne1-1, 1, 1}, mask, GGML_TYPE_F32, scale, max_bias));
+                        }
+                    }
+                }
+            }
+        }
+    }
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, true, GGML_TYPE_F32,  0.1f, 0.0f));
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, true, GGML_TYPE_F16,  0.1f, 0.0f));
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, false, GGML_TYPE_F32, 0.1f, 0.0f));
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, GGML_TYPE_F32,  0.1f, 0.0f));
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, GGML_TYPE_F16,  0.1f, 0.0f));
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, GGML_TYPE_F32,  0.1f, 8.0f));
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, GGML_TYPE_F16,  0.1f, 8.0f));
+
+    for (float max_bias : {0.0f, 8.0f}) {
+        for (float scale : {1.0f, 0.1f}) {
+            for (int64_t ne0 : {16, 1024}) {
+                for (int64_t ne1 : {16, 1024}) {
+                    test_cases.emplace_back(new test_soft_max_back(GGML_TYPE_F32, {ne0,   ne1,   1, 1}, scale, max_bias));
+                    test_cases.emplace_back(new test_soft_max_back(GGML_TYPE_F32, {ne0-1, ne1-1, 1, 1}, scale, max_bias));
+                }
+            }
+        }
+    }
+
+    for (bool fw : {true, false}) { // fw == forward
+        bool all = true;
+
+        for (float v : { 0, 1 }) {
+            for (float fs : { 1.0f, 1.4245f }) {
+                for (float ef : { 0.0f, 0.7465f }) {
+                    for (float af : { 1.0f, 1.4245f }) {
+                        for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
+                            for (bool ff : {false, true}) { // freq_factors
+                                test_cases.emplace_back(new test_rope(type, {128,  32, 2, 1}, 128, 0, 512, fs, ef, af, ff, v, fw)); // llama 7B
+
+                                if (all) {
+                                    test_cases.emplace_back(new test_rope(type, {128,  40, 2, 1}, 128, 0, 512, fs, ef, af, ff, v, fw)); // llama 13B
+                                    test_cases.emplace_back(new test_rope(type, {128,  52, 2, 1}, 128, 0, 512, fs, ef, af, ff, v, fw)); // llama 30B
+                                    test_cases.emplace_back(new test_rope(type, {128,  64, 2, 1}, 128, 0, 512, fs, ef, af, ff, v, fw)); // llama 65B
+                                }
+
+                                if (all) {
+                                    test_cases.emplace_back(new test_rope(type, { 64,   1, 2, 1},  64, 2, 512, fs, ef, af, ff, v, fw)); // neox (falcon 7B)
+                                    test_cases.emplace_back(new test_rope(type, { 64,  71, 2, 1},  64, 2, 512, fs, ef, af, ff, v, fw)); // neox (falcon 7B)
+                                    test_cases.emplace_back(new test_rope(type, { 64,   8, 2, 1},  64, 2, 512, fs, ef, af, ff, v, fw)); // neox (falcon 40B)
+                                    test_cases.emplace_back(new test_rope(type, { 80,  32, 2, 1},  20, 2, 512, fs, ef, af, ff, v, fw)); // neox (stablelm)
+                                    test_cases.emplace_back(new test_rope(type, { 80,  32, 2, 1},  32, 2, 512, fs, ef, af, ff, v, fw)); // neox (phi-2)
+                                }
+
+                                if (all) {
+                                    test_cases.emplace_back(new test_rope(type, {128,  12, 2, 1}, 128, GGML_ROPE_TYPE_MROPE,  512, fs, ef, af, ff, v, fw)); // rope_multi,m-rope (qwen2vl 2B)
+                                    test_cases.emplace_back(new test_rope(type, {128,  28, 2, 1}, 128, GGML_ROPE_TYPE_MROPE,  512, fs, ef, af, ff, v, fw)); // rope_multi,m-rope (qwen2vl 7B)
+                                    test_cases.emplace_back(new test_rope(type, { 80,  16, 2, 1},  80, GGML_ROPE_TYPE_VISION, 512, fs, ef, af, ff, v, fw)); // rope_multi,m-rope (qwen2vl ViT)
+                                }
+
+                                test_cases.emplace_back(new test_rope(type, { 64, 128, 2, 1},  64, 2, 512, fs, ef, af, ff, v, fw)); // neox (falcon 40B)
+                            }
+                        }
+
+                        all = false;
+                    }
+                }
+            }
+        }
+    }
+
+    for (int v : { 0, 1, 2, 3 }) {
+        for (int dim : { 0, 1, 2, 3, }) {
+            test_cases.emplace_back(new test_concat(GGML_TYPE_F32, {11, 12, 13, 14}, 7, dim, v));
+            test_cases.emplace_back(new test_concat(GGML_TYPE_I32, {11, 12, 13, 14}, 7, dim, v));
+        }
+    }
+
+    for (ggml_sort_order order : {GGML_SORT_ORDER_ASC, GGML_SORT_ORDER_DESC}) {
+        test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {8, 1, 1, 1}, order));
+        test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {16, 10, 10, 10}, order));
+        test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {60, 10, 10, 10}, order)); // qwen
+    }
+
+    for (ggml_scale_mode mode : {GGML_SCALE_MODE_NEAREST, GGML_SCALE_MODE_BILINEAR}) {
+        test_cases.emplace_back(new test_upscale(GGML_TYPE_F32, {512, 512, 3, 2}, 2, mode));
+        test_cases.emplace_back(new test_upscale(GGML_TYPE_F32, {512, 512, 3, 2}, 2, mode, true));
+        test_cases.emplace_back(new test_upscale_ext(GGML_TYPE_F32, {2, 5,  7, 11}, {5, 7, 11, 13}, mode));
+    }
+
+    test_cases.emplace_back(new test_sum());
+    test_cases.emplace_back(new test_sum_rows());
+    test_cases.emplace_back(new test_mean());
+    test_cases.emplace_back(new test_group_norm(GGML_TYPE_F32, {64, 64, 320, 1}));
+    test_cases.emplace_back(new test_group_norm(GGML_TYPE_F32, {9, 9, 1280, 1}));
+    test_cases.emplace_back(new test_acc());
+    test_cases.emplace_back(new test_pad());
+    test_cases.emplace_back(new test_pad_reflect_1d());
+    test_cases.emplace_back(new test_arange());
+    test_cases.emplace_back(new test_timestep_embedding());
+    test_cases.emplace_back(new test_leaky_relu());
+
+    for (int hsk : { 64, 80, 128, 192, 256, 576 }) {
+        for (int hsv : { 64, 80, 128, 192, 256, 512 }) {
+            if (hsk != 192 && hsk != 576 && hsk != hsv) continue;
+            if (hsk == 192 && (hsv != 128 && hsv != 192)) continue;
+            if (hsk == 576 && hsv != 512) continue; // DeepSeek MLA
+
+            for (bool mask : { true, false } ) {
+                for (float max_bias : { 0.0f, 8.0f }) {
+                    if (!mask && max_bias > 0.0f) continue;
+                    for (float logit_softcap : {0.0f, 10.0f}) {
+                        if (hsk != 128 && logit_softcap != 0.0f) continue;
+                        for (int nh : { 4, }) {
+                            for (int nr : { 1, 4, 16 }) {
+                                if (nr == 16 && hsk != 128) continue;
+                                for (int kv : { 512, 1024, }) {
+                                    if (nr != 1 && kv != 512) continue;
+                                    for (int nb : { 1, 3, 32, 35, }) {
+                                        for (ggml_prec prec : {GGML_PREC_F32, GGML_PREC_DEFAULT}) {
+                                            if (hsk != 128 && prec == GGML_PREC_DEFAULT) continue;
+                                            for (ggml_type type_KV : {GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0}) {
+                                                test_cases.emplace_back(new test_flash_attn_ext(
+                                                    hsk, hsv, nh, nr, kv, nb, mask, max_bias, logit_softcap, prec, type_KV));
+                                                // run fewer test cases permuted
+                                                if (mask == true && max_bias == 0.0f && logit_softcap == 0 && kv == 512) {
+                                                    test_cases.emplace_back(new test_flash_attn_ext(
+                                                        hsk, hsv, nh, nr, kv, nb, mask, max_bias, logit_softcap, prec, type_KV, {0, 2, 1, 3}));
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    test_cases.emplace_back(new test_cross_entropy_loss     (GGML_TYPE_F32, {   10, 5, 4, 3}));
+    test_cases.emplace_back(new test_cross_entropy_loss     (GGML_TYPE_F32, {30000, 1, 1, 1}));
+    test_cases.emplace_back(new test_cross_entropy_loss_back(GGML_TYPE_F32, {   10, 5, 4, 3}));
+    test_cases.emplace_back(new test_cross_entropy_loss_back(GGML_TYPE_F32, {30000, 1, 1, 1}));
+
+    test_cases.emplace_back(new test_opt_step_adamw(GGML_TYPE_F32, {10, 5, 4, 3}));
+
+    test_cases.emplace_back(new test_llama(2, true));
+    // these tests are disabled to save execution time, but they can be handy for debugging
+#if 0
+    test_cases.emplace_back(new test_llama(1));
+    test_cases.emplace_back(new test_llama(2));
+    test_cases.emplace_back(new test_falcon(1));
+    test_cases.emplace_back(new test_falcon(2));
+#endif
+
+#ifdef GGML_USE_HEXAGON
+    //verify computation result of add on cDSP
+    test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {384, 1, 1, 1}, {1,   1, 1, 1}));
+    test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {1536, 1, 1, 1}, {1,   1, 1, 1}));
+    test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {1024, 1024, 1, 1}, {1,   1, 1, 1}));
+    test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {2048, 2048, 1, 1}, {1,   1, 1, 1}));
+    test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {2048, 1, 1, 1}, {1,   1, 1, 1}));
+    test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 4096, 1, 1}, {1,   1, 1, 1}));
+    //verify computation result of mulmat on cDSP
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32,    GGML_TYPE_F32, 32, 14, 64, { 1,  1}, {1, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32,    GGML_TYPE_F32, 256, 128, 256, { 1,  1}, {1, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32,    GGML_TYPE_F32, 600, 300, 600, { 1,  1}, {1, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32,    GGML_TYPE_F32, 900, 450, 900, { 1,  1}, {1, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32,    GGML_TYPE_F32, 512, 512, 1024, { 1,  1}, {1, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32,    GGML_TYPE_F32, 1024, 512, 1024, { 1,  1}, {1, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32,    GGML_TYPE_F32, 1024, 1024, 1024, { 1,  1}, {1, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32,    GGML_TYPE_F32, 2048, 1024, 2048, { 1,  1}, {1, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32,    GGML_TYPE_F32, 2048, 2048, 2048, { 1,  1}, {1, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32,    GGML_TYPE_F32, 4096, 38, 4096, { 1,  1}, {1, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32,    GGML_TYPE_F32, 4096, 1024, 4096, { 1,  1}, {1, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32,    GGML_TYPE_F32, 4096, 2048, 4096, { 1,  1}, {1, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32,    GGML_TYPE_F32, 4096, 4096, 4096, { 1,  1}, {1, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32,    GGML_TYPE_F32, 14336, 512, 4096, { 1,  1}, {1, 1}));
+#endif
+
+    return test_cases;
+}
+
+// Test cases for performance evaluation: should be representative of real-world use cases
+static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
+    std::vector<std::unique_ptr<test_case>> test_cases;
+
+    test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1,   1, 1, 1}));
+    test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 512, 1, 1}));
+
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F16, {512, 3072, 1, 1}));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {8192, 512, 2, 1}, {0, 2, 1, 3}));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {3072, 512, 2, 1}, {0, 2, 1, 3}));
+
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {4096, 4096, 5, 1}, false, GGML_TYPE_F32, 1.0f, 0.0f));
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {77, 4096, 5, 1}, false, GGML_TYPE_F32, 1.0f, 0.0f));
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {1024, 1024, 10, 1}, false, GGML_TYPE_F32, 1.0f, 0.0f));
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {77, 1024, 10, 1}, false, GGML_TYPE_F32, 1.0f, 0.0f));
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {256, 256, 20, 1}, false, GGML_TYPE_F32, 1.0f, 0.0f));
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {64, 64, 20, 1}, false, GGML_TYPE_F32, 1.0f, 0.0f));
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {77, 64, 20, 1}, false, GGML_TYPE_F32, 1.0f, 0.0f));
+
+    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {32, 10, 1, 1}));
+    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {1024, 10, 1, 1}));
+    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {32000, 512, 1, 1}));
+
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 16416, 1, 128, {8,  1}, {4, 1}, {0, 2, 1, 3}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 1, 16416, {8,  1}, {4, 1}, {0, 1, 2, 3}, true));
+
+#ifndef GGML_USE_HEXAGON
+    for (int bs : {1, 2, 3, 4, 5, 8, 512}) {
+#else
+    for (int bs : {1, 2, 3, 4, 5, 8}) {
+#endif
+        for (ggml_type type_a : all_types) {
+            for (ggml_type type_b : {GGML_TYPE_F32}) {
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 4096, bs, 14336, {1,  1}, {1, 1}));
+            }
+        }
+    }
+
+    for (int K : {3, 5}) {
+        for (int IC : {256, 2560}) {
+            for (int IW_IH : {32, 64, 256}) {
+                if (IC == 2560 && IW_IH == 256) {
+                    // too big
+                    continue;
+                }
+                test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32, {IW_IH, IW_IH, IC, 1}, {K, K, IC, 1}, 1, 1, 1, 1, 1, 1, true));
+            }
+        }
+    }
+
+    for (int kv : { 4096, 8192, 16384, }) {
+        for (int hs : { 64, 128, }) {
+            for (int nr : { 1, 4, }) {
+                test_cases.emplace_back(new test_flash_attn_ext(hs, hs, 8, nr, kv, 1, true, 0, 0, GGML_PREC_F32, GGML_TYPE_F16));
+            }
+        }
+    }
+
+    test_cases.emplace_back(new test_conv_2d_dw({512, 512, 256, 1}, {3, 3, 1, 256}, 1, 1, 1, false));
+    test_cases.emplace_back(new test_conv_2d_dw({512, 512, 256, 1}, {3, 3, 1, 256}, 1, 1, 1, true));
+
+    test_cases.emplace_back(new test_conv_transpose_2d({256, 256, 256, 1}, {3, 3, 16, 256}, 1));
+
+    test_cases.emplace_back(new test_mean(GGML_TYPE_F32, {256, 256, 3, 1}));
+
+    return test_cases;
+}
+
+static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name, const char * params_filter) {
+    auto filter_test_cases = [](std::vector<std::unique_ptr<test_case>> & test_cases, const char * params_filter) {
+        if (params_filter == nullptr) {
+            return;
+        }
+
+        std::regex params_filter_regex(params_filter);
+
+        for (auto it = test_cases.begin(); it != test_cases.end();) {
+            if (!std::regex_search((*it)->vars(), params_filter_regex)) {
+                it = test_cases.erase(it);
+                continue;
+            }
+
+            it++;
+        }
+    };
+
+    if (mode == MODE_TEST) {
+        auto test_cases = make_test_cases_eval();
+        filter_test_cases(test_cases, params_filter);
+        ggml_backend_t backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, NULL);
+        if (backend_cpu == NULL) {
+            printf("  Failed to initialize CPU backend\n");
+            return false;
+        }
+
+        size_t n_ok = 0;
+        for (auto & test : test_cases) {
+            if (test->eval(backend, backend_cpu, op_name)) {
+                n_ok++;
+            }
+        }
+        printf("  %zu/%zu tests passed\n", n_ok, test_cases.size());
+
+        ggml_backend_free(backend_cpu);
+
+        return n_ok == test_cases.size();
+    }
+
+    if (mode == MODE_GRAD) {
+        auto test_cases = make_test_cases_eval();
+        filter_test_cases(test_cases, params_filter);
+        size_t n_ok = 0;
+        for (auto & test : test_cases) {
+            if (test->eval_grad(backend, op_name)) {
+                n_ok++;
+            }
+        }
+        printf("  %zu/%zu tests passed\n", n_ok, test_cases.size());
+
+        return n_ok == test_cases.size();
+    }
+
+    if (mode == MODE_PERF) {
+        auto test_cases = make_test_cases_perf();
+        filter_test_cases(test_cases, params_filter);
+        for (auto & test : test_cases) {
+            test->eval_perf(backend, op_name);
+        }
+        return true;
+    }
+
+    GGML_ABORT("fatal error");
+}
+
+static void usage(char ** argv) {
+    printf("Usage: %s [mode] [-o <op>] [-b <backend>] [-p <params regex>]\n", argv[0]);
+    printf("    valid modes:\n");
+    printf("      - test (default, compare with CPU backend for correctness)\n");
+    printf("      - grad (compare gradients from backpropagation with method of finite differences)\n");
+    printf("      - perf (performance evaluation)\n");
+    printf("    op names for -o are as given by ggml_op_desc() (e.g. ADD, MUL_MAT, etc)\n");
+}
+
+int main(int argc, char ** argv) {
+    test_mode mode = MODE_TEST;
+    const char * op_name_filter = nullptr;
+    const char * backend_filter = nullptr;
+    const char * params_filter = nullptr;
+
+#ifdef GGML_USE_HEXAGON
+    int mulmat_algotype = 0;
+    int backend_index   = 3;
+    for (int i = 1; i < argc; i++) {
+        if (0 == strcmp(argv[i], "-a")) {
+            mulmat_algotype = atoi(argv[i+1]);
+        }
+        if (0 == strcmp(argv[i], "-i")) {
+            backend_index = atoi(argv[i+1]);
+        }
+    }
+    printf("mulmat_algotype %d\n", mulmat_algotype);
+    printf("backend_index %d\n", backend_index);
+    if (backend_index >= HEXAGON_BACKEND_CDSP) {
+        ggml_backend_hexagon_set_cfg(backend_index, HWACCEL_CDSP);
+    }
+    if (backend_index < HEXAGON_BACKEND_CDSP) {
+        ggml_backend_hexagon_set_cfg(backend_index, HWACCEL_QNN);
+    }
+    ggml_backend_hexagon_set_mulmat_algotype(mulmat_algotype);
+#endif
+
+    for (int i = 1; i < argc; i++) {
+        if (strcmp(argv[i], "test") == 0) {
+            mode = MODE_TEST;
+        } else if (strcmp(argv[i], "perf") == 0) {
+            mode = MODE_PERF;
+        } else if (strcmp(argv[i], "grad") == 0) {
+            mode = MODE_GRAD;
+        } else if (strcmp(argv[i], "-o") == 0) {
+            if (i + 1 < argc) {
+                op_name_filter = argv[++i];
+            } else {
+                usage(argv);
+                return 1;
+            }
+        } else if (strcmp(argv[i], "-b") == 0) {
+            if (i + 1 < argc) {
+                backend_filter = argv[++i];
+            } else {
+                usage(argv);
+                return 1;
+            }
+        } else if (strcmp(argv[i], "-p") == 0) {
+            if (i + 1 < argc) {
+                params_filter = argv[++i];
+            } else {
+                usage(argv);
+                return 1;
+            }
+#ifdef GGML_USE_HEXAGON
+        } else if (strcmp(argv[i], "-a") == 0) {
+            if (i + 1 < argc) {
+                char * temp = argv[++i];
+                (void)temp;
+            }
+        } else if (strcmp(argv[i], "-i") == 0) {
+            if (i + 1 < argc) {
+                char * temp = argv[++i];
+                (void)temp;
+            }
+#endif
+        } else {
+            usage(argv);
+            return 1;
+        }
+    }
+
+    // load and enumerate backends
+    ggml_backend_load_all();
+
+    printf("Testing %zu devices\n\n", ggml_backend_dev_count());
+
+    size_t n_ok = 0;
+
+    for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
+        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+
+        printf("Backend %zu/%zu: %s\n", i + 1, ggml_backend_dev_count(), ggml_backend_dev_name(dev));
+
+        if (backend_filter != NULL && strcmp(backend_filter, ggml_backend_dev_name(dev)) != 0) {
+            printf("  Skipping\n");
+            n_ok++;
+            continue;
+        }
+
+#ifdef GGML_USE_HEXAGON
+        if (backend_index != 4 && backend_filter == NULL && ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU && mode != MODE_GRAD) {
+#else
+        if (backend_filter == NULL && ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU && mode != MODE_GRAD) {
+#endif
+            printf("  Skipping CPU backend\n");
+            n_ok++;
+            continue;
+        }
+
+#ifdef GGML_USE_HEXAGON
+        ggml_backend_t backend = ggml_backend_dev_init(dev, (char*)i);
+#else
+        ggml_backend_t backend = ggml_backend_dev_init(dev, NULL);
+#endif
+        GGML_ASSERT(backend != NULL);
+
+        ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
+        auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
+        if (ggml_backend_set_n_threads_fn) {
+            // TODO: better value for n_threads
+            ggml_backend_set_n_threads_fn(backend, std::thread::hardware_concurrency());
+        }
+
+        printf("  Device description: %s\n", ggml_backend_dev_description(dev));
+        size_t free, total; // NOLINT
+        ggml_backend_dev_memory(dev, &free, &total);
+        printf("  Device memory: %zu MB (%zu MB free)\n", total / 1024 / 1024, free / 1024 / 1024);
+        printf("\n");
+
+        bool ok = test_backend(backend, mode, op_name_filter, params_filter);
+
+        printf("  Backend %s: ", ggml_backend_name(backend));
+        if (ok) {
+            printf("\033[1;32mOK\033[0m\n");
+            n_ok++;
+        } else {
+            printf("\033[1;31mFAIL\033[0m\n");
+        }
+
+        printf("\n");
+
+        ggml_backend_free(backend);
+    }
+
+    ggml_quantize_free();
+
+    printf("%zu/%zu backends passed\n", n_ok, ggml_backend_dev_count());
+
+    if (n_ok != ggml_backend_dev_count()) {
+        printf("\033[1;31mFAIL\033[0m\n");
+        return 1;
+    }
+
+    printf("\033[1;32mOK\033[0m\n");
+    return 0;
+}
diff --git a/tests/test-thread-safety.cpp b/tests/test-thread-safety.cpp
index d525b7430f9d9..07685bafadd35 100644
--- a/tests/test-thread-safety.cpp
+++ b/tests/test-thread-safety.cpp
@@ -12,7 +12,26 @@
 #include "log.h"
 #include "sampling.h"
 
+#ifdef GGML_USE_HEXAGON
+#include "ggml-hexagon.h"
+#endif
+
 int main(int argc, char ** argv) {
+#ifdef GGML_USE_HEXAGON
+    int backend = HEXAGON_BACKEND_CDSP;
+    for (int i = 1; i < argc; i++) {
+        if (0 == strcmp(argv[i], "-mg")) {
+            backend = atoi(argv[i+1]);
+        }
+    }
+    printf("backend %d\n", backend);
+    if (backend >= HEXAGON_BACKEND_CDSP) {
+        ggml_backend_hexagon_set_cfg(backend, HWACCEL_CDSP);
+    }
+    if (backend < HEXAGON_BACKEND_CDSP) {
+        ggml_backend_hexagon_set_cfg(backend, HWACCEL_QNN);
+    }
+#endif
     common_params params;
 
     if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp
index b80e984d0245b..ab8be6dc7f770 100644
--- a/tools/llama-bench/llama-bench.cpp
+++ b/tools/llama-bench/llama-bench.cpp
@@ -21,6 +21,9 @@
 #include "common.h"
 #include "ggml.h"
 #include "llama.h"
+#ifdef GGML_USE_HEXAGON
+#include "ggml-hexagon.h"
+#endif
 
 #ifdef _WIN32
 #    define WIN32_LEAN_AND_MEAN
@@ -1702,6 +1705,24 @@ struct markdown_printer : public printer {
 
     void print_footer() override {
         fprintf(fout, "\nbuild: %s (%d)\n", test::build_commit.c_str(), test::build_number);
+
+        auto time_to_string = [](const std::chrono::system_clock::time_point & tp)->std::string {
+            auto as_time_t = std::chrono::system_clock::to_time_t(tp);
+            struct tm tm;
+
+            localtime_r(&as_time_t, &tm);
+
+            std::chrono::milliseconds ms = std::chrono::duration_cast<std::chrono::milliseconds>(tp.time_since_epoch());
+            char buf[256];
+            memset(buf, 0, 256);
+            snprintf(buf, sizeof(buf), "%04d-%02d-%02d,%02d:%02d:%02d",
+                     tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday, tm.tm_hour, tm.tm_min, tm.tm_sec);
+            GGML_UNUSED(ms);
+            return buf;
+        };
+
+        std::chrono::system_clock::time_point tp = std::chrono::system_clock::now();
+        fprintf(fout, "running time:%s\n", time_to_string(tp).c_str());
     }
 };
 
@@ -1819,6 +1840,21 @@ static std::unique_ptr<printer> create_printer(output_formats format) {
 }
 
 int main(int argc, char ** argv) {
+#ifdef GGML_USE_HEXAGON
+    int backend = HEXAGON_BACKEND_CDSP;
+    for (int i = 1; i < argc; i++) {
+        if (0 == strcmp(argv[i], "-mg")) {
+            backend = atoi(argv[i+1]);
+        }
+    }
+    printf("backend %d\n", backend);
+    if (backend >= HEXAGON_BACKEND_CDSP) {
+        ggml_backend_hexagon_set_cfg(backend, HWACCEL_CDSP);
+    }
+    if (backend < HEXAGON_BACKEND_CDSP) {
+        ggml_backend_hexagon_set_cfg(backend, HWACCEL_QNN);
+    }
+#endif
     // try to set locale for unicode characters in markdown
     setlocale(LC_CTYPE, ".UTF-8");
 
diff --git a/tools/main/main.cpp b/tools/main/main.cpp
index 516bf09652484..fdd948344ae10 100644
--- a/tools/main/main.cpp
+++ b/tools/main/main.cpp
@@ -6,6 +6,10 @@
 #include "llama.h"
 #include "chat.h"
 
+#ifdef GGML_USE_HEXAGON
+#include "ggml-hexagon.h"
+#endif
+
 #include <cstdio>
 #include <cstring>
 #include <ctime>
@@ -84,6 +88,22 @@ static void sigint_handler(int signo) {
 #endif
 
 int main(int argc, char ** argv) {
+#ifdef GGML_USE_HEXAGON
+    int backend = HEXAGON_BACKEND_CDSP;
+    for (int i = 1; i < argc; i++) {
+        if (0 == strcmp(argv[i], "-mg")) {
+            backend = atoi(argv[i+1]);
+        }
+    }
+    printf("backend %d\n", backend);
+    if (backend >= HEXAGON_BACKEND_CDSP) {
+        ggml_backend_hexagon_set_cfg(backend, HWACCEL_CDSP);
+    }
+    if (backend < HEXAGON_BACKEND_CDSP) {
+        ggml_backend_hexagon_set_cfg(backend, HWACCEL_QNN);
+    }
+#endif
+
     common_params params;
     g_params = &params;
     if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) {