diff --git a/.gitignore b/.gitignore index d571806dbfd95..83da95a2c0953 100644 --- a/.gitignore +++ b/.gitignore @@ -145,4 +145,13 @@ poetry.toml # Local scripts /run-vim.sh /run-chat.sh + +HEXAGON_Tools/ +prebuilts/QNN_SDK/qairt/2.35.0.250530/ +prebuilts/QNN_SDK/qairt/2.36.0.250627/ +prebuilts/QNN_SDK/v2.35.0.250530.zip +prebuilts/QNN_SDK/v2.36.0.250627.zip +prebuilts/Hexagon_SDK/minimal-hexagon-sdk-6.2.0.1.xz +prebuilts/OpenCL_SDK/ +prebuilts/Vulkan_SDK/ HEXAGON_Tools/ diff --git a/ggml/include/ggml-hexagon.h b/ggml/include/ggml-hexagon.h index 0d41a955f6715..fe9d4d8e588ba 100644 --- a/ggml/include/ggml-hexagon.h +++ b/ggml/include/ggml-hexagon.h @@ -21,15 +21,30 @@ enum HEXAGONBackend { HEXAGON_BACKEND_GGML = 4, //"fake" HEXAGON backend for compare performance between HEXAGON backend and ggml backend }; -GGML_BACKEND_API ggml_backend_t ggml_backend_hexagon_init(size_t dev_num, const char * qnn_lib_path); +//0: general approach through QNN:offload ggmlop to QNN(QNNCPU, QNNGPU, QNNNPU) +//1: special approach through QNN-SINGLEGRAPH:mapping entire ggml cgraph to a single QNN graph +//2: general approach through Hexagon cDSP:offload ggmlop to Hexagon cDSP directly +enum hwaccel_approach_type { + HWACCEL_QNN = 0, + HWACCEL_QNN_SINGLEGRAPH= 1, + HWACCEL_CDSP = 2, +}; + +GGML_BACKEND_API ggml_backend_t ggml_backend_hexagon_init(size_t dev_num, const char * qnn_lib_path); -GGML_BACKEND_API bool ggml_backend_is_hexagon(ggml_backend_t backend); +GGML_BACKEND_API bool ggml_backend_is_hexagon(ggml_backend_t backend); -GGML_BACKEND_API int ggml_backend_hexagon_get_device_count(void); +GGML_BACKEND_API int ggml_backend_hexagon_get_device_count(void); GGML_BACKEND_API ggml_backend_reg_t ggml_backend_hexagon_reg(void); -const char * ggml_backend_hexagon_get_devname(size_t dev_num); +GGML_BACKEND_API const char * ggml_backend_hexagon_get_devname(size_t dev_num); + +GGML_BACKEND_API void ggml_backend_hexagon_set_cfg(int new_hexagon_backend, int new_hwaccel_approach); + +GGML_BACKEND_API int ggml_backend_hexagon_get_mulmat_algotype(void); + +GGML_BACKEND_API void ggml_backend_hexagon_set_mulmat_algotype(int new_mulmat_algotype); #ifdef __cplusplus } diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index ffd83931add5c..c25485536b5a5 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -196,7 +196,7 @@ struct ggml_backend_registry { #ifdef GGML_USE_HEXAGON register_backend(ggml_backend_hexagon_reg()); #endif - + #ifdef GGML_USE_CPU register_backend(ggml_backend_cpu_reg()); #endif diff --git a/ggml/src/ggml-hexagon/CMakeLists.txt b/ggml/src/ggml-hexagon/CMakeLists.txt index 0dcd7e5e2a168..3515106cc23da 100644 --- a/ggml/src/ggml-hexagon/CMakeLists.txt +++ b/ggml/src/ggml-hexagon/CMakeLists.txt @@ -39,13 +39,19 @@ endif() #check whether user's specified htp arch is valid set(CHECK_HTP_ARCH "WRONG") -foreach (feat v68 v69 v73 v75 v79) +#ref: https://github.com/quic/ai-hub-apps/tree/main/tutorials/llm_on_genie +#foreach (feat v68 v69 v73 v75 v79) +#foreach (feat v73 v75 v79) +#for simplify workflow, only support v75 and v79, or only support 8Gen3 and 8Elite +foreach (feat v75 v79) if (${feat} STREQUAL ${HTP_ARCH_VERSION}) set(CHECK_HTP_ARCH "GOOD") endif() endforeach() if (${CHECK_HTP_ARCH} STREQUAL "WRONG") - message(FATAL_ERROR "ggml-hexagon backend only support htp arch v68,v69,v73,v75,v79") + #message(FATAL_ERROR "ggml-hexagon backend only support htp arch v68,v69,v73,v75,v79") + #for simplify workflow, only support v75 and v79, or only support 8Gen3 and 8Elite + message(FATAL_ERROR "ggml-hexagon backend only support htp arch v75,v79") endif() #check optimization flags @@ -92,10 +98,10 @@ else() message(FATAL_ERROR "ggml-hexagon now only available on Android and Windows(Windows on ARM)") endif() -set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${DEBUG_FLAG} ${OPT_FLAG}") -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${DEBUG_FLAG} ${OPT_FLAG}") -set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} ${DEBUG_FLAG} ${OPT_FLAG}") -set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} ${DEBUG_FLAG} ${OPT_FLAG}") +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DGGML_USE_HEXAGON ${DEBUG_FLAG} ${OPT_FLAG}") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGGML_USE_HEXAGON ${DEBUG_FLAG} ${OPT_FLAG}") +set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -DGGML_USE_HEXAGON ${DEBUG_FLAG} ${OPT_FLAG}") +set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DGGML_USE_HEXAGON ${DEBUG_FLAG} ${OPT_FLAG}") file(GLOB HEXAGON_SOURCES "${CMAKE_CURRENT_LIST_DIR}/*.cpp" "${CMAKE_CURRENT_LIST_DIR}/kernels/stub.c") ggml_add_backend_library(ggml-hexagon ${HEXAGON_SOURCES}) @@ -118,7 +124,7 @@ function(ggml_hexagon_build_kernel KNAME) COMMAND make -C ${CMAKE_CURRENT_LIST_DIR}/kernels/ clean COMMAND make -C ${CMAKE_CURRENT_LIST_DIR}/kernels/ HEXAGON_SDK_PATH=${HEXAGON_SDK_PATH} HTP_ARCH_VERSION=${HTP_ARCH_VERSION} DEBUG_FLAG=${DEBUG_FLAG} COMMAND echo "current working path:`pwd`\n" - COMMAND ls -l ../../../bin/libggmlop-skel.so + COMMAND ls -l ../../../bin/libggmldsp-skel.so COMMENT "build hexagon-kernel" ) endfunction() diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index 251dcc586c51e..74f3a2461a10b 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -1,4 +1,5 @@ /* + * Copyright (c) zhouwg(https://github.com/zhouwg) * Copyright (c) 2024-2025 The ggml authors * * Qualcomm QNN SDK and reference tech guides could be found at: @@ -59,6 +60,7 @@ #include #include #include +#include #if defined(__ANDROID__) || defined(__linux__) #include @@ -154,10 +156,9 @@ struct ggml_backend_hexagon_context; #if !defined (_WINDOWS) #pragma weak remote_system_request +#pragma weak remote_session_control #endif -#define MAX_DOMAIN_NAMELEN 12 - #define CHECK_QNN_API(error, result) \ do { \ error = (result); \ @@ -179,6 +180,9 @@ struct ggml_backend_hexagon_context; } \ } while (0) \ +#ifndef ggmlop_URI +#define ggmlop_URI "file:///libggmldsp-skel.so?ggmldsp_skel_handle_invoke&_modver=1.0&_idlver=0.0.1" +#endif // ================================================================================================= // section-1: data type, data structure, global vars // ================================================================================================= @@ -187,6 +191,9 @@ using pfn_rpc_mem_deinit = void (*)(void); using pfn_rpc_mem_alloc = void *(*)(int, uint32_t, int); using pfn_rpc_mem_free = void (*)(void *); using pfn_rpc_mem_to_fd = int (*)(void *); +using pfn_qnnsaver_initialize = decltype(QnnSaver_initialize); +using pfn_qnninterface_getproviders = decltype(QnnInterface_getProviders); +using pfn_qnnsysteminterface_getproviders = decltype(QnnSystemInterface_getProviders); using pfn_rpc_remote_handle_control = int (*)(uint32_t, void*, uint32_t); using pfn_rpc_remote_register_buf = int (*)(void*, int, int); using pfn_rpc_remote_session_control = int (*)(uint32_t, void *, uint32_t); @@ -195,9 +202,6 @@ using pfn_rpc_remote_handle64_close = int (*)(remote_handle64); using pfn_rpc_remote_handle64_invoke = int (*)(remote_handle64, uint32_t, remote_arg *); using pfn_rpc_remote_handle64_control = int (*)(remote_handle64, uint32_t, void*, uint32_t); -using _pfn_QnnSaver_initialize = decltype(QnnSaver_initialize); -using _pfn_QnnInterface_getProviders = decltype(QnnInterface_getProviders); -using _pfn_QnnSystemInterface_getProviders = decltype(QnnSystemInterface_getProviders); //QNN resource management for the general approach through QNN using qnn_tensors_t = std::vector< Qnn_Tensor_t >; @@ -219,15 +223,6 @@ enum qnn_profile_level { PROFILE_DETAIL = 2, }; -//0: general approach through QNN:offload ggmlop to QNN -//1: special approach through QNN-SINGLEGRAPH:mapping entire ggml cgraph to a single QNN graph -//2: general approach through Hexagon cDSP:offload ggmlop to Hexagon cDSP directly -enum hwaccel_approach_type { - HWACCEL_QNN = 0, - HWACCEL_QNN_SINGLEGRAPH = 1, - HWACCEL_CDSP = 2, -}; - enum hexagon_dsp_type { HEXAGON_ADSP = 0, HEXAGON_MDSP = 1, @@ -253,7 +248,7 @@ enum qcom_chipset_soc_model { SM8475 = 42, // v69, SD 8+ Gen 1 SM8550 = 43, // v73, SD 8 Gen 2 SM8650 = 57, // v75, SD 8 Gen 3 - SM8750 = 69, // v79, SD 8 Elite(aka 8 Gen 4) + SM8750 = 69, // v79, SD 8 Elite #if !defined(__ANDROID__) && !defined(__linux__) SC7280X = 44, SC8280X = 37, @@ -355,6 +350,7 @@ struct hexagon_appcfg_t { int profiler_duration; // threshold of duration in profiler, per seconds int profiler_counts; // threshold of counts in profiler int thread_counts; // thread_counts on cDSP side + int mulmat_algotype; // algorithm type of mulmat on cDSP side const char * cfgfilename; const char * runtime_libpath; char ggml_hexagon_version[GGMLHEXAGON_TMPBUF_LEN]; @@ -377,12 +373,23 @@ static struct hexagon_appcfg_t g_hexagon_appcfg = { .hexagon_backend = HEXAGON_BACKEND_CDSP, .enable_rpc_ion_mempool = 0, .enable_all_q_mulmat = 0, - .profiler_duration = 5, + .profiler_duration = 5, //seconds .profiler_counts = 100, .thread_counts = 4, + .mulmat_algotype = 0, .cfgfilename = "ggml-hexagon.cfg", +#if defined(__ANDROID__) + #if defined(STANDARD_ANDROID_APP) + .runtime_libpath = "/data/data/com.kantvai.kantvplayer/", + #else .runtime_libpath = "/data/data/com.layla/files/app-data/qnn-inference/", - .ggml_hexagon_version = {"1.08"}, + #endif +#elif defined(__linux__) + .qnn_runtimelib_path = "/tmp/", +#elif defined(_WIN32) + .qnn_runtimelib_path = "C:\\", +#endif + .ggml_hexagon_version = {"1.13"}, .ggml_dsp_version = {"0.63"}, }; @@ -435,7 +442,7 @@ static struct qcom_socinfo g_qnn_soc_info_table[] = { .soc_model = SM8750, .htp_arch = V79, .vtcm_size_in_mb = 8, - .soc_desc = "Qualcomm SnapDragon 8 Elite(aka 8 Gen 4)"}, + .soc_desc = "Qualcomm SnapDragon 8 Elite"}, #if !defined(__ANDROID__) && !defined(__linux__) /* Qualcomm SnapDragon 7c Gen 2 */ @@ -627,6 +634,7 @@ static constexpr const qnn_op_caps ggmlqnn_k_op_caps[] = { {false, GGML_OP_TRANSPOSE, 0, nullptr}, {false, GGML_OP_GET_ROWS, 0, nullptr}, {false, GGML_OP_GET_ROWS_BACK, 0, nullptr}, + {false, GGML_OP_SET_ROWS, 0, nullptr}, {false, GGML_OP_DIAG, 0, nullptr}, {false, GGML_OP_DIAG_MASK_INF, 0, nullptr}, {false, GGML_OP_DIAG_MASK_ZERO, 0, nullptr}, @@ -638,6 +646,7 @@ static constexpr const qnn_op_caps ggmlqnn_k_op_caps[] = { {false, GGML_OP_CONV_TRANSPOSE_1D, 0, nullptr}, {false, GGML_OP_IM2COL, 0, nullptr}, {false, GGML_OP_IM2COL_BACK, 0, nullptr}, + {false, GGML_OP_CONV_2D, 0, nullptr}, {false, GGML_OP_CONV_2D_DW, 0, nullptr}, {false, GGML_OP_CONV_TRANSPOSE_2D, 0, nullptr}, {false, GGML_OP_POOL_1D, 0, nullptr}, @@ -646,6 +655,7 @@ static constexpr const qnn_op_caps ggmlqnn_k_op_caps[] = { {false, GGML_OP_UPSCALE, 0, nullptr}, {false, GGML_OP_PAD, 0, nullptr}, {false, GGML_OP_PAD_REFLECT_1D, 0, nullptr}, + {false, GGML_OP_ROLL, 0, nullptr}, {false, GGML_OP_ARANGE, 0, nullptr}, {false, GGML_OP_TIMESTEP_EMBEDDING, 0, nullptr}, {false, GGML_OP_ARGSORT, 0, nullptr}, @@ -669,28 +679,14 @@ static constexpr const qnn_op_caps ggmlqnn_k_op_caps[] = { {false, GGML_OP_CROSS_ENTROPY_LOSS, 0, nullptr}, {false, GGML_OP_CROSS_ENTROPY_LOSS_BACK, 0, nullptr}, {false, GGML_OP_OPT_STEP_ADAMW, 0, nullptr}, - {false, static_cast(GGML_UNARY_OP_ABS), 0, nullptr}, - {false, static_cast(GGML_UNARY_OP_SGN), 0, nullptr}, - {false, static_cast(GGML_UNARY_OP_NEG), 0, nullptr}, - {false, static_cast(GGML_UNARY_OP_STEP), 0, nullptr}, - {false, static_cast(GGML_UNARY_OP_TANH), 0, nullptr}, - {false, static_cast(GGML_UNARY_OP_ELU), 0, nullptr}, - {false, static_cast(GGML_UNARY_OP_RELU), 0, nullptr}, - {false, static_cast(GGML_UNARY_OP_SIGMOID), 0, nullptr}, - {false, static_cast(GGML_UNARY_OP_GELU), 0, nullptr}, - {false, static_cast(GGML_UNARY_OP_GELU_ERF), 0, nullptr}, - {false, static_cast(GGML_UNARY_OP_GELU_QUICK), 0, nullptr}, - {false, static_cast(GGML_UNARY_OP_SILU), 0, nullptr}, - {false, static_cast(GGML_UNARY_OP_HARDSWISH), 0, nullptr}, - {false, static_cast(GGML_UNARY_OP_HARDSIGMOID), 0, nullptr}, - {false, static_cast(GGML_UNARY_OP_EXP), 0, nullptr} + {false, GGML_OP_GLU, 0, nullptr}, }; static_assert(ggmlqnn_k_op_caps[GGML_OP_NONE].supported, "GGML_OP_NONE is not true"); static_assert(ggmlqnn_k_op_caps[GGML_OP_ADD].supported, "GGML_OP_ADD is not true"); static_assert(ggmlqnn_k_op_caps[GGML_OP_MUL].supported, "GGML_OP_MUL is not true"); static_assert(ggmlqnn_k_op_caps[GGML_OP_MUL_MAT].supported, "GGML_OP_MUL_MAT is not true"); -static_assert(std::size(ggmlqnn_k_op_caps) == (static_cast(GGML_OP_COUNT) + static_cast(GGML_UNARY_OP_COUNT)), +static_assert(std::size(ggmlqnn_k_op_caps) == (static_cast(GGML_OP_COUNT)), "pls check ggmlqnn_k_op_caps and ensure is corresponding to latest ggml.h"); //supported ggml op by HWACCEL_CDSP @@ -700,12 +696,12 @@ static constexpr const hexagon_op_caps ggmlhexagon_k_op_caps[] = { {true, GGML_OP_ADD, 2, "ggmlop_dsp_add", ggmlop_dsp_add}, {false, GGML_OP_ADD1, 0, nullptr, nullptr}, {false, GGML_OP_ACC, 0, nullptr, nullptr}, - {false, GGML_OP_SUB, 2, nullptr, nullptr}, - {false, GGML_OP_MUL, 2, nullptr, nullptr}, - {false, GGML_OP_DIV, 2, nullptr, nullptr}, + {false, GGML_OP_SUB, 2, nullptr, nullptr}, + {false, GGML_OP_MUL, 2, nullptr, nullptr}, + {false, GGML_OP_DIV, 2, nullptr, nullptr}, {false, GGML_OP_SQR, 0, nullptr, nullptr}, - {false, GGML_OP_SQRT, 0, nullptr, nullptr}, - {false, GGML_OP_LOG, 0, nullptr, nullptr}, + {false, GGML_OP_SQRT, 0, nullptr, nullptr}, + {false, GGML_OP_LOG, 0, nullptr, nullptr}, {false, GGML_OP_SIN, 0, nullptr, nullptr}, {false, GGML_OP_COS, 0, nullptr, nullptr}, {false, GGML_OP_SUM, 0, nullptr, nullptr}, @@ -718,7 +714,7 @@ static constexpr const hexagon_op_caps ggmlhexagon_k_op_caps[] = { {false, GGML_OP_CONCAT, 0, nullptr, nullptr}, {false, GGML_OP_SILU_BACK, 0, nullptr, nullptr}, {false, GGML_OP_NORM, 0, nullptr, nullptr}, - {true, GGML_OP_RMS_NORM, 1, "ggmlop_dsp_rmsnorm", ggmlop_dsp_rmsnorm}, + {true, GGML_OP_RMS_NORM, 1, "ggmlop_dsp_rmsnorm", ggmlop_dsp_rmsnorm}, {false, GGML_OP_RMS_NORM_BACK, 0, nullptr, nullptr}, {false, GGML_OP_GROUP_NORM, 0, nullptr, nullptr}, {false, GGML_OP_L2_NORM, 0, nullptr, nullptr}, @@ -735,10 +731,11 @@ static constexpr const hexagon_op_caps ggmlhexagon_k_op_caps[] = { {false, GGML_OP_TRANSPOSE, 0, nullptr, nullptr}, {false, GGML_OP_GET_ROWS, 0, nullptr, nullptr}, {false, GGML_OP_GET_ROWS_BACK, 0, nullptr, nullptr}, + {false, GGML_OP_SET_ROWS, 0, nullptr, nullptr}, {false, GGML_OP_DIAG, 0, nullptr, nullptr}, {false, GGML_OP_DIAG_MASK_INF, 0, nullptr, nullptr}, {false, GGML_OP_DIAG_MASK_ZERO, 0, nullptr, nullptr}, - {true, GGML_OP_SOFT_MAX, 1, "ggmlop_dsp_softmax", ggmlop_dsp_softmax}, + {true, GGML_OP_SOFT_MAX, 1, "ggmlop_dsp_softmax", ggmlop_dsp_softmax}, {false, GGML_OP_SOFT_MAX_BACK, 0, nullptr, nullptr}, {false, GGML_OP_ROPE, 0, nullptr, nullptr}, {false, GGML_OP_ROPE_BACK, 0, nullptr, nullptr}, @@ -746,14 +743,16 @@ static constexpr const hexagon_op_caps ggmlhexagon_k_op_caps[] = { {false, GGML_OP_CONV_TRANSPOSE_1D, 0, nullptr, nullptr}, {false, GGML_OP_IM2COL, 0, nullptr, nullptr}, {false, GGML_OP_IM2COL_BACK, 0, nullptr, nullptr}, + {false, GGML_OP_CONV_2D, 0, nullptr, nullptr}, {false, GGML_OP_CONV_2D_DW, 0, nullptr, nullptr}, {false, GGML_OP_CONV_TRANSPOSE_2D, 0, nullptr, nullptr}, {false, GGML_OP_POOL_1D, 0, nullptr, nullptr}, - {true, GGML_OP_POOL_2D, 1, "ggmlop_dsp_pool2d", ggmlop_dsp_pool2d}, + {true, GGML_OP_POOL_2D, 1, "ggmlop_dsp_pool2d", ggmlop_dsp_pool2d}, {false, GGML_OP_POOL_2D_BACK, 0, nullptr, nullptr}, {false, GGML_OP_UPSCALE, 0, nullptr, nullptr}, {false, GGML_OP_PAD, 0, nullptr, nullptr}, {false, GGML_OP_PAD_REFLECT_1D, 0, nullptr, nullptr}, + {false, GGML_OP_ROLL, 0, nullptr, nullptr}, {false, GGML_OP_ARANGE, 0, nullptr, nullptr}, {false, GGML_OP_TIMESTEP_EMBEDDING, 0, nullptr, nullptr}, {false, GGML_OP_ARGSORT, 0, nullptr, nullptr}, @@ -777,28 +776,14 @@ static constexpr const hexagon_op_caps ggmlhexagon_k_op_caps[] = { {false, GGML_OP_CROSS_ENTROPY_LOSS, 0, nullptr, nullptr}, {false, GGML_OP_CROSS_ENTROPY_LOSS_BACK, 0, nullptr, nullptr}, {false, GGML_OP_OPT_STEP_ADAMW, 0, nullptr, nullptr}, - {false, static_cast(GGML_UNARY_OP_ABS), 0, nullptr, nullptr}, - {false, static_cast(GGML_UNARY_OP_SGN), 0, nullptr, nullptr}, - {false, static_cast(GGML_UNARY_OP_NEG), 0, nullptr, nullptr}, - {false, static_cast(GGML_UNARY_OP_STEP), 0, nullptr, nullptr}, - {false, static_cast(GGML_UNARY_OP_TANH), 0, nullptr, nullptr}, - {false, static_cast(GGML_UNARY_OP_ELU), 0, nullptr, nullptr}, - {false, static_cast(GGML_UNARY_OP_RELU), 0, nullptr, nullptr}, - {false, static_cast(GGML_UNARY_OP_SIGMOID), 0, nullptr, nullptr}, - {false, static_cast(GGML_UNARY_OP_GELU), 0, nullptr, nullptr}, - {false, static_cast(GGML_UNARY_OP_GELU_ERF), 0, nullptr, nullptr}, - {false, static_cast(GGML_UNARY_OP_GELU_QUICK), 0, nullptr, nullptr}, - {false, static_cast(GGML_UNARY_OP_SILU), 0, nullptr, nullptr}, - {false, static_cast(GGML_UNARY_OP_HARDSWISH), 0, nullptr, nullptr}, - {false, static_cast(GGML_UNARY_OP_HARDSIGMOID), 0, nullptr, nullptr}, - {false, static_cast(GGML_UNARY_OP_EXP), 0, nullptr, nullptr} + {false, GGML_OP_GLU, 0, nullptr, nullptr}, }; static_assert(ggmlhexagon_k_op_caps[GGML_OP_NONE].supported, "GGML_OP_NONE is not true"); static_assert(ggmlhexagon_k_op_caps[GGML_OP_ADD].supported, "GGML_OP_ADD is not true"); static_assert(ggmlhexagon_k_op_caps[GGML_OP_MUL_MAT].supported, "GGML_OP_MUL_MAT is not true"); static_assert(ggmlhexagon_k_op_caps[GGML_OP_SOFT_MAX].supported, "GGML_OP_SOFT_MAX is not true"); -static_assert(std::size(ggmlhexagon_k_op_caps) == (static_cast(GGML_OP_COUNT) + static_cast(GGML_UNARY_OP_COUNT)), +static_assert(std::size(ggmlhexagon_k_op_caps) == (static_cast(GGML_OP_COUNT)), "pls check ggmlhexagon_k_op_caps and ensure is corresponding to latest ggml.h"); static int32_t g_qnntensor_idx = 0; //ensure every QNN tensor name is unique @@ -887,6 +872,41 @@ static void ggmlhexagon_log_internal(ggml_log_level level, const char * file, co } } +static void ggmlhexagon_get_processname(char * p_name) { + if (nullptr == p_name) + return; + + char tmpbuf[GGMLHEXAGON_TMPBUF_LEN]; + memset(tmpbuf, 0, GGMLHEXAGON_TMPBUF_LEN); +#if defined(__ANDROID__) || defined(__linux__) + int result = readlink("/proc/self/exe", tmpbuf, GGMLHEXAGON_TMPBUF_LEN - 1); + if (result < 0) { + GGMLHEXAGON_LOG_WARN("failed to get process name, reason:%s", strerror(errno)); + return; + } + GGMLHEXAGON_LOG_DEBUG("process name %s", tmpbuf); + const char * realname = strrchr(tmpbuf, '/') + 1; + GGMLHEXAGON_LOG_DEBUG("process name %s", realname); + snprintf(p_name, GGMLHEXAGON_TMPBUF_LEN, "%s", realname); +#endif +} + +static bool ggmlhexagon_is_llamabench_running() { + char processname[GGMLHEXAGON_TMPBUF_LEN]; + memset(processname, 0, GGMLHEXAGON_TMPBUF_LEN); + + ggmlhexagon_get_processname(processname); + if (0 != processname[0] && 0 != processname[1] && 0 != processname[10]) { + if (0 == memcmp(processname, "llama-bench", strlen("llama-bench"))) { + return true; + } + if (0 == memcmp(processname, "test-thread-safety", strlen("test-thread-safety"))) { + return true; + } + } + return false; +} + static void ggmlhexagon_print_tensors_info(const char * func_name, const ggml_backend_hexagon_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * dst) { //skip sanity check of params because of performance concern @@ -1124,7 +1144,7 @@ class hexagon_profiler { ); } - //print/compare NPU's I/O performance between 8Gen3 and 8Elite(aka 8Gen4) , removed in the future + //print/compare NPU's I/O performance between 8Gen3 and 8Elite , removed in the future char bps_string[GGMLHEXAGON_TMPBUF_LEN]; memset(bps_string, 0, GGMLHEXAGON_TMPBUF_LEN); profiler_get_bpsstring(_total_inputsize + _total_outputsize, elapse_time, bps_string); @@ -1292,6 +1312,7 @@ class hexagon_perf { _begin_time = ggml_time_us(); } + //use explicit function calls rather than scoped feature void info() { if (0 == g_hexagon_appcfg.enable_perf) { return; @@ -1304,8 +1325,13 @@ class hexagon_perf { // it's not mandatory // had to expose two public function in hexagon_profiler class if (g_hexagon_profiler.profiler_get_frame_index() <= g_hexagon_profiler.profiler_get_threshold_count()) { + const char * devname = ggml_backend_hexagon_get_devname(g_hexagon_appcfg.hexagon_backend); + //the logic here is make sense because already checked in ggml_backend_hexagon_device_init_backend + if (g_hexagon_appcfg.hexagon_backend != HEXAGON_BACKEND_GGML) { + devname += 16; + } GGMLHEXAGON_LOG_VERBOSE("inference duration of %s through %s: %lld microseconds", - _perf_name.c_str(), ggmlhexagon_get_hwaccel_approach_name(g_hexagon_appcfg.hwaccel_approach), _duration); + _perf_name.c_str(), devname, _duration); } //update profiler data @@ -1328,7 +1354,7 @@ class hexagon_perf { int _output_size = 0; }; -//a simple class to load configurations from ggml-hexagon.cfg +//a simple class to load/set running configurations in ggml-hexagon.cfg class hexagon_appcfg { public: hexagon_appcfg() {} @@ -1394,6 +1420,103 @@ class hexagon_appcfg { value = atol(_hexagon_appcfg[section][key].c_str()); } + bool modify_hexagon_config(std::string & cfg_filename, int new_hexagon_backend, int new_hwaccel_approach, int new_mulmat_algotype) { + std::ifstream inputfile(cfg_filename); + if (!inputfile.is_open()) { + GGMLHEXAGON_LOG_WARN("can't open file %s", cfg_filename.c_str()); + return false; + } + + std::string filedata = ""; + + std::string line; + std::string backupline; + bool is_rewrite = false; + bool is_founded = false; + bool is_key = true; + std::string key; + std::string value; + std::string newvalue; + while (std::getline(inputfile, line)) { + is_founded = false; + backupline = line; + trim(line); + if (0 == line.rfind("#", 0)) { + filedata += backupline; + filedata += "\n"; + continue; + } + + newvalue = ""; + if (line.rfind("hexagon_backend", 0) != std::string::npos) { + if (new_hexagon_backend >= 0) { + is_founded = true; + is_rewrite = true; + newvalue = std::to_string(new_hexagon_backend); + } + } + + if (line.rfind("hwaccel_approach", 0) != std::string::npos) { + //compatiable with previous logic + if (new_hwaccel_approach >= 0) { + is_founded = true; + is_rewrite = true; + newvalue = std::to_string(new_hwaccel_approach); + } + } + + if (line.rfind("mulmat_algotype", 0) != std::string::npos) { + //compatiable with previous logic + if (new_mulmat_algotype >= 0) { + is_founded = true; + is_rewrite = true; + newvalue = std::to_string(new_mulmat_algotype); + } + } + + + if (is_founded) { + is_key = true; + key = ""; + value = ""; + + for (size_t i = 0; i < line.size(); ++i) { + if (line[i] == '=') { + is_key = false; + continue; + } + if (is_key) { + key += line[i]; + } else { + value += line[i]; + } + } + trim(key); + trim(value); + GGMLHEXAGON_LOG_VERBOSE("key %s value %s\n", key.c_str(), value.c_str()); + GGMLHEXAGON_LOG_VERBOSE("key %s new value %s\n", key.c_str(), newvalue.c_str()); + backupline = key + " = " + newvalue; + } + filedata += backupline; + filedata += "\n"; + } + inputfile.close(); + + if (is_rewrite) { + std::ofstream outputfile; + outputfile.open(cfg_filename); + outputfile.flush(); + outputfile << filedata; + outputfile.close(); + } + return true; + } + + //compatiable with previous codes + bool modify_hexagon_config(std::string & cfg_filename, int new_hexagon_backend, int new_hwaccel_approach) { + return modify_hexagon_config(cfg_filename, new_hexagon_backend, new_hwaccel_approach, -1); + } + private: void ltrim(std::string & str) { if (str.empty()) return; @@ -1734,10 +1857,6 @@ static void ggmlhexagon_append_tensor_dimensions(const ggml_tensor * tensor, std } static size_t ggmlhexagon_get_op_index(const ggml_tensor * tensor) { - if (tensor->op == GGML_OP_UNARY) { - return static_cast(GGML_OP_COUNT) + static_cast(ggml_get_unary_op(tensor)); - } - return tensor->op; } @@ -1879,13 +1998,12 @@ static void ggmlhexagon_load_cfg() { ggmlhexagon_get_timestring(time_string); GGMLHEXAGON_LOG_DEBUG("program running start time:%s", time_string); std::string cfg_filename = std::string(g_hexagon_appcfg.runtime_libpath) + std::string(g_hexagon_appcfg.cfgfilename); - GGMLHEXAGON_LOG_INFO("load hexagon appcfg from %s", cfg_filename.c_str()); hexagon_appcfg hexagoncfg_instance; hexagoncfg_instance.load(cfg_filename); hexagoncfg_instance.dump([](const std::string & section, const std::string & key, const std::string value) { std::ostringstream tmposs; tmposs << "section[" << std::setw(10) << std::left << section << "],[" << std::setw(25) << std::left << key << "] = [" << value << "]"; - GGMLHEXAGON_LOG_INFO("%s", tmposs.str().c_str()); + GGMLHEXAGON_LOG_VERBOSE("%s", tmposs.str().c_str()); }); std::string precision_mode; std::string version; //version of ggml-hexagon.cpp @@ -1912,19 +2030,22 @@ static void ggmlhexagon_load_cfg() { hexagoncfg_instance.get_intvalue("cdsp", "enable_rpc_ion_mempool", g_hexagon_appcfg.enable_rpc_ion_mempool, 0); hexagoncfg_instance.get_intvalue("cdsp", "enable_all_q_mulmat", g_hexagon_appcfg.enable_all_q_mulmat, 0); hexagoncfg_instance.get_intvalue("cdsp", "thread_counts", g_hexagon_appcfg.thread_counts, 4); + hexagoncfg_instance.get_intvalue("cdsp", "mulmat_algotype", g_hexagon_appcfg.mulmat_algotype, 0); - GGMLHEXAGON_LOG_INFO("internal ggml_hexagon_version=%s", g_hexagon_appcfg.ggml_hexagon_version); - GGMLHEXAGON_LOG_INFO("internal ggml_dsp_version=%s", g_hexagon_appcfg.ggml_dsp_version); - GGMLHEXAGON_LOG_INFO("external ggml_hexagon_version=%s", version.c_str()); - GGMLHEXAGON_LOG_INFO("external ggml_dsp_version=%s", ggmldsp_version.c_str()); memcpy(g_hexagon_appcfg.ggml_dsp_version, ggmldsp_version.c_str(), strlen(ggmldsp_version.c_str())); - GGMLHEXAGON_LOG_INFO("hwaccel_approach=%d(%s)", g_hexagon_appcfg.hwaccel_approach, + + GGMLHEXAGON_LOG_VERBOSE("load hexagon appcfg from %s", cfg_filename.c_str()); + GGMLHEXAGON_LOG_VERBOSE("internal ggml_hexagon_version=%s", g_hexagon_appcfg.ggml_hexagon_version); + GGMLHEXAGON_LOG_VERBOSE("internal ggml_dsp_version=%s", g_hexagon_appcfg.ggml_dsp_version); + GGMLHEXAGON_LOG_VERBOSE("external ggml_hexagon_version=%s", version.c_str()); + GGMLHEXAGON_LOG_VERBOSE("external ggml_dsp_version=%s", ggmldsp_version.c_str()); + GGMLHEXAGON_LOG_VERBOSE("hwaccel_approach=%d(%s)", g_hexagon_appcfg.hwaccel_approach, ggmlhexagon_get_hwaccel_approach_name(g_hexagon_appcfg.hwaccel_approach)); - GGMLHEXAGON_LOG_INFO("hexagon_backend=%d(%s)", g_hexagon_appcfg.hexagon_backend, + GGMLHEXAGON_LOG_VERBOSE("hexagon_backend=%d(%s)", g_hexagon_appcfg.hexagon_backend, ggml_backend_hexagon_get_devname(g_hexagon_appcfg.hexagon_backend)); - GGMLHEXAGON_LOG_INFO("runtime libpath=%s", g_hexagon_appcfg.runtime_libpath); - GGMLHEXAGON_LOG_INFO("enable_perf=%d", g_hexagon_appcfg.enable_perf); - GGMLHEXAGON_LOG_INFO("enable_profiler=%d", g_hexagon_appcfg.enable_profiler); + GGMLHEXAGON_LOG_VERBOSE("runtime libpath=%s", g_hexagon_appcfg.runtime_libpath); + GGMLHEXAGON_LOG_VERBOSE("enable_perf=%d", g_hexagon_appcfg.enable_perf); + GGMLHEXAGON_LOG_VERBOSE("enable_profiler=%d", g_hexagon_appcfg.enable_profiler); if (precision_mode.find("fp16") != std::string::npos) { g_hexagon_appcfg.precision_mode = 1; @@ -1942,6 +2063,58 @@ static void ggmlhexagon_load_cfg() { initialized = true; } +void ggml_backend_hexagon_set_cfg(int new_hexagon_backend, int new_hwaccel_approach) { + if (new_hexagon_backend < 0 || new_hexagon_backend > HEXAGON_BACKEND_GGML) { + GGMLHEXAGON_LOG_WARN("invalid new_hexagon_backend"); + return; + } + if (new_hwaccel_approach < 0 || new_hwaccel_approach > HWACCEL_CDSP) { + GGMLHEXAGON_LOG_WARN("invalid new_hwaccel_approach"); + return; + } + std::string cfg_filename = std::string(g_hexagon_appcfg.runtime_libpath) + std::string(g_hexagon_appcfg.cfgfilename); + GGMLHEXAGON_LOG_VERBOSE("load hexagon appcfg from %s", cfg_filename.c_str()); + hexagon_appcfg hexagoncfg_instance; + GGMLHEXAGON_LOG_VERBOSE("set_hexagon_cfg with new_hexagon_backend %d, new_hwaccel_approach %d", new_hexagon_backend, new_hwaccel_approach); + hexagoncfg_instance.modify_hexagon_config(cfg_filename, new_hexagon_backend, new_hwaccel_approach); + hexagoncfg_instance.load(cfg_filename); + hexagoncfg_instance.dump([](const std::string & section, const std::string & key, const std::string value) { + std::ostringstream tmposs; + tmposs << "section[" << std::setw(10) << std::left << section << "],[" << std::setw(25) << std::left << key << "] = [" << value << "]"; + GGMLHEXAGON_LOG_VERBOSE("%s", tmposs.str().c_str()); + }); +} + +int ggml_backend_hexagon_get_mulmat_algotype() { + std::string cfg_filename = std::string(g_hexagon_appcfg.runtime_libpath) + std::string(g_hexagon_appcfg.cfgfilename); + hexagon_appcfg hexagoncfg_instance; + hexagoncfg_instance.load(cfg_filename); + hexagoncfg_instance.get_intvalue("cdsp", "mulmat_algotype", g_hexagon_appcfg.mulmat_algotype, 0); + return g_hexagon_appcfg.mulmat_algotype; +} + +/** + * troubleshooting peformance of mulmat on cDSP during development stage + */ +void ggml_backend_hexagon_set_mulmat_algotype(int new_mulmat_algotype) { + //the logic here is different with logic in the ggml_backend_hexagon_set_cfg(int new_hexagon_backend, int new_hwaccel_approach) + if (new_mulmat_algotype < 0) { + GGMLHEXAGON_LOG_WARN("invalid new_mulmat_algotype"); + return; + } + std::string cfg_filename = std::string(g_hexagon_appcfg.runtime_libpath) + std::string(g_hexagon_appcfg.cfgfilename); + GGMLHEXAGON_LOG_VERBOSE("load hexagon appcfg from %s", cfg_filename.c_str()); + hexagon_appcfg hexagoncfg_instance; + GGMLHEXAGON_LOG_VERBOSE("set_hexagon_cfg with new_mulmat_algotype %d", new_mulmat_algotype); + hexagoncfg_instance.modify_hexagon_config(cfg_filename, -1, -1, new_mulmat_algotype); + hexagoncfg_instance.load(cfg_filename); + hexagoncfg_instance.dump([](const std::string & section, const std::string & key, const std::string value) { + std::ostringstream tmposs; + tmposs << "section[" << std::setw(10) << std::left << section << "],[" << std::setw(25) << std::left << key << "] = [" << value << "]"; + GGMLHEXAGON_LOG_VERBOSE("%s", tmposs.str().c_str()); + }); +} + static bool ggmlhexagon_check_valid_appcfg() { bool is_valid_appcfg = true; @@ -1949,38 +2122,38 @@ static bool ggmlhexagon_check_valid_appcfg() { ggmlhexagon_get_hwaccel_approach_name(g_hexagon_appcfg.hwaccel_approach)); GGMLHEXAGON_LOG_DEBUG("user's specified hexagon_backend=%d", g_hexagon_appcfg.hexagon_backend); if (g_hexagon_appcfg.hexagon_backend >= GGML_HEXAGON_MAX_DEVICES) { - GGMLHEXAGON_LOG_INFO("using default ggml backend"); + GGMLHEXAGON_LOG_VERBOSE("using default ggml backend"); is_valid_appcfg = false; } if (HWACCEL_QNN_SINGLEGRAPH == g_hexagon_appcfg.hwaccel_approach) { - GGMLHEXAGON_LOG_INFO("HWACCEL_QNN_SINGLEGRAPH not supported"); + GGMLHEXAGON_LOG_VERBOSE("HWACCEL_QNN_SINGLEGRAPH not supported"); is_valid_appcfg = false; } if (HWACCEL_QNN == g_hexagon_appcfg.hwaccel_approach) { if (HEXAGON_BACKEND_CDSP == g_hexagon_appcfg.hexagon_backend) { - GGMLHEXAGON_LOG_INFO("hexagon_backend HEXAGON_BACKEND_CDSP must match with hwaccel_approach HWACCEL_CDSP"); + GGMLHEXAGON_LOG_VERBOSE("hexagon_backend HEXAGON_BACKEND_CDSP must match with hwaccel_approach HWACCEL_CDSP"); is_valid_appcfg = false; } } if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) { if ((HEXAGON_BACKEND_CDSP != g_hexagon_appcfg.hexagon_backend) && (HEXAGON_BACKEND_GGML != g_hexagon_appcfg.hexagon_backend)) { - GGMLHEXAGON_LOG_INFO("hwaccel_approach HWACCEL_CDSP must match with hexagon_backend HEXAGON_BACKEND_CDSP"); + GGMLHEXAGON_LOG_VERBOSE("hwaccel_approach HWACCEL_CDSP must match with hexagon_backend HEXAGON_BACKEND_CDSP"); is_valid_appcfg = false; } if (1 == g_hexagon_appcfg.enable_all_q_mulmat) { if (0 == g_hexagon_appcfg.enable_q_mulmat) { - GGMLHEXAGON_LOG_INFO("ensure set enable_q_mulmat to 1 firstly when set enable_all_q_mulmat to 1"); - is_valid_appcfg = false; + GGMLHEXAGON_LOG_DEBUG("ensure set enable_q_mulmat to 1 firstly when set enable_all_q_mulmat to 1 if you are not currently comparing the performance of GGML_OP_ADD between QNNCPU, QNNGPU, QNNNPU, cDSP, ggml"); + //is_valid_appcfg = false; } } } if (!is_valid_appcfg) { - GGMLHEXAGON_LOG_INFO("it seems there is wrong configuration in ggml-hexagon.cfg, will using the default ggml backend accordingly"); + GGMLHEXAGON_LOG_VERBOSE("it seems there is non-default configuration in ggml-hexagon.cfg, will using the default ggml backend accordingly"); } return is_valid_appcfg; } @@ -1990,6 +2163,11 @@ static void ggmlhexagon_print_running_timestamp(ggml_backend_hexagon_context * c char timestamp[GGMLHEXAGON_TMPBUF_LEN]; memset(timestamp, 0, GGMLHEXAGON_TMPBUF_LEN); + if (ggmlhexagon_is_llamabench_running()) { + //make llama-bench happy + return; + } + GGMLHEXAGON_LOG_INFO("ggml_hexagon_version: %s", g_hexagon_appcfg.ggml_hexagon_version); GGMLHEXAGON_LOG_INFO("ggml_dsp_version: %s", g_hexagon_appcfg.ggml_dsp_version); GGMLHEXAGON_LOG_INFO("hwaccel approach: %d(%s)", g_hexagon_appcfg.hwaccel_approach, @@ -2001,10 +2179,11 @@ static void ggmlhexagon_print_running_timestamp(ggml_backend_hexagon_context * c if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) { GGMLHEXAGON_LOG_INFO("offload quantize GGML_OP_MUL_MAT: %s", g_hexagon_appcfg.enable_q_mulmat ? "YES" : "NO"); GGMLHEXAGON_LOG_INFO("using rpc ion memory pool: %s", g_hexagon_appcfg.enable_rpc_ion_mempool ? "YES" : "NO"); - GGMLHEXAGON_LOG_INFO("thread_counts with HWACCEL_CDSP: %d", g_hexagon_appcfg.thread_counts); + GGMLHEXAGON_LOG_INFO("thread_counts with HWACCEL_CDSP: %d", g_hexagon_appcfg.thread_counts); + GGMLHEXAGON_LOG_INFO("mulmat algo type on cDSP: %d", g_hexagon_appcfg.mulmat_algotype); ggmlhexagon_probe_dspinfo(ctx); } else { - GGMLHEXAGON_LOG_INFO("thread_counts with HWACCEL_QNN: %d", g_hexagon_appcfg.hvx_threads); + GGMLHEXAGON_LOG_INFO("thread_counts with HWACCEL_QNN: %d", g_hexagon_appcfg.hvx_threads); GGMLHEXAGON_LOG_INFO("offload quantize GGML_OP_MUL_MAT: %s", g_hexagon_appcfg.enable_q_mulmat ? "YES" : "NO"); } GGMLHEXAGON_LOG_INFO("running timestamp:%s", timestamp); @@ -2756,6 +2935,14 @@ class qnn_instance { std::unordered_map _qnn_rpc_buffer_to_handles; std::atomic_bool _rpcmem_initialized{false}; + + // this is moved to static declarations in this file + // pfn_rpc_mem_alloc _pfn_rpc_mem_alloc; + // pfn_rpc_mem_free _pfn_rpc_mem_free; + // pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd; + // pfn_rpc_mem_init _pfn_rpc_mem_init; + // pfn_rpc_mem_deinit _pfn_rpc_mem_deinit; + std::unordered_map _rpcmem_store_map; std::unordered_map _rpcmem_usage_map; size_t _rpcmem_usage = 0; // mempool usage in bytes @@ -2763,6 +2950,10 @@ class qnn_instance { std::string _graph_name; HEXAGONBackend _device_id; + + // this is moved to static declarations in this file + //void * _rpc_lib_handle = nullptr; + bool _enable_qnn_rpc = false; //TODO:unknown issue with QNN RPC feature qnn_instance(const qnn_instance &) = delete; @@ -3002,7 +3193,7 @@ int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t * return 1; } - auto get_providers = ggmlqnn_load_qnn_functionpointers<_pfn_QnnInterface_getProviders *>( + auto get_providers = ggmlqnn_load_qnn_functionpointers( lib_handle, "QnnInterface_getProviders"); if (nullptr == get_providers) { @@ -3042,7 +3233,7 @@ int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t * GGMLHEXAGON_LOG_WARN("unable to find a valid qnn interface\n"); return 6; } else { - GGMLHEXAGON_LOG_INFO("find a valid qnn interface\n"); + GGMLHEXAGON_LOG_VERBOSE("find a valid qnn interface\n"); } set_qnn_raw_interface(qnn_interface); @@ -3052,7 +3243,7 @@ int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t * _backend_id = backend_id; auto saver_initialize = - ggmlqnn_load_qnn_functionpointers<_pfn_QnnSaver_initialize *>(_loaded_lib_handle, "QnnSaver_initialize"); + ggmlqnn_load_qnn_functionpointers(_loaded_lib_handle, "QnnSaver_initialize"); if (nullptr != saver_initialize) { error = saver_initialize(saver_config); if (error != QNN_SUCCESS) { @@ -3103,7 +3294,7 @@ int qnn_instance::load_system() { } } - auto * get_providers = reinterpret_cast<_pfn_QnnSystemInterface_getProviders *>(dlsym( + auto * get_providers = reinterpret_cast(dlsym( _system_lib_handle, "QnnSystemInterface_getProviders")); if (nullptr == get_providers) { GGMLHEXAGON_LOG_WARN("can not load QNN symbol QnnSystemInterface_getProviders: %s\n", dlerror()); @@ -3144,7 +3335,7 @@ int qnn_instance::load_system() { GGMLHEXAGON_LOG_WARN("unable to find a valid qnn system interface\n"); return 6; } else { - GGMLHEXAGON_LOG_INFO("find a valid qnn system interface\n"); + GGMLHEXAGON_LOG_VERBOSE("find a valid qnn system interface\n"); } set_qnn_raw_system_interface(qnn_system_interface); @@ -3154,7 +3345,7 @@ int qnn_instance::load_system() { if (nullptr == _qnn_system_handle) { GGMLHEXAGON_LOG_WARN("can not create QNN system contenxt\n"); } else { - GGMLHEXAGON_LOG_INFO("initialize qnn system successfully\n"); + GGMLHEXAGON_LOG_VERBOSE("initialize qnn system successfully\n"); } return 0; @@ -3306,16 +3497,16 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { qcom_socinfo soc_info = {}; qnnstatus = _qnn_raw_interface.deviceGetPlatformInfo(nullptr, &p_info); if (QNN_SUCCESS == qnnstatus) { - GGMLHEXAGON_LOG_INFO("device counts %d\n", p_info->v1.numHwDevices); + GGMLHEXAGON_LOG_VERBOSE("device counts %d\n", p_info->v1.numHwDevices); QnnDevice_HardwareDeviceInfo_t * infos = p_info->v1.hwDevices; QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = {}; for (uint32_t i = 0; i < p_info->v1.numHwDevices; i++) { - GGMLHEXAGON_LOG_INFO("deviceID:%d, deviceType:%d, numCores %d\n", (int) infos[i].v1.deviceId, + GGMLHEXAGON_LOG_VERBOSE("deviceID:%d, deviceType:%d, numCores %d\n", (int) infos[i].v1.deviceId, (int) infos[i].v1.deviceType, (int) infos[i].v1.numCores); QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension; chipinfo = devinfo->onChipDevice; size_t htp_arch = (size_t) chipinfo.arch; - GGMLHEXAGON_LOG_INFO("htp_type:%d(%s)\n", devinfo->devType, + GGMLHEXAGON_LOG_VERBOSE("htp_type:%d(%s)\n", devinfo->devType, (devinfo->devType == QNN_HTP_DEVICE_TYPE_ON_CHIP) ? "ON_CHIP" : ""); soc_info = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize, {} }; } @@ -3349,7 +3540,7 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { if (QNN_SUCCESS != qnnstatus && QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnnstatus) { GGMLHEXAGON_LOG_WARN("failed to create QNN device\n"); } else { - GGMLHEXAGON_LOG_INFO("create device successfully\n"); + GGMLHEXAGON_LOG_VERBOSE("create device successfully\n"); } if (PROFILE_OFF != _profile_level) { @@ -3432,9 +3623,9 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { htp_set_memory_grow_size(); if (enable_qnn_rpc()) { - GGMLHEXAGON_LOG_INFO("NPU RPC feature enabled with QNN-NPU backend"); + GGMLHEXAGON_LOG_VERBOSE("NPU RPC feature enabled with QNN-NPU backend"); } else { - GGMLHEXAGON_LOG_INFO("NPU RPC feature disabled with QNN-NPU backend"); + GGMLHEXAGON_LOG_VERBOSE("NPU RPC feature disabled with QNN-NPU backend"); } } @@ -3449,7 +3640,7 @@ int qnn_instance::qnn_finalize() { int ret_status = 0; Qnn_ErrorHandle_t error = QNN_SUCCESS; - GGMLHEXAGON_LOG_INFO("enter %s\n", __func__); + GGMLHEXAGON_LOG_VERBOSE("enter %s\n", __func__); ggmlqnn_reset_idx(); free_rpcmem(); @@ -3516,7 +3707,7 @@ int qnn_instance::qnn_finalize() { unload_backend(); unload_system(); - GGMLHEXAGON_LOG_INFO("leave %s\n", __func__); + GGMLHEXAGON_LOG_VERBOSE("leave %s\n", __func__); return ret_status; } @@ -3690,7 +3881,7 @@ void qnn_instance::htp_probe_rpc_meminfo() { free_rpcmem(); _rpcmem_usage = 0; - GGMLHEXAGON_LOG_INFO("capacity of rpc ion memory %d MiB\n", _rpcmem_capacity / SIZE_IN_MB); + GGMLHEXAGON_LOG_VERBOSE("capacity of rpc ion memory %d MiB\n", _rpcmem_capacity / SIZE_IN_MB); } void qnn_instance::htp_print_info() { @@ -3735,10 +3926,10 @@ void qnn_instance::print_backend_info() { status = "No"; } - GGMLHEXAGON_LOG_INFO("%s: %s", name, status); + GGMLHEXAGON_LOG_VERBOSE("%s: %s", name, status); }; - GGMLHEXAGON_LOG_INFO("QNN backend properties:"); + GGMLHEXAGON_LOG_VERBOSE("QNN backend properties:"); print_property("Create context from binary list", QNN_PROPERTY_CONTEXT_SUPPORT_CREATE_FROM_BINARY_LIST_ASYNC); print_property("Dynamic batch", QNN_PROPERTY_GRAPH_SUPPORT_BATCH_MULTIPLE); print_property("Early termination", QNN_PROPERTY_GRAPH_SUPPORT_EARLY_TERMINATION); @@ -3767,7 +3958,7 @@ void qnn_instance::htp_set_memory_grow_size(size_t size) { if (QNN_SUCCESS != result) { GGMLHEXAGON_LOG_WARN("failed to set HTP memory config"); } else { - GGMLHEXAGON_LOG_INFO("succeed to set HTP memory config"); + GGMLHEXAGON_LOG_VERBOSE("succeed to set HTP memory config"); } } @@ -3854,7 +4045,7 @@ void qnn_instance::htp_enter_performance_mode() { if (ret != QNN_SUCCESS) { GGMLHEXAGON_LOG_WARN("failed to set HTP power config"); } else { - GGMLHEXAGON_LOG_INFO("succeed to set HTP power config"); + GGMLHEXAGON_LOG_VERBOSE("succeed to set HTP power config"); } } @@ -4066,7 +4257,7 @@ static void ggmlqnn_compute_elementwise(ggml_backend_hexagon_context * ctx, ggml } } else { GGML_ASSERT(instance->get_device_id() == ctx->device); - GGMLHEXAGON_LOG_INFO("graph name %s", graph_name.c_str()); + GGMLHEXAGON_LOG_VERBOSE("graph name %s", graph_name.c_str()); //create QNN graph error = instance->init_qnn_graph(graph_name, static_cast(ctx->device), g_hexagon_appcfg.vtcm_size_in_mb, @@ -4391,7 +4582,7 @@ static void ggmlqnn_compute_mul_mat_4d(ggml_backend_hexagon_context * ctx, ggml_ in the all, there is gap between ggml mulmat and QNN mulmat,we need to perform a transpose operation when offloading mulmat to QNN backend. this implementation will handle transpose - in func ggmlqnn_compute_create_general_tensor() + in func ggmlqnn_create_general_tensor() * @param ctx the context of backend * @param op the destination tensor where the result of the matrix multiplication will be stored. @@ -4459,7 +4650,7 @@ static void ggmlqnn_compute_mul_mat(ggml_backend_hexagon_context * ctx, ggml_ten p_tensor2_transpose = tensors[4]; } else { //create QNN graph - GGMLHEXAGON_LOG_INFO("graph name %s", graph_name.c_str()); + GGMLHEXAGON_LOG_VERBOSE("graph name %s", graph_name.c_str()); error = instance->init_qnn_graph(graph_name, static_cast(ctx->device), g_hexagon_appcfg.vtcm_size_in_mb, g_hexagon_appcfg.hvx_threads); @@ -4766,7 +4957,20 @@ static int ggmlhexagon_get_domains_info(const char * domain_type, int * num_doma int hexagon_err = AEE_SUCCESS; int ss_info = 0; void * buffer = nullptr; - ss_info = strcmp(domain_type, "NSP")? HPASS: NSP; +#if 0 +typedef enum { + /** Flag to be used to query list of all available domains */ + ALL_DOMAINS, + NSP, + LPASS, + SDSP, + MODEM, + HPASS, +} fastrpc_domain_type; +#endif + //ss_info = strcmp(domain_type, "NSP") ? HPASS: NSP; + //forward compatible with new SDK + ss_info = (0 == memcmp(domain_type, "NSP", 3)) ? 1 : 5; system_req_payload req; memset(&req, 0, sizeof(system_req_payload)); req.id = FASTRPC_GET_DOMAINS; @@ -4993,7 +5197,11 @@ static void ggmlhexagon_set_rpc_latency(remote_handle64 handle, int qos, int lat GGMLHEXAGON_LOG_WARN("failed with error 0x%x", hexagon_error); goto bail; } else { - GGMLHEXAGON_LOG_INFO("set rpc qos %d, latency %d\n", qos, latency); + if (ggmlhexagon_is_llamabench_running()) { + GGMLHEXAGON_LOG_VERBOSE("set rpc qos %d, latency %d\n", qos, latency); + } else { + GGMLHEXAGON_LOG_INFO("set rpc qos %d, latency %d\n", qos, latency); + } } } else { hexagon_error = AEE_EUNSUPPORTEDAPI; @@ -5004,6 +5212,41 @@ static void ggmlhexagon_set_rpc_latency(remote_handle64 handle, int qos, int lat return; } +/** + * set FastRPC thread priority (default unchanged at 192) + * priority values range from 1 to 255, with smaller values representing higher priorities + * Unprivileged clients: 64 through 254 (cDSP only) + * Privileged clients: 1 through 254 + * + * ref:file:///opt/qcom/Hexagon_SDK/6.2.0.1/docs/software/system_integration.html#priority-levels + */ +static int ggmlhexagon_set_priority(int domain, int priority) { + int err = 0; + + if (priority < 1) { + priority = 1; + } + if (priority > 255) { + priority = 255; + } + + if (remote_session_control) { + struct remote_rpc_thread_params data; + data.domain = domain; + data.prio = priority; + data.stack_size = -1; + err = remote_session_control(FASTRPC_THREAD_PARAMS, (void *)&data, sizeof(data)); + if (err != AEE_SUCCESS) { + GGMLHEXAGON_LOG_WARN("remote_session_control failed with 0x%x when setting thread priority\n", err); + } else { + GGMLHEXAGON_LOG_VERBOSE("thread priority set to %d\n", priority); + } + } else { + GGMLHEXAGON_LOG_WARN("cannot set thread priority\n"); + } + return err; +} + static bool ggmlhexagon_is_status_notification_supported(int domain) { int hexagon_error = AEE_SUCCESS; @@ -5219,7 +5462,11 @@ static int ggmlhexagon_init_rpcmempool(ggml_backend_hexagon_context * ctx) { ctx->rpc_mempool_capacity = candidate_size * SIZE_IN_MB; GGMLHEXAGON_LOG_DEBUG("rpc memory capacity %ld(%d MiB) for device %d", ctx->rpc_mempool_capacity, ctx->rpc_mempool_capacity / SIZE_IN_MB, ctx->device); - GGMLHEXAGON_LOG_INFO("capacity of rpc memory %d MiB", ctx->rpc_mempool_capacity / SIZE_IN_MB); + if (ggmlhexagon_is_llamabench_running()) { + GGMLHEXAGON_LOG_VERBOSE("capacity of rpc memory %d MiB", ctx->rpc_mempool_capacity / SIZE_IN_MB); + } else { + GGMLHEXAGON_LOG_INFO("capacity of rpc memory %d MiB", ctx->rpc_mempool_capacity / SIZE_IN_MB); + } if ((g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) { GGML_ASSERT(ctx->rpc_mempool_capacity > (8 * SIZE_IN_MB)); @@ -5265,14 +5512,22 @@ static void ggmlhexagon_probe_dspinfo(ggml_backend_hexagon_context * ctx) { ggmlhexagon_get_hvx_arch_ver(ctx->domain_id, &dsp_version); if (dsp_version == 0x68 || dsp_version == 0x69 || dsp_version == 0x73 || dsp_version == 0x75 || dsp_version == 0x79) { - GGMLHEXAGON_LOG_INFO("dsp arch version 0x%x", dsp_version); + if (ggmlhexagon_is_llamabench_running()) { + GGMLHEXAGON_LOG_VERBOSE("dsp arch version 0x%x", dsp_version); + } else { + GGMLHEXAGON_LOG_INFO("dsp arch version 0x%x", dsp_version); + } //0x68 -> 68, 0x69 -> 69, 0x73 -> 73, 0x75 -> 75, 0x79 -> 79 size_t htp_arch = ggmlhexagon_htparch_hex_to_decimal(dsp_version); GGMLHEXAGON_LOG_DEBUG("dsp arch version %d", htp_arch); struct qcom_socinfo * socinfo = ggmlhexagon_get_socinfo_from_socmodel(htp_arch); if (nullptr != socinfo) { //got fully description of SoC when hwaccel approach is HWACCEL_CDSP - GGMLHEXAGON_LOG_INFO("device info: %s, %s", socinfo->soc_desc, ggmlhexagon_get_htparch_desc(htp_arch)); + if (ggmlhexagon_is_llamabench_running()) { + GGMLHEXAGON_LOG_VERBOSE("device info: %s, %s", socinfo->soc_desc, ggmlhexagon_get_htparch_desc(htp_arch)); + } else { + GGMLHEXAGON_LOG_INFO("device info: %s, %s", socinfo->soc_desc, ggmlhexagon_get_htparch_desc(htp_arch)); + } } } else { GGMLHEXAGON_LOG_WARN("error: dsp arch version 0x%x is not supported", dsp_version); @@ -5282,27 +5537,42 @@ static void ggmlhexagon_probe_dspinfo(ggml_backend_hexagon_context * ctx) { uint32_t vtcm_page = 0; ggmlhexagon_get_vtcm_info(ctx->domain_id, VTCM_COUNT, &vtcm_count); ggmlhexagon_get_vtcm_info(ctx->domain_id, VTCM_PAGE, &vtcm_page); - GGMLHEXAGON_LOG_INFO("vtcm_count %d", vtcm_count); - GGMLHEXAGON_LOG_INFO("vtcm_page %d", vtcm_page); uint32_t hmx_depth = 0; uint32_t hmx_spatial = 0; ggmlhexagon_get_hmx_support_info(ctx->domain_id, HMX_SUPPORT_DEPTH, &hmx_depth); ggmlhexagon_get_hmx_support_info(ctx->domain_id, HMX_SUPPORT_SPATIAL, &hmx_spatial); - GGMLHEXAGON_LOG_INFO("hmx_depth %d", hmx_depth); - GGMLHEXAGON_LOG_INFO("hmx_spatial %d", hmx_spatial); uint32_t hvx_support_128b = 0; ggmlhexagon_get_hvx_support_info(ctx->domain_id, HVX_SUPPORT_128B, &hvx_support_128b); - GGMLHEXAGON_LOG_INFO("hvx_support_128b %d", hvx_support_128b); - GGMLHEXAGON_LOG_INFO("unsigned pd supported %d", ggmlhexagon_get_unsignedpd_support()); - GGMLHEXAGON_LOG_INFO("async fastrpc supported %d", ggmlhexagon_is_async_fastrpc_supported(ctx->domain_id)); + if (ggmlhexagon_is_llamabench_running()) { + //make llama-bench happy + GGMLHEXAGON_LOG_VERBOSE("vtcm_count %d", vtcm_count); + GGMLHEXAGON_LOG_VERBOSE("vtcm_page %d", vtcm_page); + GGMLHEXAGON_LOG_VERBOSE("hmx_depth %d", hmx_depth); + GGMLHEXAGON_LOG_VERBOSE("hmx_spatial %d", hmx_spatial); + GGMLHEXAGON_LOG_VERBOSE("hvx_support_128b %d", hvx_support_128b); + GGMLHEXAGON_LOG_VERBOSE("unsigned pd supported %d", ggmlhexagon_get_unsignedpd_support()); + GGMLHEXAGON_LOG_VERBOSE("async fastrpc supported %d", ggmlhexagon_is_async_fastrpc_supported(ctx->domain_id)); + } else { + GGMLHEXAGON_LOG_INFO("vtcm_count %d", vtcm_count); + GGMLHEXAGON_LOG_INFO("vtcm_page %d", vtcm_page); + GGMLHEXAGON_LOG_INFO("hmx_depth %d", hmx_depth); + GGMLHEXAGON_LOG_INFO("hmx_spatial %d", hmx_spatial); + GGMLHEXAGON_LOG_INFO("hvx_support_128b %d", hvx_support_128b); + GGMLHEXAGON_LOG_INFO("unsigned pd supported %d", ggmlhexagon_get_unsignedpd_support()); + GGMLHEXAGON_LOG_INFO("async fastrpc supported %d", ggmlhexagon_is_async_fastrpc_supported(ctx->domain_id)); + } } static void ggmlhexagon_deinit_cdsp(ggml_backend_hexagon_context * ctx) { int hexagon_error = AEE_SUCCESS; - GGMLHEXAGON_LOG_INFO("enter %s", __func__); + if (ggmlhexagon_is_llamabench_running()) { + GGMLHEXAGON_LOG_VERBOSE("enter %s", __func__); + } else { + GGMLHEXAGON_LOG_INFO("enter %s", __func__); + } if (0 != ctx->ggmlop_handle) { hexagon_error = ggmlop_dsp_close(ctx->ggmlop_handle); if (AEE_SUCCESS != hexagon_error) { @@ -5314,13 +5584,18 @@ static void ggmlhexagon_deinit_cdsp(ggml_backend_hexagon_context * ctx) { ggmlhexagon_deinit_rpcmempool(ctx); ctx->domain_id = -1; - GGMLHEXAGON_LOG_INFO("leave %s", __func__); + if (ggmlhexagon_is_llamabench_running()) { + GGMLHEXAGON_LOG_VERBOSE("leave %s", __func__); + } else { + GGMLHEXAGON_LOG_INFO("leave %s", __func__); + } } static int ggmlhexagon_init_dsp(ggml_backend_hexagon_context * ctx) { static std::mutex mutex; std::lock_guard lock(mutex); + // load dynamic functions from Qualcomm's rpcmem library (we moved it to the init dsp function (this one)) #if defined(__ANDROID__) || defined(__linux__) std::filesystem::path full_path(std::string(g_hexagon_appcfg.runtime_libpath) + "libcdsprpc.so"); //full_path /= std::filesystem::path("libcdsprpc.so").filename(); @@ -5455,8 +5730,13 @@ static int ggmlhexagon_init_dsp(ggml_backend_hexagon_context * ctx) { } ctx->domain_id = domain_id; - GGMLHEXAGON_LOG_INFO("using Hexagon domain %d(%s)", domain_id, ggmlhexagon_get_dsp_name(domain_id)); - GGMLHEXAGON_LOG_INFO("unsignedpd_enabled %d", is_unsignedpd_enabled); + if (ggmlhexagon_is_llamabench_running()) { + GGMLHEXAGON_LOG_VERBOSE("using Hexagon domain %d(%s)", domain_id, ggmlhexagon_get_dsp_name(domain_id)); + GGMLHEXAGON_LOG_VERBOSE("unsignedpd_enabled %d", is_unsignedpd_enabled); + } else { + GGMLHEXAGON_LOG_INFO("using Hexagon domain %d(%s)", domain_id, ggmlhexagon_get_dsp_name(domain_id)); + GGMLHEXAGON_LOG_INFO("unsignedpd_enabled %d", is_unsignedpd_enabled); + } if (is_unsignedpd_enabled) { if (remote_session_control) { struct remote_rpc_control_unsigned_module data; @@ -5465,7 +5745,7 @@ static int ggmlhexagon_init_dsp(ggml_backend_hexagon_context * ctx) { hexagon_error = remote_session_control(DSPRPC_CONTROL_UNSIGNED_MODULE, (void *)&data, sizeof(data)); GGMLHEXAGON_LOG_DEBUG("remote_session_control returned %d for configuring unsigned PD success", hexagon_error); if (AEE_SUCCESS != hexagon_error) { - GGMLHEXAGON_LOG_DEBUG("error 0x%x: remote_session_control failed", hexagon_error); + GGMLHEXAGON_LOG_WARN("error 0x%x: remote_session_control failed", hexagon_error); } } else { GGMLHEXAGON_LOG_DEBUG("unsigned PD not supported on this device"); @@ -5482,6 +5762,13 @@ static int ggmlhexagon_init_dsp(ggml_backend_hexagon_context * ctx) { GGMLHEXAGON_LOG_WARN("error 0x%x: failed to compute on domain %d", hexagon_error, domain_id); goto bail; } + ggmlhexagon_set_priority(domain_id, 160); + + ggmlop_domain_uri_len = strlen(ggmlop_URI) + MAX_DOMAIN_NAMELEN; + ggmlop_domain_uri = (char *)malloc(ggmlop_domain_uri_len); + if (NULL == ggmlop_domain_uri) { + goto bail; + } // we copy the appropaite ggmlop-skel into our runtime libpath { @@ -5491,8 +5778,8 @@ static int ggmlhexagon_init_dsp(ggml_backend_hexagon_context * ctx) { if (dsp_version == 0x68 || dsp_version == 0x69 || dsp_version == 0x73 || dsp_version == 0x75 || dsp_version == 0x79) { - // delete the file $(g_hexagon_appcfg.runtime_libpath)/libggmlop-skel.so if it exists - std::string filepath = std::string(g_hexagon_appcfg.runtime_libpath) + "/libggmlop-skel.so"; + // delete the file $(g_hexagon_appcfg.runtime_libpath)/libggmldsp-skel.so if it exists + std::string filepath = std::string(g_hexagon_appcfg.runtime_libpath) + "/libggmldsp-skel.so"; if (std::filesystem::exists(filepath)) { std::filesystem::remove(filepath); } @@ -5500,13 +5787,13 @@ static int ggmlhexagon_init_dsp(ggml_backend_hexagon_context * ctx) { // detect the htp arch number size_t htp_arch = ggmlhexagon_htparch_hex_to_decimal(dsp_version); - // find the file $(g_hexagon_appcfg.runtime_libpath)/libggmlop-skelV$(htp_arch).so if it exists - // copy and rename it to libggmlop-skel.so in the same folder + // find the file $(g_hexagon_appcfg.runtime_libpath)/libggmldsp-skelV$(htp_arch).so if it exists + // copy and rename it to libggmldsp-skel.so in the same folder // Construct file paths - std::string source_filename = std::string("libggmlop-skelV") + std::to_string(htp_arch) + ".so"; - std::string source_path = std::string(g_hexagon_appcfg.runtime_libpath) + "/" + source_filename; - std::string dest_path = std::string(g_hexagon_appcfg.runtime_libpath) + "/libggmlop-skel.so"; + std::string source_filename = std::string("libggmldsp-skelV") + std::to_string(htp_arch) + ".so"; + std::string source_path = std::string(g_hexagon_appcfg.runtime_libpath) + source_filename; + std::string dest_path = std::string(g_hexagon_appcfg.runtime_libpath) + "libggmldsp-skel.so"; // Check if source file exists if (std::filesystem::exists(source_path)) { @@ -5538,12 +5825,18 @@ static int ggmlhexagon_init_dsp(ggml_backend_hexagon_context * ctx) { GGMLHEXAGON_LOG_DEBUG("ggmlop domain uri:%s", ggmlop_domain_uri); hexagon_error = ggmlop_dsp_open(ggmlop_domain_uri, &ctx->ggmlop_handle); if (AEE_SUCCESS == hexagon_error) { - GGMLHEXAGON_LOG_INFO("succeed to open domain %d(%s)", domain_id, ggmlhexagon_get_dsp_name(domain_id)); - //FIXME: only support offload fp32 GGML_OP_MUL_MAT to cDSP - GGMLHEXAGON_LOG_INFO("only support offload fp32 GGML_OP_ADD and fp32 GGML_OP_MUL_MAT to cDSP currently"); + if (ggmlhexagon_is_llamabench_running()) { + GGMLHEXAGON_LOG_VERBOSE("succeed to open domain %d(%s)", domain_id, ggmlhexagon_get_dsp_name(domain_id)); + GGMLHEXAGON_LOG_VERBOSE("only support offload GGML_OP_ADD and GGML_OP_MUL_MAT to cDSP currently"); + } else { + GGMLHEXAGON_LOG_INFO("succeed to open domain %d(%s)", domain_id, ggmlhexagon_get_dsp_name(domain_id)); + GGMLHEXAGON_LOG_INFO("only support offload GGML_OP_ADD and GGML_OP_MUL_MAT to cDSP currently"); + } ggmlhexagon_probe_dspinfo(ctx); //FIXME: re-use this function to pass thread_counts info to code on cDSP side before fully understand qidl mechanism - ggmlop_dsp_setclocks(ctx->ggmlop_handle, HAP_DCVS_VCORNER_TURBO_PLUS, 40, 1, g_hexagon_appcfg.thread_counts); + //ggmlop_dsp_setclocks(ctx->ggmlop_handle, HAP_DCVS_VCORNER_TURBO_PLUS, 40, 1, g_hexagon_appcfg.thread_counts); + //backward compatible with previous codes on cDSP side + ggmlop_dsp_setclocks(ctx->ggmlop_handle, HAP_DCVS_VCORNER_TURBO_PLUS, 40, g_hexagon_appcfg.mulmat_algotype, g_hexagon_appcfg.thread_counts); ggmlhexagon_set_rpc_latency(ctx->ggmlop_handle, RPC_POLL_QOS, 100); int result = ggmlhexagon_init_rpcmempool(ctx); if (0 != result) { @@ -5559,6 +5852,10 @@ static int ggmlhexagon_init_dsp(ggml_backend_hexagon_context * ctx) { //make sure test-backend-ops get the correct backend name when hwaccel approach is 2(HWACCEL_CDSP) memcpy(g_hexagon_mgr[ctx->device].name, "Hexagon-cDSP", strlen("Hexagon-cDSP")); + if (NULL != ggmlop_domain_uri) { + free(ggmlop_domain_uri); + ggmlop_domain_uri = NULL; + } return 0; bail: @@ -5705,8 +6002,6 @@ static bool ggmlhexagon_can_handle_op_through_cdsp(ggml_backend_dev_t dev, const case GGML_OP_MUL_MAT: { ggmlhexagon_dump_op_info(op_tensor); - //FIXME:keep same filter logic with QNN solution to compare NPU performance between cDSP approach - // and QNN-NPU approach, remove these filters in the future if (src0_rank != src1_rank) return false; if (src0_rank != 2) @@ -5714,7 +6009,7 @@ static bool ggmlhexagon_can_handle_op_through_cdsp(ggml_backend_dev_t dev, const if (1 == g_hexagon_appcfg.enable_q_mulmat) { if (1 == g_hexagon_appcfg.enable_all_q_mulmat) { - return (src0->type == GGML_TYPE_F32 || ggml_is_quantized(src0->type)) && (src1->type == GGML_TYPE_F32); + return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && (src1->type == GGML_TYPE_F32); } return (src0->type == GGML_TYPE_F32 @@ -5799,22 +6094,25 @@ static bool ggmlhexagon_can_handle_op_through_qnn(ggml_backend_dev_t dev, const if (src0_rank != 2) { // FIXME: there are some limitations for mulmat in QNN SDK: rank >= 2. - // keep same filter logic with QNN solution to compare NPU performance between - // cDSP approach and QNN-NPU approach, remove these filters in the future return false; } if (ctx->device == HEXAGON_BACKEND_QNNNPU) { - if (1 == g_hexagon_appcfg.enable_q_mulmat) + if (1 == g_hexagon_appcfg.enable_q_mulmat) { return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_Q6_K || src0->type == GGML_TYPE_Q8_K ) && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32); - else + } else { return (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && op_tensor->type == GGML_TYPE_F32); + } } else { - return (src0->type == GGML_TYPE_F32 || ggml_is_quantized(src0->type)) + if (1 == g_hexagon_appcfg.enable_q_mulmat) { + return (src0->type == GGML_TYPE_F32 || ggml_is_quantized(src0->type)) && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32); + } else { + return (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && op_tensor->type == GGML_TYPE_F32); + } } } case GGML_OP_LOG: @@ -6076,10 +6374,13 @@ static ggml_backend_buffer_t ggml_backend_hexagon_buffer_type_alloc_buffer( if ((HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) { GGMLHEXAGON_LOG_DEBUG("device %d(%s)", ctx->device, ggml_backend_hexagon_get_devname(ctx->device)); GGML_ASSERT(nullptr != ctx->rpc_mempool); - GGMLHEXAGON_LOG_DEBUG("size %ld(%d MiB), rpc_mempool_usage %ld(%d MiB), rpc_mempool_len %ld(%d MiB)", + GGMLHEXAGON_LOG_VERBOSE("size %ld(%d MiB), rpc_mempool_usage %ld(%d MiB), rpc_mempool_len %ld(%d MiB)", size, size / SIZE_IN_MB, ctx->rpc_mempool_usage, ctx->rpc_mempool_usage / SIZE_IN_MB, ctx->rpc_mempool_len, ctx->rpc_mempool_len / SIZE_IN_MB); - GGML_ASSERT(size + ctx->rpc_mempool_usage <= ctx->rpc_mempool_len); + if (size + ctx->rpc_mempool_usage >= ctx->rpc_mempool_len) { + GGMLHEXAGON_LOG_WARN("device memory allocation of size %ld failed", size); + return nullptr; + } buffer_ctx->buffer = (static_cast(ctx->rpc_mempool)) + ctx->rpc_mempool_usage; GGMLHEXAGON_LOG_DEBUG("buffer_ctx->buffer %p", buffer_ctx->buffer); GGML_ASSERT(nullptr != buffer_ctx->buffer); @@ -6303,7 +6604,7 @@ static void ggml_backend_hexagon_device_get_props(ggml_backend_dev_t dev, static ggml_backend_t ggml_backend_hexagon_device_init_backend(ggml_backend_dev_t dev, const char * params) { GGML_UNUSED(dev); GGMLHEXAGON_LOG_DEBUG("enter %s\n", __func__); - size_t dev_index = 0; + int dev_index = 0; //case-1: test-backend-ops or other similar scenario: calling ggml_backend_dev_init(dev, reinterpret_cast(i)) directly in user's code ggmlhexagon_load_cfg(); @@ -6319,11 +6620,19 @@ static ggml_backend_t ggml_backend_hexagon_device_init_backend(ggml_backend_dev_ return nullptr; } } else { - GGMLHEXAGON_LOG_INFO("program specified param is not nullptr"); + GGMLHEXAGON_LOG_VERBOSE("program specified param is not nullptr"); //user's program calling ggml_backend_hexagon_device_init_backend directly dev_index = (int)(intptr_t)params; + if (dev_index < 0) { + GGMLHEXAGON_LOG_VERBOSE("it shouldn't happend\n"); + //test-thread-safety might-be running at the moment or an invalid value passed from user's program + dev_index = HEXAGON_BACKEND_QNNCPU; //0 + } + if (dev_index > GGML_HEXAGON_MAX_DEVICES) { + dev_index = HEXAGON_BACKEND_GGML; //4 + } g_hexagon_appcfg.hexagon_backend = dev_index; - GGMLHEXAGON_LOG_INFO("program specified dev_index %d\n", dev_index); + GGMLHEXAGON_LOG_VERBOSE("program specified dev_index %d\n", dev_index); } GGMLHEXAGON_LOG_DEBUG("hexagon_backend=%d", dev_index); ggml_backend_t hexagon_backend = ggml_backend_hexagon_init(dev_index, g_hexagon_appcfg.runtime_libpath); @@ -6720,7 +7029,7 @@ const char * ggml_backend_hexagon_get_devname(size_t dev_num) { static qnn_instance * ggmlqnn_init_qnn_instance(size_t device, const char * qnn_lib_path) { int result = 0; - GGMLHEXAGON_LOG_INFO("device=%d, hwaccel approach=%d(%s)", device, g_hexagon_appcfg.hwaccel_approach, + GGMLHEXAGON_LOG_VERBOSE("device=%d, hwaccel approach=%d(%s)", device, g_hexagon_appcfg.hwaccel_approach, ggmlhexagon_get_hwaccel_approach_name(g_hexagon_appcfg.hwaccel_approach)); qnn_instance * instance = nullptr; @@ -6740,7 +7049,7 @@ static qnn_instance * ggmlqnn_init_qnn_instance(size_t device, const char * qnn_ } std::string device_name = ggml_backend_hexagon_get_devname(device); - GGMLHEXAGON_LOG_INFO("qnn device name %s", device_name.c_str()); + GGMLHEXAGON_LOG_VERBOSE("qnn device name %s", device_name.c_str()); g_hexagon_mgr[device].instance = instance; g_hexagon_mgr[device].raw_interface = instance->get_qnn_raw_interface(); g_hexagon_mgr[device].raw_system_interface = instance->get_qnn_raw_system_interface(); @@ -6777,9 +7086,6 @@ ggml_backend_t ggml_backend_hexagon_init(size_t device, const char * runtime_lib ggmlhexagon_set_runtime_path(device, runtime_libpath); } - // the condition above never be true because our hardcoded runtime_libpath is always the same as the config, so we manually set the library paths here - ggmlhexagon_set_runtime_path(g_hexagon_appcfg.hexagon_backend, g_hexagon_appcfg.runtime_libpath); - if (nullptr != g_hexagon_mgr[device].backend) { GGMLHEXAGON_LOG_DEBUG("backend %d(%s) already loaded", device, ggml_backend_hexagon_get_devname(device)); @@ -6811,7 +7117,7 @@ ggml_backend_t ggml_backend_hexagon_init(size_t device, const char * runtime_lib } } else { //get fully description of SoC when hwaccel approach is HWACCEL_QNN and backend is HEXAGON_BACKEND_QNNNPU - GGMLHEXAGON_LOG_INFO("device name %s", ggml_backend_hexagon_device_get_description(hexagon_backend->device)); + GGMLHEXAGON_LOG_VERBOSE("device name %s", ggml_backend_hexagon_device_get_description(hexagon_backend->device)); } GGMLHEXAGON_LOG_DEBUG("leave %s", __func__); diff --git a/ggml/src/ggml-hexagon/kernels/Makefile b/ggml/src/ggml-hexagon/kernels/Makefile index 0e6b3fa2e4df6..b3e7f038cb866 100755 --- a/ggml/src/ggml-hexagon/kernels/Makefile +++ b/ggml/src/ggml-hexagon/kernels/Makefile @@ -7,7 +7,7 @@ HEXAGON_COMPUTE=compute${HTP_ARCH_VERSION} HEXAGON_CC=${HEXAGON_SDK_PATH}/tools/HEXAGON_Tools/8.8.06/Tools/bin/hexagon-clang HEXAGON_CXX=${HEXAGON_SDK_PATH}/tools/HEXAGON_Tools/8.8.06/Tools/bin/hexagon-clang -TARGET=libggmlop-skel.so +TARGET=libggmldsp-skel.so $(info HEXAGON_SDK_PATH:${HEXAGON_SDK_PATH}) $(info HTP_ARCH_VERSION:${HTP_ARCH_VERSION}) @@ -23,12 +23,14 @@ LDFLAGS=-m${HTP_ARCH_VERSION} -Wl,--defsym=ISDB_TRUSTED_FLAG=2 -Wl,--defsym=ISDB #SRCS = $(wildcard *.c) SRCS = ggml-dsp.c skel.c entry.c add.c mulmat.c OBJS = $(patsubst %.c, %.o, $(SRCS)) +OBJS += dot.o +OBJS += worker_pool.o ALL:$(OBJS) ${HEXAGON_CC} ${LDFLAGS} -o ${TARGET} -Wl,--start-group ${OBJS} -Wl,--end-group @ls -l ${TARGET} - /bin/cp -fv ${TARGET} ../../../../out/android/bin/ - /bin/cp -fv ${TARGET} ../../../../out/android/bin/libggmlop-skel${HTP_ARCH_VERSION}.so + /bin/cp -fv ${TARGET} ../../../../out/ggmlhexagon-android/bin/ + /bin/cp -fv ${TARGET} ../../../../out/ggmlhexagon-android/bin/libggmldsp-skel${HTP_ARCH_VERSION}.so /bin/rm -f *.so %.o:%.c @@ -36,5 +38,16 @@ ALL:$(OBJS) ${HEXAGON_CC} ${CFLAGS} ${DEBUG_FLAG} -D__FILENAME__=\"$<\" -o $@ -c $< @echo "\n" +%.o:%.S + @echo "${HEXAGON_CC} ${CFLAGS} ${DEBUG_FLAG} -D__FILENAME__=\"$<\" -o $@ -c $<" + ${HEXAGON_CC} ${CFLAGS} ${DEBUG_FLAG} -D__FILENAME__=\"$<\" -o $@ -c $< + @echo "\n" + +%.o:%.cpp + @echo "${HEXAGON_CC} ${CFLAGS} ${DEBUG_FLAG} -D__FILENAME__=\"$<\" -o $@ -c $<" + ${HEXAGON_CC} ${CFLAGS} ${DEBUG_FLAG} -D__FILENAME__=\"$<\" -o $@ -c $< + @echo "\n" + clean: rm -f *.o + /bin/rm -f *.so diff --git a/ggml/src/ggml-hexagon/kernels/dot.S b/ggml/src/ggml-hexagon/kernels/dot.S new file mode 100755 index 0000000000000..2031a6001519b --- /dev/null +++ b/ggml/src/ggml-hexagon/kernels/dot.S @@ -0,0 +1,136 @@ +/**============================================================================= +@file + qhblas_f_vector_dot_af.S + +@brief + Calculates dot product of two input float vectors. + + Function prototype + + int32_t qhblas_f_vector_dot_af(float_a8_t *input_1, float_a8_t *input_2, float *output, uint32_t size); + + Reference C code + + int32_t qhblas_f_vector_dot_af(float_a8_t *input_1, float_a8_t *input_2, float *output, uint32_t size) + { + if ((input_1 == NULL) || (input_2 == NULL) || (output == NULL) || (size == 0)) + { + return -1; + } + + float dot = 0; + for (uint32_t i = 0; i < size; ++i) + { + dot += input_1[i] * input_2[i]; + } + + *output = dot; + return 0; + } + +Copyright (c) 2019 Qualcomm Technologies Incorporated. +All Rights Reserved. Qualcomm Proprietary and Confidential. +=============================================================================**/ + +/*============================================================================*/ + + .p2align 2 + .p2align 4,,15 + .global qhblas_f_vector_dot_af + .type qhblas_f_vector_dot_af, @function + +/*============================================================================*/ + +#define DC_PREFETCH_AHEAD 64 // number of bytes for DCFETCH +#define L2_PREFETCH_AHEAD 256 // number of bytes for L2FETCH +#define L2FETCH_CONFIG 0x0100FF00+(L2_PREFETCH_AHEAD/256) // [stride = 256 : width = 255 : height = bytes/256] +#define L2_PREFETCH_ELEMS L2_PREFETCH_AHEAD/8 // number of elements to prefetch with L2FETCH + +/*============================================================================*/ + +qhblas_f_vector_dot_af: +{ + p0 = !cmp.eq(r0,#0) // input_1 != NULL + p0 = !cmp.eq(r1,#0) // input_2 != NULL + p0 = !cmp.eq(r2,#0) // output != NULL + p0 = cmp.gtu(r3,#0) // size > 0 + if (!p0.new) jump:nt .L_ret +} +{ + r10 = #0 + r3 = lsr(r3,#1) // size / 2 + p1 = tstbit(r3,#0) // check for odd size + if(cmp.eq(r3.new,#0)) jump:nt .L_do_one +} +{ + r7:6 = #0 + r9:8 = #0 + r5 = add(r3,#7) // (size / 2) + 7 + p2 = cmp.gtu(r3,#L2_PREFETCH_ELEMS) // check whether we can do l2fetch +} +{ + r5 = lsr(r5,#3) // ceil(size / 2) + r14 = mux(p2,r3,#0) // set l2fetch counter +} +{ + r13:12 = combine(##L2FETCH_CONFIG,#8) // set l2fetch config and max number of iterations for .L_loop_do_two + loop1(.L_prefetch_loop_do_two,r5) +} + .falign +.L_prefetch_loop_do_two: +{ + dcfetch(r0+#DC_PREFETCH_AHEAD) // prefetch ahead for input_1 + r5 = min(r12,r3) // min(8, size / 2) +} +{ + dcfetch(r1+#DC_PREFETCH_AHEAD) // prefetch ahead for input_2 + loop0(.L_loop_do_two,r5) + p2 = cmp.eq(r3,r14) // check whether to do l2fetch + if (!p2.new) jump:t .L_loop_do_two +} +{ + r5 = add(r3,#-L2_PREFETCH_ELEMS) // number of elements left to prefetch ahead + r15 = add(r0,#L2_PREFETCH_AHEAD) // input_1 addr for l2fetch +} +{ + p2 = cmp.gtu(r5,#L2_PREFETCH_ELEMS) // check whether we can continue to do l2fetch + r15 = add(r1,#L2_PREFETCH_AHEAD) // input_2 addr for l2fetch + l2fetch(r15,r13) +} +{ + if (p2) r14 = add(r14,#-L2_PREFETCH_ELEMS) // adjust l2fetch counter + if (!p2) r14 = #0 // there are no more bytes left to prefetch ahead + l2fetch(r15,r13) +} + .falign +.L_loop_do_two: +{ + r7:6 = memd(r0++#8) + r9:8 = memd(r1++#8) + r10 += sfmpy(r7,r9) +} +{ + r10 += sfmpy(r6,r8) + r3 = add(r3,#-1) // adjust (size / 2) +}:endloop0:endloop1 +{ + r10 += sfmpy(r7,r9) + if (!p1) jump:nt .L_ret +} + .falign +.L_do_one: +{ + r4 = memw(r0) + r5 = memw(r1) +} +{ + r10 += sfmpy(r4,r5) +} + .falign +.L_ret: +{ + if (p0) memw(r2) = r10 + r0 = mux(p0,#0,#-1) + jumpr r31 +} + .size qhblas_f_vector_dot_af, .-qhblas_f_vector_dot_af diff --git a/ggml/src/ggml-hexagon/kernels/entry.c b/ggml/src/ggml-hexagon/kernels/entry.c index ea38beea673c0..8af93ea1d3082 100644 --- a/ggml/src/ggml-hexagon/kernels/entry.c +++ b/ggml/src/ggml-hexagon/kernels/entry.c @@ -34,7 +34,7 @@ int ggmlop_dsp_close(remote_handle64 handle) { return 0; } -AEEResult ggmlop_dsp_setclocks(remote_handle64 handle, int32 power_level, int32 latency, int32 dcvs_enabled, int32 thread_counts) { +AEEResult ggmlop_dsp_setclocks(remote_handle64 handle, int32 power_level, int32 latency, int32 mulmat_algo, int32 thread_counts) { GGMLHEXAGON_LOG_DEBUG("enter %s", __func__); HAP_power_request_t request; memset(&request, 0, sizeof(HAP_power_request_t)); @@ -60,7 +60,7 @@ AEEResult ggmlop_dsp_setclocks(remote_handle64 handle, int32 power_level, int32 request.type = HAP_power_set_DCVS_v2; request.dcvs_v2.dcvs_enable = TRUE; request.dcvs_v2.dcvs_params.target_corner = (HAP_dcvs_voltage_corner_t)power_level; - if (dcvs_enabled) { + if (mulmat_algo) { request.dcvs_v2.dcvs_params.min_corner = HAP_DCVS_VCORNER_DISABLE; request.dcvs_v2.dcvs_params.max_corner = HAP_DCVS_VCORNER_DISABLE; } else { diff --git a/ggml/src/ggml-hexagon/kernels/mulmat.c b/ggml/src/ggml-hexagon/kernels/mulmat.c index f7494c8eaacf4..f34b6f8b09b4e 100644 --- a/ggml/src/ggml-hexagon/kernels/mulmat.c +++ b/ggml/src/ggml-hexagon/kernels/mulmat.c @@ -145,7 +145,6 @@ static void ggml_compute_forward_mul_mat_one_chunk(const ggml_tensor *src0, cons } } -//TODO: only support fp32 mulmat on cDSP static int ggmlop_dsp_mulmat_singlethread(remote_handle64 h, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ ); ggmlhexagon_dump_tensor(src0, 0); @@ -274,7 +273,6 @@ static int ggmlop_dsp_mulmat_singlethread(remote_handle64 h, const ggml_tensor * return 0; } -//TODO:multithreading mulmat static int ggmlop_dsp_mulmat_multithread(remote_handle64 h, const struct dsptensor * src0, const struct dsptensor * src1, dsptensor * dst) { GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ ); GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ ); diff --git a/ggml/src/ggml-hexagon/kernels/skel.c b/ggml/src/ggml-hexagon/kernels/skel.c index 26da58273f013..b216d66a654ab 100644 --- a/ggml/src/ggml-hexagon/kernels/skel.c +++ b/ggml/src/ggml-hexagon/kernels/skel.c @@ -289,8 +289,8 @@ extern int adsp_mmap_fd_getinfo(int, uint32_t *); #ifdef __cplusplus extern "C" { #endif -_ATTRIBUTE_VISIBILITY uint32_t ggmlop_skel_handle_invoke_qaic_version = 10048; -_ATTRIBUTE_VISIBILITY char ggmlop_skel_handle_invoke_uri[77+1]="file:///libggmlop-skel.so?ggmlop_skel_handle_invoke&_modver=1.0&_idlver=0.0.1"; +_ATTRIBUTE_VISIBILITY uint32_t ggmldsp_skel_handle_invoke_qaic_version = 10048; +_ATTRIBUTE_VISIBILITY char ggmldsp_skel_handle_invoke_uri[79+1]="file:///libggmldsp-skel.so?ggmldsp_skel_handle_invoke&_modver=1.0&_idlver=0.0.1"; static __inline int _skel_pack(_ATTRIBUTE_UNUSED remote_arg* _praROutPost, _ATTRIBUTE_UNUSED remote_arg* _ppraROutPost[1], _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _rout0[1], _ATTRIBUTE_UNUSED uint32_t _rout1[4], _ATTRIBUTE_UNUSED uint32_t _rout2[4], _ATTRIBUTE_UNUSED uint32_t _rout3[1], _ATTRIBUTE_UNUSED uint32_t _rout4[16], _ATTRIBUTE_UNUSED uint32_t _rout5[1], _ATTRIBUTE_UNUSED char* _rout6[1], _ATTRIBUTE_UNUSED uint32_t _rout6Len[1]) { int _nErr = 0; remote_arg* _praROutPostStart = _praROutPost; @@ -598,7 +598,7 @@ static __inline int _skel_method_3(int (*_pfn)(const char*, remote_handle64*), u _QAIC_CATCH(_nErr) {} return _nErr; } -__QAIC_SKEL_EXPORT int __QAIC_SKEL(ggmlop_skel_handle_invoke)(remote_handle64 _h, uint32_t _sc, remote_arg* _pra) __QAIC_SKEL_ATTRIBUTE { +__QAIC_SKEL_EXPORT int __QAIC_SKEL(ggmldsp_skel_handle_invoke)(remote_handle64 _h, uint32_t _sc, remote_arg* _pra) __QAIC_SKEL_ATTRIBUTE { switch(REMOTE_SCALARS_METHOD(_sc)){ case 0: return _skel_method_3(__QAIC_IMPL(ggmlop_dsp_open), _sc, _pra); diff --git a/ggml/src/ggml-hexagon/kernels/skel.h b/ggml/src/ggml-hexagon/kernels/skel.h index 194c71e6ecb2a..4850265ee504f 100644 --- a/ggml/src/ggml-hexagon/kernels/skel.h +++ b/ggml/src/ggml-hexagon/kernels/skel.h @@ -272,15 +272,12 @@ __QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_open)(const char* uri, remote_ * @retval, 0 on success, should always succeed */ __QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_close)(remote_handle64 h) __QAIC_HEADER_ATTRIBUTE; -__QAIC_HEADER_EXPORT AEEResult __QAIC_HEADER(ggmlop_dsp_setclocks)(remote_handle64 _h, int32 power_level, int32 latency, int32 dcvs_enable, int32 threads) __QAIC_HEADER_ATTRIBUTE; +__QAIC_HEADER_EXPORT AEEResult __QAIC_HEADER(ggmlop_dsp_setclocks)(remote_handle64 _h, int32 power_level, int32 latency, int32 mulmat_algotype, int32 thread_counts) __QAIC_HEADER_ATTRIBUTE; __QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_add)(remote_handle64 _h, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_HEADER_ATTRIBUTE; __QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_mulmat)(remote_handle64 _h, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_HEADER_ATTRIBUTE; __QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_softmax)(remote_handle64 _h, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_HEADER_ATTRIBUTE; __QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_rmsnorm)(remote_handle64 _h, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_HEADER_ATTRIBUTE; __QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_pool2d)(remote_handle64 _h, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_HEADER_ATTRIBUTE; -#ifndef ggmlop_URI -#define ggmlop_URI "file:///libggmlop-skel.so?ggmlop_skel_handle_invoke&_modver=1.0&_idlver=0.0.1" -#endif /*ggmlop_URI*/ #ifdef __cplusplus } #endif diff --git a/ggml/src/ggml-hexagon/kernels/stub.c b/ggml/src/ggml-hexagon/kernels/stub.c index 6074d243610df..7936c43cd6d77 100644 --- a/ggml/src/ggml-hexagon/kernels/stub.c +++ b/ggml/src/ggml-hexagon/kernels/stub.c @@ -312,9 +312,9 @@ static __inline int _stub_method(remote_handle64 _handle, uint32_t _mid, uint32_ } return _nErr; } -__QAIC_STUB_EXPORT AEEResult __QAIC_STUB(ggmlop_dsp_setclocks)(remote_handle64 _handle, int32 power_level, int32 latency, int32 dcvs_enable, int32 threads) __QAIC_STUB_ATTRIBUTE { +__QAIC_STUB_EXPORT AEEResult __QAIC_STUB(ggmlop_dsp_setclocks)(remote_handle64 _handle, int32 power_level, int32 latency, int32 mulmat_algotype, int32 threads) __QAIC_STUB_ATTRIBUTE { uint32_t _mid = 2; - return _stub_method(_handle, _mid, (uint32_t*)&power_level, (uint32_t*)&latency, (uint32_t*)&dcvs_enable, (uint32_t*)&threads); + return _stub_method(_handle, _mid, (uint32_t*)&power_level, (uint32_t*)&latency, (uint32_t*)&mulmat_algotype, (uint32_t*)&threads); } static __inline int _stub_unpack(_ATTRIBUTE_UNUSED remote_arg* _praROutPost, _ATTRIBUTE_UNUSED remote_arg* _ppraROutPost[1], _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _rout0[1], _ATTRIBUTE_UNUSED uint32_t _rout1[4], _ATTRIBUTE_UNUSED uint32_t _rout2[4], _ATTRIBUTE_UNUSED uint32_t _rout3[1], _ATTRIBUTE_UNUSED uint32_t _rout4[16], _ATTRIBUTE_UNUSED uint32_t _rout5[1], _ATTRIBUTE_UNUSED char* _rout6[1], _ATTRIBUTE_UNUSED uint32_t _rout6Len[1]) { int _nErr = 0; diff --git a/ggml/src/ggml-hexagon/kernels/worker_pool.cpp b/ggml/src/ggml-hexagon/kernels/worker_pool.cpp new file mode 100755 index 0000000000000..8186edcf18a95 --- /dev/null +++ b/ggml/src/ggml-hexagon/kernels/worker_pool.cpp @@ -0,0 +1,475 @@ +/**============================================================================= + +@file + worker_pool.cpp + +@brief + Utility providing a multi-priority thread worker pool for + multi-threaded computer vision (or other compute) applications. + +Copyright (c) 2019-2020 Qualcomm Technologies Incorporated. +All Rights Reserved. Qualcomm Proprietary and Confidential. + +Export of this technology or software is regulated by the U.S. +Government. Diversion contrary to U.S. law prohibited. + +All ideas, data and information contained in or disclosed by +this document are confidential and proprietary information of +Qualcomm Technologies Incorporated and all rights therein are expressly reserved. +By accepting this material the recipient agrees that this material +and the information contained therein are held in confidence and in +trust and will not be used, copied, reproduced in whole or in part, +nor its contents revealed in any manner to others without the express +written permission of Qualcomm Technologies Incorporated. + +=============================================================================**/ + +/*=========================================================================== + INCLUDE FILE +===========================================================================*/ +#include +#include +#include +#include "worker_pool.h" + +#ifndef _DEBUG +#define _DEBUG +#endif +#include "HAP_farf.h" + +#ifdef __cplusplus +extern "C" +{ +#endif +#include "qurt.h" +#include "hexagon_protos.h" + +void worker_pool_constructor(void) __attribute__((constructor)); +void worker_pool_destructor(void) __attribute__((destructor)); + +#ifdef __cplusplus +} +#endif + +/*=========================================================================== + DEFINE +===========================================================================*/ +#define WORKER_THREAD_STACK_SZ 2 *16384 +#define WORKER_KILL_SIGNAL 31 // signal to kill the worker threads +#define NUM_JOB_SLOTS (MAX_NUM_WORKERS + 1) // max queued jobs, slightly more than number of workers. +#define LOWEST_USABLE_QURT_PRIO 254 + +/*=========================================================================== + TYPEDEF +===========================================================================*/ +// internal structure kept in thread-local storage per instance of worker pool +typedef struct +{ + qurt_anysignal_t empty_jobs; // available job nodes + qurt_anysignal_t queued_jobs; // jobs that are waiting for a worker + qurt_mutex_t empty_jobs_mutex; // mutex for multiple threads trying to send a job + qurt_mutex_t queued_jobs_mutex; // mutex for multiple threads trying to acquire a job + unsigned int job_queue_mask; // mask for job queue nodes + unsigned int num_workers; // number of workers in this pool + worker_pool_job_t job[NUM_JOB_SLOTS]; // list of job descriptors + qurt_thread_t thread[MAX_NUM_WORKERS]; // thread ID's of the workers + void * stack[MAX_NUM_WORKERS]; // thread stack pointers +} worker_pool_t; + +// internal structure containing OS primitives to sync caller with all its spawned jobs. +typedef union +{ + worker_synctoken_t raw; + struct + { + unsigned int atomic_countdown; + unsigned int reserved; // reserved to align next element to 8 bytes + qurt_sem_t sem; + } sync; +} internal_synctoken_t; + +/*=========================================================================== + GLOBAL VARIABLES (per PD) +===========================================================================*/ +// initialized in constructor +unsigned int num_workers = 1; +unsigned int num_hvx128_contexts = 0; + +/*=========================================================================== + STATIC VARIABLES +===========================================================================*/ + +static worker_pool_context_t static_context = NULL; + +/*=========================================================================== + LOCAL FUNCTION +===========================================================================*/ +// the main workloop for each of the worker threads. +static void worker_pool_main(void* context) +{ + // local pointer to owning pool's context + worker_pool_t *me = (worker_pool_t *) context; + + // some local vars to reduce dereferencing inside loop + qurt_anysignal_t *signal = &me->queued_jobs; + unsigned int mask = me->job_queue_mask; + qurt_mutex_t *mutex = &me->queued_jobs_mutex; + + while(1) + { + qurt_mutex_lock(mutex); // mutex only allows 1 thread to wait on signal at a time. QuRT restriction. + (void) qurt_anysignal_wait(signal, mask); // wait for a job + unsigned int sig_rx = Q6_R_ct0_R(mask & qurt_anysignal_get(signal)); // count trailing 0's to choose flagged job + if (sig_rx < NUM_JOB_SLOTS) // if real job + { + worker_pool_job_t job = me->job[sig_rx]; // local copy of job descriptor + (void) qurt_anysignal_clear(signal, (1 << sig_rx)); // clear the queued job signal + (void) qurt_anysignal_set(&me->empty_jobs, (1 << sig_rx)); // send node back to empty list + qurt_mutex_unlock(mutex); // unlock the mutex + job.fptr(job.dptr); // issue the callback + } + else if (WORKER_KILL_SIGNAL == sig_rx) + { + // don't clear the kill signal, leave it for all the workers to see, and exit + qurt_mutex_unlock(mutex); + qurt_thread_exit(0); + } + else{ + FARF(HIGH,"Worker pool received invalid job %d", sig_rx ); + qurt_mutex_unlock(mutex); + } + // else ignore + } +} + +void worker_pool_constructor() +{ + FARF(HIGH, "In worker_pool constructor"); + qurt_sysenv_max_hthreads_t num_threads; + if (QURT_EOK != qurt_sysenv_get_max_hw_threads(&num_threads)) + { + num_workers = MAX_NUM_WORKERS; // Couldn't get number of threads from QuRT, default to 4. + FARF(HIGH, "Failed to get number of threads. Defaulting to %u", num_workers); + } + else + { + num_workers = num_threads.max_hthreads; + } + + /* Verify that number of hw threads isn't greater than max supported number of hw threads. + Max threads is used as a constant value for array size. */ + if (num_workers > MAX_NUM_WORKERS) + { + num_workers = MAX_NUM_WORKERS; + FARF(HIGH, "Limiting number of threads to maximum supported value %u", num_workers); + } + + num_hvx128_contexts = (qurt_hvx_get_units() >> 8) & 0xFF; + + /* initialize static worker_pool for clients who pass NULL as context.*/ + if (worker_pool_init(&static_context) != AEE_SUCCESS) + { + FARF(ERROR, "Could not initialize default worker pool"); + } +} + +AEEResult worker_pool_init_with_stack_size(worker_pool_context_t *context, int stack_size) +{ + int nErr = 0; + + if(stack_size <= 0) + { + FARF(ERROR, "Stack size can not be negative"); + return AEE_EBADPARM; + } + + if (NULL == context) + { + FARF(ERROR, "NULL context passed to worker_pool_init()."); + return AEE_EBADPARM; + } + + // Allocations + int size = (stack_size * num_workers) + (sizeof(worker_pool_t)); + unsigned char *mem_blob = (unsigned char*)malloc(size); + if (!mem_blob) + { + FARF(ERROR,"Could not allocate memory for worker pool!!"); + return AEE_ENOMEMORY; + } + + worker_pool_t *me = (worker_pool_t *)(mem_blob + stack_size * num_workers); + + // name for the first worker, useful in debugging threads + char name[19]; + snprintf(name, 12, "0x%8x:", (int)me); + strcat(name, "worker0"); + me->num_workers = num_workers; + // initializations + for (unsigned int i = 0; i < me->num_workers; i++) + { + me->stack[i] = NULL; + me->thread[i] = 0; + } + + // initialize job queue + qurt_anysignal_init(&(me->queued_jobs)); + qurt_anysignal_init(&(me->empty_jobs)); + qurt_mutex_init(&(me->empty_jobs_mutex)); + qurt_mutex_init(&(me->queued_jobs_mutex)); + me->job_queue_mask = (1 << NUM_JOB_SLOTS) - 1; // set a bit for each job node, number of job nodes = num_workers + 1 + (void) qurt_anysignal_set(&(me->empty_jobs), me->job_queue_mask); // fill the empty pool. + me->job_queue_mask |= (1 << WORKER_KILL_SIGNAL); // add the kill signal to the mask. + + // launch the workers + qurt_thread_attr_t attr; + qurt_thread_attr_init (&attr); + + for (unsigned int i = 0; i < me->num_workers; i++) + { + // set up stack + me->stack[i] = mem_blob; + mem_blob += stack_size; + qurt_thread_attr_set_stack_addr(&attr, me->stack[i]); + qurt_thread_attr_set_stack_size(&attr, stack_size); + + // set up name + qurt_thread_attr_set_name(&attr, name); + name[17] = (name[17] + 1); + // name threads context:worker0, context:worker1, .. (recycle at 9, but num threads should be less than that anyway) + if (name[17] > '9') name[17] = '0'; + // set up priority - by default, match the creating thread's prio + int prio = qurt_thread_get_priority(qurt_thread_get_id()); + + // If loading thread has priority less than 64, load static worker pool with 64 priority. + if(context == &static_context && prio < 64) prio = 64; + + if (prio < 1) prio = 1; + if (prio > LOWEST_USABLE_QURT_PRIO) prio = LOWEST_USABLE_QURT_PRIO; + + qurt_thread_attr_set_priority(&attr, prio); + + // launch + nErr = qurt_thread_create(&(me->thread[i]), &attr, worker_pool_main, (void *)me); + if (nErr) + { + FARF(ERROR, "Could not launch worker threads!"); + worker_pool_deinit((worker_pool_context_t*)&me); + return AEE_EQURTTHREADCREATE; + } + } + *context = (worker_pool_context_t*)me; + return AEE_SUCCESS; +} + +AEEResult worker_pool_init(worker_pool_context_t *context) +{ + return worker_pool_init_with_stack_size(context, WORKER_THREAD_STACK_SZ); +} + + +// clean up worker pool +void worker_pool_deinit(worker_pool_context_t *context) +{ + worker_pool_t *me = (worker_pool_t*)*context; + + // if no worker pool exists, return error. + if (NULL == me) + { + return; + } + + // de-initializations + (void) qurt_anysignal_set(&(me->empty_jobs), (1 << WORKER_KILL_SIGNAL)); // notify to stop new jobs. + (void) qurt_anysignal_set(&(me->queued_jobs), (1 << WORKER_KILL_SIGNAL)); // kill worker pool. + for (unsigned int i = 0; i < me->num_workers; i++) // wait for workers to die + { + if (me->thread[i]) + { + int status; + (void) qurt_thread_join(me->thread[i], &status); + } + } + + // release resources + qurt_mutex_destroy(&(me->empty_jobs_mutex)); + qurt_mutex_destroy(&(me->queued_jobs_mutex)); + qurt_anysignal_destroy(&(me->queued_jobs)); + qurt_anysignal_destroy(&(me->empty_jobs)); + // free allocated memory (were allocated as a single buffer starting at stack[0]) + if (me->stack[0]) free (me->stack[0]); + // Assign NULL to freed context so that further refence to it fails. + *context = NULL; +} + +// submit a job to the pool. +AEEResult worker_pool_submit(worker_pool_context_t context, worker_pool_job_t job) +{ + worker_pool_t *me = (worker_pool_t*)context; + + // if NULL is passed as worker_pool_context, try to use default static worker_pool + if (NULL == me) + { + if (static_context == NULL) + { + FARF(HIGH, "No default static worker pool found"); + return AEE_ERESOURCENOTFOUND; + } + FARF(MEDIUM, "Using default static worker pool"); + me = (worker_pool_t*)static_context; + } + + // if a worker thread tries to submit a job, call it in-context to avoid recursion deadlock. + unsigned int i; + qurt_thread_t id = qurt_thread_get_id(); + for (i = 0; i < me->num_workers; i++) + { + if (id == me->thread[i]) + { + job.fptr(job.dptr); // issue the callback in caller's context + return AEE_SUCCESS; + } + } + + // local vars to reduce dereferencing + qurt_mutex_t *mutex = &me->empty_jobs_mutex; + qurt_anysignal_t *signal = &me->empty_jobs; + unsigned int mask = me->job_queue_mask; + + qurt_mutex_lock(mutex); // lock empty queue + (void) qurt_anysignal_wait(signal, mask); // wait for an empty job node + unsigned int bitfield = qurt_anysignal_get(signal); + + // check if pool is being killed and return early + if (bitfield & (1 << WORKER_KILL_SIGNAL)) + { + qurt_mutex_unlock(mutex); + return AEE_ENOMORE; + } + + // send the job to the queue. + unsigned int sig_rx = Q6_R_ct0_R(mask & bitfield); // count trailing 0's to find first avail node + me->job[sig_rx] = job; // copy job descriptor + (void) qurt_anysignal_clear(signal, (1 << sig_rx)); // clear the empty job node flag + (void) qurt_anysignal_set(&me->queued_jobs, (1 << sig_rx)); // notify of pending job + qurt_mutex_unlock(mutex); // unlock the mutex + + return 0; +} + +void worker_pool_destructor() +{ + FARF(HIGH, "In worker_pool destructor"); + + worker_pool_deinit(&static_context); +} + +/*=========================================================================== + GLOBAL FUNCTION +===========================================================================*/ +// initialize a synctoken - caller will wait on the synctoken and each job will release it. +// caller wakes when all jobs have released. +void worker_pool_synctoken_init(worker_synctoken_t *token, unsigned int njobs) +{ + // cast input to usable struct + internal_synctoken_t *internal_token = (internal_synctoken_t *) token; + + // initialize atomic counter and semaphore + internal_token->sync.atomic_countdown = njobs; + qurt_sem_init_val(&internal_token->sync.sem, 0); +} + +// worker job responsible for calling this function to count down completed jobs. +void worker_pool_synctoken_jobdone(worker_synctoken_t *token) +{ + // cast input to usable struct + internal_synctoken_t *internal_token = (internal_synctoken_t *) token; + + // count down atomically, and raise semaphore if last job. + if (0 == worker_pool_atomic_dec_return(&internal_token->sync.atomic_countdown)) + { + (void) qurt_sem_up(&internal_token->sync.sem); + } +} + +// job submitter waits on this function for all jobs to complete. +void worker_pool_synctoken_wait(worker_synctoken_t *token) +{ + // cast input to usable struct + internal_synctoken_t *internal_token = (internal_synctoken_t *) token; + + // Wait for all jobs to finish and raise the semaphore + (void) qurt_sem_down(&internal_token->sync.sem); + + // clean up the semaphore + (void) qurt_sem_destroy(&internal_token->sync.sem); +} + +AEEResult worker_pool_set_thread_priority(worker_pool_context_t context, unsigned int prio) +{ + worker_pool_t *me = (worker_pool_t*)context; + + // if no worker pool exists, return error. + if (NULL == me) + { + return AEE_ENOMORE; + } + + int result = AEE_SUCCESS; + if (prio < 1) prio = 1; + if (prio > LOWEST_USABLE_QURT_PRIO) prio = LOWEST_USABLE_QURT_PRIO; + for (unsigned int i = 0; i < me->num_workers; i++) + { + int res = qurt_thread_set_priority(me->thread[i], (unsigned short)prio); + if (0 != res) + { + result = AEE_EBADPARM; + FARF(ERROR, "QURT failed to set priority of thread %d, ERROR = %d", me->thread[i], res); + } + } + return result; +} + +AEEResult worker_pool_retrieve_threadID(worker_pool_context_t context, unsigned int* threadIDs) { + + worker_pool_t *me = (worker_pool_t*)context; + if(me == NULL) + { + FARF(ERROR, "Context NULL in RetrieveThreadID"); + return AEE_EBADPARM;; + } + + for(int i=0; inum_workers; i++) + { + threadIDs[i]= me->thread[i]; + FARF(MEDIUM, "Inside RetrieveThreadID threadIDs[%d] is %d",i,threadIDs[i]); + } + return AEE_SUCCESS; +} + + +AEEResult worker_pool_get_thread_priority(worker_pool_context_t context, unsigned int *prio) +{ + worker_pool_t *me = (worker_pool_t*)context; + + // if NULL is passed as context, share static_context's priority. + if (NULL == me) + { + if (static_context == NULL) + return AEE_ENOMORE; + FARF(HIGH, "Using default static worker pool"); + me = (worker_pool_t*)static_context; + } + + int priority = qurt_thread_get_priority(me->thread[0]); + if (priority > 0) + { + *prio = priority; + return 0; + } + else + { + *prio = 0; + return AEE_EBADSTATE; + } +} diff --git a/ggml/src/ggml-hexagon/kernels/worker_pool.h b/ggml/src/ggml-hexagon/kernels/worker_pool.h new file mode 100755 index 0000000000000..701cbf6215f43 --- /dev/null +++ b/ggml/src/ggml-hexagon/kernels/worker_pool.h @@ -0,0 +1,329 @@ +#ifndef WORKER_H +#define WORKER_H + +/**============================================================================= + +@file + worker_pool.h + +@brief + Utility providing a thread worker pool for multi-threaded computer vision + (or other compute) applications. + +Copyright (c) 2019-2020 Qualcomm Technologies Incorporated. +All Rights Reserved. Qualcomm Proprietary and Confidential. + +Export of this technology or software is regulated by the U.S. +Government. Diversion contrary to U.S. law prohibited. + +All ideas, data and information contained in or disclosed by +this document are confidential and proprietary information of +Qualcomm Technologies Incorporated and all rights therein are expressly reserved. +By accepting this material the recipient agrees that this material +and the information contained therein are held in confidence and in +trust and will not be used, copied, reproduced in whole or in part, +nor its contents revealed in any manner to others without the express +written permission of Qualcomm Technologies Incorporated. + +=============================================================================**/ +//============================================================================== +// Defines +//============================================================================== +/// MACRO enables function to be visible in shared-library case. +#define WORKERPOOL_API __attribute__ ((visibility ("default"))) + +//============================================================================== +// Include Files +//============================================================================== + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/*=========================================================================== + TYPEDEF +===========================================================================*/ +/// signature of callbacks to be invoked by worker threads +typedef void ( *worker_callback_t )( void* ); + +/// Typedef of worker_pool context +typedef void* worker_pool_context_t; + +/// descriptor for requested callback +typedef struct +{ + /// function pointer + worker_callback_t fptr; + /// data pointer + void* dptr; +} worker_pool_job_t; + +/// opaque client view of synchronization token for job submitter and workers. Internals hidden in implementation. +typedef struct +{ + /// opaque array to store synchronization token for job + unsigned int dummy[8]; // large enough to hold a counter and a semaphore +} worker_synctoken_t __attribute__((aligned(8))); + +/*=========================================================================== + CONSTANTS +===========================================================================*/ +/// Maximum supported number of worker threads. + +#define MAX_NUM_WORKERS 8 +/// Number of workers +WORKERPOOL_API extern unsigned int num_workers; +/// Maximum number of hvx 128 bytes units available +WORKERPOOL_API extern unsigned int num_hvx128_contexts; + +//============================================================================== +// Declarations +//============================================================================== + +//--------------------------------------------------------------------------- +/// @brief +/// Initialize a worker pool. Should be called by each control thread that +/// requires its own worker pool. +/// +/// +/// @param *context +/// pointer to worker_pool_context_t variable. +/// +/// @return +/// 0 - success. +/// any other value - failure. +//--------------------------------------------------------------------------- +WORKERPOOL_API AEEResult +worker_pool_init(worker_pool_context_t *context); + +//--------------------------------------------------------------------------- +/// @brief +/// Initialize a worker pool with custom stack size of worker threads. +// Should be called by each control thread that requires its own worker pool. +/// +/// +/// @param *context +/// pointer to worker_pool_context_t variable. +/// @param *stack_size +/// stack size of each worker thread. +/// +/// @return +/// 0 - success. +/// any other value - failure. +//--------------------------------------------------------------------------- +WORKERPOOL_API AEEResult +worker_pool_init_with_stack_size(worker_pool_context_t *context, int stack_size); + +//--------------------------------------------------------------------------- +/// @brief +/// Kill worker threads and release worker pool resources. Must be called +/// when pool owner no longer requires the pool. +/// +/// +/// @param *context +/// worker_pool_context_t. +/// +//--------------------------------------------------------------------------- +WORKERPOOL_API void +worker_pool_deinit(worker_pool_context_t *context); + +//--------------------------------------------------------------------------- +/// @brief +/// Function to determine if there is an established worker pool available to +/// the calling thread. This is an optional call - if no pool is available +/// but attempted to be used, everything works seamlessly, in the client's +/// context (instead of worker context). +/// +/// +/// @param context +/// worker_pool_context_t. +/// +/// @return +/// 0 - no worker pool available. +/// any other value - worker pool available. +//--------------------------------------------------------------------------- +WORKERPOOL_API AEEResult +worker_pool_available(worker_pool_context_t context); + +//--------------------------------------------------------------------------- +/// @brief +/// Submit a job to the worker pool. +/// +/// +/// @param context +/// worker pool context where job is to be submitted. +/// +/// @param job +/// callback function pointer and data. +/// +/// @return +/// 0 - success. +/// any other value - failure. +//--------------------------------------------------------------------------- +WORKERPOOL_API AEEResult +worker_pool_submit(worker_pool_context_t context, worker_pool_job_t job); + +//--------------------------------------------------------------------------- +/// @brief +/// Initialize a synchronization token for job submitter and workers to use. +/// Each worker callback must be given access to the token to release it, and +/// job submitter will wait for all jobs to release the token. Internals are +/// hidden from client. +/// +/// +/// @param token +/// pointer to the synctoken structure. +/// +/// @param njobs +/// number of jobs that will be releasing the token +//--------------------------------------------------------------------------- +WORKERPOOL_API void +worker_pool_synctoken_init(worker_synctoken_t *token, unsigned int njobs); + +//--------------------------------------------------------------------------- +/// @brief +/// Needs to be called by the worker in the callback before exiting. The +/// token must be available to the callback via the data pointer given +/// to the callback during job submission. +/// +/// +/// @param token +/// pointer to the synctoken structure held by the job submitter +//--------------------------------------------------------------------------- +WORKERPOOL_API void +worker_pool_synctoken_jobdone(worker_synctoken_t *token); + +//--------------------------------------------------------------------------- +/// @brief +/// Job submitter calls this function after submitting all jobs to await +/// their completion. +/// +/// +/// @param token +/// pointer to the synctoken structure +//--------------------------------------------------------------------------- +WORKERPOOL_API void +worker_pool_synctoken_wait(worker_synctoken_t *token); + +//--------------------------------------------------------------------------- +/// @brief +/// Set the thread priority of the worker threads. Specified priority will +/// be applied to all threads in the default worker pool. The threads +/// that service boosted and background job requests will also be adjusted to be relative +/// to the new default thread priority. +/// +/// +/// @param context +/// worker pool context whose workers' priorities are to be changed. +/// +/// @param prio +/// desired priority. 1 is the highest priority allowed. 255 is the lowest priority allowed. +/// +/// @return +/// 0 - success. +/// any other value - failure. +//--------------------------------------------------------------------------- +WORKERPOOL_API AEEResult +worker_pool_set_thread_priority(worker_pool_context_t context, unsigned int prio); + +//--------------------------------------------------------------------------- +/// @brief +/// Query the thread priority of the default worker threads. This will return +/// the current priority for one of the workers, which are all created +/// with the same priority. If a user callback has changed one or more worker threads independently, +/// there is no guarantee on which worker's priority is returned by this function. +/// +/// +/// @param context +/// worker pool context whose workers' priorities are asked. +/// +/// @param prio +/// desired priority. 1 is the highest priority allowed. 255 is the lowest priority allowed. +/// +/// @return +/// 0 - success. +/// any other value - failure. +//--------------------------------------------------------------------------- +WORKERPOOL_API AEEResult +worker_pool_get_thread_priority(worker_pool_context_t context, unsigned int *prio); + +//--------------------------------------------------------------------------- +/// @brief +/// Utility inline to atomically increment a variable. Useful in +/// synchronizing jobs among worker threads, in cases where all +/// job-related info can be determined by the job number. +/// +/// +/// @param target +/// pointer to the variable being incremented +/// +/// @return +/// the value after incrementing +//--------------------------------------------------------------------------- +static inline unsigned int +worker_pool_atomic_inc_return(unsigned int *target) +{ + unsigned int result; + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = add(%0, #1)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target) + : "p0"); + return result; +} + +//--------------------------------------------------------------------------- +/// @brief +/// Utility inline to atomically decrement a variable. +/// +/// +/// @param target +/// pointer to the variable being incremented +/// +/// @return +/// the value after decrementing +//--------------------------------------------------------------------------- +static inline unsigned int +worker_pool_atomic_dec_return(unsigned int *target) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = add(%0, #-1)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target) + : "p0"); + return result; +} + +//--------------------------------------------------------------------------- +/// @brief +/// Quries and retruns the threads IDs of all the active threads in the worker pool. +/// +/// +/// @param context +/// worker pool context whose workers' IDs are asked. +/// +/// @param threadIDs +/// pointer to the array created by the user where thread IDs will be written to. +/// +/// @return +/// 0 - success. +/// 0E - Invalid parameter +//--------------------------------------------------------------------------- +WORKERPOOL_API AEEResult +worker_pool_retrieve_threadID(worker_pool_context_t context, unsigned int* threadIDs); +#ifdef __cplusplus +} +#endif + +#endif // #ifndef WORKER_H diff --git a/ggml/src/ggml-kompute/kompute b/ggml/src/ggml-kompute/kompute new file mode 160000 index 0000000000000..4565194ed7c32 --- /dev/null +++ b/ggml/src/ggml-kompute/kompute @@ -0,0 +1 @@ +Subproject commit 4565194ed7c32d1d2efa32ceab4d3c6cae006306 diff --git a/models/t5-very-small-random-F32.gguf b/models/t5-very-small-random-F32.gguf new file mode 100644 index 0000000000000..fd386d88562d2 Binary files /dev/null and b/models/t5-very-small-random-F32.gguf differ diff --git a/prebuilts/Hexagon_SDK/.lock b/prebuilts/Hexagon_SDK/.lock new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/CPU/QnnCpuCommon.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/CPU/QnnCpuCommon.h new file mode 100755 index 0000000000000..fdbfc1136d556 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/CPU/QnnCpuCommon.h @@ -0,0 +1,50 @@ +//============================================================================= +// +// Copyright (c) 2020-2023 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================= + +/** @file + * @brief QNN CPU Common components + * + * This file defines versioning and other identification details + * and supplements QnnCommon.h for CPU backend + */ + +#ifndef QNN_CPU_COMMON_H +#define QNN_CPU_COMMON_H + +#include "QnnCommon.h" + +/// CPU Backend identifier +#define QNN_BACKEND_ID_CPU 3 + +/// CPU interface provider +#define QNN_CPU_INTERFACE_PROVIDER_NAME "CPU_QTI_AISW" + +// CPU API Version values +#define QNN_CPU_API_VERSION_MAJOR 1 +#define QNN_CPU_API_VERSION_MINOR 1 +#define QNN_CPU_API_VERSION_PATCH 0 + +// clang-format off +/// Macro to set Qnn_ApiVersion_t for CPU backend +#define QNN_CPU_API_VERSION_INIT \ + { \ + { \ + QNN_API_VERSION_MAJOR, /*coreApiVersion.major*/ \ + QNN_API_VERSION_MINOR, /*coreApiVersion.major*/ \ + QNN_API_VERSION_PATCH /*coreApiVersion.major*/ \ + }, \ + { \ + QNN_CPU_API_VERSION_MAJOR, /*backendApiVersion.major*/ \ + QNN_CPU_API_VERSION_MINOR, /*backendApiVersion.minor*/ \ + QNN_CPU_API_VERSION_PATCH /*backendApiVersion.patch*/ \ + } \ + } + +// clang-format on + +#endif // QNN_CPU_COMMON_H \ No newline at end of file diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/CPU/QnnCpuGraph.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/CPU/QnnCpuGraph.h new file mode 100755 index 0000000000000..750cfd0b501f1 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/CPU/QnnCpuGraph.h @@ -0,0 +1,117 @@ +//============================================================================= +// +// Copyright (c) 2022 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================= + +/** @file + * @brief QNN CPU component Graph API. + * + * The interfaces in this file work with the top level QNN + * API and supplements QnnGraph.h for CPU backend + */ + +#ifndef QNN_CPU_GRAPH_H +#define QNN_CPU_GRAPH_H + +#include "QnnGraph.h" + +#ifdef __cplusplus +extern "C" { +#endif + +//============================================================================= +// Macros +//============================================================================= + +//============================================================================= +// Data Types +//============================================================================= + +/** + * @brief This enum provides different CPU graph configuration + * options associated with QnnGraph + */ +typedef enum { + QNN_CPU_GRAPH_CONFIG_OPTION_OP_DEBUG_CALLBACK = 1, + QNN_CPU_GRAPH_CONFIG_OPTION_UNDEFINED = 0x7fffffff +} QnnCpuGraph_ConfigOption_t; + +/* @brief CallBack function pointer to be filled by user. + * This callback will be called after each op execution. + * Only outputTensor id and data buffer is valid, consumable. + * Memory is owned by BE which is valid throughout the callback. + * Client should not update any parameter and argument of opConfig. + * NULL tensor/buffer indicate invalid data buffer. + */ +typedef Qnn_ErrorHandle_t (*QnnCpuGraph_OpDebugCallback_t)(Qnn_OpConfig_t* opConfig, + void* callBackParam); + +/* @brief Structure to be filled by user. + * This structure will have callback function and callback reference data. + * Memory is owned by BE which is valid throughout the callback. + * Client should not update any parameter and argument of opConfig. + * NULL callback function indicate no debug option. + */ +typedef struct { + void* callBackParam; + QnnCpuGraph_OpDebugCallback_t cpuGraphOpDebugCallback; +} QnnCpuGraph_OpDebug_t; + +// clang-format off +/// QnnCpuGraph_OpDebug_t initializer macro +#define QNN_CPU_GRAPH_OP_DEBUG_INIT \ + { \ + NULL, /*callBackParam*/ \ + NULL /*cpuGraphOpDebugCallback*/ \ + } +// clang-format on + +//============================================================================= +// Public Functions +//============================================================================= + +//------------------------------------------------------------------------------ +// Implementation Definition +//------------------------------------------------------------------------------ + +/** + * @brief Structure describing the set of configurations supported by graph. + * Objects of this type are to be referenced through QnnGraph_CustomConfig_t. + * + * The struct has two fields - option and a union of corresponding config values + * Based on the option corresponding item in the union can be used to specify + * config. + * Below is the map between QnnCpuGraph_ConfigOption_t and config value + * + * \verbatim embed:rst:leading-asterisk + * +----+------------------------------------------+------------------------------------+ + * | # | Config Option | Configuration Struct/value | + * +====+==========================================+====================================+ + * | 1 | QNN_CPU_GRAPH_CONFIG_DEBUG_CALLBACK | QnnCpuGraph_OpDebug_t | + * +----+------------------------------------------+------------------------------------+ + * \endverbatim + */ +typedef struct { + QnnCpuGraph_ConfigOption_t option; + union UNNAMED { + QnnCpuGraph_OpDebug_t cpuGraphOpDebug; + }; +} QnnCpuGraph_CustomConfig_t; + +/// QnnCpuGraph_CustomConfig_t initializer macro +#define QNN_CPU_GRAPH_CUSTOM_CONFIG_INIT \ + { \ + QNN_CPU_GRAPH_CONFIG_OPTION_UNKNOWN, /*option*/ \ + { \ + QNN_CPU_GRAPH_OP_DEBUG_INIT /*cpuGraphOpDebugCallback*/ \ + } \ + } + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/CPU/QnnCpuOpPackage.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/CPU/QnnCpuOpPackage.h new file mode 100755 index 0000000000000..97bdab8dfd3f9 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/CPU/QnnCpuOpPackage.h @@ -0,0 +1,224 @@ +//============================================================================== +// +// Copyright (c) 2020-2023 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +/** @file + * @brief CPU Operation Package component API + * + * Provides interface to interact with OpPackage libraries registered + * with the CPU backend. + */ + +#ifndef QNN_CPU_OP_PACKAGE_H +#define QNN_CPU_OP_PACKAGE_H + +#include "CPU/QnnCpuCommon.h" +#include "QnnGraph.h" +#include "QnnOpPackage.h" +#include "QnnTypes.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define QNN_CPUOPPACKAGE_TENSOR_DATA_FORMAT_FLAT_BUFFER 0 + +/** + * @brief A value representing a tensor data format. + */ +typedef uint32_t QnnCpuOpPackage_TensorDataFormat_t; + +/** + * @brief A value representing a profile data in ms. + */ +typedef double QnnCpuOpPackage_ProfileData_t; + +/** + * @brief An enum to specify a param type. + */ +typedef enum { + QNN_CPU_PARAMTYPE_SCALAR = 0, + QNN_CPU_PARAMTYPE_TENSOR = 1, + QNN_CPU_PARAMTYPE_STRING = 2, + // Unused, present to ensure 32 bits. + QNN_CPU_PARAMTYPE_UNDEFINED = 0xFFFFFFFF +} QnnCpuOpPackage_ParamType_t; + +/** + * @brief An enum to specify tensor data type. + */ +typedef enum { + QNN_CPU_DATATYPE_BOOL_8 = 0x0508, + QNN_CPU_DATATYPE_INT_8 = 0x0008, + QNN_CPU_DATATYPE_INT_32 = 0x0032, + QNN_CPU_DATATYPE_UINT_8 = 0x0108, + QNN_CPU_DATATYPE_UINT_32 = 0x0132, + QNN_CPU_DATATYPE_FLOAT_32 = 0x0232, + // Unused, present to ensure 32 bits. + QNN_CPU_DATATYPE_UNDEFINED = 0x7FFFFFFF +} QnnCpuOpPackage_DataType_t; + +/** + * @brief An enum to specify logging level. + */ +typedef enum { + QNN_CPU_MSG_ERROR = 1, + QNN_CPU_MSG_DEBUG = 2, + QNN_CPU_MSG_LOW = 3, + QNN_CPU_MSG_MED = 4, + QNN_CPU_MSG_HIGH = 5, + // Unused, present to ensure 32 bits + QNN_CPU_MSG_UNDEFINED = 0x7FFFFFFF +} QnnCpuOpPackage_MsgType_t; + +/** + * @brief An enum to specify the profiling type. + */ +typedef enum { + QNN_CPU_PROFILE_BASIC = 1, + QNN_CPU_PROFILE_DETAILED = 2, + // Unused, present to ensure 32 bits + QNN_CPU_PROFILE_UNDEFINED = 0x7FFFFFFF +} QnnCpuOpPackage_ProfileType_t; + +/** + * @brief A struct which defines the Global infrastructure. + */ +typedef struct _QnnOpPackage_GlobalInfrastructure_t { + // Message + void (*reportMessage)(QnnCpuOpPackage_MsgType_t msgType, const char* msg, ...); + + // Profile + void (*profile)(QnnCpuOpPackage_ProfileType_t profileType, + QnnCpuOpPackage_ProfileData_t timeInMsec); +} QnnCpuOpPackage_GlobalInfra_t; + +// clang-format off +/// QnnCpuOpPackage_GlobalInfra_t initializer macro +#define QNN_CPU_OP_PACKAGE_GLOBAL_INFRA_INIT \ + { \ + NULL, /*reportMessage*/ \ + NULL /*profile*/ \ + } +// clang-format on + +typedef Qnn_ErrorHandle_t (*QnnCpuOpPackage_OpImplFn_t)(void* opPkgNodeData); + +/** + * @brief A struct which defines the OpImpl definition. + */ +typedef struct _QnnOpPackage_OpImpl_t { + QnnCpuOpPackage_OpImplFn_t opImplFn; + void* userData; +} QnnCpuOpPackage_OpImpl_t; + +// clang-format off +/// QnnCpuOpPackage_OpImpl_t initializer macro +#define QNN_CPU_OP_PACKAGE_OPIMPL_INIT \ + { \ + NULL, /*kernelFn*/ \ + NULL /*userData*/ \ + } +// clang-format on + +/** + * @brief A struct which describes the properties of a tensor. + * + */ +typedef struct { + QnnCpuOpPackage_TensorDataFormat_t dataFormat; + QnnCpuOpPackage_DataType_t dataType; + uint32_t rank; + uint32_t* maxDimensions; + uint32_t* currentDimensions; + void* data; + Qnn_QuantizeParams_t quantizeParams; +} QnnCpuOpPackage_Tensor_t; + +// clang-format off +/// QnnCpuOpPackage_Tensor_t initializer macro +#define QNN_CPU_OP_PACKAGE_TENSOR_INIT \ + { \ + QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, /*dataFormat*/ \ + QNN_CPU_DATATYPE_UNDEFINED, /*dataType*/ \ + 0, /*rank*/ \ + NULL, /*maxDimensions*/ \ + NULL, /*currentDimensions*/ \ + NULL, /*data*/ \ + QNN_QUANTIZE_PARAMS_INIT /*quantizeParams*/ \ + } +// clang-format on + +/** + * @brief A struct which describes the parameters of a node. + * + */ +typedef struct { + QnnCpuOpPackage_ParamType_t type; + const char* name; + union { + double scalarParam; + const char* string; + QnnCpuOpPackage_Tensor_t* tensorParam; + }; +} QnnCpuOpPackage_Param_t; + +// clang-format off +/// QnnCpuOpPackage_Param_t initializer macro +#define QNN_CPU_OP_PACKAGE_PARAM_INIT \ + { \ + QNN_CPU_PARAMTYPE_UNDEFINED, /*type*/ \ + NULL, /*name*/ \ + { \ + 0 /*scalarParam*/ \ + } \ + } +// clang-format on + +/** + * @brief A struct which describes the node. + * + */ +typedef struct _QnnOpPackage_Node_t { + const char* name; + const char* packageName; + const char* typeName; + uint32_t numOfParams; + QnnCpuOpPackage_Param_t** params; + uint32_t numOfInputs; + QnnCpuOpPackage_Tensor_t** inputs; + uint32_t numOfOutputs; + QnnCpuOpPackage_Tensor_t** outputs; +} QnnCpuOpPackage_Node_t; + +// clang-format off +/// QnnCpuOpPackage_Node_t initializer macro +#define QNN_CPU_OP_PACKAGE_NODE_INIT \ + { \ + NULL, /*name*/ \ + NULL, /*packageName*/ \ + NULL, /*typeName*/ \ + 0, /*numOfParams*/ \ + NULL, /*params*/ \ + 0, /*numOfInputs*/ \ + NULL, /*inputs*/ \ + 0, /*numOfOutputs*/ \ + NULL /*outputs*/ \ + } +// clang-format on + +/** + * @brief Graph infrastructure. + * + */ +typedef _QnnOpPackage_GraphInfrastructure_t QnnCpuOpPackage_GraphInfrastructure_t; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // QNN_CPU_OP_PACKAGE_H diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspBackend.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspBackend.h new file mode 100755 index 0000000000000..e2b6c69dffbdf --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspBackend.h @@ -0,0 +1,108 @@ +//============================================================================= +// +// Copyright (c) 2022-2023 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================= + +/** @file + * @brief QNN DSP component Backend API. + * + * The interfaces in this file work with the top level QNN + * API and supplements QnnBackend.h for DSP backend + */ + +#ifndef QNN_DSP_BACKEND_H +#define QNN_DSP_BACKEND_H + +#include "QnnBackend.h" + +#ifdef __cplusplus +extern "C" { +#endif + +//============================================================================= +// Macros +//============================================================================= + +//============================================================================= +// Data Types +//============================================================================= + +//============================================================================= +// Public Functions +//============================================================================= + +//------------------------------------------------------------------------------ +// Implementation Definition +//------------------------------------------------------------------------------ + +// clang-format off + +/* @brief Enum describing the set of custom configs supported by DSP backend. +*/ +typedef enum { + /// The accelerator will always attempt to fold relu activation + /// into the immediate preceding convolution operation. This optimization + /// is correct when quantization ranges for convolution are equal or + /// subset of the Relu operation. For graphs, where this cannot be + /// guaranteed, the client should set this option to true + QNN_DSP_BACKEND_CONFIG_OPTION_FOLD_RELU_ACTIVATION_INTO_CONV_OFF = 0, + /// The accelerator will always attempt to all Convolution + /// operation using HMX instructions. Convolution that have + /// short depth and/or weights that are not symmetric could + /// exhibit inaccurate results. In such cases, clients must + /// set this option to true to guarantee correctness of the operation + QNN_DSP_BACKEND_CONFIG_OPTION_SHORT_DEPTH_CONV_ON_HMX_OFF = 1, + /// Every APP side user process that uses a DSP via FastRPC + /// has a corresponding dynamic user process domain on the DSP side. + /// QNN by default opens RPC session as unsigned PD, + /// in case this option is set to true, + /// RPC session will be opened as signed PD (requires signed .so). + QNN_DSP_BACKEND_CONFIG_OPTION_USE_SIGNED_PROCESS_DOMAIN = 2, + /// set QnnDspBackend_DspArch_t for offline prepare mode + QNN_DSP_BACKEND_CONFIG_OPTION_ARCH = 3, + /// UNKNOWN enum option that must not be used + QNN_DSP_BACKEND_CONFIG_OPTION_UNKNOWN = 0x7fffffff +} QnnDspBackend_ConfigOption_t; + +typedef enum { + QNN_DSP_BACKEND_DSP_ARCH_NONE = 0, + QNN_DSP_BACKEND_DSP_ARCH_V65 = 65, + QNN_DSP_BACKEND_DSP_ARCH_V66 = 66, + QNN_DSP_BACKEND_DSP_ARCH_V68 = 68, + QNN_DSP_BACKEND_DSP_ARCH_V69 = 69, + QNN_DSP_BACKEND_DSP_ARCH_V73 = 73, + QNN_DSP_BACKEND_DSP_ARCH_UNKNOWN = 0x7fffffff +} QnnDspBackend_DspArch_t; + +/** + * @brief Structure describing the set of configurations supported by the backend. + * Objects of this type are to be referenced through QnnBackend_CustomConfig_t. + */ +typedef struct QnnDspBackend_CustomConfig { + QnnDspBackend_ConfigOption_t option; + union UNNAMED { + bool foldReluActivationIntoConvOff; + bool shortDepthConvOnHmxOff; + bool useSignedProcessDomain; + QnnDspBackend_DspArch_t arch; + }; +} QnnDspBackend_CustomConfig_t ; + +/// QnnDspBackend_CustomConfig_t initializer macro +#define QNN_DSP_BACKEND_CUSTOM_CONFIG_INIT \ + { \ + QNN_DSP_BACKEND_CONFIG_OPTION_UNKNOWN, /*option*/ \ + { \ + false /*foldReluActivationIntoConvOff*/ \ + } \ + } + +// clang-format on +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspCommon.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspCommon.h new file mode 100755 index 0000000000000..8b5ad49d04d6e --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspCommon.h @@ -0,0 +1,61 @@ +//============================================================================= +// +// Copyright (c) 2022-2023 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================= + +/** @file + * @brief QNN DSP Common components + * + * This file defines versioning and other identification details + * and supplements QnnCommon.h for DSP backend + */ + +#ifndef QNN_DSP_COMMON_H +#define QNN_DSP_COMMON_H + +#include "QnnCommon.h" + +/// DSP Backend identifier +#define QNN_BACKEND_ID_DSP 5 + +/// DSP interface provider +#define QNN_DSP_INTERFACE_PROVIDER_NAME "DSP_QTI_AISW" + +// DSP API Version values +#define QNN_DSP_API_VERSION_MAJOR 5 +#define QNN_DSP_API_VERSION_MINOR 0 +#define QNN_DSP_API_VERSION_PATCH 1 + +// clang-format off + +/// Macro to set Qnn_ApiVersion_t for DSP backend +#define QNN_DSP_API_VERSION_INIT \ + { \ + { \ + QNN_API_VERSION_MAJOR, /*coreApiVersion.major*/ \ + QNN_API_VERSION_MINOR, /*coreApiVersion.major*/ \ + QNN_API_VERSION_PATCH /*coreApiVersion.major*/ \ + }, \ + { \ + QNN_DSP_API_VERSION_MAJOR, /*backendApiVersion.major*/ \ + QNN_DSP_API_VERSION_MINOR, /*backendApiVersion.minor*/ \ + QNN_DSP_API_VERSION_PATCH /*backendApiVersion.patch*/ \ + } \ + } + +// clang-format on + +// DSP Binary Version values +#define QNN_DSP_BINARY_VERSION_MAJOR 1 +#define QNN_DSP_BINARY_VERSION_MINOR 0 +#define QNN_DSP_BINARY_VERSION_PATCH 0 + +// DSP Context blob Version values +#define QNN_DSP_CONTEXT_BLOB_VERSION_MAJOR 1 +#define QNN_DSP_CONTEXT_BLOB_VERSION_MINOR 0 +#define QNN_DSP_CONTEXT_BLOB_VERSION_PATCH 0 + +#endif // QNN_DSP_COMMON_H diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspDevice.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspDevice.h new file mode 100755 index 0000000000000..eecf62f5cbc02 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspDevice.h @@ -0,0 +1,46 @@ +//============================================================================= +// +// Copyright (c) 2022-2023 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================= + +/** @file + * @brief QNN DSP component Device API. + * + * The interfaces in this file work with the top level QNN + * API and supplements QnnDevice.h for DSP backend + */ +#ifndef QNN_DSP_DEVICE_H +#define QNN_DSP_DEVICE_H + +#include "QnnDevice.h" +#include "QnnDspPerfInfrastructure.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _QnnDevice_Infrastructure_t { + QnnDspPerfInfrastructure_CreatePowerConfigIdFn_t createPowerConfigId; + QnnDspPerfInfrastructure_DestroyPowerConfigIdFn_t destroyPowerConfigId; + QnnDspPerfInfrastructure_SetPowerConfigFn_t setPowerConfig; + QnnDspPerfInfrastructure_SetMemoryConfigFn_t setMemoryConfig; + QnnDspPerfInfrastructure_SetThreadConfigFn_t setThreadConfig; +} QnnDspDevice_Infrastructure_t; + +#define QNN_DSP_DEVICE_INFRASTRUCTURE_INIT \ + { \ + NULL, /*createPowerConfigId*/ \ + NULL, /*destroyPowerConfigId*/ \ + NULL, /*setPowerConfig*/ \ + NULL, /*setMemoryConfig*/ \ + NULL /*setThreadConfig*/ \ + } + +#ifdef __cplusplus +} +#endif + +#endif \ No newline at end of file diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspGraph.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspGraph.h new file mode 100755 index 0000000000000..dd1c5220c8721 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspGraph.h @@ -0,0 +1,171 @@ +//============================================================================= +// +// Copyright (c) 2022-2023 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================= + +/** + * @file + * @brief QNN DSP component Graph API. + * + * The interfaces in this file work with the top level QNN + * API and supplements QnnGraph.h for DSP backend + */ + +#ifndef QNN_DSP_GRAPH_H +#define QNN_DSP_GRAPH_H + +#include "QnnGraph.h" + +#ifdef __cplusplus +extern "C" { +#endif + +//============================================================================= +// Macros +//============================================================================= + +//============================================================================= +// Data Types +//============================================================================= + +/** + * @brief This enum provides different DSP graph optimization + * options that can be used to finalize the graph + * for optimum performance. + */ +typedef enum { + QNN_DSP_GRAPH_OPTIMIZATION_TYPE_SCHEDULE_THRESHOLD = 1, + QNN_DSP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_RETRIES = 2, + QNN_DSP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG = 3, + QNN_DSP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC = 4, + QNN_DSP_GRAPH_OPTIMIZATION_TYPE_UNKNOWN = 0x7fffffff +} QnnDspGraph_OptimizationType_t; + +// clang-format off + +/** + * @brief Struct describing the set of optimization types + * and the values associated with each optimization type. + * + * Below is the Map between QnnDspGraph_OptimizationType_t and allowable values: + * + * \verbatim embed:rst:leading-asterisk + * +----+------------------------------------------------------------+-----------------------------------------------------------+ + * | # | OptimizationType option | Allowable values | + * +====+============================================================+===========================================================+ + * | 1 | QNN_DSP_GRAPH_OPTIMIZATION_TYPE_SCHEDULE_THRESHOLD | Reserved | + * +----+------------------------------------------------------------+-----------------------------------------------------------+ + * | 2 | QNN_DSP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_RETRIES | Reserved | + * +----+------------------------------------------------------------+-----------------------------------------------------------+ + * | 3 | QNN_DSP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG | Defines the optimization strategy used by the HTP backend | + * | | | | + * | | | 1 = Faster preparation time, less optimal graph | + * | | | | + * | | | 2 = More optimal graph but may take longer to prepare | + * +----+------------------------------------------------------------+-----------------------------------------------------------+ + * | 4 | QNN_DSP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC | Reserved | + * +----+------------------------------------------------------------+-----------------------------------------------------------+ + * \endverbatim + */ +typedef struct { + QnnDspGraph_OptimizationType_t type; + float floatValue; +} QnnDspGraph_OptimizationOption_t; + +/// QnnDspGraph_OptimizationOption_t initializer macro +#define QNN_DSP_GRAPH_OPTIMIZATION_OPTION_INIT \ + { \ + QNN_DSP_GRAPH_OPTIMIZATION_TYPE_UNKNOWN, /*type*/ \ + 0.0f /*floatValue*/ \ + } +// clang-format on + +/** + * @brief This enum provides different DSP graph configuration + * options associated with QnnGraph + */ +typedef enum { + QNN_DSP_GRAPH_CONFIG_OPTION_OPTIMIZATION = 1, + QNN_DSP_GRAPH_CONFIG_OPTION_ENCODING = 2, + QNN_DSP_GRAPH_CONFIG_OPTION_PRIORITY = 3, + QNN_DSP_GRAPH_CONFIG_OPTION_PRECISION = 4, + QNN_DSP_GRAPH_CONFIG_OPTION_UNKNOWN = 0x7fffffff +} QnnDspGraph_ConfigOption_t; + +typedef enum { + QNN_DSP_GRAPH_ENCODING_DYNAMIC = 1, + /** @deprecated + */ + QNN_DSP_GRAPH_ENCOING_DYNAMIC = QNN_DSP_GRAPH_ENCODING_DYNAMIC, + QNN_DSP_GRAPH_ENCODING_STATIC = 2, + /** @deprecated + */ + QNN_DSP_GRAPH_ENCOING_STATIC = QNN_DSP_GRAPH_ENCODING_STATIC, + QNN_DSP_GRAPH_ENCODING_UNKNOWN = 0x7fffffff, + /** @deprecated + */ + QNN_DSP_GRAPH_ENCOING_UNKNOW = QNN_DSP_GRAPH_ENCODING_UNKNOWN +} QnnDspGraph_Encoding_t; + +//============================================================================= +// Public Functions +//============================================================================= + +//------------------------------------------------------------------------------ +// Implementation Definition +//------------------------------------------------------------------------------ + +// clang-format off + +/** + * @brief Structure describing the set of configurations supported by graph. + * Objects of this type are to be referenced through QnnGraph_CustomConfig_t. + * + * The struct has two fields - option and a union of corresponding config values + * Based on the option corresponding item in the union can be used to specify + * config. + * + * Below is the Map between QnnDspGraph_ConfigOption_t and config value + * + * \verbatim embed:rst:leading-asterisk + * +----+------------------------------------------+------------------------------------+ + * | # | Config Option | Configuration Struct/value | + * +====+==========================================+====================================+ + * | 1 | QNN_DSP_GRAPH_CONFIG_OPTION_OPTIMIZATION | QnnDspGraph_OptimizationOption_t | + * +----+------------------------------------------+------------------------------------+ + * | 2 | QNN_DSP_GRAPH_CONFIG_OPTION_ENCODING | QnnDspGraph_Encoding_t | + * +----+------------------------------------------+------------------------------------+ + * | 3 | QNN_DSP_GRAPH_CONFIG_OPTION_PRECISION | Qnn_Precision_t | + * +----+------------------------------------------+------------------------------------+ + * | 4 | QNN_DSP_GRAPH_CONFIG_OPTION_PRIORITY | Qnn_Priority_t | + * +----+------------------------------------------+------------------------------------+ + * \endverbatim + */ +typedef struct { + QnnDspGraph_ConfigOption_t option; + union { + QnnDspGraph_OptimizationOption_t optimizationOption; + QnnDspGraph_Encoding_t encoding; + Qnn_Priority_t priority; + Qnn_Precision_t precision; + }; +} QnnDspGraph_CustomConfig_t; + +// clang-format on +/// QnnDspGraph_CustomConfig_t initializer macro +#define QNN_DSP_GRAPH_CUSTOM_CONFIG_INIT \ + { \ + QNN_DSP_GRAPH_CONFIG_OPTION_UNKNOWN, /*option*/ \ + { \ + QNN_DSP_GRAPH_OPTIMIZATION_OPTION_INIT /*optimizationOption*/ \ + } \ + } + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspOpPackage.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspOpPackage.h new file mode 100755 index 0000000000000..c8760ecb6b798 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspOpPackage.h @@ -0,0 +1,42 @@ +//============================================================================== +// +// Copyright (c) 2022-2023 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +#ifndef QNN_DSP_OP_PACKAGE_HPP +#define QNN_DSP_OP_PACKAGE_HPP + +#include "QnnOpPackage.h" +#include "QnnTypes.h" +#include "Udo/UdoImplDsp.h" + +/** + * @brief A struct which defines the Global infrastructure. + */ +typedef struct _QnnOpPackage_GlobalInfrastructure_t { + /// include the UdoMalloc, UdoFree and so on + Udo_DspGlobalInfrastructure_t* dspGlobalInfra; +} QnnDspOpPackage_GlobalInfrastructure_t; + +/** + * @brief A struct which defines the operation info. + */ +typedef struct _QnnOpPackage_OperationInfo_t { + char* opType; + uint32_t numOfStaticParams; + uint32_t numOfInputs; + uint32_t numOfOutputs; + + Udo_CreateOpFactoryFunction_t createOpFactory; + Udo_CreateOperationFunction_t createOperation; + Udo_ExecuteOpFunction_t executeOp; + Udo_ReleaseOpFunction_t releaseOp; + Udo_ReleaseOpFactoryFunction_t releaseOpFactory; + Udo_ValidateOperationFunction_t validateOp; + Udo_QueryOperationFunction_t queryOp; +} QnnDspOpPackage_OperationInfo_t; + +#endif diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspPerfInfrastructure.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspPerfInfrastructure.h new file mode 100755 index 0000000000000..c9b1aa3020b9e --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspPerfInfrastructure.h @@ -0,0 +1,448 @@ +//============================================================================== +// +// Copyright (c) 2022-2023 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +/** @file + * @brief QNN DSP component Performance Infrastructure API + * + * Provides interface to the client to control performance and system + * settings of the QNN DSP Accelerator + */ + +#ifndef QNN_DSP_PERF_INFRASTRUCTURE_H +#define QNN_DSP_PERF_INFRASTRUCTURE_H + +#include "QnnCommon.h" +#include "QnnTypes.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// max rpc polling time allowed - 9999 us +#define QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIG_MAX_RPC_POLLING_TIME 9999 + +//============================================================================= +// Data Types +//============================================================================= + +/** + * @brief QNN DSP PerfInfrastructure API result / error codes. + * + */ +typedef enum { + QNN_DSP_PERF_INFRASTRUCTURE_MIN_ERROR = QNN_MIN_ERROR_PERF_INFRASTRUCTURE, + //////////////////////////////////////////////////////////////////////// + + QNN_DSP_PERF_INFRASTRUCTURE_NO_ERROR = QNN_SUCCESS, + QNN_DSP_PERF_INFRASTRUCTURE_ERROR_INVALID_HANDLE_PTR = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 0, + QNN_DSP_PERF_INFRASTRUCTURE_ERROR_INVALID_INPUT = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 1, + QNN_DSP_PERF_INFRASTRUCTURE_ERROR_UNSUPPORTED_CONFIG = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 2, + QNN_DSP_PERF_INFRASTRUCTURE_ERROR_TRANSPORT = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 3, + QNN_DSP_PERF_INFRASTRUCTURE_ERROR_UNSUPPORTED = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 4, + QNN_DSP_PERF_INFRASTRUCTURE_ERROR_FAILED = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 5, + + //////////////////////////////////////////////////////////////////////// + QNN_DSP_PERF_INFRASTRUCTURE_MAX_ERROR = QNN_MAX_ERROR_PERF_INFRASTRUCTURE, + /// UNDEFINED value that must not be used by client + QNN_DSP_PERF_INFRASTRUCTURE_ERROR_UNDEFINED = 0x7fffffff +} QnnDspPerfInfrastructure_Error_t; + +/** + * @brief Used to allow client start (non-zero value) or stop participating + * (zero value) in DCVS + * + */ +typedef uint32_t QnnDspPerfInfrastructure_DcvsEnable_t; + +/** + * @brief Allows client to set up the sleep latency in microseconds + * + */ +typedef uint32_t QnnDspPerfInfrastructure_SleepLatency_t; + +/** + * @brief Allows client to disable sleep or low power modes. + * Pass a non-zero value to disable sleep in DSP + * + */ +typedef uint32_t QnnDspPerfInfrastructure_SleepDisable_t; + +/** + * @brief sets the minimum size by which user heap should grow + * when heap is exhausted. This API is expected to be + * called only once per backend and has a process wide impact + * + * Grow size provided in bytes and defaults to 16MB + */ +typedef uint32_t QnnDspPerfInfrastructure_MemGrowSize_t; + +/** + * @brief sets the vtcm size to use for graphs that + * are prepared offline. This API should be set up + * before users can finalize a graph offline. It allows + * the QNN DSP backend to configure the serialized + * context for the available vtcm on target + * + * VTCM size provided in MB and does not have a default + */ +typedef uint32_t QnnDspPerfInfrastructure_VtcmSize_t; + +/** + * @brief sets the number of HVX threads for QNN DSP + */ +typedef uint32_t QnnDspPerfInfrastructure_HvxThreadNumber_t; + +/** + * @brief These are the different voltage corners that can + * be requested by the client to influence the voting scheme + * for DCVS + * + */ +typedef enum { + /// Maps to HAP_DCVS_VCORNER_DISABLE. + /// Disable setting up voltage corner + DCVS_VOLTAGE_CORNER_DISABLE = 0x10, + /// Maps to HAP_DCVS_VCORNER_SVS2. + /// Set voltage corner to minimum value supported on platform + DCVS_VOLTAGE_VCORNER_MIN_VOLTAGE_CORNER = 0x20, + /// Maps to HAP_DCVS_VCORNER_SVS2. + /// Set voltage corner to SVS2 value for the platform + DCVS_VOLTAGE_VCORNER_SVS2 = 0x30, + /// Maps to HAP_DCVS_VCORNER_SVS. + /// Set voltage corner to SVS value for the platform + DCVS_VOLTAGE_VCORNER_SVS = 0x40, + /// Maps to HAP_DCVS_VCORNER_SVS_PLUS. + /// Set voltage corner to SVS_PLUS value for the platform + DCVS_VOLTAGE_VCORNER_SVS_PLUS = 0x50, + /// Maps to HAP_DCVS_VCORNER_NOM. + /// Set voltage corner to NOMINAL value for the platform + DCVS_VOLTAGE_VCORNER_NOM = 0x60, + /// Maps to HAP_DCVS_VCORNER_NOM_PLUS. + /// Set voltage corner to NOMINAL_PLUS value for the platform + DCVS_VOLTAGE_VCORNER_NOM_PLUS = 0x70, + /// Maps to HAP_DCVS_VCORNER_TURBO. + /// Set voltage corner to TURBO value for the platform + DCVS_VOLTAGE_VCORNER_TURBO = 0x80, + /// Maps to HAP_DCVS_VCORNER_TURBO_PLUS. + /// Set voltage corner to TURBO_PLUS value for the platform + DCVS_VOLTAGE_VCORNER_TURBO_PLUS = 0x90, + /// Maps to HAP_DCVS_VCORNER_MAX. + /// Set voltage corner to maximum value supported on the platform + DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER = 0xA0, + /// UNKNOWN value that must not be used by client + DCVS_VOLTAGE_VCORNER_UNKNOWN = 0x7fffffff +} QnnDspPerfInfrastructure_VoltageCorner_t; + +/** + * @brief This enum defines all the possible power mode + * that a client can set to influence DCVS mode + */ +typedef enum { + /// Maps to HAP_DCVS_V2_ADJUST_UP_DOWN. + /// Allows for DCVS to adjust up and down + QNN_DSP_PERF_INFRASTRUCTURE_POWERMODE_ADJUST_UP_DOWN = 0x1, + /// Maps to HAP_DCVS_V2_ADJUST_ONLY_UP. + /// Allows for DCVS to adjust up only + QNN_DSP_PERF_INFRASTRUCTURE_POWERMODE_ADJUST_ONLY_UP = 0x2, + /// Maps to HAP_DCVS_V2_POWER_SAVER_MODE. + /// Higher thresholds for power efficiency + QNN_DSP_PERF_INFRASTRUCTURE_POWERMODE_POWER_SAVER_MODE = 0x4, + /// Maps to HAP_DCVS_V2_POWER_SAVER_AGGRESSIVE_MODE. + /// Higher thresholds for power efficiency with faster ramp down + QNN_DSP_PERF_INFRASTRUCTURE_POWERMODE_POWER_SAVER_AGGRESSIVE_MODE = 0x8, + /// Maps to HAP_DCVS_V2_PERFORMANCE_MODE. + /// Lower thresholds for maximum performance + QNN_DSP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE = 0x10, + /// Maps to HAP_DCVS_V2_DUTY_CYCLE_MODE. + /// The below value applies only for HVX clients: + /// - For streaming class clients: + /// - detects periodicity based on HVX usage + /// - lowers clocks in the no HVX activity region of each period. + /// - For compute class clients: + /// - Lowers clocks on no HVX activity detects and brings clocks up on detecting HVX activity + /// again. + /// - Latency involved in bringing up the clock will be at max 1 to 2 ms. + QNN_DSP_PERF_INFRASTRUCTURE_POWERMODE_DUTY_CYCLE_MODE = 0x20, + /// UNKNOWN value that must not be used by client + QNN_DSP_PERF_INFRASTRUCTURE_POWERMODE_UNKNOWN = 0x7fffffff +} QnnDspPerfInfrastructure_PowerMode_t; + +/** + * @brief This enum defines all the possible performance + * options in Dsp Performance Infrastructure that + * relate to setting up of power levels + */ +typedef enum { + /// config enum implies the usage of dcvsEnableConfig struct. For dcvs v2, if not provided, will + /// set to false + QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_ENABLE = 1, + /// config enum implies the usage of sleepLatencyConfig struct + QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_SLEEP_LATENCY = 2, + /// config enum implies the usage of sleepDisableConfig struct + QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_SLEEP_DISABLE = 3, + /// config enum implies the usage of dcvsPowerModeConfig struct. If not provided, power save mode + /// will be used + QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_POWER_MODE = 4, + /// config enum implies the usage of dcvsVoltageCornerConfig struct + QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_VOLTAGE_CORNER = 5, + /// config enum implies the usage of busVoltageCornerConfig struct + QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_BUS_VOLTAGE_CORNER = 6, + /// config enum implies the usage of coreVoltageCornerConfig struct + QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_CORE_VOLTAGE_CORNER = 7, + /// config enum implies the usage of rpcControlLatencyConfig struct + QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_CONTROL_LATENCY = 9, + /// config enum implies the usage of rpcPollingTimeConfig struct + /// this config is only supported on V69 and later + /// if enabled, this config is applied to entire process + /// max allowed is QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIG_MAX_RPC_POLLING_TIME us + QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME = 10, + /// config HMX timeout interval in us. The HMX is turned off after the set interval + /// time if no interaction with it after an inference is finished. + QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_HMX_TIMEOUT_INTERVAL_US = 11, + /// UNKNOWN config option which must not be used + QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_UNKNOWN = 0x7fffffff +} QnnDspPerfInfrastructure_PowerConfigOption_t; + +/** + * @brief Allows client to set up the RPC control latency in microseconds + * + */ +typedef uint32_t QnnDspPerfInfrastructure_RpcControlLatency_t; + +/** + * @brief Allows client to set up the RPC polling time in microseconds + */ +typedef uint32_t QnnDspPerfInfrastructure_RpcPollingTime_t; + +/** + * @brief Allows client to set up the HMX timeout interval in microseconds + */ +typedef uint32_t QnnDspPerfInfrastructure_HmxTimeoutIntervalUs_t; + +/** + * @brief This struct provides performance infrastructure configuration + * associated with setting up of power levels + */ +typedef struct { + QnnDspPerfInfrastructure_PowerConfigOption_t config; + union { + QnnDspPerfInfrastructure_DcvsEnable_t dcvsEnableConfig; + QnnDspPerfInfrastructure_SleepLatency_t sleepLatencyConfig; + QnnDspPerfInfrastructure_SleepDisable_t sleepDisableConfig; + QnnDspPerfInfrastructure_PowerMode_t dcvsPowerModeConfig; + QnnDspPerfInfrastructure_VoltageCorner_t dcvsVoltageCornerMinConfig; + QnnDspPerfInfrastructure_VoltageCorner_t dcvsVoltageCornerTargetConfig; + QnnDspPerfInfrastructure_VoltageCorner_t dcvsVoltageCornerMaxConfig; + QnnDspPerfInfrastructure_VoltageCorner_t busVoltageCornerMinConfig; + QnnDspPerfInfrastructure_VoltageCorner_t busVoltageCornerTargetConfig; + QnnDspPerfInfrastructure_VoltageCorner_t busVoltageCornerMaxConfig; + QnnDspPerfInfrastructure_VoltageCorner_t coreVoltageCornerMinConfig; + QnnDspPerfInfrastructure_VoltageCorner_t coreVoltageCornerTargetConfig; + QnnDspPerfInfrastructure_VoltageCorner_t coreVoltageCornerMaxConfig; + QnnDspPerfInfrastructure_RpcControlLatency_t rpcControlLatencyConfig; + QnnDspPerfInfrastructure_RpcPollingTime_t rpcPollingTimeConfig; + QnnDspPerfInfrastructure_HmxTimeoutIntervalUs_t hmxTimeoutIntervalUsConfig; + }; +} QnnDspPerfInfrastructure_PowerConfig_t; + +/// QnnDspPerfInfrastructure_PowerConfig_t initializer macro +#define QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIG_INIT \ + { \ + QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_UNKNOWN, /*config*/ \ + { \ + 0 /*dcvsEnableConfig*/ \ + } \ + } + +/** + * @brief This enum defines all the possible performance + * options in Dsp Performance Infrastructure that + * relate to system memory settings + */ +typedef enum { + /// sets memory grow size + QNN_DSP_PERF_INFRASTRUCTURE_MEMORY_CONFIGOPTION_GROW_SIZE = 1, + /// set the size of VTCM configuration (in MB) to use + /// This setting is applicable only for off target usage. + /// For on-target usage, refer QNN_DSP_PERF_INFRASTRUCTURE_MEMORY_CONFIGOPTION_VTCM_USAGE_FACTOR + QNN_DSP_PERF_INFRASTRUCTURE_MEMORY_CONFIGOPTION_VTCM_SIZE = 2, + /// set the vtcm usage factor on-target + QNN_DSP_PERF_INFRASTRUCTURE_MEMORY_CONFIGOPTION_VTCM_USAGE_FACTOR = 3, + /// UNKNOWN config option that must not be used + QNN_DSP_PERF_INFRASTRUCTURE_MEMORY_CONFIGOPTION_UNKNOWN = 0x7fffffff +} QnnDspPerfInfrastructure_MemoryConfigOption_t; + +/** + * @brief This enum defines all the possible performance + * options in Dsp Performance Infrastructure that + * relate to thread settings + */ +typedef enum { + /// sets number of HVX threads + QNN_DSP_PERF_INFRASTRUCTURE_THREAD_CONFIGOPTION_NUMBER_OF_HVX_THREADS = 1, + /// UNKNOWN config option that must not be used + QNN_DSP_PERF_INFRASTRUCTURE_THREAD_CONFIGOPTION_UNKNOWN = 0x7fffffff +} QnnDspPerfInfrastructure_ThreadConfigOption_t; + +/** + * @brief This enum defines all the possible vtcm + * usage configuration. These settings apply only + * for on-target libraries + * + */ +typedef enum { + /// use all the vtcm available on target + QNN_DSP_PERF_INFRASTRUCTURE_VTCM_USE_FULL = 1, + /// use bare minimal vtcm available on target. This is + /// not supported in the current release. + QNN_DSP_PERF_INFRASTRUCTURE_VTCM_USE_MIN = 2, + QNN_DSP_PERF_INFRASTRUCTURE_VTCM_USE_UNKNOWN = 0x7fffffff +} QnnDspPerfInfrastructure_VtcmUsageFactor_t; + +/** + * @brief Provides performance infrastructure configuration + * options that are memory specific + */ +typedef struct { + QnnDspPerfInfrastructure_MemoryConfigOption_t config; + union { + QnnDspPerfInfrastructure_MemGrowSize_t memGrowSizeConfig; + QnnDspPerfInfrastructure_VtcmSize_t vtcmSizeInMB; + QnnDspPerfInfrastructure_VtcmUsageFactor_t vtcmUsageConfig; + }; +} QnnDspPerfInfrastructure_MemoryConfig_t; + +/// QnnDspPerfInfrastructure_MemoryConfig_t initializer macro +#define QNN_DSP_PERF_INFRASTRUCTURE_MEMORY_CONFIG_INIT \ + { \ + QNN_DSP_PERF_INFRASTRUCTURE_MEMORY_CONFIGOPTION_UNKNOWN, /*config*/ \ + { \ + 0 /*memGrowSizeConfig*/ \ + } \ + } + +/** + * @brief Provides performance infrastructure configuration + * options that are thread specific + */ +typedef struct { + QnnDspPerfInfrastructure_ThreadConfigOption_t config; + union { + QnnDspPerfInfrastructure_HvxThreadNumber_t numHvxThreads; + }; +} QnnDspPerfInfrastructure_ThreadConfig_t; + +/// QnnDspPerfInfrastructure_ThreadConfig_t initializer macro +#define QNN_DSP_PERF_INFRASTRUCTURE_THREAD_CONFIG_INIT \ + { \ + QNN_DSP_PERF_INFRASTRUCTURE_THREAD_CONFIGOPTION_UNKNOWN, /*config*/ \ + { \ + 0 /*numHvxThreads*/ \ + } \ + } + +//============================================================================= +// API Methods +//============================================================================= + +/** + * @brief This API allows client to create power configuration id that + * has to be used to set different performance modes. + * Power configuration id has to be destroyed by client when not needed. + * + * @param[out] powerConfigId Pointer to power configuration id to be created. + * + * + * @return Error code + * \n QNN_SUCCESS: No error encountered + * \n QNN_DSP_PERF_INFRASTRUCTURE_ERROR_INVALID_INPUT if power configuration + * id is NULL + */ +typedef Qnn_ErrorHandle_t (*QnnDspPerfInfrastructure_CreatePowerConfigIdFn_t)( + uint32_t* powerConfigId); + +/** + * @brief This API allows client to destroy power configuration id. + * + * @param[in] powerConfigId A power configuration id to be destroyed. + * + * + * @return Error code + * \n QNN_SUCCESS: No error encountered + * \n QNN_DSP_PERF_INFRASTRUCTURE_ERROR_INVALID_INPUT if power configuration + * id does not exist + */ +typedef Qnn_ErrorHandle_t (*QnnDspPerfInfrastructure_DestroyPowerConfigIdFn_t)( + uint32_t powerConfigId); + +/** + * @brief This API allows client to set up system power configuration that + * will enable different performance modes. This API uses + * HAP_power_dcvs_v3_payload struct to config HAP power parameters. + * Detailed HAP power parameters description please refer to Hexagon + * SDK HAP_power_dcvs_v3_payload documentation. + * + * @param[in] powerConfigId A power client id to associate calls to system + * power settings. A value of 0 implies NULL power client id + * and can override every other setting the user process. To + * enable power settings for multiple clients in the same + * process, use a non-zero power client id. + * + * + * @param[in] config Pointer to a NULL terminated array + * of config option for performance configuration. + * NULL is allowed and indicates no config options are provided. + * + * @return Error code + * \n QNN_SUCCESS: No error encountered + * \n QNN_DSP_PERF_INFRASTRUCTURE_ERROR_INVALID_INPUT if power configuration + * does not exist + */ +typedef Qnn_ErrorHandle_t (*QnnDspPerfInfrastructure_SetPowerConfigFn_t)( + uint32_t powerConfigId, const QnnDspPerfInfrastructure_PowerConfig_t** config); + +/** + * @brief This API allows clients to set up configuration associated with + * system memory + * + * @param[in] config Pointer to a NULL terminated array + * of config option for system memory configuration. + * NULL is allowed and indicates no config options are provided. + * + * @return Error code + * \n QNN_SUCCESS: No error encountered + */ +typedef Qnn_ErrorHandle_t (*QnnDspPerfInfrastructure_SetMemoryConfigFn_t)( + const QnnDspPerfInfrastructure_MemoryConfig_t** config); + +/** + * @brief This API allows clients to set up configuration for threads + * + * @param[in] config Pointer to a NULL terminated array + * of config option for thread configuration. + * NULL is allowed and indicates no config options are provided. + * + * @note This function should be called after QnnBackend_initialize and + * before Context and Graph calls + * + * @return Error code + * \n QNN_SUCCESS: No error encountered + * \n QNN_DSP_PERF_INFRASTRUCTURE_ERROR_UNSUPPORTED_CONFIG if invalid + * config or value passed + * \n QNN_DSP_PERF_INFRASTRUCTURE_ERROR_INVALID_INPUT if config is NULL + * \n QNN_DSP_PERF_INFRASTRUCTURE_ERROR_TRANSPORT if unable to set the + * settings in DSP + */ +typedef Qnn_ErrorHandle_t (*QnnDspPerfInfrastructure_SetThreadConfigFn_t)( + const QnnDspPerfInfrastructure_ThreadConfig_t** config); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // QNN_DSP_PERF_INFRASTRUCTURE_H diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspProfile.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspProfile.h new file mode 100755 index 0000000000000..04c1897aa7e18 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspProfile.h @@ -0,0 +1,244 @@ +//============================================================================== +// +// Copyright (c) 2022-2023 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +/** + * @file + * @brief QNN DSP Profile component API. + * + * Requires DSP backend to be initialized. + * Should be used with the QnnProfile API but has DSP backend + * specific definition for different QnnProfile data structures + * + */ + +#ifndef QNN_DSP_PROFILE_H +#define QNN_DSP_PROFILE_H + +#include "QnnProfile.h" + +#ifdef __cplusplus +extern "C" { +#endif + +//============================================================================= +// Macros +//============================================================================= +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the remote procedure call on the ARM processor + * when client invokes QnnContext_createFromBinary. The value + * returned is time in microseconds. + * + * @note context load binary host rpc time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_DSP_PROFILE_EVENTTYPE_CONTEXT_LOAD_BIN_HOST_RPC_TIME_MICROSEC 1002 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the remote procedure call on the DSP processor + * when client invokes QnnContext_createFromBinary. The value + * returned is time in microseconds. + * + * @note context load binary dsp rpc time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_DSP_PROFILE_EVENTTYPE_CONTEXT_LOAD_BIN_DSP_RPC_TIME_MICROSEC 1003 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the time taken to create the context on the + * accelerator when client invokes QnnContext_createFromBinary. + * The value returned is time in microseconds. + * + * @note context load binary accelerator time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_DSP_PROFILE_EVENTTYPE_CONTEXT_LOAD_BIN_ACCEL_TIME_MICROSEC 1004 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the remote procedure call on the ARM processor + * when client invokes QnnGraph_finalize. + * The value returned is time in microseconds. + * + * @note graph finalize host rpc time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_HOST_RPC_TIME_MICROSEC 2001 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the remote procedure call on the DSP processor + * when client invokes QnnGraph_finalize. + * The value returned is time in microseconds. + * + * @note graph finalize dsp rpc time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_DSP_RPC_TIME_MICROSEC 2002 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to finalize the graph on the accelerator + * when client invokes QnnGraph_finalize. + * The value returned is time in microseconds. + * + * @note graph finalize accelerator time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_ACCEL_TIME_MICROSEC 2003 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the remote procedure call on the ARM processor + * when client invokes QnnGraph_execute or QnnGraph_executeAsync. + * The value returned is time in microseconds. + * + * @note graph execute host rpc time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_HOST_RPC_TIME_MICROSEC 3001 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the remote procedure call on the DSP processor + * when client invokes QnnGraph_execute or QnnGraph_executeAsync. + * The value returned is time in microseconds. + * + * @note graph execute dsp rpc time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_DSP_RPC_TIME_MICROSEC 3002 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to execute the graph on the accelerator + * when client invokes QnnGraph_execute or QnnGraph_executeAsync. + * The value returned is number of processor cycles taken. + * + * @note graph execute accelerator time maybe available only on + * QNN_PROFILE_LEVEL_DETAILED levels + * + * @note When QNN_PROFILE_LEVEL_DETAILED is used, this event can have + * multiple sub-events of type QNN_PROFILE_EVENTTYPE_NODE. + * There will be a sub-event for each node that was added to the graph + */ +#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_ACCEL_TIME_CYCLE 3003 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to execute the graph on the accelerator + * when client invokes QnnGraph_execute or QnnGraph_executeAsync. + * The value returned is time taken in microseconds + * + * @note graph execute accelerator time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + * + * @note When QNN_PROFILE_LEVEL_DETAILED is used, this event can have + * multiple sub-events of type QNN_PROFILE_EVENTTYPE_NODE / QNN_PROFILE_EVENTUNIT_MICROSEC + * There will be a sub-event for each node that was added to the graph + */ +#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_ACCEL_TIME_MICROSEC 3004 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to time taken for miscellaneous work i.e. time + * that cannot be attributed to a node but are still needed to + * execute the graph on the accelerator. This occurs when client invokes + * QnnGraph_execute or QnnGraph_executeAsync. + * The value returned is time taken in microseconds + * + * @note graph execute misc accelerator time is available only on + * QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_MISC_ACCEL_TIME_MICROSEC 3005 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to time taken for a graph yield instance to + * release all its resources to the other graph. + * The value returned is time taken in microseconds. + */ +#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_YIELD_INSTANCE_RELEASE_TIME 3006 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to time a graph spends waiting for a higher + * priority graph to finish execution. + * The value returned is time taken in microseconds + */ +#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_YIELD_INSTANCE_WAIT_TIME 3007 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to time a graph spends re-acquiring resources + * and restoring vtcm. + * The value returned is time taken in microseconds + */ +#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_YIELD_INSTANCE_RESTORE_TIME 3008 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the number of times that a yield occured + * during execution + */ +#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_YIELD_COUNT 3009 + +/** + * @brief QnnProfile_EventType_t definition for time a graph waits to get + * VTCM. This should be constant UNLESS we need another graph to yield. + * The value returned is time taken in microseconds. + */ +#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_VTCM_ACQUIRE_TIME 3010 + +/** + * @brief QnnProfile_EventType_t definition for time a graph waits to get + * HMX + HVX, and turn them all on. + * The value returned is time taken in microseconds. + */ +#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_RESOURCE_POWER_UP_TIME 3011 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the remote procedure call on the ARM processor + * when client invokes QnnContext_free which in consequence deinit graph. + * The value returned is time in microseconds. + * + * @note graph deinit host rpc time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_DEINIT_HOST_RPC_TIME_MICROSEC 4001 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the remote procedure call on the DSP processor + * when client invokes QnnContext_free which in consequence deinit graph. + * The value returned is time in microseconds. + * + * @note graph deinit dsp rpc time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_DEINIT_DSP_RPC_TIME_MICROSEC 4002 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the time taken to deinit graph on the + * accelerator when client invokes QnnContext_free which in consequence + * deinit graph. The value returned is time in microseconds. + * + * @note graph deinit accelerator time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_DEINIT_ACCEL_TIME_MICROSEC 4003 + +#ifdef __cplusplus +} +#endif + +#endif // QNN_DSP_PROFILE_H diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspProperty.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspProperty.h new file mode 100755 index 0000000000000..39669338e35f8 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspProperty.h @@ -0,0 +1,30 @@ +//============================================================================== +// +// Copyright (c) 2022-2023 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +#ifndef QNN_DSP_PROPERTY_H +#define QNN_DSP_PROPERTY_H + +#include "QnnProperty.h" + +#ifdef __cplusplus +extern "C" { +#endif + +//============================================================================= +// Macros +//============================================================================= +/** + * @brief Property key for determining whether a backend supports unsigned pd. + */ +#define QNN_PROPERTY_CUSTOM_DSP_UNSIGNED_PD_SUPPORT QNN_PROPERTY_GROUP_CUSTOM + 1 + +#ifdef __cplusplus +} +#endif + +#endif // QNN_DSP_PROPERTY_H diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoBase.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoBase.h new file mode 100755 index 0000000000000..942e5997ab5ff --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoBase.h @@ -0,0 +1,509 @@ +//============================================================================== +// +// Copyright (c) 2019-2021 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +#ifndef SNPE_UDO_BASE_H +#define SNPE_UDO_BASE_H + +#include + +// Provide values to use for API version. +#define API_VERSION_MAJOR 1 +#define API_VERSION_MINOR 6 +#define API_VERSION_TEENY 0 + +/** @addtogroup c_plus_plus_apis C++ +@{ */ + +// Defines a bitmask of enum values. +typedef uint32_t SnpeUdo_Bitmask_t; +typedef SnpeUdo_Bitmask_t Udo_Bitmask_t; + +// A string of characters, rather than an array of bytes. +// Assumed to be UTF-8. +typedef char* SnpeUdo_String_t; +typedef SnpeUdo_String_t Udo_String_t; + +// The maximum allowable length of a SnpeUdo_String_t in bytes, +// including null terminator. SNPE will truncate strings longer +// than this. +#define SNPE_UDO_MAX_STRING_SIZE 1024 + +/** + * An enum which holds the various error types. + * The error types are divided to classes : + * 0 - 99 : generic errors + * 100 - 200 : errors related to configuration + * + */ +typedef enum +{ + /// No Error + SNPE_UDO_NO_ERROR = 0, UDO_NO_ERROR = 0, + /// Unsupported value for core type + SNPE_UDO_WRONG_CORE = 1, UDO_WRONG_CORE = 1, + /// Invalid attribute/argument passed into UDO API + SNPE_UDO_INVALID_ARGUMENT = 2, UDO_INVALID_ARGUMENT = 2, + /// Unsupported feature error + SNPE_UDO_UNSUPPORTED_FEATURE = 3, UDO_UNSUPPORTED_FEATURE = 3, + /// Error relating to memory allocation + SNPE_UDO_MEM_ALLOC_ERROR = 4, UDO_MEM_ALLOC_ERROR = 4, + /* Configuration Specific errors */ + /// No op with given attributes available in library + SNPE_UDO_WRONG_OPERATION = 100, UDO_WRONG_OPERATION = 100, + /// Unsupported value for core type in UDO configuration + SNPE_UDO_WRONG_CORE_TYPE = 101, UDO_WRONG_CORE_TYPE = 101, + /// Wrong number of params in UDO definition + SNPE_UDO_WRONG_NUM_OF_PARAMS = 102, UDO_WRONG_NUM_OF_PARAMS = 102, + /// Wrong number of dimensions for tensor(s) in UDO definition + SNPE_UDO_WRONG_NUM_OF_DIMENSIONS = 103, UDO_WRONG_NUM_OF_DIMENSIONS = 103, + /// Wrong number of input tensors in UDO definition + SNPE_UDO_WRONG_NUM_OF_INPUTS = 104, UDO_WRONG_NUM_OF_INPUTS = 104, + /// Wrong number of output tensors in UDO definition + SNPE_UDO_WRONG_NUM_OF_OUTPUTS = 105, UDO_WRONG_NUM_OF_OUTPUTS = 105, + SNPE_UDO_PROGRAM_CACHE_NOT_FOUND = 106, UDO_PROGRAM_CACHE_NOT_FOUND = 106, + SNPE_UDO_UNKNOWN_ERROR = 0xFFFFFFFF, UDO_UNKNOWN_ERROR = 0xFFFFFFFF +} SnpeUdo_ErrorType_t; + +typedef SnpeUdo_ErrorType_t Udo_ErrorType_t; + +/** + * An enum which holds the various data types. + * Designed to be used as single values or combined into a bitfield parameter + * (0x1, 0x2, 0x4, etc) + * \n FIXED_XX types are targeted for data in tensors. + * \n UINT / INT types are targeted for scalar params + */ +typedef enum +{ + /// data type: 16-bit floating point + SNPE_UDO_DATATYPE_FLOAT_16 = 0x01, UDO_DATATYPE_FLOAT_16 = 0x01, + /// data type: 32-bit floating point + SNPE_UDO_DATATYPE_FLOAT_32 = 0x02, UDO_DATATYPE_FLOAT_32 = 0x02, + /// data type: 4-bit fixed point + SNPE_UDO_DATATYPE_FIXED_4 = 0x04, UDO_DATATYPE_FIXED_4 = 0x04, + /// data type: 8-bit fixed point + SNPE_UDO_DATATYPE_FIXED_8 = 0x08, UDO_DATATYPE_FIXED_8 = 0x08, + /// data type: 16-bit fixed point + SNPE_UDO_DATATYPE_FIXED_16 = 0x10, UDO_DATATYPE_FIXED_16 = 0x10, + /// data type: 32-bit fixed point + SNPE_UDO_DATATYPE_FIXED_32 = 0x20, UDO_DATATYPE_FIXED_32 = 0x20, + /// data type: 8-bit unsigned integer + SNPE_UDO_DATATYPE_UINT_8 = 0x100, UDO_DATATYPE_UINT_8 = 0x100, + /// data type: 16-bit unsigned integer + SNPE_UDO_DATATYPE_UINT_16 = 0x200, UDO_DATATYPE_UINT_16 = 0x200, + /// data type: 32-bit unsigned integer + SNPE_UDO_DATATYPE_UINT_32 = 0x400, UDO_DATATYPE_UINT_32 = 0x400, + /// data type: 8-bit signed integer + SNPE_UDO_DATATYPE_INT_8 = 0x1000, UDO_DATATYPE_INT_8 = 0x1000, + /// data type: 16-bit signed integer + SNPE_UDO_DATATYPE_INT_16 = 0x2000, UDO_DATATYPE_INT_16 = 0x2000, + /// data type: 32-bit signed integer + SNPE_UDO_DATATYPE_INT_32 = 0x4000, UDO_DATATYPE_INT_32 = 0x4000, + SNPE_UDO_DATATYPE_LAST = 0xFFFFFFFF, UDO_DATATYPE_LAST = 0xFFFFFFFF +} SnpeUdo_DataType_t; + +typedef SnpeUdo_DataType_t Udo_DataType_t; + +/** + * An enum which holds the various layouts. + * Designed to be used as single values or combined into a bitfield parameter + * (0x1, 0x2, 0x4, etc) + */ +typedef enum +{ + /// data layout (4D): NHWC (batch-height-width-channel) + SNPE_UDO_LAYOUT_NHWC = 0x01, UDO_LAYOUT_NHWC = 0x01, + /// data layout (4D): NCHW (batch-channel-height-width) + SNPE_UDO_LAYOUT_NCHW = 0x02, UDO_LAYOUT_NCHW = 0x02, + /// data layout (5D): NDHWC (batch-dimension-height-width-channel) + SNPE_UDO_LAYOUT_NDHWC = 0x04, UDO_LAYOUT_NDHWC = 0x04, + SNPE_UDO_LAYOUT_GPU_OPTIMAL1 = 0x08, UDO_LAYOUT_GPU_OPTIMAL1 = 0x08, + SNPE_UDO_LAYOUT_GPU_OPTIMAL2 = 0x10, UDO_LAYOUT_GPU_OPTIMAL2 = 0x10, + SNPE_UDO_LAYOUT_DSP_OPTIMAL1 = 0x11, UDO_LAYOUT_DSP_OPTIMAL1 = 0x11, + SNPE_UDO_LAYOUT_DSP_OPTIMAL2 = 0x12, UDO_LAYOUT_DSP_OPTIMAL2 = 0x12, + // Indicates no data will be allocated for this tensor. + // Used to specify optional inputs/outputs positionally. + SNPE_UDO_LAYOUT_NULL = 0x13, UDO_LAYOUT_NULL = 0x13, + SNPE_UDO_LAYOUT_LAST = 0xFFFFFFFF, UDO_LAYOUT_LAST = 0xFFFFFFFF +} SnpeUdo_TensorLayout_t; + +typedef SnpeUdo_TensorLayout_t Udo_TensorLayout_t; + +/** + * An enum which holds the UDO library Core type . + * Designed to be used as single values or combined into a bitfield parameter + * (0x1, 0x2, 0x4, etc) + */ +typedef enum +{ + /// Library target IP Core is undefined + SNPE_UDO_CORETYPE_UNDEFINED = 0x00, UDO_CORETYPE_UNDEFINED = 0x00, + /// Library target IP Core is CPU + SNPE_UDO_CORETYPE_CPU = 0x01, UDO_CORETYPE_CPU = 0x01, + /// Library target IP Core is GPU + SNPE_UDO_CORETYPE_GPU = 0x02, UDO_CORETYPE_GPU = 0x02, + /// Library target IP Core is DSP + SNPE_UDO_CORETYPE_DSP = 0x04, UDO_CORETYPE_DSP = 0x04, + SNPE_UDO_CORETYPE_LAST = 0xFFFFFFFF, UDO_CORETYPE_LAST = 0xFFFFFFFF +} SnpeUdo_CoreType_t; + +typedef SnpeUdo_CoreType_t Udo_CoreType_t; + +/** + * An enum to specify the parameter type : Scalar or Tensor + */ +typedef enum +{ + /// UDO static param type: scalar + SNPE_UDO_PARAMTYPE_SCALAR = 0x00, UDO_PARAMTYPE_SCALAR = 0x00, + /// UDO static param type: string + SNPE_UDO_PARAMTYPE_STRING = 0x01, UDO_PARAMTYPE_STRING = 0x01, + /// UDO static param type: tensor + SNPE_UDO_PARAMTYPE_TENSOR = 0x02, UDO_PARAMTYPE_TENSOR = 0x02, + SNPE_UDO_PARAMTYPE_LAST = 0xFFFFFFFF, UDO_PARAMTYPE_LAST = 0xFFFFFFFF +} SnpeUdo_ParamType_t; + +typedef SnpeUdo_ParamType_t Udo_ParamType_t; + +/** + * An enum to specify quantization type + */ +typedef enum +{ + /// Tensor Quantization type: NONE. Signifies unquantized tensor data + SNPE_UDO_QUANTIZATION_NONE = 0x00, UDO_QUANTIZATION_NONE = 0x00, + /// Tensor Quantization type: Tensorflow-style + SNPE_UDO_QUANTIZATION_TF = 0x01, UDO_QUANTIZATION_TF = 0x01, + SNPE_UDO_QUANTIZATION_QMN = 0x02, UDO_QUANTIZATION_QMN = 0x02, + SNPE_UDO_QUANTIZATION_LAST = 0xFFFFFFFF, UDO_QUANTIZATION_LAST = 0xFFFFFFFF +} SnpeUdo_QuantizationType_t; + +typedef SnpeUdo_QuantizationType_t Udo_QuantizationType_t; + +/** + * @brief A struct which is used to provide a version number using 3 values : major, minor, teeny + * + */ +typedef struct +{ + /// version field: major - for backward-incompatible changes + uint32_t major; + /// version field: minor - for backward-compatible feature updates + uint32_t minor; + /// version field: teeny - for minor bug-fixes and clean-up + uint32_t teeny; +} SnpeUdo_Version_t; + +typedef SnpeUdo_Version_t Udo_Version_t; + +/** + * @brief A struct returned from version query, contains the Library version and API version + * + */ +typedef struct +{ + /// Version of UDO library. Controlled by users + SnpeUdo_Version_t libVersion; + /// Version of SNPE UDO API used in compiling library. Determined by SNPE + SnpeUdo_Version_t apiVersion; +} SnpeUdo_LibVersion_t; + +/** + * @brief A struct returned from version query, contains the package version + * + */ +typedef struct +{ + /// Version of UDO API used in package. + Udo_Version_t apiVersion; +} Udo_PkgVersion_t; + +/** + * @brief A union to hold the value of a generic type. Allows defining a parameter struct + * in a generic way, with a "value" location that holds the data regardless of the type. + * + */ +typedef union +{ + /// value type: float + float floatValue; + /// value type: unsigned 32-bit integer + uint32_t uint32Value; + /// value type: signed 32-bit integer + int32_t int32Value; + /// value type: unsigned 16-bit integer + uint16_t uint16Value; + /// value type: signed 16-bit integer + int16_t int16Value; + /// value type: unsigned 8-bit integer + uint8_t uint8Value; + /// value type: signed 8-bit integer + int8_t int8Value; +} SnpeUdo_Value_t; + +typedef SnpeUdo_Value_t Udo_Value_t; + +/** + * @brief A struct which defines a scalar parameter : name, data type, and union of values + * + */ +typedef struct +{ + /// The parameter data type : float, int, etc. + SnpeUdo_DataType_t dataType; + /// a union of specified type which holds the data + SnpeUdo_Value_t dataValue; +} SnpeUdo_ScalarParam_t; + +typedef SnpeUdo_ScalarParam_t Udo_ScalarParam_t; + +/** + * @brief A struct which defines the quantization parameters in case of Tensorflow style quantization + * + */ +typedef struct +{ + /// minimum value of the quantization range of data + float minValue; + /// maximum value of the quantization range of data + float maxValue; +} SnpeUdo_TFQuantize_t; + +typedef SnpeUdo_TFQuantize_t Udo_TFQuantize_t; + +/** + * @brief A struct which defines the quantization type, and union of supported quantization structs + * + */ +typedef struct +{ + /// quantization type (only TF-style currently supported) + SnpeUdo_QuantizationType_t quantizeType; + union + { + /// TF-style min-max quantization ranges + SnpeUdo_TFQuantize_t TFParams; + }; +} SnpeUdo_QuantizeParams_t; + +typedef SnpeUdo_QuantizeParams_t Udo_QuantizeParams_t; + +/** + * @brief A struct which defines the datatype associated with a specified core-type + * This should be used to denote the datatypes for a single tensor info, depending + * on the intended execution core. + * + */ +typedef struct +{ + /// The IP Core + SnpeUdo_CoreType_t coreType; + /// The associated datatype for this coreType + SnpeUdo_DataType_t dataType; +} SnpeUdo_PerCoreDatatype_t; + +typedef SnpeUdo_PerCoreDatatype_t Udo_PerCoreDatatype_t; + +/** + * @brief A struct which defines a tensor parameter : name, data type, layout, quantization, more. + * Also holds a pointer to the tensor data. + * + */ +typedef struct +{ + /// The maximum allowable dimensions of the tensor. The memory held in + /// _tensorData_ is guaranteed to be large enough for this. + uint32_t* maxDimensions; + /// The current dimensions of the tensor. An operation may modify the current + /// dimensions of its output, to indicate cases where the output has been + /// "resized". + /// Note that for static parameters, the current and max dimensions must + /// match. + uint32_t* currDimensions; + /// Quantization params applicable to the tensor. Currently only supports + /// Tensorflow quantization style. + SnpeUdo_QuantizeParams_t quantizeParams; + /// Number of dimensions to the tensor: 3D, 4D, etc. + uint32_t tensorRank; + /// The parameter data type: float, int, etc. + SnpeUdo_DataType_t dataType; + /// The tensor layout type: NCHW, NHWC, etc. + SnpeUdo_TensorLayout_t layout; + /// Opaque pointer to tensor data. User may be required to re-interpret the pointer + /// based on core-specific definitions. + void* tensorData; +} SnpeUdo_TensorParam_t; + +typedef SnpeUdo_TensorParam_t Udo_TensorParam_t; + +/** + * @brief struct which defines a UDO parameter - a union of scalar, tensor and string parameters + * + */ +typedef struct +{ + /// Type is scalar or tensor + SnpeUdo_ParamType_t paramType; + /// The param name, for example : "offset", "activation_type" + SnpeUdo_String_t paramName; + union + { + /// scalar param value + SnpeUdo_ScalarParam_t scalarParam; + /// tensor param value + SnpeUdo_TensorParam_t tensorParam; + /// string param value + SnpeUdo_String_t stringParam; + }; +} SnpeUdo_Param_t; + +typedef SnpeUdo_Param_t Udo_Param_t; + +/** + * @brief A struct which defines Operation information which is specific for IP core (CPU, GPU, DSP ...) + * + */ +typedef struct +{ + /// The IP Core + SnpeUdo_CoreType_t udoCoreType; + /// Bitmask, defines supported internal calculation types (like FLOAT_32, etc) + /// Based on SnpeUdo_DataType + SnpeUdo_Bitmask_t operationCalculationTypes; +} SnpeUdo_OpCoreInfo_t; + +typedef SnpeUdo_OpCoreInfo_t Udo_OpCoreInfo_t; + +/** + * @brief A struct which defines the common and core-specific Operation information + * + */ +typedef struct +{ + /// Operation type + SnpeUdo_String_t operationType; + /// A bitmask describing which IP Cores (CPU, GPU, DSP ...) support this operation + /// Translated based on SnpeUdo_CoreType + SnpeUdo_Bitmask_t supportedByCores; + /// Number of static parameters defined by the op + uint32_t numOfStaticParams; + /// Array of static parameters. Can be scalar or tensor params + SnpeUdo_Param_t* staticParams; + /// Number of input tensors this op receives + uint32_t numOfInputs; + /// Array of input tensor names to this operation + SnpeUdo_String_t* inputNames; + /// Number of output tensors this op receives + uint32_t numOfOutputs; + /// Array of output tensor names to this operation + SnpeUdo_String_t* outputNames; + /// Number of cores that the op can execute on + uint32_t numOfCoreInfo; + /// Array of per-core information entries + SnpeUdo_OpCoreInfo_t* opPerCoreInfo; +} SnpeUdo_OperationInfo_t; + +typedef SnpeUdo_OperationInfo_t Udo_OperationInfo_t; + +/** + * @brief A struct which provides the implementation library info : type, name + * + */ +typedef struct +{ + /// Defines the IP Core that this implementation library is targeting + SnpeUdo_CoreType_t udoCoreType; + /// library name. will be looked at in the standard library path + SnpeUdo_String_t libraryName; +} SnpeUdo_LibraryInfo_t; + +typedef SnpeUdo_LibraryInfo_t Udo_LibraryInfo_t; + +/** + * @brief A struct returned by the registration library and contains information on the UDO package : + * name, operations, libraries, etc. + * + */ +typedef struct +{ + /// A string containing the package name + SnpeUdo_String_t packageName; + /// A bitmask describing supported IP cores (CPU, GPU, DSP ...) + /// Translated based on SnpeUdo_CoreType + SnpeUdo_Bitmask_t supportedCoreTypes; + /// The number of implementation libraries in the package + uint32_t numOfImplementationLib; + /// Array of implementation libraries names/types + SnpeUdo_LibraryInfo_t* implementationLib; + /// A string containing all operation types separated by space + SnpeUdo_String_t operationsString; + /// Number of supported operations + uint32_t numOfOperations; + /// Array of Operation info structs. Each entry describes one + /// Operation (name, params, inputs, outputs) + SnpeUdo_OperationInfo_t* operationsInfo; +} SnpeUdo_RegInfo_t; + +typedef SnpeUdo_RegInfo_t Udo_RegInfo_t; + +/** +* @brief A struct returned by the implementation library and contains information on the +* specific library: name, IP Core, operations, etc. +* +*/ +typedef struct +{ + /// Defines the IP Core that this implementation library is targeting + SnpeUdo_CoreType_t udoCoreType; + /// A string containing the package name + SnpeUdo_String_t packageName; + /// A string containing all operation types separated by space + SnpeUdo_String_t operationsString; + /// Number of supported operations + uint32_t numOfOperations; +} SnpeUdo_ImpInfo_t; + +typedef SnpeUdo_ImpInfo_t Udo_ImpInfo_t; + +/** + * @brief This struct defines an operation. It is used for validation + * or creation of an operation. + * In case of using it for creation, the static params which are tensors + * contain pointers to the real data (weights, for example), and input/output + * tensors also include pointers to the buffers used. + */ +typedef struct +{ + /// The IP Core that the operation is defined for - CPU, GPU, DSP... + SnpeUdo_CoreType_t udoCoreType; + /// Operation type + SnpeUdo_String_t operationType; + /// The number of static parameters provided in the staticParams array. + /// this number has to match the number provided by the UDO Registration library information + uint32_t numOfStaticParams; + /// Array of static parameters + SnpeUdo_Param_t* staticParams; + /// The number of input parameters provided in inputs array. + /// this number has to match the number provided by the UDO Registration library information + uint32_t numOfInputs; + /// Array of input tensors, providing layout, data type, sizes, etc + /// When used to create an operation, also contains the initial location of the data + SnpeUdo_TensorParam_t* inputs; + /// The number of output parameters provided in inputs array. + /// this number has to match the number provided by the UDO Registration library information + uint32_t numOfOutputs; + /// Array of output tensors, providing layout, data type, sizes, etc + /// When used to create an operation, also contains the initial location of the data + SnpeUdo_TensorParam_t* outputs; +} SnpeUdo_OpDefinition_t; + +typedef SnpeUdo_OpDefinition_t Udo_OpDefinition_t; + +/** @} */ /* end_addtogroup c_plus_plus_apis C++ */ + +#endif //SNPE_UDO_BASE_H diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoFlatten.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoFlatten.h new file mode 100755 index 0000000000000..84a8fe310908e --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoFlatten.h @@ -0,0 +1,78 @@ +//============================================================================== +// +// Copyright (c) 2019 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +#include "DSP/Udo/UdoBase.h" + +#define HVX_ALIGNMENT 128 +#define DSP_STRUCT_ALIGNMENT 8 +#define DSP_ALIGN(X, ALIGNMENT) (((X) + ALIGNMENT - 1) & (~((ALIGNMENT)-1))) + +typedef struct dspStaticParamsMeta { + uint32_t size; + uint32_t numParams; +} dspStaticParamsMeta_t; + +typedef struct tensorParamInfo { + SnpeUdo_TensorLayout_t layout; + SnpeUdo_QuantizeParams_t quantizeInfo; + SnpeUdo_DataType_t dataType; + uint32_t paddingFor8byteAlignment; +} tensorParamInfo_t; + +typedef struct udoString { + uint32_t sizeStruct; // aligned + uint32_t lengthString; // does not include null character + // followed by a string +} udoString_t; // allocate mem for string for 8 byte alignment + +typedef struct dims { + uint32_t size; + uint32_t rank; + uint32_t ds; // rank # of max dimensions followed by rank # of current dimensions for tensors +} dims_t; + +typedef struct tensorData { + uint32_t structSize; + uint32_t dataSize; + // followed by actual tensor data +} tensorData_t; + +typedef struct dspStaticParamDescriptor { + uint32_t size; // including size of descriptor (including dims + data for tensors) (or including string for strings) + SnpeUdo_ParamType_t paramType; + union { // not used for string data + SnpeUdo_ScalarParam_t scalarInfo; + tensorParamInfo_t tensorInfo; + }; + udoString_t name; + // followed by char* + // in case of tensor, followed by dim_stride and tensor_data + // in case of string, followed by udo_string and char* +} dspStaticParamDescriptor_t; + +typedef struct paramSizes { + uint32_t descriptorSize; + uint32_t nameStructSize; + uint32_t dimsSize; + uint32_t dataStructSize; + uint32_t dataSize; + uint32_t stringDataStructSize; +} paramSizes_t; + +typedef struct dspStaticParams { + dspStaticParamsMeta_t meta; + dspStaticParamDescriptor_t paramDesc; +} dspStaticParams_t; + + +int +SnpeUdo_flattenStaticParams (SnpeUdo_Param_t** paramList, uint32_t numParams, uint32_t* flattenedSize, void** flattened); + +void +SnpeUdo_freeFlattenedStaticParams (void** flattened); + diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoImpl.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoImpl.h new file mode 100755 index 0000000000000..bcc767a3c4a0f --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoImpl.h @@ -0,0 +1,343 @@ +//============================================================================== +// +// Copyright (c) 2019-2021 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +#ifndef SNPE_UDO_IMPL_H +#define SNPE_UDO_IMPL_H + +#include + +#include "DSP/Udo/UdoShared.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + +/** @addtogroup c_plus_plus_apis C++ +@{ */ + +typedef struct _SnpeUdo_OpFactory_t* SnpeUdo_OpFactory_t; +typedef struct _SnpeUdo_Operation_t* SnpeUdo_Operation_t; + +typedef SnpeUdo_OpFactory_t Udo_OpFactory_t; +typedef SnpeUdo_Operation_t Udo_Operation_t; + +/** + * @brief Initialize the shared library's data structures. Calling any other + * library function before this one will result in error. + * + * @param[in] globalInfrastructure Global core-specific infrastructure to be + * used by operations created in this library. The definition and + * semantics of this object will be defined in the corresponding + * implementation header for the core type. + * @return Error code + */ +SnpeUdo_ErrorType_t +SnpeUdo_initImplLibrary(void* globalInfrastructure); + +typedef SnpeUdo_ErrorType_t +(*SnpeUdo_InitImplLibraryFunction_t)(void*); + +/** + * @brief A function to query the API version of the UDO implementation library. + * The function populates a SnpeUdo_LibVersion_t struct, which contains a SnpeUdo_Version_t + * struct for API version and library version. + * + * @param[in, out] version A pointer to struct which contains major, minor, teeny information for + * library and api versions. + * + * @return Error code + */ +SnpeUdo_ErrorType_t +SnpeUdo_getImplVersion(SnpeUdo_LibVersion_t** version); + +typedef SnpeUdo_ErrorType_t +(*SnpeUdo_getImplVersion_t)(SnpeUdo_LibVersion_t** version); + +/** + * @brief Release the shared library's data structures, and invalidate any + * handles returned by the library. The behavior of any outstanding + * asynchronous calls made to this library when this function is called + * are undefined. All library functions (except SnpeUdo_initImplLibrary) will + * return an error after this function has been successfully called. + * + * It should be possible to call SnpeUdo_initImplLibrary after calling this + * function, and re-initialize the library. + * + * @return Error code + */ +SnpeUdo_ErrorType_t +SnpeUdo_terminateImplLibrary(void); + +typedef SnpeUdo_ErrorType_t +(*SnpeUdo_TerminateImplLibraryFunction_t)(void); + + +/** + * @brief A function to query info on the UDO implementation library. + * The function populates a structure which contains information about + * operations that are part of this library + * + * @param[in, out] implementationInfo A pointer to struct which contains information + * on the operations + * + * @return error code + * + */ +SnpeUdo_ErrorType_t +SnpeUdo_getImpInfo(SnpeUdo_ImpInfo_t** implementationInfo); + +typedef SnpeUdo_ErrorType_t +(*SnpeUdo_GetImpInfoFunction_t)(SnpeUdo_ImpInfo_t** implementationInfo); + +typedef SnpeUdo_GetImpInfoFunction_t Udo_GetImpInfoFunction_t; + +/** + * @brief A function to create an operation factory. + * The function receives the operation type, and an array of static parameters, + * and returns operation factory handler + * + * @param[in] udoCoreType The Core type to create the operation on. An error will + * be returned if this does not match the core type of the library. + * + * @param[in] perFactoryInfrastructure CreateOpFactory infrastructure appropriate to this + * core type. The definition and semantics of this object will be defined + * in the corresponding implementation header for the core type. + * + * @param[in] operationType A string containing Operation type. for example "MY_CONV" + * + * @param[in] numOfStaticParams The number of static parameters. + * + * @param[in] staticParams Array of static parameters + * + * @param[in,out] opFactory Handler to Operation Factory, to be used when creating operations + * + * @return Error Code + */ +SnpeUdo_ErrorType_t +SnpeUdo_createOpFactory(SnpeUdo_CoreType_t udoCoreType, + void* perFactoryInfrastructure, + SnpeUdo_String_t operationType, + uint32_t numOfStaticParams, + SnpeUdo_Param_t* staticParams, + SnpeUdo_OpFactory_t* opFactory); + +typedef SnpeUdo_ErrorType_t +(*SnpeUdo_CreateOpFactoryFunction_t)(SnpeUdo_CoreType_t, + void*, + SnpeUdo_String_t, + uint32_t, + SnpeUdo_Param_t*, + SnpeUdo_OpFactory_t*); + +typedef SnpeUdo_CreateOpFactoryFunction_t Udo_CreateOpFactoryFunction_t; + +/** + * @brief A function to release the resources allocated for an operation factory + * created by this library. + * + * @param[in] opFactory The operation factory to release. Upon success this handle will be invalidated. + * + * @return Error Code + */ +SnpeUdo_ErrorType_t +SnpeUdo_releaseOpFactory(SnpeUdo_OpFactory_t opFactory); + +typedef SnpeUdo_ErrorType_t +(*SnpeUdo_ReleaseOpFactoryFunction_t)(SnpeUdo_OpFactory_t); + +typedef SnpeUdo_ReleaseOpFactoryFunction_t Udo_ReleaseOpFactoryFunction_t; + +/** + * @brief A function to create an operation from the factory. + * The function receives array of inputs and array of outputs, and creates an operation + * instance, returning the operation instance handler. + * + * @param[in] opFactory OpFactory instance containing the parameters for this operation. + * + * @param[in] perOpInfrastructure Per-Op infrastructure for this operation. The definition + * and semantics of this object will be defined in the implementation header + * appropriate to this core type. + * + * @param[in] numOfInputs The number of input tensors this operation will receive. + * + * @param[in] inputs Array of input tensors, providing both the sizes and initial + * location of the data. + * + * @param[in] numOfOutputs Number of output tensors this operation will produce. + * + * @param[in] outputs Array of output tensors, providing both the sizes and + * initial location of the data. + * + * @param[in,out] operation Handle for newly created operation instance. + * + * @return Error Code + */ +SnpeUdo_ErrorType_t +SnpeUdo_createOperation(SnpeUdo_OpFactory_t opFactory, + void* perOpInfrastructure, + uint32_t numOfInputs, + SnpeUdo_TensorParam_t* inputs, + uint32_t numOfOutputs, + SnpeUdo_TensorParam_t* outputs, + SnpeUdo_Operation_t* operation); + +typedef SnpeUdo_ErrorType_t +(*SnpeUdo_CreateOperationFunction_t)(SnpeUdo_OpFactory_t, + void*, + uint32_t, + SnpeUdo_TensorParam_t*, + uint32_t, + SnpeUdo_TensorParam_t*, + SnpeUdo_Operation_t*); + +typedef SnpeUdo_CreateOperationFunction_t Udo_CreateOperationFunction_t; + +/** + * @brief A pointer to notification function. + * + * The notification function supports the non-blocking (e.g. asynchronous) execution use-case. + * In case an "executeUdoOp" function is called with "blocking" set to zero, and a + * notify function, this function will be called by the implementation library at the + * end of execution. The implementation library will pass the notify function the ID + * that was provided to it when "executeUdoOp" was called. + * + * @param[in] ID 32-bit value, that was provided to executeUdoOp by the calling entity. + * Can be used to track the notifications, in case of multiple execute calls issued. + * + * @return Error code + * + */ +typedef SnpeUdo_ErrorType_t +(*SnpeUdo_ExternalNotify_t)(const uint32_t ID); + +typedef SnpeUdo_ExternalNotify_t Udo_ExternalNotify_t; + +/** + * @brief Operation execution function. + * + * Calling this function will run the operation on set of inputs, generating a set of outputs. + * The call can be blocking (synchronous) or non-blocking (asynchronous). To support the + * non-blocking mode, the calling entity can pass an ID and a notification function. + * At the end of the execution this notification function would be called, passing it the ID. + * NOTE: Asynchronous execution mode not supported in this release. + * + * @param[in] operation handle to the operation on which execute is invoked + * @param[in] blocking flag to indicate execution mode. + * If set, execution is blocking, + * e.g SnpeUdo_executeOp call does not return until execution is done. + * If not set, SnpeUdo_executeOp returns immediately, and the + * library will call the notification function (if set) when execution is done. + * + * @param[in] ID 32-bit number that can be used by the calling entity to track execution + * in case of non-blocking execution. + * For example, it can be a sequence number, increased by one on each call. + * + * @param[in] notifyFunc Pointer to notification function. if the pointer is set, and execution is + * non-blocking, the library will call this function at end of execution, + * passing the number provided as ID + * + * @return Error code + * + */ +SnpeUdo_ErrorType_t +SnpeUdo_executeOp(SnpeUdo_Operation_t operation, + bool blocking, + const uint32_t ID, + SnpeUdo_ExternalNotify_t notifyFunc); + +typedef SnpeUdo_ErrorType_t +(*SnpeUdo_ExecuteOpFunction_t)(SnpeUdo_Operation_t, + bool, + const uint32_t, + SnpeUdo_ExternalNotify_t); + +typedef SnpeUdo_ExecuteOpFunction_t Udo_ExecuteOpFunction_t; + +/** + * @brief A function to setting the inputs & outputs. part of SnpeUdo_Operation struct, + * returned from creation of a new operation instance. + * Not supported in this release. + * + * This function allows the calling entity to change some of the inputs and outputs + * between calls to execute. + * Note that the change is limited to changing the pointer to the tensor data only. + * Any other change may be rejected by the implementation library, causing + * immediate invalidation of the operation instance + * + * @param[in] operation Operation on which IO tensors are set + * + * @param[in] inputs array of tensor parameters. The calling entity may provide a subset of the + * operation inputs, providing only those that it wants to change. + * + * @param[in] outputs array of tensor parameters. The calling entity may provide a subset of the + * operation outputs, providing only those that it wants to change. + * + * @return Error code + * + */ +SnpeUdo_ErrorType_t +SnpeUdo_setOpIO(SnpeUdo_Operation_t operation, + SnpeUdo_TensorParam_t* inputs, + SnpeUdo_TensorParam_t* outputs); + +typedef SnpeUdo_ErrorType_t +(*SnpeUdo_SetOpIOFunction_t)(SnpeUdo_Operation_t, + SnpeUdo_TensorParam_t*, + SnpeUdo_TensorParam_t*); + +typedef SnpeUdo_SetOpIOFunction_t Udo_SetOpIOFunction_t; + +/** + * @brief A function to return execution times. + * + * This function can be called to query the operation execution times on the IP core + * on which the operation is run. The time is provided in micro-seconds + * + * @param[in] operation Handle to operation whose execution time is being profiled + * + * @param[in,out] executionTime pointer to a uint32 value.This function writes the operation + * execution time in usec into this value. + * + * @return Error code + * + */ +SnpeUdo_ErrorType_t +SnpeUdo_profileOp(SnpeUdo_Operation_t operation, uint32_t *executionTime); + +typedef SnpeUdo_ErrorType_t +(*SnpeUdo_ProfileOpFunction_t)(SnpeUdo_Operation_t, uint32_t*); + +typedef SnpeUdo_ProfileOpFunction_t Udo_ProfileOpFunction_t; + +/** + * @brief A function to release the operation instance + * \n When it is called, the implementation library needs to release all resources + * allocated for this operation instance. + * \n Note that all function pointers which are part of SnpeUdo_Operation become + * invalid once releaseUdoOp call returns. + * + * @param[in] operation Handle to operation to be released + * @return Error code + * + */ +SnpeUdo_ErrorType_t +SnpeUdo_releaseOp(SnpeUdo_Operation_t operation); + +typedef SnpeUdo_ErrorType_t +(*SnpeUdo_ReleaseOpFunction_t)(SnpeUdo_Operation_t); + +typedef SnpeUdo_ReleaseOpFunction_t Udo_ReleaseOpFunction_t; + +/** @} */ /* end_addtogroup c_plus_plus_apis C++ */ + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif //SNPE_UDO_IMPL_H diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoImplDsp.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoImplDsp.h new file mode 100755 index 0000000000000..522c6050a402d --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoImplDsp.h @@ -0,0 +1,199 @@ +//============================================================================== +// +// Copyright (c) 2019-2021 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +// Header to be used by a DSP Hexnn UDO Implementation library + +#ifndef SNPE_UDO_IMPL_DSP_H +#define SNPE_UDO_IMPL_DSP_H +#include +#include "DSP/Udo/UdoImpl.h" + +/** @addtogroup c_plus_plus_apis C++ +@{ */ + +/** + * @brief A function to validate that a set of params is supported by an operation + * This function is HexNN specific, use case is when registration library is not in use. + * Optional function. + * + * @param[in] operationType Operation type + * @param[in] numOfStaticParams Number of static params defined by the op + * @param[in] staticParams Array of static params to the op + * @return Error code, indicating if the operation can be created on this set of configuration or not. + * + */ + +SnpeUdo_ErrorType_t +SnpeUdo_validateOperation (SnpeUdo_String_t operationType, + uint32_t numOfStaticParams, + const SnpeUdo_Param_t* staticParams); + +typedef SnpeUdo_ErrorType_t (*SnpeUdo_ValidateOperationFunction_t) (SnpeUdo_String_t, + uint32_t, + const SnpeUdo_Param_t*); + +typedef SnpeUdo_ValidateOperationFunction_t Udo_ValidateOperationFunction_t; + +// enum used for indicating input/outout tensor data layouts on DSP, plain vs d32 +typedef enum { + SNPE_UDO_DSP_TENSOR_LAYOUT_PLAIN = 0x00, UDO_DSP_TENSOR_LAYOUT_PLAIN = 0x00, + SNPE_UDO_DSP_TENSOR_LAYOUT_D32 = 0x01, UDO_DSP_TENSOR_LAYOUT_D32 = 0x01 +} SnpeUdo_HexNNTensorLayout_t; + +typedef SnpeUdo_HexNNTensorLayout_t Udo_HexNNTensorLayout_t; + +/** + * @brief A function to query numbers of inputs and outputs, + * quantization type of each input and each output as arrays, + * and data layout (plain vs d32) of each input and each output as arrays + * of an operation. + * inputsQuantTypes and inputsLayouts should point to arrays of size numOfInputs + * outputsQuantTypes and outputsLayouts should point to arrays of size numOfOutputs + * + * Note: inputsLayouts and inputsLayouts can point to NULL, in this case, it is + * assumed all inputs and/or outputs have plain data layouts, i.e. no D32 + * + * @param[in] operationType Operation type + * @param[in] numOfStaticParams Number of static params defined by the op + * @param[in] staticParams Array of static params to the op + * @param[in,out] numOfInputs Number of input tensors to the op + * @param[in,out] inputsQuantTypes Array of Quantization info for each input tensor + * @param[in,out] inputsLayouts Array of layout type for each input tensor + * @param[in,out] numOfOutputs Number of output tensors to the op + * @param[in,out] outputsQuantTypes Array of Quantization info for each output tensor + * @param[in,out] outputsLayouts Array of layout type for each output tensor + * @return error code, indicating status of query + */ + +SnpeUdo_ErrorType_t +SnpeUdo_queryOperation (SnpeUdo_String_t operationType, + uint32_t numOfStaticParams, + const SnpeUdo_Param_t* staticParams, + uint32_t* numOfInputs, + SnpeUdo_QuantizationType_t** inputsQuantTypes, + SnpeUdo_HexNNTensorLayout_t** inputsLayouts, + uint32_t* numOfOutputs, + SnpeUdo_QuantizationType_t** outputsQuantTypes, + SnpeUdo_HexNNTensorLayout_t** outputsLayouts); + +typedef SnpeUdo_ErrorType_t (*SnpeUdo_QueryOperationFunction_t) (SnpeUdo_String_t, + uint32_t, + const SnpeUdo_Param_t*, + uint32_t*, + SnpeUdo_QuantizationType_t**, + SnpeUdo_HexNNTensorLayout_t**, + uint32_t*, + SnpeUdo_QuantizationType_t**, + SnpeUdo_HexNNTensorLayout_t**); + +typedef SnpeUdo_QueryOperationFunction_t Udo_QueryOperationFunction_t; + +// Global infrastructure functions supported by Hexagon-NN v2 +typedef void (*workerThread_t) (void* perOpInfrastructure, void* userData); +typedef int (*udoSetOutputTensorSize_t) (void* perOpInfrastructure, uint32_t outIdx, uint32_t size); +typedef int (*udoGetInputD32Paddings_t) (void* perOpInfrastructure, uint32_t inIdx, + uint32_t* heightPadBefore, uint32_t* heightPadAfter, + uint32_t* widthPadBefore, uint32_t* widthPadAfter, + uint32_t* depthPadBefore, uint32_t* depthPadAfter); +typedef int (*udoSetOutputD32ShapeSizePaddings_t) (void* perOpInfrastructure, uint32_t outIdx, + uint32_t batch, + uint32_t height, uint32_t heightPadBefore, uint32_t heightPadAfter, + uint32_t width, uint32_t widthPadBefore, uint32_t widthPadAfter, + uint32_t depth, uint32_t depthPadBefore, uint32_t depthPadAfter, + SnpeUdo_DataType_t dataType); +typedef void* (*udoMemalign_t) (size_t n, size_t size); +typedef void* (*udoMalloc_t) (size_t size); +typedef void* (*udoCalloc_t) (size_t n, size_t size); +typedef void (*udoFree_t) (void* ptr); +typedef uint32_t (*udoGetVtcmSize_t) (void* perOpInfrastructure); +typedef void* (*udoGetVtcmPtr_t) (void* perOpInfrastructure); +typedef uint32_t (*udoVtcmIsReal_t) (void* perOpInfrastructure); +typedef void (*udoRunWorkerThreads_t) (void* perOpInfrastructure, uint32_t nThreads, workerThread_t w, void* userData); + +typedef struct hexNNv2GlobalInfra { + udoSetOutputTensorSize_t udoSetOutputTensorSize; + udoGetInputD32Paddings_t udoGetInputD32Paddings; + udoSetOutputD32ShapeSizePaddings_t udoSetOutputD32ShapeSizePaddings; + udoMemalign_t udoMemalign; + udoMalloc_t udoMalloc; + udoCalloc_t udoCalloc; + udoFree_t udoFree; + udoGetVtcmSize_t udoGetVtcmSize; + udoGetVtcmPtr_t udoGetVtcmPtr; + udoVtcmIsReal_t udoVtcmIsReal; + udoRunWorkerThreads_t udoRunWorkerThreads; +} SnpeUdo_HexNNv2GlobalInfra_t; + +typedef SnpeUdo_HexNNv2GlobalInfra_t Udo_HexNNv2GlobalInfra_t; + +// hexnn types +typedef enum hexnnInfraType { + UDO_INFRA_HEXNN_V2, + UDO_INFRA_HEXNN_V3 // reserved, do not use +} SnpeUdo_HexNNInfraType_t; + +typedef SnpeUdo_HexNNInfraType_t Udo_HexNNInfraType_t; + +typedef struct { + Udo_CreateOpFactoryFunction_t create_op_factory; + Udo_CreateOperationFunction_t create_operation; + Udo_ExecuteOpFunction_t execute_op; + Udo_ReleaseOpFunction_t release_op; + Udo_ReleaseOpFactoryFunction_t release_op_factory; + Udo_ValidateOperationFunction_t validate_op; + Udo_QueryOperationFunction_t query_op; +} udo_func_package_t; + +/** + * @brief Infrastructures needed by a developer of DSP Hexnn UDO Implementation library. + * + * The framework/runtime which loads the Hexnn UDO implementation library provides + * this infrastructure to the loaded library by calling "SnpeUdo_initImplLibrary" + * function, and passing it (cast to void*). The Hexnn UDO library is expected + * to cast it back to this structure. + * + */ +typedef struct dspGlobalInfrastructure { + SnpeUdo_Version_t dspInfraVersion; // api version + SnpeUdo_HexNNInfraType_t infraType; + SnpeUdo_HexNNv2GlobalInfra_t hexNNv2Infra; +} SnpeUdo_DspGlobalInfrastructure_t; + +typedef SnpeUdo_DspGlobalInfrastructure_t Udo_DspGlobalInfrastructure_t; + +/** + * hexnn v2 per op factory infrastructure + * + * The framework/runtime passes per op factory infrastructure as a void pointer + * to HexNN UDO implementation library by calling function "SnpeUdo_createOpFactory". + * UDO implementation library is expected to cast it back to this following struct. + * + */ +typedef struct hexnnv2OpFactoryInfra { + unsigned long graphId; +} SnpeUdo_HexNNv2OpFactoryInfra_t; + +typedef SnpeUdo_HexNNv2OpFactoryInfra_t Udo_HexNNv2OpFactoryInfra_t; + +/** + * hexnn v2 per operation infrastructure + * + * The framework/runtime passes per operation infrastructure as a void pointer + * to HexNN UDO implementation library by calling function "SnpeUdo_createOperation". + * UDO implementation library is expected to cast it to the following type and save it. + * + * This is needed to be passed back into some functions from global infrastructure. + * + */ +typedef void* SnpeUdo_HexNNv2OpInfra_t; + +typedef SnpeUdo_HexNNv2OpInfra_t Udo_HexNNv2OpInfra_t; + +/** @} */ /* end_addtogroup c_plus_plus_apis C++ */ + +#endif // SNPE_UDO_IMPL_DSP_H diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoShared.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoShared.h new file mode 100755 index 0000000000000..8c17c1d5b35f1 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoShared.h @@ -0,0 +1,48 @@ +//============================================================================== +// +// Copyright (c) 2019-2021 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +#ifndef SNPE_UDO_SHARED_H +#define SNPE_UDO_SHARED_H + +#include "DSP/Udo/UdoBase.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + +/** @addtogroup c_plus_plus_apis C++ +@{ */ + +/** + * @brief A function to return the various versions as they relate to the UDO + * The function returns a struct containing the the following: + * libVersion: the version of the implementation library compiled for the UDO. Set by user + * apiVersion: the version of the UDO API used in compiling the implementation library. + * Set by SNPE + * + * @param[in, out] version A pointer to Version struct of type SnpeUdo_LibVersion_t + * + * @return Error code + * + */ +SnpeUdo_ErrorType_t +SnpeUdo_getVersion (SnpeUdo_LibVersion_t** version); + +typedef SnpeUdo_ErrorType_t +(*SnpeUdo_GetVersionFunction_t) (SnpeUdo_LibVersion_t** version); + +typedef SnpeUdo_GetVersionFunction_t Udo_GetVersionFunction_t; + +#ifdef __cplusplus +} // extern "C" +#endif + +/** @} */ /* end_addtogroup c_plus_plus_apis C++ */ + +#endif // SNPE_UDO_SHARED_H diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuBackend.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuBackend.h new file mode 100755 index 0000000000000..d7050c875f6db --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuBackend.h @@ -0,0 +1,71 @@ +//============================================================================== +// +// Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +/** + * @file + * @brief A header which defines the QNN GPU specialization of the QnnBackend.h interface. + */ + +#ifndef QNN_GPU_BACKEND_H +#define QNN_GPU_BACKEND_H + +#ifdef __cplusplus +#include +#else +#include +#endif + +#include "QnnBackend.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** +* @brief This enum defines QNN GPU custom Backend config options. +*/ +typedef enum { + /// If non-zero, tuning mode will be enabled + QNN_GPU_BACKEND_CONFIG_OPTION_ENABLE_TUNING_MODE = 0, + /// The Performance cache directory. Must be non-null + QNN_GPU_BACKEND_CONFIG_OPTION_PERFORMANCE_CACHE_DIR = 1, + /// If non-zero, the performance cache will be ignored when initializing + QNN_GPU_BACKEND_CONFIG_OPTION_INVALIDATE_PERFORMANCE_CACHE = 2, + /// Unused, present to ensure 32 bits. + QNN_GPU_BACKEND_CONFIG_OPTION_UNDEFINED = 0x7FFFFFFF, +} QnnGpuBackend_ConfigOption_t; + +/** + * @brief A struct which defines the QNN GPU Backend custom configuration options. + * Objects of this type are to be referenced through QnnBackend_CustomConfig_t. + */ +typedef struct { + QnnGpuBackend_ConfigOption_t option; + union UNNAMED { + uint8_t enableTuningMode; + const char* performanceCacheDir; + uint8_t invalidatePerformanceCache; + }; +} QnnGpuBackend_CustomConfig_t; + +// clang-format off +/// QnnGpuBackend_CustomConfig_t initializer macro +#define QNN_GPU_BACKEND_CUSTOM_CONFIG_INIT \ + { \ + QNN_GPU_BACKEND_CONFIG_OPTION_UNDEFINED, /*option*/ \ + { \ + false /*enableTuningMode*/ \ + } \ + } +// clang-format on + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuCommon.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuCommon.h new file mode 100755 index 0000000000000..8fd9c18afb46b --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuCommon.h @@ -0,0 +1,49 @@ +//============================================================================== +// +// Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +/** + * @file + * @brief A header which defines common QNN GPU macros. + */ + +#ifndef QNN_GPU_COMMON_H +#define QNN_GPU_COMMON_H + +#include "QnnCommon.h" + +/// GPU Backend identifier +#define QNN_BACKEND_ID_GPU 4 + +/// GPU interface provider +#define QNN_GPU_INTERFACE_PROVIDER_NAME "GPU_QTI_AISW" + +// GPU API Version values +#define QNN_GPU_API_VERSION_MAJOR 3 +#define QNN_GPU_API_VERSION_MINOR 7 +#define QNN_GPU_API_VERSION_PATCH 0 + +// clang-format off + +/// Macro to set Qnn_ApiVersion_t for GPU backend +#define QNN_GPU_API_VERSION_INIT \ + { \ + { \ + QNN_API_VERSION_MAJOR, /*coreApiVersion.major*/ \ + QNN_API_VERSION_MINOR, /*coreApiVersion.major*/ \ + QNN_API_VERSION_PATCH /*coreApiVersion.major*/ \ + }, \ + { \ + QNN_GPU_API_VERSION_MAJOR, /*backendApiVersion.major*/ \ + QNN_GPU_API_VERSION_MINOR, /*backendApiVersion.minor*/ \ + QNN_GPU_API_VERSION_PATCH /*backendApiVersion.patch*/ \ + } \ + } + +// clang-format on + +#endif // QNN_GPU_COMMON_H diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuContext.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuContext.h new file mode 100755 index 0000000000000..42599e4280971 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuContext.h @@ -0,0 +1,78 @@ +//============================================================================== +// +// Copyright (c) 2021-2023 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +/** + * @file + * @brief A header which defines the QNN GPU specialization of the QnnContext.h interface. + */ + +#ifndef QNN_GPU_CONTEXT_H +#define QNN_GPU_CONTEXT_H + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @brief This enum defines QNN GPU custom context config options. + */ +typedef enum { + /// Sets performance hint options via QnnGpuContext_PerfHint_t + QNN_GPU_CONTEXT_CONFIG_OPTION_PERF_HINT = 0, + /// If non-zero, OpenGL buffers will be used + QNN_GPU_CONTEXT_CONFIG_OPTION_USE_GL_BUFFERS = 1, + /// The kernel disk cache directory. Must be non-null + QNN_GPU_CONTEXT_CONFIG_OPTION_KERNEL_REPO_DIR = 2, + /// If non-zero, the kernel disk cache will be ignored when initializing + QNN_GPU_CONTEXT_CONFIG_OPTION_INVALIDATE_KERNEL_REPO = 3, + /// Unused, present to ensure 32 bits. + QNN_GPU_CONTEXT_CONFIG_OPTION_UNDEFINED = 0x7FFFFFFF +} QnnGpuContext_ConfigOption_t; + +/** + * @brief An enum which defines the different GPU performance hint options. + */ +typedef enum { + /// Sets the GPU performance hint to high performance, this is the default + QNN_GPU_CONTEXT_PERF_HINT_HIGH = 0, + /// Sets the GPU performance hint to normal performance + QNN_GPU_CONTEXT_PERF_HINT_NORMAL = 1, + /// Sets the GPU performance hint to low performance + QNN_GPU_CONTEXT_PERF_HINT_LOW = 2 +} QnnGpuContext_PerfHint_t; + +/** + * @brief A struct which defines the QNN GPU context custom configuration options. + * Objects of this type are to be referenced through QnnContext_CustomConfig_t. + */ +typedef struct { + QnnGpuContext_ConfigOption_t option; + union UNNAMED { + QnnGpuContext_PerfHint_t perfHint; + uint8_t useGLBuffers; + const char* kernelRepoDir; + uint8_t invalidateKernelRepo; + }; +} QnnGpuContext_CustomConfig_t; + +// clang-format off +/// QnnGpuContext_CustomConfig_t initializer macro +#define QNN_GPU_CONTEXT_CUSTOM_CONFIG_INIT \ + { \ + QNN_GPU_CONTEXT_CONFIG_OPTION_UNDEFINED, /*option*/ \ + { \ + QNN_GPU_CONTEXT_PERF_HINT_HIGH /*perfHint*/ \ + } \ + } +// clang-format on + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuGraph.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuGraph.h new file mode 100755 index 0000000000000..e0652d44883ef --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuGraph.h @@ -0,0 +1,72 @@ +//============================================================================== +// +// Copyright (c) 2020-2021 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +/** + * @file + * @brief A header which defines the QNN GPU specialization of the QnnGraph.h interface. + */ + +#ifndef QNN_GPU_GRAPH_H +#define QNN_GPU_GRAPH_H + +#ifdef __cplusplus +#include +#else +#include +#endif + +#include "QnnGraph.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @brief An enum which defines the different tensor optimization options. A + * tensor may be optimized to the specified QnnGpu_Precision_t when it + * is a graph tensor that is not a graph input or a graph output and + * does not connect two operations from different op packages. + */ +typedef enum { + /// Sets the precision mode to floating point 32-bit (FP32) + QNN_GPU_PRECISION_FP32 = 0, + /// Sets the precision mode to floating point 16-bit (FP16) + QNN_GPU_PRECISION_FP16 = 1, + /// Sets the precision mode to FP16 for storage and FP32 for calculations + QNN_GPU_PRECISION_HYBRID = 2, + /// Uses the tensor data type provided by the user (default) + QNN_GPU_PRECISION_USER_PROVIDED = 3, +} QnnGpu_Precision_t; + +/** + * @brief A struct which defines the QNN GPU graph custom configuration options. + * Objects of this type are to be referenced through QnnGraph_CustomConfig_t. + */ +typedef struct { + QnnGpu_Precision_t precision; + uint8_t disableMemoryOptimizations; + uint8_t disableNodeOptimizations; + uint8_t disableQueueRecording; +} QnnGpuGraph_CustomConfig_t; + +// clang-format off +/// QnnGpuGraph_CustomConfig_t initializer macro +#define QNN_GPU_GRAPH_CUSTOM_CONFIG_INIT \ + { \ + QNN_GPU_PRECISION_USER_PROVIDED, /*precision*/ \ + 0u, /*disableMemoryOptimizations*/ \ + 0u, /*disableNodeOptimizations*/ \ + 0u /*disableQueueRecording*/ \ + } +// clang-format on + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuMem.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuMem.h new file mode 100755 index 0000000000000..1c6cd5c3e032a --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuMem.h @@ -0,0 +1,52 @@ +//============================================================================== +// +// Copyright (c) 2024 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +/** + * @file + * @brief A header which defines the QNN GPU specialization of the QnnMem.h interface. + */ + +#ifndef QNN_GPU_MEM_H +#define QNN_GPU_MEM_H + +#ifdef __cplusplus +extern "C" { +#endif + +typedef void* QnnGpuMem_Buffer_t; + +/** + * @brief This enum defines QNN GPU memory type + */ +typedef enum { QNN_GPU_MEM_OPENCL = 0, QNN_GPU_MEM_UNDEFINED = 0x7FFFFFF } QnnGpu_MemType_t; + +/** + * @brief A struct which defines the QNN GPU memory preallocated by the client. + * Objects of this type are to be referenced through Qnn_MemInfoCustom_t. + */ +typedef struct { + QnnGpu_MemType_t memType; + union { + QnnGpuMem_Buffer_t buffer; + }; +} QnnGpu_MemInfoCustom_t; + +// clang-format off +/// QnnGpu_MemInfoCustom_t initializer macro +#define QNN_GPU_MEMINFO_CUSTOM_INIT \ + { \ + QNN_GPU_MEM_UNDEFINED, /*memType*/ \ + NULL /* buffer*/ \ + } +// clang-format on + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuOpPackage.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuOpPackage.h new file mode 100755 index 0000000000000..5413f50ba2267 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuOpPackage.h @@ -0,0 +1,682 @@ +//============================================================================== +// +// Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +/** + * @file + * @brief A header which defines the QNN GPU specialization of the QnnOpPackage.h interface. + */ + +#ifndef QNN_GPU_OP_PACKAGE_H +#define QNN_GPU_OP_PACKAGE_H + +#ifdef __cplusplus +#include +#else +#include +#endif + +#include "GPU/QnnGpuCommon.h" +#include "GPU/QnnGpuGraph.h" +#include "QnnOpPackage.h" +#include "QnnTypes.h" + +#ifdef __cplusplus +extern "C" { +#endif + +//============================================================================= +// QnnOpPackage_GlobalInfrastructure_t specialization. +//============================================================================= + +/** + * @brief A struct which is used to communicate device constant properties + */ +typedef struct { + /// GPU device version string + char deviceVersion[128]; + /// GPU driver interface version {major, minor} + uint32_t interfaceVersion[2]; + /// GPU Adreno(TM) tier string + char tierName[8]; + /// GPU driver version {product, major, minor, patch} + uint32_t compilerVersion[4]; + /// GPU device max work group size + size_t maxWorkGroupSize; + /// GPU device image 2D max width + size_t image2dMaxWidth; + /// GPU device image 2D max height + size_t image2dMaxHeight; + /// GPU device max memory allocation size + size_t maxBufferAllocSize; + /// GPU device addr alignment in bits + uint32_t baseAddrAlignment; + /// GPU device image 2D Array max width + size_t image2dArrayMaxWidth; + /// GPU device image 2D Array max height + size_t image2dArrayMaxHeight; + /// GPU device image 2D Array max depth + size_t image2dArrayMaxDepth; +} QnnGpu_DeviceProperties_t; + +/** + * @brief A QNN GPU struct specializing QnnOpPackage_GlobalInfrastructure_t + */ +typedef struct _QnnOpPackage_GlobalInfrastructure_t { + /// GPU backend version (as returned by QnnBackend_getApiVersion()) + const Qnn_ApiVersion_t* sdkApiVersion; + /// GPU device properties + const QnnGpu_DeviceProperties_t* deviceProperties; + /// Null terminated path to the OpenCL driver used by the backend + const char* driverPath; +} QnnGpuOpPackage_GlobalInfrastructure_t; + +//============================================================================= +// QnnOpPackage_PackageInfo_t specialization. +//============================================================================= + +/** + * @brief A struct having op package specific information + */ +typedef struct _QnnOpPackage_PackageInfo_t { + /// Null terminated hash key string of all kernel sources + const char* kernelRepoHash; +} QnnGpuOpPackage_PackageInfo_t; + +//============================================================================= +// QnnOpPackage_Optimization_t specialization. +//============================================================================= + +/** + * @brief An enum to specify the QNN GPU optimization type + * + */ +typedef enum { + /// Undefined option only used for QNN_GPU_OP_PACKAGE_OPTIMIZATION_INIT + QNN_GPU_OPTIMIZATION_TYPE_UNDEFINED = 0, + /// Super node optimization + QNN_GPU_OPTIMIZATION_TYPE_SUPER_NODE = 2, +} QnnGpuOpPackage_OptimizationType_t; + +/** + * @brief A struct representing a super node connection constraint. + */ +typedef struct { + /// Producer node corresponding to QnnGpuOpPackage_SuperNodeOptimization_t::operations + uint32_t producer; + /// Output tensor index corresponding to the producer node + uint32_t producerOutputIndex; + /// Consumer node corresponding to QnnGpuOpPackage_SuperNodeOptimization_t::operations + uint32_t consumer; + /// Output tensor index corresponding to the consumer node + uint32_t consumerInputIndex; +} QnnGpuOpPackage_SuperNodeConnectionConstraint_t; + +/** + * @brief An enum to specify the source of a tensor in an op def for a tensor constraint. + * + */ +typedef enum { + /// Tensor is an op def output + QNN_GPU_OPTIMIZATION_SUPER_NODE_TENSOR_SOURCE_OUTPUT = 1, + QNN_GPU_OPTIMIZATION_SUPER_NODE_TENSOR_SOURCE_INPUT = 2, +} QnnGpuOpPackage_TensorConstraintSource_t; + +/** + * @brief An enum to specify the tensor constraint type. + * + */ +typedef enum { + /// Add a Qnn_DataType_t to the whitelist of allowable types. + /// If no data type constraint is present for a tensor, all data types are allowed. + QNN_GPU_OPTIMIZATION_SUPER_NODE_TENSOR_CONSTRAINT_DATA_TYPE = 1, + /// Tensor must match it's rank + QNN_GPU_OPTIMIZATION_SUPER_NODE_TENSOR_CONSTRAINT_RANK = 2, + /// Tensor must match one of it's dimensions + QNN_GPU_OPTIMIZATION_SUPER_NODE_TENSOR_CONSTRAINT_DIMENSION = 3, + /// Add a Qnn_TensorType_t to the whitelist of allowable tensor types. + /// If no tensor type constraint is present for a tensor, all types are allowed. + QNN_GPU_OPTIMIZATION_SUPER_NODE_TENSOR_CONSTRAINT_TENSOR_TYPE = 4, +} QnnGpuOpPackage_TensorConstraintType_t; + +/** + * @brief A struct representing a tensor constraint. + */ +typedef struct { + /// Operation corresponding to QnnGpuOpPackage_SuperNodeOptimization_t::operations + uint32_t operationIndex; + /// Source of the tensor in the Qnn_OpConfig_t + QnnGpuOpPackage_TensorConstraintSource_t source; + union { + /// Tensor index in the Qnn_OpConfig_t, used only for inputs and outputs + uint32_t index; + /// Tensor parameter name in the Qnn_OpConfig_t, used only for parameters + const char* name; + }; + /// Type of tensor constraint + QnnGpuOpPackage_TensorConstraintType_t type; + union { + /// Tensor data type for Qnn_DataType_t constraints + Qnn_DataType_t dataType; + /// Tensor type for Qnn_TensorType_t constraints + Qnn_TensorType_t tensorType; + /// Tensor rank for rank constraints + uint32_t rank; + struct { + /// Tensor dimension index for dimension constraints + uint32_t index; + /// Tensor dimension size for dimension constraints + uint32_t size; + } dimension; + }; +} QnnGpuOpPackage_TensorConstraint_t; + +typedef struct { + /// Null-terminated array of comma separated lists of operations used for matching super node ops. + /// An asterisk (*) may be used to represent any operation type. + const char** operations; + /// Null-terminated array of pointers to super node connection constraints + QnnGpuOpPackage_SuperNodeConnectionConstraint_t** connectionConstraints; + /// Null-terminated array of pointers to super node tensor constraints + QnnGpuOpPackage_TensorConstraint_t** tensorConstraints; +} QnnGpuOpPackage_SuperNodeOptimization_t; + +// clang-format off +/// QnnGpuOpPackage_SuperNodeOptimization_t initializer macro +#define QNN_GPU_OP_PACKAGE_SUPER_NODE_OPTIMIZATION_INIT \ + { \ + NULL, /*operations*/ \ + NULL, /*connectionConstraints*/ \ + NULL, /*tensorConstraints*/ \ + } +// clang-format on + +/** + * @brief A struct representing a QNN GPU optimization. + */ +typedef struct _QnnOpPackage_Optimization_t { + /// Type of optimization + QnnGpuOpPackage_OptimizationType_t type; + /// Op package assigned name of the optimization + const char* name; + union { + /// Super node optimization, used when type is QNN_GPU_OPTIMIZATION_TYPE_SUPER_NODE + const QnnGpuOpPackage_SuperNodeOptimization_t* superNode; + }; +} QnnGpuOpPackage_Optimization_t; + +/// QnnGpuOpPackage_Optimization_t initializer macro +#define QNN_GPU_OP_PACKAGE_OPTIMIZATION_INIT \ + { \ + QNN_GPU_OPTIMIZATION_TYPE_UNDEFINED, NULL, { NULL } \ + } + +//============================================================================= +// QnnOpPackage_GraphInfrastructure_t specialization. +//============================================================================= + +/** + * @brief A QNN GPU struct specializing QnnOpPackage_GraphInfrastructure_t + */ +typedef struct _QnnOpPackage_GraphInfrastructure_t { + /// GPU precision mode, user-supplied hint used for optimal kernel selection + QnnGpu_Precision_t precisionMode; +} QnnGpuOpPackage_GraphInfrastructure_t; + +//============================================================================= +// QNN GPU Memory Object +//============================================================================= + +/** + * @brief An enum to specify the QNN GPU memory object type + * + */ +typedef enum { + /// Host memory, only used for Qnn_Param_t tensors + QNN_GPU_MEM_OBJ_TYPE_HOST = 0, + /// GPU driver buffer memory object + QNN_GPU_MEM_OBJ_TYPE_BUFFER = 1, + /// GPU driver image 2D memory object + QNN_GPU_MEM_OBJ_TYPE_IMAGE2D = 2, + /// GPU driver image 2D array memory object + QNN_GPU_MEM_OBJ_TYPE_IMAGE2D_ARRAY = 3, + /// Aggregation of GPU driver image 2D memory objects + QNN_GPU_MEM_OBJ_TYPE_AGGREGATED_IMAGE2D = 4, + /// Aggregation of GPU driver image 2D array memory objects + QNN_GPU_MEM_OBJ_TYPE_AGGREGATED_IMAGE2D_ARRAY = 5, + /// Memory type is unclaimed and can be specified by the op package via the \n + /// QnnGpu_OutputClaim_t struct + QNN_GPU_MEM_OBJ_TYPE_UNCLAIMED = 6, +} QnnGpu_MemoryObjectType_t; + +/** + * @brief An enum to specify the QNN GPU memory layout + * + */ +typedef enum { + /// HWC layout + QNN_GPU_MEM_LAYOUT_HWC = 0, + /// HCW layout + QNN_GPU_MEM_LAYOUT_HCW = 1, + /// CHW layout + QNN_GPU_MEM_LAYOUT_CHW = 2, + /// Undefined + QNN_GPU_MEM_LAYOUT_UNDEFINED = 0x7FFFFFFF, +} QnnGpu_MemoryLayout_t; + +/** + * @brief A struct to specify blockSize for weight Tensor and tensorId for weight Param tensor + */ +typedef struct { + // Block Quantization, block Sizes + uint32_t* bqBlockSize; + /// Tensor Id for Quantization encodings + uint32_t bqEncodingTensorId; +} QnnGpu_BlockEncodingInfo_t; + +// clang-format off +/// QnnGpu_MemoryObject_t initializer macro +#define QNN_GPU_BLOCK_ENCODING_INFO_INIT \ + { \ + NULL, /*bqBlockSize*/ \ + 0u /*bqEncodingTensorId*/ \ + } +// clang-format on + +/** + * @brief A QNN GPU struct specifying a memory object + * This struct is used with the following kernel argument types: + * - QNN_GPU_KERNEL_ARG_TYPE_OP_INPUT_READ + * - QNN_GPU_KERNEL_ARG_TYPE_OP_INPUT_READWRITE + * - QNN_GPU_KERNEL_ARG_TYPE_OP_OUTPUT_WRITE + * - QNN_GPU_KERNEL_ARG_TYPE_INTERNAL_READ + * - QNN_GPU_KERNEL_ARG_TYPE_INTERNAL_READWRITE + * - QNN_GPU_KERNEL_ARG_TYPE_INTERNAL_WRITE + */ +typedef struct { + /// Type of memory object + QnnGpu_MemoryObjectType_t type; + /// Data type of the memory object + Qnn_DataType_t dataType; + /// Memory object dimensions \n + /// Size is numDimensions. Uses the following type dependent format: \n + /// QNN_GPU_MEM_OBJ_TYPE_BUFFER -> {numElements} \n + /// QNN_GPU_MEM_OBJ_TYPE_IMAGE2D -> {height,width} \n + /// QNN_GPU_MEM_OBJ_TYPE_IMAGE2D_ARRAY -> {height,width,array_size} \n + /// QNN_GPU_MEM_OBJ_TYPE_AGGREGATED_IMAGE2D -> {num_batches,height,width} \n + /// QNN_GPU_MEM_OBJ_TYPE_AGGREGATED_IMAGE2D_ARRAY -> {num_batches,height,width,array_size} + uint32_t* dimensions; + /// Memory object offsets \n + /// Size is numDimensions. \n + /// Indicates where the data store starts in the memory object. \n + uint32_t* offsets; + /// Number of dimensions in memory object \n + /// Size is numDimensions. Has the following type dependent size: \n + /// QNN_GPU_MEM_OBJ_TYPE_BUFFER -> 1 \n + /// QNN_GPU_MEM_OBJ_TYPE_IMAGE2D -> 2 \n + /// QNN_GPU_MEM_OBJ_TYPE_IMAGE2D_ARRAY -> 3 \n + /// QNN_GPU_MEM_OBJ_TYPE_AGGREGATED_IMAGE2D -> 3 \n + /// QNN_GPU_MEM_OBJ_TYPE_AGGREGATED_IMAGE2D_ARRAY -> 4 + uint32_t numDimensions; + /// Memory object layout \n + /// Op package specific layout identifier \n + /// Default is QNN_GPU_MEM_LAYOUT_UNDEFINED if not already specified by a prior operation + QnnGpu_MemoryLayout_t layout; + /// Block Quantization Tensor Information + QnnGpu_BlockEncodingInfo_t blockEncodingInfo; +} QnnGpu_MemoryObject_t; + +// clang-format off +/// QnnGpu_MemoryObject_t initializer macro +#define QNN_GPU_MEMORY_OBJECT_INIT \ + { \ + QNN_GPU_MEM_OBJ_TYPE_UNCLAIMED, /*type*/ \ + QNN_DATATYPE_UNDEFINED, /*dataType*/ \ + NULL, /*dimensions*/ \ + NULL, /*offsets*/ \ + 0u, /*numDimensions*/ \ + QNN_GPU_MEM_LAYOUT_UNDEFINED, /*layout*/ \ + QNN_GPU_BLOCK_ENCODING_INFO_INIT /*blockEncodingInfo*/ \ + } +// clang-format on + +//============================================================================= +// QnnOpPackage_Node_t specialization. +//============================================================================= + +/** + * @brief A QNN GPU struct specifying a storage tensor + */ +typedef struct { + /// Tensor ID + uint32_t id; + /// Tensor's associated memory object + const QnnGpu_MemoryObject_t* memoryObject; +} QnnGpu_TensorStorageType_t; + +// clang-format off +/// QnnGpu_TensorStorageType_t initializer macro +#define QNN_GPU_TENSOR_STORAGE_TYPE_INIT \ + { \ + 0u, /*id*/ \ + NULL /*memoryObject*/ \ + } +// clang-format on + +/** + * @brief A QNN GPU struct specializing QnnOpPackage_Node_t + */ +typedef struct _QnnOpPackage_Node_t { + /// Optimization index, see QnnOpPackage_Info_t, ignore when only one op config provided + uint32_t optimization; + /// Null-terminated array of operation config pointers + /// Only one pointer provided when no optimizations performed + const Qnn_OpConfig_t** configs; + /// Null-terminated array of tensor storage type pointers called out in the config + const QnnGpu_TensorStorageType_t** storageTypes; + /// Kernel variant index, if set then used by OpPackage to determine kernel selection + int32_t kernelVariant; +} QnnGpuOpPackage_Node_t; + +//============================================================================= +// QnnOpPackage_OpImpl_t specialization. +//============================================================================= + +/** + * @brief A QNN GPU struct specifying an output tensor claim. Using the principle + * of least work, operations must output a memory object type that is most + * convenient for itself. Only QNN_TENSOR_TYPE_NATIVE tensor types may + * be claimed. + */ +typedef struct { + /// Index into the Qnn_OpConfig_t provided in QnnGpuOpPackage_Node_t + uint32_t opConfigIndex; + /// Index into the operation outputs to identify the tensor + uint32_t outputIndex; + /// Specification of the claimed memory object + const QnnGpu_MemoryObject_t* memoryObject; +} QnnGpu_OutputClaim_t; + +// clang-format off +/// QnnGpu_OutputClaim_t initializer macro +#define QNN_GPU_OUTPUT_CLAIM_INIT \ + { \ + 0u, /*opConfigIndex*/ \ + 0u, /*outputIndex*/ \ + NULL /*memoryObject*/ \ + } +// clang-format on + +/** + * @brief An enum to specify the kernel argument type. + * + */ +typedef enum { + /// Operation input tensor used as kernel input + QNN_GPU_KERNEL_ARG_TYPE_OP_INPUT_READ = 0, + /// Operation input tensor used as kernel output + QNN_GPU_KERNEL_ARG_TYPE_OP_INPUT_READWRITE = 1, + /// Operation output tensor used as kernel output + QNN_GPU_KERNEL_ARG_TYPE_OP_OUTPUT_WRITE = 2, + /// Operation internal tensor used as kernel input + QNN_GPU_KERNEL_ARG_TYPE_INTERNAL_READ = 3, + /// Operation internal tensor used as kernel input/output + QNN_GPU_KERNEL_ARG_TYPE_INTERNAL_READWRITE = 4, + /// Operation internal tensor used as kernel output + QNN_GPU_KERNEL_ARG_TYPE_INTERNAL_WRITE = 5, + /// Plain old data kernel argument + QNN_GPU_KERNEL_ARG_TYPE_DATA = 6, + /// Local memory kernel argument + QNN_GPU_KERNEL_ARG_TYPE_LOCAL = 7, + /// Null pointer kernel argument + QNN_GPU_KERNEL_ARG_TYPE_NULL_PTR = 8, + /// Operation tensor parameter used as kernel input + QNN_GPU_KERNEL_ARG_TYPE_OP_TENSOR_PARAM = 9, +} QnnGpu_KernelArgType_t; + +/** + * @brief A QNN GPU struct specifying a kernel argument corresponding to a tensor. + * This struct is used with the following kernel argument types: + * - QNN_GPU_KERNEL_ARG_TYPE_OP_INPUT_READ + * - QNN_GPU_KERNEL_ARG_TYPE_OP_INPUT_READWRITE + * - QNN_GPU_KERNEL_ARG_TYPE_OP_OUTPUT_WRITE + * - QNN_GPU_KERNEL_ARG_TYPE_INTERNAL_READ + * - QNN_GPU_KERNEL_ARG_TYPE_INTERNAL_READWRITE + * - QNN_GPU_KERNEL_ARG_TYPE_INTERNAL_WRITE + */ +typedef struct { + /// Index into the Qnn_OpConfig_t provided in QnnGpuOpPackage_Node_t, ignored for INTERNAL types + uint32_t opConfigIndex; + /// Index into the operation input ot output list or the internal tensor list + uint32_t tensorIndex; + /// Batch element index for aggregated tensor types + uint32_t element; +} QnnGpu_TensorKernelArg_t; + +// clang-format off +/// QnnGpu_TensorKernelArg_t initializer macro +#define QNN_GPU_TENSOR_KERNEL_ARG_INIT \ + { \ + 0u, /*opConfigIndex*/ \ + 0u, /*tensorIndex*/ \ + 0u /*element*/ \ + } +// clang-format on + +/** + * @brief An enum to specify the kernel data argument type. + * + */ +typedef enum { + QNN_GPU_KERNEL_ARG_CL_TYPE_CHAR = 0, + QNN_GPU_KERNEL_ARG_CL_TYPE_UCHAR = 1, + QNN_GPU_KERNEL_ARG_CL_TYPE_SHORT = 2, + QNN_GPU_KERNEL_ARG_CL_TYPE_USHORT = 3, + QNN_GPU_KERNEL_ARG_CL_TYPE_INT = 4, + QNN_GPU_KERNEL_ARG_CL_TYPE_UINT = 5, + QNN_GPU_KERNEL_ARG_CL_TYPE_LONG = 6, + QNN_GPU_KERNEL_ARG_CL_TYPE_ULONG = 7, + QNN_GPU_KERNEL_ARG_CL_TYPE_FLOAT = 8, + QNN_GPU_KERNEL_ARG_CL_TYPE_DOUBLE = 9, +} QnnGpu_DataKernelArgType_t; + +/** + * @brief A QNN GPU struct specifying a kernel argument corresponding to a plain old data. + * This struct is used only with the QNN_GPU_KERNEL_ARG_TYPE_DATA arg type. + */ +typedef struct { + /// Data type of the data + QnnGpu_DataKernelArgType_t type; + union { + /// Used with QNN_GPU_KERNEL_ARG_CL_TYPE_CHAR + int8_t qnnChar; + /// Used with QNN_GPU_KERNEL_ARG_CL_TYPE_UCHAR + uint8_t qnnUChar; + /// Used with QNN_GPU_KERNEL_ARG_CL_TYPE_SHORT + int16_t qnnShort; + /// Used with QNN_GPU_KERNEL_ARG_CL_TYPE_USHORT + uint16_t qnnUShort; + /// Used with QNN_GPU_KERNEL_ARG_CL_TYPE_INT + int32_t qnnInt; + /// Used with QNN_GPU_KERNEL_ARG_CL_TYPE_UINT + uint32_t qnnUInt; + /// Used with QNN_GPU_KERNEL_ARG_CL_TYPE_LONG + int64_t qnnLong; + /// Used with QNN_GPU_KERNEL_ARG_CL_TYPE_ULONG + uint64_t qnnULong; + /// Used with QNN_GPU_KERNEL_ARG_CL_TYPE_FLOAT + float qnnFloat; + /// Used with QNN_GPU_KERNEL_ARG_CL_TYPE_DOUBLE + double qnnDouble; + }; +} QnnGpu_DataKernelArg_t; + +/// QnnGpu_DataKernelArg_t initializer macro +#define QNN_GPU_DATA_KERNEL_ARG_INIT \ + { \ + QNN_GPU_KERNEL_ARG_CL_TYPE_CHAR, /*type*/ \ + { \ + 0 /*qnnChar*/ \ + } \ + } + +/** + * @brief A QNN GPU struct specifying a kernel argument corresponding to a local memory type. + * This struct is used only with the QNN_GPU_KERNEL_ARG_TYPE_LOCAL arg type. + */ +typedef struct { + /// Size of the memory requested in bytes + uint32_t size; +} QnnGpu_LocalKernelArg_t; + +/// QnnGpu_LocalKernelArg_t initializer macro +#define QNN_GPU_LOCAL_KERNEL_ARG_INIT \ + { 0u /*size*/ } + +/** + * @brief A QNN GPU struct specifying a kernel argument. + * Note that the QNN_GPU_KERNEL_ARG_TYPE_NULL_PTR type does not have an entry in + * the union. + */ +typedef struct { + /// Type of kernel argument + QnnGpu_KernelArgType_t type; + union { + /// Tensor type argument + QnnGpu_TensorKernelArg_t tensor; + /// Plain old data argument + QnnGpu_DataKernelArg_t data; + /// Local memory argument + QnnGpu_LocalKernelArg_t local; + }; +} QnnGpu_KernelArg_t; + +/// QnnGpu_KernelArg_t initializer macro +#define QNN_GPU_KERNEL_ARG_INIT \ + { \ + QNN_GPU_KERNEL_ARG_TYPE_NULL_PTR, /*type*/ \ + { \ + QNN_GPU_TENSOR_KERNEL_ARG_INIT /*tensor*/ \ + } \ + } + +/** + * @brief An enum to specify the kernel source type. + * + */ +typedef enum { + QNN_GPU_KERNEL_SOURCE_TYPE_TEXT = 0, + QNN_GPU_KERNEL_SOURCE_TYPE_BINARY = 1, +} QnnGpu_KernelSourceType_t; + +/** + * @brief This enum defines QNN GPU kernel tuning options. + */ +typedef enum { + /// local work size tuning + QNN_GPU_KERNEL_TUNING_LOCAL_WORK_SIZE = 0, + QNN_GPU_KERNEL_TUNING_UNDEFINED = 0x7FFFFFFF +} QnnGpu_KernelTuningOption_t; + +/** + * @brief This struct provides local-work-size tuning configuration. + */ +typedef struct { + uint32_t minValue[3]; + uint32_t maxValue[3]; + uint32_t stepSize[3]; +} QnnGpu_KernelLocalWorkSizeTuning_t; + +/** + * @brief This struct provides QNN GPU kernel tuning configuration. + */ +typedef struct { + QnnGpu_KernelTuningOption_t option; + union UNNAMED { + QnnGpu_KernelLocalWorkSizeTuning_t lws; + }; +} QnnGpu_KernelTuningConfig_t; + +/** + * @brief A QNN GPU struct specifying a kernel. + */ +typedef struct { + /// Kernel source code or binary + const void* kernelSource; + /// Length of kernel source/binary in bytes + size_t sourceLength; + /// Type of kernel source + QnnGpu_KernelSourceType_t sourceType; + /// Null terminated build options string used for kernel compilation + const char* buildOptions; + /// Rank of the globalWorkSizes + size_t globalWorkDim; + /// Global work sizes used by enqueuing the kernel + size_t globalWorkSizes[3]; + /// Rank of the localWorkSizes + size_t localWorkDim; + /// Local work sizes used by enqueuing the kernel + size_t localWorkSizes[3]; + /// Null-terminated array of kernel arguments in the order they appear in the kernel function + QnnGpu_KernelArg_t** args; + /// Null terminated name of the kernel + const char* name; + /// If non-zero, kernel will be enqueued during execute even if it is static + uint32_t isDynamic; + /// Null-terminated array to provide kernel tuning configurations. + QnnGpu_KernelTuningConfig_t** tuningConfigs; + /// Reserved field, must be null + void* reserved; +} QnnGpu_Kernel_t; + +// clang-format off +/// QnnGpu_Kernel_t initializer macro +#define QNN_GPU_KERNEL_INIT \ + { \ + NULL, /*kernelSource*/ \ + 0u, /*sourceLength*/ \ + QNN_GPU_KERNEL_SOURCE_TYPE_TEXT, /*sourceType*/ \ + NULL, /*buildOptions*/ \ + 0u, /*globalWorkDim*/ \ + {0u}, /*globalWorkSizes*/ \ + 0u, /*localWorkDim*/ \ + {0u}, /*localWorkSizes*/ \ + NULL, /*args*/ \ + NULL, /*name*/ \ + 0u, /*isDynamic*/ \ + NULL, /*tuningConfigs*/ \ + NULL /*reserved*/ \ + } +// clang-format on + +/** + * @brief A QNN GPU struct specifying an operation. + */ +typedef struct _QnnOpPackage_OpImpl_t { + /// Null-terminated array of output claims + QnnGpu_OutputClaim_t** outputClaims; + /// Null-terminated array of tensor requests + QnnGpu_MemoryObject_t** memoryObjects; + /// Null-terminated array of kernels + QnnGpu_Kernel_t** kernels; +} QnnGpu_Operation_t; + +// clang-format off +/// QnnGpu_Operation_t initializer macro +#define QNN_GPU_OPERATION_INIT \ + { \ + NULL, /*outputClaims*/ \ + NULL, /*memoryObjects*/ \ + NULL, /*kernels*/ \ + } +// clang-format on + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GenAiTransformer/QnnGenAiTransformerCommon.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GenAiTransformer/QnnGenAiTransformerCommon.h new file mode 100755 index 0000000000000..3adb43819b8b3 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GenAiTransformer/QnnGenAiTransformerCommon.h @@ -0,0 +1,50 @@ +//============================================================================= +// +// Copyright (c) 2024 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================= + +/** @file + * @brief QNN GenAiTransformer Common components + * + * This file defines versioning and other identification details + * and supplements QnnCommon.h for GenAiTransformer backend + */ + +#ifndef QNN_GENAI_TRANSFORMER_COMMON_H +#define QNN_GENAI_TRANSFORMER_COMMON_H + +#include "QnnCommon.h" + +/// GenAiTransformer Backend identifier +#define QNN_BACKEND_ID_GENAI_TRANSFORMER 14 + +/// GenAiTransformer interface provider +#define QNN_GENAI_TRANSFORMER_INTERFACE_PROVIDER_NAME "GENAI_TRANSFORMER_QTI_AISW" + +// GenAiTransformer API Version values +#define QNN_GENAI_TRANSFORMER_API_VERSION_MAJOR 1 +#define QNN_GENAI_TRANSFORMER_API_VERSION_MINOR 0 +#define QNN_GENAI_TRANSFORMER_API_VERSION_PATCH 0 + +// clang-format off +/// Macro to set Qnn_ApiVersion_t for GENAI_TRANSFORMER backend +#define QNN_GENAI_TRANSFORMER_API_VERSION_INIT \ + { \ + { \ + QNN_API_VERSION_MAJOR, /*coreApiVersion.major*/ \ + QNN_API_VERSION_MINOR, /*coreApiVersion.major*/ \ + QNN_API_VERSION_PATCH /*coreApiVersion.major*/ \ + }, \ + { \ + QNN_GENAI_TRANSFORMER_API_VERSION_MAJOR, /*backendApiVersion.major*/ \ + QNN_GENAI_TRANSFORMER_API_VERSION_MINOR, /*backendApiVersion.minor*/ \ + QNN_GENAI_TRANSFORMER_API_VERSION_PATCH /*backendApiVersion.patch*/ \ + } \ + } + +// clang-format on + +#endif // QNN_GENAI_TRANSFORMER_COMMON_H \ No newline at end of file diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaBackend.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaBackend.h new file mode 100755 index 0000000000000..e756b8042ec09 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaBackend.h @@ -0,0 +1,76 @@ +//============================================================================= +// +// Copyright (c) 2022 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================= + +/** @file + * @brief QNN HTA component Backend API. + * + * The interfaces in this file work with the top level QNN + * API and supplements QnnBackend.h for HTA backend + */ + +#ifndef QNN_HTA_BACKEND_H +#define QNN_HTA_BACKEND_H + +#include "QnnBackend.h" + +#ifdef __cplusplus +extern "C" { +#endif + +//============================================================================= +// Macros +//============================================================================= + +//============================================================================= +// Data Types +//============================================================================= + +/* @brief Enum describing the set of features supported by HTA backend. + This is used as a bitmask, so assign unique bits to each entries. +*/ +typedef enum { + /// The accelerator will always attempt to fold relu activation + /// into the immediate preceding convolution operation. This optimization + /// is correct when quantization ranges for convolution are equal or + /// subset of the Relu operation. For graphs, where this cannot be + /// guranteed, the client should set this flag + QNN_HTA_FOLD_RELU_ACTIVATION_INTO_CONV_OFF = 1 << 0, + /// UNKNOWN enum event that must not be used + QNN_HTA_BACKEND_FEATURES_UNKNOWN = 0x7fffffff +} QnnHtaBackend_Features_t; + +//============================================================================= +// Public Functions +//============================================================================= + +//------------------------------------------------------------------------------ +// Implementation Definition +//------------------------------------------------------------------------------ + +// clang-format off + +/** + * @brief Structure describing the set of configurations supported by the backend. + * Objects of this type are to be referenced through QnnBackend_CustomConfig_t. + */ +typedef struct { + /// field to save the features that are passed + /// via QnnHtaBackend_Features_t + uint32_t bitmaskFeatures; +} QnnHtaBackend_CustomConfig_t ; + +/// QnnHtaBackend_CustomConfig_t initializer macro +#define QNN_HTA_BACKEND_CUSTOM_CONFIG_INIT \ + { 0 /*bitmaskFeatures*/ } + +// clang-format on +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaCommon.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaCommon.h new file mode 100755 index 0000000000000..1eb8e1f0a99a4 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaCommon.h @@ -0,0 +1,62 @@ +//============================================================================= +// +// Copyright (c) 2022 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================= + +/** @file + * @brief QNN HTA Common components + * + * This file defines versioning and other identification details + * and supplements QnnCommon.h for HTA backend + */ + +#ifndef QNN_HTA_COMMON_H +#define QNN_HTA_COMMON_H + +#include "QnnCommon.h" + +/// HTA Backend identifier +#define QNN_BACKEND_ID_HTA 7 + +/// HTA interface provider +#define QNN_HTA_INTERFACE_PROVIDER_NAME "HTA_QTI_AISW" + +// HTA API Version values + +#define QNN_HTA_API_VERSION_MAJOR 2 +#define QNN_HTA_API_VERSION_MINOR 0 +#define QNN_HTA_API_VERSION_PATCH 0 + +// clang-format off + +/// Macro to set Qnn_ApiVersion_t for HTA backend +#define QNN_HTA_API_VERSION_INIT \ + { \ + { \ + QNN_API_VERSION_MAJOR, /*coreApiVersion.major*/ \ + QNN_API_VERSION_MINOR, /*coreApiVersion.major*/ \ + QNN_API_VERSION_PATCH /*coreApiVersion.major*/ \ + }, \ + { \ + QNN_HTA_API_VERSION_MAJOR, /*backendApiVersion.major*/ \ + QNN_HTA_API_VERSION_MINOR, /*backendApiVersion.minor*/ \ + QNN_HTA_API_VERSION_PATCH /*backendApiVersion.patch*/ \ + } \ + } + +// clang-format on + +// HTA Binary Version values +#define QNN_HTA_BINARY_VERSION_MAJOR 2 +#define QNN_HTA_BINARY_VERSION_MINOR 0 +#define QNN_HTA_BINARY_VERSION_PATCH 0 + +// HTA Context blob Version values +#define QNN_HTA_CONTEXT_BLOB_VERSION_MAJOR 1 +#define QNN_HTA_CONTEXT_BLOB_VERSION_MINOR 1 +#define QNN_HTA_CONTEXT_BLOB_VERSION_PATCH 0 + +#endif // QNN_HTA_COMMON_H diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaDevice.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaDevice.h new file mode 100755 index 0000000000000..d31f5232e21f3 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaDevice.h @@ -0,0 +1,41 @@ +//============================================================================= +// +// Copyright (c) 2022 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================= + +/** @file + * @brief QNN HTA component Device API. + * + * The interfaces in this file work with the top level QNN + * API and supplements QnnDevice.h for HTA backend + */ +#ifndef QNN_HTA_DEVICE_H +#define QNN_HTA_DEVICE_H + +#include "QnnDevice.h" +#include "QnnHtaPerfInfrastructure.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _QnnDevice_Infrastructure_t { + QnnHtaPerfInfrastructure_SetPowerConfigFn_t setPowerConfig; +} QnnHtaDevice_Infrastructure_t; + +// clang-format off +/// QnnHtaDevice_Infrastructure_t initializer macro +#define QNN_HTA_DEVICE_INFRASTRUCTURE_INIT \ + { \ + NULL, /*setPowerConfig*/ \ + } +// clang-format on + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif \ No newline at end of file diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaGraph.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaGraph.h new file mode 100755 index 0000000000000..0abbb9bc5114d --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaGraph.h @@ -0,0 +1,123 @@ +//============================================================================= +// +// Copyright (c) 2022 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================= + +/** @file + * @brief QNN HTA component Graph API. + * + * The interfaces in this file work with the top level QNN + * API and supplements QnnGraph.h for HTA backend + */ + +#ifndef QNN_HTA_GRAPH_H +#define QNN_HTA_GRAPH_H + +#include "QnnGraph.h" + +#ifdef __cplusplus +extern "C" { +#endif + +//============================================================================= +// Macros +//============================================================================= + +//============================================================================= +// Data Types +//============================================================================= + +/** + * @brief This enum provides different HTA graph optimization + * options that can be used to finalize the graph + * for optimum performance + */ +typedef enum QnnHtaGraph_OptimizationType { + QNN_HTA_GRAPH_OPTIMIZATION_TYPE_SCHEDULE_THRESHOLD = 1, + QNN_HTA_GRAPH_OPTIMIZATION_TYPE_FINALIZE_RETRIES = 2, + QNN_HTA_GRAPH_OPTIMIZATION_TYPE_UNKNOWN = 0x7fffffff +} QnnHtaGraph_OptimizationType_t; + +/* @brief Struct describing the set of optimization type + * and the value associated with the optimization + */ +typedef struct QnnHtaGraph_OptimizationOption { + QnnHtaGraph_OptimizationType_t type; + float floatValue; +} QnnHtaGraph_OptimizationOption_t; + +// clang-format off +/// QnnHtaGraph_OptimizationOption_t initializer macro +#define QNN_HTA_GRAPH_OPTIMIZATION_OPTION_INIT \ + { \ + QNN_HTA_GRAPH_OPTIMIZATION_TYPE_UNKNOWN, /*type*/ \ + 0.0f /*floatValue*/ \ + } +// clang-format on + +/** + * @brief This enum provides different HTA graph configuration + * options associated with QnnGraph + */ +typedef enum QnnHtaGraph_ConfigOption { + QNN_HTA_GRAPH_CONFIG_OPTION_OPTIMIZATION = 1, + QNN_HTA_GRAPH_CONFIG_OPTION_PRIORITY = 2, + QNN_HTA_GRAPH_CONFIG_OPTION_UNKNOWN = 0x7fffffff +} QnnHtaGraph_ConfigOption_t; + +//============================================================================= +// Public Functions +//============================================================================= + +//------------------------------------------------------------------------------ +// Implementation Definition +//------------------------------------------------------------------------------ + +// clang-format off + +/** + * @brief Structure describing the set of configurations supported by graph. + * Objects of this type are to be referenced through QnnGraph_CustomConfig_t. + * + * The struct has two fields - option and a union of corresponding config values + * Based on the option corresponding item in the union can be used to specify + * config + * Below is the Map between QnnHtaGraph_ConfigOption_t and config value + * + * \verbatim embed:rst:leading-asterisk + * +----+------------------------------------------+------------------------------------+ + * | # | Config Option | Configuration Struct/value | + * +====+==========================================+====================================+ + * | 1 | QNN_HTA_GRAPH_CONFIG_OPTION_OPTIMIZATION | QnnHtaGraph_OptimizationOption_t | + * +----+------------------------------------------+------------------------------------+ + * | 2 | QNN_HTA_GRAPH_CONFIG_OPTION_PRIORITY | Qnn_Priority_t | + * +----+------------------------------------------+------------------------------------+ + * \endverbatim + */ +typedef struct { + QnnHtaGraph_ConfigOption_t option; + union { + QnnHtaGraph_OptimizationOption_t optimizationOption; + Qnn_Priority_t priority; + }; +} QnnHtaGraph_CustomConfig_t ; + + +/// QnnHtaGraph_CustomConfig_t initalizer macro +#define QNN_HTA_GRAPH_CUSTOM_CONFIG_INIT \ + { \ + QNN_HTA_GRAPH_CONFIG_OPTION_UNKNOWN, /*option*/ \ + { \ + QNN_HTA_GRAPH_OPTIMIZATION_OPTION_INIT /*optimizationOption*/ \ + } \ + } + +// clang-format on +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaPerfInfrastructure.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaPerfInfrastructure.h new file mode 100755 index 0000000000000..4f6e0c22c274b --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaPerfInfrastructure.h @@ -0,0 +1,134 @@ +//============================================================================== +// +// Copyright (c) 2022 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +/** @file + * @brief QNN HTA component Performance Infrastructure API + * + * Provides interface to the client to control performance and system + * settings of the QNN HTA Accelerator + */ + +#ifndef QNN_HTA_PERF_INFRASTRUCTURE_H +#define QNN_HTA_PERF_INFRASTRUCTURE_H + +#include "QnnCommon.h" +#include "QnnTypes.h" + +#ifdef __cplusplus +extern "C" { +#endif + +//============================================================================= +// Data Types +//============================================================================= + +/** + * @brief QNN HTA PerfInfrastructure API result / error codes. + * + */ +typedef enum { + QNN_HTA_PERF_INFRASTRUCTURE_MIN_ERROR = QNN_MIN_ERROR_PERF_INFRASTRUCTURE, + //////////////////////////////////////////////////////////////////////// + + QNN_HTA_PERF_INFRASTRUCTURE_NO_ERROR = QNN_SUCCESS, + QNN_HTA_PERF_INFRASTRUCTURE_ERROR_INVALID_HANDLE_PTR = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 0, + QNN_HTA_PERF_INFRASTRUCTURE_ERROR_INVALID_INPUT = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 1, + QNN_HTA_PERF_INFRASTRUCTURE_ERROR_UNSUPPORTED_CONFIG = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 2, + QNN_HTA_PERF_INFRASTRUCTURE_ERROR_TRANSPORT = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 3, + + //////////////////////////////////////////////////////////////////////// + QNN_HTA_PERF_INFRASTRUCTURE_MAX_ERROR = QNN_MAX_ERROR_PERF_INFRASTRUCTURE +} QnnHtaPerfInfrastructure_Error_t; + +/** + * @brief This enum defines all the possible performance + * options in Hta Performance Infrastructure that + * relate to setting up of power levels + */ +typedef enum { + /// config enum implies the usage of powerModeConfig struct. If not provided + /// will be used as type identificator + QNN_HTA_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_POWER_MODE = 1, + /// UNKNOWN config option which must not be used + QNN_HTA_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_UNKNOWN = 0x7fffffff +} QnnHtaPerfInfrastructure_PowerConfigOption_t; + +/** + * @brief This enum defines all the possible power mode + * that a client can set + */ +typedef enum { + /// default mode + QNN_HTA_PERF_INFRASTRUCTURE_POWERMODE_DEFAULT = 0, + /// low power saver mode + QNN_HTA_PERF_INFRASTRUCTURE_POWERMODE_LOW_POWER_SAVER = 1, + /// power saver mode + QNN_HTA_PERF_INFRASTRUCTURE_POWERMODE_POWER_SAVER = 2, + /// high power saver mode + QNN_HTA_PERF_INFRASTRUCTURE_POWERMODE_HIGH_POWER_SAVER = 3, + /// balanced mode + QNN_HTA_PERF_INFRASTRUCTURE_POWERMODE_BALANCED = 4, + /// high performance mode + QNN_HTA_PERF_INFRASTRUCTURE_POWERMODE_HIGH_PERFORMANCE = 5, + /// burst mode + QNN_HTA_PERF_INFRASTRUCTURE_POWERMODE_BURST = 6, + /// UNKNOWN value that must not be used by client + QNN_HTA_PERF_INFRASTRUCTURE_POWERMODE_UNKNOWN = 0x7fffffff +} QnnHtaPerfInfrastructure_PowerMode_t; + +/** + * @brief This struct provides performance infrastructure configuration + * associated with setting up of power levels + */ +typedef struct { + QnnHtaPerfInfrastructure_PowerConfigOption_t config; + // Organize as union for future expand flexibility defined by PowerConfigOption_t + union { + QnnHtaPerfInfrastructure_PowerMode_t powerModeConfig; + }; +} QnnHtaPerfInfrastructure_PowerConfig_t; + +/// QnnHtaPerfInfrastructure_PowerConfig_t initializer macro +#define QNN_HTA_PERF_INFRASTRUCTURE_POWER_CONFIG_INIT \ + { \ + QNN_HTA_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_UNKNOWN, /*config*/ \ + { \ + QNN_HTA_PERF_INFRASTRUCTURE_POWERMODE_UNKNOWN /*powerModeConfig*/ \ + } \ + } + +//============================================================================= +// API Methods +//============================================================================= + +/** + * @brief This API allows client to set up system power configuration that + * will enable different performance modes. + * + * @param[in] clientId A power client id to associate calls to system + * power settings. A value of 0 implies NULL power client id + * and can override every other setting the user process. To + * enable power settings for multiple clients in the same + * process, use a non-zero power client id. + * + * + * @param[in] config Pointer to a NULL terminated array + * of config option for performance configuration. + * NULL is allowed and indicates no config options are provided. + * + * @return Error code + * \n QNN_SUCCESS: No error encountered + */ +typedef Qnn_ErrorHandle_t (*QnnHtaPerfInfrastructure_SetPowerConfigFn_t)( + uint32_t clientId, const QnnHtaPerfInfrastructure_PowerConfig_t** config); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // QNN_HTA_PERF_INFRASTRUCTURE_H diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaProfile.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaProfile.h new file mode 100755 index 0000000000000..f069dbbedf6b7 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaProfile.h @@ -0,0 +1,199 @@ +//============================================================================== +// +// Copyright (c) 2022 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +/** + * @file + * @brief QNN HTA Profile component API. + * + * Requires HTA backend to be initialized. + * Should be used with the QnnProfile API but has HTA backend + * specific definition for different QnnProfile data structures + * + */ + +#ifndef QNN_HTA_PROFILE_H +#define QNN_HTA_PROFILE_H + +#include "QnnProfile.h" + +#ifdef __cplusplus +extern "C" { +#endif + +//============================================================================= +// Macros +//============================================================================= +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the remote procedure call on the ARM processor + * when client invokes QnnContext_createFromBinary. The value + * returned is time in microseconds. + * + * @note context load binary host time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTA_PROFILE_EVENTTYPE_CONTEXT_LOAD_BIN_HOST_TIME_MICROSEC 1002 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the remote procedure call on the HTA processor + * when client invokes QnnContext_createFromBinary. The value + * returned is time in microseconds. + * + * @note context load binary HTA time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTA_PROFILE_EVENTTYPE_CONTEXT_LOAD_BIN_HTA_TIME_MICROSEC 1003 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the time taken to create the context on the + * accelerator when client invokes QnnContext_createFromBinary. + * The value returned is time in microseconds. + * + * @note context load binary accelerator time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTA_PROFILE_EVENTTYPE_CONTEXT_LOAD_BIN_ACCEL_TIME_MICROSEC 1004 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the remote procedure call on the ARM processor + * when client invokes QnnGraph_finalize. + * The value returned is time in microseconds. + * + * @note graph finalize host time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTA_PROFILE_EVENTTYPE_GRAPH_FINALIZE_HOST_TIME_MICROSEC 2001 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the remote procedure call on the HTA processor + * when client invokes QnnGraph_finalize. + * The value returned is time in microseconds. + * + * @note graph finalize HTA time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTA_PROFILE_EVENTTYPE_GRAPH_FINALIZE_HTA_TIME_MICROSEC 2002 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to finalize the graph on the accelerator + * when client invokes QnnGraph_finalize. + * The value returned is time in microseconds. + * + * @note graph finalize accelerator time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTA_PROFILE_EVENTTYPE_GRAPH_FINALIZE_ACCEL_TIME_MICROSEC 2003 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the remote procedure call on the ARM processor + * when client invokes QnnGraph_execute or QnnGraph_executeAsync. + * The value returned is time in microseconds. + * + * @note graph execute host time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTA_PROFILE_EVENTTYPE_GRAPH_EXECUTE_HOST_TIME_MICROSEC 3001 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the remote procedure call on the HTA processor + * when client invokes QnnGraph_execute or QnnGraph_executeAsync. + * The value returned is time in microseconds. + * + * @note graph execute HTA time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTA_PROFILE_EVENTTYPE_GRAPH_EXECUTE_HTA_TIME_MICROSEC 3002 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to execute the graph on the accelerator + * when client invokes QnnGraph_execute or QnnGraph_executeAsync. + * The value returned is number of processor cycles taken. + * + * @note graph execute accelerator time maybe available only on + * QNN_PROFILE_LEVEL_DETAILED levels + * + * @note When QNN_PROFILE_LEVEL_DETAILED is used, this event can have + * multiple sub-events of type QNN_PROFILE_EVENTTYPE_NODE. + * There will be a sub-event for each node that was added to the graph + */ +#define QNN_HTA_PROFILE_EVENTTYPE_GRAPH_EXECUTE_ACCEL_TIME_CYCLE 3003 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to execute the graph on the accelerator + * when client invokes QnnGraph_execute or QnnGraph_executeAsync. + * The value returned is time taken in microseconds + * + * @note graph execute accelerator time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + * + * @note When QNN_PROFILE_LEVEL_DETAILED is used, this event can have + * multiple sub-events of type QNN_PROFILE_EVENTTYPE_NODE / QNN_PROFILE_EVENTUNIT_MICROSEC. + * There will be a sub-event for each node that was added to the graph + */ +#define QNN_HTA_PROFILE_EVENTTYPE_GRAPH_EXECUTE_ACCEL_TIME_MICROSEC 3004 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to time taken for miscellaneous work i.e. time + * that cannot be attributed to a node but are still needed to + * execute the graph on the accelerator. This occurs when client invokes + * QnnGraph_execute or QnnGraph_executeAsync. + * The value returned is time taken in microseconds + * + * @note graph execute misc accelerator time is available only on + * QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTA_PROFILE_EVENTTYPE_GRAPH_EXECUTE_MISC_ACCEL_TIME_MICROSEC 3005 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the remote procedure call on the ARM processor + * when client invokes QnnContext_free which in consequence deinit graph. + * The value returned is time in microseconds. + * + * @note graph deinit host time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTA_PROFILE_EVENTTYPE_GRAPH_DEINIT_HOST_TIME_MICROSEC 4001 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the remote procedure call on the HTA processor + * when client invokes QnnContext_free which in consequence deinit graph. + * The value returned is time in microseconds. + * + * @note graph deinit HTA time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTA_PROFILE_EVENTTYPE_GRAPH_DEINIT_HTA_TIME_MICROSEC 4002 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the time taken to deinit graph on the + * accelerator when client invokes QnnContext_free which in consequence + * deinit graph. The value returned is time in microseconds. + * + * @note graph deinit accelerator time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTA_PROFILE_EVENTTYPE_GRAPH_DEINIT_ACCEL_TIME_MICROSEC 4003 + +#ifdef __cplusplus +} +#endif + +#endif // QNN_HTA_PROFILE_H diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpCommon.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpCommon.h new file mode 100755 index 0000000000000..8b1d458a04b8e --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpCommon.h @@ -0,0 +1,98 @@ +//============================================================================= +// +// Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================= + +/** @file + * @brief QNN HTP Common components + * + * This file defines versioning and other identification details + * and supplements QnnCommon.h for HTP backend + */ + +#ifndef QNN_HTP_COMMON_H +#define QNN_HTP_COMMON_H + +#include "QnnCommon.h" + +/// HTP Backend identifier +#define QNN_BACKEND_ID_HTP 6 + +/// HTP interface provider +#define QNN_HTP_INTERFACE_PROVIDER_NAME "HTP_QTI_AISW" + +// HTP API Version values +#define QNN_HTP_API_VERSION_MAJOR 5 +#define QNN_HTP_API_VERSION_MINOR 34 +#define QNN_HTP_API_VERSION_PATCH 0 + +// clang-format off + +/// Macro to set Qnn_ApiVersion_t for HTP backend +#define QNN_HTP_API_VERSION_INIT \ + { \ + { \ + QNN_API_VERSION_MAJOR, /*coreApiVersion.major*/ \ + QNN_API_VERSION_MINOR, /*coreApiVersion.major*/ \ + QNN_API_VERSION_PATCH /*coreApiVersion.major*/ \ + }, \ + { \ + QNN_HTP_API_VERSION_MAJOR, /*backendApiVersion.major*/ \ + QNN_HTP_API_VERSION_MINOR, /*backendApiVersion.minor*/ \ + QNN_HTP_API_VERSION_PATCH /*backendApiVersion.patch*/ \ + } \ + } + +// clang-format on + +// DSP Context blob Version values +#define QNN_HTP_CONTEXT_BLOB_VERSION_MAJOR 3 +#define QNN_HTP_CONTEXT_BLOB_VERSION_MINOR 2 +#define QNN_HTP_CONTEXT_BLOB_VERSION_PATCH 3 + +/* ==== CDSP Security Library Versioning ==== */ +/* ==== This information is only intended for OEMs ==== */ + +/* Security versioning for DSP libraries is supported V73 onwards */ +#define QNN_HTP_NATIVE_LIB_SECURITY_VERSIONING_MIN_ARCH 73 + +/* Here we will define CDSP library versions for different targets + * Version is increased whenever there is a security fix from CDSP + * The versioning will start from 1.0.0 for each new target + * */ + +/* V73 Security Issues: + * List of security issues fixed for V73 and the fixed version + * */ +#define QNN_HTP_V73_NATIVE_LIB_SECURITY_VERSION_MAJOR 1 +#define QNN_HTP_V73_NATIVE_LIB_SECURITY_VERSION_MINOR 0 +#define QNN_HTP_V73_NATIVE_LIB_SECURITY_VERSION_PATCH 0 + +/* V75 Security Issues: + * List of security issues fixed for V75 and the fixed version + * */ +// HTP Native library version values for V75 +#define QNN_HTP_V75_NATIVE_LIB_SECURITY_VERSION_MAJOR 1 +#define QNN_HTP_V75_NATIVE_LIB_SECURITY_VERSION_MINOR 0 +#define QNN_HTP_V75_NATIVE_LIB_SECURITY_VERSION_PATCH 0 + +/* V79 Security Issues: + * List of security issues fixed for V79 and the fixed version + * */ +// HTP Native library version values for V79 +#define QNN_HTP_V79_NATIVE_LIB_SECURITY_VERSION_MAJOR 1 +#define QNN_HTP_V79_NATIVE_LIB_SECURITY_VERSION_MINOR 0 +#define QNN_HTP_V79_NATIVE_LIB_SECURITY_VERSION_PATCH 0 + +/* V81 Security Issues: + * List of security issues fixed for V81 and the fixed version + * */ +// HTP Native library version values for V81 +#define QNN_HTP_V81_NATIVE_LIB_SECURITY_VERSION_MAJOR 1 +#define QNN_HTP_V81_NATIVE_LIB_SECURITY_VERSION_MINOR 0 +#define QNN_HTP_V81_NATIVE_LIB_SECURITY_VERSION_PATCH 0 + +#endif // QNN_HTP_COMMON_H diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpContext.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpContext.h new file mode 100755 index 0000000000000..8266817e2dc41 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpContext.h @@ -0,0 +1,164 @@ +//============================================================================== +// +// Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +// All rights reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc.s +// +//============================================================================== + +/** + * @file + * @brief QNN HTP component Context API. + * + * The interfaces in this file work with the top level QNN + * API and supplements QnnContext.h for HTP backend + */ + +#ifndef QNN_HTP_CONTEXT_H +#define QNN_HTP_CONTEXT_H + +#include "QnnContext.h" + +#ifdef __cplusplus +extern "C" { +#endif + +//============================================================================= +// Macros +//============================================================================= + +//============================================================================= +// Data Types +//============================================================================= + +/** + * @brief This enum provides different HTP context configuration + * options associated with QnnContext + */ +typedef enum { + QNN_HTP_CONTEXT_CONFIG_OPTION_WEIGHT_SHARING_ENABLED = 1, + QNN_HTP_CONTEXT_CONFIG_OPTION_REGISTER_MULTI_CONTEXTS = 2, + QNN_HTP_CONTEXT_CONFIG_OPTION_FILE_READ_MEMORY_BUDGET = 3, + QNN_HTP_CONTEXT_CONFIG_OPTION_DSP_MEMORY_PROFILING_ENABLED = 4, + QNN_HTP_CONTEXT_CONFIG_OPTION_SHARE_RESOURCES = 5, + QNN_HTP_CONTEXT_CONFIG_OPTION_IO_MEM_ESTIMATION = 6, + QNN_HTP_CONTEXT_CONFIG_OPTION_PREPARE_ONLY = 7, + QNN_HTP_CONTEXT_CONFIG_OPTION_INIT_ACCELERATION = 8, + QNN_HTP_CONTEXT_CONFIG_OPTION_SKIP_VALIDATION_ON_BINARY_SECTION = 9, + QNN_HTP_CONTEXT_CONFIG_OPTION_UNKNOWN = 0x7fffffff +} QnnHtpContext_ConfigOption_t; + +typedef struct { + // Handle referring to the first context associated to a group. When a new + // group is to be registered, the following value must be 0. + Qnn_ContextHandle_t firstGroupHandle; + // Max spill-fill buffer to be allocated for the group of context in bytes. + // The value that is passed during the registration of the first context to + // a group is taken. Subsequent configuration of this value is disregarded. + uint64_t maxSpillFillBuffer; +} QnnHtpContext_GroupRegistration_t; + +//============================================================================= +// Public Functions +//============================================================================= + +//------------------------------------------------------------------------------ +// Implementation Definition +//------------------------------------------------------------------------------ + +// clang-format off + +/** + * @brief Structure describing the set of configurations supported by context. + * Objects of this type are to be referenced through QnnContext_CustomConfig_t. + * + * The struct has two fields - option and a union of config values + * Based on the option corresponding item in the union can be used to specify + * config. + * + * Below is the Map between QnnHtpContext_CustomConfig_t and config value + * + * \verbatim embed:rst:leading-asterisk + * +----+---------------------------------------------------------------------+---------------------------------------+ + * | # | Config Option | Configuration Struct/value | + * +====+=====================================================================+=======================================+ + * | 1 | QNN_HTP_CONTEXT_CONFIG_OPTION_WEIGHT_SHARING_ENABLED | bool | + * +====+=====================================================================+=======================================+ + * | 2 | QNN_HTP_CONTEXT_CONFIG_OPTION_REGISTER_MULTI_CONTEXTS | QnnHtpContext_GroupRegistration_t | + * +====+=====================================================================+=======================================+ + * | 3 | QNN_HTP_CONTEXT_CONFIG_OPTION_FILE_READ_MEMORY_BUDGET | uint64_t | + * +====+=====================================================================+=======================================+ + * | 4 | QNN_HTP_CONTEXT_CONFIG_OPTION_DSP_MEMORY_PROFILING_ENABLED | bool | + * +====+=====================================================================+=======================================+ + * | 5 | QNN_HTP_CONTEXT_CONFIG_OPTION_SHARE_RESOURCES | bool | + * +----+---------------------------------------------------------------------+---------------------------------------+ + * | 6 | QNN_HTP_CONTEXT_CONFIG_OPTION_IO_MEM_ESTIMATION | bool | + * +----+---------------------------------------------------------------------+---------------------------------------+ + * | 7 | QNN_HTP_CONTEXT_CONFIG_OPTION_PREPARE_ONLY | bool | + * +----+---------------------------------------------------------------------+---------------------------------------+ + * | 8 | QNN_HTP_CONTEXT_CONFIG_OPTION_INIT_ACCELERATION | bool | + * +----+---------------------------------------------------------------------+---------------------------------------+ + * | 9 | QNN_HTP_CONTEXT_CONFIG_OPTION_SKIP_VALIDATION_ON_BINARY_SECTION | bool | + * +----+---------------------------------------------------------------------+---------------------------------------+ + * \endverbatim + */ +typedef struct QnnHtpContext_CustomConfig { + QnnHtpContext_ConfigOption_t option; + union UNNAMED { + // This field sets the weight sharing which is by default false + bool weightSharingEnabled; + QnnHtpContext_GroupRegistration_t groupRegistration; + // - Init time may be impacted depending the value set below + // - Value should be grather than 0 and less than or equal to the file size + // - If set to 0, the feature is not utilized + // - If set to greater than file size, min(fileSize, fileReadMemoryBudgetInMb) is used + // - As an example, if value 2 is passed, it would translate to (2 * 1024 * 1024) bytes + uint64_t fileReadMemoryBudgetInMb; + bool dspMemoryProfilingEnabled; + // This field enables resource sharing across different contexts, enhancing RAM and virtual + // address(VA) space utialization. When this flag is activated, graphs are expected to execute + // sequentially. Note that this configuration option is only supported when using the + // QnnContext_createFromBinaryListAsync API. + bool shareResources; + // This field enables I/O memory estimation during QnnContext_createFromBinary API when multiple + // PDs are available. When enabled, it estimates the total size of the I/O tensors required by + // the context to ensure sufficient space on the PD before deserialization. This feature helps + // with memory registration failures in large models. + // Note that enabling this feature increases peak RAM usage during context initialization phase + // in QnnContext_createFromBinary, but sustained RAM remains unaffected. + bool ioMemEstimation; + // This field enables model preparation without mapping its content on the DSP side. It is + // useful when a model needs to be prepared on the device but executed through a serialized + // binary method. This prevents extra mapping onto the DSP VA space. Set this flag only when + // creating the context. + bool isPrepareOnly; + // This field enables initialization acceleration, which is disabled by default. + // If set to true, the DSP will utilize all hardware threads to accelerate deserialization. + // It is not recommended to execute graphs simultaneously, as this will significantly degrade + // performance. + // Note that this feature may not be effective for small graphs with a few number of ops. + bool initAcceleration; + // This field enables crc32 check skip in Lora super adapter apply, which is disabled by default. + // If set to true, crc32 check for non-base adapter in super adapter apply use case will be + // skipped to improve time cost. + // Note that base adapter in super adaper never do crc32 check, therefore, their apply time cost + // won't improve by turning this config option on. + bool skipValidationOnBinarySection; + }; +} QnnHtpContext_CustomConfig_t; + +/// QnnHtpContext_CustomConfig_t initializer macro +#define QNN_HTP_CONTEXT_CUSTOM_CONFIG_INIT \ + { \ + QNN_HTP_CONTEXT_CONFIG_OPTION_UNKNOWN, /*option*/ \ + { \ + false /*weightsharing*/\ + } \ + } + +// clang-format on +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpDevice.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpDevice.h new file mode 100755 index 0000000000000..e70c23577264b --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpDevice.h @@ -0,0 +1,178 @@ +//============================================================================= +// +// Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================= + +/** @file + * @brief QNN HTP Device components + * + * This file defines structures and supplements QnnDevice.h for QNN HTP device + */ + +#pragma once + +#include "QnnCommon.h" +#include "QnnDevice.h" +#include "QnnHtpPerfInfrastructure.h" +#include "QnnTypes.h" +#ifdef __cplusplus +extern "C" { +#endif + +/** + * This is used to represent the HTP hardware architecture + * Since QnnDevice only supports V68 or newer, using legacy ARCH will result in error + */ +typedef enum { + QNN_HTP_DEVICE_ARCH_NONE = 0, + QNN_HTP_DEVICE_ARCH_V68 = 68, + QNN_HTP_DEVICE_ARCH_V69 = 69, + QNN_HTP_DEVICE_ARCH_V73 = 73, + QNN_HTP_DEVICE_ARCH_V75 = 75, + QNN_HTP_DEVICE_ARCH_V79 = 79, + QNN_HTP_DEVICE_ARCH_V81 = 81, + QNN_HTP_DEVICE_ARCH_UNKNOWN = 0x7fffffff +} QnnHtpDevice_Arch_t; + +/** + * data struture to configure a device to set the minimum HTP Arch + * the driver will use ops that compatible to this HTP Arch + */ +typedef struct { + uint32_t deviceId; + QnnHtpDevice_Arch_t arch; +} QnnHtpDevice_Minimum_Arch_t; + +/** + * data struture to configure a device to running in Signed/unsigned Domain. + */ +typedef struct { + uint32_t deviceId; + bool useSignedProcessDomain; +} QnnHtpDevice_UseSignedProcessDomain_t; + +typedef void* QnnHtpDevice_UseCustomSetting_t; + +/** + * enum to list what custom configure is available. + */ +typedef enum { + QNN_HTP_DEVICE_CONFIG_OPTION_SOC = 0, + QNN_HTP_DEVICE_CONFIG_OPTION_ARCH = 1, + QNN_HTP_DEVICE_CONFIG_OPTION_SIGNEDPD = 2, + QNN_HTP_DEVICE_CONFIG_OPTION_CUSTOM = 3, + QNN_HTP_DEVICE_CONFIG_OPTION_RESERVED = 0x7fff0000, + QNN_HTP_DEVICE_CONFIG_OPTION_UNKNOWN = 0x7fffffff +} QnnHtpDevice_ConfigOption_t; + +/** + * Data structure for custom configure. + */ +typedef struct { + QnnHtpDevice_ConfigOption_t option; + union UNNAMED { + // This field set the SoC Model + uint32_t socModel; + // This field update the minimum HTP arch + QnnHtpDevice_Minimum_Arch_t arch; + // This structure is used for enable/disable Signed/unsigned PD + QnnHtpDevice_UseSignedProcessDomain_t useSignedProcessDomain; + // This structure is used for enable Custom setting + QnnHtpDevice_UseCustomSetting_t useCustomSetting; + // Reserved for internal purposes + void* reserved; + }; +} QnnHtpDevice_CustomConfig_t; + +// For deviceType in QnnDevice_HardwareDeviceInfoV1_t +typedef enum { + QNN_HTP_DEVICE_TYPE_ON_CHIP = 0, // HTP cores are inside SoC + QNN_HTP_DEVICE_TYPE_UNKNOWN = 0x7fffffff +} QnnHtpDevice_DeviceType_t; + +/** + * @brief QNN HTP Device core type + * This enumeration provides information about the core type inside the SOC. + * + * For online operation, the caller should retrieve this information from + * `QnnDevice_getPlatformInfo`. For offline operation, the caller needs to create a + * `QnnDevice_CoreInfo_t` with the correct core type, and then use it to create the + * `QnnDevice_PlatformInfo_t`. + */ +typedef enum { + QNN_HTP_CORE_TYPE_NSP = 0, + QNN_HTP_CORE_TYPE_HPASS = 1, + + // supported coreType are < QNN_CORE_TYPE_MAX + QNN_HTP_CORE_TYPE_MAX, + QNN_HTP_CORE_TYPE_UNKNOWN = 0x7fffffff +} QnnHtpDevice_CoreType_t; + +/** + * This structure provides info about the NSP device inside SoC + * For online operation, caller should get these info from QnnDevice_getPlatformInfo + * For offline operation, caller need to create this structure and filling the correct information + * for QnnDevice_create + */ +typedef struct { + size_t vtcmSize; // The VTCM for this device in Mega Byte + // user could not request VTCM size exceed this value + uint32_t socModel; // An enum value defined in Qnn Header that represent SoC model + bool signedPdSupport; // This field is true if the device supports Signed PD + bool dlbcSupport; // This field is true if the device supports DLBC + QnnHtpDevice_Arch_t arch; // This field shows the Architecture of this device +} QnnHtpDevice_OnChipDeviceInfoExtension_t; + +/** + * This structure is being used in QnnDevice_HardwareDeviceInfoV1_t + * QnnDevice_getPlatformInfo use this structure to list the supported device features/info + */ +typedef struct _QnnDevice_DeviceInfoExtension_t { + QnnHtpDevice_DeviceType_t devType; + union UNNAMED { + QnnHtpDevice_OnChipDeviceInfoExtension_t onChipDevice; + }; +} QnnHtpDevice_DeviceInfoExtension_t; + +/** + * @brief QNN HTP Device PerfInfrastructure specialization structure. + * Objects of this type are to be referenced through QnnDevice_getInfrastructure. + * + * Contains function pointers for each interface method for + * Htp PerfInfrastructure. + */ +typedef struct { + QnnHtpPerfInfrastructure_CreatePowerConfigIdFn_t createPowerConfigId; + QnnHtpPerfInfrastructure_DestroyPowerConfigIdFn_t destroyPowerConfigId; + QnnHtpPerfInfrastructure_SetPowerConfigFn_t setPowerConfig; + QnnHtpPerfInfrastructure_SetMemoryConfigFn_t setMemoryConfig; +} QnnHtpDevice_PerfInfrastructure_t; + +/// QnnHtpDevice_PerfInfrastructure_t initializer macro +#define QNN_HTP_DEVICE_PERF_INFRASTRUCTURE_INIT \ + { \ + NULL, /*createPowerConfigId*/ \ + NULL, /*destroyPowerConfigId*/ \ + NULL, /*setPowerConfig*/ \ + NULL /*setMemoryConfig*/ \ + } + +typedef enum { + QNN_HTP_DEVICE_INFRASTRUCTURE_TYPE_PERF = 0, + QNN_HTP_DEVICE_INFRASTRUCTURE_TYPE_UNKNOWN = 0x7fffffff +} QnnHtpDevice_InfrastructureType_t; + +typedef struct _QnnDevice_Infrastructure_t { + QnnHtpDevice_InfrastructureType_t infraType; + union UNNAMED { + QnnHtpDevice_PerfInfrastructure_t perfInfra; + }; +} QnnHtpDevice_Infrastructure_t; + +// clang-format on +#ifdef __cplusplus +} // extern "C" +#endif diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpGraph.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpGraph.h new file mode 100755 index 0000000000000..f7e49e9fb8bc3 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpGraph.h @@ -0,0 +1,299 @@ +//============================================================================= +// +// Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================= + +/** + * @file + * @brief QNN HTP component Graph API. + * + * The interfaces in this file work with the top level QNN + * API and supplements QnnGraph.h for HTP backend + */ + +#ifndef QNN_HTP_GRAPH_H +#define QNN_HTP_GRAPH_H + +#include "QnnGraph.h" + +#ifdef __cplusplus +extern "C" { +#endif + +//============================================================================= +// Macros +//============================================================================= +/** + * @brief QnnHtpGraph config value macro. Represents to use the maximum + * available number of the resource. + * + * Currently only applicable for QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE. + */ +#define QNN_HTP_GRAPH_CONFIG_OPTION_MAX 0 + +//============================================================================= +// Data Types +//============================================================================= + +/** + * @brief This enum provides different HTP graph optimization + * options that can be used to finalize the graph + * for optimum performance. + */ +typedef enum { + QNN_HTP_GRAPH_OPTIMIZATION_TYPE_SCHEDULE_THRESHOLD = 1, + QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_RETRIES = 2, + QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG = 3, + QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC = 4, + QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC_WEIGHTS = 5, + QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_SPARSE_WEIGHTS_COMPRESSION = 6, + QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_SLC_ALLOCATOR = 7, + QNN_HTP_GRAPH_OPTIMIZATION_TYPE_UNKNOWN = 0x7fffffff +} QnnHtpGraph_OptimizationType_t; + +// clang-format off + +/** + * @brief Struct describing the set of optimization types + * and the values associated with each optimization type. + * + * Below is the Map between QnnHtpGraph_OptimizationType_t and allowable values: + * + * \verbatim embed:rst:leading-asterisk + * +----+--------------------------------------------------------------------+---------------------------------------------------------------------+ + * | # | OptimizationType option | Allowable values | + * +====+====================================================================+=====================================================================+ + * | 1 | QNN_HTP_GRAPH_OPTIMIZATION_TYPE_SCHEDULE_THRESHOLD | Reserved | + * +----+--------------------------------------------------------------------+---------------------------------------------------------------------+ + * | 2 | QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_RETRIES | Reserved | + * +----+--------------------------------------------------------------------+---------------------------------------------------------------------+ + * | 3 | QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG | Defines the optimization strategy used by the HTP backend | + * +----+--------------------------------------------------------------------+---------------------------------------------------------------------+ + * | 4 | QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC | Reserved | + * +----+--------------------------------------------------------------------+---------------------------------------------------------------------+ + * | 5 | QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC_WEIGHTS | Enables DLBC weights compression | + * +----+--------------------------------------------------------------------+---------------------------------------------------------------------+ + * | 6 | QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_SPARSE_WEIGHTS_COMPRESSION | Enables Weight Sparsity Compression | + * +----+--------------------------------------------------------------------+---------------------------------------------------------------------+ + * | 7 | QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_SLC_ALLOCATOR | Enables System Level Cache Allocator usage | + * +----+--------------------------------------------------------------------+---------------------------------------------------------------------+ + * \endverbatim + */ +typedef struct { + QnnHtpGraph_OptimizationType_t type; + float floatValue; +} QnnHtpGraph_OptimizationOption_t; + +/** + * @brief This struct encapsulates all the VTCM configurations for parallel graph execution. + * + * @code + * |<-- (1) 8MB Total Hardware VTCM -->| + * |<-- (2) 7MB Addressable -->| + * +------+------+------+------+------+------+------+------+ + * | CV | | | | | | | | + * +------+------+------+------+------+------+------+------+ + * |<-- (4) Graph A -->|<-- (4) Graph B -->| + * + * A |> 0 MB (3) Graph Offset + * B |-------------------> 3 MB + * @endcode + */ +typedef struct { + /// (4) above, the amount of VTCM used by a graph + uint32_t sizeInBytes; + /// (3) above, where in the addressable region to start VTCM. + /// Note: (3) + (4) <= (2) + uint32_t offsetInBytes; + /// (2) Addressable portion of VTCM. + /// Set to less than hardware size so Graph(s) can coexist with other VTCM clients. + uint32_t sizeTotalInBytes; + + // For ABI compatibility in the future. + // Set to 0 for now. + uint32_t reserved[3]; +} QnnHtpGraph_VtcmConfig_t; + +/** + * @brief This enum defines whether graph concurrency (i.e. multiple graphs running concurrently) + * is possible, and how to behave when circumstances for concurrency aren't possible. + */ +typedef enum { + /// This graph will not be able to run concurrently with other graphs. + QNN_HTP_GRAPH_CONCURRENCY_OPTION_NONE = 0, + QNN_HTP_GRAPH_CONCURRENCY_OPTION_DEFAULT = QNN_HTP_GRAPH_CONCURRENCY_OPTION_NONE, + /// Graph will try to run concurrently, sharing all resources on the DSP (VTCM, HMX, HVX, etc). + QNN_HTP_GRAPH_CONCURRENCY_OPTION_ALL_SHARED = 1, + // Unused, present to ensure 32 bits. + QNN_HTP_GRAPH_CONCURRENCY_OPTION_UNKNOWN = 0x7fffffff +} QnnHtpGraph_ConcurrencyOption_t; + +/** + * @brief This struct encapsulates all the configurations for parallel graph execution. + */ +typedef struct { + QnnHtpGraph_ConcurrencyOption_t concurrency; + QnnHtpGraph_VtcmConfig_t vtcmConfig; + + // For ABI compatibility in the future. + // Set to 0 for now. + uint32_t reserved[4]; +} QnnHtpGraph_ParallelGraphExecutionConfig_t; +/// The settings in this struct is only applicable +/// for DSP architectures >= V81. +/// Use on other SOCs will return an error. +/// +/// Values will be defaulted to their SOC's TURBO frequency +/// (SOC as identified by Qnn_DeviceHandle_t). +/// +/// On automotive SDKs HMX OP Bounding will be enabled by default. +/// +/// On non-automotive SDKs using this setting will enable +/// HMX OP Bounding. It is off by default. +typedef struct QnnHtp_HmxBoundingInfo { + /// Target HMX freq in Hz. + /// Can be derived from sysMonApp (HexagonSDK) or QProfiler. + float targetHmxFreqHz; + /// Target DSP Core freq in Hz. + /// Can be derived from sysMonApp (HexagonSDK) or QProfiler. + float targetDspCoreFreq; +} QnnHtp_HmxBoundingInfo_t; + +/// QnnHtpGraph_OptimizationOption_t initializer macro +#define QNN_HTP_GRAPH_OPTIMIZATION_OPTION_INIT \ + { \ + QNN_HTP_GRAPH_OPTIMIZATION_TYPE_UNKNOWN, /*type*/ \ + 0.0f /*floatValue*/ \ + } +// clang-format on + +/** + * @brief This enum provides different HTP graph configuration + * options associated with QnnGraph + */ +typedef enum { + QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION = 1, + QNN_HTP_GRAPH_CONFIG_OPTION_PRECISION = 2, + QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE_IN_MB = 3, + QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE_IN_MB, + QNN_HTP_GRAPH_CONFIG_OPTION_FOLD_RELU_ACTIVATION_INTO_CONV_OFF = 4, + QNN_HTP_GRAPH_CONFIG_OPTION_SHORT_DEPTH_CONV_ON_HMX_OFF = 5, + QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS = 6, + QNN_HTP_GRAPH_CONFIG_OPTION_FINALIZE_CONFIG = 7, + QNN_HTP_GRAPH_CONFIG_OPTION_NUM_CORES = 8, + QNN_HTP_GRAPH_CONFIG_OPTION_PARALLEL_GRAPH_EXECUTION_CONFIG = 9, + QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE_IN_BYTES = 10, + QNN_HTP_GRAPH_CONFIG_OPTION_HMX_BOUNDING = 11, + QNN_HTP_GRAPH_CONFIG_OPTION_WEIGHTS_PACKING = 12, + QNN_HTP_GRAPH_CONFIG_OPTION_ASSUME_SAME_QUANT = 13, + QNN_HTP_GRAPH_CONFIG_OPTION_RESERVED = 0x7fff0000, + QNN_HTP_GRAPH_CONFIG_OPTION_UNKNOWN = 0x7fffffff +} QnnHtpGraph_ConfigOption_t; + +//============================================================================= +// Public Functions +//============================================================================= + +//------------------------------------------------------------------------------ +// Implementation Definition +//------------------------------------------------------------------------------ + +/** + * @brief A struct for different config parameters in a key value format. + */ +typedef struct { + const char* key; + Qnn_Scalar_t value; +} QnnHtpGraph_FinalizeConfig_t; + +/** + * @brief Structure describing the set of configurations supported by graph. + * Objects of this type are to be referenced through QnnGraph_CustomConfig_t. + * + * The struct has two fields - option and a union of corresponding config values + * Based on the option corresponding item in the union can be used to specify + * config. + * + * Below is the Map between QnnHtpGraph_ConfigOption_t and config value + * + * \verbatim embed:rst:leading-asterisk + * +----+-------------------------------------------------------------------------------------+------------------------------------------------+ + * | # | Config Option | Configuration Struct/value | + * +====+=====================================================================================+================================================+ + * | 1 | QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION | QnnHtpGraph_OptimizationOption_t + * | + * +----+-------------------------------------------------------------------------------------+------------------------------------------------+ + * | 2 | QNN_HTP_GRAPH_CONFIG_OPTION_PRECISION | Qnn_Precision_t | + * +----+-------------------------------------------------------------------------------------+------------------------------------------------+ + * | 3 | + * QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE_IN_MB/QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE | uint32_t | + * +----+-------------------------------------------------------------------------------------+------------------------------------------------+ + * | 4 | QNN_HTP_GRAPH_CONFIG_OPTION_FOLD_RELU_ACTIVATION_INTO_CONV_OFF | bool | + * +----+-------------------------------------------------------------------------------------+------------------------------------------------+ + * | 5 | QNN_HTP_GRAPH_CONFIG_OPTION_SHORT_DEPTH_CONV_ON_HMX_OFF | bool | + * +----+-------------------------------------------------------------------------------------+------------------------------------------------+ + * | 6 | QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS | uint32_t | + * +----+-------------------------------------------------------------------------------------+------------------------------------------------+ + * | 7 | QNN_HTP_GRAPH_CONFIG_OPTION_FINALIZE_CONFIG | QnnHtpGraph_FinalizeConfig_t | + * +----+-------------------------------------------------------------------------------------+------------------------------------------------+ + * | 8 | QNN_HTP_GRAPH_CONFIG_OPTION_NUM_CORES | uint32_t | + * +----+-------------------------------------------------------------------------------------+------------------------------------------------+ + * | 9 | QNN_HTP_GRAPH_CONFIG_OPTION_PARALLEL_GRAPH_EXECUTION_CONFIG | + * QnnHtpGraph_ParallelGraphExecutionConfig_t | + * +----+-------------------------------------------------------------------------------------+------------------------------------------------+ + * | 10 | QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE_IN_BYTES | uint32_t | + * +----+-------------------------------------------------------------------------------------+------------------------------------------------+ + * | 11 | QNN_HTP_GRAPH_CONFIG_OPTION_HMX_BOUNDING | uint32_t | + * +----+-------------------------------------------------------------------------------------+------------------------------------------------+ + * | 12 | QNN_HTP_GRAPH_CONFIG_OPTION_WEIGHTS_PACKING | bool | + * +----+-------------------------------------------------------------------------------------+------------------------------------------------+ + * | 13 | QNN_HTP_GRAPH_CONFIG_OPTION_ASSUME_SAME_QUANT | bool | + * +----+-------------------------------------------------------------------------------------+------------------------------------------------+ + * +-------------------------+----------------------------------------------------------------+------------------------------------------------+ + * | 0x7fff0000 - 0x7ffffffe | QNN_HTP_GRAPH_CONFIG_OPTION_RESERVED | These are + * reserved for internal purposes | + * +-------------------------+----------------------------------------------------------------+------------------------------------------------+ + * \endverbatim + * + * NOTE: Option #6 (i.e. QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS), can only be + * set prior to the first execution of the graph. Proceeding executions will not use + * the updated value if user does change it after the first execution. + */ +typedef struct { + QnnHtpGraph_ConfigOption_t option; + union { + QnnHtpGraph_OptimizationOption_t optimizationOption; + Qnn_Precision_t precision; + uint32_t vtcmSizeInMB; + bool foldReluActivationIntoConvOff; + bool shortDepthConvOnHmxOff; + uint64_t numHvxThreads; + void* reserved; + QnnHtpGraph_FinalizeConfig_t finalizeConfig; + uint32_t numCores; + QnnHtpGraph_ParallelGraphExecutionConfig_t parallelGraphExecutionConfig; + uint32_t vtcmSizeInBytes; + QnnHtp_HmxBoundingInfo_t hmxBoundingInfo; + bool weightsPacking; + bool assumeSameQuant; + }; +} QnnHtpGraph_CustomConfig_t; + +// clang-format on +/// QnnHtpGraph_CustomConfig_t initializer macro +#define QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT \ + { \ + QNN_HTP_GRAPH_CONFIG_OPTION_UNKNOWN, /*option*/ \ + { \ + QNN_HTP_GRAPH_OPTIMIZATION_OPTION_INIT /*optimizationOption*/ \ + } \ + } + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpMem.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpMem.h new file mode 100755 index 0000000000000..adc9ef2c52504 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpMem.h @@ -0,0 +1,85 @@ +//============================================================================== +// +// Copyright (c) 2022-2023 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +#ifndef QNN_HTP_MEMORY_INFRASTRUCTURE_2_H +#define QNN_HTP_MEMORY_INFRASTRUCTURE_2_H + +#include "QnnCommon.h" + +/** + * @file + * @brief QNN HTP Memory Infrastructure component API. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +//============================================================================= +// VTCM +//============================================================================= + +// clang-format off + +/** + * @brief Raw memory address that exists ONLY on the QURT + * side. + */ +typedef uint32_t QnnHtpMem_QurtAddress_t; + +/** + * @brief Configuration for custom shared buffer memory type + * This shared buffer is a contiguous chunk of memory identified + * by a single file descriptor which will be used by multiple tensors + * based on the offset provided + * Each QnnMem_register call with different offset will return a + * unique memory handle + */ +typedef struct { + // File descriptor for memory, must be set to QNN_MEM_INVALID_FD if not applicable + int32_t fd; + // Offset to be used in contiguous shared buffer + uint64_t offset; +} QnnHtpMem_SharedBufferConfig_t; + +// clang-format off + +/** + * @brief QNN Memory Type + */ +typedef enum { + QNN_HTP_MEM_QURT = 0, + QNN_HTP_MEM_SHARED_BUFFER = 1, + QNN_HTP_MEM_UNDEFINED = 0x7FFFFFFF +} QnnHtpMem_Type_t; + +// clang-format off + +/** + * @brief descriptor used for the QNN API + */ +typedef struct { + // Memory type identified by QnnHtpMem_Type_t + QnnHtpMem_Type_t type; + // Total size of the buffer + // For memory type QURT, it would be size of a tensor + // For memory type SHARED BUFFER, it would be the total size of the buffer + uint64_t size; + + union { + QnnHtpMem_QurtAddress_t qurtAddress; + QnnHtpMem_SharedBufferConfig_t sharedBufferConfig; + }; +} QnnMemHtp_Descriptor_t; + +// clang-format on +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpPerfInfrastructure.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpPerfInfrastructure.h new file mode 100755 index 0000000000000..f92317ac94bf2 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpPerfInfrastructure.h @@ -0,0 +1,511 @@ +//============================================================================== +// +// Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +/** @file + * @brief QNN HTP component Performance Infrastructure API + * + * Provides interface to the client to control performance and system + * settings of the QNN HTP Accelerator + */ + +#ifndef QNN_HTP_PERF_INFRASTRUCTURE_H +#define QNN_HTP_PERF_INFRASTRUCTURE_H + +#include "QnnCommon.h" +#include "QnnTypes.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// max rpc polling time allowed - 9999 us +#define QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIG_MAX_RPC_POLLING_TIME 9999 + +//============================================================================= +// Data Types +//============================================================================= + +/** + * @brief QNN HTP PerfInfrastructure API result / error codes. + * + */ +typedef enum { + QNN_HTP_PERF_INFRASTRUCTURE_MIN_ERROR = QNN_MIN_ERROR_PERF_INFRASTRUCTURE, + //////////////////////////////////////////////////////////////////////// + + QNN_HTP_PERF_INFRASTRUCTURE_NO_ERROR = QNN_SUCCESS, + QNN_HTP_PERF_INFRASTRUCTURE_ERROR_INVALID_HANDLE_PTR = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 0, + QNN_HTP_PERF_INFRASTRUCTURE_ERROR_INVALID_INPUT = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 1, + QNN_HTP_PERF_INFRASTRUCTURE_ERROR_UNSUPPORTED_CONFIG = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 2, + QNN_HTP_PERF_INFRASTRUCTURE_ERROR_TRANSPORT = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 3, + QNN_HTP_PERF_INFRASTRUCTURE_ERROR_UNSUPPORTED = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 4, + QNN_HTP_PERF_INFRASTRUCTURE_ERROR_MEM_ALLOC = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 5, + QNN_HTP_PERF_INFRASTRUCTURE_ERROR_FAILED = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 6, + + //////////////////////////////////////////////////////////////////////// + QNN_HTP_PERF_INFRASTRUCTURE_MAX_ERROR = QNN_MAX_ERROR_PERF_INFRASTRUCTURE, + /// UNDEFINED value that must not be used by client + QNN_HTP_PERF_INFRASTRUCTURE_ERROR_UNDEFINED = 0x7fffffff +} QnnHtpPerfInfrastructure_Error_t; + +/** + * @brief Allows client to consider (non-zero value) DCVS enable/disable + * and option parameters, otherwise (zero value) + * + */ +typedef uint32_t QnnHtpPerfInfrastructure_SetDcvsEnable_t; + +/** + * @brief Allows client to start (non-zero value) or stop (zero value) + * participating in DCVS + * + */ +typedef uint32_t QnnHtpPerfInfrastructure_DcvsEnable_t; + +/** + * @brief Allows client to consider (non-zero value) latency parameter, + * otherwise (zero value) + * + */ +typedef uint32_t QnnHtpPerfInfrastructure_SetSleepLatency_t; + +/** + * @brief Allows client to set up the sleep latency in microseconds + * + */ +typedef uint32_t QnnHtpPerfInfrastructure_SleepLatency_t; + +/** + * @brief Allows client to consider (non-zero value) sleep disable + * parameter, otherwise (zero value) + * + */ +typedef uint32_t QnnHtpPerfInfrastructure_SetSleepDisable_t; + +/** + * @brief Allows client to disable sleep or low power modes. + * Pass a non-zero value to disable sleep in HTP + * + */ +typedef uint32_t QnnHtpPerfInfrastructure_SleepDisable_t; + +/** + * @brief Allows client to consider (non-zero value) bus clock + * params, otherwise (zero value) + * + */ +typedef uint32_t QnnHtpPerfInfrastructure_SetBusParams_t; + +/** + * @brief Allows client consider (non-zero value) core clock + * params, otherwise (zero value) + * + */ +typedef uint32_t QnnHtpPerfInfrastructure_SetCoreParams_t; + +/** + * @brief Allows client to set up the RPC control latency in microseconds + * + */ +typedef uint32_t QnnHtpPerfInfrastructure_RpcControlLatency_t; + +/** + * @brief Allows client to set up the RPC polling time in microseconds + */ +typedef uint32_t QnnHtpPerfInfrastructure_RpcPollingTime_t; + +/** + * @brief Allows client to set up the adaptive polling time in microseconds + */ +typedef uint32_t QnnHtpPerfInfrastructure_AdaptivePollingTime_t; + +/** + * @brief Allows client to set up the HMX timeout interval in microseconds + */ +typedef uint32_t QnnHtpPerfInfrastructure_HmxTimeoutIntervalUs_t; + +/** + * @brief sets the minimum size by which user heap should grow + * when heap is exhausted. This API is expected to be + * called only once per backend and has a process wide impact + * + * Grow size provided in bytes and defaults to 16MB + */ +typedef uint32_t QnnHtpPerfInfrastructure_MemGrowSize_t; + +/** + * @brief Allows client to set default values for HMX frequency. + * If enabled 1 HMX vote will scale with DCVS Corner if 0 HMX vote + * needs to be specified manually. + * + */ +typedef uint32_t QnnHtpPerfInfrastructure_HmxDefault_Vote_t; + +/** + * @brief Perf modes to specify clock frequency level within + * target voltage corner currently applies only for HMX config. + */ +typedef enum { + // To select max frequency at target voltage corner. + QNN_HTP_PERF_INFRASTRUCTURE_CLK_PERF_HIGH = 0, + // To select min frequency at target voltage corner. + QNN_HTP_PERF_INFRASTRUCTURE_CLK_PERF_LOW, + /// UNKNOWN value that must not be used by client + QNN_HTP_PERF_INFRASTRUCTURE_CLK_PERF_UNKNOWN = 0x7fffffff +} QnnHtpPerfInfrastructure_ClkPerfMode_t; + +/** + * @brief These are the different voltage corners that can + * be requested by the client to influence the voting scheme + * for DCVS + * + */ +typedef enum { + /// Maps to HAP_DCVS_VCORNER_DISABLE. + /// Disable setting up voltage corner + DCVS_VOLTAGE_CORNER_DISABLE = 0x10, + /// Maps to HAP_DCVS_VCORNER_SVS2. + /// Set voltage corner to minimum value supported on platform + DCVS_VOLTAGE_VCORNER_MIN_VOLTAGE_CORNER = 0x20, + /// Maps to HAP_DCVS_VCORNER_SVS2. + /// Set voltage corner to SVS2 value for the platform + DCVS_VOLTAGE_VCORNER_SVS2 = 0x30, + /// Maps to HAP_DCVS_VCORNER_SVS. + /// Set voltage corner to SVS value for the platform + DCVS_VOLTAGE_VCORNER_SVS = 0x40, + /// Maps to HAP_DCVS_VCORNER_SVS_PLUS. + /// Set voltage corner to SVS_PLUS value for the platform + DCVS_VOLTAGE_VCORNER_SVS_PLUS = 0x50, + /// Maps to HAP_DCVS_VCORNER_NOM. + /// Set voltage corner to NOMINAL value for the platform + DCVS_VOLTAGE_VCORNER_NOM = 0x60, + /// Maps to HAP_DCVS_VCORNER_NOM_PLUS. + /// Set voltage corner to NOMINAL_PLUS value for the platform + DCVS_VOLTAGE_VCORNER_NOM_PLUS = 0x70, + /// Maps to HAP_DCVS_VCORNER_TURBO. + /// Set voltage corner to TURBO value for the platform + DCVS_VOLTAGE_VCORNER_TURBO = 0x80, + /// Maps to HAP_DCVS_VCORNER_TURBO_PLUS. + /// Set voltage corner to TURBO_PLUS value for the platform + DCVS_VOLTAGE_VCORNER_TURBO_PLUS = 0x90, + /// Maps to HAP_DCVS_VCORNER_TURBO_L2. + /// Set voltage corner to TURBO_L2 value for the platform + DCVS_VOLTAGE_VCORNER_TURBO_L2 = 0x92, + /// Maps to HAP_DCVS_VCORNER_TURBO_L3. + /// Set voltage corner to TURBO_L3 value for the platform + DCVS_VOLTAGE_VCORNER_TURBO_L3 = 0x93, + /// Maps to HAP_DCVS_VCORNER_MAX. + /// Set voltage corner to maximum value supported on the platform + DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER = 0xA0, + /// UNKNOWN value that must not be used by client + DCVS_VOLTAGE_VCORNER_UNKNOWN = 0x7fffffff +} QnnHtpPerfInfrastructure_VoltageCorner_t; + +/** + * @brief These are the expanded voltage corners that can + * be requested by the client to influence the voting scheme + * for DCVS + * + */ +typedef enum { + /// Maps to HAP_DCVS_EXP_VCORNER_DISABLE. + /// Disable setting up voltage corner + DCVS_EXP_VCORNER_DISABLE = 0, + /// Maps to HAP_DCVS_EXP_VCORNER_MIN. + /// Set voltage corner to minimum value supported on platform + DCVS_EXP_VCORNER_MIN = 0x100, + /// Maps to HAP_DCVS_EXP_VCORNER_LOW_SVS_D2. + /// Set voltage corner to LOWSVS_D2 value for the platform + DCVS_EXP_VCORNER_LOW_SVS_D2 = 0x134, + /// Maps to HAP_DCVS_EXP_VCORNER_LOW_SVS_D1. + /// Set voltage corner to LOWSVS_D1 value for the platform + DCVS_EXP_VCORNER_LOW_SVS_D1 = 0x138, + /// Maps to HAP_DCVS_EXP_VCORNER_LOW_SVS. + /// Set voltage corner to LOWSVS value for the platform + DCVS_EXP_VCORNER_LOW_SVS = 0x140, + /// Maps to HAP_DCVS_EXP_VCORNER_SVS. + /// Set voltage corner to SVS value for the platform + DCVS_EXP_VCORNER_SVS = 0x180, + /// Maps to HAP_DCVS_EXP_VCORNER_SVS_L1. + /// Set voltage corner to SVS_L1 value for the platform + DCVS_EXP_VCORNER_SVS_L1 = 0x1C0, + /// Maps to HAP_DCVS_EXP_VCORNER_NOM. + /// Set voltage corner to NOM value for the platform + DCVS_EXP_VCORNER_NOM = 0x200, + /// Maps to HAP_DCVS_EXP_VCORNER_NOM_L1. + /// Set voltage corner to NOM_L1 value for the platform + DCVS_EXP_VCORNER_NOM_L1 = 0x240, + /// Maps to HAP_DCVS_EXP_VCORNER_TUR. + /// Set voltage corner to TURBO value for the platform + DCVS_EXP_VCORNER_TUR = 0x280, + /// Maps to HAP_DCVS_EXP_VCORNER_TUR_L1. + /// Set voltage corner to TURBO_L1 value for the platform + DCVS_EXP_VCORNER_TUR_L1 = 0x2A0, + /// Maps to HAP_DCVS_EXP_VCORNER_TUR_L2. + /// Set voltage corner to TURBO_L2 value for the platform + DCVS_EXP_VCORNER_TUR_L2 = 0x2B0, + /// Maps to HAP_DCVS_EXP_VCORNER_TUR_L3. + /// Set voltage corner to TURBO_L3 value for the platform + DCVS_EXP_VCORNER_TUR_L3 = 0x2C0, + /// Maps to HAP_DCVS_EXP_VCORNER_MAX. + /// Selects the maximum voltage corner defined for the chipset + DCVS_EXP_VCORNER_MAX = 0xFFFF, + /// UNKNOWN value that must not be used by client + DCVS_EXP_VCORNER_UNKNOWN = 0x7fffffff +} QnnHtpPerfInfrastructure_ExpVoltageCorner_t; + +/** + * @brief This enum defines all the possible power mode + * that a client can set to influence DCVS mode + */ +typedef enum { + /// Maps to HAP_DCVS_V2_ADJUST_UP_DOWN. + /// Allows for DCVS to adjust up and down + QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_ADJUST_UP_DOWN = 0x1, + /// Maps to HAP_DCVS_V2_ADJUST_ONLY_UP. + /// Allows for DCVS to adjust up only + QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_ADJUST_ONLY_UP = 0x2, + /// Maps to HAP_DCVS_V2_POWER_SAVER_MODE. + /// Higher thresholds for power efficiency + QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_POWER_SAVER_MODE = 0x4, + /// Maps to HAP_DCVS_V2_POWER_SAVER_AGGRESSIVE_MODE. + /// Higher thresholds for power efficiency with faster ramp down + QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_POWER_SAVER_AGGRESSIVE_MODE = 0x8, + /// Maps to HAP_DCVS_V2_PERFORMANCE_MODE. + /// Lower thresholds for maximum performance + QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE = 0x10, + /// Maps to HAP_DCVS_V2_DUTY_CYCLE_MODE. + /// The below value applies only for HVX clients: + /// - For streaming class clients: + /// - detects periodicity based on HVX usage + /// - lowers clocks in the no HVX activity region of each period. + /// - For compute class clients: + /// - Lowers clocks on no HVX activity detects and brings clocks up on detecting HVX activity + /// again. + /// - Latency involved in bringing up the clock will be at max 1 to 2 ms. + QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_DUTY_CYCLE_MODE = 0x20, + /// UNKNOWN value that must not be used by client + QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_UNKNOWN = 0x7fffffff +} QnnHtpPerfInfrastructure_PowerMode_t; + +/** + * @brief This struct provides performance infrastructure configuration + * associated with setting up of DcvsV3 which allows to select + * bus and core operating corners separately + */ +typedef struct { + uint32_t contextId; + QnnHtpPerfInfrastructure_SetDcvsEnable_t setDcvsEnable; + QnnHtpPerfInfrastructure_DcvsEnable_t dcvsEnable; + QnnHtpPerfInfrastructure_PowerMode_t powerMode; + QnnHtpPerfInfrastructure_SetSleepLatency_t setSleepLatency; + QnnHtpPerfInfrastructure_SleepLatency_t sleepLatency; + QnnHtpPerfInfrastructure_SetSleepDisable_t setSleepDisable; + QnnHtpPerfInfrastructure_SleepDisable_t sleepDisable; + QnnHtpPerfInfrastructure_SetBusParams_t setBusParams; + QnnHtpPerfInfrastructure_VoltageCorner_t busVoltageCornerMin; + QnnHtpPerfInfrastructure_VoltageCorner_t busVoltageCornerTarget; + QnnHtpPerfInfrastructure_VoltageCorner_t busVoltageCornerMax; + QnnHtpPerfInfrastructure_SetCoreParams_t setCoreParams; + QnnHtpPerfInfrastructure_VoltageCorner_t coreVoltageCornerMin; + QnnHtpPerfInfrastructure_VoltageCorner_t coreVoltageCornerTarget; + QnnHtpPerfInfrastructure_VoltageCorner_t coreVoltageCornerMax; +} QnnHtpPerfInfrastructure_DcvsV3_t; + +/** + * @brief This struct provides performance infrastructure configuration + * associated with setting up of hmxv2 which allows to select + * hmx corner separately. If hmxPickDefault is 1 all voltage corner + * params will be ignored. Ensure to use same contextID as used for + * DCVS vote. + */ +typedef struct { + QnnHtpPerfInfrastructure_HmxDefault_Vote_t hmxPickDefault; + QnnHtpPerfInfrastructure_ExpVoltageCorner_t hmxVoltageCornerMin; + QnnHtpPerfInfrastructure_ExpVoltageCorner_t hmxVoltageCornerTarget; + QnnHtpPerfInfrastructure_ExpVoltageCorner_t hmxVoltageCornerMax; + QnnHtpPerfInfrastructure_ClkPerfMode_t hmxPerfMode; +} QnnHtpPerfInfrastructure_HmxV2_t; + +/** + * @brief This enum defines all the possible performance + * options in Htp Performance Infrastructure that + * relate to setting up of power levels + */ +typedef enum { + /// config enum implies the usage of Dcvs v3 + QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3 = 1, + /// config enum implies the usage of rpcControlLatencyConfig struct + QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_CONTROL_LATENCY = 2, + /// config enum implies the usage of rpcPollingTimeConfig struct + /// this config is only supported on V69 and later + /// if enabled, this config is applied to entire process + /// max allowed is QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIG_MAX_RPC_POLLING_TIME us + QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME = 3, + /// config HMX timeout interval in us. The HMX is turned off after the set interval + /// time if no interaction with it after an inference is finished. + QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_HMX_TIMEOUT_INTERVAL_US = 4, + /// config HMX V2 voting parameters only on supported chips + QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_HMX_V2 = 5, + /// config enum implies the usage of adaptivePollingTime struct + /// this config can only be enabled in the RPC polling mode + /// if enabled, this config is applied to the entire process + QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_ADAPTIVE_POLLING_TIME = 6, + /// UNKNOWN config option which must not be used + QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_UNKNOWN = 0x7fffffff +} QnnHtpPerfInfrastructure_PowerConfigOption_t; + +/** + * @brief This struct provides performance infrastructure configuration + * associated with setting up of power levels + */ +typedef struct { + QnnHtpPerfInfrastructure_PowerConfigOption_t option; + union UNNAMED { + QnnHtpPerfInfrastructure_DcvsV3_t dcvsV3Config; + QnnHtpPerfInfrastructure_RpcControlLatency_t rpcControlLatencyConfig; + QnnHtpPerfInfrastructure_RpcPollingTime_t rpcPollingTimeConfig; + QnnHtpPerfInfrastructure_HmxTimeoutIntervalUs_t hmxTimeoutIntervalUsConfig; + QnnHtpPerfInfrastructure_HmxV2_t hmxV2Config; + QnnHtpPerfInfrastructure_AdaptivePollingTime_t adaptivePollingTimeConfig; + }; +} QnnHtpPerfInfrastructure_PowerConfig_t; + +/// QnnHtpPerfInfrastructure_PowerConfig_t initializer macro +#define QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIG_INIT \ + { \ + QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_UNKNOWN, /*config*/ \ + { \ + 0 /*dcvsV3Config*/ \ + } \ + } + +/** + * @brief This enum defines all the possible performance + * options in Htp Performance Infrastructure that + * relate to system memory settings + */ +typedef enum { + /// sets memory grow size + QNN_HTP_PERF_INFRASTRUCTURE_MEMORY_CONFIGOPTION_GROW_SIZE = 1, + /// UNKNOWN config option that must not be used + QNN_HTP_PERF_INFRASTRUCTURE_MEMORY_CONFIGOPTION_UNKNOWN = 0x7fffffff +} QnnHtpPerfInfrastructure_MemoryConfigOption_t; + +/** + * @brief Provides performance infrastructure configuration + * options that are memory specific + */ +typedef struct { + QnnHtpPerfInfrastructure_MemoryConfigOption_t option; + union UNNAMED { + QnnHtpPerfInfrastructure_MemGrowSize_t memGrowSizeConfig; + }; +} QnnHtpPerfInfrastructure_MemoryConfig_t; + +/// QnnHtpPerfInfrastructure_MemoryConfig_t initializer macro +#define QNN_HTP_PERF_INFRASTRUCTURE_MEMORY_CONFIG_INIT \ + { \ + QNN_HTP_PERF_INFRASTRUCTURE_MEMORY_CONFIGOPTION_UNKNOWN, /*config*/ \ + { \ + 0 /*memGrowSizeConfig*/ \ + } \ + } + +//============================================================================= +// API Methods +//============================================================================= + +/** + * @brief This API allows client to create power configuration id that + * has to be used to set different performance modes. + * Power configuration id has to be destroyed by client when not needed. + * + * @param[in] deviceId Hardware Device on which this config id needs to be created. + * + * @param[in] coreId Core/NSP on which this config id needs to be created. + * + * @param[out] powerConfigId Pointer to power configuration id to be created. + * + * @return Error code + * \n QNN_SUCCESS: No error encountered + * \n QNN_HTP_PERF_INFRASTRUCTURE_ERROR_INVALID_INPUT if deviceId/coreId + * or power configuration id is NULL + */ +typedef Qnn_ErrorHandle_t (*QnnHtpPerfInfrastructure_CreatePowerConfigIdFn_t)( + uint32_t deviceId, uint32_t coreId, uint32_t* powerConfigId); + +/** + * @brief This API allows client to destroy power configuration id. + * + * @param[in] powerConfigId A power configuration id to be destroyed. + * + * @return Error code + * \n QNN_SUCCESS: No error encountered + * \n QNN_HTP_PERF_INFRASTRUCTURE_ERROR_INVALID_INPUT if power configuration + * id does not exist + * \n QNN_COMMON_ERROR_SYSTEM_COMMUNICATION: SSR occurence (successful recovery) + * \n QNN_COMMON_ERROR_SYSTEM_COMMUNICATION_FATAL: SSR occurence (unsuccessful recovery) + */ +typedef Qnn_ErrorHandle_t (*QnnHtpPerfInfrastructure_DestroyPowerConfigIdFn_t)( + uint32_t powerConfigId); + +/** + * @brief This API allows client to set up system power configuration that + * will enable different performance modes. This API uses + * HAP_power_dcvs_v3_payload struct to config HAP power parameters. + * Detailed HAP power parameters description please refer to Hexagon + * SDK HAP_power_dcvs_v3_payload documentation. + * + * @param[in] powerConfigId A power client id to associate calls to system + * power settings. A value of 0 implies NULL power client id + * and can override every other setting the user process. To + * enable power settings for multiple clients in the same + * process, use a non-zero power client id. + * + * @param[in] config Pointer to a NULL terminated array + * of config option for performance configuration. + * NULL is allowed and indicates no config options are provided. + * + * @return Error code + * \n QNN_SUCCESS: No error encountered + * \n QNN_HTP_PERF_INFRASTRUCTURE_ERROR_INVALID_INPUT if power configuration + * does not exist + * \n QNN_COMMON_ERROR_SYSTEM_COMMUNICATION: SSR occurence (successful recovery) + * \n QNN_COMMON_ERROR_SYSTEM_COMMUNICATION_FATAL: SSR occurence (unsuccessful recovery) + */ +typedef Qnn_ErrorHandle_t (*QnnHtpPerfInfrastructure_SetPowerConfigFn_t)( + uint32_t powerConfigId, const QnnHtpPerfInfrastructure_PowerConfig_t** config); + +/** + * @brief This API allows clients to set up configuration associated with + * system memory on a specific device + * + * @param[in] deviceId Hardware Device on which this config needs to be applied. + * + * @param[in] coreId Core/NSP on which this config needs to be applied. + * + * @param[in] config Pointer to a NULL terminated array + * of config option for system memory configuration. + * NULL is allowed and indicates no config options are provided. + * + * @return Error code + * \n QNN_SUCCESS: No error encountered + * \n QNN_HTP_PERF_INFRASTRUCTURE_ERROR_INVALID_INPUT if deviceId/coreId + * or memory configuration does not exist + * \n QNN_COMMON_ERROR_SYSTEM_COMMUNICATION: SSR occurence (successful recovery) + * \n QNN_COMMON_ERROR_SYSTEM_COMMUNICATION_FATAL: SSR occurence (unsuccessful recovery) + */ +typedef Qnn_ErrorHandle_t (*QnnHtpPerfInfrastructure_SetMemoryConfigFn_t)( + uint32_t deviceId, uint32_t coreId, const QnnHtpPerfInfrastructure_MemoryConfig_t** config); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // QNN_HTP_PERF_INFRASTRUCTURE_H diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpProfile.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpProfile.h new file mode 100755 index 0000000000000..92381d17b0440 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpProfile.h @@ -0,0 +1,567 @@ +//============================================================================== +// +// Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +/** + * @file + * @brief QNN HTP Profile component API. + * + * Requires HTP backend to be initialized. + * Should be used with the QnnProfile API but has HTP backend + * specific definition for different QnnProfile data structures + * + */ + +#ifndef QNN_HTP_PROFILE_H +#define QNN_HTP_PROFILE_H + +#include "QnnProfile.h" + +#ifdef __cplusplus +extern "C" { +#endif + +//============================================================================= +// Macros +//============================================================================= +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the remote procedure call on the ARM processor + * when client invokes QnnContext_createFromBinary. The value + * returned is time in microseconds. + * + * @note context load binary host rpc time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTP_PROFILE_EVENTTYPE_CONTEXT_LOAD_BIN_HOST_RPC_TIME_MICROSEC 1002 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the remote procedure call on the HTP processor + * when client invokes QnnContext_createFromBinary. The value + * returned is time in microseconds. + * + * @note context load binary htp rpc time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTP_PROFILE_EVENTTYPE_CONTEXT_LOAD_BIN_HTP_RPC_TIME_MICROSEC 1003 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the time taken to create the context on the + * accelerator when client invokes QnnContext_createFromBinary. + * The value returned is time in microseconds. + * + * @note context load binary accelerator time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTP_PROFILE_EVENTTYPE_CONTEXT_LOAD_BIN_ACCEL_TIME_MICROSEC 1004 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the remote procedure call on the ARM processor + * when client invokes QnnGraph_finalize. + * The value returned is time in microseconds. + * + * @note graph finalize host rpc time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_HOST_RPC_TIME_MICROSEC 2001 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the remote procedure call on the HTP processor + * when client invokes QnnGraph_finalize. + * The value returned is time in microseconds. + * + * @note graph finalize htp rpc time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_HTP_RPC_TIME_MICROSEC 2002 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to finalize the graph on the accelerator + * when client invokes QnnGraph_finalize. + * The value returned is time in microseconds. + * + * @note graph finalize accelerator time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_ACCEL_TIME_MICROSEC 2003 + +/* Graph Performance Estimate Support + * + **/ +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to Performance Estimates for the graph + * when client invokes QnnGraph_finalize. + * This is just a dummy event which will print only the heading + * with no value or unit. + * @note HTP Performance Estimates maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE 2004 + +/** + * @brief QnnProfile_EventType_t definition to get perf mode at which + * the perf estimates are collected during QnnGraph_finalize. + * The value returned is the perf mode in string with no unit. + * + * @note Perf mode maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE_MODE 2005 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to simulated execution cycles during + * QnnGraph_finalize. + * The value returned is number of cycles. + * + * @note Simulated execution cycles maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE_SIM_EXEC_CYCLES 2006 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to a lower estimate of simulated execution + * cycles during QnnGraph_finalize. + * The value returned is number of cycles. + * + * @note Simulated execution cycles lower estimate maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE_SIM_EXEC_LOWER_CYCLES 2007 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to a upper estimate of simulated execution + * cycles during QnnGraph_finalize. + * The value returned is number of cycles. + * + * @note Simulated execution cycles upper estimate maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE_SIM_EXEC_UPPER_CYCLES 2008 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to DDR information for each HTP during + * QnnGraph_finalize. + * This is just a dummy event which will print only the heading + * with no value or unit. + * + * @note DDR Information for each HTP maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE_BANDWIDTH_STATS 2009 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the HTP ID on chip during QnnGraph_finalize. + * The value returned is the HTP ID with no unit. + * + * @note HTP ID's maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE_BANDWIDTH_STATS_HTP_ID 2010 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the Graph defined inputs or the total reads + * (in bytes) from DDR for graph input related tensors (weights, + * bias, activations) which do not have predecessors. + * The value returned is the num of blocks in bytes. + * + * @note Graph defined inputs for each HTP maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE_INPUT_FILL 2011 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the total reads (in bytes) from DDR for + * compiler generated fill operators which have predecessors and + * successors and originate on the same HTP. + * The value returned is the num of blocks in bytes. + * + * @note Intermediate Fill Information for each HTP maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE_INTERMEDIATE_FILL 2012 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the total writes (in bytes) from DDR for + * compiler generated fill operators which have predecessors and + * successors and originate on the same HTP. + * The value returned is the num of blocks in bytes. + * + * @note Intermediate Spill Information for each HTP maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE_INTERMEDIATE_SPILL 2013 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the total reads (in bytes) from DDR for + * fills which were generated by a different HTP core and do not + * have a predecessor, but have a successor. + * The value returned is the num of blocks in bytes. + * + * @note Inter HTP Fill Information for each HTP maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE_INTER_HTP_FILL 2014 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the total writes (in bytes) from DDR for + * fills which were generated by a different HTP core and do not + * have a successor, but have a predecessor. + * The value returned is the num of blocks in bytes. + * + * @note Inter HTP Spill Information for each HTP maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE_INTER_HTP_SPILL 2015 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the total writes (in bytes) to DDR for + * graph output related tensors which do not have successors. + * The value returned is the num of blocks in bytes. + * + * @note Graph output related tensors for each HTP maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE_OUTPUT_SPILL 2016 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the total number of missing ops which do + * not have any cost associated with them while getting the graph + * performance estimates. + * The value returned is the num of missing ops with no unit. + * + * @note Number of missing cost ops maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE_MISSING_COST_OPS 2017 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the op ids of the missing ops which do + * not have any cost associated with them while getting the graph + * performance estimates. + * The value returned is the opname along with the op id (decimal + * format) of the ops which does not have any costs associated + * with them. + * + * @note Opname and Op ids of missing cost ops are available only with + * QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE_MISSING_COST_OPID 2018 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the remote procedure call on the ARM processor + * when client invokes QnnGraph_execute or QnnGraph_executeAsync. + * The value returned is time in microseconds. + * + * @note graph execute host rpc time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_HOST_RPC_TIME_MICROSEC 3001 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the remote procedure call on the HTP processor + * when client invokes QnnGraph_execute or QnnGraph_executeAsync. + * The value returned is time in microseconds. + * + * @note graph execute htp rpc time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_HTP_RPC_TIME_MICROSEC 3002 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to execute the graph on the accelerator + * when client invokes QnnGraph_execute or QnnGraph_executeAsync. + * The value returned is number of processor cycles taken. + * + * @note graph execute accelerator time maybe available only on + * QNN_PROFILE_LEVEL_DETAILED levels + * + * @note When QNN_PROFILE_LEVEL_DETAILED is used, this event can have + * multiple sub-events of type QNN_PROFILE_EVENTTYPE_NODE. + * There will be a sub-event for each node that was added to the graph + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_ACCEL_TIME_CYCLE 3003 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to execute the graph on the accelerator + * when client invokes QnnGraph_execute or QnnGraph_executeAsync. + * The value indicates execute including wait/resource acquisition + * time on the accelerator, if applicable in multi-threaded scenarios. + * The value returned is time taken in microseconds + * + * @note graph execute accelerator time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + * + * @note When QNN_PROFILE_LEVEL_DETAILED is used, this event can have + * multiple sub-events of type QNN_PROFILE_EVENTTYPE_NODE / QNN_PROFILE_EVENTUNIT_MICROSEC + * There will be a sub-event for each node that was added to the graph + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_ACCEL_TIME_MICROSEC 3004 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to time taken for miscellaneous work i.e. time + * that cannot be attributed to a node but are still needed to + * execute the graph on the accelerator. This occurs when client invokes + * QnnGraph_execute or QnnGraph_executeAsync. + * The value returned is time taken in microseconds + * + * @note graph execute misc accelerator time is available only on + * QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_MISC_ACCEL_TIME_MICROSEC 3005 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to time taken for a graph yield instance to + * release all its resources to the other graph. + * The value returned is time taken in microseconds. + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_YIELD_INSTANCE_RELEASE_TIME 3006 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to time a graph spends waiting for a higher + * priority graph to finish execution. + * The value returned is time taken in microseconds + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_YIELD_INSTANCE_WAIT_TIME 3007 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to time a graph spends re-acquiring resources + * and restoring vtcm. + * The value returned is time taken in microseconds + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_YIELD_INSTANCE_RESTORE_TIME 3008 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the number of times that a yield occured + * during execution + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_YIELD_COUNT 3009 + +/** + * @brief QnnProfile_EventType_t definition for time a graph waits to get + * VTCM. This should be constant UNLESS we need another graph to yield. + * The value returned is time taken in microseconds. + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_VTCM_ACQUIRE_TIME 3010 + +/** + * @brief QnnProfile_EventType_t definition for time a graph waits to get + * HMX + HVX, and turn them all on. + * The value returned is time taken in microseconds. + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_RESOURCE_POWER_UP_TIME 3011 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to execute the graph on the accelerator + * when client invokes QnnGraph_execute or QnnGraph_executeAsync. + * The value indicates execute excluding wait/resource acquisition + * time on the accelerator, if applicable in multi-threaded scenarios. + * The value returned is time taken in microseconds + * + * @note graph execute accelerator time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + * + * @note When QNN_PROFILE_LEVEL_DETAILED is used, this event can have + * multiple sub-events of type QNN_PROFILE_EVENTTYPE_NODE / QNN_PROFILE_EVENTUNIT_MICROSEC + * There will be a sub-event for each node that was added to the graph + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_ACCEL_EXCL_WAIT_TIME_MICROSEC 3012 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the remote procedure call on the ARM processor + * when client invokes QnnContext_free which in consequence deinit graph. + * The value returned is time in microseconds. + * + * @note graph deinit host rpc time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_DEINIT_HOST_RPC_TIME_MICROSEC 4001 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the remote procedure call on the HTP processor + * when client invokes QnnContext_free which in consequence deinit graph. + * The value returned is time in microseconds. + * + * @note graph deinit htp rpc time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_DEINIT_HTP_RPC_TIME_MICROSEC 4002 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the time taken to deinit graph on the + * accelerator when client invokes QnnContext_free which in consequence + * deinit graph. The value returned is time in microseconds. + * + * @note graph deinit accelerator time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_DEINIT_ACCEL_TIME_MICROSEC 4003 + +/** + * @brief QnnProfile_EventType_t definition to get data related to execution of + * an operation. This value represents the amount of time an op spends + * waiting for execution on the main thread since the last op on the main + * thread due to scheduling and can be interpreted appropriately in + * conjunction with the unit. + * + * @note node wait information is available on QNN_HTP_PROFILE_LEVEL_LINTING level + */ +#define QNN_HTP_PROFILE_EVENTTYPE_NODE_WAIT 5001 + +/** + * @brief QnnProfile_EventType_t definition to get data related to execution of + * an operation. This value represents the amount of time at least one + * background op is running during the execution of an op on the main thread + * and can be interpreted appropriately in conjunction with the unit. + * + * @note node overlap information is available on QNN_HTP_PROFILE_LEVEL_LINTING level + */ +#define QNN_HTP_PROFILE_EVENTTYPE_NODE_OVERLAP 5002 + +/** + * @brief QnnProfile_EventType_t definition to get data related to execution of + * an operation. This value represents the amount of time at least one + * background op that is not being waited upon to finish is running during + * the wait period of an op on the main thread and can be interpreted + * appropriately in conjunction with the unit. + * + * @note node wait overlap information is available on QNN_HTP_PROFILE_LEVEL_LINTING + * level + */ +#define QNN_HTP_PROFILE_EVENTTYPE_NODE_WAIT_OVERLAP 5003 + +/** + * @brief QnnProfile_EventType_t definition to get data related to execution of + * an operation. This value represents a bitmask denoting the resources + * an op uses. + * + * @note node specific information is available on QNN_HTP_PROFILE_LEVEL_LINTING level + */ +#define QNN_HTP_PROFILE_EVENTTYPE_NODE_RESOURCEMASK 5004 + +/** + * @brief QnnProfile_EventType_t definition to get data related to execution of + * an operation. This value represents the ID of an op running in parallel to + * an op running on the main thread or on HMX. + * + * @note node specific information is available on QNN_HTP_PROFILE_LEVEL_LINTING level + */ +#define QNN_HTP_PROFILE_EVENTTYPE_NODE_CRITICAL_BG_OP_ID 5005 + +/** + * @brief QnnProfile_EventType_t definition to get data related to execution of + * an operation. This value represents the ID of an op running on threads other + * than the main or the HMX thread when the main and the HMX threads are not + * executing any op. + * + * @note node specific information is available on QNN_HTP_PROFILE_LEVEL_LINTING level + */ +#define QNN_HTP_PROFILE_EVENTTYPE_NODE_WAIT_BG_OP_ID 5006 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to execute the graph's critical path on the accelerator + * when client invokes QnnGraph_execute or QnnGraph_executeAsync. + * The value returned is number of processor cycles taken. + * + * @note graph execute accelerator time maybe available only on + * QNN_HTP_PROFILE_LEVEL_LINTING levels + * + * @note When QNN_HTP_PROFILE_LEVEL_LINTING is used, this event can have + * multiple sub-events of type QNN_PROFILE_EVENTTYPE_NODE. + * There will be a sub-event for each node that was added to the graph + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_CRITICAL_ACCEL_TIME_CYCLE 6001 + +/** + * @brief Linting QnnProfile_Level_t definition that allows collecting in-depth + * performance metrics for each op in the graph including main thread + * execution time and time spent on parallel background ops. + */ +#define QNN_HTP_PROFILE_LEVEL_LINTING 7001 + +/** + * @brief QnnProfile_EventType_t definition to get number of HVX threads + * configured by a graph. Different graphs can have a different + * value. + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_NUMBER_OF_HVX_THREADS 8001 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to applying binary section for updatable tensors + * when client invokes QnnContext_ApplyBinarySection. + * It refers to the total time the entire API takes. + * The value returned is time taken in microseconds. + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_APPLY_BINARY_SECTION_QNN 9001 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to applying binary section for updatable tensors + * when client invokes QnnContext_ApplyBinarySection. + * It refers to the time of callTransport. + * The value returned is time taken in microseconds. + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_APPLY_BINARY_SECTION_RPC 9002 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to applying binary section for updatable tensors + * when client invokes QnnContext_ApplyBinarySection. + * It refers to the remote procedure call on the HTP processor. + * The value returned is time taken in microseconds. + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_APPLY_BINARY_SECTION_QNN_ACC 9003 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to applying binary section for updatable tensors + * when client invokes QnnContext_ApplyBinarySection. + * It refers to the Hexnn call + * The value returned is time taken in microseconds. + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_APPLY_BINARY_SECTION_ACC 9004 + + + +#ifdef __cplusplus +} +#endif + +#endif // QNN_HTP_PROFILE_H diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpProperty.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpProperty.h new file mode 100755 index 0000000000000..51440061dc611 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpProperty.h @@ -0,0 +1,30 @@ +//============================================================================== +// +// Copyright (c) 2022 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +#ifndef QNN_HTP_PROPERTY_H +#define QNN_HTP_PROPERTY_H + +#include "QnnProperty.h" + +#ifdef __cplusplus +extern "C" { +#endif + +//============================================================================= +// Macros +//============================================================================= +/** + * @brief Property key for determining whether a backend supports unsigned pd. + */ +#define QNN_PROPERTY_CUSTOM_HTP_UNSIGNED_PD_SUPPORT QNN_PROPERTY_GROUP_CUSTOM + 1 + +#ifdef __cplusplus +} +#endif + +#endif // QNN_HTP_PROPERTY_H diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpSystemContext.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpSystemContext.h new file mode 100755 index 0000000000000..dcfedcb3f6450 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpSystemContext.h @@ -0,0 +1,119 @@ +//============================================================================== +// +// Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +// All rights reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +/** + * @file + * @brief QNN HTP component System Context API. + * + * The interfaces in this file work with the top level QNN + * API and supplements QnnSystemContext.h for HTP backend + */ + +#ifndef QNN_HTP_SYSTEM_CONTEXT_H +#define QNN_HTP_SYSTEM_CONTEXT_H + +#ifdef __cplusplus +extern "C" { +#endif + +//============================================================================= +// Macros +//============================================================================= +typedef enum { + // Following version with hwInfoBlobVersion as: + // - Major 0, Minor: 0, Patch: 1 + QNN_SYSTEM_CONTEXT_HTP_HW_INFO_BLOB_VERSION_V1 = 0x01, + // Unused, present to ensure 32 bits. + QNN_SYSTEM_CONTEXT_HTP_HW_INFO_BLOB_UNDEFINED = 0x7FFFFFFF +} QnnHtpSystemContext_HwInfoBlobVersion_t; + +// This struct is gets populated within a binary blob as part of hwInfoBlob in +// QnnSystemContext_BinaryInfoV#_t struct in QnnSystemContext.h +typedef struct QnnHtpSystemContext_HwBlobInfoV1 { + // This value represents the index of the list of graphs registered + // to this context as specified in QnnSystemContext_GraphInfo_t* + uint32_t graphListIndex; + // Stores the spill-fill buffer size used by each of the graphs + uint64_t spillFillBufferSize; +} QnnHtpSystemContext_HwBlobInfoV1_t; + +typedef struct { + QnnHtpSystemContext_HwInfoBlobVersion_t version; + union UNNAMED { + QnnHtpSystemContext_HwBlobInfoV1_t contextBinaryHwInfoBlobV1_t; + }; +} QnnHtpSystemContext_HwBlobInfo_t; + +typedef enum { + // Following version with GraphInfoBlobVersion as: + // - Major 0, Minor: 0, Patch: 1 + QNN_SYSTEM_CONTEXT_HTP_GRAPH_INFO_BLOB_VERSION_V1 = 0x01, + // Unused, present to ensure 32 bits. + QNN_SYSTEM_CONTEXT_HTP_GRAPH_INFO_BLOB_UNDEFINED = 0x7FFFFFFF +} QnnHtpSystemContext_GraphInfoBlobVersion_t; + +// This struct is gets populated within a binary blob as part of GraphInfoBlob in +// QnnSystemContext_BinaryInfoV#_t struct in QnnSystemContext.h +typedef struct { + // Stores the spill-fill buffer size used by each of the graphs + uint64_t spillFillBufferSize; + // HTP vtcm size (MB) + uint32_t vtcmSize; + // Optimization level + uint32_t optimizationLevel; + // Htp Dlbc + uint8_t htpDlbc; + // Number of HVX Threads to reserve; + uint64_t numHvxThreads; +} QnnHtpSystemContext_GraphBlobInfoV1_t; + +typedef struct { + QnnHtpSystemContext_GraphInfoBlobVersion_t version; + union UNNAMED { + QnnHtpSystemContext_GraphBlobInfoV1_t contextBinaryGraphBlobInfoV1; + }; +} QnnHtpSystemContext_GraphBlobInfo_t; + +typedef enum { + // Following version with ContextInfoBlobVersion as: + // - Major 0, Minor: 0, Patch: 1 + QNN_SYSTEM_CONTEXT_HTP_CONTEXT_INFO_BLOB_VERSION_V1 = 0x01, + // Unused, present to ensure 32 bits. + QNN_SYSTEM_CONTEXT_HTP_CONTEXT_INFO_BLOB_UNDEFINED = 0x7FFFFFFF +} QnnHtpSystemContext_ContextInfoBlobVersion_t; + +typedef struct{ + /// An integer representation of SocUtility::DspArch + uint32_t dspArch; +} QnnHtpSystemContext_ContextBlobInfoV1_t; + +typedef struct { + QnnHtpSystemContext_ContextInfoBlobVersion_t version; + union UNNAMED { + QnnHtpSystemContext_ContextBlobInfoV1_t contextBinaryContextBlobInfoV1; + }; +} QnnHtpSystemContext_ContextBlobInfo_t; + +//============================================================================= +// Data Types +//============================================================================= + +//============================================================================= +// Public Functions +//============================================================================= + +//============================================================================= +// Implementation Definition +//============================================================================= + +// clang-format on +#ifdef __cplusplus +} // extern "C" +#endif + +#endif \ No newline at end of file diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/afuncs.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/afuncs.h new file mode 100755 index 0000000000000..28b5685f29750 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/afuncs.h @@ -0,0 +1,338 @@ +//============================================================================== +// +// Copyright (c) 2018, 2023 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +#ifndef AFUNCS_H +#define AFUNCS_H 1 + +#include +#include +#include "dtype.h" +#ifndef __hexagon__ +#include // for memcpy etc +#endif +// #include "asm_define.h" +#include "builtin_intrinsics.h" +#include "macros_attribute.h" + +struct tile_data { + uint8_t **addr; + uint32_t offset_t_col; + uint32_t offset_t_row; + uint32_t width; + uint32_t height; + uint32_t depth; +}; + +// Define order: .addr, .offset_t_col, .offset_t_row, .width, .height, .depth +#define TILEDATA(adrtab, next_tab_col, next_tab_row, h, w, d) \ + { \ + (uint8_t **)(adrtab), static_cast(next_tab_col), static_cast(next_tab_row), \ + static_cast(w), static_cast(h), static_cast(d) \ + } + +/*=======================================*/ +/* Auxiliary functions */ +/*=======================================*/ +#if defined(__hexagon__) +inline int32_t max_i32(int32_t a, int32_t b) +{ + return Q6_R_max_RR(a, b); +} +inline int32_t min_i32(int32_t a, int32_t b) +{ + return Q6_R_min_RR(a, b); +} +inline uint32_t max_u32(uint32_t a, uint32_t b) +{ + return Q6_R_maxu_RR(a, b); +} +inline uint32_t min_u32(uint32_t a, uint32_t b) +{ + return Q6_R_minu_RR(a, b); +} +#else +inline int32_t max_i32(int32_t a, int32_t b) +{ + return (a < b) ? b : a; +} +inline int32_t min_i32(int32_t a, int32_t b) +{ + return (a < b) ? a : b; +} +inline uint32_t max_u32(uint32_t a, uint32_t b) +{ + return (a < b) ? b : a; +} +inline uint32_t min_u32(uint32_t a, uint32_t b) +{ + return (a < b) ? a : b; +} +#endif + +[[maybe_unused]] inline ALWAYSINLINE int64_t roundf_i64(float val) +{ + // add 0.5 (with same sign as val) and then conversion to int truncates toward 0. + // values exactly halfway will round away from 0 (like roundf). + + return (int64_t)(val + copysignf(0.5f, val)); +} + +[[maybe_unused]] inline ALWAYSINLINE NN_INT32_T roundf_i32(float val) +{ + // add 0.5 (with same sign as val) and then conversion to int truncates toward 0. + // values exactly halfway will round away from 0 (like roundf). + + return (int)(val + copysignf(0.5f, val)); +} +// same thing for rounding to unsigned range; -ve inputs will give 0. +// +[[maybe_unused]] inline ALWAYSINLINE uint32_t roundf_u32(float val) +{ + // add 0.5f and then convert to uint (trunc towards 0; -ve values are clipped to 0). +#ifdef __hexagon__ + // use intrinsic since conv of -ve float to unsigned is 'undefined behaviour' in C. + return Q6_R_convert_sf2uw_R_chop(val + 0.5f); +#else + return (val < 0.5f) ? 0 : (uint32_t)(val + 0.5f); +#endif +} + +[[maybe_unused]] inline ALWAYSINLINE NN_INT32_T roundd_i32(double val) +{ + // add 0.5 (with same sign as val) and then conversion to int truncates toward 0. + // values exactly halfway will round away from 0 (like round). + + return (int)(val + copysign(0.5, val)); +} + +[[maybe_unused]] inline ALWAYSINLINE NN_INT32_T saturate_u8(NN_INT32_T val) +{ +#ifdef __hexagon__ + return Q6_R_satub_R(val); +#else + return (val < 0) ? 0 : ((val > 255) ? 255 : val); +#endif +} + +[[maybe_unused]] inline ALWAYSINLINE NN_INT32_T saturate_u16(NN_INT32_T val) +{ +#ifdef __hexagon__ + return Q6_R_satuh_R(val); +#else + return (val < 0) ? 0 : ((val > 65535) ? 65535 : val); +#endif +} + +[[maybe_unused]] static inline ALWAYSINLINE NN_INT32_T saturate_i16(NN_INT32_T val) +{ +#ifdef __hexagon__ + return Q6_R_sath_R(val); +#else + return (val < -32768) ? -32768 : ((val > 32767) ? 32767 : val); +#endif +} + +/** + * @brief low-cost frexpf (but only the exponent result); + * Generates only a few instructions on hexagon. + * + * Input must not be inf,nan, zero, or denormal. + * + * returns: + * -1 if abs(x) is in range 0.25 ... 0.249999 + * 0 if abs(x) is in range 0.5 ... 0.99999 + * 1 if abs(x) is in range 1.0 .. 1.9999 + * etc + * + * If the value -126 is returned, x is a zero or denormal; + * 129 is returned for inf or NaN. for other cases the value is the same + * as what frexpf (in math.h) generates for the exponent. + */ +[[maybe_unused]] inline ALWAYSINLINE constexpr int flt_getexp(float x) +{ + union { + float f; + uint32_t u32; + } const uu = {x}; + return ((uu.u32 >> 23u) & 0xFFu) - 126; +} +/** + * @brief low-cost frexpf (but only the 'fraction' result); + * Generates only a few instructions on hexagon. + * + * Input must not be inf,nan, zero, or denormal. + * + * returns a value in the range [0.5, 1.0) (or in (-1.0,-0.5] when x < 0) + * such that x = flt_getmant(x) * powf2(2.0, flt_getexp(x)) + * + */ +[[maybe_unused]] inline ALWAYSINLINE constexpr float flt_getmant(float x) +{ + union { + float f; + uint32_t u32; + } uu = {x}; + uu.u32 = (uu.u32 & 0x807fffffu) | (uint32_t(126) << 23u); // force exponent = 126 + return uu.f; +} + +/** + * @brief returns the mantissa of x, as a 24-bit number + * in the range 0x800000 .. 0xFFFFFF + * + * Input must not be inf,nan, zero, or denormal. + * + * Sign is discarded. same as powf(2,24) * flt_getmant(fabsf(x)). + */ +[[maybe_unused]] inline ALWAYSINLINE constexpr int32_t flt_getfrac(float x) +{ + union { + float f; + uint32_t u32; + } const uu = {x}; + int32_t const m = (uu.u32 & 0x007fffffu) | (uint32_t(1) << 23u); + return m; +} + +// +// This 'normalizes' a float to 0.5 .. 0.9999 (sign is retained) +// Same result as the return value from frexpf, without using a function call +// Results are not valid if x is 0, denormal, or inf/nan +// +[[maybe_unused]] inline ALWAYSINLINE float flt_getfrac_norm(float x) +{ + union { + float f; + uint32_t u32; + } uu = {x}; + uu.u32 = (uu.u32 & 0x807fffffu) | (uint32_t(126) << 23u); // force exponent = 126 + return uu.f; +} +/** + * @brief low-cost 2.0*n for integer n. + * Same as powf(2.0f, iexpo) without a function call; + * + * Constraint: iexpo must be in range -126..127 + */ +[[maybe_unused]] inline ALWAYSINLINE constexpr float flt_power2(uint32_t const iexpo) +{ + uint32_t const a = (iexpo + 127) & 0xFFu; + union { + uint32_t u32; + float f; + } const uu = {a << 23u}; + return uu.f; +} +/** + * @brief low-cost ldexpf + * Same as ldexpf(val, iexpo) without a function call; + * + * Constraint: iexpo must be in range -126..127 + */ +[[maybe_unused]] inline ALWAYSINLINE constexpr float flt_ldexp(float val, int iexpo) +{ + return val * flt_power2(iexpo); +} +/** + * @brief low-cost 2.0*n for integer n. + * Same as pow(2.0d, iexpo) without a function call; + * + * Constraint: iexpo must be in range -1022..1023 + */ +[[maybe_unused]] inline ALWAYSINLINE constexpr double double_power2(uint32_t const iexpo) +{ + uint64_t const a = (iexpo + 1023) & 0x7FFu; + union { + uint64_t u64; + double d; + } const uu = {a << 52u}; + return uu.d; +} +/** + * @brief low-cost ldexpf + * Same as ldexp(val, iexpo) without a function call; + * + * Constraint: iexpo must be in range -1022..1023 + */ +[[maybe_unused]] inline ALWAYSINLINE constexpr double double_ldexp(double val, int iexpo) +{ + return val * double_power2(iexpo); +} + +/** + * @brief returns the exponent and mantissa of x, as a n-bit number + * + * Constraint: iexpo must be in range -126..127 + * Input must not be negative, inf,nan, zero, or denormal. + */ +template inline constexpr std::pair get_scalefactor(float x) +{ + union { + float f; + uint32_t u32; + } const uu = {x}; + + uint32_t inval = uu.u32; + uint32_t const mask = hnnx::safe_lshift(1, MBITS) - 1; + inval = hnnx::safe_rshift(inval + hnnx::safe_lshift(1, (24 - MBITS - 1)), + (24 - MBITS)); // possibly overflows into exponent, but that's OK. + uint32_t const m = ((inval & mask) | hnnx::safe_lshift(1u, (MBITS - 1))); + int32_t const e = int32_t(hnnx::safe_rshift(inval, (MBITS - 1)) & 0xFFu) - 126; + return {e, m}; +} + +/** + * @brief returns the parameters for scaling. + * bit 31-24: left shift amount + * bit 23-16: right shift amout + * bit 15- 0: scale factor + * + * Input must not be inf,nan, zero, negative or denormal. + * + */ +[[maybe_unused]] inline ALWAYSINLINE constexpr uint32_t get_scaling_params(float x, int max_sl, int max_sr) +{ + auto [e, m] = get_scalefactor<15>(x); + // Set a sl or sr amount to perform a multiply of 2^exponent by mantissa. + int sl = (e > 0) ? e : 0; + int sr = (e > 0) ? 0 : -e; + // The max_sl allows the addition of extra left shifts when working with small numbers having negative exponents. + // For every extra left shift, there is an offsetting right shift added so that the net right shift amount + // required from the exponent stays the same. The max_sr parameter provides a ceiling to the required offsetting + // right shifts, preventing the total right shift requirement from being large enough to erase data through shifting. + if (sl == 0 && sr > 0) { + sl = min_i32(max_sl, max_i32(max_sr - sr, 0)); + sr = sr + sl; + } + return ((uint32_t(sl) & 0x0FFu) << 24u) | ((uint32_t(sr) & 0x0FFu) << 16u) | uint32_t(m); +} + +/** + * @brief given a scale in float and a recip shift amount + * return a quantized scale multiplier and change recip shamt inplace + * + */ +inline uint32_t get_quantized_multipiler(const float scale_f, int &recip_shamt) +{ + recip_shamt = (scale_f <= 1.0f) ? 0 : flt_getexp(scale_f); + uint32_t scale = static_cast(roundf(flt_ldexp(scale_f, (31 - recip_shamt)))); + scale = (scale < 0x7fffffffu) ? scale : 0x7FFFFFFFu; + return scale; +} + +/** + * @brief given a scale in float and a recip shift amount + * return a quantized scale multiplier and change recip shamt inplace + * + */ +//Now with corrected spelling +inline uint32_t get_quantized_multiplier(const float scale_f, int &recip_shamt) +{ + return get_quantized_multipiler(scale_f, recip_shamt); +} +#endif /*AFUNCS_H*/ diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/allocator.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/allocator.h new file mode 100755 index 0000000000000..844bcf4c7ec50 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/allocator.h @@ -0,0 +1,236 @@ +//============================================================================== +// +// Copyright (c) 2020 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +#ifndef ALLOCATOR_H +#define ALLOCATOR_H 1 + +#include +#include +#include +#include "dtype_enum.h" +#include "weak_linkage.h" +#include "macros_attribute.h" +#include "forward_classes.h" +#include "hexagon_nn_types.h" + +enum class MemoryClass { + Plain, + TCM, + UnCached, // for spill/fill DDR + XXX_LAST_MEMORY_TYPE, + Default = Plain +}; + +PUSH_VISIBILITY(default) + +extern bool TrackedAllocError; + +class Graph; +class HexagonNNEnv; +namespace fa { +struct PoolDesc; +struct BigBuff; +struct RuntimeAllocator; +} // namespace fa +namespace hnnx { + +class Serializer; +class Deserializer; + +// some options flags (powers of 2) for calls to Tensor::allocate +enum AllocOptions { + uncached_int8 = 0x1, // override MemoryClass to UnCached. + uncached_int16 = 0x2, + uncached_fp16 = 0x4 +}; + +/* + * Maybe FIXME: It seems like FancyAllocator has just about all the same interfaces as Allocator, + * is all this pimpl stuff needed, or could we just inherit Allocator and have a unique_ptr + * in our graph? + */ + +class Allocator { + public: + // MIN_ALIGN, MAX_ALIGN: + // - both must be powers of 2 + // - 8 <= MIN_ALIGN <= MAX_ALIGN + // All allocations will be aligned to at least MIN_ALIGN, both start and end of each region. + // This includes sub-allocations in memory pools. + // Alignment requests > MAX_ALIGN may be treated as MAX_ALIGN if allocated in DDR. + // + static constexpr unsigned MIN_ALIGN = 256; + static constexpr unsigned MAX_ALIGN = 256; + + // The alignment used by TCM allocation; >= MIN_ALIGN + static constexpr unsigned TCM_ALLOC_ALIGN = 2048; + + static void *vacant() { return (void *)2; } // special value for 'vacant' slot. + enum Mode { AllocVirtual, AllocPhysical, AllocTemp, AllocTempEnd, AllocComplete, LastMode = AllocComplete }; + + // AllocTemp/AllocTempEnd are used in Virtual mode, to set a 'Temp Physical' mode + // where allocation is done to physical memory, but into memory blocks which + // are discarded when we return via AllocTempEnd (So, AllocTempEnd is not possible as an actual + // current mode). + // This is intended to support nesting (multiple levels of AllocTemp; each + // AllocTempEnd discards all allocs since the matching AllocTemp; but + // currently nesting is not supported, so AllocTemp must be followed by AllocTempEnd, + // which actually takes you back to AllocVirtual + // AllocComplete allows no further allocations. A deserialized allocator + // is in this state. + + API_EXPORT Allocator(Mode mode_in, Graph &graph_in) : graph(graph_in), mode(mode_in){}; + API_EXPORT virtual ~Allocator() = 0; + + Graph &graph; + + // Either allocates enough, or dips into a buffer (and changes the buffer pointer and size parameter accordingly). + // al is an alignment parameter; it must be a power of 2 or the code below won't work. + API_EXPORT void *tracked_aligned_alloc(size_t al, size_t bytes, fa::BigBuff *const bb = nullptr); + API_EXPORT void tracked_free(void *aligned_ptr) noexcept; + + API_EXPORT virtual void allocate_n(void **arrp, size_t n, size_t block_size, size_t alignment, MemoryClass memclass, + unsigned options, DType dtype); + + // options for allocate_persistent_blocks. + // if 'allnew' is *not* present, it is assumed that all of the pointers + // are either null, or point to existing persistent blocks. The 'null' ones + // are replaced with new allocations, and the ref counts are increased in both cases. + // with 'allnew': pointers are assumed to contain garbage. Equivalent to zeroing the + // pointer table first. + // + // zoneB: with this, ref counts are update in 'B' zone instead of A + // + // incref: ovverides 'allnew'; all of the existing pointers are required to be valid persistent + // blocks; the ref counts are increased by 1 + // decref: overrides 'incref and allnew'; all of the pointers are required to be valid persistent + // blocks; the ref counts are reduced by 1. If total refs are zero, block is freed. + // the pointer table is not updated. + // + // infinite: newly alloc'd blocks get refcount set to a huge number, instead of 1. + // Currently this is used when deserializing, since we can't free things immediately when in Crate. + // + enum persistent_options { + allnew = 1u, // assume existing pointers are garbage, allocate them all. + zoneB = 2u, // reference count in zone B instead of A. + incref = 4u, // enforce that all existing are persistnent; incref them. + decref = 8u, + infinite = 16u, // refcounts on new blocks, set to a huge # instead of 1. + }; + + // allocate n 'persistent' blocks of the given size/alignment, and update the table. + API_EXPORT virtual void allocate_persistent_blocks(void **table, size_t nblocks, size_t block_size, + size_t alignment, unsigned options); + + API_EXPORT inline void *allocate(const void *oldval, size_t block_size, size_t alignment, MemoryClass memclass, + unsigned options, DType dtype) + { + PUSH_WARNING() + DISABLE_WARNING("-Wcast-qual", MSVC_NO_EQUIV) + void *tmp = const_cast(oldval); + POP_WARNING() + allocate_n(&tmp, 1, block_size, alignment, memclass, options, dtype); + return tmp; + } + + API_EXPORT Mode get_mode() const { return mode; } + API_EXPORT virtual void set_mode(Mode new_mode); + + API_EXPORT virtual void set_tcm_pool(void *base, size_t size); + + API_EXPORT virtual void set_largest_memory_alloc_size(size_t size); + + /* + * Serialize all the internal data for the allocator. + * Memory regions / pools, etc. + */ + API_EXPORT virtual void serialize(Serializer &) const; + /* + * Deserialize the allocator, restore internal data from buffer. + */ + API_EXPORT virtual void deserialize(HexagonNNEnv &env, Deserializer &dctx, + hexagon_nn_wide_address_const_t params_weights = 0U, + const size_t params_weights_length = 0, + hexagon_nn_wide_iovec_t const &weights = NULL_IOVEC); + + API_EXPORT virtual int find_replaceable_mempool(unsigned const replaceable_pool_seq, + fa::PoolDesc &found_pool) const; + + // LCOV_EXCL_START [SAFTYSWCCB-1542] + API_EXPORT static inline constexpr size_t fixup_alignment(size_t align) + { + static_assert(MIN_ALIGN >= 8 && (MIN_ALIGN & (MIN_ALIGN - 1)) == 0, "bad MIN_ALIGN"); + static_assert(MAX_ALIGN >= MIN_ALIGN && (MAX_ALIGN & (MAX_ALIGN - 1)) == 0, "bad MAX_ALIGN"); + if (MIN_ALIGN < MAX_ALIGN) { + return std::max(MIN_ALIGN, std::min(MAX_ALIGN, align)); + } else { + return MIN_ALIGN; + } + } + // LCOV_EXCL_STOP + + API_EXPORT static inline constexpr size_t round_up_align(size_t n, size_t align) + { + return (n + (align - 1)) & ~(align - 1); + } + template API_EXPORT static inline T *round_up_align(T *p, size_t align) + { + return (T *)round_up_align((size_t)p, align); + } + + protected: + Mode mode = AllocVirtual; +}; + +// +// this is s 'shim' class to help in making dummy allocators. It defines overrides +// for all of the pure-virtual methods, so you don't need to +// +class FakeAllocator : public Allocator { + public: + API_EXPORT FakeAllocator(Allocator::Mode mode_in, Graph &graph_in) : Allocator(mode_in, graph_in){}; + API_EXPORT virtual ~FakeAllocator(); +}; + +// this is an accessor which is used by the Dma 'Fill' operation +// to get a source pointer for reading const, based on (pool_id, offset). +// It also holds the base pointer for ddr spill area. +// Maybe other things could be added later. + +class MemPoolRunTimeAccessor { + hexagon_nn_wide_address_t spill_area; + fa::PoolDesc const *pool_table; // pool_table[0] is for poolid=1 + unsigned max_pool_id; + + public: + API_EXPORT MemPoolRunTimeAccessor(hexagon_nn_wide_address_const_t spill_area_in, fa::PoolDesc const *const pt, + unsigned const pt_size) + : spill_area(spill_area_in), pool_table(pt), max_pool_id(pt_size) + { + } + API_EXPORT MemPoolRunTimeAccessor() : spill_area(0), pool_table(nullptr), max_pool_id(0) {} + API_EXPORT MemPoolRunTimeAccessor(MemPoolRunTimeAccessor const &) = default; + API_EXPORT MemPoolRunTimeAccessor &operator=(MemPoolRunTimeAccessor const &) = default; + + // pool ids are >= 1, <= num_pools + API_EXPORT constexpr unsigned num_pools() const { return max_pool_id; } //LCOV_EXCL_LINE [SAFTYSWCCB-1542] + // map pool_id to base address of the data, for persistent pool; also get 'is_weights' flag. + // implementation in runtime_alloc.h + std::pair get_persistent_pool_base_iswts(unsigned pool_id) const; + API_EXPORT hexagon_nn_wide_address_t get_spill_area() const { return spill_area; } + + // used to construct the ConstExtentDescriptor during prep + // implementation in fa_alloc.h + API_EXPORT fa::PoolDesc const *get_descriptor(unsigned pool_id) const; +}; + +} // namespace hnnx + +POP_VISIBILITY() + +#endif diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/bake_defs.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/bake_defs.h new file mode 100755 index 0000000000000..11d01bcb31b95 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/bake_defs.h @@ -0,0 +1,244 @@ +//============================================================================== +// +// Copyright (c) Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +#ifndef BAKE_DEFS +#define BAKE_DEFS 1 +#include +#include +#include +#include + +#include "executable.h" + +// Contains defs for host-side and target side, so try not +// to add too many 'host only' things. + +#ifdef __hexagon__ +#define HNNX_ARCH_CAN_RUN_BAKED 1 +#endif + +namespace hnnx { + +namespace bake { + +using tgt_ptr_word = unsigned; +using tgt_sizet_word = unsigned; +static constexpr unsigned tgt_ptr_bytes = sizeof(tgt_ptr_word); +static constexpr unsigned tgt_sizet_bytes = sizeof(tgt_sizet_word); +static constexpr bool op_has_graphp = false; +static constexpr unsigned tensor_uptr_ptrs = 2; +static constexpr unsigned max_opaquet_align = 1024; // must be power of 2 + +// This should be OK as a first approx: includes hexagon and x86-32 +static constexpr bool host_can_run_baked = sizeof(void *) == tgt_ptr_bytes; + +inline unsigned constexpr round_up(unsigned x, unsigned m) +{ + return ((x + (m - 1)) / m) * m; +} + +// functions to calculate size, align of various things. They +// are included in target build so we can static_assert that sizes are what we think they are. +// (all must be constexpr). + +// {size, alignment} of typical_op +inline constexpr std::pair typical_op_tgt_size_align(unsigned n_in, unsigned n_out) +{ + // 1 pointer per input, plus tensor_uptr_ptrs per output; but if n_in = n_out == 0, it's 1 pointer. + // (for a 'fill' byte). + unsigned num_io_ptrs = n_in + n_out * tensor_uptr_ptrs; + if (num_io_ptrs == 0) num_io_ptrs = 1; // n_in = n_out = 0 case + return {tgt_ptr_bytes * ((op_has_graphp ? 2 : 1) // vptr, and maybe Graph * + + num_io_ptrs), // inputs and outputs + tgt_ptr_bytes}; // align +} + +// 'tensor_op_tgt_size_align is used for crate accounting of ShapeWrapperOp, ConstWrapperOp, DummyOp +// In a proper 'baked graph' we don't need to insert these, just the tensors... + +inline constexpr std::pair tensor_op_tgt_size_align(unsigned n_out) +{ + // happens to be the same as TypicalOp with no inputs... + return typical_op_tgt_size_align(0, n_out); +} + +// {size, alignment, extra} of typical_op_with_compiler +// extra_len is the len of the extra data +// extra_align is its alignment. +// The 3rd return value is the offset of the 'extra' within the image. +// +inline constexpr std::tuple +typical_op_extra_tgt_size_align(unsigned n_in, unsigned n_out, unsigned extra_len, unsigned extra_align) +{ + std::pair base_size = typical_op_tgt_size_align(n_in, n_out); + unsigned extra_offs = base_size.first; + if (extra_len > 0) { + extra_align = std::max(extra_align, base_size.second); + extra_len = round_up(extra_len, extra_align); + extra_offs = round_up(extra_offs, extra_align); + base_size.first = extra_offs + extra_len; + base_size.second = extra_align; + } + return {base_size.first, base_size.second, extra_offs}; +} + +// {size, alignment} of variadic op (without the in, out array contents)! +constexpr std::pair variadic_op_tgt_size_align(unsigned n_in, unsigned n_out) +{ + const unsigned cratevec_words = 2; + return {tgt_ptr_bytes * (1 // vptr + + (op_has_graphp ? 1 : 0) // Graph * + + 2 * cratevec_words), // two cratevecs + tgt_ptr_bytes}; // align +} +// {size, alignment} of simple_op_wrapper (without the in, out array contents)! +constexpr std::pair simplewrap_op_tgt_size_align(unsigned n_in, unsigned n_out) +{ + // this is just one more pointer than a variadic op... + const auto var_result = variadic_op_tgt_size_align(n_in, n_out); + return {var_result.first + tgt_ptr_bytes, var_result.second}; +} + +// {size, alignment} of a ChunkPreloadOp +constexpr std::pair chunk_preload_op_tgt_size_align() +{ + return {tgt_ptr_bytes * (1 // vptr + + (op_has_graphp ? 1 : 0) // Graph * + + 2), // ptr, len; + tgt_ptr_bytes}; // align +} + +// +// {size_align} of Shape object +// +constexpr std::pair shape_tgt_size_align(unsigned rank) +{ + // tgt_sizet_bytes * (1 + 1 + 2 * rank) = + // vtable ptr + // shapeflag flags + padding[] + // std::array dims + // std::array max_dims + // + rank = std::array pad + return {round_up(tgt_sizet_bytes * (1 + 1 + 1 + 2 * rank) + rank, tgt_sizet_bytes), tgt_sizet_bytes}; +} + +// +// {size_align} of DynamicShape object +// +constexpr std::pair dynamic_shape_tgt_size_align(const unsigned rank) +{ + // std::array dims == tgt_sizet_bytes * rank + // (shapeflag flags + padding[]) + vtable ptr + dynamic_state = (3 * tgt_sizet_bytes) + return {round_up(tgt_sizet_bytes * rank + (4 * tgt_sizet_bytes), tgt_sizet_bytes), tgt_sizet_bytes}; +} + +// +// {size_align} of interface object (may or may not be quantized) +// +constexpr std::pair interface_tgt_size_align(bool is_quantized) +{ + return {tgt_sizet_bytes + (is_quantized ? round_up(3 * 4, tgt_sizet_bytes) : 0), tgt_sizet_bytes}; +} + +// {size_align} of Tensors, of three different forms: +// +// 'general' tensor +// +constexpr std::pair tensor_general_tgt_size_align() +{ + return {tgt_sizet_bytes * 4 + 2 * tgt_ptr_bytes, tgt_sizet_bytes}; +} + +// 'shape' tensor, of given rank. +// +constexpr std::pair tensor_shape_tgt_size_align(unsigned rank) +{ + return {tgt_sizet_bytes * ((rank == 0 ? 1 : rank) + 1), tgt_sizet_bytes}; +} + +// 'scalar' tensor, need to know if the interface is 'quantized' or not +// Note, this assumes all value are <= size_t bytes. +// +constexpr std::pair tensor_scalar_tgt_size_align(bool is_quantized) +{ + const unsigned ifc_size = interface_tgt_size_align(is_quantized).first; + return {tgt_sizet_bytes * 2 + ifc_size, tgt_sizet_bytes}; +} +// sizeof OpExtraInfo on target: {long long, 2 * unsigned, char *, 4 * padbyte} +constexpr std::pair OpExtraInfo_size_align = {24, 8}; + +// The size of a SliceDispatchOp for the given number of slices. +// Currently it's always the same regardless of 'nslices'; We may introduce 'right-sized' +// value, in which case 'exact=true' will get the 'real' size; but exact = false will always +// give the full size. +constexpr std::pair slice_dispatch_op_size_align(unsigned const nslices, bool const exact = false) +{ + return {tgt_sizet_bytes * ((op_has_graphp ? 5 : 4) + 3 * Executable::MAX_OP_SLICES), tgt_sizet_bytes}; +} + +// The size of a Predicated Op +constexpr std::pair pred_op_size_align() +{ + return {tgt_sizet_bytes * ((op_has_graphp ? 5 : 4) + 3), tgt_sizet_bytes}; +} + +// this is used in e.g. +// if constexpr(host_can_run_baked) static_assert(size_align_matches(N_IN, N_OUT)); + +template constexpr bool size_align_matches(SZAL sz) +{ + return sizeof(T) == std::get<0>(sz) && alignof(T) == std::get<1>(sz); +} + +// This is a utility to check that a type T has a given size and aligment, using static_assert; +// Just need to include a call to 'do-nothing' bake::check_size_align::template check(); +// The static assert is *disabled* unless compiling on hexagon (or compatible host). +// +// It's more complex than it needs to be, since it's designed to make sure the type and +// numbers wind up in the error message, e.g. you could end up with +// error: static_assert failed due to requirement 'claimed(40) == actual(48)' "size not as claimed" +// static_assert(claimed(CLAIMED_SIZE) == actual(ACTUAL_SIZE), "size not as claimed"); +// ... note: in instantiation of function template specialization 'check_szal::check_size_align<..., ...>' +// +template struct check_size_align { + static constexpr int claimed(int K) { return K; } + static constexpr int actual(int K) { return K; } + template static constexpr bool check_size() + { + static_assert(claimed(CLAIMED_SIZE) == actual(ACTUAL_SIZE), "size not as claimed"); + return CLAIMED_SIZE == ACTUAL_SIZE; + } + template static constexpr bool check_align() + { + static_assert(claimed(CLAIMED_ALIGN) == actual(ACTUAL_ALIGN), "align not as claimed"); + return CLAIMED_ALIGN == ACTUAL_ALIGN; + } + + template static constexpr bool check() + { + bool result = true; + if constexpr (host_can_run_baked) { + result = check_size() && check_align(); + } + return result; + } +}; + +} // namespace bake + +// +// op_opaque_tgt_info must be specialized for each OpaqueT used in TypicalOpWithCompiler +// +template struct op_opaque_tgt_info { + // static constexpr unsigned length = ..; // length of the struct on target CPU + // static constexpr unsigned alignment = ... // aligbment on target CPU +}; + +} // namespace hnnx + +#endif // BAKE_DEFS diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/builtin_intrinsics.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/builtin_intrinsics.h new file mode 100755 index 0000000000000..3496b792f25aa --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/builtin_intrinsics.h @@ -0,0 +1,247 @@ +//============================================================================== +// +// Copyright (c) 2023 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +// Compiler builtin intrinsic functions should be specified in this file + +#ifndef BUILTIN_INTRINSICS_H_ +#define BUILTIN_INTRINSICS_H_ + +#include +#include +#include +#include + +// Branch prediction +#if defined(__clang__) + +#define HEX_LIKELY(x) __builtin_expect(!!(x), 1) +#define HEX_UNLIKELY(x) __builtin_expect(!!(x), 0) + +#define HEX_ASSUME __builtin_assume +#define HEX_UNREACHABLE __builtin_unreachable + +#elif defined(_MSC_VER) + +#define HEX_LIKELY(x) (x) +#define HEX_UNLIKELY(x) (x) + +#define HEX_ASSUME __assume +#define HEX_UNREACHABLE() __assume(0) + +#elif defined(__GNUC__) +//No equivalent __builtin_assume in GNUC. Hence leaving empty. +#define HEX_ASSUME(cond) + +#define HEX_LIKELY(x) __builtin_expect(!!(x), 1) +#define HEX_UNLIKELY(x) __builtin_expect(!!(x), 0) +#define HEX_UNREACHABLE __builtin_unreachable + +#endif // defined(__clang__) + +// Overflow detection +#if defined(__clang__) || defined(__GNUC__) + +#define HEX_ADD_OVERFLOW __builtin_add_overflow +#define HEX_MUL_OVERFLOW __builtin_mul_overflow + +#elif defined(_MSC_VER) + +#include + +template static inline bool HEX_ADD_OVERFLOW(_T a, _T b, _T *out) +{ + *out = a + b; + return ((b > 0) && (a > std::numeric_limits<_T>::max() - b)) || + ((b < 0) && (a < std::numeric_limits<_T>::min() - b)); +} + +template static inline bool HEX_MUL_OVERFLOW(_T a, _T b, _T *out) +{ + *out = a * b; + return ((b > 0) && (a > std::numeric_limits<_T>::max() / b || a < std::numeric_limits<_T>::min() / b)) || + ((b < 0) && (a > std::numeric_limits<_T>::min() / b || a < std::numeric_limits<_T>::max() / b)); +} + +#endif // __clang__ + +// Count bits + +#include + +template static inline int HEX_COUNT_ONE_BIT(_T x) +{ + return std::bitset(x).count(); +} + +#define HEX_COUNT_ONE_BIT_ULL HEX_COUNT_ONE_BIT +#define HEX_COUNT_ONE_BIT_UL HEX_COUNT_ONE_BIT + +#if defined(__clang__) || defined(__GNUC__) + +#define HEX_COUNT_LEADING_ZERO __builtin_clz +#define HEX_COUNT_LEADING_ZERO_UL __builtin_clzl +#define HEX_COUNT_LEADING_ZERO_ULL __builtin_clzll + +#define HEX_COUNT_TRAILING_ZERO __builtin_ctz +#define HEX_COUNT_TRAILING_ZERO_UL __builtin_ctzl +#define HEX_COUNT_TRAILING_ZERO_ULL __builtin_ctzll + +#elif defined(_MSC_VER) + +#include + +// Returns the number of leading 0-bits in x, starting at the most significant +// bit position. If x is 0, the result is undefined. +static inline int HEX_COUNT_LEADING_ZERO_ULL(unsigned long long x) +{ + unsigned long where; + if (_BitScanReverse64(&where, x)) return static_cast(63 - where); + return 64; // Undefined behavior +} + +static inline int HEX_COUNT_LEADING_ZERO(unsigned int x) +{ + unsigned long where; + if (_BitScanReverse(&where, x)) return static_cast(31 - where); + return 32; // Undefined Behavior. +} + +static inline int HEX_COUNT_LEADING_ZERO_UL(unsigned long x) +{ + return sizeof(x) == 8 ? HEX_COUNT_LEADING_ZERO_ULL(x) : HEX_COUNT_LEADING_ZERO(static_cast(x)); +} + +// Returns the number of trailing 0-bits in x, starting at the least significant +// bit position. If x is 0, the result is undefined. +static inline int HEX_COUNT_TRAILING_ZERO_ULL(unsigned long long x) +{ + unsigned long where; + if (_BitScanForward64(&where, x)) return static_cast(where); + return 64; // Undefined Behavior. +} + +static inline int HEX_COUNT_TRAILING_ZERO(unsigned int x) +{ + unsigned long where; + if (_BitScanForward(&where, x)) return static_cast(where); + return 32; // Undefined Behavior. +} + +static inline int HEX_COUNT_TRAILING_ZERO_UL(unsigned long x) +{ + return sizeof(x) == 8 ? HEX_COUNT_TRAILING_ZERO_ULL(x) : HEX_COUNT_TRAILING_ZERO(static_cast(x)); +} + +#endif // defined(__clang__) + +// Atomic operation + +#if defined(__clang__) || defined(__GNUC__) + +#define HEX_ATOMIC_FETCH_AND_ADD __sync_fetch_and_add + +#define HEX_ATOMIC_FETCH_AND_AND __sync_fetch_and_and +#define HEX_ATOMIC_FETCH_AND_OR __sync_fetch_and_or + +#define HEX_ATOMIC_VAL_COMPARE_AND_SWAP __sync_val_compare_and_swap +#define HEX_ATOMIC_BOOL_COMPARE_AND_SWAP __sync_bool_compare_and_swap + +#elif defined(_MSC_VER) + +#include + +#define HEX_ATOMIC_FETCH_AND_ADD(_p, _v) \ + (sizeof *(_p) == sizeof(__int64) ? _InterlockedExchangeAdd64((__int64 *)(_p), (__int64)(_v)) \ + : _InterlockedExchangeAdd((long *)(_p), (long)(_v))) + +template static inline _T HEX_ATOMIC_FETCH_AND_AND(_T volatile *_p, _T _v) +{ + _InterlockedAnd((long *)_p, (long)_v); + return static_cast<_T>(*_p); +} + +template static inline _T HEX_ATOMIC_FETCH_AND_OR(_T volatile *_p, _T _v) +{ + _InterlockedOr((long *)_p, (long)_v); + return static_cast<_T>(*_p); +} + +#define HEX_ATOMIC_VAL_COMPARE_AND_SWAP(_p, _old, _new) \ + (sizeof *(_p) == sizeof(__int64) \ + ? _InterlockedCompareExchange64((__int64 *)(_p), (__int64)(_new), (__int64)(_old)) \ + : _InterlockedCompareExchange((long *)(_p), (long)(_new), (long)(_old))) + +#define HEX_ATOMIC_BOOL_COMPARE_AND_SWAP(_p, _old, _new) (HEX_ATOMIC_VAL_COMPARE_AND_SWAP(_p, _old, _new) == (_old)) + +#endif // defined(__clang__) + +namespace hnnx { + +/** + * @brief promote_shift_operand reflects the integral promotions for small integer types. + * safe_lshift/safe_rshift must be aware of these promotions, since the C++ standard only + * defines the behavior for shift operations where the RHS is between 0 and + * 1 less than the bit-width of the *promoted* type of the LHS. + */ +template struct promote_shift_operand { + typedef T type; +}; + +template <> struct promote_shift_operand { + using type = int; +}; +template <> struct promote_shift_operand { + using type = int; +}; +template <> struct promote_shift_operand { + using type = int; +}; +template <> struct promote_shift_operand { + using type = int; +}; +template <> struct promote_shift_operand { + using type = int; +}; + +template using promote_shift_operand_t = typename promote_shift_operand::type; + +// The following portable template functions are replacements for the +// built-in shift operations, << and >>, that provide the following guarantees: +// +// 1. Both the left and right operands of the shift will be treated as unsigned. +// This, by construction, prevents any undefined or implementation-defined +// behavior that may arise when shifting negative-valued expressions. +// 2. The right operand will be bit-masked in a way that guarantees +// that its value is in the range [0, bitwidth(promoted_left_operand) - 1] + +template constexpr unsigned get_safe_shift_mask() +{ + return unsigned(CHAR_BIT * sizeof(promote_shift_operand_t>>) - 1); +} + +template ()> +constexpr auto safe_lshift(T const value, S const shift_amount) +{ + static_assert(std::is_integral::value && std::is_integral::value, + "safe_lshift only makes sense for integral parameters"); + assert((static_cast(shift_amount) & ~mask) == 0 && "shift_amount is out of range"); + return value << shift_amount; +} + +template ()> +constexpr auto safe_rshift(T const value, S const shift_amount) +{ + static_assert(std::is_integral::value && std::is_integral::value, + "safe_rshift only makes sense for integral parameters"); + assert((static_cast(shift_amount) & ~mask) == 0 && "shift_amount is out of range"); + return value >> shift_amount; +} + +} // namespace hnnx + +#endif /* BUILTIN_INTRINSICS_H_ */ diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/c_tricks.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/c_tricks.h new file mode 100755 index 0000000000000..0531625039312 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/c_tricks.h @@ -0,0 +1,21 @@ +//============================================================================== +// +// Copyright (c) 2020 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +#ifndef C_TRICKS_H +#define C_TRICKS_H 1 + +#define CTRICKS_PASTER2(A, B) A##B +#define CTRICKS_PASTER(A, B) CTRICKS_PASTER2(A, B) + +#define STRINGIFY(x) #x +#define TOSTRING(x) STRINGIFY(x) + +#define PROBABLY(x) __builtin_expect(!(!(x)), 1) +#define YEAHRIGHT(x) __builtin_expect(!(!(x)), 1) + +#endif diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/cc_pp.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/cc_pp.h new file mode 100755 index 0000000000000..c4363d8cb3e6f --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/cc_pp.h @@ -0,0 +1,26 @@ +//============================================================================== +// +// Copyright (c) 2020 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +#ifndef CC_PP_H +#define CC_PP_H 1 + +/* + * C++ Preprocessor Definitions + */ + +#ifdef __cplusplus +#define EXTERN_C_BEGIN extern "C" { +#define EXTERN_C_END \ + } \ + ; +#else +#define EXTERN_C_BEGIN /* NOTHING */ +#define EXTERN_C_END /* NOTHING */ +#endif + +#endif diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/check_hvx.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/check_hvx.h new file mode 100755 index 0000000000000..bd12354b0a314 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/check_hvx.h @@ -0,0 +1,35 @@ +//============================================================================== +// +// Copyright (c) 2022-2023 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +#include "cc_pp.h" +#include "macros_attribute.h" +#include "weak_linkage.h" + +#ifndef CHECK_HVX_H +#define CHECK_HVX_H 1 + +// +// This makes sure that we have an HVX context (or not). Does nothing on H2 or +// QuRT, but on x86, makes use of a TLS variable to do the check. +// + +#ifdef __hexagon__ + +static inline void check_hvx() {} +static inline void check_not_hvx() {} + +#else + +PUSH_VISIBILITY(default) +API_EXPORT void check_hvx(); +API_EXPORT void check_not_hvx(); +POP_VISIBILITY() + +#endif + +#endif // CHECK_HVX_H diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/const_extent_descriptor.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/const_extent_descriptor.h new file mode 100755 index 0000000000000..a7f50569eb471 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/const_extent_descriptor.h @@ -0,0 +1,207 @@ +//============================================================================== +// +// Copyright (c) 2023 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +#ifndef CONST_EXTENT_DESCRIPTOR_H +#define CONST_EXTENT_DESCRIPTOR_H 1 + +#include +#include +#include +#include +#include "forward_classes.h" +#include "serialize_defs.h" +#include "pickle_header_tags.h" +#include "const_extent_shared.h" + +namespace hnnx { + +// This class is used, on both encoder and decoder, to contain a 'const extent descriptor' in its raw form, (just an array of uint32) +// and provide higher-level access to the contents. + +class ConstExtentDesc { + protected: + using table_t = std::vector; + // The 'table' may or may not contain the 'padding' section at the end; this is not accessed, + // and the serialize method will always generate the required padding. + table_t table; + // some values broken out from the header... + unsigned extab_n = 0, extab_idx = 0; // number of extents, and word index where they start + unsigned mptab_n = 0, mptab_idx = 0; // number of memory pools, and word index where they start. + unsigned desc_len = 0; // length of the entire descriptor in bytes (0 if invalid descriptor) + + bool scan_table(); // sanity check, and unpacks the above; returns true if OK. + + public: + static uint8_t constexpr EXTENT_FLAGS_BITFIELD_LSB = 8; + static uint8_t constexpr EXTENT_FLAGS_BITFIELD_WIDTH = 8; + + /// + /// @brief Values for 8b flags in extent record + /// + static uint8_t constexpr EXTENT_FLAG_RESERVED_0 = (1 << 0); + static uint8_t constexpr EXTENT_FLAG_RESERVED_1 = (1 << 1); + static uint8_t constexpr EXTENT_FLAG_RESERVED_2 = (1 << 2); + static uint8_t constexpr EXTENT_FLAG_RESERVED_3 = (1 << 3); + static uint8_t constexpr EXTENT_FLAG_IS_FAR_HINT = (1 << 4); ///< Contents maybe far + static uint8_t constexpr EXTENT_FLAG_RESERVED_5 = (1 << 5); + static uint8_t constexpr EXTENT_FLAG_RESERVED_6 = (1 << 6); + static uint8_t constexpr EXTENT_FLAG_RESERVED_7 = (1 << 7); + + // Return from 'extent_info'. + struct extab_entry { + uint32_t extent_flags; + uint32_t align; // a power of 2, >= 64 + uint64_t offset; // offset, in bytes, from the start of the descriptor, to where the data is. + uint64_t length; // length of the data in bytes. + }; + // Return from 'mempool_info'. + // Note: if 'adjust_offset' is true, the 'offset' field from the containing extent will be added to offset, + // so that the offset is from the start of the descriptor, instead of the start of the containing extent. + struct mempool_entry { + uint32_t mempool_id; // a mempool id >=2 indicating a const mempool + uint32_t extent_id; // an extent_id, >=1 + uint64_t offset; // offset in bytes of the data from the start of the extent (see note above) + uint64_t length; // length in bytes of the data + }; + // optional name of the const_extent this descriptor corresponds to. Used for matching in weight_sharing. + std::string name = std::string{}; + + ConstExtentDesc() {} + ConstExtentDesc(table_t &&table_in); + void serialize(Serializer &) const; + inline bool load_table(table_t &&table_in) + { + table = std::move(table_in); + return scan_table(); + } + + constexpr bool is_valid() const { return desc_len != 0; } + + constexpr unsigned descriptor_length() const { return desc_len; } + + constexpr unsigned num_extents() const { return extab_n; } + constexpr unsigned num_mempools() const { return mptab_n; } + + // unpack a row of the extent table + // NOTE: extent_id is 1-based, must be 1 .. num_extents() + extab_entry extent_info(unsigned extent_id) const; + + // unpack a row of the mempool table. + // note: idx is not a mempool idx, it is a 1-based row in range 1...num_mempools(); + // if adjust_offset, the offset of the containing extent is added to the offset + // of the mempool in the returned value. + mempool_entry mempool_info(unsigned idx, bool adjust_offset = false) const; + + // The ordering of the data and the descriptors is such that: + // + // (1) extent_info(1).offset >= descriptor_length() + // mempool_info(1,true).offset >= descriptor_length() + // (2) for i >=2, + // extent_info(i).offset >= extent_info(i+1).offset + extent_info(i+1).length + // mempool_info(i,true).offset >= mempool_info(1-1,true).offset + mempool_info(1-1).length + // + +#if !defined(PREPARE_DISABLED) + /// + /// @brief Memory pool record iterator + /// @details Use to iterator over records in memory pool table in constant + /// extent descriptor + /// + class mempool_iterator { + public: + using iterator_category = std::input_iterator_tag; + using value_type = ConstExtentDesc::mempool_entry; + using difference_type = std::ptrdiff_t; + using pointer = value_type *; + using reference = value_type &; + + /// + /// @brief Constructor + /// @param [in] cedesc A valid constant extent descriptor instance + /// @param [in] index Record index (zero-based!) + /// + explicit mempool_iterator(ConstExtentDesc const &cedesc, uint32_t index) : _cedesc(cedesc), _index(index) {} + + /// + /// @brief Increment record + /// @return Iterator + /// + mempool_iterator &operator++() + { + // Increment IFF valid constant extent descriptor and mempool record + // index within range + _index += (_cedesc.is_valid() && (_index < _cedesc.mptab_n)) ? 1 : 0; + return *this; + } + + /// + /// @brief Equality operator + /// @return true if iterators are equal + /// + bool operator==(mempool_iterator const &other) const { return _index == other._index; } + + /// + /// @brief Inequality operator + /// @return true if iterators are not equal + /// + bool operator!=(mempool_iterator const &other) const { return !(*this == other); } + + /// + /// @brief Dereference iterator + /// + reference operator*(); + + private: + /// + /// @brief Reference to a constant extent descriptor instance + /// @details It contains the blob representing constant extent segment + /// + ConstExtentDesc const &_cedesc; + + /// + /// @brief Current index + /// + uint32_t _index; + + /// + /// @brief Mempool record entry + /// @details It is assigned when on iterator dereference + /// + value_type _entry; + }; + + /// + /// @brief Return mempool iterator initialized to the first record + /// @return Mempool iterator + /// + mempool_iterator begin() { return mempool_iterator(*this, 0); } + + /// + /// @brief Return mempool iterator beyond the last record + /// @warning Intended to be used as a sentinel + /// @return Mempool iterator + /// + mempool_iterator end() { return mempool_iterator(*this, mptab_n); } +#endif +}; +#ifndef PREPARE_DISABLED +// Called at the end of serializing a graph, if 'const extent' mode is enabled. +// See comment in const_extent_descriptor.cc for full details. +// LCOV_EXCL_START [SAFTYSWCCB-1542] +size_t write_aligned_const_info(Graph const &gr, Serializer &sctx, unsigned buried_aux_n_words = 0); +#else +inline constexpr size_t write_aligned_const_info(Graph const &gr, Serializer const &sctx, unsigned = 0) +{ + return 0; +} +// LCOV_EXCL_STOP +#endif + +} // namespace hnnx + +#endif // CONST_EXTENT_DESCRIPTOR_H diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/const_extent_shared.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/const_extent_shared.h new file mode 100755 index 0000000000000..39c95e26ed561 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/const_extent_shared.h @@ -0,0 +1,81 @@ +//============================================================================== +// +// Copyright (c) 2024 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +#ifndef CONST_EXTENT_SHARED_H_ +#define CONST_EXTENT_SHARED_H_ + +namespace hnnx { +// definitions pertaining to the 'const extent descriptor'. + +constexpr unsigned CONST_EXTENT_DESC_MAGIC = 0x71c43c9b; +// if a const extent descriptor has a 'cbname' in it, the last 32-bit slot +// is this value. The 0x3e, 0x00 is the ">\0" at the end of the cbname +constexpr unsigned CONST_EXTENT_CBNAME_TAG = 0xebbe003e; + +// This must be a power of 2, and >= 64. +// This is effectively a 'quiet' minimum on options.serialize_const_alignment, which sets +// the actual alignment. +// It is not necessary for the decoder to know what value of alignment was used in the encoder. +constexpr unsigned CONST_EXTENT_MIN_ALIGN = 256; +// +// this is a (non-quiet) maximum on options.serialize_const_alignment +constexpr unsigned CONST_EXTENT_MAX_ALIGN = 1024 * 1024; + +/// +/// @brief Size of const extent descriptor header +/// +constexpr unsigned CONST_EXTENT_HEADER_SIZE_WORDS = 4u; +constexpr unsigned CONST_EXTENT_HEADER_SIZE_BYTES = CONST_EXTENT_HEADER_SIZE_WORDS * 4u; + +/// +/// @brief Size of an extent record +/// @details Const extent descriptor contains a table of such records +/// +constexpr unsigned CONST_EXTENT_RECORD_SIZE_WORDS = 4u; +constexpr unsigned CONST_EXTENT_RECORD_SIZE_BYTES = CONST_EXTENT_RECORD_SIZE_WORDS * 4u; + +/// +/// @brief Offset of extent record table relative to const extent descriptor +/// @details Both byte and words offsets are listed +/// +constexpr unsigned CONST_EXTENT_RECORD_TAB_OFFSET_WORDS = 4u; +constexpr unsigned CONST_EXTENT_RECORD_TAB_OFFSET_BYTES = CONST_EXTENT_RECORD_TAB_OFFSET_WORDS * 4u; + +/// +/// @brief Size of mempool record in a const extent descriptor +/// @details Both byte and word sizes are provided +/// +constexpr unsigned CONST_EXTENT_MEMPOOL_RECORD_SIZE_WORDS = 4u; +constexpr unsigned CONST_EXTENT_MEMPOOL_RECORD_SIZE_BYTES = CONST_EXTENT_MEMPOOL_RECORD_SIZE_WORDS * 4u; + +// This function is used by deserializer to help it extract the extent-desc table (as a vector) from some +// arbitrary point down the pickle. Parameter is a pointer to the first 4 words; the return value is +// 0 if the first two words do not look like CEDesc header; +// n otherwise (where 'n' is the number of 32-bit words to extract). +// +inline unsigned const_extent_hdr_check(uint32_t const *const hdrp) +{ + if (hdrp[0] != CONST_EXTENT_DESC_MAGIC) return 0; + const unsigned word0 = hdrp[1]; + const unsigned hdr_len16 = word0 >> 24u; // units of 16 bytes + const unsigned desc_len64 = word0 & 0xFFFFFFu; // units of 64 bytes + const unsigned n_extent = hdrp[2] & 0xFFFFFFu; + const unsigned n_mempool = hdrp[3] & 0xFFFFFFu; + // no. of words actually needed + const unsigned desc_words = 4 * (hdr_len16 + n_extent + n_mempool); + + // note, n_extent == n_mempool == 0 is allowed. + if (hdr_len16 == 0 || desc_len64 == 0 || n_extent > n_mempool || desc_words > desc_len64 * 16) { + return -1; + } + return desc_words; +} + +} // namespace hnnx + +#endif diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/constraints.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/constraints.h new file mode 100755 index 0000000000000..b30f7b8f5c871 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/constraints.h @@ -0,0 +1,121 @@ +//============================================================================== +// +// Copyright (c) 2020 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +#ifndef CONSTRAINTS_H +#define CONSTRAINTS_H + +#include "interface_defs.h" +#include "op_def.h" + +#include +#include + +namespace constraint_lib { + +/** \defgroup OptConstraint Constraint Expressions for Optimization Rules + * \ingroup OptimizationFuncs + * + * @{ + */ +//! Find the chunksize of a given tensor type in a given dimension (a constant). +/// For instance, LAYOUT_CHUNKSIZE(QUint8CroutonTensor,3) gives size_t(32) +/// +#define LAYOUT_CHUNKSIZE(TYPENAME, IDX) (TYPENAME::layout.ChunkSizes[(IDX)]) + +// some convenience wrappers... + +//! IS_FLOAT16("operand") -> bool (true if operand has Float16 output) +#define IS_FLOAT16(X) EQ(DTYPE_OF(X), DType::Float16) + +//! IS_FLOAT32("operand") -> bool (true if operand has float output) +#define IS_FLOAT32(X) EQ(DTYPE_OF(X), DType::Float32) + +//! IS_FLOAT("operand") -> bool (alias of IS_FLOAT32) +#define IS_FLOAT(X) IS_FLOAT32(X) + +//! IS_QUINT8("operand") -> bool (true if operand has 'QUInt8' output) +#define IS_QUINT8(X) EQ(DTYPE_OF(X), DType::QUInt8) + +//! IS_QINT8("operand") -> bool (true if operand has 'QInt8' output) +#define IS_QINT8(X) EQ(DTYPE_OF(X), DType::QInt8) + +//! IS_QINT16("operand") -> bool (true if operand has 'QInt16' output) +#define IS_QINT16(X) EQ(DTYPE_OF(X), DType::QInt16) + +//! IS_QUINT16("operand") -> bool (true if operand has 'QUInt16' output) +#define IS_QUINT16(X) EQ(DTYPE_OF(X), DType::QUInt16) + +//! IS_QINT32("operand") -> bool (true if operand has 'QInt32' output) +#define IS_QINT32(X) EQ(DTYPE_OF(X), DType::QInt32) +//! IS_INT32("operand") -> bool (true if operand has 'Int32' output) +#define IS_INT32(X) EQ(DTYPE_OF(X), DType::Int32) + +//! IS_INT64("operand") -> bool (true if operand has 'Int64' output) +#define IS_INT64(X) EQ(DTYPE_OF(X), DType::Int64) + +//! IS_QUANT_TYPE("operand") -> bool (true if operand has 'Quantized' output) +#define IS_QUANT_TYPE(X) OR(IS_QUINT8(X), IS_QINT8(X), IS_QINT16(X), IS_QUINT16(X), IS_QINT32(X)) +//! IS_QUANT_SIGNED("operand") -> bool (true if operand has 'Signed Quantized' output) +#define IS_QUANT_SIGNED(X) OR(IS_QINT32(X), IS_QINT16(X), IS_QINT8(X)) +//! IS_SIGNED_SYMM("operand") -> bool (true if operand has 'Signed Quantized' output with offset == 0) +#define IS_SIGNED_SYMM(X) AND(IS_QUANT_SIGNED(X), EQ(ZERO_OFFSET_OF(X), 0)) + +// The problem with IS_SIGNED_SYMM is that it tends to get used as +// AND( IS_QINT8(X), IS_SIGNED_SYMM(X)) +// which expands to X.dtype==qint8 && ( (X.dtype ==qint32 || X.dtype == .. ) && X.zero_offs == 0) +// So, use IS_QINT8_SYMM(X) etc instead. + +//! IS_QINT8_SYMM("operand") -> bool (true if operand has QINT8 output with offset == 0) +#define IS_QINT8_SYMM(X) AND(IS_QINT8(X), EQ(ZERO_OFFSET_OF(X), 0)) +//! IS_QINT16_SYMM("operand") -> bool (true if operand has QINT16 output with offset == 0) +#define IS_QINT16_SYMM(X) AND(IS_QINT16(X), EQ(ZERO_OFFSET_OF(X), 0)) +//! IS_QINT32_SYMM("operand") -> bool (true if operand has QINT32 output with offset == 0) +#define IS_QINT32_SYMM(X) AND(IS_QINT32(X), EQ(ZERO_OFFSET_OF(X), 0)) + +//! IS_FULLY_CONNECT_WEIGHT("operand") -> bool (true if operand is QUInt8 or (QInt8 and symmetrically quantized)) +#define IS_FULLY_CONNECT_WEIGHT(X) OR(IS_QUINT8(X), IS_QINT8_SYMM(X)) + +//! IS_FLOAT16_BOTH("operand", "operand") -> bool (true if both operands are FP16 type) +#define IS_FLOAT16_BOTH(X, Y) AND(IS_FLOAT16(X), IS_FLOAT16(Y)) +//! IS_FLOAT16_ALL("operand", ...) -> bool (true if all operands are FP16 type) +#define IS_FLOAT16_ALL(...) IS_DTYPE_ALL(DType::Float16, __VA_ARGS__) +//! IS_FLOAT32_ALL("operand", ...) -> bool (true if all operands are FP32 type) +#define IS_FLOAT32_ALL(...) IS_DTYPE_ALL(DType::Float32, __VA_ARGS__) + +//! DIM_CHANNEL("operand") -> unsigned (extract depth dimension, #4) +#define DIM_CHANNEL(X) DIM_OF(X, 4) +//! DIM_DEPTH("operand") -> unsigned (extract depth dimension, #3) +#define DIM_DEPTH(X) DIM_OF(X, 3) +//! DIM_WIDTH("operand") -> unsigned (extract width dimension, #2) +#define DIM_WIDTH(X) DIM_OF(X, 2) +//! DIM_HEIGHT("operand") -> unsigned (extract height dimension, #1) +#define DIM_HEIGHT(X) DIM_OF(X, 1) +//! DIM_BATCHES("operand") -> unsigned (extract batches dimension, #0) +#define DIM_BATCHES(X) DIM_OF(X, 0) + +//! DIM_NFILTS("operand") -> unsigned (extract 'output depth' dimension from filter weights, #3) +#define DIM_NFILTS(X) DIM_OF(X, 3) +//! DIM_FILTDEPTH("operand") -> unsigned (extract 'input depth' dimension from filter weights, #2) +#define DIM_FILTDEPTH(X) DIM_OF(X, 2) +//! DIM_FILTWIDTH("operand") -> unsigned (extract 'filter width' dimension from filter weights, #1) +#define DIM_FILTWIDTH(X) DIM_OF(X, 1) +//! DIM_FILTHEIGHT("operand") -> unsigned (extract 'filter height' dimension from filter weights, #0) +#define DIM_FILTHEIGHT(X) DIM_OF(X, 0) + +#define MAX_SPARSE_ELEMENTS(X) DIM_OF(X, (MAX_DIMENSIONS - 1)) + +//! IS_EMPTY_DIM("operand", dim) -> bool (true if size of dim is 0) +#define IS_EMPTY_DIM(X, DIM) EQ(DIM_OF(X, DIM), 0) + +//! IS_EMPTY("operand") -> bool (true if size of all dims is 0) +#define IS_EMPTY(X) AND(IS_EMPTY_DIM(X, 0), IS_EMPTY_DIM(X, 1), IS_EMPTY_DIM(X, 2), IS_EMPTY_DIM(X, 3)) + +} // namespace constraint_lib +/** @} */ + +#endif diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/conversions.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/conversions.h new file mode 100755 index 0000000000000..4cb348c637953 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/conversions.h @@ -0,0 +1,609 @@ +//============================================================================== +// +// Copyright (c) 2018 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +#ifndef CONVERSIONS_H +#define CONVERSIONS_H + +#include +#include +#include +#include +#include + +#include "builtin_intrinsics.h" + +#ifdef __hexagon__ +#include "hexagon_protos.h" +#endif + +#include "float16.h" + +#if defined(__clang__) +#define ATTR_NO_SANITIZE(CATEGORY) __attribute__((no_sanitize(CATEGORY))) +#else +#define ATTR_NO_SANITIZE(CATEGORY) /*empty */ +#endif + +namespace hnnx { + +namespace scast { + +// for a given floating type F, and a integer type TI, +// intrange_within_float::max() +// generates the largest value representable in type F which will fit into TI without overflow. +// in many cases this is F(std::numeric_limits::max()), +// but there are exceptions when the mantissa of F is narrower than TI; in those cases we +// want the representable value which is smaller than the integer's max value, not the nearest: +// F TI +// Float16 int16 32752.0 (0x7ff0) +// Float15 uint16 65504.0 (0xffe0) +// float int32 2147483520.0 (0x7fffff80) +// float uint32 4294967040.0 (0xFFFFFF00) +// float int64 9.223371487e18 (0x7fff_ff80_0000_0000) +// float uint64 1.844674297e+19 (0xFFFF_FF00__0000_0000) +// double int64 9223372036854774784.0 (0x7FFF_FFFF_FFFF_FC00) +// double uint64 18446744073709549568.0 (0xFFFF_FFFF_FFFF_F800) +// +// All of the 'min' limits are zero or powers of 2, so those can be converted +// directly from std::numeric_limits::min() +// +// +template struct intrange_within_float { +}; + +// LCOV_EXCL_START [SAFTYSWCCB-1736] constexprs resolved during compile time +template struct intrange_within_float { + static_assert(std::numeric_limits::is_integer); + static inline constexpr Float16 max() + { + if constexpr (sizeof(TI) < 2) { + return Float16(std::numeric_limits::max()); + } else if constexpr (sizeof(TI) == 2) { + return std::numeric_limits::is_signed ? Float16(32752.0f) : Float16(65504.0f); + } else { + return std::numeric_limits::is_signed ? Float16(-65504.0f) : Float16(65504.0f); + } + } + // 'min' value of integer range is always exactly representable + static inline constexpr Float16 min() { return Float16(std::numeric_limits::min()); } +}; + +template struct intrange_within_float { + static_assert(std::numeric_limits::is_integer); + static inline constexpr float max() + { + if constexpr (sizeof(TI) < 4) { + return float(std::numeric_limits::max()); + } else if constexpr (sizeof(TI) == 4) { + return std::numeric_limits::is_signed ? 2147483520.0f : 4294967040.0f; + } else { + static_assert(sizeof(TI) == 8); + return std::numeric_limits::is_signed ? 9.223371487e18f : 1.844674297e+19f; + } + } + // 'min' value of integer range is always exactly representable + static inline constexpr float min() { return float(std::numeric_limits::min()); } +}; + +template struct intrange_within_float { + static_assert(std::numeric_limits::is_integer); + static inline constexpr double max() + { + if constexpr (sizeof(TI) < 8) { + return double(std::numeric_limits::max()); + } else { + static_assert(sizeof(TI) == 8); + return std::numeric_limits::is_signed ? 9223372036854774784.0 : 18446744073709549568.0; + } + } + // 'min' value of integer range is always exactly representable + static inline constexpr float min() { return double(std::numeric_limits::min()); } +}; +// LCOV_EXCL_STOP + +template struct satcast_helper { + static_assert(std::numeric_limits::is_specialized && std::numeric_limits::is_specialized); + static inline TOUT constexpr op(TIN val) + { + if constexpr (!std::numeric_limits::is_integer) { // convert to a float + return TOUT(val); + } else { + constexpr bool OUTS = std::numeric_limits::is_signed; + if constexpr (std::numeric_limits::is_integer) { + // integer to integer. + // widening? or same width, same signedness? + constexpr bool INS = std::numeric_limits::is_signed; + if (sizeof(TOUT) > sizeof(TIN) || (sizeof(TOUT) == sizeof(TIN) && OUTS == INS)) { + // if the output is unsigned and the input < 0, return 0 + // otherwise it's a normal cast. + return (!OUTS && INS && val < 0) ? TOUT(0) : TOUT(val); + } else if (sizeof(TOUT) == sizeof(TIN)) { + if (!OUTS) { // same size, different signs + return (val < 0) ? (TOUT)0 : (TOUT)val; // signed->unsigned + } else { + constexpr TIN lim = std::numeric_limits::max(); + return (val > lim) ? (TOUT)lim : (TOUT)val; + } + } else { + // narrowing conversion + if (!OUTS) { + constexpr TIN m = std::numeric_limits::max(); + return (val < 0) ? TOUT(0) : (val > m) ? TOUT(m) : TOUT(val); + } else { + constexpr TIN mn = INS ? std::numeric_limits::min() : 0; + constexpr TIN mx = std::numeric_limits::max(); + return (val < mn) ? TOUT(mn) : (val > mx) ? TOUT(mx) : TOUT(val); + } + } + } else { // float to integer + if constexpr (sizeof(TOUT) <= sizeof(int32_t)) { + if constexpr (OUTS) { + constexpr TIN loval = intrange_within_float::min(); + constexpr TIN hival = intrange_within_float::max(); + int32_t const tmp = (int32_t)std::max(loval, std::min(hival, val)); + return satcast_helper::op(tmp); + } else { + constexpr TIN loval = 0.0; + constexpr TIN hival = intrange_within_float::max(); + uint32_t const tmp = (uint32_t)std::max(loval, std::min(hival, val)); + return satcast_helper::op(tmp); + } + } else { // 64-bit output assumed + constexpr TIN loval = intrange_within_float::min(); + constexpr TIN hival = intrange_within_float::max(); + return (TOUT)std::max(loval, std::min(hival, val)); + } + } + } + } +}; +// specialize for conversion to same +template struct satcast_helper { + static_assert(std::numeric_limits::is_specialized); + static inline TT constexpr op(TT val) { return val; } +}; + +#ifdef __hexagon__ + +// saturate to types <= int. +template struct q6_sat_int { +}; +template <> struct q6_sat_int { + static inline int op(int x) { return Q6_R_satb_R(x); } +}; +template <> struct q6_sat_int { + static inline int op(int x) { return Q6_R_satub_R(x); } +}; +template <> struct q6_sat_int { + static inline int op(int x) { return Q6_R_sath_R(x); } +}; +template <> struct q6_sat_int { + static inline int op(int x) { return Q6_R_satuh_R(x); } +}; + +// TODO: these should be done again for 'long' if long is also 32 bits. +#if 0 // NOTE: we can't really do this unless intrinsics are constexpr +template <> struct satcast_helper { + static inline uint8_t /*constexpr*/ op(int val) + { + return Q6_R_satub_R(val); + } +}; +template <> struct satcast_helper { + static inline int8_t /*constexpr*/ op(int val) { return Q6_R_satb_R(val); } +}; +template <> struct satcast_helper { + static inline uint16_t /*constexpr*/ op(int val) + { + return Q6_R_satuh_R(val); + } +}; +template <> struct satcast_helper { + static inline int16_t /*constexpr*/ op(int val) { return Q6_R_sath_R(val); } +}; +#endif + +#endif +} // end namespace scast + +} // namespace hnnx + +/** + * @brief saturate_cast( TIN val ) will work on any two numeric types; + * if the input is outside the numeric range of the output type, it + * will be range-limited. + * + * it works as follows: + * * if TOUT is a floating type, the operation is the same as the C++ cast. + * * if TOUT is integer and TIN is float, the input is first converted + * to one of int32,uint32, int64, uint64 ensuring that out-of-range values + * are clipped; and then converted to the output type as below (if it is smaller + * than 32 bits) (The 2-step conversion is intended to work well when things + * are specialized to support native hexagon ops). + * * Otherwise they are both integers. + * - If the output width is larger than the input (or if they are the same size + * and of the same signedness): + * * if the output is unsigned, and the input is < 0, the result is zero + * * otherwise the result is the same as a C++ cast (all values representable) + * - Otherwise, it is a saturating cast; values are limited to the range of TOUT. + */ +template inline constexpr TOUT saturate_cast(TIN val) +{ + return hnnx::scast::satcast_helper::op(val); +} + +/** + * @brief T saturate_round( float val ) + * round val to nearest int, and saturate to range of T. + * + * T must be an integer type, at most 32 bits. + */ +// For general C platform, we need to clip the range before converting to int; +// for hexagon the conversions saturate. +// +#ifndef __hexagon__ +template inline TOUT saturate_round(float val) +{ + static_assert(sizeof(TOUT) <= 8 && std::numeric_limits::is_integer); + return saturate_cast(std::nearbyintf(val)); +} + +#else +template inline TOUT saturate_round(float val) +{ + static_assert(sizeof(TOUT) <= 8 && std::numeric_limits::is_integer); + if constexpr ((sizeof(TOUT) == 8) && !std::numeric_limits::is_signed) { + // convert to unsigned u64, rounding, saturating + return Q6_P_convert_sf2ud_R(val); + } else if constexpr ((sizeof(TOUT) == 8) && std::numeric_limits::is_signed) { + // convert to int64, rounding + return Q6_P_convert_sf2d_R(val); + } else if constexpr ((sizeof(TOUT) == 4) && !std::numeric_limits::is_signed) { + // convert to unsigned u32, rounding, saturating + return Q6_R_convert_sf2uw_R(val); + } else { + // convert to int32,rounding; + int const r = Q6_R_convert_sf2w_R(val); + if constexpr (sizeof(TOUT) < 4) return static_cast(hnnx::scast::q6_sat_int::op(r)); + return static_cast(r); // LCOV_EXCL_LINE [SAFTYSWCCB-1736] + } +} +#endif + +namespace hnnx { + +/** + * @brief 'proper' compare of any two integer types + * proper_gt( a, b) => a > b; + * E.g. if a is unsigned and b is signed, the operation checks to see if b is < 0; + * if so, the result is true; otherwise an unsigned compare is done: a > (unsigned)b + * + */ +namespace prpercmp { + +/** + * @brief if both A and B are either *int*, or smaller than int, + * then promote them both to int and compare them. + * + * otherwise, if TA is wider than TB, (or the same, with TA unsigned): + * promote b to TA, and then compare them. + * Exception, if TA is unsigned and TB is signed and b < 0; then a struct proper_cmp_helper { + static_assert(std::numeric_limits::is_integer && std::numeric_limits::is_integer); + static const bool ASIGNED = std::numeric_limits::is_signed; + static const bool BSIGNED = std::numeric_limits::is_signed; + + // compare by promoting both to int, when... + static const bool CMP_AS_INT = (sizeof(TA) < sizeof(int) || (sizeof(TA) == sizeof(int) && ASIGNED)) && + (sizeof(TB) < sizeof(int) || (sizeof(TB) == sizeof(int) && BSIGNED)); + // otherwise, compare by promoting B to A when ... + static const bool B_TO_A = sizeof(TA) > sizeof(TB) || (sizeof(TA) == sizeof(TB) && !ASIGNED); + // otherwise, compare by promoting A to B + + static inline bool constexpr eq(TA a, TB b) + { + if (CMP_AS_INT) { + return (int)a == (int)b; + } else if (B_TO_A) { + if (!ASIGNED && BSIGNED && b < 0) return false; + return a == (TA)b; + } else { + if (!BSIGNED && ASIGNED && a < 0) return false; + return (TB)a == b; + } + } + static inline bool constexpr lt(TA a, TB b) + { + if (CMP_AS_INT) { + return (int)a < (int)b; + } else if (B_TO_A) { + if (!ASIGNED && BSIGNED && b < 0) return false; // a < b always false if b<0 + return a < (TA)b; + } else { + if (!BSIGNED && ASIGNED && a < 0) return true; // a < b always true if a<0 + return (TB)a < b; + } + } +}; +/** + * @brief specialize for comparison to same type + */ +template struct proper_cmp_helper { + static_assert(std::numeric_limits::is_integer); + static inline bool constexpr eq(T a, T b) { return a == b; } + static inline bool constexpr lt(T a, T b) { return a < b; } +}; + +} // end namespace prpercmp + +} // namespace hnnx + +/** + * @brief 'proper' compare of any two integer types, respecting signedness and actual numeric value. + * proper_eq(a,b) => a == b; + * + * E.g. if a is signed and <0, and b is unsigned, result will always be false. + * + */ + +template inline bool constexpr proper_eq(TA a, TB b) +{ + return hnnx::prpercmp::proper_cmp_helper::eq(a, b); +} +/** + * @brief 'proper' compare of any two integer types, respecting signedness and actual numeric value + * proper_ne(a,b) => !proper_eq(a,b); + */ +template inline bool constexpr proper_ne(TA a, TB b) +{ + return !hnnx::prpercmp::proper_cmp_helper::eq(a, b); +} +/** + * @brief 'proper' compare of any two integer types, respecting signedness and actual numeric value + * proper_lt(a,b) => a inline bool constexpr proper_lt(TA a, TB b) +{ + return hnnx::prpercmp::proper_cmp_helper::lt(a, b); +} +/** + * @brief 'proper' compare of any two integer types, respecting signedness and actual numeric value + * proper_ge(a,b) => a>=b; + */ +template inline bool constexpr proper_ge(TA a, TB b) +{ + return !hnnx::prpercmp::proper_cmp_helper::lt(a, b); +} +/** + * @brief 'proper' compare of any two integer types, respecting signedness and actual numeric value + * proper_gt(a,b) => a>b; + */ +template inline bool constexpr proper_gt(TA a, TB b) +{ + return hnnx::prpercmp::proper_cmp_helper::lt(b, a); +} +/** + * @brief 'proper' compare of any two integer types, respecting signedness and actual numeric value + * proper_le(a,b) => a<=b; + */ +template inline bool constexpr proper_le(TA a, TB b) +{ + return !hnnx::prpercmp::proper_cmp_helper::lt(b, a); +} +/** + * @brief x >= lo && x < limit, using proper compares + */ +template inline bool constexpr proper_inrange(TA x, TB lo, TC limit) +{ + return proper_ge(x, lo) && proper_lt(x, limit); +} + +/** + * @brief x >= lo && x <= hi, using proper compares + */ +template inline bool constexpr proper_inrange_closed(TA x, TB lo, TC hi) +{ + return proper_ge(x, lo) && proper_le(x, hi); +} + +/** + * @brief find the 'width' of an unsigned value (# of bits needed to contain it) + * this is floor( log2(x))+1 + * (and 0 when x = 0) + * + */ +inline int constexpr binary_bitwidth(unsigned x) +{ + return (x == 0) ? 0 : (sizeof(unsigned) * 8 - HEX_COUNT_LEADING_ZERO(x)); +} +/** + * @brief find the 'width' of an unsigned long value (# of bits needed to contain it) + * this is floor( log2(x))+1 + * (and 0 when x = 0) + * + */ +inline int constexpr binary_bitwidth(unsigned long x) +{ + return (x == 0) ? 0 : (sizeof(unsigned long) * 8 - HEX_COUNT_LEADING_ZERO_UL(x)); +} +/** + * @brief find the 'width' of an unsigned long long value (# of bits needed to contain it) + * this is floor( log2(x))+1 + * (and 0 when x = 0) + * + */ +inline int constexpr binary_bitwidth(unsigned long long x) +{ + return (x == 0) ? 0 : (sizeof(unsigned long long) * 8 - HEX_COUNT_LEADING_ZERO_ULL(x)); +} +/** + * @brief saturating u32+u32 add + */ +inline uint32_t /*constexpr*/ addu32_sat(uint32_t a, uint32_t b) +{ + uint64_t const prod = (uint64_t)a + b; + return saturate_cast(prod); +} + +/** + * @brief saturating i32+i32 add + */ +inline int32_t /*constexpr*/ addi32_sat(int32_t a, int32_t b) +{ +#ifdef __hexagon__ + return Q6_R_add_RR_sat(a, b); +#else + int64_t prod = (int64_t)a + b; + return saturate_cast(prod); +#endif +} + +/** + * @brief saturating u32xu32 multiply + */ +inline uint32_t constexpr mulu32_sat(uint32_t a, uint32_t b) +{ + uint64_t const prod = (uint64_t)a * b; + return saturate_cast(prod); +} + +/** + * @brief saturating i32xi32 multiply + */ +inline int32_t constexpr muli32_sat(int32_t a, int32_t b) +{ + int64_t const prod = (int64_t)a * b; + return saturate_cast(prod); +} + +/** + * @brief saturating u64xu64 multiply + */ +inline uint64_t /*constexpr*/ mulu64_sat(uint64_t a, uint64_t b) +{ + uint64_t prod = 0; + if (HEX_MUL_OVERFLOW(a, b, &prod)) { + prod = std::numeric_limits::max(); + } + return prod; +} + +/** + * @brief saturating i64xi64 multiply + */ +inline int64_t /*constexpr*/ muli64_sat(int64_t a, int64_t b) +{ + int64_t prod = 0; + if (HEX_MUL_OVERFLOW(a, b, &prod)) { + prod = (int64_t(uint64_t(a) ^ uint64_t(b)) >= 0) ? std::numeric_limits::max() + : std::numeric_limits::min(); + } + return prod; +} +/** + * @brief add unsigned+unsigned->unsigned, escaping 'unsigned overflow' checks + */ +ATTR_NO_SANITIZE("unsigned-integer-overflow") +inline unsigned constexpr addu32_modular(unsigned a, unsigned b) +{ + return a + b; +} +/** + * @brief subtract unsigned-unsigned->unsigned, escaping 'unsigned overflow' checks + * For '-unsigned_var', use subu32_modular(0,unsigned_var) + */ +ATTR_NO_SANITIZE("unsigned-integer-overflow") +inline unsigned constexpr subu32_modular(unsigned a, unsigned b) +{ + return a - b; +} +/** + * @brief multiply unsigned*unsigned->unsigned, escaping 'unsigned overflow' checks + */ +ATTR_NO_SANITIZE("unsigned-integer-overflow") +inline unsigned constexpr mulu32_modular(unsigned a, unsigned b) +{ + return a * b; +} +/** + * @brief mul-add u32*u32+u32->u32, escaping 'unsigned overflow' checks + */ +ATTR_NO_SANITIZE("unsigned-integer-overflow") +inline unsigned constexpr muladdu32_modular(unsigned a, unsigned b, unsigned c) +{ + return a * b + c; +} + +/** + * @brief add u64+u64->u64, escaping 'unsigned overflow' checks + */ +ATTR_NO_SANITIZE("unsigned-integer-overflow") +inline uint64_t constexpr addu64_modular(uint64_t a, uint64_t b) +{ + return a + b; +} + +/** + * @brief subtract u64-u64->u64, escaping 'unsigned overflow' checks + */ +ATTR_NO_SANITIZE("unsigned-integer-overflow") +inline uint64_t constexpr subu64_modular(uint64_t a, uint64_t b) +{ + return a - b; +} +/** + * @brief mul u64*u64->u64, escaping 'unsigned overflow' checks + */ +ATTR_NO_SANITIZE("unsigned-integer-overflow") +inline uint64_t constexpr mulu64_modular(uint64_t a, uint64_t b) +{ + return a * b; +} + +/** + * @brief 'image' conversion from TIN to TOUT (which must be the same size) + * e.g. image_convert( 1.25f) -> 0x3fa00000 + */ + +template inline constexpr TOUT image_convert(TIN x) +{ + static_assert(sizeof(TOUT) == sizeof(TIN)); + static_assert(std::is_trivially_copyable_v); + static_assert(std::is_trivially_copyable_v); + static_assert(std::is_trivially_constructible_v); + TOUT out; + std::memcpy(&out, &x, sizeof(TOUT)); + return out; +} + +// round up A to a multiple of B. +// b is expected to be > 0 even if signed. + +template inline constexpr size_t round_up(size_t a, TD b) +{ + static_assert(std::is_integral_v, "round_up can only apply to integer types"); + // for b being a power of 2, this should compile as (a+(b-1)) &~(b-1) + return b * ((a + (b - 1)) / b); +} +// for int, b is expected to be > 0; +// this will work for negative a, e.g. round_up(-53,10) -> -50 +template inline constexpr size_t round_up(int a, TD b) +{ + static_assert(std::is_integral_v, "round_up can only apply to integer types"); + int const bi = b; + int const tmp = a + ((a > 0) ? (bi - 1) : 0); + return bi * (tmp / bi); +} + +#endif /*CONVERSIONS_H*/ diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/cost.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/cost.h new file mode 100755 index 0000000000000..8f0b21ccb86e5 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/cost.h @@ -0,0 +1,38 @@ +//============================================================================== +// +// Copyright (c) 2020 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +#ifndef COST_H +#define COST_H 1 + +// NOTE: WHATCOST may be something like SNAIL/128 +#define COST_OF(FUNC, WHATCOST) COST_OF_OP(typename DerivedType<(FUNC)>::type, WHATCOST) +#define COST_OF_F(FUNC, WHATCOSTFN) COST_OF_OP_F(typename DerivedType<(FUNC)>::type, WHATCOSTFN) + +#ifdef PREPARE_DISABLED +#define COST_OF_OP(OP, WHATCOST) +#define COST_OF_OP_F(OP, WHATCOSTFN) +#else +#define COST_OF_OP(OP, WHATCOST) \ + template <> [[maybe_unused]] constexpr hnnx::cost_function_t hnnx::get_costf() \ + { \ + return hnnx::cost_function_t(float(StandardCosts::WHATCOST)); \ + } + +#define COST_OF_OP_F(OP, WHATCOSTFN) \ + template <> \ + float hnnx::cost_function_t::cfunc(hnnx::cost_function_t const &, const Graph &graph_in, const Op *op) \ + { \ + return WHATCOSTFN(graph_in, op); \ + } \ + template <> [[maybe_unused]] constexpr hnnx::cost_function_t hnnx::get_costf() \ + { \ + return hnnx::cost_function_t(hnnx::cost_function_t::cfunc, 1.0); \ + } +#endif + +#endif diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/cost_funcs.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/cost_funcs.h new file mode 100755 index 0000000000000..286945b9b34b8 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/cost_funcs.h @@ -0,0 +1,56 @@ +//============================================================================= +// +// Copyright (c) 2020 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================ + +#ifndef COST_FUNCS_H +#define COST_FUNCS_H +#include +#include +#include "weak_linkage.h" +#include "macros_attribute.h" +PUSH_VISIBILITY(default) + +class Graph; +class Op; + +namespace hnnx { + +class API_EXPORT cost_function_t { + using inner_func_t = float (*)(cost_function_t const &, const Graph &, Op const *); + inner_func_t funcp; + float val; + + public: + cost_function_t(cost_function_t const &) = default; + cost_function_t &operator=(cost_function_t const &) = default; + constexpr explicit cost_function_t(float val_in) : funcp(simple_cost_function), val(val_in) {} + constexpr cost_function_t(inner_func_t f, float val_in) : funcp(f), val(val_in) {} + constexpr cost_function_t() noexcept : funcp(simple_cost_function), val(0.0f) {} + + inline float operator()(const Graph &graph_in, Op const *op) const { return (*funcp)(*this, graph_in, op); } + static float simple_cost_function(cost_function_t const &self, const Graph &, Op const *) + { + return self.val; + } // just returns val; + + float get_val() const { return val; } + + // unreliable compare for two cost func: returns -1,0,1 if this cost + // is <,=,> than rhs cost, with the second result being true; or <0,false> + // if it can't tell. + std::pair compare(cost_function_t const &rhs) const; + + template static float cfunc(cost_function_t const &, const Graph &, Op const *); +}; + +API_EXPORT cost_function_t cost_func_from_str(std::string_view); + +} // namespace hnnx + +POP_VISIBILITY() + +#endif diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/crate.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/crate.h new file mode 100755 index 0000000000000..494f51e40fa0f --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/crate.h @@ -0,0 +1,471 @@ +//============================================================================== +// +// Copyright (c) Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +/* + * crate.h + * + * Created on: Aug 1, 2019 + * Author: smithg + */ + +#ifndef CRATE_H_ +#define CRATE_H_ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "is_detected.h" +#include "forward_classes.h" +#include "macros_attribute.h" +#include "weak_linkage.h" +#include "size_align_code.h" + +PUSH_VISIBILITY(default) + +class Graph; +class Tensor; + +/// @brief A 'Crate' allows construction of some number of different data types, +/// contiguously packed into a few large memory blocks. +/// +/// Example: +/// +/// Crate crt; +/// Thing tp* = crt.emplace( ... ctor parms for Thing ... ) +/// AnotherThing tp2* = crt.emplace( ... ctor parms for AnotherThing ... ) +/// +/// When the crate is destroyed, all of the contained objects are destroyed in the reverse +/// order. You cannot 'remove' a single entry using +/// +/// crt.erase has been deprecated +/// +/// However, this is likely not going to free any memory; it will just call the dtor of the +/// object (and make sure it doesn't get called later, when the Crate is cleared or destroyed). +/// +/// You can also emplace variable-sized arrays of trivially-destructable objects. +/// +/// alloc_array does not initialize: +/// +/// float * farr = crt.alloc_array(n); +/// +/// alloc_array_zero does zero-initializing: +/// +/// int * farr = crt.alloc_array_zero(n); +/// +/// If an allocation needs space larger than CHUNKBYTES, it will get its own chunk. +/// +// Each record containing an object has a non-null 'dtor' field; if the object is trivially destructible, +// this will be (dtor_funcp)1, and the object is not on the linked-list. +// +// note: +// A constructor may emplace additional records in the crate recursively. Likewise, +// it's OK if the dtors call erase() on other objects. If this happens during a 'clear', +// the erase calls are ignored since the other objects are going to get dtor'd anyhow (if they have not +// been already). +// Important: if object A's constructor places B into the crate, then B will very likely get destroyed +// first when the crate is cleared. Thus, A's destructor can't look at B (it can erase B, which is ignored +// as described above). + +// +// new 'raw' mode: +// - when the crate is in 'raw' mode, no destructors are registered. inserting an object +// increases 'alloc_count' in the chunk header, but does not increment 'nrec', nor any +// does it increase Crate::m_records. +// - raw mode is entered by enable_raw_mode(size_needed): +// which does this in addition to enabling raw mode: +// - if there is no current chunk, or if the current chunk doesn't have room for 'size_needed' bytes, +// a new chunk is added which does. +// - enable_raw_mode(size_needed) returns a chunk handle. +// +// Internally, raw_mode causes add_record_slot() to do the same thing, but it only moves alloc_count, it does +// not assign a slot index, and 'idx' is -1 in the returned struct. +// All callers of add_record_slot() *must* check for raw mode (can be done by checking idx < 0), and then avoid +// adding an dtor or doing '++m_records'. +// +// it's also possible to call .enable_raw_mode(), disable_raw_mode() +// but .enable_raw_mode() does nothing if there isn't at least one chunk allocated. +// + +namespace hnnx { + +// +// This is used to statically determine whether a type T has a clear(Graph&) +// method. This is used as an additional destructor which takes a Graph +// reference. +// + +template using clear_t = decltype(std::declval().clear(std::declval())); + +template constexpr bool has_clear = is_detected_v; + +class Deserz; +class DCrate; + +class Crate { + API_EXPORT static constexpr size_t CHUNKBYTES = (1 << 16); + static_assert(CHUNKBYTES % 8 == 0 && CHUNKBYTES >= 128); + typedef void (*dtor_funcp)(Graph *graph_in, void *); + API_EXPORT static dtor_funcp DTOR_TRIVIAL() { return (dtor_funcp)1; } + API_EXPORT static dtor_funcp DTOR_IN_PROCESS() { return (dtor_funcp)2; } + + //! A record in the index of a chunk + struct index_rec { + unsigned loc; ///< offset in bytes to the object + dtor_funcp + dtor; ///< pointer to dtor function (null if empty record; (DTOR_TRIVIAL if the object is trivial dtor) + }; + //! A chunk record in the crate. + /// + /// Each chunk is created as an array of uint64_t, via make_unique + /// The memory in a chunk has a chunkhdr, which is followed by: + /// + /// [Objects][Objects][Objects]--> free space <--[Index records] + /// + /// 'alloc_count' is the next offset available to be allocated. + /// index records are entered in reverse order from the end. So, the last nrec*sizeof(index_rec) + /// bytes of the area, are the index. + /// + typedef std::unique_ptr uptr_chunk_t; + struct chunkhdr; + API_EXPORT static chunkhdr *hdr_of(uptr_chunk_t &p) { return reinterpret_cast(p.get()); } + API_EXPORT static chunkhdr const *hdr_of(uptr_chunk_t const &p) + { + return reinterpret_cast(p.get()); + } + /// The chunkhdr is the first portion of the chunk, and is immediately followed + /// by data_len bytes, which is a multiple of 8. + struct API_EXPORT alignas(8) chunkhdr { + unsigned data_len; ///< length of the data area following header, bytes (>=CHUNKBYTES). + unsigned nrec; ///< records in use (including deleted ones) + unsigned alloc_count; ///< offset of first byte in 'free space' + // init to a given length (header not included) + void init(unsigned length) + { + data_len = length; + nrec = 0; + alloc_count = 0; + } + // reset (preserve data_len) + void init() + { + nrec = 0; + alloc_count = 0; + } + // pointer to 'offs ' within data area + inline uint8_t *get_ptr(unsigned offs) { return (uint8_t *)(this + 1) + offs; } + // pointer to end of the allocation + inline uint8_t *get_end_ptr() { return (uint8_t *)(this + 1) + data_len; } + // amount of space remaining + inline size_t space_avail() const { return data_len - alloc_count - nrec * sizeof(index_rec); } + // get pointer to an index record. + // record 0 is the last (oldest) one. + index_rec *index_p(int idx) { return (index_rec *)get_end_ptr() - (idx + 1); } + static uptr_chunk_t allocate(unsigned len); + }; + std::vector m_chunks; /// < chunks with data + std::vector m_free; /// < chunks without + typedef std::vector::iterator chunk_iter; + + bool m_rawmode = false; + bool m_clearing = false; ///< set while clearing. + size_t m_allrecords = 0; ///< includes removed and 'padding' records + size_t m_records = 0; ///< only actual, non-erased records. + + //! Returned from add_record_slot (which is used to create a new record) + struct recposn { + chunkhdr *chunkp; ///< the chunk in which it was found + void *objp; ///< pointer to the object + int idx; ///< index within the chunk (= -1 if insert was done in raw mode) + }; + API_EXPORT recposn add_record_slot(size_t bytes, size_t align); + API_EXPORT void recover_ctor_throw(recposn const &) noexcept; + API_EXPORT void install_dtor(recposn const &, dtor_funcp dtor_func); + API_EXPORT void move_to_free(chunk_iter chunk_to_free); + + public: + class ChunkHandle { + friend class Crate; + chunkhdr *chunkp; + + protected: + ChunkHandle(chunkhdr *cp) : chunkp(cp){}; + + public: + ChunkHandle() : chunkp(nullptr) {} // null handle may only be assigned-to + ChunkHandle(ChunkHandle const &) = default; + ChunkHandle &operator=(ChunkHandle const &) = default; + friend inline bool operator==(ChunkHandle const &a, ChunkHandle const &b) { return a.chunkp == b.chunkp; } + std::pair get_memory_extent() const + { + size_t const len = chunkp->get_ptr(chunkp->alloc_count) - (uint8_t *)chunkp; + return {chunkp, len}; + } + }; + + API_EXPORT Crate(); ///< Construct a new Crate + Crate(Crate const &) = delete; + Crate &operator=(Crate const &) = delete; + + // get the preload handle for the first chunk + ChunkHandle first_chunk_handle() const + { + return ChunkHandle(m_chunks.empty() ? nullptr : hdr_of(const_cast(*this).m_chunks.front())); + } + // get the preload handle for the most recent chunk + ChunkHandle last_chunk_handle() const + { + return ChunkHandle(m_chunks.empty() ? nullptr : hdr_of(const_cast(*this).m_chunks.back())); + } + // 'raw mode' + ChunkHandle enable_raw_mode(unsigned bytes_needed); + API_EXPORT void enable_raw_mode(); + void disable_raw_mode() { m_rawmode = false; } + bool raw_mode() const { return m_rawmode; } + + // Note that the destructor doesn't do anything. You have to call clear() manually. + API_EXPORT ~Crate(); + //! The number of objects in the crate. + size_t size() const { return m_records; } + //! The number of chunks in use + size_t chunk_count() const { return m_chunks.size(); } + //! The amount of space left in the current chunk, approximately. + /// DO NOT CALL unless chunk_count() > 0 + size_t current_chunk_space_remain() const { return hdr_of(this->m_chunks.back())->space_avail(); } + //! Delete all objects. Does not necessarily free all storage to the + /// system; but all retained storage is availabe for re-use in the crate. + /// Note that this is no longer called by the destructor- it must be called explicitly. + API_EXPORT void clear(Graph *graph_in); + // Special entry for deserialzing in segments. + // If it is possible to allocate, in current raw-mode chunk, everything from offset 'start' + // up to but not including 'limit', this is done, and the base address of that region is returned. + // otherwise does nothing and returns null. + API_EXPORT void *allocate_bulk(size_t start, size_t limit); + + //! Construct an object of type T into the crate, using the + /// parameters of any constructor of T. It is acceptable for the + /// constructor to call the emplace method to add other objects to + /// the crate. + template API_HIDDEN T *emplace(Args &&...args) + { + recposn const pos = add_record_slot(sizeof(T), alignof(T)); + // construct the object + if constexpr (std::is_nothrow_constructible::value) { + new (pos.objp) T(std::forward(args)...); + } else { + try { + new (pos.objp) T(std::forward(args)...); + } catch (const std::exception &e) { + recover_ctor_throw(pos); + throw; + } + } + if (pos.idx >= 0) { + // register destructor + if constexpr (!std::is_trivially_destructible::value) { + // Obtain a callable '~T()' function. + // this typically generates a jump, or a small inline; lambda can + // be cast to a function pointer since it has no state. + auto dtor_func = [](Graph *graph_in, void *obj) { + if constexpr (has_clear) { + static_cast(obj)->clear(graph_in); + } + static_cast(obj)->~T(); + }; + install_dtor(pos, (dtor_funcp)dtor_func); + } else { + ++m_records; // note, install_dtor does this too. + } + } + return static_cast(pos.objp); + } + + using deserialize_op_func = void *(*)(void *, Deserz &); + using deserialize_dtor_func = void (*)(Graph *, void *); + + // Alternate interface to cut down on template instantations: + // init_func is used to initialize the memory, and dtor_func + // is is used to register the destructor. It's up to the user + // to provide the correct size and alignment. + + API_EXPORT void *emplace_explicit(Deserz &dctx, deserialize_op_func init_func, deserialize_dtor_func dtor_func, + size_align_code_t size_al); + + //! Allocate 'n' of type T in the crate. + /// Will initially be garbage; T must be trivially destructable (unless waived) + template T *alloc_array(size_t n) + { + static_assert(DTOR_OK || std::is_trivially_destructible::value); + if (n == 0) return nullptr; + recposn const pos = add_record_slot(sizeof(T) * n, alignof(T)); + if (pos.idx >= 0) m_records++; + return static_cast(pos.objp); + } + //! Allocate 'n' of type T in the crate. + /// Will be zero-filled; T must be trivially destructable. + template T *alloc_array_zero(size_t n) + { + T *const res = alloc_array(n); + if (n != 0) ::memset(res, 0, sizeof(T) * n); + return res; + } + //! Allocate 'n' of type T in the crate. + /// Will be "value constructed"; in case of things like int and pointer, + /// this means they will be zeroed. + /// + /// T must be trivially destructable. + template T *alloc_array_value(size_t n) + { + T *res = alloc_array(n); + if (n != 0) std::uninitialized_value_construct_n(res, n); + return res; + } +}; + +/* + * EJP: This seems silly, but I don't know how to get visibility into Graph into a templated Tensor because of include hell. + */ + +API_EXPORT Crate *graph_crate(Graph &graph_in); + +// +// replacement for vector, for use in ops; + +// +// limited options for constructor: +// (1) copy, or move, from vector - need Graph *; +// (2) create with a given size, null-initialized; - need Graph *; +// (3) create empty, and then fill in later +// using init( Graph* , std::vector const &) +// or init( Graph* , std::vector &&) +// or init( Graph *, size ) +// or init( Graph *, T const *ptr, size ); +// or init_move( Graph *, T *ptr, size ); + +// With option 3, it assumed that the 'init' is done during the constructor of +// a host object - this is needed during deserialize, for instance. +// the 'len' is 32 bits so this type occupies 2 pointers, vs. 3 for std::vector. +// +// If 'T' has a destructor, the cratevec's destructor will invoke that on +// each element of the vector, in reverse order. +// when the 'move-from' mechanisms to init from 'std::vector && are used, +// the supplied vector will not be cleared; but its elements will all be +// 'moved-from'. + +template class cratevec { + T *m_ptr; + unsigned m_len; + using vec_t = std::vector; + static constexpr bool need_dtor = !std::is_trivially_destructible::value; + + public: + using iterator = T *; + using const_iterator = T const *; + using value_type = T; + using size_type = size_t; + using difference_type = ptrdiff_t; + using reference = T &; + using const_reference = T const &; + + cratevec() : m_ptr(nullptr), m_len(0) {} + cratevec(Graph *g, vec_t const &v) : cratevec() + { + if (!v.empty()) init(g, v.data(), v.size()); + } + cratevec(Graph *g, vec_t &&v) : cratevec() + { + if (!v.empty()) init_move(g, v.data(), v.size()); + } + cratevec(Graph *g, size_t n) : cratevec() { init(g, n); } + cratevec(cratevec const &) = delete; + cratevec(cratevec &&) = delete; + ~cratevec() + { + if constexpr (need_dtor) { + if (m_len > 0) { + T *const ptr0 = m_ptr; + T *ptr = ptr0 + m_len; + do { + ptr--; + ptr->~T(); + } while (ptr > ptr0); + } + } + } + + cratevec &operator=(cratevec const &) = delete; + cratevec &operator=(cratevec &&) = delete; + + void init(Graph *g, T const *data, size_t n) + { + assert(m_len == 0); + if (n) { + m_ptr = graph_crate(*g)->alloc_array(n); + std::uninitialized_copy_n(data, n, m_ptr); + m_len = n; + } + } + void init_move(Graph *g, T *data, size_t n) + { + assert(m_len == 0); + if (n) { + m_ptr = graph_crate(*g)->alloc_array(n); + std::uninitialized_move_n(data, n, m_ptr); + m_len = n; + } + } + // these methods get used during deserialize, so allow it to pass crate in directly. + void init(hnnx::Crate *const crate_p, size_t const n) + { + assert(m_len == 0); + if (n) { + m_ptr = crate_p->alloc_array(n); + std::uninitialized_value_construct_n(m_ptr, n); + m_len = n; + } + } + // The DCrate version is defined in dcrate_inlines.h + void init(hnnx::DCrate *crate_p, size_t n); + + void init(Graph *const g, size_t const n) { init(graph_crate(*g), n); } + void init(Graph *const g, vec_t const &v) { init(g, v.data(), v.size()); } + void init(Graph *const g, vec_t &&v) { init_move(g, v.data(), v.size()); } + + iterator begin() noexcept { return m_ptr; } + iterator end() noexcept { return m_ptr + m_len; } + const_iterator begin() const noexcept { return m_ptr; } + const_iterator end() const noexcept { return m_ptr + m_len; } + const_iterator cbegin() const noexcept { return m_ptr; } + const_iterator cend() const noexcept { return m_ptr + m_len; } + size_type size() const noexcept { return m_len; } + T *data() noexcept { return m_ptr; } + T const *data() const noexcept { return m_ptr; } + bool empty() const noexcept { return m_len == 0; } + reference operator[](size_type idx) { return m_ptr[idx]; } + const_reference operator[](size_type idx) const { return m_ptr[idx]; } + reference at(size_type idx) + { + if (idx >= m_len) throw std::range_error("cratevec"); + return m_ptr[idx]; + } + const_reference at(size_type idx) const { return const_cast(*this).at(idx); } + reference front() { return m_ptr[0]; } + const_reference front() const { return m_ptr[0]; } + reference back() { return m_ptr[m_len - 1]; } + const_reference back() const { return m_ptr[m_len - 1]; } +}; + +} // namespace hnnx + +POP_VISIBILITY() + +#endif /* CRATE_H_ */ diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/dcrate_inlines.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/dcrate_inlines.h new file mode 100755 index 0000000000000..a48e7bc909904 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/dcrate_inlines.h @@ -0,0 +1,101 @@ +//============================================================================== +// +// Copyright (c) Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +#ifndef DCRATE_INLINES_H +#define DCRATE_INLINES_H 1 + +#include +#include +#include + +#include "macros_attribute.h" +#include "deser_concurrent.h" +#include "crate.h" + +namespace hnnx { + +// alloc 'amount' bytes with given alignment. +inline void *DCrate::do_alloc(const size_t align, const size_t amount) +{ + size_t basep = size_t(nextp); + if (align > 4) { + basep = (basep + (align - 1)) & ~(align - 1); + } + size_t const next_base = basep + amount; + if (next_base > (size_t)limitp) return nullptr; + nextp = (void *)next_base; // update 'nextp' ... + return (void *)basep; +} + +template inline T *DCrate::alloc_array(const size_t n) +{ + if (nextp != nullptr) { + void *const allocp = do_alloc(alignof(T), sizeof(T) * n); + if (allocp) return (T *)allocp; + } + return cratep->alloc_array(n); +} + +template inline T *DCrate::emplace(Args &&...args) +{ + if (nextp != nullptr) { + void *const allocp = do_alloc(alignof(T), sizeof(T)); + if (allocp) { + new (allocp) T(std::forward(args)...); + return (T *)allocp; + } + } + return cratep->emplace(std::forward(args)...); +} + +template <> +inline void *DCrate::emplace_explicit(Deserz &dctx, deserialize_op_func const init_func, + deserialize_dtor_func const dtor_func, size_align_code_t const size_al) +{ + if (nextp != nullptr) { + void *const allocp = do_alloc(size_al.align(), size_al.size()); + if (allocp) { + init_func(allocp, dctx); + return allocp; + } + } + return cratep->emplace_explicit(dctx, init_func, dtor_func, size_al); +} + +// this will be used in place of 'emplace' when the constructor parms +// are just 'Deserz &' +template inline T *DCrate::emplace0(Deserz &dctx) +{ + deserialize_op_func const ctor = [](void *const ptr, Deserz &dctx) -> void * { + new (ptr) T(dctx); + return ptr; + }; + if (nextp != nullptr) { + void *const allocp = do_alloc(alignof(T), sizeof(T)); + if (allocp) { + (ctor)(allocp, dctx); + return (T *)allocp; + } + } + return (T *)cratep->emplace_explicit(dctx, ctor, nullptr, size_align_code_t::for_type()); +} +// init method of cratevec using 'Dcrate' is declared here to avoid header inclusion madness. +// +template inline void hnnx::cratevec::init(hnnx::DCrate *crate_p, size_t n) +{ + assert(m_len == 0); + if (n) { + m_ptr = crate_p->alloc_array(n); + std::uninitialized_value_construct_n(m_ptr, n); + m_len = n; + } +} + +} // namespace hnnx + +#endif // DCRATE_INLINES_H diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/deser_concurrent.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/deser_concurrent.h new file mode 100755 index 0000000000000..16db21a082cf1 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/deser_concurrent.h @@ -0,0 +1,288 @@ +//============================================================================== +// +// Copyright (c) Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +#ifndef DESER_CONCURRENT_H +#define DESER_CONCURRENT_H 1 + +#include +#include +#include +#include +#include +#include +#include + +#include "deser_concurrent_defs.h" + +// this is intended to be included only in "deserialize.h" + +struct PreloadInfo; + +namespace hnnx { +struct runlist_seg_descriptor; +class Crate; +class Deserz; +class fixup_supplemental_recs; +class InitTimeSchedule; + +// describes a 'span' of the deserialized data +struct deser_segment_span { + void *base; + void *limit; +}; + +// This describes a partially-decoded segment; includes fixups. +// This should stay small so we can place it inside Deserz, and std::move it +// out (to keep the fixup list) when done with the segment. +struct runlist_fixup_state { + unsigned segno = 0; + size_t *crate_begin = nullptr; // where the data starts in the crate + runlist_seg_descriptor *seg_desc = nullptr; // Corresponding 'runlist_seg_descriptor' for reference. + // The next three are copied from the runlist_auxdata_seg_desc + uint32_t base_tensor_index = 0; // first tensor index defined this segment + uint32_t base_blocktable_index = 0; // first blocktable index defined in this segment + uint32_t base_sharedobj_index = 0; // first 'shared_object' index defined in this segment + // fixup data + size_t *fixup_list_head = nullptr; // head of the 'fixup list', or null if none. + fixup_supplemental_recs *fixup_supplemental; // supplemental fixup list + + runlist_fixup_state() = default; + ~runlist_fixup_state() = default; + runlist_fixup_state(runlist_fixup_state const &) = default; + // *Some* implementations of c++lib require this to have operator= (non-move) + // in order for std::vector containing it to be constructed via resize. + runlist_fixup_state &operator=(runlist_fixup_state const &) = default; + // the move-ctor and move-assign must leave the source with no fixup list, + // and segno = 0. + runlist_fixup_state(runlist_fixup_state &&from) { do_move_from(std::move(from)); } + runlist_fixup_state &operator=(runlist_fixup_state &&from) + { + do_move_from(std::move(from)); + return *this; + } + + private: + // this is used in move-constructor and move-assign; it will always leave 'from' + // with certain 'null' values to trap cases where we're using the wrong instance. + void do_move_from(runlist_fixup_state &&from) + { + segno = from.segno; + crate_begin = from.crate_begin; + seg_desc = from.seg_desc; + base_tensor_index = from.base_tensor_index; + base_blocktable_index = from.base_blocktable_index; + base_sharedobj_index = from.base_sharedobj_index; + fixup_list_head = from.fixup_list_head; + fixup_supplemental = from.fixup_supplemental; + from.segno = 0; + from.seg_desc = nullptr; + from.fixup_list_head = nullptr; + } +}; +// +// This contains 'supplemental' fixup records for a segment; there is one instance in each runlist_seg_descriptor, +// and a pointer to in the runlist_fixup_state. When the 'runlist_fixup_state' is moved in or out of the Deserz, +// the pointer to this remains. +// To avoid the overhead of vec_push_back, this // has a static array into which values are recorded; +// when this is full (or near full), all the records within are appended to the vector in a single operation. +// At the end of the operation, any remaining records are appended to the vector, but only if the vector +// is not empty (we can read the records out of the fixed array, if they all fit). +// +// The append() is not safe unless 'ensure_room_for' is checked first; you can e.g. do ensure_room_for(3) +// ahead of doing up to 3 append +// It is best to use a constant as parameter to ensure_room_for, i.e. ahead of code which may append +// *up to* 4 values, use ensure_room_for(4); this simplifies the inline expansion of 'ensure_room_for', +// and makes very little difference to performance compared to using the exact value. +// +class fixup_supplemental_recs { + static constexpr unsigned ARR_SIZE = 64; + unsigned num_in_arr = 0; + uint32_t fixed_arr[ARR_SIZE]; + std::vector var_arr; + unsigned n_vec = 0; // = var_arr.size() + + public: + void clear(); + unsigned constexpr size() const { return num_in_arr + n_vec; } + void reserve(unsigned const n) { var_arr.reserve(n); } + inline void ensure_room_for(unsigned const n) + { + assert(n <= ARR_SIZE); + if (num_in_arr > ARR_SIZE - n) flush_to_vec(); + } + // append allowed only when preceded by 'ensure_room_for' + inline void append(uint32_t const val) + { + assert(num_in_arr < ARR_SIZE); + fixed_arr[num_in_arr++] = val; + } + // use instead of 'ensure_room_for(1); push_back(n)' + inline void push_back(uint32_t const val) + { + if (num_in_arr > ARR_SIZE - 1) flush_to_vec(); + fixed_arr[num_in_arr++] = val; + } + // After all push_back() done, do a 'finish' + // and then get_limits() can be used to traverse the data. + void finish(); // flushes, but only if the vec is not empty. + std::pair get_limits() const; + + protected: + void flush_to_vec(); +}; + +// An array of these (size N+1) is used to hold the +// information used in deserializing each each segment. +// The [N+1] is partially used; some operations may use +// e.g. arr[i+1].auxinfo.some_field to find out where something +// ends for the current segment, using the start of the next segment; +// so N-1 entry needs a next. + +struct runlist_seg_descriptor { + runlist_auxdata_seg_desc auxinfo; // the data from the 'aux_data' record for this segment + runlist_fixup_state segfixup; // the deserialization state (moved in and out of Deserz as needed) + fixup_supplemental_recs fixup_supp; // fixup supplemental recs. + deser_segment_span span_to_deser = {}; + // These are used to configure the last preload in each segment, which preloads a region + // which is either partially, or entirely, in the next segment. So, the first two entries + // below are actually set at the end of deserialization of the previous segment; the end_preload + // is set by the current segment deserialize. + // The information stored in [N] is for configuring + // the last preload in the last segment, with end_preload set to 'end of crate'; in this case + // start_preload could be <= the end of the crate, and then we don't configure it. + // likewise the information in [0] is only 'end_preload', which can be used to configure + // 'Graph::m_initial_preload' (it should go from start-of-crate to seg[0].end_preload). + // In some cases (hopefully, only in testing) we may have segments with no preloads in them, + // in which case null pointers will appear in some of these; the ChunkPreload ops need to + // configured by getting info from adjacent segments. + PreloadInfo *prev_seg_final_preload{}; // points to the prev segments' final PreloadInfo + char *start_preload{}; // the preload start address for prev seg's final preload + char *end_preload{}; // end address for prev seg's final preload +}; + +// One instance of this is in Deserializer, called segments. +// It is created 'empty', and populated when we encounter the valid +// Aux Data record. +// +class DeserSegDescs { + unsigned n_segs = 0; + // points to an array of n_seg + 1, if n_segs > 0 + std::unique_ptr seg_arr; + + public: + DeserSegDescs() = default; + ~DeserSegDescs() = default; + DeserSegDescs(DeserSegDescs const &) = delete; + DeserSegDescs(DeserSegDescs &&) = default; + DeserSegDescs &operator=(DeserSegDescs const &) = delete; + DeserSegDescs &operator=(DeserSegDescs &&) = default; + + // these two are used to create the array + void set_size(unsigned const n); // used to create sized, empty array + runlist_seg_descriptor *data() { return seg_arr.get(); } + + constexpr unsigned num_segs() const { return n_segs; } + constexpr bool is_active() const { return n_segs != 0; } + // note: 'i' may be 0 .. num_segs(); only can use when 'is_active'. + runlist_seg_descriptor &operator[](unsigned const i) { return seg_arr[i]; } + runlist_seg_descriptor const &operator[](unsigned const i) const { return seg_arr[i]; } + + // We can add other data in here, to manage the concurrent deserialization. + unsigned n_threads = 0; // set when allocating the 'Deserz' array + std::vector deserz_arr; // sized as 'n_threads'. + + // start-of-crate, rounded to a multiple of 32; Calculated before any multi-thread + // operations. Use to configure Graph::m_initial_preload. + void *crate_preload_start_boundary; + // end-of-crate, rounded up to multiple of 32. Calculated before any multi-thread + // operations. No 'ChunkPreloadOp' will exceed this. + void *crate_preload_final_boundary; + + InitTimeSchedule *initSchedule; +}; + +// A 'DCrate' is a proxy object stored within Deserz. +// It has some of the same methods as Crate; but if nextp is not null, +// it will allocated into the space at 'nextp', limited by 'limitp' +// Otherwise it will use the Crate. +// Most methods are defined as inlines in dcrate_inlines,h +// +class DCrate { + // these are either both null, or both non-null and 4-aligned. + void *nextp = nullptr; + void *limitp = nullptr; + Crate *cratep = nullptr; + + public: + DCrate() {} + ~DCrate() {} + DCrate(DCrate const &) = default; + DCrate(DCrate &&) = default; + DCrate &operator=(DCrate const &) = default; + DCrate &operator=(DCrate &&) = default; + explicit DCrate(Crate &c) : cratep(&c) {} + void set_crate(Crate &c) { cratep = &c; } + Crate *crate() { return cratep; } + bool is_active() const { return nextp != nullptr; } + + constexpr size_t bytes_remaining() const { return (char *)limitp - (char *)nextp; } + char *next_loc() { return (char *)nextp; } + std::pair range_remain() { return {(char *)nextp, (char *)limitp}; } + + void set_memory_range(void *base, unsigned len) + { + nextp = base; + limitp = (void *)((char *)base + len); + } + void remove_memory_range() + { + nextp = nullptr; + limitp = nullptr; + } + + // Methods of Crate we want to support (See crate.h for more more detail). + // Note that the constructors invoked in 'emplace' and 'emplace_explicit' + // can and will recursively call 'emplace' to construct their sub-objects. + template T *emplace(Args &&...args); + // variant of 'emplace' which can use the 'emplace_explicit' call to avoid + // instantiating the constructor twice + template T *emplace0(Deserz &dctx); + // (this is defined with 'template' args, only so it can be declared here without + // forward refs. All are pass-by-value. Only one specialization will be defined). + template void *emplace_explicit(Deserz &dctx, FI, FD, SA); + // array allocation, used to make all arrays in crate during deserialize. + template T *alloc_array(size_t n); + + private: + // reserve the specified data in the range, and return pointer to start; or + // return null if not possible. + void *do_alloc(size_t align, size_t amount); +}; + +// defines the encoding in the upper 3 bits of the last word of a 'multi-word' supplemental record +// all must be 4..7, since a 0 in the msb indicates a 'short' record. + +constexpr unsigned SUPPFIXUP_CAT_tensor = 4; +constexpr unsigned SUPPFIXUP_CAT_sharedobj = 5; +constexpr unsigned SUPPFIXUP_CAT_blocktable = 6; // with indices packed in one word +constexpr unsigned SUPPFIXUP_CAT_blocktable_full = 7; // .. in two words +constexpr unsigned SUPPFIXUP_CAT_SHIFT = 29u; + +bool fixup_encode_for_blocktable(runlist_fixup_state &seginfo, uint32_t idx, uint32_t table_offs, void **ptrloc); + +// high-level operations in the 'deserialize by segments' code. + +GraphStatus do_multiseg_deser(Deserializer &dctx, size_t ref_deser_pos); +GraphStatus segmentjob_deserialize_ops(Deserializer &dctx, unsigned segno, unsigned threadno); +GraphStatus segmentjob_process_fixups(Deserializer &dctx, unsigned segno, unsigned threadno); +GraphStatus segmentjob_compile_ops(Deserializer &dctx, unsigned segno, unsigned threadno); +void resolve_chunk_preload_after_multiseg_deser(Deserializer &dctx); + +} // namespace hnnx + +#endif // DESER_CONCURRENT_H diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/deser_concurrent_defs.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/deser_concurrent_defs.h new file mode 100755 index 0000000000000..3d72ed7d2de71 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/deser_concurrent_defs.h @@ -0,0 +1,97 @@ +//============================================================================== +// +// Copyright (c) Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +#ifndef DESER_CONCURRENT_DEFS_H +#define DESER_CONCURRENT_DEFS_H 1 + +#include +#include + +namespace hnnx { + +// NOTE: this file contains defs for concurrent deserialize which are needed on both decode and prepare +// side; mostly just the format of the Aux Data records. +// Defs needed only on decode side are in 'deser_concurrent.h', which #includes this file. + +constexpr unsigned DesConcur_MIN_SEGMENTS = 8; // can't have less than this number. + +// This is the number of runlist slots in the runlist_auxdata_seg_desc format. +// It must be >= the actual number. This number is coded into the start of the AuxData +// payload. If the number gets bigger, the reader of the aux-data +// record will need to be able to cope with the older, smaller value. + +constexpr unsigned DesConcur_MAX_RUNLISTS = 4; + +// The 'Aux Data' record describing the runlist partition has a payload formed of +// a runlist_auxdata_header, followed immediately by N+1 of runlist_auxdata_seg_desc. +// The number N is in the header; there may be additional words after, which can be +// ignored +// +// Aux Data header record. +// The 'record_version' is reserved to flag changes in the format, so that +// if it changes, new skel can understand old records. +// Currently, It has this format; most changes will expand one of the fields +// so following this may be adequate to capture version changes; if it is not, +// add flags in the upper bits. +// bits 31 ..13 : reserved, 0 +// bit 12: set of crate sizes are calculated based on 'dynamic tensor' sizes +// bits 11..8 length of the header in uint32's +// bits 7..3 length of 'segment' record, in uint32's +// bits 2..0 .. value of DesConcur_MAX_RUNLISTS +// +struct runlist_auxdata_header { + unsigned record_version; // see above + unsigned numsegs : 16; // number of segments; >= 8, likely <= 64 but who knows + unsigned hdrflags : 16; // reserved for flags + unsigned runlist_offset; // see below +}; + +// 'runlist_offset' is the offset, in u32's units, from the 'num_in_tensors' word +// to the 'n_ops_total' word. This is needed by 'weight share' processing in order to +// adjust the deser_offset values to accommodate changes in the encoding length of pointers. + +// The N segments are described by an array of N+1 of runlist_auxdata_seg_desc; +// segment i is defined by arr[i] (start) and arr[i+1] (end). +// An exception is 'crate_seg_len'- this may be less than arr[i+1].crate_offset - arr[i].crate_offset +// due to padding. +// In the final record arr[N]: +// - crate_seg_len is not used (0) +// - The *_list_posn records are the total length of the runlists +// - the four 'base_*_index' values are all 1 greater than any index used in the graph +// +struct runlist_auxdata_seg_desc { + uint32_t deser_offset; // where the input (pickle) data begins - reference point is the start of 'Runlist' as + // // defined in docs/pickle_format.md, i.e. the location of 'n_ops_total' word + uint32_t crate_offset; // offset in crate + uint32_t crate_seg_len; // crate length needed (not used in final entry) + uint32_t runlist_posn[DesConcur_MAX_RUNLISTS]; // where the segment starts in Op* runlist + uint32_t execlist_posn[DesConcur_MAX_RUNLISTS]; // where the segment starts in 'execlist' + uint32_t base_opseq_index; // first 'op_sequence_marker' index used in the segment. + uint32_t base_tensor_index; // first tensor index defined this segment + uint32_t base_blocktable_index; // first blocktable index defined in this segment + uint32_t base_sharedobj_index; // first 'shared_object' index defined in this segment +}; + +// Bit in the header version indicating crate sizes allow for 'dynamic shapes'. +// NOTE: if that gets backed out later, leave this here but remove it from DesConcur_AUXDATA_REC_VERSION +// +constexpr unsigned DesConcur_AUXDATA_REC_VERSION_DYNSHAPE_SIZES = 4096; + +constexpr unsigned DesConcur_AUXDATA_REC_VERSION = // composed of: + ((sizeof(runlist_auxdata_header) / sizeof(uint32_t)) * 256 // header size + + (sizeof(runlist_auxdata_seg_desc) / sizeof(uint32_t)) * 8 // seg desc len + + DesConcur_MAX_RUNLISTS) | + DesConcur_AUXDATA_REC_VERSION_DYNSHAPE_SIZES; + +// values to be used to 'grow' old crate estimate to compensate for 'dyn shape' mismatch +constexpr unsigned DesConcur_CrateGrowPerTensor = 2; // number of words per 'tensor' +constexpr unsigned DesConcur_CrateGrowPerShared = 2; // number of words per 'shared object' + +} // namespace hnnx + +#endif // DESER_CONCURRENT_DEFS_H diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/deserialize_tensors.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/deserialize_tensors.h new file mode 100755 index 0000000000000..43f14039fd1ad --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/deserialize_tensors.h @@ -0,0 +1,68 @@ +//============================================================================== +// +// Copyright (c) 2021-2023 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +#ifndef DESERIALIZE_TENSORS_H +#define DESERIALIZE_TENSORS_H 1 + +#include +#include +#include +#include +#include +#include +#include "limits.h" +#include "log.h" + +#include "forward_classes.h" +#include "serdes_tensors.h" + +namespace hnnx { + +// see comment in serdes_tensors.h for overview of how this works. + +class Deserializer; + +class DeserTensorConn : public SerTensorConnDefs { + typedef unsigned tensor_idx; + typedef Tensor const *ptr_type; + + // this collects all of the tensor_def we have seen. index is seq_index-1. + std::vector defined_tensors; + + public: + DeserTensorConn() {} + // process a tensor definition + void tensor_def(Deserz &, ptr_type); + // process n tensor refs. + void tensor_refs(Deserz &, ptr_type *ptrs, unsigned num); + // process a tensor ref + void tensor_ref(Deserz &dctx, ptr_type &ptr) { tensor_refs(dctx, &ptr, 1); } + + // TODO: remove these two, we don't use them, and should not. + // read an identity (for use in subsequent need_fixup) + tensor_idx read_identity(Deserz &); + // apply the identity to 'fix' a tensor pointer (usually now, sometimes later + void need_fixup(tensor_idx ident, ptr_type *dst); + + // 'reserve' the defined tensors to avoid allocation overhead... + inline void reserve_tensors(const size_t n) { defined_tensors.reserve(n); } + // resize the 'defined tensors' table to its full capacity (specified). + // Used only in multi-thread deserialize, prior to deserializing the runlist. + inline void resize_tensordef_table(const size_t n) { defined_tensors.resize(n); } + + // this is for use by 'reference fixup' code, in concurrent deserialize. + std::vector const &get_defined_tensors() const { return defined_tensors; } + + protected: + tensor_idx read_identity_inline(Deserz &); + void apply_fixup_inline(tensor_idx idx, ptr_type *dst); +}; + +} // namespace hnnx + +#endif // DESERIALIZE_TENSORS_H diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/deserializer.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/deserializer.h new file mode 100755 index 0000000000000..7312ae8bdd948 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/deserializer.h @@ -0,0 +1,761 @@ +//============================================================================== +// +// Copyright (c) Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +#ifndef DESERIALIZER_H +#define DESERIALIZER_H 1 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "limits.h" +#include "dtype.h" +#include "log.h" +#include "allocator.h" +#include "op_extra_info.h" + +#include "serialize_defs.h" +#include "forward_classes.h" +#include "deserialize_tensors.h" +#include "macros_attribute.h" +#include "const_extent_descriptor.h" +#include "weak_linkage.h" +#include "size_align_code.h" +#include "deser_concurrent.h" +#include "hexagon_nn_types.h" + +namespace hnnx { +class DMA_Manager; +class Crate; +/** + * @brief \ref Serializer and \ref Deserializer modules that provides + * a mechanism to flatten (serialize) and reconstruct (deserialize) + * primitive and user-defined data types. The initial objective + * was to create an in-memory representation of the optimized + * \ref Graph on x86 which can then be reconstructed and executed on + * a qdsp target, essentially, a means to Graph caching. + * + */ +using tensor_deserializer_fn = uptr_Tensor (*)(Deserz &); + +using deserialize_op_func = void *(*)(void *, Deserz &); // Allocation function +using deserialize_dtor_func = void (*)(Graph *, void *); // Deallocation function +class SimpleOpBase; +using deserialize_make_unique = std::unique_ptr (*)(); + +struct op_deserializer_fn { + op_deserializer_fn(deserialize_op_func init_func_in, const size_align_code_t sizeal_in) + : init_func(init_func_in), size_align_code(sizeal_in) + { + } + op_deserializer_fn(deserialize_op_func init_func_in, deserialize_dtor_func dtor_func_in, + const size_align_code_t sizeal_in) + : dtor_func(dtor_func_in), init_func(init_func_in), size_align_code(sizeal_in){}; + op_deserializer_fn(const op_deserializer_fn &) = default; + op_deserializer_fn(op_deserializer_fn &&) = default; + op_deserializer_fn &operator=(const op_deserializer_fn &) = delete; + deserialize_dtor_func dtor_func = nullptr; + deserialize_op_func init_func = nullptr; + const size_align_code_t size_align_code{}; + inline constexpr size_t get_size() const { return size_align_code.size(); } + inline constexpr size_t get_align() const { return size_align_code.align(); } +}; + +// here's a quick and dirty way to make these maps go faster: compare string_view starting with len; +// and if the len is the same, then compare the middle character, and if that's the same, +// use memcmp. This avoids getting slowed down by a lot of long common prefixes in the type names. +// and we don't care about the weird ordering it generates. +// +struct trick_stringview_lt { + bool operator()(std::string_view const &a, std::string_view const &b) const + { + unsigned const na = a.size(); + unsigned const nb = b.size(); + if (na != nb) return na < nb; + char const *const pa = a.data(); + char const *const pb = b.data(); + if (pa == pb || na == 0) return false; // pa==pb is a common case. + unsigned const char_a = pa[na >> 1]; + unsigned const char_b = pb[na >> 1]; + if (char_a != char_b) return char_a < char_b; + return ::memcmp(pa, pb, na) < 0; + } +}; + +using op_deserializer_map_t = std::map, trick_stringview_lt>; +using tensor_deserializer_map_t = std::map; +using cexdesc_deserializer_map = std::map; + +using const_extent_t = std::pair; +using weight_buf_deserializer_map = std::map; + +/** + * @brief Deserializer class to reverse the serialization + * process and reconstruct the data for specific types + * + */ +class Deserz : public DeSerError { + friend class Deserializer; // weirdly, sometimes a derived class needs to be a friend. + friend class DeserTensorConn; + + protected: + Deserz(Deserializer *full_deser, char const *p, size_t n, Graph *g = nullptr); + + public: + // I want to make this protected, but can't. + // Even code which has access to a protected copy_ctor + // of foo can't invoke .resize(n, foo_inst) on a std::vector. This + // seems like a defect in C++. Applies to various 'emplace' methods too; + // the 'emplace' can only ever use public ctors. + Deserz(Deserz const &) = default; + + public: + virtual ~Deserz(); // please keep this as first virtual method declared. + + // These three ONLY TO BE USED when setting up a Deserz to start processing a segment. + void setup_source_span(deser_segment_span const &); + void setup_dcrate_out(void *base, size_t len); + void setup_next_tensor_index(unsigned const idx) { next_tensordef_index = idx; } + + typedef uint32_t object_identity_type; + + // Note, various accessor methods are defined as inlines below 'class Deserializer'. + // true if this Deserz is really an instance of Deserializer. + constexpr bool is_base_deser() const; + + using op_deserialize_fn_list_t = std::vector; + using tensor_deserialize_fn_list_t = std::vector; + + op_deserialize_fn_list_t &get_op_deserialize_fn_list(); + tensor_deserialize_fn_list_t &get_tensor_deserialize_fn_list(); + std::vector &get_blocktable_link_table(); + // when deserializing an op: + // - call deserialize_tensor_ref (or _refs) on all the input tensor pointers + // - pass all output tensor addresses to deserialize_tensor_def + // Sequence must match serialization; note that the deserialize-ctor of Tensor + // calls deserialize_tensor_def on itself; so there is no need to call it elsewhere, + // except for specialized types which are constructed otherwise during depickle (e.g., + // types embedded in the Op). + // + // Some ops have multiple copies of some input tensor pointers; for these, it's possible + // serialize just one reference, and the deserialize it using + // auto id = deserialize_object_identity() // <- corresponds to serialize_tensor_ref + // need_tensor_fixup( id, &first_tensor_pointer); + // (other deserialize activity can happen here) + // need_tensor_fixup( id, &second_tensor_pointer); + + void deserialize_tensor_def(Tensor const *tensor_ptr); + void deserialize_tensor_ref(Tensor const *&where); + void deserialize_tensor_refs(Tensor const **ptrs, unsigned n); + template void deserialize_tensor_ref(T const *&where); + template void deserialize_tensor_refs(T const **ptrs, unsigned n); + object_identity_type deserialize_object_identity(); + void need_tensor_fixup(object_identity_type oid, Tensor const **where); + + Graph &graph() const { return *graph_ptr; } + Crate *crate() { return d_crate.crate(); } + DCrate *dcrate() { return &d_crate; } + DeserSegDescs const &get_segments() const; // gets ref to associated 'segments' object + op_deserializer_map_t const &get_op_deser_map() const { return *op_deserializer_map; } + + bool is_aligned_const_format() const; + bool has_pending_tensor_updates(); + + bool is_shared_dynamic_tensor_shape_format() const; + + fa::RuntimeAllocator *allocator; + DCrate d_crate; // contains a crate pointer + + protected: + // hoist pointers to these maps into Deserializer to avoid static lock overhead + op_deserializer_map_t const *op_deserializer_map; + tensor_deserializer_map_t const *tensor_deserializer_map; + Graph *graph_ptr{}; + Deserializer *full_deser; + + char const *bufstart; // start of current buffer + char const *bufend; // first byte we can't read + char const *bufp; // next to read + char const *buf_limit; // <= bufend; where 'fill_buffer' needs to be called. + size_t bytes_filled; // bytes previously filled + + uint32_t op_flags; + OpExtraInfo op_extra_info; + + unsigned next_tensordef_index = 1; // belongs to 'tensorconn' but needs to be in Deserz. + // 'format version'. Currently only ones used are 0 = classic, 1 = July/2023 + // Only access through methods like .classic_format(); + // This is changed to non-zero value based on seeing certain Aux Data records + // (which must appear before the allocator). + int format_version = 0; + + // this is used in multi-thread decoding. It is important that + // it remains null-constructed if the object is really a base of Deserializer; + // it is only used in 'segment' Deserz instances. + runlist_fixup_state seg_fixup_state{}; + + /** + * @brief throws an error since deserializer detected + * deserialization on insufficient bytes i.e. an underflow + * + */ + API_EXPORT virtual char const *fill_buffer(); // called for underflow on short operation + + /** + * @brief Deserialize data of specified length and write into + * buffer provided by caller + * + * @param[out] p buffer to write to + * @param[in] len length of the \ref bufp to read from + * @param[in] align if true, skip input bytes to a boundary of 4 + */ + API_EXPORT virtual void deserialize_fread(void *p, size_t len, bool align); + + /** + * @brief Get current position of buffer from which next data will be read + * + * @return size_t offset from buffer start + */ + size_t buffer_offset() const { return bufp - bufstart; } + /** + * @brief Available buffer size remaining for deserialization + * + * @return size_t remaining bytes size + */ + size_t buffer_remain() const { return bufend - bufp; } + + /** + * @brief deserialize buffer for type T + * + * @retval T returs the deserialized value of type T + * + * Note: This is the templated API called by deserialize_T() functions + * + * Note: Cannot be used for more than 4 bytes, there is a specialized version to read u64. + */ + template T simple_deserialize() + { + static_assert(sizeof(T) <= 4, "can only read sizeof(T) <= 4"); + constexpr size_t W = 4; + char const *curr_p = bufp; + if (curr_p >= buf_limit) { + curr_p = fill_buffer(); + } + T const val = *(T const *)(curr_p); + bufp = curr_p + W; + return val; + } + // see comment above deserialize_shared_obj. + API_EXPORT std::pair deserialize_shared_obj_func(void const **ptrloc); + API_EXPORT uint64_t deser_u64_slowpath(); + void initial_l2fetch(); // called only from ctor + + public: + inline constexpr bool classic_format() const { return format_version == 0; } + /** + * @brief deserialize data of type which calls simple_deserialize + * + * @param val data to deserialize + * + * Note: the below are the only types supported for deserialize_type + */ + API_EXPORT uint64_t deserialize_uint64(); // inline later + inline float deserialize_float() { return simple_deserialize(); } + inline uint32_t deserialize_uint32() { return simple_deserialize(); } + inline NN_INT32_T deserialize_int32() { return simple_deserialize(); } + inline int16_t deserialize_int16() { return simple_deserialize(); } + inline uint16_t deserialize_uint16() { return simple_deserialize(); } + inline int8_t deserialize_int8() { return simple_deserialize(); } + inline uint8_t deserialize_uint8() { return simple_deserialize(); } + + inline uint64_t deserialize_namesig() { return deserialize_uint64(); } + + // note, this is defined as an inline in deserializer.cc and not available elsewhere + tensor_deserializer_fn deserialize_tensor_identification(unsigned tensor_class_index); + + // deserialize string + // **NOTE** will throe runtime error if called in a Deserz which is not really a Deserialize. + API_EXPORT std::string_view deserialize_str(); + + uint32_t get_op_flags() const { return op_flags; }; + void clear_op_flags() { op_flags = 0; }; + void set_op_flags(uint32_t f) { op_flags = f; }; + + const OpExtraInfo &get_op_extra_info() const { return op_extra_info; }; + void clear_extra_info() { op_extra_info.clear(); }; + void set_op_extra_info(OpExtraInfo in_op_extra_info) { op_extra_info = in_op_extra_info; }; + + /** + * @brief deserialize buffer for specified size + * + * @param[in] alloc_size number of bytes to read from \ref bufp + * @param[out] ptr destination buffer for the read bytes + * @return size_t number of bytes actually read + */ + API_EXPORT size_t deserialize_buf(size_t alloc_size, void *ptr); + /** + * @brief similar to deserialize_buf but first deserialize a + * uint32_t size of bytes that should match the alloc_size + * + * @param[in] alloc_size number of bytes to read from \ref bufp + * @param[out] ptr destination buffer for the read bytes + * @return size_t number of bytes actually read + */ + API_EXPORT size_t deserialize_buf_withlen(size_t alloc_size, void *ptr); + // deserialize a pointer as 64 bits + inline void *deserialize_ptr() { return (void *)size_t(deserialize_uint64()); } + + template T deserialize_type(); + + template std::array deserialize_array(); + + /** + * @brief convernience wrappers for deserialize fuctions that + * take in different number of arguments of uint32_t type + * + * @return std::tuple (first, second) uint32_t data deserialized + */ + // convenience wrappers (to reduce inlined code size w/o much loss of speed) + API_EXPORT std::tuple deserialize_uint32_x2(); + API_EXPORT std::tuple deserialize_uint32_x3(); + API_EXPORT std::tuple deserialize_uint32_x4(); + + API_EXPORT void deserialize_uint32_arr(uint32_t *p, size_t N); + + // to reduce code size in the templates, we can deserialize arrays of + // N uint32 to sizet + API_EXPORT void deserialize_uint32_arr_sizet(size_t *p, size_t N); + + /** + * @brief deserialize array containing uint32_t type date + * + * @tparam N size of the array + * @return std::array array containing the deserialized values + */ + template std::array deserialize_uint32_array_sizet() + { + std::array res; + deserialize_uint32_arr_sizet(&res[0], N); + return res; + } + + // + // This is used for shared objects like Shape and Interface. + // it deserializes the index, and decides if it's the first instance. + // - must always pass the address which needs to point to it; though it + // will be not be set by this function. + // - if retval.second is null, then the object was previously deserialized, + // and return.first is the pointer to it. + // - otherwise, caller must deserialize the instance, and store the pointer + // at *retval.second. retval.first will be null in this case. + // In scenarios where delayed resolution is used, the return may be {token,null} + // where 'token' is actually delayed resolution token. + // + template + std::pair // see above + deserialize_shared_obj(T const **const loc) + { + auto const res = deserialize_shared_obj_func((void const **)loc); + return {(T const *)res.first, (T const **)res.second}; + } + + // Increment tue current read position of internal buffer without reading anything + void deserialize_skip_words(size_t nwords); + + // Apply the 'pointer fixups' contained within seg_info. This can + // be called with 'this' being any Deserz or Deserializer associated + // with the operation (it is only used to access tables in Deserializer). + // This can only be done on a given segment when all previous have + // been deserialized; so if we have one Deserz per thread, we need + // to 'move' the seg_info object out of it after completing the segment, + // and use this later to do the fixups. + // Returns true if ok, false if failed. + // Will leave the fixup list empty on success. + bool apply_segment_fixups(runlist_fixup_state &seg_info) const; + + // Methods to move 'seg_fixup_state' object in or out. + void install_seg_fixup_state(runlist_fixup_state &&src) { seg_fixup_state = std::move(src); } + runlist_fixup_state extract_seg_fixup_state() { return std::move(seg_fixup_state); } + void extract_seg_fixup_state_to(runlist_fixup_state &dest) { dest = std::move(seg_fixup_state); } + + // and a read_only accessor + runlist_fixup_state const &fixup_state() const { return seg_fixup_state; } + + // for Tensor::deserialize_blocktable + inline bool fixup_encode_for_blocktable(uint32_t const idx, uint32_t const table_offs, void **const ptrloc) + { + return hnnx::fixup_encode_for_blocktable(seg_fixup_state, idx, table_offs, ptrloc); + } +}; + +///////////////// + +class Deserializer : public Deserz { + friend class Deserz; + + public: + /** + * @brief Construct a new Deserializer object + * + * @param[in] p buffer that needs to be deserialized + * @param[in] n length of the buffer + * @param[in] g pointer Graph object to deserialize (usually null, since object + * is being passed to the Graph::Graph ctor to deserialize; that ctor + * must immediately call dctx.set_graph(*this) ) + */ + API_EXPORT Deserializer(char const *p, size_t n, Graph *g = nullptr); + API_EXPORT virtual ~Deserializer(); // please keep this as first virtual method declared. + + void set_graph(Graph &g); + + inline void deserialize_tensor_def(Tensor const *tensor_ptr) { tensorconn.tensor_def(*this, tensor_ptr); } + inline void deserialize_tensor_ref(Tensor const *&where) { tensorconn.tensor_ref(*this, where); } + inline void deserialize_tensor_refs(Tensor const **ptrs, unsigned n) { tensorconn.tensor_refs(*this, ptrs, n); } + inline void deserialize_pred_conditions(std::vector &pred_cond_list) + { + // get the number of items in the vector + uint32_t num_of_objects = deserialize_uint32(); + assert(num_of_objects <= UINT32_MAX); + if (num_of_objects > 0) { + pred_cond_list.resize(num_of_objects); + + // TODO: remove this once we know how to update it at runtime + // Currently setting it to true + pred_cond_list.at(0) = 1; + } + } + template inline void deserialize_tensor_ref(T const *&where) + { + static_assert(std::is_base_of::value); + tensorconn.tensor_ref(*this, *(Tensor const **)&where); + } + template void deserialize_tensor_refs(T const **ptrs, unsigned n) + { + static_assert(std::is_base_of::value); + tensorconn.tensor_refs(*this, (Tensor const **)ptrs, n); + } + inline object_identity_type deserialize_object_identity() { return tensorconn.read_identity(*this); } + + inline void need_tensor_fixup(object_identity_type oid, Tensor const **where) { tensorconn.need_fixup(oid, where); } + inline void resolve_fixups() + { + [[maybe_unused]] const object_identity_type newval = tensorconn.read_identity(*this); + assert(newval == 0); + } + + constexpr bool is_aligned_const_format() const { return aligned_const_format_flag; } + void set_aligned_const_format(const bool v = true) { aligned_const_format_flag = v; } + + constexpr bool is_shared_dynamic_tensor_shape_format() const { return shared_dynamic_tensor_shape; } + void set_shared_dynamic_tensor_shape_format(const bool v = true) { shared_dynamic_tensor_shape = v; } + + PUSH_WARNING() + DISABLE_WARNING("-Wcast-qual", MSVC_NO_EQUIV) + // valid when the entire pickle, in const_extent format, is loaded as a single, persistent dma buffer + inline unsigned char *get_weight_pointer() { return ((unsigned char *)bufstart) + (4 * pickle_len_words); }; + POP_WARNING() + inline size_t get_weight_size() { return (bufend - bufstart) - (4 * pickle_len_words); }; + + inline op_deserialize_fn_list_t &get_op_deserialize_fn_list() { return op_deserialize_fn_list; } + inline tensor_deserialize_fn_list_t &get_tensor_deserialize_fn_list() { return tensor_deserialize_fn_list; } + + // Next 4 methods are used to support 'deserialize_by_segments'. + // 'get_forward_span' returns a 'deser_segment_span' (pair of pointers) for a region of deserialized data + // from 'ref + start' up to 'ref + end', where start and end (0 <= start < end) are byte offsets + // relative to some position 'ref' in the deserialized data, and 'ref' is the value which bytes_consumed() + // returned at that reference point. All should be multiples of 4. + deser_segment_span get_forward_span(size_t ref, size_t start, size_t end); + // used to get a reference point for bytes_consumed + size_t bytes_consumed() const { return bufp - bufstart; } + // used to skip past the last 'get_forward_span' we did + void skip_to_after_span(deser_segment_span const &); + // resize tables: tensor, shared_obj, linktable, according to info in final_segdesc + void resize_object_tables(runlist_auxdata_seg_desc const &final_desc); + + uint32_t crate_size_according_to_segments() const; + + protected: + std::vector objindex; // index of pointers to shape, etc. + // the state of the 'tensor connectivity' deserialize engine. + DeserTensorConn tensorconn; + bool aligned_const_format_flag = false; + bool shared_dynamic_tensor_shape = false; + + // this is used in 'deserialize_str', so it ideally should be in Deserz; but + // it's pretty large; so, put it here and forbid calling deserialize_str + // on a Derserz which not really a Deserialize. We only use it to decode + // 'classic' pickles, so this is ok. + char name_buf[4096]; // used for string view + + // do the reference fixups on a segment. Return true if OK. + // See Deserz::apply_segment_fixups for public API. + static bool do_segment_fixups(runlist_fixup_state &seginfo, Deserz const &dctx0); + + public: + inline constexpr bool classic_format() const { return format_version == 0; } + inline void set_format_2307() { format_version = 1; } + + // This is called when a 'class index' Aux Data is encountered. + // It must deserialize exactly the indicated number of payload words. + // is_tensor = false for "Co" (op class index), and true for "Ct" (tensor class index) + API_EXPORT void auxdata_class_index(unsigned payload_words, bool is_tensor); + // + // called when an 'Nt' Aux data is encountered, which provides some array sizes for the + // deserialization. + // It must deserialize exactly the indicated number of payload words. + API_EXPORT void auxdata_temparr_sizes(unsigned payload_words); + // Called when a 'AuxTag_deserializeSegments' is encountered. If it likes + // the record, it will set up the 'segments' object. + API_EXPORT void auxdata_deserialize_segments(unsigned payload_words); + + // called when a 'KS' Aux data is encountered, which provides a const_extent_descriptor + // It must deserialize exactly the indicated number of payload words. + API_EXPORT int auxdata_read_const_extent_descriptor(const unsigned payload_words); + // helper for above. payload_words is the length WITH PADDING + API_EXPORT int extract_const_extent_name(const unsigned payload_words, std::string &retVal); + + // Extract a std::vector containing the 'const extent descriptor table, + // from a given offset (in units of 32-bit words) relative to the start of the pickle. + // or separate pointer (if separate buffer for the weights was passed in). + // This does not affect the current position. + // If there is a problem, it returns an empty vector; caller *must* check and report. + // This uses hnnx::const_extent_hdr_check to understand how much it should read, + // and to do basic check. + API_EXPORT std::vector extract_const_extent_table(size_t posn_in_words); + std::vector extract_const_extent_table(hexagon_nn_wide_address_const_t weight_data, + const size_t weight_size); + // given a destination char pointer, prefilled with \null, fills it in with the name of the const_extent + // caller must provide destination of sufficient length + std::string name_from_weight_data(hexagon_nn_wide_address_const_t weight_data, const uint32_t weight_length); + // helper func for above. return -1 if name not present. + std::string get_name(hexagon_nn_wide_address_const_t weight_data, const uint32_t weight_length); + // give a vector of weight_data buffers, stores them all in the appropriate map + void store_named_weight_bufs(const hexagon_nn_wide_address_const_t *const buffers, const uint64_t *const lengths, + const unsigned num_buffers); + // + // copy 'len' bytes of data at offset offs_bytes in the pickle into location dstp. + // returns true if it's possible. You can maybe pass a DMA_Manager to have it queued... + // offs_bytes defined as uint64_t to support possible 'far' data on hexagon. + API_EXPORT bool extract_const_extent_data(uint64_t offs_bytes, size_t len, void *dstp, DMA_Manager *dma = nullptr); + // same, using an external const_extent + bool extract_const_extent_data(uint64_t offs_bytes, size_t len, void *dstp, + hexagon_nn_wide_address_const_t weight_data, const size_t weight_length); + + // This extracts the 'objindex', when it is needed e.g. to 'patch' interfaces. + // Must be done only after deserializing, and can only be done once. + std::vector extract_objindex() { return std::move(objindex); } + + DeserSegDescs segments; // array of runlist_seg_descriptor, empty if not doing multiseg. + + // this is used to pass the offset of the const-extent-descriptor (recorded as pickle_len) + // to the alloc->deserialize. + size_t pickle_len_words; + + // OPTIONAL maps from weight buffer names to the descriptors and the buffers, respectively + cexdesc_deserializer_map named_cexdescs; + weight_buf_deserializer_map named_weight_bufs; + + void *uncached_ptr; + uint32_t uncached_len; + + std::vector op_deserialize_fn_list; + std::vector tensor_deserialize_fn_list; + + // used to 'link' shared blocktables during deser. + std::vector blocktable_link_table; +}; + +///////////////// + +// true if this Deserz is really an instance of Deserializer. +inline constexpr bool Deserz::is_base_deser() const +{ + return static_cast(full_deser) == this; +} + +inline bool Deserz::is_aligned_const_format() const +{ + return full_deser->aligned_const_format_flag; +} +inline bool Deserz::is_shared_dynamic_tensor_shape_format() const +{ + return full_deser->shared_dynamic_tensor_shape; +} +inline Deserz::op_deserialize_fn_list_t &Deserz::get_op_deserialize_fn_list() +{ + return full_deser->op_deserialize_fn_list; +} +inline Deserz::tensor_deserialize_fn_list_t &Deserz::get_tensor_deserialize_fn_list() +{ + return full_deser->tensor_deserialize_fn_list; +} +inline std::vector &Deserz::get_blocktable_link_table() +{ + return full_deser->blocktable_link_table; +} +// For these in Deserz, we must call the corresponding methods on the +// tensorconn in 'full_deser', but must pass 'this' as first parameter. +inline void Deserz::deserialize_tensor_def(Tensor const *const tensor_ptr) +{ + full_deser->tensorconn.tensor_def(*this, tensor_ptr); +} +inline void Deserz::deserialize_tensor_ref(Tensor const *&where) +{ + full_deser->tensorconn.tensor_ref(*this, where); +} +inline void Deserz::deserialize_tensor_refs(Tensor const **const ptrs, const unsigned n) +{ + full_deser->tensorconn.tensor_refs(*this, ptrs, n); +} +inline DeserSegDescs const &Deserz::get_segments() const +{ + return full_deser->segments; +} + +// unaligned read of 64-bits (two 32-bit aligned reads) +template <> inline uint64_t Deserz::simple_deserialize() +{ + char const *const curr_p = bufp; + if (curr_p + 8u > buf_limit) { + return deser_u64_slowpath(); + } + uint32_t const *const p = (uint32_t const *)(curr_p); + bufp = curr_p + 8u; + return p[0] + ((uint64_t)p[1] << 32); +} +inline uint64_t Deserz::deserialize_uint64() +{ + return simple_deserialize(); +} + +template <> inline uint64_t Deserz::deserialize_type() +{ + return deserialize_uint64(); +} +template <> inline float Deserz::deserialize_type() +{ + return deserialize_float(); +} +// sometimes uint32_t is unsigned long, sometimes it's unsigned +// sometimes unsigned long is uint64. Hopefully this should cover it all. +#if ULONG_MAX == UINT_MAX +template <> inline unsigned long Deserz::deserialize_type() +{ + return deserialize_uint32(); +} +template <> inline long Deserz::deserialize_type() +{ + return deserialize_int32(); +} +#endif +template <> inline unsigned Deserz::deserialize_type() +{ + return deserialize_uint32(); +} +template <> inline int Deserz::deserialize_type() +{ + return deserialize_int32(); +} +template <> inline int16_t Deserz::deserialize_type() +{ + return deserialize_int16(); +} +template <> inline uint16_t Deserz::deserialize_type() +{ + return deserialize_uint16(); +} +template <> inline int8_t Deserz::deserialize_type() +{ + return deserialize_int8(); +} +template <> inline uint8_t Deserz::deserialize_type() +{ + return deserialize_uint8(); +} + +// assert( dctx.deserialize_uint32() == SOME_CONST ); +// is not safe, since if you turn off asserts, it will no longer read the 4 bytes. This is to allow that to work +#define DESERIALIZE_ASSERT_UINT32(DCTX, VAL) \ + do { \ + uint32_t const tmp [[gnu::unused]] = (DCTX).deserialize_uint32(); \ + assert(tmp == (VAL)); \ + } while (0) + +#include "weak_linkage.h" +PUSH_VISIBILITY(default) + +/** + * @brief register the deserialization function for each \ref Op + * TypicalOp and VariadicOp derived classes are instantiated via + * template and hence the need to create a map of deserialize functions + * for each Op when they are generated at library initialization + * + * @param[in] tinf Op type_info that is used to key the map + * @param[in] fn Deserialize function + */ +API_EXPORT void deserialize_op_register(std::type_info const *tinf, const std::string_view type_tag, + const op_deserializer_fn &fn, bool is_external = false); +/** + * @brief register the deserialization function for each \ref Tensor + * Since \ref Tensor derived classes are instantiated via templates, there + * is a need to create a map of deserialize function for each Tensor at runtime + * + * @param[in] type_tag Tensor type tag that is used to key the map + * @param[in] fn Deserialize function + */ +API_FUNC_EXPORT void deserialize_tensor_register(std::type_info const &tinf, const char *type_tag, + tensor_deserializer_fn fn); + +POP_VISIBILITY() + +// this is fully defined in serialize_register.h +template struct deserialize_tensor_using_constructor; + +// this is fully defined in serialize_register.h +template struct alloc_func_for_op; +template struct dealloc_func_for_op; + +////////////////////// +// Forward decls of things defined in template_help.h +// +// contains_type< tuple, x >::value: true if x is in a,b,c ... +// no 'remove ref' etc is done. +template struct contains_type; +template struct not_contains_type; +template