diff --git a/CMakeLists.txt b/CMakeLists.txt index a313206351677..4532d40bf632f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -107,6 +107,7 @@ llama_option_depr(WARNING LLAMA_RPC GGML_RPC) llama_option_depr(WARNING LLAMA_SYCL GGML_SYCL) llama_option_depr(WARNING LLAMA_SYCL_F16 GGML_SYCL_F16) llama_option_depr(WARNING LLAMA_CANN GGML_CANN) +llama_option_depr(WARNING LLAMA_QNN GGML_QNN) # # build the library diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 532534bcb97e3..bf562db79c988 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -153,6 +153,7 @@ option(GGML_SYCL "ggml: use SYCL" option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF) set (GGML_SYCL_TARGET "INTEL" CACHE STRING "ggml: sycl target device") +option(GGML_QNN "ggml: use QNN" OFF) # extra artifacts option(GGML_BUILD_TESTS "ggml: build tests" ${GGML_STANDALONE}) @@ -165,7 +166,7 @@ option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE}) set(CMAKE_C_STANDARD 11) set(CMAKE_C_STANDARD_REQUIRED true) -if (GGML_SYCL) +if (GGML_SYCL OR GGML_QNN) set(CMAKE_CXX_STANDARD 17) else() set(CMAKE_CXX_STANDARD 11) diff --git a/ggml/include/ggml-qnn.h b/ggml/include/ggml-qnn.h new file mode 100644 index 0000000000000..b8c7da8fbbf87 --- /dev/null +++ b/ggml/include/ggml-qnn.h @@ -0,0 +1,41 @@ +#pragma once + +#include "ggml.h" + +#include "ggml-backend.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define GGML_QNN_MAX_DEVICES 3 + +enum QNNBackend { + QNN_BACKEND_CPU = 0, + QNN_BACKEND_GPU, + QNN_BACKEND_NPU, + QNN_BACKEND_GGML, //"fake" QNN backend, used for compare performance between + // QNN and original GGML +}; + +/** + * + * @param device 0: QNN_BACKEND_CPU 1: QNN_BACKEND_GPU 2:QNN_BACKEND_NPU + * @param extend_lib_search_path extened lib search path for searching QNN backend dynamic libs + * @return + */ +GGML_API ggml_backend_t ggml_backend_qnn_init(size_t dev_num, const char *extend_lib_search_path); + +GGML_API bool ggml_backend_is_qnn(ggml_backend_t backend); + +GGML_API void ggml_backend_qnn_set_n_threads(ggml_backend_t backend, int thread_counts); + +GGML_API int ggml_backend_qnn_get_device_count(void); + +GGML_API void ggml_backend_qnn_get_device_description(size_t dev_num, char *description, size_t description_size); + +GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t dev_num); + +#ifdef __cplusplus +} +#endif diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index cd2dcd0660d3a..cd2f1936f13a8 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -898,6 +898,36 @@ if (GGML_CANN) endif() endif() +if (GGML_QNN) + if (CMAKE_SYSTEM_NAME STREQUAL "Android") + find_library(LOG_LIB log) + find_library(ANDROID_LIB android) + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${LOG_LIB} ${ANDROID_LIB}) + set(GGML_QNN_DEFAULT_LIB_SEARCH_PATH "/data/local/tmp/" CACHE STRING "customized library search path for QNN backend") + else() + message(FATAL_ERROR "QNN now only available on Android") + endif() + + string(REGEX REPLACE "/$" "" GGML_QNN_DEFAULT_LIB_SEARCH_PATH "${GGML_QNN_DEFAULT_LIB_SEARCH_PATH}") + add_compile_definitions(GGML_QNN_DEFAULT_LIB_SEARCH_PATH="${GGML_QNN_DEFAULT_LIB_SEARCH_PATH}/") + if (NOT DEFINED GGML_QNN_SDK_PATH) + # try read from environment variable + if (DEFINED ENV{QNN_SDK_PATH}) + set(GGML_QNN_SDK_PATH $ENV{QNN_SDK_PATH}) + else() + message(FATAL_ERROR "GGML_QNN_SDK_PATH not defined") + endif() + endif() + + message("QNN_SDK_PATH: ${GGML_QNN_SDK_PATH}") + file(GLOB GGML_SOURCES_QNN "ggml-qnn/*.cpp") + list(APPEND GGML_SOURCES_QNN "ggml-qnn.cpp") + set(GGML_HEADERS_QNN ../include/ggml-qnn.h) + set(QNN_INC_PATH ${GGML_QNN_SDK_PATH}/include/QNN) + set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${QNN_INC_PATH} "ggml-qnn") + list(APPEND GGML_CDEF_PUBLIC GGML_USE_QNN) +endif() + function(get_flags CCID CCVER) set(C_FLAGS "") set(CXX_FLAGS "") @@ -1315,6 +1345,7 @@ add_library(ggml ${GGML_SOURCES_BLAS} ${GGML_HEADERS_BLAS} ${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE} ${GGML_SOURCES_CANN} ${GGML_HEADERS_CANN} + ${GGML_SOURCES_QNN} ${GGML_HEADERS_QNN} ggml-aarch64.c ggml-aarch64.h ) diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.c index b5d9301a78762..4524820a5397d 100644 --- a/ggml/src/ggml-backend.c +++ b/ggml/src/ggml-backend.c @@ -449,6 +449,11 @@ GGML_CALL static void ggml_backend_registry_init(void) { extern GGML_CALL int ggml_backend_cann_reg_devices(void); ggml_backend_cann_reg_devices(); #endif + +#ifdef GGML_USE_QNN + extern GGML_CALL void ggml_backend_qnn_reg_devices(void); + ggml_backend_qnn_reg_devices(); +#endif } GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) { diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp new file mode 100644 index 0000000000000..6ed5ecb2e03f2 --- /dev/null +++ b/ggml/src/ggml-qnn.cpp @@ -0,0 +1,469 @@ +#include "ggml-qnn.h" + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ggml-backend-impl.h" + +#include "ggml-qnn/backend-ops.hpp" +#include "ggml-qnn/backend.hpp" +#include "ggml-qnn/logger.hpp" +#include "ggml-qnn/tensor.hpp" +#include "ggml-qnn/utils.hpp" + +// ================================================================================================= +// +// self-defined macro / data structure +// +// ================================================================================================= +#ifdef NDEBUG +#define ENABLE_QNNBACKEND_PERF 0 // enable/disable op's perf info +#else +#define ENABLE_QNNBACKEND_PERF 1 // enable/disable op's perf info +#endif + +#define QNN_BACKEND_NAME "qnn" + +// according to the QNN SDK Reference Guide, +// CPU - Choose a non-quantized model.Quantized models are currently incompatible with the CPU backend +// GPU - Choose a non-quantized model.Quantized models are currently incompatible with the GPU backend +// HTP - Choose a quantized model. Quantized models are required when running on the HTP backend +// DSP - Choose a quantized model. Quantized models are required when running on the DSP backend +// HTA - Choose a quantized model. Quantized models are required when running on the HTA backend +// +// only focus on Qualcomm CPU/GPU/NPU backend in this implementation of QNN backend for ggml currently, +// CPU: Qualcomm Kryo CPU +// GPU: Qualcomm Adreno GPU +// NPU: Qualcomm NPU: aka HTP(Hexagon Tensor Processor), ~= cDSP(Compute DSP) + +// HMX(Hexagon Matrix eXtensions)/HTA(Hexagon Tensor Accelerator) + +static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = { + ggml_backend_qnn_context(QNN_BACKEND_CPU, 1, "qnn-cpu", "libQnnCpu.so"), /* QNN_BACKEND_CPU */ + ggml_backend_qnn_context(QNN_BACKEND_GPU, 1, "qnn-gpu", "libQnnGpu.so"), /* QNN_BACKEND_GPU */ + ggml_backend_qnn_context(QNN_BACKEND_NPU, 1, "qnn-npu", "libQnnHtp.so"), /* QNN_BACKEND_NPU */ +}; + +class ggml_backend_qnn_buffer_context { +public: + ggml_backend_qnn_buffer_context(QNNBackend device, std::shared_ptr instance, size_t size) : + _instance(instance), _name(QNN_BACKEND_NAME + std::to_string(device)) { + + size_t size_page = sysconf(_SC_PAGESIZE); + + // TODO: for qnn npu, a better way here is to reuse the buffer allocated by qnn rpc, will save an extra copy + _buffer = qnn::align_alloc(size_page, size); + + if (!_buffer) { + QNN_LOG_WARN("failed to allocate %.2f MiB\n", float(size / (1 << 20))); + return; + } + + _buffer_size = size; + } + + ~ggml_backend_qnn_buffer_context() { + // the free will do nothing if the _buffer is nullptr + qnn::align_free(_buffer); + } + + bool is_valid() const { return _buffer != nullptr; } + + void *get_buffer() { return _buffer; } + size_t get_buffer_size() { return _buffer_size; } + +private: + std::shared_ptr _instance; + std::string _name; + void *_buffer = nullptr; + size_t _buffer_size = 0; +}; + +struct ggml_backend_qnn_buffer_type_context { + size_t device; + std::string name; +}; + +// ================================================================================================= +// +// implementation of QNN backend for GGML +// +// ================================================================================================= +static bool ggml_qnn_compute_forward(ggml_backend_qnn_context *ctx, struct ggml_tensor *tensor) { + return qnn::ggml_qnn_forward(ctx, tensor); +} + +static const char *ggml_backend_qnn_buffer_get_name(ggml_backend_buffer_t buffer) { + GGML_UNUSED(buffer); + return "QNN"; +} + +GGML_CALL static bool ggml_backend_buffer_is_qnn(ggml_backend_buffer_t buffer) { + return buffer->iface.get_name == ggml_backend_qnn_buffer_get_name; +} + +GGML_CALL static void ggml_backend_qnn_buffer_free_buffer(ggml_backend_buffer_t buffer) { + ggml_backend_qnn_buffer_context *ctx = (ggml_backend_qnn_buffer_context *)buffer->context; + + delete ctx; +} + +GGML_CALL static void *ggml_backend_qnn_buffer_get_base(ggml_backend_buffer_t buffer) { + ggml_backend_qnn_buffer_context *ctx = (ggml_backend_qnn_buffer_context *)buffer->context; + + return ctx->get_buffer(); +} + +GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor) { + // Do nothing here, the qnn tensor will be create along with the graph. + GGML_UNUSED(buffer); + GGML_UNUSED(tensor); +} + +GGML_CALL static void ggml_backend_qnn_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor, + const void *data, size_t offset, size_t size) { + GGML_UNUSED(buffer); + + memcpy((char *)tensor->data + offset, data, size); +} + +GGML_CALL static void ggml_backend_qnn_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor *tensor, + void *data, size_t offset, size_t size) { + GGML_UNUSED(buffer); + memcpy(data, (const char *)tensor->data + offset, size); +} + +GGML_CALL static bool ggml_backend_qnn_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor *src, + struct ggml_tensor *dst) { + GGML_UNUSED(buffer); + if (ggml_backend_buffer_is_host(src->buffer)) { + memcpy(dst->data, src->data, ggml_nbytes(src)); + return true; + } + + return false; +} + +GGML_CALL static void ggml_backend_qnn_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + ggml_backend_qnn_buffer_context *ctx = (ggml_backend_qnn_buffer_context *)buffer->context; + + memset(ctx->get_buffer(), value, ctx->get_buffer_size()); +} + +static ggml_backend_buffer_i ggml_backend_qnn_buffer_interface = { + /* .get_name = */ ggml_backend_qnn_buffer_get_name, + /* .free_buffer = */ ggml_backend_qnn_buffer_free_buffer, + /* .get_base = */ ggml_backend_qnn_buffer_get_base, + /* .init_tensor = */ ggml_backend_qnn_buffer_init_tensor, + /* .set_tensor = */ ggml_backend_qnn_buffer_set_tensor, + /* .get_tensor = */ ggml_backend_qnn_buffer_get_tensor, + /* .cpy_tensor = */ ggml_backend_qnn_buffer_cpy_tensor, + /* .clear = */ ggml_backend_qnn_buffer_clear, + /* .reset = */ nullptr, +}; + +GGML_CALL static const char *ggml_backend_qnn_buffer_type_name(ggml_backend_buffer_type_t buft) { + GGML_UNUSED(buft); + return "QNN"; +} + +GGML_CALL static ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, + size_t size) { + ggml_backend_qnn_buffer_type_context *buft_ctx = (ggml_backend_qnn_buffer_type_context *)buft->context; + ggml_backend_qnn_buffer_context *ctx = + new ggml_backend_qnn_buffer_context((QNNBackend)buft_ctx->device, g_qnn_mgr[buft_ctx->device].instance, size); + if (!ctx->is_valid()) { + return nullptr; + } + + return ggml_backend_buffer_init(buft, ggml_backend_qnn_buffer_interface, ctx, size); +} + +GGML_CALL static size_t ggml_backend_qnn_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + GGML_UNUSED(buft); + return 32; +} + +// TODO: this value is an experimental value, works fine with whisper/llm/minicpm-v inference on Android +GGML_CALL static size_t ggml_backend_qnn_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { + GGML_UNUSED(buft); + + return (96 * 1024 * 1024); +} + +GGML_CALL static bool ggml_backend_qnn_buffer_is_host(ggml_backend_buffer_type_t buft) { + GGML_UNUSED(buft); + return true; +} + +GGML_CALL static const char *ggml_backend_qnn_name(ggml_backend_t backend) { + ggml_backend_qnn_context *ctx = (ggml_backend_qnn_context *)backend->context; + return g_qnn_mgr[ctx->device].name; +} + +GGML_CALL static void ggml_backend_qnn_free(ggml_backend_t backend) { + QNN_LOG_INFO("enter %s", __func__); + ggml_backend_qnn_context *ctx = (ggml_backend_qnn_context *)backend->context; + QNN_LOG_INFO("idx %d, name:%s", ctx->device, g_qnn_mgr[ctx->device].name); + + auto instance = g_qnn_mgr[ctx->device].instance; + if (instance) { + ctx->qnn_graph_cache.clear(); + instance->qnn_finalize(); + g_qnn_mgr[ctx->device].instance.reset(); + } + + if (g_qnn_mgr[ctx->device].backend != nullptr) { + delete backend; + g_qnn_mgr[ctx->device].backend = nullptr; + } + QNN_LOG_INFO("leave %s", __func__); +} + +GGML_CALL static ggml_backend_buffer_type_t ggml_backend_qnn_get_default_buffer_type(ggml_backend_t backend) { + ggml_backend_qnn_context *ctx = (ggml_backend_qnn_context *)backend->context; + + return ggml_backend_qnn_buffer_type(ctx->device); +} + +GGML_CALL static ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, ggml_cgraph *cgraph) { + enum ggml_status result = GGML_STATUS_SUCCESS; + ggml_backend_qnn_context *ctx = (ggml_backend_qnn_context *)backend->context; + GGML_UNUSED(ctx); + + for (int i = 0; i < cgraph->n_nodes; i++) { + ggml_tensor *node = cgraph->nodes[i]; + if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || + node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { + continue; + } + bool ok = ggml_qnn_compute_forward(ctx, node); + if (!ok) { + QNN_LOG_DEBUG("error: op not supported %s (%s)\n", node->name, ggml_op_name(node->op)); + } + } + + return result; +} + +GGML_CALL static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const ggml_tensor *op) { + GGML_UNUSED(backend); + return qnn::ggml_qnn_supports_op(op); +} + +GGML_CALL static bool ggml_backend_qnn_offload_op(ggml_backend_t backend, const ggml_tensor *op) { + GGML_UNUSED(backend); + + size_t dims = ggml_n_dims(op); + bool can_offload = false; + for (size_t i = 0; i < dims; i++) { + if (op->ne[i] > 1) { + can_offload = true; + break; + } + } + + return can_offload; +} + +static ggml_backend_i ggml_backend_qnn_interface = { + /* .get_name = */ ggml_backend_qnn_name, + /* .free = */ ggml_backend_qnn_free, + /* .get_default_buffer_type = */ ggml_backend_qnn_get_default_buffer_type, + /* .set_tensor_async = */ nullptr, + /* .get_tensor_async = */ nullptr, + /* .cpy_tensor_async = */ nullptr, + /* .synchronize = */ nullptr, + /* .graph_plan_create = */ nullptr, + /* .graph_plan_free = */ nullptr, + /* .graph_plan_update = */ nullptr, + /* .graph_plan_compute = */ nullptr, + /* .graph_compute = */ ggml_backend_qnn_graph_compute, + /* .supports_op = */ ggml_backend_qnn_supports_op, + /* .supports_buft = */ nullptr, + /* .offload_op = */ ggml_backend_qnn_offload_op, + /* .event_new = */ nullptr, + /* .event_free = */ nullptr, + /* .event_record = */ nullptr, + /* .event_wait = */ nullptr, + /* .event_synchronize = */ nullptr, +}; + +static ggml_guid_t ggml_backend_qnn_guid() { + static ggml_guid guid = { 0x1a, 0x2b, 0x3c, 0x4d, 0x5e, 0x6f, 0x70, 0x81, + 0x92, 0xa3, 0xb4, 0xc5, 0xd6, 0xe7, 0xf8, 0x09 }; + return &guid; +} + +static ggml_backend_t ggml_backend_qnn_reg_init(const char *extend_lib_search_path, void *user_data) { + ggml_backend_t qnn_backend = ggml_backend_qnn_init((int)(intptr_t)user_data, extend_lib_search_path); + return qnn_backend; +} + +bool ggml_backend_is_qnn(ggml_backend_t backend) { + return backend != nullptr && ggml_guid_matches(backend->guid, ggml_backend_qnn_guid()); +} + +void ggml_backend_qnn_set_n_threads(ggml_backend_t backend, int n_threads) { + GGML_ASSERT(ggml_backend_is_qnn(backend)); + + auto *ctx = (ggml_backend_qnn_context *)backend->context; + ctx->threads = n_threads; +} + +int ggml_backend_qnn_get_device_count() { return GGML_QNN_MAX_DEVICES; } + +void ggml_backend_qnn_get_device_description(size_t dev_num, char *description, size_t description_size) { + if (nullptr == description || 0 == description_size) { + QNN_LOG_WARN("invalid param"); + return; + } + + if (dev_num >= GGML_QNN_MAX_DEVICES) { + QNN_LOG_WARN("invalid param"); + return; + } + + snprintf(description, description_size, "%s", g_qnn_mgr[dev_num].name); +} + +ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device) { + if (device >= GGML_QNN_MAX_DEVICES) { + QNN_LOG_DEBUG( + "ggml_backend_qnn_buffer_type error: device_index:%d is " + "out of range [0, %d]\n", + device, GGML_QNN_MAX_DEVICES - 1); + return nullptr; + } + + static ggml_backend_qnn_buffer_type_context ggml_backend_qnn_buffer_type_contexts[GGML_QNN_MAX_DEVICES]; + static ggml_backend_buffer_type ggml_backend_qnn_buffer_types[GGML_QNN_MAX_DEVICES]; + static bool ggml_backend_qnn_buffer_type_initialized = false; + if (!ggml_backend_qnn_buffer_type_initialized) { + for (size_t i = 0; i < GGML_QNN_MAX_DEVICES; i++) { + auto &context = ggml_backend_qnn_buffer_type_contexts[i]; + context = { i, std::string(QNN_BACKEND_NAME) + std::to_string(i) }; + ggml_backend_qnn_buffer_types[i] = { + /* .iface = */ { /* .get_name = */ ggml_backend_qnn_buffer_type_name, + /* .alloc_buffer = */ ggml_backend_qnn_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_qnn_buffer_type_get_alignment, + /* .get_max_size = */ ggml_backend_qnn_buffer_type_get_max_size, + /* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes + /* .is_host = */ ggml_backend_qnn_buffer_is_host }, + /* .context = */ &context, + }; + } + ggml_backend_qnn_buffer_type_initialized = true; + } + + return &ggml_backend_qnn_buffer_types[device]; +} + +ggml_backend_t ggml_backend_qnn_init(size_t device, const char *extend_lib_search_path) { + int result = 0; + + if (!extend_lib_search_path) { + extend_lib_search_path = GGML_QNN_DEFAULT_LIB_SEARCH_PATH; + QNN_LOG_WARN("extend_lib_search_path is nullptr, will use " GGML_QNN_DEFAULT_LIB_SEARCH_PATH " as default"); + } + + QNN_LOG_DEBUG("device %d", device); + QNN_LOG_DEBUG("extend_lib_search_path %s", extend_lib_search_path); + if (device >= GGML_QNN_MAX_DEVICES) { + QNN_LOG_ERROR("invalid device %d", device); + return nullptr; + } + + std::string path = extend_lib_search_path; + +// TODO: Fix this for other platforms +#if defined(__ANDROID__) || defined(ANDROID) + if (QNN_BACKEND_NPU == device) { + if (0 == setenv("LD_LIBRARY_PATH", + (path + ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/" + "dsp:/vendor/dsp/images") + .c_str(), + 1)) { + QNN_LOG_INFO("QNN NPU backend setenv successfully"); + } else { + QNN_LOG_ERROR("QNN NPU backend setenv failure"); + } + if (0 == setenv("ADSP_LIBRARY_PATH", + (path + ";/vendor/dsp/cdsp;/vendor/lib/rfsa/adsp;/system/lib/" + "rfsa/adsp;/vendor/dsp/dsp;/vendor/dsp/images;/dsp") + .c_str(), + 1)) { + QNN_LOG_INFO("QNN NPU backend setenv successfully"); + } else { + QNN_LOG_ERROR("QNN NPU backend setenv failure"); + } + } else { + if (0 == setenv("LD_LIBRARY_PATH", path.c_str(), 1)) { + QNN_LOG_INFO("%s backend setenv successfully\n", qnn::get_backend_name(device)); + } else { + QNN_LOG_ERROR("%s backend setenv failure\n", qnn::get_backend_name(device)); + } + } +#endif + + auto instance = std::make_shared(extend_lib_search_path, g_qnn_mgr[device].lib, ""); + result = instance->qnn_init(nullptr); + if (result != 0) { + QNN_LOG_WARN("init qnn subsystem failed with qnn backend %s, pls check why\n", qnn::get_backend_name(device)); + return nullptr; + } + auto qnn_interface = instance->get_qnn_interface(); + if (!qnn_interface) { + QNN_LOG_WARN("qnn subsystem failure\n"); + return nullptr; + } + + std::string device_name = qnn::get_backend_name(device); + QNN_LOG_INFO("qnn device name %s", device_name.c_str()); + auto &qnn_device = g_qnn_mgr[device]; + qnn_device.instance = instance; + qnn_device.qnn_interface = qnn_interface; + qnn_device.socinfo = instance->get_soc_info(); + + ggml_backend_t qnn_backend = new ggml_backend{ /* .guid = */ ggml_backend_qnn_guid(), + /* .iface = */ ggml_backend_qnn_interface, + /* .context = */ &g_qnn_mgr[device] }; + g_qnn_mgr[device].backend = qnn_backend; + + return qnn_backend; +} + +extern "C" GGML_CALL void ggml_backend_qnn_reg_devices(); + +GGML_CALL void ggml_backend_qnn_reg_devices() { + for (size_t idx = 0; idx < GGML_QNN_MAX_DEVICES; idx++) { + char name[GGML_MAX_NAME]; + ggml_backend_qnn_get_device_description(idx, name, GGML_MAX_NAME); + ggml_backend_register(name, ggml_backend_qnn_reg_init, ggml_backend_qnn_buffer_type(idx), + (void *)(intptr_t)idx); + } +} diff --git a/ggml/src/ggml-qnn/.clang-format b/ggml/src/ggml-qnn/.clang-format new file mode 100644 index 0000000000000..3b933ff10db42 --- /dev/null +++ b/ggml/src/ggml-qnn/.clang-format @@ -0,0 +1,31 @@ +--- +BasedOnStyle: Google +IndentWidth: 4 +AccessModifierOffset: -4 +AlignAfterOpenBracket: Align +AlignOperands: true +AlignTrailingComments: true +BinPackArguments: true +BinPackParameters: true +BreakBeforeBraces: Custom +BreakConstructorInitializers: AfterColon +ColumnLimit: 120 +Cpp11BracedListStyle: false +DerivePointerAlignment: false +IncludeCategories: + - Regex: '^<.*\.h>' + Priority: 1 + - Regex: '^<.*' + Priority: 2 + - Regex: '^"ggml\.h"' + Priority: 3 + - Regex: '^"ggml-.+\.h"' + Priority: 4 + - Regex: '.*' + Priority: 5 +KeepEmptyLinesAtTheStartOfBlocks: true +MaxEmptyLinesToKeep: 1 +PointerAlignment: Right +SortIncludes: true +SpacesBeforeTrailingComments: 1 +UseTab: Never \ No newline at end of file diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp new file mode 100644 index 0000000000000..5829e0fadbe92 --- /dev/null +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -0,0 +1,608 @@ + +#include "backend-ops.hpp" + +#include + +#include "graph.hpp" +#include "logger.hpp" +#include "op-config.hpp" +#include "tensor.hpp" +#include "utils.hpp" + +#ifndef NDEBUG + +namespace { + +bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor *src, ggml_tensor *dst) { + if (!ctx || !src || !dst) { + QNN_LOG_WARN("invalid params\n"); + return false; + } + + auto instance = ctx->instance; + if (!instance) { + QNN_LOG_WARN("invalid instance\n"); + return false; + } + + return true; +} + +bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) { + if (!ctx || !src0 || !src1 || !dst) { + QNN_LOG_WARN("invalid params\n"); + return false; + } + + auto instance = ctx->instance; + if (!instance) { + QNN_LOG_WARN("invalid instance\n"); + return false; + } + + return true; +} + +bool is_tensor_dimensions_equal(const ggml_tensor *l, const ggml_tensor *r) { + const auto dim_l = ggml_n_dims(l); + if (dim_l != ggml_n_dims(r)) { + return false; + } + + for (int i = 0; i < dim_l; i++) { + if (l->ne[i] != r->ne[i]) { + return false; + } + } + + return true; +} + +void print_ggml_tensor(const ggml_tensor *tensor) { + QNN_LOG_DEBUG("%s: type:%s ne: %ldx%ldx%ldx%ld, nb: %ldx%ldx%ldx%ld\n", tensor->name, ggml_type_name(tensor->type), + (long)tensor->ne[0], (long)tensor->ne[1], (long)tensor->ne[2], (long)tensor->ne[3], + (long)tensor->nb[0], (long)tensor->nb[1], (long)tensor->nb[2], (long)tensor->nb[3]); +} + +} // namespace + +#define CHECK_PARAMS(ctx, ...) \ + if (!qnn_is_valid_params((ctx), __VA_ARGS__)) { \ + return false; \ + } + +#else +#define CHECK_PARAMS(ctx, ...) +#endif + +namespace { + +typedef bool (*ggml_qnn_unary_op_t)(ggml_backend_qnn_context *ctx, ggml_tensor *src, ggml_tensor *dst); +typedef bool (*ggml_qnn_binary_op_t)(ggml_backend_qnn_context *ctx, ggml_tensor *src0, ggml_tensor *src1, + ggml_tensor *dst); + +typedef const ggml_qnn_unary_op_t (&ggml_qnn_unary_op_array_t)[GGML_OP_COUNT + GGML_UNARY_OP_COUNT]; +typedef const ggml_qnn_binary_op_t (&ggml_qnn_binary_op_array_t)[GGML_OP_COUNT]; + +constexpr const size_t kGgmlUnaryOpStart = GGML_OP_COUNT; + +template +qnn::ggml_tensor_array_t to_ggml_tensor_array(const std::array &array) { + return qnn::ggml_tensor_array_t(array.data(), array.data() + _Size); +} + +template +bool execute_graph(qnn::ggml_qnn_graph *graph, const std::array &inputs, + const std::array &outputs) { + if (!graph->execute(to_ggml_tensor_array<_InputSize>(inputs), to_ggml_tensor_array<_OutputSize>(outputs))) { + QNN_LOG_WARN("execute failed\n"); + return false; + } + + return true; +} + +template +std::string get_graph_key(const std::string &op_name, const std::array &inputs, + const std::array &outputs) { + constexpr static const auto append_dimensions = [](std::string &key, const ggml_tensor *tensor) { + char buffer[256] = {}; + snprintf(buffer, sizeof(buffer), "_%ldx%ldx%ldx%ld", (long)tensor->ne[0], (long)tensor->ne[1], + (long)tensor->ne[2], (long)tensor->ne[3]); + key += buffer; + }; + + std::string graph_key(op_name); + for (auto &input : inputs) { + append_dimensions(graph_key, input); + } + for (auto &output : outputs) { + append_dimensions(graph_key, output); + } + + return graph_key; +} + +qnn::ggml_op_constructor_t generate_common_op_constructor(const std::string &op_name) { + if (op_name == QNN_OP_MAT_MUL) { + // For QNN_OP_MAT_MUL, we need to transpose the input tensor + return [](const std::string &name) { + auto config = std::make_unique(name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL); + Qnn_Scalar_t scalar = QNN_SCALAR_INIT; + scalar.dataType = QNN_DATATYPE_BOOL_8; + scalar.bool8Value = true; + config->add_scalar_param(QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN0, scalar); + QNN_LOG_DEBUG("add scalar param %s\n", QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN0); + return config; + }; + } + + return [op_name](const std::string &name) { + return std::make_unique(name, QNN_OP_PACKAGE_NAME_QTI_AISW, op_name); + }; +} + +constexpr const char *kGgmlOpToQnnOp[] = { + nullptr, // GGML_OP_NONE + nullptr, // GGML_OP_DUP + QNN_OP_ELEMENT_WISE_ADD, // GGML_OP_ADD + nullptr, // GGML_OP_ADD1 + nullptr, // GGML_OP_ACC + QNN_OP_ELEMENT_WISE_SUBTRACT, // GGML_OP_SUB + QNN_OP_ELEMENT_WISE_MULTIPLY, // GGML_OP_MUL + QNN_OP_ELEMENT_WISE_DIVIDE, // GGML_OP_DIV + nullptr, // GGML_OP_SQR + QNN_OP_ELEMENT_WISE_SQUARE_ROOT, // GGML_OP_SQRT + QNN_OP_ELEMENT_WISE_LOG, // GGML_OP_LOG + nullptr, // GGML_OP_SIN + nullptr, // GGML_OP_COS + nullptr, // GGML_OP_SUM + nullptr, // GGML_OP_SUM_ROWS + nullptr, // GGML_OP_MEAN + nullptr, // GGML_OP_ARGMAX + nullptr, // GGML_OP_REPEAT + nullptr, // GGML_OP_REPEAT_BACK + nullptr, // GGML_OP_CONCAT + nullptr, // GGML_OP_SILU_BACK + nullptr, // GGML_OP_NORM + nullptr, // GGML_OP_RMS_NORM + nullptr, // GGML_OP_RMS_NORM_BACK + nullptr, // GGML_OP_GROUP_NORM + + QNN_OP_MAT_MUL, // GGML_OP_MUL_MAT + nullptr, // GGML_OP_MUL_MAT_ID + nullptr, // GGML_OP_OUT_PROD + + nullptr, // GGML_OP_SCALE + nullptr, // GGML_OP_SET + nullptr, // GGML_OP_CPY + nullptr, // GGML_OP_CONT + nullptr, // GGML_OP_RESHAPE + nullptr, // GGML_OP_VIEW + nullptr, // GGML_OP_PERMUTE + nullptr, // GGML_OP_TRANSPOSE + nullptr, // GGML_OP_GET_ROWS + nullptr, // GGML_OP_GET_ROWS_BACK + nullptr, // GGML_OP_DIAG + nullptr, // GGML_OP_DIAG_MASK_INF + nullptr, // GGML_OP_DIAG_MASK_ZERO + nullptr, // GGML_OP_SOFT_MAX + nullptr, // GGML_OP_SOFT_MAX_BACK + nullptr, // GGML_OP_ROPE + nullptr, // GGML_OP_ROPE_BACK + nullptr, // GGML_OP_CLAMP + nullptr, // GGML_OP_CONV_TRANSPOSE_1D + nullptr, // GGML_OP_IM2COL + nullptr, // GGML_OP_IM2COL_BACK + nullptr, // GGML_OP_CONV_TRANSPOSE_2D + nullptr, // GGML_OP_POOL_1D + nullptr, // GGML_OP_POOL_2D + nullptr, // GGML_OP_POOL_2D_BACK + nullptr, // GGML_OP_UPSCALE + nullptr, // GGML_OP_PAD + nullptr, // GGML_OP_ARANGE + nullptr, // GGML_OP_TIMESTEP_EMBEDDING + nullptr, // GGML_OP_ARGSORT + nullptr, // GGML_OP_LEAKY_RELU + + nullptr, // GGML_OP_FLASH_ATTN_EXT + nullptr, // GGML_OP_FLASH_ATTN_BACK + nullptr, // GGML_OP_SSM_CONV + nullptr, // GGML_OP_SSM_SCAN + nullptr, // GGML_OP_WIN_PART + nullptr, // GGML_OP_WIN_UNPART + nullptr, // GGML_OP_GET_REL_POS + nullptr, // GGML_OP_ADD_REL_POS + nullptr, // GGML_OP_RWKV_WKV + + nullptr, // GGML_OP_UNARY + + nullptr, // GGML_OP_MAP_UNARY + nullptr, // GGML_OP_MAP_BINARY + + nullptr, // GGML_OP_MAP_CUSTOM1_F32 + nullptr, // GGML_OP_MAP_CUSTOM2_F32 + nullptr, // GGML_OP_MAP_CUSTOM3_F32 + + nullptr, // GGML_OP_MAP_CUSTOM1 + nullptr, // GGML_OP_MAP_CUSTOM2 + nullptr, // GGML_OP_MAP_CUSTOM3 + + nullptr, // GGML_OP_CROSS_ENTROPY_LOSS + nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK + + // ggml_unary_op + nullptr, // GGML_UNARY_OP_ABS + nullptr, // GGML_UNARY_OP_SGN + nullptr, // GGML_UNARY_OP_NEG + nullptr, // GGML_UNARY_OP_STEP + nullptr, // GGML_UNARY_OP_TANH + nullptr, // GGML_UNARY_OP_ELU + nullptr, // GGML_UNARY_OP_RELU + nullptr, // GGML_UNARY_OP_SIGMOID + QNN_OP_GELU, // GGML_UNARY_OP_GELU + nullptr, // GGML_UNARY_OP_GELU_QUICK + nullptr, // GGML_UNARY_OP_SILU + nullptr, // GGML_UNARY_OP_HARDSWISH + nullptr, // GGML_UNARY_OP_HARDSIGMOID + nullptr, // GGML_UNARY_OP_EXP +}; + +static_assert(sizeof(kGgmlOpToQnnOp) / sizeof(kGgmlOpToQnnOp[0]) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT), + "GGML_OP_COUNT does not match the size of the kGgmlOpToQnnOp table"); +static_assert(kGgmlOpToQnnOp[GGML_UNARY_OP_GELU + kGgmlUnaryOpStart] != nullptr, + "GGML_UNARY_OP_GELU does not correspond to QNN_OP_GELU"); + +template +qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_context *ctx, size_t op, + const std::array &inputs, + const std::array &outputs) { + GGML_ASSERT(op < (GGML_OP_COUNT + GGML_UNARY_OP_COUNT)); + + auto &graph_cache = ctx->qnn_graph_cache; + const auto *op_name = + op < kGgmlUnaryOpStart ? ggml_op_name(ggml_op(op)) : ggml_unary_op_name(ggml_unary_op(op - kGgmlUnaryOpStart)); + auto graph_key = get_graph_key<_InputSize, _OutputSize>(op_name, inputs, outputs); + auto it = graph_cache.find(graph_key); + qnn::ggml_qnn_graph *graph_ptr = nullptr; + if (it != graph_cache.end()) { + QNN_LOG_DEBUG("found graph %s in cache\n", graph_key.c_str()); + graph_ptr = it->second.get(); + } else { + auto graph = std::make_unique(graph_key, (QNNBackend)(ctx->device), ctx->instance, + ctx->socinfo.vtcm_size_in_mb); + if (!graph->is_valid()) { + return nullptr; + } + + auto op_constructor = generate_common_op_constructor(kGgmlOpToQnnOp[op]); + if (!graph->build_graph(op_constructor, to_ggml_tensor_array<_InputSize>(inputs), + to_ggml_tensor_array<_OutputSize>(outputs))) { + QNN_LOG_ERROR("build_graph failed\n"); + return nullptr; + } + + graph_ptr = graph.get(); + graph_cache[graph_key] = std::move(graph); + } + + return graph_ptr; +} + +template +bool qnn_binary_op_impl(ggml_backend_qnn_context *ctx, ggml_tensor *src0, ggml_tensor *src1, ggml_tensor *dst) { + static_assert(kGgmlOpToQnnOp[_GgmlOp] != nullptr, "GGML_OP does not have a corresponding QNN_OP"); + + CHECK_PARAMS(ctx, src0, src1, dst); + + bool succeed = false; + auto *graph_ptr = get_qnn_graph_from_cache<2, 1>(ctx, _GgmlOp, { src0, src1 }, { dst }); + if (graph_ptr) { + succeed = execute_graph<2, 1>(graph_ptr, { src0, src1 }, { dst }); + } + +#ifndef NDEBUG + if (!succeed) { + print_ggml_tensor(src0); + print_ggml_tensor(src1); + print_ggml_tensor(dst); + } +#endif + + return succeed; +} + +template +bool qnn_unary_op_impl(ggml_backend_qnn_context *ctx, ggml_tensor *src, ggml_tensor *dst) { + static_assert(kGgmlOpToQnnOp[_GgmlOp] != nullptr, "GGML_OP does not have a corresponding QNN_OP"); + + CHECK_PARAMS(ctx, src, dst); + + bool succeed = false; + auto *graph_ptr = get_qnn_graph_from_cache<1, 1>(ctx, _GgmlOp, { src }, { dst }); + if (graph_ptr) { + succeed = execute_graph<1, 1>(graph_ptr, { src }, { dst }); + } + +#ifndef NDEBUG + if (!succeed) { + print_ggml_tensor(src); + print_ggml_tensor(dst); + } +#endif + + return succeed; +} +constexpr const ggml_qnn_unary_op_t kQnnUnaryOpsTable[] = { + nullptr, // GGML_OP_NONE + nullptr, // GGML_OP_DUP + nullptr, // GGML_OP_ADD + nullptr, // GGML_OP_ADD1 + nullptr, // GGML_OP_ACC + nullptr, // GGML_OP_SUB + nullptr, // GGML_OP_MUL + nullptr, // GGML_OP_DIV + nullptr, // GGML_OP_SQR + qnn_unary_op_impl, // GGML_OP_SQRT + qnn_unary_op_impl, // GGML_OP_LOG + nullptr, // GGML_OP_SIN + nullptr, // GGML_OP_COS + nullptr, // GGML_OP_SUM + nullptr, // GGML_OP_SUM_ROWS + nullptr, // GGML_OP_MEAN + nullptr, // GGML_OP_ARGMAX + nullptr, // GGML_OP_REPEAT + nullptr, // GGML_OP_REPEAT_BACK + nullptr, // GGML_OP_CONCAT + nullptr, // GGML_OP_SILU_BACK + nullptr, // GGML_OP_NORM + nullptr, // GGML_OP_RMS_NORM + nullptr, // GGML_OP_RMS_NORM_BACK + nullptr, // GGML_OP_GROUP_NORM + + nullptr, // GGML_OP_MUL_MAT + nullptr, // GGML_OP_MUL_MAT_ID + nullptr, // GGML_OP_OUT_PROD + + nullptr, // GGML_OP_SCALE + nullptr, // GGML_OP_SET + nullptr, // GGML_OP_CPY + nullptr, // GGML_OP_CONT + nullptr, // GGML_OP_RESHAPE + nullptr, // GGML_OP_VIEW + nullptr, // GGML_OP_PERMUTE + nullptr, // GGML_OP_TRANSPOSE + nullptr, // GGML_OP_GET_ROWS + nullptr, // GGML_OP_GET_ROWS_BACK + nullptr, // GGML_OP_DIAG + nullptr, // GGML_OP_DIAG_MASK_INF + nullptr, // GGML_OP_DIAG_MASK_ZERO + nullptr, // GGML_OP_SOFT_MAX + nullptr, // GGML_OP_SOFT_MAX_BACK + nullptr, // GGML_OP_ROPE + nullptr, // GGML_OP_ROPE_BACK + nullptr, // GGML_OP_CLAMP + nullptr, // GGML_OP_CONV_TRANSPOSE_1D + nullptr, // GGML_OP_IM2COL + nullptr, // GGML_OP_IM2COL_BACK + nullptr, // GGML_OP_CONV_TRANSPOSE_2D + nullptr, // GGML_OP_POOL_1D + nullptr, // GGML_OP_POOL_2D + nullptr, // GGML_OP_POOL_2D_BACK + nullptr, // GGML_OP_UPSCALE + nullptr, // GGML_OP_PAD + nullptr, // GGML_OP_ARANGE + nullptr, // GGML_OP_TIMESTEP_EMBEDDING + nullptr, // GGML_OP_ARGSORT + nullptr, // GGML_OP_LEAKY_RELU + + nullptr, // GGML_OP_FLASH_ATTN_EXT + nullptr, // GGML_OP_FLASH_ATTN_BACK + nullptr, // GGML_OP_SSM_CONV + nullptr, // GGML_OP_SSM_SCAN + nullptr, // GGML_OP_WIN_PART + nullptr, // GGML_OP_WIN_UNPART + nullptr, // GGML_OP_GET_REL_POS + nullptr, // GGML_OP_ADD_REL_POS + nullptr, // GGML_OP_RWKV_WKV + + nullptr, // GGML_OP_UNARY + + nullptr, // GGML_OP_MAP_UNARY + nullptr, // GGML_OP_MAP_BINARY + + nullptr, // GGML_OP_MAP_CUSTOM1_F32 + nullptr, // GGML_OP_MAP_CUSTOM2_F32 + nullptr, // GGML_OP_MAP_CUSTOM3_F32 + + nullptr, // GGML_OP_MAP_CUSTOM1 + nullptr, // GGML_OP_MAP_CUSTOM2 + nullptr, // GGML_OP_MAP_CUSTOM3 + + nullptr, // GGML_OP_CROSS_ENTROPY_LOSS + nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK + + // ggml_unary_op + nullptr, // GGML_UNARY_OP_ABS + nullptr, // GGML_UNARY_OP_SGN + nullptr, // GGML_UNARY_OP_NEG + nullptr, // GGML_UNARY_OP_STEP + nullptr, // GGML_UNARY_OP_TANH + nullptr, // GGML_UNARY_OP_ELU + nullptr, // GGML_UNARY_OP_RELU + nullptr, // GGML_UNARY_OP_SIGMOID + qnn_unary_op_impl, // GGML_UNARY_OP_GELU + nullptr, // GGML_UNARY_OP_GELU_QUICK + nullptr, // GGML_UNARY_OP_SILU + nullptr, // GGML_UNARY_OP_HARDSWISH + nullptr, // GGML_UNARY_OP_HARDSIGMOID + nullptr, // GGML_UNARY_OP_EXP +}; + +static_assert(sizeof(kQnnUnaryOpsTable) / sizeof(kQnnUnaryOpsTable[0]) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT), + "GGML_OP_COUNT does not match the size of the kQnnUnaryOpsTable table"); + +static constexpr const ggml_qnn_binary_op_t kQnnBinaryOpsTable[] = { + nullptr, // GGML_OP_NONE + nullptr, // GGML_OP_DUP + qnn_binary_op_impl, // GGML_OP_ADD + nullptr, // GGML_OP_ADD1 + nullptr, // GGML_OP_ACC + qnn_binary_op_impl, // GGML_OP_SUB + qnn_binary_op_impl, // GGML_OP_MUL + qnn_binary_op_impl, // GGML_OP_DIV + nullptr, // GGML_OP_SQR + nullptr, // GGML_OP_SQRT + nullptr, // GGML_OP_LOG + nullptr, // GGML_OP_SIN + nullptr, // GGML_OP_COS + nullptr, // GGML_OP_SUM + nullptr, // GGML_OP_SUM_ROWS + nullptr, // GGML_OP_MEAN + nullptr, // GGML_OP_ARGMAX + nullptr, // GGML_OP_REPEAT + nullptr, // GGML_OP_REPEAT_BACK + nullptr, // GGML_OP_CONCAT + nullptr, // GGML_OP_SILU_BACK + nullptr, // GGML_OP_NORM + nullptr, // GGML_OP_RMS_NORM + nullptr, // GGML_OP_RMS_NORM_BACK + nullptr, // GGML_OP_GROUP_NORM + + qnn_binary_op_impl, // GGML_OP_MUL_MAT + nullptr, // GGML_OP_MUL_MAT_ID + nullptr, // GGML_OP_OUT_PROD + + nullptr, // GGML_OP_SCALE + nullptr, // GGML_OP_SET + nullptr, // GGML_OP_CPY + nullptr, // GGML_OP_CONT + nullptr, // GGML_OP_RESHAPE + nullptr, // GGML_OP_VIEW + nullptr, // GGML_OP_PERMUTE + nullptr, // GGML_OP_TRANSPOSE + nullptr, // GGML_OP_GET_ROWS + nullptr, // GGML_OP_GET_ROWS_BACK + nullptr, // GGML_OP_DIAG + nullptr, // GGML_OP_DIAG_MASK_INF + nullptr, // GGML_OP_DIAG_MASK_ZERO + nullptr, // GGML_OP_SOFT_MAX + nullptr, // GGML_OP_SOFT_MAX_BACK + nullptr, // GGML_OP_ROPE + nullptr, // GGML_OP_ROPE_BACK + nullptr, // GGML_OP_CLAMP + nullptr, // GGML_OP_CONV_TRANSPOSE_1D + nullptr, // GGML_OP_IM2COL + nullptr, // GGML_OP_IM2COL_BACK + nullptr, // GGML_OP_CONV_TRANSPOSE_2D + nullptr, // GGML_OP_POOL_1D + nullptr, // GGML_OP_POOL_2D + nullptr, // GGML_OP_POOL_2D_BACK + nullptr, // GGML_OP_UPSCALE + nullptr, // GGML_OP_PAD + nullptr, // GGML_OP_ARANGE + nullptr, // GGML_OP_TIMESTEP_EMBEDDING + nullptr, // GGML_OP_ARGSORT + nullptr, // GGML_OP_LEAKY_RELU + + nullptr, // GGML_OP_FLASH_ATTN_EXT + nullptr, // GGML_OP_FLASH_ATTN_BACK + nullptr, // GGML_OP_SSM_CONV + nullptr, // GGML_OP_SSM_SCAN + nullptr, // GGML_OP_WIN_PART + nullptr, // GGML_OP_WIN_UNPART + nullptr, // GGML_OP_GET_REL_POS + nullptr, // GGML_OP_ADD_REL_POS + nullptr, // GGML_OP_RWKV_WKV + + nullptr, // GGML_OP_UNARY + + nullptr, // GGML_OP_MAP_UNARY + nullptr, // GGML_OP_MAP_BINARY + + nullptr, // GGML_OP_MAP_CUSTOM1_F32 + nullptr, // GGML_OP_MAP_CUSTOM2_F32 + nullptr, // GGML_OP_MAP_CUSTOM3_F32 + + nullptr, // GGML_OP_MAP_CUSTOM1 + nullptr, // GGML_OP_MAP_CUSTOM2 + nullptr, // GGML_OP_MAP_CUSTOM3 + + nullptr, // GGML_OP_CROSS_ENTROPY_LOSS + nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK +}; + +static_assert(sizeof(kQnnBinaryOpsTable) / sizeof(kQnnBinaryOpsTable[0]) == GGML_OP_COUNT, + "GGML_OP_COUNT does not match the size of the kQnnBinaryOpsTable table"); + +} // namespace + +namespace qnn { + +bool ggml_qnn_supports_op(const ggml_tensor *op) { + if (op->op == GGML_OP_UNARY) { + if (!kQnnUnaryOpsTable[kGgmlUnaryOpStart + ggml_get_unary_op(op)]) { + QNN_LOG_DEBUG("unsupported unary op %d", ggml_get_unary_op(op)); + return false; + } + + if (!op->src[0]) { + QNN_LOG_DEBUG("src0 is nullptr"); + return false; + } + } else if (op->op != GGML_OP_NONE) { + if (!kQnnUnaryOpsTable[op->op] && !kQnnBinaryOpsTable[op->op]) { + QNN_LOG_DEBUG("unsupported op %d", op->op); + return false; + } + + if (!op->src[0] || !op->src[1]) { + QNN_LOG_DEBUG("src0 or src1 is nullptr"); + return false; + } + +#ifndef NDEBUG + if (op->op == GGML_OP_ADD && !is_tensor_dimensions_equal(op->src[0], op->src[1])) { + QNN_LOG_DEBUG("src0 and src1 dimensions are not equal"); + return false; + } +#endif + } + + switch (op->type) { + case GGML_TYPE_F32: + case GGML_TYPE_F16: + case GGML_TYPE_I8: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q4_0: + break; + default: + QNN_LOG_DEBUG("unsupported src0 type %d", op->src[0]->type); + return false; + } + + return true; +} + +bool ggml_qnn_forward(ggml_backend_qnn_context *ctx, struct ggml_tensor *tensor) { + size_t unary_op_idx = tensor->op; + if (tensor->op == GGML_OP_UNARY) { + unary_op_idx = kGgmlUnaryOpStart + ggml_get_unary_op(tensor); + } + + auto unary_op = kQnnUnaryOpsTable[unary_op_idx]; + if (unary_op) { + return unary_op(ctx, tensor->src[0], tensor); + } + + auto binary_op = kQnnBinaryOpsTable[tensor->op]; + if (binary_op) { + return binary_op(ctx, tensor->src[0], tensor->src[1], tensor); + } + + QNN_LOG_WARN("unsupported op %s", ggml_op_desc(tensor)); + return false; +} + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/backend-ops.hpp b/ggml/src/ggml-qnn/backend-ops.hpp new file mode 100644 index 0000000000000..ed4ce994f787b --- /dev/null +++ b/ggml/src/ggml-qnn/backend-ops.hpp @@ -0,0 +1,12 @@ +#pragma once + +#include "ggml.h" + +#include "backend.hpp" + +namespace qnn { + +bool ggml_qnn_supports_op(const ggml_tensor *op); +bool ggml_qnn_forward(ggml_backend_qnn_context *ctx, struct ggml_tensor *tensor); + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/backend.hpp b/ggml/src/ggml-qnn/backend.hpp new file mode 100644 index 0000000000000..b2f93a8f7a9e5 --- /dev/null +++ b/ggml/src/ggml-qnn/backend.hpp @@ -0,0 +1,34 @@ + +#pragma once + +#include +#include + +#include "ggml.h" + +#include "ggml-backend.h" + +#include "graph.hpp" +#include "qnn-lib.hpp" + +namespace qnn { +typedef std::unordered_map> ggml_qnn_graph_cache_t; +} // namespace qnn + +struct ggml_backend_qnn_context { + int device; + int threads; + char name[GGML_MAX_NAME]; + char lib[GGML_MAX_NAME]; + ggml_backend *backend = nullptr; + qnn::qcom_socinfo socinfo = {}; + std::shared_ptr instance; + std::shared_ptr qnn_interface; + qnn::ggml_qnn_graph_cache_t qnn_graph_cache; + + explicit ggml_backend_qnn_context(int device, int threads, const char *name, const char *lib) : + device(device), threads(threads) { + strncpy(this->name, name, GGML_MAX_NAME); + strncpy(this->lib, lib, GGML_MAX_NAME); + } +}; diff --git a/ggml/src/ggml-qnn/buffer.hpp b/ggml/src/ggml-qnn/buffer.hpp new file mode 100644 index 0000000000000..4b4b2daaa75b4 --- /dev/null +++ b/ggml/src/ggml-qnn/buffer.hpp @@ -0,0 +1,55 @@ +#pragma once + +#include + +#include "logger.hpp" +#include "qnn-lib.hpp" + +namespace qnn { +class ggml_qnn_rpc_buffer { +public: + ggml_qnn_rpc_buffer(std::shared_ptr qnn_instance, size_t size, uint32_t rank, uint32_t *dimensions, + Qnn_DataType_t data_type) : + _qnn_instance(qnn_instance), _size(size) { + + _qnn_rpc_buffer = static_cast(qnn_instance->alloc_rpcmem(size, alignof(void *))); + _qnn_rpc_mem_handle = qnn_instance->register_rpcmem(_qnn_rpc_buffer, rank, dimensions, data_type); + if (!_qnn_rpc_buffer || !_qnn_rpc_mem_handle) { + QNN_LOG_WARN("register rpc mem failure\n"); + // let the destructor free the buffer + return; + } + + QNN_LOG_DEBUG("alloc rpcmem(%p) successfully, size %d\n", _qnn_rpc_buffer, (int)size); + } + ~ggml_qnn_rpc_buffer() { + if (_qnn_instance) { + if (_qnn_rpc_mem_handle) { + _qnn_instance->unregister_rpcmem(_qnn_rpc_mem_handle); + } + + if (_qnn_rpc_buffer) { + _qnn_instance->free_rpcmem(_qnn_rpc_buffer); + } + } + } + + bool is_valid() const { return _qnn_rpc_buffer && _qnn_rpc_mem_handle; } + + uint8_t *get_buffer() const { return _qnn_rpc_buffer; } + size_t get_size() const { return _size; } + Qnn_MemHandle_t get_mem_handle() const { return _qnn_rpc_mem_handle; } + +private: + std::shared_ptr _qnn_instance; + size_t _size = 0; + uint8_t *_qnn_rpc_buffer = nullptr; + Qnn_MemHandle_t _qnn_rpc_mem_handle = nullptr; + + ggml_qnn_rpc_buffer(const ggml_qnn_rpc_buffer &) = delete; + void operator=(const ggml_qnn_rpc_buffer &) = delete; + ggml_qnn_rpc_buffer(ggml_qnn_rpc_buffer &&) = delete; + void operator=(ggml_qnn_rpc_buffer &&) = delete; +}; + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp new file mode 100644 index 0000000000000..3f1a0ef163208 --- /dev/null +++ b/ggml/src/ggml-qnn/graph.hpp @@ -0,0 +1,233 @@ + +#pragma once + +#include +#include +#include +#include +#include + +#include "ggml-qnn.h" + +#include "logger.hpp" +#include "op-config.hpp" +#include "qnn-lib.hpp" +#include "tensor.hpp" + +namespace qnn { + +using ggml_tensor_array_t = std::vector; +using ggml_op_constructor_t = std::function(const std::string &)>; + +class ggml_qnn_graph { +public: + explicit ggml_qnn_graph(const std::string &graph_name, QNNBackend device, + std::shared_ptr qnn_instance, size_t vtcm_size_in_mb) : + _graph_name(graph_name), _device(device), _qnn_instance(qnn_instance) { + QNN_LOG_INFO("graph name %s", graph_name.c_str()); + + auto qnn_interface = qnn_instance->get_qnn_interface(); + auto qnn_context = qnn_instance->get_qnn_context_handle(); + Qnn_ErrorHandle_t error = QNN_SUCCESS; + Qnn_GraphHandle_t graph_handle = nullptr; + if (device == QNN_BACKEND_NPU) { + // TODO: fix graph config here for NPU + QnnHtpGraph_CustomConfig_t hvx_config; + hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS; + hvx_config.numHvxThreads = 8; + QnnGraph_Config_t graph_hvx_config; + graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_hvx_config.customConfig = &hvx_config; + + QnnHtpGraph_CustomConfig_t dlbc_config; + dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; + dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC; + dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC + QnnGraph_Config_t graph_dlbc_config; + graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_dlbc_config.customConfig = &dlbc_config; + + QnnHtpGraph_CustomConfig_t opt_config; + opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; + opt_config.optimizationOption.floatValue = 1; // 1 / 3 + QnnGraph_Config_t graph_opt_config; + graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_opt_config.customConfig = &opt_config; + + QnnHtpGraph_CustomConfig_t vtcm_config; + vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE; + vtcm_config.vtcmSizeInMB = vtcm_size_in_mb; + QnnGraph_Config_t graph_vtcm_config; + graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_vtcm_config.customConfig = &vtcm_config; + + const QnnGraph_Config_t *graph_configs[] = { &graph_hvx_config, &graph_dlbc_config, &graph_vtcm_config, + &graph_opt_config, nullptr }; + error = qnn_interface->qnn_graph_create(qnn_context, graph_name.c_str(), graph_configs, &graph_handle); + } else { + error = qnn_interface->qnn_graph_create(qnn_context, graph_name.c_str(), nullptr, &graph_handle); + } + + if (error != QNN_SUCCESS) { + QNN_LOG_INFO( + "can't create qnn graph handle with graph name %s, " + "error = %d\n", + graph_name.c_str(), error); + return; + } + + QNN_LOG_INFO("create qnn graph handle with graph name %s ok\n", graph_name.c_str()); + _graph_handle = graph_handle; + _qnn_interface = qnn_interface; + } + + ~ggml_qnn_graph() { QNN_LOG_DEBUG("graph name %s, destroy", _graph_name.c_str()); } + + bool build_graph(ggml_op_constructor_t op_constructor, const ggml_tensor_array_t &tensor_inputs, + const ggml_tensor_array_t &tensor_outputs) { + GGML_ASSERT(op_constructor); + if (!is_valid()) { + QNN_LOG_ERROR("Invalid graph\n"); + return false; + } + + // get the max tensor rank + for (auto tensor : tensor_inputs) { + _tensor_rank = std::max(_tensor_rank, ggml_n_dims(tensor)); + } + for (auto tensor : tensor_outputs) { + _tensor_rank = std::max(_tensor_rank, ggml_n_dims(tensor)); + } + + QNN_LOG_DEBUG("graph name %s, build_graph start", _graph_name.c_str()); + _tensor_inputs.resize(tensor_inputs.size()); + for (size_t i = 0; i < tensor_inputs.size(); i++) { + char buffer[GGML_MAX_NAME] = {}; + snprintf(buffer, GGML_MAX_NAME, "src%d", (int)i); + auto qnn_tensor = + std::make_shared(std::string(buffer), _device, _graph_handle, _qnn_instance); + auto *ggml_tensor = tensor_inputs[i]; + if (!qnn_tensor->bind_ggml_tensor(ggml_tensor, true, _tensor_rank)) { + QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); + return false; + } + + _tensor_inputs[i] = qnn_tensor; + } + + _tensor_outputs.resize(tensor_outputs.size()); + for (size_t i = 0; i < tensor_outputs.size(); i++) { + char buffer[GGML_MAX_NAME] = {}; + snprintf(buffer, GGML_MAX_NAME, "dst%d", (int)i); + auto qnn_tensor = + std::make_shared(std::string(buffer), _device, _graph_handle, _qnn_instance); + auto *ggml_tensor = tensor_outputs[i]; + if (!qnn_tensor->bind_ggml_tensor(ggml_tensor, false, _tensor_rank)) { + QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); + return false; + } + + _tensor_outputs[i] = qnn_tensor; + } + + _op_config = op_constructor(_graph_name); + _op_config->set_input_tensors(_tensor_inputs); + _op_config->set_output_tensors(_tensor_outputs); + auto error = _qnn_interface->qnn_graph_add_node(_graph_handle, _op_config->get_op_config()); + if (error != QNN_SUCCESS) { + auto *error_str = get_qnn_error_string(error); + if (error_str) { + QNN_LOG_ERROR("qnn_graph_add_node.error: %s\n", error_str); + } else { + QNN_LOG_ERROR("qnn_graph_add_node.error: %d\n", error); + } + return false; + } + + error = _qnn_interface->qnn_graph_finalize(_graph_handle, nullptr, nullptr); + if (error != QNN_SUCCESS) { + auto *error_str = get_qnn_error_string(error); + if (error_str) { + QNN_LOG_ERROR("qnn_graph_finalize.error: %s\n", error_str); + } else { + QNN_LOG_ERROR("qnn_graph_finalize.error: %d\n", error); + } + return false; + } + + QNN_LOG_DEBUG("graph name %s, build_graph succeed", _graph_name.c_str()); + return true; + } + + bool execute(const ggml_tensor_array_t &tensor_inputs, const ggml_tensor_array_t &tensor_outputs) { + GGML_ASSERT(tensor_inputs.size() == _tensor_inputs.size()); + GGML_ASSERT(tensor_outputs.size() == _tensor_outputs.size()); + for (size_t i = 0; i < tensor_inputs.size(); i++) { + auto *ggml_tensor = tensor_inputs[i]; + if (!_tensor_inputs[i]->bind_ggml_tensor(ggml_tensor, true, _tensor_rank)) { + QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); + return false; + } + } + + for (size_t i = 0; i < tensor_outputs.size(); i++) { + auto *ggml_tensor = tensor_outputs[i]; + if (!_tensor_outputs[i]->bind_ggml_tensor(ggml_tensor, false, _tensor_rank)) { + QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); + return false; + } + } + + _op_config->set_input_tensors(_tensor_inputs); + _op_config->set_output_tensors(_tensor_outputs); + auto &qnn_tensor_inputs = _op_config->get_qnn_input_tensors(); + auto &qnn_tensor_outputs = _op_config->get_qnn_output_tensors(); + + auto error = + _qnn_interface->qnn_graph_execute(_graph_handle, qnn_tensor_inputs.data(), qnn_tensor_inputs.size(), + qnn_tensor_outputs.data(), qnn_tensor_outputs.size(), nullptr, nullptr); + if (_device == QNN_BACKEND_NPU) { + if (error == QNN_COMMON_ERROR_SYSTEM_COMMUNICATION) { + QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); + } + } + + for (auto tensor : _tensor_inputs) { + tensor->unbind_ggml_tensor(); + } + + for (auto tensor : _tensor_outputs) { + tensor->unbind_ggml_tensor(); + } + + if (error != QNN_SUCCESS) { + QNN_LOG_INFO("error = %d\n", error); + return false; + } + + return true; + } + + bool is_valid() const { return _graph_handle != nullptr; } + + Qnn_GraphHandle_t get_graph_handler() const { return _graph_handle; } + + const std::string &get_name() const { return _graph_name; } + +private: + const std::string _graph_name; + const QNNBackend _device; + Qnn_GraphHandle_t _graph_handle = nullptr; + std::shared_ptr _qnn_instance; + std::shared_ptr _qnn_interface; + std::vector> _tensor_inputs; + std::vector> _tensor_outputs; + std::unique_ptr _op_config; + std::vector _param_types; + int _tensor_rank = 0; + + DISABLE_COPY(ggml_qnn_graph); + DISABLE_MOVE(ggml_qnn_graph); +}; + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/logger.cpp b/ggml/src/ggml-qnn/logger.cpp new file mode 100644 index 0000000000000..187e9088c779c --- /dev/null +++ b/ggml/src/ggml-qnn/logger.cpp @@ -0,0 +1,74 @@ + +#include "logger.hpp" + +#include + +#include + +#if defined(__ANDROID__) || defined(ANDROID) +#include +#endif + +void qnn::internal_log(ggml_log_level level, const char * /*file*/, const char *func, int line, const char *format, + ...) { + static std::mutex qnn_internal_log_mutex; + static char s_qnn_internal_log_buf[QNN_LOGBUF_LEN]; + + { + std::lock_guard lock(qnn_internal_log_mutex); + va_list args; + + va_start(args, format); + int len_prefix = snprintf(s_qnn_internal_log_buf, QNN_LOGBUF_LEN, "[%s, %d]: ", func, line); + int len = vsnprintf(s_qnn_internal_log_buf + len_prefix, QNN_LOGBUF_LEN - len_prefix, format, args); + if (len < (QNN_LOGBUF_LEN - len_prefix)) { +#if defined(__ANDROID__) || defined(ANDROID) + // for Android APK + __android_log_print(level, "ggml-qnn", "%s\n", s_qnn_internal_log_buf); +#endif + // for Android command line application or WoA(Windows on ARM) + printf("%s\n", s_qnn_internal_log_buf); + } + va_end(args); + } +} + +#if ENABLE_QNNSDK_LOG +void qnn::sdk_logcallback(const char *fmt, QnnLog_Level_t level, uint64_t timestamp, va_list argp) { + static std::mutex log_mutex; + static unsigned char s_ggml_qnn_logbuf[QNN_LOGBUF_LEN]; + + const char *log_level_desc = ""; + switch (level) { + case QNN_LOG_LEVEL_ERROR: + log_level_desc = "ERROR"; + break; + case QNN_LOG_LEVEL_WARN: + log_level_desc = "WARNING"; + break; + case QNN_LOG_LEVEL_INFO: + log_level_desc = "INFO"; + break; + case QNN_LOG_LEVEL_DEBUG: + log_level_desc = "DEBUG"; + break; + case QNN_LOG_LEVEL_VERBOSE: + log_level_desc = "VERBOSE"; + break; + case QNN_LOG_LEVEL_MAX: + log_level_desc = "UNKNOWN"; + break; + } + + double ms = (double)timestamp / 1000000.0; + { + std::lock_guard lock(log_mutex); + + memset(s_ggml_qnn_logbuf, 0, QNN_LOGBUF_LEN); + vsnprintf(reinterpret_cast(s_ggml_qnn_logbuf), QNN_LOGBUF_LEN, fmt, argp); + QNN_LOG_INFO("%8.1fms [%-7s] %s", ms, log_level_desc, s_ggml_qnn_logbuf); + } +} +#else +void qnn::sdk_logcallback(const char *, QnnLog_Level_t, uint64_t, va_list) {} +#endif diff --git a/ggml/src/ggml-qnn/logger.hpp b/ggml/src/ggml-qnn/logger.hpp new file mode 100644 index 0000000000000..b4bab0c006691 --- /dev/null +++ b/ggml/src/ggml-qnn/logger.hpp @@ -0,0 +1,43 @@ +#pragma once + +#include + +#include "ggml.h" + +#include "QnnCommon.h" +#include "QnnInterface.h" +#include "QnnTypes.h" +#include "System/QnnSystemInterface.h" + +#define QNN_LOGBUF_LEN 4096 + +namespace qnn { +void internal_log(ggml_log_level level, const char *file, const char *func, int line, const char *format, ...); + +void sdk_logcallback(const char *fmt, QnnLog_Level_t level, uint64_t timestamp, va_list argp); +} // namespace qnn + +// ================================================================================================= +// +// QNN backend internal log function +// +// ================================================================================================= +#define QNN_LOG_ERROR(...) qnn::internal_log(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) + +#define QNN_LOG_WARN(...) qnn::internal_log(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) + +#define QNN_LOG_INFO(...) qnn::internal_log(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) + +#ifdef NDEBUG +#define ENABLE_QNNBACKEND_DEBUG 0 // for troubleshooting QNN backend +#define ENABLE_QNNSDK_LOG 0 // enable/disable QNN SDK's internal log +#else +#define ENABLE_QNNBACKEND_DEBUG 1 // for troubleshooting QNN backend +#define ENABLE_QNNSDK_LOG 1 // enable/disable QNN SDK's internal log +#endif + +#if ENABLE_QNNBACKEND_DEBUG +#define QNN_LOG_DEBUG(...) qnn::internal_log(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#else +#define QNN_LOG_DEBUG(...) +#endif diff --git a/ggml/src/ggml-qnn/op-config.hpp b/ggml/src/ggml-qnn/op-config.hpp new file mode 100644 index 0000000000000..7852ee84dc12f --- /dev/null +++ b/ggml/src/ggml-qnn/op-config.hpp @@ -0,0 +1,73 @@ +#pragma once + +#include +#include + +#include "ggml-qnn.h" + +#include "logger.hpp" +#include "qnn-lib.hpp" +#include "qnn-types.hpp" +#include "tensor.hpp" + +namespace qnn { +class ggml_qnn_op_config { +public: + explicit ggml_qnn_op_config(const std::string &name, const std::string &package_name, const std::string &op_type) : + _name(name), _package_name(package_name), _op_type(op_type) {} + + void set_input_tensors(const std::vector> &tensor_inputs) { + _qnn_tensor_inputs.resize(tensor_inputs.size()); + for (size_t i = 0; i < tensor_inputs.size(); i++) { + _qnn_tensor_inputs[i] = tensor_inputs[i]->get_qnn_tensor(); + } + } + + void set_output_tensors(const std::vector> &tensor_outputs) { + _qnn_tensor_outputs.resize(tensor_outputs.size()); + for (size_t i = 0; i < tensor_outputs.size(); i++) { + _qnn_tensor_outputs[i] = tensor_outputs[i]->get_qnn_tensor(); + } + } + + void add_scalar_param(const std::string &name, const Qnn_Scalar_t scalar) { + _param_names.push_back(name); + Qnn_Param_t param = QNN_PARAM_INIT; + param.paramType = QNN_PARAMTYPE_SCALAR; + param.name = _param_names.back().c_str(); + param.scalarParam = scalar; + _parameters.push_back(param); + } + + std::vector &get_qnn_input_tensors() { return _qnn_tensor_inputs; } + std::vector &get_qnn_output_tensors() { return _qnn_tensor_outputs; } + + Qnn_OpConfig_t get_op_config() { + Qnn_OpConfig_t config = QNN_OPCONFIG_INIT; + config.version = QNN_OPCONFIG_VERSION_1; + auto &op_config = config.v1; + op_config.name = _name.c_str(); + op_config.packageName = _package_name.c_str(); + op_config.typeName = _op_type.c_str(); + op_config.numOfParams = (uint32_t)_parameters.size(); + op_config.params = _parameters.data(); + op_config.numOfInputs = (uint32_t)_qnn_tensor_inputs.size(); + op_config.inputTensors = _qnn_tensor_inputs.data(); + op_config.numOfOutputs = (uint32_t)_qnn_tensor_outputs.size(); + op_config.outputTensors = _qnn_tensor_outputs.data(); + return config; + } + +private: + std::string _name; + std::string _package_name; + std::string _op_type; + std::vector _qnn_tensor_inputs; + std::vector _qnn_tensor_outputs; + std::vector _parameters; + std::vector _param_names; + + DISABLE_COPY(ggml_qnn_op_config); + DISABLE_MOVE(ggml_qnn_op_config); +}; +} // namespace qnn diff --git a/ggml/src/ggml-qnn/qnn-lib.cpp b/ggml/src/ggml-qnn/qnn-lib.cpp new file mode 100644 index 0000000000000..a7553c4ac2b75 --- /dev/null +++ b/ggml/src/ggml-qnn/qnn-lib.cpp @@ -0,0 +1,35 @@ + +#include "qnn-lib.hpp" + +namespace qnn { + +qnn_system_interface::qnn_system_interface(const QnnSystemInterface_t &qnn_sys_interface, dl_handler_t lib_handle) : + _qnn_sys_interface(qnn_sys_interface), _lib_handle(lib_handle) { + qnn_system_context_create(&_qnn_system_handle); + if (_qnn_system_handle) { + QNN_LOG_INFO("initialize qnn system successfully\n"); + } else { + QNN_LOG_WARN("can not create QNN system contenxt\n"); + } +} + +qnn_system_interface::~qnn_system_interface() { + if (_qnn_system_handle) { + if (qnn_system_context_free(_qnn_system_handle) != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN system context\n"); + } + } else { + QNN_LOG_WARN("system handle is null\n"); + } + + if (_lib_handle) { + int dlclose_error = dl_unload(_lib_handle); + if (dlclose_error != 0) { + QNN_LOG_WARN("failed to close QnnSystem library, error %s\n", dl_error()); + } + } else { + QNN_LOG_WARN("system lib handle is null\n"); + } +} + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/qnn-lib.hpp b/ggml/src/ggml-qnn/qnn-lib.hpp new file mode 100644 index 0000000000000..da986e2e4c4ff --- /dev/null +++ b/ggml/src/ggml-qnn/qnn-lib.hpp @@ -0,0 +1,906 @@ +#pragma once + +#include + +#include +#include +#include +#include + +// header file of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK +// https://qpm.qualcomm.com/#/main/tools/details/qualcomm_ai_engine_direct +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "qnn-types.hpp" +#include "utils.hpp" + +namespace qnn { + +// TODO: those function should be moved to a separate file, and have separate implementation for each platform +typedef void *dl_handler_t; + +inline dl_handler_t dl_load(const std::string &lib_path) { return dlopen(lib_path.c_str(), RTLD_NOW | RTLD_LOCAL); } + +inline void *dl_sym(dl_handler_t handle, const std::string &symbol) { return dlsym(handle, symbol.c_str()); } + +inline int dl_unload(dl_handler_t handle) { return dlclose(handle); } + +inline const char *dl_error() { return dlerror(); } + +template +Fn dl_sym_typed(dl_handler_t handle, const std::string &function_name) { + return reinterpret_cast(dl_sym(handle, function_name)); +} + +// ================================================================================================= +// +// wrapper class of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK +// ref:https://github.com/pytorch/executorch/tree/main/backends/qualcomm +// ================================================================================================= + +// TODO: fix this for other compilers +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wextra-semi" + +class qnn_system_interface { + +#define DEFINE_SHIM_FUNCTION_SYS_INTERFACE(F, pointer_name) \ + template \ + inline auto qnn_##F(Args... args) const { \ + return (_qnn_sys_interface.QNN_SYSTEM_INTERFACE_VER_NAME.pointer_name)(std::forward(args)...); \ + } + +public: + qnn_system_interface(const QnnSystemInterface_t &qnn_sys_interface, dl_handler_t lib_handle); + ~qnn_system_interface(); + bool is_valid() const { return _qnn_system_handle != nullptr; } + + // QnnSystem + DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_create, systemContextCreate); + + DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_get_binary_info, systemContextGetBinaryInfo); + + DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_free, systemContextFree); + +private: + qnn_system_interface(const qnn_system_interface &) = delete; + void operator=(const qnn_system_interface &) = delete; + qnn_system_interface(qnn_system_interface &&) = delete; + void operator=(qnn_system_interface &&) = delete; + + const QnnSystemInterface_t _qnn_sys_interface = {}; + dl_handler_t _lib_handle = nullptr; + QnnSystemContext_Handle_t _qnn_system_handle = nullptr; +}; + +class qnn_interface { + +#define DEFINE_SHIM_FUNCTION_INTERFACE(F, pointer_name) \ + template \ + inline auto qnn_##F(Args... args) const { \ + return (_qnn_interface.QNN_INTERFACE_VER_NAME.pointer_name)(std::forward(args)...); \ + } + +public: + qnn_interface(const QnnInterface_t &qnn_interface) : _qnn_interface(qnn_interface) {} + + // QnnBackend + DEFINE_SHIM_FUNCTION_INTERFACE(backend_create, backendCreate); + + DEFINE_SHIM_FUNCTION_INTERFACE(backend_free, backendFree); + + DEFINE_SHIM_FUNCTION_INTERFACE(backend_register_op_package, backendRegisterOpPackage); + + DEFINE_SHIM_FUNCTION_INTERFACE(backend_validate_op_config, backendValidateOpConfig); + + DEFINE_SHIM_FUNCTION_INTERFACE(backend_get_api_version, backendGetApiVersion); + // QnnDevice + DEFINE_SHIM_FUNCTION_INTERFACE(device_create, deviceCreate); + + DEFINE_SHIM_FUNCTION_INTERFACE(device_free, deviceFree); + + DEFINE_SHIM_FUNCTION_INTERFACE(device_get_infrastructure, deviceGetInfrastructure); + + DEFINE_SHIM_FUNCTION_INTERFACE(device_get_platform_info, deviceGetPlatformInfo); + + DEFINE_SHIM_FUNCTION_INTERFACE(device_free_platform_info, deviceFreePlatformInfo); + + DEFINE_SHIM_FUNCTION_INTERFACE(device_get_info, deviceGetInfo); + + // QnnContext + DEFINE_SHIM_FUNCTION_INTERFACE(context_create, contextCreate); + + DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary_size, contextGetBinarySize); + + DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary, contextGetBinary); + + DEFINE_SHIM_FUNCTION_INTERFACE(context_create_from_binary, contextCreateFromBinary); + + DEFINE_SHIM_FUNCTION_INTERFACE(context_free, contextFree); + + // QnnGraph + DEFINE_SHIM_FUNCTION_INTERFACE(graph_create, graphCreate); + + DEFINE_SHIM_FUNCTION_INTERFACE(graph_add_node, graphAddNode); + + DEFINE_SHIM_FUNCTION_INTERFACE(graph_finalize, graphFinalize); + + DEFINE_SHIM_FUNCTION_INTERFACE(graph_execute, graphExecute); + + DEFINE_SHIM_FUNCTION_INTERFACE(graph_retrieve, graphRetrieve); + + // QnnLog + DEFINE_SHIM_FUNCTION_INTERFACE(log_create, logCreate); + + DEFINE_SHIM_FUNCTION_INTERFACE(log_free, logFree); + + DEFINE_SHIM_FUNCTION_INTERFACE(log_set_log_level, logSetLogLevel); + + // QnnProfile + DEFINE_SHIM_FUNCTION_INTERFACE(profile_create, profileCreate); + + DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_events, profileGetEvents); + + DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_sub_events, profileGetSubEvents); + + DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_event_data, profileGetEventData); + + DEFINE_SHIM_FUNCTION_INTERFACE(profile_free, profileFree); + + // QnnMem + DEFINE_SHIM_FUNCTION_INTERFACE(mem_register, memRegister); + + DEFINE_SHIM_FUNCTION_INTERFACE(mem_de_register, memDeRegister); + + // QnnProperty + DEFINE_SHIM_FUNCTION_INTERFACE(property_has_capability, propertyHasCapability); + + // QnnTensor + DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_context_tensor, tensorCreateContextTensor); + + DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_graph_tensor, tensorCreateGraphTensor); + + uint32_t get_backend_id() const { return _qnn_interface.backendId; } + +private: + qnn_interface(const qnn_interface &) = delete; + void operator=(const qnn_interface &) = delete; + qnn_interface(qnn_interface &&) = delete; + void operator=(qnn_interface &&) = delete; + + const QnnInterface_t _qnn_interface = {}; +}; + +#pragma GCC diagnostic pop + +class qnn_instance { +public: + using BackendIdType = decltype(QnnInterface_t{}.backendId); + + explicit qnn_instance(const std::string &lib_path, const std::string &backend_name, const std::string &model_name) : + _lib_path(std::move(lib_path)), _backend_name(std::move(backend_name)), _model_name(std::move(model_name)) {} + + ~qnn_instance() {} + + int qnn_init(const QnnSaver_Config_t **saver_config) { + BackendIdType backend_id = QNN_BACKEND_ID_NULL; + QNN_LOG_DEBUG("enter qni_init\n"); + + std::lock_guard lock(_init_mutex); + if (load_system() != 0) { + QNN_LOG_WARN("can not load QNN system lib, pls check why?\n"); + return 1; + } else { + QNN_LOG_DEBUG("load QNN system lib successfully\n"); + } + + std::string backend_lib_path = _lib_path + _backend_name; + if (_lib_path_to_backend_id.count(backend_lib_path) == 0) { + int is_load_ok = load_backend(backend_lib_path, saver_config); + if (is_load_ok != 0) { + QNN_LOG_WARN("failed to load QNN backend\n"); + return 2; + } + } + + backend_id = _lib_path_to_backend_id[backend_lib_path]; + if (_loaded_backend.count(backend_id) == 0 || _loaded_lib_handle.count(backend_id) == 0) { + QNN_LOG_WARN( + "library %s is loaded but loaded backend count=%zu, " + "loaded lib_handle count=%zu\n", + backend_lib_path.c_str(), _loaded_backend.count(backend_id), _loaded_lib_handle.count(backend_id)); + return 3; + } + + _qnn_interface = std::make_shared(*_loaded_backend[backend_id]); + _qnn_interface->qnn_log_create(qnn::sdk_logcallback, _qnn_log_level, &_qnn_log_handle); + if (nullptr == _qnn_log_handle) { + // NPU backend not work on Qualcomm SoC equipped low-end phone + QNN_LOG_WARN("why failed to initialize qnn log\n"); + return 4; + } else { + QNN_LOG_DEBUG("initialize qnn log successfully\n"); + } + + std::vector temp_backend_config; + _qnn_interface->qnn_backend_create( + _qnn_log_handle, temp_backend_config.empty() ? nullptr : temp_backend_config.data(), &_qnn_backend_handle); + if (nullptr == _qnn_backend_handle) { + QNN_LOG_WARN("why failed to initialize qnn backend\n"); + return 5; + } else { + QNN_LOG_DEBUG("initialize qnn backend successfully\n"); + } + + Qnn_ErrorHandle_t qnn_status = _qnn_interface->qnn_property_has_capability(QNN_PROPERTY_GROUP_DEVICE); + if (QNN_PROPERTY_NOT_SUPPORTED == qnn_status) { + QNN_LOG_WARN("device property is not supported\n"); + } + if (QNN_PROPERTY_ERROR_UNKNOWN_KEY == qnn_status) { + QNN_LOG_WARN("device property is not known to backend\n"); + } + + qnn_status = QNN_SUCCESS; + if (_backend_name.find("Htp") != std::variant_npos) { + const QnnDevice_PlatformInfo_t *p_info = nullptr; + _qnn_interface->qnn_device_get_platform_info(nullptr, &p_info); + QNN_LOG_INFO("device counts %d", p_info->v1.numHwDevices); + QnnDevice_HardwareDeviceInfo_t *infos = p_info->v1.hwDevices; + QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = {}; + for (uint32_t i = 0; i < p_info->v1.numHwDevices; i++) { + QNN_LOG_INFO("deviceID:%d, deviceType:%d, numCores %d", infos[i].v1.deviceId, infos[i].v1.deviceType, + infos[i].v1.numCores); + QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension; + chipinfo = devinfo->onChipDevice; + QnnHtpDevice_Arch_t htp_arch = chipinfo.arch; + QNN_LOG_INFO("htp_type:%d(%s)", devinfo->devType, + (devinfo->devType == QNN_HTP_DEVICE_TYPE_ON_CHIP) ? "ON_CHIP" : ""); + QNN_LOG_INFO("qualcomm soc_model:%d(%s), htp_arch:%d(%s), vtcm_size:%d MB", chipinfo.socModel, + qnn::get_chipset_desc(chipinfo.socModel), htp_arch, qnn::get_htparch_desc(htp_arch), + chipinfo.vtcmSize); + _soc_info = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize }; + } + _qnn_interface->qnn_device_free_platform_info(nullptr, p_info); + + QnnHtpDevice_CustomConfig_t soc_customconfig; + soc_customconfig.option = QNN_HTP_DEVICE_CONFIG_OPTION_SOC; + soc_customconfig.socModel = chipinfo.socModel; + QnnDevice_Config_t soc_devconfig; + soc_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM; + soc_devconfig.customConfig = &soc_customconfig; + + QnnHtpDevice_CustomConfig_t arch_customconfig; + arch_customconfig.option = QNN_HTP_DEVICE_CONFIG_OPTION_ARCH; + arch_customconfig.arch.arch = chipinfo.arch; + arch_customconfig.arch.deviceId = 0; // Id of device to be used. If single device is used by default 0. + QnnDevice_Config_t arch_devconfig; + arch_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM; + arch_devconfig.customConfig = &arch_customconfig; + + const QnnDevice_Config_t *p_deviceconfig[] = { &soc_devconfig, &arch_devconfig, nullptr }; + qnn_status = _qnn_interface->qnn_device_create(_qnn_log_handle, p_deviceconfig, &_qnn_device_handle); + } else { + qnn_status = _qnn_interface->qnn_device_create(_qnn_log_handle, nullptr, &_qnn_device_handle); + } + if (QNN_SUCCESS != qnn_status && QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnn_status) { + QNN_LOG_WARN("failed to create QNN device\n"); + } else { + QNN_LOG_INFO("create QNN device successfully\n"); + } + + if (qnn::sdk_profile_level::profile_off != _profile_level) { + QNN_LOG_INFO("profiling turned on; level = %d", _profile_level); + if (qnn::sdk_profile_level::profile_basic == _profile_level) { + QNN_LOG_INFO("basic profiling requested. creating Qnn Profile object\n"); + if (QNN_PROFILE_NO_ERROR != _qnn_interface->qnn_profile_create( + _qnn_backend_handle, QNN_PROFILE_LEVEL_BASIC, &_qnn_profile_handle)) { + QNN_LOG_WARN("unable to create profile handle in the backend\n"); + return 6; + } else { + QNN_LOG_DEBUG("initialize qnn profile successfully\n"); + } + } else if (qnn::sdk_profile_level::profile_detail == _profile_level) { + QNN_LOG_INFO("detailed profiling requested. Creating Qnn Profile object\n"); + if (QNN_PROFILE_NO_ERROR != _qnn_interface->qnn_profile_create(_qnn_backend_handle, + QNN_PROFILE_LEVEL_DETAILED, + &_qnn_profile_handle)) { + QNN_LOG_WARN("unable to create profile handle in the backend\n"); + return 7; + } else { + QNN_LOG_DEBUG("initialize qnn profile successfully\n"); + } + } + } + + _rpc_lib_handle = dl_load("libcdsprpc.so"); + if (nullptr == _rpc_lib_handle) { + QNN_LOG_WARN("failed to load qualcomm's rpc lib, error:%s\n", dl_error()); + return 8; + } else { + QNN_LOG_DEBUG("load rpcmem lib successfully\n"); + set_rpcmem_initialized(true); + } + _pfn_rpc_mem_init = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_init")); + _pfn_rpc_mem_deinit = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_deinit")); + _pfn_rpc_mem_alloc = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_alloc")); + _pfn_rpc_mem_free = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_free")); + _pfn_rpc_mem_to_fd = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_to_fd")); + if (nullptr == _pfn_rpc_mem_alloc || nullptr == _pfn_rpc_mem_free || nullptr == _pfn_rpc_mem_to_fd) { + QNN_LOG_WARN("unable to access symbols in QNN RPC lib. error: %s", dl_error()); + dl_unload(_rpc_lib_handle); + return 9; + } + + if (nullptr != _pfn_rpc_mem_init) { // make Qualcomm's SoC equipped low-end phone happy + _pfn_rpc_mem_init(); + } + + /* TODO: not used, keep it for further usage + QnnContext_Config_t qnn_context_config = QNN_CONTEXT_CONFIG_INIT; + qnn_context_config.priority = QNN_PRIORITY_DEFAULT; + const QnnContext_Config_t * context_configs[] = {&qnn_context_config, nullptr}; + */ + _qnn_interface->qnn_context_create(_qnn_backend_handle, _qnn_device_handle, nullptr, &_qnn_context_handle); + if (nullptr == _qnn_context_handle) { + QNN_LOG_WARN("why failed to initialize qnn context\n"); + return 10; + } else { + QNN_LOG_DEBUG("initialize qnn context successfully\n"); + } + + if (_backend_name.find("Htp") != std::variant_npos) { + // TODO: faster approach to probe the accurate capacity of rpc ion memory + size_t candidate_size = 0; + uint8_t *rpc_buffer = nullptr; + const int size_in_mb = (1 << 20); + size_t probe_slots[] = { 1024, 1536, 2048 - 48, 2048 }; + size_t probe_counts = sizeof(probe_slots) / sizeof(size_t); + for (size_t idx = 0; idx < probe_counts; idx++) { + rpc_buffer = static_cast(alloc_rpcmem(probe_slots[idx] * size_in_mb, sizeof(void *))); + if (!rpc_buffer) { + QNN_LOG_DEBUG("alloc rpcmem %d (MB) failure, %s\n", probe_slots[idx], strerror(errno)); + break; + } else { + candidate_size = probe_slots[idx]; + free_rpcmem(rpc_buffer); + rpc_buffer = nullptr; + } + } + + _rpcmem_capacity = std::max(candidate_size, _rpcmem_capacity); + QNN_LOG_INFO("capacity of QNN rpc ion memory is about %d MB\n", _rpcmem_capacity); + + if (0 != init_htp_perfinfra()) { + QNN_LOG_WARN("initialize HTP performance failure"); + } + if (0 != set_rpc_polling()) { + QNN_LOG_WARN("set RPC polling failure"); + } + if (0 != set_high_performance_mode()) { + QNN_LOG_WARN("set HTP high performance mode failure"); + } + } + + QNN_LOG_DEBUG("leave qni_init\n"); + + return 0; + } + + int qnn_finalize() { + int ret_status = 0; + Qnn_ErrorHandle_t error = QNN_SUCCESS; + + if (nullptr != _pfn_rpc_mem_deinit) // make Qualcomm's SoC equipped low-end phone happy + _pfn_rpc_mem_deinit(); + + if (dl_unload(_rpc_lib_handle) != 0) { + QNN_LOG_WARN("failed to unload qualcomm's rpc lib, error:%s\n", dl_error()); + } else { + QNN_LOG_DEBUG("succeed to close rpcmem lib\n"); + } + + if (_backend_name.find("Htp") != std::variant_npos) { + _qnn_htp_perfinfra->destroyPowerConfigId(_qnn_power_configid); + } + + if (nullptr != _qnn_context_handle) { + error = _qnn_interface->qnn_context_free(_qnn_context_handle, _qnn_profile_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN context_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(), + QNN_GET_ERROR_CODE(error)); + } + _qnn_context_handle = nullptr; + } + + if (nullptr != _qnn_profile_handle) { + error = _qnn_interface->qnn_profile_free(_qnn_profile_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN profile_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(), + QNN_GET_ERROR_CODE(error)); + } + _qnn_profile_handle = nullptr; + } + + if (nullptr != _qnn_device_handle) { + error = _qnn_interface->qnn_device_free(_qnn_device_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN device_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(), + QNN_GET_ERROR_CODE(error)); + } + _qnn_device_handle = nullptr; + } + + if (nullptr != _qnn_backend_handle) { + error = _qnn_interface->qnn_backend_free(_qnn_backend_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN backend_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(), + QNN_GET_ERROR_CODE(error)); + } + _qnn_backend_handle = nullptr; + } + + if (nullptr != _qnn_log_handle) { + error = _qnn_interface->qnn_log_free(_qnn_log_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN log_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(), + QNN_GET_ERROR_CODE(error)); + } + _qnn_log_handle = nullptr; + } + + unload_backend(); + + _qnn_sys_interface.reset(); + + return ret_status; + } + + std::shared_ptr get_qnn_interface() { + if (!_qnn_interface) { + QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + } + return _qnn_interface; + } + + Qnn_LogHandle_t get_qnn_log_handle() { return _qnn_log_handle; } + + Qnn_ProfileHandle_t get_qnn_profile_handle() { return _qnn_profile_handle; } + + Qnn_DeviceHandle_t get_qnn_device_handle() { return _qnn_device_handle; } + + Qnn_BackendHandle_t get_qnn_backend_handle() { return _qnn_backend_handle; } + + Qnn_ContextHandle_t get_qnn_context_handle() { return _qnn_context_handle; } + + Qnn_GraphHandle_t get_qnn_graph_handle() { return _qnn_graph_handle; } + + int init_htp_perfinfra() { + QnnDevice_Infrastructure_t device_infra = nullptr; + int error = _qnn_interface->qnn_device_get_infrastructure(&device_infra); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to get qnn device infra\n"); + return 1; + } else { + QNN_LOG_INFO("HTP backend perf_infrastructure creation ok\n"); + } + + QnnHtpDevice_Infrastructure_t *htp_infra = static_cast(device_infra); + QnnHtpDevice_PerfInfrastructure_t *htp_perfinfra = &htp_infra->perfInfra; + uint32_t power_configid = 1; + uint32_t device_id = 0; + uint32_t core_id = 0; + htp_perfinfra->createPowerConfigId(device_id, core_id, &power_configid); + if (htp_infra->infraType != QNN_HTP_DEVICE_INFRASTRUCTURE_TYPE_PERF) { + QNN_LOG_INFO("HTP infra type = %d, which is not perf infra type", htp_infra->infraType); + } else { + QNN_LOG_INFO("HTP infra type = %d, which is perf infra type\n", htp_infra->infraType); + } + _qnn_htp_perfinfra = htp_perfinfra; + _qnn_power_configid = power_configid; + + return 0; + } + + int set_rpc_polling() { + if (_qnn_htp_perfinfra) { + QnnHtpPerfInfrastructure_PowerConfig_t rpc_polling_time; + memset(&rpc_polling_time, 0, sizeof(rpc_polling_time)); + rpc_polling_time.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME; + // use rpc polling time recommended 0-10000 us + rpc_polling_time.rpcPollingTimeConfig = 9999; + + QnnHtpPerfInfrastructure_PowerConfig_t rpc_control_latency; + memset(&rpc_control_latency, 0, sizeof(rpc_control_latency)); + rpc_control_latency.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_CONTROL_LATENCY; + // use rpc control latency recommended 100 us, refer hexagon sdk + rpc_control_latency.rpcControlLatencyConfig = 100; + + const QnnHtpPerfInfrastructure_PowerConfig_t *power_configs[] = { &rpc_polling_time, &rpc_control_latency, + nullptr }; + Qnn_ErrorHandle_t qnn_status = _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, power_configs); + if (qnn_status != QNN_SUCCESS) { + QNN_LOG_WARN("set htp perf failed\n"); + } else { + QNN_LOG_INFO("set htp perf ok\n"); + } + } else { + QNN_LOG_WARN("can't set htp perf\n"); + } + + return 0; + } + + int set_high_performance_mode() { + if (nullptr == _qnn_htp_perfinfra) { + QNN_LOG_WARN("perf intra is null\n"); + return 1; + } + + QnnHtpPerfInfrastructure_PowerConfig_t power_config; + memset(&power_config, 0, sizeof(power_config)); + power_config.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3; + + power_config.dcvsV3Config.setDcvsEnable = 1; + power_config.dcvsV3Config.dcvsEnable = 0; + power_config.dcvsV3Config.contextId = _qnn_power_configid; + power_config.dcvsV3Config.powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE; + power_config.dcvsV3Config.setSleepLatency = 1; // true to consider Latency parameter otherwise false + power_config.dcvsV3Config.sleepLatency = 40; + power_config.dcvsV3Config.setBusParams = 1; // true to consider Bus parameter otherwise false + power_config.dcvsV3Config.setCoreParams = 1; // true to consider Core parameter otherwise false + power_config.dcvsV3Config.sleepDisable = 1; // true to consider sleep/LPM modes, false to enable + power_config.dcvsV3Config.setSleepDisable = + 1; // true to consider sleep disable/enable parameter otherwise false set sleep latency parameter + // set Bus Clock Parameters + power_config.dcvsV3Config.busVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + power_config.dcvsV3Config.busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + power_config.dcvsV3Config.busVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + // set Core Clock Parameters + power_config.dcvsV3Config.coreVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + power_config.dcvsV3Config.coreVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + power_config.dcvsV3Config.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + + // set power config with different performance parameters + const QnnHtpPerfInfrastructure_PowerConfig_t *power_configs[] = { &power_config, nullptr }; + Qnn_ErrorHandle_t qnn_status = QNN_SUCCESS; + qnn_status = _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, power_configs); + if (qnn_status != QNN_SUCCESS) { + QNN_LOG_WARN("set htp high performance mode failed\n"); + } else { + QNN_LOG_INFO("set htp high performance mode ok\n"); + } + + return 0; + } + + std::string &get_qnn_graph_name() { return _graph_name; } + + bool is_rpcmem_initialized() { return _rpcmem_initialized; } + + void set_rpcmem_initialized(bool initialized) { _rpcmem_initialized = initialized; } + + size_t get_rpcmem_capacity() { return _rpcmem_capacity; } + + void *alloc_rpcmem(size_t bytes, size_t alignment) { + if (!_rpcmem_initialized) { + QNN_LOG_WARN("rpc memory not initialized\n"); + return nullptr; + } + + auto allocate_bytes = static_cast(bytes + alignment); + void *buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, (int)allocate_bytes); + if (!buf) { + QNN_LOG_WARN("failed to allocate rpc memory, size: %d MB\n", (int)(allocate_bytes / (1 << 20))); + return nullptr; + } + + auto aligned_buf = reinterpret_cast(qnn::align_to(alignment, reinterpret_cast(buf))); + bool status = _rpcmem_store_map.insert(std::pair(aligned_buf, buf)).second; + if (!status) { + QNN_LOG_WARN("failed to allocate rpc memory\n"); + _pfn_rpc_mem_free(buf); + } + + return aligned_buf; + } + + void free_rpcmem(void *buf) { + if (!_rpcmem_initialized) { + QNN_LOG_WARN("rpc memory not initialized\n"); + } else if (_rpcmem_store_map.count(buf) == 0) { + QNN_LOG_WARN("no allocated tensor\n"); + } else { + _pfn_rpc_mem_free(_rpcmem_store_map[buf]); + _rpcmem_store_map.erase(buf); + } + } + + int32_t rpcmem_to_fd(void *buf) { + int32_t mem_fd = -1; + if (!is_rpcmem_initialized()) { + QNN_LOG_WARN("rpc memory not initialized\n"); + } else { + mem_fd = _pfn_rpc_mem_to_fd(buf); + } + + return mem_fd; + } + + Qnn_MemHandle_t register_rpcmem(void *p_data, uint32_t rank, uint32_t *dimensions, Qnn_DataType_t data_type) { + if (!p_data) { + QNN_LOG_WARN("invalid param\n"); + return nullptr; + } + + if (!is_rpcmem_initialized()) { + QNN_LOG_WARN("rpc memory not initialized\n"); + return nullptr; + } + + if (is_rpcmem_registered(p_data)) { + QNN_LOG_WARN("rpc memory already registered\n"); + return _qnn_rpc_buffer_to_handles[p_data]; + } + + auto mem_fd = rpcmem_to_fd(p_data); + if (mem_fd == -1) { + QNN_LOG_WARN("failed to get file descriptor\n"); + return nullptr; + } + + QNN_LOG_INFO("mem_fd %d\n", mem_fd); + Qnn_MemDescriptor_t descriptor = { { rank, dimensions, nullptr }, data_type, QNN_MEM_TYPE_ION, { { mem_fd } } }; + Qnn_MemHandle_t handle = nullptr; + auto error = _qnn_interface->qnn_mem_register(_qnn_context_handle, &descriptor, + /*numDescriptors=*/1, &handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to register shared memory, error %d, %s\n", QNN_GET_ERROR_CODE(error), + strerror(error)); + return nullptr; + } + + _qnn_rpc_buffer_to_handles.insert({ p_data, handle }); + QNN_LOG_INFO("successfully register shared memory handler: %p\n", handle); + return handle; + } + + void unregister_rpcmem(Qnn_MemHandle_t mem_handle) { + Qnn_ErrorHandle_t error = _qnn_interface->qnn_mem_de_register(&mem_handle, 1); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to unregister shared memory, error %d\n", QNN_GET_ERROR_CODE(error)); + } + + auto it = std::find_if(_qnn_rpc_buffer_to_handles.begin(), _qnn_rpc_buffer_to_handles.end(), + [mem_handle](const auto &kv) { return kv.second == mem_handle; }); + if (it == _qnn_rpc_buffer_to_handles.end()) { + QNN_LOG_WARN("failed to find shared memory handler: %p\n", mem_handle); + return; + } + + _qnn_rpc_buffer_to_handles.erase(it); + } + + bool is_rpcmem_allocated(void *buf) { return _rpcmem_store_map.count(buf) != 0; } + bool is_rpcmem_registered(void *buf) { return _qnn_rpc_buffer_to_handles.count(buf) != 0U; } + + const qnn::qcom_socinfo &get_soc_info() { return _soc_info; } + +private: + int load_system() { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + + std::string system_lib_path = _lib_path + "libQnnSystem.so"; + QNN_LOG_DEBUG("system_lib_path:%s\n", system_lib_path.c_str()); + + auto system_lib_handle = dl_load(system_lib_path); + if (!system_lib_handle) { + QNN_LOG_WARN("can not load QNN library %s, error: %s\n", system_lib_path.c_str(), dl_error()); + return 1; + } + + auto *get_providers = dl_sym_typed( + system_lib_handle, "QnnSystemInterface_getProviders"); + if (!get_providers) { + QNN_LOG_WARN("can not load QNN symbol QnnSystemInterface_getProviders: %s\n", dl_error()); + return 2; + } + + uint32_t num_providers = 0; + const QnnSystemInterface_t **provider_list = nullptr; + error = get_providers(&provider_list, &num_providers); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to get providers, error %d\n", QNN_GET_ERROR_CODE(error)); + return 3; + } + + if (num_providers != _required_num_providers) { + QNN_LOG_WARN("providers is %d instead of required %d\n", num_providers, _required_num_providers); + return 4; + } + + if (!provider_list) { + QNN_LOG_WARN("can not get providers\n"); + return 5; + } + + QNN_SYSTEM_INTERFACE_VER_TYPE qnn_system_interface; + bool found_valid_system_interface = false; + for (size_t idx = 0; idx < num_providers; idx++) { + if (QNN_SYSTEM_API_VERSION_MAJOR == provider_list[idx]->systemApiVersion.major && + QNN_SYSTEM_API_VERSION_MINOR <= provider_list[idx]->systemApiVersion.minor) { + found_valid_system_interface = true; + qnn_system_interface = provider_list[idx]->QNN_SYSTEM_INTERFACE_VER_NAME; + break; + } + } + if (!found_valid_system_interface) { + QNN_LOG_WARN("unable to find a valid qnn system interface\n"); + return 6; + } else { + QNN_LOG_INFO("find a valid qnn system interface\n"); + } + + auto qnn_sys_interface = std::make_shared(*provider_list[0], system_lib_handle); + if (!qnn_sys_interface->is_valid()) { + QNN_LOG_WARN("failed to create QNN system interface\n"); + return 7; + } + + _qnn_sys_interface = qnn_sys_interface; + return 0; + } + + int load_backend(std::string &lib_path, const QnnSaver_Config_t ** /*saver_config*/) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + QNN_LOG_DEBUG("lib_path:%s\n", lib_path.c_str()); + + auto lib_handle = dl_load(lib_path.c_str()); + if (!lib_handle) { + QNN_LOG_WARN("can not open QNN library %s, with error: %s", lib_path.c_str(), dl_error()); + return 1; + } + + auto get_providers = + qnn::dl_sym_typed(lib_handle, "QnnInterface_getProviders"); + if (!get_providers) { + QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", dl_error()); + return 2; + } + + std::uint32_t num_providers = 0; + const QnnInterface_t **provider_list = nullptr; + error = get_providers(&provider_list, &num_providers); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to get providers, error %d", QNN_GET_ERROR_CODE(error)); + return 3; + } + QNN_LOG_DEBUG("num_providers=%d\n", num_providers); + if (num_providers != _required_num_providers) { + QNN_LOG_WARN("providers is %d instead of required %d", num_providers, _required_num_providers); + return 4; + } + + if (!provider_list) { + QNN_LOG_WARN("failed to get qnn interface providers\n"); + return 5; + } + bool found_valid_interface = false; + QNN_INTERFACE_VER_TYPE qnn_interface; + for (size_t idx = 0; idx < num_providers; idx++) { + if (QNN_API_VERSION_MAJOR == provider_list[idx]->apiVersion.coreApiVersion.major && + QNN_API_VERSION_MINOR <= provider_list[idx]->apiVersion.coreApiVersion.minor) { + found_valid_interface = true; + qnn_interface = provider_list[idx]->QNN_INTERFACE_VER_NAME; + break; + } + } + + if (!found_valid_interface) { + QNN_LOG_WARN("unable to find a valid qnn interface\n"); + return 6; + } else { + QNN_LOG_INFO("find a valid qnn interface\n"); + } + + BackendIdType backend_id = provider_list[0]->backendId; + _lib_path_to_backend_id[lib_path] = backend_id; + if (_loaded_backend.count(backend_id) > 0) { + QNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists\n", lib_path.c_str(), backend_id); + } + _loaded_backend[backend_id] = provider_list[0]; + if (_loaded_lib_handle.count(backend_id) > 0) { + QNN_LOG_WARN("closing %p\n", _loaded_lib_handle[backend_id]); + int dlclose_error = dl_unload(_loaded_lib_handle[backend_id]); + if (dlclose_error != 0) { + QNN_LOG_WARN("fail to close %p with error %s\n", _loaded_lib_handle[backend_id], dl_error()); + } + } + _loaded_lib_handle[backend_id] = lib_handle; + _backend_id = backend_id; + + return 0; + } + + int unload_backend() { + int dlclose_error = 0; + for (auto &it : _loaded_lib_handle) { + dlclose_error = dl_unload(it.second); + if (dlclose_error != 0) { + QNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, dl_error()); + } + } + + _loaded_lib_handle.clear(); + _lib_path_to_backend_id.clear(); + _loaded_backend.clear(); + + return 0; + } + +private: + static constexpr const int _required_num_providers = 1; + + std::string _lib_path; + std::string _backend_name; + std::string _model_name; // Qualcomm's dedicated prebuilt model name, keep it for further usage + BackendIdType _backend_id; + + QnnLog_Level_t _qnn_log_level = QNN_LOG_LEVEL_DEBUG; + +#ifdef NDEBUG + qnn::sdk_profile_level _profile_level = qnn::sdk_profile_level::profile_off; +#else + qnn::sdk_profile_level _profile_level = qnn::sdk_profile_level::profile_detail; +#endif + + std::shared_ptr _qnn_sys_interface; + std::shared_ptr _qnn_interface; + + Qnn_GraphHandle_t _qnn_graph_handle = nullptr; + + Qnn_LogHandle_t _qnn_log_handle = nullptr; + + Qnn_ProfileHandle_t _qnn_profile_handle = nullptr; + + Qnn_DeviceHandle_t _qnn_device_handle = nullptr; + + Qnn_BackendHandle_t _qnn_backend_handle = nullptr; + + Qnn_ContextHandle_t _qnn_context_handle = nullptr; + + QnnHtpDevice_PerfInfrastructure_t *_qnn_htp_perfinfra = nullptr; + uint32_t _qnn_power_configid = 1; + + std::unordered_map _qnn_rpc_buffer_to_handles; + + std::mutex _init_mutex; + std::unordered_map _loaded_lib_handle; + std::unordered_map _lib_path_to_backend_id; + std::unordered_map _loaded_backend; + + dl_handler_t _rpc_lib_handle = nullptr; + std::atomic_bool _rpcmem_initialized{ false }; + qnn::pfn_rpc_mem_alloc _pfn_rpc_mem_alloc; + qnn::pfn_rpc_mem_free _pfn_rpc_mem_free; + qnn::pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd; + qnn::pfn_rpc_mem_init _pfn_rpc_mem_init; + qnn::pfn_rpc_mem_deinit _pfn_rpc_mem_deinit; + std::unordered_map _rpcmem_store_map; + size_t _rpcmem_capacity = 512; + + std::string _graph_name; + + qnn::qcom_socinfo _soc_info = {}; +}; + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/qnn-types.hpp b/ggml/src/ggml-qnn/qnn-types.hpp new file mode 100644 index 0000000000000..8fce790defb61 --- /dev/null +++ b/ggml/src/ggml-qnn/qnn-types.hpp @@ -0,0 +1,61 @@ + +#pragma once + +#include "QnnCommon.h" +#include "QnnInterface.h" +#include "QnnTypes.h" +#include "Saver/QnnSaver.h" +#include "System/QnnSystemInterface.h" + +namespace qnn { +// ================================================================================================= +// +// helper data type / data structure / macros / functions of +// Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK +// ref:https://github.com/pytorch/executorch/tree/main/backends/qualcomm +// ================================================================================================= +enum sdk_profile_level { profile_off = 0, profile_basic = 1, profile_detail = 2 }; + +enum qcom_htp_arch { + NONE = 0, + V68 = 68, + V69 = 69, + V73 = 73, + V75 = 75, +}; + +enum qcom_chipset { + UNKNOWN_SM = 0, + SM8450 = 36, // v69 + SM8475 = 42, // v69 + SM8550 = 43, // v73 + SM8650 = 57, // v75 +}; + +struct qcom_socinfo { + uint32_t soc_model; + size_t htp_arch; + size_t vtcm_size_in_mb; +}; + +using pfn_rpc_mem_init = void (*)(void); +using pfn_rpc_mem_deinit = void (*)(void); +using pfn_rpc_mem_alloc = void *(*)(int, uint32_t, int); +using pfn_rpc_mem_free = void (*)(void *); +using pfn_rpc_mem_to_fd = int (*)(void *); + +using pfn_qnnsaver_initialize = decltype(QnnSaver_initialize); +using pfn_qnninterface_getproviders = decltype(QnnInterface_getProviders); +using pfn_qnnsysteminterface_getproviders = decltype(QnnSystemInterface_getProviders); +} // namespace qnn + +#define RPCMEM_DEFAULT_FLAGS 1 +#define RPCMEM_HEAP_ID_SYSTEM 25 + +#define DISABLE_COPY(class_name) \ + class_name(const class_name &) = delete; \ + void operator=(const class_name &) = delete + +#define DISABLE_MOVE(class_name) \ + class_name(class_name &&) = delete; \ + void operator=(class_name &&) = delete diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp new file mode 100644 index 0000000000000..0c724e2871d45 --- /dev/null +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -0,0 +1,201 @@ + +#pragma once + +#include +#include +#include +#include + +#include "ggml-qnn.h" + +#include "buffer.hpp" +#include "logger.hpp" +#include "qnn-lib.hpp" +#include "utils.hpp" + +namespace qnn { + +class ggml_qnn_tensor { +public: + explicit ggml_qnn_tensor(const std::string &name, QNNBackend device, Qnn_GraphHandle_t graph_handle, + std::shared_ptr qnn_instance) : + _tensor_name(name), _device(device), _qnn_instance(qnn_instance), _graph_handle(graph_handle) { + QNN_TENSOR_SET_NAME(_qnn_tensor, _tensor_name.c_str()); + QNN_TENSOR_SET_DIMENSIONS(_qnn_tensor, _dimensions.data()); + QNN_TENSOR_SET_TYPE(_qnn_tensor, QNN_TENSOR_TYPE_NATIVE); + QNN_TENSOR_SET_DATA_FORMAT(_qnn_tensor, QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER); + QNN_LOG_DEBUG("create tensor %s, device: %d", _tensor_name.c_str(), device); + } + + ~ggml_qnn_tensor() { _qnn_rpc_buffer.reset(); } + + bool bind_ggml_tensor(ggml_tensor *tensor, bool is_input, int prev_max_rank) { + if (_tensor) { + if (_tensor != tensor) { + QNN_LOG_WARN("tensor %s has been bound to another ggml tensor %s", _tensor_name.c_str(), + ggml_get_name(_tensor)); + return false; + } + QNN_LOG_INFO("tensor %s already bound to same ggml tensor %s", _tensor_name.c_str(), + ggml_get_name(_tensor)); + return true; + } + + update_params_from_ggml_tensor(tensor, prev_max_rank); + Qnn_TensorType_t new_tensor_type = is_input ? QNN_TENSOR_TYPE_APP_WRITE : QNN_TENSOR_TYPE_APP_READ; + QNN_TENSOR_SET_TYPE(_qnn_tensor, new_tensor_type); + QNN_LOG_INFO("tensor %s changed to type %d", _tensor_name.c_str(), new_tensor_type); + + if (!QNN_TENSOR_GET_ID(_qnn_tensor)) { + Qnn_Tensor_t qnn_tensor = _qnn_tensor; + auto qnn_interface = _qnn_instance->get_qnn_interface(); + auto error = qnn_interface->qnn_tensor_create_graph_tensor(_graph_handle, &qnn_tensor); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("create graph tensor failed, tensor %s, error: %d\n", _tensor_name.c_str(), error); + return false; + } + + QNN_TENSOR_SET_ID(_qnn_tensor, QNN_TENSOR_GET_ID(qnn_tensor)); + QNN_LOG_DEBUG("create graph tensor %s, id: %d, rank: %d", _tensor_name.c_str(), + QNN_TENSOR_GET_ID(qnn_tensor), QNN_TENSOR_GET_RANK(qnn_tensor)); + } + + if (should_use_mem_handle()) { + if (!_qnn_rpc_buffer) { + auto qnn_rpc_buffer = std::make_unique( + _qnn_instance, ggml_nbytes(tensor), QNN_TENSOR_GET_RANK(_qnn_tensor), + QNN_TENSOR_GET_DIMENSIONS(_qnn_tensor), QNN_TENSOR_GET_DATA_TYPE(_qnn_tensor)); + if (!qnn_rpc_buffer->is_valid()) { + QNN_LOG_WARN("alloc rpc mem failed, tensor %s", _tensor_name.c_str()); + return false; + } + + _qnn_rpc_buffer = std::move(qnn_rpc_buffer); + } + + QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_MEMHANDLE); + QNN_TENSOR_SET_MEM_HANDLE(_qnn_tensor, _qnn_rpc_buffer->get_mem_handle()); + QNN_LOG_DEBUG("tensor %s, use mem handle %p", _tensor_name.c_str(), QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor)); + } else { + QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); + Qnn_ClientBuffer_t client_buf = { tensor->data, get_ggml_tensor_data_size(tensor) }; + QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf); + QNN_LOG_DEBUG("tensor %s, use client buffer %p size %d", _tensor_name.c_str(), client_buf.data, + (int)client_buf.dataSize); + } + + _tensor = tensor; + + if (!write_to_qnn_tensor()) { + QNN_LOG_WARN("write to qnn tensor failed, tensor %s", _tensor_name.c_str()); + return false; + } + + QNN_LOG_DEBUG("bind tensor %s to ggml tensor %s", _tensor_name.c_str(), ggml_get_name(tensor)); + return true; + } + + bool unbind_ggml_tensor() { + if (!_graph_handle) { + QNN_LOG_WARN("tensor %s not bound to any graph", _tensor_name.c_str()); + return false; + } + + if (!_tensor) { + QNN_LOG_DEBUG("tensor %s not bound to ggml tensor", _tensor_name.c_str()); + return true; + } + + if (!read_from_qnn_tensor()) { + QNN_LOG_WARN("read from qnn tensor failed, tensor %s", _tensor_name.c_str()); + return false; + } + + if (!should_use_mem_handle()) { + QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); + Qnn_ClientBuffer_t client_buf = {}; + QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf); + QNN_LOG_DEBUG("tensor %s, clear client buffer", _tensor_name.c_str()); + } + + QNN_LOG_DEBUG("unbind tensor: %s from ggml tensor: %s", _tensor_name.c_str(), ggml_get_name(_tensor)); + _tensor = nullptr; + return true; + } + + const Qnn_Tensor_t &get_qnn_tensor() const { return _qnn_tensor; } + +private: + bool write_to_qnn_tensor() { + auto tensor_type = QNN_TENSOR_GET_TYPE(_qnn_tensor); + if (tensor_type != QNN_TENSOR_TYPE_APP_WRITE && tensor_type != QNN_TENSOR_TYPE_APP_READWRITE) { + QNN_LOG_DEBUG("tensor %s type(%d) not WRITE", _tensor_name.c_str(), (int)tensor_type); + return true; + } + + if (should_use_mem_handle()) { + if (_qnn_rpc_buffer) { + memcpy(_qnn_rpc_buffer->get_buffer(), _tensor->data, ggml_nbytes(_tensor)); + } else { + QNN_LOG_WARN("tensor %s: can't find rpcmem from qnn mem handle\n", _tensor_name.c_str()); + return false; + } + } + + // For CPU and GPU, the data is already in the tensor. + QNN_LOG_DEBUG("write tensor %s to qnn", _tensor_name.c_str()); + return true; + } + + bool read_from_qnn_tensor() { + auto tensor_type = QNN_TENSOR_GET_TYPE(_qnn_tensor); + if (tensor_type != QNN_TENSOR_TYPE_APP_READ && tensor_type != QNN_TENSOR_TYPE_APP_READWRITE) { + QNN_LOG_DEBUG("tensor %s type(%d) not READ", _tensor_name.c_str(), (int)tensor_type); + return true; + } + + if (should_use_mem_handle()) { + if (_qnn_rpc_buffer) { + memcpy(_tensor->data, _qnn_rpc_buffer->get_buffer(), ggml_nbytes(_tensor)); + } else { + QNN_LOG_WARN("can't find rpcmem from qnn mem handle\n"); + return false; + } + } + + // For CPU and GPU, the data is already in the tensor. + QNN_LOG_DEBUG("read tensor %s from qnn", _tensor_name.c_str()); + return true; + } + + void update_params_from_ggml_tensor(ggml_tensor *tensor, int prev_max_rank) { + _dimensions[0] = (uint32_t)tensor->ne[0]; + _dimensions[1] = (uint32_t)tensor->ne[1]; + _dimensions[2] = (uint32_t)tensor->ne[2]; + _dimensions[3] = (uint32_t)tensor->ne[3]; + QNN_TENSOR_SET_DATA_TYPE(_qnn_tensor, device_datatype_from_ggml_datatype(tensor->type)); + // TODO: set the quantizeParams base on the tensor type + + QNN_TENSOR_SET_RANK(_qnn_tensor, (uint32_t)std::max(prev_max_rank, ggml_n_dims(tensor))); + + QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); + Qnn_ClientBuffer_t client_buf = {}; + QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf); + } + + bool should_use_mem_handle() const { return _device == QNN_BACKEND_NPU; } + + std::string _tensor_name; + const ggml_tensor *_tensor; + QNNBackend _device; + std::shared_ptr _qnn_instance; + Qnn_Tensor_t _qnn_tensor = qnn_tensor_init(kDefaultQnnTensorVersion); + std::array _dimensions = {}; + Qnn_GraphHandle_t _graph_handle = nullptr; + std::unique_ptr _qnn_rpc_buffer; + + DISABLE_COPY(ggml_qnn_tensor); + DISABLE_MOVE(ggml_qnn_tensor); +}; + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/utils.cpp b/ggml/src/ggml-qnn/utils.cpp new file mode 100644 index 0000000000000..e44d6dbccee42 --- /dev/null +++ b/ggml/src/ggml-qnn/utils.cpp @@ -0,0 +1,208 @@ + +#include "utils.hpp" + +#include + +#include "ggml-qnn.h" + +#include "qnn-types.hpp" + +namespace qnn { + +// TODO: mapping more ggml data type to QNN data type +// ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684 +Qnn_DataType_t device_datatype_from_ggml_datatype(ggml_type ggml_type) { + switch (ggml_type) { + case GGML_TYPE_F16: + return QNN_DATATYPE_FLOAT_16; + case GGML_TYPE_F32: + return QNN_DATATYPE_FLOAT_32; + case GGML_TYPE_I8: + return QNN_DATATYPE_INT_8; + case GGML_TYPE_Q8_0: + return QNN_DATATYPE_SFIXED_POINT_8; + case GGML_TYPE_Q4_0: + return QNN_DATATYPE_SFIXED_POINT_4; + default: + break; + } + return QNN_DATATYPE_UNDEFINED; +} + +Qnn_TensorType_t device_tensortype_from_ggml_tensor(ggml_tensor *ggml_tensor) { + Qnn_TensorType_t qnn_tensor_type = QNN_TENSOR_TYPE_NATIVE; + + if (ggml_tensor->flags & GGML_TENSOR_FLAG_INPUT) { + qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; + } else if (ggml_tensor->flags & GGML_TENSOR_FLAG_OUTPUT) { + qnn_tensor_type = QNN_TENSOR_TYPE_APP_READ; + } + + return qnn_tensor_type; +} + +uint32_t get_ggml_tensor_rank(const ggml_tensor *tensor) { + uint32_t rank = 0; + for (int i = 0; i < GGML_MAX_DIMS; i++) { + if ((0 != tensor->ne[i]) && (1 != tensor->ne[i])) { + rank++; + } + } + return rank; +} + +const char *get_backend_name(int n_backend_type) { + switch (n_backend_type) { + case QNN_BACKEND_CPU: + return "QNN-CPU"; + case QNN_BACKEND_GPU: + return "QNN-GPU"; + case QNN_BACKEND_NPU: + return "QNN-NPU"; + case QNN_BACKEND_GGML: + return "ggml"; //"fake" QNN backend, used for compare performance between QNN backend and original GGML + default: + return "unknown"; + } +} + +const char *get_chipset_desc(uint32_t chipset_id) { + switch (chipset_id) { + case SM8450: + return "SM8450"; + case SM8475: + return "SM8475"; + case SM8550: + return "SM8550"; + case SM8650: + return "SM8650"; + default: + return "unknown"; + } +} + +const char *get_htparch_desc(size_t htp_arch) { + switch (htp_arch) { + case V68: + return "QCOM_HTP_V68"; + case V69: + return "QCOM_HTP_V69"; + case V73: + return "QCOM_HTP_V73"; + case V75: + return "QCOM_HTP_V75"; + default: + return "unknown"; + } +} + +intptr_t align_to(size_t alignment, intptr_t offset) { + return offset % alignment == 0 + ? offset + : offset + (static_cast(alignment) - (offset % static_cast(alignment))); +} + +uint32_t get_ggml_tensor_data_size(const ggml_tensor *tensor) { + /* + size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]); + size_t n_dims = qnn_get_ggml_tensor_rank(tensor); + for (int i = 1; i < n_dims; i++) { + data_size *= tensor->ne[i]; + } + + return data_size; + */ + return ggml_nbytes(tensor); +} + +void *align_alloc(size_t alignment, size_t size) { + size_t size_aligned = size; + if ((size_aligned % alignment) != 0) { + size_aligned += (alignment - (size_aligned % alignment)); + } + + void *data = std::aligned_alloc(alignment, size_aligned); + if (!data) { + QNN_LOG_WARN("aligned_alloc failed\n"); + return nullptr; + } + + return data; +} + +void align_free(void *ptr) { std::free(ptr); } + +// ================================================================================================= +// +// QNN backend internal helper functions +// +// ================================================================================================= +// TODO: only support GGML_OP_ADD/GGML_OP_MUL/GGML_OP_MUL_MAT +const char *opname_from_ggmlop(enum ggml_op ggmlop) { + switch (ggmlop) { + case GGML_OP_ADD: + return QNN_OP_ELEMENT_WISE_ADD; + case GGML_OP_MUL: + return QNN_OP_ELEMENT_WISE_MULTIPLY; + case GGML_OP_MUL_MAT: + return QNN_OP_MAT_MUL; + default: + break; + } + return nullptr; +} + +const char *get_qnn_error_string(Qnn_ErrorHandle_t error) { + // A complete list of error codes can be found at here: + // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/api_error_codes.html + switch (error) { + case QNN_SUCCESS: + return "QNN_SUCCESS"; + case QNN_COMMON_ERROR_GENERAL: + return "QNN_COMMON_ERROR_GENERAL"; + + // QnnGraph_Error_t + case QNN_GRAPH_ERROR_UNSUPPORTED_FEATURE: + return "QNN_GRAPH_ERROR_UNSUPPORTED_FEATURE"; + case QNN_GRAPH_ERROR_MEM_ALLOC: + return "QNN_GRAPH_ERROR_MEM_ALLOC"; + case QNN_GRAPH_ERROR_INVALID_ARGUMENT: + return "QNN_GRAPH_ERROR_INVALID_ARGUMENT"; + case QNN_GRAPH_ERROR_INVALID_HANDLE: + return "QNN_GRAPH_ERROR_INVALID_HANDLE"; + case QNN_GRAPH_ERROR_GRAPH_DOES_NOT_EXIST: + return "QNN_GRAPH_ERROR_GRAPH_DOES_NOT_EXIST"; + case QNN_GRAPH_ERROR_INVALID_NAME: + return "QNN_GRAPH_ERROR_INVALID_NAME"; + case QNN_GRAPH_ERROR_INVALID_TENSOR: + return "QNN_GRAPH_ERROR_INVALID_TENSOR"; + case QNN_GRAPH_ERROR_INVALID_OP_CONFIG: + return "QNN_GRAPH_ERROR_INVALID_OP_CONFIG"; + case QNN_GRAPH_ERROR_SET_PROFILE: + return "QNN_GRAPH_ERROR_SET_PROFILE"; + case QNN_GRAPH_ERROR_UNCONNECTED_NODE: + return "QNN_GRAPH_ERROR_UNCONNECTED_NODE"; + case QNN_GRAPH_ERROR_CREATE_FAILED: + return "QNN_GRAPH_ERROR_CREATE_FAILED"; + + // QnnOpPackage_Error_t + case QNN_OP_PACKAGE_ERROR_LIBRARY_ALREADY_INITIALIZED: + return "QNN_OP_PACKAGE_ERROR_LIBRARY_ALREADY_INITIALIZED"; + case QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED: + return "QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED"; + case QNN_OP_PACKAGE_ERROR_INVALID_HANDLE: + return "QNN_OP_PACKAGE_ERROR_INVALID_HANDLE"; + case QNN_OP_PACKAGE_ERROR_INVALID_INFRASTRUCTURE: + return "QNN_OP_PACKAGE_ERROR_INVALID_INFRASTRUCTURE"; + case QNN_OP_PACKAGE_ERROR_INVALID_INFO: + return "QNN_OP_PACKAGE_ERROR_INVALID_INFO"; + case QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE: + return "QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE"; + case QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT: + return "QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT"; + default: + return nullptr; + } +} + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/utils.hpp b/ggml/src/ggml-qnn/utils.hpp new file mode 100644 index 0000000000000..b7f29bdaa5663 --- /dev/null +++ b/ggml/src/ggml-qnn/utils.hpp @@ -0,0 +1,254 @@ +#pragma once + +#include +#include +#include +#include +#include + +#include + +#include "ggml.h" + +#include "QnnTypes.h" +#include "logger.hpp" + +#define QNN_TENSOR_VER(x) ((x).v1) + +namespace qnn { + +uint32_t get_ggml_tensor_rank(const ggml_tensor *tensor); +const char *get_backend_name(int n_backend_type); +const char *get_chipset_desc(uint32_t chipset_id); +const char *get_htparch_desc(size_t htp_arch); +intptr_t align_to(size_t alignment, intptr_t offset); +uint32_t get_ggml_tensor_data_size(const ggml_tensor *tensor); + +void *align_alloc(size_t alignment, size_t size); +void align_free(void *ptr); + +const char *opname_from_ggmlop(enum ggml_op ggmlop); + +const char *get_qnn_error_string(Qnn_ErrorHandle_t error); + +constexpr const Qnn_TensorVersion_t kDefaultQnnTensorVersion = QNN_TENSOR_VERSION_1; + +inline Qnn_Tensor_t qnn_tensor_init(Qnn_TensorVersion_t version) { + Qnn_Tensor_t tensor; + tensor.version = version; + if (version == QNN_TENSOR_VERSION_1) { + tensor.v1 = QNN_TENSOR_V1_INIT; + } else if (version == QNN_TENSOR_VERSION_2) { + tensor.v2 = QNN_TENSOR_V2_INIT; + } + return tensor; +} + +inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t &tensor) { + if (tensor.version == kDefaultQnnTensorVersion) { + return QNN_TENSOR_VER(tensor).id; + } + + return 0u; +} + +inline const char *get_qnn_tensorname(const Qnn_Tensor_t &tensor) { + if (tensor.version == kDefaultQnnTensorVersion) { + return QNN_TENSOR_VER(tensor).name; + } + return nullptr; +} + +inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t &tensor) { + if (tensor.version == kDefaultQnnTensorVersion) { + return QNN_TENSOR_VER(tensor).type; + } + return QNN_TENSOR_TYPE_UNDEFINED; +} + +inline Qnn_TensorDataFormat_t get_qnn_tensor_dataformat(const Qnn_Tensor_t &tensor) { + if (tensor.version == kDefaultQnnTensorVersion) { + return QNN_TENSOR_VER(tensor).dataFormat; + } + return QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER; +} + +inline Qnn_DataType_t get_qnn_tensor_datatype(const Qnn_Tensor_t &tensor) { + if (tensor.version == kDefaultQnnTensorVersion) { + return QNN_TENSOR_VER(tensor).dataType; + } + return QNN_DATATYPE_UNDEFINED; +} + +inline Qnn_QuantizeParams_t get_qnn_tensor_quantparams(const Qnn_Tensor_t &tensor) { + if (tensor.version == kDefaultQnnTensorVersion) { + return QNN_TENSOR_VER(tensor).quantizeParams; + } + return QNN_QUANTIZE_PARAMS_INIT; +} + +inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t &tensor) { + if (tensor.version == kDefaultQnnTensorVersion) { + return QNN_TENSOR_VER(tensor).rank; + } + return 0u; +} + +inline uint32_t *get_qnn_tensor_dimensions(const Qnn_Tensor_t &tensor) { + if (tensor.version == kDefaultQnnTensorVersion) { + return QNN_TENSOR_VER(tensor).dimensions; + } + return nullptr; +} + +inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t &tensor) { + if (tensor.version == kDefaultQnnTensorVersion) { + return QNN_TENSOR_VER(tensor).memType; + } + return QNN_TENSORMEMTYPE_UNDEFINED; +} + +inline Qnn_MemHandle_t get_qnn_tensor_memhandle(const Qnn_Tensor_t &tensor) { + if (tensor.version == kDefaultQnnTensorVersion) { + return QNN_TENSOR_VER(tensor).memHandle; + } + return nullptr; +} + +inline void set_qnn_tensor_id(Qnn_Tensor_t &tensor, uint32_t id) { + if (tensor.version == kDefaultQnnTensorVersion) { + QNN_TENSOR_VER(tensor).id = id; + } +} + +inline void set_qnn_tensor_name(Qnn_Tensor_t &tensor, const char *name) { + if (tensor.version == kDefaultQnnTensorVersion) { + QNN_TENSOR_VER(tensor).name = name; + } +} + +inline void set_qnn_tensor_type(Qnn_Tensor_t &tensor, Qnn_TensorType_t type) { + if (tensor.version == kDefaultQnnTensorVersion) { + QNN_TENSOR_VER(tensor).type = type; + } +} + +inline void set_qnn_tensor_dataformat(Qnn_Tensor_t &tensor, Qnn_TensorDataFormat_t format) { + if (tensor.version == kDefaultQnnTensorVersion) { + QNN_TENSOR_VER(tensor).dataFormat = format; + } +} + +inline void set_qnn_tensor_datatype(Qnn_Tensor_t &tensor, Qnn_DataType_t dataType) { + if (tensor.version == kDefaultQnnTensorVersion) { + QNN_TENSOR_VER(tensor).dataType = dataType; + } +} + +inline void set_qnn_tensor_quantparams(Qnn_Tensor_t &tensor, Qnn_QuantizeParams_t params) { + if (tensor.version == kDefaultQnnTensorVersion) { + QNN_TENSOR_VER(tensor).quantizeParams = params; + } +} + +inline void set_qnn_tensor_rank(Qnn_Tensor_t &tensor, uint32_t rank) { + if (tensor.version == kDefaultQnnTensorVersion) { + QNN_TENSOR_VER(tensor).rank = rank; + } +} + +inline void set_qnn_tensor_dimensions(Qnn_Tensor_t &tensor, uint32_t *dims) { + if (tensor.version == kDefaultQnnTensorVersion) { + QNN_TENSOR_VER(tensor).dimensions = dims; + } +} + +inline void set_qnn_tensor_memtype(Qnn_Tensor_t &tensor, Qnn_TensorMemType_t mem_type) { + if (tensor.version == kDefaultQnnTensorVersion) { + QNN_TENSOR_VER(tensor).memType = mem_type; + } +} + +inline void set_qnn_tensor_clientbuf(Qnn_Tensor_t &tensor, Qnn_ClientBuffer_t client_buf) { + if (tensor.version == kDefaultQnnTensorVersion) { + QNN_TENSOR_VER(tensor).clientBuf = client_buf; + } +} + +inline void set_qnn_tensor_memhandle(Qnn_Tensor_t &tensor, Qnn_MemHandle_t handle) { + if (tensor.version == kDefaultQnnTensorVersion) { + QNN_TENSOR_VER(tensor).memHandle = handle; + } +} + +inline void set_qnn_tensor_dyn_dimensions(Qnn_Tensor_t &tensor, uint8_t *isDynamicDimensions) { + if (tensor.version == QNN_TENSOR_VERSION_2) { + tensor.v2.isDynamicDimensions = isDynamicDimensions; + } +} + +Qnn_DataType_t device_datatype_from_ggml_datatype(ggml_type ggml_type); +Qnn_TensorType_t device_tensortype_from_ggml_tensor(ggml_tensor *ggml_tensor); + +#if ENABLE_QNNBACKEND_PERF +class qnn_perf { +public: + qnn_perf(const std::string &perf_name) : _perf_name(std::move(perf_name)) {}; + ~qnn_perf() { info(); } + qnn_perf() = delete; + qnn_perf(const qnn_perf &) = delete; + qnn_perf &operator=(const qnn_perf &) = delete; + + void start() { _begin_time = ggml_time_us(); } + + void info() { + _end_time = ggml_time_us(); + _duration = (_end_time - _begin_time); + QNN_LOG_INFO("duration of %s : %lld microseconds\n", _perf_name.c_str(), _duration); + } + +private: + int64_t _begin_time = 0LL; + int64_t _end_time = 0LL; + int64_t _duration = 0LL; + std::string _perf_name; +}; +#else +class qnn_perf { +public: + qnn_perf(const std::string &) {} + ~qnn_perf() { info(); } + qnn_perf() = delete; + qnn_perf(const qnn_perf &) = delete; + qnn_perf &operator=(const qnn_perf &) = delete; + + void start() {} + void info() {} +}; +#endif + +} // namespace qnn + +#define QNN_TENSOR_GET_ID(tensor) qnn::get_qnn_tensorid(tensor) +#define QNN_TENSOR_GET_NAME(tensor) qnn::get_qnn_tensorname(tensor) +#define QNN_TENSOR_GET_TYPE(tensor) qnn::get_qnn_tensortype(tensor) +#define QNN_TENSOR_GET_DATA_FORMAT(tensor) qnn::get_qnn_tensor_dataformat(tensor) +#define QNN_TENSOR_GET_DATA_TYPE(tensor) qnn::get_qnn_tensor_datatype(tensor) +#define QNN_TENSOR_GET_QUANT_PARAMS(tensor) qnn::get_qnn_tensor_quantparams(tensor) +#define QNN_TENSOR_GET_RANK(tensor) qnn::get_qnn_tensor_rank(tensor) +#define QNN_TENSOR_GET_DIMENSIONS(tensor) qnn::get_qnn_tensor_dimensions(tensor) +#define QNN_TENSOR_GET_MEM_TYPE(tensor) qnn::get_qnn_tensor_memtype(tensor) +#define QNN_TENSOR_GET_MEM_HANDLE(tensor) qnn::get_qnn_tensor_memhandle(tensor) + +#define QNN_TENSOR_SET_ID(tensor, value) qnn::set_qnn_tensor_id(tensor, value) +#define QNN_TENSOR_SET_NAME(tensor, value) qnn::set_qnn_tensor_name(tensor, value) +#define QNN_TENSOR_SET_TYPE(tensor, value) qnn::set_qnn_tensor_type(tensor, value) +#define QNN_TENSOR_SET_DATA_FORMAT(tensor, value) qnn::set_qnn_tensor_dataformat(tensor, value) +#define QNN_TENSOR_SET_DATA_TYPE(tensor, value) qnn::set_qnn_tensor_datatype(tensor, value) +#define QNN_TENSOR_SET_QUANT_PARAMS(tensor, value) qnn::set_qnn_tensor_quantparams(tensor, value) +#define QNN_TENSOR_SET_RANK(tensor, value) qnn::set_qnn_tensor_rank(tensor, value) +#define QNN_TENSOR_SET_DIMENSIONS(tensor, value) qnn::set_qnn_tensor_dimensions(tensor, value) +#define QNN_TENSOR_SET_MEM_TYPE(tensor, value) qnn::set_qnn_tensor_memtype(tensor, value) +#define QNN_TENSOR_SET_CLIENT_BUF(tensor, value) qnn::set_qnn_tensor_clientbuf(tensor, value) +#define QNN_TENSOR_SET_MEM_HANDLE(tensor, value) qnn::set_qnn_tensor_memhandle(tensor, value) +#define QNN_TENSOR_SET_DYN_DIMENSIONS(tensor, value) qnn::set_qnn_tensor_dyn_dimensions(tensor, value) diff --git a/src/llama.cpp b/src/llama.cpp index 40db035171127..43fd2f759dff4 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -22,6 +22,8 @@ # include "ggml-kompute.h" #elif defined(GGML_USE_CANN) # include "ggml-cann.h" +#elif defined(GGML_USE_QNN) +# include "ggml-qnn.h" #endif #ifdef GGML_USE_BLAS @@ -3332,6 +3334,8 @@ static size_t llama_get_device_count(const llama_model & model) { count = ggml_backend_vk_get_device_count(); #elif defined(GGML_USE_CANN) return ggml_backend_cann_get_device_count(); +#elif defined(GGML_USE_QNN) + count = ggml_backend_qnn_get_device_count(); #endif #if defined(GGML_USE_RPC) count += model.rpc_servers.size(); @@ -3370,6 +3374,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_ } #elif defined(GGML_USE_CANN) buft = ggml_backend_cann_buffer_type(local_gpu); +#elif defined(GGML_USE_QNN) + buft = ggml_backend_qnn_buffer_type(gpu); #endif if (buft == nullptr) { @@ -17987,6 +17993,8 @@ size_t llama_max_devices(void) { return GGML_VK_MAX_DEVICES; #elif defined(GGML_USE_CANN) return GGML_CANN_MAX_DEVICES; +#elif defined(GGML_USE_QNN) + return GGML_QNN_MAX_DEVICES; #else return 1; #endif @@ -18002,7 +18010,7 @@ bool llama_supports_mlock(void) { bool llama_supports_gpu_offload(void) { #if defined(GGML_USE_CUDA) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \ - defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC) + defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC) || defined(GGML_USE_QNN) // Defined when llama.cpp is compiled with support for offloading model layers to GPU. return true; #else @@ -18369,6 +18377,16 @@ struct llama_context * llama_new_context_with_model( ctx->backends.push_back(backend); } } +#elif defined(GGML_USE_QNN) + if (model->n_gpu_layers > 0) { + ggml_backend_t backend = ggml_backend_qnn_init(model->main_gpu, nullptr); + if (nullptr == backend) { + LLAMA_LOG_ERROR("%s: failed to initialize QNN backend\n", __func__); + llama_free(ctx); + return nullptr; + } + ctx->backends.push_back(backend); + } #endif #ifdef GGML_USE_BLAS