ggml-qnn: rebase to upstream

zhouwg · zhouwg · commit e4b0d8c20416 · 2025-03-16T17:20:24.000+08:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -7,6 +7,16 @@ set(CMAKE_WARN_UNUSED_CLI YES)
 
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
+if(CMAKE_SYSTEM_NAME STREQUAL "Android")
+    set(TARGET_SNAPDRAGON8GEN3    ON)
+    if(TARGET_SNAPDRAGON8GEN3)
+       #works fine on Snapdragon 8Gen3 with 5x-10x(76.64 tokens per second) performance gain through the default ggml backend
+       add_definitions(-march=armv8.7-a)
+       add_definitions(-mcpu=cortex-x1)
+       add_definitions(-mtune=cortex-x1)
+   endif()
+endif()
+
 if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
     set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
@@ -117,6 +127,7 @@ llama_option_depr(WARNING     LLAMA_RPC                 GGML_RPC)
 llama_option_depr(WARNING     LLAMA_SYCL                GGML_SYCL)
 llama_option_depr(WARNING     LLAMA_SYCL_F16            GGML_SYCL_F16)
 llama_option_depr(WARNING     LLAMA_CANN                GGML_CANN)
+llama_option_depr(WARNING     LLAMA_QNN                 GGML_QNN)
 
 if (NOT MSVC)
     if (LLAMA_SANITIZE_THREAD)
diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp
@@ -14,13 +14,16 @@
  * section-6  QNN helper function
  * section-7  ggml-qnn backend helper function / class
  * section-8  implementation of ggml-qnn backend according to ggml's backend subsystem
- * section-9  implementation of offload ggml op to QNN backend
- * section-10 illustrate why the second approach is actual an fake at the moment
+ * section-9  implementation of general approach or the first tech approach
+ * section-10 implementation of the second tech approach:mapping the entire ggml cgraph to a single QNN graph
  *
  * currently provide following ggml op' QNN backend implementation:
- * - GGML_OP_ADD:    this is a simple skeleton, can expand other ggml ops according to expertise
- * - GGML_OP_MUL:    this is a simple skeleton, can expand other ggml ops according to expertise
- * - GGML_OP_MUL_MAT:this is a complicated skeleton, can expand other complex ggml ops accordingly
+ * - GGML_OP_ADD/GGML_OP_SUB/GGML_OP_MUL/GGML_OP_DIV:
+ *   this is a simple skeleton, can expand other ggml ops according to expertise
+ * - GGML_OP_LOG/GGML_OP_SQRT:
+ *   this is a simple skeleton, can expand other ggml ops according to expertise
+ * - GGML_OP_MUL_MAT:
+ *   this is a complicated skeleton, can expand other complex ggml ops accordingly
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to
@@ -80,7 +83,6 @@
 #include <unordered_set>
 #include <utility>
 #include <future>
-#include <chrono>
 #if (defined __ANDROID__) || (defined ANDROID)
 #include "android/log.h"
 #endif
@@ -186,7 +188,6 @@ static void ggml_qnn_diag_mask(ggml_backend_qnn_context * ctx, ggml_tensor * dst
 
 #define GGMLQNN_MEM_ADD(alignment)                      (sizeof (size_t) + alignment)
 #define GGMLQNN_MEM_MASK(alignment)                     ((uintptr_t)alignment - 1)
-#define TENSOR_DUMP(tensor)                             ggmlqnn_tensor_dump(tensor, #tensor)
 #define GQCGT                                           ggmlqnn_create_general_tensor
 #define QNN_VER_PTR(x)                                  (&((x).v1))
 #define RPCMEM_DEFAULT_FLAGS                            1
@@ -260,7 +261,7 @@ using qnn_ptensors_t                             = std::vector< Qnn_Tensor_t *>;
 using qnn_singlenode_res_t                       = std::tuple<Qnn_GraphHandle_t, qnn_ptensors_t>;
 
 //QNN resource management for the second technical approach(mapping the entire cgraph to a single QNN graph)
-using qnn_tensors_t                             = std::vector< Qnn_Tensor_t * >;
+using qnn_tensors_t                             = std::vector< Qnn_Tensor_t >;
 using qnn_cgraph_node_t                         = std::tuple<std::string, Qnn_GraphHandle_t>;
 using qnn_cgraph_nodes_t                        = std::vector<qnn_cgraph_node_t>;
 using qnn_multinode_res_t                       = std::tuple<Qnn_GraphHandle_t, qnn_cgraph_nodes_t, qnn_ptensors_t, qnn_tensors_t, qnn_tensors_t>;
@@ -325,11 +326,6 @@ struct ggml_backend_qnn_context {
     size_t work_size;
     size_t desired_size;
     int n_threads;
-
-#if 1//ndef NDEBUG
-    std::atomic_uint32_t supported_op_count   = 0;
-    std::atomic_uint32_t unsupported_op_count = 0;
-#endif
 };
 
 struct qnn_op_caps {
@@ -370,8 +366,6 @@ static struct qnn_parameter g_qnn_params = {
 #if defined(__ANDROID__)
 //Android command line program
         .qnn_runtimelib_path    = "/data/local/tmp/",
-//Android KanTV standard APP
-//      .qnn_runtimelib_path    = "/data/data/com.cdeos.kantv/qnnlib/",
 #elif defined(__linux__)
         .qnn_runtimelib_path    = "/tmp/",
 #elif defined(_WIN32)
@@ -1066,46 +1060,6 @@ static void ggmlqnn_load_cfg() {
     }
 }
 
-static void ggmlqnn_tensor_dump_elements(const ggml_tensor * tensor) {
-    float value = 0;
-    std::ostringstream tmposs;
-    if (tensor->type == GGML_TYPE_F32) {
-        for (int h = 0; h < tensor->ne[3]; h++) {
-            for (int i = 0; i < tensor->ne[2]; i++) {
-                for (int j = 0; j < tensor->ne[1]; j++) {
-                    for (int k = 0; k < tensor->ne[0]; k++) {
-                        value = ((float *) tensor->data)[h * tensor->ne[2] + i * tensor->ne[1] +
-                                                         j * tensor->ne[0] + k];
-                        tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value
-                               << " ";
-                    }
-                    if (strlen(tmposs.str().c_str()) <= (4096 - 96)) {
-                        GGMLQNN_LOG_DEBUG("%s\n", tmposs.str().c_str());
-                    }
-                    tmposs.clear();
-                    tmposs.str("");
-                }
-            }
-        }
-    }
-
-    GGMLQNN_LOG_DEBUG("\n");
-}
-
-
-static void ggmlqnn_tensor_dump(const ggml_tensor * tensor, const char * name) {
-    GGMLQNN_LOG_DEBUG("dump ggml tensor %s(%s)\n", name, tensor->name);
-    GGMLQNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64", nb = (%5zi, %5zi, %5zi, %5zi)\n",
-           name,
-           tensor->type, ggml_type_name(tensor->type),
-           tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3],
-           tensor->nb[0], tensor->nb[1], tensor->nb[2], tensor->nb[2]);
-    ggmlqnn_tensor_dump_elements(tensor);
-
-    GGMLQNN_LOG_DEBUG("\n");
-}
-
-
 // =================================================================================================
 //  section-6: QNN helper function
 // =================================================================================================
@@ -2698,7 +2652,7 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
     std::filesystem::path full_path(std::string(g_qnn_params.qnn_runtimelib_path) + "libcdsprpc.so");
     full_path /= std::filesystem::path("libcdsprpc.so").filename();
     _rpc_lib_handle = dlopen(full_path.string().c_str(), RTLD_NOW | RTLD_LOCAL);
-    if (!_rpc_lib_handle) {
+    if (nullptr == _rpc_lib_handle) {
         GGMLQNN_LOG_WARN("failed to load %s\n", full_path.c_str());
         _rpc_lib_handle = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL);
     }
@@ -5083,9 +5037,56 @@ void ggml_qnn_rope(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
 // =================================================================================================
 // details: https://github.com/ggml-org/llama.cpp/pull/12326#issuecomment-2712838649
 // ref:     https://github.com/kantv-ai/kantv/blob/kantv-poc-with-qnn/core/ggml/jni/Inception_v3.cpp#L20634
-static enum ggml_status
-ggml_backend_qnn_graph_compute_special(ggml_backend_t backend, struct ggml_cgraph *cgraph) {
-    enum ggml_status ggml_result = GGML_STATUS_SUCCESS;
+static enum ggml_status ggml_backend_qnn_graph_compute_special(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+    enum ggml_status ggml_result                = GGML_STATUS_SUCCESS;
+    Qnn_ErrorHandle_t qnn_error                 = QNN_SUCCESS;
+    qnn_perf op_perf                            = qnn_perf("ggml_backend_qnn_graph_compute_special");
+    qnn_instance * instance                     = nullptr;
+    Qnn_GraphHandle_t graph_handle              = nullptr;
+    ggml_backend_qnn_context * ctx              = (ggml_backend_qnn_context *) backend->context;
+    instance                                    = ctx->instance;
+    QNN_INTERFACE_VER_TYPE qnn_raw_interface    = ctx->raw_interface;
+    op_perf.start();
+
+    //now we got the entire ggml cgraph or a ggml cgraph which contains multiple nodes
+    GGMLQNN_LOG_DEBUG("qnn device %d(%s)", ctx->device, ggml_backend_qnn_get_devname(ctx->device));
+    GGMLQNN_LOG_DEBUG("cgraph->n_nodes %d", cgraph->n_nodes);
+    int num_nodes = std::min(5, cgraph->n_nodes);
+    //for (int i = 0; i < cgraph->n_nodes; i++) {
+    for (int i = 0; i < num_nodes; i++) {
+        ggml_tensor * node = cgraph->nodes[i];
+        GGMLQNN_LOG_DEBUG("%s: op %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
+    }
+
+    //now we'll offload the ggml cgraph to a single QNN graph
+    std::string graph_name;
+    ggmlqnn_get_graphkey_from_cgraph(cgraph, graph_name);
+    if (graph_name == "")
+        return GGML_STATUS_SUCCESS;
+    if (ctx->qnn_multinode_graph_map.find(graph_name) != ctx->qnn_multinode_graph_map.end()) {
+        GGMLQNN_LOG_DEBUG("graph name %s already create", graph_name.c_str());
+        //retrieve computational resource from cached QNN graph
+        qnn_multinode_res_t &graph_res = ctx->qnn_multinode_graph_map[graph_name];
+        graph_handle = std::get<0>(graph_res);
+    } else {
+        //create QNN graph
+        GGMLQNN_LOG_INFO("graph name %s", graph_name.c_str());
+        qnn_error = instance->init_qnn_graph(graph_name, static_cast<QNNBackend>(ctx->device), 8, 4);
+        if (QNN_SUCCESS != qnn_error) {
+            GGMLQNN_LOG_WARN("can't create qnn graph handle with graph name %s, error = %d(%s)\n", graph_name.c_str(), qnn_error,
+                             ggmlqnn_get_qnnerror_string(qnn_error));
+            return ggml_result;
+        }
+        graph_handle = instance->get_qnn_graph_handle();
+        //TBD: compose a single opcfg QNN graph
+
+        //finalize QNN graph
+        CHECK_QNN_API(qnn_error, qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr));
+
+        //TBD: cache QNN graph
+    }
+    //TBD: exec QNN graph
+    GGMLQNN_LOG_DEBUG("the second inference approach \"mapping cgraph to QNN graph\" will be seen in the future");
 
     return ggml_result;
 }
diff --git a/scripts/build-run-android.sh b/scripts/build-run-android.sh
@@ -7,7 +7,6 @@ PWD=`pwd`
 ANDROID_PLATFORM=android-34
 ANDROID_NDK=${PWD}/android-ndk-r26c
 REMOTE_PATH=/data/local/tmp/
-GGUF_MODEL_NAME=/sdcard/deepseek-r1-distill-qwen-1.5b-q4_0.gguf
 GGUF_MODEL_NAME=/sdcard/qwen1_5-1_8b-chat-q4_0.gguf
 
 #QNN SDK could be found at:
@@ -18,8 +17,7 @@ QNN_SDK_INSTALL_PATH=/opt/qcom/aistack/qairt/
 QNN_SDK_VERSION=2.32.0.250228
 QNN_SDK_PATH=${QNN_SDK_INSTALL_PATH}/${QNN_SDK_VERSION}
 
-#default is QNN NPU
-qnnbackend=2
+qnnparams=" -mg 2 -ngl 99 "
 
 function dump_vars()
 {
@@ -188,7 +186,7 @@ function run_llamacli()
 
     adb shell "cd ${REMOTE_PATH} \
                && export LD_LIBRARY_PATH=${REMOTE_PATH} \
-               && ${REMOTE_PATH}/llama-cli -mg ${qnnbackend} -ngl 99 -no-cnv -m ${GGUF_MODEL_NAME} -p \"introduce the movie Once Upon a Time in America briefly.\n\""
+               && ${REMOTE_PATH}/llama-cli ${qnnparams} -no-cnv -m ${GGUF_MODEL_NAME} -p \"introduce the movie Once Upon a Time in America briefly.\n\""
 
 }
 
@@ -199,12 +197,11 @@ function run_llamabench()
 
     adb shell "cd ${REMOTE_PATH} \
                && export LD_LIBRARY_PATH=${REMOTE_PATH} \
-               && ${REMOTE_PATH}/llama-bench -mg ${qnnbackend} -m ${GGUF_MODEL_NAME}"
+               && ${REMOTE_PATH}/llama-bench ${qnnparams} -m ${GGUF_MODEL_NAME}"
 
 }
 
 
-#refer to:https://github.com/ggml-org/llama.cpp/pull/12155
 function run_test-ops()
 {
     prepare_run_on_phone test-backend-ops
@@ -215,37 +212,6 @@ function run_test-ops()
 
 }
 
-function run_test-op()
-{
-    prepare_run_on_phone test-backend-ops
-
-    qnnbackendname=qnn-cpu
-    case $qnnbackend in
-        0)
-        qnnbackendname=qnn-cpu
-        ;;
-        1)
-        qnnbackendname=qnn-gpu
-        ;;
-        2)
-        qnnbackendname=qnn-npu
-        ;;
-        *)
-        qnnbackendname=qnn-cpu
-        ;;
-    esac
-
-    #debug
-    echo "adb shell cd ${REMOTE_PATH} \
-               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
-               && ${REMOTE_PATH}/test-backend-ops test -o $opname -b $qnnbackendname "
-
-    echo "\n"
-    adb shell "cd ${REMOTE_PATH} \
-               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
-               && ${REMOTE_PATH}/test-backend-ops test -o $opname -b $qnnbackendname "
-
-}
 
 function print_oplist()
 {
@@ -335,9 +301,8 @@ function show_usage()
     echo "  $0 build"
     echo "  $0 updateqnnlib"
     echo "  $0 run_testops"
-    echo "  $0 run_testop          [ADD/MUL/MUL_MAT......(op from print_oplist)]  [0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU)]"
-    echo "  $0 run_llamacli        0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU) / 3 (ggml)"
-    echo "  $0 run_llamabench      0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU) / 3 (ggml)"
+    echo "  $0 run_llamacli"
+    echo "  $0 run_llamabench"
 
     echo -e "\n\n\n"
 }
@@ -367,40 +332,19 @@ elif [ $# == 1 ]; then
     elif [ "$1" == "run_testops" ]; then
         run_test-ops
         exit 0
-
-    elif [ "$1" == "updateqnnlib" ]; then
-        update_qnn_libs
-        exit 0
-    else
-        show_usage
-        exit 1
-    fi
-elif [ $# == 2 ]; then
-    qnnbackend=$2
-    if [ ${qnnbackend} -gt 3 ]; then
-        show_usage
-        exit 1
-    fi
-
-    if [ "$1" == "run_llamacli" ]; then
+    elif [ "$1" == "run_llamacli" ]; then
         run_llamacli
         exit 0
     elif [ "$1" == "run_llamabench" ]; then
         run_llamabench
         exit 0
-    fi
-elif [ $# == 3 ]; then
-    opname=$2
-#TODO: check opname in oplist
-#opname can be found via print_oplist:
-
-    qnnbackend=$3
-    if [ ${qnnbackend} -gt 3 ]; then
+    elif [ "$1" == "updateqnnlib" ]; then
+        update_qnn_libs
+        exit 0
+    else
         show_usage
         exit 1
     fi
-    run_test-op
-    exit 0
 else
     show_usage
     exit 1
diff --git a/scripts/ggml-qnn.cfg b/scripts/ggml-qnn.cfg
@@ -1,9 +1,19 @@
 [general]
+#0: QNN-CPU backend
+#1: QNN-GPU backend
+#2: QNN-NPU(htp) backend
+#3: default ggml backend
+qnn_backend = 2
+
 # enable/disable QNN's internal log
 print_qnn_internal_log = 0
+
 # 0: general approach,similar to ggml-sycl or ggml-cann
 # 1: mapping entire ggml cgraph to QNN graph
 inference_approach = 0
 
 [npu]
-npu_inference_datatype = "fp16"
+hvx_threads = 4
+vtcm_size_in_mb = 8
+enable_dlbc = 1
+precision_mode = "fp16"