ggml-hexagon: sync with self-build branch

zhouwg · zhouwg · commit b32ba64ec29e · 2025-07-10T23:11:18.000+08:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -8,17 +8,27 @@ set(CMAKE_WARN_UNUSED_CLI YES)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
 if(CMAKE_SYSTEM_NAME STREQUAL "Android")
+    set(CMAKE_VERBOSE_MAKEFILE      ON)
     if(DEFINED HTP_ARCH_VERSION)
         if (${HTP_ARCH_VERSION} STREQUAL "v75" OR ${HTP_ARCH_VERSION} STREQUAL "v79")
-            #works fine on Snapdragon 8Gen3&8Elite with 1.5x - 3x performance gains with the default ggml backend
-            set(OPT_FLAG " -O3 -march=armv8.7-a -mcpu=cortex-x1 -mtune=cortex-x1 -ffp-model=fast -fno-finite-math-only")
-            message("OPT_FLAG:${OPT_FLAG}")
-            set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   -DGGML_USE_HEXAGON ${DEBUG_FLAG} ${OPT_FLAG}")
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGGML_USE_HEXAGON ${DEBUG_FLAG} ${OPT_FLAG}")
-            set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -DGGML_USE_HEXAGON ${DEBUG_FLAG} ${OPT_FLAG}")
-            set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DGGML_USE_HEXAGON ${DEBUG_FLAG} ${OPT_FLAG}")
+            #works fine on Snapdragon 8Gen3 & 8Elite
+            set(OPT_FLAG " -O3 -march=armv8.7-a+dotprod+fp16+i8mm -mcpu=cortex-x1 -mtune=cortex-x1 -ffp-model=fast -fno-finite-math-only")
+        else()
+            #should be works fine with mainstream mobile SoC
+            set(OPT_FLAG " -O3 -march=armv8.2-a+dotprod+fp16 -ffp-model=fast -fno-finite-math-only")
         endif()
+    else()
+        #should be works fine with mainstream mobile SoC
+        set(OPT_FLAG " -O3 -march=armv8.2-a+dotprod+fp16 -ffp-model=fast -fno-finite-math-only")
     endif()
+
+    message("OPT_FLAG:${OPT_FLAG}")
+    #ensure the same toolchain optimization for ggml-opencl, ggml-vulkan, ggml-hexagon on Android phone
+    set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   ${DEBUG_FLAG} ${OPT_FLAG}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${DEBUG_FLAG} ${OPT_FLAG}")
+    set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} ${DEBUG_FLAG} ${OPT_FLAG}")
+    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} ${DEBUG_FLAG} ${OPT_FLAG}")
+
 endif()
 
 if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
diff --git a/ggml/src/ggml-hexagon/CMakeLists.txt b/ggml/src/ggml-hexagon/CMakeLists.txt
@@ -49,11 +49,6 @@ if (${CHECK_HTP_ARCH} STREQUAL "WRONG")
 endif()
 
 #check optimization flags
-set(OPT_FLAG " ")
-if (${HTP_ARCH_VERSION} STREQUAL "v75" OR ${HTP_ARCH_VERSION} STREQUAL "v79")
-    #works fine on Snapdragon 8Gen3&8Elite with 1.5x - 3x performance gains with the default ggml backend
-    set(OPT_FLAG " -O3 -march=armv8.7-a -mcpu=cortex-x1 -mtune=cortex-x1 -flto -D_GNU_SOURCE -fvectorize -ffp-model=fast -fno-finite-math-only")
-endif()
 message("OPT_FLAG:${OPT_FLAG}")
 
 if(CMAKE_SYSTEM_NAME STREQUAL "Android")
diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -24,6 +24,14 @@
  * - GGML_OP_ADD & GGML_OP_MUL_MAT:
  *   this is a hwaccel skeleton, can expand other ggml ops accordingly
  *
+ * there is a key point in this PR:
+ * when hwaccel_approach is HWACCEL_QNN,  there are 4 backends:
+ *    HEXAGON_BACKEND_QNNCPU, HEXAGON_BACKEND_QNNGPU, HEXAGON_BACKEND_QNNNPU, HEXAGON_BACKEND_GGML(the default ggml backend);
+ * when hwaccel_approach is HWACCEL_CDSP, there are 2 backends(which is exactly similar to ggml-opencl or ggml-vulkan):
+ *    HEXAGON_BACKEND_CDSP, HEXAGON_BACKEND_GGML.
+ *
+ * the reason for this is to facilitate the performance comparison between the cDSP approach and the QNN approach.
+ * accordingly, this PR not only support QNN-based approach but also support the cDSP based approach.
  */
 #include <stdio.h>
 #include <stdlib.h>
@@ -80,7 +88,6 @@
 #include "rpcmem.h"
 #include "remote.h"
 #include "os_defines.h"
-#include "domain.h"
 #include "AEEStdErr.h"
 #include "HAP_power.h"
 #include "HAP_farf.h"
@@ -377,7 +384,7 @@ static struct hexagon_appcfg_t g_hexagon_appcfg = {
 #elif defined(_WIN32)
         .qnn_runtimelib_path    = "C:\\",
 #endif
-        .ggml_hexagon_version   = {"1.13"},
+        .ggml_hexagon_version   = {"1.14"},
         .ggml_dsp_version       = {"0.63"},
 };
 
@@ -888,29 +895,29 @@ static void ggmlhexagon_print_tensors_info(const char * func_name, const ggml_ba
     }
 
     if (nullptr != func_name && nullptr != ctx) {
-        GGMLHEXAGON_LOG_DEBUG("call %s in dev %s\n", func_name, ctx->name);
+        GGMLHEXAGON_LOG_VERBOSE("call %s in dev %s\n", func_name, ctx->name);
     }
     if (nullptr != src0) {
-        GGMLHEXAGON_LOG_DEBUG(
+        GGMLHEXAGON_LOG_VERBOSE(
                 "%-6s: type = %i (%s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi, %5zi)",
                 src0->name,
                 src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2],
                 src0->ne[3],
                 src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
     }
     if (nullptr != src1) {
-        GGMLHEXAGON_LOG_DEBUG(
+        GGMLHEXAGON_LOG_VERBOSE(
                 "%-6s: type = %i (%s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi, %5zi)",
                 src1->name,
                 src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2],
                 src1->ne[3],
                 src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]);
     }
-    GGMLHEXAGON_LOG_DEBUG("%-6s: type = %i (%s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi, %5zi)",
+    GGMLHEXAGON_LOG_VERBOSE("%-6s: type = %i (%s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi, %5zi)",
                       dst->name,
                       dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
                       dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3]);
-    GGMLHEXAGON_LOG_DEBUG("\n");
+    GGMLHEXAGON_LOG_VERBOSE("\n");
 }
 
 static void ggmlhexagon_dump_op_info(const struct ggml_tensor * tensor) {
@@ -921,7 +928,7 @@ static void ggmlhexagon_dump_op_info(const struct ggml_tensor * tensor) {
     const struct ggml_tensor * src0 = tensor->src[0];
     struct ggml_tensor       * src1 = tensor->src[1];
     struct ggml_tensor       * dst  = const_cast<ggml_tensor *>(tensor);
-    GGMLHEXAGON_LOG_DEBUG("op name:%s, tensor type:%s", ggml_op_name(tensor->op), ggml_type_name(tensor->type));
+    GGMLHEXAGON_LOG_VERBOSE("op name:%s, tensor type:%s", ggml_op_name(tensor->op), ggml_type_name(tensor->type));
     ggmlhexagon_print_tensors_info(nullptr, nullptr, src0, src1, dst);
 }
 
@@ -939,7 +946,7 @@ static void ggmlhexagon_dump_tensor_elements(const ggml_tensor * tensor) {
                                << " ";
                     }
                     if (strlen(tmposs.str().c_str()) <= (GGMLHEXAGON_LOGBUF_LEN - 96)) {
-                        GGMLHEXAGON_LOG_DEBUG("%s\n", tmposs.str().c_str());
+                        GGMLHEXAGON_LOG_VERBOSE("%s\n", tmposs.str().c_str());
                     }
                     tmposs.clear();
                     tmposs.str("");
@@ -948,7 +955,7 @@ static void ggmlhexagon_dump_tensor_elements(const ggml_tensor * tensor) {
         }
     }
 
-    GGMLHEXAGON_LOG_DEBUG("\n");
+    GGMLHEXAGON_LOG_VERBOSE("\n");
 }
 
 static void ggmlhexagon_dump_tensor(const ggml_tensor * tensor, const char * name) {
@@ -3375,7 +3382,7 @@ static void ggmlqnn_sdk_logcallback(const char * fmt,
     {
         std::lock_guard<std::mutex> lock(log_mutex);
         memset(s_ggmlqnn_sdk_logbuf, 0, GGMLHEXAGON_LOGBUF_LEN);
-        vsnprintf(reinterpret_cast<char *const>(s_ggmlqnn_sdk_logbuf), GGMLHEXAGON_LOGBUF_LEN, fmt, argp);
+        vsnprintf(reinterpret_cast<char *>(s_ggmlqnn_sdk_logbuf), GGMLHEXAGON_LOGBUF_LEN, fmt, argp);
         GGMLHEXAGON_LOG_DEBUG("%8.1fms [%-7s] %s\n", ms, log_level_desc, s_ggmlqnn_sdk_logbuf);
     }
 #if !GGMLHEXAGON_DEBUG
@@ -4599,7 +4606,7 @@ static void ggmlqnn_compute_mul_mat(ggml_backend_hexagon_context * ctx, ggml_ten
         //retrieve computational resource from cached QNN graph
         qnn_singlenode_res_t & graph_item = ctx->qnn_singlenode_graph_map[graph_name];
         graph_handle = std::get<0>(graph_item);
-        qnn_ptensors_t &tensors = std::get<1>(graph_item);
+        qnn_ptensors_t & tensors = std::get<1>(graph_item);
         p_tensor0 = tensors[0];
         p_tensor1 = tensors[1];
         p_tensor2 = tensors[2];
@@ -5849,6 +5856,7 @@ static bool ggmlhexagon_can_handle_op_through_cdsp(ggml_backend_dev_t dev, const
     switch (op_tensor->op) {
         case GGML_OP_ADD:
         {
+            ggmlhexagon_dump_op_info(op_tensor);
             //TODO:workaround approach to fix HWACCEL_CDSP can't works in ASR inference and  LLM inference
             //     with some LLM models in a standard Android APP
             if (ne00 < 1024) {
@@ -5927,8 +5935,9 @@ static bool ggmlhexagon_can_handle_op_through_qnn(ggml_backend_dev_t dev, const
                 return false;
             }
 
-            if (ne00 < 32)
+            if (ne00 < 32) {
                 return false;
+            }
 
             return ggmlhexagon_same_types(ctx, op_tensor);
         }
diff --git a/scripts/build-run-android.sh b/scripts/build-run-android.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 
 # build llama.cpp + ggml-hexagon backend on Linux for Android phone equipped with Qualcomm Snapdragon mobile SoC
 # this script will setup local dev envs automatically
@@ -61,19 +61,7 @@ GGUF_MODEL_NAME=/sdcard/qwen1_5-1_8b-chat-q4_0.gguf
 #v75 --- Snapdragon 8 Gen3
 #v79 --- Snapdragon 8 Elite
 
-#8Gen2
-#HTP_ARCH_VERSION=v73
-#HTP_ARCH_VERSION_a=V73
-
-#8Gen3
-#HTP_ARCH_VERSION=v75
-#HTP_ARCH_VERSION_a=V75
-
-#8Elite
-#HTP_ARCH_VERSION=v79
-#HTP_ARCH_VERSION_a=V79
-
-#modify the following two lines to adapt to test phone
+#modify the following two lines to suit the test phone
 HTP_ARCH_VERSION=v79
 HTP_ARCH_VERSION_a=V79
 
@@ -132,7 +120,7 @@ function check_and_download_hexagon_sdk()
             echo -e "minimal-hexagon-sdk-6.2.0.1.xz already exist\n"
         else
             echo -e "begin downloading minimal-hexagon-sdk-6.2.0.1.xz \n"
-            wget --no-config --quiet --show-progress -O ${PROJECT_ROOT_PATH}/prebuilts/Hexagon_SDK/minimal-hexagon-sdk-6.2.0.1.xz https://github.com/kantv-ai/toolchain/raw/refs/heads/main/minimal-hexagon-sdk-6.2.0.1.xz
+            wget --no-config --quiet --show-progress -O ${PROJECT_ROOT_PATH}/prebuilts/Hexagon_SDK/minimal-hexagon-sdk-6.2.0.1.xz https://github.com/zhouwg/toolchain/raw/refs/heads/main/minimal-hexagon-sdk-6.2.0.1.xz
             if [ $? -ne 0 ]; then
                 printf "failed to download minimal-hexagon-sdk-6.2.0.1.xz\n"
                 exit 1
@@ -168,8 +156,13 @@ function check_and_download_qnn_sdk()
         is_qnn_sdk_exist=0
     fi
 
+    if [ ! -f ${QNN_SDK_PATH}/NOTICE.txt ]; then
+        echo -e "${TEXT_RED}${QNN_SDK_PATH}/NOTICE.txt not exist${TEXT_RESET}\n"
+        is_qnn_sdk_exist=0
+    fi
+
     if [ ${is_qnn_sdk_exist} -eq 0 ]; then
-        if [ ! -f ${PROJECT_ROOT_PATH}/prebuild/v${QNN_SDK_VERSION}.zip ]; then
+        if [ ! -f ${PROJECT_ROOT_PATH}/prebuilts/v${QNN_SDK_VERSION}.zip ]; then
             wget --no-config --quiet --show-progress -O ${PROJECT_ROOT_PATH}/prebuilts/QNN_SDK/v${QNN_SDK_VERSION}.zip https://softwarecenter.qualcomm.com/api/download/software/sdks/Qualcomm_AI_Runtime_Community/All/${QNN_SDK_VERSION}/v${QNN_SDK_VERSION}.zip
         fi
         if [ $? -ne 0 ]; then
@@ -178,6 +171,10 @@ function check_and_download_qnn_sdk()
         fi
         cd ${PROJECT_ROOT_PATH}/prebuilts/QNN_SDK/
         unzip v${QNN_SDK_VERSION}.zip
+        if [ $? -ne 0 ]; then
+            printf "failed to decompress Qualcomm QNN SDK to %s \n" "${QNN_SDK_PATH}"
+            exit 1
+        fi
         printf "Qualcomm QNN SDK saved to ${QNN_SDK_PATH} \n\n"
         cd ${PROJECT_ROOT_PATH}
     else
diff --git a/scripts/ggml-hexagon.cfg b/scripts/ggml-hexagon.cfg
@@ -30,9 +30,8 @@ hwaccel_approach = 2
 #          b. we can compare Hexagon NPU performance between HWACCEL_CDSP/QNNNPU/the default ggml backend accordingly
 
 
-#enable/disable offload quantized type mulmat
-#quatized type mulmat works fine through QNNNPU at the moment
-#quatized type mulmat doesn't works fine through HWACCEL_CDSP at the moment
+#enable/disable offload quantized mulmat
+#quantized mulmat works fine through QNNNPU at the moment
 #this item will make mulmat performance comprision easily
 enable_q_mulmat = 0