ggml-hexagon: uniform NDEBUG usage in ggml-hexagon.cpp and ggml-dsp.c

zhouwg · zhouwg · commit 361b9bbd1ad5 · 2025-04-06T14:50:13.000+08:00
diff --git a/ggml/src/ggml-hexagon/CMakeLists.txt b/ggml/src/ggml-hexagon/CMakeLists.txt
@@ -17,6 +17,15 @@ message("QNN_SDK_PATH    : ${QNN_SDK_PATH}")
 message("HEXAGON_SDK_PATH: ${HEXAGON_SDK_PATH}")
 message("HTP_ARCH_VERSION: ${HTP_ARCH_VERSION}")
 
+if (CMAKE_BUILD_TYPE STREQUAL "Debug")
+    set(DEBUG_FLAG "-Wall")
+    message("Debug mode:${DEBUG_FLAG}")
+else()
+    set(DEBUG_FLAG "-DNDEBUG -Wall")
+    message("Release mode:${DEBUG_FLAG}")
+endif()
+
+
 #v68 --- Snapdragon 888
 #v69 --- Snapdragon 8 Gen1
 #v73 --- Snapdragon 8 Gen2
@@ -32,6 +41,7 @@ set(HEXAGON_CC              "${HEXAGON_SDK_PATH}/tools/HEXAGON_Tools/8.8.06/Tool
 set(HEXAGON_CXX             "${HEXAGON_SDK_PATH}/tools/HEXAGON_Tools/8.8.06/Tools/bin/hexagon-clang")
 set(HEXAGON_TARGET          libggmlop_skel${HTP_ARCH_VERSION}.so)
 set(HEXAGON_KERNELS_PATH    "${CMAKE_CURRENT_LIST_DIR}/kernels")
+set(HEXAGON_COMPUTE         "compute${HTP_ARCH_VERSION}")
 
 if(CMAKE_SYSTEM_NAME STREQUAL "Android")
     find_library(LOG_LIB log)
@@ -64,7 +74,7 @@ else()
     message(FATAL_ERROR "QNN now only available on Android and Windows(Windows on ARM)")
 endif()
 
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGGML_USE_QNN")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGGML_USE_HEXAGON ${DEBUG_FLAG}")
 set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
 
 file(GLOB QNN_SOURCES "${CMAKE_CURRENT_LIST_DIR}/*.cpp" "${CMAKE_CURRENT_LIST_DIR}/kernels/ggmlop_ap_skel.c")
@@ -83,8 +93,8 @@ function(ggml_hexagon_build_kernel KNAME)
         TARGET ${PROJECT_NAME}
         POST_BUILD
         COMMAND echo "current working path:`pwd`\n"
-        COMMAND ${HEXAGON_CC} -o ${HEXAGON_KERNELS_PATH}/ggml-dsp.o -c ${HEXAGON_KERNELS_PATH}/ggml-dsp.c -m${HTP_ARCH_VERSION} -c -Ofast -Wall -Wstrict-prototypes -fno-zero-initialized-in-bss -fdata-sections -fpic -D__V_DYNAMIC__ -mhvx -mhvx-length=128B -I${HEXAGON_SDK_PATH}/incs -I${HEXAGON_SDK_PATH}/libs/qprintf/inc -I${HEXAGON_SDK_PATH}/incs/stddef -I${HEXAGON_SDK_PATH}/ipc/fastrpc/incs -I${HEXAGON_SDK_PATH}/ipc/fastrpc/rpcmem/inc -I${HEXAGON_SDK_PATH}/utils/examples -I${HEXAGON_SDK_PATH}/ipc/fastrpc/rtld/ship/inc -I${HEXAGON_SDK_PATH}/libs/atomic/inc -I${HEXAGON_SDK_PATH}/utils/sim_utils/inc
-        COMMAND ${HEXAGON_CC} -o ${HEXAGON_KERNELS_PATH}/ggmlop_cdsp_skel.o -c ${HEXAGON_KERNELS_PATH}/ggmlop_cdsp_skel.c -m${HTP_ARCH_VERSION} -c -Ofast -Wall -Wstrict-prototypes -fno-zero-initialized-in-bss -fdata-sections -fpic -D__V_DYNAMIC__ -mhvx -mhvx-length=128B -I${HEXAGON_SDK_PATH}/incs -I${HEXAGON_SDK_PATH}/libs/qprintf/inc -I${HEXAGON_SDK_PATH}/incs/stddef -I${HEXAGON_SDK_PATH}/ipc/fastrpc/incs -I${HEXAGON_SDK_PATH}/ipc/fastrpc/rpcmem/inc -I${HEXAGON_SDK_PATH}/utils/examples -I${HEXAGON_SDK_PATH}/ipc/fastrpc/rtld/ship/inc -I${HEXAGON_SDK_PATH}/libs/atomic/inc -I${HEXAGON_SDK_PATH}/utils/sim_utils/inc
+        COMMAND ${HEXAGON_CC} -o ${HEXAGON_KERNELS_PATH}/ggml-dsp.o -c ${HEXAGON_KERNELS_PATH}/ggml-dsp.c -m${HTP_ARCH_VERSION} -c -Ofast -Wall -Wstrict-prototypes -fno-zero-initialized-in-bss -fdata-sections -fpic ${DEBUG_FLAG} -D__V_DYNAMIC__ -mhvx -mhvx-length=128B -fno-finite-math-only -I${HEXAGON_SDK_PATH}/incs -I${HEXAGON_SDK_PATH}/libs/qprintf/inc -I${HEXAGON_SDK_PATH}/incs/stddef -I${HEXAGON_SDK_PATH}/ipc/fastrpc/incs -I${HEXAGON_SDK_PATH}/ipc/fastrpc/rpcmem/inc -I${HEXAGON_SDK_PATH}/utils/examples -I${HEXAGON_SDK_PATH}/ipc/fastrpc/rtld/ship/inc -I${HEXAGON_SDK_PATH}/libs/atomic/inc -I${HEXAGON_SDK_PATH}/utils/sim_utils/inc -I${HEXAGON_SDK_PATH}/rtos/qurt/${HEXAGON_COMPUTE}/include/posix -I${HEXAGON_SDK_PATH}/rtos/qurt/${HEXAGON_COMPUTE}/include/qurt/
+        COMMAND ${HEXAGON_CC} -o ${HEXAGON_KERNELS_PATH}/ggmlop_cdsp_skel.o -c ${HEXAGON_KERNELS_PATH}/ggmlop_cdsp_skel.c -m${HTP_ARCH_VERSION} -c -Ofast -Wall -Wstrict-prototypes -fno-zero-initialized-in-bss -fdata-sections -fpic -D__V_DYNAMIC__ -mhvx -mhvx-length=128B -fno-finite-math-only -I${HEXAGON_SDK_PATH}/incs -I${HEXAGON_SDK_PATH}/libs/qprintf/inc -I${HEXAGON_SDK_PATH}/incs/stddef -I${HEXAGON_SDK_PATH}/ipc/fastrpc/incs -I${HEXAGON_SDK_PATH}/ipc/fastrpc/rpcmem/inc -I${HEXAGON_SDK_PATH}/utils/examples -I${HEXAGON_SDK_PATH}/ipc/fastrpc/rtld/ship/inc -I${HEXAGON_SDK_PATH}/libs/atomic/inc -I${HEXAGON_SDK_PATH}/utils/sim_utils/inc
         COMMAND ${HEXAGON_CC} -m${HTP_ARCH_VERSION} -Wl,--defsym=ISDB_TRUSTED_FLAG=2 -Wl,--defsym=ISDB_SECURE_FLAG=2 -Wl,--no-threads -fpic -shared -Wl,-Bsymbolic -Wl,--wrap=malloc -Wl,--wrap=calloc -Wl,--wrap=free -Wl,--wrap=realloc -Wl,--wrap=memalign -lc -Wl,-soname=${HEXAGON_TARGET} -o ${HEXAGON_KERNELS_PATH}/${HEXAGON_TARGET} -Wl,--start-group ${HEXAGON_KERNELS_PATH}/ggmlop_cdsp_skel.o ${HEXAGON_KERNELS_PATH}/ggml-dsp.o -Wl,--end-group
         COMMAND ls -l ${HEXAGON_KERNELS_PATH}/${HEXAGON_TARGET}
         COMMENT "build hexagon-kernel"
diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -129,7 +129,7 @@
 class  qnn_instance;
 struct ggml_backend_hexagon_context;
 
-#if 0//def NDEBUG
+#ifdef NDEBUG
 #define GGMLHEXAGON_DEBUG                               0
 #else
 #define GGMLHEXAGON_DEBUG                               1
@@ -141,6 +141,7 @@ struct ggml_backend_hexagon_context;
 #define GGMLHEXAGON_LOG_ERROR(...)                      ggmlhexagon_log_internal(GGML_LOG_LEVEL_ERROR, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
 #define GGMLHEXAGON_LOG_WARN(...)                       ggmlhexagon_log_internal(GGML_LOG_LEVEL_WARN , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
 #define GGMLHEXAGON_LOG_INFO(...)                       ggmlhexagon_log_internal(GGML_LOG_LEVEL_INFO , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
+#define GGMLHEXAGON_LOG_VERBOSE(...)                    ggmlhexagon_log_internal(GGML_LOG_LEVEL_CONT , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
 
 #if GGMLHEXAGON_DEBUG
 #define GGMLHEXAGON_LOG_DEBUG(...)                      ggmlhexagon_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
@@ -154,6 +155,10 @@ struct ggml_backend_hexagon_context;
 #define SIZE_IN_MB                                      (1 << 20)
 #define STATUS_CONTEXT                                  0x12345678
 
+#if !defined (_WINDOWS)
+#pragma weak remote_system_request
+#endif
+
 #define CHECK_QNN_API(error, result)                                            \
     do {                                                                        \
         error = (result);                                                       \
@@ -316,6 +321,7 @@ struct hexagon_appcfg_t {
     int hexagon_backend;        // 0: HEXAGON_BACKEND_QNNCPU 1: HEXAGON_BACKEND_QNNGPU 2: HEXAGON_BACKEND_QNNNPU / HEXAGON_BACKEND_CDSP
     int enable_rpc_ion_mempool; // enable/disable rpc ion memory pool
     int enable_rpc_dma_mempool; // enable/disable rpc dma memory pool
+    int enable_all_q_mulmat;    // enable/disable offload all quantized type mulmat to cDSP
     const char * cfgfilename;
     const char * runtime_libpath;
     char ggml_hexagon_version[GGMLHEXAGON_TMPBUF_LEN];
@@ -335,6 +341,7 @@ static struct hexagon_appcfg_t g_hexagon_appcfg = {
         .hexagon_backend        = HEXAGON_BACKEND_CDSP,
         .enable_rpc_ion_mempool = 0,
         .enable_rpc_dma_mempool = 0,
+        .enable_all_q_mulmat    = 0,
         .cfgfilename            = "ggml-hexagon.cfg",
 #if defined(__ANDROID__)
 //Android command line program
@@ -344,7 +351,7 @@ static struct hexagon_appcfg_t g_hexagon_appcfg = {
 #elif defined(_WIN32)
         .qnn_runtimelib_path    = "C:\\",
 #endif
-        .ggml_hexagon_version   = {"1.00"},
+        .ggml_hexagon_version   = {"1.01"},
 };
 
 //file:///opt/qcom/aistack/qairt/2.31.0.250130/docs/QNN/general/overview.html#tbl-supported-snapdragon-devices
@@ -891,7 +898,7 @@ class hexagon_perf {
             return;
         _end_time = ggml_time_us();
         _duration = (_end_time - _begin_time);
-        GGMLHEXAGON_LOG_DEBUG("duration of %s : %lld microseconds\n", _perf_name.c_str(), _duration);
+        GGMLHEXAGON_LOG_VERBOSE("duration of %s : %lld microseconds\n", _perf_name.c_str(), _duration);
     }
 
 private:
@@ -1454,6 +1461,7 @@ static void ggmlhexagon_load_cfg() {
     qnncfg_instance.get_stringvalue("qnn", "precision_mode", precision_mode, "fp32");
     qnncfg_instance.get_intvalue("cdsp", "enable_rpc_ion_mempool", g_hexagon_appcfg.enable_rpc_ion_mempool, 1);
     qnncfg_instance.get_intvalue("cdsp", "enable_rpc_dma_mempool", g_hexagon_appcfg.enable_rpc_dma_mempool, 0);
+    qnncfg_instance.get_intvalue("cdsp", "enable_all_q_mulmat", g_hexagon_appcfg.enable_all_q_mulmat, 0);
     GGMLHEXAGON_LOG_INFO("internal ggml_hexagon_version=%s", g_hexagon_appcfg.ggml_hexagon_version);
     GGMLHEXAGON_LOG_INFO("external ggml_hexagon_version=%s", ggml_hexagon_version.c_str());
     GGMLHEXAGON_LOG_INFO("hwaccel_approach=%d(%s)", g_hexagon_appcfg.hwaccel_approach,
@@ -1504,6 +1512,13 @@ static bool ggmlhexagon_check_valid_appcfg() {
             GGMLHEXAGON_LOG_INFO("rpc dma mempool not supported");
             is_valid_appcfg = false;
         }
+
+        if (1 == g_hexagon_appcfg.enable_all_q_mulmat) {
+            if (0 == g_hexagon_appcfg.enable_q_mulmat) {
+                GGMLHEXAGON_LOG_INFO("ensure set enable_q_mulmat to 1 firstly when set enable_all_q_mulmat to 1");
+                is_valid_appcfg = false;
+            }
+        }
     }
 
     if (!is_valid_appcfg) {
@@ -2743,6 +2758,10 @@ static void ggmlqnn_sdk_logcallback(const char * fmt,
         vsnprintf(reinterpret_cast<char *const>(s_ggmlqnn_sdk_logbuf), GGMLHEXAGON_LOGBUF_LEN, fmt, argp);
         GGMLHEXAGON_LOG_DEBUG("%8.1fms [%-7s] %s\n", ms, log_level_desc, s_ggmlqnn_sdk_logbuf);
     }
+#if !GGMLHEXAGON_DEBUG
+    GGML_UNUSED(log_level_desc);
+    GGML_UNUSED(ms);
+#endif
 }
 
 int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
@@ -5075,6 +5094,7 @@ static bool ggmlhexagon_can_handle_op_through_cdsp(ggml_backend_dev_t dev, const
 
     const struct ggml_tensor * src0 = op_tensor->src[0];
     const struct ggml_tensor * src1 = op_tensor->src[1];
+    const int src0_rank = ggml_n_dims(src0);
     switch (op_tensor->op) {
         case GGML_OP_ADD:
         {
@@ -5086,7 +5106,15 @@ static bool ggmlhexagon_can_handle_op_through_cdsp(ggml_backend_dev_t dev, const
         case GGML_OP_MUL_MAT:
         {
             ggmlhexagon_dump_op_info(op_tensor);
+            //FIXME:remove this filter in the future
+            if (2 != src0_rank) {
+                return false;
+            }
             if (1 == g_hexagon_appcfg.enable_q_mulmat) {
+                if (1 == g_hexagon_appcfg.enable_all_q_mulmat) {
+                    return (src0->type == GGML_TYPE_F32 || ggml_is_quantized(src0->type)) && (src1->type == GGML_TYPE_F32);
+                }
+
                 return (src0->type == GGML_TYPE_F32
                         || src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q8_0
                         || src0->type == GGML_TYPE_Q6_K || src0->type == GGML_TYPE_Q8_K
@@ -5126,9 +5154,9 @@ static bool ggmlhexagon_can_handle_op_through_qnn(ggml_backend_dev_t dev, const
 
     struct ggml_tensor * src0 = op_tensor->src[0];
     struct ggml_tensor * src1 = op_tensor->src[1];
-    const int64_t ne00  = src0->ne[0];;
+    const int64_t ne00        = src0->ne[0];;
     const int src0_rank = ggml_n_dims(src0);
-    int src1_rank       = 0;
+    int src1_rank  = 0;
     if (nullptr != src1) {
         src1_rank = ggml_n_dims(src1);
     }
diff --git a/ggml/src/ggml-hexagon/kernels/ggml-dsp.h b/ggml/src/ggml-hexagon/kernels/ggml-dsp.h
@@ -51,7 +51,7 @@ extern "C" {
 #define GGML_FP16_TO_FP32(x)            ggml_lookup_fp16_to_fp32(x)
 
 //NPU performance will be slower when enable GGMLHEXAGON_DEBUG
-#if 1//def NDEBUG
+#ifdef NDEBUG
 #define GGMLHEXAGON_DEBUG                                   0
 #else
 #define GGMLHEXAGON_DEBUG                                   1
diff --git a/scripts/build-run-android.sh b/scripts/build-run-android.sh
@@ -18,7 +18,7 @@ QNN_SDK_INSTALL_PATH=/opt/qcom/aistack/qairt/
 QNN_SDK_VERSION=2.32.0.250228
 QNN_SDK_PATH=${QNN_SDK_INSTALL_PATH}/${QNN_SDK_VERSION}
 
-#5.5.3.0 should be also ok because someone told me can't find 6.2.0.1 on 04/05/2025
+#5.5.3.0 should be also ok
 HEXAGON_SDK_PATH=/opt/qcom/Hexagon_SDK/6.2.0.1
 #available htp arch version:
 #v68 --- Snapdragon 888
@@ -132,6 +132,16 @@ function build_arm64
     cd -
 }
 
+function build_arm64_debug
+{
+    cmake -H. -B./out/android -DCMAKE_BUILD_TYPE=Debug -DGGML_OPENMP=OFF -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=latest -DCMAKE_C_FLAGS=-march=armv8.7-a -DGGML_HEXAGON=ON -DQNN_SDK_PATH=${QNN_SDK_PATH} -DHEXAGON_SDK_PATH=${HEXAGON_SDK_PATH} -DHTP_ARCH_VERSION=${HTP_ARCH_VERSION}
+    cd out/android
+    make -j16
+    show_pwd
+
+    cd -
+}
+
 
 function remove_temp_dir()
 {
@@ -177,7 +187,7 @@ function update_qnn_cfg()
 }
 
 
-function build_ggml_qnn()
+function build_ggml_hexagon()
 {
     show_pwd
     check_and_download_ndk
@@ -188,6 +198,17 @@ function build_ggml_qnn()
     build_arm64
 }
 
+function build_ggml_hexagon_debug()
+{
+    show_pwd
+    check_and_download_ndk
+    check_and_download_qnn_sdk
+    check_hexagon_sdk
+    dump_vars
+    remove_temp_dir
+    build_arm64_debug
+}
+
 
 function prepare_run_on_phone()
 {
@@ -341,6 +362,7 @@ function show_usage()
     echo "  $0 help"
     echo "  $0 print_oplist"
     echo "  $0 build"
+    echo "  $0 build_debug (enable debug log for developers on ARM-AP side and cDSP side)"
     echo "  $0 updateqnnlib"
     echo "  $0 run_testops"
     echo "  $0 run_testop          [ADD/MUL_MAT]"
@@ -371,7 +393,10 @@ elif [ $# == 1 ]; then
         print_oplist
         exit 1
     elif [ "$1" == "build" ]; then
-        build_ggml_qnn
+        build_ggml_hexagon
+        exit 0
+    elif [ "$1" == "build_debug" ]; then
+        build_ggml_hexagon_debug
         exit 0
     elif [ "$1" == "run_testops" ]; then
         run_test-ops
diff --git a/scripts/ggml-hexagon.cfg b/scripts/ggml-hexagon.cfg
@@ -1,6 +1,6 @@
 [general]
 #version of ggml-hexagon.cpp on ARM-AP side
-version = "1.00"
+version = "1.01"
 #version of ggml-dsp.c on cDSP side
 ggmldsp_version = "0.60"
 
@@ -44,3 +44,6 @@ precision_mode = "fp16"
 enable_rpc_ion_mempool = 0
 #enable/disable rpc dma memory pool
 enable_rpc_dma_mempool = 0
+#enable/disable offload all quantized type mulmat to cDSP
+#ensure enable_q_mulmat already be setting to 1
+enable_all_q_mulmat = 0