ggml-hexagon: release v1.06 and ready for code review

zhouwg · zhouwg · commit 48b72f31729e · 2025-05-17T10:44:49.000+08:00
diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -383,8 +383,8 @@ static struct hexagon_appcfg_t g_hexagon_appcfg = {
 #elif defined(_WIN32)
         .qnn_runtimelib_path    = "C:\\",
 #endif
-        .ggml_hexagon_version   = {"1.05"},
-        .ggml_dsp_version       = {"0.62"},
+        .ggml_hexagon_version   = {"1.06"},
+        .ggml_dsp_version       = {"0.63"},
 };
 
 //file:///opt/qcom/aistack/qairt/2.31.0.250130/docs/QNN/general/overview.html#tbl-supported-snapdragon-devices
@@ -1417,6 +1417,13 @@ class hexagon_appcfg {
         section = cur_section;
         trim(key);
         trim(value);
+
+        //"1.00" -> 1.00
+        if (value.front() == '"' && value.back() == '"') {
+            value.erase(0, 1); // erase the first character "
+            value.erase(value.size() - 1); // erase the last character "
+        }
+
         return true;
     }
 
@@ -1829,8 +1836,10 @@ static void ggmlhexagon_load_cfg() {
         GGMLHEXAGON_LOG_INFO("%s", tmposs.str().c_str());
     });
     std::string precision_mode;
-    std::string ggml_hexagon_version;
-    hexagoncfg_instance.get_stringvalue("general", "version", ggml_hexagon_version, "1.00");
+    std::string version; //version of ggml-hexagon.cpp
+    std::string ggmldsp_version; //version of ggml-dsp.c
+    hexagoncfg_instance.get_stringvalue("general", "version", version, "1.00");
+    hexagoncfg_instance.get_stringvalue("general", "ggmldsp_version", ggmldsp_version, "0.62");
     hexagoncfg_instance.get_intvalue("general", "enable_perf", g_hexagon_appcfg.enable_perf, 1);
     hexagoncfg_instance.get_intvalue("general", "print_tensors_info", g_hexagon_appcfg.print_tensors_info, 0);
     hexagoncfg_instance.get_intvalue("general", "dump_op_info", g_hexagon_appcfg.dump_op_info, 0);
@@ -1854,7 +1863,9 @@ static void ggmlhexagon_load_cfg() {
 
     GGMLHEXAGON_LOG_INFO("internal ggml_hexagon_version=%s", g_hexagon_appcfg.ggml_hexagon_version);
     GGMLHEXAGON_LOG_INFO("internal ggml_dsp_version=%s", g_hexagon_appcfg.ggml_dsp_version);
-    GGMLHEXAGON_LOG_INFO("external ggml_hexagon_version=%s", ggml_hexagon_version.c_str());
+    GGMLHEXAGON_LOG_INFO("external ggml_hexagon_version=%s", version.c_str());
+    GGMLHEXAGON_LOG_INFO("external ggml_dsp_version=%s", ggmldsp_version.c_str());
+    memcpy(g_hexagon_appcfg.ggml_dsp_version, ggmldsp_version.c_str(), strlen(ggmldsp_version.c_str()));
     GGMLHEXAGON_LOG_INFO("hwaccel_approach=%d(%s)", g_hexagon_appcfg.hwaccel_approach,
                          ggmlhexagon_get_hwaccel_approach_name(g_hexagon_appcfg.hwaccel_approach));
     GGMLHEXAGON_LOG_INFO("hexagon_backend=%d(%s)", g_hexagon_appcfg.hexagon_backend,
@@ -5445,6 +5456,7 @@ static void ggmlhexagon_compute(ggml_backend_hexagon_context * ctx, struct ggml_
     // between ARM-AP and cDSP. the mechanism in qidl/FastRPC is exactly similar to mechanism in TEE.
     // try to find a better/efficient approach to exchange necessary data between ARM-AP side and cDSP side.
     // manually modifying the important data structure ggml_tensor in ggml.h is not make-sense and not acceptable.
+    std::chrono::high_resolution_clock::time_point start_time = std::chrono::high_resolution_clock::now();
     dsptensor_0.data        = src0->data;
     dsptensor_0.data_len    = ggml_nbytes(src0);
     dsptensor_0.type        = src0->type;
@@ -5491,6 +5503,9 @@ static void ggmlhexagon_compute(ggml_backend_hexagon_context * ctx, struct ggml_
     dsptensor_2.nb[3] = dst->nb[3];
 
     memcpy(dsptensor_2.op_params, dst->op_params, GGML_MAX_OP_PARAMS / sizeof(int32_t));
+    std::chrono::high_resolution_clock::time_point end_time = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<size_t, std::nano> duration = end_time - start_time;
+    GGMLHEXAGON_LOG_VERBOSE("pack duration %llu ns", duration.count());
 
     hexagon_error = op_func(ctx->ggmlop_handle, &dsptensor_0, &dsptensor_1, &dsptensor_2);
     if (AEE_SUCCESS != hexagon_error) {
diff --git a/ggml/src/ggml-hexagon/kernels/Makefile b/ggml/src/ggml-hexagon/kernels/Makefile
@@ -21,7 +21,7 @@ CFLAGS=-m${HTP_ARCH_VERSION} -c -Ofast -Wall -Wstrict-prototypes -fno-zero-initi
 LDFLAGS=-m${HTP_ARCH_VERSION} -Wl,--defsym=ISDB_TRUSTED_FLAG=2 -Wl,--defsym=ISDB_SECURE_FLAG=2 -Wl,--no-threads -fpic -shared -Wl,-Bsymbolic -Wl,--wrap=malloc -Wl,--wrap=calloc -Wl,--wrap=free -Wl,--wrap=realloc -Wl,--wrap=memalign -lc -Wl,-soname=${TARGET}
 
 #SRCS = $(wildcard *.c)
-SRCS = ggml-dsp.c skel.c add.c  mulmat.c
+SRCS = ggml-dsp.c skel.c entry.c add.c  mulmat.c
 OBJS = $(patsubst %.c, %.o, $(SRCS))
 
 ALL:$(OBJS)
diff --git a/ggml/src/ggml-hexagon/kernels/entry.c b/ggml/src/ggml-hexagon/kernels/entry.c
@@ -0,0 +1,115 @@
+#include "ggml-dsp.h"
+
+static int32 g_thread_counts = 1;
+
+int ggmlop_dsp_open(const char * uri, remote_handle64 * handle) {
+    void * tptr = NULL;
+    GGMLHEXAGON_LOG_DEBUG("uri %s", uri);
+    tptr = (void *)malloc(1);
+    GGML_ASSERT(NULL != tptr);
+    *handle = (remote_handle64)tptr;
+
+    GGMLHEXAGON_LOG_DEBUG("api_version = 0x%x", qurt_api_version());
+    GGMLHEXAGON_LOG_DEBUG("hvx units = 0x%d", qurt_hvx_get_units());
+    qurt_arch_version_t  vers;
+    qurt_sysenv_get_arch_version(&vers);
+    GGMLHEXAGON_LOG_DEBUG("arch_version=0x%x", vers.arch_version);
+
+    qurt_sysenv_app_heap_t aheap;
+    qurt_sysenv_get_app_heap(&aheap);
+    GGMLHEXAGON_LOG_DEBUG("aheap.heap_base=0x%x, aheap.heap_limit=0x%x", aheap.heap_base, aheap.heap_limit);
+
+    qurt_sysenv_max_hthreads_t mhwt;
+    qurt_sysenv_get_max_hw_threads(&mhwt);
+    GGMLHEXAGON_LOG_DEBUG("max hardware threads counts=%d", mhwt.max_hthreads);
+    g_thread_counts = mhwt.max_hthreads;
+
+    return 0;
+}
+
+int ggmlop_dsp_close(remote_handle64 handle) {
+    if (handle)
+        free((void*)handle);
+
+    return 0;
+}
+
+AEEResult ggmlop_dsp_setclocks(remote_handle64 handle, int32 power_level, int32 latency, int32 dcvs_enabled, int32 thread_counts) {
+    GGMLHEXAGON_LOG_DEBUG("enter %s", __func__);
+    HAP_power_request_t request;
+    memset(&request, 0, sizeof(HAP_power_request_t));
+    request.type = HAP_power_set_apptype;
+    request.apptype = HAP_POWER_COMPUTE_CLIENT_CLASS;
+
+    GGMLHEXAGON_LOG_DEBUG("user specified thread_counts %d", thread_counts);
+    if (thread_counts > 1)
+        g_thread_counts = (thread_counts > g_thread_counts) ? g_thread_counts : thread_counts;
+    else
+        g_thread_counts = 1;
+    GGMLHEXAGON_LOG_DEBUG("real thread_counts %d", g_thread_counts);
+
+    void * ggmop_ctx = (void*)(handle);
+    int retval = HAP_power_set(ggmop_ctx, &request);
+    if (retval)  {
+        GGMLHEXAGON_LOG_DEBUG("failed first power vote");
+        return AEE_EFAILED;
+    }
+
+    //configure clocks & DCVS mode
+    memset(&request, 0, sizeof(HAP_power_request_t));
+    request.type = HAP_power_set_DCVS_v2;
+    request.dcvs_v2.dcvs_enable = TRUE;
+    request.dcvs_v2.dcvs_params.target_corner = (HAP_dcvs_voltage_corner_t)power_level;
+    if (dcvs_enabled) {
+        request.dcvs_v2.dcvs_params.min_corner = HAP_DCVS_VCORNER_DISABLE;
+        request.dcvs_v2.dcvs_params.max_corner = HAP_DCVS_VCORNER_DISABLE;
+    } else {
+        request.dcvs_v2.dcvs_params.min_corner = request.dcvs_v2.dcvs_params.target_corner;
+        request.dcvs_v2.dcvs_params.max_corner = request.dcvs_v2.dcvs_params.target_corner;
+    }
+    request.dcvs_v2.dcvs_option     = HAP_DCVS_V2_PERFORMANCE_MODE;
+    request.dcvs_v2.set_dcvs_params = TRUE;
+    request.dcvs_v2.set_latency     = TRUE;
+    request.dcvs_v2.latency         = latency;
+    retval = HAP_power_set(ggmop_ctx, &request);
+    if (retval) {
+        GGMLHEXAGON_LOG_DEBUG("failed to vote for performance mode");
+        return AEE_EFAILED;
+    }
+
+    memset(&request, 0, sizeof(HAP_power_request_t));
+    request.type = HAP_power_set_HVX;
+    request.hvx.power_up = TRUE;
+    retval = HAP_power_set(ggmop_ctx, &request);
+    if (retval) {
+        GGMLHEXAGON_LOG_DEBUG("failed to vote for HVX power");
+        return AEE_EFAILED;
+    }
+    GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
+    return AEE_SUCCESS;
+}
+
+// =================================================================================================
+//  implementation of ggml-hexagon kernel, it's better to put every hexagon-kernel to a single file
+// =================================================================================================
+int ggmlop_dsp_softmax(remote_handle64 h, const dsptensor * src0, const dsptensor * src1, dsptensor * dst) {
+    GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
+    GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
+    return 0;
+}
+
+int ggmlop_dsp_rmsnorm(remote_handle64 h, const dsptensor * src0, const dsptensor * src1, dsptensor * dst) {
+    GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
+    GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
+    return 0;
+}
+
+int ggmlop_dsp_pool2d(remote_handle64 h, const dsptensor * src0, const dsptensor * src1, dsptensor * dst) {
+    GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
+    GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
+    return 0;
+}
+
+int ggmlop_get_thread_counts(void) {
+    return g_thread_counts;
+}
diff --git a/ggml/src/ggml-hexagon/kernels/ggml-dsp.c b/ggml/src/ggml-hexagon/kernels/ggml-dsp.c
@@ -1,10 +1,33 @@
+/*
+ * Copyright (c) 2025 The ggml authors
+ *
+ * Qualcomm Hexagon SDK and reference tech guides could be found at:
+ * https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools
+ *
+ * this single-source-file or self-contained file is implementation of ggml-dsp:
+ *    - a customized tiny ggml running on Qualcomm Hexagon cDSP
+ *    - ported from original ggml
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
 #include "ggml-dsp.h"
 
-// =================================================================================================
-// tiny ggml-dsp, ported from original ggml
-// =================================================================================================
-static int32 g_thread_counts = 1;
-
 void ggmlhexagon_log_internal(int level, const char *file, const char *func, int line, const char *format, ...) {
 #if !GGMLHEXAGON_DEBUG
     return;
@@ -30,7 +53,7 @@ void ggmlhexagon_dump_tensor_elements(const ggml_tensor * tensor) {
     char tmpbuf[GGMLHEXAGON_LOGBUF_LEN];
     size_t buflen = 0;
     if (tensor->type == GGML_TYPE_F32) {
-        memset(tmpbuf, 0, GGMLHEXAGON_LOG_LEVEL_DEBUG);
+        memset(tmpbuf, 0, GGMLHEXAGON_LOGBUF_LEN);
         for (int h = 0; h < tensor->ne[3]; h++) {
             for (int i = 0; i < tensor->ne[2]; i++) {
                 for (int j = 0; j < tensor->ne[1]; j++) {
@@ -173,116 +196,3 @@ int64_t ggml_time_ms(void) {
 int64_t ggml_time_us(void) {
     return hexagon_perf_get_time_us();
 }
-
-int ggmlop_get_thread_counts(void) {
-    return g_thread_counts;
-}
-
-// =================================================================================================
-//  implementation of ggml-hexagon kernel skel function
-// =================================================================================================
-int ggmlop_dsp_open(const char*uri, remote_handle64* handle) {
-    void *tptr = NULL;
-    GGMLHEXAGON_LOG_DEBUG("uri %s", uri);
-    tptr = (void *)malloc(1);
-    *handle = (remote_handle64)tptr;
-    assert(*handle);
-
-    GGMLHEXAGON_LOG_DEBUG("api_version = 0x%x", qurt_api_version());
-    GGMLHEXAGON_LOG_DEBUG("hvx units = 0x%d", qurt_hvx_get_units());
-    qurt_arch_version_t  vers;
-    qurt_sysenv_get_arch_version(&vers);
-    GGMLHEXAGON_LOG_DEBUG("arch_version=0x%x", vers.arch_version);
-    qurt_sysenv_app_heap_t aheap;
-    qurt_sysenv_get_app_heap(&aheap);
-    GGMLHEXAGON_LOG_DEBUG("aheap.heap_base=0x%x, aheap.heap_limit=0x%x", aheap.heap_base, aheap.heap_limit);
-    qurt_sysenv_max_hthreads_t mhwt;
-    qurt_sysenv_get_max_hw_threads(&mhwt);
-    GGMLHEXAGON_LOG_DEBUG("max hardware threads counts=%d", mhwt.max_hthreads);
-    g_thread_counts = mhwt.max_hthreads;
-
-    return 0;
-}
-
-int ggmlop_dsp_close(remote_handle64 handle) {
-    if (handle)
-        free((void*)handle);
-
-    return 0;
-}
-
-AEEResult ggmlop_dsp_setclocks(remote_handle64 handle, int32 power_level, int32 latency, int32 dcvs_enabled, int32 thread_counts) {
-    GGMLHEXAGON_LOG_DEBUG("enter %s", __func__);
-    HAP_power_request_t request;
-    memset(&request, 0, sizeof(HAP_power_request_t));
-    request.type = HAP_power_set_apptype;
-    request.apptype = HAP_POWER_COMPUTE_CLIENT_CLASS;
-
-    GGMLHEXAGON_LOG_DEBUG("user specified thread_counts %d", thread_counts);
-    if (thread_counts > 1)
-        g_thread_counts = (thread_counts > g_thread_counts) ? g_thread_counts : thread_counts;
-    else
-        g_thread_counts = 1;
-    GGMLHEXAGON_LOG_DEBUG("real thread_counts %d", g_thread_counts);
-
-    void * ggmop_ctx = (void*)(handle);
-    int retval = HAP_power_set(ggmop_ctx, &request);
-    if (retval)  {
-        GGMLHEXAGON_LOG_DEBUG("failed first power vote");
-        return AEE_EFAILED;
-    }
-
-    //configure clocks & DCVS mode
-    memset(&request, 0, sizeof(HAP_power_request_t));
-    request.type = HAP_power_set_DCVS_v2;
-    request.dcvs_v2.dcvs_enable = TRUE;
-    request.dcvs_v2.dcvs_params.target_corner = (HAP_dcvs_voltage_corner_t)power_level;
-    if (dcvs_enabled) {
-        request.dcvs_v2.dcvs_params.min_corner = HAP_DCVS_VCORNER_DISABLE;
-        request.dcvs_v2.dcvs_params.max_corner = HAP_DCVS_VCORNER_DISABLE;
-    } else {
-        request.dcvs_v2.dcvs_params.min_corner = request.dcvs_v2.dcvs_params.target_corner;
-        request.dcvs_v2.dcvs_params.max_corner = request.dcvs_v2.dcvs_params.target_corner;
-    }
-    request.dcvs_v2.dcvs_option     = HAP_DCVS_V2_PERFORMANCE_MODE;
-    request.dcvs_v2.set_dcvs_params = TRUE;
-    request.dcvs_v2.set_latency     = TRUE;
-    request.dcvs_v2.latency         = latency;
-    retval = HAP_power_set(ggmop_ctx, &request);
-    if (retval) {
-        GGMLHEXAGON_LOG_DEBUG("failed to vote for performance mode");
-        return AEE_EFAILED;
-    }
-
-    memset(&request, 0, sizeof(HAP_power_request_t));
-    request.type = HAP_power_set_HVX;
-    request.hvx.power_up = TRUE;
-    retval = HAP_power_set(ggmop_ctx, &request);
-    if (retval) {
-        GGMLHEXAGON_LOG_DEBUG("failed to vote for HVX power");
-        return AEE_EFAILED;
-    }
-    GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
-    return AEE_SUCCESS;
-}
-
-// =================================================================================================
-//  implementation of ggml-hexagon kernel, it's better to put every hexagon-kernel to a single file
-// =================================================================================================
-int ggmlop_dsp_softmax(remote_handle64 h, const dsptensor * src0, const dsptensor * src1, dsptensor * dst) {
-    GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
-    GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
-    return 0;
-}
-
-int ggmlop_dsp_rmsnorm(remote_handle64 h, const dsptensor * src0, const dsptensor * src1, dsptensor * dst) {
-    GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
-    GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
-    return 0;
-}
-
-int ggmlop_dsp_pool2d(remote_handle64 h, const dsptensor * src0, const dsptensor * src1, dsptensor * dst) {
-    GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
-    GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
-    return 0;
-}
diff --git a/scripts/ggml-hexagon.cfg b/scripts/ggml-hexagon.cfg
@@ -23,9 +23,9 @@
 #
 [general]
 #version of ggml-hexagon.cpp on ARM-AP side
-version = "1.05"
+version = "1.06"
 #version of ggml-dsp.c on cDSP side
-ggmldsp_version = "0.62"
+ggmldsp_version = "0.63"
 
 #0: HEXAGON_BACKEND_QNNCPU
 #1: HEXAGON_BACKEND_QNNGPU