ggml-dsp: refine logic of thread_counts

zhouwg · zhouwg · commit 758c94a72caa · 2025-04-17T09:45:13.000+08:00
diff --git a/ggml/src/ggml-hexagon/kernels/ggml-dsp.c b/ggml/src/ggml-hexagon/kernels/ggml-dsp.c
@@ -198,7 +198,8 @@ int ggmlop_dsp_open(const char*uri, remote_handle64* handle) {
     GGMLHEXAGON_LOG_DEBUG("aheap.heap_base=0x%x, aheap.heap_limit=0x%x", aheap.heap_base, aheap.heap_limit);
     qurt_sysenv_max_hthreads_t mhwt;
     qurt_sysenv_get_max_hw_threads(&mhwt);
-    GGMLHEXAGON_LOG_DEBUG("max hardware threads=%d", mhwt.max_hthreads);
+    GGMLHEXAGON_LOG_DEBUG("max hardware threads counts=%d", mhwt.max_hthreads);
+    g_thread_counts = mhwt.max_hthreads;
 
     return 0;
 }
@@ -211,13 +212,18 @@ int ggmlop_dsp_close(remote_handle64 handle) {
 }
 
 AEEResult ggmlop_dsp_setclocks(remote_handle64 handle, int32 power_level, int32 latency, int32 dcvs_enabled, int32 thread_counts) {
-    GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
+    GGMLHEXAGON_LOG_DEBUG("enter %s", __func__);
     HAP_power_request_t request;
     memset(&request, 0, sizeof(HAP_power_request_t));
     request.type = HAP_power_set_apptype;
     request.apptype = HAP_POWER_COMPUTE_CLIENT_CLASS;
 
-    g_thread_counts = thread_counts;
+    GGMLHEXAGON_LOG_DEBUG("user specified thread_counts %d", thread_counts);
+    if (thread_counts > 1)
+        g_thread_counts = (thread_counts > g_thread_counts) ? g_thread_counts : thread_counts;
+    else
+        g_thread_counts = 1;
+    GGMLHEXAGON_LOG_DEBUG("real thread_counts %d", g_thread_counts);
 
     void * ggmop_ctx = (void*)(handle);
     int retval = HAP_power_set(ggmop_ctx, &request);
diff --git a/ggml/src/ggml-hexagon/kernels/ggml-dsp.h b/ggml/src/ggml-hexagon/kernels/ggml-dsp.h
@@ -50,9 +50,35 @@ extern "C" {
 #define GGML_MEM_ALIGN      16
 #endif
 
-#define GGML_RESTRICT
+#ifdef __cplusplus
+// restrict not standard in C++
+#    if defined(__GNUC__)
+#        define GGML_RESTRICT       __restrict__
+#    elif defined(__clang__)
+#        define GGML_RESTRICT       __restrict
+#    elif defined(_MSC_VER)
+#        define GGML_RESTRICT       __restrict
+#    else
+#        define GGML_RESTRICT
+#    endif
+#else
+#    if defined (_MSC_VER) && (__STDC_VERSION__ < 201112L)
+#        define GGML_RESTRICT       __restrict
+#    else
+#        define GGML_RESTRICT       restrict
+#    endif
+#endif
+
+#ifndef __cplusplus
+#ifndef static_assert
+        #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
+            #define static_assert(cond, msg) _Static_assert(cond, msg)
+        #else
+            #define static_assert(cond, msg) struct global_scope_noop_trick
+        #endif
+#endif
+#endif // __cplusplus
 
-#define static_assert(a, b) do { } while (0)
 
 //NPU performance will be slower when enable GGMLHEXAGON_DEBUG
 #ifdef NDEBUG
diff --git a/ggml/src/ggml-hexagon/kernels/mulmat.c b/ggml/src/ggml-hexagon/kernels/mulmat.c
@@ -132,7 +132,9 @@ static void ggml_compute_forward_mul_mat_one_chunk(const ggml_tensor *src0, cons
                 float * dst_col = (float*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3));
 
                 for (int32_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) {
-                    vec_dot_f32(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot);
+                    vec_dot_f32(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0),
+                                (float*)(src0_row + ir0 * nb01), (num_rows_per_vec_dot > 1 ? nb01 : 0),
+                                (float*)src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot);
                 }
 
                 for (int cn = 0; cn < num_rows_per_vec_dot; ++cn) {

Original file line number	Diff line number	Diff line change
`@@ -132,7 +132,9 @@ static void ggml_compute_forward_mul_mat_one_chunk(const ggml_tensor *src0, cons`
`132`	`132`	`float * dst_col = (float)((char)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3));`
`133`	`133`
`134`	`134`	`for (int32_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) {`
`135`		`- vec_dot_f32(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot);`
	`135`	`+ vec_dot_f32(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0),`
	`136`	`+ (float)(src0_row + ir0 nb01), (num_rows_per_vec_dot > 1 ? nb01 : 0),`
	`137`	`+ (float*)src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot);`
`136`	`138`	`}`
`137`	`139`
`138`	`140`	`for (int cn = 0; cn < num_rows_per_vec_dot; ++cn) {`