Skip to content

Commit 758c94a

Browse files
author
zhouwg
committed
ggml-dsp: refine logic of thread_counts
1 parent 01fc3f6 commit 758c94a

File tree

3 files changed

+40
-6
lines changed

3 files changed

+40
-6
lines changed

ggml/src/ggml-hexagon/kernels/ggml-dsp.c

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,8 @@ int ggmlop_dsp_open(const char*uri, remote_handle64* handle) {
198198
GGMLHEXAGON_LOG_DEBUG("aheap.heap_base=0x%x, aheap.heap_limit=0x%x", aheap.heap_base, aheap.heap_limit);
199199
qurt_sysenv_max_hthreads_t mhwt;
200200
qurt_sysenv_get_max_hw_threads(&mhwt);
201-
GGMLHEXAGON_LOG_DEBUG("max hardware threads=%d", mhwt.max_hthreads);
201+
GGMLHEXAGON_LOG_DEBUG("max hardware threads counts=%d", mhwt.max_hthreads);
202+
g_thread_counts = mhwt.max_hthreads;
202203

203204
return 0;
204205
}
@@ -211,13 +212,18 @@ int ggmlop_dsp_close(remote_handle64 handle) {
211212
}
212213

213214
AEEResult ggmlop_dsp_setclocks(remote_handle64 handle, int32 power_level, int32 latency, int32 dcvs_enabled, int32 thread_counts) {
214-
GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
215+
GGMLHEXAGON_LOG_DEBUG("enter %s", __func__);
215216
HAP_power_request_t request;
216217
memset(&request, 0, sizeof(HAP_power_request_t));
217218
request.type = HAP_power_set_apptype;
218219
request.apptype = HAP_POWER_COMPUTE_CLIENT_CLASS;
219220

220-
g_thread_counts = thread_counts;
221+
GGMLHEXAGON_LOG_DEBUG("user specified thread_counts %d", thread_counts);
222+
if (thread_counts > 1)
223+
g_thread_counts = (thread_counts > g_thread_counts) ? g_thread_counts : thread_counts;
224+
else
225+
g_thread_counts = 1;
226+
GGMLHEXAGON_LOG_DEBUG("real thread_counts %d", g_thread_counts);
221227

222228
void * ggmop_ctx = (void*)(handle);
223229
int retval = HAP_power_set(ggmop_ctx, &request);

ggml/src/ggml-hexagon/kernels/ggml-dsp.h

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,9 +50,35 @@ extern "C" {
5050
#define GGML_MEM_ALIGN 16
5151
#endif
5252

53-
#define GGML_RESTRICT
53+
#ifdef __cplusplus
54+
// restrict not standard in C++
55+
# if defined(__GNUC__)
56+
# define GGML_RESTRICT __restrict__
57+
# elif defined(__clang__)
58+
# define GGML_RESTRICT __restrict
59+
# elif defined(_MSC_VER)
60+
# define GGML_RESTRICT __restrict
61+
# else
62+
# define GGML_RESTRICT
63+
# endif
64+
#else
65+
# if defined (_MSC_VER) && (__STDC_VERSION__ < 201112L)
66+
# define GGML_RESTRICT __restrict
67+
# else
68+
# define GGML_RESTRICT restrict
69+
# endif
70+
#endif
71+
72+
#ifndef __cplusplus
73+
#ifndef static_assert
74+
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
75+
#define static_assert(cond, msg) _Static_assert(cond, msg)
76+
#else
77+
#define static_assert(cond, msg) struct global_scope_noop_trick
78+
#endif
79+
#endif
80+
#endif // __cplusplus
5481

55-
#define static_assert(a, b) do { } while (0)
5682

5783
//NPU performance will be slower when enable GGMLHEXAGON_DEBUG
5884
#ifdef NDEBUG

ggml/src/ggml-hexagon/kernels/mulmat.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,9 @@ static void ggml_compute_forward_mul_mat_one_chunk(const ggml_tensor *src0, cons
132132
float * dst_col = (float*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3));
133133

134134
for (int32_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) {
135-
vec_dot_f32(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot);
135+
vec_dot_f32(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0),
136+
(float*)(src0_row + ir0 * nb01), (num_rows_per_vec_dot > 1 ? nb01 : 0),
137+
(float*)src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot);
136138
}
137139

138140
for (int cn = 0; cn < num_rows_per_vec_dot; ++cn) {

0 commit comments

Comments
 (0)