Skip to content

Commit 254f4d5

Browse files
author
zhouwg
committed
ggml-hexagon: optimize GGML_OP_ADD on cDSP side
1 parent 3ee7316 commit 254f4d5

File tree

1 file changed

+44
-14
lines changed

1 file changed

+44
-14
lines changed

ggml/src/ggml-qnn/kernels/ggmlop_cdsp.c

Lines changed: 44 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@
4545

4646
#define GGML_MAX_DIMS 4
4747

48+
#define ALIGN_128_BYTE 128
49+
4850
#define GGML_UNUSED(x) (void)(x)
4951

5052
#define UNUSED GGML_UNUSED
@@ -223,7 +225,7 @@ static const struct ggml_type_traits type_traits[1] = {
223225
// section-2: ggml-hexagon kernel's internal troubleshooting function
224226
// =================================================================================================
225227
static void ggmlhexagon_log_internal(int level, const char *file, const char *func, int line, const char *format, ...) {
226-
//return;
228+
return;
227229
static char s_ggmlhexagon_log_internal_buf[GGMLHEXAGON_LOGBUF_LEN];
228230
va_list args;
229231
va_start(args, format);
@@ -504,6 +506,46 @@ AEEResult ggmlop_dsp_setclocks(remote_handle64 handle, int32 power_level, int32
504506
// =================================================================================================
505507
// section-5: ggml-hexagon kernel function: offload ggmlop to cDSP through Hexagon C API and SIMD instructions
506508
// =================================================================================================
509+
inline static void ggmlhexagon_dsp_add_f32 (const int n, float * z, const float * x, const float * y) {
510+
HVX_Vector * va;
511+
HVX_Vector * vb;
512+
HVX_Vector * vc;
513+
HVX_Vector qf32;
514+
const int FLOATS_PER_VECTOR = 128 / sizeof(float);
515+
const int block = n / FLOATS_PER_VECTOR;
516+
const int left = n % FLOATS_PER_VECTOR;
517+
const int blocks = block * FLOATS_PER_VECTOR;
518+
519+
if (0 == block) {
520+
for (size_t i = 0; i < n; ++i)
521+
z[i] = x[i] + y[i];
522+
523+
return;
524+
}
525+
526+
if ((((uintptr_t)z | (uintptr_t)x | (uintptr_t)y) % ALIGN_128_BYTE) != 0) {
527+
GGMLHEXAGON_LOG_DEBUG("memaddress mismatch alignment 128 bytes z:%p x:%p y:%p", z, x, y);
528+
for (size_t i = 0; i < n; ++i)
529+
z[i] = x[i] + y[i];
530+
531+
return;
532+
}
533+
534+
va = (HVX_Vector *)x;
535+
vb = (HVX_Vector *)y;
536+
vc = (HVX_Vector *)z;
537+
for (size_t i = 0; i < block; ++i) {
538+
qf32 = Q6_Vqf32_vadd_VsfVsf(*va++, *vb++);
539+
*vc = Q6_Vsf_equals_Vqf32(qf32);
540+
vc++;
541+
}
542+
543+
if (left > 0) {
544+
for (size_t i = 0; i < left; ++i)
545+
z[i + blocks] = x[i + blocks] + y[i + blocks];
546+
}
547+
}
548+
507549
static void ggml_compute_forward_add_f32(
508550
const struct ggml_tensor * src0,
509551
const struct ggml_tensor * src1,
@@ -545,19 +587,7 @@ static void ggml_compute_forward_add_f32(
545587
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
546588
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
547589
for (int64_t r = 0; r < nr0; ++r) {
548-
#ifdef GGML_USE_ACCELERATE
549-
//vDSP_vadd(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10);
550-
HVX_Vector *va = (HVX_Vector *) src1_ptr;
551-
HVX_Vector *vb = (HVX_Vector *) src0_ptr + r * ne10;
552-
HVX_Vector *vc = (HVX_Vector *) dst_ptr + r * ne10;
553-
int total_vectors = ne10 / FLOATS_PER_VECTOR;
554-
GGMLHEXAGON_LOG_DEBUG("total_vectors %d", total_vectors);
555-
for (int i = 0; i < total_vectors; ++i) {
556-
*vc++ = Q6_Vqf32_vadd_Vqf32Vqf32(*va++, *vb++);
557-
}
558-
#else
559-
ggml_vec_add_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
560-
#endif
590+
ggmlhexagon_dsp_add_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
561591
}
562592
}
563593
} else {

0 commit comments

Comments
 (0)