ggml-hexagon: optimize GGML_OP_ADD on cDSP side

zhouwg · zhouwg · commit 254f4d5b2673 · 2025-04-01T22:23:48.000+08:00
diff --git a/ggml/src/ggml-qnn/kernels/ggmlop_cdsp.c b/ggml/src/ggml-qnn/kernels/ggmlop_cdsp.c
@@ -45,6 +45,8 @@
 
 #define GGML_MAX_DIMS       4
 
+#define ALIGN_128_BYTE      128
+
 #define GGML_UNUSED(x)      (void)(x)
 
 #define UNUSED              GGML_UNUSED
@@ -223,7 +225,7 @@ static const struct ggml_type_traits type_traits[1] = {
 //  section-2: ggml-hexagon kernel's internal troubleshooting function
 // =================================================================================================
 static void ggmlhexagon_log_internal(int level, const char *file, const char *func, int line, const char *format, ...) {
-    //return;
+    return;
     static char s_ggmlhexagon_log_internal_buf[GGMLHEXAGON_LOGBUF_LEN];
     va_list args;
     va_start(args, format);
@@ -504,6 +506,46 @@ AEEResult ggmlop_dsp_setclocks(remote_handle64 handle, int32 power_level, int32
 // =================================================================================================
 //  section-5: ggml-hexagon kernel function: offload ggmlop to cDSP through Hexagon C API and SIMD instructions
 // =================================================================================================
+inline static void ggmlhexagon_dsp_add_f32 (const int n, float * z, const float * x, const float * y) {
+    HVX_Vector * va;
+    HVX_Vector * vb;
+    HVX_Vector * vc;
+    HVX_Vector qf32;
+    const int FLOATS_PER_VECTOR = 128 / sizeof(float);
+    const int block  = n / FLOATS_PER_VECTOR;
+    const int left   = n % FLOATS_PER_VECTOR;
+    const int blocks = block * FLOATS_PER_VECTOR;
+
+    if (0 == block) {
+        for (size_t i = 0; i < n; ++i)
+            z[i] = x[i] + y[i];
+
+        return;
+    }
+
+    if ((((uintptr_t)z | (uintptr_t)x | (uintptr_t)y) % ALIGN_128_BYTE) != 0) {
+        GGMLHEXAGON_LOG_DEBUG("memaddress mismatch alignment 128 bytes z:%p x:%p y:%p", z, x, y);
+        for (size_t i = 0; i < n; ++i)
+            z[i] = x[i] + y[i];
+
+        return;
+    }
+
+    va = (HVX_Vector *)x;
+    vb = (HVX_Vector *)y;
+    vc = (HVX_Vector *)z;
+    for (size_t i = 0; i < block; ++i) {
+        qf32 = Q6_Vqf32_vadd_VsfVsf(*va++, *vb++);
+        *vc = Q6_Vsf_equals_Vqf32(qf32);
+        vc++;
+    }
+
+    if (left > 0) {
+        for (size_t i = 0; i < left; ++i)
+            z[i + blocks] = x[i + blocks] + y[i + blocks];
+    }
+}
+
 static void ggml_compute_forward_add_f32(
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
@@ -545,19 +587,7 @@ static void ggml_compute_forward_add_f32(
             float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
             float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
             for (int64_t r = 0; r < nr0; ++r) {
-#ifdef GGML_USE_ACCELERATE
-                //vDSP_vadd(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10);
-                HVX_Vector *va = (HVX_Vector *) src1_ptr;
-                HVX_Vector *vb = (HVX_Vector *) src0_ptr + r * ne10;
-                HVX_Vector *vc = (HVX_Vector *) dst_ptr + r * ne10;
-                int total_vectors = ne10 / FLOATS_PER_VECTOR;
-                GGMLHEXAGON_LOG_DEBUG("total_vectors %d", total_vectors);
-                for (int i = 0; i < total_vectors; ++i) {
-                    *vc++ = Q6_Vqf32_vadd_Vqf32Vqf32(*va++, *vb++);
-                }
-#else
-        ggml_vec_add_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
-#endif
+                ggmlhexagon_dsp_add_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
             }
         }
     } else {