|
45 | 45 |
|
46 | 46 | #define GGML_MAX_DIMS 4 |
47 | 47 |
|
| 48 | +#define ALIGN_128_BYTE 128 |
| 49 | + |
48 | 50 | #define GGML_UNUSED(x) (void)(x) |
49 | 51 |
|
50 | 52 | #define UNUSED GGML_UNUSED |
@@ -223,7 +225,7 @@ static const struct ggml_type_traits type_traits[1] = { |
223 | 225 | // section-2: ggml-hexagon kernel's internal troubleshooting function |
224 | 226 | // ================================================================================================= |
225 | 227 | static void ggmlhexagon_log_internal(int level, const char *file, const char *func, int line, const char *format, ...) { |
226 | | - //return; |
| 228 | + return; |
227 | 229 | static char s_ggmlhexagon_log_internal_buf[GGMLHEXAGON_LOGBUF_LEN]; |
228 | 230 | va_list args; |
229 | 231 | va_start(args, format); |
@@ -504,6 +506,46 @@ AEEResult ggmlop_dsp_setclocks(remote_handle64 handle, int32 power_level, int32 |
504 | 506 | // ================================================================================================= |
505 | 507 | // section-5: ggml-hexagon kernel function: offload ggmlop to cDSP through Hexagon C API and SIMD instructions |
506 | 508 | // ================================================================================================= |
| 509 | +inline static void ggmlhexagon_dsp_add_f32 (const int n, float * z, const float * x, const float * y) { |
| 510 | + HVX_Vector * va; |
| 511 | + HVX_Vector * vb; |
| 512 | + HVX_Vector * vc; |
| 513 | + HVX_Vector qf32; |
| 514 | + const int FLOATS_PER_VECTOR = 128 / sizeof(float); |
| 515 | + const int block = n / FLOATS_PER_VECTOR; |
| 516 | + const int left = n % FLOATS_PER_VECTOR; |
| 517 | + const int blocks = block * FLOATS_PER_VECTOR; |
| 518 | + |
| 519 | + if (0 == block) { |
| 520 | + for (size_t i = 0; i < n; ++i) |
| 521 | + z[i] = x[i] + y[i]; |
| 522 | + |
| 523 | + return; |
| 524 | + } |
| 525 | + |
| 526 | + if ((((uintptr_t)z | (uintptr_t)x | (uintptr_t)y) % ALIGN_128_BYTE) != 0) { |
| 527 | + GGMLHEXAGON_LOG_DEBUG("memaddress mismatch alignment 128 bytes z:%p x:%p y:%p", z, x, y); |
| 528 | + for (size_t i = 0; i < n; ++i) |
| 529 | + z[i] = x[i] + y[i]; |
| 530 | + |
| 531 | + return; |
| 532 | + } |
| 533 | + |
| 534 | + va = (HVX_Vector *)x; |
| 535 | + vb = (HVX_Vector *)y; |
| 536 | + vc = (HVX_Vector *)z; |
| 537 | + for (size_t i = 0; i < block; ++i) { |
| 538 | + qf32 = Q6_Vqf32_vadd_VsfVsf(*va++, *vb++); |
| 539 | + *vc = Q6_Vsf_equals_Vqf32(qf32); |
| 540 | + vc++; |
| 541 | + } |
| 542 | + |
| 543 | + if (left > 0) { |
| 544 | + for (size_t i = 0; i < left; ++i) |
| 545 | + z[i + blocks] = x[i + blocks] + y[i + blocks]; |
| 546 | + } |
| 547 | +} |
| 548 | + |
507 | 549 | static void ggml_compute_forward_add_f32( |
508 | 550 | const struct ggml_tensor * src0, |
509 | 551 | const struct ggml_tensor * src1, |
@@ -545,19 +587,7 @@ static void ggml_compute_forward_add_f32( |
545 | 587 | float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01); |
546 | 588 | float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11); |
547 | 589 | for (int64_t r = 0; r < nr0; ++r) { |
548 | | -#ifdef GGML_USE_ACCELERATE |
549 | | - //vDSP_vadd(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10); |
550 | | - HVX_Vector *va = (HVX_Vector *) src1_ptr; |
551 | | - HVX_Vector *vb = (HVX_Vector *) src0_ptr + r * ne10; |
552 | | - HVX_Vector *vc = (HVX_Vector *) dst_ptr + r * ne10; |
553 | | - int total_vectors = ne10 / FLOATS_PER_VECTOR; |
554 | | - GGMLHEXAGON_LOG_DEBUG("total_vectors %d", total_vectors); |
555 | | - for (int i = 0; i < total_vectors; ++i) { |
556 | | - *vc++ = Q6_Vqf32_vadd_Vqf32Vqf32(*va++, *vb++); |
557 | | - } |
558 | | -#else |
559 | | - ggml_vec_add_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr); |
560 | | -#endif |
| 590 | + ggmlhexagon_dsp_add_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr); |
561 | 591 | } |
562 | 592 | } |
563 | 593 | } else { |
|
0 commit comments