1+ #include "ggml-dsp.h"
2+
3+ inline static void ggmlhexagon_dsp_add_f32 (const int n , float * z , const float * x , const float * y ) {
4+ HVX_Vector * va ;
5+ HVX_Vector * vb ;
6+ HVX_Vector * vc ;
7+ HVX_Vector qf32 ;
8+ const int FLOATS_PER_VECTOR = 128 / sizeof (float );
9+ const int block = n / FLOATS_PER_VECTOR ;
10+ const int left = n % FLOATS_PER_VECTOR ;
11+ const int blocks = block * FLOATS_PER_VECTOR ;
12+
13+ if (0 == block ) {
14+ for (size_t i = 0 ; i < n ; ++ i )
15+ z [i ] = x [i ] + y [i ];
16+
17+ return ;
18+ }
19+
20+ if ((((uintptr_t )z | (uintptr_t )x | (uintptr_t )y ) % ALIGN_128_BYTE ) != 0 ) {
21+ GGMLHEXAGON_LOG_DEBUG ("memaddress mismatch alignment 128 bytes z:%p x:%p y:%p" , z , x , y );
22+ for (size_t i = 0 ; i < n ; ++ i )
23+ z [i ] = x [i ] + y [i ];
24+
25+ return ;
26+ }
27+
28+ va = (HVX_Vector * )x ;
29+ vb = (HVX_Vector * )y ;
30+ vc = (HVX_Vector * )z ;
31+ for (size_t i = 0 ; i < block ; ++ i ) {
32+ qf32 = Q6_Vqf32_vadd_VsfVsf (* va ++ , * vb ++ );
33+ * vc = Q6_Vsf_equals_Vqf32 (qf32 );
34+ vc ++ ;
35+ }
36+
37+ if (left > 0 ) {
38+ for (size_t i = 0 ; i < left ; ++ i )
39+ z [i + blocks ] = x [i + blocks ] + y [i + blocks ];
40+ }
41+ }
42+
43+ static void ggml_compute_forward_add_f32 (
44+ const struct ggml_tensor * src0 ,
45+ const struct ggml_tensor * src1 ,
46+ struct ggml_tensor * dst ) {
47+ GGMLHEXAGON_LOG_DEBUG ("enter %s" , __func__ );
48+ uint64_t start_time = ggml_time_us ();
49+
50+ memcpy (dst -> ne , src1 -> ne , 16 );
51+ memcpy (dst -> nb , src1 -> nb , 16 );
52+ ggmlhexagon_dump_tensor (src0 , 1 );
53+ ggmlhexagon_dump_tensor (src1 , 1 );
54+ ggmlhexagon_dump_tensor (dst , 1 );
55+
56+ GGML_ASSERT (ggml_can_repeat (src1 , src0 ) && ggml_are_same_shape (src0 , dst ));
57+
58+ const int ith = 0 ;
59+ const int nth = 1 ;
60+
61+ const int nr = ggml_nrows (src0 );
62+ GGML_TENSOR_BINARY_OP_LOCALS
63+
64+ GGML_ASSERT ( nb0 == sizeof (float ));
65+ GGML_ASSERT (nb00 == sizeof (float ));
66+
67+ const int dr = (nr + nth - 1 )/nth ;
68+ const int ir0 = dr * ith ;
69+ const int ir1 = MIN (ir0 + dr , nr );
70+ if (nb10 == sizeof (float )) {
71+ for (int ir = ir0 ; ir < ir1 ; ++ ir ) {
72+ // src1 is broadcastable across src0 and dst in i1, i2, i3
73+ const int32_t i03 = ir /(ne02 * ne01 );
74+ const int32_t i02 = (ir - i03 * ne02 * ne01 )/ne01 ;
75+ const int32_t i01 = (ir - i03 * ne02 * ne01 - i02 * ne01 );
76+
77+ const int32_t i13 = i03 % ne13 ;
78+ const int32_t i12 = i02 % ne12 ;
79+ const int32_t i11 = i01 % ne11 ;
80+ const int32_t nr0 = ne00 / ne10 ;
81+
82+ float * dst_ptr = (float * ) ((char * ) dst -> data + i03 * nb3 + i02 * nb2 + i01 * nb1 );
83+ float * src0_ptr = (float * ) ((char * ) src0 -> data + i03 * nb03 + i02 * nb02 + i01 * nb01 );
84+ float * src1_ptr = (float * ) ((char * ) src1 -> data + i13 * nb13 + i12 * nb12 + i11 * nb11 );
85+ for (int32_t r = 0 ; r < nr0 ; ++ r ) {
86+ ggmlhexagon_dsp_add_f32 (ne10 , dst_ptr + r * ne10 , src0_ptr + r * ne10 , src1_ptr );
87+ }
88+ }
89+ } else {
90+ // src1 is not contiguous
91+ for (int ir = ir0 ; ir < ir1 ; ++ ir ) {
92+ // src1 is broadcastable across src0 and dst in i1, i2, i3
93+ const int32_t i03 = ir /(ne02 * ne01 );
94+ const int32_t i02 = (ir - i03 * ne02 * ne01 )/ne01 ;
95+ const int32_t i01 = (ir - i03 * ne02 * ne01 - i02 * ne01 );
96+
97+ const int32_t i13 = i03 % ne13 ;
98+ const int32_t i12 = i02 % ne12 ;
99+ const int32_t i11 = i01 % ne11 ;
100+
101+ float * dst_ptr = (float * ) ((char * ) dst -> data + i03 * nb3 + i02 * nb2 + i01 * nb1 );
102+ float * src0_ptr = (float * ) ((char * ) src0 -> data + i03 * nb03 + i02 * nb02 + i01 * nb01 );
103+
104+ for (int32_t i0 = 0 ; i0 < ne0 ; ++ i0 ) {
105+ const int32_t i10 = i0 % ne10 ;
106+ float * src1_ptr = (float * ) ((char * ) src1 -> data + i13 * nb13 + i12 * nb12 + i11 * nb11 + i10 * nb10 );
107+
108+ dst_ptr [i0 ] = src0_ptr [i0 ] + * src1_ptr ;
109+ }
110+ }
111+ }
112+
113+ uint64_t end_time = ggml_time_us ();
114+ uint64_t duration = (end_time - start_time );
115+ GGMLHEXAGON_LOG_DEBUG ("duration %llu us" , duration );
116+ #if !GGMLHEXAGON_DEBUG
117+ UNUSED (duration );
118+ #endif
119+
120+ GGMLHEXAGON_LOG_DEBUG ("leave %s" , __func__ );
121+ }
122+
123+ //FIXME: failed with test-backend-ops when disable ion rpc mempool
124+ int ggmlop_dsp_add (remote_handle64 h , const ggml_tensor * src0 , const ggml_tensor * src1 , ggml_tensor * dst )
125+ {
126+ GGMLHEXAGON_LOG_DEBUG ("enter %s\n" , __func__ );
127+ switch (src0 -> type ) {
128+ case GGML_TYPE_F32 :
129+ {
130+ if (src1 -> type == GGML_TYPE_F32 ) {
131+ ggml_compute_forward_add_f32 (src0 , src1 , dst );
132+ } else {
133+ GGML_ABORT ("fatal error" );
134+ }
135+ break ;
136+ }
137+ default :
138+ {
139+ GGML_ABORT ("fatal error" );
140+ }
141+ }
142+ GGMLHEXAGON_LOG_DEBUG ("leave %s\n" , __func__ );
143+ return 0 ;
144+ }
0 commit comments