@@ -119,111 +119,7 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
119119 }
120120
121121#if defined(GGML_SIMD )
122- #if defined(__ARM_FEATURE_SVE )
123-
124- const int sve_register_length = svcntb () * 8 ;
125- const int ggml_f16_epr = sve_register_length / 16 ; // running when 16
126- const int ggml_f16_step = 8 * ggml_f16_epr ; // choose 8 SVE registers
127-
128- const int np = (n & ~(ggml_f16_step - 1 ));
129-
130- svfloat16_t sum_00 = svdup_n_f16 (0.0f );
131- svfloat16_t sum_01 = svdup_n_f16 (0.0f );
132- svfloat16_t sum_02 = svdup_n_f16 (0.0f );
133- svfloat16_t sum_03 = svdup_n_f16 (0.0f );
134-
135- svfloat16_t sum_10 = svdup_n_f16 (0.0f );
136- svfloat16_t sum_11 = svdup_n_f16 (0.0f );
137- svfloat16_t sum_12 = svdup_n_f16 (0.0f );
138- svfloat16_t sum_13 = svdup_n_f16 (0.0f );
139-
140- svfloat16_t ax1 , ax2 , ax3 , ax4 , ax5 , ax6 , ax7 , ax8 ;
141- svfloat16_t ay1 , ay2 , ay3 , ay4 , ay5 , ay6 , ay7 , ay8 ;
142-
143- for (int i = 0 ; i < np ; i += ggml_f16_step ) {
144- ay1 = GGML_F16x_VEC_LOAD (y + i + 0 * ggml_f16_epr , 0 ); // 8 elements
145-
146- ax1 = GGML_F16x_VEC_LOAD (x [0 ] + i + 0 * ggml_f16_epr , 0 ); // 8 elemnst
147- sum_00 = GGML_F16x_VEC_FMA (sum_00 , ax1 , ay1 ); // sum_00 = sum_00+ax1*ay1
148- ax1 = GGML_F16x_VEC_LOAD (x [1 ] + i + 0 * ggml_f16_epr , 0 ); // 8 elements
149- sum_10 = GGML_F16x_VEC_FMA (sum_10 , ax1 , ay1 );
150-
151- ay2 = GGML_F16x_VEC_LOAD (y + i + 1 * ggml_f16_epr , 1 ); // next 8 elements
152-
153- ax2 = GGML_F16x_VEC_LOAD (x [0 ] + i + 1 * ggml_f16_epr , 1 ); // next 8 ekements
154- sum_01 = GGML_F16x_VEC_FMA (sum_01 , ax2 , ay2 );
155- ax2 = GGML_F16x_VEC_LOAD (x [1 ] + i + 1 * ggml_f16_epr , 1 );
156- sum_11 = GGML_F16x_VEC_FMA (sum_11 , ax2 , ay2 );
157-
158- ay3 = GGML_F16x_VEC_LOAD (y + i + 2 * ggml_f16_epr , 2 );
159-
160- ax3 = GGML_F16x_VEC_LOAD (x [0 ] + i + 2 * ggml_f16_epr , 2 );
161- sum_02 = GGML_F16x_VEC_FMA (sum_02 , ax3 , ay3 );
162- ax1 = GGML_F16x_VEC_LOAD (x [1 ] + i + 2 * ggml_f16_epr , 2 );
163- sum_12 = GGML_F16x_VEC_FMA (sum_12 , ax3 , ay3 );
164-
165- ay4 = GGML_F16x_VEC_LOAD (y + i + 3 * ggml_f16_epr , 3 );
166-
167- ax4 = GGML_F16x_VEC_LOAD (x [0 ] + i + 3 * ggml_f16_epr , 3 );
168- sum_03 = GGML_F16x_VEC_FMA (sum_03 , ax4 , ay4 );
169- ax4 = GGML_F16x_VEC_LOAD (x [1 ] + i + 3 * ggml_f16_epr , 3 );
170- sum_13 = GGML_F16x_VEC_FMA (sum_13 , ax4 , ay4 );
171-
172- ay5 = GGML_F16x_VEC_LOAD (y + i + 4 * ggml_f16_epr , 4 );
173-
174- ax5 = GGML_F16x_VEC_LOAD (x [0 ] + i + 4 * ggml_f16_epr , 4 );
175-
176- sum_00 = GGML_F16x_VEC_FMA (sum_00 , ax5 , ay5 );
177- ax5 = GGML_F16x_VEC_LOAD (x [1 ] + i + 4 * ggml_f16_epr , 4 );
178- sum_10 = GGML_F16x_VEC_FMA (sum_10 , ax5 , ay5 );
179-
180- ay6 = GGML_F16x_VEC_LOAD (y + i + 5 * ggml_f16_epr , 5 );
181-
182- ax6 = GGML_F16x_VEC_LOAD (x [0 ] + i + 5 * ggml_f16_epr , 5 );
183-
184- sum_01 = GGML_F16x_VEC_FMA (sum_01 , ax6 , ay6 );
185- ax6 = GGML_F16x_VEC_LOAD (x [1 ] + i + 5 * ggml_f16_epr , 5 );
186- sum_11 = GGML_F16x_VEC_FMA (sum_11 , ax6 , ay6 );
187-
188- ay7 = GGML_F16x_VEC_LOAD (y + i + 6 * ggml_f16_epr , 6 );
189-
190- ax7 = GGML_F16x_VEC_LOAD (x [0 ] + i + 6 * ggml_f16_epr , 6 );
191-
192- sum_02 = GGML_F16x_VEC_FMA (sum_02 , ax7 , ay7 );
193- ax7 = GGML_F16x_VEC_LOAD (x [1 ] + i + 6 * ggml_f16_epr , 6 );
194- sum_12 = GGML_F16x_VEC_FMA (sum_12 , ax7 , ay7 );
195-
196- ay8 = GGML_F16x_VEC_LOAD (y + i + 7 * ggml_f16_epr , 7 );
197-
198- ax8 = GGML_F16x_VEC_LOAD (x [0 ] + i + 7 * ggml_f16_epr , 7 );
199-
200- sum_03 = GGML_F16x_VEC_FMA (sum_03 , ax8 , ay8 );
201- ax8 = GGML_F16x_VEC_LOAD (x [1 ] + i + 7 * ggml_f16_epr , 7 );
202- sum_13 = GGML_F16x_VEC_FMA (sum_13 , ax8 , ay8 );
203- }
204-
205- const int np2 = (n & ~(ggml_f16_epr - 1 ));
206- for (int k = np ; k < np2 ; k += ggml_f16_epr ) {
207- svfloat16_t ry = GGML_F16x_VEC_LOAD (y + k , 0 );
208-
209- svfloat16_t rx = GGML_F16x_VEC_LOAD (x [0 ] + k , 0 );
210- sum_00 = GGML_F16x_VEC_FMA (sum_00 , rx , ry );
211- rx = GGML_F16x_VEC_LOAD (x [1 ] + k , 0 );
212- sum_10 = GGML_F16x_VEC_FMA (sum_10 , rx , ry );
213- }
214-
215- if (np2 < n ) {
216- svbool_t pg = svwhilelt_b16 (np2 , n );
217- svfloat16_t hx_0 = svld1_f16 (pg , (const __fp16 * )(x [0 ] + np2 ));
218- svfloat16_t hx_1 = svld1_f16 (pg , (const __fp16 * )(x [1 ] + np2 ));
219- svfloat16_t hy = svld1_f16 (pg , (const __fp16 * )(y + np2 ));
220-
221- sum_00 = svmad_f16_x (pg , hx_0 , hy , sum_00 );
222- sum_10 = svmad_f16_x (pg , hx_1 , hy , sum_10 );
223- }
224- GGML_F16x_VEC_REDUCE (sumf [0 ], sum_00 , sum_01 , sum_02 , sum_03 );
225- GGML_F16x_VEC_REDUCE (sumf [1 ], sum_10 , sum_11 , sum_12 , sum_13 );
226- #elif defined(__riscv_v_intrinsic )
122+ #if defined(__riscv_v_intrinsic )
227123 // todo: RVV impl
228124 for (int i = 0 ; i < n ; ++ i ) {
229125 for (int j = 0 ; j < GGML_VEC_DOT_UNROLL ; ++ j ) {
@@ -277,86 +173,7 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
277173
278174inline static void ggml_vec_mad_f32 (const int n , float * GGML_RESTRICT y , const float * GGML_RESTRICT x , const float v ) {
279175#if defined(GGML_SIMD )
280- #if defined(__ARM_FEATURE_SVE )
281-
282- const int sve_register_length = ggml_cpu_get_sve_cnt () * 8 ;
283- const int ggml_f32_epr = sve_register_length / 32 ;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
284- const int ggml_f32_step = 8 * ggml_f32_epr ; // choose 8 SVE registers
285- GGML_F32_VEC vx = GGML_F32_VEC_SET1 (v );
286-
287- const int np = (n & ~(ggml_f32_step - 1 ));
288- svfloat32_t ax1 , ax2 , ax3 , ax4 , ax5 , ax6 , ax7 , ax8 ;
289- svfloat32_t ay1 , ay2 , ay3 , ay4 , ay5 , ay6 , ay7 , ay8 ;
290- for (int i = 0 ; i < np ; i += ggml_f32_step ) {
291-
292- ax1 = GGML_F32_VEC_LOAD (x + i );
293- ay1 = GGML_F32_VEC_LOAD (y + i );
294- ay1 = GGML_F32_VEC_FMA (ay1 , ax1 , vx );
295-
296- GGML_F32_VEC_STORE (y + i , ay1 );
297-
298- ax2 = GGML_F32_VEC_LOAD (x + i + 1 * ggml_f32_epr );
299- ay2 = GGML_F32_VEC_LOAD (y + i + 1 * ggml_f32_epr );
300- ay2 = GGML_F32_VEC_FMA (ay2 , ax2 , vx );
301-
302- GGML_F32_VEC_STORE (y + i + 1 * ggml_f32_epr , ay2 );
303-
304- ax3 = GGML_F32_VEC_LOAD (x + i + 2 * ggml_f32_epr );
305- ay3 = GGML_F32_VEC_LOAD (y + i + 2 * ggml_f32_epr );
306- ay3 = GGML_F32_VEC_FMA (ay3 , ax3 , vx );
307-
308- GGML_F32_VEC_STORE (y + i + 2 * ggml_f32_epr , ay3 );
309-
310- ax4 = GGML_F32_VEC_LOAD (x + i + 3 * ggml_f32_epr );
311- ay4 = GGML_F32_VEC_LOAD (y + i + 3 * ggml_f32_epr );
312- ay4 = GGML_F32_VEC_FMA (ay4 , ax4 , vx );
313-
314- GGML_F32_VEC_STORE (y + i + 3 * ggml_f32_epr , ay4 );
315-
316- ax5 = GGML_F32_VEC_LOAD (x + i + 4 * ggml_f32_epr );
317- ay5 = GGML_F32_VEC_LOAD (y + i + 4 * ggml_f32_epr );
318- ay5 = GGML_F32_VEC_FMA (ay5 , ax5 , vx );
319-
320- GGML_F32_VEC_STORE (y + i + 4 * ggml_f32_epr , ay5 );
321-
322- ax6 = GGML_F32_VEC_LOAD (x + i + 5 * ggml_f32_epr );
323- ay6 = GGML_F32_VEC_LOAD (y + i + 5 * ggml_f32_epr );
324- ay6 = GGML_F32_VEC_FMA (ay6 , ax6 , vx );
325-
326- GGML_F32_VEC_STORE (y + i + 5 * ggml_f32_epr , ay6 );
327-
328- ax7 = GGML_F32_VEC_LOAD (x + i + 6 * ggml_f32_epr );
329- ay7 = GGML_F32_VEC_LOAD (y + i + 6 * ggml_f32_epr );
330- ay7 = GGML_F32_VEC_FMA (ay7 , ax7 , vx );
331-
332- GGML_F32_VEC_STORE (y + i + 6 * ggml_f32_epr , ay7 );
333-
334- ax8 = GGML_F32_VEC_LOAD (x + i + 7 * ggml_f32_epr );
335- ay8 = GGML_F32_VEC_LOAD (y + i + 7 * ggml_f32_epr );
336- ay8 = GGML_F32_VEC_FMA (ay8 , ax8 , vx );
337-
338- GGML_F32_VEC_STORE (y + i + 7 * ggml_f32_epr , ay8 );
339- }
340- // leftovers
341- // Since 8 unrolls are done in above loop, leftovers lie in range [0, ggml_f32_step] which is handled in below loop
342- const int np2 = (n & ~(ggml_f32_epr - 1 ));
343- for (int i = np ; i < np2 ; i += ggml_f32_epr ) {
344- ax1 = GGML_F32_VEC_LOAD (x + i );
345- ay1 = GGML_F32_VEC_LOAD (y + i );
346- ay1 = GGML_F32_VEC_FMA (ay1 , ax1 , vx );
347-
348- GGML_F32_VEC_STORE (y + i , ay1 );
349- }
350- // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
351- if (np2 < n ) {
352- svbool_t pg = svwhilelt_b32 (np2 , n );
353- ax1 = svld1_f32 (pg , x + np2 );
354- ay1 = svld1_f32 (pg , y + np2 );
355- ay1 = svmad_f32_m (pg , ax1 , vx , ay1 );
356-
357- svst1_f32 (pg , y + np2 , ay1 );
358- }
359- #elif defined(__riscv_v_intrinsic )
176+ #if defined(__riscv_v_intrinsic )
360177 for (int i = 0 , avl ; i < n ; i += avl ) {
361178 avl = __riscv_vsetvl_e32m8 (n - i );
362179 vfloat32m8_t ax = __riscv_vle32_v_f32m8 (& x [i ], avl );
@@ -397,84 +214,7 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
397214
398215inline static void ggml_vec_mad_f16 (const int n , ggml_fp16_t * GGML_RESTRICT y , const ggml_fp16_t * GGML_RESTRICT x , const float v ) {
399216#if defined(GGML_SIMD )
400- #if defined(__ARM_FEATURE_SVE )
401- const int sve_register_length = svcntb () * 8 ;
402- const int ggml_f16_epr = sve_register_length / 16 ;
403- const int ggml_f16_step = 8 * ggml_f16_epr ;
404-
405- GGML_F16x_VEC vx = GGML_F16x_VEC_SET1 (v );
406-
407- const int np = (n & ~(ggml_f16_step - 1 ));
408-
409- svfloat16_t ax1 , ax2 , ax3 , ax4 , ax5 , ax6 , ax7 , ax8 ;
410- svfloat16_t ay1 , ay2 , ay3 , ay4 , ay5 , ay6 , ay7 , ay8 ;
411- for (int i = 0 ; i < np ; i += ggml_f16_step ) {
412- ax1 = GGML_F16x_VEC_LOAD (x + i + 0 * ggml_f16_epr , 0 );
413- ay1 = GGML_F16x_VEC_LOAD (y + i + 0 * ggml_f16_epr , 0 );
414- ay1 = GGML_F16x_VEC_FMA (ay1 , ax1 , vx );
415-
416- GGML_F16x_VEC_STORE (y + i + 0 * ggml_f16_epr , ay1 , 0 );
417-
418- ax2 = GGML_F16x_VEC_LOAD (x + i + 1 * ggml_f16_epr , 1 );
419- ay2 = GGML_F16x_VEC_LOAD (y + i + 1 * ggml_f16_epr , 1 );
420- ay2 = GGML_F16x_VEC_FMA (ay2 , ax2 , vx );
421-
422- GGML_F16x_VEC_STORE (y + i + 1 * ggml_f16_epr , ay2 , 1 );
423-
424- ax3 = GGML_F16x_VEC_LOAD (x + i + 2 * ggml_f16_epr , 2 );
425- ay3 = GGML_F16x_VEC_LOAD (y + i + 2 * ggml_f16_epr , 2 );
426- ay3 = GGML_F16x_VEC_FMA (ay3 , ax3 , vx );
427-
428- GGML_F16x_VEC_STORE (y + i + 2 * ggml_f16_epr , ay3 , 2 );
429-
430- ax4 = GGML_F16x_VEC_LOAD (x + i + 3 * ggml_f16_epr , 3 );
431- ay4 = GGML_F16x_VEC_LOAD (y + i + 3 * ggml_f16_epr , 3 );
432- ay4 = GGML_F16x_VEC_FMA (ay4 , ax4 , vx );
433-
434- GGML_F16x_VEC_STORE (y + i + 3 * ggml_f16_epr , ay4 , 3 );
435-
436- ax5 = GGML_F16x_VEC_LOAD (x + i + 4 * ggml_f16_epr , 4 );
437- ay5 = GGML_F16x_VEC_LOAD (y + i + 4 * ggml_f16_epr , 4 );
438- ay5 = GGML_F16x_VEC_FMA (ay5 , ax5 , vx );
439-
440- GGML_F16x_VEC_STORE (y + i + 4 * ggml_f16_epr , ay5 , 4 );
441-
442- ax6 = GGML_F16x_VEC_LOAD (x + i + 5 * ggml_f16_epr , 5 );
443- ay6 = GGML_F16x_VEC_LOAD (y + i + 5 * ggml_f16_epr , 5 );
444- ay6 = GGML_F16x_VEC_FMA (ay6 , ax6 , vx );
445-
446- GGML_F16x_VEC_STORE (y + i + 5 * ggml_f16_epr , ay6 , 5 );
447-
448- ax7 = GGML_F16x_VEC_LOAD (x + i + 6 * ggml_f16_epr , 6 );
449- ay7 = GGML_F16x_VEC_LOAD (y + i + 6 * ggml_f16_epr , 6 );
450- ay7 = GGML_F16x_VEC_FMA (ay7 , ax7 , vx );
451-
452- GGML_F16x_VEC_STORE (y + i + 6 * ggml_f16_epr , ay7 , 6 );
453-
454- ax8 = GGML_F16x_VEC_LOAD (x + i + 7 * ggml_f16_epr , 7 );
455- ay8 = GGML_F16x_VEC_LOAD (y + i + 7 * ggml_f16_epr , 7 );
456- ay8 = GGML_F16x_VEC_FMA (ay8 , ax8 , vx );
457-
458- GGML_F16x_VEC_STORE (y + i + 7 * ggml_f16_epr , ay8 , 7 );
459- }
460- const int np2 = (n & ~(ggml_f16_epr - 1 ));
461- for (int k = np ; k < np2 ; k += ggml_f16_epr ) {
462- svfloat16_t rx = GGML_F16x_VEC_LOAD (x + k , 0 );
463- svfloat16_t ry = GGML_F16x_VEC_LOAD (y + k , 0 );
464- ry = GGML_F16x_VEC_FMA (ry , rx , vx );
465-
466- GGML_F16x_VEC_STORE (y + k , ry , 0 );
467- }
468-
469- if (np2 < n ) {
470- svbool_t pg = svwhilelt_b16 (np2 , n );
471- svfloat16_t hx = svld1_f16 (pg , (const __fp16 * )(x + np2 ));
472- svfloat16_t hy = svld1_f16 (pg , (const __fp16 * )(y + np2 ));
473- hy = svmad_f16_x (pg , hx , vx , hy );
474- svst1_f16 (pg , (__fp16 * )(y + np2 ), hy );
475- }
476-
477- #elif defined(__riscv_v_intrinsic )
217+ #if defined(__riscv_v_intrinsic )
478218 // todo: RVV impl
479219 // scalar
480220 for (int i = 0 ; i < n ; ++ i ) {
@@ -523,14 +263,7 @@ inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int
523263 }
524264
525265#if defined(GGML_SIMD )
526- #if defined(__ARM_FEATURE_SVE )
527- // scalar Route to scalar implementation //TODO: Write SVE code
528- for (int k = 0 ; k < GGML_VEC_MAD_UNROLL ; ++ k ) {
529- for (int i = 0 ; i < n ; ++ i ) {
530- y [i ] += x [k ][i ]* v [k ][0 ];
531- }
532- }
533- #elif defined(__riscv_v_intrinsic )
266+ #if defined(__riscv_v_intrinsic )
534267 for (int i = 0 , avl ; i < n ; i += avl ) {
535268 avl = __riscv_vsetvl_e32m8 (n - i );
536269 vfloat32m8_t ay = __riscv_vle32_v_f32m8 (& y [i ], avl );
@@ -586,12 +319,7 @@ inline static void ggml_vec_mad1_f32(const int n, float * y, const float * x, co
586319#if defined(GGML_USE_ACCELERATE )
587320 vDSP_vsmsa (x , 1 , & s , & b , y , 1 , n );
588321#elif defined(GGML_SIMD )
589- #if defined(__ARM_FEATURE_SVE )
590- // scalar ; TODO: Write SVE code
591- for (int i = 0 ; i < n ; ++ i ) {
592- y [i ] = x [i ]* s + b ;
593- }
594- #elif defined(__riscv_v_intrinsic )
322+ #if defined(__riscv_v_intrinsic )
595323 for (int i = 0 , avl ; i < n ; i += avl ) {
596324 avl = __riscv_vsetvl_e32m8 (n - i );
597325 vfloat32m8_t ax = __riscv_vle32_v_f32m8 (& x [i ], avl );
@@ -634,33 +362,7 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
634362#if defined(GGML_USE_ACCELERATE )
635363 vDSP_vsmul (y , 1 , & v , y , 1 , n );
636364#elif defined(GGML_SIMD )
637- #if defined(__ARM_FEATURE_SVE )
638- const int sve_register_length = ggml_cpu_get_sve_cnt () * 8 ;
639- const int ggml_f32_epr = sve_register_length / 32 ;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
640- const int ggml_f32_step = 2 * ggml_f32_epr ;
641-
642- GGML_F32_VEC vx = GGML_F32_VEC_SET1 (v );
643- const int np = (n & ~(ggml_f32_step - 1 ));
644- svfloat32_t ay1 ;
645- svfloat32_t ay2 ;
646- for (int i = 0 ; i < np ; i += ggml_f32_step ) {
647- ay1 = GGML_F32_VEC_LOAD (y + i );
648- ay1 = GGML_F32_VEC_MUL (ay1 , vx );
649- GGML_F32_VEC_STORE (y + i , ay1 );
650-
651- ay2 = GGML_F32_VEC_LOAD (y + i + 1 * ggml_f32_epr );
652- ay2 = GGML_F32_VEC_MUL (ay2 , vx );
653- GGML_F32_VEC_STORE (y + i + 1 * ggml_f32_epr , ay2 );
654- }
655- // leftovers
656- // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
657- if (np < n ) {
658- svbool_t pg = svwhilelt_b32 (np , n );
659- ay1 = svld1_f32 (pg , y + np );
660- ay1 = svmul_f32_m (pg , ay1 , vx );
661- svst1_f32 (pg , y + np , ay1 );
662- }
663- #elif defined(__riscv_v_intrinsic )
365+ #if defined(__riscv_v_intrinsic )
664366 for (int i = 0 , avl ; i < n ; i += avl ) {
665367 avl = __riscv_vsetvl_e32m8 (n - i );
666368 vfloat32m8_t ay = __riscv_vle32_v_f32m8 (& y [i ], avl );
@@ -698,33 +400,7 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
698400
699401inline static void ggml_vec_scale_f16 (const int n , ggml_fp16_t * y , const float v ) {
700402#if defined(GGML_SIMD )
701- #if defined(__ARM_FEATURE_SVE )
702- const int sve_register_length = svcntb () * 8 ;
703- const int ggml_f16_epr = sve_register_length / 16 ;
704- const int ggml_f16_step = 2 * ggml_f16_epr ;
705-
706- GGML_F16x_VEC vx = GGML_F16x_VEC_SET1 (v );
707- const int np = (n & ~(ggml_f16_step - 1 ));
708- svfloat16_t ay1 , ay2 ;
709-
710- for (int i = 0 ; i < np ; i += ggml_f16_step ) {
711- ay1 = GGML_F16x_VEC_LOAD (y + i + 0 * ggml_f16_epr , 0 );
712- ay1 = GGML_F16x_VEC_MUL (ay1 , vx );
713- GGML_F16x_VEC_STORE (y + i + 0 * ggml_f16_epr , ay1 , 0 );
714-
715- ay2 = GGML_F16x_VEC_LOAD (y + i + 1 * ggml_f16_epr , 1 );
716- ay2 = GGML_F16x_VEC_MUL (ay2 , vx );
717- GGML_F16x_VEC_STORE (y + i + 1 * ggml_f16_epr , ay2 , 1 );
718- }
719- // leftovers
720- // maximum number of leftover elements will be less that ggmlF_16x_epr. Apply predicated svmad on available elements only
721- if (np < n ) {
722- svbool_t pg = svwhilelt_b16 (np , n );
723- svfloat16_t hy = svld1_f16 (pg , (__fp16 * )(y + np ));
724- svfloat16_t out = svmul_f16_m (pg , hy , vx );
725- svst1_f16 (pg , (__fp16 * )(y + np ), out );
726- }
727- #elif defined(__riscv_v_intrinsic )
403+ #if defined(__riscv_v_intrinsic )
728404 // todo: RVV impl
729405 // scalar
730406 for (int i = 0 ; i < n ; ++ i ) {
0 commit comments