@@ -355,57 +355,27 @@ inline static void ggml_vec_mad1_f32(const int n, float * y, const float s, cons
355355#if defined(GGML_USE_ACCELERATE )
356356 vDSP_vsmsa (y , 1 , & s , & b , y , 1 , n );
357357#elif defined(GGML_SIMD )
358- #if defined(__ARM_FEATURE_SVE )
359- const int sve_register_length = ggml_cpu_get_sve_cnt () * 8 ;
360- const int ggml_f32_epr = sve_register_length / 32 ;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
361- const int ggml_f32_step = 2 * ggml_f32_epr ;
362-
363- GGML_F32_VEC vs = GGML_F32_VEC_SET1 (s );
364- GGML_F32_VEC vb = GGML_F32_VEC_SET1 (b );
365-
366- const int np = (n & ~(ggml_f32_step - 1 ));
367- svfloat32_t ay1 ;
368- svfloat32_t ay2 ;
369- for (int i = 0 ; i < np ; i += ggml_f32_step ) {
370- ay1 = GGML_F32_VEC_LOAD (y + i );
371- ay1 = GGML_F32_VEC_FMA (ay1 , vs , vb );
372- GGML_F32_VEC_STORE (y + i , ay1 );
373-
374- ay2 = GGML_F32_VEC_LOAD (y + i + 1 * ggml_f32_epr );
375- ay2 = GGML_F32_VEC_FMA (ay2 , vs , vb );
376- GGML_F32_VEC_STORE (y + i + 1 * ggml_f32_epr , ay2 );
377- }
378- // leftovers
379- // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
380- if (np < n ) {
381- svbool_t pg = svwhilelt_b32 (np , n );
382- ay1 = svld1_f32 (pg , y + np );
383- ay1 = svmul_f32_m (pg , ay1 , vs );
384- ay1 = svadd_f32_m (pg , ay1 , vb );
385- svst1_f32 (pg , y + np , ay1 );
386- }
387- #else
388- const int np = (n & ~(GGML_F32_STEP - 1 ));
358+ // TODO: #if defined(__ARM_FEATURE_SVE)
359+ const int np = (n & ~(GGML_F32_STEP - 1 ));
389360
390- GGML_F32_VEC vs = GGML_F32_VEC_SET1 (s );
391- GGML_F32_VEC vb = GGML_F32_VEC_SET1 (b );
361+ GGML_F32_VEC vs = GGML_F32_VEC_SET1 (s );
362+ GGML_F32_VEC vb = GGML_F32_VEC_SET1 (b );
392363
393- GGML_F32_VEC ay [GGML_F32_ARR ];
364+ GGML_F32_VEC ay [GGML_F32_ARR ];
394365
395- for (int i = 0 ; i < np ; i += GGML_F32_STEP ) {
396- for (int j = 0 ; j < GGML_F32_ARR ; j ++ ) {
397- ay [j ] = GGML_F32_VEC_LOAD (y + i + j * GGML_F32_EPR );
398- ay [j ] = GGML_F32_VEC_FMA (ay [j ], vs , vb );
366+ for (int i = 0 ; i < np ; i += GGML_F32_STEP ) {
367+ for (int j = 0 ; j < GGML_F32_ARR ; j ++ ) {
368+ ay [j ] = GGML_F32_VEC_LOAD (y + i + j * GGML_F32_EPR );
369+ ay [j ] = GGML_F32_VEC_FMA (ay [j ], vs , vb );
399370
400- GGML_F32_VEC_STORE (y + i + j * GGML_F32_EPR , ay [j ]);
401- }
371+ GGML_F32_VEC_STORE (y + i + j * GGML_F32_EPR , ay [j ]);
402372 }
373+ }
403374
404- // leftovers
405- for (int i = np ; i < n ; ++ i ) {
406- y [i ] = y [i ]* s + b ;
407- }
408- #endif
375+ // leftovers
376+ for (int i = np ; i < n ; ++ i ) {
377+ y [i ] = y [i ]* s + b ;
378+ }
409379#else
410380 // scalar
411381 for (int i = 0 ; i < n ; ++ i ) {
0 commit comments