@@ -355,57 +355,27 @@ inline static void ggml_vec_mad1_f32(const int n, float * y, const float s, cons
355355#if  defined(GGML_USE_ACCELERATE )
356356    vDSP_vsmsa (y , 1 , & s , & b , y , 1 , n );
357357#elif  defined(GGML_SIMD )
358-     #if  defined(__ARM_FEATURE_SVE )
359-         const  int  sve_register_length  =  ggml_cpu_get_sve_cnt () *  8 ;
360-         const  int  ggml_f32_epr  =  sve_register_length  / 32 ;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16 
361-         const  int  ggml_f32_step  =  2  *  ggml_f32_epr ;
362- 
363-         GGML_F32_VEC  vs  =  GGML_F32_VEC_SET1 (s );
364-         GGML_F32_VEC  vb  =  GGML_F32_VEC_SET1 (b );
365- 
366-         const  int  np  =  (n  &  ~(ggml_f32_step  -  1 ));
367-         svfloat32_t  ay1 ;
368-         svfloat32_t  ay2 ;
369-         for  (int  i  =  0 ; i  <  np ; i  +=  ggml_f32_step ) {
370-             ay1  =  GGML_F32_VEC_LOAD (y  +  i );
371-             ay1  =  GGML_F32_VEC_FMA (ay1 , vs , vb );
372-             GGML_F32_VEC_STORE (y  +  i , ay1 );
373- 
374-             ay2  =  GGML_F32_VEC_LOAD (y  +  i  +  1 * ggml_f32_epr );
375-             ay2  =  GGML_F32_VEC_FMA (ay2 , vs , vb );
376-             GGML_F32_VEC_STORE (y  +  i  +  1 * ggml_f32_epr , ay2 );
377-         }
378-         // leftovers 
379-         // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only 
380-         if  (np  <  n ) {
381-             svbool_t  pg  =  svwhilelt_b32 (np , n );
382-             ay1  =  svld1_f32 (pg , y  +  np );
383-             ay1  =  svmul_f32_m (pg , ay1 , vs );
384-             ay1  =  svadd_f32_m (pg , ay1 , vb );
385-             svst1_f32 (pg , y  +  np , ay1 );
386-         }
387-     #else 
388-         const  int  np  =  (n  &  ~(GGML_F32_STEP  -  1 ));
358+     // TODO: #if defined(__ARM_FEATURE_SVE) 
359+     const  int  np  =  (n  &  ~(GGML_F32_STEP  -  1 ));
389360
390-          GGML_F32_VEC  vs  =  GGML_F32_VEC_SET1 (s );
391-          GGML_F32_VEC  vb  =  GGML_F32_VEC_SET1 (b );
361+     GGML_F32_VEC  vs  =  GGML_F32_VEC_SET1 (s );
362+     GGML_F32_VEC  vb  =  GGML_F32_VEC_SET1 (b );
392363
393-          GGML_F32_VEC  ay [GGML_F32_ARR ];
364+     GGML_F32_VEC  ay [GGML_F32_ARR ];
394365
395-          for  (int  i  =  0 ; i  <  np ; i  +=  GGML_F32_STEP ) {
396-              for  (int  j  =  0 ; j  <  GGML_F32_ARR ; j ++ ) {
397-                  ay [j ] =  GGML_F32_VEC_LOAD (y  +  i  +  j * GGML_F32_EPR );
398-                  ay [j ] =  GGML_F32_VEC_FMA (ay [j ], vs , vb );
366+     for  (int  i  =  0 ; i  <  np ; i  +=  GGML_F32_STEP ) {
367+         for  (int  j  =  0 ; j  <  GGML_F32_ARR ; j ++ ) {
368+             ay [j ] =  GGML_F32_VEC_LOAD (y  +  i  +  j * GGML_F32_EPR );
369+             ay [j ] =  GGML_F32_VEC_FMA (ay [j ], vs , vb );
399370
400-                 GGML_F32_VEC_STORE (y  +  i  +  j * GGML_F32_EPR , ay [j ]);
401-             }
371+             GGML_F32_VEC_STORE (y  +  i  +  j * GGML_F32_EPR , ay [j ]);
402372        }
373+     }
403374
404-         // leftovers 
405-         for  (int  i  =  np ; i  <  n ; ++ i ) {
406-             y [i ] =  y [i ]* s  +  b ;
407-         }
408-     #endif 
375+     // leftovers 
376+     for  (int  i  =  np ; i  <  n ; ++ i ) {
377+         y [i ] =  y [i ]* s  +  b ;
378+     }
409379#else 
410380    // scalar 
411381    for  (int  i  =  0 ; i  <  n ; ++ i ) {
0 commit comments