@@ -29,23 +29,55 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
29
* trivial copy of asum.c with the ABS() removed *
30
30
**************************************************************************************/
31
31
32
-
33
32
#include "common.h"
33
+ #include "../simd/intrin.h"
34
34
#include <math.h>
35
35
36
36
FLOAT CNAME (BLASLONG n , FLOAT * x , BLASLONG inc_x )
37
37
{
38
- BLASLONG i = 0 ;
38
+ BLASLONG i = 0 ;
39
39
FLOAT sumf = 0.0 ;
40
- if (n <= 0 || inc_x <= 0 ) return ( sumf );
41
-
40
+ if (n <= 0 || inc_x <= 0 )
41
+ return ( sumf );
42
42
n *= inc_x ;
43
- while (i < n )
43
+ if (inc_x == 1 )
44
+ {
45
+ #if V_SIMD
46
+ const int vstep = v_nlanes_f32 ;
47
+ const int unrollx4 = n & (- vstep * 4 );
48
+ const int unrollx = n & - vstep ;
49
+ v_f32 vsum0 = v_zero_f32 ();
50
+ v_f32 vsum1 = v_zero_f32 ();
51
+ v_f32 vsum2 = v_zero_f32 ();
52
+ v_f32 vsum3 = v_zero_f32 ();
53
+ while (i < unrollx4 )
54
+ {
55
+ vsum0 = v_add_f32 (vsum0 , v_loadu_f32 (x ));
56
+ vsum1 = v_add_f32 (vsum1 , v_loadu_f32 (x + vstep ));
57
+ vsum2 = v_add_f32 (vsum2 , v_loadu_f32 (x + vstep * 2 ));
58
+ vsum3 = v_add_f32 (vsum3 , v_loadu_f32 (x + vstep * 3 ));
59
+ i += vstep * 4 ;
60
+ }
61
+ vsum0 = v_add_f32 (
62
+ v_add_f32 (vsum0 , vsum1 ), v_add_f32 (vsum2 , vsum3 ));
63
+ while (i < unrollx )
64
+ {
65
+ vsum0 = v_add_f32 (vsum0 , v_loadu_f32 (x + i ));
66
+ i += vstep ;
67
+ }
68
+ sumf = v_sum_f32 (vsum0 );
69
+ #else
70
+ int n1 = n & -4 ;
71
+ for (; i < n1 ; i += 4 )
72
+ {
73
+ sumf += x [i ] + x [i + 1 ] + x [i + 2 ] + x [i + 3 ];
74
+ }
75
+ #endif
76
+ }
77
+ while (i < n )
44
78
{
45
79
sumf += x [i ];
46
80
i += inc_x ;
47
81
}
48
- return (sumf );
82
+ return (sumf );
49
83
}
50
-
51
-
0 commit comments