@@ -42,24 +42,27 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
42
42
n *= inc_x ;
43
43
if (inc_x == 1 )
44
44
{
45
- #if V_SIMD
45
+ #if V_SIMD && (!defined( DOUBLE ) || (defined( DOUBLE ) && V_SIMD_F64 && V_SIMD > 128 ))
46
46
#ifdef DOUBLE
47
47
const int vstep = v_nlanes_f64 ;
48
- const int unrollx2 = n & (- vstep * 2 );
48
+ const int unrollx4 = n & (- vstep * 4 );
49
49
const int unrollx = n & - vstep ;
50
50
v_f64 vsum0 = v_zero_f64 ();
51
51
v_f64 vsum1 = v_zero_f64 ();
52
- while (i < unrollx2 )
52
+ v_f64 vsum2 = v_zero_f64 ();
53
+ v_f64 vsum3 = v_zero_f64 ();
54
+ for (; i < unrollx4 ; i += vstep * 4 )
53
55
{
54
- vsum0 = v_add_f64 (vsum0 , v_loadu_f64 (x ));
55
- vsum1 = v_add_f64 (vsum1 , v_loadu_f64 (x + vstep ));
56
- i += vstep * 2 ;
56
+ vsum0 = v_add_f64 (vsum0 , v_loadu_f64 (x + i ));
57
+ vsum1 = v_add_f64 (vsum1 , v_loadu_f64 (x + i + vstep ));
58
+ vsum2 = v_add_f64 (vsum2 , v_loadu_f64 (x + i + vstep * 2 ));
59
+ vsum3 = v_add_f64 (vsum3 , v_loadu_f64 (x + i + vstep * 3 ));
57
60
}
58
- vsum0 = v_add_f64 (vsum0 , vsum1 );
59
- while (i < unrollx )
61
+ vsum0 = v_add_f64 (
62
+ v_add_f64 (vsum0 , vsum1 ), v_add_f64 (vsum2 , vsum3 ));
63
+ for (; i < unrollx ; i += vstep )
60
64
{
61
65
vsum0 = v_add_f64 (vsum0 , v_loadu_f64 (x + i ));
62
- i += vstep ;
63
66
}
64
67
sumf = v_sum_f64 (vsum0 );
65
68
#else
@@ -70,20 +73,18 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
70
73
v_f32 vsum1 = v_zero_f32 ();
71
74
v_f32 vsum2 = v_zero_f32 ();
72
75
v_f32 vsum3 = v_zero_f32 ();
73
- while ( i < unrollx4 )
76
+ for (; i < unrollx4 ; i += vstep * 4 )
74
77
{
75
- vsum0 = v_add_f32 (vsum0 , v_loadu_f32 (x ));
76
- vsum1 = v_add_f32 (vsum1 , v_loadu_f32 (x + vstep ));
77
- vsum2 = v_add_f32 (vsum2 , v_loadu_f32 (x + vstep * 2 ));
78
- vsum3 = v_add_f32 (vsum3 , v_loadu_f32 (x + vstep * 3 ));
79
- i += vstep * 4 ;
78
+ vsum0 = v_add_f32 (vsum0 , v_loadu_f32 (x + i ));
79
+ vsum1 = v_add_f32 (vsum1 , v_loadu_f32 (x + i + vstep ));
80
+ vsum2 = v_add_f32 (vsum2 , v_loadu_f32 (x + i + vstep * 2 ));
81
+ vsum3 = v_add_f32 (vsum3 , v_loadu_f32 (x + i + vstep * 3 ));
80
82
}
81
83
vsum0 = v_add_f32 (
82
84
v_add_f32 (vsum0 , vsum1 ), v_add_f32 (vsum2 , vsum3 ));
83
- while ( i < unrollx )
85
+ for (; i < unrollx ; i += vstep )
84
86
{
85
87
vsum0 = v_add_f32 (vsum0 , v_loadu_f32 (x + i ));
86
- i += vstep ;
87
88
}
88
89
sumf = v_sum_f32 (vsum0 );
89
90
#endif
0 commit comments