@@ -25,67 +25,35 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25
25
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
26
*****************************************************************************/
27
27
28
+ /*
29
+ * Avoid contraction of floating point operations, specifically fused
30
+ * multiply-add, because they can cause unexpected results in complex
31
+ * multiplication.
32
+ */
33
+ #if defined(__GNUC__ ) && !defined(__clang__ )
34
+ #pragma GCC optimize ("fp-contract=off")
35
+ #endif
36
+
37
+ #if defined(__clang__ )
38
+ #pragma clang fp contract(off)
39
+ #endif
40
+
28
41
#include "common.h"
42
+ #include "vector-common.h"
29
43
30
- static void cscal_kernel_16 (BLASLONG n , FLOAT * alpha , FLOAT * x ) {
31
- __asm__("vlrepf %%v0,0(%[alpha])\n\t"
32
- "vlef %%v1,4(%[alpha]),0\n\t"
33
- "vlef %%v1,4(%[alpha]),2\n\t"
34
- "vflcsb %%v1,%%v1\n\t"
35
- "vlef %%v1,4(%[alpha]),1\n\t"
36
- "vlef %%v1,4(%[alpha]),3\n\t"
37
- "srlg %[n],%[n],4\n\t"
38
- "xgr %%r1,%%r1\n\t"
39
- "0:\n\t"
40
- "pfd 2, 1024(%%r1,%[x])\n\t"
41
- "vl %%v16,0(%%r1,%[x])\n\t"
42
- "vl %%v17,16(%%r1,%[x])\n\t"
43
- "vl %%v18,32(%%r1,%[x])\n\t"
44
- "vl %%v19,48(%%r1,%[x])\n\t"
45
- "vl %%v20,64(%%r1,%[x])\n\t"
46
- "vl %%v21,80(%%r1,%[x])\n\t"
47
- "vl %%v22,96(%%r1,%[x])\n\t"
48
- "vl %%v23,112(%%r1,%[x])\n\t"
49
- "verllg %%v24,%%v16,32\n\t"
50
- "verllg %%v25,%%v17,32\n\t"
51
- "verllg %%v26,%%v18,32\n\t"
52
- "verllg %%v27,%%v19,32\n\t"
53
- "verllg %%v28,%%v20,32\n\t"
54
- "verllg %%v29,%%v21,32\n\t"
55
- "verllg %%v30,%%v22,32\n\t"
56
- "verllg %%v31,%%v23,32\n\t"
57
- "vfmsb %%v16,%%v16,%%v0\n\t"
58
- "vfmsb %%v17,%%v17,%%v0\n\t"
59
- "vfmsb %%v18,%%v18,%%v0\n\t"
60
- "vfmsb %%v19,%%v19,%%v0\n\t"
61
- "vfmsb %%v20,%%v20,%%v0\n\t"
62
- "vfmsb %%v21,%%v21,%%v0\n\t"
63
- "vfmsb %%v22,%%v22,%%v0\n\t"
64
- "vfmsb %%v23,%%v23,%%v0\n\t"
65
- "vfmasb %%v16,%%v24,%%v1,%%v16\n\t"
66
- "vfmasb %%v17,%%v25,%%v1,%%v17\n\t"
67
- "vfmasb %%v18,%%v26,%%v1,%%v18\n\t"
68
- "vfmasb %%v19,%%v27,%%v1,%%v19\n\t"
69
- "vfmasb %%v20,%%v28,%%v1,%%v20\n\t"
70
- "vfmasb %%v21,%%v29,%%v1,%%v21\n\t"
71
- "vfmasb %%v22,%%v30,%%v1,%%v22\n\t"
72
- "vfmasb %%v23,%%v31,%%v1,%%v23\n\t"
73
- "vst %%v16,0(%%r1,%[x])\n\t"
74
- "vst %%v17,16(%%r1,%[x])\n\t"
75
- "vst %%v18,32(%%r1,%[x])\n\t"
76
- "vst %%v19,48(%%r1,%[x])\n\t"
77
- "vst %%v20,64(%%r1,%[x])\n\t"
78
- "vst %%v21,80(%%r1,%[x])\n\t"
79
- "vst %%v22,96(%%r1,%[x])\n\t"
80
- "vst %%v23,112(%%r1,%[x])\n\t"
81
- "agfi %%r1,128\n\t"
82
- "brctg %[n],0b"
83
- : "+m" (* (FLOAT (* )[n * 2 ]) x ),[n ] "+&r" (n )
84
- : [x ] "a" (x ), "m" (* (const FLOAT (* )[2 ]) alpha ),
85
- [alpha ] "a" (alpha )
86
- : "cc" , "r1" , "v0" , "v1" , "v16" , "v17" , "v18" , "v19" , "v20" , "v21" ,
87
- "v22" , "v23" , "v24" , "v25" , "v26" , "v27" , "v28" , "v29" , "v30" ,
88
- "v31" );
44
+ static void cscal_kernel_16 (BLASLONG n , FLOAT da_r , FLOAT da_i , FLOAT * x ) {
45
+ vector_float da_r_vec = vec_splats (da_r );
46
+ vector_float da_i_vec = { - da_i , da_i , - da_i , da_i };
47
+
48
+ vector_float * x_vec_ptr = (vector_float * )x ;
49
+
50
+ #pragma GCC unroll 16
51
+ for (size_t i = 0 ; i < n /2 ; i ++ ) {
52
+ vector_float x_vec = vec_load_hinted (x + i * VLEN_FLOATS );
53
+ vector_float x_swapped = {x_vec [1 ], x_vec [0 ], x_vec [3 ], x_vec [2 ]};
54
+
55
+ x_vec_ptr [i ] = x_vec * da_r_vec + x_swapped * da_i_vec ;
56
+ }
89
57
}
90
58
91
59
static void cscal_kernel_16_zero_r (BLASLONG n , FLOAT * alpha , FLOAT * x ) {
@@ -199,14 +167,12 @@ static void cscal_kernel_16_zero(BLASLONG n, FLOAT *x) {
199
167
: "cc" , "r1" , "v0" );
200
168
}
201
169
202
- static void cscal_kernel_inc_8 (BLASLONG n , FLOAT * alpha , FLOAT * x ,
170
+ static void cscal_kernel_inc_8 (BLASLONG n , FLOAT da_r , FLOAT da_i , FLOAT * x ,
203
171
BLASLONG inc_x ) {
204
172
BLASLONG i ;
205
173
BLASLONG inc_x2 = 2 * inc_x ;
206
174
BLASLONG inc_x3 = inc_x2 + inc_x ;
207
175
FLOAT t0 , t1 , t2 , t3 ;
208
- FLOAT da_r = alpha [0 ];
209
- FLOAT da_i = alpha [1 ];
210
176
211
177
for (i = 0 ; i < n ; i += 4 ) {
212
178
t0 = da_r * x [0 ] - da_i * x [1 ];
@@ -324,9 +290,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
324
290
325
291
BLASLONG n1 = n & -8 ;
326
292
if (n1 > 0 ) {
327
- alpha [0 ] = da_r ;
328
- alpha [1 ] = da_i ;
329
- cscal_kernel_inc_8 (n1 , alpha , x , inc_x );
293
+ cscal_kernel_inc_8 (n1 , da_r , da_i , x , inc_x );
330
294
j = n1 ;
331
295
i = n1 * inc_x ;
332
296
}
@@ -362,7 +326,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
362
326
else if (da_i == 0 )
363
327
cscal_kernel_16_zero_i (n1 , alpha , x );
364
328
else
365
- cscal_kernel_16 (n1 , alpha , x );
329
+ cscal_kernel_16 (n1 , da_r , da_i , x );
366
330
367
331
i = n1 << 1 ;
368
332
j = n1 ;
0 commit comments