@@ -34,144 +34,145 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34
34
**************************************************************************************/
35
35
36
36
#define HAVE_KERNEL_16 1
37
- static void casum_kernel_16 ( BLASLONG n , FLOAT * x , FLOAT * svec ) __attribute__ ((noinline ));
38
37
39
- static void casum_kernel_16 ( BLASLONG n , FLOAT * x , FLOAT * svec )
38
+ static float casum_kernel_16 ( long n , float * x )
40
39
{
41
-
42
-
43
- BLASLONG i = n ;
44
- BLASLONG o16 = 16 ;
45
- BLASLONG o32 = 32 ;
46
- BLASLONG o48 = 48 ;
47
- BLASLONG o64 = 64 ;
48
- BLASLONG o80 = 80 ;
49
- BLASLONG o96 = 96 ;
50
- BLASLONG o112 = 112 ;
51
- FLOAT * x1 = x ;
52
- BLASLONG pre = 384 ;
53
-
54
- __asm__ __volatile__
55
- (
56
-
57
- "dcbt %2 , %4 \n\t"
58
-
59
- "xxlxor 32,32,32 \n\t"
60
- "xxlxor 33,33,33 \n\t"
61
- "xxlxor 34,34,34 \n\t"
62
- "xxlxor 35,35,35 \n\t"
63
- "xxlxor 36,36,36 \n\t"
64
- "xxlxor 37,37,37 \n\t"
65
- "xxlxor 38,38,38 \n\t"
66
- "xxlxor 39,39,39 \n\t"
67
-
68
- "lxvw4x 40, 0, %2 \n\t"
69
- "lxvw4x 41, %5, %2 \n\t"
70
- "lxvw4x 42, %6, %2 \n\t"
71
- "lxvw4x 43, %7, %2 \n\t"
72
- "lxvw4x 44, %8, %2 \n\t"
73
- "lxvw4x 45, %9, %2 \n\t"
74
- "lxvw4x 46, %10, %2 \n\t"
75
- "lxvw4x 47, %11, %2 \n\t"
76
-
77
- "addi %2, %2, 128 \n\t"
78
-
79
- "addic. %0 , %0 , -16 \n\t"
80
- "ble 2f \n\t"
81
-
82
- ".align 5 \n\t"
83
- "1: \n\t"
84
-
85
- "dcbt %2 , %4 \n\t"
86
-
87
- "xvabssp 48, 40 \n\t"
88
- "xvabssp 49, 41 \n\t"
89
- "xvabssp 50, 42 \n\t"
90
- "xvabssp 51, 43 \n\t"
91
-
92
- "lxvw4x 40, 0, %2 \n\t"
93
- "lxvw4x 41, %5, %2 \n\t"
94
-
95
- "xvabssp 52, 44 \n\t"
96
- "xvabssp 53, 45 \n\t"
97
-
98
- "lxvw4x 42, %6, %2 \n\t"
99
- "lxvw4x 43, %7, %2 \n\t"
100
-
101
- "xvabssp 54, 46 \n\t"
102
- "xvabssp 55, 47 \n\t"
103
-
104
- "lxvw4x 44, %8, %2 \n\t"
105
- "lxvw4x 45, %9, %2 \n\t"
106
-
107
- "xvaddsp 32, 32, 48 \n\t"
108
- "xvaddsp 33, 33, 49 \n\t"
109
-
110
- "lxvw4x 46, %10, %2 \n\t"
111
- "lxvw4x 47, %11, %2 \n\t"
112
-
113
- "xvaddsp 34, 34, 50 \n\t"
114
- "xvaddsp 35, 35, 51 \n\t"
115
- "addi %2, %2, 128 \n\t"
116
- "xvaddsp 36, 36, 52 \n\t"
117
- "xvaddsp 37, 37, 53 \n\t"
118
- "addic. %0 , %0 , -16 \n\t"
119
- "xvaddsp 38, 38, 54 \n\t"
120
- "xvaddsp 39, 39, 55 \n\t"
121
-
122
- "bgt 1b \n\t"
123
-
124
- "2: \n\t"
125
-
126
-
127
- "xvabssp 48, 40 \n\t"
128
- "xvabssp 49, 41 \n\t"
129
- "xvabssp 50, 42 \n\t"
130
- "xvabssp 51, 43 \n\t"
131
- "xvabssp 52, 44 \n\t"
132
- "xvabssp 53, 45 \n\t"
133
- "xvabssp 54, 46 \n\t"
134
- "xvabssp 55, 47 \n\t"
135
-
136
- "xvaddsp 32, 32, 48 \n\t"
137
- "xvaddsp 33, 33, 49 \n\t"
138
- "xvaddsp 34, 34, 50 \n\t"
139
- "xvaddsp 35, 35, 51 \n\t"
140
- "xvaddsp 36, 36, 52 \n\t"
141
- "xvaddsp 37, 37, 53 \n\t"
142
- "xvaddsp 38, 38, 54 \n\t"
143
- "xvaddsp 39, 39, 55 \n\t"
144
-
145
- "xvaddsp 32, 32, 33 \n\t"
146
- "xvaddsp 34, 34, 35 \n\t"
147
- "xvaddsp 36, 36, 37 \n\t"
148
- "xvaddsp 38, 38, 39 \n\t"
149
-
150
- "xvaddsp 32, 32, 34 \n\t"
151
- "xvaddsp 36, 36, 38 \n\t"
152
-
153
- "xvaddsp 32, 32, 36 \n\t"
154
-
155
-
156
- "stxvw4x 32, 0, %3 \n\t"
157
-
158
- :
159
- :
160
- "r" (i ), // 0
161
- "r" (n ), // 1
162
- "r" (x1 ), // 2
163
- "r" (svec ), // 3
164
- "r" (pre ), // 4
165
- "r" (o16 ), // 5
166
- "r" (o32 ), // 6
167
- "r" (o48 ), // 7
168
- "r" (o64 ), // 8
169
- "r" (o80 ), // 9
170
- "r" (o96 ), // 10
171
- "r" (o112 ) // 11
172
- : "cr0" , "%0" , "%2" , "memory"
173
- );
174
-
175
- }
176
-
177
-
40
+ float sum ;
41
+ __vector float t0 ;
42
+ __vector float t1 ;
43
+ __vector float t2 ;
44
+ __vector float t3 ;
45
+
46
+ __asm__
47
+ (
48
+ "dcbt 0, %2 \n\t"
49
+
50
+ "xxlxor 32, 32, 32 \n\t"
51
+ "xxlxor 33, 33, 33 \n\t"
52
+ "xxlxor 34, 34, 34 \n\t"
53
+ "xxlxor 35, 35, 35 \n\t"
54
+ "xxlxor 36, 36, 36 \n\t"
55
+ "xxlxor 37, 37, 37 \n\t"
56
+ "xxlxor 38, 38, 38 \n\t"
57
+ "xxlxor 39, 39, 39 \n\t"
58
+
59
+ "lxvw4x 40, 0, %2 \n\t"
60
+ "lxvw4x 41, %8, %2 \n\t"
61
+ "lxvw4x 42, %9, %2 \n\t"
62
+ "lxvw4x 43, %10, %2 \n\t"
63
+ "lxvw4x 44, %11, %2 \n\t"
64
+ "lxvw4x 45, %12, %2 \n\t"
65
+ "lxvw4x 46, %13, %2 \n\t"
66
+ "lxvw4x 47, %14, %2 \n\t"
67
+
68
+ "addi %2, %2, 128 \n\t"
69
+
70
+ "addic. %1, %1, -16 \n\t"
71
+ "ble 2f \n\t"
72
+
73
+ ".p2align 5 \n"
74
+ "1: \n\t"
75
+
76
+ "xvabssp 48, 40 \n\t"
77
+ "xvabssp 49, 41 \n\t"
78
+ "xvabssp 50, 42 \n\t"
79
+ "xvabssp 51, 43 \n\t"
80
+
81
+ "lxvw4x 40, 0, %2 \n\t"
82
+ "lxvw4x 41, %8, %2 \n\t"
83
+
84
+ "xvabssp %x3, 44 \n\t"
85
+ "xvabssp %x4, 45 \n\t"
86
+
87
+ "lxvw4x 42, %9, %2 \n\t"
88
+ "lxvw4x 43, %10, %2 \n\t"
89
+
90
+ "xvabssp %x5, 46 \n\t"
91
+ "xvabssp %x6, 47 \n\t"
92
+
93
+ "lxvw4x 44, %11, %2 \n\t"
94
+ "lxvw4x 45, %12, %2 \n\t"
95
+
96
+ "xvaddsp 32, 32, 48 \n\t"
97
+ "xvaddsp 33, 33, 49 \n\t"
98
+
99
+ "lxvw4x 46, %13, %2 \n\t"
100
+ "lxvw4x 47, %14, %2 \n\t"
101
+
102
+ "xvaddsp 34, 34, 50 \n\t"
103
+ "xvaddsp 35, 35, 51 \n\t"
104
+ "addi %2, %2, 128 \n\t"
105
+ "xvaddsp 36, 36, %x3 \n\t"
106
+ "xvaddsp 37, 37, %x4 \n\t"
107
+ "addic. %1, %1, -16 \n\t"
108
+ "xvaddsp 38, 38, %x5 \n\t"
109
+ "xvaddsp 39, 39, %x6 \n\t"
110
+
111
+ "bgt 1b \n"
112
+
113
+ "2: \n\t"
114
+
115
+ "xvabssp 48, 40 \n\t"
116
+ "xvabssp 49, 41 \n\t"
117
+ "xvabssp 50, 42 \n\t"
118
+ "xvabssp 51, 43 \n\t"
119
+ "xvabssp %x3, 44 \n\t"
120
+ "xvabssp %x4, 45 \n\t"
121
+ "xvabssp %x5, 46 \n\t"
122
+ "xvabssp %x6, 47 \n\t"
123
+
124
+ "xvaddsp 32, 32, 48 \n\t"
125
+ "xvaddsp 33, 33, 49 \n\t"
126
+ "xvaddsp 34, 34, 50 \n\t"
127
+ "xvaddsp 35, 35, 51 \n\t"
128
+ "xvaddsp 36, 36, %x3 \n\t"
129
+ "xvaddsp 37, 37, %x4 \n\t"
130
+ "xvaddsp 38, 38, %x5 \n\t"
131
+ "xvaddsp 39, 39, %x6 \n\t"
132
+
133
+ "xvaddsp 32, 32, 33 \n\t"
134
+ "xvaddsp 34, 34, 35 \n\t"
135
+ "xvaddsp 36, 36, 37 \n\t"
136
+ "xvaddsp 38, 38, 39 \n\t"
137
+
138
+ "xvaddsp 32, 32, 34 \n\t"
139
+ "xvaddsp 36, 36, 38 \n\t"
140
+
141
+ "xvaddsp 32, 32, 36 \n\t"
142
+
143
+ "xxsldwi 33, 32, 32, 2 \n\t"
144
+ "xvaddsp 32, 32, 33 \n\t"
145
+
146
+ "xxsldwi 33, 32, 32, 1 \n\t"
147
+ "xvaddsp 32, 32, 33 \n\t"
148
+
149
+ "xscvspdp %0, 32 \n"
150
+
151
+ "#n=%1 x=%3=%2 sum=%0 o16=%8 o32=%9 o48=%10 o64=%11 o80=%12 o96=%13 o112=%14\n"
152
+ "#t0=%x3 t1=%x4 t2=%x5 t3=%x6"
153
+ :
154
+ "=f" (sum ), // 0
155
+ "+r" (n ), // 1
156
+ "+b" (x ), // 2
157
+ "=wa" (t0 ), // 3
158
+ "=wa" (t1 ), // 4
159
+ "=wa" (t2 ), // 5
160
+ "=wa" (t3 ) // 6
161
+ :
162
+ "m" (* x ),
163
+ "b" (16 ), // 8
164
+ "b" (32 ), // 9
165
+ "b" (48 ), // 10
166
+ "b" (64 ), // 11
167
+ "b" (80 ), // 12
168
+ "b" (96 ), // 13
169
+ "b" (112 ) // 14
170
+ :
171
+ "cr0" ,
172
+ "vs32" ,"vs33" ,"vs34" ,"vs35" ,"vs36" ,"vs37" ,"vs38" ,"vs39" ,
173
+ "vs40" ,"vs41" ,"vs42" ,"vs43" ,"vs44" ,"vs45" ,"vs46" ,"vs47" ,
174
+ "vs48" ,"vs49" ,"vs50" ,"vs51"
175
+ );
176
+
177
+ return sum ;
178
+ }
0 commit comments