@@ -55,7 +55,7 @@ static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x)
55
55
"srlg %%r0,%1,5 \n\t"
56
56
"xgr %%r1,%%r1 \n\t"
57
57
"0: \n\t"
58
- "pfd 1, 1024(%2) \n\t"
58
+ "pfd 1, 1024(%%r1,%2) \n\t"
59
59
60
60
"vlef %%v16,0(%%r1,%2),0 \n\t"
61
61
"vlef %%v17,4(%%r1,%2),0 \n\t"
@@ -93,100 +93,88 @@ static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x)
93
93
"vlef %%v22,120(%%r1,%2),3 \n\t"
94
94
"vlef %%v23,124(%%r1,%2),3 \n\t"
95
95
96
- "vflpsb %%v16, %%v16 \n\t"
97
- "vflpsb %%v17, %%v17 \n\t"
98
- "vflpsb %%v18, %%v18 \n\t"
99
- "vflpsb %%v19, %%v19 \n\t"
100
- "vflpsb %%v20, %%v20 \n\t"
101
- "vflpsb %%v21, %%v21 \n\t"
102
- "vflpsb %%v22, %%v22 \n\t"
103
- "vflpsb %%v23, %%v23 \n\t"
104
- "vfasb %%v16,%%v16,%%v17 \n\t"
105
- "vfasb %%v17,%%v18,%%v19 \n\t"
106
- "vfasb %%v18,%%v20,%%v21 \n\t"
107
- "vfasb %%v19,%%v22,%%v23 \n\t"
108
-
109
- "vfchsb %%v24,%%v16,%%v17 \n\t"
110
- "vfchsb %%v25,%%v18,%%v19 \n\t"
111
- "vsel %%v24,%%v16,%%v17,%%v24 \n\t"
112
- "vsel %%v25,%%v18,%%v19,%%v25 \n\t"
113
-
114
- "vfchsb %%v26,%%v24,%%v25 \n\t"
115
- "vsel %%v26,%%v24,%%v25,%%v26 \n\t"
116
-
117
- "vfchsb %%v27,%%v26,%%v0 \n\t"
118
- "vsel %%v0,%%v26,%%v0,%%v27 \n\t"
119
-
120
- "vlef %%v16,128(%%r1,%2),0 \n\t"
121
- "vlef %%v17,132(%%r1,%2),0 \n\t"
122
- "vlef %%v16,136(%%r1,%2),1 \n\t"
123
- "vlef %%v17,140(%%r1,%2),1 \n\t"
124
- "vlef %%v16,144(%%r1,%2),2 \n\t"
125
- "vlef %%v17,148(%%r1,%2),2 \n\t"
126
- "vlef %%v16,152(%%r1,%2),3 \n\t"
127
- "vlef %%v17,156(%%r1,%2),3 \n\t"
128
-
129
- "vlef %%v18,160(%%r1,%2),0 \n\t"
130
- "vlef %%v19,164(%%r1,%2),0 \n\t"
131
- "vlef %%v18,168(%%r1,%2),1 \n\t"
132
- "vlef %%v19,172(%%r1,%2),1 \n\t"
133
- "vlef %%v18,176(%%r1,%2),2 \n\t"
134
- "vlef %%v19,180(%%r1,%2),2 \n\t"
135
- "vlef %%v18,184(%%r1,%2),3 \n\t"
136
- "vlef %%v19,188(%%r1,%2),3 \n\t"
137
-
138
- "vlef %%v20,192(%%r1,%2),0 \n\t"
139
- "vlef %%v21,196(%%r1,%2),0 \n\t"
140
- "vlef %%v20,200(%%r1,%2),1 \n\t"
141
- "vlef %%v21,204(%%r1,%2),1 \n\t"
142
- "vlef %%v20,208(%%r1,%2),2 \n\t"
143
- "vlef %%v21,212(%%r1,%2),2 \n\t"
144
- "vlef %%v20,216(%%r1,%2),3 \n\t"
145
- "vlef %%v21,220(%%r1,%2),3 \n\t"
146
-
147
- "vlef %%v22,224(%%r1,%2),0 \n\t"
148
- "vlef %%v23,228(%%r1,%2),0 \n\t"
149
- "vlef %%v22,232(%%r1,%2),1 \n\t"
150
- "vlef %%v23,236(%%r1,%2),1 \n\t"
151
- "vlef %%v22,240(%%r1,%2),2 \n\t"
152
- "vlef %%v23,244(%%r1,%2),2 \n\t"
153
- "vlef %%v22,248(%%r1,%2),3 \n\t"
154
- "vlef %%v23,252(%%r1,%2),3 \n\t"
155
-
156
- "vflpsb %%v16, %%v16 \n\t"
157
- "vflpsb %%v17, %%v17 \n\t"
158
- "vflpsb %%v18, %%v18 \n\t"
159
- "vflpsb %%v19, %%v19 \n\t"
160
- "vflpsb %%v20, %%v20 \n\t"
161
- "vflpsb %%v21, %%v21 \n\t"
162
- "vflpsb %%v22, %%v22 \n\t"
163
- "vflpsb %%v23, %%v23 \n\t"
96
+ "vlef %%v24,128(%%r1,%2),0 \n\t"
97
+ "vlef %%v25,132(%%r1,%2),0 \n\t"
98
+ "vlef %%v24,136(%%r1,%2),1 \n\t"
99
+ "vlef %%v25,140(%%r1,%2),1 \n\t"
100
+ "vlef %%v24,144(%%r1,%2),2 \n\t"
101
+ "vlef %%v25,148(%%r1,%2),2 \n\t"
102
+ "vlef %%v24,152(%%r1,%2),3 \n\t"
103
+ "vlef %%v25,156(%%r1,%2),3 \n\t"
104
+
105
+ "vlef %%v26,160(%%r1,%2),0 \n\t"
106
+ "vlef %%v27,164(%%r1,%2),0 \n\t"
107
+ "vlef %%v26,168(%%r1,%2),1 \n\t"
108
+ "vlef %%v27,172(%%r1,%2),1 \n\t"
109
+ "vlef %%v26,176(%%r1,%2),2 \n\t"
110
+ "vlef %%v27,180(%%r1,%2),2 \n\t"
111
+ "vlef %%v26,184(%%r1,%2),3 \n\t"
112
+ "vlef %%v27,188(%%r1,%2),3 \n\t"
113
+
114
+ "vlef %%v28,192(%%r1,%2),0 \n\t"
115
+ "vlef %%v29,196(%%r1,%2),0 \n\t"
116
+ "vlef %%v28,200(%%r1,%2),1 \n\t"
117
+ "vlef %%v29,204(%%r1,%2),1 \n\t"
118
+ "vlef %%v28,208(%%r1,%2),2 \n\t"
119
+ "vlef %%v29,212(%%r1,%2),2 \n\t"
120
+ "vlef %%v28,216(%%r1,%2),3 \n\t"
121
+ "vlef %%v29,220(%%r1,%2),3 \n\t"
122
+
123
+ "vlef %%v30,224(%%r1,%2),0 \n\t"
124
+ "vlef %%v31,228(%%r1,%2),0 \n\t"
125
+ "vlef %%v30,232(%%r1,%2),1 \n\t"
126
+ "vlef %%v31,236(%%r1,%2),1 \n\t"
127
+ "vlef %%v30,240(%%r1,%2),2 \n\t"
128
+ "vlef %%v31,244(%%r1,%2),2 \n\t"
129
+ "vlef %%v30,248(%%r1,%2),3 \n\t"
130
+ "vlef %%v31,252(%%r1,%2),3 \n\t"
131
+
132
+ "vflpsb %%v16,%%v16 \n\t"
133
+ "vflpsb %%v17,%%v17 \n\t"
134
+ "vflpsb %%v18,%%v18 \n\t"
135
+ "vflpsb %%v19,%%v19 \n\t"
136
+ "vflpsb %%v20,%%v20 \n\t"
137
+ "vflpsb %%v21,%%v21 \n\t"
138
+ "vflpsb %%v22,%%v22 \n\t"
139
+ "vflpsb %%v23,%%v23 \n\t"
140
+ "vflpsb %%v24,%%v24 \n\t"
141
+ "vflpsb %%v25,%%v25 \n\t"
142
+ "vflpsb %%v26,%%v26 \n\t"
143
+ "vflpsb %%v27,%%v27 \n\t"
144
+ "vflpsb %%v28,%%v28 \n\t"
145
+ "vflpsb %%v29,%%v29 \n\t"
146
+ "vflpsb %%v30,%%v30 \n\t"
147
+ "vflpsb %%v31,%%v31 \n\t"
148
+
164
149
"vfasb %%v16,%%v16,%%v17 \n\t"
165
- "vfasb %%v17,%%v18,%%v19 \n\t"
166
- "vfasb %%v18,%%v20,%%v21 \n\t"
167
- "vfasb %%v19,%%v22,%%v23 \n\t"
150
+ "vfasb %%v18,%%v18,%%v19 \n\t"
151
+ "vfasb %%v20,%%v20,%%v21 \n\t"
152
+ "vfasb %%v22,%%v22,%%v23 \n\t"
153
+ "vfasb %%v24,%%v24,%%v25 \n\t"
154
+ "vfasb %%v26,%%v26,%%v27 \n\t"
155
+ "vfasb %%v28,%%v28,%%v29 \n\t"
156
+ "vfasb %%v30,%%v30,%%v31 \n\t"
168
157
169
- "vfchsb %%v24,%%v16,%%v17 \n\t"
170
- "vfchsb %%v25,%%v18,%%v19 \n\t"
171
- "vsel %%v24,%%v16,%%v17,%%v24 \n\t"
172
- "vsel %%v25,%%v18,%%v19,%%v25 \n\t"
158
+ "vfmaxsb %%v16,%%v16,%%v24,0 \n\t"
159
+ "vfmaxsb %%v18,%%v18,%%v26,0 \n\t"
160
+ "vfmaxsb %%v20,%%v20,%%v28,0 \n\t"
161
+ "vfmaxsb %%v22,%%v22,%%v30,0 \n\t"
162
+
163
+ "vfmaxsb %%v16,%%v16,%%v20,0 \n\t"
164
+ "vfmaxsb %%v18,%%v18,%%v22,0 \n\t"
173
165
174
- "vfchsb %%v26,%%v24,%%v25 \n\t"
175
- "vsel %%v26,%%v24,%%v25,%%v26 \n\t"
166
+ "vfmaxsb %%v16,%%v16,%%v18,0 \n\t"
176
167
177
- "vfchsb %%v27,%%v26,%%v0 \n\t"
178
- "vsel %%v0,%%v26,%%v0,%%v27 \n\t"
168
+ "vfmaxsb %%v0,%%v0,%%v16,0 \n\t"
179
169
180
170
"agfi %%r1, 256 \n\t"
181
171
"brctg %%r0, 0b \n\t"
182
172
183
173
"veslg %%v16,%%v0,32 \n\t"
184
- "vfchsb %%v17,%%v16,%%v0 \n\t"
185
- "vsel %%v0,%%v16,%%v0,%%v17 \n\t"
174
+ "vfmaxsb %%v0,%%v0,%%v16,0 \n\t"
186
175
187
176
"vrepf %%v16,%%v0,2 \n\t"
188
- "wfchsb %%v17,%%v16,%%v0 \n\t"
189
- "vsel %%v0,%%v16,%%v0,%%v17 \n\t"
177
+ "wfmaxsb %%v0,%%v0,%%v16,0 \n\t"
190
178
"ler %0,%%f0 "
191
179
:"=f" (amax )
192
180
:"r" (n ),"ZR" ((const FLOAT (* )[n ])x )
@@ -233,11 +221,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
233
221
234
222
maxf = CABS1 (x ,0 );
235
223
inc_x2 = 2 * inc_x ;
236
- ix += inc_x2 ;
237
- i ++ ;
238
224
239
- BLASLONG n1 = ( n - 1 ) & -4 ;
240
- while (( i - 1 ) < n1 ) {
225
+ BLASLONG n1 = n & -4 ;
226
+ while (i < n1 ) {
241
227
242
228
if (CABS1 (x ,ix ) > maxf ) {
243
229
maxf = CABS1 (x ,ix );
0 commit comments