|
1 |
| -#define ASSEMBLER |
| 1 | +/*************************************************************************** |
| 2 | +Copyright (c) 2023, The OpenBLAS Project |
| 3 | +All rights reserved. |
| 4 | +Redistribution and use in source and binary forms, with or without |
| 5 | +modification, are permitted provided that the following conditions are |
| 6 | +met: |
| 7 | +1. Redistributions of source code must retain the above copyright |
| 8 | +notice, this list of conditions and the following disclaimer. |
| 9 | +2. Redistributions in binary form must reproduce the above copyright |
| 10 | +notice, this list of conditions and the following disclaimer in |
| 11 | +the documentation and/or other materials provided with the |
| 12 | +distribution. |
| 13 | +3. Neither the name of the OpenBLAS project nor the names of |
| 14 | +its contributors may be used to endorse or promote products |
| 15 | +derived from this software without specific prior written permission. |
| 16 | +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| 17 | +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 18 | +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 19 | +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE |
| 20 | +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
| 21 | +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR |
| 22 | +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
| 23 | +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
| 24 | +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE |
| 25 | +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 26 | +*****************************************************************************/ |
2 | 27 |
|
| 28 | +#define ASSEMBLER |
3 | 29 | #include "common.h"
|
| 30 | + |
4 | 31 | #define N $r4
|
5 | 32 | #define XX $r5
|
6 | 33 | #define YY $r6
|
|
35 | 62 | bge $r0, N, .L999
|
36 | 63 | li.d TEMP, 1
|
37 | 64 | movgr2fr.d a1, $r0
|
38 |
| - ffint.d.l a1, a1 |
| 65 | + FFINT a1, a1 |
39 | 66 | movgr2fr.d a2, TEMP
|
40 |
| - ffint.d.l a2, a2 |
41 |
| - fcmp.ceq.d $fcc0, ALPHA, a1 |
| 67 | + FFINT a2, a2 |
| 68 | + CMPEQ $fcc0, ALPHA, a1 |
42 | 69 | bcnez $fcc0, .L999
|
43 | 70 | slli.d TEMP, TEMP, BASE_SHIFT
|
44 | 71 | slli.d INCX, INCX, BASE_SHIFT
|
45 | 72 | slli.d INCY, INCY, BASE_SHIFT
|
46 |
| - movfr2gr.d t1, ALPHA |
| 73 | + MTG t1, ALPHA |
| 74 | +#ifdef DOUBLE |
47 | 75 | xvreplgr2vr.d VXA, t1
|
| 76 | +#else |
| 77 | + xvreplgr2vr.w VXA, t1 |
| 78 | +#endif |
48 | 79 |
|
49 | 80 | srai.d I, N, 3
|
50 | 81 | bne INCX, TEMP, .L20
|
|
56 | 87 |
|
57 | 88 | .L11:
|
58 | 89 | bge $r0, I, .L113
|
59 |
| - fcmp.ceq.d $fcc0, ALPHA, a2 |
| 90 | + CMPEQ $fcc0, ALPHA, a2 |
60 | 91 | bceqz $fcc0, .L112
|
61 | 92 | .align 3
|
62 | 93 |
|
63 | 94 | .L111:
|
| 95 | +#ifdef DOUBLE |
64 | 96 | xvld VX0, X, 0 * SIZE
|
65 | 97 | xvld VX2, Y, 0 * SIZE
|
66 | 98 | xvld VX1, X, 4 * SIZE
|
|
70 | 102 | addi.d I, I, -1
|
71 | 103 | xvst VX2, Y, 0 * SIZE
|
72 | 104 | xvst VX3, Y, 4 * SIZE
|
| 105 | +#else |
| 106 | + xvld VX0, X, 0 * SIZE |
| 107 | + xvld VX2, Y, 0 * SIZE |
| 108 | + addi.d I, I, -1 |
| 109 | + xvfadd.s VX2, VX0, VX2 |
| 110 | + xvst VX2, Y, 0 * SIZE |
| 111 | +#endif |
73 | 112 | addi.d X, X, 8 * SIZE
|
74 | 113 | addi.d Y, Y, 8 * SIZE
|
75 | 114 | blt $r0, I, .L111
|
76 | 115 | b .L113
|
77 | 116 | .align 3
|
78 | 117 |
|
79 | 118 | .L112:
|
| 119 | +#ifdef DOUBLE |
80 | 120 | xvld VX0, X, 0 * SIZE
|
81 | 121 | xvld VX2, Y, 0 * SIZE
|
82 | 122 | xvld VX1, X, 4 * SIZE
|
|
86 | 126 | addi.d I, I, -1
|
87 | 127 | xvst VX2, Y, 0 * SIZE
|
88 | 128 | xvst VX3, Y, 4 * SIZE
|
| 129 | +#else |
| 130 | + xvld VX0, X, 0 * SIZE |
| 131 | + xvld VX2, Y, 0 * SIZE |
| 132 | + addi.d I, I, -1 |
| 133 | + xvfmadd.s VX2, VX0, VXA, VX2 |
| 134 | + xvst VX2, Y, 0 * SIZE |
| 135 | +#endif |
89 | 136 | addi.d X, X, 8 * SIZE
|
90 | 137 | addi.d Y, Y, 8 * SIZE
|
91 | 138 | blt $r0, I, .L112
|
|
97 | 144 | .align 3
|
98 | 145 |
|
99 | 146 | .L114:
|
100 |
| - fld.d $f12, X, 0 * SIZE |
101 |
| - fld.d $f14, Y, 0 * SIZE |
| 147 | + LD $f12, X, 0 * SIZE |
| 148 | + LD $f14, Y, 0 * SIZE |
102 | 149 | addi.d I, I, -1
|
103 |
| - fmadd.d $f14, $f12, $f0, $f14 |
104 |
| - fst.d $f14, Y, 0 * SIZE |
| 150 | + MADD $f14, $f12, $f0, $f14 |
| 151 | + ST $f14, Y, 0 * SIZE |
105 | 152 | addi.d X, X, SIZE
|
106 | 153 | addi.d Y, Y, SIZE
|
107 | 154 | blt $r0, I, .L114
|
|
114 | 161 | .align 3
|
115 | 162 |
|
116 | 163 | .L121:
|
| 164 | +#ifdef DOUBLE |
117 | 165 | xvld VX0, X, 0 * SIZE
|
118 | 166 | ld.d t1, Y, 0 * SIZE
|
119 | 167 | add.d Y, Y, INCY
|
|
158 | 206 | xvstelm.d VX3, YY, 0, 2
|
159 | 207 | add.d YY, YY, INCY
|
160 | 208 | xvstelm.d VX3, YY, 0, 3
|
| 209 | +#else |
| 210 | + xvld VX0, X, 0 * SIZE |
| 211 | + ld.w t1, Y, 0 * SIZE |
| 212 | + add.d Y, Y, INCY |
| 213 | + ld.w t2, Y, 0 * SIZE |
| 214 | + add.d Y, Y, INCY |
| 215 | + ld.w t3, Y, 0 * SIZE |
| 216 | + add.d Y, Y, INCY |
| 217 | + ld.w t4, Y, 0 * SIZE |
| 218 | + xvinsgr2vr.w VX2, t1, 0 |
| 219 | + xvinsgr2vr.w VX2, t2, 1 |
| 220 | + xvinsgr2vr.w VX2, t3, 2 |
| 221 | + xvinsgr2vr.w VX2, t4, 3 |
| 222 | + add.d Y, Y, INCY |
| 223 | + ld.w t1, Y, 0 * SIZE |
| 224 | + add.d Y, Y, INCY |
| 225 | + ld.w t2, Y, 0 * SIZE |
| 226 | + add.d Y, Y, INCY |
| 227 | + ld.w t3, Y, 0 * SIZE |
| 228 | + add.d Y, Y, INCY |
| 229 | + ld.w t4, Y, 0 * SIZE |
| 230 | + xvinsgr2vr.w VX2, t1, 4 |
| 231 | + xvinsgr2vr.w VX2, t2, 5 |
| 232 | + xvinsgr2vr.w VX2, t3, 6 |
| 233 | + xvinsgr2vr.w VX2, t4, 7 |
| 234 | + add.d Y, Y, INCY |
| 235 | + xvfmadd.s VX2, VX0, VXA, VX2 |
| 236 | + addi.d I, I, -1 |
| 237 | + xvstelm.w VX2, YY, 0, 0 |
| 238 | + add.d YY, YY, INCY |
| 239 | + xvstelm.w VX2, YY, 0, 1 |
| 240 | + add.d YY, YY, INCY |
| 241 | + xvstelm.w VX2, YY, 0, 2 |
| 242 | + add.d YY, YY, INCY |
| 243 | + xvstelm.w VX2, YY, 0, 3 |
| 244 | + add.d YY, YY, INCY |
| 245 | + xvstelm.w VX2, YY, 0, 4 |
| 246 | + add.d YY, YY, INCY |
| 247 | + xvstelm.w VX2, YY, 0, 5 |
| 248 | + add.d YY, YY, INCY |
| 249 | + xvstelm.w VX2, YY, 0, 6 |
| 250 | + add.d YY, YY, INCY |
| 251 | + xvstelm.w VX2, YY, 0, 7 |
| 252 | +#endif |
161 | 253 | add.d YY, YY, INCY
|
162 | 254 | addi.d X, X, 8 * SIZE
|
163 | 255 | blt $r0, I, .L121
|
|
169 | 261 | .align 3
|
170 | 262 |
|
171 | 263 | .L123:
|
172 |
| - fld.d $f12, X, 0 * SIZE |
173 |
| - fld.d $f14, Y, 0 * SIZE |
| 264 | + LD $f12, X, 0 * SIZE |
| 265 | + LD $f14, Y, 0 * SIZE |
174 | 266 | addi.d I, I, -1
|
175 |
| - fmadd.d $f14, $f12, $f0, $f14 |
176 |
| - fst.d $f14, Y, 0 * SIZE |
| 267 | + MADD $f14, $f12, $f0, $f14 |
| 268 | + ST $f14, Y, 0 * SIZE |
177 | 269 | addi.d X, X, SIZE
|
178 | 270 | add.d Y, Y, INCY
|
179 | 271 | blt $r0, I, .L123
|
|
185 | 277 | .align 3
|
186 | 278 |
|
187 | 279 | .L211:
|
| 280 | +#ifdef DOUBLE |
188 | 281 | xvld VX2, Y, 0 * SIZE
|
189 | 282 | ld.d t1, X, 0 * SIZE
|
190 | 283 | add.d X, X, INCX
|
|
217 | 310 | addi.d I, I, -1
|
218 | 311 | xvst VX3, Y, 4 * SIZE
|
219 | 312 | addi.d Y, Y, 8 * SIZE
|
| 313 | +#else |
| 314 | + xvld VX2, Y, 0 * SIZE |
| 315 | + ld.w t1, X, 0 * SIZE |
| 316 | + add.d X, X, INCX |
| 317 | + ld.w t2, X, 0 * SIZE |
| 318 | + add.d X, X, INCX |
| 319 | + ld.w t3, X, 0 * SIZE |
| 320 | + add.d X, X, INCX |
| 321 | + ld.w t4, X, 0 * SIZE |
| 322 | + xvinsgr2vr.w VX0, t1, 0 |
| 323 | + xvinsgr2vr.w VX0, t2, 1 |
| 324 | + xvinsgr2vr.w VX0, t3, 2 |
| 325 | + xvinsgr2vr.w VX0, t4, 3 |
| 326 | + add.d X, X, INCX |
| 327 | + ld.w t1, X, 0 * SIZE |
| 328 | + add.d X, X, INCX |
| 329 | + ld.w t2, X, 0 * SIZE |
| 330 | + add.d X, X, INCX |
| 331 | + ld.w t3, X, 0 * SIZE |
| 332 | + add.d X, X, INCX |
| 333 | + ld.w t4, X, 0 * SIZE |
| 334 | + add.d X, X, INCX |
| 335 | + xvinsgr2vr.w VX0, t1, 4 |
| 336 | + xvinsgr2vr.w VX0, t2, 5 |
| 337 | + xvinsgr2vr.w VX0, t3, 6 |
| 338 | + xvinsgr2vr.w VX0, t4, 7 |
| 339 | + xvfmadd.s VX2, VX0, VXA, VX2 |
| 340 | + addi.d I, I, -1 |
| 341 | + xvst VX2, Y, 0 * SIZE |
| 342 | + addi.d Y, Y, 8 * SIZE |
| 343 | +#endif |
220 | 344 | blt $r0, I, .L211
|
221 | 345 | .align 3
|
222 | 346 |
|
|
226 | 350 | .align 3
|
227 | 351 |
|
228 | 352 | .L213:
|
229 |
| - fld.d $f12, X, 0 * SIZE |
230 |
| - fld.d $f14, Y, 0 * SIZE |
| 353 | + LD $f12, X, 0 * SIZE |
| 354 | + LD $f14, Y, 0 * SIZE |
231 | 355 | addi.d I, I, -1
|
232 |
| - fmadd.d $f14, $f12, $f0, $f14 |
233 |
| - fst.d $f14, Y, 0 * SIZE |
| 356 | + MADD $f14, $f12, $f0, $f14 |
| 357 | + ST $f14, Y, 0 * SIZE |
234 | 358 | add.d X, X, INCX
|
235 | 359 | addi.d Y, Y, SIZE
|
236 | 360 | blt $r0, I, .L213
|
|
243 | 367 | .align 3
|
244 | 368 |
|
245 | 369 | .L222:
|
| 370 | +#ifdef DOUBLE |
246 | 371 | ld.d t1, X, 0 * SIZE
|
247 | 372 | add.d X, X, INCX
|
248 | 373 | ld.d t2, X, 0 * SIZE
|
|
309 | 434 | xvstelm.d VX3, YY, 0, 2
|
310 | 435 | add.d YY, YY, INCY
|
311 | 436 | xvstelm.d VX3, YY, 0, 3
|
| 437 | +#else |
| 438 | + ld.w t1, X, 0 * SIZE |
| 439 | + add.d X, X, INCX |
| 440 | + ld.w t2, X, 0 * SIZE |
| 441 | + add.d X, X, INCX |
| 442 | + ld.w t3, X, 0 * SIZE |
| 443 | + add.d X, X, INCX |
| 444 | + ld.w t4, X, 0 * SIZE |
| 445 | + add.d X, X, INCX |
| 446 | + xvinsgr2vr.w VX0, t1, 0 |
| 447 | + xvinsgr2vr.w VX0, t2, 1 |
| 448 | + xvinsgr2vr.w VX0, t3, 2 |
| 449 | + xvinsgr2vr.w VX0, t4, 3 |
| 450 | + ld.w t1, Y, 0 * SIZE |
| 451 | + add.d Y, Y, INCY |
| 452 | + ld.w t2, Y, 0 * SIZE |
| 453 | + add.d Y, Y, INCY |
| 454 | + ld.w t3, Y, 0 * SIZE |
| 455 | + add.d Y, Y, INCY |
| 456 | + ld.w t4, Y, 0 * SIZE |
| 457 | + add.d Y, Y, INCY |
| 458 | + xvinsgr2vr.w VX2, t1, 0 |
| 459 | + xvinsgr2vr.w VX2, t2, 1 |
| 460 | + xvinsgr2vr.w VX2, t3, 2 |
| 461 | + xvinsgr2vr.w VX2, t4, 3 |
| 462 | + ld.w t1, X, 0 * SIZE |
| 463 | + add.d X, X, INCX |
| 464 | + ld.w t2, X, 0 * SIZE |
| 465 | + add.d X, X, INCX |
| 466 | + ld.w t3, X, 0 * SIZE |
| 467 | + add.d X, X, INCX |
| 468 | + ld.w t4, X, 0 * SIZE |
| 469 | + add.d X, X, INCX |
| 470 | + xvinsgr2vr.w VX0, t1, 4 |
| 471 | + xvinsgr2vr.w VX0, t2, 5 |
| 472 | + xvinsgr2vr.w VX0, t3, 6 |
| 473 | + xvinsgr2vr.w VX0, t4, 7 |
| 474 | + ld.w t1, Y, 0 * SIZE |
| 475 | + add.d Y, Y, INCY |
| 476 | + ld.w t2, Y, 0 * SIZE |
| 477 | + add.d Y, Y, INCY |
| 478 | + ld.w t3, Y, 0 * SIZE |
| 479 | + add.d Y, Y, INCY |
| 480 | + ld.w t4, Y, 0 * SIZE |
| 481 | + xvinsgr2vr.w VX2, t1, 4 |
| 482 | + xvinsgr2vr.w VX2, t2, 5 |
| 483 | + xvinsgr2vr.w VX2, t3, 6 |
| 484 | + xvinsgr2vr.w VX2, t4, 7 |
| 485 | + add.d Y, Y, INCY |
| 486 | + xvfmadd.s VX2, VX0, VXA, VX2 |
| 487 | + addi.d I, I, -1 |
| 488 | + xvstelm.w VX2, YY, 0, 0 |
| 489 | + add.d YY, YY, INCY |
| 490 | + xvstelm.w VX2, YY, 0, 1 |
| 491 | + add.d YY, YY, INCY |
| 492 | + xvstelm.w VX2, YY, 0, 2 |
| 493 | + add.d YY, YY, INCY |
| 494 | + xvstelm.w VX2, YY, 0, 3 |
| 495 | + add.d YY, YY, INCY |
| 496 | + xvstelm.w VX2, YY, 0, 4 |
| 497 | + add.d YY, YY, INCY |
| 498 | + xvstelm.w VX2, YY, 0, 5 |
| 499 | + add.d YY, YY, INCY |
| 500 | + xvstelm.w VX2, YY, 0, 6 |
| 501 | + add.d YY, YY, INCY |
| 502 | + xvstelm.w VX2, YY, 0, 7 |
| 503 | +#endif |
312 | 504 | add.d YY, YY, INCY
|
313 | 505 | blt $r0, I, .L222
|
314 | 506 | .align 3
|
|
319 | 511 | .align 3
|
320 | 512 |
|
321 | 513 | .L224:
|
322 |
| - fld.d $f12, X, 0 * SIZE |
323 |
| - fld.d $f14, Y, 0 * SIZE |
| 514 | + LD $f12, X, 0 * SIZE |
| 515 | + LD $f14, Y, 0 * SIZE |
324 | 516 | addi.d I, I, -1
|
325 |
| - fmadd.d $f14, $f12, $f0, $f14 |
326 |
| - fst.d $f14, Y, 0 * SIZE |
| 517 | + MADD $f14, $f12, $f0, $f14 |
| 518 | + ST $f14, Y, 0 * SIZE |
327 | 519 | add.d X, X, INCX
|
328 | 520 | add.d Y, Y, INCY
|
329 | 521 | blt $r0, I, .L224
|
330 |
| - b .L999 |
331 | 522 | .align 3
|
332 | 523 |
|
333 | 524 | .L999:
|
|
0 commit comments