|
| 1 | +/******************************************************************************* |
| 2 | +Copyright (c) 2024, The OpenBLAS Project |
| 3 | +All rights reserved. |
| 4 | +Redistribution and use in source and binary forms, with or without |
| 5 | +modification, are permitted provided that the following conditions are |
| 6 | +met: |
| 7 | +1. Redistributions of source code must retain the above copyright |
| 8 | +notice, this list of conditions and the following disclaimer. |
| 9 | +2. Redistributions in binary form must reproduce the above copyright |
| 10 | +notice, this list of conditions and the following disclaimer in |
| 11 | +the documentation and/or other materials provided with the |
| 12 | +distribution. |
| 13 | +3. Neither the name of the OpenBLAS project nor the names of |
| 14 | +its contributors may be used to endorse or promote products |
| 15 | +derived from this software without specific prior written permission. |
| 16 | +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| 17 | +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 18 | +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 19 | +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE |
| 20 | +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
| 21 | +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR |
| 22 | +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
| 23 | +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
| 24 | +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE |
| 25 | +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 26 | +*******************************************************************************/ |
| 27 | + |
| 28 | +#define ASSEMBLER |
| 29 | + |
| 30 | +#include "common.h" |
| 31 | + |
| 32 | +/* Param */ |
| 33 | +#define M $r4 |
| 34 | +#define N $r5 |
| 35 | +#define A $r7 |
| 36 | +#define LDA $r8 |
| 37 | +#define X $r9 |
| 38 | +#define INCX $r10 |
| 39 | +#define Y $r11 |
| 40 | +#define INCY $r6 |
| 41 | +#define BUFFER $r16 |
| 42 | +#define ALPHA $f0 |
| 43 | + |
| 44 | +#define YORIG $r18 |
| 45 | +#define T0 $r19 |
| 46 | +#define T1 $r20 |
| 47 | +#define XX $r12 |
| 48 | +#define YY $r13 |
| 49 | +#define I $r14 |
| 50 | +#define J $r15 |
| 51 | +#define AO1 $r23 |
| 52 | +#define AO2 $r24 |
| 53 | +#define IX $r25 |
| 54 | +#define IY $r26 |
| 55 | +#define II $r27 |
| 56 | +#define T2 $r28 |
| 57 | +#define T3 $r29 |
| 58 | +#define T4 $r30 |
| 59 | + |
| 60 | +/* LSX vectors */ |
| 61 | +#define U0 $vr11 |
| 62 | +#define U1 $vr12 |
| 63 | +#define U2 $vr2 |
| 64 | +#define U3 $vr3 |
| 65 | +#define U4 $vr4 |
| 66 | +#define U5 $vr5 |
| 67 | +#define U6 $vr6 |
| 68 | +#define U7 $vr7 |
| 69 | +#define U8 $vr8 |
| 70 | +#define U9 $vr9 |
| 71 | +#define VALPHA $vr10 |
| 72 | + |
| 73 | +#define a1 $f3 |
| 74 | +#define a2 $f4 |
| 75 | +#define a3 $f5 |
| 76 | +#define a4 $f6 |
| 77 | +#define a5 $f7 |
| 78 | +#define a6 $f8 |
| 79 | +#define a7 $f9 |
| 80 | +#define a8 $f10 |
| 81 | + |
| 82 | + |
| 83 | + PROLOGUE |
| 84 | + |
| 85 | + LDARG INCY, $sp, 0 |
| 86 | + LDARG BUFFER, $sp, 8 |
| 87 | + |
| 88 | + addi.d $sp, $sp, -80 |
| 89 | + |
| 90 | + SDARG $r23, $sp, 0 |
| 91 | + SDARG $r24, $sp, 8 |
| 92 | + SDARG $r25, $sp, 16 |
| 93 | + SDARG $r26, $sp, 32 |
| 94 | + SDARG $r27, $sp, 40 |
| 95 | + SDARG $r28, $sp, 48 |
| 96 | + SDARG $r29, $sp, 56 |
| 97 | + SDARG $r30, $sp, 64 |
| 98 | + ST ALPHA, $sp, 72 |
| 99 | + |
| 100 | + vldrepl.d VALPHA, $sp, 72 |
| 101 | + |
| 102 | + slli.d LDA, LDA, BASE_SHIFT |
| 103 | + slli.d INCX, INCX, BASE_SHIFT |
| 104 | + slli.d INCY, INCY, BASE_SHIFT |
| 105 | + |
| 106 | + bge $r0, M, .L999 |
| 107 | + bge $r0, N, .L999 |
| 108 | + |
| 109 | + move J, $r0 |
| 110 | + move IX, $r0 |
| 111 | + |
| 112 | + move AO1, A //a_ptr |
| 113 | + move XX, X |
| 114 | + move YY, Y |
| 115 | + |
| 116 | + beq J, M, .L999 |
| 117 | + |
| 118 | +.L01: |
| 119 | + vldx U0, XX, IX |
| 120 | + vshuf4i.d U0, U0, 0x00 |
| 121 | + |
| 122 | + vfmul.d U1, VALPHA, U0 //temp1 |
| 123 | + |
| 124 | + move IY, $r0 |
| 125 | + move II, $r0 |
| 126 | + move I, $r0 |
| 127 | + |
| 128 | + srai.d T0, M, 2 //n/4 |
| 129 | + beq I, T0, .L03 |
| 130 | + |
| 131 | +.L02: |
| 132 | + vldx U2, AO1, II |
| 133 | + addi.d II, II, 16 |
| 134 | + vldx U7, AO1, II |
| 135 | + |
| 136 | + move T1, IY |
| 137 | + add.d T2, T1, INCY |
| 138 | + add.d T3, T2, INCY |
| 139 | + add.d T4, T3, INCY |
| 140 | + |
| 141 | + fldx.d a1, YY, T1 |
| 142 | + fldx.d a2, YY, T2 |
| 143 | + fldx.d a3, YY, T3 |
| 144 | + fldx.d a4, YY, T4 |
| 145 | + |
| 146 | + vextrins.d U3, U4, 0x10 |
| 147 | + vextrins.d U5, U6, 0x10 |
| 148 | + |
| 149 | + vfmadd.d U3, U1, U2, U3 |
| 150 | + vfmadd.d U5, U1, U7, U5 |
| 151 | + |
| 152 | + vextrins.d U4, U3, 0x01 |
| 153 | + vextrins.d U6, U5, 0x01 |
| 154 | + |
| 155 | + fstx.d a1, YY, T1 |
| 156 | + fstx.d a2, YY, T2 |
| 157 | + fstx.d a3, YY, T3 |
| 158 | + fstx.d a4, YY, T4 |
| 159 | + |
| 160 | + add.d IY, T4, INCY |
| 161 | + addi.d II, II, 16 |
| 162 | + addi.d I, I, 1 |
| 163 | + blt I, T0, .L02 |
| 164 | + |
| 165 | +.L03: |
| 166 | + andi T0, M, 2 |
| 167 | + beq $r0, T0, .L04 |
| 168 | + |
| 169 | + addi.d T1, $r0, 4 |
| 170 | + mod.d T1, M, T1 |
| 171 | + sub.d II, M, T1 |
| 172 | + slli.d II, II, BASE_SHIFT |
| 173 | + |
| 174 | + move T1, IY |
| 175 | + add.d T2, T1, INCY |
| 176 | + |
| 177 | + vldx U2, AO1, II |
| 178 | + |
| 179 | + fldx.d a1, YY, T1 |
| 180 | + fldx.d a2, YY, T2 |
| 181 | + |
| 182 | + vextrins.d U3, U4, 0x10 |
| 183 | + |
| 184 | + vfmadd.d U3, U1, U2, U3 |
| 185 | + |
| 186 | + vextrins.d U4, U3, 0x01 |
| 187 | + |
| 188 | + fstx.d a1, YY, T1 |
| 189 | + fstx.d a2, YY, T2 |
| 190 | + |
| 191 | + add.d IY, T2, INCY |
| 192 | + |
| 193 | +.L04: |
| 194 | + andi T0, M, 1 |
| 195 | + beq $r0, T0, .L05 |
| 196 | + |
| 197 | + addi.d II, M, -1 |
| 198 | + slli.d II, II, BASE_SHIFT |
| 199 | + |
| 200 | + fldx.d a1, AO1, II |
| 201 | + fldx.d a3, YY, IY |
| 202 | + |
| 203 | + fmadd.d a3, $f12, a1, a3 |
| 204 | + |
| 205 | + fstx.d a3, YY, IY |
| 206 | + |
| 207 | + add.d IY, IY, INCY |
| 208 | + |
| 209 | +.L05: |
| 210 | + add.d AO1, AO1, LDA |
| 211 | + add.d IX, IX, INCX |
| 212 | + |
| 213 | + addi.d J, J, 1 |
| 214 | + blt J, N, .L01 |
| 215 | + |
| 216 | +.L999: |
| 217 | + LDARG $r23, $sp, 0 |
| 218 | + LDARG $r24, $sp, 8 |
| 219 | + LDARG $r25, $sp, 16 |
| 220 | + LDARG $r26, $sp, 32 |
| 221 | + LDARG $r27, $sp, 40 |
| 222 | + LDARG $r28, $sp, 48 |
| 223 | + LDARG $r29, $sp, 56 |
| 224 | + LDARG $r30, $sp, 64 |
| 225 | + LD ALPHA, $sp, 72 |
| 226 | + addi.d $sp, $sp, 80 |
| 227 | + jirl $r0, $r1, 0x0 |
| 228 | + |
| 229 | + EPILOGUE |
0 commit comments