|
| 1 | +/*********************************************************************/ |
| 2 | +/* Copyright 2009, 2010 The University of Texas at Austin. */ |
| 3 | +/* All rights reserved. */ |
| 4 | +/* */ |
| 5 | +/* Redistribution and use in source and binary forms, with or */ |
| 6 | +/* without modification, are permitted provided that the following */ |
| 7 | +/* conditions are met: */ |
| 8 | +/* */ |
| 9 | +/* 1. Redistributions of source code must retain the above */ |
| 10 | +/* copyright notice, this list of conditions and the following */ |
| 11 | +/* disclaimer. */ |
| 12 | +/* */ |
| 13 | +/* 2. Redistributions in binary form must reproduce the above */ |
| 14 | +/* copyright notice, this list of conditions and the following */ |
| 15 | +/* disclaimer in the documentation and/or other materials */ |
| 16 | +/* provided with the distribution. */ |
| 17 | +/* */ |
| 18 | +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ |
| 19 | +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ |
| 20 | +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ |
| 21 | +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ |
| 22 | +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ |
| 23 | +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ |
| 24 | +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ |
| 25 | +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ |
| 26 | +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ |
| 27 | +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ |
| 28 | +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ |
| 29 | +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ |
| 30 | +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ |
| 31 | +/* POSSIBILITY OF SUCH DAMAGE. */ |
| 32 | +/* */ |
| 33 | +/* The views and conclusions contained in the software and */ |
| 34 | +/* documentation are those of the authors and should not be */ |
| 35 | +/* interpreted as representing official policies, either expressed */ |
| 36 | +/* or implied, of The University of Texas at Austin. */ |
| 37 | +/*********************************************************************/ |
| 38 | + |
| 39 | +#define ASSEMBLER |
| 40 | +#include "common.h" |
| 41 | +#include "def_vsx.h" |
| 42 | + |
| 43 | +#ifndef __64BIT__ |
| 44 | +#define LOAD lwz |
| 45 | +#else |
| 46 | +#define LOAD ld |
| 47 | +#endif |
| 48 | + |
| 49 | +#ifdef __64BIT__ |
| 50 | +#define STACKSIZE 320 |
| 51 | +#define ALPHA 296(SP) |
| 52 | +#define FZERO 304(SP) |
| 53 | +#else |
| 54 | +#define STACKSIZE 240 |
| 55 | +#define ALPHA 224(SP) |
| 56 | +#define FZERO 232(SP) |
| 57 | +#endif |
| 58 | + |
| 59 | +#define M r3 |
| 60 | +#define N r4 |
| 61 | +#define K r5 |
| 62 | + |
| 63 | +#ifdef linux |
| 64 | +#ifndef __64BIT__ |
| 65 | +#define A r6 |
| 66 | +#define B r7 |
| 67 | +#define C r8 |
| 68 | +#define LDC r9 |
| 69 | +#define OFFSET r10 |
| 70 | +#else |
| 71 | +#define A r7 |
| 72 | +#define B r8 |
| 73 | +#define C r9 |
| 74 | +#define LDC r10 |
| 75 | +#define OFFSET r6 |
| 76 | +#endif |
| 77 | +#endif |
| 78 | + |
| 79 | +#if defined(_AIX) || defined(__APPLE__) |
| 80 | +#if !defined(__64BIT__) && defined(DOUBLE) |
| 81 | +#define A r8 |
| 82 | +#define B r9 |
| 83 | +#define C r10 |
| 84 | +#define LDC r7 |
| 85 | +#define OFFSET r6 |
| 86 | +#else |
| 87 | +#define A r7 |
| 88 | +#define B r8 |
| 89 | +#define C r9 |
| 90 | +#define LDC r10 |
| 91 | +#define OFFSET r6 |
| 92 | +#endif |
| 93 | +#endif |
| 94 | + |
| 95 | +#define o0 0 |
| 96 | + |
| 97 | +#define PRE r15 |
| 98 | +#define T4 r16 |
| 99 | +#define L r17 |
| 100 | +#define T3 r18 |
| 101 | +#define T2 r19 |
| 102 | +#define KK r20 |
| 103 | +#define I r21 |
| 104 | +#define J r22 |
| 105 | +#define AO r23 |
| 106 | +#define BO r24 |
| 107 | +#define CO r25 |
| 108 | +#define o8 r26 |
| 109 | +#define o16 r27 |
| 110 | +#define o24 r28 |
| 111 | +#define o32 r29 |
| 112 | +#define o48 r30 |
| 113 | +#define T1 r31 |
| 114 | + |
| 115 | +#include "dtrsm_macros_LT_16x4_power8.S" |
| 116 | + |
| 117 | +#ifndef NEEDPARAM |
| 118 | + |
| 119 | + PROLOGUE |
| 120 | + PROFCODE |
| 121 | + |
| 122 | + addi SP, SP, -STACKSIZE |
| 123 | + li r0, 0 |
| 124 | + |
| 125 | + stfd f14, 0(SP) |
| 126 | + stfd f15, 8(SP) |
| 127 | + stfd f16, 16(SP) |
| 128 | + stfd f17, 24(SP) |
| 129 | + |
| 130 | + stfd f18, 32(SP) |
| 131 | + stfd f19, 40(SP) |
| 132 | + stfd f20, 48(SP) |
| 133 | + stfd f21, 56(SP) |
| 134 | + |
| 135 | + stfd f22, 64(SP) |
| 136 | + stfd f23, 72(SP) |
| 137 | + stfd f24, 80(SP) |
| 138 | + stfd f25, 88(SP) |
| 139 | + |
| 140 | + stfd f26, 96(SP) |
| 141 | + stfd f27, 104(SP) |
| 142 | + stfd f28, 112(SP) |
| 143 | + stfd f29, 120(SP) |
| 144 | + |
| 145 | + stfd f30, 128(SP) |
| 146 | + stfd f31, 136(SP) |
| 147 | + |
| 148 | +#ifdef __64BIT__ |
| 149 | + std r31, 144(SP) |
| 150 | + std r30, 152(SP) |
| 151 | + std r29, 160(SP) |
| 152 | + std r28, 168(SP) |
| 153 | + std r27, 176(SP) |
| 154 | + std r26, 184(SP) |
| 155 | + std r25, 192(SP) |
| 156 | + std r24, 200(SP) |
| 157 | + std r23, 208(SP) |
| 158 | + std r22, 216(SP) |
| 159 | + std r21, 224(SP) |
| 160 | + std r20, 232(SP) |
| 161 | + std r19, 240(SP) |
| 162 | + std r18, 248(SP) |
| 163 | + std r17, 256(SP) |
| 164 | + std r16, 264(SP) |
| 165 | + std r15, 272(SP) |
| 166 | +#else |
| 167 | + stw r31, 144(SP) |
| 168 | + stw r30, 148(SP) |
| 169 | + stw r29, 152(SP) |
| 170 | + stw r28, 156(SP) |
| 171 | + stw r27, 160(SP) |
| 172 | + stw r26, 164(SP) |
| 173 | + stw r25, 168(SP) |
| 174 | + stw r24, 172(SP) |
| 175 | + stw r23, 176(SP) |
| 176 | + stw r22, 180(SP) |
| 177 | + stw r21, 184(SP) |
| 178 | + stw r20, 188(SP) |
| 179 | + stw r19, 192(SP) |
| 180 | + stw r18, 196(SP) |
| 181 | +#endif |
| 182 | + |
| 183 | + |
| 184 | +#if defined(_AIX) || defined(__APPLE__) |
| 185 | +#if !defined(__64BIT__) && defined(DOUBLE) |
| 186 | + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) |
| 187 | +#endif |
| 188 | +#endif |
| 189 | + |
| 190 | + |
| 191 | +#if defined(linux) && defined(__64BIT__) |
| 192 | + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) |
| 193 | +#endif |
| 194 | + |
| 195 | +#if defined(_AIX) || defined(__APPLE__) |
| 196 | +#ifdef __64BIT__ |
| 197 | + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) |
| 198 | +#else |
| 199 | +#ifdef DOUBLE |
| 200 | + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) |
| 201 | +#else |
| 202 | + lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) |
| 203 | +#endif |
| 204 | +#endif |
| 205 | +#endif |
| 206 | + |
| 207 | + |
| 208 | + cmpwi cr0, M, 0 |
| 209 | + ble L999 |
| 210 | + cmpwi cr0, N, 0 |
| 211 | + ble L999 |
| 212 | + cmpwi cr0, K, 0 |
| 213 | + ble L999 |
| 214 | + |
| 215 | + slwi LDC, LDC, BASE_SHIFT |
| 216 | + |
| 217 | + li o8, 8 |
| 218 | + li o16, 16 |
| 219 | + li o24, 24 |
| 220 | + li o32, 32 |
| 221 | + li o48, 48 |
| 222 | + li PRE, 384 |
| 223 | + |
| 224 | + mr KK, OFFSET |
| 225 | + |
| 226 | +#include "dtrsm_logic_LT_16x4_power8.S" |
| 227 | + |
| 228 | +L999: |
| 229 | + addi r3, 0, 0 |
| 230 | + |
| 231 | + lfd f14, 0(SP) |
| 232 | + lfd f15, 8(SP) |
| 233 | + lfd f16, 16(SP) |
| 234 | + lfd f17, 24(SP) |
| 235 | + |
| 236 | + lfd f18, 32(SP) |
| 237 | + lfd f19, 40(SP) |
| 238 | + lfd f20, 48(SP) |
| 239 | + lfd f21, 56(SP) |
| 240 | + |
| 241 | + lfd f22, 64(SP) |
| 242 | + lfd f23, 72(SP) |
| 243 | + lfd f24, 80(SP) |
| 244 | + lfd f25, 88(SP) |
| 245 | + |
| 246 | + lfd f26, 96(SP) |
| 247 | + lfd f27, 104(SP) |
| 248 | + lfd f28, 112(SP) |
| 249 | + lfd f29, 120(SP) |
| 250 | + |
| 251 | + lfd f30, 128(SP) |
| 252 | + lfd f31, 136(SP) |
| 253 | + |
| 254 | +#ifdef __64BIT__ |
| 255 | + ld r31, 144(SP) |
| 256 | + ld r30, 152(SP) |
| 257 | + ld r29, 160(SP) |
| 258 | + ld r28, 168(SP) |
| 259 | + ld r27, 176(SP) |
| 260 | + ld r26, 184(SP) |
| 261 | + ld r25, 192(SP) |
| 262 | + ld r24, 200(SP) |
| 263 | + ld r23, 208(SP) |
| 264 | + ld r22, 216(SP) |
| 265 | + ld r21, 224(SP) |
| 266 | + ld r20, 232(SP) |
| 267 | + ld r19, 240(SP) |
| 268 | + ld r18, 248(SP) |
| 269 | + ld r17, 256(SP) |
| 270 | + ld r16, 264(SP) |
| 271 | + ld r15, 272(SP) |
| 272 | +#else |
| 273 | + lwz r31, 144(SP) |
| 274 | + lwz r30, 148(SP) |
| 275 | + lwz r29, 152(SP) |
| 276 | + lwz r28, 156(SP) |
| 277 | + lwz r27, 160(SP) |
| 278 | + lwz r26, 164(SP) |
| 279 | + lwz r25, 168(SP) |
| 280 | + lwz r24, 172(SP) |
| 281 | + lwz r23, 176(SP) |
| 282 | + lwz r22, 180(SP) |
| 283 | + lwz r21, 184(SP) |
| 284 | + lwz r20, 188(SP) |
| 285 | + lwz r19, 192(SP) |
| 286 | + lwz r18, 196(SP) |
| 287 | +#endif |
| 288 | + |
| 289 | + addi SP, SP, STACKSIZE |
| 290 | + |
| 291 | + blr |
| 292 | + |
| 293 | + EPILOGUE |
| 294 | +#endif |
0 commit comments