@@ -33,6 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3333#define ALPHAI $f1
3434#define X $r7
3535#define INCX $r8
36+ #define DUMMY2 $r9
3637
3738#define I $r12
3839#define TEMP $r13
@@ -65,6 +66,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
6566
6667 bge $r0, N, .L999
6768 bge $r0, INCX, .L999
69+ ld.d DUMMY2, $sp, 0
6870 li.d TEMP, 1
6971 movgr2fr.d a1, $r0
7072 FFINT a1, a1
@@ -86,24 +88,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
8688#endif
8789 bne INCX, TEMP, .L22
8890
91+ /////// INCX == 1 ////////
8992.L11:
90- bge $r0, I, .L997
9193 CMPEQ $fcc0, ALPHAR, a1
9294 CMPEQ $fcc1, ALPHAI, a1
95+ bge $r0, I, .L19
96+ /////// INCX == 1 && N >= 4 ////////
97+ bnez DUMMY2, .L17 // if DUMMPY2 == 1, called from c/zscal.
98+
9399 bceqz $fcc0, .L13
94100 b .L14
95101 .align 3
96102
97103.L13:
98- bceqz $fcc1, .L114 //alpha_r != 0.0 && alpha_i != 0.0
99- b .L113 //alpha_r != 0.0 && alpha_i == 0.0
104+ bceqz $fcc1, .L17 //alpha_r != 0.0 && alpha_i != 0.0
105+ b .L16 //alpha_r != 0.0 && alpha_i == 0.0
100106
101107.L14:
102- bceqz $fcc1, .L114 //alpha_r == 0.0 && alpha_i != 0.0
103- b .L111 //alpha_r == 0.0 && alpha_i == 0.0
108+ bceqz $fcc1, .L18 //alpha_r == 0.0 && alpha_i != 0.0
109+ b .L15 //alpha_r == 0.0 && alpha_i == 0.0
104110 .align 3
105111
106- .L111 : //alpha_r == 0.0 && alpha_i == 0.0
112+ .L15 : //alpha_r == 0.0 && alpha_i == 0.0
107113 xvst VXZ, X, 0 * SIZE
108114#ifdef DOUBLE
109115 xvst VXZ, X, 4 * SIZE
@@ -113,11 +119,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
113119 addi.d X, X, 16 * SIZE
114120#endif
115121 addi.d I, I, -1
116- blt $r0, I, .L111
117- b .L997
122+ blt $r0, I, .L15
123+ b .L19
118124 .align 3
119125
120- .L113 : //alpha_r != 0.0 && alpha_i == 0.0
126+ .L16 : //alpha_r != 0.0 && alpha_i == 0.0
121127 xvld VX0, X, 0 * SIZE
122128#ifdef DOUBLE
123129 xvld VX1, X, 4 * SIZE
@@ -143,11 +149,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
143149 addi.d X, X, 16 * SIZE
144150#endif
145151 addi.d I, I, -1
146- blt $r0, I, .L113
147- b .L997
152+ blt $r0, I, .L16
153+ b .L19
148154 .align 3
149155
150- .L114 : //alpha_r != 0.0 && alpha_i != 0.0
156+ .L17 : //alpha_r != 0.0 && alpha_i != 0.0
151157 xvld VX0, X, 0 * SIZE
152158#ifdef DOUBLE
153159 xvld VX1, X, 4 * SIZE
@@ -177,29 +183,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
177183 addi.d X, X, 16 * SIZE
178184#endif
179185 addi.d I, I, -1
180- blt $r0, I, .L114
181- b .L997
186+ blt $r0, I, .L17
187+ b .L19
188+ .align 3
189+
190+ .L18: //alpha_r == 0.0 && alpha_i != 0.0
191+ xvld VX0, X, 0 * SIZE
192+ #ifdef DOUBLE
193+ xvld VX1, X, 4 * SIZE
194+ xvpickev.d x1, VX1, VX0
195+ xvpickod.d x2, VX1, VX0
196+ xvfmul.d x3, VXAI, x2
197+ xvfsub.d x3, VXZ, x3
198+ xvfmul.d x4, VXAI, x1
199+ xvilvl.d VX2, x4 ,x3
200+ xvilvh.d VX3, x4, x3
201+ xvst VX2, X, 0 * SIZE
202+ xvst VX3, X, 4 * SIZE
203+ addi.d X, X, 8 * SIZE
204+ #else
205+ xvld VX1, X, 8 * SIZE
206+ xvpickev.w x1, VX1, VX0
207+ xvpickod.w x2, VX1, VX0
208+ xvfmul.s x3, VXAI, x2
209+ xvfsub.s x3, VXZ, x3
210+ xvfmul.s x4, VXAI, x1
211+ xvilvl.w VX2, x4 ,x3
212+ xvilvh.w VX3, x4, x3
213+ xvst VX2, X, 0 * SIZE
214+ xvst VX3, X, 8 * SIZE
215+ addi.d X, X, 16 * SIZE
216+ #endif
217+ addi.d I, I, -1
218+ blt $r0, I, .L18
219+ b .L19
220+ .align 3
221+
222+ /////// INCX == 1 && N < 8 ///////
223+ .L19:
224+ #ifdef DOUBLE
225+ andi I, N, 3
226+ #else
227+ andi I, N, 7
228+ #endif
229+ beqz I, .L999
230+ bnez DUMMY2, .L998 // if DUMMPY2 == 1, called from c/zscal.
231+
232+ bceqz $fcc0, .L13_1
233+ b .L14_1
234+
235+ .L13_1:
236+ bceqz $fcc1, .L998 // alpha_r != 0.0 && alpha_i != 0.0
237+ b .L997 // alpha_r != 0.0 && alpha_i == 0.0
238+
239+ .L14_1:
240+ bceqz $fcc1, .L996 // alpha_r == 0.0 && alpha_i != 0.0
241+ b .L995 // alpha_r == 0.0 && alpha_i == 0.0
182242 .align 3
183243
244+ /////// INCX != 1 ////////
184245.L22:
185- bge $r0, I, .L997
186- move XX, X
187246 CMPEQ $fcc0, ALPHAR, a1
188247 CMPEQ $fcc1, ALPHAI, a1
248+ move XX, X
249+ bge $r0, I, .L29
250+ bnez DUMMY2, .L25 // if DUMMPY2 == 1, called from c/zscal.
189251 bceqz $fcc0, .L23
190252 b .L24
191253 .align 3
192254
193255.L23:
194- bceqz $fcc1, .L224 //alpha_r != 0.0 && alpha_i != 0.0
195- b .L223 //alpha_r != 0.0 && alpha_i == 0.0
256+ bceqz $fcc1, .L25 //alpha_r != 0.0 && alpha_i != 0.0
257+ b .L26 //alpha_r != 0.0 && alpha_i == 0.0
196258
197259.L24:
198- bceqz $fcc1, .L224 //alpha_r == 0.0 && alpha_i != 0.0
199- b .L221 //alpha_r == 0.0 && alpha_i == 0.0
260+ bceqz $fcc1, .L28 //alpha_r == 0.0 && alpha_i != 0.0
261+ b .L27 //alpha_r == 0.0 && alpha_i == 0.0
200262 .align 3
201263
202- .L221 : //alpha_r == 0.0 && alpha_i == 0.0
264+ .L27 : //alpha_r == 0.0 && alpha_i == 0.0
203265#ifdef DOUBLE
204266 xvstelm.d VXZ, X, 0 , 0
205267 xvstelm.d VXZ, X, 1 * SIZE, 0
@@ -239,11 +301,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
239301#endif
240302 add .d X, X, INCX
241303 addi.d I, I, -1
242- blt $r0, I, .L221
243- b .L997
304+ blt $r0, I, .L27
305+ b .L29
244306 .align 3
245307
246- .L223 : //alpha_r != 0.0 && alpha_i == 0.0
308+ .L26 : //alpha_r != 0.0 && alpha_i == 0.0
247309#ifdef DOUBLE
248310 ld.d t1, X, 0 * SIZE
249311 ld.d t2, X, 1 * SIZE
@@ -350,11 +412,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
350412 xvstelm.w x4, XX, 1 * SIZE, 7
351413#endif
352414 add .d XX, XX, INCX
353- blt $r0, I, .L223
354- b .L997
415+ blt $r0, I, .L26
416+ b .L29
355417 .align 3
356418
357- .L224 : //alpha_r != 0.0 && alpha_i != 0.0
419+ .L25 : //alpha_r != 0.0 && alpha_i != 0.0
358420#ifdef DOUBLE
359421 ld.d t1, X, 0 * SIZE
360422 ld.d t2, X, 1 * SIZE
@@ -465,20 +527,176 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
465527 xvstelm.w x4, XX, 1 * SIZE, 7
466528#endif
467529 add .d XX, XX, INCX
468- blt $r0, I, .L224
469- b .L997
530+ blt $r0, I, .L25
531+ b .L29
470532 .align 3
471533
472- .L997:
534+ .L28: //alpha_r == 0.0 && alpha_i != 0.0
473535#ifdef DOUBLE
474- andi I, N, 3
536+ ld.d t1, X, 0 * SIZE
537+ ld.d t2, X, 1 * SIZE
538+ add .d X, X, INCX
539+ ld.d t3, X, 0 * SIZE
540+ ld.d t4, X, 1 * SIZE
541+ add .d X, X, INCX
542+ xvinsgr2vr.d x1, t1, 0
543+ xvinsgr2vr.d x2, t2, 0
544+ xvinsgr2vr.d x1, t3, 1
545+ xvinsgr2vr.d x2, t4, 1
546+ ld.d t1, X, 0 * SIZE
547+ ld.d t2, X, 1 * SIZE
548+ add .d X, X, INCX
549+ ld.d t3, X, 0 * SIZE
550+ ld.d t4, X, 1 * SIZE
551+ xvinsgr2vr.d x1, t1, 2
552+ xvinsgr2vr.d x2, t2, 2
553+ xvinsgr2vr.d x1, t3, 3
554+ xvinsgr2vr.d x2, t4, 3
555+ add .d X, X, INCX
556+
557+ xvfmul.d x3, VXAI, x2
558+ xvfsub.d x3, VXZ, x3
559+ xvfmul.d x4, VXAI, x1
560+ addi.d I, I, -1
561+ xvstelm.d x3, XX, 0 * SIZE, 0
562+ xvstelm.d x4, XX, 1 * SIZE, 0
563+ add .d XX, XX, INCX
564+ xvstelm.d x3, XX, 0 * SIZE, 1
565+ xvstelm.d x4, XX, 1 * SIZE, 1
566+ add .d XX, XX, INCX
567+ xvstelm.d x3, XX, 0 * SIZE, 2
568+ xvstelm.d x4, XX, 1 * SIZE, 2
569+ add .d XX, XX, INCX
570+ xvstelm.d x3, XX, 0 * SIZE, 3
571+ xvstelm.d x4, XX, 1 * SIZE, 3
475572#else
476- andi I, N, 7
573+ ld.w t1, X, 0 * SIZE
574+ ld.w t2, X, 1 * SIZE
575+ add .d X, X, INCX
576+ ld.w t3, X, 0 * SIZE
577+ ld.w t4, X, 1 * SIZE
578+ add .d X, X, INCX
579+ xvinsgr2vr.w x1, t1, 0
580+ xvinsgr2vr.w x2, t2, 0
581+ xvinsgr2vr.w x1, t3, 1
582+ xvinsgr2vr.w x2, t4, 1
583+ ld.w t1, X, 0 * SIZE
584+ ld.w t2, X, 1 * SIZE
585+ add .d X, X, INCX
586+ ld.w t3, X, 0 * SIZE
587+ ld.w t4, X, 1 * SIZE
588+ xvinsgr2vr.w x1, t1, 2
589+ xvinsgr2vr.w x2, t2, 2
590+ xvinsgr2vr.w x1, t3, 3
591+ xvinsgr2vr.w x2, t4, 3
592+ add .d X, X, INCX
593+ ld.w t1, X, 0 * SIZE
594+ ld.w t2, X, 1 * SIZE
595+ add .d X, X, INCX
596+ ld.w t3, X, 0 * SIZE
597+ ld.w t4, X, 1 * SIZE
598+ add .d X, X, INCX
599+ xvinsgr2vr.w x1, t1, 4
600+ xvinsgr2vr.w x2, t2, 4
601+ xvinsgr2vr.w x1, t3, 5
602+ xvinsgr2vr.w x2, t4, 5
603+ ld.w t1, X, 0 * SIZE
604+ ld.w t2, X, 1 * SIZE
605+ add .d X, X, INCX
606+ ld.w t3, X, 0 * SIZE
607+ ld.w t4, X, 1 * SIZE
608+ xvinsgr2vr.w x1, t1, 6
609+ xvinsgr2vr.w x2, t2, 6
610+ xvinsgr2vr.w x1, t3, 7
611+ xvinsgr2vr.w x2, t4, 7
612+ add .d X, X, INCX
613+
614+ xvfmul.s x3, VXAI, x2
615+ xvfsub.s x3, VXZ, x3
616+ xvfmul.s x4, VXAI, x1
617+ addi.d I, I, -1
618+ xvstelm.w x3, XX, 0 * SIZE, 0
619+ xvstelm.w x4, XX, 1 * SIZE, 0
620+ add .d XX, XX, INCX
621+ xvstelm.w x3, XX, 0 * SIZE, 1
622+ xvstelm.w x4, XX, 1 * SIZE, 1
623+ add .d XX, XX, INCX
624+ xvstelm.w x3, XX, 0 * SIZE, 2
625+ xvstelm.w x4, XX, 1 * SIZE, 2
626+ add .d XX, XX, INCX
627+ xvstelm.w x3, XX, 0 * SIZE, 3
628+ xvstelm.w x4, XX, 1 * SIZE, 3
629+ add .d XX, XX, INCX
630+ xvstelm.w x3, XX, 0 * SIZE, 4
631+ xvstelm.w x4, XX, 1 * SIZE, 4
632+ add .d XX, XX, INCX
633+ xvstelm.w x3, XX, 0 * SIZE, 5
634+ xvstelm.w x4, XX, 1 * SIZE, 5
635+ add .d XX, XX, INCX
636+ xvstelm.w x3, XX, 0 * SIZE, 6
637+ xvstelm.w x4, XX, 1 * SIZE, 6
638+ add .d XX, XX, INCX
639+ xvstelm.w x3, XX, 0 * SIZE, 7
640+ xvstelm.w x4, XX, 1 * SIZE, 7
477641#endif
478- bge $r0, I, .L999
642+ add .d XX, XX, INCX
643+ blt $r0, I, .L28
644+ b .L29
479645 .align 3
480646
481- .L998:
647+ /////// INCX != 1 && N < 8 ///////
648+ .L29:
649+ #ifdef DOUBLE
650+ andi I, N, 3
651+ #else
652+ andi I, N, 7
653+ #endif
654+ beqz I, .L999
655+ bnez DUMMY2, .L998 // if DUMMPY2 == 1, called from c/zscal.
656+
657+ bceqz $fcc0, .L23_1
658+ b .L24_1
659+
660+ .L23_1:
661+ bceqz $fcc1, .L998 // alpha_r != 0.0 && alpha_i != 0.0
662+ b .L997 // alpha_r != 0.0 && alpha_i == 0.0
663+
664+ .L24_1:
665+ bceqz $fcc1, .L996 // alpha_r == 0.0 && alpha_i != 0.0
666+ b .L995 // alpha_r == 0.0 && alpha_i == 0.0
667+ .align 3
668+
669+ .L995: // alpha_r == 0.0 && alpha_i == 0.0
670+ ST a1, X, 0 * SIZE
671+ ST a1, X, 1 * SIZE
672+ addi.d I, I, -1
673+ add .d X, X, INCX
674+ blt $r0, I, .L995
675+ b .L999
676+ .L996: // alpha_r == 0.0 && alpha_i != 0.0
677+ LD a1, X, 0 * SIZE
678+ LD a2, X, 1 * SIZE
679+ addi.d I, I, -1
680+ MUL s1, ALPHAI, a2
681+ MUL s2, ALPHAI, a1
682+ SUB s1, $f12, s1
683+ ST s1, X, 0 * SIZE
684+ ST s2, X, 1 * SIZE
685+ add .d X, X, INCX
686+ blt $r0, I, .L996
687+ b .L999
688+ .L997: // alpha_r != 0.0 && alpha_i == 0.0
689+ LD a1, X, 0 * SIZE
690+ LD a2, X, 1 * SIZE
691+ addi.d I, I, -1
692+ MUL s1, ALPHAR, a1
693+ MUL s2, ALPHAR, a2
694+ ST s1, X, 0 * SIZE
695+ ST s2, X, 1 * SIZE
696+ add .d X, X, INCX
697+ blt $r0, I, .L997
698+ b .L999
699+ .L998: // alpha_r != 0.0 && alpha_i != 0.0, one by one
482700 LD a1, X, 0 * SIZE
483701 LD a2, X, 1 * SIZE
484702 addi.d I, I, -1
@@ -490,7 +708,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
490708 ST s2, X, 1 * SIZE
491709 add .d X, X, INCX
492710 blt $r0, I, .L998
493- .align 3
711+ b .L999
494712
495713.L999:
496714 move $r4, $r12
0 commit comments