@@ -52,17 +52,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
5252/* Don't change following FR unless you know the effects. */
5353#define res1 $vr19
5454#define res2 $vr20
55+ #define RCP $f2
56+ #define VALPHA $vr3
57+
58+ // The optimization for snrm2 cannot simply involve
59+ // extending the data type from float to double and
60+ // then summing the squares of the data. LAPACK tests
61+ // have shown that this approach can still lead to data overflow.
62+ // Instead, we need to find the maximum absolute value in the entire
63+ // array and divide each data element by this maximum value before
64+ // performing the calculation. This approach can avoid overflow (and does not require extending the data type).
5565
5666 PROLOGUE
5767
5868#ifdef F_INTERFACE
5969 LDINT N, 0 (N)
6070 LDINT INCX, 0 (INCX)
6171#endif
62- vxor.v res1, res1, res1
63- vxor.v res2, res2, res2
6472 bge $r0, N, .L999
6573 beq $r0, INCX, .L999
74+
75+ addi.d $sp, $sp, -64
76+ st.d $ra, $sp, 0
77+ st.d $s0, $sp, 8
78+ st.d $s1, $sp, 16
79+ st.d $s2, $sp, 24
80+ st.d $s3, $sp, 32
81+ st.d N, $sp, 40
82+ st.d X, $sp, 48
83+ st.d INCX, $sp, 56
84+ bl samax_k
85+ ld.d $ra, $sp, 0
86+ ld.d $s0, $sp, 8
87+ ld.d $s1, $sp, 16
88+ ld.d $s2, $sp, 24
89+ ld.d $s3, $sp, 32
90+ ld.d N, $sp, 40
91+ ld.d X, $sp, 48
92+ ld.d INCX, $sp, 56
93+ addi.d $sp, $sp, 64
94+
95+ frecip.s RCP, $f0
96+ vreplvei.w VALPHA, $vr2, 0
97+ vxor.v res1, res1, res1
98+ vxor.v res2, res2, res2
99+ fcmp.ceq.s $fcc0, $f0, $f19
100+ bcnez $fcc0, .L999
66101 li.d TEMP, SIZE
67102 slli.d INCX, INCX, BASE_SHIFT
68103 srai.d I, N, 3
@@ -75,14 +110,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
75110 vld VX5, X, 4 * SIZE
76111 addi.d I, I, -1
77112 addi.d X, X, 8 * SIZE
78- vfcvtl.d.s VX1, VX0
79- vfcvth.d.s VX2, VX0
80- vfcvtl.d.s VX3, VX5
81- vfcvth.d.s VX4, VX5
82- vfmadd.d res1, VX1, VX1, res1
83- vfmadd.d res2, VX2, VX2, res2
84- vfmadd.d res1, VX3, VX3, res1
85- vfmadd.d res2, VX4, VX4, res2
113+
114+ vfmul.s VX0, VX0, VALPHA
115+ vfmul.s VX5, VX5, VALPHA
116+
117+ vfmadd.s res1, VX0, VX0, res1
118+ vfmadd.s res2, VX5, VX5, res2
86119 blt $r0, I, .L10
87120 b .L996
88121 .align 3
@@ -104,10 +137,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
104137 vinsgr2vr.w VX0, t2, 1
105138 vinsgr2vr.w VX0, t3, 2
106139 vinsgr2vr.w VX0, t4, 3
107- vfcvtl.d.s VX1, VX0
108- vfcvth.d.s VX2, VX0
109- vfmadd.d res1, VX1, VX1, res1
110- vfmadd.d res2, VX2, VX2, res2
140+ vfmul.s VX0, VX0, VALPHA
141+ vfmadd.s res1, VX0, VX0, res1
142+
111143 ld.w t1, X, 0
112144 add .d X, X, INCX
113145 ld.w t2, X, 0
@@ -120,19 +152,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
120152 vinsgr2vr.w VX0, t2, 1
121153 vinsgr2vr.w VX0, t3, 2
122154 vinsgr2vr.w VX0, t4, 3
123- vfcvtl.d.s VX3, VX0
124- vfcvth.d.s VX4, VX0
125- vfmadd.d res1, VX3, VX3, res1
126- vfmadd.d res2, VX4, VX4, res2
155+ vfmul.s VX0, VX0, VALPHA
156+ vfmadd.s res2, VX0, VX0, res2
127157 addi.d I, I, -1
128158 blt $r0, I, .L21
129- b .L996
130159 .align 3
131160
132161.L996:
133- vfadd.d res1, res1, res2
134- vreplvei.d VX1, res1, 1
135- vfadd.d res1, VX1, res1
162+ vfadd.s res1, res1, res2
163+ vreplvei.w VX1, res1, 1
164+ vreplvei.w VX2, res1, 2
165+ vreplvei.w VX3, res1, 3
166+ vfadd.s res1, VX1, res1
167+ vfadd.s res1, VX2, res1
168+ vfadd.s res1, VX3, res1
136169 .align 3
137170
138171.L997:
@@ -143,16 +176,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
143176.L998:
144177 fld .s $f15, X, 0
145178 addi.d I, I, -1
146- fcvt.d.s $f15, $f15
147- fmadd.d $f19, $f15, $f15, $f19
179+ fmul .s $f15, $f15, RCP
180+ fmadd.s $f19, $f15, $f15, $f19
148181 add .d X, X, INCX
149182 blt $r0, I, .L998
150183 .align 3
151184
152185.L999:
153- fsqrt .d $f19, $f19
186+ fsqrt .s $f19, $f19
187+ fmul .s $f0, $f19, $f0
154188 move $r4, $r17
155- fcvt.s.d $f0, $f19
156189 jirl $r0, $r1, 0x0
157190 .align 3
158191
0 commit comments