Skip to content

Commit 75cb798

Browse files
committed
LoongArch64: Fixed snrm2_lsx.S
1 parent 3c61387 commit 75cb798

File tree

3 files changed

+81
-32
lines changed

3 files changed

+81
-32
lines changed

kernel/arm/nrm2.c

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,18 +52,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
5252
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
5353
{
5454
BLASLONG i=0;
55+
BLASLONG j=0;
5556
FLOAT scale = 0.0;
56-
FLOAT ssq = 1.0;
57+
FLOAT ssq = 0.0;
5758
FLOAT absxi = 0.0;
59+
FLOAT max = 0.0;
5860

5961

6062
if (n <= 0 || inc_x == 0) return(0.0);
6163
if ( n == 1 ) return( ABS(x[0]) );
6264

6365
n *= inc_x;
66+
// Find Max value
67+
while(abs(j) < abs(n))
68+
{
69+
if (max < ABS(x[j])) max = ABS(x[j]);
70+
71+
j += inc_x;
72+
}
73+
74+
if (max == 0.0) return (0.0);
75+
76+
6477
while(abs(i) < abs(n))
6578
{
6679

80+
#if 0
6781
if ( x[i] != 0.0 )
6882
{
6983
absxi = ABS( x[i] );
@@ -78,9 +92,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
7892
}
7993

8094
}
95+
#endif
96+
ssq += (x[i] / max) * (x[i] / max);
8197
i += inc_x;
8298
}
83-
scale = scale * sqrt( ssq );
99+
scale = max * sqrt( ssq );
84100
return(scale);
85101

86102
}

kernel/loongarch64/KERNEL.LA264

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -130,12 +130,12 @@ ZROTKERNEL = crot_lsx.S
130130
#CROTKERNEL = ../arm/zrot.c
131131
#ZROTKERNEL = ../arm/zrot.c
132132

133-
#SNRM2KERNEL = snrm2_lsx.S
134-
#DNRM2KERNEL = dnrm2_lsx.S
133+
SNRM2KERNEL = snrm2_lsx.S
134+
DNRM2KERNEL = dnrm2_lsx.S
135135
#CNRM2KERNEL = cnrm2_lsx.S
136136
#ZNRM2KERNEL = znrm2_lsx.S
137-
SNRM2KERNEL = ../arm/nrm2.c
138-
DNRM2KERNEL = ../arm/nrm2.c
137+
#SNRM2KERNEL = ../arm/nrm2.c
138+
#DNRM2KERNEL = ../arm/nrm2.c
139139
CNRM2KERNEL = ../arm/znrm2.c
140140
ZNRM2KERNEL = ../arm/znrm2.c
141141

kernel/loongarch64/snrm2_lsx.S

Lines changed: 59 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -52,17 +52,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
5252
/* Don't change following FR unless you know the effects. */
5353
#define res1 $vr19
5454
#define res2 $vr20
55+
#define RCP $f2
56+
#define VALPHA $vr3
57+
58+
// The optimization for snrm2 cannot simply involve
59+
// extending the data type from float to double and
60+
// then summing the squares of the data. LAPACK tests
61+
// have shown that this approach can still lead to data overflow.
62+
// Instead, we need to find the maximum absolute value in the entire
63+
// array and divide each data element by this maximum value before
64+
// performing the calculation. This approach can avoid overflow (and does not require extending the data type).
5565

5666
PROLOGUE
5767

5868
#ifdef F_INTERFACE
5969
LDINT N, 0(N)
6070
LDINT INCX, 0(INCX)
6171
#endif
62-
vxor.v res1, res1, res1
63-
vxor.v res2, res2, res2
6472
bge $r0, N, .L999
6573
beq $r0, INCX, .L999
74+
75+
addi.d $sp, $sp, -64
76+
st.d $ra, $sp, 0
77+
st.d $s0, $sp, 8
78+
st.d $s1, $sp, 16
79+
st.d $s2, $sp, 24
80+
st.d $s3, $sp, 32
81+
st.d N, $sp, 40
82+
st.d X, $sp, 48
83+
st.d INCX, $sp, 56
84+
bl samax_k
85+
ld.d $ra, $sp, 0
86+
ld.d $s0, $sp, 8
87+
ld.d $s1, $sp, 16
88+
ld.d $s2, $sp, 24
89+
ld.d $s3, $sp, 32
90+
ld.d N, $sp, 40
91+
ld.d X, $sp, 48
92+
ld.d INCX, $sp, 56
93+
addi.d $sp, $sp, 64
94+
95+
frecip.s RCP, $f0
96+
vreplvei.w VALPHA, $vr2, 0
97+
vxor.v res1, res1, res1
98+
vxor.v res2, res2, res2
99+
fcmp.ceq.s $fcc0, $f0, $f19
100+
bcnez $fcc0, .L999
66101
li.d TEMP, SIZE
67102
slli.d INCX, INCX, BASE_SHIFT
68103
srai.d I, N, 3
@@ -75,14 +110,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
75110
vld VX5, X, 4 * SIZE
76111
addi.d I, I, -1
77112
addi.d X, X, 8 * SIZE
78-
vfcvtl.d.s VX1, VX0
79-
vfcvth.d.s VX2, VX0
80-
vfcvtl.d.s VX3, VX5
81-
vfcvth.d.s VX4, VX5
82-
vfmadd.d res1, VX1, VX1, res1
83-
vfmadd.d res2, VX2, VX2, res2
84-
vfmadd.d res1, VX3, VX3, res1
85-
vfmadd.d res2, VX4, VX4, res2
113+
114+
vfmul.s VX0, VX0, VALPHA
115+
vfmul.s VX5, VX5, VALPHA
116+
117+
vfmadd.s res1, VX0, VX0, res1
118+
vfmadd.s res2, VX5, VX5, res2
86119
blt $r0, I, .L10
87120
b .L996
88121
.align 3
@@ -104,10 +137,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
104137
vinsgr2vr.w VX0, t2, 1
105138
vinsgr2vr.w VX0, t3, 2
106139
vinsgr2vr.w VX0, t4, 3
107-
vfcvtl.d.s VX1, VX0
108-
vfcvth.d.s VX2, VX0
109-
vfmadd.d res1, VX1, VX1, res1
110-
vfmadd.d res2, VX2, VX2, res2
140+
vfmul.s VX0, VX0, VALPHA
141+
vfmadd.s res1, VX0, VX0, res1
142+
111143
ld.w t1, X, 0
112144
add.d X, X, INCX
113145
ld.w t2, X, 0
@@ -120,19 +152,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
120152
vinsgr2vr.w VX0, t2, 1
121153
vinsgr2vr.w VX0, t3, 2
122154
vinsgr2vr.w VX0, t4, 3
123-
vfcvtl.d.s VX3, VX0
124-
vfcvth.d.s VX4, VX0
125-
vfmadd.d res1, VX3, VX3, res1
126-
vfmadd.d res2, VX4, VX4, res2
155+
vfmul.s VX0, VX0, VALPHA
156+
vfmadd.s res2, VX0, VX0, res2
127157
addi.d I, I, -1
128158
blt $r0, I, .L21
129-
b .L996
130159
.align 3
131160

132161
.L996:
133-
vfadd.d res1, res1, res2
134-
vreplvei.d VX1, res1, 1
135-
vfadd.d res1, VX1, res1
162+
vfadd.s res1, res1, res2
163+
vreplvei.w VX1, res1, 1
164+
vreplvei.w VX2, res1, 2
165+
vreplvei.w VX3, res1, 3
166+
vfadd.s res1, VX1, res1
167+
vfadd.s res1, VX2, res1
168+
vfadd.s res1, VX3, res1
136169
.align 3
137170

138171
.L997:
@@ -143,16 +176,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
143176
.L998:
144177
fld.s $f15, X, 0
145178
addi.d I, I, -1
146-
fcvt.d.s $f15, $f15
147-
fmadd.d $f19, $f15, $f15, $f19
179+
fmul.s $f15, $f15, RCP
180+
fmadd.s $f19, $f15, $f15, $f19
148181
add.d X, X, INCX
149182
blt $r0, I, .L998
150183
.align 3
151184

152185
.L999:
153-
fsqrt.d $f19, $f19
186+
fsqrt.s $f19, $f19
187+
fmul.s $f0, $f19, $f0
154188
move $r4, $r17
155-
fcvt.s.d $f0, $f19
156189
jirl $r0, $r1, 0x0
157190
.align 3
158191

0 commit comments

Comments
 (0)