Skip to content

Commit d3bf5a5

Browse files
committed
Combine two reduction operations of zhe/symv into one, with tail undisturbed setted.
1 parent 18d7afe commit d3bf5a5

File tree

4 files changed

+119
-119
lines changed

4 files changed

+119
-119
lines changed

kernel/riscv64/zhemv_LM_rvv.c

Lines changed: 34 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -36,12 +36,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3636
#define VSSEV_FLOAT __riscv_vsse32_v_f32m4
3737
#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1
3838
#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4
39+
#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu
3940
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4
4041
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4
4142
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
4243
#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4
4344
#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4
4445
#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4
46+
#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f32m4_tu
4547
#else
4648
#define VSETVL(n) __riscv_vsetvl_e64m4(n)
4749
#define VSETVL_MAX __riscv_vsetvlmax_e64m1()
@@ -52,12 +54,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
5254
#define VSSEV_FLOAT __riscv_vsse64_v_f64m4
5355
#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1
5456
#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4
57+
#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu
5558
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4
5659
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4
5760
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
5861
#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4
5962
#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4
6063
#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4
64+
#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f64m4_tu
6165
#endif
6266

6367
int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){
@@ -143,49 +147,45 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B
143147
iy += inc_yv;
144148
ia += inc_av;
145149
}
146-
v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl);
147-
temp_r2 = VFMVFS_FLOAT(v_res);
148-
v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl);
149-
temp_i2 = VFMVFS_FLOAT(v_res);
150+
150151
if(i < m){
151-
gvl = VSETVL(m-i);
152-
va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl);
153-
va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl);
154-
vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
155-
vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
152+
unsigned int gvl_rem = VSETVL(m-i);
153+
va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl_rem);
154+
va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl_rem);
155+
vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl_rem);
156+
vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl_rem);
156157
#ifndef HEMVREV
157-
vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl);
158-
vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl);
159-
vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl);
160-
vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl);
158+
vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl_rem);
159+
vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl_rem);
160+
vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl_rem);
161+
vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl_rem);
161162
#else
162-
vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl);
163-
vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl);
164-
vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl);
165-
vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl);
163+
vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl_rem);
164+
vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl_rem);
165+
vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl_rem);
166+
vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl_rem);
166167
#endif
167-
VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
168-
VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl);
168+
VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl_rem);
169+
VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl_rem);
169170

170-
vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
171-
vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
171+
vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl_rem);
172+
vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl_rem);
172173
#ifndef HEMVREV
173-
vr0 = VFMULVV_FLOAT(vx0, va0, gvl);
174-
vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl);
175-
vr1 = VFMULVV_FLOAT(vx1, va0, gvl);
176-
vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl);
174+
vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, va0, gvl_rem);
175+
vr0 = VFMACCVV_FLOAT_TU(vr0, vx1, va1, gvl_rem);
176+
vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, va0, gvl_rem);
177+
vr1 = VFNMSACVV_FLOAT_TU(vr1, vx0, va1, gvl_rem);
177178
#else
178-
vr0 = VFMULVV_FLOAT(vx0, va0, gvl);
179-
vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl);
180-
vr1 = VFMULVV_FLOAT(vx1, va0, gvl);
181-
vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl);
179+
vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, va0, gvl_rem);
180+
vr0 = VFNMSACVV_FLOAT_TU(vr0, vx1, va1, gvl_rem);
181+
vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, va0, gvl_rem);
182+
vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, va1, gvl_rem);
182183
#endif
183-
184-
v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl);
185-
temp_r2 += VFMVFS_FLOAT(v_res);
186-
v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl);
187-
temp_i2 += VFMVFS_FLOAT(v_res);
188184
}
185+
v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl);
186+
temp_r2 = VFMVFS_FLOAT(v_res);
187+
v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl);
188+
temp_i2 = VFMVFS_FLOAT(v_res);
189189
}
190190
y[jy] += alpha_r * temp_r2 - alpha_i * temp_i2;
191191
y[jy+1] += alpha_r * temp_i2 + alpha_i * temp_r2;

kernel/riscv64/zhemv_UV_rvv.c

Lines changed: 34 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -36,12 +36,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3636
#define VSSEV_FLOAT __riscv_vsse32_v_f32m4
3737
#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1
3838
#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4
39+
#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu
3940
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4
4041
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4
4142
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
4243
#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4
4344
#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4
4445
#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4
46+
#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f32m4_tu
4547
#else
4648
#define VSETVL(n) __riscv_vsetvl_e64m4(n)
4749
#define VSETVL_MAX __riscv_vsetvlmax_e64m1()
@@ -52,12 +54,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
5254
#define VSSEV_FLOAT __riscv_vsse64_v_f64m4
5355
#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1
5456
#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4
57+
#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu
5558
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4
5659
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4
5760
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
5861
#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4
5962
#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4
6063
#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4
64+
#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f64m4_tu
6165
#endif
6266

6367
int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){
@@ -142,49 +146,45 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B
142146
iy += inc_yv;
143147
ia += inc_av;
144148
}
145-
v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl);
146-
temp_r2 = VFMVFS_FLOAT(v_res);
147-
v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl);
148-
temp_i2 = VFMVFS_FLOAT(v_res);
149+
149150
if(i < j){
150-
gvl = VSETVL(j-i);
151-
va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl);
152-
va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl);
153-
vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
154-
vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
151+
unsigned int gvl_rem = VSETVL(j-i);
152+
va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl_rem);
153+
va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl_rem);
154+
vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl_rem);
155+
vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl_rem);
155156
#ifndef HEMVREV
156-
vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl);
157-
vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl);
158-
vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl);
159-
vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl);
157+
vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl_rem);
158+
vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl_rem);
159+
vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl_rem);
160+
vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl_rem);
160161
#else
161-
vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl);
162-
vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl);
163-
vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl);
164-
vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl);
162+
vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl_rem);
163+
vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl_rem);
164+
vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl_rem);
165+
vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl_rem);
165166
#endif
166-
VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
167-
VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl);
167+
VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl_rem);
168+
VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl_rem);
168169

169-
vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
170-
vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
170+
vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl_rem);
171+
vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl_rem);
171172
#ifndef HEMVREV
172-
vr0 = VFMULVV_FLOAT(vx0, va0, gvl);
173-
vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl);
174-
vr1 = VFMULVV_FLOAT(vx1, va0, gvl);
175-
vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl);
173+
vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, va0, gvl_rem);
174+
vr0 = VFMACCVV_FLOAT_TU(vr0, vx1, va1, gvl_rem);
175+
vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, va0, gvl_rem);
176+
vr1 = VFNMSACVV_FLOAT_TU(vr1, vx0, va1, gvl_rem);
176177
#else
177-
vr0 = VFMULVV_FLOAT(vx0, va0, gvl);
178-
vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl);
179-
vr1 = VFMULVV_FLOAT(vx1, va0, gvl);
180-
vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl);
178+
vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, va0, gvl_rem);
179+
vr0 = VFNMSACVV_FLOAT_TU(vr0, vx1, va1, gvl_rem);
180+
vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, va0, gvl_rem);
181+
vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, va1, gvl_rem);
181182
#endif
182-
183-
v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl);
184-
temp_r2 += VFMVFS_FLOAT(v_res);
185-
v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl);
186-
temp_i2 += VFMVFS_FLOAT(v_res);
187183
}
184+
v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl);
185+
temp_r2 = VFMVFS_FLOAT(v_res);
186+
v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl);
187+
temp_i2 = VFMVFS_FLOAT(v_res);
188188
}
189189
y[jy] += temp_r1 * a_ptr[ja];
190190
y[jy+1] += temp_i1 * a_ptr[ja];

kernel/riscv64/zsymv_L_rvv.c

Lines changed: 25 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3838
#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1
3939
#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4
4040
#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4
41+
#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu
42+
#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f32m4_tu
4143
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4
4244
#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4
4345
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4
@@ -57,6 +59,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
5759
#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1
5860
#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4
5961
#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4
62+
#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu
63+
#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f64m4_tu
6064
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4
6165
#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4
6266
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4
@@ -133,38 +137,34 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i,
133137
ix += inc_xv;
134138
iy += inc_yv;
135139
}
136-
v_res = VFREDSUM_FLOAT(vr_r, v_z0, gvl);
137-
temp2[0] = VFMVFS_FLOAT_M1(v_res);
138-
v_res = VFREDSUM_FLOAT(vr_i, v_z0, gvl);
139-
temp2[1] = VFMVFS_FLOAT_M1(v_res);
140140

141141
if(i < m){
142-
gvl = VSETVL(m-i);
143-
vy_r = VLSEV_FLOAT(&y[2 * iy], stride_y, gvl);
144-
vy_i = VLSEV_FLOAT(&y[2 * iy + 1], stride_y, gvl);
145-
va_r = VLSEV_FLOAT(&a_ptr[2 * i], 2 * sizeof(FLOAT), gvl);
146-
va_i = VLSEV_FLOAT(&a_ptr[2 * i + 1], 2 * sizeof(FLOAT), gvl);
142+
unsigned int gvl_rem = VSETVL(m-i);
143+
vy_r = VLSEV_FLOAT(&y[2 * iy], stride_y, gvl_rem);
144+
vy_i = VLSEV_FLOAT(&y[2 * iy + 1], stride_y, gvl_rem);
145+
va_r = VLSEV_FLOAT(&a_ptr[2 * i], 2 * sizeof(FLOAT), gvl_rem);
146+
va_i = VLSEV_FLOAT(&a_ptr[2 * i + 1], 2 * sizeof(FLOAT), gvl_rem);
147147

148-
vy_r = VFMACCVF_FLOAT(vy_r, temp1[0], va_r, gvl);
149-
vy_r = VFNMSACVF_FLOAT(vy_r, temp1[1], va_i, gvl);
150-
vy_i = VFMACCVF_FLOAT(vy_i, temp1[0], va_i, gvl);
151-
vy_i = VFMACCVF_FLOAT(vy_i, temp1[1], va_r, gvl);
148+
vy_r = VFMACCVF_FLOAT(vy_r, temp1[0], va_r, gvl_rem);
149+
vy_r = VFNMSACVF_FLOAT(vy_r, temp1[1], va_i, gvl_rem);
150+
vy_i = VFMACCVF_FLOAT(vy_i, temp1[0], va_i, gvl_rem);
151+
vy_i = VFMACCVF_FLOAT(vy_i, temp1[1], va_r, gvl_rem);
152152

153-
VSSEV_FLOAT(&y[2 * iy], stride_y, vy_r, gvl);
154-
VSSEV_FLOAT(&y[2 * iy + 1], stride_y, vy_i, gvl);
153+
VSSEV_FLOAT(&y[2 * iy], stride_y, vy_r, gvl_rem);
154+
VSSEV_FLOAT(&y[2 * iy + 1], stride_y, vy_i, gvl_rem);
155155

156-
vx_r = VLSEV_FLOAT(&x[2 * ix], stride_x, gvl);
157-
vx_i = VLSEV_FLOAT(&x[2 * ix + 1], stride_x, gvl);
158-
vr_r = VFMULVV_FLOAT(vx_r, va_r, gvl);
159-
vr_r = VFNMSACVV_FLOAT(vr_r, vx_i, va_i, gvl);
160-
vr_i = VFMULVV_FLOAT(vx_r, va_i, gvl);
161-
vr_i = VFMACCVV_FLOAT(vr_i, vx_i, va_r, gvl);
156+
vx_r = VLSEV_FLOAT(&x[2 * ix], stride_x, gvl_rem);
157+
vx_i = VLSEV_FLOAT(&x[2 * ix + 1], stride_x, gvl_rem);
158+
vr_r = VFMACCVV_FLOAT_TU(vr_r, vx_r, va_r, gvl_rem);
159+
vr_r = VFNMSACVV_FLOAT_TU(vr_r, vx_i, va_i, gvl_rem);
160+
vr_i = VFMACCVV_FLOAT_TU(vr_i, vx_r, va_i, gvl_rem);
161+
vr_i = VFMACCVV_FLOAT_TU(vr_i, vx_i, va_r, gvl_rem);
162162

163-
v_res = VFREDSUM_FLOAT(vr_r, v_z0, gvl);
164-
temp2[0] += VFMVFS_FLOAT_M1(v_res);
165-
v_res = VFREDSUM_FLOAT(vr_i, v_z0, gvl);
166-
temp2[1] += VFMVFS_FLOAT_M1(v_res);
167163
}
164+
v_res = VFREDSUM_FLOAT(vr_r, v_z0, gvl);
165+
temp2[0] = VFMVFS_FLOAT_M1(v_res);
166+
v_res = VFREDSUM_FLOAT(vr_i, v_z0, gvl);
167+
temp2[1] = VFMVFS_FLOAT_M1(v_res);
168168
}
169169
y[2 * jy] += alpha_r * temp2[0] - alpha_i * temp2[1];
170170
y[2 * jy + 1] += alpha_r * temp2[1] + alpha_i * temp2[0];

0 commit comments

Comments
 (0)