@@ -36,12 +36,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36
36
#define VSSEV_FLOAT __riscv_vsse32_v_f32m4
37
37
#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1
38
38
#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4
39
+ #define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu
39
40
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4
40
41
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4
41
42
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
42
43
#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4
43
44
#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4
44
45
#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4
46
+ #define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f32m4_tu
45
47
#else
46
48
#define VSETVL (n ) __riscv_vsetvl_e64m4(n)
47
49
#define VSETVL_MAX __riscv_vsetvlmax_e64m1()
@@ -52,12 +54,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
52
54
#define VSSEV_FLOAT __riscv_vsse64_v_f64m4
53
55
#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1
54
56
#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4
57
+ #define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu
55
58
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4
56
59
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4
57
60
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
58
61
#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4
59
62
#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4
60
63
#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4
64
+ #define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f64m4_tu
61
65
#endif
62
66
63
67
int CNAME (BLASLONG m , BLASLONG offset , FLOAT alpha_r , FLOAT alpha_i , FLOAT * a , BLASLONG lda , FLOAT * x , BLASLONG incx , FLOAT * y , BLASLONG incy , FLOAT * buffer ){
@@ -143,49 +147,45 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B
143
147
iy += inc_yv ;
144
148
ia += inc_av ;
145
149
}
146
- v_res = VFREDSUM_FLOAT (vr0 , v_z0 , gvl );
147
- temp_r2 = VFMVFS_FLOAT (v_res );
148
- v_res = VFREDSUM_FLOAT (vr1 , v_z0 , gvl );
149
- temp_i2 = VFMVFS_FLOAT (v_res );
150
+
150
151
if (i < m ){
151
- gvl = VSETVL (m - i );
152
- va0 = VLSEV_FLOAT (& a_ptr [ia ], stride_a , gvl );
153
- va1 = VLSEV_FLOAT (& a_ptr [ia + 1 ], stride_a , gvl );
154
- vy0 = VLSEV_FLOAT (& y [iy ], stride_y , gvl );
155
- vy1 = VLSEV_FLOAT (& y [iy + 1 ], stride_y , gvl );
152
+ unsigned int gvl_rem = VSETVL (m - i );
153
+ va0 = VLSEV_FLOAT (& a_ptr [ia ], stride_a , gvl_rem );
154
+ va1 = VLSEV_FLOAT (& a_ptr [ia + 1 ], stride_a , gvl_rem );
155
+ vy0 = VLSEV_FLOAT (& y [iy ], stride_y , gvl_rem );
156
+ vy1 = VLSEV_FLOAT (& y [iy + 1 ], stride_y , gvl_rem );
156
157
#ifndef HEMVREV
157
- vy0 = VFMACCVF_FLOAT (vy0 , temp_r1 , va0 , gvl );
158
- vy0 = VFNMSACVF_FLOAT (vy0 , temp_i1 , va1 , gvl );
159
- vy1 = VFMACCVF_FLOAT (vy1 , temp_r1 , va1 , gvl );
160
- vy1 = VFMACCVF_FLOAT (vy1 , temp_i1 , va0 , gvl );
158
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_r1 , va0 , gvl_rem );
159
+ vy0 = VFNMSACVF_FLOAT (vy0 , temp_i1 , va1 , gvl_rem );
160
+ vy1 = VFMACCVF_FLOAT (vy1 , temp_r1 , va1 , gvl_rem );
161
+ vy1 = VFMACCVF_FLOAT (vy1 , temp_i1 , va0 , gvl_rem );
161
162
#else
162
- vy0 = VFMACCVF_FLOAT (vy0 , temp_r1 , va0 , gvl );
163
- vy0 = VFMACCVF_FLOAT (vy0 , temp_i1 , va1 , gvl );
164
- vy1 = VFNMSACVF_FLOAT (vy1 , temp_r1 , va1 , gvl );
165
- vy1 = VFMACCVF_FLOAT (vy1 , temp_i1 , va0 , gvl );
163
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_r1 , va0 , gvl_rem );
164
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_i1 , va1 , gvl_rem );
165
+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_r1 , va1 , gvl_rem );
166
+ vy1 = VFMACCVF_FLOAT (vy1 , temp_i1 , va0 , gvl_rem );
166
167
#endif
167
- VSSEV_FLOAT (& y [iy ], stride_y , vy0 , gvl );
168
- VSSEV_FLOAT (& y [iy + 1 ], stride_y , vy1 , gvl );
168
+ VSSEV_FLOAT (& y [iy ], stride_y , vy0 , gvl_rem );
169
+ VSSEV_FLOAT (& y [iy + 1 ], stride_y , vy1 , gvl_rem );
169
170
170
- vx0 = VLSEV_FLOAT (& x [ix ], stride_x , gvl );
171
- vx1 = VLSEV_FLOAT (& x [ix + 1 ], stride_x , gvl );
171
+ vx0 = VLSEV_FLOAT (& x [ix ], stride_x , gvl_rem );
172
+ vx1 = VLSEV_FLOAT (& x [ix + 1 ], stride_x , gvl_rem );
172
173
#ifndef HEMVREV
173
- vr0 = VFMULVV_FLOAT ( vx0 , va0 , gvl );
174
- vr0 = VFMACCVV_FLOAT (vr0 , vx1 , va1 , gvl );
175
- vr1 = VFMULVV_FLOAT ( vx1 , va0 , gvl );
176
- vr1 = VFNMSACVV_FLOAT (vr1 , vx0 , va1 , gvl );
174
+ vr0 = VFMACCVV_FLOAT_TU ( vr0 , vx0 , va0 , gvl_rem );
175
+ vr0 = VFMACCVV_FLOAT_TU (vr0 , vx1 , va1 , gvl_rem );
176
+ vr1 = VFMACCVV_FLOAT_TU ( vr1 , vx1 , va0 , gvl_rem );
177
+ vr1 = VFNMSACVV_FLOAT_TU (vr1 , vx0 , va1 , gvl_rem );
177
178
#else
178
- vr0 = VFMULVV_FLOAT ( vx0 , va0 , gvl );
179
- vr0 = VFNMSACVV_FLOAT (vr0 , vx1 , va1 , gvl );
180
- vr1 = VFMULVV_FLOAT ( vx1 , va0 , gvl );
181
- vr1 = VFMACCVV_FLOAT (vr1 , vx0 , va1 , gvl );
179
+ vr0 = VFMACCVV_FLOAT_TU ( vr0 , vx0 , va0 , gvl_rem );
180
+ vr0 = VFNMSACVV_FLOAT_TU (vr0 , vx1 , va1 , gvl_rem );
181
+ vr1 = VFMACCVV_FLOAT_TU ( vr1 , vx1 , va0 , gvl_rem );
182
+ vr1 = VFMACCVV_FLOAT_TU (vr1 , vx0 , va1 , gvl_rem );
182
183
#endif
183
-
184
- v_res = VFREDSUM_FLOAT (vr0 , v_z0 , gvl );
185
- temp_r2 += VFMVFS_FLOAT (v_res );
186
- v_res = VFREDSUM_FLOAT (vr1 , v_z0 , gvl );
187
- temp_i2 += VFMVFS_FLOAT (v_res );
188
184
}
185
+ v_res = VFREDSUM_FLOAT (vr0 , v_z0 , gvl );
186
+ temp_r2 = VFMVFS_FLOAT (v_res );
187
+ v_res = VFREDSUM_FLOAT (vr1 , v_z0 , gvl );
188
+ temp_i2 = VFMVFS_FLOAT (v_res );
189
189
}
190
190
y [jy ] += alpha_r * temp_r2 - alpha_i * temp_i2 ;
191
191
y [jy + 1 ] += alpha_r * temp_i2 + alpha_i * temp_r2 ;
0 commit comments