@@ -37,114 +37,114 @@ double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
37
37
vfloat32m2_t vx , vy ;
38
38
unsigned int gvl = 0 ;
39
39
vfloat64m1_t v_res , v_z0 ;
40
- gvl = vsetvlmax_e64m1 ();
41
- v_res = vfmv_v_f_f64m1 (0 , gvl );
42
- v_z0 = vfmv_v_f_f64m1 (0 , gvl );
40
+ gvl = __riscv_vsetvlmax_e64m1 ();
41
+ v_res = __riscv_vfmv_v_f_f64m1 (0 , gvl );
42
+ v_z0 = __riscv_vfmv_v_f_f64m1 (0 , gvl );
43
43
44
44
if (inc_x == 1 && inc_y == 1 ){
45
- gvl = vsetvl_e64m4 (n );
46
- vr = vfmv_v_f_f64m4 (0 , gvl );
45
+ gvl = __riscv_vsetvl_e64m4 (n );
46
+ vr = __riscv_vfmv_v_f_f64m4 (0 , gvl );
47
47
for (i = 0 ,j = 0 ; i < n /gvl ; i ++ ){
48
- vx = vle32_v_f32m2 (& x [j ], gvl );
49
- vy = vle32_v_f32m2 (& y [j ], gvl );
50
- vr = vfwmacc_vv_f64m4 (vr , vx , vy , gvl );
48
+ vx = __riscv_vle32_v_f32m2 (& x [j ], gvl );
49
+ vy = __riscv_vle32_v_f32m2 (& y [j ], gvl );
50
+ vr = __riscv_vfwmacc_vv_f64m4 (vr , vx , vy , gvl );
51
51
j += gvl ;
52
52
}
53
53
if (j > 0 ){
54
- v_res = vfredusum_vs_f64m4_f64m1 ( v_res , vr , v_z0 , gvl );
55
- dot += (double )vfmv_f_s_f64m1_f64 (v_res );
54
+ v_res = __riscv_vfredusum_vs_f64m4_f64m1 ( vr , v_z0 , gvl );
55
+ dot += (double )__riscv_vfmv_f_s_f64m1_f64 (v_res );
56
56
}
57
57
//tail
58
58
if (j < n ){
59
- gvl = vsetvl_e64m4 (n - j );
60
- vx = vle32_v_f32m2 (& x [j ], gvl );
61
- vy = vle32_v_f32m2 (& y [j ], gvl );
62
- vfloat64m4_t vz = vfmv_v_f_f64m4 (0 , gvl );
63
- //vr = vfdot_vv_f32m2 (vx, vy, gvl);
64
- vr = vfwmacc_vv_f64m4 (vz , vx , vy , gvl );
65
- v_res = vfredusum_vs_f64m4_f64m1 ( v_res , vr , v_z0 , gvl );
66
- dot += (double )vfmv_f_s_f64m1_f64 (v_res );
59
+ gvl = __riscv_vsetvl_e64m4 (n - j );
60
+ vx = __riscv_vle32_v_f32m2 (& x [j ], gvl );
61
+ vy = __riscv_vle32_v_f32m2 (& y [j ], gvl );
62
+ vfloat64m4_t vz = __riscv_vfmv_v_f_f64m4 (0 , gvl );
63
+ //vr = __riscv_vfdot_vv_f32m2 (vx, vy, gvl);
64
+ vr = __riscv_vfwmacc_vv_f64m4 (vz , vx , vy , gvl );
65
+ v_res = __riscv_vfredusum_vs_f64m4_f64m1 ( vr , v_z0 , gvl );
66
+ dot += (double )__riscv_vfmv_f_s_f64m1_f64 (v_res );
67
67
}
68
68
}else if (inc_y == 1 ){
69
- gvl = vsetvl_e64m4 (n );
70
- vr = vfmv_v_f_f64m4 (0 , gvl );
69
+ gvl = __riscv_vsetvl_e64m4 (n );
70
+ vr = __riscv_vfmv_v_f_f64m4 (0 , gvl );
71
71
int stride_x = inc_x * sizeof (FLOAT );
72
72
for (i = 0 ,j = 0 ; i < n /gvl ; i ++ ){
73
- vx = vlse32_v_f32m2 (& x [j * inc_x ], stride_x , gvl );
74
- vy = vle32_v_f32m2 (& y [j ], gvl );
75
- vr = vfwmacc_vv_f64m4 (vr , vx , vy , gvl );
73
+ vx = __riscv_vlse32_v_f32m2 (& x [j * inc_x ], stride_x , gvl );
74
+ vy = __riscv_vle32_v_f32m2 (& y [j ], gvl );
75
+ vr = __riscv_vfwmacc_vv_f64m4 (vr , vx , vy , gvl );
76
76
j += gvl ;
77
77
}
78
78
if (j > 0 ){
79
- v_res = vfredusum_vs_f64m4_f64m1 ( v_res , vr , v_z0 , gvl );
80
- dot += (double )vfmv_f_s_f64m1_f64 (v_res );
79
+ v_res = __riscv_vfredusum_vs_f64m4_f64m1 ( vr , v_z0 , gvl );
80
+ dot += (double )__riscv_vfmv_f_s_f64m1_f64 (v_res );
81
81
82
82
}
83
83
//tail
84
84
if (j < n ){
85
- gvl = vsetvl_e64m4 (n - j );
86
- vx = vlse32_v_f32m2 (& x [j * inc_x ], stride_x , gvl );
87
- vy = vle32_v_f32m2 (& y [j ], gvl );
88
- vfloat64m4_t vz = vfmv_v_f_f64m4 (0 , gvl );
89
- //vr = vfdot_vv_f32m2 (vx, vy, gvl);
90
- vr = vfwmacc_vv_f64m4 (vz , vx , vy , gvl );
91
- v_res = vfredusum_vs_f64m4_f64m1 ( v_res , vr , v_z0 , gvl );
92
- dot += (double )vfmv_f_s_f64m1_f64 (v_res );
85
+ gvl = __riscv_vsetvl_e64m4 (n - j );
86
+ vx = __riscv_vlse32_v_f32m2 (& x [j * inc_x ], stride_x , gvl );
87
+ vy = __riscv_vle32_v_f32m2 (& y [j ], gvl );
88
+ vfloat64m4_t vz = __riscv_vfmv_v_f_f64m4 (0 , gvl );
89
+ //vr = __riscv_vfdot_vv_f32m2 (vx, vy, gvl);
90
+ vr = __riscv_vfwmacc_vv_f64m4 (vz , vx , vy , gvl );
91
+ v_res = __riscv_vfredusum_vs_f64m4_f64m1 ( vr , v_z0 , gvl );
92
+ dot += (double )__riscv_vfmv_f_s_f64m1_f64 (v_res );
93
93
94
94
}
95
95
}else if (inc_x == 1 ){
96
- gvl = vsetvl_e64m4 (n );
97
- vr = vfmv_v_f_f64m4 (0 , gvl );
96
+ gvl = __riscv_vsetvl_e64m4 (n );
97
+ vr = __riscv_vfmv_v_f_f64m4 (0 , gvl );
98
98
int stride_y = inc_y * sizeof (FLOAT );
99
99
for (i = 0 ,j = 0 ; i < n /gvl ; i ++ ){
100
- vx = vle32_v_f32m2 (& x [j ], gvl );
101
- vy = vlse32_v_f32m2 (& y [j * inc_y ], stride_y , gvl );
102
- vr = vfwmacc_vv_f64m4 (vr , vx , vy , gvl );
100
+ vx = __riscv_vle32_v_f32m2 (& x [j ], gvl );
101
+ vy = __riscv_vlse32_v_f32m2 (& y [j * inc_y ], stride_y , gvl );
102
+ vr = __riscv_vfwmacc_vv_f64m4 (vr , vx , vy , gvl );
103
103
j += gvl ;
104
104
}
105
105
if (j > 0 ){
106
- v_res = vfredusum_vs_f64m4_f64m1 ( v_res , vr , v_z0 , gvl );
107
- dot += (double )vfmv_f_s_f64m1_f64 (v_res );
106
+ v_res = __riscv_vfredusum_vs_f64m4_f64m1 ( vr , v_z0 , gvl );
107
+ dot += (double )__riscv_vfmv_f_s_f64m1_f64 (v_res );
108
108
109
109
}
110
110
//tail
111
111
if (j < n ){
112
- gvl = vsetvl_e64m4 (n - j );
113
- vx = vle32_v_f32m2 (& x [j ], gvl );
114
- vy = vlse32_v_f32m2 (& y [j * inc_y ], stride_y , gvl );
115
- vfloat64m4_t vz = vfmv_v_f_f64m4 (0 , gvl );
116
- //vr = vfdot_vv_f32m2 (vx, vy, gvl);
117
- vr = vfwmacc_vv_f64m4 (vz , vx , vy , gvl );
118
- v_res = vfredusum_vs_f64m4_f64m1 ( v_res , vr , v_z0 , gvl );
119
- dot += (double )vfmv_f_s_f64m1_f64 (v_res );
112
+ gvl = __riscv_vsetvl_e64m4 (n - j );
113
+ vx = __riscv_vle32_v_f32m2 (& x [j ], gvl );
114
+ vy = __riscv_vlse32_v_f32m2 (& y [j * inc_y ], stride_y , gvl );
115
+ vfloat64m4_t vz = __riscv_vfmv_v_f_f64m4 (0 , gvl );
116
+ //vr = __riscv_vfdot_vv_f32m2 (vx, vy, gvl);
117
+ vr = __riscv_vfwmacc_vv_f64m4 (vz , vx , vy , gvl );
118
+ v_res = __riscv_vfredusum_vs_f64m4_f64m1 ( vr , v_z0 , gvl );
119
+ dot += (double )__riscv_vfmv_f_s_f64m1_f64 (v_res );
120
120
121
121
}
122
122
}else {
123
- gvl = vsetvl_e64m4 (n );
124
- vr = vfmv_v_f_f64m4 (0 , gvl );
123
+ gvl = __riscv_vsetvl_e64m4 (n );
124
+ vr = __riscv_vfmv_v_f_f64m4 (0 , gvl );
125
125
int stride_x = inc_x * sizeof (FLOAT );
126
126
int stride_y = inc_y * sizeof (FLOAT );
127
127
for (i = 0 ,j = 0 ; i < n /gvl ; i ++ ){
128
- vx = vlse32_v_f32m2 (& x [j * inc_x ], stride_x , gvl );
129
- vy = vlse32_v_f32m2 (& y [j * inc_y ], stride_y , gvl );
130
- vr = vfwmacc_vv_f64m4 (vr , vx , vy , gvl );
128
+ vx = __riscv_vlse32_v_f32m2 (& x [j * inc_x ], stride_x , gvl );
129
+ vy = __riscv_vlse32_v_f32m2 (& y [j * inc_y ], stride_y , gvl );
130
+ vr = __riscv_vfwmacc_vv_f64m4 (vr , vx , vy , gvl );
131
131
j += gvl ;
132
132
}
133
133
if (j > 0 ){
134
- v_res = vfredusum_vs_f64m4_f64m1 ( v_res , vr , v_z0 , gvl );
135
- dot += (double )vfmv_f_s_f64m1_f64 (v_res );
134
+ v_res = __riscv_vfredusum_vs_f64m4_f64m1 ( vr , v_z0 , gvl );
135
+ dot += (double )__riscv_vfmv_f_s_f64m1_f64 (v_res );
136
136
137
137
}
138
138
//tail
139
139
if (j < n ){
140
- gvl = vsetvl_e64m4 (n - j );
141
- vx = vlse32_v_f32m2 (& x [j * inc_x ], stride_x , gvl );
142
- vy = vlse32_v_f32m2 (& y [j * inc_y ], stride_y , gvl );
143
- vfloat64m4_t vz = vfmv_v_f_f64m4 (0 , gvl );
144
- //vr = vfdot_vv_f32m2 (vx, vy, gvl);
145
- vr = vfwmacc_vv_f64m4 (vz , vx , vy , gvl );
146
- v_res = vfredusum_vs_f64m4_f64m1 ( v_res , vr , v_z0 , gvl );
147
- dot += (double )vfmv_f_s_f64m1_f64 (v_res );
140
+ gvl = __riscv_vsetvl_e64m4 (n - j );
141
+ vx = __riscv_vlse32_v_f32m2 (& x [j * inc_x ], stride_x , gvl );
142
+ vy = __riscv_vlse32_v_f32m2 (& y [j * inc_y ], stride_y , gvl );
143
+ vfloat64m4_t vz = __riscv_vfmv_v_f_f64m4 (0 , gvl );
144
+ //vr = __riscv_vfdot_vv_f32m2 (vx, vy, gvl);
145
+ vr = __riscv_vfwmacc_vv_f64m4 (vz , vx , vy , gvl );
146
+ v_res = __riscv_vfredusum_vs_f64m4_f64m1 ( vr , v_z0 , gvl );
147
+ dot += (double )__riscv_vfmv_f_s_f64m1_f64 (v_res );
148
148
149
149
}
150
150
}
0 commit comments