@@ -28,35 +28,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2828#include "common.h"
2929
3030#if !defined(DOUBLE )
31- #define VSETVL (n ) __riscv_vsetvl_e32m1(n)
32- #define FLOAT_V_T vfloat32m1_t
33- #define FLOAT_VX2_T vfloat32m1x2_t
34- #define FLOAT_VX4_T vfloat32m1x4_t
35- #define FLOAT_VX8_T vfloat32m1x8_t
36- #define VLEV_FLOAT __riscv_vle32_v_f32m1
37- #define VLSEV_FLOAT __riscv_vlse32_v_f32m1
38- #define VSEV_FLOAT __riscv_vse32_v_f32m1
39- #define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m1x2
40- #define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1x2
41- #define VLSSEG4_FLOAT __riscv_vlsseg4e32_v_f32m1x4
42- #define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1x4
43- #define VLSSEG8_FLOAT __riscv_vlsseg8e32_v_f32m1x8
44- #define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1x8
31+ #define FLOAT_V_T vfloat32m2_t
32+ #define FLOAT_V_T_HALF vfloat32m1_t
33+ #define VLEV_FLOAT __riscv_vle32_v_f32m2
34+ #define VLEV_FLOAT_HALF __riscv_vle32_v_f32m1
35+ #define VSEV_FLOAT __riscv_vse32_v_f32m2
36+ #define VSEV_FLOAT_HALF __riscv_vse32_v_f32m1
4537#else
46- #define VSETVL (n ) __riscv_vsetvl_e64m1(n)
47- #define FLOAT_V_T vfloat64m1_t
48- #define FLOAT_VX2_T vfloat64m1x2_t
49- #define FLOAT_VX4_T vfloat64m1x4_t
50- #define FLOAT_VX8_T vfloat64m1x8_t
51- #define VLEV_FLOAT __riscv_vle64_v_f64m1
52- #define VLSEV_FLOAT __riscv_vlse64_v_f64m1
53- #define VSEV_FLOAT __riscv_vse64_v_f64m1
54- #define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m1x2
55- #define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1x2
56- #define VLSSEG4_FLOAT __riscv_vlsseg4e64_v_f64m1x4
57- #define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1x4
58- #define VLSSEG8_FLOAT __riscv_vlsseg8e64_v_f64m1x8
59- #define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1x8
38+ #define FLOAT_V_T vfloat64m4_t
39+ #define FLOAT_V_T_HALF vfloat64m2_t
40+ #define VLEV_FLOAT __riscv_vle64_v_f64m4
41+ #define VLEV_FLOAT_HALF __riscv_vle64_v_f64m2
42+ #define VSEV_FLOAT __riscv_vse64_v_f64m4
43+ #define VSEV_FLOAT_HALF __riscv_vse64_v_f64m2
6044#endif
6145
6246int CNAME (BLASLONG m , BLASLONG n , IFLOAT * a , BLASLONG lda , IFLOAT * b )
@@ -69,9 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
6953 IFLOAT * boffset , * boffset1 , * boffset2 , * boffset3 , * boffset4 ;
7054
7155 FLOAT_V_T v0 ;
72- FLOAT_VX2_T vx2 ;
73- FLOAT_VX4_T vx4 ;
74- FLOAT_VX8_T vx8 ;
56+ FLOAT_V_T_HALF v1 ;
7557
7658 // fprintf(stderr, "gemm_tcopy_8 m=%ld n=%ld lda=%ld\n", m, n, lda);
7759
@@ -81,156 +63,12 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
8163 boffset3 = b + m * (n & ~3 );
8264 boffset4 = b + m * (n & ~1 );
8365
84- for (j = (m >> 3 ); j > 0 ; j -- ) {
85-
86- aoffset1 = aoffset ;
87- aoffset += 8 * lda ;
88-
89- boffset1 = boffset ;
90- boffset += 64 ;
91-
92- for (i = (n >> 3 ); i > 0 ; i -- ) {
93- size_t vl = 8 ;
94-
95- vx8 = VLSSEG8_FLOAT (aoffset1 , lda * sizeof (FLOAT ), vl );
96- VSSEG8_FLOAT (boffset1 , vx8 , vl );
97-
98- aoffset1 += 8 ;
99- boffset1 += m * 8 ;
100- }
101-
102- if (n & 4 ) {
103- size_t vl = 8 ;
104-
105- vx4 = VLSSEG4_FLOAT (aoffset1 , lda * sizeof (FLOAT ), vl );
106- VSSEG4_FLOAT (boffset2 , vx4 , vl );
107-
108- aoffset1 += 4 ;
109- boffset2 += 32 ;
110- }
111-
112- if (n & 2 ) {
113- size_t vl = 8 ;
114-
115- vx2 = VLSSEG2_FLOAT (aoffset1 , lda * sizeof (FLOAT ), vl );
116- VSSEG2_FLOAT (boffset3 , vx2 , vl );
117-
118- aoffset1 += 2 ;
119- boffset3 += 16 ;
120- }
121-
122- if (n & 1 ) {
123- size_t vl = 8 ;
124-
125- v0 = VLSEV_FLOAT (aoffset1 , lda * sizeof (FLOAT ), vl );
126- VSEV_FLOAT (boffset4 , v0 , vl );
127-
128- aoffset1 += 1 ;
129- boffset4 += 8 ;
130- }
131-
132- }
133-
134- if (m & 4 ) {
135-
136- aoffset1 = aoffset ;
137- aoffset += 4 * lda ;
138-
139- boffset1 = boffset ;
140- boffset += 32 ;
141-
142- for (i = (n >> 3 ); i > 0 ; i -- ) {
143- size_t vl = 4 ;
144-
145- vx8 = VLSSEG8_FLOAT (aoffset1 , lda * sizeof (FLOAT ), vl );
146- VSSEG8_FLOAT (boffset1 , vx8 , vl );
147-
148- aoffset1 += 8 ;
149- boffset1 += m * 8 ;
150- }
151-
152- if (n & 4 ) {
153- size_t vl = 4 ;
154-
155- vx4 = VLSSEG4_FLOAT (aoffset1 , lda * sizeof (FLOAT ), vl );
156- VSSEG4_FLOAT (boffset2 , vx4 , vl );
157-
158- aoffset1 += 4 ;
159- boffset2 += 16 ;
160- }
161-
162- if (n & 2 ) {
163- size_t vl = 4 ;
164-
165- vx2 = VLSSEG2_FLOAT (aoffset1 , lda * sizeof (FLOAT ), vl );
166- VSSEG2_FLOAT (boffset3 , vx2 , vl );
167-
168- aoffset1 += 2 ;
169- boffset3 += 8 ;
170- }
171-
172- if (n & 1 ) {
173- size_t vl = 4 ;
174-
175- v0 = VLSEV_FLOAT (aoffset1 , lda * sizeof (FLOAT ), vl );
176- VSEV_FLOAT (boffset4 , v0 , vl );
177-
178- aoffset1 += 1 ;
179- boffset4 += 4 ;
180- }
181- }
182-
183- if (m & 2 ) {
66+ for (j = m ; j > 0 ; j -- ) {
18467 aoffset1 = aoffset ;
185- aoffset += 2 * lda ;
186-
18768 boffset1 = boffset ;
188- boffset += 16 ;
189-
190- for (i = (n >> 3 ); i > 0 ; i -- ) {
191- size_t vl = 2 ;
19269
193- vx8 = VLSSEG8_FLOAT (aoffset1 , lda * sizeof (FLOAT ), vl );
194- VSSEG8_FLOAT (boffset1 , vx8 , vl );
195-
196- aoffset1 += 8 ;
197- boffset1 += m * 8 ;
198- }
199-
200- if (n & 4 ) {
201- size_t vl = 2 ;
202-
203- vx4 = VLSSEG4_FLOAT (aoffset1 , lda * sizeof (FLOAT ), vl );
204- VSSEG4_FLOAT (boffset2 , vx4 , vl );
205-
206- aoffset1 += 4 ;
207- boffset2 += 8 ;
208- }
209-
210- if (n & 2 ) {
211- size_t vl = 2 ;
212-
213- vx2 = VLSSEG2_FLOAT (aoffset1 , lda * sizeof (FLOAT ), vl );
214- VSSEG2_FLOAT (boffset3 , vx2 , vl );
215-
216- aoffset1 += 2 ;
217- boffset3 += 4 ;
218- }
219-
220- if (n & 1 ) {
221- size_t vl = 2 ;
222-
223- v0 = VLSEV_FLOAT (aoffset1 , lda * sizeof (FLOAT ), vl );
224- VSEV_FLOAT (boffset4 , v0 , vl );
225-
226- aoffset1 += 1 ;
227- boffset4 += 2 ;
228- }
229- }
230-
231- if (m & 1 ) {
232- aoffset1 = aoffset ;
233- boffset1 = boffset ;
70+ aoffset += lda ;
71+ boffset += 8 ;
23472
23573 for (i = (n >> 3 ); i > 0 ; i -- ) {
23674 size_t vl = 8 ;
@@ -245,27 +83,25 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
24583 if (n & 4 ) {
24684 size_t vl = 4 ;
24785
248- v0 = VLEV_FLOAT (aoffset1 , vl );
249- VSEV_FLOAT (boffset2 , v0 , vl );
86+ v1 = VLEV_FLOAT_HALF (aoffset1 , vl );
87+ VSEV_FLOAT_HALF (boffset2 , v1 , vl );
25088
25189 aoffset1 += 4 ;
252- // boffset2 += 4;
90+ boffset2 += 4 ;
25391 }
25492
25593 if (n & 2 ) {
256- size_t vl = 2 ;
257-
258- v0 = VLEV_FLOAT (aoffset1 , vl );
259- VSEV_FLOAT (boffset3 , v0 , vl );
94+ * (boffset3 ) = * (aoffset1 );
95+ * (boffset3 + 1 ) = * (aoffset1 + 1 );
26096
26197 aoffset1 += 2 ;
262- // boffset3 += 2;
98+ boffset3 += 2 ;
26399 }
264100
265101 if (n & 1 ) {
266- * (boffset4 ) = * (aoffset1 );
267- // aoffset1 ++;
268- // boffset4 ++;
102+ * (boffset4 ) = * (aoffset1 );
103+ aoffset1 ++ ;
104+ boffset4 ++ ;
269105 }
270106 }
271107
0 commit comments