25
25
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26
26
* POSSIBILITY OF SUCH DAMAGE.
27
27
* *****************************************************************************/
28
+ #include <arm_neon.h>
28
29
29
30
#include "common.h"
30
31
@@ -34,6 +35,9 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) {
34
35
a_offset = a ;
35
36
b_offset = b ;
36
37
38
+ uint16x8_t v0 , v1 , v2 , v3 , v4 , v5 , v6 , v7 ;
39
+ uint16x4_t v0_h , v1_h , v2_h , v3_h , v4_h , v5_h , v6_h , v7_h ;
40
+
37
41
for (BLASLONG j = 0 ; j < n / 8 ; j ++ ) {
38
42
a_offset0 = a_offset ;
39
43
a_offset1 = a_offset0 + lda ;
@@ -42,12 +46,29 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) {
42
46
a_offset += 8 ;
43
47
44
48
for (BLASLONG i = 0 ; i < m / 4 ; i ++ ) {
45
- for (BLASLONG line = 0 ; line < 8 ; line ++ ) {
46
- b_offset [line * 4 ] = a_offset0 [line ];
47
- b_offset [line * 4 + 1 ] = a_offset1 [line ];
48
- b_offset [line * 4 + 2 ] = a_offset2 [line ];
49
- b_offset [line * 4 + 3 ] = a_offset3 [line ];
50
- }
49
+ v0 = vld1q_u16 (a_offset0 );
50
+ v1 = vld1q_u16 (a_offset1 );
51
+ v2 = vld1q_u16 (a_offset2 );
52
+ v3 = vld1q_u16 (a_offset3 );
53
+
54
+ v4 = vtrn1q_u16 (v0 , v1 );
55
+ v5 = vtrn2q_u16 (v0 , v1 );
56
+ v6 = vtrn1q_u16 (v2 , v3 );
57
+ v7 = vtrn2q_u16 (v2 , v3 );
58
+
59
+ v0 = (uint16x8_t )vtrn1q_u32 ((uint32x4_t )v4 , (uint32x4_t )v6 );
60
+ v1 = (uint16x8_t )vtrn1q_u32 ((uint32x4_t )v5 , (uint32x4_t )v7 );
61
+ v2 = (uint16x8_t )vtrn2q_u32 ((uint32x4_t )v4 , (uint32x4_t )v6 );
62
+ v3 = (uint16x8_t )vtrn2q_u32 ((uint32x4_t )v5 , (uint32x4_t )v7 );
63
+
64
+ vst1_u16 (b_offset , vget_low_u16 (v0 ));
65
+ vst1_u16 (b_offset + 4 , vget_low_u16 (v1 ));
66
+ vst1_u16 (b_offset + 8 , vget_low_u16 (v2 ));
67
+ vst1_u16 (b_offset + 12 , vget_low_u16 (v3 ));
68
+ vst1_u16 (b_offset + 16 , vget_high_u16 (v0 ));
69
+ vst1_u16 (b_offset + 20 , vget_high_u16 (v1 ));
70
+ vst1_u16 (b_offset + 24 , vget_high_u16 (v2 ));
71
+ vst1_u16 (b_offset + 28 , vget_high_u16 (v3 ));
51
72
52
73
b_offset += 32 ;
53
74
a_offset0 += 4 * lda ;
@@ -76,12 +97,25 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) {
76
97
a_offset += 4 ;
77
98
78
99
for (BLASLONG i = 0 ; i < m / 4 ; i ++ ) {
79
- for (BLASLONG line = 0 ; line < 4 ; line ++ ) {
80
- b_offset [line * 4 ] = a_offset0 [line ];
81
- b_offset [line * 4 + 1 ] = a_offset1 [line ];
82
- b_offset [line * 4 + 2 ] = a_offset2 [line ];
83
- b_offset [line * 4 + 3 ] = a_offset3 [line ];
84
- }
100
+ v0_h = vld1_u16 (a_offset0 );
101
+ v1_h = vld1_u16 (a_offset1 );
102
+ v2_h = vld1_u16 (a_offset2 );
103
+ v3_h = vld1_u16 (a_offset3 );
104
+
105
+ v4_h = vtrn1_u16 (v0_h , v1_h );
106
+ v5_h = vtrn2_u16 (v0_h , v1_h );
107
+ v6_h = vtrn1_u16 (v2_h , v3_h );
108
+ v7_h = vtrn2_u16 (v2_h , v3_h );
109
+
110
+ v0_h = (uint16x4_t )vtrn1_u32 ((uint32x2_t )v4_h , (uint32x2_t )v6_h );
111
+ v1_h = (uint16x4_t )vtrn1_u32 ((uint32x2_t )v5_h , (uint32x2_t )v7_h );
112
+ v2_h = (uint16x4_t )vtrn2_u32 ((uint32x2_t )v4_h , (uint32x2_t )v6_h );
113
+ v3_h = (uint16x4_t )vtrn2_u32 ((uint32x2_t )v5_h , (uint32x2_t )v7_h );
114
+
115
+ vst1_u16 (b_offset , v0_h );
116
+ vst1_u16 (b_offset + 4 , v1_h );
117
+ vst1_u16 (b_offset + 8 , v2_h );
118
+ vst1_u16 (b_offset + 12 , v3_h );
85
119
86
120
b_offset += 16 ;
87
121
a_offset0 += 4 * lda ;
0 commit comments