Skip to content

Commit 135718e

Browse files
committed
Improve the performance of sbgemm_tcopy on neoversen2
1 parent e9a911f commit 135718e

File tree

1 file changed

+46
-12
lines changed

1 file changed

+46
-12
lines changed

kernel/arm64/sbgemm_tcopy_8_neoversen2.c

Lines changed: 46 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
2626
* POSSIBILITY OF SUCH DAMAGE.
2727
* *****************************************************************************/
28+
#include <arm_neon.h>
2829

2930
#include "common.h"
3031

@@ -34,6 +35,9 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) {
3435
a_offset = a;
3536
b_offset = b;
3637

38+
uint16x8_t v0, v1, v2, v3, v4, v5, v6, v7;
39+
uint16x4_t v0_h, v1_h, v2_h, v3_h, v4_h, v5_h, v6_h, v7_h;
40+
3741
for (BLASLONG j = 0; j < n / 8; j++) {
3842
a_offset0 = a_offset;
3943
a_offset1 = a_offset0 + lda;
@@ -42,12 +46,29 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) {
4246
a_offset += 8;
4347

4448
for (BLASLONG i = 0; i < m / 4; i++) {
45-
for (BLASLONG line = 0; line < 8; line++) {
46-
b_offset[line * 4] = a_offset0[line];
47-
b_offset[line * 4 + 1] = a_offset1[line];
48-
b_offset[line * 4 + 2] = a_offset2[line];
49-
b_offset[line * 4 + 3] = a_offset3[line];
50-
}
49+
v0 = vld1q_u16(a_offset0);
50+
v1 = vld1q_u16(a_offset1);
51+
v2 = vld1q_u16(a_offset2);
52+
v3 = vld1q_u16(a_offset3);
53+
54+
v4 = vtrn1q_u16(v0, v1);
55+
v5 = vtrn2q_u16(v0, v1);
56+
v6 = vtrn1q_u16(v2, v3);
57+
v7 = vtrn2q_u16(v2, v3);
58+
59+
v0 = (uint16x8_t)vtrn1q_u32((uint32x4_t)v4, (uint32x4_t)v6);
60+
v1 = (uint16x8_t)vtrn1q_u32((uint32x4_t)v5, (uint32x4_t)v7);
61+
v2 = (uint16x8_t)vtrn2q_u32((uint32x4_t)v4, (uint32x4_t)v6);
62+
v3 = (uint16x8_t)vtrn2q_u32((uint32x4_t)v5, (uint32x4_t)v7);
63+
64+
vst1_u16(b_offset, vget_low_u16(v0));
65+
vst1_u16(b_offset + 4, vget_low_u16(v1));
66+
vst1_u16(b_offset + 8, vget_low_u16(v2));
67+
vst1_u16(b_offset + 12, vget_low_u16(v3));
68+
vst1_u16(b_offset + 16, vget_high_u16(v0));
69+
vst1_u16(b_offset + 20, vget_high_u16(v1));
70+
vst1_u16(b_offset + 24, vget_high_u16(v2));
71+
vst1_u16(b_offset + 28, vget_high_u16(v3));
5172

5273
b_offset += 32;
5374
a_offset0 += 4 * lda;
@@ -76,12 +97,25 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) {
7697
a_offset += 4;
7798

7899
for (BLASLONG i = 0; i < m / 4; i++) {
79-
for (BLASLONG line = 0; line < 4; line++) {
80-
b_offset[line * 4] = a_offset0[line];
81-
b_offset[line * 4 + 1] = a_offset1[line];
82-
b_offset[line * 4 + 2] = a_offset2[line];
83-
b_offset[line * 4 + 3] = a_offset3[line];
84-
}
100+
v0_h = vld1_u16(a_offset0);
101+
v1_h = vld1_u16(a_offset1);
102+
v2_h = vld1_u16(a_offset2);
103+
v3_h = vld1_u16(a_offset3);
104+
105+
v4_h = vtrn1_u16(v0_h, v1_h);
106+
v5_h = vtrn2_u16(v0_h, v1_h);
107+
v6_h = vtrn1_u16(v2_h, v3_h);
108+
v7_h = vtrn2_u16(v2_h, v3_h);
109+
110+
v0_h = (uint16x4_t)vtrn1_u32((uint32x2_t)v4_h, (uint32x2_t)v6_h);
111+
v1_h = (uint16x4_t)vtrn1_u32((uint32x2_t)v5_h, (uint32x2_t)v7_h);
112+
v2_h = (uint16x4_t)vtrn2_u32((uint32x2_t)v4_h, (uint32x2_t)v6_h);
113+
v3_h = (uint16x4_t)vtrn2_u32((uint32x2_t)v5_h, (uint32x2_t)v7_h);
114+
115+
vst1_u16(b_offset, v0_h);
116+
vst1_u16(b_offset + 4, v1_h);
117+
vst1_u16(b_offset + 8, v2_h);
118+
vst1_u16(b_offset + 12, v3_h);
85119

86120
b_offset += 16;
87121
a_offset0 += 4 * lda;

0 commit comments

Comments
 (0)