Skip to content

Commit fdac8a9

Browse files
committed
Add sbgemm_ncopy_8 and sbgemm_tcopy_4
1 parent 135718e commit fdac8a9

File tree

3 files changed

+328
-2
lines changed

3 files changed

+328
-2
lines changed

kernel/arm64/KERNEL.NEOVERSEN2

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -190,10 +190,10 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
190190

191191
SBGEMM_BETA = sbgemm_beta_neoversen2.c
192192
SBGEMMKERNEL = sbgemm_kernel_$(SBGEMM_UNROLL_M)x$(SBGEMM_UNROLL_N)_neoversen2.c
193-
SBGEMMINCOPY = sbgemm_ncopy_$(SBGEMM_UNROLL_N)_neoversen2.c
193+
SBGEMMINCOPY = sbgemm_ncopy_$(SBGEMM_UNROLL_M)_neoversen2.c
194194
SBGEMMITCOPY = sbgemm_tcopy_$(SBGEMM_UNROLL_M)_neoversen2.c
195195
SBGEMMONCOPY = sbgemm_ncopy_$(SBGEMM_UNROLL_N)_neoversen2.c
196-
SBGEMMOTCOPY = sbgemm_tcopy_$(SBGEMM_UNROLL_M)_neoversen2.c
196+
SBGEMMOTCOPY = sbgemm_tcopy_$(SBGEMM_UNROLL_N)_neoversen2.c
197197
SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX)
198198
SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX)
199199
SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX)
Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
/***************************************************************************
2+
* Copyright (c) 2022, The OpenBLAS Project
3+
* All rights reserved.
4+
* Redistribution and use in source and binary forms, with or without
5+
* modification, are permitted provided that the following conditions are
6+
* met:
7+
* 1. Redistributions of source code must retain the above copyright
8+
* notice, this list of conditions and the following disclaimer.
9+
* 2. Redistributions in binary form must reproduce the above copyright
10+
* notice, this list of conditions and the following disclaimer in
11+
* the documentation and/or other materials provided with the
12+
* distribution.
13+
* 3. Neither the name of the OpenBLAS project nor the names of
14+
* its contributors may be used to endorse or promote products
15+
* derived from this software without specific prior written permission.
16+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19+
* ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20+
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21+
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22+
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23+
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24+
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25+
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26+
* POSSIBILITY OF SUCH DAMAGE.
27+
* *****************************************************************************/
28+
29+
#include <arm_sve.h>
30+
31+
#include "common.h"
32+
33+
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) {
34+
IFLOAT *a_offset;
35+
IFLOAT *a_offsetx[8];
36+
IFLOAT *b_offset;
37+
a_offset = a;
38+
b_offset = b;
39+
40+
svbool_t pg16 = svdupq_b16(1, 1, 1, 1, 0, 0, 0, 0);
41+
svbfloat16_t v0, v1, v2, v3, v4, v5, v6, v7;
42+
43+
for (BLASLONG j = 0; j < n / 8; j++) {
44+
a_offsetx[0] = a_offset;
45+
a_offsetx[1] = a_offsetx[0] + lda;
46+
a_offsetx[2] = a_offsetx[1] + lda;
47+
a_offsetx[3] = a_offsetx[2] + lda;
48+
a_offsetx[4] = a_offsetx[3] + lda;
49+
a_offsetx[5] = a_offsetx[4] + lda;
50+
a_offsetx[6] = a_offsetx[5] + lda;
51+
a_offsetx[7] = a_offsetx[6] + lda;
52+
a_offset += 8 * lda;
53+
54+
for (BLASLONG i = 0; i < m / 4; i++) {
55+
v0 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[0]);
56+
v1 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[1]);
57+
v2 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[2]);
58+
v3 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[3]);
59+
v4 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[4]);
60+
v5 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[5]);
61+
v6 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[6]);
62+
v7 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[7]);
63+
64+
svst1_bf16(pg16, (bfloat16_t *)b_offset, v0);
65+
svst1_bf16(pg16, (bfloat16_t *)b_offset + 4, v1);
66+
svst1_bf16(pg16, (bfloat16_t *)b_offset + 8, v2);
67+
svst1_bf16(pg16, (bfloat16_t *)b_offset + 12, v3);
68+
svst1_bf16(pg16, (bfloat16_t *)b_offset + 16, v4);
69+
svst1_bf16(pg16, (bfloat16_t *)b_offset + 20, v5);
70+
svst1_bf16(pg16, (bfloat16_t *)b_offset + 24, v6);
71+
svst1_bf16(pg16, (bfloat16_t *)b_offset + 28, v7);
72+
73+
b_offset += 32;
74+
a_offsetx[0] += 4;
75+
a_offsetx[1] += 4;
76+
a_offsetx[2] += 4;
77+
a_offsetx[3] += 4;
78+
a_offsetx[4] += 4;
79+
a_offsetx[5] += 4;
80+
a_offsetx[6] += 4;
81+
a_offsetx[7] += 4;
82+
}
83+
84+
if (m & 3) {
85+
BLASLONG rest = m & 3;
86+
for (BLASLONG col = 0; col < 4; col++) {
87+
b_offset[4 * col] = a_offsetx[col][0];
88+
b_offset[4 * col + 1] = rest == 1 ? 0 : a_offsetx[col][1];
89+
b_offset[4 * col + 2] = rest <= 2 ? 0 : a_offsetx[col][2];
90+
b_offset[4 * col + 3] = rest <= 3 ? 0 : a_offsetx[col][3];
91+
}
92+
b_offset += 16;
93+
}
94+
}
95+
96+
if (n & 4) {
97+
a_offsetx[0] = a_offset;
98+
a_offsetx[1] = a_offsetx[0] + lda;
99+
a_offsetx[2] = a_offsetx[1] + lda;
100+
a_offsetx[3] = a_offsetx[2] + lda;
101+
a_offset += 4 * lda;
102+
103+
for (BLASLONG i = 0; i < m / 4; i++) {
104+
v0 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[0]);
105+
v1 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[1]);
106+
v2 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[2]);
107+
v3 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[3]);
108+
109+
svst1_bf16(pg16, (bfloat16_t *)b_offset, v0);
110+
svst1_bf16(pg16, (bfloat16_t *)b_offset + 4, v1);
111+
svst1_bf16(pg16, (bfloat16_t *)b_offset + 8, v2);
112+
svst1_bf16(pg16, (bfloat16_t *)b_offset + 12, v3);
113+
114+
b_offset += 16;
115+
a_offsetx[0] += 4;
116+
a_offsetx[1] += 4;
117+
a_offsetx[2] += 4;
118+
a_offsetx[3] += 4;
119+
}
120+
121+
if (m & 3) {
122+
BLASLONG rest = m & 3;
123+
for (BLASLONG col = 0; col < 4; col++) {
124+
b_offset[4 * col] = a_offsetx[col][0];
125+
b_offset[4 * col + 1] = rest == 1 ? 0 : a_offsetx[col][1];
126+
b_offset[4 * col + 2] = rest <= 2 ? 0 : a_offsetx[col][2];
127+
b_offset[4 * col + 3] = rest <= 3 ? 0 : a_offsetx[col][3];
128+
}
129+
b_offset += 16;
130+
}
131+
}
132+
133+
if (n & 2) {
134+
a_offsetx[0] = a_offset;
135+
a_offsetx[1] = a_offsetx[0] + lda;
136+
a_offset += 2 * lda;
137+
138+
for (BLASLONG i = 0; i < m / 4; i++) {
139+
v0 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[0]);
140+
v1 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[1]);
141+
svst1_bf16(pg16, (bfloat16_t *)b_offset, v0);
142+
svst1_bf16(pg16, (bfloat16_t *)b_offset + 4, v1);
143+
144+
b_offset += 8;
145+
a_offsetx[0] += 4;
146+
a_offsetx[1] += 4;
147+
}
148+
149+
if (m & 3) {
150+
BLASLONG rest = m & 3;
151+
for (BLASLONG col = 0; col < 2; col++) {
152+
b_offset[4 * col] = a_offsetx[col][0];
153+
b_offset[4 * col + 1] = rest == 1 ? 0 : a_offsetx[col][1];
154+
b_offset[4 * col + 2] = rest <= 2 ? 0 : a_offsetx[col][2];
155+
b_offset[4 * col + 3] = rest <= 3 ? 0 : a_offsetx[col][3];
156+
}
157+
b_offset += 8;
158+
}
159+
}
160+
161+
if (n & 1) {
162+
a_offsetx[0] = a_offset;
163+
for (BLASLONG i = 0; i < m / 4; i++) {
164+
v0 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[0]);
165+
svst1_bf16(pg16, (bfloat16_t *)b_offset, v0);
166+
b_offset += 4;
167+
a_offsetx[0] += 4;
168+
}
169+
if (m & 3) {
170+
BLASLONG rest = m & 3;
171+
b_offset[0] = a_offsetx[0][0];
172+
b_offset[1] = rest == 1 ? 0 : a_offsetx[0][1];
173+
b_offset[2] = rest <= 2 ? 0 : a_offsetx[0][2];
174+
b_offset[3] = rest <= 3 ? 0 : a_offsetx[0][3];
175+
}
176+
}
177+
178+
return 0;
179+
}
Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
/***************************************************************************
2+
* Copyright (c) 2022, The OpenBLAS Project
3+
* All rights reserved.
4+
* Redistribution and use in source and binary forms, with or without
5+
* modification, are permitted provided that the following conditions are
6+
* met:
7+
* 1. Redistributions of source code must retain the above copyright
8+
* notice, this list of conditions and the following disclaimer.
9+
* 2. Redistributions in binary form must reproduce the above copyright
10+
* notice, this list of conditions and the following disclaimer in
11+
* the documentation and/or other materials provided with the
12+
* distribution.
13+
* 3. Neither the name of the OpenBLAS project nor the names of
14+
* its contributors may be used to endorse or promote products
15+
* derived from this software without specific prior written permission.
16+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19+
* ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20+
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21+
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22+
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23+
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24+
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25+
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26+
* POSSIBILITY OF SUCH DAMAGE.
27+
* *****************************************************************************/
28+
#include <arm_neon.h>
29+
30+
#include "common.h"
31+
32+
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) {
33+
IFLOAT *a_offset, *a_offset0, *a_offset1, *a_offset2, *a_offset3;
34+
IFLOAT *b_offset;
35+
a_offset = a;
36+
b_offset = b;
37+
38+
uint16x4_t v0_h, v1_h, v2_h, v3_h, v4_h, v5_h, v6_h, v7_h;
39+
40+
for (BLASLONG j = 0; j < n / 4; j++) {
41+
a_offset0 = a_offset;
42+
a_offset1 = a_offset0 + lda;
43+
a_offset2 = a_offset1 + lda;
44+
a_offset3 = a_offset2 + lda;
45+
a_offset += 4;
46+
47+
for (BLASLONG i = 0; i < m / 4; i++) {
48+
v0_h = vld1_u16(a_offset0);
49+
v1_h = vld1_u16(a_offset1);
50+
v2_h = vld1_u16(a_offset2);
51+
v3_h = vld1_u16(a_offset3);
52+
53+
v4_h = vtrn1_u16(v0_h, v1_h);
54+
v5_h = vtrn2_u16(v0_h, v1_h);
55+
v6_h = vtrn1_u16(v2_h, v3_h);
56+
v7_h = vtrn2_u16(v2_h, v3_h);
57+
58+
v0_h = (uint16x4_t)vtrn1_u32((uint32x2_t)v4_h, (uint32x2_t)v6_h);
59+
v1_h = (uint16x4_t)vtrn1_u32((uint32x2_t)v5_h, (uint32x2_t)v7_h);
60+
v2_h = (uint16x4_t)vtrn2_u32((uint32x2_t)v4_h, (uint32x2_t)v6_h);
61+
v3_h = (uint16x4_t)vtrn2_u32((uint32x2_t)v5_h, (uint32x2_t)v7_h);
62+
63+
vst1_u16(b_offset, v0_h);
64+
vst1_u16(b_offset + 4, v1_h);
65+
vst1_u16(b_offset + 8, v2_h);
66+
vst1_u16(b_offset + 12, v3_h);
67+
68+
b_offset += 16;
69+
a_offset0 += 4 * lda;
70+
a_offset1 += 4 * lda;
71+
a_offset2 += 4 * lda;
72+
a_offset3 += 4 * lda;
73+
}
74+
75+
if (m & 3) {
76+
BLASLONG rest = m & 3;
77+
for (BLASLONG line = 0; line < 4; line++) {
78+
b_offset[line * 4] = a_offset0[line];
79+
b_offset[line * 4 + 1] = rest == 1 ? 0 : a_offset1[line];
80+
b_offset[line * 4 + 2] = rest <= 2 ? 0 : a_offset2[line];
81+
b_offset[line * 4 + 3] = rest <= 3 ? 0 : a_offset3[line];
82+
}
83+
b_offset += 16;
84+
}
85+
}
86+
87+
if (n & 2) {
88+
a_offset0 = a_offset;
89+
a_offset1 = a_offset0 + lda;
90+
a_offset2 = a_offset1 + lda;
91+
a_offset3 = a_offset2 + lda;
92+
a_offset += 2;
93+
94+
for (BLASLONG i = 0; i < m / 4; i++) {
95+
for (BLASLONG line = 0; line < 2; line++) {
96+
b_offset[line * 4] = a_offset0[line];
97+
b_offset[line * 4 + 1] = a_offset1[line];
98+
b_offset[line * 4 + 2] = a_offset2[line];
99+
b_offset[line * 4 + 3] = a_offset3[line];
100+
}
101+
b_offset += 8;
102+
a_offset0 += 4 * lda;
103+
a_offset1 += 4 * lda;
104+
a_offset2 += 4 * lda;
105+
a_offset3 += 4 * lda;
106+
}
107+
108+
if (m & 3) {
109+
BLASLONG rest = m & 3;
110+
for (BLASLONG line = 0; line < 2; line++) {
111+
b_offset[line * 4] = a_offset0[line];
112+
b_offset[line * 4 + 1] = rest == 1 ? 0 : a_offset1[line];
113+
b_offset[line * 4 + 2] = rest <= 2 ? 0 : a_offset2[line];
114+
b_offset[line * 4 + 3] = rest <= 3 ? 0 : a_offset3[line];
115+
}
116+
b_offset += 8;
117+
}
118+
}
119+
120+
if (n & 1) {
121+
a_offset0 = a_offset;
122+
a_offset1 = a_offset0 + lda;
123+
a_offset2 = a_offset1 + lda;
124+
a_offset3 = a_offset2 + lda;
125+
126+
for (BLASLONG i = 0; i < m / 4; i++) {
127+
b_offset[0] = *a_offset0;
128+
b_offset[1] = *a_offset1;
129+
b_offset[2] = *a_offset2;
130+
b_offset[3] = *a_offset3;
131+
b_offset += 4;
132+
a_offset0 += 4 * lda;
133+
a_offset1 += 4 * lda;
134+
a_offset2 += 4 * lda;
135+
a_offset3 += 4 * lda;
136+
}
137+
138+
if (m & 3) {
139+
BLASLONG rest = m & 3;
140+
b_offset[0] = *a_offset0;
141+
b_offset[1] = rest == 1 ? 0 : *a_offset1;
142+
b_offset[2] = rest <= 2 ? 0 : *a_offset2;
143+
b_offset[3] = rest <= 3 ? 0 : *a_offset3;
144+
}
145+
}
146+
return 0;
147+
}

0 commit comments

Comments
 (0)