Skip to content

Commit 7947970

Browse files
author
Chip Kerchner
committed
Move common code.
1 parent 72216d2 commit 7947970

File tree

4 files changed

+152
-134
lines changed

4 files changed

+152
-134
lines changed

kernel/power/gemm_common.c

Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
#ifndef GEMM_COMMON_C
2+
#define GEMM_COMMON_C
3+
#include "common.h"
4+
5+
#include <altivec.h>
6+
7+
#define FORCEINLINE inline __attribute__((always_inline))
8+
9+
#ifdef __clang__
10+
#define uint16_t unsigned short
11+
#define uint32_t unsigned int
12+
#define uint64_t unsigned long long
13+
#endif
14+
15+
#ifdef _ARCH_PWR10
16+
#ifdef __has_builtin
17+
#if !__has_builtin(__builtin_vsx_assemble_pair)
18+
#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair
19+
#endif
20+
#if !__has_builtin(__builtin_vsx_disassemble_pair)
21+
#define __builtin_vsx_disassemble_pair __builtin_mma_disassemble_pair
22+
#endif
23+
#endif
24+
25+
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
26+
#define __builtin_vsx_assemble_pair2(vp0, v0, v1) __builtin_vsx_assemble_pair(vp0, v1, v0)
27+
#else
28+
#define __builtin_vsx_assemble_pair2(vp0, v0, v1) __builtin_vsx_assemble_pair(vp0, v0, v1)
29+
#endif
30+
31+
#define USE_VECTOR_PAIRS
32+
#endif
33+
34+
typedef __vector IFLOAT vec_bf16;
35+
typedef __vector FLOAT vec_f32;
36+
typedef __vector unsigned char vec_uc8;
37+
38+
FORCEINLINE vec_uc8 vec_load_vec(void *src)
39+
{
40+
return vec_xl(0, (unsigned char *)(src));
41+
}
42+
43+
FORCEINLINE void vec_load_pair(vec_f32 *dst, vec_f32 *src)
44+
{
45+
#ifdef USE_VECTOR_PAIRS
46+
__vector_pair vy0p;
47+
vy0p = *(__vector_pair *)(src);
48+
__builtin_vsx_disassemble_pair((void *)(dst), &vy0p);
49+
#else
50+
dst[0] = src[0];
51+
dst[1] = src[1];
52+
#endif
53+
}
54+
55+
FORCEINLINE void vec_store_pair(vec_f32 *dst, vec_f32 *src)
56+
{
57+
#ifdef USE_VECTOR_PAIRS
58+
__vector_pair vy0p;
59+
__builtin_vsx_assemble_pair2(&vy0p, (vec_uc8)src[1], (vec_uc8)src[0]);
60+
*(__vector_pair *)(dst) = vy0p;
61+
#else
62+
dst[0] = src[0];
63+
dst[1] = src[1];
64+
#endif
65+
}
66+
67+
FORCEINLINE vec_bf16 vec_loadN(void *src, BLASLONG n)
68+
{
69+
IFLOAT *src2 = (IFLOAT *)(src);
70+
#ifdef _ARCH_PWR9
71+
return vec_xl_len(src2, n * sizeof(IFLOAT));
72+
#else
73+
__attribute__((aligned(16))) IFLOAT data[sizeof(vec_bf16) / sizeof(IFLOAT)];
74+
memset(data, 0, sizeof(vec_bf16));
75+
if (n & 4) {
76+
memcpy(data, src2, sizeof(uint64_t));
77+
}
78+
if (n & 2) {
79+
BLASLONG n4 = n & 4;
80+
memcpy(data + n4, src2 + n4, sizeof(uint32_t));
81+
}
82+
if (n & 1) {
83+
BLASLONG n6 = n & 6;
84+
data[n6] = src2[n6];
85+
}
86+
return (vec_bf16)vec_load_vec(data);
87+
#endif
88+
}
89+
90+
FORCEINLINE vec_f32 vec_loadN_f32(void *src, BLASLONG n)
91+
{
92+
#ifndef _ARCH_PWR9
93+
if (n & 4) {
94+
return (vec_f32)vec_load_vec(src);
95+
}
96+
#endif
97+
return (vec_f32)vec_loadN(src, n * (sizeof(FLOAT) / sizeof(IFLOAT)));
98+
}
99+
100+
FORCEINLINE void vec_loadN2_f32(vec_f32 *data, vec_f32 *src, BLASLONG n)
101+
{
102+
data[0] = src[0];
103+
data[1] = vec_loadN_f32(&src[1], n);
104+
}
105+
106+
FORCEINLINE void vec_storeN(vec_bf16 data, void *dst, BLASLONG n)
107+
{
108+
IFLOAT *dst2 = (IFLOAT *)(dst);
109+
#ifdef _ARCH_PWR9
110+
vec_xst_len(data, dst2, n * sizeof(IFLOAT));
111+
#else
112+
if (n & 8) {
113+
vec_xst(data, 0, dst2);
114+
return;
115+
}
116+
__attribute__((aligned(16))) IFLOAT data2[sizeof(vec_f32) / sizeof(IFLOAT)];
117+
vec_xst(data, 0, data2);
118+
if (n & 4) {
119+
memcpy(dst2, data2, sizeof(uint64_t));
120+
}
121+
if (n & 2) {
122+
BLASLONG n4 = n & 4;
123+
memcpy(dst2 + n4, data2 + n4, sizeof(uint32_t));
124+
}
125+
if (n & 1) {
126+
BLASLONG n6 = n & 6;
127+
dst2[n6] = data2[n6];
128+
}
129+
#endif
130+
}
131+
132+
FORCEINLINE void vec_storeN_f32(vec_f32 data, void *dst, BLASLONG n)
133+
{
134+
#ifndef _ARCH_PWR9
135+
if (n & 4) {
136+
vec_xst(data, 0, (FLOAT *)dst);
137+
return;
138+
}
139+
#endif
140+
return vec_storeN((vec_bf16)data, dst, n * (sizeof(FLOAT) / sizeof(IFLOAT)));
141+
}
142+
143+
FORCEINLINE void vec_storeN2_f32(vec_f32 *data, vec_f32 *dst, BLASLONG n)
144+
{
145+
dst[0] = data[0];
146+
vec_storeN_f32(data[1], &dst[1], n);
147+
}
148+
#endif

kernel/power/sbgemv_common.c

Lines changed: 2 additions & 131 deletions
Original file line numberDiff line numberDiff line change
@@ -27,40 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2727

2828
#ifndef SBGEMV_COMMON_C
2929
#define SBGEMV_COMMON_C
30-
#include "common.h"
31-
32-
#include <altivec.h>
33-
34-
#define FORCEINLINE inline __attribute__((always_inline))
35-
36-
#ifdef __clang__
37-
#define uint16_t unsigned short
38-
#define uint32_t unsigned int
39-
#define uint64_t unsigned long long
40-
#endif
41-
42-
#ifdef _ARCH_PWR10
43-
#ifdef __has_builtin
44-
#if !__has_builtin(__builtin_vsx_assemble_pair)
45-
#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair
46-
#endif
47-
#if !__has_builtin(__builtin_vsx_disassemble_pair)
48-
#define __builtin_vsx_disassemble_pair __builtin_mma_disassemble_pair
49-
#endif
50-
#endif
51-
52-
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
53-
#define __builtin_vsx_assemble_pair2(vp0, v0, v1) __builtin_vsx_assemble_pair(vp0, v1, v0)
54-
#else
55-
#define __builtin_vsx_assemble_pair2(vp0, v0, v1) __builtin_vsx_assemble_pair(vp0, v0, v1)
56-
#endif
57-
58-
#define USE_VECTOR_PAIRS
59-
#endif
60-
61-
typedef __vector IFLOAT vec_bf16;
62-
typedef __vector FLOAT vec_f32;
63-
typedef __vector unsigned char vec_uc8;
30+
#include "gemm_common.c"
6431

6532
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
6633
#define BF16_HI(data, zero) (vec_f32)vec_mergeh(data, zero)
@@ -70,108 +37,12 @@ typedef __vector unsigned char vec_uc8;
7037
#define BF16_LO(data, zero) (vec_f32)vec_mergel(zero, data)
7138
#endif
7239

73-
FORCEINLINE vec_uc8 vec_load_vec(void *src)
74-
{
75-
return vec_xl(0, (unsigned char *)(src));
76-
}
77-
78-
FORCEINLINE void vec_load_pair(vec_f32 *dst, vec_f32 *src)
79-
{
80-
#ifdef USE_VECTOR_PAIRS
81-
__vector_pair vy0p;
82-
vy0p = *(__vector_pair *)(src);
83-
__builtin_vsx_disassemble_pair((void *)(dst), &vy0p);
84-
#else
85-
dst[0] = src[0];
86-
dst[1] = src[1];
87-
#endif
88-
}
89-
90-
FORCEINLINE void vec_store_pair(vec_f32 *dst, vec_f32 *src)
91-
{
92-
#ifdef USE_VECTOR_PAIRS
93-
__vector_pair vy0p;
94-
__builtin_vsx_assemble_pair2(&vy0p, (vec_uc8)src[1], (vec_uc8)src[0]);
95-
*(__vector_pair *)(dst) = vy0p;
96-
#else
97-
dst[0] = src[0];
98-
dst[1] = src[1];
99-
#endif
100-
}
101-
102-
FORCEINLINE vec_bf16 vec_loadN(void *src, BLASLONG n)
103-
{
104-
IFLOAT *src2 = (IFLOAT *)(src);
105-
#ifdef _ARCH_PWR9
106-
return vec_xl_len(src2, n * sizeof(IFLOAT));
107-
#else
108-
__attribute__((aligned(16))) IFLOAT data[sizeof(vec_bf16) / sizeof(IFLOAT)];
109-
memset(data, 0, sizeof(vec_bf16));
110-
if (n & 4) {
111-
memcpy(data, src2, sizeof(uint64_t));
112-
}
113-
if (n & 2) {
114-
BLASLONG n4 = n & 4;
115-
memcpy(data + n4, src2 + n4, sizeof(uint32_t));
116-
}
117-
if (n & 1) {
118-
BLASLONG n6 = n & 6;
119-
data[n6] = src2[n6];
120-
}
121-
return (vec_bf16)vec_load_vec(data);
122-
#endif
123-
}
124-
12540
FORCEINLINE vec_f32 vec_loadNHi(void *src, BLASLONG n, vec_bf16 zero)
12641
{
12742
vec_bf16 data = vec_loadN(src, n);
12843
return BF16_HI(data, zero);
12944
}
13045

131-
FORCEINLINE vec_f32 vec_loadN_f32(void *src, BLASLONG n)
132-
{
133-
#ifndef _ARCH_PWR9
134-
if (n & 4) {
135-
return (vec_f32)vec_load_vec(src);
136-
}
137-
#endif
138-
return (vec_f32)vec_loadN(src, n * (sizeof(FLOAT) / sizeof(IFLOAT)));
139-
}
140-
141-
FORCEINLINE void vec_loadN2_f32(vec_f32 *data, vec_f32 *src, BLASLONG n)
142-
{
143-
data[0] = src[0];
144-
data[1] = vec_loadN_f32(&src[1], n);
145-
}
146-
147-
FORCEINLINE void vec_storeN_f32(vec_f32 data, void *dst, BLASLONG n)
148-
{
149-
FLOAT *dst2 = (FLOAT *)(dst);
150-
#ifdef _ARCH_PWR9
151-
vec_xst_len(data, dst2, n * sizeof(FLOAT));
152-
#else
153-
if (n & 4) {
154-
vec_xst(data, 0, dst2);
155-
return;
156-
}
157-
__attribute__((aligned(16))) FLOAT data2[sizeof(vec_f32) / sizeof(FLOAT)];
158-
vec_xst(data, 0, data2);
159-
if (n & 2) {
160-
memcpy(dst2, data2, sizeof(uint64_t));
161-
}
162-
if (n & 1) {
163-
BLASLONG n2 = n & 2;
164-
dst2[n2] = data2[n2];
165-
}
166-
#endif
167-
}
168-
169-
FORCEINLINE void vec_storeN2_f32(vec_f32 *data, vec_f32 *dst, BLASLONG n)
170-
{
171-
dst[0] = data[0];
172-
vec_storeN_f32(data[1], &dst[1], n);
173-
}
174-
17546
FORCEINLINE vec_f32 vec_mult(vec_f32 *inp, vec_bf16 in0, vec_bf16 zero)
17647
{
17748
vec_f32 v_in00 = BF16_HI(in0, zero);
@@ -297,7 +168,7 @@ FORCEINLINE void copy_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src, F
297168
}
298169
}
299170

300-
FORCEINLINE void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
171+
FORCEINLINE void move_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
301172
{
302173
for (BLASLONG i = 0; i < n; i++) {
303174
*dest = *src++;

kernel/power/sbgemv_n.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, IFLOAT *a, BLASLONG lda, IFLOAT *
179179

180180
a += NB;
181181
if (inc_y != 1) {
182-
add_y(NB, ybuffer, y_ptr, inc_y);
182+
move_y(NB, ybuffer, y_ptr, inc_y);
183183
y_ptr += (NB * inc_y);
184184
} else {
185185
y_ptr += NB;

kernel/power/sbgemv_n_vsx.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -269,8 +269,7 @@ static void BF16GEMV_N_VSX_8(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, BLAS
269269
vec_loadN_mult2(v_x7, &vb3[i], n, zero, vy0);
270270

271271
vec_storeN2_f32(vy0, &v_y[(i * 2) + 0], n3);
272-
} else
273-
if (n) {
272+
} else if (n) {
274273
vec_f32 vy0 = vec_loadN_f32(&v_y[(i * 2) + 0], n);
275274

276275
vy0 += vec_loadNHi_multi2(v_x0, &va0[i], n, zero);

0 commit comments

Comments
 (0)