Skip to content

Commit 1b0b434

Browse files
committed
s390x/Z14: Change register blocking for SGEMM to 16x4
Change register blocking for SGEMM (and STRMM) on z14 from 8x4 to 16x4 by adjusting SGEMM_DEFAULT_UNROLL_M and choosing the appropriate copy implementations. Actually make KERNEL.Z14 more flexible, so that the change in param.h suffices. As a result, performance for SGEMM improves by around 30% on z15. On z14, FP SIMD instructions can operate on float-sized scalars in vector registers, while z13 could do that for double-sized scalars only. Thus, we can double the amount of elements of C that are held in registers in an SGEMM kernel. Signed-off-by: Marius Hillenbrand <[email protected]>
1 parent 71b6eaf commit 1b0b434

File tree

3 files changed

+22
-5
lines changed

3 files changed

+22
-5
lines changed

kernel/zarch/KERNEL.Z14

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -92,12 +92,14 @@ CTRMMKERNEL = ctrmm4x4V.S
9292
ZTRMMKERNEL = ztrmm4x4V.S
9393

9494
SGEMMKERNEL = gemm_vec.c
95-
SGEMMINCOPY = ../generic/gemm_ncopy_8.c
96-
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
97-
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
98-
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
95+
ifneq ($(SGEMM_UNROLL_M),$(SGEMM_UNROLL_N))
96+
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
97+
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
9998
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
10099
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
100+
endif
101+
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
102+
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
101103
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
102104
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
103105

kernel/zarch/gemm_vec.c

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,15 @@ typedef FLOAT vector_float __attribute__ ((vector_size (16)));
220220
}
221221

222222

223+
#if UNROLL_M == 16
224+
VECTOR_BLOCK(16, 4)
225+
VECTOR_BLOCK(16, 2)
226+
VECTOR_BLOCK(16, 1)
227+
#endif
228+
#if UNROLL_N == 8
229+
VECTOR_BLOCK(8, 8)
230+
VECTOR_BLOCK(4, 8)
231+
#endif
223232
VECTOR_BLOCK(8, 4)
224233
VECTOR_BLOCK(8, 2)
225234
VECTOR_BLOCK(8, 1)
@@ -284,6 +293,12 @@ static inline void GEBP_block(BLASLONG m, BLASLONG n,
284293
return; \
285294
}
286295

296+
#if UNROLL_M == 16
297+
BLOCK(16, 4); BLOCK(16, 2); BLOCK(16, 1);
298+
#endif
299+
#if UNROLL_N == 8
300+
BLOCK(8, 8); BLOCK(4, 8);
301+
#endif
287302
BLOCK(8, 4); BLOCK(8, 2); BLOCK(8, 1);
288303
BLOCK(4, 4); BLOCK(4, 2); BLOCK(4, 1);
289304

param.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2999,7 +2999,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
29992999
#define GEMM_DEFAULT_OFFSET_B 0
30003000
#define GEMM_DEFAULT_ALIGN 0x03fffUL
30013001

3002-
#define SGEMM_DEFAULT_UNROLL_M 8
3002+
#define SGEMM_DEFAULT_UNROLL_M 16
30033003
#define SGEMM_DEFAULT_UNROLL_N 4
30043004

30053005
#define DGEMM_DEFAULT_UNROLL_M 8

0 commit comments

Comments
 (0)