Skip to content

Commit ea78106

Browse files
authored
Merge pull request #2614 from mhillenibm/gemm_vec_z14
s390x: Improve performance of SGEMM and STRMM on z14 and newer
2 parents f94c53e + cb9dc36 commit ea78106

File tree

7 files changed

+571
-56
lines changed

7 files changed

+571
-56
lines changed

CONTRIBUTORS.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -183,4 +183,7 @@ In chronological order:
183183

184184
* Rajalakshmi Srinivasaraghavan <https://github.com/RajalakshmiSR>
185185
* [2020-04-15] Half-precision GEMM for bfloat16
186-
186+
187+
* Marius Hillenbrand <https://github.com/mhillenibm>
188+
* [2020-05-12] Revise dynamic architecture detection for IBM z
189+
* [2020-05-12] Add new sgemm and strmm kernel for IBM z14

Makefile.system

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -563,8 +563,27 @@ DYNAMIC_CORE += EMAG8180
563563
endif
564564

565565
ifeq ($(ARCH), zarch)
566-
DYNAMIC_CORE = Z13
566+
DYNAMIC_CORE = ZARCH_GENERIC
567+
568+
# Z13 is supported since gcc-5.2, gcc-6, and in RHEL 7.3 and newer
569+
GCC_GE_52 := $(subst 0,,$(shell expr `$(CC) -dumpversion` \>= "5.2"))
570+
571+
ifeq ($(wildcard /etc/redhat-release), /etc/redhat-release)
572+
RHEL_WITH_Z13 := $(subst 0,,$(shell source /etc/os-release ; expr $$VERSION_ID \>= "7.3"))
573+
endif
574+
575+
ifeq ($(or $(GCC_GE_52),$(RHEL_WITH_Z13)), 1)
576+
DYNAMIC_CORE += Z13
577+
else
578+
$(info OpenBLAS: Not building Z13 kernels because gcc is older than 5.2 or 6.x)
579+
endif
580+
581+
GCC_MAJOR_GE_7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7)
582+
ifeq ($(GCC_MAJOR_GE_7), 1)
567583
DYNAMIC_CORE += Z14
584+
else
585+
$(info OpenBLAS: Not building Z14 kernels because gcc is older than 7.x)
586+
endif
568587
endif
569588

570589
ifeq ($(ARCH), power)

Makefile.zarch

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,6 @@ FCOMMON_OPT += -march=z13 -mzvector
55
endif
66

77
ifeq ($(CORE), Z14)
8-
CCOMMON_OPT += -march=z14 -mzvector
8+
CCOMMON_OPT += -march=z14 -mzvector -O3
99
FCOMMON_OPT += -march=z14 -mzvector
1010
endif

driver/others/dynamic_zarch.c

Lines changed: 90 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,58 @@
1-
21
#include "common.h"
2+
#include <stdbool.h>
3+
4+
// Gate kernels for z13 and z14 on gcc version
5+
#if (__GNUC__ == 5 && __GNUC_MINOR__ >= 2) || __GNUC__ >= 6 || \
6+
/* RHEL 7 since 7.3: */ \
7+
(__GNUC__ == 4 && __GNUC_MINOR__ == 8 && __GNUC_PATCHLEVEL__ == 5 && \
8+
__GNUC_RH_RELEASE__ >= 11)
9+
#define HAVE_Z13_SUPPORT
10+
#endif
11+
12+
#if __GNUC__ >= 7
13+
#define HAVE_Z14_SUPPORT
14+
#endif
15+
16+
// Guard the use of getauxval() on glibc version >= 2.16
17+
#ifdef __GLIBC__
18+
#include <features.h>
19+
#if __GLIBC_PREREQ(2, 16)
20+
#include <sys/auxv.h>
21+
#define HAVE_GETAUXVAL 1
22+
23+
static unsigned long get_hwcap(void)
24+
{
25+
unsigned long hwcap = getauxval(AT_HWCAP);
26+
char *maskenv;
27+
28+
// honor requests for not using specific CPU features in LD_HWCAP_MASK
29+
maskenv = getenv("LD_HWCAP_MASK");
30+
if (maskenv)
31+
hwcap &= strtoul(maskenv, NULL, 0);
32+
33+
return hwcap;
34+
// note that a missing auxval is interpreted as no capabilities
35+
// available, which is safe.
36+
}
337

38+
#else // __GLIBC_PREREQ(2, 16)
39+
#warn "Cannot detect SIMD support in Z13 or newer architectures since glibc is older than 2.16"
40+
41+
static unsigned long get_hwcap(void) {
42+
// treat missing support for getauxval() as no capabilities available,
43+
// which is safe.
44+
return 0;
45+
}
46+
#endif // __GLIBC_PREREQ(2, 16)
47+
#endif // __GLIBC
48+
49+
extern gotoblas_t gotoblas_ZARCH_GENERIC;
50+
#ifdef HAVE_Z13_SUPPORT
451
extern gotoblas_t gotoblas_Z13;
52+
#endif
53+
#ifdef HAVE_Z14_SUPPORT
554
extern gotoblas_t gotoblas_Z14;
6-
//extern gotoblas_t gotoblas_Z15;
7-
//#if (!defined C_GCC) || (GCC_VERSION >= 60000)
8-
//extern gotoblas_t gotoblas_Z14;
9-
//#endif
55+
#endif
1056

1157
#define NUM_CORETYPES 4
1258

@@ -16,47 +62,50 @@ static char* corename[] = {
1662
"unknown",
1763
"Z13",
1864
"Z14",
19-
// "Z15",
2065
"ZARCH_GENERIC",
2166
};
2267

2368
char* gotoblas_corename(void) {
69+
#ifdef HAVE_Z13_SUPPORT
2470
if (gotoblas == &gotoblas_Z13) return corename[1];
71+
#endif
72+
#ifdef HAVE_Z14_SUPPORT
2573
if (gotoblas == &gotoblas_Z14) return corename[2];
26-
// if (gotoblas == &gotoblas_Z15) return corename[3];
27-
//#if (!defined C_GCC) || (GCC_VERSION >= 60000)
28-
// if (gotoblas == &gotoblas_POWER9) return corename[3];
29-
//#endif
30-
return corename[0]; // try generic?
74+
#endif
75+
if (gotoblas == &gotoblas_ZARCH_GENERIC) return corename[3];
76+
77+
return corename[0];
3178
}
3279

33-
// __builtin_cpu_is is not supported by zarch
80+
/**
81+
* Detect the fitting set of kernels by retrieving the CPU features supported by
82+
* OS from the auxiliary value AT_HWCAP and choosing the set of kernels
83+
* ("coretype") that exploits most of the features and can be compiled with the
84+
* available gcc version.
85+
* Note that we cannot use vector registers on a z13 or newer unless supported
86+
* by the OS kernel (which needs to handle them properly during context switch).
87+
*/
3488
static gotoblas_t* get_coretype(void) {
35-
FILE* infile;
36-
char buffer[512], * p;
37-
38-
p = (char*)NULL;
39-
infile = fopen("/proc/sysinfo", "r");
40-
while (fgets(buffer, sizeof(buffer), infile)) {
41-
if (!strncmp("Type", buffer, 4)) {
42-
p = strchr(buffer, ':') + 2;
43-
#if 0
44-
fprintf(stderr, "%s\n", p);
45-
#endif
46-
break;
47-
}
48-
}
4989

50-
fclose(infile);
90+
unsigned long hwcap __attribute__((unused)) = get_hwcap();
5191

52-
if (strstr(p, "2964")) return &gotoblas_Z13;
53-
if (strstr(p, "2965")) return &gotoblas_Z13;
54-
if (strstr(p, "3906")) return &gotoblas_Z14;
55-
if (strstr(p, "3907")) return &gotoblas_Z14;
56-
if (strstr(p, "8561")) return &gotoblas_Z14; // fallback z15 to z14
57-
if (strstr(p, "8562")) return &gotoblas_Z14; // fallback z15 to z14
92+
// z14 and z15 systems: exploit Vector Facility (SIMD) and
93+
// Vector-Enhancements Facility 1 (float SIMD instructions), if present.
94+
#ifdef HAVE_Z14_SUPPORT
95+
if ((hwcap & HWCAP_S390_VX) && (hwcap & HWCAP_S390_VXE))
96+
return &gotoblas_Z14;
97+
#endif
98+
99+
// z13: Vector Facility (SIMD for double)
100+
#ifdef HAVE_Z13_SUPPORT
101+
if (hwcap & HWCAP_S390_VX)
102+
return &gotoblas_Z13;
103+
#endif
58104

59-
return NULL; // should be ZARCH_GENERIC
105+
// fallback in case of missing compiler support, systems before z13, or
106+
// when the OS does not advertise support for the Vector Facility (e.g.,
107+
// missing support in the OS kernel)
108+
return &gotoblas_ZARCH_GENERIC;
60109
}
61110

62111
static gotoblas_t* force_coretype(char* coretype) {
@@ -76,12 +125,13 @@ static gotoblas_t* force_coretype(char* coretype) {
76125

77126
switch (found)
78127
{
128+
#ifdef HAVE_Z13_SUPPORT
79129
case 1: return (&gotoblas_Z13);
130+
#endif
131+
#ifdef HAVE_Z14_SUPPORT
80132
case 2: return (&gotoblas_Z14);
81-
// case 3: return (&gotoblas_Z15);
82-
//#if (!defined C_GCC) || (GCC_VERSION >= 60000)
83-
// case 3: return (&gotoblas_POWER9);
84-
//#endif
133+
#endif
134+
case 3: return (&gotoblas_ZARCH_GENERIC);
85135
default: return NULL;
86136
}
87137
snprintf(message, 128, "Core not found: %s\n", coretype);
@@ -109,9 +159,9 @@ void gotoblas_dynamic_init(void) {
109159

110160
if (gotoblas == NULL)
111161
{
112-
snprintf(coremsg, 128, "Falling back to Z14 core\n");
162+
snprintf(coremsg, 128, "Failed to detect system, falling back to generic z support.\n");
113163
openblas_warning(1, coremsg);
114-
gotoblas = &gotoblas_Z14;
164+
gotoblas = &gotoblas_ZARCH_GENERIC;
115165
}
116166

117167
if (gotoblas && gotoblas->init) {

kernel/zarch/KERNEL.Z14

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -86,23 +86,23 @@ DGEMVTKERNEL = dgemv_t_4.c
8686
CGEMVTKERNEL = cgemv_t_4.c
8787
ZGEMVTKERNEL = zgemv_t_4.c
8888

89-
STRMMKERNEL = strmm8x4V.S
89+
STRMMKERNEL = gemm_vec.c
9090
DTRMMKERNEL = trmm8x4V.S
9191
CTRMMKERNEL = ctrmm4x4V.S
9292
ZTRMMKERNEL = ztrmm4x4V.S
9393

94-
SGEMMKERNEL = strmm8x4V.S
95-
SGEMMINCOPY = ../generic/gemm_ncopy_8.c
96-
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
97-
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
98-
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
94+
SGEMMKERNEL = gemm_vec.c
95+
ifneq ($(SGEMM_UNROLL_M),$(SGEMM_UNROLL_N))
96+
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
97+
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
9998
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
10099
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
100+
endif
101+
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
102+
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
101103
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
102104
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
103105

104-
105-
106106
DGEMMKERNEL = gemm8x4V.S
107107
DGEMMINCOPY = ../generic/gemm_ncopy_8.c
108108
DGEMMITCOPY = ../generic/gemm_tcopy_8.c
@@ -145,7 +145,3 @@ ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
145145
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
146146
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
147147

148-
149-
150-
151-

0 commit comments

Comments
 (0)