Skip to content

Commit e8ca5a5

Browse files
authored
Merge pull request #1919 from fenrus75/haswelltuning
(sgemm) Apply some of the SKYLAKEX optimizations also to HASWELL
2 parents 78d877b + c4e23dd commit e8ca5a5

File tree

4 files changed

+39
-10
lines changed

4 files changed

+39
-10
lines changed

kernel/Makefile

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,27 @@ endif
55
TOPDIR = ..
66
include $(TOPDIR)/Makefile.system
77

8+
AVX2OPT =
9+
ifeq ($(C_COMPILER), GCC)
10+
# AVX2 support was added in 4.7.0
11+
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
12+
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
13+
ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7), 11)
14+
AVX2OPT = -mavx2
15+
endif
16+
endif
17+
ifeq ($(C_COMPILER), CLANG)
18+
# Any clang posing as gcc 4.2 should be new enough (3.4 or later)
19+
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
20+
GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 2)
21+
ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ2), 11)
22+
AVX2OPT = -mavx2
23+
endif
24+
endif
25+
ifdef NO_AVX2
26+
AVX2OPT=
27+
endif
28+
829
ifdef TARGET_CORE
930
ifeq ($(TARGET_CORE), SKYLAKEX)
1031
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=skylake-avx512
@@ -16,8 +37,10 @@ ifeq ($(TARGET_CORE), SKYLAKEX)
1637
override CFLAGS += -fno-asynchronous-unwind-tables
1738
endif
1839
endif
40+
else ifeq ($(TARGET_CORE), HASWELL)
41+
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT)
1942
else
20-
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
43+
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
2144
endif
2245
BUILD_KERNEL = 1
2346
KDIR =

kernel/x86_64/KERNEL.HASWELL

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,10 @@ ZAXPYKERNEL = zaxpy.c
3333

3434
STRMMKERNEL = sgemm_kernel_16x4_haswell.S
3535
SGEMMKERNEL = sgemm_kernel_16x4_haswell.S
36+
SGEMM_BETA = sgemm_beta_skylakex.c
3637
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
3738
SGEMMITCOPY = ../generic/gemm_tcopy_16.c
38-
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
39+
SGEMMONCOPY = sgemm_ncopy_4_skylakex.c
3940
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
4041
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
4142
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)

kernel/x86_64/sgemm_beta_skylakex.c

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -61,30 +61,36 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
6161
c_offset = c;
6262

6363
if (beta == ZERO){
64-
__m512 z_zero;
65-
__m256 y_zero;
6664

67-
z_zero = _mm512_setzero_ps();
68-
y_zero = _mm256_setzero_ps();
6965
j = n;
7066
do {
7167
c_offset1 = c_offset;
7268
c_offset += ldc;
7369

7470
i = m;
75-
71+
#ifdef __AVX2__
7672
while (i >= 32) {
73+
#ifdef __AVX512CD__
74+
__m512 z_zero = _mm512_setzero_ps();
7775
_mm512_storeu_ps(c_offset1, z_zero);
7876
_mm512_storeu_ps(c_offset1 + 16, z_zero);
77+
#else
78+
__m256 y_zero = _mm256_setzero_ps();
79+
_mm256_storeu_ps(c_offset1, y_zero);
80+
_mm256_storeu_ps(c_offset1 + 8, y_zero);
81+
_mm256_storeu_ps(c_offset1 + 16, y_zero);
82+
_mm256_storeu_ps(c_offset1 + 24, y_zero);
83+
#endif
7984
c_offset1 += 32;
8085
i -= 32;
8186
}
8287
while (i >= 8) {
88+
__m256 y_zero = _mm256_setzero_ps();
8389
_mm256_storeu_ps(c_offset1, y_zero);
8490
c_offset1 += 8;
8591
i -= 8;
8692
}
87-
93+
#endif
8894
while (i > 0) {
8995
*c_offset1 = ZERO;
9096
c_offset1 ++;

kernel/x86_64/sgemm_ncopy_4_skylakex.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict a, BLASLONG lda, FLOAT * __
4949
FLOAT *b_offset;
5050
FLOAT ctemp1, ctemp2, ctemp3, ctemp4;
5151
FLOAT ctemp5, ctemp6, ctemp7, ctemp8;
52-
FLOAT ctemp9, ctemp10, ctemp11, ctemp12;
53-
FLOAT ctemp13, ctemp14, ctemp15, ctemp16;
52+
FLOAT ctemp9, ctemp13;
5453

5554
a_offset = a;
5655
b_offset = b;

0 commit comments

Comments
 (0)