Skip to content

Commit dd7a650

Browse files
authored
Merge pull request #59 from xianyi/develop
rebase
2 parents 2931feb + 4a4c50a commit dd7a650

File tree

11 files changed

+5403
-15
lines changed

11 files changed

+5403
-15
lines changed

CONTRIBUTORS.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,3 +187,6 @@ In chronological order:
187187
* Marius Hillenbrand <https://github.com/mhillenibm>
188188
* [2020-05-12] Revise dynamic architecture detection for IBM z
189189
* [2020-05-12] Add new sgemm and strmm kernel for IBM z14
190+
191+
* Danfeng Zhang <https://github.com/craft-zhang>
192+
* [2020-05-20] Improve performance of SGEMM and STRMM on Arm Cortex-A53

Makefile.system

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ ifeq ($(ARCH), amd64)
2121
override ARCH=x86_64
2222
else ifeq ($(ARCH), powerpc64)
2323
override ARCH=power
24+
else ifeq ($(ARCH), powerpc)
25+
override ARCH=power
2426
else ifeq ($(ARCH), i386)
2527
override ARCH=x86
2628
else ifeq ($(ARCH), aarch64)
@@ -277,6 +279,15 @@ NO_LAPACK = 1
277279
override FEXTRALIB =
278280
endif
279281

282+
ifeq ($(C_COMPILER), GCC)
283+
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
284+
GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4)
285+
GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5)
286+
GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7)
287+
GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
288+
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
289+
endif
290+
280291
#
281292
# OS dependent settings
282293
#
@@ -323,13 +334,7 @@ ifeq ($(C_COMPILER), CLANG)
323334
CCOMMON_OPT += -DMS_ABI
324335
endif
325336

326-
ifeq ($(C_COMPILER), GCC)
327337
#Version tests for supporting specific features (MS_ABI, POWER9 intrinsics)
328-
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
329-
GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4)
330-
GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7)
331-
GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
332-
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
333338
ifeq ($(GCCVERSIONGT4), 1)
334339
# GCC Major version > 4
335340
# It is compatible with MSVC ABI.
@@ -343,7 +348,6 @@ ifeq ($(GCCMINORVERSIONGTEQ7), 1)
343348
CCOMMON_OPT += -DMS_ABI
344349
endif
345350
endif
346-
endif
347351

348352
# Ensure the correct stack alignment on Win32
349353
# http://permalink.gmane.org/gmane.comp.lib.openblas.general/97

c_check

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -310,6 +310,7 @@ $linker_a = "";
310310
&& ($flags !~ /advapi32/)
311311
&& ($flags !~ /shell32/)
312312
&& ($flags !~ /omp/)
313+
&& ($flags !~ /[0-9]+/)
313314
) {
314315
$linker_l .= $flags . " "
315316
}

f_check

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -335,6 +335,7 @@ if ($link ne "") {
335335
&& ($flags !~ /advapi32/)
336336
&& ($flags !~ /shell32/)
337337
&& ($flags !~ /omp/)
338+
&& ($flags !~ /[0-9]+/)
338339
&& ($flags !~ /^\-l$/)
339340
) {
340341
$linker_l .= $flags . " ";

kernel/arm64/KERNEL.CORTEXA53

Lines changed: 192 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,194 @@
1-
include $(KERNELDIR)/KERNEL.ARMV8
1+
SAMINKERNEL = ../arm/amin.c
2+
DAMINKERNEL = ../arm/amin.c
3+
CAMINKERNEL = ../arm/zamin.c
4+
ZAMINKERNEL = ../arm/zamin.c
25

6+
SMAXKERNEL = ../arm/max.c
7+
DMAXKERNEL = ../arm/max.c
38

9+
SMINKERNEL = ../arm/min.c
10+
DMINKERNEL = ../arm/min.c
11+
12+
ISAMINKERNEL = ../arm/iamin.c
13+
IDAMINKERNEL = ../arm/iamin.c
14+
ICAMINKERNEL = ../arm/izamin.c
15+
IZAMINKERNEL = ../arm/izamin.c
16+
17+
ISMAXKERNEL = ../arm/imax.c
18+
IDMAXKERNEL = ../arm/imax.c
19+
20+
ISMINKERNEL = ../arm/imin.c
21+
IDMINKERNEL = ../arm/imin.c
22+
23+
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
24+
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
25+
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
26+
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
27+
28+
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
29+
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
30+
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
31+
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
32+
33+
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
34+
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
35+
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
36+
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
37+
38+
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
39+
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
40+
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
41+
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
42+
43+
SAMAXKERNEL = amax.S
44+
DAMAXKERNEL = amax.S
45+
CAMAXKERNEL = zamax.S
46+
ZAMAXKERNEL = zamax.S
47+
48+
SAXPYKERNEL = axpy.S
49+
DAXPYKERNEL = axpy.S
50+
CAXPYKERNEL = zaxpy.S
51+
ZAXPYKERNEL = zaxpy.S
52+
53+
SROTKERNEL = rot.S
54+
DROTKERNEL = rot.S
55+
CROTKERNEL = zrot.S
56+
ZROTKERNEL = zrot.S
57+
58+
SSCALKERNEL = scal.S
59+
DSCALKERNEL = scal.S
60+
CSCALKERNEL = zscal.S
61+
ZSCALKERNEL = zscal.S
62+
63+
SGEMVNKERNEL = gemv_n.S
64+
DGEMVNKERNEL = gemv_n.S
65+
CGEMVNKERNEL = zgemv_n.S
66+
ZGEMVNKERNEL = zgemv_n.S
67+
68+
SGEMVTKERNEL = gemv_t.S
69+
DGEMVTKERNEL = gemv_t.S
70+
CGEMVTKERNEL = zgemv_t.S
71+
ZGEMVTKERNEL = zgemv_t.S
72+
73+
74+
SASUMKERNEL = asum.S
75+
DASUMKERNEL = asum.S
76+
CASUMKERNEL = casum.S
77+
ZASUMKERNEL = zasum.S
78+
79+
SCOPYKERNEL = copy.S
80+
DCOPYKERNEL = copy.S
81+
CCOPYKERNEL = copy.S
82+
ZCOPYKERNEL = copy.S
83+
84+
SSWAPKERNEL = swap.S
85+
DSWAPKERNEL = swap.S
86+
CSWAPKERNEL = swap.S
87+
ZSWAPKERNEL = swap.S
88+
89+
ISAMAXKERNEL = iamax.S
90+
IDAMAXKERNEL = iamax.S
91+
ICAMAXKERNEL = izamax.S
92+
IZAMAXKERNEL = izamax.S
93+
94+
SNRM2KERNEL = nrm2.S
95+
DNRM2KERNEL = nrm2.S
96+
CNRM2KERNEL = znrm2.S
97+
ZNRM2KERNEL = znrm2.S
98+
99+
DDOTKERNEL = dot.S
100+
SDOTKERNEL = dot.S
101+
CDOTKERNEL = zdot.S
102+
ZDOTKERNEL = zdot.S
103+
DSDOTKERNEL = dot.S
104+
105+
DGEMM_BETA = dgemm_beta.S
106+
SGEMM_BETA = sgemm_beta.S
107+
108+
ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 8x8)
109+
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S
110+
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S
111+
else
112+
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
113+
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
114+
endif
115+
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
116+
ifeq ($(SGEMM_UNROLL_M), 16)
117+
SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S
118+
else
119+
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
120+
endif
121+
ifeq ($(SGEMM_UNROLL_M), 4)
122+
SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S
123+
else
124+
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
125+
endif
126+
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
127+
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
128+
endif
129+
ifeq ($(SGEMM_UNROLL_N), 16)
130+
SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S
131+
else
132+
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
133+
endif
134+
ifeq ($(SGEMM_UNROLL_N), 4)
135+
SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S
136+
else
137+
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
138+
endif
139+
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
140+
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
141+
142+
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
143+
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
144+
145+
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
146+
147+
ifeq ($(DGEMM_UNROLL_M), 8)
148+
DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S
149+
DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S
150+
else
151+
DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
152+
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
153+
endif
154+
155+
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
156+
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
157+
endif
158+
159+
ifeq ($(DGEMM_UNROLL_N), 4)
160+
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
161+
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
162+
else
163+
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
164+
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
165+
endif
166+
167+
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
168+
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
169+
170+
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
171+
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
172+
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
173+
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
174+
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
175+
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
176+
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
177+
endif
178+
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
179+
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
180+
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
181+
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
182+
183+
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
184+
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
185+
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
186+
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
187+
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
188+
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
189+
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
190+
endif
191+
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
192+
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
193+
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
194+
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)

0 commit comments

Comments
 (0)