Skip to content

Commit 71c6dee

Browse files
authored
Merge pull request #1821 from ashwinyes/develop_aarch64_armv8neonkernels
Use ThunderX2 Neon Kernels for ARMV8 Target
2 parents a719235 + 21f46a1 commit 71c6dee

File tree

8 files changed

+643
-85
lines changed

8 files changed

+643
-85
lines changed

driver/others/parameter.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -730,15 +730,15 @@ void blas_set_parameter(void){
730730

731731
#if defined(ARCH_ARM64)
732732

733-
#if defined(VULCAN) || defined(THUNDERX2T99)
733+
#if defined(VULCAN) || defined(THUNDERX2T99) || defined(ARMV8)
734734
unsigned long dgemm_prefetch_size_a;
735735
unsigned long dgemm_prefetch_size_b;
736736
unsigned long dgemm_prefetch_size_c;
737737
#endif
738738

739739
void blas_set_parameter(void)
740740
{
741-
#if defined(VULCAN) || defined(THUNDERX2T99)
741+
#if defined(VULCAN) || defined(THUNDERX2T99) || defined(ARMV8)
742742
dgemm_p = 160;
743743
dgemm_q = 128;
744744
dgemm_r = 4096;

interface/swap.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@
4242
#include "functable.h"
4343
#endif
4444

45-
#if defined(THUNDERX2T99) || defined(VULCAN)
45+
#if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8)
4646
// Multithreaded swap gives performance benefits in ThunderX2T99
4747
#else
4848
// Disable multi-threading as it does not show any performance

kernel/arm64/KERNEL.ARMV8

Lines changed: 152 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,3 @@
1-
SAMAXKERNEL = amax.S
2-
DAMAXKERNEL = amax.S
3-
CAMAXKERNEL = zamax.S
4-
ZAMAXKERNEL = zamax.S
5-
61
SAMINKERNEL = ../arm/amin.c
72
DAMINKERNEL = ../arm/amin.c
83
CAMINKERNEL = ../arm/zamin.c
@@ -14,11 +9,6 @@ DMAXKERNEL = ../arm/max.c
149
SMINKERNEL = ../arm/min.c
1510
DMINKERNEL = ../arm/min.c
1611

17-
ISAMAXKERNEL = iamax.S
18-
IDAMAXKERNEL = iamax.S
19-
ICAMAXKERNEL = izamax.S
20-
IZAMAXKERNEL = izamax.S
21-
2212
ISAMINKERNEL = ../arm/iamin.c
2313
IDAMINKERNEL = ../arm/iamin.c
2414
ICAMINKERNEL = ../arm/izamin.c
@@ -30,33 +20,35 @@ IDMAXKERNEL = ../arm/imax.c
3020
ISMINKERNEL = ../arm/imin.c
3121
IDMINKERNEL = ../arm/imin.c
3222

33-
SASUMKERNEL = asum.S
34-
DASUMKERNEL = asum.S
35-
CASUMKERNEL = casum.S
36-
ZASUMKERNEL = zasum.S
23+
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
24+
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
25+
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
26+
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
3727

38-
SAXPYKERNEL = axpy.S
39-
DAXPYKERNEL = axpy.S
40-
CAXPYKERNEL = zaxpy.S
41-
ZAXPYKERNEL = zaxpy.S
28+
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
29+
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
30+
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
31+
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
4232

43-
SCOPYKERNEL = copy.S
44-
DCOPYKERNEL = copy.S
45-
CCOPYKERNEL = copy.S
46-
ZCOPYKERNEL = copy.S
33+
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
34+
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
35+
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
36+
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
4737

48-
SDOTKERNEL = dot.S
49-
DDOTKERNEL = dot.S
50-
CDOTKERNEL = zdot.S
51-
ZDOTKERNEL = zdot.S
52-
DSDOTKERNEL = dot.S
38+
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
39+
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
40+
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
41+
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
5342

54-
ifneq ($(OS_DARWIN)$(CROSS),11)
55-
SNRM2KERNEL = nrm2.S
56-
DNRM2KERNEL = nrm2.S
57-
CNRM2KERNEL = znrm2.S
58-
ZNRM2KERNEL = znrm2.S
59-
endif
43+
SAMAXKERNEL = amax.S
44+
DAMAXKERNEL = amax.S
45+
CAMAXKERNEL = zamax.S
46+
ZAMAXKERNEL = zamax.S
47+
48+
SAXPYKERNEL = axpy.S
49+
DAXPYKERNEL = daxpy_thunderx2t99.S
50+
CAXPYKERNEL = zaxpy.S
51+
ZAXPYKERNEL = zaxpy.S
6052

6153
SROTKERNEL = rot.S
6254
DROTKERNEL = rot.S
@@ -68,11 +60,6 @@ DSCALKERNEL = scal.S
6860
CSCALKERNEL = zscal.S
6961
ZSCALKERNEL = zscal.S
7062

71-
SSWAPKERNEL = swap.S
72-
DSWAPKERNEL = swap.S
73-
CSWAPKERNEL = swap.S
74-
ZSWAPKERNEL = swap.S
75-
7663
SGEMVNKERNEL = gemv_n.S
7764
DGEMVNKERNEL = gemv_n.S
7865
CGEMVNKERNEL = zgemv_n.S
@@ -83,18 +70,137 @@ DGEMVTKERNEL = gemv_t.S
8370
CGEMVTKERNEL = zgemv_t.S
8471
ZGEMVTKERNEL = zgemv_t.S
8572

86-
STRMMKERNEL = ../generic/trmmkernel_4x4.c
73+
74+
SASUMKERNEL = sasum_thunderx2t99.c
75+
DASUMKERNEL = dasum_thunderx2t99.c
76+
CASUMKERNEL = casum_thunderx2t99.c
77+
ZASUMKERNEL = zasum_thunderx2t99.c
78+
79+
SCOPYKERNEL = copy_thunderx2t99.c
80+
DCOPYKERNEL = copy_thunderx2t99.c
81+
CCOPYKERNEL = copy_thunderx2t99.c
82+
ZCOPYKERNEL = copy_thunderx2t99.c
83+
84+
SSWAPKERNEL = swap_thunderx2t99.S
85+
DSWAPKERNEL = swap_thunderx2t99.S
86+
CSWAPKERNEL = swap_thunderx2t99.S
87+
ZSWAPKERNEL = swap_thunderx2t99.S
88+
89+
ISAMAXKERNEL = iamax_thunderx2t99.c
90+
IDAMAXKERNEL = iamax_thunderx2t99.c
91+
ICAMAXKERNEL = izamax_thunderx2t99.c
92+
IZAMAXKERNEL = izamax_thunderx2t99.c
93+
94+
ifneq ($(OS_DARWIN)$(CROSS),11)
95+
SNRM2KERNEL = scnrm2_thunderx2t99.c
96+
CNRM2KERNEL = scnrm2_thunderx2t99.c
97+
#DNRM2KERNEL = dznrm2_thunderx2t99_fast.c
98+
#ZNRM2KERNEL = dznrm2_thunderx2t99_fast.c
99+
DNRM2KERNEL = dznrm2_thunderx2t99.c
100+
ZNRM2KERNEL = dznrm2_thunderx2t99.c
101+
endif
102+
103+
DDOTKERNEL = dot_thunderx2t99.c
104+
SDOTKERNEL = dot_thunderx2t99.c
105+
CDOTKERNEL = zdot_thunderx2t99.c
106+
ZDOTKERNEL = zdot_thunderx2t99.c
107+
DSDOTKERNEL = dot.S
108+
109+
ifneq ($(OS_DARWIN)$(CROSS),11)
110+
111+
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
112+
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
113+
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
114+
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
115+
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
116+
SGEMMINCOPYOBJ = sgemm_incopy.o
117+
SGEMMITCOPYOBJ = sgemm_itcopy.o
118+
endif
119+
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
120+
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
121+
SGEMMONCOPYOBJ = sgemm_oncopy.o
122+
SGEMMOTCOPYOBJ = sgemm_otcopy.o
123+
124+
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
125+
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
126+
127+
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
128+
129+
ifeq ($(DGEMM_UNROLL_M), 8)
130+
DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S
131+
DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S
132+
else
133+
DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
134+
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
135+
endif
136+
137+
DGEMMINCOPYOBJ = dgemm_incopy.o
138+
DGEMMITCOPYOBJ = dgemm_itcopy.o
139+
endif
140+
141+
ifeq ($(DGEMM_UNROLL_N), 4)
142+
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
143+
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
144+
else
145+
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
146+
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
147+
endif
148+
149+
DGEMMONCOPYOBJ = dgemm_oncopy.o
150+
DGEMMOTCOPYOBJ = dgemm_otcopy.o
151+
152+
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
153+
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
154+
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
155+
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
156+
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
157+
CGEMMINCOPYOBJ = cgemm_incopy.o
158+
CGEMMITCOPYOBJ = cgemm_itcopy.o
159+
endif
160+
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
161+
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
162+
CGEMMONCOPYOBJ = cgemm_oncopy.o
163+
CGEMMOTCOPYOBJ = cgemm_otcopy.o
164+
165+
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
166+
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
167+
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
168+
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
169+
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
170+
ZGEMMINCOPYOBJ = zgemm_incopy.o
171+
ZGEMMITCOPYOBJ = zgemm_itcopy.o
172+
endif
173+
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
174+
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
175+
ZGEMMONCOPYOBJ = zgemm_oncopy.o
176+
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
177+
178+
ifeq ($(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N), 8x4)
179+
DGEMMKERNEL = dgemm_kernel_8x4_thunderx2t99.S
180+
endif
181+
182+
ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 16x4)
183+
SGEMMKERNEL = sgemm_kernel_16x4_thunderx2t99.S
184+
endif
185+
186+
ifeq ($(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N), 8x4)
187+
CGEMMKERNEL = cgemm_kernel_8x4_thunderx2t99.S
188+
endif
189+
190+
ifeq ($(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N), 4x4)
191+
ZGEMMKERNEL = zgemm_kernel_4x4_thunderx2t99.S
192+
endif
193+
194+
else
195+
196+
STRMMKERNEL = ../generic/trmmkernel_2x2.c
87197
DTRMMKERNEL = ../generic/trmmkernel_2x2.c
88198
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
89199
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
90200

91-
ifneq ($(OS_DARWIN)$(CROSS),11)
92-
SGEMMKERNEL = sgemm_kernel_4x4.S
93-
else
94201
SGEMMKERNEL = ../generic/gemmkernel_2x2.c
95-
endif
96-
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
97-
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
202+
SGEMMONCOPY = ../generic/gemm_ncopy_2.c
203+
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
98204
SGEMMONCOPYOBJ = sgemm_oncopy.o
99205
SGEMMOTCOPYOBJ = sgemm_otcopy.o
100206

@@ -116,26 +222,4 @@ ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
116222
ZGEMMONCOPYOBJ = zgemm_oncopy.o
117223
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
118224

119-
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
120-
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
121-
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
122-
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
123-
124-
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
125-
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
126-
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
127-
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
128-
129-
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
130-
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
131-
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
132-
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
133-
134-
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
135-
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
136-
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
137-
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
138-
139-
140-
141-
225+
endif

kernel/arm64/KERNEL.CORTEXA57

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,49 @@
1-
include $(KERNELDIR)/KERNEL.ARMV8
1+
SAMINKERNEL = ../arm/amin.c
2+
DAMINKERNEL = ../arm/amin.c
3+
CAMINKERNEL = ../arm/zamin.c
4+
ZAMINKERNEL = ../arm/zamin.c
5+
6+
SMAXKERNEL = ../arm/max.c
7+
DMAXKERNEL = ../arm/max.c
8+
9+
SMINKERNEL = ../arm/min.c
10+
DMINKERNEL = ../arm/min.c
11+
12+
ISAMINKERNEL = ../arm/iamin.c
13+
IDAMINKERNEL = ../arm/iamin.c
14+
ICAMINKERNEL = ../arm/izamin.c
15+
IZAMINKERNEL = ../arm/izamin.c
16+
17+
ISMAXKERNEL = ../arm/imax.c
18+
IDMAXKERNEL = ../arm/imax.c
19+
20+
ISMINKERNEL = ../arm/imin.c
21+
IDMINKERNEL = ../arm/imin.c
22+
23+
STRMMKERNEL = ../generic/trmmkernel_4x4.c
24+
DTRMMKERNEL = ../generic/trmmkernel_2x2.c
25+
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
26+
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
27+
28+
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
29+
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
30+
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
31+
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
32+
33+
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
34+
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
35+
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
36+
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
37+
38+
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
39+
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
40+
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
41+
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
42+
43+
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
44+
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
45+
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
46+
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
247

348
SAMAXKERNEL = amax.S
449
DAMAXKERNEL = amax.S

0 commit comments

Comments
 (0)