Skip to content

Commit 310ea55

Browse files
committed
Simplifying ARMv8 build parameters
ARMv8 builds were a bit mixed up, with ThunderX2 code in ARMv8 mode (which is not right because TX2 is ARMv8.1) as well as requiring a few redundancies in the defines, making it harder to maintain and understand what core has what. A few other minor issues were also fixed. Tests were made on the following cores: A53, A57, A72, Falkor, ThunderX, ThunderX2, and XGene. Tests were: OpenBLAS/test, OpenBLAS/benchmark, BLAS-Tester. A summary: * Removed TX2 code from ARMv8 build, to make sure it is compatible with all ARMv8 cores, not just v8.1. Also, the TX2 code has actually harmed performance on big cores. * Commoned up ARMv8 architectures' defines in params.h, to make sure that all will benefit from ARMv8 settings, in addition to their own. * Adding a few more cores, using ARMv8's include strategy, to benefit from compiler optimisations using mtune. Also updated cache information from the manuals, making sure we set good conservative values by default. Removed Vulcan, as it's an alias to TX2. * Auto-detecting most of those cores, but also updating the forced compilation in getarch.c, to make sure the parameters are the same whether compiled natively or forced arch. Benefits: * ARMv8 build is now guaranteed to work on all ARMv8 cores * Improved performance for ARMv8 builds on some cores (A72, Falkor, ThunderX1 and 2: up to 11%) over current develop * Improved performance for *all* cores comparing to develop branch before TX2's patch (9% ~ 36%) * ThunderX1 builds are 14% faster than ARMv8 on TX1, 9% faster than current develop's branch and 8% faster than deveop before tx2 patches Issues: * Regression from current develop branch for A53 (-12%) and A57 (-3%) with ARMv8 builds, but still faster than before TX2's commit (+15% and +24% respectively). This can be improved with a simplification of TX2's code, to be done in future patches. At least the code is guaranteed to be ARMv8.0 now. Comments: * CortexA57 builds are unchanged on A57 hardware from develop's branch, which makes sense, as it's untouched. * CortexA72 builds improve over A57 on A72 hardware, even if they're using the same includes due to new compiler tunning in the makefile.
1 parent 368d14f commit 310ea55

File tree

11 files changed

+249
-200
lines changed

11 files changed

+249
-200
lines changed

Makefile.arm64

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,22 +4,37 @@ CCOMMON_OPT += -march=armv8-a
44
FCOMMON_OPT += -march=armv8-a
55
endif
66

7+
ifeq ($(CORE), CORTEXA53)
8+
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53
9+
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a53
10+
endif
11+
712
ifeq ($(CORE), CORTEXA57)
8-
CCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57
9-
FCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57
13+
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a57
14+
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a57
15+
endif
16+
17+
ifeq ($(CORE), CORTEXA72)
18+
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
19+
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
1020
endif
1121

12-
ifeq ($(CORE), VULCAN)
13-
CCOMMON_OPT += -mtune=vulcan -mcpu=vulcan
14-
FCOMMON_OPT += -mtune=vulcan -mcpu=vulcan
22+
ifeq ($(CORE), CORTEXA73)
23+
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a73
24+
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a73
1525
endif
1626

1727
ifeq ($(CORE), THUNDERX)
18-
CCOMMON_OPT += -mtune=thunderx -mcpu=thunderx
19-
FCOMMON_OPT += -mtune=thunderx -mcpu=thunderx
28+
CCOMMON_OPT += -march=armv8-a -mtune=thunderx
29+
FCOMMON_OPT += -march=armv8-a -mtune=thunderx
30+
endif
31+
32+
ifeq ($(CORE), FALKOR)
33+
CCOMMON_OPT += -march=armv8.1-a -mtune=falkor
34+
FCOMMON_OPT += -march=armv8.1-a -mtune=falkor
2035
endif
2136

2237
ifeq ($(CORE), THUNDERX2T99)
23-
CCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99
24-
FCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99
38+
CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
39+
FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
2540
endif

TargetList.txt

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,8 +83,11 @@ ARMV5
8383

8484
8.ARM 64-bit CPU:
8585
ARMV8
86+
CORTEXA53
8687
CORTEXA57
87-
VULCAN
88+
CORTEXA72
89+
CORTEXA73
90+
FALKOR
8891
THUNDERX
8992
THUNDERX2T99
9093

cpuid_arm64.c

Lines changed: 72 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -29,25 +29,37 @@
2929

3030
#define CPU_UNKNOWN 0
3131
#define CPU_ARMV8 1
32-
#define CPU_CORTEXA57 2
33-
#define CPU_VULCAN 3
34-
#define CPU_THUNDERX 4
35-
#define CPU_THUNDERX2T99 5
32+
// Arm
33+
#define CPU_CORTEXA53 2
34+
#define CPU_CORTEXA57 3
35+
#define CPU_CORTEXA72 4
36+
#define CPU_CORTEXA73 5
37+
// Qualcomm
38+
#define CPU_FALKOR 6
39+
// Cavium
40+
#define CPU_THUNDERX 7
41+
#define CPU_THUNDERX2T99 8
3642

3743
static char *cpuname[] = {
3844
"UNKNOWN",
3945
"ARMV8" ,
46+
"CORTEXA53",
4047
"CORTEXA57",
41-
"VULCAN",
48+
"CORTEXA72",
49+
"CORTEXA73",
50+
"FALKOR",
4251
"THUNDERX",
4352
"THUNDERX2T99"
4453
};
4554

4655
static char *cpuname_lower[] = {
4756
"unknown",
48-
"armv8" ,
57+
"armv8",
58+
"cortexa53",
4959
"cortexa57",
50-
"vulcan",
60+
"cortexa72",
61+
"cortexa73",
62+
"falkor",
5163
"thunderx",
5264
"thunderx2t99"
5365
};
@@ -114,14 +126,24 @@ int detect(void)
114126

115127
fclose(infile);
116128
if(cpu_part != NULL && cpu_implementer != NULL) {
117-
if (strstr(cpu_implementer, "0x41") &&
118-
(strstr(cpu_part, "0xd07") || strstr(cpu_part,"0xd08")))
119-
return CPU_CORTEXA57; //or compatible, ex. A72
120-
else if (strstr(cpu_part, "0x516") && strstr(cpu_implementer, "0x42"))
121-
return CPU_VULCAN;
122-
else if (strstr(cpu_part, "0x0a1") && strstr(cpu_implementer, "0x43"))
129+
// Arm
130+
if (strstr(cpu_implementer, "0x41")) {
131+
if (strstr(cpu_part, "0xd03"))
132+
return CPU_CORTEXA53;
133+
else if (strstr(cpu_part, "0xd07"))
134+
return CPU_CORTEXA57;
135+
else if (strstr(cpu_part, "0xd08"))
136+
return CPU_CORTEXA72;
137+
else if (strstr(cpu_part, "0xd09"))
138+
return CPU_CORTEXA73;
139+
}
140+
// Qualcomm
141+
else if (strstr(cpu_implementer, "0x51") && strstr(cpu_part, "0xc00"))
142+
return CPU_FALKOR;
143+
// Cavium
144+
else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0a1"))
123145
return CPU_THUNDERX;
124-
else if (strstr(cpu_part, "0x0af") && strstr(cpu_implementer, "0x43"))
146+
else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0af"))
125147
return CPU_THUNDERX2T99;
126148
}
127149

@@ -180,62 +202,62 @@ void get_subdirname(void)
180202
void get_cpuconfig(void)
181203
{
182204

205+
// All arches should define ARMv8
206+
printf("#define ARMV8\n");
207+
printf("#define HAVE_NEON\n"); // This shouldn't be necessary
208+
printf("#define HAVE_VFPV4\n"); // This shouldn't be necessary
209+
183210
int d = detect();
184211
switch (d)
185212
{
186213

214+
case CPU_CORTEXA53:
215+
printf("#define %s\n", cpuname[d]);
216+
// Fall-through
187217
case CPU_ARMV8:
188-
printf("#define ARMV8\n");
189-
printf("#define L1_DATA_SIZE 32768\n");
190-
printf("#define L1_DATA_LINESIZE 64\n");
191-
printf("#define L2_SIZE 262144\n");
192-
printf("#define L2_LINESIZE 64\n");
193-
printf("#define DTB_DEFAULT_ENTRIES 64\n");
194-
printf("#define DTB_SIZE 4096\n");
195-
printf("#define L2_ASSOCIATIVE 4\n");
196-
break;
197-
198-
case CPU_VULCAN:
199-
printf("#define VULCAN \n");
200-
printf("#define HAVE_VFP \n");
201-
printf("#define HAVE_VFPV3 \n");
202-
printf("#define HAVE_NEON \n");
203-
printf("#define HAVE_VFPV4 \n");
204-
printf("#define L1_CODE_SIZE 32768 \n");
205-
printf("#define L1_CODE_LINESIZE 64 \n");
206-
printf("#define L1_CODE_ASSOCIATIVE 8 \n");
207-
printf("#define L1_DATA_SIZE 32768 \n");
208-
printf("#define L1_DATA_LINESIZE 64 \n");
209-
printf("#define L1_DATA_ASSOCIATIVE 8 \n");
210-
printf("#define L2_SIZE 262144 \n");
211-
printf("#define L2_LINESIZE 64 \n");
212-
printf("#define L2_ASSOCIATIVE 8 \n");
213-
printf("#define L3_SIZE 33554432 \n");
214-
printf("#define L3_LINESIZE 64 \n");
215-
printf("#define L3_ASSOCIATIVE 32 \n");
216-
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
217-
printf("#define DTB_SIZE 4096 \n");
218+
// Minimum parameters for ARMv8 (based on A53)
219+
printf("#define L1_DATA_SIZE 32768\n");
220+
printf("#define L1_DATA_LINESIZE 64\n");
221+
printf("#define L2_SIZE 262144\n");
222+
printf("#define L2_LINESIZE 64\n");
223+
printf("#define DTB_DEFAULT_ENTRIES 64\n");
224+
printf("#define DTB_SIZE 4096\n");
225+
printf("#define L2_ASSOCIATIVE 4\n");
218226
break;
219227

220228
case CPU_CORTEXA57:
221-
printf("#define CORTEXA57\n");
222-
printf("#define HAVE_VFP\n");
223-
printf("#define HAVE_VFPV3\n");
224-
printf("#define HAVE_NEON\n");
225-
printf("#define HAVE_VFPV4\n");
229+
case CPU_CORTEXA72:
230+
case CPU_CORTEXA73:
231+
// Common minimum settings for these Arm cores
232+
// Can change a lot, but we need to be conservative
233+
// TODO: detect info from /sys if possible
234+
printf("#define %s\n", cpuname[d]);
226235
printf("#define L1_CODE_SIZE 49152\n");
227236
printf("#define L1_CODE_LINESIZE 64\n");
228237
printf("#define L1_CODE_ASSOCIATIVE 3\n");
229238
printf("#define L1_DATA_SIZE 32768\n");
230239
printf("#define L1_DATA_LINESIZE 64\n");
231240
printf("#define L1_DATA_ASSOCIATIVE 2\n");
232-
printf("#define L2_SIZE 2097152\n");
241+
printf("#define L2_SIZE 524288\n");
233242
printf("#define L2_LINESIZE 64\n");
234243
printf("#define L2_ASSOCIATIVE 16\n");
235244
printf("#define DTB_DEFAULT_ENTRIES 64\n");
236245
printf("#define DTB_SIZE 4096\n");
237246
break;
238247

248+
case CPU_FALKOR:
249+
printf("#define FALKOR\n");
250+
printf("#define L1_CODE_SIZE 65536\n");
251+
printf("#define L1_CODE_LINESIZE 64\n");
252+
printf("#define L1_DATA_SIZE 32768\n");
253+
printf("#define L1_DATA_LINESIZE 128\n");
254+
printf("#define L2_SIZE 524288\n");
255+
printf("#define L2_LINESIZE 64\n");
256+
printf("#define DTB_DEFAULT_ENTRIES 64\n");
257+
printf("#define DTB_SIZE 4096\n");
258+
printf("#define L2_ASSOCIATIVE 16\n");
259+
break;
260+
239261
case CPU_THUNDERX:
240262
printf("#define THUNDERX\n");
241263
printf("#define L1_DATA_SIZE 32768\n");
@@ -249,10 +271,6 @@ void get_cpuconfig(void)
249271

250272
case CPU_THUNDERX2T99:
251273
printf("#define VULCAN \n");
252-
printf("#define HAVE_VFP \n");
253-
printf("#define HAVE_VFPV3 \n");
254-
printf("#define HAVE_NEON \n");
255-
printf("#define HAVE_VFPV4 \n");
256274
printf("#define L1_CODE_SIZE 32768 \n");
257275
printf("#define L1_CODE_LINESIZE 64 \n");
258276
printf("#define L1_CODE_ASSOCIATIVE 8 \n");

getarch.c

Lines changed: 64 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -927,11 +927,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
927927
#define ARCHCONFIG "-DARMV8 " \
928928
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
929929
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
930-
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 "
930+
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \
931+
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
931932
#define LIBNAME "armv8"
932933
#define CORENAME "ARMV8"
933934
#endif
934935

936+
#ifdef FORCE_CORTEXA53
937+
#define FORCE
938+
#define ARCHITECTURE "ARM64"
939+
#define SUBARCHITECTURE "CORTEXA53"
940+
#define SUBDIRNAME "arm64"
941+
#define ARCHCONFIG "-DCORTEXA53 " \
942+
"-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \
943+
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \
944+
"-DL2_SIZE=262144 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
945+
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
946+
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
947+
#define LIBNAME "cortexa53"
948+
#define CORENAME "CORTEXA53"
949+
#else
950+
#endif
951+
935952
#ifdef FORCE_CORTEXA57
936953
#define FORCE
937954
#define ARCHITECTURE "ARM64"
@@ -942,26 +959,57 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
942959
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \
943960
"-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
944961
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
945-
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON"
962+
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
946963
#define LIBNAME "cortexa57"
947964
#define CORENAME "CORTEXA57"
948965
#else
949966
#endif
950967

951-
#ifdef FORCE_VULCAN
968+
#ifdef FORCE_CORTEXA72
952969
#define FORCE
953970
#define ARCHITECTURE "ARM64"
954-
#define SUBARCHITECTURE "VULCAN"
971+
#define SUBARCHITECTURE "CORTEXA72"
955972
#define SUBDIRNAME "arm64"
956-
#define ARCHCONFIG "-DVULCAN " \
957-
"-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \
958-
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=8 " \
959-
"-DL2_SIZE=262144 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \
960-
"-DL3_SIZE=33554432 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=32 " \
973+
#define ARCHCONFIG "-DCORTEXA72 " \
974+
"-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \
975+
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \
976+
"-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
977+
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
978+
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
979+
#define LIBNAME "cortexa72"
980+
#define CORENAME "CORTEXA72"
981+
#else
982+
#endif
983+
984+
#ifdef FORCE_CORTEXA73
985+
#define FORCE
986+
#define ARCHITECTURE "ARM64"
987+
#define SUBARCHITECTURE "CORTEXA73"
988+
#define SUBDIRNAME "arm64"
989+
#define ARCHCONFIG "-DCORTEXA73 " \
990+
"-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \
991+
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \
992+
"-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
993+
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
994+
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
995+
#define LIBNAME "cortexa73"
996+
#define CORENAME "CORTEXA73"
997+
#else
998+
#endif
999+
1000+
#ifdef FORCE_FALKOR
1001+
#define FORCE
1002+
#define ARCHITECTURE "ARM64"
1003+
#define SUBARCHITECTURE "FALKOR"
1004+
#define SUBDIRNAME "arm64"
1005+
#define ARCHCONFIG "-DFALKOR " \
1006+
"-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \
1007+
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \
1008+
"-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
9611009
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
962-
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON"
963-
#define LIBNAME "vulcan"
964-
#define CORENAME "VULCAN"
1010+
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
1011+
#define LIBNAME "falkor"
1012+
#define CORENAME "FALKOR"
9651013
#else
9661014
#endif
9671015

@@ -973,13 +1021,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
9731021
#define ARCHCONFIG "-DTHUNDERX " \
9741022
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \
9751023
"-DL2_SIZE=16777216 -DL2_LINESIZE=128 -DL2_ASSOCIATIVE=16 " \
976-
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 "
1024+
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
1025+
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
9771026
#define LIBNAME "thunderx"
9781027
#define CORENAME "THUNDERX"
9791028
#else
9801029
#endif
9811030

9821031
#ifdef FORCE_THUNDERX2T99
1032+
#define ARMV8
9831033
#define FORCE
9841034
#define ARCHITECTURE "ARM64"
9851035
#define SUBARCHITECTURE "THUNDERX2T99"
@@ -990,7 +1040,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
9901040
"-DL2_SIZE=262144 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \
9911041
"-DL3_SIZE=33554432 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=32 " \
9921042
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
993-
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON"
1043+
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
9941044
#define LIBNAME "thunderx2t99"
9951045
#define CORENAME "THUNDERX2T99"
9961046
#else

0 commit comments

Comments
 (0)