Skip to content

Commit f71ad19

Browse files
committed
[AArch64] Add a target feature for AArch64StorePairSuppress
The AArch64StorePairSuppress pass prevents the creation of STP under some heuristics. Unfortunately it often prevents the creation of STP in cases where it is obviously beneficial, and it doesn't match my understanding of scheduling/cpu pipelining to prevent the creation of STP. From some benchmarking, even on an in-order cpu where the scheduling is most important I don't see it giving better results. In general the lower instruction count for STP would be expected to give a slightly better cycle count. As the pass specifically mentions the cyclone cpu, this patch adds a target feature for FeatureStorePairSuppress, enabled for all the non-Arm cpus. This has the effect of disabling it for all Arm cpus. Differential Revision: https://reviews.llvm.org/D134646
1 parent f99bd29 commit f71ad19

File tree

8 files changed

+219
-192
lines changed

8 files changed

+219
-192
lines changed

llvm/lib/Target/AArch64/AArch64.td

Lines changed: 38 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,10 @@ def FeatureDisableLatencySchedHeuristic : SubtargetFeature<
295295
"disable-latency-sched-heuristic", "DisableLatencySchedHeuristic", "true",
296296
"Disable latency scheduling heuristic">;
297297

298+
def FeatureStorePairSuppress : SubtargetFeature<
299+
"store-pair-suppress", "EnableStorePairSuppress", "true",
300+
"Enable Store Pair Suppression heuristics">;
301+
298302
def FeatureForce32BitJumpTables
299303
: SubtargetFeature<"force-32bit-jump-tables", "Force32BitJumpTables", "true",
300304
"Force jump table entries to be 32-bits wide except at MinSize">;
@@ -952,8 +956,8 @@ def TuneA64FX : SubtargetFeature<"a64fx", "ARMProcFamily", "A64FX",
952956
FeaturePostRAScheduler,
953957
FeatureAggressiveFMA,
954958
FeatureArithmeticBccFusion,
955-
FeaturePredictableSelectIsExpensive
956-
]>;
959+
FeatureStorePairSuppress,
960+
FeaturePredictableSelectIsExpensive]>;
957961

958962
def TuneCarmel : SubtargetFeature<"carmel", "ARMProcFamily", "Carmel",
959963
"Nvidia Carmel processors">;
@@ -967,10 +971,10 @@ def TuneAppleA7 : SubtargetFeature<"apple-a7", "ARMProcFamily", "AppleA7",
967971
FeatureArithmeticCbzFusion,
968972
FeatureDisableLatencySchedHeuristic,
969973
FeatureFuseAES, FeatureFuseCryptoEOR,
974+
FeatureStorePairSuppress,
970975
FeatureZCRegMove,
971976
FeatureZCZeroing,
972-
FeatureZCZeroingFPWorkaround]
973-
>;
977+
FeatureZCZeroingFPWorkaround]>;
974978

975979
def TuneAppleA10 : SubtargetFeature<"apple-a10", "ARMProcFamily", "AppleA10",
976980
"Apple A10", [
@@ -980,9 +984,9 @@ def TuneAppleA10 : SubtargetFeature<"apple-a10", "ARMProcFamily", "AppleA10",
980984
FeatureDisableLatencySchedHeuristic,
981985
FeatureFuseAES,
982986
FeatureFuseCryptoEOR,
987+
FeatureStorePairSuppress,
983988
FeatureZCRegMove,
984-
FeatureZCZeroing]
985-
>;
989+
FeatureZCZeroing]>;
986990

987991
def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11",
988992
"Apple A11", [
@@ -992,9 +996,9 @@ def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11",
992996
FeatureDisableLatencySchedHeuristic,
993997
FeatureFuseAES,
994998
FeatureFuseCryptoEOR,
999+
FeatureStorePairSuppress,
9951000
FeatureZCRegMove,
996-
FeatureZCZeroing]
997-
>;
1001+
FeatureZCZeroing]>;
9981002

9991003
def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12",
10001004
"Apple A12", [
@@ -1004,9 +1008,9 @@ def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12",
10041008
FeatureDisableLatencySchedHeuristic,
10051009
FeatureFuseAES,
10061010
FeatureFuseCryptoEOR,
1011+
FeatureStorePairSuppress,
10071012
FeatureZCRegMove,
1008-
FeatureZCZeroing]
1009-
>;
1013+
FeatureZCZeroing]>;
10101014

10111015
def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13",
10121016
"Apple A13", [
@@ -1016,9 +1020,9 @@ def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13",
10161020
FeatureDisableLatencySchedHeuristic,
10171021
FeatureFuseAES,
10181022
FeatureFuseCryptoEOR,
1023+
FeatureStorePairSuppress,
10191024
FeatureZCRegMove,
1020-
FeatureZCZeroing]
1021-
>;
1025+
FeatureZCZeroing]>;
10221026

10231027
def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
10241028
"Apple A14", [
@@ -1034,6 +1038,7 @@ def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
10341038
FeatureFuseCryptoEOR,
10351039
FeatureFuseAdrpAdd,
10361040
FeatureFuseLiterals,
1041+
FeatureStorePairSuppress,
10371042
FeatureZCRegMove,
10381043
FeatureZCZeroing]>;
10391044

@@ -1049,9 +1054,9 @@ def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15",
10491054
FeatureFuseCCSelect,
10501055
FeatureFuseCryptoEOR,
10511056
FeatureFuseLiterals,
1057+
FeatureStorePairSuppress,
10521058
FeatureZCRegMove,
1053-
FeatureZCZeroing
1054-
]>;
1059+
FeatureZCZeroing]>;
10551060

10561061
def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
10571062
"Apple A16", [
@@ -1065,9 +1070,9 @@ def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
10651070
FeatureFuseCCSelect,
10661071
FeatureFuseCryptoEOR,
10671072
FeatureFuseLiterals,
1073+
FeatureStorePairSuppress,
10681074
FeatureZCRegMove,
1069-
FeatureZCZeroing
1070-
]>;
1075+
FeatureZCZeroing]>;
10711076

10721077
def TuneExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
10731078
"Samsung Exynos-M3 processors",
@@ -1078,6 +1083,7 @@ def TuneExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
10781083
FeatureFuseCCSelect,
10791084
FeatureFuseAdrpAdd,
10801085
FeatureFuseLiterals,
1086+
FeatureStorePairSuppress,
10811087
FeatureAddrLSLFast,
10821088
FeatureALULSLFast,
10831089
FeaturePostRAScheduler,
@@ -1096,6 +1102,7 @@ def TuneExynosM4 : SubtargetFeature<"exynosm4", "ARMProcFamily", "ExynosM3",
10961102
FeatureFuseCCSelect,
10971103
FeatureFuseAdrpAdd,
10981104
FeatureFuseLiterals,
1105+
FeatureStorePairSuppress,
10991106
FeatureAddrLSLFast,
11001107
FeatureALULSLFast,
11011108
FeaturePostRAScheduler,
@@ -1107,18 +1114,18 @@ def TuneKryo : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
11071114
FeaturePredictableSelectIsExpensive,
11081115
FeatureZCZeroing,
11091116
FeatureAddrLSLFast,
1110-
FeatureALULSLFast]
1111-
>;
1117+
FeatureALULSLFast,
1118+
FeatureStorePairSuppress]>;
11121119

11131120
def TuneFalkor : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor",
11141121
"Qualcomm Falkor processors", [
11151122
FeaturePostRAScheduler,
11161123
FeaturePredictableSelectIsExpensive,
11171124
FeatureZCZeroing,
1125+
FeatureStorePairSuppress,
11181126
FeatureAddrLSLFast,
11191127
FeatureALULSLFast,
1120-
FeatureSlowSTRQro
1121-
]>;
1128+
FeatureSlowSTRQro]>;
11221129

11231130
def TuneNeoverseE1 : SubtargetFeature<"neoversee1", "ARMProcFamily", "NeoverseE1",
11241131
"Neoverse E1 ARM processors", [
@@ -1182,6 +1189,7 @@ def TuneSaphira : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira",
11821189
FeaturePostRAScheduler,
11831190
FeaturePredictableSelectIsExpensive,
11841191
FeatureZCZeroing,
1192+
FeatureStorePairSuppress,
11851193
FeatureAddrLSLFast,
11861194
FeatureALULSLFast]>;
11871195

@@ -1190,6 +1198,7 @@ def TuneThunderX2T99 : SubtargetFeature<"thunderx2t99", "ARMProcFamily", "Thund
11901198
FeatureAggressiveFMA,
11911199
FeatureArithmeticBccFusion,
11921200
FeaturePostRAScheduler,
1201+
FeatureStorePairSuppress,
11931202
FeaturePredictableSelectIsExpensive]>;
11941203

11951204
def TuneThunderX3T110 : SubtargetFeature<"thunderx3t110", "ARMProcFamily",
@@ -1200,34 +1209,40 @@ def TuneThunderX3T110 : SubtargetFeature<"thunderx3t110", "ARMProcFamily",
12001209
FeaturePostRAScheduler,
12011210
FeaturePredictableSelectIsExpensive,
12021211
FeatureBalanceFPOps,
1212+
FeatureStorePairSuppress,
12031213
FeatureStrictAlign]>;
12041214

12051215
def TuneThunderX : SubtargetFeature<"thunderx", "ARMProcFamily", "ThunderX",
12061216
"Cavium ThunderX processors", [
12071217
FeaturePostRAScheduler,
1218+
FeatureStorePairSuppress,
12081219
FeaturePredictableSelectIsExpensive]>;
12091220

12101221
def TuneThunderXT88 : SubtargetFeature<"thunderxt88", "ARMProcFamily",
12111222
"ThunderXT88",
12121223
"Cavium ThunderX processors", [
12131224
FeaturePostRAScheduler,
1225+
FeatureStorePairSuppress,
12141226
FeaturePredictableSelectIsExpensive]>;
12151227

12161228
def TuneThunderXT81 : SubtargetFeature<"thunderxt81", "ARMProcFamily",
12171229
"ThunderXT81",
12181230
"Cavium ThunderX processors", [
12191231
FeaturePostRAScheduler,
1232+
FeatureStorePairSuppress,
12201233
FeaturePredictableSelectIsExpensive]>;
12211234

12221235
def TuneThunderXT83 : SubtargetFeature<"thunderxt83", "ARMProcFamily",
12231236
"ThunderXT83",
12241237
"Cavium ThunderX processors", [
12251238
FeaturePostRAScheduler,
1239+
FeatureStorePairSuppress,
12261240
FeaturePredictableSelectIsExpensive]>;
12271241

12281242
def TuneTSV110 : SubtargetFeature<"tsv110", "ARMProcFamily", "TSV110",
12291243
"HiSilicon TS-V110 processors", [
12301244
FeatureFuseAES,
1245+
FeatureStorePairSuppress,
12311246
FeaturePostRAScheduler]>;
12321247

12331248
def TuneAmpere1 : SubtargetFeature<"ampere1", "ARMProcFamily", "Ampere1",
@@ -1241,7 +1256,8 @@ def TuneAmpere1 : SubtargetFeature<"ampere1", "ARMProcFamily", "Ampere1",
12411256
FeatureCmpBccFusion,
12421257
FeatureFuseAddress,
12431258
FeatureFuseLiterals,
1244-
FeatureLdpAlignedOnly,
1259+
FeatureStorePairSuppress,
1260+
FeatureLdpAlignedOnly,
12451261
FeatureStpAlignedOnly]>;
12461262

12471263
def TuneAmpere1A : SubtargetFeature<"ampere1a", "ARMProcFamily", "Ampere1A",
@@ -1256,6 +1272,7 @@ def TuneAmpere1A : SubtargetFeature<"ampere1a", "ARMProcFamily", "Ampere1A",
12561272
FeatureFuseAddress,
12571273
FeatureFuseLiterals,
12581274
FeatureFuseLiterals,
1275+
FeatureStorePairSuppress,
12591276
FeatureLdpAlignedOnly,
12601277
FeatureStpAlignedOnly]>;
12611278

llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
// ===---------------------------------------------------------------------===//
1212

1313
#include "AArch64InstrInfo.h"
14+
#include "AArch64Subtarget.h"
1415
#include "llvm/CodeGen/MachineFunction.h"
1516
#include "llvm/CodeGen/MachineFunctionPass.h"
1617
#include "llvm/CodeGen/MachineInstr.h"
@@ -122,7 +123,10 @@ bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &MF) {
122123
if (skipFunction(MF.getFunction()) || MF.getFunction().hasOptSize())
123124
return false;
124125

125-
const TargetSubtargetInfo &ST = MF.getSubtarget();
126+
const AArch64Subtarget &ST = MF.getSubtarget<AArch64Subtarget>();
127+
if (!ST.enableStorePairSuppress())
128+
return false;
129+
126130
TII = static_cast<const AArch64InstrInfo *>(ST.getInstrInfo());
127131
TRI = ST.getRegisterInfo();
128132
MRI = &MF.getRegInfo();

llvm/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
1-
; RUN: llc < %s -mcpu=cortex-a57 -aarch64-a57-fp-load-balancing-override=1 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false | FileCheck %s --check-prefix CHECK --check-prefix CHECK-BALFP --check-prefix CHECK-EVEN
2-
; RUN: llc < %s -mcpu=cortex-a57 -aarch64-a57-fp-load-balancing-override=2 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false | FileCheck %s --check-prefix CHECK --check-prefix CHECK-BALFP --check-prefix CHECK-ODD
3-
; RUN: llc < %s -mcpu=cortex-a53 -aarch64-a57-fp-load-balancing-override=1 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false | FileCheck %s --check-prefix CHECK --check-prefix CHECK-A53 --check-prefix CHECK-EVEN
4-
; RUN: llc < %s -mcpu=cortex-a53 -aarch64-a57-fp-load-balancing-override=2 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false | FileCheck %s --check-prefix CHECK --check-prefix CHECK-A53 --check-prefix CHECK-ODD
1+
; RUN: llc < %s -mcpu=cortex-a57 -aarch64-a57-fp-load-balancing-override=1 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false | FileCheck %s --check-prefix CHECK --check-prefix CHECK-EVEN
2+
; RUN: llc < %s -mcpu=cortex-a57 -aarch64-a57-fp-load-balancing-override=2 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false | FileCheck %s --check-prefix CHECK --check-prefix CHECK-ODD
3+
; RUN: llc < %s -mcpu=cortex-a53 -aarch64-a57-fp-load-balancing-override=1 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false | FileCheck %s --check-prefix CHECK --check-prefix CHECK-EVEN
4+
; RUN: llc < %s -mcpu=cortex-a53 -aarch64-a57-fp-load-balancing-override=2 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false | FileCheck %s --check-prefix CHECK --check-prefix CHECK-ODD
55

66
; The following tests use the balance-fp-ops feature, and should be independent of
77
; the target cpu.
88

9-
; RUN: llc < %s -mtriple=aarch64-linux-gnueabi -mattr=+balance-fp-ops -aarch64-a57-fp-load-balancing-override=1 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false | FileCheck %s --check-prefix CHECK --check-prefix CHECK-EVEN --check-prefix CHECK-BALFP
10-
; RUN: llc < %s -mtriple=aarch64-linux-gnueabi -mattr=+balance-fp-ops -aarch64-a57-fp-load-balancing-override=2 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false | FileCheck %s --check-prefix CHECK --check-prefix CHECK-ODD --check-prefix CHECK-BALFP
9+
; RUN: llc < %s -mtriple=aarch64-linux-gnueabi -mattr=+balance-fp-ops -aarch64-a57-fp-load-balancing-override=1 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false | FileCheck %s --check-prefix CHECK --check-prefix CHECK-EVEN
10+
; RUN: llc < %s -mtriple=aarch64-linux-gnueabi -mattr=+balance-fp-ops -aarch64-a57-fp-load-balancing-override=2 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false | FileCheck %s --check-prefix CHECK --check-prefix CHECK-ODD
1111

1212
; Test the AArch64A57FPLoadBalancing pass. This pass relies heavily on register allocation, so
1313
; our test strategy is to:
@@ -81,9 +81,7 @@ entry:
8181
; CHECK: fmsub [[x]]
8282
; CHECK: fmadd [[y]]
8383
; CHECK: fmadd [[x]]
84-
; CHECK-BALFP: stp [[x]], [[y]]
85-
; CHECK-A53-DAG: str [[x]]
86-
; CHECK-A53-DAG: str [[y]]
84+
; CHECK: stp [[x]], [[y]]
8785

8886
define void @f2(ptr nocapture readonly %p, ptr nocapture %q) #0 {
8987
entry:
@@ -176,9 +174,7 @@ declare void @g(...) #1
176174
; CHECK: fmsub [[x]]
177175
; CHECK: fmadd [[y]]
178176
; CHECK: fmadd [[x]]
179-
; CHECK-BALFP: stp [[x]], [[y]]
180-
; CHECK-A53-DAG: str [[x]]
181-
; CHECK-A53-DAG: str [[y]]
177+
; CHECK: stp [[x]], [[y]]
182178

183179
define void @f4(ptr nocapture readonly %p, ptr nocapture %q) #0 {
184180
entry:

llvm/test/CodeGen/AArch64/argument-blocks-array-of-struct.ll

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -385,12 +385,9 @@ define void @caller_in_block() {
385385
; CHECK-NEXT: bl return_in_block
386386
; CHECK-NEXT: adrp x8, in_block_store
387387
; CHECK-NEXT: add x8, x8, :lo12:in_block_store
388-
; CHECK-NEXT: str d0, [x8]
389-
; CHECK-NEXT: str d1, [x8, #8]
390-
; CHECK-NEXT: str d2, [x8, #16]
391-
; CHECK-NEXT: str d3, [x8, #24]
392-
; CHECK-NEXT: str d4, [x8, #32]
393-
; CHECK-NEXT: str d5, [x8, #40]
388+
; CHECK-NEXT: stp d0, d1, [x8]
389+
; CHECK-NEXT: stp d2, d3, [x8, #16]
390+
; CHECK-NEXT: stp d4, d5, [x8, #32]
394391
; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
395392
; CHECK-NEXT: ret
396393
%1 = call %T_IN_BLOCK @return_in_block()
@@ -403,12 +400,9 @@ define void @callee_in_block(%T_IN_BLOCK %a) {
403400
; CHECK: // %bb.0:
404401
; CHECK-NEXT: adrp x8, in_block_store
405402
; CHECK-NEXT: add x8, x8, :lo12:in_block_store
406-
; CHECK-NEXT: str d5, [x8, #40]
407-
; CHECK-NEXT: str d4, [x8, #32]
408-
; CHECK-NEXT: str d3, [x8, #24]
409-
; CHECK-NEXT: str d2, [x8, #16]
410-
; CHECK-NEXT: str d1, [x8, #8]
411-
; CHECK-NEXT: str d0, [x8]
403+
; CHECK-NEXT: stp d4, d5, [x8, #32]
404+
; CHECK-NEXT: stp d2, d3, [x8, #16]
405+
; CHECK-NEXT: stp d0, d1, [x8]
412406
; CHECK-NEXT: ret
413407
store %T_IN_BLOCK %a, ptr @in_block_store
414408
ret void

llvm/test/CodeGen/AArch64/arm64-windows-calls.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -152,8 +152,7 @@ define void @call_copy_pod() {
152152
; CHECK-NEXT: add x19, x19, :lo12:Pod
153153
; CHECK-NEXT: mov x0, x19
154154
; CHECK-NEXT: bl copy_pod
155-
; CHECK-NEXT: str d0, [x19]
156-
; CHECK-NEXT: str d1, [x19, #8]
155+
; CHECK-NEXT: stp d0, d1, [x19]
157156
; CHECK-NEXT: .seh_startepilogue
158157
; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload
159158
; CHECK-NEXT: .seh_save_reg x30, 8

0 commit comments

Comments
 (0)