joker-eph
diff --git a/‎llvm/lib/Target/AArch64/AArch64.td
Lines changed: 38 additions & 21 deletions b/‎llvm/lib/Target/AArch64/AArch64.td
Lines changed: 38 additions & 21 deletions
diff --git a/‎llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp
Lines changed: 5 additions & 1 deletion b/‎llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp
Lines changed: 5 additions & 1 deletion
diff --git a/‎llvm/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll
Lines changed: 8 additions & 12 deletions b/‎llvm/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll
Lines changed: 8 additions & 12 deletions
diff --git a/‎llvm/test/CodeGen/AArch64/argument-blocks-array-of-struct.ll
Lines changed: 6 additions & 12 deletions b/‎llvm/test/CodeGen/AArch64/argument-blocks-array-of-struct.ll
Lines changed: 6 additions & 12 deletions
diff --git a/‎llvm/test/CodeGen/AArch64/arm64-windows-calls.ll
Lines changed: 1 addition & 2 deletions b/‎llvm/test/CodeGen/AArch64/arm64-windows-calls.ll
Lines changed: 1 addition & 2 deletions
@@ -295,6 +295,10 @@ def FeatureDisableLatencySchedHeuristic : SubtargetFeature<
     "disable-latency-sched-heuristic", "DisableLatencySchedHeuristic", "true",
     "Disable latency scheduling heuristic">;
 
+def FeatureStorePairSuppress : SubtargetFeature<
+    "store-pair-suppress", "EnableStorePairSuppress", "true",
+    "Enable Store Pair Suppression heuristics">;
+
 def FeatureForce32BitJumpTables
    : SubtargetFeature<"force-32bit-jump-tables", "Force32BitJumpTables", "true",
                       "Force jump table entries to be 32-bits wide except at MinSize">;
@@ -952,8 +956,8 @@ def TuneA64FX : SubtargetFeature<"a64fx", "ARMProcFamily", "A64FX",
                                  FeaturePostRAScheduler,
                                  FeatureAggressiveFMA,
                                  FeatureArithmeticBccFusion,
-                                 FeaturePredictableSelectIsExpensive
-                                 ]>;
+                                 FeatureStorePairSuppress,
+                                 FeaturePredictableSelectIsExpensive]>;
 
 def TuneCarmel : SubtargetFeature<"carmel", "ARMProcFamily", "Carmel",
                                   "Nvidia Carmel processors">;
@@ -967,10 +971,10 @@ def TuneAppleA7  : SubtargetFeature<"apple-a7", "ARMProcFamily", "AppleA7",
                                     FeatureArithmeticCbzFusion,
                                     FeatureDisableLatencySchedHeuristic,
                                     FeatureFuseAES, FeatureFuseCryptoEOR,
+                                    FeatureStorePairSuppress,
                                     FeatureZCRegMove,
                                     FeatureZCZeroing,
-                                    FeatureZCZeroingFPWorkaround]
-                                    >;
+                                    FeatureZCZeroingFPWorkaround]>;
 
 def TuneAppleA10 : SubtargetFeature<"apple-a10", "ARMProcFamily", "AppleA10",
                                     "Apple A10", [
@@ -980,9 +984,9 @@ def TuneAppleA10 : SubtargetFeature<"apple-a10", "ARMProcFamily", "AppleA10",
                                     FeatureDisableLatencySchedHeuristic,
                                     FeatureFuseAES,
                                     FeatureFuseCryptoEOR,
+                                    FeatureStorePairSuppress,
                                     FeatureZCRegMove,
-                                    FeatureZCZeroing]
-                                    >;
+                                    FeatureZCZeroing]>;
 
 def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11",
                                     "Apple A11", [
@@ -992,9 +996,9 @@ def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11",
                                     FeatureDisableLatencySchedHeuristic,
                                     FeatureFuseAES,
                                     FeatureFuseCryptoEOR,
+                                    FeatureStorePairSuppress,
                                     FeatureZCRegMove,
-                                    FeatureZCZeroing]
-                                    >;
+                                    FeatureZCZeroing]>;
 
 def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12",
                                     "Apple A12", [
@@ -1004,9 +1008,9 @@ def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12",
                                     FeatureDisableLatencySchedHeuristic,
                                     FeatureFuseAES,
                                     FeatureFuseCryptoEOR,
+                                    FeatureStorePairSuppress,
                                     FeatureZCRegMove,
-                                    FeatureZCZeroing]
-                                    >;
+                                    FeatureZCZeroing]>;
 
 def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13",
                                     "Apple A13", [
@@ -1016,9 +1020,9 @@ def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13",
                                     FeatureDisableLatencySchedHeuristic,
                                     FeatureFuseAES,
                                     FeatureFuseCryptoEOR,
+                                    FeatureStorePairSuppress,
                                     FeatureZCRegMove,
-                                    FeatureZCZeroing]
-                                    >;
+                                    FeatureZCZeroing]>;
 
 def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
                                     "Apple A14", [
@@ -1034,6 +1038,7 @@ def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
                                     FeatureFuseCryptoEOR,
                                     FeatureFuseAdrpAdd,
                                     FeatureFuseLiterals,
+                                    FeatureStorePairSuppress,
                                     FeatureZCRegMove,
                                     FeatureZCZeroing]>;
 
@@ -1049,9 +1054,9 @@ def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15",
                                     FeatureFuseCCSelect,
                                     FeatureFuseCryptoEOR,
                                     FeatureFuseLiterals,
+                                    FeatureStorePairSuppress,
                                     FeatureZCRegMove,
-                                    FeatureZCZeroing
-                                    ]>;
+                                    FeatureZCZeroing]>;
 
 def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
                                     "Apple A16", [
@@ -1065,9 +1070,9 @@ def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
                                     FeatureFuseCCSelect,
                                     FeatureFuseCryptoEOR,
                                     FeatureFuseLiterals,
+                                    FeatureStorePairSuppress,
                                     FeatureZCRegMove,
-                                    FeatureZCZeroing
-                                    ]>;
+                                    FeatureZCZeroing]>;
 
 def TuneExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
                                     "Samsung Exynos-M3 processors",
@@ -1078,6 +1083,7 @@ def TuneExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
                                      FeatureFuseCCSelect,
                                      FeatureFuseAdrpAdd,
                                      FeatureFuseLiterals,
+                                     FeatureStorePairSuppress,
                                      FeatureAddrLSLFast,
                                      FeatureALULSLFast,
                                      FeaturePostRAScheduler,
@@ -1096,6 +1102,7 @@ def TuneExynosM4 : SubtargetFeature<"exynosm4", "ARMProcFamily", "ExynosM3",
                                      FeatureFuseCCSelect,
                                      FeatureFuseAdrpAdd,
                                      FeatureFuseLiterals,
+                                     FeatureStorePairSuppress,
                                      FeatureAddrLSLFast,
                                      FeatureALULSLFast,
                                      FeaturePostRAScheduler,
@@ -1107,18 +1114,18 @@ def TuneKryo    : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
                                    FeaturePredictableSelectIsExpensive,
                                    FeatureZCZeroing,
                                    FeatureAddrLSLFast,
-                                   FeatureALULSLFast]
-                                   >;
+                                   FeatureALULSLFast,
+                                   FeatureStorePairSuppress]>;
 
 def TuneFalkor  : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor",
                                    "Qualcomm Falkor processors", [
                                    FeaturePostRAScheduler,
                                    FeaturePredictableSelectIsExpensive,
                                    FeatureZCZeroing,
+                                   FeatureStorePairSuppress,
                                    FeatureAddrLSLFast,
                                    FeatureALULSLFast,
-                                   FeatureSlowSTRQro
-                                   ]>;
+                                   FeatureSlowSTRQro]>;
 
 def TuneNeoverseE1 : SubtargetFeature<"neoversee1", "ARMProcFamily", "NeoverseE1",
                                       "Neoverse E1 ARM processors", [
@@ -1182,6 +1189,7 @@ def TuneSaphira  : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira",
                                    FeaturePostRAScheduler,
                                    FeaturePredictableSelectIsExpensive,
                                    FeatureZCZeroing,
+                                   FeatureStorePairSuppress,
                                    FeatureAddrLSLFast,
                                    FeatureALULSLFast]>;
 
@@ -1190,6 +1198,7 @@ def TuneThunderX2T99  : SubtargetFeature<"thunderx2t99", "ARMProcFamily", "Thund
                                           FeatureAggressiveFMA,
                                           FeatureArithmeticBccFusion,
                                           FeaturePostRAScheduler,
+                                          FeatureStorePairSuppress,
                                           FeaturePredictableSelectIsExpensive]>;
 
 def TuneThunderX3T110  : SubtargetFeature<"thunderx3t110", "ARMProcFamily",
@@ -1200,34 +1209,40 @@ def TuneThunderX3T110  : SubtargetFeature<"thunderx3t110", "ARMProcFamily",
                                            FeaturePostRAScheduler,
                                            FeaturePredictableSelectIsExpensive,
                                            FeatureBalanceFPOps,
+                                           FeatureStorePairSuppress,
                                            FeatureStrictAlign]>;
 
 def TuneThunderX : SubtargetFeature<"thunderx", "ARMProcFamily", "ThunderX",
                                     "Cavium ThunderX processors", [
                                     FeaturePostRAScheduler,
+                                    FeatureStorePairSuppress,
                                     FeaturePredictableSelectIsExpensive]>;
 
 def TuneThunderXT88 : SubtargetFeature<"thunderxt88", "ARMProcFamily",
                                        "ThunderXT88",
                                        "Cavium ThunderX processors", [
                                        FeaturePostRAScheduler,
+                                       FeatureStorePairSuppress,
                                        FeaturePredictableSelectIsExpensive]>;
 
 def TuneThunderXT81 : SubtargetFeature<"thunderxt81", "ARMProcFamily",
                                        "ThunderXT81",
                                        "Cavium ThunderX processors", [
                                        FeaturePostRAScheduler,
+                                       FeatureStorePairSuppress,
                                        FeaturePredictableSelectIsExpensive]>;
 
 def TuneThunderXT83 : SubtargetFeature<"thunderxt83", "ARMProcFamily",
                                        "ThunderXT83",
                                        "Cavium ThunderX processors", [
                                        FeaturePostRAScheduler,
+                                       FeatureStorePairSuppress,
                                        FeaturePredictableSelectIsExpensive]>;
 
 def TuneTSV110 : SubtargetFeature<"tsv110", "ARMProcFamily", "TSV110",
                                   "HiSilicon TS-V110 processors", [
                                   FeatureFuseAES,
+                                  FeatureStorePairSuppress,
                                   FeaturePostRAScheduler]>;
 
 def TuneAmpere1 : SubtargetFeature<"ampere1", "ARMProcFamily", "Ampere1",
@@ -1241,7 +1256,8 @@ def TuneAmpere1 : SubtargetFeature<"ampere1", "ARMProcFamily", "Ampere1",
                                    FeatureCmpBccFusion,
                                    FeatureFuseAddress,
                                    FeatureFuseLiterals,
-			           FeatureLdpAlignedOnly,
+                                   FeatureStorePairSuppress,
+                                   FeatureLdpAlignedOnly,
                                    FeatureStpAlignedOnly]>;
 
 def TuneAmpere1A : SubtargetFeature<"ampere1a", "ARMProcFamily", "Ampere1A",
@@ -1256,6 +1272,7 @@ def TuneAmpere1A : SubtargetFeature<"ampere1a", "ARMProcFamily", "Ampere1A",
                                     FeatureFuseAddress,
                                     FeatureFuseLiterals,
                                     FeatureFuseLiterals,
+                                    FeatureStorePairSuppress,
                                     FeatureLdpAlignedOnly,
                                     FeatureStpAlignedOnly]>;
 
 
@@ -11,6 +11,7 @@
 // ===---------------------------------------------------------------------===//
 
 #include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -122,7 +123,10 @@ bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()) || MF.getFunction().hasOptSize())
     return false;
 
-  const TargetSubtargetInfo &ST = MF.getSubtarget();
+  const AArch64Subtarget &ST = MF.getSubtarget<AArch64Subtarget>();
+  if (!ST.enableStorePairSuppress())
+    return false;
+
   TII = static_cast<const AArch64InstrInfo *>(ST.getInstrInfo());
   TRI = ST.getRegisterInfo();
   MRI = &MF.getRegInfo();
 
@@ -1,13 +1,13 @@
-; RUN: llc < %s -mcpu=cortex-a57 -aarch64-a57-fp-load-balancing-override=1 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false | FileCheck %s --check-prefix CHECK --check-prefix CHECK-BALFP --check-prefix CHECK-EVEN
-; RUN: llc < %s -mcpu=cortex-a57 -aarch64-a57-fp-load-balancing-override=2 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false | FileCheck %s --check-prefix CHECK --check-prefix CHECK-BALFP --check-prefix CHECK-ODD
-; RUN: llc < %s -mcpu=cortex-a53 -aarch64-a57-fp-load-balancing-override=1 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false | FileCheck %s --check-prefix CHECK --check-prefix CHECK-A53 --check-prefix CHECK-EVEN
-; RUN: llc < %s -mcpu=cortex-a53 -aarch64-a57-fp-load-balancing-override=2 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false | FileCheck %s --check-prefix CHECK --check-prefix CHECK-A53 --check-prefix CHECK-ODD
+; RUN: llc < %s -mcpu=cortex-a57 -aarch64-a57-fp-load-balancing-override=1 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false | FileCheck %s --check-prefix CHECK --check-prefix CHECK-EVEN
+; RUN: llc < %s -mcpu=cortex-a57 -aarch64-a57-fp-load-balancing-override=2 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false | FileCheck %s --check-prefix CHECK --check-prefix CHECK-ODD
+; RUN: llc < %s -mcpu=cortex-a53 -aarch64-a57-fp-load-balancing-override=1 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false | FileCheck %s --check-prefix CHECK --check-prefix CHECK-EVEN
+; RUN: llc < %s -mcpu=cortex-a53 -aarch64-a57-fp-load-balancing-override=2 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false | FileCheck %s --check-prefix CHECK --check-prefix CHECK-ODD
 
 ; The following tests use the balance-fp-ops feature, and should be independent of
 ; the target cpu.
 
-; RUN: llc < %s -mtriple=aarch64-linux-gnueabi -mattr=+balance-fp-ops -aarch64-a57-fp-load-balancing-override=1 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false | FileCheck %s --check-prefix CHECK --check-prefix CHECK-EVEN --check-prefix CHECK-BALFP
-; RUN: llc < %s -mtriple=aarch64-linux-gnueabi -mattr=+balance-fp-ops  -aarch64-a57-fp-load-balancing-override=2 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false | FileCheck %s --check-prefix CHECK --check-prefix CHECK-ODD --check-prefix CHECK-BALFP
+; RUN: llc < %s -mtriple=aarch64-linux-gnueabi -mattr=+balance-fp-ops -aarch64-a57-fp-load-balancing-override=1 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false | FileCheck %s --check-prefix CHECK --check-prefix CHECK-EVEN
+; RUN: llc < %s -mtriple=aarch64-linux-gnueabi -mattr=+balance-fp-ops  -aarch64-a57-fp-load-balancing-override=2 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false | FileCheck %s --check-prefix CHECK --check-prefix CHECK-ODD
 
 ; Test the AArch64A57FPLoadBalancing pass. This pass relies heavily on register allocation, so
 ; our test strategy is to:
@@ -81,9 +81,7 @@ entry:
 ; CHECK: fmsub [[x]]
 ; CHECK: fmadd [[y]]
 ; CHECK: fmadd [[x]]
-; CHECK-BALFP: stp [[x]], [[y]]
-; CHECK-A53-DAG: str [[x]]
-; CHECK-A53-DAG: str [[y]]
+; CHECK: stp [[x]], [[y]]
 
 define void @f2(ptr nocapture readonly %p, ptr nocapture %q) #0 {
 entry:
@@ -176,9 +174,7 @@ declare void @g(...) #1
 ; CHECK: fmsub [[x]]
 ; CHECK: fmadd [[y]]
 ; CHECK: fmadd [[x]]
-; CHECK-BALFP: stp [[x]], [[y]]
-; CHECK-A53-DAG: str [[x]]
-; CHECK-A53-DAG: str [[y]]
+; CHECK: stp [[x]], [[y]]
 
 define void @f4(ptr nocapture readonly %p, ptr nocapture %q) #0 {
 entry:
 
@@ -385,12 +385,9 @@ define void @caller_in_block() {
 ; CHECK-NEXT:    bl return_in_block
 ; CHECK-NEXT:    adrp x8, in_block_store
 ; CHECK-NEXT:    add x8, x8, :lo12:in_block_store
-; CHECK-NEXT:    str d0, [x8]
-; CHECK-NEXT:    str d1, [x8, #8]
-; CHECK-NEXT:    str d2, [x8, #16]
-; CHECK-NEXT:    str d3, [x8, #24]
-; CHECK-NEXT:    str d4, [x8, #32]
-; CHECK-NEXT:    str d5, [x8, #40]
+; CHECK-NEXT:    stp d0, d1, [x8]
+; CHECK-NEXT:    stp d2, d3, [x8, #16]
+; CHECK-NEXT:    stp d4, d5, [x8, #32]
 ; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
   %1 = call %T_IN_BLOCK @return_in_block()
@@ -403,12 +400,9 @@ define void @callee_in_block(%T_IN_BLOCK %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, in_block_store
 ; CHECK-NEXT:    add x8, x8, :lo12:in_block_store
-; CHECK-NEXT:    str d5, [x8, #40]
-; CHECK-NEXT:    str d4, [x8, #32]
-; CHECK-NEXT:    str d3, [x8, #24]
-; CHECK-NEXT:    str d2, [x8, #16]
-; CHECK-NEXT:    str d1, [x8, #8]
-; CHECK-NEXT:    str d0, [x8]
+; CHECK-NEXT:    stp d4, d5, [x8, #32]
+; CHECK-NEXT:    stp d2, d3, [x8, #16]
+; CHECK-NEXT:    stp d0, d1, [x8]
 ; CHECK-NEXT:    ret
   store %T_IN_BLOCK %a, ptr @in_block_store
   ret void
 
@@ -152,8 +152,7 @@ define void @call_copy_pod() {
 ; CHECK-NEXT:    add x19, x19, :lo12:Pod
 ; CHECK-NEXT:    mov x0, x19
 ; CHECK-NEXT:    bl copy_pod
-; CHECK-NEXT:    str d0, [x19]
-; CHECK-NEXT:    str d1, [x19, #8]
+; CHECK-NEXT:    stp d0, d1, [x19]
 ; CHECK-NEXT:    .seh_startepilogue
 ; CHECK-NEXT:    ldr x30, [sp, #8] // 8-byte Folded Reload
 ; CHECK-NEXT:    .seh_save_reg x30, 8