llvm
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPU.td‎
Lines changed: 14 additions & 4 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPU.td‎
Lines changed: 14 additions & 4 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp‎
Lines changed: 1 addition & 0 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h‎
Lines changed: 3 additions & 0 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/SIISelLowering.cpp‎
Lines changed: 40 additions & 21 deletions b/‎llvm/lib/Target/AMDGPU/SIISelLowering.cpp‎
Lines changed: 40 additions & 21 deletions
diff --git a/‎llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll‎
Lines changed: 6 additions & 6 deletions b/‎llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll‎
Lines changed: 6 additions & 6 deletions b/‎llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll‎
Lines changed: 1 addition & 1 deletion b/‎llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll‎
Lines changed: 1 addition & 1 deletion b/‎llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll‎
Lines changed: 1 addition & 1 deletion b/‎llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll‎
Lines changed: 1 addition & 1 deletion b/‎llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll‎
Lines changed: 1 addition & 1 deletion
@@ -1431,6 +1431,14 @@ def FeatureDisable : SubtargetFeature<"",
   "Dummy feature to disable assembler instructions"
 >;
 
+// GFX-9 & higher targets have a 16-dword Instruction Buffer and per-SQ
+// instruction store which can supply 4 dwords to each of the 2 waves per
+// cycle. Change default alignment to 4 dwords or 16 bytes.
+def FeaturePrefLoopAlign32B : SubtargetFeature<"loop-align",
+  "PrefLoopAlignmentLog2",
+  "5",
+  "Prefer 32-byte alignment for loops">;
+
 //===----------------------------------------------------------------------===//
 
 class GCNSubtargetFeatureGeneration <string Value,
@@ -1495,7 +1503,8 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9",
    FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureSupportsXNACK,
    FeatureUnalignedBufferAccess, FeatureUnalignedScratchAccess,
    FeatureUnalignedDSAccess, FeatureNegativeScratchOffsetBug, FeatureGWS,
-   FeatureDefaultComponentZero,FeatureVmemWriteVgprInOrder, FeatureMemToLDSLoad
+   FeatureDefaultComponentZero,FeatureVmemWriteVgprInOrder, FeatureMemToLDSLoad,
+   FeaturePrefLoopAlign32B
   ]
 >;
 
@@ -1519,7 +1528,7 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10",
    FeatureDefaultComponentZero, FeatureMaxHardClauseLength63,
    FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts,
    FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts,
-   FeatureVmemWriteVgprInOrder, FeatureMemToLDSLoad
+   FeatureVmemWriteVgprInOrder, FeatureMemToLDSLoad, FeaturePrefLoopAlign32B
   ]
 >;
 
@@ -1542,7 +1551,7 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11",
    FeatureUnalignedDSAccess, FeatureGDS, FeatureGWS,
    FeatureDefaultComponentZero, FeatureMaxHardClauseLength32,
    FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts,
-   FeatureVmemWriteVgprInOrder
+   FeatureVmemWriteVgprInOrder, FeaturePrefLoopAlign32B
   ]
 >;
 
@@ -1566,7 +1575,8 @@ def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12",
    FeatureDefaultComponentBroadcast, FeatureMaxHardClauseLength32,
    FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts,
    FeatureIEEEMinimumMaximumInsts, FeatureMinimum3Maximum3F32,
-   FeatureMinimum3Maximum3F16, FeatureAgentScopeFineGrainedRemoteMemoryAtomics
+   FeatureMinimum3Maximum3F16, FeatureAgentScopeFineGrainedRemoteMemoryAtomics,
+   FeaturePrefLoopAlign32B
   ]
 >;
 
 
@@ -628,6 +628,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   setMaxAtomicSizeInBitsSupported(64);
   setMaxDivRemBitWidthSupported(64);
   setMaxLargeFPConvertBitWidthSupported(64);
+  setPrefLoopAlignment(Align(1ULL << Subtarget->getPrefLoopAlignment()));
 }
 
 bool AMDGPUTargetLowering::mayIgnoreSignedZero(SDValue Op) const {
 
@@ -80,6 +80,7 @@ class AMDGPUSubtarget {
   unsigned LocalMemorySize = 0;
   unsigned AddressableLocalMemorySize = 0;
   char WavefrontSizeLog2 = 0;
+  unsigned PrefLoopAlignmentLog2 = 0;
 
 public:
   AMDGPUSubtarget(Triple TT);
@@ -377,6 +378,8 @@ class AMDGPUSubtarget {
   uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const;
   unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const;
 
+  unsigned getPrefLoopAlignment() const { return PrefLoopAlignmentLog2; }
+
   /// \returns Corresponding DWARF register number mapping flavour for the
   /// \p WavefrontSize.
   AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const;
 
@@ -53,10 +53,14 @@ using namespace llvm::SDPatternMatch;
 
 STATISTIC(NumTailCalls, "Number of tail calls");
 
+static cl::opt<bool> DisableLoopAlignment("amdgpu-disable-loop-alignment",
+                                          cl::desc("Do not align loops"),
+                                          cl::init(false));
+
 static cl::opt<bool>
-    DisableLoopAlignment("amdgpu-disable-loop-alignment",
-                         cl::desc("Do not align and prefetch loops"),
-                         cl::init(false));
+    DisableLoopAlignmentPrefetch("amdgpu-disable-loop-alignment-prefetch",
+                                 cl::desc("Do not align and prefetch loops"),
+                                 cl::init(false));
 
 static cl::opt<bool> UseDivergentRegisterIndexing(
     "amdgpu-use-divergent-register-indexing", cl::Hidden,
@@ -17434,25 +17438,9 @@ Align SITargetLowering::computeKnownAlignForTargetInstr(
 Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
   const Align PrefAlign = TargetLowering::getPrefLoopAlignment(ML);
   const Align CacheLineAlign = Align(64);
-
-  // Pre-GFX10 target did not benefit from loop alignment
-  if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
-      getSubtarget()->hasInstFwdPrefetchBug())
-    return PrefAlign;
-
-  // On GFX10 I$ is 4 x 64 bytes cache lines.
-  // By default prefetcher keeps one cache line behind and reads two ahead.
-  // We can modify it with S_INST_PREFETCH for larger loops to have two lines
-  // behind and one ahead.
-  // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
-  // If loop fits 64 bytes it always spans no more than two cache lines and
-  // does not need an alignment.
-  // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
-  // Else if loop is less or equal 192 bytes we need two lines behind.
-
   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
   const MachineBasicBlock *Header = ML->getHeader();
-  if (Header->getAlignment() != PrefAlign)
+  if (DisableLoopAlignment || Header->getAlignment() > PrefAlign)
     return Header->getAlignment(); // Already processed.
 
   unsigned LoopSize = 0;
@@ -17465,10 +17453,41 @@ Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
     for (const MachineInstr &MI : *MBB) {
       LoopSize += TII->getInstSizeInBytes(MI);
       if (LoopSize > 192)
-        return PrefAlign;
+        break;
     }
   }
 
+  // Pre-GFX10 targets did not benefit from loop alignment driven by prefetch
+  // considerations
+  if (!ML || DisableLoopAlignmentPrefetch ||
+      !getSubtarget()->hasInstPrefetch() ||
+      getSubtarget()->hasInstFwdPrefetchBug()) {
+    // Align loops < 32 bytes agrressively
+    if (LoopSize <= 32)
+      return PrefAlign;
+    // Align larger loops less aggressively
+    if (!ML->isInnermost())
+      return Header->getAlignment();
+    return (PrefAlign.value() > 1) ? Align(PrefAlign.value() >> 1) : PrefAlign;
+  }
+
+  // On GFX10 I$ is 4 x 64 bytes cache lines.
+  // By default prefetcher keeps one cache line behind and reads two ahead.
+  // We can modify it with S_INST_PREFETCH for larger loops to have two lines
+  // behind and one ahead.
+  // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
+  // If loop fits 64 bytes it always spans no more than two cache lines and
+  // does not need an alignment.
+  // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
+  // Else if loop is less or equal 192 bytes we need two lines behind.
+
+  // Align larger loops less aggressively
+  if (LoopSize > 192) {
+    if (!ML->isInnermost())
+      return Header->getAlignment();
+    return (PrefAlign.value() > 1) ? Align(PrefAlign.value() >> 1) : PrefAlign;
+  }
+
   if (LoopSize <= 64)
     return PrefAlign;
 
 
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s
+; RUN: llc -amdgpu-disable-loop-alignment=true -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -amdgpu-disable-loop-alignment=true -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s
+; RUN: llc -amdgpu-disable-loop-alignment=true -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -amdgpu-disable-loop-alignment=true -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -amdgpu-disable-loop-alignment=true -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s
+; RUN: llc -amdgpu-disable-loop-alignment=true -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s
 
 
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s
+; RUN: llc -amdgpu-disable-loop-alignment=true -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -amdgpu-disable-loop-alignment=true -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s
+; RUN: llc -amdgpu-disable-loop-alignment=true -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -amdgpu-disable-loop-alignment=true -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -amdgpu-disable-loop-alignment=true -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s
+; RUN: llc -amdgpu-disable-loop-alignment=true -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s
 
 
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -new-reg-bank-select < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -amdgpu-disable-loop-alignment=true -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -new-reg-bank-select < %s | FileCheck -check-prefix=GFX10 %s
 
 ; Divergent phis that don't require lowering using lane mask merging
 
 
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -amdgpu-disable-loop-alignment=true -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
 
 ; This file contains various tests that have divergent i1s used outside of
 ; the loop. These are lane masks is sgpr and need to have correct value in
 
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -new-reg-bank-select < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -amdgpu-disable-loop-alignment=true -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -new-reg-bank-select < %s | FileCheck -check-prefix=GFX10 %s
 
 ; Simples case, if - then, that requires lane mask merging,
 ; %phi lane mask will hold %val_A at %A. Lanes that are active in %B
 
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -new-reg-bank-select < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -amdgpu-disable-loop-alignment=true -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -new-reg-bank-select < %s | FileCheck -check-prefix=GFX10 %s
 
 define void @temporal_divergent_i1_phi(float %val, ptr %addr) {
 ; GFX10-LABEL: temporal_divergent_i1_phi:
Original file line number	Diff line number	Diff line change
`@@ -628,6 +628,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,`
`628`	`628`	`setMaxAtomicSizeInBitsSupported(64);`
`629`	`629`	`setMaxDivRemBitWidthSupported(64);`
`630`	`630`	`setMaxLargeFPConvertBitWidthSupported(64);`
	`631`	`+ setPrefLoopAlignment(Align(1ULL << Subtarget->getPrefLoopAlignment()));`
`631`	`632`	`}`
`632`	`633`
`633`	`634`	`bool AMDGPUTargetLowering::mayIgnoreSignedZero(SDValue Op) const {`