Skip to content

Commit 74c5549

Browse files
committed
[AMDGPU] Change default loop alignment for GFX9 and higher targets
Align small loops aggresively to 32 bytes and larger loops to 16 bytes
1 parent fb1035c commit 74c5549

File tree

118 files changed

+511
-1351
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

118 files changed

+511
-1351
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1431,6 +1431,14 @@ def FeatureDisable : SubtargetFeature<"",
14311431
"Dummy feature to disable assembler instructions"
14321432
>;
14331433

1434+
// GFX-9 & higher targets have a 16-dword Instruction Buffer and per-SQ
1435+
// instruction store which can supply 4 dwords to each of the 2 waves per
1436+
// cycle. Change default alignment to 4 dwords or 16 bytes.
1437+
def FeaturePrefLoopAlign32B : SubtargetFeature<"loop-align",
1438+
"PrefLoopAlignmentLog2",
1439+
"5",
1440+
"Prefer 32-byte alignment for loops">;
1441+
14341442
//===----------------------------------------------------------------------===//
14351443

14361444
class GCNSubtargetFeatureGeneration <string Value,
@@ -1495,7 +1503,8 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9",
14951503
FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureSupportsXNACK,
14961504
FeatureUnalignedBufferAccess, FeatureUnalignedScratchAccess,
14971505
FeatureUnalignedDSAccess, FeatureNegativeScratchOffsetBug, FeatureGWS,
1498-
FeatureDefaultComponentZero,FeatureVmemWriteVgprInOrder, FeatureMemToLDSLoad
1506+
FeatureDefaultComponentZero,FeatureVmemWriteVgprInOrder, FeatureMemToLDSLoad,
1507+
FeaturePrefLoopAlign32B
14991508
]
15001509
>;
15011510

@@ -1519,7 +1528,7 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10",
15191528
FeatureDefaultComponentZero, FeatureMaxHardClauseLength63,
15201529
FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts,
15211530
FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts,
1522-
FeatureVmemWriteVgprInOrder, FeatureMemToLDSLoad
1531+
FeatureVmemWriteVgprInOrder, FeatureMemToLDSLoad, FeaturePrefLoopAlign32B
15231532
]
15241533
>;
15251534

@@ -1542,7 +1551,7 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11",
15421551
FeatureUnalignedDSAccess, FeatureGDS, FeatureGWS,
15431552
FeatureDefaultComponentZero, FeatureMaxHardClauseLength32,
15441553
FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts,
1545-
FeatureVmemWriteVgprInOrder
1554+
FeatureVmemWriteVgprInOrder, FeaturePrefLoopAlign32B
15461555
]
15471556
>;
15481557

@@ -1566,7 +1575,8 @@ def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12",
15661575
FeatureDefaultComponentBroadcast, FeatureMaxHardClauseLength32,
15671576
FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts,
15681577
FeatureIEEEMinimumMaximumInsts, FeatureMinimum3Maximum3F32,
1569-
FeatureMinimum3Maximum3F16, FeatureAgentScopeFineGrainedRemoteMemoryAtomics
1578+
FeatureMinimum3Maximum3F16, FeatureAgentScopeFineGrainedRemoteMemoryAtomics,
1579+
FeaturePrefLoopAlign32B
15701580
]
15711581
>;
15721582

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -628,6 +628,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
628628
setMaxAtomicSizeInBitsSupported(64);
629629
setMaxDivRemBitWidthSupported(64);
630630
setMaxLargeFPConvertBitWidthSupported(64);
631+
setPrefLoopAlignment(Align(1ULL << Subtarget->getPrefLoopAlignment()));
631632
}
632633

633634
bool AMDGPUTargetLowering::mayIgnoreSignedZero(SDValue Op) const {

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ class AMDGPUSubtarget {
8080
unsigned LocalMemorySize = 0;
8181
unsigned AddressableLocalMemorySize = 0;
8282
char WavefrontSizeLog2 = 0;
83+
unsigned PrefLoopAlignmentLog2 = 0;
8384

8485
public:
8586
AMDGPUSubtarget(Triple TT);
@@ -377,6 +378,8 @@ class AMDGPUSubtarget {
377378
uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const;
378379
unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const;
379380

381+
unsigned getPrefLoopAlignment() const { return PrefLoopAlignmentLog2; }
382+
380383
/// \returns Corresponding DWARF register number mapping flavour for the
381384
/// \p WavefrontSize.
382385
AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const;

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 40 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -53,10 +53,14 @@ using namespace llvm::SDPatternMatch;
5353

5454
STATISTIC(NumTailCalls, "Number of tail calls");
5555

56+
static cl::opt<bool> DisableLoopAlignment("amdgpu-disable-loop-alignment",
57+
cl::desc("Do not align loops"),
58+
cl::init(false));
59+
5660
static cl::opt<bool>
57-
DisableLoopAlignment("amdgpu-disable-loop-alignment",
58-
cl::desc("Do not align and prefetch loops"),
59-
cl::init(false));
61+
DisableLoopAlignmentPrefetch("amdgpu-disable-loop-alignment-prefetch",
62+
cl::desc("Do not align and prefetch loops"),
63+
cl::init(false));
6064

6165
static cl::opt<bool> UseDivergentRegisterIndexing(
6266
"amdgpu-use-divergent-register-indexing", cl::Hidden,
@@ -17434,25 +17438,9 @@ Align SITargetLowering::computeKnownAlignForTargetInstr(
1743417438
Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
1743517439
const Align PrefAlign = TargetLowering::getPrefLoopAlignment(ML);
1743617440
const Align CacheLineAlign = Align(64);
17437-
17438-
// Pre-GFX10 target did not benefit from loop alignment
17439-
if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
17440-
getSubtarget()->hasInstFwdPrefetchBug())
17441-
return PrefAlign;
17442-
17443-
// On GFX10 I$ is 4 x 64 bytes cache lines.
17444-
// By default prefetcher keeps one cache line behind and reads two ahead.
17445-
// We can modify it with S_INST_PREFETCH for larger loops to have two lines
17446-
// behind and one ahead.
17447-
// Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
17448-
// If loop fits 64 bytes it always spans no more than two cache lines and
17449-
// does not need an alignment.
17450-
// Else if loop is less or equal 128 bytes we do not need to modify prefetch,
17451-
// Else if loop is less or equal 192 bytes we need two lines behind.
17452-
1745317441
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
1745417442
const MachineBasicBlock *Header = ML->getHeader();
17455-
if (Header->getAlignment() != PrefAlign)
17443+
if (DisableLoopAlignment || Header->getAlignment() > PrefAlign)
1745617444
return Header->getAlignment(); // Already processed.
1745717445

1745817446
unsigned LoopSize = 0;
@@ -17465,10 +17453,41 @@ Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
1746517453
for (const MachineInstr &MI : *MBB) {
1746617454
LoopSize += TII->getInstSizeInBytes(MI);
1746717455
if (LoopSize > 192)
17468-
return PrefAlign;
17456+
break;
1746917457
}
1747017458
}
1747117459

17460+
// Pre-GFX10 targets did not benefit from loop alignment driven by prefetch
17461+
// considerations
17462+
if (!ML || DisableLoopAlignmentPrefetch ||
17463+
!getSubtarget()->hasInstPrefetch() ||
17464+
getSubtarget()->hasInstFwdPrefetchBug()) {
17465+
// Align loops < 32 bytes agrressively
17466+
if (LoopSize <= 32)
17467+
return PrefAlign;
17468+
// Align larger loops less aggressively
17469+
if (!ML->isInnermost())
17470+
return Header->getAlignment();
17471+
return (PrefAlign.value() > 1) ? Align(PrefAlign.value() >> 1) : PrefAlign;
17472+
}
17473+
17474+
// On GFX10 I$ is 4 x 64 bytes cache lines.
17475+
// By default prefetcher keeps one cache line behind and reads two ahead.
17476+
// We can modify it with S_INST_PREFETCH for larger loops to have two lines
17477+
// behind and one ahead.
17478+
// Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
17479+
// If loop fits 64 bytes it always spans no more than two cache lines and
17480+
// does not need an alignment.
17481+
// Else if loop is less or equal 128 bytes we do not need to modify prefetch,
17482+
// Else if loop is less or equal 192 bytes we need two lines behind.
17483+
17484+
// Align larger loops less aggressively
17485+
if (LoopSize > 192) {
17486+
if (!ML->isInnermost())
17487+
return Header->getAlignment();
17488+
return (PrefAlign.value() > 1) ? Align(PrefAlign.value() >> 1) : PrefAlign;
17489+
}
17490+
1747217491
if (LoopSize <= 64)
1747317492
return PrefAlign;
1747417493

llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2-
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
3-
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s
4-
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
5-
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
6-
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s
7-
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s
2+
; RUN: llc -amdgpu-disable-loop-alignment=true -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
3+
; RUN: llc -amdgpu-disable-loop-alignment=true -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s
4+
; RUN: llc -amdgpu-disable-loop-alignment=true -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
5+
; RUN: llc -amdgpu-disable-loop-alignment=true -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
6+
; RUN: llc -amdgpu-disable-loop-alignment=true -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s
7+
; RUN: llc -amdgpu-disable-loop-alignment=true -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s
88
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s
99
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s
1010

llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2-
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
3-
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s
4-
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
5-
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
6-
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s
7-
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s
2+
; RUN: llc -amdgpu-disable-loop-alignment=true -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
3+
; RUN: llc -amdgpu-disable-loop-alignment=true -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s
4+
; RUN: llc -amdgpu-disable-loop-alignment=true -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
5+
; RUN: llc -amdgpu-disable-loop-alignment=true -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
6+
; RUN: llc -amdgpu-disable-loop-alignment=true -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s
7+
; RUN: llc -amdgpu-disable-loop-alignment=true -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s
88
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s
99
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s
1010

llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
2-
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -new-reg-bank-select < %s | FileCheck -check-prefix=GFX10 %s
2+
; RUN: llc -amdgpu-disable-loop-alignment=true -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -new-reg-bank-select < %s | FileCheck -check-prefix=GFX10 %s
33

44
; Divergent phis that don't require lowering using lane mask merging
55

llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
2-
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
2+
; RUN: llc -amdgpu-disable-loop-alignment=true -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
33

44
; This file contains various tests that have divergent i1s used outside of
55
; the loop. These are lane masks is sgpr and need to have correct value in

llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
2-
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -new-reg-bank-select < %s | FileCheck -check-prefix=GFX10 %s
2+
; RUN: llc -amdgpu-disable-loop-alignment=true -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -new-reg-bank-select < %s | FileCheck -check-prefix=GFX10 %s
33

44
; Simples case, if - then, that requires lane mask merging,
55
; %phi lane mask will hold %val_A at %A. Lanes that are active in %B

llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
2-
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -new-reg-bank-select < %s | FileCheck -check-prefix=GFX10 %s
2+
; RUN: llc -amdgpu-disable-loop-alignment=true -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -new-reg-bank-select < %s | FileCheck -check-prefix=GFX10 %s
33

44
define void @temporal_divergent_i1_phi(float %val, ptr %addr) {
55
; GFX10-LABEL: temporal_divergent_i1_phi:

0 commit comments

Comments
 (0)