-
Notifications
You must be signed in to change notification settings - Fork 15.5k
[AMDGPU] Change default loop alignment for GFX9 and higher targets #153065
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
74c5549
cfa627e
676038c
525ccd4
82b6998
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1431,6 +1431,14 @@ def FeatureDisable : SubtargetFeature<"", | |
| "Dummy feature to disable assembler instructions" | ||
| >; | ||
|
|
||
| // GFX-9 & higher targets have a 16-dword Instruction Buffer and per-SQ | ||
| // instruction store which can supply 4 dwords to each of the 2 waves per | ||
| // cycle. Change default alignment to 4 dwords or 16 bytes. | ||
| def FeaturePrefLoopAlign32B : SubtargetFeature<"loop-align", | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Include the value in the feature name |
||
| "PrefLoopAlignmentLog2", | ||
| "5", | ||
| "Prefer 32-byte alignment for loops">; | ||
|
|
||
| //===----------------------------------------------------------------------===// | ||
|
|
||
| class GCNSubtargetFeatureGeneration <string Value, | ||
|
|
@@ -1495,7 +1503,8 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9", | |
| FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureSupportsXNACK, | ||
| FeatureUnalignedBufferAccess, FeatureUnalignedScratchAccess, | ||
| FeatureUnalignedDSAccess, FeatureNegativeScratchOffsetBug, FeatureGWS, | ||
| FeatureDefaultComponentZero,FeatureVmemWriteVgprInOrder, FeatureMemToLDSLoad | ||
| FeatureDefaultComponentZero,FeatureVmemWriteVgprInOrder, FeatureMemToLDSLoad, | ||
| FeaturePrefLoopAlign32B | ||
| ] | ||
| >; | ||
|
|
||
|
|
@@ -1519,7 +1528,7 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10", | |
| FeatureDefaultComponentZero, FeatureMaxHardClauseLength63, | ||
| FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts, | ||
| FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts, | ||
| FeatureVmemWriteVgprInOrder, FeatureMemToLDSLoad | ||
| FeatureVmemWriteVgprInOrder, FeatureMemToLDSLoad, FeaturePrefLoopAlign32B | ||
| ] | ||
| >; | ||
|
|
||
|
|
@@ -1542,7 +1551,7 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11", | |
| FeatureUnalignedDSAccess, FeatureGDS, FeatureGWS, | ||
| FeatureDefaultComponentZero, FeatureMaxHardClauseLength32, | ||
| FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts, | ||
| FeatureVmemWriteVgprInOrder | ||
| FeatureVmemWriteVgprInOrder, FeaturePrefLoopAlign32B | ||
| ] | ||
| >; | ||
|
|
||
|
|
@@ -1566,7 +1575,8 @@ def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12", | |
| FeatureDefaultComponentBroadcast, FeatureMaxHardClauseLength32, | ||
| FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts, | ||
| FeatureIEEEMinimumMaximumInsts, FeatureMinimum3Maximum3F32, | ||
| FeatureMinimum3Maximum3F16, FeatureAgentScopeFineGrainedRemoteMemoryAtomics | ||
| FeatureMinimum3Maximum3F16, FeatureAgentScopeFineGrainedRemoteMemoryAtomics, | ||
| FeaturePrefLoopAlign32B | ||
| ] | ||
| >; | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -80,6 +80,7 @@ class AMDGPUSubtarget { | |
| unsigned LocalMemorySize = 0; | ||
| unsigned AddressableLocalMemorySize = 0; | ||
| char WavefrontSizeLog2 = 0; | ||
| unsigned PrefLoopAlignmentLog2 = 0; | ||
|
|
||
| public: | ||
| AMDGPUSubtarget(Triple TT); | ||
|
|
@@ -377,6 +378,8 @@ class AMDGPUSubtarget { | |
| uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const; | ||
| unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const; | ||
|
|
||
| unsigned getPrefLoopAlignment() const { return PrefLoopAlignmentLog2; } | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we really need a target feature for this? Can we just have the getter return the correct value based on gfx version directly below?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. prefer target feature over a check for every gfx version esp as new versions show up
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm more wondering if this is a universal property that should have always been done. The prior comment says there's no benefit pre-gfx10, so why is this now needed for gfx9?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Didn't see sufficient documentation to confirm this would benefit older targets, but another amdgpu compiler is doing this for all targets. So will make it universal.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yes, same doubt. |
||
|
|
||
| /// \returns Corresponding DWARF register number mapping flavour for the | ||
| /// \p WavefrontSize. | ||
| AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const; | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -53,9 +53,13 @@ using namespace llvm::SDPatternMatch; | |
|
|
||
| STATISTIC(NumTailCalls, "Number of tail calls"); | ||
|
|
||
| static cl::opt<bool> | ||
| DisableAllLoopAlignment("amdgpu-disable-all-loop-alignment", | ||
| cl::desc("Do not align loops"), cl::init(false)); | ||
|
|
||
| static cl::opt<bool> | ||
| DisableLoopAlignment("amdgpu-disable-loop-alignment", | ||
| cl::desc("Do not align and prefetch loops"), | ||
| cl::desc("Do not align loops for prefetch"), | ||
| cl::init(false)); | ||
|
Comment on lines
+56
to
63
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we not add new flags? There is a cost to them and we should be removing many of the ones we already have. Clang already has an explicit metadata driven flag to align or disable alignment of loops. |
||
|
|
||
| static cl::opt<bool> UseDivergentRegisterIndexing( | ||
|
|
@@ -17434,25 +17438,9 @@ Align SITargetLowering::computeKnownAlignForTargetInstr( | |
| Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { | ||
| const Align PrefAlign = TargetLowering::getPrefLoopAlignment(ML); | ||
| const Align CacheLineAlign = Align(64); | ||
|
|
||
| // Pre-GFX10 target did not benefit from loop alignment | ||
| if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() || | ||
| getSubtarget()->hasInstFwdPrefetchBug()) | ||
| return PrefAlign; | ||
|
|
||
| // On GFX10 I$ is 4 x 64 bytes cache lines. | ||
| // By default prefetcher keeps one cache line behind and reads two ahead. | ||
| // We can modify it with S_INST_PREFETCH for larger loops to have two lines | ||
| // behind and one ahead. | ||
| // Therefor we can benefit from aligning loop headers if loop fits 192 bytes. | ||
| // If loop fits 64 bytes it always spans no more than two cache lines and | ||
| // does not need an alignment. | ||
| // Else if loop is less or equal 128 bytes we do not need to modify prefetch, | ||
| // Else if loop is less or equal 192 bytes we need two lines behind. | ||
|
|
||
| const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); | ||
| const MachineBasicBlock *Header = ML->getHeader(); | ||
| if (Header->getAlignment() != PrefAlign) | ||
| if (DisableAllLoopAlignment || Header->getAlignment() > PrefAlign) | ||
| return Header->getAlignment(); // Already processed. | ||
|
|
||
| unsigned LoopSize = 0; | ||
|
|
@@ -17465,10 +17453,40 @@ Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { | |
| for (const MachineInstr &MI : *MBB) { | ||
| LoopSize += TII->getInstSizeInBytes(MI); | ||
| if (LoopSize > 192) | ||
| return PrefAlign; | ||
| break; | ||
| } | ||
| } | ||
|
|
||
| // Pre-GFX10 targets did not benefit from loop alignment driven by prefetch | ||
| // considerations | ||
| if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() || | ||
| getSubtarget()->hasInstFwdPrefetchBug()) { | ||
| // Align loops < 32 bytes agrressively | ||
| if (LoopSize <= 32) | ||
| return PrefAlign; | ||
| // Align larger loops less aggressively | ||
| if (!ML->isInnermost()) | ||
| return Header->getAlignment(); | ||
| return (PrefAlign.value() > 1) ? Align(PrefAlign.value() >> 1) : PrefAlign; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you rewrite this to use Align more directly, this is unnecessarily confusing |
||
| } | ||
|
|
||
| // On GFX10 I$ is 4 x 64 bytes cache lines. | ||
| // By default prefetcher keeps one cache line behind and reads two ahead. | ||
| // We can modify it with S_INST_PREFETCH for larger loops to have two lines | ||
| // behind and one ahead. | ||
| // Therefor we can benefit from aligning loop headers if loop fits 192 bytes. | ||
| // If loop fits 64 bytes it always spans no more than two cache lines and | ||
| // does not need an alignment. | ||
| // Else if loop is less or equal 128 bytes we do not need to modify prefetch, | ||
| // Else if loop is less or equal 192 bytes we need two lines behind. | ||
|
|
||
| // Align larger loops less aggressively | ||
| if (LoopSize > 192) { | ||
| if (!ML->isInnermost()) | ||
| return Header->getAlignment(); | ||
| return (PrefAlign.value() > 1) ? Align(PrefAlign.value() >> 1) : PrefAlign; | ||
| } | ||
|
|
||
| if (LoopSize <= 64) | ||
| return PrefAlign; | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,5 +1,5 @@ | ||
| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 | ||
| ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -new-reg-bank-select < %s | FileCheck -check-prefix=GFX10 %s | ||
| ; RUN: llc -amdgpu-disable-all-loop-alignment=true -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -new-reg-bank-select < %s | FileCheck -check-prefix=GFX10 %s | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Definitely should not be bulk adding this flag to tests
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Following tests have been changed and don't use the flag and this should be sufficient testing for this change. llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll Will remove the backend cl::opts and use -align-loops=1 for the remaining 110 tests. I think its an overkill to modify 110 additional tests for alignment. |
||
|
|
||
| define void @temporal_divergent_i32(float %val, ptr %addr) { | ||
| ; GFX10-LABEL: temporal_divergent_i32: | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.