Skip to content

Commit beb4047

Browse files
Implement compiler option -mamdgpu-expand-waitcnt-profiling to expand waitcnt instruction
1 parent ac047f2 commit beb4047

File tree

7 files changed

+427
-1
lines changed

7 files changed

+427
-1
lines changed

clang/include/clang/Driver/Options.td

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5497,7 +5497,10 @@ defm wavefrontsize64 : SimpleMFlag<"wavefrontsize64",
54975497
" mode (AMDGPU only)">;
54985498
defm amdgpu_precise_memory_op
54995499
: SimpleMFlag<"amdgpu-precise-memory-op", "Enable", "Disable",
5500-
" precise memory mode (AMDGPU only)">;
5500+
" precise memory mode (AMDGPU only)", m_amdgpu_Features_Group>;
5501+
defm amdgpu_expand_waitcnt_profiling
5502+
: SimpleMFlag<"amdgpu-expand-waitcnt-profiling", "Enable", "Disable",
5503+
" waitcnt expansion for profiling (AMDGPU only)", m_amdgpu_Features_Group>;
55015504

55025505
def munsafe_fp_atomics : Flag<["-"], "munsafe-fp-atomics">,
55035506
Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>, Alias<fatomic_ignore_denormal_mode>;

clang/lib/Driver/ToolChains/AMDGPU.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -700,6 +700,10 @@ void amdgpu::getAMDGPUTargetFeatures(const Driver &D,
700700
options::OPT_mno_amdgpu_precise_memory_op, false))
701701
Features.push_back("+precise-memory");
702702

703+
if (Args.hasFlag(options::OPT_mamdgpu_expand_waitcnt_profiling,
704+
options::OPT_mno_amdgpu_expand_waitcnt_profiling, false))
705+
Features.push_back("+expand-waitcnt-profiling");
706+
703707
handleTargetFeaturesGroup(D, Triple, Args, Features,
704708
options::OPT_m_amdgpu_Features_Group);
705709
}

clang/test/Driver/amdgpu-features.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,3 +38,9 @@
3838

3939
// RUN: %clang -### -target amdgcn -mcpu=gfx1010 -mno-amdgpu-precise-memory-op %s 2>&1 | FileCheck --check-prefix=NO-PREC-MEM %s
4040
// NO-PREC-MEM-NOT: {{".*precise-memory"}}
41+
42+
// RUN: %clang -### -target amdgcn -mcpu=gfx900 -mamdgpu-expand-waitcnt-profiling %s 2>&1 | FileCheck --check-prefix=EXPAND-WAITCNT %s
43+
// EXPAND-WAITCNT: "-target-feature" "+expand-waitcnt-profiling"
44+
45+
// RUN: %clang -### -target amdgcn -mcpu=gfx900 -mno-amdgpu-expand-waitcnt-profiling %s 2>&1 | FileCheck --check-prefix=NO-EXPAND-WAITCNT %s
46+
// NO-EXPAND-WAITCNT-NOT: "{{[+]}}expand-waitcnt-profiling"

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,10 @@ def FeaturePreciseMemory
223223
: SubtargetFeature<"precise-memory", "EnablePreciseMemory",
224224
"true", "Enable precise memory mode">;
225225

226+
def FeatureExpandWaitcntProfiling
227+
: SubtargetFeature<"expand-waitcnt-profiling", "EnableExpandWaitcntProfiling",
228+
"true", "Expand waitcnt instructions for profiling">;
229+
226230
def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug",
227231
"SGPRInitBug",
228232
"true",

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
9090
bool EnableCuMode = false;
9191
bool TrapHandler = false;
9292
bool EnablePreciseMemory = false;
93+
bool EnableExpandWaitcntProfiling = false;
9394

9495
// Used as options.
9596
bool EnableLoadStoreOpt = false;
@@ -674,6 +675,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
674675

675676
bool isPreciseMemoryEnabled() const { return EnablePreciseMemory; }
676677

678+
bool isExpandWaitcntProfilingEnabled() const {
679+
return EnableExpandWaitcntProfiling;
680+
}
681+
677682
bool hasFlatAddressSpace() const {
678683
return FlatAddressSpace;
679684
}

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -494,6 +494,16 @@ class SIInsertWaitcnts {
494494
bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
495495
bool run(MachineFunction &MF);
496496

497+
// Methods for expanding waitcnt instructions for profiling
498+
bool expandWaitcntsForProfiling(MachineFunction &MF);
499+
bool expandSingleWaitcnt(MachineInstr &MI, MachineBasicBlock &MBB);
500+
bool expandSingleCounterWait(MachineInstr &MI, MachineBasicBlock &MBB,
501+
InstCounterType CT);
502+
bool expandCounterSequence(MachineBasicBlock &MBB,
503+
MachineBasicBlock::iterator InsertPos,
504+
InstCounterType CT, unsigned CountValue,
505+
DebugLoc DL);
506+
497507
void setForceEmitWaitcnt() {
498508
// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
499509
// For debug builds, get the debug counter info and adjust if need be
@@ -2725,6 +2735,156 @@ SIInsertWaitcntsPass::run(MachineFunction &MF,
27252735
.preserve<AAManager>();
27262736
}
27272737

2738+
/// Expand waitcnt instructions for profiling by inserting a sequence of
2739+
/// decreasing counter values. This helps identify which specific memory
2740+
/// operation is a bottleneck during PC sampling.
2741+
bool SIInsertWaitcnts::expandWaitcntsForProfiling(MachineFunction &MF) {
2742+
if (!ST->isExpandWaitcntProfilingEnabled())
2743+
return false;
2744+
2745+
bool Modified = false;
2746+
2747+
// Iterate through all basic blocks
2748+
for (MachineBasicBlock &MBB : MF) {
2749+
for (auto I = MBB.begin(), E = MBB.end(); I != E;) {
2750+
MachineInstr &MI = *I;
2751+
++I; // Advance iterator before potential expansion
2752+
2753+
if (ST->hasExtendedWaitCounts()) {
2754+
// GFX12+: Handle separate wait instructions
2755+
if (auto CT = counterTypeForInstr(MI.getOpcode())) {
2756+
Modified |= expandSingleCounterWait(MI, MBB, *CT);
2757+
}
2758+
} else {
2759+
// Pre-GFX12: Handle combined S_WAITCNT
2760+
if (MI.getOpcode() == AMDGPU::S_WAITCNT) {
2761+
Modified |= expandSingleWaitcnt(MI, MBB);
2762+
}
2763+
}
2764+
}
2765+
}
2766+
2767+
return Modified;
2768+
}
2769+
2770+
/// Expand a single S_WAITCNT instruction (pre-GFX12)
2771+
bool SIInsertWaitcnts::expandSingleWaitcnt(MachineInstr &MI,
2772+
MachineBasicBlock &MBB) {
2773+
assert(MI.getOpcode() == AMDGPU::S_WAITCNT);
2774+
2775+
// Decode the waitcnt immediate
2776+
unsigned Imm = MI.getOperand(0).getImm();
2777+
AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST->getCPU());
2778+
AMDGPU::Waitcnt Wait = AMDGPU::decodeWaitcnt(IV, Imm);
2779+
2780+
// Insert expanded waitcnts BEFORE the original instruction
2781+
auto InsertPos = MI.getIterator();
2782+
DebugLoc DL = MI.getDebugLoc();
2783+
2784+
bool Modified = false;
2785+
2786+
// Expand each counter independently
2787+
// For independent counters (Case 2 from requirements):
2788+
// vmcnt and lgkmcnt can be separated
2789+
Modified |= expandCounterSequence(MBB, InsertPos, LOAD_CNT, Wait.LoadCnt, DL);
2790+
Modified |= expandCounterSequence(MBB, InsertPos, DS_CNT, Wait.DsCnt, DL);
2791+
Modified |= expandCounterSequence(MBB, InsertPos, EXP_CNT, Wait.ExpCnt, DL);
2792+
Modified |=
2793+
expandCounterSequence(MBB, InsertPos, STORE_CNT, Wait.StoreCnt, DL);
2794+
2795+
// If we expanded anything, remove the original waitcnt
2796+
if (Modified) {
2797+
MI.eraseFromParent();
2798+
}
2799+
2800+
return Modified;
2801+
}
2802+
2803+
/// Expand a single counter wait instruction (GFX12+)
2804+
bool SIInsertWaitcnts::expandSingleCounterWait(MachineInstr &MI,
2805+
MachineBasicBlock &MBB,
2806+
InstCounterType CT) {
2807+
// Get the counter value from the instruction
2808+
unsigned CountValue = MI.getOperand(0).getImm();
2809+
2810+
// Insert expanded waitcnts BEFORE the original instruction
2811+
auto InsertPos = MI.getIterator();
2812+
DebugLoc DL = MI.getDebugLoc();
2813+
2814+
bool Modified = expandCounterSequence(MBB, InsertPos, CT, CountValue, DL);
2815+
2816+
// If we expanded, remove the original instruction
2817+
if (Modified) {
2818+
MI.eraseFromParent();
2819+
}
2820+
2821+
return Modified;
2822+
}
2823+
2824+
/// Insert a sequence of wait instructions with decreasing counter values
2825+
bool SIInsertWaitcnts::expandCounterSequence(
2826+
MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPos,
2827+
InstCounterType CT, unsigned CountValue, DebugLoc DL) {
2828+
// Skip if counter is already at zero, not active, or at max (wait not needed)
2829+
if (CountValue == 0 || CountValue == ~0u)
2830+
return false;
2831+
2832+
unsigned MaxCount = getWaitCountMax(CT);
2833+
if (CountValue >= MaxCount)
2834+
return false;
2835+
2836+
bool Modified = false;
2837+
2838+
// Generate decreasing sequence: CountValue-1, CountValue-2, ..., 1, 0
2839+
// We start from CountValue-1 because the original waitcnt already handles
2840+
// CountValue
2841+
for (int i = CountValue - 1; i >= 0; --i) {
2842+
if (ST->hasExtendedWaitCounts()) {
2843+
// GFX12+: Use separate wait instructions
2844+
unsigned Opcode = instrsForExtendedCounterTypes[CT];
2845+
BuildMI(MBB, InsertPos, DL, TII->get(Opcode)).addImm(i);
2846+
} else {
2847+
// Pre-GFX12: Use combined S_WAITCNT with only this counter set
2848+
AMDGPU::Waitcnt Wait;
2849+
switch (CT) {
2850+
case LOAD_CNT:
2851+
Wait.LoadCnt = i;
2852+
break;
2853+
case DS_CNT:
2854+
Wait.DsCnt = i;
2855+
break;
2856+
case EXP_CNT:
2857+
Wait.ExpCnt = i;
2858+
break;
2859+
case STORE_CNT:
2860+
Wait.StoreCnt = i;
2861+
break;
2862+
case SAMPLE_CNT:
2863+
Wait.SampleCnt = i;
2864+
break;
2865+
case BVH_CNT:
2866+
Wait.BvhCnt = i;
2867+
break;
2868+
case KM_CNT:
2869+
Wait.KmCnt = i;
2870+
break;
2871+
case X_CNT:
2872+
Wait.XCnt = i;
2873+
break;
2874+
default:
2875+
break;
2876+
}
2877+
2878+
AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST->getCPU());
2879+
unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
2880+
BuildMI(MBB, InsertPos, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
2881+
}
2882+
Modified = true;
2883+
}
2884+
2885+
return Modified;
2886+
}
2887+
27282888
bool SIInsertWaitcnts::run(MachineFunction &MF) {
27292889
ST = &MF.getSubtarget<GCNSubtarget>();
27302890
TII = ST->getInstrInfo();
@@ -2963,5 +3123,10 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
29633123
PreheadersToFlush.clear();
29643124
SLoadAddresses.clear();
29653125

3126+
// Expand waitcnts for profiling if requested
3127+
if (ST->isExpandWaitcntProfilingEnabled()) {
3128+
Modified |= expandWaitcntsForProfiling(MF);
3129+
}
3130+
29663131
return Modified;
29673132
}

0 commit comments

Comments
 (0)