@@ -494,6 +494,16 @@ class SIInsertWaitcnts {
494494 bool isVMEMOrFlatVMEM (const MachineInstr &MI) const ;
495495 bool run (MachineFunction &MF);
496496
497+ // Methods for expanding waitcnt instructions for profiling
498+ bool expandWaitcntsForProfiling (MachineFunction &MF);
499+ bool expandSingleWaitcnt (MachineInstr &MI, MachineBasicBlock &MBB);
500+ bool expandSingleCounterWait (MachineInstr &MI, MachineBasicBlock &MBB,
501+ InstCounterType CT);
502+ bool expandCounterSequence (MachineBasicBlock &MBB,
503+ MachineBasicBlock::iterator InsertPos,
504+ InstCounterType CT, unsigned CountValue,
505+ DebugLoc DL);
506+
497507 void setForceEmitWaitcnt () {
498508// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
499509// For debug builds, get the debug counter info and adjust if need be
@@ -2725,6 +2735,156 @@ SIInsertWaitcntsPass::run(MachineFunction &MF,
27252735 .preserve <AAManager>();
27262736}
27272737
2738+ // / Expand waitcnt instructions for profiling by inserting a sequence of
2739+ // / decreasing counter values. This helps identify which specific memory
2740+ // / operation is a bottleneck during PC sampling.
2741+ bool SIInsertWaitcnts::expandWaitcntsForProfiling (MachineFunction &MF) {
2742+ if (!ST->isExpandWaitcntProfilingEnabled ())
2743+ return false ;
2744+
2745+ bool Modified = false ;
2746+
2747+ // Iterate through all basic blocks
2748+ for (MachineBasicBlock &MBB : MF) {
2749+ for (auto I = MBB.begin (), E = MBB.end (); I != E;) {
2750+ MachineInstr &MI = *I;
2751+ ++I; // Advance iterator before potential expansion
2752+
2753+ if (ST->hasExtendedWaitCounts ()) {
2754+ // GFX12+: Handle separate wait instructions
2755+ if (auto CT = counterTypeForInstr (MI.getOpcode ())) {
2756+ Modified |= expandSingleCounterWait (MI, MBB, *CT);
2757+ }
2758+ } else {
2759+ // Pre-GFX12: Handle combined S_WAITCNT
2760+ if (MI.getOpcode () == AMDGPU::S_WAITCNT) {
2761+ Modified |= expandSingleWaitcnt (MI, MBB);
2762+ }
2763+ }
2764+ }
2765+ }
2766+
2767+ return Modified;
2768+ }
2769+
2770+ // / Expand a single S_WAITCNT instruction (pre-GFX12)
2771+ bool SIInsertWaitcnts::expandSingleWaitcnt (MachineInstr &MI,
2772+ MachineBasicBlock &MBB) {
2773+ assert (MI.getOpcode () == AMDGPU::S_WAITCNT);
2774+
2775+ // Decode the waitcnt immediate
2776+ unsigned Imm = MI.getOperand (0 ).getImm ();
2777+ AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion (ST->getCPU ());
2778+ AMDGPU::Waitcnt Wait = AMDGPU::decodeWaitcnt (IV, Imm);
2779+
2780+ // Insert expanded waitcnts BEFORE the original instruction
2781+ auto InsertPos = MI.getIterator ();
2782+ DebugLoc DL = MI.getDebugLoc ();
2783+
2784+ bool Modified = false ;
2785+
2786+ // Expand each counter independently
2787+ // For independent counters (Case 2 from requirements):
2788+ // vmcnt and lgkmcnt can be separated
2789+ Modified |= expandCounterSequence (MBB, InsertPos, LOAD_CNT, Wait.LoadCnt , DL);
2790+ Modified |= expandCounterSequence (MBB, InsertPos, DS_CNT, Wait.DsCnt , DL);
2791+ Modified |= expandCounterSequence (MBB, InsertPos, EXP_CNT, Wait.ExpCnt , DL);
2792+ Modified |=
2793+ expandCounterSequence (MBB, InsertPos, STORE_CNT, Wait.StoreCnt , DL);
2794+
2795+ // If we expanded anything, remove the original waitcnt
2796+ if (Modified) {
2797+ MI.eraseFromParent ();
2798+ }
2799+
2800+ return Modified;
2801+ }
2802+
2803+ // / Expand a single counter wait instruction (GFX12+)
2804+ bool SIInsertWaitcnts::expandSingleCounterWait (MachineInstr &MI,
2805+ MachineBasicBlock &MBB,
2806+ InstCounterType CT) {
2807+ // Get the counter value from the instruction
2808+ unsigned CountValue = MI.getOperand (0 ).getImm ();
2809+
2810+ // Insert expanded waitcnts BEFORE the original instruction
2811+ auto InsertPos = MI.getIterator ();
2812+ DebugLoc DL = MI.getDebugLoc ();
2813+
2814+ bool Modified = expandCounterSequence (MBB, InsertPos, CT, CountValue, DL);
2815+
2816+ // If we expanded, remove the original instruction
2817+ if (Modified) {
2818+ MI.eraseFromParent ();
2819+ }
2820+
2821+ return Modified;
2822+ }
2823+
2824+ // / Insert a sequence of wait instructions with decreasing counter values
2825+ bool SIInsertWaitcnts::expandCounterSequence (
2826+ MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPos,
2827+ InstCounterType CT, unsigned CountValue, DebugLoc DL) {
2828+ // Skip if counter is already at zero, not active, or at max (wait not needed)
2829+ if (CountValue == 0 || CountValue == ~0u )
2830+ return false ;
2831+
2832+ unsigned MaxCount = getWaitCountMax (CT);
2833+ if (CountValue >= MaxCount)
2834+ return false ;
2835+
2836+ bool Modified = false ;
2837+
2838+ // Generate decreasing sequence: CountValue-1, CountValue-2, ..., 1, 0
2839+ // We start from CountValue-1 because the original waitcnt already handles
2840+ // CountValue
2841+ for (int i = CountValue - 1 ; i >= 0 ; --i) {
2842+ if (ST->hasExtendedWaitCounts ()) {
2843+ // GFX12+: Use separate wait instructions
2844+ unsigned Opcode = instrsForExtendedCounterTypes[CT];
2845+ BuildMI (MBB, InsertPos, DL, TII->get (Opcode)).addImm (i);
2846+ } else {
2847+ // Pre-GFX12: Use combined S_WAITCNT with only this counter set
2848+ AMDGPU::Waitcnt Wait;
2849+ switch (CT) {
2850+ case LOAD_CNT:
2851+ Wait.LoadCnt = i;
2852+ break ;
2853+ case DS_CNT:
2854+ Wait.DsCnt = i;
2855+ break ;
2856+ case EXP_CNT:
2857+ Wait.ExpCnt = i;
2858+ break ;
2859+ case STORE_CNT:
2860+ Wait.StoreCnt = i;
2861+ break ;
2862+ case SAMPLE_CNT:
2863+ Wait.SampleCnt = i;
2864+ break ;
2865+ case BVH_CNT:
2866+ Wait.BvhCnt = i;
2867+ break ;
2868+ case KM_CNT:
2869+ Wait.KmCnt = i;
2870+ break ;
2871+ case X_CNT:
2872+ Wait.XCnt = i;
2873+ break ;
2874+ default :
2875+ break ;
2876+ }
2877+
2878+ AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion (ST->getCPU ());
2879+ unsigned Enc = AMDGPU::encodeWaitcnt (IV, Wait);
2880+ BuildMI (MBB, InsertPos, DL, TII->get (AMDGPU::S_WAITCNT)).addImm (Enc);
2881+ }
2882+ Modified = true ;
2883+ }
2884+
2885+ return Modified;
2886+ }
2887+
27282888bool SIInsertWaitcnts::run (MachineFunction &MF) {
27292889 ST = &MF.getSubtarget <GCNSubtarget>();
27302890 TII = ST->getInstrInfo ();
@@ -2963,5 +3123,10 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
29633123 PreheadersToFlush.clear ();
29643124 SLoadAddresses.clear ();
29653125
3126+ // Expand waitcnts for profiling if requested
3127+ if (ST->isExpandWaitcntProfilingEnabled ()) {
3128+ Modified |= expandWaitcntsForProfiling (MF);
3129+ }
3130+
29663131 return Modified;
29673132}
0 commit comments