Skip to content

Commit 58ac95d

Browse files
authored
[AMDGPU] Avoid changing minOccupancy if unclustered schedule was not run for any region. (llvm#162025)
During init of unclustered schedule stage, minOccupancy may be temporarily increased. But subsequently, if none of the regions are scheduled because they don't meet the conditions of initGCNRegion, minOccupancy remains incorrectly set. This patch avoids this incorrectness by delaying the change of minOccupancy until a region is about to be scheduled.
1 parent 830f690 commit 58ac95d

File tree

3 files changed

+90
-7
lines changed

3 files changed

+90
-7
lines changed

llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp

Lines changed: 30 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1228,18 +1228,20 @@ bool UnclusteredHighRPStage::initGCNSchedStage() {
12281228
createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PreRAReentry));
12291229

12301230
InitialOccupancy = DAG.MinOccupancy;
1231-
// Aggressivly try to reduce register pressure in the unclustered high RP
1231+
// Aggressively try to reduce register pressure in the unclustered high RP
12321232
// stage. Temporarily increase occupancy target in the region.
1233+
TempTargetOccupancy = MFI.getMaxWavesPerEU() > DAG.MinOccupancy
1234+
? InitialOccupancy + 1
1235+
: InitialOccupancy;
1236+
IsAnyRegionScheduled = false;
12331237
S.SGPRLimitBias = S.HighRPSGPRBias;
12341238
S.VGPRLimitBias = S.HighRPVGPRBias;
1235-
if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy)
1236-
MFI.increaseOccupancy(MF, ++DAG.MinOccupancy);
12371239

12381240
LLVM_DEBUG(
12391241
dbgs()
12401242
<< "Retrying function scheduling without clustering. "
1241-
"Aggressivly try to reduce register pressure to achieve occupancy "
1242-
<< DAG.MinOccupancy << ".\n");
1243+
"Aggressively try to reduce register pressure to achieve occupancy "
1244+
<< TempTargetOccupancy << ".\n");
12431245

12441246
return true;
12451247
}
@@ -1320,9 +1322,16 @@ void UnclusteredHighRPStage::finalizeGCNSchedStage() {
13201322
SavedMutations.swap(DAG.Mutations);
13211323
S.SGPRLimitBias = S.VGPRLimitBias = 0;
13221324
if (DAG.MinOccupancy > InitialOccupancy) {
1325+
assert(IsAnyRegionScheduled);
13231326
LLVM_DEBUG(dbgs() << StageID
13241327
<< " stage successfully increased occupancy to "
13251328
<< DAG.MinOccupancy << '\n');
1329+
} else if (!IsAnyRegionScheduled) {
1330+
assert(DAG.MinOccupancy == InitialOccupancy);
1331+
LLVM_DEBUG(dbgs() << StageID
1332+
<< ": No regions scheduled, min occupancy stays at "
1333+
<< DAG.MinOccupancy << ", MFI occupancy stays at "
1334+
<< MFI.getOccupancy() << ".\n");
13261335
}
13271336

13281337
GCNSchedStage::finalizeGCNSchedStage();
@@ -1396,13 +1405,27 @@ bool UnclusteredHighRPStage::initGCNRegion() {
13961405
// rescheduling of previous regions did not make occupancy drop back down to
13971406
// the initial minimum).
13981407
unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize();
1408+
// If no region has been scheduled yet, the DAG has not yet been updated with
1409+
// the occupancy target. So retrieve it from the temporary.
1410+
unsigned CurrentTargetOccupancy =
1411+
IsAnyRegionScheduled ? DAG.MinOccupancy : TempTargetOccupancy;
13991412
if (!DAG.RegionsWithExcessRP[RegionIdx] &&
1400-
(DAG.MinOccupancy <= InitialOccupancy ||
1413+
(CurrentTargetOccupancy <= InitialOccupancy ||
14011414
DAG.Pressure[RegionIdx].getOccupancy(ST, DynamicVGPRBlockSize) !=
14021415
InitialOccupancy))
14031416
return false;
14041417

1405-
return GCNSchedStage::initGCNRegion();
1418+
bool IsSchedulingThisRegion = GCNSchedStage::initGCNRegion();
1419+
// If this is the first region scheduled during this stage, make the target
1420+
// occupancy changes in the DAG and MFI.
1421+
if (!IsAnyRegionScheduled && IsSchedulingThisRegion) {
1422+
IsAnyRegionScheduled = true;
1423+
if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy) {
1424+
DAG.MinOccupancy = TempTargetOccupancy;
1425+
MFI.increaseOccupancy(MF, TempTargetOccupancy);
1426+
}
1427+
}
1428+
return IsSchedulingThisRegion;
14061429
}
14071430

14081431
bool ClusteredLowOccStage::initGCNRegion() {

llvm/lib/Target/AMDGPU/GCNSchedStrategy.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -417,6 +417,10 @@ class UnclusteredHighRPStage : public GCNSchedStage {
417417
private:
418418
// Save the initial occupancy before starting this stage.
419419
unsigned InitialOccupancy;
420+
// Save the temporary target occupancy before starting this stage.
421+
unsigned TempTargetOccupancy;
422+
// Track whether any region was scheduled by this stage.
423+
bool IsAnyRegionScheduled;
420424

421425
public:
422426
bool initGCNSchedStage() override;
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
# REQUIRES: asserts
2+
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -start-before=machine-scheduler -stop-after=greedy,2 -stress-regalloc=4 -debug-only=machine-scheduler %s -o - 2>&1 | FileCheck %s
3+
4+
--- |
5+
define amdgpu_kernel void @no_sched_metric_due_to_spills() #0 {
6+
ret void
7+
}
8+
9+
attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
10+
...
11+
12+
# This test checks for the following scenario: Unclustered high-RP-reschedule
13+
# stage raises the occupancy target temporarily but no region gets scheduled
14+
# because of constraints. Then, DAG and MFI min-occupancy should not be changed
15+
# at the end of the unclustered schedule stage.
16+
# CHECK: Retrying function scheduling without clustering. Aggressively try to reduce register pressure to achieve occupancy 5.
17+
# CHECK: Unclustered High Register Pressure Reschedule: No regions scheduled, min occupancy stays at 4, MFI occupancy stays at 4.
18+
19+
---
20+
name: no_sched_metric_due_to_spills
21+
tracksRegLiveness: true
22+
machineFunctionInfo:
23+
stackPtrOffsetReg: '$sgpr32'
24+
occupancy: 4
25+
body: |
26+
bb.0:
27+
liveins: $vgpr0, $sgpr0_sgpr1, $sgpr15
28+
29+
%0:sgpr_32 = COPY $sgpr15
30+
%1:sgpr_64 = COPY $sgpr0_sgpr1
31+
%2:vgpr_32 = COPY $vgpr0
32+
%3:sgpr_128 = S_LOAD_DWORDX4_IMM %1, 0, 0 :: (dereferenceable invariant load (s128), addrspace 4)
33+
undef %4.sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM %1, 16, 0 :: (dereferenceable invariant load (s64), align 16, addrspace 4)
34+
%5:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1, 32, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4)
35+
%6:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1, 64, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4)
36+
%7:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1, 84, 0 :: (dereferenceable invariant load (s32), addrspace 4)
37+
%8:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1, 112, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4)
38+
%9:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1, 128, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4)
39+
%10:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1, 176, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4)
40+
%11:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1, 192, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4)
41+
%12:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1, 216, 0 :: (dereferenceable invariant load (s64), addrspace 4)
42+
%13:sreg_32 = S_ADD_I32 %12.sub0, 127, implicit-def dead $scc
43+
%14:sreg_32 = S_ASHR_I32 %13, 31, implicit-def dead $scc
44+
%15:sreg_32 = S_LSHR_B32 %14, 25, implicit-def dead $scc
45+
%16:sreg_32 = S_ADD_I32 %13, %15, implicit-def dead $scc
46+
%17:sreg_32 = S_ASHR_I32 %16, 7, implicit-def dead $scc
47+
%18:sreg_32 = S_ADD_I32 %12.sub1, 255, implicit-def dead $scc
48+
%19:sreg_32 = S_ASHR_I32 %18, 31, implicit-def dead $scc
49+
%20:sreg_32 = S_LSHR_B32 %19, 24, implicit-def dead $scc
50+
%21:sreg_32 = S_ADD_I32 %18, %20, implicit-def dead $scc
51+
%22:sreg_32 = S_ASHR_I32 %21, 8, implicit-def dead $scc
52+
%23:sreg_32 = nsw S_MUL_I32 %22, %17
53+
%24:sreg_32 = S_ASHR_I32 %0, 31, implicit-def dead $scc
54+
S_ENDPGM 0
55+
56+
...

0 commit comments

Comments
 (0)