Skip to content

Commit ef5cfa8

Browse files
shiltianzGoldthorpe
authored andcommitted
[AMDGPU][Attributor] Remove final update of waves-per-eu after the attributor run (llvm#155246) (llvm#3772)
We do not need this in the attributor, because `ST.getWavesPerEU` accounts for both the waves-per-eu and flat-workgroup-size attributes. If the waves-per-eu values are not valid, it drops them. In the attributor, we only need to propagate the values without using intermediate flat workgroup size values. Fixes SWDEV-550257. (cherry picked from commit ca03045)
1 parent 4ce598d commit ef5cfa8

32 files changed

+676
-1357
lines changed

llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp

Lines changed: 1 addition & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -1295,74 +1295,6 @@ struct AAAMDGPUNoAGPR
12951295

12961296
const char AAAMDGPUNoAGPR::ID = 0;
12971297

1298-
/// Performs the final check and updates the 'amdgpu-waves-per-eu' attribute
1299-
/// based on the finalized 'amdgpu-flat-work-group-size' attribute.
1300-
/// Both attributes start with narrow ranges that expand during iteration.
1301-
/// However, a narrower flat-workgroup-size leads to a wider waves-per-eu range,
1302-
/// preventing optimal updates later. Therefore, waves-per-eu can't be updated
1303-
/// with intermediate values during the attributor run. We defer the
1304-
/// finalization of waves-per-eu until after the flat-workgroup-size is
1305-
/// finalized.
1306-
/// TODO: Remove this and move similar logic back into the attributor run once
1307-
/// we have a better representation for waves-per-eu.
1308-
static bool updateWavesPerEU(Module &M, TargetMachine &TM) {
1309-
bool Changed = false;
1310-
1311-
LLVMContext &Ctx = M.getContext();
1312-
1313-
for (Function &F : M) {
1314-
if (F.isDeclaration())
1315-
continue;
1316-
1317-
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
1318-
1319-
std::optional<std::pair<unsigned, std::optional<unsigned>>>
1320-
FlatWgrpSizeAttr =
1321-
AMDGPU::getIntegerPairAttribute(F, "amdgpu-flat-work-group-size");
1322-
1323-
unsigned MinWavesPerEU = ST.getMinWavesPerEU();
1324-
unsigned MaxWavesPerEU = ST.getMaxWavesPerEU();
1325-
1326-
unsigned MinFlatWgrpSize = ST.getMinFlatWorkGroupSize();
1327-
unsigned MaxFlatWgrpSize = ST.getMaxFlatWorkGroupSize();
1328-
if (FlatWgrpSizeAttr.has_value()) {
1329-
MinFlatWgrpSize = FlatWgrpSizeAttr->first;
1330-
MaxFlatWgrpSize = *(FlatWgrpSizeAttr->second);
1331-
}
1332-
1333-
// Start with the "best" range.
1334-
unsigned Min = MinWavesPerEU;
1335-
unsigned Max = MinWavesPerEU;
1336-
1337-
// Compute the range from flat workgroup size. `getWavesPerEU` will also
1338-
// account for the 'amdgpu-waves-er-eu' attribute.
1339-
auto [MinFromFlatWgrpSize, MaxFromFlatWgrpSize] =
1340-
ST.getWavesPerEU(F, {MinFlatWgrpSize, MaxFlatWgrpSize});
1341-
1342-
// For the lower bound, we have to "tighten" it.
1343-
Min = std::max(Min, MinFromFlatWgrpSize);
1344-
// For the upper bound, we have to "extend" it.
1345-
Max = std::max(Max, MaxFromFlatWgrpSize);
1346-
1347-
// Clamp the range to the max range.
1348-
Min = std::max(Min, MinWavesPerEU);
1349-
Max = std::min(Max, MaxWavesPerEU);
1350-
1351-
// Update the attribute if it is not the max.
1352-
if (Min != MinWavesPerEU || Max != MaxWavesPerEU) {
1353-
SmallString<10> Buffer;
1354-
raw_svector_ostream OS(Buffer);
1355-
OS << Min << ',' << Max;
1356-
Attribute OldAttr = F.getFnAttribute("amdgpu-waves-per-eu");
1357-
Attribute NewAttr = Attribute::get(Ctx, "amdgpu-waves-per-eu", OS.str());
1358-
F.addFnAttr(NewAttr);
1359-
Changed |= OldAttr == NewAttr;
1360-
}
1361-
}
1362-
1363-
return Changed;
1364-
}
1365-
13661298
static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
13671299
AMDGPUAttributorOptions Options) {
13681300
SetVector<Function *> Functions;
@@ -1437,11 +1369,7 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
14371369
}
14381370
}
14391371

1440-
bool Changed = A.run() == ChangeStatus::CHANGED;
1441-
1442-
Changed |= updateWavesPerEU(M, TM);
1443-
1444-
return Changed;
1372+
return A.run() == ChangeStatus::CHANGED;
14451373
}
14461374

14471375
class AMDGPUAttributorLegacy : public ModulePass {

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -216,15 +216,6 @@ AMDGPUSubtarget::getWavesPerEU(const Function &F) const {
216216
return getWavesPerEU(FlatWorkGroupSizes, LDSBytes, F);
217217
}
218218

219-
std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
220-
const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
221-
// Minimum number of bytes allocated in the LDS.
222-
unsigned LDSBytes = AMDGPU::getIntegerPairAttribute(F, "amdgpu-lds-size",
223-
{0, UINT32_MAX}, true)
224-
.first;
225-
return getWavesPerEU(FlatWorkGroupSizes, LDSBytes, F);
226-
}
227-
228219
std::pair<unsigned, unsigned>
229220
AMDGPUSubtarget::getWavesPerEU(std::pair<unsigned, unsigned> FlatWorkGroupSizes,
230221
unsigned LDSBytes, const Function &F) const {

llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll

Lines changed: 59 additions & 127 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll

Lines changed: 14 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ declare void @unknown()
105105

106106
define amdgpu_kernel void @kernel_calls_extern() {
107107
; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_extern(
108-
; CHECK-SAME: ) #[[ATTR3:[0-9]+]] {
108+
; CHECK-SAME: ) #[[ATTR2:[0-9]+]] {
109109
; CHECK-NEXT: call void @unknown()
110110
; CHECK-NEXT: ret void
111111
;
@@ -115,8 +115,8 @@ define amdgpu_kernel void @kernel_calls_extern() {
115115

116116
define amdgpu_kernel void @kernel_calls_extern_marked_callsite() {
117117
; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_extern_marked_callsite(
118-
; CHECK-SAME: ) #[[ATTR3]] {
119-
; CHECK-NEXT: call void @unknown() #[[ATTR7:[0-9]+]]
118+
; CHECK-SAME: ) #[[ATTR2]] {
119+
; CHECK-NEXT: call void @unknown() #[[ATTR6:[0-9]+]]
120120
; CHECK-NEXT: ret void
121121
;
122122
call void @unknown() #0
@@ -125,7 +125,7 @@ define amdgpu_kernel void @kernel_calls_extern_marked_callsite() {
125125

126126
define amdgpu_kernel void @kernel_calls_indirect(ptr %indirect) {
127127
; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_indirect(
128-
; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR3]] {
128+
; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR2]] {
129129
; CHECK-NEXT: call void [[INDIRECT]]()
130130
; CHECK-NEXT: ret void
131131
;
@@ -135,8 +135,8 @@ define amdgpu_kernel void @kernel_calls_indirect(ptr %indirect) {
135135

136136
define amdgpu_kernel void @kernel_calls_indirect_marked_callsite(ptr %indirect) {
137137
; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_indirect_marked_callsite(
138-
; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR3]] {
139-
; CHECK-NEXT: call void [[INDIRECT]]() #[[ATTR7]]
138+
; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR2]] {
139+
; CHECK-NEXT: call void [[INDIRECT]]() #[[ATTR6]]
140140
; CHECK-NEXT: ret void
141141
;
142142
call void %indirect() #0
@@ -240,14 +240,13 @@ define amdgpu_kernel void @indirect_calls_none_agpr(i1 %cond) {
240240
}
241241

242242

243-
attributes #0 = { "amdgpu-no-agpr" }
243+
attributes #0 = { "amdgpu-agpr-alloc"="0" }
244244
;.
245-
; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
246-
; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
247-
; CHECK: attributes #[[ATTR2:[0-9]+]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
248-
; CHECK: attributes #[[ATTR3]] = { "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
249-
; CHECK: attributes #[[ATTR4:[0-9]+]] = { convergent nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" }
250-
; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx90a" }
251-
; CHECK: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx90a" }
252-
; CHECK: attributes #[[ATTR7]] = { "amdgpu-no-agpr" }
245+
; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
246+
; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
247+
; CHECK: attributes #[[ATTR2]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
248+
; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" }
249+
; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx90a" }
250+
; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx90a" }
251+
; CHECK: attributes #[[ATTR6]] = { "amdgpu-agpr-alloc"="0" }
253252
;.

llvm/test/CodeGen/AMDGPU/annotate-existing-abi-attributes.ll

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -117,14 +117,14 @@ define void @call_no_dispatch_id() {
117117
ret void
118118
}
119119
;.
120-
; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-workitem-id-x" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
121-
; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-workitem-id-y" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
122-
; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
123-
; CHECK: attributes #[[ATTR3]] = { "amdgpu-no-workgroup-id-x" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
124-
; CHECK: attributes #[[ATTR4]] = { "amdgpu-no-workgroup-id-y" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
125-
; CHECK: attributes #[[ATTR5]] = { "amdgpu-no-workgroup-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
126-
; CHECK: attributes #[[ATTR6]] = { "amdgpu-no-dispatch-ptr" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
127-
; CHECK: attributes #[[ATTR7]] = { "amdgpu-no-queue-ptr" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
128-
; CHECK: attributes #[[ATTR8]] = { "amdgpu-no-implicitarg-ptr" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
129-
; CHECK: attributes #[[ATTR9]] = { "amdgpu-no-dispatch-id" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
120+
; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
121+
; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-workitem-id-y" "uniform-work-group-size"="false" }
122+
; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
123+
; CHECK: attributes #[[ATTR3]] = { "amdgpu-no-workgroup-id-x" "uniform-work-group-size"="false" }
124+
; CHECK: attributes #[[ATTR4]] = { "amdgpu-no-workgroup-id-y" "uniform-work-group-size"="false" }
125+
; CHECK: attributes #[[ATTR5]] = { "amdgpu-no-workgroup-id-z" "uniform-work-group-size"="false" }
126+
; CHECK: attributes #[[ATTR6]] = { "amdgpu-no-dispatch-ptr" "uniform-work-group-size"="false" }
127+
; CHECK: attributes #[[ATTR7]] = { "amdgpu-no-queue-ptr" "uniform-work-group-size"="false" }
128+
; CHECK: attributes #[[ATTR8]] = { "amdgpu-no-implicitarg-ptr" "uniform-work-group-size"="false" }
129+
; CHECK: attributes #[[ATTR9]] = { "amdgpu-no-dispatch-id" "uniform-work-group-size"="false" }
130130
;.

0 commit comments

Comments
 (0)