Skip to content

Commit 84ab301

Browse files
authored
[AMDGPU][Attributor] Infer inreg attribute in AMDGPUAttributor (llvm#146720)
This patch introduces `AAAMDGPUUniformArgument` that can infer `inreg` function argument attribute. The idea is, for a function argument, if the corresponding call site arguments are always uniform, we can mark it as `inreg` thus pass it via SGPR. In addition, this AA is also able to propagate the inreg attribute if feasible.
1 parent cc68e45 commit 84ab301

File tree

3 files changed

+204
-3
lines changed

3 files changed

+204
-3
lines changed

llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp

Lines changed: 117 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include "AMDGPU.h"
1414
#include "GCNSubtarget.h"
1515
#include "Utils/AMDGPUBaseInfo.h"
16+
#include "llvm/Analysis/TargetTransformInfo.h"
1617
#include "llvm/IR/IntrinsicsAMDGPU.h"
1718
#include "llvm/IR/IntrinsicsR600.h"
1819
#include "llvm/Target/TargetMachine.h"
@@ -1296,6 +1297,116 @@ struct AAAMDGPUNoAGPR
12961297

12971298
const char AAAMDGPUNoAGPR::ID = 0;
12981299

1300+
struct AAAMDGPUUniform : public StateWrapper<BooleanState, AbstractAttribute> {
1301+
using Base = StateWrapper<BooleanState, AbstractAttribute>;
1302+
AAAMDGPUUniform(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
1303+
1304+
/// Create an abstract attribute view for the position \p IRP.
1305+
static AAAMDGPUUniform &createForPosition(const IRPosition &IRP,
1306+
Attributor &A);
1307+
1308+
/// See AbstractAttribute::getName()
1309+
StringRef getName() const override { return "AAAMDGPUUniform"; }
1310+
1311+
const std::string getAsStr(Attributor *A) const override {
1312+
return getAssumed() ? "uniform" : "divergent";
1313+
}
1314+
1315+
void trackStatistics() const override {}
1316+
1317+
/// See AbstractAttribute::getIdAddr()
1318+
const char *getIdAddr() const override { return &ID; }
1319+
1320+
/// This function should return true if the type of the \p AA is
1321+
/// AAAMDGPUUniform
1322+
static bool classof(const AbstractAttribute *AA) {
1323+
return (AA->getIdAddr() == &ID);
1324+
}
1325+
1326+
/// Unique ID (due to the unique address)
1327+
static const char ID;
1328+
};
1329+
1330+
const char AAAMDGPUUniform::ID = 0;
1331+
1332+
/// This AA is to infer the inreg attribute for a function argument.
1333+
struct AAAMDGPUUniformArgument : public AAAMDGPUUniform {
1334+
AAAMDGPUUniformArgument(const IRPosition &IRP, Attributor &A)
1335+
: AAAMDGPUUniform(IRP, A) {}
1336+
1337+
void initialize(Attributor &A) override {
1338+
Argument *Arg = getAssociatedArgument();
1339+
CallingConv::ID CC = Arg->getParent()->getCallingConv();
1340+
if (Arg->hasAttribute(Attribute::InReg)) {
1341+
indicateOptimisticFixpoint();
1342+
return;
1343+
}
1344+
1345+
if (AMDGPU::isEntryFunctionCC(CC)) {
1346+
// We only use isArgPassedInSGPR on kernel entry function argument, so
1347+
// even if we will use SPGR for non-uniform i1 argument passing, it will
1348+
// not affect this.
1349+
if (AMDGPU::isArgPassedInSGPR(Arg))
1350+
indicateOptimisticFixpoint();
1351+
else
1352+
indicatePessimisticFixpoint();
1353+
}
1354+
}
1355+
1356+
ChangeStatus updateImpl(Attributor &A) override {
1357+
unsigned ArgNo = getAssociatedArgument()->getArgNo();
1358+
1359+
auto isUniform = [&](AbstractCallSite ACS) -> bool {
1360+
CallBase *CB = ACS.getInstruction();
1361+
Value *V = CB->getArgOperand(ArgNo);
1362+
if (isa<Constant>(V))
1363+
return true;
1364+
if (auto *Arg = dyn_cast<Argument>(V)) {
1365+
auto *AA = A.getOrCreateAAFor<AAAMDGPUUniform>(
1366+
IRPosition::argument(*Arg), this, DepClassTy::REQUIRED);
1367+
return AA && AA->isValidState();
1368+
}
1369+
const TargetTransformInfo *TTI =
1370+
A.getInfoCache().getAnalysisResultForFunction<TargetIRAnalysis>(
1371+
*CB->getFunction());
1372+
return TTI->isAlwaysUniform(V);
1373+
};
1374+
1375+
bool UsedAssumedInformation = true;
1376+
if (!A.checkForAllCallSites(isUniform, *this, /*RequireAllCallSites=*/true,
1377+
UsedAssumedInformation))
1378+
return indicatePessimisticFixpoint();
1379+
1380+
if (!UsedAssumedInformation)
1381+
return indicateOptimisticFixpoint();
1382+
1383+
return ChangeStatus::UNCHANGED;
1384+
}
1385+
1386+
ChangeStatus manifest(Attributor &A) override {
1387+
Argument *Arg = getAssociatedArgument();
1388+
// If the argument already has inreg attribute, we will not do anything
1389+
// about it.
1390+
if (Arg->hasAttribute(Attribute::InReg))
1391+
return ChangeStatus::UNCHANGED;
1392+
if (AMDGPU::isEntryFunctionCC(Arg->getParent()->getCallingConv()))
1393+
return ChangeStatus::UNCHANGED;
1394+
LLVMContext &Ctx = Arg->getContext();
1395+
return A.manifestAttrs(getIRPosition(),
1396+
{Attribute::get(Ctx, Attribute::InReg)});
1397+
}
1398+
};
1399+
1400+
AAAMDGPUUniform &AAAMDGPUUniform::createForPosition(const IRPosition &IRP,
1401+
Attributor &A) {
1402+
switch (IRP.getPositionKind()) {
1403+
case IRPosition::IRP_ARGUMENT:
1404+
return *new (A.Allocator) AAAMDGPUUniformArgument(IRP, A);
1405+
default:
1406+
llvm_unreachable("not a valid position for AAAMDGPUUniform");
1407+
}
1408+
}
1409+
12991410
/// Performs the final check and updates the 'amdgpu-waves-per-eu' attribute
13001411
/// based on the finalized 'amdgpu-flat-work-group-size' attribute.
13011412
/// Both attributes start with narrow ranges that expand during iteration.
@@ -1382,7 +1493,7 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
13821493
&AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID,
13831494
&AACallEdges::ID, &AAPointerInfo::ID, &AAPotentialConstantValues::ID,
13841495
&AAUnderlyingObjects::ID, &AANoAliasAddrSpace::ID, &AAAddressSpace::ID,
1385-
&AAIndirectCallInfo::ID});
1496+
&AAIndirectCallInfo::ID, &AAAMDGPUUniform::ID});
13861497

13871498
AttributorConfig AC(CGUpdater);
13881499
AC.IsClosedWorldModule = Options.IsClosedWorld;
@@ -1435,6 +1546,11 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
14351546
A.getOrCreateAAFor<AAAddressSpace>(IRPosition::value(*Ptr));
14361547
A.getOrCreateAAFor<AANoAliasAddrSpace>(IRPosition::value(*Ptr));
14371548
}
1549+
1550+
if (!AMDGPU::isEntryFunctionCC(F->getCallingConv())) {
1551+
for (auto &Arg : F->args())
1552+
A.getOrCreateAAFor<AAAMDGPUUniform>(IRPosition::argument(Arg));
1553+
}
14381554
}
14391555
}
14401556

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s -o - | FileCheck %s
3+
4+
@g1 = protected addrspace(1) externally_initialized global i32 0, align 4
5+
@g2 = protected addrspace(1) externally_initialized global i32 0, align 4
6+
@g3 = protected addrspace(1) externally_initialized global i32 0, align 4
7+
8+
define internal void @callee_with_always_uniform_argument(ptr addrspace(1) %x, i32 %y) {
9+
; CHECK-LABEL: define internal void @callee_with_always_uniform_argument(
10+
; CHECK-SAME: ptr addrspace(1) inreg [[X:%.*]], i32 inreg [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
11+
; CHECK-NEXT: [[ENTRY:.*:]]
12+
; CHECK-NEXT: [[X_VAL:%.*]] = load i32, ptr addrspace(1) [[X]], align 4
13+
; CHECK-NEXT: store i32 [[X_VAL]], ptr addrspace(1) @g2, align 4
14+
; CHECK-NEXT: store i32 [[Y]], ptr addrspace(1) @g3, align 4
15+
; CHECK-NEXT: ret void
16+
;
17+
entry:
18+
%x.val = load i32, ptr addrspace(1) %x, align 4
19+
store i32 %x.val, ptr addrspace(1) @g2, align 4
20+
store i32 %y, ptr addrspace(1) @g3, align 4
21+
ret void
22+
}
23+
24+
define amdgpu_kernel void @kernel_with_readfirstlane(ptr addrspace(1) %p, i32 %x) {
25+
; CHECK-LABEL: define amdgpu_kernel void @kernel_with_readfirstlane(
26+
; CHECK-SAME: ptr addrspace(1) [[P:%.*]], i32 [[X:%.*]]) #[[ATTR0]] {
27+
; CHECK-NEXT: [[ENTRY:.*:]]
28+
; CHECK-NEXT: [[P0:%.*]] = call ptr addrspace(1) @llvm.amdgcn.readfirstlane.p1(ptr addrspace(1) [[P]])
29+
; CHECK-NEXT: call void @callee_with_always_uniform_argument(ptr addrspace(1) [[P0]], i32 [[X]])
30+
; CHECK-NEXT: ret void
31+
;
32+
entry:
33+
%p0 = call ptr addrspace(1) @llvm.amdgcn.readfirstlane.p1(ptr addrspace(1) %p)
34+
call void @callee_with_always_uniform_argument(ptr addrspace(1) %p0, i32 %x)
35+
ret void
36+
}
37+
38+
define amdgpu_kernel void @kernel_with_constant(i32 %x) {
39+
; CHECK-LABEL: define amdgpu_kernel void @kernel_with_constant(
40+
; CHECK-SAME: i32 [[X:%.*]]) #[[ATTR0]] {
41+
; CHECK-NEXT: [[ENTRY:.*:]]
42+
; CHECK-NEXT: call void @callee_with_always_uniform_argument(ptr addrspace(1) @g1, i32 [[X]])
43+
; CHECK-NEXT: ret void
44+
;
45+
entry:
46+
call void @callee_with_always_uniform_argument(ptr addrspace(1) @g1, i32 %x)
47+
ret void
48+
}
49+
50+
define internal void @callee_without_always_uniform_argument(ptr addrspace(1) %x, i32 %y) {
51+
; CHECK-LABEL: define internal void @callee_without_always_uniform_argument(
52+
; CHECK-SAME: ptr addrspace(1) [[X:%.*]], i32 [[Y:%.*]]) #[[ATTR0]] {
53+
; CHECK-NEXT: [[ENTRY:.*:]]
54+
; CHECK-NEXT: [[X_VAL:%.*]] = load i32, ptr addrspace(1) [[X]], align 4
55+
; CHECK-NEXT: store i32 [[X_VAL]], ptr addrspace(1) @g2, align 4
56+
; CHECK-NEXT: store i32 [[Y]], ptr addrspace(1) @g3, align 4
57+
; CHECK-NEXT: ret void
58+
;
59+
entry:
60+
%x.val = load i32, ptr addrspace(1) %x, align 4
61+
store i32 %x.val, ptr addrspace(1) @g2, align 4
62+
store i32 %y, ptr addrspace(1) @g3, align 4
63+
ret void
64+
}
65+
66+
define amdgpu_kernel void @kernel_with_divergent_callsite_argument(ptr addrspace(1) %p, i32 %x) {
67+
; CHECK-LABEL: define amdgpu_kernel void @kernel_with_divergent_callsite_argument(
68+
; CHECK-SAME: ptr addrspace(1) [[P:%.*]], i32 [[X:%.*]]) #[[ATTR0]] {
69+
; CHECK-NEXT: [[ENTRY:.*:]]
70+
; CHECK-NEXT: [[ID_X:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
71+
; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[P]], i32 [[ID_X]]
72+
; CHECK-NEXT: [[D:%.*]] = load i32, ptr addrspace(1) [[GEP]], align 4
73+
; CHECK-NEXT: call void @callee_without_always_uniform_argument(ptr addrspace(1) [[GEP]], i32 [[D]])
74+
; CHECK-NEXT: ret void
75+
;
76+
entry:
77+
%id.x = call i32 @llvm.amdgcn.workitem.id.x()
78+
%gep = getelementptr i32, ptr addrspace(1) %p, i32 %id.x
79+
%d = load i32, ptr addrspace(1) %gep
80+
call void @callee_without_always_uniform_argument(ptr addrspace(1) %gep, i32 %d)
81+
ret void
82+
}
83+
84+
declare ptr addrspace(1) @llvm.amdgcn.readfirstlane.p1(ptr addrspace(1))
85+
declare noundef i32 @llvm.amdgcn.workitem.id.x()

llvm/test/CodeGen/AMDGPU/attributor-noalias-addrspace.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -480,7 +480,7 @@ bb.2.end:
480480

481481
define internal void @callee_no_alias_addr_space_select(ptr %ptr1, ptr %ptr2, ptr %ptr3, i1 %cond1, i1 %cond2, i32 %val) #0 {
482482
; CHECK-LABEL: define internal void @callee_no_alias_addr_space_select(
483-
; CHECK-SAME: ptr [[PTR1:%.*]], ptr [[PTR2:%.*]], ptr [[PTR3:%.*]], i1 [[COND1:%.*]], i1 [[COND2:%.*]], i32 [[VAL:%.*]]) #[[ATTR1:[0-9]+]] {
483+
; CHECK-SAME: ptr [[PTR1:%.*]], ptr [[PTR2:%.*]], ptr [[PTR3:%.*]], i1 inreg [[COND1:%.*]], i1 inreg [[COND2:%.*]], i32 inreg [[VAL:%.*]]) #[[ATTR1:[0-9]+]] {
484484
; CHECK-NEXT: [[PTR4:%.*]] = select i1 [[COND1]], ptr addrspacecast (ptr addrspace(1) @gptr to ptr), ptr addrspacecast (ptr addrspace(4) @gptr2 to ptr)
485485
; CHECK-NEXT: [[PTR5:%.*]] = select i1 [[COND2]], ptr [[PTR4]], ptr addrspacecast (ptr addrspace(3) @gptr3 to ptr)
486486
; CHECK-NEXT: store i32 [[VAL]], ptr [[PTR5]], align 4, !noalias.addrspace [[META1:![0-9]+]]
@@ -516,7 +516,7 @@ define internal void @callee_no_alias_addr_space_select(ptr %ptr1, ptr %ptr2, pt
516516

517517
define internal void @callee_alias_addr_space_branch(ptr %ptr1, ptr %ptr2, ptr %ptr3, i1 %cond1, i1 %cond2, i32 %val) #0 {
518518
; CHECK-LABEL: define internal void @callee_alias_addr_space_branch(
519-
; CHECK-SAME: ptr [[PTR1:%.*]], ptr [[PTR2:%.*]], ptr [[PTR3:%.*]], i1 [[COND1:%.*]], i1 [[COND2:%.*]], i32 [[VAL:%.*]]) #[[ATTR1]] {
519+
; CHECK-SAME: ptr [[PTR1:%.*]], ptr [[PTR2:%.*]], ptr [[PTR3:%.*]], i1 inreg [[COND1:%.*]], i1 inreg [[COND2:%.*]], i32 inreg [[VAL:%.*]]) #[[ATTR1]] {
520520
; CHECK-NEXT: br i1 [[COND1]], label %[[BB_1_TRUE:.*]], label %[[BB_1_FALSE:.*]]
521521
; CHECK: [[BB_1_TRUE]]:
522522
; CHECK-NEXT: br label %[[BB_1_END:.*]]

0 commit comments

Comments
 (0)