Skip to content

Commit 351b38f

Browse files
authored
[AMDGPU] Mark address space cast from private to flat as divergent if target supports globally addressable scratch (#152376)
Globally addressable scratch is a new feature introduced in gfx1250. However, this feature changes how scratch space is mapped into the flat aperture, making address space casts from private to flat no longer uniform.
1 parent 381623e commit 351b38f

File tree

4 files changed

+97
-3
lines changed

4 files changed

+97
-3
lines changed

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -991,10 +991,21 @@ bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
991991
return true;
992992

993993
if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
994-
if (Intrinsic->getIntrinsicID() == Intrinsic::read_register)
994+
Intrinsic::ID IID = Intrinsic->getIntrinsicID();
995+
switch (IID) {
996+
case Intrinsic::read_register:
995997
return isReadRegisterSourceOfDivergence(Intrinsic);
996-
997-
return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID());
998+
case Intrinsic::amdgcn_addrspacecast_nonnull: {
999+
unsigned SrcAS =
1000+
Intrinsic->getOperand(0)->getType()->getPointerAddressSpace();
1001+
unsigned DstAS = Intrinsic->getType()->getPointerAddressSpace();
1002+
return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
1003+
DstAS == AMDGPUAS::FLAT_ADDRESS &&
1004+
ST->hasGloballyAddressableScratch();
1005+
}
1006+
default:
1007+
return AMDGPU::isIntrinsicSourceOfDivergence(IID);
1008+
}
9981009
}
9991010

10001011
// Assume all function calls are a source of divergence.
@@ -1008,6 +1019,15 @@ bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
10081019
if (isa<InvokeInst>(V))
10091020
return true;
10101021

1022+
// If the target supports globally addressable scratch, the mapping from
1023+
// scratch memory to the flat aperture changes therefore an address space cast
1024+
// is no longer uniform.
1025+
if (auto *CastI = dyn_cast<AddrSpaceCastInst>(V)) {
1026+
return CastI->getSrcAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS &&
1027+
CastI->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS &&
1028+
ST->hasGloballyAddressableScratch();
1029+
}
1030+
10111031
return false;
10121032
}
10131033

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10074,7 +10074,30 @@ unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
1007410074

1007510075
InstructionUniformity
1007610076
SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const {
10077+
const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
1007710078
unsigned opcode = MI.getOpcode();
10079+
10080+
auto HandleAddrSpaceCast = [this, &MRI](const MachineInstr &MI) {
10081+
Register Dst = MI.getOperand(0).getReg();
10082+
Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
10083+
: MI.getOperand(1).getReg();
10084+
LLT DstTy = MRI.getType(Dst);
10085+
LLT SrcTy = MRI.getType(Src);
10086+
unsigned DstAS = DstTy.getAddressSpace();
10087+
unsigned SrcAS = SrcTy.getAddressSpace();
10088+
return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
10089+
DstAS == AMDGPUAS::FLAT_ADDRESS &&
10090+
ST.hasGloballyAddressableScratch()
10091+
? InstructionUniformity::NeverUniform
10092+
: InstructionUniformity::Default;
10093+
};
10094+
10095+
// If the target supports globally addressable scratch, the mapping from
10096+
// scratch memory to the flat aperture changes therefore an address space cast
10097+
// is no longer uniform.
10098+
if (opcode == TargetOpcode::G_ADDRSPACE_CAST)
10099+
return HandleAddrSpaceCast(MI);
10100+
1007810101
if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
1007910102
auto IID = GI->getIntrinsicID();
1008010103
if (AMDGPU::isIntrinsicSourceOfDivergence(IID))
@@ -10083,6 +10106,8 @@ SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const {
1008310106
return InstructionUniformity::AlwaysUniform;
1008410107

1008510108
switch (IID) {
10109+
case Intrinsic::amdgcn_addrspacecast_nonnull:
10110+
return HandleAddrSpaceCast(MI);
1008610111
case Intrinsic::amdgcn_if:
1008710112
case Intrinsic::amdgcn_else:
1008810113
// FIXME: Uniform if second result
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
# NOTE: This file is Generic MIR translation of llvm/test/Analysis/UniformityAnalysis/AMDGPU/addrspacecast.ll test file
2+
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=print-machine-uniformity -filetype=null %s 2>&1 | FileCheck %s --check-prefix=UNI
3+
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass=print-machine-uniformity -filetype=null %s 2>&1 | FileCheck %s --check-prefix=DIV
4+
5+
# UNI: ALL VALUES UNIFORM
6+
# DIV: DIVERGENT: %3: %3:_(p0) = G_ADDRSPACE_CAST %2:_(p5)
7+
# DIV: DIVERGENT: %4: %4:_(p0) = G_INTRINSIC intrinsic(@llvm.amdgcn.addrspacecast.nonnull), %2:_(p5)
8+
9+
--- |
10+
define void @foo() {
11+
%alloca = alloca i32, align 4, addrspace(5)
12+
%cast = addrspacecast ptr addrspace(5) %alloca to ptr
13+
store i32 1, ptr %cast, align 4
14+
%cast.1 = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p5(ptr addrspace(5) %alloca)
15+
store i32 2, ptr %cast.1, align 4
16+
ret void
17+
}
18+
...
19+
---
20+
name: foo
21+
stack:
22+
- { id: 0, name: alloca, type: default, offset: 0, size: 4, alignment: 4,
23+
stack-id: default, callee-saved-register: '', callee-saved-restored: true,
24+
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
25+
body: |
26+
bb.1 (%ir-block.0):
27+
%10:_(s32) = G_CONSTANT i32 1
28+
%12:_(s32) = G_CONSTANT i32 2
29+
%8:_(p5) = G_FRAME_INDEX %stack.0.alloca
30+
%9:_(p0) = G_ADDRSPACE_CAST %8(p5)
31+
G_STORE %10(s32), %9(p0) :: (store (s32) into %ir.cast)
32+
%11:_(p0) = G_INTRINSIC intrinsic(@llvm.amdgcn.addrspacecast.nonnull), %8(p5)
33+
G_STORE %12(s32), %11(p0) :: (store (s32) into %ir.cast.1)
34+
SI_RETURN
35+
...
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s --check-prefix=UNI
2+
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s --check-prefix=DIV
3+
4+
; UNI: ALL VALUES UNIFORM
5+
; DIV: DIVERGENT: %cast = addrspacecast ptr addrspace(5) %alloca to ptr
6+
; DIV: DIVERGENT: %cast.1 = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p5(ptr addrspace(5) %alloca)
7+
define void @foo() {
8+
%alloca = alloca i32, align 4, addrspace(5)
9+
%cast = addrspacecast ptr addrspace(5) %alloca to ptr
10+
store i32 1, ptr %cast
11+
%cast.1 = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p5(ptr addrspace(5) %alloca)
12+
store i32 2, ptr %cast.1
13+
ret void
14+
}

0 commit comments

Comments
 (0)