From 18d9ec996b56a41533b8e428820e7cfdbfaf1bae Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Wed, 6 Aug 2025 16:23:26 -0400 Subject: [PATCH] [AMDGPU] Mark address space cast from private to flat as divergent if target supports globally addressable scratch Globally addressable scratch is a new feature introduced in gfx1250. However, this feature changes how scratch space is mapped into the flat aperture, making address space casts from private to flat no longer uniform. --- .../AMDGPU/AMDGPUTargetTransformInfo.cpp | 26 ++++++++++++-- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 25 +++++++++++++ .../AMDGPU/MIR/addrspacecast.mir | 35 +++++++++++++++++++ .../AMDGPU/addrspacecast.ll | 14 ++++++++ 4 files changed, 97 insertions(+), 3 deletions(-) create mode 100644 llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/addrspacecast.mir create mode 100644 llvm/test/Analysis/UniformityAnalysis/AMDGPU/addrspacecast.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index a0c99b0ef0491..846a0b6280f19 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -991,10 +991,21 @@ bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const { return true; if (const IntrinsicInst *Intrinsic = dyn_cast(V)) { - if (Intrinsic->getIntrinsicID() == Intrinsic::read_register) + Intrinsic::ID IID = Intrinsic->getIntrinsicID(); + switch (IID) { + case Intrinsic::read_register: return isReadRegisterSourceOfDivergence(Intrinsic); - - return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID()); + case Intrinsic::amdgcn_addrspacecast_nonnull: { + unsigned SrcAS = + Intrinsic->getOperand(0)->getType()->getPointerAddressSpace(); + unsigned DstAS = Intrinsic->getType()->getPointerAddressSpace(); + return SrcAS == AMDGPUAS::PRIVATE_ADDRESS && + DstAS == AMDGPUAS::FLAT_ADDRESS && + ST->hasGloballyAddressableScratch(); + } + default: + return AMDGPU::isIntrinsicSourceOfDivergence(IID); + } } // Assume all function calls are a source of divergence. @@ -1008,6 +1019,15 @@ bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const { if (isa(V)) return true; + // If the target supports globally addressable scratch, the mapping from + // scratch memory to the flat aperture changes therefore an address space cast + // is no longer uniform. + if (auto *CastI = dyn_cast(V)) { + return CastI->getSrcAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS && + CastI->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS && + ST->hasGloballyAddressableScratch(); + } + return false; } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 5f498a3f5a421..f20b22d14c984 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -10074,7 +10074,30 @@ unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, InstructionUniformity SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const { + const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); unsigned opcode = MI.getOpcode(); + + auto HandleAddrSpaceCast = [this, &MRI](const MachineInstr &MI) { + Register Dst = MI.getOperand(0).getReg(); + Register Src = isa(MI) ? MI.getOperand(2).getReg() + : MI.getOperand(1).getReg(); + LLT DstTy = MRI.getType(Dst); + LLT SrcTy = MRI.getType(Src); + unsigned DstAS = DstTy.getAddressSpace(); + unsigned SrcAS = SrcTy.getAddressSpace(); + return SrcAS == AMDGPUAS::PRIVATE_ADDRESS && + DstAS == AMDGPUAS::FLAT_ADDRESS && + ST.hasGloballyAddressableScratch() + ? InstructionUniformity::NeverUniform + : InstructionUniformity::Default; + }; + + // If the target supports globally addressable scratch, the mapping from + // scratch memory to the flat aperture changes therefore an address space cast + // is no longer uniform. + if (opcode == TargetOpcode::G_ADDRSPACE_CAST) + return HandleAddrSpaceCast(MI); + if (auto *GI = dyn_cast(&MI)) { auto IID = GI->getIntrinsicID(); if (AMDGPU::isIntrinsicSourceOfDivergence(IID)) @@ -10083,6 +10106,8 @@ SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const { return InstructionUniformity::AlwaysUniform; switch (IID) { + case Intrinsic::amdgcn_addrspacecast_nonnull: + return HandleAddrSpaceCast(MI); case Intrinsic::amdgcn_if: case Intrinsic::amdgcn_else: // FIXME: Uniform if second result diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/addrspacecast.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/addrspacecast.mir new file mode 100644 index 0000000000000..612f7b7ef4ec4 --- /dev/null +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/addrspacecast.mir @@ -0,0 +1,35 @@ +# NOTE: This file is Generic MIR translation of llvm/test/Analysis/UniformityAnalysis/AMDGPU/addrspacecast.ll test file +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=print-machine-uniformity -filetype=null %s 2>&1 | FileCheck %s --check-prefix=UNI +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass=print-machine-uniformity -filetype=null %s 2>&1 | FileCheck %s --check-prefix=DIV + +# UNI: ALL VALUES UNIFORM +# DIV: DIVERGENT: %3: %3:_(p0) = G_ADDRSPACE_CAST %2:_(p5) +# DIV: DIVERGENT: %4: %4:_(p0) = G_INTRINSIC intrinsic(@llvm.amdgcn.addrspacecast.nonnull), %2:_(p5) + +--- | + define void @foo() { + %alloca = alloca i32, align 4, addrspace(5) + %cast = addrspacecast ptr addrspace(5) %alloca to ptr + store i32 1, ptr %cast, align 4 + %cast.1 = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p5(ptr addrspace(5) %alloca) + store i32 2, ptr %cast.1, align 4 + ret void + } +... +--- +name: foo +stack: + - { id: 0, name: alloca, type: default, offset: 0, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +body: | + bb.1 (%ir-block.0): + %10:_(s32) = G_CONSTANT i32 1 + %12:_(s32) = G_CONSTANT i32 2 + %8:_(p5) = G_FRAME_INDEX %stack.0.alloca + %9:_(p0) = G_ADDRSPACE_CAST %8(p5) + G_STORE %10(s32), %9(p0) :: (store (s32) into %ir.cast) + %11:_(p0) = G_INTRINSIC intrinsic(@llvm.amdgcn.addrspacecast.nonnull), %8(p5) + G_STORE %12(s32), %11(p0) :: (store (s32) into %ir.cast.1) + SI_RETURN +... diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/addrspacecast.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/addrspacecast.ll new file mode 100644 index 0000000000000..e6808448651c8 --- /dev/null +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/addrspacecast.ll @@ -0,0 +1,14 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes='print' -disable-output %s 2>&1 | FileCheck %s --check-prefix=UNI +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -passes='print' -disable-output %s 2>&1 | FileCheck %s --check-prefix=DIV + +; UNI: ALL VALUES UNIFORM +; DIV: DIVERGENT: %cast = addrspacecast ptr addrspace(5) %alloca to ptr +; DIV: DIVERGENT: %cast.1 = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p5(ptr addrspace(5) %alloca) +define void @foo() { + %alloca = alloca i32, align 4, addrspace(5) + %cast = addrspacecast ptr addrspace(5) %alloca to ptr + store i32 1, ptr %cast + %cast.1 = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p5(ptr addrspace(5) %alloca) + store i32 2, ptr %cast.1 + ret void +}