Skip to content

Conversation

@shiltian
Copy link
Contributor

@shiltian shiltian commented Aug 6, 2025

Globally addressable scratch is a new feature introduced in gfx1250. However, this feature changes how scratch space is mapped into the flat aperture, making address space casts from private to flat no longer uniform.

… target supports globally addressable scratch

Globally addressable scratch is a new feature introduced in gfx1250. However, this feature changes how scratch space is mapped into the flat aperture, making address space casts from private to flat no longer uniform.
@shiltian shiltian requested review from changpeng and rampitec August 6, 2025 20:27
@llvmbot llvmbot added backend:AMDGPU llvm:analysis Includes value tracking, cost tables and constant folding labels Aug 6, 2025
Copy link
Contributor Author

shiltian commented Aug 6, 2025

@llvmbot
Copy link
Member

llvmbot commented Aug 6, 2025

@llvm/pr-subscribers-llvm-analysis

@llvm/pr-subscribers-backend-amdgpu

Author: Shilei Tian (shiltian)

Changes

Globally addressable scratch is a new feature introduced in gfx1250. However, this feature changes how scratch space is mapped into the flat aperture, making address space casts from private to flat no longer uniform.


Full diff: https://github.com/llvm/llvm-project/pull/152376.diff

4 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp (+23-3)
  • (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+25)
  • (added) llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/addrspacecast.mir (+35)
  • (added) llvm/test/Analysis/UniformityAnalysis/AMDGPU/addrspacecast.ll (+14)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index a0c99b0ef0491..846a0b6280f19 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -991,10 +991,21 @@ bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
     return true;
 
   if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
-    if (Intrinsic->getIntrinsicID() == Intrinsic::read_register)
+    Intrinsic::ID IID = Intrinsic->getIntrinsicID();
+    switch (IID) {
+    case Intrinsic::read_register:
       return isReadRegisterSourceOfDivergence(Intrinsic);
-
-    return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID());
+    case Intrinsic::amdgcn_addrspacecast_nonnull: {
+      unsigned SrcAS =
+          Intrinsic->getOperand(0)->getType()->getPointerAddressSpace();
+      unsigned DstAS = Intrinsic->getType()->getPointerAddressSpace();
+      return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
+             DstAS == AMDGPUAS::FLAT_ADDRESS &&
+             ST->hasGloballyAddressableScratch();
+    }
+    default:
+      return AMDGPU::isIntrinsicSourceOfDivergence(IID);
+    }
   }
 
   // Assume all function calls are a source of divergence.
@@ -1008,6 +1019,15 @@ bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
   if (isa<InvokeInst>(V))
     return true;
 
+  // If the target supports globally addressable scratch, the mapping from
+  // scratch memory to the flat aperture changes therefore an address space cast
+  // is no longer uniform.
+  if (auto *CastI = dyn_cast<AddrSpaceCastInst>(V)) {
+    return CastI->getSrcAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS &&
+           CastI->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS &&
+           ST->hasGloballyAddressableScratch();
+  }
+
   return false;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 5f498a3f5a421..f20b22d14c984 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -10074,7 +10074,30 @@ unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
 
 InstructionUniformity
 SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const {
+  const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
   unsigned opcode = MI.getOpcode();
+
+  auto HandleAddrSpaceCast = [this, &MRI](const MachineInstr &MI) {
+    Register Dst = MI.getOperand(0).getReg();
+    Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
+                                       : MI.getOperand(1).getReg();
+    LLT DstTy = MRI.getType(Dst);
+    LLT SrcTy = MRI.getType(Src);
+    unsigned DstAS = DstTy.getAddressSpace();
+    unsigned SrcAS = SrcTy.getAddressSpace();
+    return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
+                   DstAS == AMDGPUAS::FLAT_ADDRESS &&
+                   ST.hasGloballyAddressableScratch()
+               ? InstructionUniformity::NeverUniform
+               : InstructionUniformity::Default;
+  };
+
+  // If the target supports globally addressable scratch, the mapping from
+  // scratch memory to the flat aperture changes therefore an address space cast
+  // is no longer uniform.
+  if (opcode == TargetOpcode::G_ADDRSPACE_CAST)
+    return HandleAddrSpaceCast(MI);
+
   if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
     auto IID = GI->getIntrinsicID();
     if (AMDGPU::isIntrinsicSourceOfDivergence(IID))
@@ -10083,6 +10106,8 @@ SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const {
       return InstructionUniformity::AlwaysUniform;
 
     switch (IID) {
+    case Intrinsic::amdgcn_addrspacecast_nonnull:
+      return HandleAddrSpaceCast(MI);
     case Intrinsic::amdgcn_if:
     case Intrinsic::amdgcn_else:
       // FIXME: Uniform if second result
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/addrspacecast.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/addrspacecast.mir
new file mode 100644
index 0000000000000..612f7b7ef4ec4
--- /dev/null
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/addrspacecast.mir
@@ -0,0 +1,35 @@
+# NOTE: This file is Generic MIR translation of llvm/test/Analysis/UniformityAnalysis/AMDGPU/addrspacecast.ll test file
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=print-machine-uniformity -filetype=null %s 2>&1 | FileCheck %s --check-prefix=UNI
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass=print-machine-uniformity -filetype=null %s 2>&1 | FileCheck %s --check-prefix=DIV
+
+# UNI: ALL VALUES UNIFORM
+# DIV: DIVERGENT: %3: %3:_(p0) = G_ADDRSPACE_CAST %2:_(p5)
+# DIV: DIVERGENT: %4: %4:_(p0) = G_INTRINSIC intrinsic(@llvm.amdgcn.addrspacecast.nonnull), %2:_(p5)
+
+--- |
+  define void @foo() {
+    %alloca = alloca i32, align 4, addrspace(5)
+    %cast = addrspacecast ptr addrspace(5) %alloca to ptr
+    store i32 1, ptr %cast, align 4
+    %cast.1 = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p5(ptr addrspace(5) %alloca)
+    store i32 2, ptr %cast.1, align 4
+    ret void
+  }
+...
+---
+name:            foo
+stack:
+  - { id: 0, name: alloca, type: default, offset: 0, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+body:             |
+  bb.1 (%ir-block.0):
+    %10:_(s32) = G_CONSTANT i32 1
+    %12:_(s32) = G_CONSTANT i32 2
+    %8:_(p5) = G_FRAME_INDEX %stack.0.alloca
+    %9:_(p0) = G_ADDRSPACE_CAST %8(p5)
+    G_STORE %10(s32), %9(p0) :: (store (s32) into %ir.cast)
+    %11:_(p0) = G_INTRINSIC intrinsic(@llvm.amdgcn.addrspacecast.nonnull), %8(p5)
+    G_STORE %12(s32), %11(p0) :: (store (s32) into %ir.cast.1)
+    SI_RETURN
+...
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/addrspacecast.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/addrspacecast.ll
new file mode 100644
index 0000000000000..e6808448651c8
--- /dev/null
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/addrspacecast.ll
@@ -0,0 +1,14 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s --check-prefix=UNI
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s --check-prefix=DIV
+
+; UNI: ALL VALUES UNIFORM
+; DIV: DIVERGENT:   %cast = addrspacecast ptr addrspace(5) %alloca to ptr
+; DIV: DIVERGENT:   %cast.1 = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p5(ptr addrspace(5) %alloca)
+define void @foo() {
+  %alloca = alloca i32, align 4, addrspace(5)
+  %cast = addrspacecast ptr addrspace(5) %alloca to ptr
+  store i32 1, ptr %cast
+  %cast.1 = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p5(ptr addrspace(5) %alloca)
+  store i32 2, ptr %cast.1
+  ret void
+}

@shiltian shiltian merged commit 351b38f into main Aug 6, 2025
12 checks passed
@shiltian shiltian deleted the users/shiltian/as-cast-from-0-to-5-not-uniform branch August 6, 2025 21:08
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

backend:AMDGPU llvm:analysis Includes value tracking, cost tables and constant folding

Projects

None yet

Development

Successfully merging this pull request may close these issues.

4 participants