-
Notifications
You must be signed in to change notification settings - Fork 15.4k
[AMDGPU][gfx1250] Support "cluster" syncscope #157641
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-backend-amdgpu Author: Pierre van Houtryve (Pierre-vh) ChangesDefaults to "agent" for targets that do not support it.
Patch is 4.07 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/157641.diff 10 Files Affected:
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 2cddc3365d5d7..c7d515aeb012f 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -537,6 +537,8 @@ Every processor supports every OS ABI (see :ref:`amdgpu-os`) with the following
- Packed
work-item Add product
IDs names.
+ - Workgroup
+ Clusters
=========== =============== ============ ===== ================= =============== =============== ======================
@@ -1095,6 +1097,22 @@ is conservatively correct for OpenCL.
- ``wavefront`` and executed by a thread in the
same wavefront.
+ ``cluster`` Synchronizes with, and participates in modification
+ and seq_cst total orderings with, other operations
+ (except image operations) for all address spaces
+ (except private, or generic that accesses private)
+ provided the other operation's sync scope is:
+
+ - ``system``, ``agent`` or ``cluster`` and
+ executed by a thread on the same cluster.
+ - ``workgroup`` and executed by a thread in the
+ same work-group.
+ - ``wavefront`` and executed by a thread in the
+ same wavefront.
+
+ On targets that do not support workgroup cluster
+ launch mode, this behaves like ``agent`` scope instead.
+
``workgroup`` Synchronizes with, and participates in modification
and seq_cst total orderings with, other operations
(except image operations) for all address spaces
@@ -1128,6 +1146,9 @@ is conservatively correct for OpenCL.
``agent-one-as`` Same as ``agent`` but only synchronizes with other
operations within the same address space.
+ ``cluster-one-as`` Same as ``cluster`` but only synchronizes with other
+ operations within the same address space.
+
``workgroup-one-as`` Same as ``workgroup`` but only synchronizes with
other operations within the same address space.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp
index eda479064d7b2..d09b7cffe9f29 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp
@@ -23,6 +23,7 @@ AMDGPUMachineModuleInfo::AMDGPUMachineModuleInfo(const MachineModuleInfo &MMI)
AgentSSID = CTX.getOrInsertSyncScopeID("agent");
WorkgroupSSID = CTX.getOrInsertSyncScopeID("workgroup");
WavefrontSSID = CTX.getOrInsertSyncScopeID("wavefront");
+ ClusterSSID = CTX.getOrInsertSyncScopeID("cluster");
SystemOneAddressSpaceSSID =
CTX.getOrInsertSyncScopeID("one-as");
AgentOneAddressSpaceSSID =
@@ -33,4 +34,5 @@ AMDGPUMachineModuleInfo::AMDGPUMachineModuleInfo(const MachineModuleInfo &MMI)
CTX.getOrInsertSyncScopeID("wavefront-one-as");
SingleThreadOneAddressSpaceSSID =
CTX.getOrInsertSyncScopeID("singlethread-one-as");
+ ClusterOneAddressSpaceSSID = CTX.getOrInsertSyncScopeID("cluster-one-as");
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h
index fcb0c8cfb7ca6..bf852bb38376e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h
@@ -32,6 +32,8 @@ class AMDGPUMachineModuleInfo final : public MachineModuleInfoELF {
SyncScope::ID WorkgroupSSID;
/// Wavefront synchronization scope ID (cross address space).
SyncScope::ID WavefrontSSID;
+ /// Cluster synchronization scope ID (cross address space).
+ SyncScope::ID ClusterSSID;
/// System synchronization scope ID (single address space).
SyncScope::ID SystemOneAddressSpaceSSID;
/// Agent synchronization scope ID (single address space).
@@ -42,6 +44,8 @@ class AMDGPUMachineModuleInfo final : public MachineModuleInfoELF {
SyncScope::ID WavefrontOneAddressSpaceSSID;
/// Single thread synchronization scope ID (single address space).
SyncScope::ID SingleThreadOneAddressSpaceSSID;
+ /// Cluster synchronization scope ID (single address space).
+ SyncScope::ID ClusterOneAddressSpaceSSID;
/// In AMDGPU target synchronization scopes are inclusive, meaning a
/// larger synchronization scope is inclusive of a smaller synchronization
@@ -60,12 +64,15 @@ class AMDGPUMachineModuleInfo final : public MachineModuleInfoELF {
else if (SSID == getWorkgroupSSID() ||
SSID == getWorkgroupOneAddressSpaceSSID())
return 2;
+ else if (SSID == getClusterSSID() ||
+ SSID == getClusterOneAddressSpaceSSID())
+ return 3;
else if (SSID == getAgentSSID() ||
SSID == getAgentOneAddressSpaceSSID())
- return 3;
+ return 4;
else if (SSID == SyncScope::System ||
SSID == getSystemOneAddressSpaceSSID())
- return 4;
+ return 5;
return std::nullopt;
}
@@ -73,7 +80,8 @@ class AMDGPUMachineModuleInfo final : public MachineModuleInfoELF {
/// \returns True if \p SSID is restricted to single address space, false
/// otherwise
bool isOneAddressSpace(SyncScope::ID SSID) const {
- return SSID == getSingleThreadOneAddressSpaceSSID() ||
+ return SSID == getClusterOneAddressSpaceSSID() ||
+ SSID == getSingleThreadOneAddressSpaceSSID() ||
SSID == getWavefrontOneAddressSpaceSSID() ||
SSID == getWorkgroupOneAddressSpaceSSID() ||
SSID == getAgentOneAddressSpaceSSID() ||
@@ -95,6 +103,8 @@ class AMDGPUMachineModuleInfo final : public MachineModuleInfoELF {
SyncScope::ID getWavefrontSSID() const {
return WavefrontSSID;
}
+ /// \returns Cluster synchronization scope ID (cross address space).
+ SyncScope::ID getClusterSSID() const { return ClusterSSID; }
/// \returns System synchronization scope ID (single address space).
SyncScope::ID getSystemOneAddressSpaceSSID() const {
return SystemOneAddressSpaceSSID;
@@ -115,6 +125,10 @@ class AMDGPUMachineModuleInfo final : public MachineModuleInfoELF {
SyncScope::ID getSingleThreadOneAddressSpaceSSID() const {
return SingleThreadOneAddressSpaceSSID;
}
+ /// \returns Single thread synchronization scope ID (single address space).
+ SyncScope::ID getClusterOneAddressSpaceSSID() const {
+ return ClusterOneAddressSpaceSSID;
+ }
/// In AMDGPU target synchronization scopes are inclusive, meaning a
/// larger synchronization scope is inclusive of a smaller synchronization
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index e172a9c699fb1..cbd6f64976d21 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1833,6 +1833,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
return GFX1250Insts && getGeneration() == GFX12;
}
+ /// \returns true if the subtarget supports clusters of workgroups.
+ bool hasClusters() const { return GFX1250Insts; }
+
/// \returns true if the subtarget requires a wait for xcnt before atomic
/// flat/global stores & rmw.
bool requiresWaitXCntBeforeAtomicStores() const { return GFX1250Insts; }
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 0be6a9d09379f..1637c06936f9b 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -63,6 +63,7 @@ enum class SIAtomicScope {
SINGLETHREAD,
WAVEFRONT,
WORKGROUP,
+ CLUSTER, // Promoted to AGENT on targets without workgroup clusters.
AGENT,
SYSTEM
};
@@ -106,6 +107,7 @@ class SIMemOpInfo final {
bool IsCooperative = false;
SIMemOpInfo(
+ const GCNSubtarget &ST,
AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
SIAtomicScope Scope = SIAtomicScope::SYSTEM,
SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
@@ -156,6 +158,11 @@ class SIMemOpInfo final {
SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
this->Scope = std::min(Scope, SIAtomicScope::AGENT);
}
+
+ // On targets that have no concept of a workgroup cluster, use
+ // AGENT scope as a conservatively correct alternative.
+ if (this->Scope == SIAtomicScope::CLUSTER && !ST.hasClusters())
+ this->Scope = SIAtomicScope::AGENT;
}
public:
@@ -225,6 +232,7 @@ class SIMemOpInfo final {
class SIMemOpAccess final {
private:
const AMDGPUMachineModuleInfo *MMI = nullptr;
+ const GCNSubtarget &ST;
/// Reports unsupported message \p Msg for \p MI to LLVM context.
void reportUnsupported(const MachineBasicBlock::iterator &MI,
@@ -248,7 +256,7 @@ class SIMemOpAccess final {
public:
/// Construct class to support accessing the machine memory operands
/// of instructions in the machine function \p MF.
- SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI);
+ SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI, const GCNSubtarget &ST);
/// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise.
std::optional<SIMemOpInfo>
@@ -773,6 +781,8 @@ SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true);
if (SSID == MMI->getAgentSSID())
return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true);
+ if (SSID == MMI->getClusterSSID())
+ return std::tuple(SIAtomicScope::CLUSTER, SIAtomicAddrSpace::ATOMIC, true);
if (SSID == MMI->getWorkgroupSSID())
return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC,
true);
@@ -788,6 +798,9 @@ SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
if (SSID == MMI->getAgentOneAddressSpaceSSID())
return std::tuple(SIAtomicScope::AGENT,
SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
+ if (SSID == MMI->getClusterOneAddressSpaceSSID())
+ return std::tuple(SIAtomicScope::CLUSTER,
+ SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
return std::tuple(SIAtomicScope::WORKGROUP,
SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
@@ -815,8 +828,9 @@ SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
return SIAtomicAddrSpace::OTHER;
}
-SIMemOpAccess::SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI_)
- : MMI(&MMI_) {}
+SIMemOpAccess::SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI_,
+ const GCNSubtarget &ST)
+ : MMI(&MMI_), ST(ST) {}
std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
const MachineBasicBlock::iterator &MI) const {
@@ -877,7 +891,7 @@ std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
return std::nullopt;
}
}
- return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
+ return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
IsNonTemporal, IsLastUse, IsCooperative);
}
@@ -891,7 +905,7 @@ SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const {
// Be conservative if there are no memory operands.
if (MI->getNumMemOperands() == 0)
- return SIMemOpInfo();
+ return SIMemOpInfo(ST);
return constructFromMIWithMMO(MI);
}
@@ -905,7 +919,7 @@ SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const {
// Be conservative if there are no memory operands.
if (MI->getNumMemOperands() == 0)
- return SIMemOpInfo();
+ return SIMemOpInfo(ST);
return constructFromMIWithMMO(MI);
}
@@ -946,8 +960,9 @@ SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {
if (SynchronizeAS)
OrderingAddrSpace = *SynchronizeAS;
- return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
- IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
+ return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace,
+ SIAtomicAddrSpace::ATOMIC, IsCrossAddressSpaceOrdering,
+ AtomicOrdering::NotAtomic);
}
std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
@@ -959,7 +974,7 @@ std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
// Be conservative if there are no memory operands.
if (MI->getNumMemOperands() == 0)
- return SIMemOpInfo();
+ return SIMemOpInfo(ST);
return constructFromMIWithMMO(MI);
}
@@ -2377,6 +2392,7 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
switch (Scope) {
case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:
+ case SIAtomicScope::CLUSTER:
if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
LOADCnt |= true;
if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
@@ -2413,6 +2429,7 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
switch (Scope) {
case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:
+ case SIAtomicScope::CLUSTER:
case SIAtomicScope::WORKGROUP:
// If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
// not needed as LDS operations for all waves are executed in a total
@@ -2495,6 +2512,9 @@ bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
case SIAtomicScope::AGENT:
ScopeImm = AMDGPU::CPol::SCOPE_DEV;
break;
+ case SIAtomicScope::CLUSTER:
+ ScopeImm = AMDGPU::CPol::SCOPE_SE;
+ break;
case SIAtomicScope::WORKGROUP:
// GFX12.0:
// In WGP mode the waves of a work-group can be executing on either CU of
@@ -2565,6 +2585,7 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
.addImm(AMDGPU::CPol::SCOPE_DEV);
}
break;
+ case SIAtomicScope::CLUSTER:
case SIAtomicScope::WORKGROUP:
// No WB necessary, but we still have to wait.
break;
@@ -2649,11 +2670,8 @@ bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
const unsigned Scope = CPol->getImm() & CPol::SCOPE;
// GFX12.0 only: Extra waits needed before system scope stores.
- if (!ST.hasGFX1250Insts()) {
- if (!Atomic && Scope == CPol::SCOPE_SYS)
- return insertWaitsBeforeSystemScopeStore(MI);
- return Changed;
- }
+ if (!ST.hasGFX1250Insts() && !Atomic && Scope == CPol::SCOPE_SYS)
+ Changed |= insertWaitsBeforeSystemScopeStore(MI.getIterator());
return Changed;
}
@@ -2684,6 +2702,9 @@ bool SIGfx12CacheControl::setAtomicScope(const MachineBasicBlock::iterator &MI,
case SIAtomicScope::AGENT:
Changed |= setScope(MI, AMDGPU::CPol::SCOPE_DEV);
break;
+ case SIAtomicScope::CLUSTER:
+ Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SE);
+ break;
case SIAtomicScope::WORKGROUP:
// In workgroup mode, SCOPE_SE is needed as waves can executes on
// different CUs that access different L0s.
@@ -2930,8 +2951,8 @@ SIMemoryLegalizerPass::run(MachineFunction &MF,
bool SIMemoryLegalizer::run(MachineFunction &MF) {
bool Changed = false;
- SIMemOpAccess MOA(MMI.getObjFileInfo<AMDGPUMachineModuleInfo>());
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ SIMemOpAccess MOA(MMI.getObjFileInfo<AMDGPUMachineModuleInfo>(), ST);
CC = SICacheControl::create(ST);
for (auto &MBB : MF) {
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
index 736a8b58466dd..d288bfc6a09db 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
@@ -1632,6 +1632,801 @@ entry:
ret void
}
+
+define amdgpu_kernel void @cluster_acquire_fence() {
+; GFX6-LABEL: cluster_acquire_fence:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: cluster_acquire_fence:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: cluster_acquire_fence:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT: buffer_gl1_inv
+; GFX10-WGP-NEXT: buffer_gl0_inv
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: cluster_acquire_fence:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT: buffer_gl1_inv
+; GFX10-CU-NEXT: buffer_gl0_inv
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: cluster_acquire_fence:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: cluster_acquire_fence:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: cluster_acquire_fence:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: cluster_acquire_fence:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: cluster_acquire_fence:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: buffer_inv sc1
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: cluster_acquire_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: cluster_acquire_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: cluster_acquire_fence:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_wait_storecnt 0x0
+; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: cluster_acquire_fence:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: cluster_acquire_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+entry:
+ fence syncscope("cluster") acquire
+ ret void
+}
+
+define amdgpu_kernel void @cluster_release_fence() {
+; GFX6-LABEL: cluster_release_fence:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: cluster_release_fence:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: cluster_release_fence:
+; GFX10-WGP: ; ...
[truncated]
|
Merge activity
|
f1e9273 to
2a7f810
Compare
Defaults to "agent" for targets that do not support it. - Add documentation - Register it in MachineModuleInfo - Add MemoryLegalizer support
c6fa4fb to
57ba44e
Compare
| SyncScope::ID getSingleThreadOneAddressSpaceSSID() const { | ||
| return SingleThreadOneAddressSpaceSSID; | ||
| } | ||
| /// \returns Single thread synchronization scope ID (single address space). |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Minor typo, s/Single thread/Cluster

Defaults to "agent" for targets that do not support it.