-
Notifications
You must be signed in to change notification settings - Fork 15k
[NVPTX] Add syncscope support for cmpxchg #140812
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[NVPTX] Add syncscope support for cmpxchg #140812
Conversation
|
@llvm/pr-subscribers-backend-arm Author: Akshay Deodhar (akshayrdeodhar) ChangesThis MR adds support for cmpxchg instructions with syncscope. Adds PatFrags for matching syncscope for 3-input atomic operations in the NVPTX backend. Patch is 2.76 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/140812.diff 19 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 03099e9ad44dc..b2a75965e6c2e 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2319,13 +2319,15 @@ class TargetLoweringBase {
/// standard ABI uses a fence before a seq_cst load instead of after a
/// seq_cst store).
/// @{
- virtual Instruction *emitLeadingFence(IRBuilderBase &Builder,
- Instruction *Inst,
- AtomicOrdering Ord) const;
-
- virtual Instruction *emitTrailingFence(IRBuilderBase &Builder,
- Instruction *Inst,
- AtomicOrdering Ord) const;
+ virtual Instruction *
+ emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::System) const;
+
+ virtual Instruction *
+ emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::System) const;
/// @}
// Emits code that executes when the comparison result in the ll/sc
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index c376de877ac7d..b8dcafa32052b 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -65,7 +65,8 @@ class AtomicExpandImpl {
const DataLayout *DL = nullptr;
private:
- bool bracketInstWithFences(Instruction *I, AtomicOrdering Order);
+ bool bracketInstWithFences(Instruction *I, AtomicOrdering Order,
+ SyncScope::ID SSID = SyncScope::System);
IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL);
LoadInst *convertAtomicLoadToIntegerType(LoadInst *LI);
bool tryExpandAtomicLoad(LoadInst *LI);
@@ -303,6 +304,7 @@ bool AtomicExpandImpl::processAtomicInstr(Instruction *I) {
if (TLI->shouldInsertFencesForAtomic(I)) {
auto FenceOrdering = AtomicOrdering::Monotonic;
+ SyncScope::ID SSID = SyncScope::System;
if (LI && isAcquireOrStronger(LI->getOrdering())) {
FenceOrdering = LI->getOrdering();
LI->setOrdering(AtomicOrdering::Monotonic);
@@ -325,13 +327,18 @@ bool AtomicExpandImpl::processAtomicInstr(Instruction *I) {
// expandAtomicCmpXchg in that case.
FenceOrdering = CASI->getMergedOrdering();
auto CASOrdering = TLI->atomicOperationOrderAfterFenceSplit(CASI);
+ SSID = CASI->getSyncScopeID();
CASI->setSuccessOrdering(CASOrdering);
CASI->setFailureOrdering(CASOrdering);
+ // If CAS ordering is monotonic, then the operation will
+ // take default scope. Otherwise, it will retain its scope
+ if (CASOrdering != AtomicOrdering::Monotonic)
+ CASI->setSyncScopeID(SSID);
}
if (FenceOrdering != AtomicOrdering::Monotonic) {
- MadeChange |= bracketInstWithFences(I, FenceOrdering);
+ MadeChange |= bracketInstWithFences(I, FenceOrdering, SSID);
}
} else if (I->hasAtomicStore() &&
TLI->shouldInsertTrailingFenceForAtomicStore(I)) {
@@ -432,12 +439,13 @@ PreservedAnalyses AtomicExpandPass::run(Function &F,
}
bool AtomicExpandImpl::bracketInstWithFences(Instruction *I,
- AtomicOrdering Order) {
+ AtomicOrdering Order,
+ SyncScope::ID SSID) {
ReplacementIRBuilder Builder(I, *DL);
- auto LeadingFence = TLI->emitLeadingFence(Builder, I, Order);
+ auto LeadingFence = TLI->emitLeadingFence(Builder, I, Order, SSID);
- auto TrailingFence = TLI->emitTrailingFence(Builder, I, Order);
+ auto TrailingFence = TLI->emitTrailingFence(Builder, I, Order, SSID);
// We have a guard here because not every atomic operation generates a
// trailing fence.
if (TrailingFence)
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index c85f0c71ef25f..d0268545042ed 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -2327,18 +2327,20 @@ TargetLoweringBase::getAtomicMemOperandFlags(const Instruction &AI,
Instruction *TargetLoweringBase::emitLeadingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
if (isReleaseOrStronger(Ord) && Inst->hasAtomicStore())
- return Builder.CreateFence(Ord);
+ return Builder.CreateFence(Ord, SSID);
else
return nullptr;
}
Instruction *TargetLoweringBase::emitTrailingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
if (isAcquireOrStronger(Ord))
- return Builder.CreateFence(Ord);
+ return Builder.CreateFence(Ord, SSID);
else
return nullptr;
}
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index afbf1b4c55e70..5196ce846d6a2 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -21229,7 +21229,8 @@ Instruction *ARMTargetLowering::makeDMB(IRBuilderBase &Builder,
// Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
Instruction *ARMTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
switch (Ord) {
case AtomicOrdering::NotAtomic:
case AtomicOrdering::Unordered:
@@ -21254,7 +21255,8 @@ Instruction *ARMTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
Instruction *ARMTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
switch (Ord) {
case AtomicOrdering::NotAtomic:
case AtomicOrdering::Unordered:
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 9fad056edd3f1..da09eca2b946f 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -666,10 +666,12 @@ class VectorType;
void
emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override;
- Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
- AtomicOrdering Ord) const override;
- Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
- AtomicOrdering Ord) const override;
+ Instruction *emitLeadingFence(
+ IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::ID SyncScope::System) const override;
+ Instruction *emitTrailingFence(
+ IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::ID SyncScope::System) const override;
unsigned getMaxSupportedInterleaveFactor() const override;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 1f417dbada8e6..0bf3e5dcdbf4e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -6311,7 +6311,8 @@ AtomicOrdering NVPTXTargetLowering::atomicOperationOrderAfterFenceSplit(
Instruction *NVPTXTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
if (!isa<AtomicCmpXchgInst>(Inst))
return TargetLoweringBase::emitLeadingFence(Builder, Inst, Ord);
@@ -6319,15 +6320,17 @@ Instruction *NVPTXTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
// Emit a fence.sc leading fence for cmpxchg seq_cst which are not emulated
if (isReleaseOrStronger(Ord))
return Ord == AtomicOrdering::SequentiallyConsistent
- ? Builder.CreateFence(AtomicOrdering::SequentiallyConsistent)
- : Builder.CreateFence(AtomicOrdering::Release);
+ ? Builder.CreateFence(AtomicOrdering::SequentiallyConsistent,
+ SSID)
+ : Builder.CreateFence(AtomicOrdering::Release, SSID);
return nullptr;
}
Instruction *NVPTXTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
// Specialize for cmpxchg
if (!isa<AtomicCmpXchgInst>(Inst))
return TargetLoweringBase::emitTrailingFence(Builder, Inst, Ord);
@@ -6340,7 +6343,7 @@ Instruction *NVPTXTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
if (isAcquireOrStronger(Ord) &&
(Ord != AtomicOrdering::SequentiallyConsistent ||
CASWidth < STI.getMinCmpXchgSizeInBits()))
- return Builder.CreateFence(AtomicOrdering::Acquire);
+ return Builder.CreateFence(AtomicOrdering::Acquire, SSID);
return nullptr;
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index b4b7dad984b62..3f494c9066140 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -280,10 +280,14 @@ class NVPTXTargetLowering : public TargetLowering {
AtomicOrdering
atomicOperationOrderAfterFenceSplit(const Instruction *I) const override;
- Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
- AtomicOrdering Ord) const override;
- Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
- AtomicOrdering Ord) const override;
+ Instruction *
+ emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::System) const override;
+ Instruction *
+ emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::System) const override;
unsigned getPreferredFPToIntOpcode(unsigned Op, EVT FromVT,
EVT ToVT) const override;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 193418ca391e5..4dbcf6183efe9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -41,6 +41,27 @@ def AS_match {
}];
}
+multiclass nvvm_ternary_atomic_op_scoped<SDPatternOperator frag> {
+ defvar frag_pat = (frag node:$ptr, node:$cmp, node:$val);
+ def NAME#_cta: PatFrag<!setdagop(frag_pat, ops),
+ (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val), [{
+ return Scopes[cast<MemSDNode>(N)->getSyncScopeID()] == NVPTX::Scope::Block;
+ }]>;
+ def NAME#_cluster : PatFrag<!setdagop(frag_pat, ops),
+ (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val), [{
+ return Scopes[cast<MemSDNode>(N)->getSyncScopeID()] == NVPTX::Scope::Cluster;
+ }]>;
+ def NAME#_gpu: PatFrag<!setdagop(frag_pat, ops),
+ (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val), [{
+ return Scopes[cast<MemSDNode>(N)->getSyncScopeID()] == NVPTX::Scope::Device;
+ }]>;
+ def NAME#_sys: PatFrag<!setdagop(frag_pat, ops),
+ (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val), [{
+ return Scopes[cast<MemSDNode>(N)->getSyncScopeID()] == NVPTX::Scope::System;
+ }]>;
+}
+
+
// A node that will be replaced with the current PTX version.
class PTX {
SDNodeXForm PTXVerXform = SDNodeXForm<imm, [{
@@ -2111,9 +2132,9 @@ multiclass F_ATOMIC_2<RegTyInfo t, string sem_str, string as_str, string op_str,
}
// has 3 operands
-multiclass F_ATOMIC_3<RegTyInfo t, string sem_str, string as_str, string op_str,
- SDPatternOperator op, list<Predicate> preds> {
- defvar asm_str = "atom" # sem_str # as_str # "." # op_str # " \t$dst, [$addr], $b, $c;";
+multiclass F_ATOMIC_3<RegTyInfo t, string sem_str, string scope_str, string as_str,
+ string op_str, SDPatternOperator op, list<Predicate> preds> {
+ defvar asm_str = "atom" # sem_str # scope_str # as_str # "." # op_str # " \t$dst, [$addr], $b, $c;";
let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in {
def rr : NVPTXInst<(outs t.RC:$dst),
(ins ADDR:$addr, t.RC:$b, t.RC:$c),
@@ -2149,12 +2170,12 @@ multiclass F_ATOMIC_2_AS<RegTyInfo t, SDPatternOperator frag, string op_str, lis
defm _GEN : F_ATOMIC_2<t, "", "", op_str, ATOMIC_GENERIC_CHK<frag_pat>, preds>;
}
-multiclass F_ATOMIC_3_AS<RegTyInfo t, SDPatternOperator frag, string sem_str, string op_str, list<Predicate> preds = []> {
+multiclass F_ATOMIC_3_AS<RegTyInfo t, SDPatternOperator frag, string scope_str, string sem_str, string op_str, list<Predicate> preds = []> {
defvar frag_pat = (frag node:$a, node:$b, node:$c);
- defm _G : F_ATOMIC_3<t, sem_str, ".global", op_str, ATOMIC_GLOBAL_CHK<frag_pat>, preds>;
- defm _S : F_ATOMIC_3<t, sem_str, ".shared", op_str, ATOMIC_SHARED_CHK<frag_pat>, preds>;
- defm _S_C : F_ATOMIC_3<t, sem_str, ".shared::cluster", op_str, ATOMIC_SHARED_CLUSTER_CHK<frag_pat>, !listconcat([hasClusters], preds)>;
- defm _GEN : F_ATOMIC_3<t, sem_str, "", op_str, ATOMIC_GENERIC_CHK<frag_pat>, preds>;
+ defm _G : F_ATOMIC_3<t, sem_str, scope_str, ".global", op_str, ATOMIC_GLOBAL_CHK<frag_pat>, preds>;
+ defm _S : F_ATOMIC_3<t, sem_str, scope_str, ".shared", op_str, ATOMIC_SHARED_CHK<frag_pat>, preds>;
+ defm _S_C : F_ATOMIC_3<t, sem_str, scope_str, ".shared::cluster", op_str, ATOMIC_SHARED_CLUSTER_CHK<frag_pat>, !listconcat([hasClusters], preds)>;
+ defm _GEN : F_ATOMIC_3<t, sem_str, scope_str, "", op_str, ATOMIC_GENERIC_CHK<frag_pat>, preds>;
}
// atom_add
@@ -2205,18 +2226,30 @@ foreach t = [I32RT, I64RT] in {
foreach order = ["acquire", "release", "acq_rel", "monotonic"] in {
defvar cas_order_string = !if(!eq(order, "monotonic"), ".relaxed", "."#order);
defvar atomic_cmp_swap_pat = !cast<PatFrag>("atomic_cmp_swap_i"#t.Size#_#order);
+
+ // Instantiate scoped versions of the atomic compare and swap pattern
+ defm atomic_cmp_swap_i#t.Size#_#order: nvvm_ternary_atomic_op_scoped<atomic_cmp_swap_pat>;
+
+ foreach scope = ["cta", "cluster", "gpu", "sys"] in {
+ defvar atomic_cmp_swap_pat_scoped = !cast<PatFrag>("atomic_cmp_swap_i"#t.Size#_#order#_#scope);
+
+ // Syncscope is only supported for SM70+
+ defm INT_PTX_ATOM_CAS_#t.Size#_#order#_#scope
+ : F_ATOMIC_3_AS<t, atomic_cmp_swap_pat_scoped, "."#scope, cas_order_string, "cas.b"#t.Size, [hasSM<70>, hasPTX<63>]>;
+ }
+
// Note that AtomicExpand will convert cmpxchg seq_cst to a cmpxchg monotonic with fences around it.
// Memory orders are only supported for SM70+, PTX63+- so we have two sets of instruction definitions-
// for SM70+, and "old" ones which lower to "atom.cas", for earlier archs.
defm INT_PTX_ATOM_CAS_#t.Size#_#order
- : F_ATOMIC_3_AS<t, atomic_cmp_swap_pat, cas_order_string, "cas.b"#t.Size, [hasSM<70>, hasPTX<63>]>;
+ : F_ATOMIC_3_AS<t, atomic_cmp_swap_pat, "", cas_order_string, "cas.b"#t.Size, [hasSM<70>, hasPTX<63>]>;
defm INT_PTX_ATOM_CAS_#t.Size#_#order#_old
- : F_ATOMIC_3_AS<t, atomic_cmp_swap_pat, "", "cas.b"#t.Size, []>;
+ : F_ATOMIC_3_AS<t, atomic_cmp_swap_pat, "", "", "cas.b"#t.Size, []>;
}
}
// Note that 16-bit CAS support in PTX is emulated.
-defm INT_PTX_ATOM_CAS_16 : F_ATOMIC_3_AS<I16RT, atomic_cmp_swap_i16, "", "cas.b16", [hasSM<70>, hasPTX<63>]>;
+defm INT_PTX_ATOM_CAS_16 : F_ATOMIC_3_AS<I16RT, atomic_cmp_swap_i16, "", "", "cas.b16", [hasSM<70>, hasPTX<63>]>;
// Support for scoped atomic operations. Matches
// int_nvvm_atomic_{op}_{space}_{type}_{scope}
@@ -2246,7 +2279,8 @@ multiclass ATOM3N_impl<string OpStr, string IntTypeStr, string TypeStr,
RegTyInfo t, list<Predicate> Preds> {
defm "" : F_ATOMIC_3<t,
as_str = !if(!eq(SpaceStr, "gen"), "", "." # SpaceStr),
- sem_str = !if(!eq(ScopeStr, "gpu"), "", "." # ScopeStr),
+ sem_str = "",
+ scope_str = !if(!eq(ScopeStr, "gpu"), "", "." # ScopeStr),
op_str = OpStr # "." # TypeStr,
op = !cast<Intrinsic>(
"int_nvvm_atomic_" # OpStr
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 59bfec30dc211..6dd67c76b7077 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -12588,7 +12588,8 @@ static Instruction *callIntrinsic(IRBuilderBase &Builder, Intrinsic::ID Id) {
// http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
Instruction *PPCTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
if (Ord == AtomicOrdering::SequentiallyConsistent)
return callIntrinsic(Builder, Intrinsic::ppc_sync);
if (isReleaseOrStronger(Ord))
@@ -12598,7 +12599,8 @@ Instruction *PPCTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
Instruction *PPCTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
if (Inst->hasAtomicLoad() && isAcquireOrStronger(Ord)) {
// See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and
// http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index e7e7c21b50395..964f5e11f78cd 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -927,10 +927,14 @@ namespace llvm {
return true;
}
- Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
- AtomicOrdering Ord) const override;
- Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
- AtomicOrdering Ord) const override;
+ Instruction *
+ emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::System) const override;
+ Instruction *
+ emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::System) const override;
bool shouldInlineQuadwordAtomics() const;
diff --git a/llvm/lib/Target/RIS...
[truncated]
|
|
@llvm/pr-subscribers-backend-risc-v Author: Akshay Deodhar (akshayrdeodhar) ChangesThis MR adds support for cmpxchg instructions with syncscope. Adds PatFrags for matching syncscope for 3-input atomic operations in the NVPTX backend. Patch is 2.76 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/140812.diff 19 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 03099e9ad44dc..b2a75965e6c2e 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2319,13 +2319,15 @@ class TargetLoweringBase {
/// standard ABI uses a fence before a seq_cst load instead of after a
/// seq_cst store).
/// @{
- virtual Instruction *emitLeadingFence(IRBuilderBase &Builder,
- Instruction *Inst,
- AtomicOrdering Ord) const;
-
- virtual Instruction *emitTrailingFence(IRBuilderBase &Builder,
- Instruction *Inst,
- AtomicOrdering Ord) const;
+ virtual Instruction *
+ emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::System) const;
+
+ virtual Instruction *
+ emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::System) const;
/// @}
// Emits code that executes when the comparison result in the ll/sc
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index c376de877ac7d..b8dcafa32052b 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -65,7 +65,8 @@ class AtomicExpandImpl {
const DataLayout *DL = nullptr;
private:
- bool bracketInstWithFences(Instruction *I, AtomicOrdering Order);
+ bool bracketInstWithFences(Instruction *I, AtomicOrdering Order,
+ SyncScope::ID SSID = SyncScope::System);
IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL);
LoadInst *convertAtomicLoadToIntegerType(LoadInst *LI);
bool tryExpandAtomicLoad(LoadInst *LI);
@@ -303,6 +304,7 @@ bool AtomicExpandImpl::processAtomicInstr(Instruction *I) {
if (TLI->shouldInsertFencesForAtomic(I)) {
auto FenceOrdering = AtomicOrdering::Monotonic;
+ SyncScope::ID SSID = SyncScope::System;
if (LI && isAcquireOrStronger(LI->getOrdering())) {
FenceOrdering = LI->getOrdering();
LI->setOrdering(AtomicOrdering::Monotonic);
@@ -325,13 +327,18 @@ bool AtomicExpandImpl::processAtomicInstr(Instruction *I) {
// expandAtomicCmpXchg in that case.
FenceOrdering = CASI->getMergedOrdering();
auto CASOrdering = TLI->atomicOperationOrderAfterFenceSplit(CASI);
+ SSID = CASI->getSyncScopeID();
CASI->setSuccessOrdering(CASOrdering);
CASI->setFailureOrdering(CASOrdering);
+ // If CAS ordering is monotonic, then the operation will
+ // take default scope. Otherwise, it will retain its scope
+ if (CASOrdering != AtomicOrdering::Monotonic)
+ CASI->setSyncScopeID(SSID);
}
if (FenceOrdering != AtomicOrdering::Monotonic) {
- MadeChange |= bracketInstWithFences(I, FenceOrdering);
+ MadeChange |= bracketInstWithFences(I, FenceOrdering, SSID);
}
} else if (I->hasAtomicStore() &&
TLI->shouldInsertTrailingFenceForAtomicStore(I)) {
@@ -432,12 +439,13 @@ PreservedAnalyses AtomicExpandPass::run(Function &F,
}
bool AtomicExpandImpl::bracketInstWithFences(Instruction *I,
- AtomicOrdering Order) {
+ AtomicOrdering Order,
+ SyncScope::ID SSID) {
ReplacementIRBuilder Builder(I, *DL);
- auto LeadingFence = TLI->emitLeadingFence(Builder, I, Order);
+ auto LeadingFence = TLI->emitLeadingFence(Builder, I, Order, SSID);
- auto TrailingFence = TLI->emitTrailingFence(Builder, I, Order);
+ auto TrailingFence = TLI->emitTrailingFence(Builder, I, Order, SSID);
// We have a guard here because not every atomic operation generates a
// trailing fence.
if (TrailingFence)
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index c85f0c71ef25f..d0268545042ed 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -2327,18 +2327,20 @@ TargetLoweringBase::getAtomicMemOperandFlags(const Instruction &AI,
Instruction *TargetLoweringBase::emitLeadingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
if (isReleaseOrStronger(Ord) && Inst->hasAtomicStore())
- return Builder.CreateFence(Ord);
+ return Builder.CreateFence(Ord, SSID);
else
return nullptr;
}
Instruction *TargetLoweringBase::emitTrailingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
if (isAcquireOrStronger(Ord))
- return Builder.CreateFence(Ord);
+ return Builder.CreateFence(Ord, SSID);
else
return nullptr;
}
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index afbf1b4c55e70..5196ce846d6a2 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -21229,7 +21229,8 @@ Instruction *ARMTargetLowering::makeDMB(IRBuilderBase &Builder,
// Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
Instruction *ARMTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
switch (Ord) {
case AtomicOrdering::NotAtomic:
case AtomicOrdering::Unordered:
@@ -21254,7 +21255,8 @@ Instruction *ARMTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
Instruction *ARMTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
switch (Ord) {
case AtomicOrdering::NotAtomic:
case AtomicOrdering::Unordered:
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 9fad056edd3f1..da09eca2b946f 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -666,10 +666,12 @@ class VectorType;
void
emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override;
- Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
- AtomicOrdering Ord) const override;
- Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
- AtomicOrdering Ord) const override;
+ Instruction *emitLeadingFence(
+ IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::ID SyncScope::System) const override;
+ Instruction *emitTrailingFence(
+ IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::ID SyncScope::System) const override;
unsigned getMaxSupportedInterleaveFactor() const override;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 1f417dbada8e6..0bf3e5dcdbf4e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -6311,7 +6311,8 @@ AtomicOrdering NVPTXTargetLowering::atomicOperationOrderAfterFenceSplit(
Instruction *NVPTXTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
if (!isa<AtomicCmpXchgInst>(Inst))
return TargetLoweringBase::emitLeadingFence(Builder, Inst, Ord);
@@ -6319,15 +6320,17 @@ Instruction *NVPTXTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
// Emit a fence.sc leading fence for cmpxchg seq_cst which are not emulated
if (isReleaseOrStronger(Ord))
return Ord == AtomicOrdering::SequentiallyConsistent
- ? Builder.CreateFence(AtomicOrdering::SequentiallyConsistent)
- : Builder.CreateFence(AtomicOrdering::Release);
+ ? Builder.CreateFence(AtomicOrdering::SequentiallyConsistent,
+ SSID)
+ : Builder.CreateFence(AtomicOrdering::Release, SSID);
return nullptr;
}
Instruction *NVPTXTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
// Specialize for cmpxchg
if (!isa<AtomicCmpXchgInst>(Inst))
return TargetLoweringBase::emitTrailingFence(Builder, Inst, Ord);
@@ -6340,7 +6343,7 @@ Instruction *NVPTXTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
if (isAcquireOrStronger(Ord) &&
(Ord != AtomicOrdering::SequentiallyConsistent ||
CASWidth < STI.getMinCmpXchgSizeInBits()))
- return Builder.CreateFence(AtomicOrdering::Acquire);
+ return Builder.CreateFence(AtomicOrdering::Acquire, SSID);
return nullptr;
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index b4b7dad984b62..3f494c9066140 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -280,10 +280,14 @@ class NVPTXTargetLowering : public TargetLowering {
AtomicOrdering
atomicOperationOrderAfterFenceSplit(const Instruction *I) const override;
- Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
- AtomicOrdering Ord) const override;
- Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
- AtomicOrdering Ord) const override;
+ Instruction *
+ emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::System) const override;
+ Instruction *
+ emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::System) const override;
unsigned getPreferredFPToIntOpcode(unsigned Op, EVT FromVT,
EVT ToVT) const override;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 193418ca391e5..4dbcf6183efe9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -41,6 +41,27 @@ def AS_match {
}];
}
+multiclass nvvm_ternary_atomic_op_scoped<SDPatternOperator frag> {
+ defvar frag_pat = (frag node:$ptr, node:$cmp, node:$val);
+ def NAME#_cta: PatFrag<!setdagop(frag_pat, ops),
+ (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val), [{
+ return Scopes[cast<MemSDNode>(N)->getSyncScopeID()] == NVPTX::Scope::Block;
+ }]>;
+ def NAME#_cluster : PatFrag<!setdagop(frag_pat, ops),
+ (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val), [{
+ return Scopes[cast<MemSDNode>(N)->getSyncScopeID()] == NVPTX::Scope::Cluster;
+ }]>;
+ def NAME#_gpu: PatFrag<!setdagop(frag_pat, ops),
+ (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val), [{
+ return Scopes[cast<MemSDNode>(N)->getSyncScopeID()] == NVPTX::Scope::Device;
+ }]>;
+ def NAME#_sys: PatFrag<!setdagop(frag_pat, ops),
+ (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val), [{
+ return Scopes[cast<MemSDNode>(N)->getSyncScopeID()] == NVPTX::Scope::System;
+ }]>;
+}
+
+
// A node that will be replaced with the current PTX version.
class PTX {
SDNodeXForm PTXVerXform = SDNodeXForm<imm, [{
@@ -2111,9 +2132,9 @@ multiclass F_ATOMIC_2<RegTyInfo t, string sem_str, string as_str, string op_str,
}
// has 3 operands
-multiclass F_ATOMIC_3<RegTyInfo t, string sem_str, string as_str, string op_str,
- SDPatternOperator op, list<Predicate> preds> {
- defvar asm_str = "atom" # sem_str # as_str # "." # op_str # " \t$dst, [$addr], $b, $c;";
+multiclass F_ATOMIC_3<RegTyInfo t, string sem_str, string scope_str, string as_str,
+ string op_str, SDPatternOperator op, list<Predicate> preds> {
+ defvar asm_str = "atom" # sem_str # scope_str # as_str # "." # op_str # " \t$dst, [$addr], $b, $c;";
let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in {
def rr : NVPTXInst<(outs t.RC:$dst),
(ins ADDR:$addr, t.RC:$b, t.RC:$c),
@@ -2149,12 +2170,12 @@ multiclass F_ATOMIC_2_AS<RegTyInfo t, SDPatternOperator frag, string op_str, lis
defm _GEN : F_ATOMIC_2<t, "", "", op_str, ATOMIC_GENERIC_CHK<frag_pat>, preds>;
}
-multiclass F_ATOMIC_3_AS<RegTyInfo t, SDPatternOperator frag, string sem_str, string op_str, list<Predicate> preds = []> {
+multiclass F_ATOMIC_3_AS<RegTyInfo t, SDPatternOperator frag, string scope_str, string sem_str, string op_str, list<Predicate> preds = []> {
defvar frag_pat = (frag node:$a, node:$b, node:$c);
- defm _G : F_ATOMIC_3<t, sem_str, ".global", op_str, ATOMIC_GLOBAL_CHK<frag_pat>, preds>;
- defm _S : F_ATOMIC_3<t, sem_str, ".shared", op_str, ATOMIC_SHARED_CHK<frag_pat>, preds>;
- defm _S_C : F_ATOMIC_3<t, sem_str, ".shared::cluster", op_str, ATOMIC_SHARED_CLUSTER_CHK<frag_pat>, !listconcat([hasClusters], preds)>;
- defm _GEN : F_ATOMIC_3<t, sem_str, "", op_str, ATOMIC_GENERIC_CHK<frag_pat>, preds>;
+ defm _G : F_ATOMIC_3<t, sem_str, scope_str, ".global", op_str, ATOMIC_GLOBAL_CHK<frag_pat>, preds>;
+ defm _S : F_ATOMIC_3<t, sem_str, scope_str, ".shared", op_str, ATOMIC_SHARED_CHK<frag_pat>, preds>;
+ defm _S_C : F_ATOMIC_3<t, sem_str, scope_str, ".shared::cluster", op_str, ATOMIC_SHARED_CLUSTER_CHK<frag_pat>, !listconcat([hasClusters], preds)>;
+ defm _GEN : F_ATOMIC_3<t, sem_str, scope_str, "", op_str, ATOMIC_GENERIC_CHK<frag_pat>, preds>;
}
// atom_add
@@ -2205,18 +2226,30 @@ foreach t = [I32RT, I64RT] in {
foreach order = ["acquire", "release", "acq_rel", "monotonic"] in {
defvar cas_order_string = !if(!eq(order, "monotonic"), ".relaxed", "."#order);
defvar atomic_cmp_swap_pat = !cast<PatFrag>("atomic_cmp_swap_i"#t.Size#_#order);
+
+ // Instantiate scoped versions of the atomic compare and swap pattern
+ defm atomic_cmp_swap_i#t.Size#_#order: nvvm_ternary_atomic_op_scoped<atomic_cmp_swap_pat>;
+
+ foreach scope = ["cta", "cluster", "gpu", "sys"] in {
+ defvar atomic_cmp_swap_pat_scoped = !cast<PatFrag>("atomic_cmp_swap_i"#t.Size#_#order#_#scope);
+
+ // Syncscope is only supported for SM70+
+ defm INT_PTX_ATOM_CAS_#t.Size#_#order#_#scope
+ : F_ATOMIC_3_AS<t, atomic_cmp_swap_pat_scoped, "."#scope, cas_order_string, "cas.b"#t.Size, [hasSM<70>, hasPTX<63>]>;
+ }
+
// Note that AtomicExpand will convert cmpxchg seq_cst to a cmpxchg monotonic with fences around it.
// Memory orders are only supported for SM70+, PTX63+- so we have two sets of instruction definitions-
// for SM70+, and "old" ones which lower to "atom.cas", for earlier archs.
defm INT_PTX_ATOM_CAS_#t.Size#_#order
- : F_ATOMIC_3_AS<t, atomic_cmp_swap_pat, cas_order_string, "cas.b"#t.Size, [hasSM<70>, hasPTX<63>]>;
+ : F_ATOMIC_3_AS<t, atomic_cmp_swap_pat, "", cas_order_string, "cas.b"#t.Size, [hasSM<70>, hasPTX<63>]>;
defm INT_PTX_ATOM_CAS_#t.Size#_#order#_old
- : F_ATOMIC_3_AS<t, atomic_cmp_swap_pat, "", "cas.b"#t.Size, []>;
+ : F_ATOMIC_3_AS<t, atomic_cmp_swap_pat, "", "", "cas.b"#t.Size, []>;
}
}
// Note that 16-bit CAS support in PTX is emulated.
-defm INT_PTX_ATOM_CAS_16 : F_ATOMIC_3_AS<I16RT, atomic_cmp_swap_i16, "", "cas.b16", [hasSM<70>, hasPTX<63>]>;
+defm INT_PTX_ATOM_CAS_16 : F_ATOMIC_3_AS<I16RT, atomic_cmp_swap_i16, "", "", "cas.b16", [hasSM<70>, hasPTX<63>]>;
// Support for scoped atomic operations. Matches
// int_nvvm_atomic_{op}_{space}_{type}_{scope}
@@ -2246,7 +2279,8 @@ multiclass ATOM3N_impl<string OpStr, string IntTypeStr, string TypeStr,
RegTyInfo t, list<Predicate> Preds> {
defm "" : F_ATOMIC_3<t,
as_str = !if(!eq(SpaceStr, "gen"), "", "." # SpaceStr),
- sem_str = !if(!eq(ScopeStr, "gpu"), "", "." # ScopeStr),
+ sem_str = "",
+ scope_str = !if(!eq(ScopeStr, "gpu"), "", "." # ScopeStr),
op_str = OpStr # "." # TypeStr,
op = !cast<Intrinsic>(
"int_nvvm_atomic_" # OpStr
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 59bfec30dc211..6dd67c76b7077 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -12588,7 +12588,8 @@ static Instruction *callIntrinsic(IRBuilderBase &Builder, Intrinsic::ID Id) {
// http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
Instruction *PPCTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
if (Ord == AtomicOrdering::SequentiallyConsistent)
return callIntrinsic(Builder, Intrinsic::ppc_sync);
if (isReleaseOrStronger(Ord))
@@ -12598,7 +12599,8 @@ Instruction *PPCTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
Instruction *PPCTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
if (Inst->hasAtomicLoad() && isAcquireOrStronger(Ord)) {
// See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and
// http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index e7e7c21b50395..964f5e11f78cd 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -927,10 +927,14 @@ namespace llvm {
return true;
}
- Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
- AtomicOrdering Ord) const override;
- Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
- AtomicOrdering Ord) const override;
+ Instruction *
+ emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::System) const override;
+ Instruction *
+ emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::System) const override;
bool shouldInlineQuadwordAtomics() const;
diff --git a/llvm/lib/Target/RIS...
[truncated]
|
|
@llvm/pr-subscribers-backend-nvptx Author: Akshay Deodhar (akshayrdeodhar) ChangesThis MR adds support for cmpxchg instructions with syncscope. Adds PatFrags for matching syncscope for 3-input atomic operations in the NVPTX backend. Patch is 2.76 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/140812.diff 19 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 03099e9ad44dc..b2a75965e6c2e 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2319,13 +2319,15 @@ class TargetLoweringBase {
/// standard ABI uses a fence before a seq_cst load instead of after a
/// seq_cst store).
/// @{
- virtual Instruction *emitLeadingFence(IRBuilderBase &Builder,
- Instruction *Inst,
- AtomicOrdering Ord) const;
-
- virtual Instruction *emitTrailingFence(IRBuilderBase &Builder,
- Instruction *Inst,
- AtomicOrdering Ord) const;
+ virtual Instruction *
+ emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::System) const;
+
+ virtual Instruction *
+ emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::System) const;
/// @}
// Emits code that executes when the comparison result in the ll/sc
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index c376de877ac7d..b8dcafa32052b 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -65,7 +65,8 @@ class AtomicExpandImpl {
const DataLayout *DL = nullptr;
private:
- bool bracketInstWithFences(Instruction *I, AtomicOrdering Order);
+ bool bracketInstWithFences(Instruction *I, AtomicOrdering Order,
+ SyncScope::ID SSID = SyncScope::System);
IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL);
LoadInst *convertAtomicLoadToIntegerType(LoadInst *LI);
bool tryExpandAtomicLoad(LoadInst *LI);
@@ -303,6 +304,7 @@ bool AtomicExpandImpl::processAtomicInstr(Instruction *I) {
if (TLI->shouldInsertFencesForAtomic(I)) {
auto FenceOrdering = AtomicOrdering::Monotonic;
+ SyncScope::ID SSID = SyncScope::System;
if (LI && isAcquireOrStronger(LI->getOrdering())) {
FenceOrdering = LI->getOrdering();
LI->setOrdering(AtomicOrdering::Monotonic);
@@ -325,13 +327,18 @@ bool AtomicExpandImpl::processAtomicInstr(Instruction *I) {
// expandAtomicCmpXchg in that case.
FenceOrdering = CASI->getMergedOrdering();
auto CASOrdering = TLI->atomicOperationOrderAfterFenceSplit(CASI);
+ SSID = CASI->getSyncScopeID();
CASI->setSuccessOrdering(CASOrdering);
CASI->setFailureOrdering(CASOrdering);
+ // If CAS ordering is monotonic, then the operation will
+ // take default scope. Otherwise, it will retain its scope
+ if (CASOrdering != AtomicOrdering::Monotonic)
+ CASI->setSyncScopeID(SSID);
}
if (FenceOrdering != AtomicOrdering::Monotonic) {
- MadeChange |= bracketInstWithFences(I, FenceOrdering);
+ MadeChange |= bracketInstWithFences(I, FenceOrdering, SSID);
}
} else if (I->hasAtomicStore() &&
TLI->shouldInsertTrailingFenceForAtomicStore(I)) {
@@ -432,12 +439,13 @@ PreservedAnalyses AtomicExpandPass::run(Function &F,
}
bool AtomicExpandImpl::bracketInstWithFences(Instruction *I,
- AtomicOrdering Order) {
+ AtomicOrdering Order,
+ SyncScope::ID SSID) {
ReplacementIRBuilder Builder(I, *DL);
- auto LeadingFence = TLI->emitLeadingFence(Builder, I, Order);
+ auto LeadingFence = TLI->emitLeadingFence(Builder, I, Order, SSID);
- auto TrailingFence = TLI->emitTrailingFence(Builder, I, Order);
+ auto TrailingFence = TLI->emitTrailingFence(Builder, I, Order, SSID);
// We have a guard here because not every atomic operation generates a
// trailing fence.
if (TrailingFence)
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index c85f0c71ef25f..d0268545042ed 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -2327,18 +2327,20 @@ TargetLoweringBase::getAtomicMemOperandFlags(const Instruction &AI,
Instruction *TargetLoweringBase::emitLeadingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
if (isReleaseOrStronger(Ord) && Inst->hasAtomicStore())
- return Builder.CreateFence(Ord);
+ return Builder.CreateFence(Ord, SSID);
else
return nullptr;
}
Instruction *TargetLoweringBase::emitTrailingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
if (isAcquireOrStronger(Ord))
- return Builder.CreateFence(Ord);
+ return Builder.CreateFence(Ord, SSID);
else
return nullptr;
}
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index afbf1b4c55e70..5196ce846d6a2 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -21229,7 +21229,8 @@ Instruction *ARMTargetLowering::makeDMB(IRBuilderBase &Builder,
// Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
Instruction *ARMTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
switch (Ord) {
case AtomicOrdering::NotAtomic:
case AtomicOrdering::Unordered:
@@ -21254,7 +21255,8 @@ Instruction *ARMTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
Instruction *ARMTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
switch (Ord) {
case AtomicOrdering::NotAtomic:
case AtomicOrdering::Unordered:
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 9fad056edd3f1..da09eca2b946f 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -666,10 +666,12 @@ class VectorType;
void
emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override;
- Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
- AtomicOrdering Ord) const override;
- Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
- AtomicOrdering Ord) const override;
+ Instruction *emitLeadingFence(
+ IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::ID SyncScope::System) const override;
+ Instruction *emitTrailingFence(
+ IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::ID SyncScope::System) const override;
unsigned getMaxSupportedInterleaveFactor() const override;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 1f417dbada8e6..0bf3e5dcdbf4e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -6311,7 +6311,8 @@ AtomicOrdering NVPTXTargetLowering::atomicOperationOrderAfterFenceSplit(
Instruction *NVPTXTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
if (!isa<AtomicCmpXchgInst>(Inst))
return TargetLoweringBase::emitLeadingFence(Builder, Inst, Ord);
@@ -6319,15 +6320,17 @@ Instruction *NVPTXTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
// Emit a fence.sc leading fence for cmpxchg seq_cst which are not emulated
if (isReleaseOrStronger(Ord))
return Ord == AtomicOrdering::SequentiallyConsistent
- ? Builder.CreateFence(AtomicOrdering::SequentiallyConsistent)
- : Builder.CreateFence(AtomicOrdering::Release);
+ ? Builder.CreateFence(AtomicOrdering::SequentiallyConsistent,
+ SSID)
+ : Builder.CreateFence(AtomicOrdering::Release, SSID);
return nullptr;
}
Instruction *NVPTXTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
// Specialize for cmpxchg
if (!isa<AtomicCmpXchgInst>(Inst))
return TargetLoweringBase::emitTrailingFence(Builder, Inst, Ord);
@@ -6340,7 +6343,7 @@ Instruction *NVPTXTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
if (isAcquireOrStronger(Ord) &&
(Ord != AtomicOrdering::SequentiallyConsistent ||
CASWidth < STI.getMinCmpXchgSizeInBits()))
- return Builder.CreateFence(AtomicOrdering::Acquire);
+ return Builder.CreateFence(AtomicOrdering::Acquire, SSID);
return nullptr;
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index b4b7dad984b62..3f494c9066140 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -280,10 +280,14 @@ class NVPTXTargetLowering : public TargetLowering {
AtomicOrdering
atomicOperationOrderAfterFenceSplit(const Instruction *I) const override;
- Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
- AtomicOrdering Ord) const override;
- Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
- AtomicOrdering Ord) const override;
+ Instruction *
+ emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::System) const override;
+ Instruction *
+ emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::System) const override;
unsigned getPreferredFPToIntOpcode(unsigned Op, EVT FromVT,
EVT ToVT) const override;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 193418ca391e5..4dbcf6183efe9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -41,6 +41,27 @@ def AS_match {
}];
}
+multiclass nvvm_ternary_atomic_op_scoped<SDPatternOperator frag> {
+ defvar frag_pat = (frag node:$ptr, node:$cmp, node:$val);
+ def NAME#_cta: PatFrag<!setdagop(frag_pat, ops),
+ (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val), [{
+ return Scopes[cast<MemSDNode>(N)->getSyncScopeID()] == NVPTX::Scope::Block;
+ }]>;
+ def NAME#_cluster : PatFrag<!setdagop(frag_pat, ops),
+ (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val), [{
+ return Scopes[cast<MemSDNode>(N)->getSyncScopeID()] == NVPTX::Scope::Cluster;
+ }]>;
+ def NAME#_gpu: PatFrag<!setdagop(frag_pat, ops),
+ (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val), [{
+ return Scopes[cast<MemSDNode>(N)->getSyncScopeID()] == NVPTX::Scope::Device;
+ }]>;
+ def NAME#_sys: PatFrag<!setdagop(frag_pat, ops),
+ (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val), [{
+ return Scopes[cast<MemSDNode>(N)->getSyncScopeID()] == NVPTX::Scope::System;
+ }]>;
+}
+
+
// A node that will be replaced with the current PTX version.
class PTX {
SDNodeXForm PTXVerXform = SDNodeXForm<imm, [{
@@ -2111,9 +2132,9 @@ multiclass F_ATOMIC_2<RegTyInfo t, string sem_str, string as_str, string op_str,
}
// has 3 operands
-multiclass F_ATOMIC_3<RegTyInfo t, string sem_str, string as_str, string op_str,
- SDPatternOperator op, list<Predicate> preds> {
- defvar asm_str = "atom" # sem_str # as_str # "." # op_str # " \t$dst, [$addr], $b, $c;";
+multiclass F_ATOMIC_3<RegTyInfo t, string sem_str, string scope_str, string as_str,
+ string op_str, SDPatternOperator op, list<Predicate> preds> {
+ defvar asm_str = "atom" # sem_str # scope_str # as_str # "." # op_str # " \t$dst, [$addr], $b, $c;";
let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in {
def rr : NVPTXInst<(outs t.RC:$dst),
(ins ADDR:$addr, t.RC:$b, t.RC:$c),
@@ -2149,12 +2170,12 @@ multiclass F_ATOMIC_2_AS<RegTyInfo t, SDPatternOperator frag, string op_str, lis
defm _GEN : F_ATOMIC_2<t, "", "", op_str, ATOMIC_GENERIC_CHK<frag_pat>, preds>;
}
-multiclass F_ATOMIC_3_AS<RegTyInfo t, SDPatternOperator frag, string sem_str, string op_str, list<Predicate> preds = []> {
+multiclass F_ATOMIC_3_AS<RegTyInfo t, SDPatternOperator frag, string scope_str, string sem_str, string op_str, list<Predicate> preds = []> {
defvar frag_pat = (frag node:$a, node:$b, node:$c);
- defm _G : F_ATOMIC_3<t, sem_str, ".global", op_str, ATOMIC_GLOBAL_CHK<frag_pat>, preds>;
- defm _S : F_ATOMIC_3<t, sem_str, ".shared", op_str, ATOMIC_SHARED_CHK<frag_pat>, preds>;
- defm _S_C : F_ATOMIC_3<t, sem_str, ".shared::cluster", op_str, ATOMIC_SHARED_CLUSTER_CHK<frag_pat>, !listconcat([hasClusters], preds)>;
- defm _GEN : F_ATOMIC_3<t, sem_str, "", op_str, ATOMIC_GENERIC_CHK<frag_pat>, preds>;
+ defm _G : F_ATOMIC_3<t, sem_str, scope_str, ".global", op_str, ATOMIC_GLOBAL_CHK<frag_pat>, preds>;
+ defm _S : F_ATOMIC_3<t, sem_str, scope_str, ".shared", op_str, ATOMIC_SHARED_CHK<frag_pat>, preds>;
+ defm _S_C : F_ATOMIC_3<t, sem_str, scope_str, ".shared::cluster", op_str, ATOMIC_SHARED_CLUSTER_CHK<frag_pat>, !listconcat([hasClusters], preds)>;
+ defm _GEN : F_ATOMIC_3<t, sem_str, scope_str, "", op_str, ATOMIC_GENERIC_CHK<frag_pat>, preds>;
}
// atom_add
@@ -2205,18 +2226,30 @@ foreach t = [I32RT, I64RT] in {
foreach order = ["acquire", "release", "acq_rel", "monotonic"] in {
defvar cas_order_string = !if(!eq(order, "monotonic"), ".relaxed", "."#order);
defvar atomic_cmp_swap_pat = !cast<PatFrag>("atomic_cmp_swap_i"#t.Size#_#order);
+
+ // Instantiate scoped versions of the atomic compare and swap pattern
+ defm atomic_cmp_swap_i#t.Size#_#order: nvvm_ternary_atomic_op_scoped<atomic_cmp_swap_pat>;
+
+ foreach scope = ["cta", "cluster", "gpu", "sys"] in {
+ defvar atomic_cmp_swap_pat_scoped = !cast<PatFrag>("atomic_cmp_swap_i"#t.Size#_#order#_#scope);
+
+ // Syncscope is only supported for SM70+
+ defm INT_PTX_ATOM_CAS_#t.Size#_#order#_#scope
+ : F_ATOMIC_3_AS<t, atomic_cmp_swap_pat_scoped, "."#scope, cas_order_string, "cas.b"#t.Size, [hasSM<70>, hasPTX<63>]>;
+ }
+
// Note that AtomicExpand will convert cmpxchg seq_cst to a cmpxchg monotonic with fences around it.
// Memory orders are only supported for SM70+, PTX63+- so we have two sets of instruction definitions-
// for SM70+, and "old" ones which lower to "atom.cas", for earlier archs.
defm INT_PTX_ATOM_CAS_#t.Size#_#order
- : F_ATOMIC_3_AS<t, atomic_cmp_swap_pat, cas_order_string, "cas.b"#t.Size, [hasSM<70>, hasPTX<63>]>;
+ : F_ATOMIC_3_AS<t, atomic_cmp_swap_pat, "", cas_order_string, "cas.b"#t.Size, [hasSM<70>, hasPTX<63>]>;
defm INT_PTX_ATOM_CAS_#t.Size#_#order#_old
- : F_ATOMIC_3_AS<t, atomic_cmp_swap_pat, "", "cas.b"#t.Size, []>;
+ : F_ATOMIC_3_AS<t, atomic_cmp_swap_pat, "", "", "cas.b"#t.Size, []>;
}
}
// Note that 16-bit CAS support in PTX is emulated.
-defm INT_PTX_ATOM_CAS_16 : F_ATOMIC_3_AS<I16RT, atomic_cmp_swap_i16, "", "cas.b16", [hasSM<70>, hasPTX<63>]>;
+defm INT_PTX_ATOM_CAS_16 : F_ATOMIC_3_AS<I16RT, atomic_cmp_swap_i16, "", "", "cas.b16", [hasSM<70>, hasPTX<63>]>;
// Support for scoped atomic operations. Matches
// int_nvvm_atomic_{op}_{space}_{type}_{scope}
@@ -2246,7 +2279,8 @@ multiclass ATOM3N_impl<string OpStr, string IntTypeStr, string TypeStr,
RegTyInfo t, list<Predicate> Preds> {
defm "" : F_ATOMIC_3<t,
as_str = !if(!eq(SpaceStr, "gen"), "", "." # SpaceStr),
- sem_str = !if(!eq(ScopeStr, "gpu"), "", "." # ScopeStr),
+ sem_str = "",
+ scope_str = !if(!eq(ScopeStr, "gpu"), "", "." # ScopeStr),
op_str = OpStr # "." # TypeStr,
op = !cast<Intrinsic>(
"int_nvvm_atomic_" # OpStr
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 59bfec30dc211..6dd67c76b7077 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -12588,7 +12588,8 @@ static Instruction *callIntrinsic(IRBuilderBase &Builder, Intrinsic::ID Id) {
// http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
Instruction *PPCTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
if (Ord == AtomicOrdering::SequentiallyConsistent)
return callIntrinsic(Builder, Intrinsic::ppc_sync);
if (isReleaseOrStronger(Ord))
@@ -12598,7 +12599,8 @@ Instruction *PPCTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
Instruction *PPCTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
if (Inst->hasAtomicLoad() && isAcquireOrStronger(Ord)) {
// See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and
// http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index e7e7c21b50395..964f5e11f78cd 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -927,10 +927,14 @@ namespace llvm {
return true;
}
- Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
- AtomicOrdering Ord) const override;
- Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
- AtomicOrdering Ord) const override;
+ Instruction *
+ emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::System) const override;
+ Instruction *
+ emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::System) const override;
bool shouldInlineQuadwordAtomics() const;
diff --git a/llvm/lib/Target/RIS...
[truncated]
|
|
@llvm/pr-subscribers-backend-powerpc Author: Akshay Deodhar (akshayrdeodhar) ChangesThis MR adds support for cmpxchg instructions with syncscope. Adds PatFrags for matching syncscope for 3-input atomic operations in the NVPTX backend. Patch is 2.76 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/140812.diff 19 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 03099e9ad44dc..b2a75965e6c2e 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2319,13 +2319,15 @@ class TargetLoweringBase {
/// standard ABI uses a fence before a seq_cst load instead of after a
/// seq_cst store).
/// @{
- virtual Instruction *emitLeadingFence(IRBuilderBase &Builder,
- Instruction *Inst,
- AtomicOrdering Ord) const;
-
- virtual Instruction *emitTrailingFence(IRBuilderBase &Builder,
- Instruction *Inst,
- AtomicOrdering Ord) const;
+ virtual Instruction *
+ emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::System) const;
+
+ virtual Instruction *
+ emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::System) const;
/// @}
// Emits code that executes when the comparison result in the ll/sc
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index c376de877ac7d..b8dcafa32052b 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -65,7 +65,8 @@ class AtomicExpandImpl {
const DataLayout *DL = nullptr;
private:
- bool bracketInstWithFences(Instruction *I, AtomicOrdering Order);
+ bool bracketInstWithFences(Instruction *I, AtomicOrdering Order,
+ SyncScope::ID SSID = SyncScope::System);
IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL);
LoadInst *convertAtomicLoadToIntegerType(LoadInst *LI);
bool tryExpandAtomicLoad(LoadInst *LI);
@@ -303,6 +304,7 @@ bool AtomicExpandImpl::processAtomicInstr(Instruction *I) {
if (TLI->shouldInsertFencesForAtomic(I)) {
auto FenceOrdering = AtomicOrdering::Monotonic;
+ SyncScope::ID SSID = SyncScope::System;
if (LI && isAcquireOrStronger(LI->getOrdering())) {
FenceOrdering = LI->getOrdering();
LI->setOrdering(AtomicOrdering::Monotonic);
@@ -325,13 +327,18 @@ bool AtomicExpandImpl::processAtomicInstr(Instruction *I) {
// expandAtomicCmpXchg in that case.
FenceOrdering = CASI->getMergedOrdering();
auto CASOrdering = TLI->atomicOperationOrderAfterFenceSplit(CASI);
+ SSID = CASI->getSyncScopeID();
CASI->setSuccessOrdering(CASOrdering);
CASI->setFailureOrdering(CASOrdering);
+ // If CAS ordering is monotonic, then the operation will
+ // take default scope. Otherwise, it will retain its scope
+ if (CASOrdering != AtomicOrdering::Monotonic)
+ CASI->setSyncScopeID(SSID);
}
if (FenceOrdering != AtomicOrdering::Monotonic) {
- MadeChange |= bracketInstWithFences(I, FenceOrdering);
+ MadeChange |= bracketInstWithFences(I, FenceOrdering, SSID);
}
} else if (I->hasAtomicStore() &&
TLI->shouldInsertTrailingFenceForAtomicStore(I)) {
@@ -432,12 +439,13 @@ PreservedAnalyses AtomicExpandPass::run(Function &F,
}
bool AtomicExpandImpl::bracketInstWithFences(Instruction *I,
- AtomicOrdering Order) {
+ AtomicOrdering Order,
+ SyncScope::ID SSID) {
ReplacementIRBuilder Builder(I, *DL);
- auto LeadingFence = TLI->emitLeadingFence(Builder, I, Order);
+ auto LeadingFence = TLI->emitLeadingFence(Builder, I, Order, SSID);
- auto TrailingFence = TLI->emitTrailingFence(Builder, I, Order);
+ auto TrailingFence = TLI->emitTrailingFence(Builder, I, Order, SSID);
// We have a guard here because not every atomic operation generates a
// trailing fence.
if (TrailingFence)
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index c85f0c71ef25f..d0268545042ed 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -2327,18 +2327,20 @@ TargetLoweringBase::getAtomicMemOperandFlags(const Instruction &AI,
Instruction *TargetLoweringBase::emitLeadingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
if (isReleaseOrStronger(Ord) && Inst->hasAtomicStore())
- return Builder.CreateFence(Ord);
+ return Builder.CreateFence(Ord, SSID);
else
return nullptr;
}
Instruction *TargetLoweringBase::emitTrailingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
if (isAcquireOrStronger(Ord))
- return Builder.CreateFence(Ord);
+ return Builder.CreateFence(Ord, SSID);
else
return nullptr;
}
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index afbf1b4c55e70..5196ce846d6a2 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -21229,7 +21229,8 @@ Instruction *ARMTargetLowering::makeDMB(IRBuilderBase &Builder,
// Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
Instruction *ARMTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
switch (Ord) {
case AtomicOrdering::NotAtomic:
case AtomicOrdering::Unordered:
@@ -21254,7 +21255,8 @@ Instruction *ARMTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
Instruction *ARMTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
switch (Ord) {
case AtomicOrdering::NotAtomic:
case AtomicOrdering::Unordered:
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 9fad056edd3f1..da09eca2b946f 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -666,10 +666,12 @@ class VectorType;
void
emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override;
- Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
- AtomicOrdering Ord) const override;
- Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
- AtomicOrdering Ord) const override;
+ Instruction *emitLeadingFence(
+ IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::ID SyncScope::System) const override;
+ Instruction *emitTrailingFence(
+ IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::ID SyncScope::System) const override;
unsigned getMaxSupportedInterleaveFactor() const override;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 1f417dbada8e6..0bf3e5dcdbf4e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -6311,7 +6311,8 @@ AtomicOrdering NVPTXTargetLowering::atomicOperationOrderAfterFenceSplit(
Instruction *NVPTXTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
if (!isa<AtomicCmpXchgInst>(Inst))
return TargetLoweringBase::emitLeadingFence(Builder, Inst, Ord);
@@ -6319,15 +6320,17 @@ Instruction *NVPTXTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
// Emit a fence.sc leading fence for cmpxchg seq_cst which are not emulated
if (isReleaseOrStronger(Ord))
return Ord == AtomicOrdering::SequentiallyConsistent
- ? Builder.CreateFence(AtomicOrdering::SequentiallyConsistent)
- : Builder.CreateFence(AtomicOrdering::Release);
+ ? Builder.CreateFence(AtomicOrdering::SequentiallyConsistent,
+ SSID)
+ : Builder.CreateFence(AtomicOrdering::Release, SSID);
return nullptr;
}
Instruction *NVPTXTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
// Specialize for cmpxchg
if (!isa<AtomicCmpXchgInst>(Inst))
return TargetLoweringBase::emitTrailingFence(Builder, Inst, Ord);
@@ -6340,7 +6343,7 @@ Instruction *NVPTXTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
if (isAcquireOrStronger(Ord) &&
(Ord != AtomicOrdering::SequentiallyConsistent ||
CASWidth < STI.getMinCmpXchgSizeInBits()))
- return Builder.CreateFence(AtomicOrdering::Acquire);
+ return Builder.CreateFence(AtomicOrdering::Acquire, SSID);
return nullptr;
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index b4b7dad984b62..3f494c9066140 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -280,10 +280,14 @@ class NVPTXTargetLowering : public TargetLowering {
AtomicOrdering
atomicOperationOrderAfterFenceSplit(const Instruction *I) const override;
- Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
- AtomicOrdering Ord) const override;
- Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
- AtomicOrdering Ord) const override;
+ Instruction *
+ emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::System) const override;
+ Instruction *
+ emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::System) const override;
unsigned getPreferredFPToIntOpcode(unsigned Op, EVT FromVT,
EVT ToVT) const override;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 193418ca391e5..4dbcf6183efe9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -41,6 +41,27 @@ def AS_match {
}];
}
+multiclass nvvm_ternary_atomic_op_scoped<SDPatternOperator frag> {
+ defvar frag_pat = (frag node:$ptr, node:$cmp, node:$val);
+ def NAME#_cta: PatFrag<!setdagop(frag_pat, ops),
+ (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val), [{
+ return Scopes[cast<MemSDNode>(N)->getSyncScopeID()] == NVPTX::Scope::Block;
+ }]>;
+ def NAME#_cluster : PatFrag<!setdagop(frag_pat, ops),
+ (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val), [{
+ return Scopes[cast<MemSDNode>(N)->getSyncScopeID()] == NVPTX::Scope::Cluster;
+ }]>;
+ def NAME#_gpu: PatFrag<!setdagop(frag_pat, ops),
+ (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val), [{
+ return Scopes[cast<MemSDNode>(N)->getSyncScopeID()] == NVPTX::Scope::Device;
+ }]>;
+ def NAME#_sys: PatFrag<!setdagop(frag_pat, ops),
+ (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val), [{
+ return Scopes[cast<MemSDNode>(N)->getSyncScopeID()] == NVPTX::Scope::System;
+ }]>;
+}
+
+
// A node that will be replaced with the current PTX version.
class PTX {
SDNodeXForm PTXVerXform = SDNodeXForm<imm, [{
@@ -2111,9 +2132,9 @@ multiclass F_ATOMIC_2<RegTyInfo t, string sem_str, string as_str, string op_str,
}
// has 3 operands
-multiclass F_ATOMIC_3<RegTyInfo t, string sem_str, string as_str, string op_str,
- SDPatternOperator op, list<Predicate> preds> {
- defvar asm_str = "atom" # sem_str # as_str # "." # op_str # " \t$dst, [$addr], $b, $c;";
+multiclass F_ATOMIC_3<RegTyInfo t, string sem_str, string scope_str, string as_str,
+ string op_str, SDPatternOperator op, list<Predicate> preds> {
+ defvar asm_str = "atom" # sem_str # scope_str # as_str # "." # op_str # " \t$dst, [$addr], $b, $c;";
let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in {
def rr : NVPTXInst<(outs t.RC:$dst),
(ins ADDR:$addr, t.RC:$b, t.RC:$c),
@@ -2149,12 +2170,12 @@ multiclass F_ATOMIC_2_AS<RegTyInfo t, SDPatternOperator frag, string op_str, lis
defm _GEN : F_ATOMIC_2<t, "", "", op_str, ATOMIC_GENERIC_CHK<frag_pat>, preds>;
}
-multiclass F_ATOMIC_3_AS<RegTyInfo t, SDPatternOperator frag, string sem_str, string op_str, list<Predicate> preds = []> {
+multiclass F_ATOMIC_3_AS<RegTyInfo t, SDPatternOperator frag, string scope_str, string sem_str, string op_str, list<Predicate> preds = []> {
defvar frag_pat = (frag node:$a, node:$b, node:$c);
- defm _G : F_ATOMIC_3<t, sem_str, ".global", op_str, ATOMIC_GLOBAL_CHK<frag_pat>, preds>;
- defm _S : F_ATOMIC_3<t, sem_str, ".shared", op_str, ATOMIC_SHARED_CHK<frag_pat>, preds>;
- defm _S_C : F_ATOMIC_3<t, sem_str, ".shared::cluster", op_str, ATOMIC_SHARED_CLUSTER_CHK<frag_pat>, !listconcat([hasClusters], preds)>;
- defm _GEN : F_ATOMIC_3<t, sem_str, "", op_str, ATOMIC_GENERIC_CHK<frag_pat>, preds>;
+ defm _G : F_ATOMIC_3<t, sem_str, scope_str, ".global", op_str, ATOMIC_GLOBAL_CHK<frag_pat>, preds>;
+ defm _S : F_ATOMIC_3<t, sem_str, scope_str, ".shared", op_str, ATOMIC_SHARED_CHK<frag_pat>, preds>;
+ defm _S_C : F_ATOMIC_3<t, sem_str, scope_str, ".shared::cluster", op_str, ATOMIC_SHARED_CLUSTER_CHK<frag_pat>, !listconcat([hasClusters], preds)>;
+ defm _GEN : F_ATOMIC_3<t, sem_str, scope_str, "", op_str, ATOMIC_GENERIC_CHK<frag_pat>, preds>;
}
// atom_add
@@ -2205,18 +2226,30 @@ foreach t = [I32RT, I64RT] in {
foreach order = ["acquire", "release", "acq_rel", "monotonic"] in {
defvar cas_order_string = !if(!eq(order, "monotonic"), ".relaxed", "."#order);
defvar atomic_cmp_swap_pat = !cast<PatFrag>("atomic_cmp_swap_i"#t.Size#_#order);
+
+ // Instantiate scoped versions of the atomic compare and swap pattern
+ defm atomic_cmp_swap_i#t.Size#_#order: nvvm_ternary_atomic_op_scoped<atomic_cmp_swap_pat>;
+
+ foreach scope = ["cta", "cluster", "gpu", "sys"] in {
+ defvar atomic_cmp_swap_pat_scoped = !cast<PatFrag>("atomic_cmp_swap_i"#t.Size#_#order#_#scope);
+
+ // Syncscope is only supported for SM70+
+ defm INT_PTX_ATOM_CAS_#t.Size#_#order#_#scope
+ : F_ATOMIC_3_AS<t, atomic_cmp_swap_pat_scoped, "."#scope, cas_order_string, "cas.b"#t.Size, [hasSM<70>, hasPTX<63>]>;
+ }
+
// Note that AtomicExpand will convert cmpxchg seq_cst to a cmpxchg monotonic with fences around it.
// Memory orders are only supported for SM70+, PTX63+- so we have two sets of instruction definitions-
// for SM70+, and "old" ones which lower to "atom.cas", for earlier archs.
defm INT_PTX_ATOM_CAS_#t.Size#_#order
- : F_ATOMIC_3_AS<t, atomic_cmp_swap_pat, cas_order_string, "cas.b"#t.Size, [hasSM<70>, hasPTX<63>]>;
+ : F_ATOMIC_3_AS<t, atomic_cmp_swap_pat, "", cas_order_string, "cas.b"#t.Size, [hasSM<70>, hasPTX<63>]>;
defm INT_PTX_ATOM_CAS_#t.Size#_#order#_old
- : F_ATOMIC_3_AS<t, atomic_cmp_swap_pat, "", "cas.b"#t.Size, []>;
+ : F_ATOMIC_3_AS<t, atomic_cmp_swap_pat, "", "", "cas.b"#t.Size, []>;
}
}
// Note that 16-bit CAS support in PTX is emulated.
-defm INT_PTX_ATOM_CAS_16 : F_ATOMIC_3_AS<I16RT, atomic_cmp_swap_i16, "", "cas.b16", [hasSM<70>, hasPTX<63>]>;
+defm INT_PTX_ATOM_CAS_16 : F_ATOMIC_3_AS<I16RT, atomic_cmp_swap_i16, "", "", "cas.b16", [hasSM<70>, hasPTX<63>]>;
// Support for scoped atomic operations. Matches
// int_nvvm_atomic_{op}_{space}_{type}_{scope}
@@ -2246,7 +2279,8 @@ multiclass ATOM3N_impl<string OpStr, string IntTypeStr, string TypeStr,
RegTyInfo t, list<Predicate> Preds> {
defm "" : F_ATOMIC_3<t,
as_str = !if(!eq(SpaceStr, "gen"), "", "." # SpaceStr),
- sem_str = !if(!eq(ScopeStr, "gpu"), "", "." # ScopeStr),
+ sem_str = "",
+ scope_str = !if(!eq(ScopeStr, "gpu"), "", "." # ScopeStr),
op_str = OpStr # "." # TypeStr,
op = !cast<Intrinsic>(
"int_nvvm_atomic_" # OpStr
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 59bfec30dc211..6dd67c76b7077 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -12588,7 +12588,8 @@ static Instruction *callIntrinsic(IRBuilderBase &Builder, Intrinsic::ID Id) {
// http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
Instruction *PPCTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
if (Ord == AtomicOrdering::SequentiallyConsistent)
return callIntrinsic(Builder, Intrinsic::ppc_sync);
if (isReleaseOrStronger(Ord))
@@ -12598,7 +12599,8 @@ Instruction *PPCTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
Instruction *PPCTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
if (Inst->hasAtomicLoad() && isAcquireOrStronger(Ord)) {
// See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and
// http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index e7e7c21b50395..964f5e11f78cd 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -927,10 +927,14 @@ namespace llvm {
return true;
}
- Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
- AtomicOrdering Ord) const override;
- Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
- AtomicOrdering Ord) const override;
+ Instruction *
+ emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::System) const override;
+ Instruction *
+ emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::System) const override;
bool shouldInlineQuadwordAtomics() const;
diff --git a/llvm/lib/Target/RIS...
[truncated]
|
|
CC: @gonzalobg |
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM in principle.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we do this without modifying the base API? Looks like the SSID information is enclosed in the instruction which we can query in NVPTX specific NVPTXTargetLowering::emitTrailingFence/NVPTXTargetLowering::emitLeadingFence.
535c4f6 to
93ff279
Compare
|
✅ With the latest revision this PR passed the Python code formatter. |
|
@akshayrdeodhar when looking at the tests, e.g., here: https://github.com/llvm/llvm-project/blob/026e94ab2832d1e207439c8f52f2482206b848f5/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll#L28 I observe that a non-atomic load is being generated to read the initial CAS value. If so, that load needs to be an atomic load of the appropriate scope. |
This does make sense- the first load generated by the emulation loop should be atomic! AtomicExpand does not currently do this. Are we sure that generating an atomic load is valid for all targets? Then we'll have to modify AtomicExpand to issue atomic loads. |
|
Also, I'm planning to make the fix to AtomicExpandPass for the issue that @gonzalobg pointed out in a separate PR: |
For what it's worth, I think it's helpful to modify the base API because some targets in an internal LLVM fork would also benefit from scoping fences. This is cleaner than having to check the type of the instruction in both functions. Perhaps it would be better done in a separate patch though! |
Thank you for this. The reasoning behind not modifying the base API was this- in any expansion, the scope of the fence will be the same as the scope of the cmpxchg. The cmpxchg will always retain its scope. Atleast in NVPTX we don't see a counterexample. Do you have a counterexample for this? |
d711575 to
7693e1e
Compare
ea1c54f to
acd31be
Compare
llvm/test/CodeGen/NVPTX/cmpxchg.py
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Perhaps we should just run this directly form lit instead of manually generating and committing such large files?
@AlexMaclean An alternative would be to create a "slice" of the test surface. Something like:
|
Thanks for pointing this out, had missed it. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, thanks for bearing with all the review comments!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM with some nits.
|
LGTM modulo one nit (add a comment, see suggestion) |
Co-authored-by: gonzalobg <[email protected]>
|
Thanks for the detailed reviews! |
|
I'm seeing The failing test is from Triton: https://github.com/triton-lang/triton/blob/main/python/test/unit/language/test_core.py |
|
Reduced reproducer: https://godbolt.org/z/Txbh96MYK The question is where exactly is the problem? Or do we expect |
|
The problem was in triton code, which copy/pasted use of |
This MR adds support for cmpxchg instructions with syncscope.