-
Notifications
You must be signed in to change notification settings - Fork 14.7k
[mlir] Added Convergent
trait that matches LLVM's semantics
#152358
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,12 +14,13 @@ | |
#define NVVMIR_OPS | ||
|
||
include "mlir/IR/EnumAttr.td" | ||
include "mlir/Interfaces/ControlFlowInterfaces.td" | ||
include "mlir/Interfaces/InferIntRangeInterface.td" | ||
include "mlir/Interfaces/SideEffectInterfaces.td" | ||
include "mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td" | ||
include "mlir/Dialect/LLVMIR/LLVMOpBase.td" | ||
include "mlir/Dialect/LLVMIR/NVVMRequiresSMTraits.td" | ||
include "mlir/Interfaces/SideEffectInterfaces.td" | ||
include "mlir/Dialect/LLVMIR/BasicPtxBuilderInterface.td" | ||
include "mlir/Interfaces/InferIntRangeInterface.td" | ||
include "mlir/Dialect/LLVMIR/LLVMTypes.td" | ||
|
||
def LLVM_PointerGeneric : LLVM_PointerInAddressSpace<0>; | ||
|
@@ -105,9 +106,10 @@ class NVVM_Op<string mnemonic, list<Trait> traits = []> : | |
} | ||
|
||
/// Base class that defines BasicPtxBuilderOpInterface. | ||
class NVVM_PTXBuilder_Op<string mnemonic, | ||
list<Trait> traits = [DeclareOpInterfaceMethods<BasicPtxBuilderOpInterface>]> : | ||
LLVM_OpBase<NVVM_Dialect, mnemonic, traits> { | ||
class NVVM_PTXBuilder_Op<string mnemonic, list<Trait> traits = []> : | ||
LLVM_OpBase<NVVM_Dialect, mnemonic, | ||
!listconcat(traits, | ||
[DeclareOpInterfaceMethods<BasicPtxBuilderOpInterface>])> { | ||
} | ||
|
||
//===----------------------------------------------------------------------===// | ||
|
@@ -561,7 +563,7 @@ def NVVM_MBarrierTestWaitSharedOp : NVVM_Op<"mbarrier.test.wait.shared">, | |
// NVVM synchronization op definitions | ||
//===----------------------------------------------------------------------===// | ||
|
||
def NVVM_Barrier0Op : NVVM_Op<"barrier0"> { | ||
def NVVM_Barrier0Op : NVVM_Op<"barrier0", [Convergent]> { | ||
let assemblyFormat = "attr-dict"; | ||
string llvmBuilder = [{ | ||
createIntrinsicCall( | ||
|
@@ -570,8 +572,9 @@ def NVVM_Barrier0Op : NVVM_Op<"barrier0"> { | |
}]; | ||
} | ||
|
||
def NVVM_BarrierOp : NVVM_Op<"barrier", [AttrSizedOperandSegments]> { | ||
let arguments = (ins | ||
def NVVM_BarrierOp : NVVM_Op<"barrier", | ||
[Convergent, AttrSizedOperandSegments]> { | ||
let arguments = (ins | ||
Optional<I32>:$barrierId, | ||
Optional<I32>:$numberOfThreads); | ||
string llvmBuilder = [{ | ||
|
@@ -598,7 +601,7 @@ def NVVM_BarrierOp : NVVM_Op<"barrier", [AttrSizedOperandSegments]> { | |
]; | ||
} | ||
|
||
def NVVM_BarrierArriveOp : NVVM_PTXBuilder_Op<"barrier.arrive"> | ||
def NVVM_BarrierArriveOp : NVVM_PTXBuilder_Op<"barrier.arrive", [Convergent]> | ||
{ | ||
let arguments = (ins Optional<I32>:$barrierId, I32:$numberOfThreads); | ||
|
||
|
@@ -624,7 +627,7 @@ def NVVM_BarrierArriveOp : NVVM_PTXBuilder_Op<"barrier.arrive"> | |
}]; | ||
} | ||
|
||
def NVVM_ClusterArriveOp : NVVM_Op<"cluster.arrive"> { | ||
def NVVM_ClusterArriveOp : NVVM_Op<"cluster.arrive", [Convergent]> { | ||
let arguments = (ins OptionalAttr<UnitAttr>:$aligned); | ||
|
||
let summary = "Cluster Barrier Arrive Op"; | ||
|
@@ -647,7 +650,8 @@ def NVVM_ClusterArriveOp : NVVM_Op<"cluster.arrive"> { | |
let assemblyFormat = "attr-dict"; | ||
} | ||
|
||
def NVVM_ClusterArriveRelaxedOp : NVVM_Op<"cluster.arrive.relaxed", [NVVMRequiresSM<90>]> { | ||
def NVVM_ClusterArriveRelaxedOp : NVVM_Op<"cluster.arrive.relaxed", | ||
[Convergent, NVVMRequiresSM<90>]> { | ||
let arguments = (ins OptionalAttr<UnitAttr>:$aligned); | ||
|
||
let summary = "Cluster Barrier Relaxed Arrive Op"; | ||
|
@@ -673,7 +677,8 @@ def NVVM_ClusterArriveRelaxedOp : NVVM_Op<"cluster.arrive.relaxed", [NVVMRequire | |
let assemblyFormat = "attr-dict"; | ||
} | ||
|
||
def NVVM_ClusterWaitOp : NVVM_Op<"cluster.wait", [NVVMRequiresSM<90>]> { | ||
def NVVM_ClusterWaitOp : NVVM_Op<"cluster.wait", | ||
[Convergent, NVVMRequiresSM<90>]> { | ||
let arguments = (ins OptionalAttr<UnitAttr>:$aligned); | ||
|
||
let summary = "Cluster Barrier Wait Op"; | ||
|
@@ -1054,7 +1059,8 @@ def NVVM_CpAsyncWaitGroupOp : NVVM_Op<"cp.async.wait.group">, | |
let assemblyFormat = "$n attr-dict"; | ||
} | ||
|
||
def NVVM_CpAsyncMBarrierArriveOp : NVVM_Op<"cp.async.mbarrier.arrive"> { | ||
def NVVM_CpAsyncMBarrierArriveOp : NVVM_Op<"cp.async.mbarrier.arrive", | ||
[Convergent]> { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure that these are actually convergent? See this comment: https://discourse.llvm.org/t/llvm-convergence-semantics/77642/12 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure I follow your comment. Are you referring to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
The comment is a general one, so it potentially applies to all the intrinsics :)
I do have low confidence for LLVM current annotation as a reliable source of documentation right now. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
That's valid point. At the same time having different traits set on LLVM intrinsic and MLIR operation, that is lowered to that intrinsic, looks not only confusing, but stinky. Specifically about There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @AlexMaclean @Prince781 that will be great to revisit nvvm's intrinsic properties. I know PTX doc definitely describes There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes the ".aligned" specifier in PTX is what brings the convergence requirement. But I'm not sure why There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Absolutely: we should fix LLVM ;) |
||
let summary = "NVVM Dialect Op for cp.async.mbarrier.arrive"; | ||
let description = [{ | ||
The `cp.async.mbarrier.arrive` Op makes the mbarrier object track | ||
|
@@ -1079,7 +1085,8 @@ def NVVM_CpAsyncMBarrierArriveOp : NVVM_Op<"cp.async.mbarrier.arrive"> { | |
}]; | ||
} | ||
|
||
def NVVM_CpAsyncMBarrierArriveSharedOp : NVVM_Op<"cp.async.mbarrier.arrive.shared"> { | ||
def NVVM_CpAsyncMBarrierArriveSharedOp : NVVM_Op<"cp.async.mbarrier.arrive.shared", | ||
[Convergent]> { | ||
let summary = "NVVM Dialect Op for cp.async.mbarrier.arrive.shared"; | ||
let description = [{ | ||
The `cp.async.mbarrier.arrive.shared` Op makes the mbarrier object | ||
|
@@ -2806,7 +2813,8 @@ def NVVM_CpAsyncBulkSharedCTAToGlobalOp : | |
// NVVM Wgmma Ops | ||
//===----------------------------------------------------------------------===// | ||
|
||
def NVVM_WgmmaFenceAlignedOp : NVVM_Op<"wgmma.fence.aligned", [NVVMRequiresSMa<[90]>]> { | ||
def NVVM_WgmmaFenceAlignedOp : NVVM_Op<"wgmma.fence.aligned", | ||
[Convergent, NVVMRequiresSMa<[90]>]> { | ||
let arguments = (ins); | ||
let description = [{ | ||
Enforce an ordering of register accesses between warpgroup level matrix | ||
|
@@ -2820,7 +2828,8 @@ def NVVM_WgmmaFenceAlignedOp : NVVM_Op<"wgmma.fence.aligned", [NVVMRequiresSMa<[ | |
}]; | ||
} | ||
|
||
def NVVM_WgmmaGroupSyncAlignedOp : NVVM_Op<"wgmma.commit.group.sync.aligned", [NVVMRequiresSMa<[90]>]> { | ||
def NVVM_WgmmaGroupSyncAlignedOp : NVVM_Op<"wgmma.commit.group.sync.aligned", | ||
[Convergent, NVVMRequiresSMa<[90]>]> { | ||
let assemblyFormat = "attr-dict"; | ||
let description = [{ | ||
Commits all prior uncommitted warpgroup level matrix multiplication operations. | ||
|
@@ -2832,7 +2841,8 @@ def NVVM_WgmmaGroupSyncAlignedOp : NVVM_Op<"wgmma.commit.group.sync.aligned", [N | |
}]; | ||
} | ||
|
||
def NVVM_WgmmaWaitGroupSyncOp : NVVM_Op<"wgmma.wait.group.sync.aligned", [NVVMRequiresSMa<[90]>]> { | ||
def NVVM_WgmmaWaitGroupSyncOp : NVVM_Op<"wgmma.wait.group.sync.aligned", | ||
[Convergent, NVVMRequiresSMa<[90]>]> { | ||
let arguments = (ins I64Attr:$group); | ||
let assemblyFormat = "attr-dict $group"; | ||
let description = [{ | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -337,8 +337,12 @@ struct ReturnLike : public TraitBase<ConcreteType, ReturnLike> { | |
return success(); | ||
} | ||
}; | ||
} // namespace OpTrait | ||
|
||
// The Operation may not be made control-dependent on any additional values. | ||
// See https://llvm.org/docs/ConvergentOperations.html for more details. | ||
template <typename ConcreteType> | ||
struct Convergent : public TraitBase<ConcreteType, Convergent> {}; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there any impact that MLIR is using structured control-flow in the support for convergence? In particular aren't token inserted during lowering to CFG to preserve loop structures for example? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There's no direct impact on absence of this in MLIR, but this trait is a prerequisite to properly annotate functions that invoke such operations. OtherwiseLLVM will apply optimizations incorrectly.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
For each Op that MLIR generates, LLVM sets There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Are you sure we would do it inter-procedurally and annotate the actual call here? @npanchen : when mapping a structured control-flow to a CFG, just annotating things as "convergent" isn't enough I believe. See this example: https://youtu.be/_Z5DuiVCFAw?t=434 ; basically structured control-flow needs to say "something" about the reconvergence property and the management of anchors (the video is amazing for explaining all this I think!) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the Also, the MLIR trait seems generally useful to prevent, say, loop multiversioning by trip count if loop contains such operation. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Thanks, that's quite interesting video. I actually was unable to find when completely side note: there's also There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. They are emitted by clang, search for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see. I was not searching right. |
||
} // namespace OpTrait | ||
} // namespace mlir | ||
|
||
//===----------------------------------------------------------------------===// | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2113,6 +2113,11 @@ def TestTypeChangerOp : TEST_Op<"type_changer">, | |
def TestValidOp : TEST_Op<"valid", [Terminator]>, | ||
Arguments<(ins Variadic<AnyType>)>; | ||
|
||
def TestConvergentOp : TEST_Op<"convergent", [Convergent]> { | ||
let arguments = (ins AnyType); | ||
let results = (outs AnyType); | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is this doing right now? Without an actual test exercising this op, adding an op isn't useful. |
||
|
||
def TestMergeBlocksOp : TEST_Op<"merge_blocks"> { | ||
let summary = "merge_blocks operation"; | ||
let description = [{ | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This instruction can synchronize different thread counts. Could you clarify what
convergent
refers to in this context? I may be missing the nuance—does it mean a convergent warp or a convergent CTA?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That's is based on LLVM's assumption that intrinsics are convergent.
See my other reply about consistency of convergent properties.