@@ -2617,6 +2617,30 @@ def Tcgen05GroupKindAttr :
26172617 let assemblyFormat = "`<` $value `>`";
26182618}
26192619
2620+ def Tcgen05FenceBefore : I32EnumAttrCase<"BEFORE_THREAD_SYNC", 0, "before">;
2621+ def Tcgen05FenceAfter : I32EnumAttrCase<"AFTER_THREAD_SYNC", 1, "after">;
2622+ def Tcgen05FenceKind : I32EnumAttr<"Tcgen05FenceKind", "NVVM Tcgen05 fence kind",
2623+ [Tcgen05FenceBefore, Tcgen05FenceAfter]> {
2624+ let genSpecializedAttr = 0;
2625+ let cppNamespace = "::mlir::NVVM";
2626+ }
2627+ def Tcgen05FenceKindAttr :
2628+ EnumAttr<NVVM_Dialect, Tcgen05FenceKind, "tcgen05_fence"> {
2629+ let assemblyFormat = "`<` $value `>`";
2630+ }
2631+
2632+ def Tcgen05WaitLoad : I32EnumAttrCase<"LOAD", 0, "load">;
2633+ def Tcgen05WaitStore : I32EnumAttrCase<"STORE", 1, "store">;
2634+ def Tcgen05WaitKind : I32EnumAttr<"Tcgen05WaitKind", "NVVM Tcgen05 wait kind",
2635+ [Tcgen05WaitLoad, Tcgen05WaitStore]> {
2636+ let genSpecializedAttr = 0;
2637+ let cppNamespace = "::mlir::NVVM";
2638+ }
2639+ def Tcgen05WaitKindAttr :
2640+ EnumAttr<NVVM_Dialect, Tcgen05WaitKind, "tcgen05_wait"> {
2641+ let assemblyFormat = "`<` $value `>`";
2642+ }
2643+
26202644def NVVM_Tcgen05AllocOp : NVVM_Op<"tcgen05.alloc"> {
26212645 let summary = "Tcgen05 alloc operation";
26222646 let description = [{
@@ -2701,6 +2725,91 @@ def NVVM_Tcgen05RelinquishAllocPermitOp : NVVM_Op<"tcgen05.relinquish_alloc_perm
27012725 }];
27022726}
27032727
2728+ def NVVM_Tcgen05FenceOp : NVVM_Op<"tcgen05.fence"> {
2729+ let summary = "Tcgen05 fence operations";
2730+ let description = [{
2731+ The `tcgen05.fence<before>` orders all prior async tcgen05 operations
2732+ with respect to the subsequent tcgen05 and execution ordering operations.
2733+ The `tcgen05.fence<after>` orders all subsequent async tcgen05 operations
2734+ with respect to the prior tcgen05 and execution ordering operations.
2735+
2736+ [For more information refer to the PTX ISA]
2737+ (https://docs.nvidia.com/cuda/parallel-thread-execution/#tensorcore-5th-generation-instructions-tcgen05-fence)
2738+ }];
2739+
2740+ let arguments = (ins Tcgen05FenceKindAttr:$kind);
2741+ let assemblyFormat = "$kind attr-dict";
2742+
2743+ string llvmBuilder = [{
2744+ auto id = ($kind == NVVM::Tcgen05FenceKind::BEFORE_THREAD_SYNC)
2745+ ? llvm::Intrinsic::nvvm_tcgen05_fence_before_thread_sync
2746+ : llvm::Intrinsic::nvvm_tcgen05_fence_after_thread_sync;
2747+ createIntrinsicCall(builder, id);
2748+ }];
2749+ }
2750+
2751+ def NVVM_Tcgen05WaitOp : NVVM_Op<"tcgen05.wait"> {
2752+ let summary = "Tcgen05 wait operations";
2753+ let description = [{
2754+ The `tcgen05.wait<load>` causes the executing thread to block until
2755+ all prior `tcgen05.ld` operations issued by the executing thread
2756+ have completed. Similarly, the `tcgen05.wait<store>` causes the executing
2757+ thread to block until all prior `tcgen05.st` operations issued by the
2758+ executing thread have completed.
2759+ [For more information refer PTX ISA]
2760+ (https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-wait)
2761+ }];
2762+
2763+ let arguments = (ins Tcgen05WaitKindAttr:$kind);
2764+ let assemblyFormat = "$kind attr-dict";
2765+
2766+ string llvmBuilder = [{
2767+ auto id = ($kind == NVVM::Tcgen05WaitKind::LOAD)
2768+ ? llvm::Intrinsic::nvvm_tcgen05_wait_ld
2769+ : llvm::Intrinsic::nvvm_tcgen05_wait_st;
2770+ createIntrinsicCall(builder, id);
2771+ }];
2772+ }
2773+
2774+ def NVVM_Tcgen05CommitOp : NVVM_Op<"tcgen05.commit"> {
2775+ let summary = "Tcgen05 commit operations";
2776+ let description = [{
2777+ The `tcgen05.commit` makes the mbarrier object, specified by
2778+ the operand `addr`, track the completion of all the prior
2779+ async-tcgen05 operations initiated by the executing thread.
2780+ The multicast variants allow signaling on the mbarrier objects
2781+ of multiple CTAs within the cluster. Operand `multicastMask`,
2782+ when present, specifies the destination CTAs in the cluster such
2783+ that each bit position in the 16-bit `multicastMask` operand
2784+ corresponds to the `nvvm.read.ptx.sreg.ctaid` of the destination CTA.
2785+ [For more information refer PTX ISA]
2786+ (https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen-async-sync-operations-commit)
2787+ }];
2788+
2789+ let arguments = (ins
2790+ AnyTypeOf<[LLVM_AnyPointer, LLVM_PointerShared]>:$addr,
2791+ Optional<I16>:$multicastMask,
2792+ DefaultValuedAttr<Tcgen05GroupKindAttr, "Tcgen05GroupKind::CTA_1">:$group);
2793+
2794+ let assemblyFormat = [{
2795+ $addr (`,` `multicast_mask` `=` $multicastMask^)?
2796+ attr-dict `:` type(operands)
2797+ }];
2798+
2799+ let extraClassDeclaration = [{
2800+ static llvm::Intrinsic::ID
2801+ getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
2802+ llvm::SmallVector<llvm::Value *> &args);
2803+ }];
2804+
2805+ string llvmBuilder = [{
2806+ llvm::SmallVector<llvm::Value *> args;
2807+ auto id = NVVM::Tcgen05CommitOp::getIntrinsicIDAndArgs(
2808+ *op, moduleTranslation, args);
2809+ createIntrinsicCall(builder, id, args);
2810+ }];
2811+ }
2812+
27042813//===----------------------------------------------------------------------===//
27052814// NVVM target attribute.
27062815//===----------------------------------------------------------------------===//
0 commit comments