From 133109de1248ddf934fd46409dece99d78ef59d5 Mon Sep 17 00:00:00 2001 From: Keren Zhou Date: Thu, 21 Nov 2024 13:45:16 -0800 Subject: [PATCH 01/12] [DRAFT] Completely remove `MemDesc` from the Triton dialect (#5208) After this PR, `MemDesc` will be a type only in the TritonGPU dialect, as will the `TensorOrMemDesc` interface. --- .../TritonGPUToLLVM/TypeConverter.h | 5 +- .../Conversion/TritonGPUToLLVM/Utility.h | 10 +- .../triton/Dialect/Triton/IR/CMakeLists.txt | 4 - include/triton/Dialect/Triton/IR/Traits.h | 6 +- include/triton/Dialect/Triton/IR/TritonOps.td | 1 - .../triton/Dialect/Triton/IR/TritonTypes.td | 48 --- include/triton/Dialect/Triton/IR/Types.h | 2 - .../triton/Dialect/TritonGPU/IR/Attributes.h | 2 +- .../Dialect/TritonGPU/IR/CMakeLists.txt | 13 +- include/triton/Dialect/TritonGPU/IR/Dialect.h | 2 +- .../Dialect/TritonGPU/IR/TritonGPUAttrDefs.td | 2 +- .../TritonGPU/IR/TritonGPUInterfaces.h | 5 +- .../Dialect/TritonGPU/IR/TritonGPUOps.td | 29 +- .../IR/TritonGPUTypeInterfaces.td} | 11 +- .../Dialect/TritonGPU/IR/TritonGPUTypes.td | 55 ++- include/triton/Dialect/TritonGPU/IR/Types.h | 3 + .../TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td | 25 +- lib/Analysis/Alias.cpp | 4 +- lib/Analysis/CMakeLists.txt | 2 + lib/Analysis/Utility.cpp | 3 +- .../SharedToDotOperandFMA.cpp | 1 + .../TritonGPUToLLVM/TypeConverter.cpp | 1 + lib/Conversion/TritonGPUToLLVM/Utility.cpp | 16 +- .../TritonGPUToLLVM/ViewOpToLLVM.cpp | 3 +- lib/Dialect/Triton/IR/Dialect.cpp | 42 -- lib/Dialect/Triton/IR/Types.cpp | 55 --- lib/Dialect/TritonGPU/IR/CMakeLists.txt | 1 + lib/Dialect/TritonGPU/IR/Dialect.cpp | 51 ++- lib/Dialect/TritonGPU/IR/Types.cpp | 55 +++ .../Pipeliner/MatmulLoopPipeline.cpp | 64 +-- .../Pipeliner/PipeliningUtility.cpp | 6 +- .../Pipeliner/TMAStoresPipeline.cpp | 4 +- lib/Dialect/TritonGPU/Transforms/Prefetch.cpp | 6 +- .../Transforms/ReduceDataDuplication.cpp | 2 +- lib/Dialect/TritonGPU/Transforms/Utility.cpp | 7 +- lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp | 20 +- python/test/unit/language/test_core.py | 8 +- test/Analysis/test-alias.mlir | 86 ++-- test/Analysis/test-allocation.mlir | 302 +++++++-------- test/Analysis/test-membar.mlir | 366 +++++++++--------- test/Conversion/amd/compute-base-ptr.mlir | 4 +- .../decompose-unsupported-conversions.mlir | 4 +- test/Conversion/amd/tritongpu_to_llvm.mlir | 10 +- .../amd/tritongpu_wmma_dot_to_llvm.mlir | 18 +- test/Conversion/tritongpu_to_llvm.mlir | 100 ++--- test/Conversion/tritongpu_to_llvm_hopper.mlir | 36 +- test/Conversion/tritonnvidiagpu_to_llvm.mlir | 16 +- test/Triton/invalid.mlir | 6 +- test/TritonGPU/accumulator-init.mlir | 70 ++-- .../amd/amd-reorder-instructions.mlir | 136 +++---- test/TritonGPU/amd/amd-sched-2nd-load.mlir | 58 +-- test/TritonGPU/amd/optimize-lds-usage.mlir | 28 +- test/TritonGPU/canonicalize.mlir | 34 +- test/TritonGPU/combine.mlir | 12 +- test/TritonGPU/dot-operands.mlir | 32 +- test/TritonGPU/fence-inserstion.mlir | 12 +- test/TritonGPU/invalid.mlir | 20 +- test/TritonGPU/loop-pipeline-cuda.mlir | 24 +- test/TritonGPU/loop-pipeline-hip.mlir | 18 +- test/TritonGPU/loop-pipeline-hopper.mlir | 80 ++-- test/TritonGPU/loop-pipeline.mlir | 14 +- .../pipeline-hopper-remove-wait.mlir | 10 +- test/TritonGPU/prefetch.mlir | 48 +-- test/TritonGPU/reduce-data-duplication.mlir | 2 +- test/TritonGPU/reorder-instructions.mlir | 46 +-- test/TritonGPU/tritongpu_ops.mlir | 4 +- test/TritonNvidiaGPU/membar.mlir | 26 +- .../ConvertLayoutOpToLLVM.cpp | 1 + .../SharedToDotOperandMFMA.cpp | 2 +- .../SharedToDotOperandWMMA.cpp | 2 +- .../StreamPipelineV2.cpp | 17 +- .../SharedToDotOperandMMAv2OrV3.cpp | 1 + .../DecomposeUnsupportedConversions.cpp | 2 +- .../DotOpToLLVM/WGMMA.cpp | 9 +- 74 files changed, 1130 insertions(+), 1100 deletions(-) rename include/triton/Dialect/{Triton/IR/TritonTypeInterfaces.td => TritonGPU/IR/TritonGPUTypeInterfaces.td} (75%) diff --git a/include/triton/Conversion/TritonGPUToLLVM/TypeConverter.h b/include/triton/Conversion/TritonGPUToLLVM/TypeConverter.h index 5ae547c392..60c0ed7b61 100644 --- a/include/triton/Conversion/TritonGPUToLLVM/TypeConverter.h +++ b/include/triton/Conversion/TritonGPUToLLVM/TypeConverter.h @@ -18,11 +18,12 @@ class TritonGPUToLLVMTypeConverter : public LLVMTypeConverter { const TargetInfoBase &targetInfo, const DataLayoutAnalysis *analysis = nullptr); - Type getElementTypeForStruct(TensorOrMemDesc type); + Type getElementTypeForStruct(triton::gpu::TensorOrMemDesc type); Type convertTritonPointerType(triton::PointerType type); Type convertTritonTensorType(RankedTensorType type, const TargetInfoBase &targetInfo); - Type convertMemDescType(MemDescType type, const TargetInfoBase &targetInfo); + Type convertMemDescType(triton::gpu::MemDescType type, + const TargetInfoBase &targetInfo); Type convertAsyncToken(triton::gpu::AsyncTokenType type); }; diff --git a/include/triton/Conversion/TritonGPUToLLVM/Utility.h b/include/triton/Conversion/TritonGPUToLLVM/Utility.h index 9061e24fbe..ba24461a1f 100644 --- a/include/triton/Conversion/TritonGPUToLLVM/Utility.h +++ b/include/triton/Conversion/TritonGPUToLLVM/Utility.h @@ -14,6 +14,7 @@ #include "triton/Dialect/Triton/IR/Utility.h" #include "triton/Dialect/TritonGPU/IR/Dialect.h" #include "triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h" +#include "triton/Dialect/TritonGPU/IR/Types.h" #include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h" #include "triton/Tools/LinearLayout.h" #include "triton/Tools/StrUtil.h" @@ -1141,8 +1142,8 @@ emitIndices(Location loc, RewriterBase &rewriter, const TargetInfoBase &target, // // Returns true on success. [[nodiscard]] bool emitTransferBetweenRegistersAndShared( - RankedTensorType registerTy, MemDescType sharedTy, Type elemLlvmTy, - std::optional maxVecElems, Value shmemBase, + RankedTensorType registerTy, triton::gpu::MemDescType sharedTy, + Type elemLlvmTy, std::optional maxVecElems, Value shmemBase, ArrayRef shmemStrides, Location loc, RewriterBase &rewriter, const TargetInfoBase &target, std::function perVectorCallback); @@ -1310,13 +1311,14 @@ inline DenseMap getSwizzledSharedPtrs( } SmallVector loadSharedToDistributed(RankedTensorType dstTy, - MemDescType srcTy, Type elemLlvmTy, + triton::gpu::MemDescType srcTy, + Type elemLlvmTy, SharedMemoryObject smemObj, Location loc, RewriterBase &rewriter, const TargetInfoBase &target); void storeDistributedToShared( - MemDescType dstTy, RankedTensorType srcTy, Type elemLlvmTy, + triton::gpu::MemDescType dstTy, RankedTensorType srcTy, Type elemLlvmTy, ArrayRef srcVals, Value smemBase, ArrayRef dstStrides, Location loc, RewriterBase &rewriter, const TargetInfoBase &target, std::pair *const llvmOpCount = nullptr); diff --git a/include/triton/Dialect/Triton/IR/CMakeLists.txt b/include/triton/Dialect/Triton/IR/CMakeLists.txt index 8139ebf1ae..fecd5adf62 100644 --- a/include/triton/Dialect/Triton/IR/CMakeLists.txt +++ b/include/triton/Dialect/Triton/IR/CMakeLists.txt @@ -20,10 +20,6 @@ set(LLVM_TARGET_DEFINITIONS TritonInterfaces.td) mlir_tablegen(AttrInterfaces.h.inc -gen-attr-interface-decls) mlir_tablegen(AttrInterfaces.cpp.inc -gen-attr-interface-defs) -set(LLVM_TARGET_DEFINITIONS TritonTypeInterfaces.td) -mlir_tablegen(TypeInterfaces.h.inc -gen-type-interface-decls) -mlir_tablegen(TypeInterfaces.cpp.inc -gen-type-interface-defs) - set(LLVM_TARGET_DEFINITIONS TritonOpInterfaces.td) mlir_tablegen(OpInterfaces.h.inc -gen-op-interface-decls) mlir_tablegen(OpInterfaces.cpp.inc -gen-op-interface-defs) diff --git a/include/triton/Dialect/Triton/IR/Traits.h b/include/triton/Dialect/Triton/IR/Traits.h index 7f0e5109e6..804b1648e9 100644 --- a/include/triton/Dialect/Triton/IR/Traits.h +++ b/include/triton/Dialect/Triton/IR/Traits.h @@ -69,9 +69,9 @@ class DotLike : public TraitBase { static LogicalResult verifyTrait(Operation *op) { if (op->getNumOperands() < 3) return op->emitOpError("expected at least 3 operands"); - auto aTy = cast(op->getOperand(0).getType()); - auto bTy = cast(op->getOperand(1).getType()); - auto cTy = cast(op->getOperand(2).getType()); + auto aTy = cast(op->getOperand(0).getType()); + auto bTy = cast(op->getOperand(1).getType()); + auto cTy = cast(op->getOperand(2).getType()); auto aShape = aTy.getShape(); auto bShape = bTy.getShape(); auto cShape = cTy.getShape(); diff --git a/include/triton/Dialect/Triton/IR/TritonOps.td b/include/triton/Dialect/Triton/IR/TritonOps.td index cdf9b0ea77..197b9df7cf 100644 --- a/include/triton/Dialect/Triton/IR/TritonOps.td +++ b/include/triton/Dialect/Triton/IR/TritonOps.td @@ -13,7 +13,6 @@ include "mlir/Interfaces/SideEffectInterfaces.td" // Pure include "mlir/Interfaces/ControlFlowInterfaces.td" // BranchOpInterface include "mlir/Interfaces/InferTypeOpInterface.td" // SameOperandsAndResultType include "mlir/Interfaces/CallInterfaces.td" // CallOpInterface -include "triton/Dialect/Triton/IR/TritonTypeInterfaces.td" include "triton/Dialect/Triton/IR/TritonOpInterfaces.td" diff --git a/include/triton/Dialect/Triton/IR/TritonTypes.td b/include/triton/Dialect/Triton/IR/TritonTypes.td index 98f8e570a9..a70b97dbc8 100644 --- a/include/triton/Dialect/Triton/IR/TritonTypes.td +++ b/include/triton/Dialect/Triton/IR/TritonTypes.td @@ -92,54 +92,6 @@ def TT_TensorPtr : TT_PtrOf<[TT_Tensor]>; // Any Type in Triton IR def TT_Type : AnyTypeOf<[TT_FloatLike, TT_IntLike, TT_PtrLike, TT_TensorPtr]>; -// Memory descriptor type. -def TT_MemDescType : TritonTypeDef<"MemDesc", "memdesc", [ShapedTypeInterface]> { - let summary = "memory descriptor type (`::mlir::triton::MemDescType`) in Triton IR type system"; - - let description = [{ - Memory descriptor contains a base pointer (scalar) and a descriptor of the memory. - If mutable memory is false that means the memory is constant and can only be allocated and stored once. - A constant memory allocation is different than a tensor as it can have multiple views and the descriptor - can be changed without changing the underlying memory. - }]; - - let parameters = (ins - ArrayRefParameter<"int64_t">:$shape, - "Type":$elementType, - "Attribute":$encoding, - "Attribute":$memorySpace, - "bool":$mutable_memory - ); - let extraClassDeclaration = [{ - MemDescType cloneWith(std::optional> shape, - Type elementType) const { - return MemDescType::get(shape.value_or(getShape()), elementType, getEncoding(), getMemorySpace(), getMutableMemory()); - } - - bool hasRank() const { return true; } - }]; - let builders = [ - TypeBuilderWithInferredContext<(ins - "llvm::ArrayRef":$shape, - "Type":$elementType, - "Attribute":$encoding, - "Attribute":$memorySpace - ), [{ - return $_get(elementType.getContext(), shape, elementType, encoding, memorySpace, /*mutableMemory=*/false); - }]>, - TypeBuilderWithInferredContext<(ins - "llvm::ArrayRef":$shape, - "Type":$elementType, - "Attribute":$encoding, - "Attribute":$memorySpace, - "bool":$mutableMemory - ), [{ - return $_get(elementType.getContext(), shape, elementType, encoding, memorySpace, mutableMemory); - }]> - ]; - let hasCustomAssemblyFormat = 1; -} - // Result type of ExperimentalMakeTensorDescriptor def TT_TensorDescType : TritonTypeDef<"TensorDesc", "tensordesc", []> { let summary = "Tensor descriptor type (`::mlir::triton::TensorDescType`) in Triton IR type system"; diff --git a/include/triton/Dialect/Triton/IR/Types.h b/include/triton/Dialect/Triton/IR/Types.h index 17d2dbc8cc..6bcac9522e 100644 --- a/include/triton/Dialect/Triton/IR/Types.h +++ b/include/triton/Dialect/Triton/IR/Types.h @@ -8,8 +8,6 @@ #define GET_TYPEDEF_CLASSES #include "triton/Dialect/Triton/IR/Types.h.inc" -#include "triton/Dialect/Triton/IR/TypeInterfaces.h.inc" - namespace mlir { namespace triton { diff --git a/include/triton/Dialect/TritonGPU/IR/Attributes.h b/include/triton/Dialect/TritonGPU/IR/Attributes.h index a99ddfc17d..1f93b3d935 100644 --- a/include/triton/Dialect/TritonGPU/IR/Attributes.h +++ b/include/triton/Dialect/TritonGPU/IR/Attributes.h @@ -5,6 +5,6 @@ #include "triton/Dialect/TritonGPU/IR/TritonGPUInterfaces.h" #define GET_ATTRDEF_CLASSES -#include "triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.h.inc" +#include "triton/Dialect/TritonGPU/IR/AttrDefs.h.inc" #endif // TRITON_DIALECT_TRITONGPU_IR_ATTRIBUTES_H_ diff --git a/include/triton/Dialect/TritonGPU/IR/CMakeLists.txt b/include/triton/Dialect/TritonGPU/IR/CMakeLists.txt index 73c9401c18..189f6d4307 100644 --- a/include/triton/Dialect/TritonGPU/IR/CMakeLists.txt +++ b/include/triton/Dialect/TritonGPU/IR/CMakeLists.txt @@ -12,10 +12,15 @@ add_mlir_doc(TritonGPUOps TritonGPUOps dialects/ -gen-op-doc) add_public_tablegen_target(TritonGPUTableGen) set(LLVM_TARGET_DEFINITIONS TritonGPUAttrDefs.td) -mlir_tablegen(TritonGPUAttrInterfaces.h.inc -gen-attr-interface-decls) -mlir_tablegen(TritonGPUAttrInterfaces.cpp.inc -gen-attr-interface-defs) -mlir_tablegen(TritonGPUAttrDefs.h.inc -gen-attrdef-decls) -mlir_tablegen(TritonGPUAttrDefs.cpp.inc -gen-attrdef-defs) +mlir_tablegen(AttrInterfaces.h.inc -gen-attr-interface-decls) +mlir_tablegen(AttrInterfaces.cpp.inc -gen-attr-interface-defs) +mlir_tablegen(AttrDefs.h.inc -gen-attrdef-decls) +mlir_tablegen(AttrDefs.cpp.inc -gen-attrdef-defs) mlir_tablegen(OpsEnums.h.inc -gen-enum-decls) mlir_tablegen(OpsEnums.cpp.inc -gen-enum-defs) add_public_tablegen_target(TritonGPUAttrDefsIncGen) + +set(LLVM_TARGET_DEFINITIONS TritonGPUTypeInterfaces.td) +mlir_tablegen(TypeInterfaces.h.inc -gen-type-interface-decls) +mlir_tablegen(TypeInterfaces.cpp.inc -gen-type-interface-defs) +add_public_tablegen_target(TritonGPUTypeInterfacesIncGen) diff --git a/include/triton/Dialect/TritonGPU/IR/Dialect.h b/include/triton/Dialect/TritonGPU/IR/Dialect.h index e0865e12af..b09fc29fb3 100644 --- a/include/triton/Dialect/TritonGPU/IR/Dialect.h +++ b/include/triton/Dialect/TritonGPU/IR/Dialect.h @@ -9,10 +9,10 @@ // TritonGPU depends on Triton #include "triton/Dialect/Triton/IR/Dialect.h" #include "triton/Dialect/TritonGPU/IR/Attributes.h" -#include "triton/Dialect/TritonGPU/IR/Dialect.h.inc" #include "triton/Dialect/TritonGPU/IR/Types.h" #define GET_OP_CLASSES +#include "triton/Dialect/TritonGPU/IR/Dialect.h.inc" #include "triton/Dialect/TritonGPU/IR/Ops.h.inc" namespace mlir { diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td index 26ff9f7e3a..9af550aae9 100644 --- a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td +++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td @@ -2,8 +2,8 @@ #define TRITONGPU_ATTRDEFS include "mlir/IR/AttrTypeBase.td" -include "triton/Dialect/TritonGPU/IR/TritonGPUDialect.td" include "triton/Dialect/Triton/IR/TritonInterfaces.td" +include "triton/Dialect/TritonGPU/IR/TritonGPUDialect.td" //===----------------------------------------------------------------------===// // TritonGPU Attribute Definitions diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUInterfaces.h b/include/triton/Dialect/TritonGPU/IR/TritonGPUInterfaces.h index 9cf2876d2c..1e76237dac 100644 --- a/include/triton/Dialect/TritonGPU/IR/TritonGPUInterfaces.h +++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUInterfaces.h @@ -1,6 +1,9 @@ #ifndef TRITON_GPU_DIALECT_INTERFACES_H #define TRITON_GPU_DIALECT_INTERFACES_H + +// clang-format off #include "triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h" -#include "triton/Dialect/TritonGPU/IR/TritonGPUAttrInterfaces.h.inc" +#include "triton/Dialect/TritonGPU/IR/AttrInterfaces.h.inc" +// clang-format on #endif // TRITON_GPU_DIALECT_INTERFACES_H diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td index 77cb2c8bf0..b747fddde6 100644 --- a/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td +++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td @@ -3,6 +3,7 @@ include "triton/Dialect/TritonGPU/IR/TritonGPUDialect.td" include "triton/Dialect/TritonGPU/IR/TritonGPUTypes.td" +include "triton/Dialect/TritonGPU/IR/TritonGPUTypeInterfaces.td" include "triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td" include "mlir/Dialect/Arith/IR/ArithBase.td" include "triton/Dialect/Triton/IR/TritonTypes.td" @@ -95,7 +96,7 @@ def TTG_AsyncCopyGlobalToLocalOp : TTG_Op<"async_copy_global_to_local", [ let arguments = ( ins TT_PtrTensor:$src, - TT_MemDescType:$result, + TTG_MemDescType:$result, Optional:$mask, Optional:$other, DefaultValuedAttr:$cache, @@ -168,7 +169,7 @@ def TTG_LocalAllocOp : TTG_Op<"local_alloc", [DeclareOpInterfaceMethods" is printed as "". + // Use qualified() otherwise "!triton_gpu.memdesc" is printed as "". let assemblyFormat = [{$src attr-dict `:` qualified(type($src))}]; } @@ -212,12 +213,12 @@ def TTG_MemDescSubviewOp : TTG_Op<"memdesc_subview", [Pure]> { Then in Python syntax, the subview covers input[1][0:4][4:8]. }]; let arguments = ( - ins TT_MemDescType:$src, Variadic:$offsets); + ins TTG_MemDescType:$src, Variadic:$offsets); - // Use qualified() otherwise "!tt.memdesc" is printed as "". + // Use qualified() otherwise "!triton_gpu.memdesc" is printed as "". let assemblyFormat = [{$src `[` $offsets `]` attr-dict `:` qualified(type($src)) `->` qualified(type($result))}]; - let results = (outs TT_MemDescType:$result); + let results = (outs TTG_MemDescType:$result); let hasVerifier = 1; } @@ -233,14 +234,14 @@ def TTG_MemDescTransOp : TTG_Op<"memdesc_trans", [Pure, representing a transposed view of the buffer. }]; - let arguments = (ins TT_MemDescType:$src, Variadic:$order); + let arguments = (ins TTG_MemDescType:$src, Variadic:$order); let arguments = ( - ins TT_MemDescType:$src, + ins TTG_MemDescType:$src, DenseI32ArrayAttr:$order ); - let results = (outs TT_MemDescType:$result); + let results = (outs TTG_MemDescType:$result); let assemblyFormat = "$src attr-dict `:` qualified(type($src)) `->` qualified(type($result))"; @@ -253,7 +254,7 @@ def TTG_LocalLoadOp : TTG_Op<"local_load", [DeclareOpInterfaceMethods :$token); + let arguments = (ins TTG_MemDescType:$src, Optional :$token); let builders = [ OpBuilder<(ins "Type":$retType, "Value":$src), @@ -261,7 +262,7 @@ def TTG_LocalLoadOp : TTG_Op<"local_load", [DeclareOpInterfaceMethods(nullptr)); }]>]; - // Use qualified() otherwise "!tt.memdesc" is printed as "". + // Use qualified() otherwise "!triton_gpu.memdesc" is printed as "". let assemblyFormat = [{$src (`token` $token^)? attr-dict `:` qualified(type($src)) `->` type($result)}]; let results = (outs TT_Tensor:$result); @@ -273,10 +274,10 @@ def TTG_LocalStoreOp : TTG_Op<"local_store", [DeclareOpInterfaceMethods" is printed as "". + // Use qualified() otherwise "!triton_gpu.memdesc" is printed as "". let assemblyFormat = [{ $src `,` $dst attr-dict `:` type($src) `->` qualified(type($dst)) }]; diff --git a/include/triton/Dialect/Triton/IR/TritonTypeInterfaces.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUTypeInterfaces.td similarity index 75% rename from include/triton/Dialect/Triton/IR/TritonTypeInterfaces.td rename to include/triton/Dialect/TritonGPU/IR/TritonGPUTypeInterfaces.td index e3aed22627..a0415b62c6 100644 --- a/include/triton/Dialect/Triton/IR/TritonTypeInterfaces.td +++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUTypeInterfaces.td @@ -1,11 +1,11 @@ -#ifndef TRITON_TYPE_INTERFACES -#define TRITON_TYPE_INTERFACES +#ifndef TRITON_GPU_TYPE_INTERFACES +#define TRITON_GPU_TYPE_INTERFACES include "mlir/IR/OpBase.td" // Interface dynamically attached to RankedTensorType and MemDescType. -def TT_TensorOrMemDesc : TypeInterface<"TensorOrMemDesc"> { - let cppNamespace = "::mlir"; +def TTG_TensorOrMemDesc : TypeInterface<"TensorOrMemDesc"> { + let cppNamespace = "::mlir::triton::gpu"; let methods = [ InterfaceMethod<"Returns the encoding of the tensor or memory descriptor", "mlir::Attribute", "getEncoding", (ins)>, @@ -17,8 +17,7 @@ def TT_TensorOrMemDesc : TypeInterface<"TensorOrMemDesc"> { "int64_t", "getRank", (ins)>, InterfaceMethod<"Returns the element type bit width", "int64_t", "getElementTypeBitWidth", (ins)>, - ]; } -#endif // TRITON_TYPE_INTERFACES +#endif // TRITON_GPU_TYPE_INTERFACES diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUTypes.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUTypes.td index 6765ac40cb..766d5a9bd7 100644 --- a/include/triton/Dialect/TritonGPU/IR/TritonGPUTypes.td +++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUTypes.td @@ -1,8 +1,9 @@ #ifndef TRITONGPU_TYPES #define TRITONGPU_TYPES -include "triton/Dialect/TritonGPU/IR/TritonGPUDialect.td" include "mlir/IR/AttrTypeBase.td" +include "mlir/IR/BuiltinTypeInterfaces.td" +include "triton/Dialect/TritonGPU/IR/TritonGPUDialect.td" class TTG_TypeDef traits = []> : TypeDef { @@ -23,8 +24,7 @@ def TTG_TokenType : TTG_TypeDef<"Token", "token"> { let skipDefaultBuilders = 1; } -def TTG_AsyncToken : TTG_TypeDef<"AsyncToken", - "async.token", []> { +def TTG_AsyncToken : TTG_TypeDef<"AsyncToken", "async.token", []> { let summary = "async token type"; let description = [{ `ttg.async.token` is a type returned by an asynchronous operation. @@ -33,4 +33,53 @@ def TTG_AsyncToken : TTG_TypeDef<"AsyncToken", }]; } +// Memory descriptor type. +def TTG_MemDescType : TTG_TypeDef<"MemDesc", "memdesc", [ShapedTypeInterface]> { + let summary = "memory descriptor type (`::mlir::triton::gpu::MemDescType`) in Triton IR type system"; + + let description = [{ + Memory descriptor contains a base pointer (scalar) and a descriptor of the memory. + If mutable memory is false that means the memory is constant and can only be allocated and stored once. + A constant memory allocation is different than a tensor as it can have multiple views and the descriptor + can be changed without changing the underlying memory. + }]; + + let parameters = (ins + ArrayRefParameter<"int64_t">:$shape, + "Type":$elementType, + "Attribute":$encoding, + "Attribute":$memorySpace, + "bool":$mutable_memory + ); + let extraClassDeclaration = [{ + MemDescType cloneWith(std::optional> shape, + Type elementType) const { + return MemDescType::get(shape.value_or(getShape()), elementType, getEncoding(), getMemorySpace(), getMutableMemory()); + } + + bool hasRank() const { return true; } + }]; + let builders = [ + TypeBuilderWithInferredContext<(ins + "llvm::ArrayRef":$shape, + "Type":$elementType, + "Attribute":$encoding, + "Attribute":$memorySpace + ), [{ + return $_get(elementType.getContext(), shape, elementType, encoding, memorySpace, /*mutableMemory=*/false); + }]>, + TypeBuilderWithInferredContext<(ins + "llvm::ArrayRef":$shape, + "Type":$elementType, + "Attribute":$encoding, + "Attribute":$memorySpace, + "bool":$mutableMemory + ), [{ + return $_get(elementType.getContext(), shape, elementType, encoding, memorySpace, mutableMemory); + }]> + ]; + let hasCustomAssemblyFormat = 1; +} + + #endif diff --git a/include/triton/Dialect/TritonGPU/IR/Types.h b/include/triton/Dialect/TritonGPU/IR/Types.h index edf37fef60..82ab3ae457 100644 --- a/include/triton/Dialect/TritonGPU/IR/Types.h +++ b/include/triton/Dialect/TritonGPU/IR/Types.h @@ -1,10 +1,13 @@ #ifndef TRITONGPU_IR_TYPES_H_ #define TRITONGPU_IR_TYPES_H_ +#include "mlir/IR/BuiltinTypes.h" #include "mlir/IR/TypeSupport.h" #include "mlir/IR/Types.h" #define GET_TYPEDEF_CLASSES #include "triton/Dialect/TritonGPU/IR/Types.h.inc" +#include "triton/Dialect/TritonGPU/IR/TypeInterfaces.h.inc" + #endif // TRITON_IR_TYPES_H_ diff --git a/include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td b/include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td index e257e8fead..f363032a37 100644 --- a/include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td +++ b/include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td @@ -28,7 +28,8 @@ include "triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUAttrDefs.td" include "mlir/Dialect/Arith/IR/ArithBase.td" include "triton/Dialect/Triton/IR/TritonTypes.td" include "triton/Dialect/Triton/IR/TritonAttrDefs.td" -include "triton/Dialect/Triton/IR/TritonTypeInterfaces.td" +include "triton/Dialect/TritonGPU/IR/TritonGPUTypes.td" +include "triton/Dialect/TritonGPU/IR/TritonGPUTypeInterfaces.td" include "mlir/IR/OpBase.td" include "mlir/Interfaces/SideEffectInterfaces.td" // Pure include "mlir/Interfaces/InferTypeOpInterface.td" // SameOperandsAndResultType @@ -80,8 +81,8 @@ def TTNG_WarpGroupDotOp : TTNG_Op<"warp_group_dot", [DeclareOpInterfaceMethods:$useC, DefaultValuedAttr:$inputPrecision, @@ -100,8 +101,8 @@ def TTNG_WarpGroupDotOp : TTNG_Op<"warp_group_dot", [DeclareOpInterfaceMethods, AllTypesMatch<["inputs", "outputs"]>]> { let summary = "warp group dot wait"; - let arguments = (ins Variadic:$inputs, I32Attr:$pendings); - let results = (outs Variadic:$outputs); + let arguments = (ins Variadic:$inputs, I32Attr:$pendings); + let results = (outs Variadic:$outputs); let description = [{ Waits until there are $pendings or fewer outstanding async dot operations. @@ -125,7 +126,7 @@ def TTNG_InitBarrierOp : TTNG_Op<"init_barrier", [DeclareOpInterfaceMethods:$coord, - TT_MemDescType:$barrier, - TT_MemDescType:$result, + TTG_MemDescType:$barrier, + TTG_MemDescType:$result, I1:$pred, DefaultValuedAttr:$cache, DefaultValuedAttr:$evict, @@ -250,7 +251,7 @@ def TTNG_AsyncTMACopyLocalToGlobalOp : TTNG_Op<"async_tma_copy_local_to_global", let arguments = ( ins TT_PtrType:$desc_ptr, Variadic:$coord, - TT_MemDescType:$src); + TTG_MemDescType:$src); let assemblyFormat = [{ $desc_ptr `[` $coord `]` $src diff --git a/lib/Analysis/Alias.cpp b/lib/Analysis/Alias.cpp index 3a141a73fa..020f513bac 100644 --- a/lib/Analysis/Alias.cpp +++ b/lib/Analysis/Alias.cpp @@ -28,7 +28,7 @@ LogicalResult SharedMemoryAliasAnalysis::visitOperation( bool pessimistic = true; auto result = op->getResult(0); // skip ops that return memdesc in a different memory space. - if (auto memdescTy = dyn_cast(result.getType())) { + if (auto memdescTy = dyn_cast(result.getType())) { if (!isa_and_nonnull( memdescTy.getMemorySpace())) return success(); @@ -43,7 +43,7 @@ LogicalResult SharedMemoryAliasAnalysis::visitOperation( aliasInfo = AliasInfo(operands[0]->getValue()); pessimistic = false; } else { - assert(!isa(result.getType()) && + assert(!isa(result.getType()) && "unknown operation creating memory descriptor"); } diff --git a/lib/Analysis/CMakeLists.txt b/lib/Analysis/CMakeLists.txt index a84f0649b6..693d222f2f 100644 --- a/lib/Analysis/CMakeLists.txt +++ b/lib/Analysis/CMakeLists.txt @@ -7,7 +7,9 @@ add_triton_library(TritonAnalysis DEPENDS TritonTableGen + TritonGPUTableGen TritonGPUAttrDefsIncGen + TritonGPUTypeInterfacesIncGen LINK_LIBS PUBLIC MLIRAnalysis diff --git a/lib/Analysis/Utility.cpp b/lib/Analysis/Utility.cpp index ac72b4f26c..6166e10199 100644 --- a/lib/Analysis/Utility.cpp +++ b/lib/Analysis/Utility.cpp @@ -533,7 +533,8 @@ bool supportMMA(Value value, int version) { // types of both the operands are identical here. assert((version == 1 || version == 2 || version == 3) && "Unexpected MMA layout version found"); - auto elemTy = cast(value.getType()).getElementType(); + auto elemTy = + cast(value.getType()).getElementType(); // FP8 is not natively supported on all mma versions but it can always be // promoted to fp16 therefore we can always support it. bool isFP8 = elemTy.isFloat8E5M2() || elemTy.isFloat8E4M3FN() || diff --git a/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandFMA.cpp b/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandFMA.cpp index be2e6f584f..4914fd712b 100644 --- a/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandFMA.cpp +++ b/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandFMA.cpp @@ -12,6 +12,7 @@ using ::mlir::triton::gpu::getOrder; using ::mlir::triton::gpu::getShapePerCTA; using ::mlir::triton::gpu::getSizePerThread; using ::mlir::triton::gpu::getTotalElemsPerThread; +using ::mlir::triton::gpu::MemDescType; using ::mlir::triton::gpu::SharedEncodingAttr; SmallVector diff --git a/lib/Conversion/TritonGPUToLLVM/TypeConverter.cpp b/lib/Conversion/TritonGPUToLLVM/TypeConverter.cpp index fee10296c8..1ed1f40635 100644 --- a/lib/Conversion/TritonGPUToLLVM/TypeConverter.cpp +++ b/lib/Conversion/TritonGPUToLLVM/TypeConverter.cpp @@ -11,6 +11,7 @@ using namespace mlir::triton; using ::mlir::triton::gpu::BlockedEncodingAttr; using ::mlir::triton::gpu::DotOperandEncodingAttr; using ::mlir::triton::gpu::getTotalElemsPerThread; +using ::mlir::triton::gpu::MemDescType; using ::mlir::triton::gpu::NvidiaMmaEncodingAttr; using ::mlir::triton::gpu::SharedEncodingAttr; using ::mlir::triton::gpu::SliceEncodingAttr; diff --git a/lib/Conversion/TritonGPUToLLVM/Utility.cpp b/lib/Conversion/TritonGPUToLLVM/Utility.cpp index 6597ec8a30..c681cd344c 100644 --- a/lib/Conversion/TritonGPUToLLVM/Utility.cpp +++ b/lib/Conversion/TritonGPUToLLVM/Utility.cpp @@ -159,8 +159,8 @@ emitIndices(Location loc, RewriterBase &rewriter, const TargetInfoBase &target, } bool emitTransferBetweenRegistersAndShared( - RankedTensorType registerTy, MemDescType sharedTy, Type elemLlvmTy, - std::optional maxVecElems, Value shmemBase, + RankedTensorType registerTy, triton::gpu::MemDescType sharedTy, + Type elemLlvmTy, std::optional maxVecElems, Value shmemBase, ArrayRef shmemStrides, Location loc, RewriterBase &rewriter, const TargetInfoBase &target, std::function perVectorCallback) { @@ -272,7 +272,8 @@ bool emitTransferBetweenRegistersAndShared( } SmallVector loadSharedToDistributed(RankedTensorType dstTy, - MemDescType srcTy, Type elemLlvmTy, + triton::gpu::MemDescType srcTy, + Type elemLlvmTy, SharedMemoryObject smemObj, Location loc, RewriterBase &rewriter, const TargetInfoBase &target) { @@ -295,10 +296,11 @@ SmallVector loadSharedToDistributed(RankedTensorType dstTy, return ret; } -void storeDistributedToShared(MemDescType dstTy, RankedTensorType srcTy, - Type elemLlvmTy, ArrayRef srcVals, - Value smemBase, ArrayRef dstStrides, - Location loc, RewriterBase &rewriter, +void storeDistributedToShared(triton::gpu::MemDescType dstTy, + RankedTensorType srcTy, Type elemLlvmTy, + ArrayRef srcVals, Value smemBase, + ArrayRef dstStrides, Location loc, + RewriterBase &rewriter, const TargetInfoBase &target, std::pair *const llvmOpCount) { bool success = emitTransferBetweenRegistersAndShared( diff --git a/lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp index aa8840433e..ea05490c7a 100644 --- a/lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp +++ b/lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp @@ -1,7 +1,8 @@ #include "mlir/Support/LLVM.h" #include "triton/Conversion/TritonGPUToLLVM/PatternTritonGPUOpToLLVM.h" #include "triton/Conversion/TritonGPUToLLVM/Utility.h" -#include "triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.cpp.inc" +#include "triton/Dialect/TritonGPU/IR/Attributes.h" +#include "triton/Dialect/TritonGPU/IR/Types.h" using namespace mlir; using namespace mlir::triton; diff --git a/lib/Dialect/Triton/IR/Dialect.cpp b/lib/Dialect/Triton/IR/Dialect.cpp index f978958512..2874a3f564 100644 --- a/lib/Dialect/Triton/IR/Dialect.cpp +++ b/lib/Dialect/Triton/IR/Dialect.cpp @@ -14,7 +14,6 @@ #include "triton/Dialect/Triton/IR/AttrInterfaces.cpp.inc" #include "triton/Dialect/Triton/IR/Dialect.cpp.inc" #include "triton/Dialect/Triton/IR/OpInterfaces.cpp.inc" -#include "triton/Dialect/Triton/IR/TypeInterfaces.cpp.inc" using namespace mlir; using namespace mlir::triton; @@ -78,44 +77,6 @@ struct TritonInlinerInterface : public DialectInlinerInterface { } }; -struct TensorModel - : public TensorOrMemDesc::ExternalModel { - Type getElementType(Type pointer) const { - return cast(pointer).getElementType(); - } - Attribute getEncoding(Type pointer) const { - return cast(pointer).getEncoding(); - } - ArrayRef getShape(Type pointer) const { - return cast(pointer).getShape(); - } - int64_t getRank(Type pointer) const { - return cast(pointer).getRank(); - } - int64_t getElementTypeBitWidth(Type pointer) const { - return cast(pointer).getElementTypeBitWidth(); - } -}; - -struct MemDescModel - : public TensorOrMemDesc::ExternalModel { - Type getElementType(Type pointer) const { - return cast(pointer).getElementType(); - } - Attribute getEncoding(Type pointer) const { - return cast(pointer).getEncoding(); - } - ArrayRef getShape(Type pointer) const { - return cast(pointer).getShape(); - } - int64_t getRank(Type pointer) const { - return cast(pointer).getShape().size(); - } - int64_t getElementTypeBitWidth(Type pointer) const { - return cast(pointer).getElementType().getIntOrFloatBitWidth(); - } -}; - } // namespace void TritonDialect::initialize() { @@ -128,9 +89,6 @@ void TritonDialect::initialize() { // We can also add interface here. addInterfaces(); - - RankedTensorType::attachInterface(*getContext()); - MemDescType::attachInterface(*getContext()); } Operation *TritonDialect::materializeConstant(OpBuilder &builder, diff --git a/lib/Dialect/Triton/IR/Types.cpp b/lib/Dialect/Triton/IR/Types.cpp index 6e41e70a8e..de8925cbff 100644 --- a/lib/Dialect/Triton/IR/Types.cpp +++ b/lib/Dialect/Triton/IR/Types.cpp @@ -50,61 +50,6 @@ void PointerType::print(AsmPrinter &printer) const { } } -static constexpr llvm::StringRef kMutableMemory = "mutable"; - -Type MemDescType::parse(AsmParser &parser) { - if (parser.parseLess()) - return Type(); - - SmallVector dimensions; - if (parser.parseDimensionList(dimensions, /*allowDynamic=*/false)) - return Type(); - - // Parse the element type. - Type elementType; - if (parser.parseType(elementType)) - return Type(); - - Attribute encoding; - if (succeeded(parser.parseOptionalComma())) { - if (parser.parseAttribute(encoding)) - return Type(); - } - bool mutableMemory = false; - Attribute memorySpace; - if (succeeded(parser.parseOptionalComma())) { - if (failed(parser.parseOptionalKeyword(kMutableMemory))) { - if (parser.parseAttribute(memorySpace)) - return Type(); - } else { - mutableMemory = true; - } - } - if (mutableMemory == false && succeeded(parser.parseOptionalComma())) { - if (parser.parseOptionalKeyword(kMutableMemory)) - return Type(); - mutableMemory = true; - } - if (parser.parseGreater()) - return Type(); - return MemDescType::get(parser.getContext(), dimensions, elementType, - encoding, memorySpace, mutableMemory); -} - -void MemDescType::print(AsmPrinter &printer) const { - printer << "<"; - for (auto dim : getShape()) - printer << dim << "x"; - printer << getElementType(); - if (getEncoding()) - printer << ", " << getEncoding(); - if (getMemorySpace()) - printer << ", " << getMemorySpace(); - if (getMutableMemory()) - printer << ", " << kMutableMemory; - printer << ">"; -} - namespace mlir { namespace triton { diff --git a/lib/Dialect/TritonGPU/IR/CMakeLists.txt b/lib/Dialect/TritonGPU/IR/CMakeLists.txt index 98831f0db8..7486d72f36 100644 --- a/lib/Dialect/TritonGPU/IR/CMakeLists.txt +++ b/lib/Dialect/TritonGPU/IR/CMakeLists.txt @@ -7,6 +7,7 @@ add_triton_library(TritonGPUIR DEPENDS TritonGPUTableGen TritonGPUAttrDefsIncGen + TritonGPUTypeInterfacesIncGen LINK_LIBS PUBLIC MLIRGPUDialect diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp index 5ae07c3378..3273154fc7 100644 --- a/lib/Dialect/TritonGPU/IR/Dialect.cpp +++ b/lib/Dialect/TritonGPU/IR/Dialect.cpp @@ -11,6 +11,7 @@ #include "triton/Dialect/TritonGPU/IR/Attributes.h" #include "triton/Dialect/TritonGPU/IR/Dialect.h" #include "triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h" +#include "triton/Dialect/TritonGPU/IR/Types.h" #include "triton/Dialect/TritonGPU/Transforms/Utility.h" #include "triton/Tools/LinearLayout.h" #include "triton/Tools/StrUtil.h" @@ -19,6 +20,7 @@ // Include TableGen'erated code #include "triton/Dialect/TritonGPU/IR/Dialect.cpp.inc" +#include "triton/Dialect/TritonGPU/IR/TypeInterfaces.cpp.inc" using namespace mlir; using namespace mlir::triton; @@ -730,10 +732,10 @@ static void maybePrintCTALayout(mlir::MLIRContext *context, //===----------------------------------------------------------------------===// // Attribute methods //===----------------------------------------------------------------------===// -#include "triton/Dialect/TritonGPU/IR/TritonGPUAttrInterfaces.cpp.inc" +#include "triton/Dialect/TritonGPU/IR/AttrInterfaces.cpp.inc" #define GET_ATTRDEF_CLASSES -#include "triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.cpp.inc" +#include "triton/Dialect/TritonGPU/IR/AttrDefs.cpp.inc" SliceEncodingAttr BlockedEncodingAttr::squeeze(int axis) { return SliceEncodingAttr::get(getContext(), axis, *this); @@ -3660,12 +3662,52 @@ void mlir::triton::gpu::dumpHWLayout(RankedTensorType tensorType) { llvm::errs() << getLayoutStr(tensorType, /*useHWPointOfView=*/true); } +struct TensorModel + : public triton::gpu::TensorOrMemDesc::ExternalModel { + Type getElementType(Type pointer) const { + return cast(pointer).getElementType(); + } + Attribute getEncoding(Type pointer) const { + return cast(pointer).getEncoding(); + } + ArrayRef getShape(Type pointer) const { + return cast(pointer).getShape(); + } + int64_t getRank(Type pointer) const { + return cast(pointer).getRank(); + } + int64_t getElementTypeBitWidth(Type pointer) const { + return cast(pointer).getElementTypeBitWidth(); + } +}; + +struct MemDescModel + : public triton::gpu::TensorOrMemDesc::ExternalModel { + Type getElementType(Type pointer) const { + return cast(pointer).getElementType(); + } + Attribute getEncoding(Type pointer) const { + return cast(pointer).getEncoding(); + } + ArrayRef getShape(Type pointer) const { + return cast(pointer).getShape(); + } + int64_t getRank(Type pointer) const { + return cast(pointer).getShape().size(); + } + int64_t getElementTypeBitWidth(Type pointer) const { + return cast(pointer).getElementType().getIntOrFloatBitWidth(); + } +}; + void TritonGPUDialect::initialize() { registerTypes(); addAttributes< #define GET_ATTRDEF_LIST -#include "triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.cpp.inc" +#include "triton/Dialect/TritonGPU/IR/AttrDefs.cpp.inc" >(); addOperations< #define GET_OP_LIST @@ -3674,6 +3716,9 @@ void TritonGPUDialect::initialize() { >(); addInterfaces(); addInterfaces(); + + RankedTensorType::attachInterface(*getContext()); + MemDescType::attachInterface(*getContext()); } // verify TritonGPU ops diff --git a/lib/Dialect/TritonGPU/IR/Types.cpp b/lib/Dialect/TritonGPU/IR/Types.cpp index 77f673cc27..fe87626203 100644 --- a/lib/Dialect/TritonGPU/IR/Types.cpp +++ b/lib/Dialect/TritonGPU/IR/Types.cpp @@ -27,6 +27,61 @@ void TokenType::print(AsmPrinter &printer) const { printer << "<" << getType() << ">"; } +static constexpr llvm::StringRef kMutableMemory = "mutable"; + +Type MemDescType::parse(AsmParser &parser) { + if (parser.parseLess()) + return Type(); + + SmallVector dimensions; + if (parser.parseDimensionList(dimensions, /*allowDynamic=*/false)) + return Type(); + + // Parse the element type. + Type elementType; + if (parser.parseType(elementType)) + return Type(); + + Attribute encoding; + if (succeeded(parser.parseOptionalComma())) { + if (parser.parseAttribute(encoding)) + return Type(); + } + bool mutableMemory = false; + Attribute memorySpace; + if (succeeded(parser.parseOptionalComma())) { + if (failed(parser.parseOptionalKeyword(kMutableMemory))) { + if (parser.parseAttribute(memorySpace)) + return Type(); + } else { + mutableMemory = true; + } + } + if (mutableMemory == false && succeeded(parser.parseOptionalComma())) { + if (parser.parseOptionalKeyword(kMutableMemory)) + return Type(); + mutableMemory = true; + } + if (parser.parseGreater()) + return Type(); + return MemDescType::get(parser.getContext(), dimensions, elementType, + encoding, memorySpace, mutableMemory); +} + +void MemDescType::print(AsmPrinter &printer) const { + printer << "<"; + for (auto dim : getShape()) + printer << dim << "x"; + printer << getElementType(); + if (getEncoding()) + printer << ", " << getEncoding(); + if (getMemorySpace()) + printer << ", " << getMemorySpace(); + if (getMutableMemory()) + printer << ", " << kMutableMemory; + printer << ">"; +} + //===----------------------------------------------------------------------===// // Triton Dialect //===----------------------------------------------------------------------===// diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp index 0d7bd5bdc2..5ffddee023 100644 --- a/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp +++ b/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp @@ -121,7 +121,7 @@ static int createAsyncCopy(scf::ForOp forOp, tt::LoadOp loadOp, Value alloc, Value src = loadOp.getPtr(); Value mask = loadOp.getMask(); Value other = loadOp.getOther(); - tt::MemDescType allocTy = cast(alloc.getType()); + ttg::MemDescType allocTy = cast(alloc.getType()); auto convertBlockLayout = [&](Value src, ttg::BlockedEncodingAttr enc) { auto ty = cast(src.getType()); @@ -146,7 +146,7 @@ static int createAsyncCopy(scf::ForOp forOp, tt::LoadOp loadOp, Value alloc, copyOffsets[0] = insertIdx; Attribute sharedMemorySpace = triton::gpu::SharedMemorySpaceAttr::get(forOp.getContext()); - tt::MemDescType subviewTy = tt::MemDescType::get( + ttg::MemDescType subviewTy = ttg::MemDescType::get( allocTy.getShape().drop_front(), allocTy.getElementType(), allocTy.getEncoding(), sharedMemorySpace, /*mutableMemory=*/true); auto view = builder.createWithStage( @@ -168,13 +168,13 @@ static int createAsyncCopy(scf::ForOp forOp, tt::LoadOp loadOp, Value alloc, loc, stageForFirstUse, clusterForFirstUse, subviewTy, alloc, loadOffsets); if (loadIsMMAv3Shared) { auto alloc = cast((*loadOp->getUsers().begin())); - replaceUsesAndPropagateType(builder, alloc, viewLoad.getResult()); + tt::replaceUsesAndPropagateType(builder, alloc, viewLoad.getResult()); alloc.erase(); } else { SmallVector allocsToErase; for (Operation *user : loadOp->getUsers()) { if (auto alloc = dyn_cast(user)) { - replaceUsesAndPropagateType(builder, alloc, viewLoad.getResult()); + tt::replaceUsesAndPropagateType(builder, alloc, viewLoad.getResult()); allocsToErase.push_back(alloc); } } @@ -227,15 +227,15 @@ createTMAAsyncCopy(scf::ForOp &forOp, tt::ExperimentalDescriptorLoadOp loadOp, auto [stageForFirstUse, clusterForFirstUse] = tt::getStageCluster(firstUse); Attribute sharedMemorySpace = - triton::gpu::SharedMemorySpaceAttr::get(forOp.getContext()); + ttg::SharedMemorySpaceAttr::get(forOp.getContext()); Value zero = builder.createWithStage( forOp.getLoc(), stage, clusterId, 0, 32); builder.setInsertionPoint(loadOp); Location loc = loadOp.getLoc(); - tt::MemDescType allocTy = cast(alloc.getType()); + ttg::MemDescType allocTy = cast(alloc.getType()); SmallVector copyOffsets(allocTy.getRank(), zero); copyOffsets[0] = insertIdx; - tt::MemDescType subviewTy = tt::MemDescType::get( + ttg::MemDescType subviewTy = ttg::MemDescType::get( allocTy.getShape().drop_front(), allocTy.getElementType(), allocTy.getEncoding(), sharedMemorySpace, /*mutableMemory=*/true); auto view = builder.createWithStage( @@ -259,13 +259,13 @@ createTMAAsyncCopy(scf::ForOp &forOp, tt::ExperimentalDescriptorLoadOp loadOp, loc, stageForFirstUse, clusterForFirstUse, subviewTy, alloc, loadOffsets); if (loadIsMMAv3Shared) { auto alloc = cast((*loadOp->getUsers().begin())); - replaceUsesAndPropagateType(builder, alloc, viewLoad.getResult()); + tt::replaceUsesAndPropagateType(builder, alloc, viewLoad.getResult()); alloc.erase(); } else { SmallVector allocsToErase; for (Operation *user : loadOp->getUsers()) { if (auto alloc = dyn_cast(user)) { - replaceUsesAndPropagateType(builder, alloc, viewLoad.getResult()); + tt::replaceUsesAndPropagateType(builder, alloc, viewLoad.getResult()); allocsToErase.push_back(alloc); } } @@ -525,15 +525,15 @@ static Value createAlloc(scf::ForOp &forOp, Operation *loadOp, ttg::SharedEncodingAttr sharedEnc, unsigned distance) { OpBuilder builder(forOp); Attribute sharedMemorySpace = - triton::gpu::SharedMemorySpaceAttr::get(forOp.getContext()); + ttg::SharedMemorySpaceAttr::get(forOp.getContext()); auto ty = cast(loadOp->getResultTypes()[0]); SmallVector bufferShape(ty.getShape().begin(), ty.getShape().end()); bufferShape.insert(bufferShape.begin(), distance); - Type memdescType = mlir::triton::MemDescType::get( - bufferShape, ty.getElementType(), sharedEnc, sharedMemorySpace, - /*mutableMemory*/ true); - Value alloc = builder.create( - loadOp->getLoc(), memdescType, Value()); + Type memdescType = ttg::MemDescType::get(bufferShape, ty.getElementType(), + sharedEnc, sharedMemorySpace, + /*mutableMemory*/ true); + Value alloc = + builder.create(loadOp->getLoc(), memdescType, Value()); return alloc; } @@ -541,7 +541,7 @@ static Value createAlloc(scf::ForOp &forOp, Operation *loadOp, static Value createBarrierAlloc(scf::ForOp &forOp, unsigned distance) { OpBuilder builder(forOp); Attribute sharedMemorySpace = - triton::gpu::SharedMemorySpaceAttr::get(forOp.getContext()); + ttg::SharedMemorySpaceAttr::get(forOp.getContext()); Location loc = forOp.getLoc(); auto context = forOp.getContext(); auto barrierCTALayout = @@ -549,14 +549,14 @@ static Value createBarrierAlloc(scf::ForOp &forOp, unsigned distance) { /*CTASplitNum=*/{1}, /*CTAOrder=*/{0}); auto barrierEncoding = ttg::SharedEncodingAttr::get(context, 1, 1, 1, {0}, barrierCTALayout); - Type barrierMemDescType = tt::MemDescType::get( + Type barrierMemDescType = ttg::MemDescType::get( {distance}, builder.getI64Type(), barrierEncoding, sharedMemorySpace, /*mutableMemory=*/true); Type singleBarrierMemDescType = - tt::MemDescType::get({1}, builder.getI64Type(), barrierEncoding, - sharedMemorySpace, /*mutableMemory=*/true); - Value barrierAlloc = builder.create( - loc, barrierMemDescType, Value()); + ttg::MemDescType::get({1}, builder.getI64Type(), barrierEncoding, + sharedMemorySpace, /*mutableMemory=*/true); + Value barrierAlloc = + builder.create(loc, barrierMemDescType, Value()); for (unsigned i = 0; i < distance; i++) { Value idx = builder.create(loc, i, 32); Value barrierView = builder.create( @@ -654,10 +654,10 @@ static void createTMABarrierAndWait( Location loc = forOp.getLoc(); OpBuilderWithStage builder(forOp); Attribute sharedMemorySpace = - triton::gpu::SharedMemorySpaceAttr::get(builder.getContext()); - tt::MemDescType barrierTy = tt::MemDescType::get( + ttg::SharedMemorySpaceAttr::get(builder.getContext()); + ttg::MemDescType barrierTy = ttg::MemDescType::get( {1}, builder.getI64Type(), - cast(barrierAlloc.getType()).getEncoding(), + cast(barrierAlloc.getType()).getEncoding(), sharedMemorySpace, /*mutableMemory=*/true); builder.setInsertionPoint(group[0]->loadOp); @@ -838,14 +838,14 @@ createAsyncOps(scf::ForOp &forOp, static void invalidateBarriers(OpBuilder &builder, SmallVector &barriers) { Attribute sharedMemorySpace = - triton::gpu::SharedMemorySpaceAttr::get(builder.getContext()); + ttg::SharedMemorySpaceAttr::get(builder.getContext()); for (Value barrier : barriers) { - int numBarriers = cast(barrier.getType()).getShape()[0]; + int numBarriers = cast(barrier.getType()).getShape()[0]; for (int i = 0; i < numBarriers; i++) { Value idx = builder.create(barrier.getLoc(), i, 32); - tt::MemDescType barrierTy = tt::MemDescType::get( + ttg::MemDescType barrierTy = ttg::MemDescType::get( {1}, builder.getI64Type(), - cast(barrier.getType()).getEncoding(), + cast(barrier.getType()).getEncoding(), sharedMemorySpace, /*mutableMemory=*/true); Value barrierView = builder.create( @@ -1092,7 +1092,7 @@ static void threadValuesThroughWait(ttng::WarpGroupDotWaitOp wait, for (ttng::WarpGroupDotOp dot : asyncDots) { for (Value operand : dot.getOperands()) { - if (isa(operand.getType())) { + if (isa(operand.getType())) { newOperands.insert(operand); } } @@ -1110,12 +1110,12 @@ static void threadValuesThroughWait(ttng::WarpGroupDotWaitOp wait, }; for (int i = 0; i < origNumOperands; i++) { Value operand = wait.getResult(i); - if (!isa(operand.getType())) + if (!isa(operand.getType())) operand.replaceAllUsesWith(newWait.getResult(i)); } for (int i = origNumOperands; i < newOperands.size(); i++) { Value operand = newWait.getOperand(i); - if (!isa(operand.getType())) + if (!isa(operand.getType())) operand.replaceUsesWithIf(newWait.getResult(i), dominatedByNewWait); } wait->erase(); @@ -1178,7 +1178,7 @@ static std::optional dotCanBeProperlyAsync(ttng::WarpGroupDotOp dotOp, // Rule 1: All shmem operands are multi-buffered. auto checkOperand = [&](Value operand) { if (!isa( - cast(operand.getType()).getEncoding())) { + cast(operand.getType()).getEncoding())) { // Rule 1a: Register operands must not be modified within the loop. // First, check for chained WGMMA as an exception. if (auto cvt = dyn_cast(operand.getDefiningOp())) { diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/PipeliningUtility.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/PipeliningUtility.cpp index f90c6b7475..29e75ee535 100644 --- a/lib/Dialect/TritonGPU/Transforms/Pipeliner/PipeliningUtility.cpp +++ b/lib/Dialect/TritonGPU/Transforms/Pipeliner/PipeliningUtility.cpp @@ -147,10 +147,10 @@ void mlir::triton::replaceUsesAndPropagateType(OpBuilder &builder, builder.setInsertionPoint(user); Value newVal; if (auto subview = dyn_cast(user)) { - triton::MemDescType oldType = subview.getType(); + triton::gpu::MemDescType oldType = subview.getType(); bool isMutable = - cast(val.getType()).getMutableMemory(); - Type newDstType = triton::MemDescType::get( + cast(val.getType()).getMutableMemory(); + Type newDstType = triton::gpu::MemDescType::get( oldType.getShape(), oldType.getElementType(), oldType.getEncoding(), oldType.getMemorySpace(), isMutable); newVal = builder.create( diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/TMAStoresPipeline.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/TMAStoresPipeline.cpp index 1cc3df7ec3..b24ac95387 100644 --- a/lib/Dialect/TritonGPU/Transforms/Pipeliner/TMAStoresPipeline.cpp +++ b/lib/Dialect/TritonGPU/Transforms/Pipeliner/TMAStoresPipeline.cpp @@ -42,8 +42,8 @@ static Value createAlloc(scf::ForOp &forOp, Attribute sharedMemorySpace = triton::gpu::SharedMemorySpaceAttr::get(ty.getContext()); Type memdescType = - tt::MemDescType::get(ty.getShape(), ty.getElementType(), encoding, - sharedMemorySpace, /*mutableMemory*/ true); + ttg::MemDescType::get(ty.getShape(), ty.getElementType(), encoding, + sharedMemorySpace, /*mutableMemory*/ true); Value alloc = builder.create(storeOp->getLoc(), memdescType, Value()); return alloc; diff --git a/lib/Dialect/TritonGPU/Transforms/Prefetch.cpp b/lib/Dialect/TritonGPU/Transforms/Prefetch.cpp index 2cbc00142b..46a55d550d 100644 --- a/lib/Dialect/TritonGPU/Transforms/Prefetch.cpp +++ b/lib/Dialect/TritonGPU/Transforms/Prefetch.cpp @@ -114,7 +114,7 @@ Value Prefetcher::generatePrefetch(Value v, unsigned opIdx, bool isPrologue, std::optional offsetK, std::optional shapeK) { // opIdx: 0 => a, 1 => b - auto type = cast(v.getType()); + auto type = cast(v.getType()); SmallVector shape{type.getShape().begin(), type.getShape().end()}; SmallVector offset{0, 0}; Type elementType = type.getElementType(); @@ -136,8 +136,8 @@ Value Prefetcher::generatePrefetch(Value v, unsigned opIdx, bool isPrologue, builder.create(v.getLoc(), off, 32)); Value newSmem = builder.create( v.getLoc(), - triton::MemDescType::get(shape, elementType, type.getEncoding(), - type.getMemorySpace()), + triton::gpu::MemDescType::get(shape, elementType, type.getEncoding(), + type.getMemorySpace()), v, offsetsVal); auto dotOperandEnc = triton::gpu::DotOperandEncodingAttr::get( diff --git a/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp b/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp index b1e296c1bb..af756c6d83 100644 --- a/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp +++ b/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp @@ -58,7 +58,7 @@ class TritonGPUReduceDataDuplicationPass } auto sharedMemorySpace = triton::gpu::SharedMemorySpaceAttr::get(srcType.getContext()); - auto tmpType = triton::MemDescType::get( + auto tmpType = triton::gpu::MemDescType::get( dstType.getShape(), dstType.getElementType(), triton::gpu::SharedEncodingAttr::get( mod.getContext(), dstDotOp, srcType.getShape(), sharedOrder, diff --git a/lib/Dialect/TritonGPU/Transforms/Utility.cpp b/lib/Dialect/TritonGPU/Transforms/Utility.cpp index fa8ec2b926..b8f3abfcac 100644 --- a/lib/Dialect/TritonGPU/Transforms/Utility.cpp +++ b/lib/Dialect/TritonGPU/Transforms/Utility.cpp @@ -944,7 +944,7 @@ getSharedEncIfAllUsersAreDotEnc(Value val, bool &incompatible) { if (user->getNumResults() != 1) return std::nullopt; if (auto memDesc = - dyn_cast(user->getResult(0).getType())) { + dyn_cast(user->getResult(0).getType())) { // First time we find a shared encoding in the chain, save it and try to // use it if it is compatible with the other users. tempAttr = cast(memDesc.getEncoding()); @@ -955,10 +955,11 @@ getSharedEncIfAllUsersAreDotEnc(Value val, bool &incompatible) { if (!isa(user)) return std::nullopt; auto dotOpEnc = dyn_cast( - cast(user->getResult(0).getType()).getEncoding()); + cast(user->getResult(0).getType()) + .getEncoding()); if (!dotOpEnc) return std::nullopt; - auto srcTy = cast(val.getType()); + auto srcTy = cast(val.getType()); auto CTALayout = ttg::getCTALayout(srcTy.getEncoding()); auto order = ttg::getOrder(srcTy.getEncoding()); unsigned bitWidth = srcTy.getElementType().getIntOrFloatBitWidth(); diff --git a/lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp b/lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp index 92d9b589a2..942eb5423d 100644 --- a/lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp +++ b/lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp @@ -42,8 +42,10 @@ mlir::LogicalResult WarpGroupDotOp::inferReturnTypes( inferredReturnTypes.push_back(accTy); // verify encodings - auto aEnc = cast(operands[0].getType()).getEncoding(); - auto bEnc = cast(operands[1].getType()).getEncoding(); + auto aEnc = + cast(operands[0].getType()).getEncoding(); + auto bEnc = + cast(operands[1].getType()).getEncoding(); auto retEnc = accTy.getEncoding(); if (aEnc) { assert(bEnc); @@ -62,10 +64,10 @@ void WarpGroupDotOp::getEffects( &effects) { auto &a = getAMutable(); auto &b = getBMutable(); - if (isa(a.get().getType())) + if (isa(a.get().getType())) effects.emplace_back(MemoryEffects::Read::get(), &a, mlir::triton::gpu::SharedMemory::get()); - if (isa(b.get().getType())) + if (isa(b.get().getType())) effects.emplace_back(MemoryEffects::Read::get(), &b, mlir::triton::gpu::SharedMemory::get()); } @@ -73,11 +75,12 @@ void WarpGroupDotOp::getEffects( bool WarpGroupDotOp::needsPartialAccumulator() { const auto &a = getA(); const auto &d = getD(); - auto aTensorTy = cast(a.getType()); - auto aElTy = cast(a.getType()).getElementType(); + auto aTensorTy = cast(a.getType()); + auto aElTy = cast(a.getType()).getElementType(); bool isFP8 = aElTy.isFloat8E5M2() || aElTy.isFloat8E4M3FN() || aElTy.isFloat8E5M2FNUZ() || aElTy.isFloat8E4M3FNUZ(); - bool accFP32 = cast(d.getType()).getElementType().isF32(); + bool accFP32 = + cast(d.getType()).getElementType().isF32(); uint32_t maxNumImpreciseAcc = getMaxNumImpreciseAcc(); return isFP8 && accFP32 && maxNumImpreciseAcc <= aTensorTy.getShape()[1]; } @@ -93,7 +96,8 @@ LogicalResult WarpGroupDotWaitOp::inferReturnTypes( return mlir::success(); } -static LogicalResult verifyBarrierType(Operation *op, MemDescType barrierType) { +static LogicalResult +verifyBarrierType(Operation *op, mlir::triton::gpu::MemDescType barrierType) { if (!barrierType.getElementType().isInteger(64) || barrierType.getShape() != ArrayRef({1})) return op->emitOpError( diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py index e9ce077184..23c598ee16 100644 --- a/python/test/unit/language/test_core.py +++ b/python/test/unit/language/test_core.py @@ -5333,10 +5333,10 @@ def test_convert2d(M, N, src_layout, interm_layout, dst_layout, dtype, device, t %12 = triton_gpu.convert_layout %9 : tensor<{M}x{N}xi32, #src> -> tensor<{M}x{N}xi32, #dst> %13 = triton_gpu.convert_layout %11 : tensor<{M}x{N}xf16, #src> -> tensor<{M}x{N}xf16, #dst> """ if interm_layout is None else f""" - %15 = triton_gpu.local_alloc %9 : (tensor<{M}x{N}xi32, #src>) -> !tt.memdesc<{M}x{N}xi32, #interm, #triton_gpu.shared_memory> - %16 = triton_gpu.local_load %15 : !tt.memdesc<{M}x{N}xi32, #interm, #triton_gpu.shared_memory> -> tensor<{M}x{N}xi32, #src> - %17 = triton_gpu.local_alloc %11 : (tensor<{M}x{N}xf16, #src>) -> !tt.memdesc<{M}x{N}xf16, #interm, #triton_gpu.shared_memory> - %18 = triton_gpu.local_load %17 : !tt.memdesc<{M}x{N}xf16, #interm, #triton_gpu.shared_memory> -> tensor<{M}x{N}xf16, #src> + %15 = triton_gpu.local_alloc %9 : (tensor<{M}x{N}xi32, #src>) -> !triton_gpu.memdesc<{M}x{N}xi32, #interm, #triton_gpu.shared_memory> + %16 = triton_gpu.local_load %15 : !triton_gpu.memdesc<{M}x{N}xi32, #interm, #triton_gpu.shared_memory> -> tensor<{M}x{N}xi32, #src> + %17 = triton_gpu.local_alloc %11 : (tensor<{M}x{N}xf16, #src>) -> !triton_gpu.memdesc<{M}x{N}xf16, #interm, #triton_gpu.shared_memory> + %18 = triton_gpu.local_load %17 : !triton_gpu.memdesc<{M}x{N}xf16, #interm, #triton_gpu.shared_memory> -> tensor<{M}x{N}xf16, #src> %12 = triton_gpu.convert_layout %16 : tensor<{M}x{N}xi32, #src> -> tensor<{M}x{N}xi32, #dst> %13 = triton_gpu.convert_layout %18 : tensor<{M}x{N}xf16, #src> -> tensor<{M}x{N}xf16, #dst> diff --git a/test/Analysis/test-alias.mlir b/test/Analysis/test-alias.mlir index 109395ae04..e67e55fb1c 100644 --- a/test/Analysis/test-alias.mlir +++ b/test/Analysis/test-alias.mlir @@ -41,7 +41,7 @@ tt.func @matmul_loop(%lb : index, %ub : index, %step : index, %A : !tt.ptr, // CHECK-LABEL: alloc tt.func @alloc(%A : !tt.ptr) { // CHECK: %0 -> %0 - %cst2 = triton_gpu.local_alloc : () -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst2 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> tt.return } @@ -49,40 +49,40 @@ tt.func @alloc(%A : !tt.ptr) { tt.func @alloc_init(%A : !tt.ptr) { %cst0 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL> // CHECK: %0 -> %0 - %cst1 = triton_gpu.local_alloc %cst0 : (tensor<16x16xf16, #AL>) -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> + %cst1 = triton_gpu.local_alloc %cst0 : (tensor<16x16xf16, #AL>) -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> tt.return } // CHECK-LABEL: trans tt.func @trans(%A : !tt.ptr) { // CHECK: %0 -> %0 - %tensor = triton_gpu.local_alloc : () -> !tt.memdesc<16x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %tensor = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<16x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK: %1 -> %0 - %b = triton_gpu.memdesc_trans %tensor {order=array} : !tt.memdesc<16x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<32x16xf16, #A_SHARED_T, #triton_gpu.shared_memory, mutable> + %b = triton_gpu.memdesc_trans %tensor {order=array} : !triton_gpu.memdesc<16x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> !triton_gpu.memdesc<32x16xf16, #A_SHARED_T, #triton_gpu.shared_memory, mutable> tt.return } // CHECK-LABEL: subview -tt.func @subview(%A : !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory>) { +tt.func @subview(%A : !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory>) { %index = arith.constant 0 : i32 // CHECK: %0 -> %0 - %a = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %a = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: %1 -> %0 - %cst1 = triton_gpu.memdesc_subview %a[%index, %index, %index] : !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst1 = triton_gpu.memdesc_subview %a[%index, %index, %index] : !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> tt.return } // CHECK-LABEL: if_alias tt.func @if_alias(%i1 : i1) { // CHECK: %0 -> %0 - %a = triton_gpu.local_alloc : () -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %a = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK: %1 -> %1 - %b = triton_gpu.local_alloc : () -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %b = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: %2 -> %0,%1 - %cst2 = scf.if %i1 -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> { - scf.yield %a : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst2 = scf.if %i1 -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> { + scf.yield %a : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> } else { - scf.yield %b : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + scf.yield %b : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> } tt.return } @@ -90,11 +90,11 @@ tt.func @if_alias(%i1 : i1) { // CHECK-LABEL: for tt.func @for(%lb : index, %ub : index, %step : index, %A : !tt.ptr, %B : !tt.ptr) { // CHECK: %0 -> %0 - %a = triton_gpu.local_alloc : () -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %a = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK: %1 -> %1 - %b = triton_gpu.local_alloc : () -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %b = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK: %2 -> %2 - %c = triton_gpu.local_alloc : () -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %c = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: %arg6 -> %0 // CHECK-NEXT: %arg7 -> %1 // CHECK-NEXT: %arg8 -> %2 @@ -102,8 +102,8 @@ tt.func @for(%lb : index, %ub : index, %step : index, %A : !tt.ptr, %B : !t // CHECK-NEXT: %3#1 -> %0,%1 // CHECK-NEXT: %3#2 -> %0,%1,%2 %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a, %b_shared = %b, %c_shared = %c) -> - (!tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>) { - scf.yield %b_shared, %a_shared, %a_shared : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + (!triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>) { + scf.yield %b_shared, %a_shared, %a_shared : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> } tt.return } @@ -111,11 +111,11 @@ tt.func @for(%lb : index, %ub : index, %step : index, %A : !tt.ptr, %B : !t // CHECK-LABEL: for_if tt.func @for_if(%lb : index, %ub : index, %step : index, %A : !tt.ptr, %B : !tt.ptr, %i1 : i1) { // CHECK: %0 -> %0 - %a_shared_init = triton_gpu.local_alloc : () -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %a_shared_init = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: %1 -> %1 - %b_shared_init = triton_gpu.local_alloc : () -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %b_shared_init = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: %2 -> %2 - %c_shared_init = triton_gpu.local_alloc : () -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %c_shared_init = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: %arg7 -> %0 // CHECK-NEXT: %arg8 -> %1 // CHECK-NEXT: %arg9 -> %2 @@ -123,14 +123,14 @@ tt.func @for_if(%lb : index, %ub : index, %step : index, %A : !tt.ptr, %B : // CHECK-NEXT: %3#1 -> %0,%1 // CHECK-NEXT: %3#2 -> %0,%1,%2 %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> - (!tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>) { + (!triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>) { scf.if %i1 { %index = arith.constant 8 : i32 // CHECK-NEXT: %4 -> %0,%1 - %cst0 = triton_gpu.memdesc_subview %a_shared[%index, %index] : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst0 = triton_gpu.memdesc_subview %a_shared[%index, %index] : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> !triton_gpu.memdesc<32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> scf.yield } - scf.yield %b_shared, %a_shared, %a_shared : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + scf.yield %b_shared, %a_shared, %a_shared : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> } tt.return } @@ -138,11 +138,11 @@ tt.func @for_if(%lb : index, %ub : index, %step : index, %A : !tt.ptr, %B : // CHECK-LABEL: for_for_if tt.func @for_for_if(%lb : index, %ub : index, %step : index, %A : !tt.ptr, %B : !tt.ptr, %i1 : i1) { // CHECK: %0 -> %0 - %a_shared_init = triton_gpu.local_alloc : () -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %a_shared_init = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: %1 -> %1 - %b_shared_init = triton_gpu.local_alloc : () -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %b_shared_init = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: %2 -> %2 - %c_shared_init = triton_gpu.local_alloc : () -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %c_shared_init = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: %arg7 -> %0 // CHECK-NEXT: %arg8 -> %1 // CHECK-NEXT: %arg9 -> %2 @@ -150,23 +150,23 @@ tt.func @for_for_if(%lb : index, %ub : index, %step : index, %A : !tt.ptr, // CHECK-NEXT: %3#1 -> %1 // CHECK-NEXT: %3#2 -> %2,%6,%6 %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> - (!tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>) { + (!triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>) { // CHECK-NEXT: %arg11 -> %2,%6,%6 // CHECK-NEXT: %4 -> %2,%6,%6 - %c_shared_next = scf.for %jv = %lb to %ub step %step iter_args(%c_shared_next = %c_shared) -> (!tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>) { + %c_shared_next = scf.for %jv = %lb to %ub step %step iter_args(%c_shared_next = %c_shared) -> (!triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>) { // CHECK-NEXT: %5 -> %6,%6 - %c_shared_next_next = scf.if %i1 -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> { + %c_shared_next_next = scf.if %i1 -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> { // CHECK-NEXT: %6 -> %6 - %cst0 = triton_gpu.local_alloc : () -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> - scf.yield %cst0 : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + scf.yield %cst0 : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> } else { // CHECK-NEXT: %6 -> %6 - %cst0 = triton_gpu.local_alloc : () -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> - scf.yield %cst0 : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + scf.yield %cst0 : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> } - scf.yield %c_shared_next_next : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + scf.yield %c_shared_next_next : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> } - scf.yield %a_shared, %b_shared, %c_shared_next : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + scf.yield %a_shared, %b_shared, %c_shared_next : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> } tt.return } @@ -175,29 +175,29 @@ tt.func @for_for_if(%lb : index, %ub : index, %step : index, %A : !tt.ptr, tt.func @cf_for(%arg0: index, %arg1: index, %arg2: index, %arg3: !tt.ptr, %arg4: !tt.ptr) { %idx = arith.constant 0 : i32 // CHECK: %0 -> %0 - %cst = triton_gpu.local_alloc : () -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: %1 -> %1 - %cst_0 = triton_gpu.local_alloc : () -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst_0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: %2 -> %0 - %0 = triton_gpu.memdesc_subview %cst[%idx, %idx] : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %0 = triton_gpu.memdesc_subview %cst[%idx, %idx] : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> gpu.barrier // CHECK-NEXT: %3 -> %3 - %cst_1 = triton_gpu.local_alloc : () -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst_1 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: %5 -> %0,%1,%3 // CHECK-NEXT: %6 -> %0,%1,%3 // CHECK-NEXT: %7 -> %0,%1,%3 - cf.br ^bb1(%arg0, %cst, %cst_0, %cst_1 : index, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>) -^bb1(%1: index, %2: !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, %3: !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, %4: !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>): // 2 preds: ^bb0, ^bb2 + cf.br ^bb1(%arg0, %cst, %cst_0, %cst_1 : index, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>) +^bb1(%1: index, %2: !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, %3: !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, %4: !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>): // 2 preds: ^bb0, ^bb2 %5 = arith.cmpi slt, %1, %arg1 : index cf.cond_br %5, ^bb2, ^bb3 ^bb2: // pred: ^bb1 gpu.barrier %8 = arith.addi %1, %arg2 : index - cf.br ^bb1(%8, %4, %2, %3 : index, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>) + cf.br ^bb1(%8, %4, %2, %3 : index, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>) ^bb3: // pred: ^bb1 gpu.barrier // CHECK-NEXT: %10 -> %0 - %9 = triton_gpu.memdesc_subview %0[%idx, %idx] : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %9 = triton_gpu.memdesc_subview %0[%idx, %idx] : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> tt.return } diff --git a/test/Analysis/test-allocation.mlir b/test/Analysis/test-allocation.mlir index db2e2947b8..fe4da43ca9 100644 --- a/test/Analysis/test-allocation.mlir +++ b/test/Analysis/test-allocation.mlir @@ -95,47 +95,47 @@ tt.func @reusable(%A : !tt.ptr) { // CHECK-LABEL: preallocate tt.func @preallocate(%A : !tt.ptr) { // CHECK: offset = 0, size = 512 - %cst0 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: offset = 1024, size = 512 - %cst1 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst1 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: offset = 2048, size = 512 - %cst2 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst2 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: offset = 3072, size = 1024 - %a = triton_gpu.local_alloc : () -> !tt.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %a = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: offset = 4096, size = 1024 - %b = triton_gpu.local_alloc : () -> !tt.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %b = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> - triton_gpu.local_dealloc %cst0 : !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + triton_gpu.local_dealloc %cst0 : !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: offset = 0, size = 1024 - %c = triton_gpu.local_alloc : () -> !tt.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %c = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> - triton_gpu.local_dealloc %cst1 : !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> - triton_gpu.local_dealloc %cst2 : !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + triton_gpu.local_dealloc %cst1 : !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + triton_gpu.local_dealloc %cst2 : !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: offset = 1024, size = 1024 - %cst4 = triton_gpu.local_alloc : () -> !tt.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst4 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: offset = 6144, size = 2048 - %e = triton_gpu.local_alloc : () -> !tt.memdesc<64x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> - triton_gpu.local_dealloc %a : !tt.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %e = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<64x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + triton_gpu.local_dealloc %a : !triton_gpu.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: offset = 8192, size = 2048 - %d = triton_gpu.local_alloc : () -> !tt.memdesc<64x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> - triton_gpu.local_dealloc %b : !tt.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %d = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<64x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + triton_gpu.local_dealloc %b : !triton_gpu.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: offset = 10240, size = 2048 - %f = triton_gpu.local_alloc : () -> !tt.memdesc<64x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> - triton_gpu.local_dealloc %cst4 : !tt.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> - triton_gpu.local_dealloc %c : !tt.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %f = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<64x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + triton_gpu.local_dealloc %cst4 : !triton_gpu.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + triton_gpu.local_dealloc %c : !triton_gpu.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: offset = 0, size = 2048 - %cst5 = triton_gpu.local_alloc : () -> !tt.memdesc<64x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst5 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<64x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: offset = 2048, size = 4096 - %g = triton_gpu.local_alloc : () -> !tt.memdesc<128x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> - triton_gpu.local_dealloc %e : !tt.memdesc<64x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %g = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + triton_gpu.local_dealloc %e : !triton_gpu.memdesc<64x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: offset = 2048, size = 4096 - %h = triton_gpu.local_alloc : () -> !tt.memdesc<128x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> - triton_gpu.local_dealloc %d : !tt.memdesc<64x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %h = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + triton_gpu.local_dealloc %d : !triton_gpu.memdesc<64x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: offset = 2048, size = 4096 - %i = triton_gpu.local_alloc : () -> !tt.memdesc<128x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> - triton_gpu.local_dealloc %f : !tt.memdesc<64x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> - triton_gpu.local_dealloc %cst5 : !tt.memdesc<64x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %i = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + triton_gpu.local_dealloc %f : !triton_gpu.memdesc<64x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + triton_gpu.local_dealloc %cst5 : !triton_gpu.memdesc<64x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> tt.return // CHECK-NEXT: size = 12288 } @@ -145,11 +145,11 @@ tt.func @preallocate(%A : !tt.ptr) { tt.func @unused(%A : !tt.ptr) { %cst = arith.constant dense<0.000000e+00> : tensor<32x16xf16, #AL> // CHECK: offset = 0, size = 1024 - %cst0 = triton_gpu.local_alloc %cst : (tensor<32x16xf16, #AL>) -> !tt.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory> + %cst0 = triton_gpu.local_alloc %cst : (tensor<32x16xf16, #AL>) -> !triton_gpu.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory> // CHECK-NEXT: offset = 0, size = 512 - %cst1 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst1 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: offset = 0, size = 512 - %cst2 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst2 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> tt.return // CHECK: size = 1024 } @@ -158,33 +158,33 @@ tt.func @unused(%A : !tt.ptr) { // CHECK-LABEL: longlive tt.func @longlive(%A : !tt.ptr) { // CHECK: offset = 0, size = 512 - %cst0 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: offset = 1024, size = 512 - %cst1 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst1 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: offset = 2048, size = 512 - %cst2 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst2 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: offset = 3072, size = 1024 - %a = triton_gpu.local_alloc : () -> !tt.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> - triton_gpu.local_dealloc %cst1 : !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> - triton_gpu.local_dealloc %cst2 : !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %a = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + triton_gpu.local_dealloc %cst1 : !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + triton_gpu.local_dealloc %cst2 : !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: offset = 1024, size = 512 - %cst3 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst3 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: offset = 2048, size = 512 - %cst4 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst4 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: offset = 3072, size = 1024 - %b = triton_gpu.local_alloc : () -> !tt.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %b = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: offset = 3072, size = 512 - %cst5 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst5 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: offset = 3072, size = 512 - %cst6 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst6 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: offset = 3072, size = 1024 - %c = triton_gpu.local_alloc : () -> !tt.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> - triton_gpu.local_dealloc %cst3 : !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> - triton_gpu.local_dealloc %cst4 : !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %c = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + triton_gpu.local_dealloc %cst3 : !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + triton_gpu.local_dealloc %cst4 : !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: offset = 1024, size = 1024 - %d = triton_gpu.local_alloc : () -> !tt.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> - triton_gpu.local_dealloc %cst0 : !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %d = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + triton_gpu.local_dealloc %cst0 : !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> tt.return // CHECK-NEXT: size = 4096 } @@ -193,43 +193,43 @@ tt.func @longlive(%A : !tt.ptr) { // CHECK-LABEL: multi_color tt.func @multi_color(%A : !tt.ptr) { // CHECK: offset = 0, size = 64 - %cst = triton_gpu.local_alloc : () -> !tt.memdesc<4x8xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<4x8xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: offset = 1536, size = 32 - %cst_0 = triton_gpu.local_alloc : () -> !tt.memdesc<4x4xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst_0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<4x4xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: offset = 1664, size = 128 - %cst_1 = triton_gpu.local_alloc : () -> !tt.memdesc<16x4xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst_1 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<16x4xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> %cst_2 = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #AL> // CHECK-NEXT: scratch offset = 128, size = 1152 %0 = triton_gpu.convert_layout %cst_2 : tensor<16x32xf16, #AL> -> tensor<16x32xf16, #BL> - %1 = triton_gpu.local_load %cst : !tt.memdesc<4x8xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<4x8xf16, #AL> + %1 = triton_gpu.local_load %cst : !triton_gpu.memdesc<4x8xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<4x8xf16, #AL> // CHECK-NEXT: offset = 0, size = 128 - %cst_3 = triton_gpu.local_alloc : () -> !tt.memdesc<4x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> - %2 = triton_gpu.local_load %cst_0 : !tt.memdesc<4x4xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<4x4xf16, #AL> + %cst_3 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<4x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %2 = triton_gpu.local_load %cst_0 : !triton_gpu.memdesc<4x4xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<4x4xf16, #AL> // CHECK-NEXT: scratch offset = 0, size = 1152 %3 = triton_gpu.convert_layout %cst_2 : tensor<16x32xf16, #AL> -> tensor<16x32xf16, #BL> // CHECK-NEXT: offset = 0, size = 256 - %cst_4 = triton_gpu.local_alloc : () -> !tt.memdesc<4x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst_4 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<4x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: offset = 256, size = 64 - %cst_5 = triton_gpu.local_alloc : () -> !tt.memdesc<4x8xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> - %4 = triton_gpu.local_load %cst_5 : !tt.memdesc<4x8xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<4x8xf16, #AL> - %5 = triton_gpu.local_load %cst_5 : !tt.memdesc<4x8xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<4x8xf16, #AL> + %cst_5 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<4x8xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %4 = triton_gpu.local_load %cst_5 : !triton_gpu.memdesc<4x8xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<4x8xf16, #AL> + %5 = triton_gpu.local_load %cst_5 : !triton_gpu.memdesc<4x8xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<4x8xf16, #AL> // CHECK-NEXT: offset = 1024, size = 512 - %cst_6 = triton_gpu.local_alloc : () -> !tt.memdesc<8x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst_6 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<8x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: offset = 1792, size = 128 - %cst_7 = triton_gpu.local_alloc : () -> !tt.memdesc<2x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> - %6 = triton_gpu.local_load %cst_0 : !tt.memdesc<4x4xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<4x4xf16, #AL> + %cst_7 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<2x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %6 = triton_gpu.local_load %cst_0 : !triton_gpu.memdesc<4x4xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<4x4xf16, #AL> // CHECK-NEXT: offset = 1024, size = 512 - %cst_8 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst_8 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: offset = 256, size = 32 - %cst_9 = triton_gpu.local_alloc : () -> !tt.memdesc<4x4xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst_9 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<4x4xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: offset = 1024, size = 512 - %cst_10 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> - %7 = triton_gpu.local_load %cst_1 : !tt.memdesc<16x4xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<16x4xf16, #AL> - %8 = triton_gpu.local_load %cst_4 : !tt.memdesc<4x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<4x32xf16, #AL> + %cst_10 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %7 = triton_gpu.local_load %cst_1 : !triton_gpu.memdesc<16x4xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<16x4xf16, #AL> + %8 = triton_gpu.local_load %cst_4 : !triton_gpu.memdesc<4x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<4x32xf16, #AL> // CHECK-NEXT: scratch offset = 0, size = 1152 %9 = triton_gpu.convert_layout %cst_2 : tensor<16x32xf16, #AL> -> tensor<16x32xf16, #BL> %cst_11 = arith.constant dense<0.000000e+00> : tensor<4x4xf16, #AL> - %10 = triton_gpu.local_load %cst_7 : !tt.memdesc<2x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<2x32xf16, #AL> + %10 = triton_gpu.local_load %cst_7 : !triton_gpu.memdesc<2x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<2x32xf16, #AL> %cst_12 = arith.constant dense<0.000000e+00> : tensor<4x16xf16, #AL> %cst_13 = arith.constant dense<0.000000e+00> : tensor<8x32xf16, #AL> // CHECK-NEXT: size = 1920 @@ -240,25 +240,25 @@ tt.func @multi_color(%A : !tt.ptr) { // CHECK-LABEL: multi_color_multi_rounds tt.func @multi_color_multi_rounds(%arg0: !tt.ptr) { // CHECK: offset = 0, size = 32 - %cst = triton_gpu.local_alloc : () -> !tt.memdesc<4x4xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<4x4xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: offset = 1280, size = 128 - %cst_0 = triton_gpu.local_alloc : () -> !tt.memdesc<16x4xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst_0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<16x4xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: offset = 2048, size = 8192 - %cst_1 = triton_gpu.local_alloc : () -> !tt.memdesc<1024x4xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst_1 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1024x4xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> %cst_2 = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #AL> // CHECK-NEXT: scratch offset = 128, size = 1152 %0 = triton_gpu.convert_layout %cst_2 : tensor<16x32xf16, #AL> -> tensor<16x32xf16, #BL> - %1 = triton_gpu.local_load %cst : !tt.memdesc<4x4xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<4x4xf16, #AL> + %1 = triton_gpu.local_load %cst : !triton_gpu.memdesc<4x4xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<4x4xf16, #AL> // CHECK-NEXT: offset = 1152, size = 128 - %cst_3 = triton_gpu.local_alloc : () -> !tt.memdesc<2x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> - %2 = triton_gpu.local_load %cst : !tt.memdesc<4x4xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<4x4xf16, #AL> + %cst_3 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<2x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %2 = triton_gpu.local_load %cst : !triton_gpu.memdesc<4x4xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<4x4xf16, #AL> // CHECK-NEXT: offset = 0, size = 512 - %cst_4 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> - %3 = triton_gpu.local_load %cst_0 : !tt.memdesc<16x4xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<16x4xf16, #AL> - %4 = triton_gpu.local_load %cst_1 : !tt.memdesc<1024x4xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<1024x4xf16, #AL> + %cst_4 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %3 = triton_gpu.local_load %cst_0 : !triton_gpu.memdesc<16x4xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<16x4xf16, #AL> + %4 = triton_gpu.local_load %cst_1 : !triton_gpu.memdesc<1024x4xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<1024x4xf16, #AL> // CHECK-NEXT: scratch offset = 0, size = 1152 %5 = triton_gpu.convert_layout %cst_2 : tensor<16x32xf16, #AL> -> tensor<16x32xf16, #BL> - %6 = triton_gpu.local_load %cst_3 : !tt.memdesc<2x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<2x32xf16, #AL> + %6 = triton_gpu.local_load %cst_3 : !triton_gpu.memdesc<2x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<2x32xf16, #AL> // CHECK-NEXT: size = 10240 tt.return } @@ -267,10 +267,10 @@ tt.func @multi_color_multi_rounds(%arg0: !tt.ptr) { // CHECK-LABEL: alloc tt.func @alloc(%A : !tt.ptr) { // CHECK: offset = 0, size = 512 - %cst0 = triton_gpu.local_alloc : () -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> %cst1 = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #AL> // CHECK-NEXT: offset = 0, size = 512 - %cst2 = triton_gpu.local_alloc : () -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst2 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> tt.return // CHECK-NEXT: size = 512 } @@ -279,10 +279,10 @@ tt.func @alloc(%A : !tt.ptr) { // CHECK-LABEL: dealloc tt.func @dealloc(%A : !tt.ptr) { // CHECK: offset = 0, size = 1024 - %cst0 = triton_gpu.local_alloc : () -> !tt.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK: offset = 1024, size = 1024 - %cst1 = triton_gpu.local_alloc : () -> !tt.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> - triton_gpu.local_dealloc %cst0 : !tt.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst1 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + triton_gpu.local_dealloc %cst0 : !triton_gpu.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> tt.return // CHECK-NEXT: size = 2048 } @@ -303,8 +303,8 @@ tt.func @scratch() { // CHECK-LABEL: trans tt.func @trans(%A : !tt.ptr) { // CHECK: offset = 0, size = 1024 - %tensor = triton_gpu.local_alloc : () -> !tt.memdesc<16x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> - %b = triton_gpu.memdesc_trans %tensor {order=array} : !tt.memdesc<16x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<32x16xf16, #A_SHARED_T, #triton_gpu.shared_memory, mutable> + %tensor = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<16x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %b = triton_gpu.memdesc_trans %tensor {order=array} : !triton_gpu.memdesc<16x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> !triton_gpu.memdesc<32x16xf16, #A_SHARED_T, #triton_gpu.shared_memory, mutable> tt.return } @@ -312,9 +312,9 @@ tt.func @trans(%A : !tt.ptr) { // CHECK-LABEL: extract_slice tt.func @extract_slice(%A : !tt.ptr) { // CHECK: offset = 0, size = 512 - %cst0 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> %index = arith.constant 0 : i32 - %cst1 = triton_gpu.memdesc_subview %cst0[%index, %index, %index] : !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst1 = triton_gpu.memdesc_subview %cst0[%index, %index, %index] : !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> tt.return // CHECK-NEXT: size = 512 } @@ -326,9 +326,9 @@ tt.func @atomic_scalar(%arg3: !tt.ptr) -> i32 { // CHECK: size = 8196 %c0_i32 = arith.constant 0 : i32 %1 = arith.constant dense<1.0> : tensor<128x32xf16, #AL> - %2 = triton_gpu.local_alloc %1 : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> + %2 = triton_gpu.local_alloc %1 : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> %4 = tt.atomic_cas acq_rel, gpu, %arg3, %c0_i32, %c0_i32 : (!tt.ptr, i32, i32) -> i32 - %3 = triton_gpu.local_load %2 : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> + %3 = triton_gpu.local_load %2 : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> tt.return %4 : i32 } @@ -338,9 +338,9 @@ tt.func @atomic_scalar_no_use(%arg3: !tt.ptr) { // CHECK: size = 8192 %c0_i32 = arith.constant 0 : i32 %1 = arith.constant dense<1.0> : tensor<128x32xf16, #AL> - %2 = triton_gpu.local_alloc %1 : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> + %2 = triton_gpu.local_alloc %1 : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> %4 = tt.atomic_cas acq_rel, gpu, %arg3, %c0_i32, %c0_i32 : (!tt.ptr, i32, i32) -> i32 - %3 = triton_gpu.local_load %2 : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> + %3 = triton_gpu.local_load %2 : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> tt.return } @@ -349,25 +349,25 @@ tt.func @atomic_scalar_no_use(%arg3: !tt.ptr) { // CHECK-LABEL: if tt.func @if(%i1 : i1) { // CHECK: offset = 0, size = 512 - %cst0 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: offset = 1024, size = 512 - %cst1 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst1 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> scf.if %i1 { // CHECK-NEXT: offset = 2048, size = 1024 - %a = triton_gpu.local_alloc : () -> !tt.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %a = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: offset = 2048, size = 1024 - %b = triton_gpu.local_alloc : () -> !tt.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> - triton_gpu.local_dealloc %cst0 : !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> - triton_gpu.local_dealloc %cst1 : !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %b = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + triton_gpu.local_dealloc %cst0 : !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + triton_gpu.local_dealloc %cst1 : !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> } // CHECK-NEXT: offset = 0, size = 512 - %cst2 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst2 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: offset = 1024, size = 512 - %cst3 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst3 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: offset = 2048, size = 1024 - %a = triton_gpu.local_alloc : () -> !tt.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> - triton_gpu.local_dealloc %cst2 : !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> - triton_gpu.local_dealloc %cst3 : !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %a = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + triton_gpu.local_dealloc %cst2 : !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + triton_gpu.local_dealloc %cst3 : !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> tt.return // CHECK-NEXT: size = 3072 } @@ -377,28 +377,28 @@ tt.func @if(%i1 : i1) { // CHECK-LABEL: if_else tt.func @if_else(%i1 : i1) { // CHECK: offset = 0, size = 512 - %cst0 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: offset = 1024, size = 512 - %cst1 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst1 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> scf.if %i1 { // CHECK-NEXT: offset = 2048, size = 1024 - %a = triton_gpu.local_alloc : () -> !tt.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %a = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: offset = 2048, size = 1024 - %b = triton_gpu.local_alloc : () -> !tt.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %b = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> } else { // CHECK-NEXT: offset = 2048, size = 512 - %cst2 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst2 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: offset = 3072, size = 512 - %cst3 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst3 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: offset = 4096, size = 1024 - %a = triton_gpu.local_alloc : () -> !tt.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> - triton_gpu.local_dealloc %cst2 : !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> - triton_gpu.local_dealloc %cst3 : !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %a = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + triton_gpu.local_dealloc %cst2 : !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + triton_gpu.local_dealloc %cst3 : !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> } // CHECK-NEXT: offset = 2048, size = 1024 - %a = triton_gpu.local_alloc : () -> !tt.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> - triton_gpu.local_dealloc %cst0 : !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> - triton_gpu.local_dealloc %cst1 : !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %a = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + triton_gpu.local_dealloc %cst0 : !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + triton_gpu.local_dealloc %cst1 : !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> tt.return // CHECK-NEXT: size = 5120 } @@ -408,13 +408,13 @@ tt.func @if_else(%i1 : i1) { // CHECK-LABEL: for tt.func @for(%lb : index, %ub : index, %step : index, %A : !tt.ptr, %B : !tt.ptr) { // CHECK: offset = 0, size = 8192 - %a_shared_init = triton_gpu.local_alloc : () -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %a_shared_init = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: offset = 8192, size = 8192 - %b_shared_init = triton_gpu.local_alloc : () -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %b_shared_init = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: offset = 16384, size = 8192 - %c_shared_init = triton_gpu.local_alloc : () -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> - %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (!tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>) { - scf.yield %b_shared, %a_shared, %a_shared : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %c_shared_init = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (!triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>) { + scf.yield %b_shared, %a_shared, %a_shared : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> } tt.return // CHECK-NEXT: size = 24576 @@ -423,18 +423,18 @@ tt.func @for(%lb : index, %ub : index, %step : index, %A : !tt.ptr, %B : !t // CHECK-LABEL: for_if_slice tt.func @for_if_slice(%lb : index, %ub : index, %step : index, %A : !tt.ptr, %B : !tt.ptr, %i1 : i1) { // CHECK: offset = 0, size = 8192 - %a_shared_init = triton_gpu.local_alloc : () -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %a_shared_init = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: offset = 8192, size = 8192 - %b_shared_init = triton_gpu.local_alloc : () -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %b_shared_init = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: offset = 16384, size = 8192 - %c_shared_init = triton_gpu.local_alloc : () -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> - %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (!tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>) { + %c_shared_init = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (!triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>) { scf.if %i1 { %index = arith.constant 8 : i32 - %cst0 = triton_gpu.memdesc_subview %a_shared[%index, %index] : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst0 = triton_gpu.memdesc_subview %a_shared[%index, %index] : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> !triton_gpu.memdesc<32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> scf.yield } - scf.yield %b_shared, %a_shared, %a_shared : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + scf.yield %b_shared, %a_shared, %a_shared : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> } tt.return // CHECK-NEXT: size = 24576 @@ -444,16 +444,16 @@ tt.func @for_if_slice(%lb : index, %ub : index, %step : index, %A : !tt.ptr // CHECK-LABEL: for_use_ancestor tt.func @for_use_ancestor(%lb : index, %ub : index, %step : index, %A : !tt.ptr, %B : !tt.ptr, %i1 : i1) { // CHECK: offset = 0, size = 8192 - %a_shared_init = triton_gpu.local_alloc : () -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %a_shared_init = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: offset = 8192, size = 8192 - %b_shared_init = triton_gpu.local_alloc : () -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %b_shared_init = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: offset = 16384, size = 8192 - %c_shared_init = triton_gpu.local_alloc : () -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> - %a_shared, %b_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init) -> (!tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>) { - %c0 = triton_gpu.memdesc_trans %c_shared_init {order=array} : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<32x128xf16, #A_SHARED_T, #triton_gpu.shared_memory, mutable> + %c_shared_init = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %a_shared, %b_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init) -> (!triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>) { + %c0 = triton_gpu.memdesc_trans %c_shared_init {order=array} : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> !triton_gpu.memdesc<32x128xf16, #A_SHARED_T, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: offset = 24576, size = 8192 - %c1 = triton_gpu.local_alloc : () -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> - scf.yield %b_shared, %a_shared: !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %c1 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + scf.yield %b_shared, %a_shared: !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> } tt.return // CHECK-NEXT: size = 32768 @@ -464,28 +464,28 @@ tt.func @for_use_ancestor(%lb : index, %ub : index, %step : index, %A : !tt.ptr< // CHECK-LABEL: for_for_if tt.func @for_for_if(%lb : index, %ub : index, %step : index, %A : !tt.ptr, %B : !tt.ptr, %i1 : i1) { // CHECK: offset = 0, size = 8192 - %a_shared_init = triton_gpu.local_alloc : () -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %a_shared_init = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: offset = 8192, size = 8192 - %b_shared_init = triton_gpu.local_alloc : () -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %b_shared_init = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: offset = 16384, size = 8192 - %c_shared_init = triton_gpu.local_alloc : () -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> - %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (!tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>) { - %c_shared_next = scf.for %jv = %lb to %ub step %step iter_args(%c_shared_next = %c_shared) -> (!tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>) { - %c_shared_next_next = scf.if %i1 -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> { + %c_shared_init = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (!triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>) { + %c_shared_next = scf.for %jv = %lb to %ub step %step iter_args(%c_shared_next = %c_shared) -> (!triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>) { + %c_shared_next_next = scf.if %i1 -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> { // CHECK-NEXT: offset = 24576, size = 8192 - %cst0 = triton_gpu.local_alloc : () -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> - scf.yield %cst0 : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + scf.yield %cst0 : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> } else { // CHECK-NEXT: offset = 32768, size = 8192 - %cst1 = triton_gpu.local_alloc : () -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> - scf.yield %cst1 : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst1 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + scf.yield %cst1 : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> } - scf.yield %c_shared_next_next : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + scf.yield %c_shared_next_next : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> } - scf.yield %a_shared, %b_shared, %c_shared_next : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + scf.yield %a_shared, %b_shared, %c_shared_next : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> } // CHECK-NEXT: offset = 0, size = 8192 - %cst2 = triton_gpu.local_alloc : () -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst2 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> tt.return // CHECK-NEXT: size = 40960 } @@ -497,7 +497,7 @@ module attributes {"triton_gpu.num-warps" = 4 : i32} { // CHECK-LABEL: alloc1 tt.func @alloc1(%A : !tt.ptr) { // CHECK: offset = 0, size = 512 - %cst0 = triton_gpu.local_alloc : () -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> tt.return // CHECK-NEXT: size = 512 } @@ -505,7 +505,7 @@ tt.func @alloc1(%A : !tt.ptr) { // CHECK-LABEL: alloc2 tt.func @alloc2(%A : !tt.ptr) { // CHECK: offset = 0, size = 1024 - %cst0 = triton_gpu.local_alloc : () -> !tt.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> tt.return // CHECK-NEXT: size = 1024 } @@ -514,10 +514,10 @@ tt.func @alloc2(%A : !tt.ptr) { tt.func @alloc3(%cond : i1) { scf.if %cond { // CHECK: offset = 0, size = 512 - %cst0 = triton_gpu.local_alloc : () -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> } else { // CHECK-NEXT: offset = 0, size = 1024 - %cst0 = triton_gpu.local_alloc : () -> !tt.memdesc<16x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<16x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> } tt.return // CHECK-NEXT: size = 1024 @@ -539,7 +539,7 @@ tt.func @alloc4(%A : !tt.ptr, %cond : i1) { // CHECK-LABEL: single_call tt.func @single_call(%A : !tt.ptr) { // CHECK: offset = 0, size = 512 - %cst0 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> %cst1 = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #AL> // CHECK-NEXT: virtual offset = 0, size = 512 tt.call @alloc1(%A) : (!tt.ptr) -> () @@ -550,7 +550,7 @@ tt.func @single_call(%A : !tt.ptr) { // CHECK-LABEL: multiple_calls tt.func @multiple_calls(%A : !tt.ptr) { // CHECK: offset = 0, size = 512 - %cst0 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: virtual offset = 0, size = 512 tt.call @alloc1(%A) : (!tt.ptr) -> () %cst1 = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #AL> @@ -565,9 +565,9 @@ tt.func @if_else_calls(%A : !tt.ptr, %cond : i1) { %cst = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #AL> scf.if %cond { // CHECK: offset = 0, size = 512 - %cst0 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: offset = 0, size = 1024 - %cst1 = triton_gpu.local_alloc %cst : (tensor<16x32xf16, #AL>) -> !tt.memdesc<16x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst1 = triton_gpu.local_alloc %cst : (tensor<16x32xf16, #AL>) -> !triton_gpu.memdesc<16x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: virtual offset = 0, size = 512 tt.call @alloc1(%A) : (!tt.ptr) -> () } else { @@ -582,7 +582,7 @@ tt.func @if_else_calls(%A : !tt.ptr, %cond : i1) { // CHECK-LABEL: for_calls tt.func @for_calls(%A : !tt.ptr, %cond : i1) { // CHECK: offset = 0, size = 512 - %cst0 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> %cst1 = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #AL> %lb = arith.constant 0 : index %ub = arith.constant 10 : index @@ -598,7 +598,7 @@ tt.func @for_calls(%A : !tt.ptr, %cond : i1) { // CHECK-LABEL: call_graph_1 tt.func @call_graph_1(%A : !tt.ptr, %cond : i1) { // CHECK: offset = 0, size = 512 - %cst0 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: virtual offset = 0, size = 1024 tt.call @alloc3(%cond) : (i1) -> () tt.return @@ -608,7 +608,7 @@ tt.func @call_graph_1(%A : !tt.ptr, %cond : i1) { // CHECK-LABEL: call_graph_2 tt.func @call_graph_2(%A : !tt.ptr, %cond : i1) { // CHECK: offset = 0, size = 512 - %cst0 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %cst0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: virtual offset = 0, size = 1024 tt.call @alloc4(%A, %cond) : (!tt.ptr, i1) -> () tt.return diff --git a/test/Analysis/test-membar.mlir b/test/Analysis/test-membar.mlir index 65d802d995..a2711ba98f 100644 --- a/test/Analysis/test-membar.mlir +++ b/test/Analysis/test-membar.mlir @@ -46,10 +46,10 @@ tt.func @raw_single_block(%A : !tt.ptr) { %cst2 = arith.constant dense<0.000000e+00> : tensor<128x32xf16, #AL> %0 = tt.splat %A : !tt.ptr -> tensor<128x32x!tt.ptr, #AL> %1 = tt.load %0, %cst1, %cst2 : tensor<128x32x!tt.ptr, #AL> - %2 = triton_gpu.local_alloc %1 : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> + %2 = triton_gpu.local_alloc %1 : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> // CHECK: gpu.barrier // CHECK-NEXT: triton_gpu.local_load - %3 = triton_gpu.local_load %2 : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> + %3 = triton_gpu.local_load %2 : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> tt.return } @@ -59,14 +59,14 @@ tt.func @war_single_block(%A : !tt.ptr) { %cst2 = arith.constant dense<0.000000e+00> : tensor<128x32xf16, #AL> %0 = tt.splat %A : !tt.ptr -> tensor<128x32x!tt.ptr, #AL> %1 = tt.load %0, %cst1, %cst2 : tensor<128x32x!tt.ptr, #AL> - %2 = triton_gpu.local_alloc %1 : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> + %2 = triton_gpu.local_alloc %1 : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> // CHECK: triton_gpu.local_alloc // CHECK: gpu.barrier // CHECK-NEXT: triton_gpu.local_load - %3 = triton_gpu.local_load %2 : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> + %3 = triton_gpu.local_load %2 : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> // CHECK: gpu.barrier // CHECK-NEXT: %4 = triton_gpu.local_alloc - %4 = triton_gpu.local_alloc %1 : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> + %4 = triton_gpu.local_alloc %1 : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> tt.return } @@ -76,25 +76,25 @@ tt.func @war_single_block_local_store(%A : !tt.ptr) { %cst2 = arith.constant dense<0.000000e+00> : tensor<128x32xf16, #AL> %0 = tt.splat %A : !tt.ptr -> tensor<128x32x!tt.ptr, #AL> %1 = tt.load %0, %cst1, %cst2 : tensor<128x32x!tt.ptr, #AL> - %2 = triton_gpu.local_alloc %1 : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %2 = triton_gpu.local_alloc %1 : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK: triton_gpu.local_alloc // CHECK: gpu.barrier // CHECK-NEXT: triton_gpu.local_load - %3 = triton_gpu.local_load %2 : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<128x32xf16, #AL> + %3 = triton_gpu.local_load %2 : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<128x32xf16, #AL> // CHECK: gpu.barrier // CHECK-NEXT: triton_gpu.local_store - triton_gpu.local_store %1, %2 : tensor<128x32xf16, #AL> -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %1, %2 : tensor<128x32xf16, #AL> -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> tt.return } // CHECK-LABEL: scratch tt.func @scratch(%arg: tensor<16x16xf16, #AL>) { - %cst0 = triton_gpu.local_alloc %arg : (tensor<16x16xf16, #AL>) -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> + %cst0 = triton_gpu.local_alloc %arg : (tensor<16x16xf16, #AL>) -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> // CHECK: gpu.barrier // CHECK-NEXT: triton_gpu.local_load // CHECK: gpu.barrier // CHECK: tt.reduce - %1 = triton_gpu.local_load %cst0 : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL> + %1 = triton_gpu.local_load %cst0 : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL> %2 = "tt.reduce" (%1) ({ ^bb0(%arg1: f16, %arg2: f16): %add = arith.addf %arg1, %arg2 : f16 @@ -105,34 +105,34 @@ tt.func @scratch(%arg: tensor<16x16xf16, #AL>) { // CHECK-LABEL: async_wait tt.func @async_wait(%arg: tensor<32x16xf16, #AL>) { - %cst0 = triton_gpu.local_alloc %arg : (tensor<32x16xf16, #AL>) -> !tt.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory> + %cst0 = triton_gpu.local_alloc %arg : (tensor<32x16xf16, #AL>) -> !triton_gpu.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory> // CHECK: triton_gpu.async_wait triton_gpu.async_wait {num = 4 : i32} // CHECK: gpu.barrier // CHECK-NEXT: triton_gpu.local_load - %1 = triton_gpu.local_load %cst0 : !tt.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<32x16xf16, #AL> + %1 = triton_gpu.local_load %cst0 : !triton_gpu.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<32x16xf16, #AL> tt.return } // CHECK-LABEL: subview tt.func @subview() { %cst0 = arith.constant dense<0.000000e+00> : tensor<32x16xf16, #AL> - %a = triton_gpu.local_alloc %cst0 : (tensor<32x16xf16, #AL>) -> !tt.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory> + %a = triton_gpu.local_alloc %cst0 : (tensor<32x16xf16, #AL>) -> !triton_gpu.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory> %index = arith.constant 0 : i32 - %0 = triton_gpu.memdesc_subview %a[%index, %index] : !tt.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> + %0 = triton_gpu.memdesc_subview %a[%index, %index] : !triton_gpu.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> // CHECK: gpu.barrier // CHECK-NEXT: triton_gpu.local_load - %1 = triton_gpu.local_load %0 : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL> + %1 = triton_gpu.local_load %0 : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL> // CHECK: gpu.barrier // CHECK-NEXT: triton_gpu.local_alloc - %2 = triton_gpu.local_alloc %1 : (tensor<16x16xf16, #AL>) -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> + %2 = triton_gpu.local_alloc %1 : (tensor<16x16xf16, #AL>) -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> tt.return } // CHECK-LABEL: trans -tt.func @trans(%a: !tt.memdesc<16x32xf16, #A_SHARED, #triton_gpu.shared_memory>) { +tt.func @trans(%a: !triton_gpu.memdesc<16x32xf16, #A_SHARED, #triton_gpu.shared_memory>) { // CHECK-NOT: gpu.barrier - %b = triton_gpu.memdesc_trans %a {order=array} : !tt.memdesc<16x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> !tt.memdesc<32x16xf16, #A_SHARED_T, #triton_gpu.shared_memory> + %b = triton_gpu.memdesc_trans %a {order=array} : !triton_gpu.memdesc<16x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> !triton_gpu.memdesc<32x16xf16, #A_SHARED_T, #triton_gpu.shared_memory> tt.return } @@ -142,31 +142,31 @@ tt.func @async_copy_global_to_local(%A : !tt.ptr, %i1 : i1) { %a_ptr = tt.splat %A : !tt.ptr -> tensor<16x16x!tt.ptr, #AL> %mask = tt.splat %i1 : i1 -> tensor<16x16xi1, #AL> %other = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL> - %alloc = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> - %subview = triton_gpu.memdesc_subview %alloc[%index, %index, %index] : !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> - %1 = triton_gpu.async_copy_global_to_local %a_ptr, %subview : tensor<16x16x!tt.ptr, #AL> -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %alloc = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %subview = triton_gpu.memdesc_subview %alloc[%index, %index, %index] : !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %1 = triton_gpu.async_copy_global_to_local %a_ptr, %subview : tensor<16x16x!tt.ptr, #AL> -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK: gpu.barrier // CHECK-NEXT: triton_gpu.local_load - %4 = triton_gpu.local_load %subview : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<16x16xf16, #AL> + %4 = triton_gpu.local_load %subview : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<16x16xf16, #AL> tt.return } // If branch inserted a barrier for %cst0, but else didn't, then the barrier should be inserted in the parent region // CHECK-LABEL: multi_blocks tt.func @multi_blocks(%i1 : i1) { %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL> - %cst0 = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> + %cst0 = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> scf.if %i1 { // CHECK: gpu.barrier // CHECK-NEXT: triton_gpu.local_load - %0 = triton_gpu.local_load %cst0 : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL> + %0 = triton_gpu.local_load %cst0 : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL> scf.yield } else { - %cst1 = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> + %cst1 = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> scf.yield } // CHECK: gpu.barrier // CHECK-NEXT: triton_gpu.local_load - %2 = triton_gpu.local_load %cst0 : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL> + %2 = triton_gpu.local_load %cst0 : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL> tt.return } @@ -174,21 +174,21 @@ tt.func @multi_blocks(%i1 : i1) { // CHECK-LABEL: multi_blocks_join_barrier tt.func @multi_blocks_join_barrier(%i1 : i1) { %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL> - %cst0 = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> + %cst0 = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> scf.if %i1 { // CHECK: gpu.barrier // CHECK-NEXT: triton_gpu.local_load - %0 = triton_gpu.local_load %cst0 : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL> + %0 = triton_gpu.local_load %cst0 : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL> scf.yield } else { // CHECK: gpu.barrier // CHECK-NEXT: triton_gpu.local_load - %1 = triton_gpu.local_load %cst0 : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL> + %1 = triton_gpu.local_load %cst0 : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL> scf.yield } // CHECK-NOT: gpu.barrier // CHECK: tt.return - %a_ = triton_gpu.local_load %cst0 : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL> + %a_ = triton_gpu.local_load %cst0 : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL> tt.return } @@ -196,25 +196,25 @@ tt.func @multi_blocks_join_barrier(%i1 : i1) { // CHECK-LABEL: multi_blocks_yield tt.func @multi_blocks_yield(%i1 : i1) { %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL> - %cst0 = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> - %a = scf.if %i1 -> (!tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>) { + %cst0 = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> + %a = scf.if %i1 -> (!triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>) { // CHECK: gpu.barrier // CHECK-NEXT: triton_gpu.local_load - %0 = triton_gpu.local_load %cst0 : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL> - %1 = triton_gpu.local_alloc %0 : (tensor<16x16xf16, #AL>) -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> - scf.yield %1 : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> + %0 = triton_gpu.local_load %cst0 : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL> + %1 = triton_gpu.local_alloc %0 : (tensor<16x16xf16, #AL>) -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> + scf.yield %1 : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> } else { // CHECK: gpu.barrier // CHECK-NEXT: triton_gpu.local_load - %2 = triton_gpu.local_load %cst0 : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL> - %3 = triton_gpu.local_alloc %2 : (tensor<16x16xf16, #AL>) -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> - scf.yield %3 : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> + %2 = triton_gpu.local_load %cst0 : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL> + %3 = triton_gpu.local_alloc %2 : (tensor<16x16xf16, #AL>) -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> + scf.yield %3 : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> } - %a_ = triton_gpu.local_load %cst0 : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL> + %a_ = triton_gpu.local_load %cst0 : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL> // CHECK: triton_gpu.local_load // CHECK-NEXT: gpu.barrier // CHECK-NEXT: triton_gpu.local_load - %4 = triton_gpu.local_load %a : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL> + %4 = triton_gpu.local_load %a : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL> tt.return } @@ -222,27 +222,27 @@ tt.func @multi_blocks_yield(%i1 : i1) { // CHECK-LABEL: multi_blocks_entry_no_shared tt.func @multi_blocks_entry_no_shared(%i1 : i1) { %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL> - %cst0 = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> - %a = scf.if %i1 -> (!tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>) { + %cst0 = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> + %a = scf.if %i1 -> (!triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>) { // CHECK: gpu.barrier // CHECK-NEXT: triton_gpu.local_alloc // CHECK-NEXT: gpu.barrier // CHECK-NEXT: triton_gpu.local_load // CHECK-NEXT: gpu.barrier // CHECK-NEXT: triton_gpu.local_alloc - %cst1 = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> - %0 = triton_gpu.local_load %cst1 : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL> - %1 = triton_gpu.local_alloc %0 : (tensor<16x16xf16, #AL>) -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> - scf.yield %1 : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> + %cst1 = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> + %0 = triton_gpu.local_load %cst1 : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL> + %1 = triton_gpu.local_alloc %0 : (tensor<16x16xf16, #AL>) -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> + scf.yield %1 : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> } else { // CHECK-NOT: gpu.barrier // CHECK: triton_gpu.local_alloc - %cst1 = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> - scf.yield %cst1 : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> + %cst1 = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> + scf.yield %cst1 : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> } // CHECK: gpu.barrier // CHECK-NEXT: triton_gpu.local_load - %2 = triton_gpu.local_load %a : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL> + %2 = triton_gpu.local_load %a : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL> tt.return } @@ -250,16 +250,16 @@ tt.func @multi_blocks_entry_no_shared(%i1 : i1) { // CHECK-LABEL: multi_blocks_noelse tt.func @multi_blocks_noelse(%i1 : i1) { %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL> - %cst0 = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> + %cst0 = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> scf.if %i1 { // CHECK: gpu.barrier // CHECK-NEXT: triton_gpu.local_load - %0 = triton_gpu.local_load %cst0 : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL> + %0 = triton_gpu.local_load %cst0 : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL> scf.yield } // CHECK: gpu.barrier // CHECK-NEXT: triton_gpu.local_load - %1 = triton_gpu.local_load %cst0 : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL> + %1 = triton_gpu.local_load %cst0 : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL> tt.return } @@ -267,39 +267,39 @@ tt.func @multi_blocks_noelse(%i1 : i1) { // CHECK-LABEL: multi_blocks_nested_scf tt.func @multi_blocks_nested_scf(%i1 : i1, %i2 : i1) { %cst = arith.constant dense<0.000000e+00> : tensor<128x32xf16, #AL> - %cst0 = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> + %cst0 = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> scf.if %i1 { scf.if %i2 { // CHECK: gpu.barrier // CHECK-NEXT: triton_gpu.local_load - %0 = triton_gpu.local_load %cst0 : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> + %0 = triton_gpu.local_load %cst0 : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> scf.yield } scf.yield } else { // CHECK: gpu.barrier // CHECK-NEXT: triton_gpu.local_load - %1 = triton_gpu.local_load %cst0 : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> + %1 = triton_gpu.local_load %cst0 : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> scf.yield } // CHECK: gpu.barrier // CHECK-NEXT: triton_gpu.local_load - %2 = triton_gpu.local_load %cst0 : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> + %2 = triton_gpu.local_load %cst0 : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> tt.return } // CHECK-LABEL: for tt.func @for(%lb : index, %ub : index, %step : index, %A : !tt.ptr, %B : !tt.ptr) { %cst = arith.constant dense<0.000000e+00> : tensor<128x32xf16, #AL> - %a_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> - %b_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> - %c_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> - %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (!tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>) { + %a_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> + %b_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> + %c_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> + %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (!triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>) { // CHECK: gpu.barrier // CHECK-NEXT: triton_gpu.local_load - %a0 = triton_gpu.local_load %a_shared : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> - %b0 = triton_gpu.local_load %b_shared : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> - scf.yield %b_shared, %a_shared, %a_shared : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> + %a0 = triton_gpu.local_load %a_shared : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> + %b0 = triton_gpu.local_load %b_shared : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> + scf.yield %b_shared, %a_shared, %a_shared : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> } tt.return } @@ -309,24 +309,24 @@ tt.func @for(%lb : index, %ub : index, %step : index, %A : !tt.ptr, %B : !t // CHECK-LABEL: for_alias tt.func @for_alias(%lb : index, %ub : index, %step : index, %A : !tt.ptr, %B : !tt.ptr) { %cst = arith.constant dense<0.000000e+00> : tensor<128x32xf16, #AL> - %a_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> - %b_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> + %a_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> + %b_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> // CHECK: gpu.barrier // CHECK-NEXT: triton_gpu.local_load - %a0 = triton_gpu.local_load %a_shared_init : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> - %b0 = triton_gpu.local_load %b_shared_init : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> - %0 = triton_gpu.local_alloc %a0 : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> - %c_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> - %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (!tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>) { + %a0 = triton_gpu.local_load %a_shared_init : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> + %b0 = triton_gpu.local_load %b_shared_init : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> + %0 = triton_gpu.local_alloc %a0 : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> + %c_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> + %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (!triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>) { // CHECK: gpu.barrier // CHECK-NEXT: triton_gpu.local_load - %a1 = triton_gpu.local_load %a_shared : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> - %b1 = triton_gpu.local_load %b_shared : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> - scf.yield %c_shared, %a_shared, %b_shared : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> + %a1 = triton_gpu.local_load %a_shared : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> + %b1 = triton_gpu.local_load %b_shared : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> + scf.yield %c_shared, %a_shared, %b_shared : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> } // CHECK: gpu.barrier // CHECK-NEXT: triton_gpu.local_load - %r = triton_gpu.local_load %0 : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> + %r = triton_gpu.local_load %0 : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> tt.return } @@ -335,63 +335,63 @@ tt.func @for_alias(%lb : index, %ub : index, %step : index, %A : !tt.ptr, % // CHECK-LABEL: for_reuse tt.func @for_reuse(%lb : index, %ub : index, %step : index, %A : !tt.ptr, %B : !tt.ptr) { %cst = arith.constant dense<0.000000e+00> : tensor<128x32xf16, #AL> - %a_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> - %b_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> + %a_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> + %b_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> // CHECK: gpu.barrier // CHECK-NEXT: triton_gpu.local_load - %a0 = triton_gpu.local_load %a_shared_init : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> - %b0 = triton_gpu.local_load %b_shared_init : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> - %0 = triton_gpu.local_alloc %a0 : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> - %c_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> - %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (!tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>) { + %a0 = triton_gpu.local_load %a_shared_init : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> + %b0 = triton_gpu.local_load %b_shared_init : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> + %0 = triton_gpu.local_alloc %a0 : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> + %c_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> + %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (!triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>) { // CHECK: gpu.barrier // CHECK-NEXT: triton_gpu.local_alloc - %a1 = triton_gpu.local_load %a_shared_init : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> - %b1 = triton_gpu.local_load %b_shared_init : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> - %1 = triton_gpu.local_alloc %a1 : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> + %a1 = triton_gpu.local_load %a_shared_init : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> + %b1 = triton_gpu.local_load %b_shared_init : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> + %1 = triton_gpu.local_alloc %a1 : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> // CHECK: gpu.barrier // CHECK-NEXT: triton_gpu.local_alloc - %a2 = triton_gpu.local_load %a_shared_init : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> - %b2 = triton_gpu.local_load %b_shared_init : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> - %2 = triton_gpu.local_alloc %a1 : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> - scf.yield %c_shared, %a_shared, %b_shared : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> + %a2 = triton_gpu.local_load %a_shared_init : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> + %b2 = triton_gpu.local_load %b_shared_init : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> + %2 = triton_gpu.local_alloc %a1 : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> + scf.yield %c_shared, %a_shared, %b_shared : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> } // CHECK: gpu.barrier // CHECK-NEXT: triton_gpu.local_load - %r = triton_gpu.local_load %0 : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> + %r = triton_gpu.local_load %0 : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> tt.return } // CHECK-LABEL: for_reuse_nested tt.func @for_reuse_nested(%lb : index, %ub : index, %step : index, %A : !tt.ptr, %B : !tt.ptr) { %cst = arith.constant dense<0.000000e+00> : tensor<128x32xf16, #AL> - %a_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> - %b_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> + %a_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> + %b_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> // CHECK: gpu.barrier // CHECK-NEXT: triton_gpu.local_load - %a0 = triton_gpu.local_load %a_shared_init : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> - %b0 = triton_gpu.local_load %b_shared_init : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> - %0 = triton_gpu.local_alloc %a0 : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> - %c_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> - %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (!tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>) { + %a0 = triton_gpu.local_load %a_shared_init : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> + %b0 = triton_gpu.local_load %b_shared_init : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> + %0 = triton_gpu.local_alloc %a0 : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> + %c_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> + %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (!triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>) { // CHECK: gpu.barrier // CHECK-NEXT: triton_gpu.local_alloc - %a1 = triton_gpu.local_load %a_shared_init : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> - %b1 = triton_gpu.local_load %b_shared_init : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> - %1 = triton_gpu.local_alloc %a1 : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> - %a_shared_next, %b_shared_next, %c_shared_next = scf.for %ivv = %lb to %ub step %step iter_args(%a_shared_nested = %a_shared_init, %b_shared_nested = %b_shared_init, %c_shared_nested = %c_shared_init) -> (!tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>) { + %a1 = triton_gpu.local_load %a_shared_init : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> + %b1 = triton_gpu.local_load %b_shared_init : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> + %1 = triton_gpu.local_alloc %a1 : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> + %a_shared_next, %b_shared_next, %c_shared_next = scf.for %ivv = %lb to %ub step %step iter_args(%a_shared_nested = %a_shared_init, %b_shared_nested = %b_shared_init, %c_shared_nested = %c_shared_init) -> (!triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>) { // CHECK: gpu.barrier // CHECK-NEXT: triton_gpu.local_alloc - %a2 = triton_gpu.local_load %a_shared_init : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> - %b2 = triton_gpu.local_load %b_shared_init : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> - %2 = triton_gpu.local_alloc %a2 : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> - scf.yield %c_shared_nested, %a_shared_nested, %b_shared_nested : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> + %a2 = triton_gpu.local_load %a_shared_init : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> + %b2 = triton_gpu.local_load %b_shared_init : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> + %2 = triton_gpu.local_alloc %a2 : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> + scf.yield %c_shared_nested, %a_shared_nested, %b_shared_nested : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> } - scf.yield %c_shared, %a_shared, %b_shared : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> + scf.yield %c_shared, %a_shared, %b_shared : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> } // CHECK: gpu.barrier // CHECK-NEXT: triton_gpu.local_load - %r = triton_gpu.local_load %0 : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> + %r = triton_gpu.local_load %0 : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> tt.return } @@ -399,25 +399,25 @@ tt.func @for_reuse_nested(%lb : index, %ub : index, %step : index, %A : !tt.ptr< // CHECK-LABEL: for_for_if tt.func @for_for_if(%lb : index, %ub : index, %step : index, %A : !tt.ptr, %B : !tt.ptr, %i1 : i1) { %cst = arith.constant dense<0.000000e+00> : tensor<128x32xf16, #AL> - %a_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> - %b_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> - %c_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> - %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (!tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>) { - %c_shared_next = scf.for %jv = %lb to %ub step %step iter_args(%c_shared_next = %c_shared) -> (!tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>) { - %c_shared_next_next = scf.if %i1 -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> { + %a_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> + %b_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> + %c_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> + %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (!triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>) { + %c_shared_next = scf.for %jv = %lb to %ub step %step iter_args(%c_shared_next = %c_shared) -> (!triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>) { + %c_shared_next_next = scf.if %i1 -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> { // CHECK: gpu.barrier // CHECK-NEXT: triton_gpu.local_alloc - %cst0 = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> - scf.yield %cst0 : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> + %cst0 = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> + scf.yield %cst0 : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> } else { // CHECK: gpu.barrier // CHECK-NEXT: triton_gpu.local_alloc - %cst0 = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> - scf.yield %cst0 : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> + %cst0 = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> + scf.yield %cst0 : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> } - scf.yield %c_shared_next_next : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> + scf.yield %c_shared_next_next : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> } - scf.yield %a_shared, %b_shared, %c_shared_next : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> + scf.yield %a_shared, %b_shared, %c_shared_next : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> } tt.return } @@ -426,30 +426,30 @@ tt.func @for_for_if(%lb : index, %ub : index, %step : index, %A : !tt.ptr, // CHECK-LABEL: for_if_for tt.func @for_if_for(%lb : index, %ub : index, %step : index, %A : !tt.ptr, %B : !tt.ptr, %i1 : i1) { %cst = arith.constant dense<0.000000e+00> : tensor<128x32xf16, #AL> - %a_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> - %b_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> - %c_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> + %a_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> + %b_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> + %c_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> // CHECK: gpu.barrier - %c_blocked = triton_gpu.local_load %c_shared_init : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> + %c_blocked = triton_gpu.local_load %c_shared_init : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> - %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (!tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>) { - %c_shared_next_next = scf.if %i1 -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> { + %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (!triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>) { + %c_shared_next_next = scf.if %i1 -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> { // CHECK: gpu.barrier // CHECK-NEXT: triton_gpu.local_alloc - %cst0 = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> - scf.yield %cst0 : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> + %cst0 = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> + scf.yield %cst0 : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> } else { - %c_shared_ = scf.for %jv = %lb to %ub step %step iter_args(%c_shared_next = %c_shared) -> (!tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>) { + %c_shared_ = scf.for %jv = %lb to %ub step %step iter_args(%c_shared_next = %c_shared) -> (!triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>) { // CHECK: gpu.barrier // CHECK-NEXT: triton_gpu.local_load - %c_blocked_next = triton_gpu.local_load %c_shared_next : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> - scf.yield %c_shared : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> + %c_blocked_next = triton_gpu.local_load %c_shared_next : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> + scf.yield %c_shared : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> } - scf.yield %c_shared_ : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> + scf.yield %c_shared_ : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> } // CHECK-NOT: gpu.barrier - %b_blocked_next = triton_gpu.local_load %b_shared: !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> - scf.yield %a_shared, %b_shared, %c_shared_next_next : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> + %b_blocked_next = triton_gpu.local_load %b_shared: !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> + scf.yield %a_shared, %b_shared, %c_shared_next_next : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> } tt.return } @@ -457,65 +457,65 @@ tt.func @for_if_for(%lb : index, %ub : index, %step : index, %A : !tt.ptr, // CHECK-LABEL: cf_if tt.func @cf_if(%i1 : i1) { %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL> - %a = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> + %a = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> cf.cond_br %i1, ^bb1, ^bb2 ^bb1: // pred: ^bb0 // CHECK: gpu.barrier // CHECK-NEXT: triton_gpu.local_load - %0 = triton_gpu.local_load %a : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL> + %0 = triton_gpu.local_load %a : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL> cf.br ^bb2 ^bb2: // 2 preds: ^bb0, ^bb1 // CHECK: gpu.barrier // CHECK-NEXT: triton_gpu.local_load - %1 = triton_gpu.local_load %a : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL> + %1 = triton_gpu.local_load %a : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL> tt.return } // CHECK-LABEL: cf_if_else tt.func @cf_if_else(%i1 : i1) { %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL> - %a = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> + %a = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> cf.cond_br %i1, ^bb1, ^bb2 ^bb1: // pred: ^bb0 // CHECK: gpu.barrier // CHECK-NEXT: triton_gpu.local_load - %0 = triton_gpu.local_load %a : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL> - %1 = triton_gpu.local_alloc %0 : (tensor<16x16xf16, #AL>) -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> - cf.br ^bb3(%1 : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>) + %0 = triton_gpu.local_load %a : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL> + %1 = triton_gpu.local_alloc %0 : (tensor<16x16xf16, #AL>) -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> + cf.br ^bb3(%1 : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>) ^bb2: // pred: ^bb0 // CHECK: gpu.barrier // CHECK-NEXT: triton_gpu.local_load - %2 = triton_gpu.local_load %a : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL> - %3 = triton_gpu.local_alloc %2 : (tensor<16x16xf16, #AL>) -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> - cf.br ^bb3(%3 : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>) -^bb3(%arg: !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>): // 2 preds: ^bb1, ^bb2 + %2 = triton_gpu.local_load %a : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL> + %3 = triton_gpu.local_alloc %2 : (tensor<16x16xf16, #AL>) -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> + cf.br ^bb3(%3 : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>) +^bb3(%arg: !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>): // 2 preds: ^bb1, ^bb2 cf.br ^bb4 ^bb4: // pred: ^bb3 // CHECK: triton_gpu.local_load - %4 = triton_gpu.local_load %a : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL> + %4 = triton_gpu.local_load %a : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL> // CHECK: gpu.barrier // CHECK-NEXT: triton_gpu.local_load - %5 = triton_gpu.local_load %arg : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL> + %5 = triton_gpu.local_load %arg : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL> tt.return } // CHECK-LABEL: cf_if_else_return tt.func @cf_if_else_return(%i1 : i1) { %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL> - %a = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> - %b = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> + %a = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> + %b = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> cf.cond_br %i1, ^bb1, ^bb2 ^bb1: // pred: ^bb0 // CHECK: gpu.barrier // CHECK-NEXT: triton_gpu.local_load - %0 = triton_gpu.local_load %a : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL> - %1 = triton_gpu.local_load %b : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL> + %0 = triton_gpu.local_load %a : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL> + %1 = triton_gpu.local_load %b : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL> tt.return ^bb2: // pred: ^bb0 // CHECK: gpu.barrier // CHECK-NEXT: triton_gpu.local_load - %2 = triton_gpu.local_load %a : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL> - %3 = triton_gpu.local_load %b : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL> + %2 = triton_gpu.local_load %a : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL> + %3 = triton_gpu.local_load %b : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL> tt.return } @@ -524,9 +524,9 @@ tt.func @atomic_scalar(%arg3: !tt.ptr) -> i32 { // CHECK-NOT: gpu.barrier %c0_i32 = arith.constant 0 : i32 %1 = arith.constant dense<1.0> : tensor<128x32xf16, #AL> - %2 = triton_gpu.local_alloc %1 : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> + %2 = triton_gpu.local_alloc %1 : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> %4 = tt.atomic_cas acq_rel, gpu, %arg3, %c0_i32, %c0_i32 : (!tt.ptr, i32, i32) -> i32 - %3 = triton_gpu.local_load %2 : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> + %3 = triton_gpu.local_load %2 : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> tt.return %4 : i32 } @@ -534,11 +534,11 @@ tt.func @atomic_scalar(%arg3: !tt.ptr) -> i32 { tt.func @atomic_scalar_no_use(%arg3: !tt.ptr) { %c0_i32 = arith.constant 0 : i32 %1 = arith.constant dense<1.0> : tensor<128x32xf16, #AL> - %2 = triton_gpu.local_alloc %1 : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> + %2 = triton_gpu.local_alloc %1 : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> %4 = tt.atomic_cas acq_rel, gpu, %arg3, %c0_i32, %c0_i32 : (!tt.ptr, i32, i32) -> i32 // CHECK: gpu.barrier // CHECK-NEXT: triton_gpu.local_load - %3 = triton_gpu.local_load %2 : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> + %3 = triton_gpu.local_load %2 : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL> tt.return } @@ -549,38 +549,38 @@ module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.num-ctas" = 1 : // CHECK-LABEL: convert_layout1 tt.func @convert_layout1(%A : !tt.ptr) { // CHECK-NOT: gpu.barrier - %0 = triton_gpu.local_alloc : () -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> - %1 = triton_gpu.local_load %0 : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<16x16xf16, #AL> + %0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %1 = triton_gpu.local_load %0 : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<16x16xf16, #AL> tt.return } // CHECK-LABEL: convert_layout2 tt.func @convert_layout2(%A : !tt.ptr) { %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL> - %0 = triton_gpu.local_alloc : () -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> - %1 = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %1 = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK: triton_gpu.local_load // CHECK-NEXT: gpu.barrier // CHECK: triton_gpu.local_load - %3 = triton_gpu.local_load %0 : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<16x16xf16, #AL> - %4 = triton_gpu.local_load %1 : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<16x16xf16, #AL> + %3 = triton_gpu.local_load %0 : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<16x16xf16, #AL> + %4 = triton_gpu.local_load %1 : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<16x16xf16, #AL> tt.return } // CHECK-LABEL: convert_layout3 tt.func @convert_layout3(%cond : i1) { scf.if %cond { - %0 = triton_gpu.local_alloc : () -> !tt.memdesc<16x64xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<16x64xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK: triton_gpu.local_load // CHECK-NOT: gpu.barrier - %1 = triton_gpu.local_load %0 : !tt.memdesc<16x64xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<16x64xf16, #AL> + %1 = triton_gpu.local_load %0 : !triton_gpu.memdesc<16x64xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<16x64xf16, #AL> } else { - %0 = triton_gpu.local_alloc : () -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> // CHECK: triton_gpu.local_load // CHECK-NEXT: gpu.barrier // CHECK-NEXT: triton_gpu.local_alloc - %1 = triton_gpu.local_load %0 : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<16x16xf16, #AL> - %2 = triton_gpu.local_alloc %1 : (tensor<16x16xf16, #AL>) -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> + %1 = triton_gpu.local_load %0 : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<16x16xf16, #AL> + %2 = triton_gpu.local_alloc %1 : (tensor<16x16xf16, #AL>) -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> } tt.return } @@ -619,7 +619,7 @@ tt.func @single_call_no_sync(%A : !tt.ptr) { // CHECK-LABEL: multiple_calls tt.func @multiple_calls(%A : !tt.ptr) { %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL> - %cst0 = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> + %cst0 = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> tt.call @convert_layout1(%A) : (!tt.ptr) -> () %cst1 = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #AL> tt.call @convert_layout2(%A) : (!tt.ptr) -> () @@ -631,12 +631,12 @@ tt.func @if_else_calls(%A : !tt.ptr, %cond : i1) { scf.if %cond { %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL> %cst_ = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #AL> - %cst0 = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> + %cst0 = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> // CHECK: gpu.barrier // CHECK-NEXT: tt.call // CHECK-NEXT: gpu.barrier tt.call @convert_layout1(%A) : (!tt.ptr) -> () - %cst1 = triton_gpu.local_alloc %cst_ : (tensor<16x32xf16, #AL>) -> !tt.memdesc<16x32xf16, #A_SHARED, #triton_gpu.shared_memory> + %cst1 = triton_gpu.local_alloc %cst_ : (tensor<16x32xf16, #AL>) -> !triton_gpu.memdesc<16x32xf16, #A_SHARED, #triton_gpu.shared_memory> } else { %cst0 = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #AL> // CHECK: tt.call @@ -649,7 +649,7 @@ tt.func @if_else_calls(%A : !tt.ptr, %cond : i1) { // CHECK-LABEL: for_calls tt.func @for_calls(%A : !tt.ptr, %cond : i1) { %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL> - %cst0 = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> + %cst0 = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> %cst1 = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #AL> %lb = arith.constant 0 : index %ub = arith.constant 10 : index @@ -665,7 +665,7 @@ tt.func @for_calls(%A : !tt.ptr, %cond : i1) { // CHECK-LABEL: call_graph_1 tt.func @call_graph_1(%A : !tt.ptr, %cond : i1) { %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL> - %cst0 = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> // CHECK: gpu.barrier + %cst0 = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> // CHECK: gpu.barrier // CHECK-NEXT: tt.call tt.call @convert_layout3(%cond) : (i1) -> () tt.return @@ -677,7 +677,7 @@ tt.func @call_graph_2(%A : !tt.ptr, %cond : i1) { tt.call @convert_layout4(%A, %cond) : (!tt.ptr, i1) -> () // CHECK: tt.call // CHECK-NEXT: gpu.barrier - %cst0 = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> + %cst0 = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> tt.return } @@ -694,8 +694,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : tt.func public @kernel(%arg3: !tt.ptr, %arg4: !tt.ptr, %arg12: tensor<32x128xf16, #blocked>, %arg13: tensor<32x128xf32, #blocked>, %arg14: tensor<32x32xf16, #blocked1>) { %c0_i32 = arith.constant 0 : i32 %cst = arith.constant dense<0.000000e+00> : tensor<32x128xf32, #blocked> - %37 = triton_gpu.local_alloc %arg14 {allocation.offset = 0 : i32} : (tensor<32x32xf16, #blocked1>) -> !tt.memdesc<32x32xf16, #shared, #triton_gpu.shared_memory> - %58 = triton_gpu.local_alloc %arg12 : (tensor<32x128xf16, #blocked>) -> !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory> + %37 = triton_gpu.local_alloc %arg14 {allocation.offset = 0 : i32} : (tensor<32x32xf16, #blocked1>) -> !triton_gpu.memdesc<32x32xf16, #shared, #triton_gpu.shared_memory> + %58 = triton_gpu.local_alloc %arg12 : (tensor<32x128xf16, #blocked>) -> !triton_gpu.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory> cf.br ^bb1 ^bb1: // 2 preds: ^bb0, ^bb1 %59 = tt.atomic_cas acq_rel, gpu, %arg3, %c0_i32, %c0_i32 : (!tt.ptr, i32, i32) -> i32 @@ -703,8 +703,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : cf.cond_br %60, ^bb1, ^bb2 ^bb2: // pred: ^bb1 %72 = triton_gpu.convert_layout %arg13 : tensor<32x128xf32, #blocked> -> tensor<32x128xf32, #mma> - %73 = triton_gpu.local_load %37 : !tt.memdesc<32x32xf16, #shared, #triton_gpu.shared_memory> -> tensor<32x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> - %74 = triton_gpu.local_load %58 : !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory> -> tensor<32x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> + %73 = triton_gpu.local_load %37 : !triton_gpu.memdesc<32x32xf16, #shared, #triton_gpu.shared_memory> -> tensor<32x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> + %74 = triton_gpu.local_load %58 : !triton_gpu.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory> -> tensor<32x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> %75 = tt.dot %73, %74, %72, inputPrecision = tf32 : tensor<32x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<32x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<32x128xf32, #mma> %76 = triton_gpu.convert_layout %75 {allocation.offset = 0 : i32} : tensor<32x128xf32, #mma> -> tensor<32x128xf32, #blocked> %77 = arith.truncf %76 : tensor<32x128xf32, #blocked> to tensor<32x128xf16, #blocked> @@ -725,8 +725,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : tt.func @tma_special_cases(%arg1: !tt.ptr) -> (tensor<256x64xf16, #blocked>){ %true = arith.constant 1 : i1 %c0 = arith.constant 0 : i32 - %barrier = triton_gpu.local_alloc : () -> !tt.memdesc<1xi64, #shared1, #triton_gpu.shared_memory, mutable> - %alloc = triton_gpu.local_alloc : () -> !tt.memdesc<256x64xf16, #shared, #triton_gpu.shared_memory, mutable> + %barrier = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1xi64, #shared1, #triton_gpu.shared_memory, mutable> + %alloc = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<256x64xf16, #shared, #triton_gpu.shared_memory, mutable> // CHECK: triton_nvidia_gpu.init_barrier // CHECK-NEXT: triton_nvidia_gpu.init_barrier triton_nvidia_gpu.init_barrier %barrier, 1 : <1xi64, #shared1, #triton_gpu.shared_memory, mutable> @@ -749,7 +749,7 @@ tt.func @tma_special_cases(%arg1: !tt.ptr) -> (tensor<256x64xf16, #blocke triton_nvidia_gpu.wait_barrier %barrier, %c0 : <1xi64, #shared1, #triton_gpu.shared_memory, mutable> // CHECK-NEXT: triton_gpu.local_load - %t = triton_gpu.local_load %alloc : !tt.memdesc<256x64xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<256x64xf16, #blocked> + %t = triton_gpu.local_load %alloc : !triton_gpu.memdesc<256x64xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<256x64xf16, #blocked> // CHECK-NEXT: triton_nvidia_gpu.barrier_expect // CHECK-NEXT: gpu.barrier @@ -780,8 +780,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : tt.func @tma_special_cases_cf(%arg1: !tt.ptr, %i1 : i1, %arg2: tensor<256x64xf16, #blocked>) -> (tensor<256x64xf16, #blocked>){ %true = arith.constant 1 : i1 %c0 = arith.constant 0 : i32 - %barrier = triton_gpu.local_alloc : () -> !tt.memdesc<1xi64, #shared1, #triton_gpu.shared_memory, mutable> - %alloc = triton_gpu.local_alloc : () -> !tt.memdesc<256x64xf16, #shared, #triton_gpu.shared_memory, mutable> + %barrier = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1xi64, #shared1, #triton_gpu.shared_memory, mutable> + %alloc = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<256x64xf16, #shared, #triton_gpu.shared_memory, mutable> // CHECK: cf.cond_br scf.if %i1 { // CHECK-NOT: gpu.barrier @@ -797,12 +797,12 @@ tt.func @tma_special_cases_cf(%arg1: !tt.ptr, %i1 : i1, %arg2: tensor<256 // CHECK-NOT: gpu.barrier // CHECK: triton_gpu.local_store // CHECK-NEXT: cf.br - triton_gpu.local_store %arg2, %alloc : tensor<256x64xf16, #blocked> -> !tt.memdesc<256x64xf16, #shared, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %arg2, %alloc : tensor<256x64xf16, #blocked> -> !triton_gpu.memdesc<256x64xf16, #shared, #triton_gpu.shared_memory, mutable> scf.yield } // CHECK: gpu.barrier // CHECK-NEXT: triton_gpu.local_load - %t = triton_gpu.local_load %alloc : !tt.memdesc<256x64xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<256x64xf16, #blocked> + %t = triton_gpu.local_load %alloc : !triton_gpu.memdesc<256x64xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<256x64xf16, #blocked> tt.return %t : tensor<256x64xf16, #blocked> } } diff --git a/test/Conversion/amd/compute-base-ptr.mlir b/test/Conversion/amd/compute-base-ptr.mlir index 809e5a8699..c62f7bfb6c 100644 --- a/test/Conversion/amd/compute-base-ptr.mlir +++ b/test/Conversion/amd/compute-base-ptr.mlir @@ -7,10 +7,10 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // CHECK-LABEL: @local_load_offset tt.func @local_load_offset(%arg0: tensor<16x16xf16, #mma>) { %0 = triton_gpu.convert_layout %arg0 {allocation.offset = 0 : i32} : tensor<16x16xf16, #mma> -> tensor<16x16xf16, #blocked> loc(#loc1) - %1 = triton_gpu.local_alloc %0 {allocation.offset = 0 : i32} : (tensor<16x16xf16, #blocked>) -> !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> loc(#loc2) + %1 = triton_gpu.local_alloc %0 {allocation.offset = 0 : i32} : (tensor<16x16xf16, #blocked>) -> !triton_gpu.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> loc(#loc2) // This catches base ptr calculation in the computeBasePtr, checks if the gep has correct element type. // CHECK: llvm.getelementptr {{.*}} (!llvm.ptr<3>, i32) -> !llvm.ptr<3>, f16 local_load:3:0 - %2 = triton_gpu.local_load %1 : !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> -> tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 16}>> loc(#loc3) + %2 = triton_gpu.local_load %1 : !triton_gpu.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> -> tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 16}>> loc(#loc3) tt.return } } diff --git a/test/Conversion/amd/decompose-unsupported-conversions.mlir b/test/Conversion/amd/decompose-unsupported-conversions.mlir index 1bd288449f..9e6acf2e4b 100644 --- a/test/Conversion/amd/decompose-unsupported-conversions.mlir +++ b/test/Conversion/amd/decompose-unsupported-conversions.mlir @@ -8,7 +8,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "hip:gfx1130", "triton_gpu.threads-per-warp" = 32 : i32} { tt.func @wmma_to_wmma_dot_op(%arg0: tensor<16x16xf16, #mma>) { // CHECK: %[[SRC_BLOCKED:.+]] = triton_gpu.convert_layout %{{.*}} : tensor<16x16xf16, #[[$WMMA]]> -> tensor<16x16xf16, #[[$BLOCKED]]> - // CHECK-NEXT: %[[INT_SHARED:.+]] = triton_gpu.local_alloc %[[SRC_BLOCKED]] : {{.*}} -> !tt.memdesc<16x16xf16, #[[$SHARED]], #triton_gpu.shared_memory> + // CHECK-NEXT: %[[INT_SHARED:.+]] = triton_gpu.local_alloc %[[SRC_BLOCKED]] : {{.*}} -> !triton_gpu.memdesc<16x16xf16, #[[$SHARED]], #triton_gpu.shared_memory> // CHECK-NEXT: %[[DST_DOT_OP:.+]] = triton_gpu.local_load %[[INT_SHARED]] : {{.*}} -> tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #[[$WMMA]], kWidth = 16}>> %0 = triton_gpu.convert_layout %arg0 : tensor<16x16xf16, #mma> -> tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 16}>> tt.return @@ -25,7 +25,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { tt.func @wmma_to_wmma_dot3d_op(%arg0: tensor<2x16x16xf16, #mma>) { // CHECK: %[[SRC_BLOCKED:.+]] = triton_gpu.convert_layout %{{.*}} : tensor<2x16x16xf16, #[[$WMMA]]> -> tensor<2x16x16xf16, #[[$BLOCKED]]> - // CHECK-NEXT: %[[INT_SHARED:.+]] = triton_gpu.local_alloc %[[SRC_BLOCKED]] : {{.*}} -> !tt.memdesc<2x16x16xf16, #[[$SHARED]], #triton_gpu.shared_memory> + // CHECK-NEXT: %[[INT_SHARED:.+]] = triton_gpu.local_alloc %[[SRC_BLOCKED]] : {{.*}} -> !triton_gpu.memdesc<2x16x16xf16, #[[$SHARED]], #triton_gpu.shared_memory> // CHECK-NEXT: %[[DST_DOT_OP:.+]] = triton_gpu.local_load %[[INT_SHARED]] : {{.*}} -> tensor<2x16x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #[[$WMMA]], kWidth = 16}>> %0 = triton_gpu.convert_layout %arg0 : tensor<2x16x16xf16, #mma> -> tensor<2x16x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 16}>> tt.return diff --git a/test/Conversion/amd/tritongpu_to_llvm.mlir b/test/Conversion/amd/tritongpu_to_llvm.mlir index d9a37b5c75..98d97f5cce 100644 --- a/test/Conversion/amd/tritongpu_to_llvm.mlir +++ b/test/Conversion/amd/tritongpu_to_llvm.mlir @@ -44,19 +44,19 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // CHECK-LABEL: small_mfma_tensor_conversions tt.func public @small_mfma_tensor_conversions(%arg0: tensor<16x16xf16, #mfma>, %arg1: tensor<16x16x!tt.ptr, #mfma>) { // CHECK-NOT: triton_gpu.convert_layout - %0 = triton_gpu.local_alloc %arg0 : (tensor<16x16xf16, #mfma>) -> !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> + %0 = triton_gpu.local_alloc %arg0 : (tensor<16x16xf16, #mfma>) -> !triton_gpu.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> // CHECK-4: store {{.*}} vector<4xf16> - %1 = triton_gpu.local_load %0 : !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> -> tensor<16x16xf16, #dotop0> + %1 = triton_gpu.local_load %0 : !triton_gpu.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> -> tensor<16x16xf16, #dotop0> // CHECK-2: load {{.*}} vector<4xf16> - %2 = triton_gpu.local_load %0 : !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> -> tensor<16x16xf16, #dotop1> + %2 = triton_gpu.local_load %0 : !triton_gpu.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> -> tensor<16x16xf16, #dotop1> // CHECK-8: load {{.*}} vector<1xf16> - %3 = triton_gpu.local_load %0 : !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> -> tensor<16x16xf16, #mfma> + %3 = triton_gpu.local_load %0 : !triton_gpu.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> -> tensor<16x16xf16, #mfma> // CHECK-4: load {{.*}} vector<4xf16> %4 = tt.fp_to_fp %3 : tensor<16x16xf16, #mfma> -> tensor<16x16xf32, #mfma> %5 = tt.dot %1, %2, %4 : tensor<16x16xf16, #dotop0> * tensor<16x16xf16, #dotop1> -> tensor<16x16xf32, #mfma> // Store result to prevent DCE from removing all conversion related code - %6 = triton_gpu.local_alloc %5 : (tensor<16x16xf32, #mfma>) -> !tt.memdesc<16x16xf32, #shared, #triton_gpu.shared_memory> + %6 = triton_gpu.local_alloc %5 : (tensor<16x16xf32, #mfma>) -> !triton_gpu.memdesc<16x16xf32, #shared, #triton_gpu.shared_memory> tt.return } } diff --git a/test/Conversion/amd/tritongpu_wmma_dot_to_llvm.mlir b/test/Conversion/amd/tritongpu_wmma_dot_to_llvm.mlir index 5eb856bb99..e7dcb873d0 100644 --- a/test/Conversion/amd/tritongpu_wmma_dot_to_llvm.mlir +++ b/test/Conversion/amd/tritongpu_wmma_dot_to_llvm.mlir @@ -5,22 +5,22 @@ #mma2 = #triton_gpu.amd_wmma<{version = 2, warpsPerCTA = [2, 2]}> module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { // CHECK-LABEL: wmma1_dot_operand - tt.func @wmma1_dot_operand(%arg0: !tt.memdesc<64x64xf16, #shared>) { + tt.func @wmma1_dot_operand(%arg0: !triton_gpu.memdesc<64x64xf16, #shared>) { // 2 CTA * 4 rep * load_per_thread_per_instr // CHECK-COUNT-8: llvm.load %{{.*}} : !llvm.ptr<3> -> vector<16xf16> - %0 = triton_gpu.local_load %arg0 : !tt.memdesc<64x64xf16, #shared> -> tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1, kWidth = 16}>> + %0 = triton_gpu.local_load %arg0 : !triton_gpu.memdesc<64x64xf16, #shared> -> tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1, kWidth = 16}>> // CHECK-COUNT-128: llvm.load %{{.*}} : !llvm.ptr<3> -> vector<1xf16> - %1 = triton_gpu.local_load %arg0 : !tt.memdesc<64x64xf16, #shared> -> tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma1, kWidth = 16}>> + %1 = triton_gpu.local_load %arg0 : !triton_gpu.memdesc<64x64xf16, #shared> -> tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma1, kWidth = 16}>> tt.return } // CHECK-LABEL: wmma2_dot_operand - tt.func @wmma2_dot_operand(%arg0: !tt.memdesc<64x64xf16, #shared>) { + tt.func @wmma2_dot_operand(%arg0: !triton_gpu.memdesc<64x64xf16, #shared>) { // 2 CTA * 4 rep * load_per_thread_per_instr // CHECK-COUNT-8: llvm.load %{{.*}} : !llvm.ptr<3> -> vector<8xf16> - %0 = triton_gpu.local_load %arg0 : !tt.memdesc<64x64xf16, #shared> -> tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma2, kWidth = 8}>> + %0 = triton_gpu.local_load %arg0 : !triton_gpu.memdesc<64x64xf16, #shared> -> tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma2, kWidth = 8}>> // CHECK-COUNT-64: llvm.load %{{.*}} : !llvm.ptr<3> -> vector<1xf16> - %1 = triton_gpu.local_load %arg0 : !tt.memdesc<64x64xf16, #shared> -> tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma2, kWidth = 8}>> + %1 = triton_gpu.local_load %arg0 : !triton_gpu.memdesc<64x64xf16, #shared> -> tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma2, kWidth = 8}>> tt.return } @@ -105,11 +105,11 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : #mma1 = #triton_gpu.amd_wmma<{version = 1, warpsPerCTA = [2, 1, 4]}> module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { // CHECK-LABEL: wmma_dot_operand3d - tt.func @wmma_dot_operand3d(%arg0: !tt.memdesc<4x16x32xf16, #shared>) { + tt.func @wmma_dot_operand3d(%arg0: !triton_gpu.memdesc<4x16x32xf16, #shared>) { // CHECK-COUNT-4: llvm.load %{{.*}} : !llvm.ptr<3> -> vector<16xf16> - %0 = triton_gpu.local_load %arg0 : !tt.memdesc<4x16x32xf16, #shared> -> tensor<4x16x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1, kWidth = 16}>> + %0 = triton_gpu.local_load %arg0 : !triton_gpu.memdesc<4x16x32xf16, #shared> -> tensor<4x16x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1, kWidth = 16}>> // CHECK-COUNT-32: llvm.load %{{.*}} : !llvm.ptr<3> -> vector<1xf16> - %1 = triton_gpu.local_load %arg0 : !tt.memdesc<4x16x32xf16, #shared> -> tensor<4x16x32xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma1, kWidth = 16}>> + %1 = triton_gpu.local_load %arg0 : !triton_gpu.memdesc<4x16x32xf16, #shared> -> tensor<4x16x32xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma1, kWidth = 16}>> tt.return } diff --git a/test/Conversion/tritongpu_to_llvm.mlir b/test/Conversion/tritongpu_to_llvm.mlir index b2b64014b0..3f2fd578da 100644 --- a/test/Conversion/tritongpu_to_llvm.mlir +++ b/test/Conversion/tritongpu_to_llvm.mlir @@ -447,7 +447,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // CHECK: llvm.mlir.addressof @global_smem // CHECK-NEXT: llvm.getelementptr // CHECK-NEXT: llvm.mlir.constant - %0 = triton_gpu.local_alloc : () -> !tt.memdesc<16x16xf16, #shared0, #triton_gpu.shared_memory, mutable> + %0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<16x16xf16, #shared0, #triton_gpu.shared_memory, mutable> tt.return } } @@ -477,8 +477,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // CHECK-NEXT: llvm.getelementptr %index = arith.constant 1 : i32 %zero = arith.constant 0 : i32 - %0 = triton_gpu.local_alloc : () -> !tt.memdesc<128x16x32xf32, #shared0, #triton_gpu.shared_memory, mutable> - %1 = triton_gpu.memdesc_subview %0[%index, %zero, %zero] : !tt.memdesc<128x16x32xf32, #shared0, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x32xf32, #shared0, #triton_gpu.shared_memory, mutable> + %0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x16x32xf32, #shared0, #triton_gpu.shared_memory, mutable> + %1 = triton_gpu.memdesc_subview %0[%index, %zero, %zero] : !triton_gpu.memdesc<128x16x32xf32, #shared0, #triton_gpu.shared_memory, mutable> -> !triton_gpu.memdesc<16x32xf32, #shared0, #triton_gpu.shared_memory, mutable> tt.return } } @@ -509,10 +509,10 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : %24 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #slice1d0> %59 = tt.addptr %58, %24 : tensor<64x!tt.ptr, #slice1d0>, tensor<64xi32, #slice1d0> %66 = tt.addptr %59, %cst_2 : tensor<64x!tt.ptr, #slice1d0>, tensor<64xi32, #slice1d0> - %71 = triton_gpu.local_alloc : () -> !tt.memdesc<2x64xi64, #shared2D, #triton_gpu.shared_memory, mutable> + %71 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<2x64xi64, #shared2D, #triton_gpu.shared_memory, mutable> %subview = triton_gpu.memdesc_subview %71[%c0_i32, %c0_i32] : - !tt.memdesc<2x64xi64, #shared2D, #triton_gpu.shared_memory, mutable> -> - !tt.memdesc<64xi64, #shared1D, #triton_gpu.shared_memory, mutable> + !triton_gpu.memdesc<2x64xi64, #shared2D, #triton_gpu.shared_memory, mutable> -> + !triton_gpu.memdesc<64xi64, #shared1D, #triton_gpu.shared_memory, mutable> // CHECK: llvm.inline_asm has_side_effects asm_dialect = att // CHECK-SAME: cp.async.ca.shared.global [ ${{.*}} + 0 ], [ ${{.*}} + 0 ], 0x8, 0x8 // CHECK: cp.async.ca.shared.global [ ${{.*}} + 0 ], [ ${{.*}} + 0 ], 0x8, 0x8 @@ -523,7 +523,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : // CHECK: cp.async.ca.shared.global [ ${{.*}} + 0 ], [ ${{.*}} + 0 ], 0x8, 0x8 // CHECK: cp.async.ca.shared.global [ ${{.*}} + 0 ], [ ${{.*}} + 0 ], 0x8, 0x8 // CHECK: cp.async.commit_group - %73 = triton_gpu.async_copy_global_to_local %66, %subview : tensor<64x!tt.ptr, #slice1d0> -> !tt.memdesc<64xi64, #shared1D, #triton_gpu.shared_memory, mutable> + %73 = triton_gpu.async_copy_global_to_local %66, %subview : tensor<64x!tt.ptr, #slice1d0> -> !triton_gpu.memdesc<64xi64, #shared1D, #triton_gpu.shared_memory, mutable> triton_gpu.async_commit_group %73 tt.return } @@ -556,14 +556,14 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : %off = arith.addi %broadcast_off0, %broadcast_off1 : tensor<16x64xi32, #AL> %a_init = tt.splat %arg0 : !tt.ptr -> tensor<16x64x!tt.ptr, #AL> %a_ptr = tt.addptr %a_init, %off : tensor<16x64x!tt.ptr, #AL>, tensor<16x64xi32, #AL> - %tensor = triton_gpu.local_alloc : () -> !tt.memdesc<16x64xf32, #A, #triton_gpu.shared_memory, mutable> + %tensor = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<16x64xf32, #A, #triton_gpu.shared_memory, mutable> %index = arith.constant 1 : i32 // CHECK: llvm.inline_asm has_side_effects asm_dialect = att operand_attrs = [] "@${{.*}} cp.async.cg.shared.global [ ${{.*}} + 0 ], [ ${{.*}} + 0 ], 0x10, 0x10;" // CHECK: llvm.inline_asm has_side_effects asm_dialect = att operand_attrs = [] "@${{.*}} cp.async.cg.shared.global [ ${{.*}} + 16 ], [ ${{.*}} + 0 ], 0x10, 0x10;" // CHECK: llvm.inline_asm has_side_effects asm_dialect = att // CHECK-SAME: cp.async.commit_group - %a = triton_gpu.async_copy_global_to_local %a_ptr, %tensor : tensor<16x64x!tt.ptr, #AL> -> !tt.memdesc<16x64xf32, #A, #triton_gpu.shared_memory, mutable> + %a = triton_gpu.async_copy_global_to_local %a_ptr, %tensor : tensor<16x64x!tt.ptr, #AL> -> !triton_gpu.memdesc<16x64xf32, #A, #triton_gpu.shared_memory, mutable> triton_gpu.async_commit_group tt.return } @@ -596,7 +596,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : %off = arith.addi %broadcast_off0, %broadcast_off1 : tensor<16x32xi32, #AL> %a_init = tt.splat %arg0 : !tt.ptr -> tensor<16x32x!tt.ptr, #AL> %a_ptr = tt.addptr %a_init, %off : tensor<16x32x!tt.ptr, #AL>, tensor<16x32xi32, #AL> - %tensor = triton_gpu.local_alloc : () -> !tt.memdesc<16x32xf32, #A, #triton_gpu.shared_memory, mutable> + %tensor = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<16x32xf32, #A, #triton_gpu.shared_memory, mutable> %index = arith.constant 1 : i32 // CHECK: llvm.inline_asm @@ -609,7 +609,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // CHECK-SAME: cp.async.ca.shared.global [ ${{.*}} + 0 ], [ ${{.*}} + 0 ], 0x4, 0x4 // CHECK: llvm.inline_asm // CHECK-SAME: cp.async.commit_group - %a = triton_gpu.async_copy_global_to_local %a_ptr, %tensor : tensor<16x32x!tt.ptr, #AL> -> !tt.memdesc<16x32xf32, #A, #triton_gpu.shared_memory, mutable> + %a = triton_gpu.async_copy_global_to_local %a_ptr, %tensor : tensor<16x32x!tt.ptr, #AL> -> !triton_gpu.memdesc<16x32xf32, #A, #triton_gpu.shared_memory, mutable> triton_gpu.async_commit_group tt.return } @@ -641,7 +641,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : %off = arith.addi %broadcast_off0, %broadcast_off1 : tensor<32x32xi32, #AL> %a_init = tt.splat %arg0 : !tt.ptr -> tensor<32x32x!tt.ptr, #AL> %a_ptr = tt.addptr %a_init, %off : tensor<32x32x!tt.ptr, #AL>, tensor<32x32xi32, #AL> - %tensor = triton_gpu.local_alloc : () -> !tt.memdesc<32x32xf32, #A, #triton_gpu.shared_memory, mutable> + %tensor = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<32x32xf32, #A, #triton_gpu.shared_memory, mutable> %index = arith.constant 1 : i32 // CHECK: llvm.mlir.constant(0 : i32) : i32 @@ -665,7 +665,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // CHECK-SAME: cp.async.ca.shared.global [ ${{.*}} + 0 ], [ ${{.*}} + 0 ], 0x4, 0x4 // CHECK: llvm.inline_asm // CHECK-SAME: cp.async.commit_group - %a = triton_gpu.async_copy_global_to_local %a_ptr, %tensor : tensor<32x32x!tt.ptr, #AL> -> !tt.memdesc<32x32xf32, #A, #triton_gpu.shared_memory, mutable> + %a = triton_gpu.async_copy_global_to_local %a_ptr, %tensor : tensor<32x32x!tt.ptr, #AL> -> !triton_gpu.memdesc<32x32xf32, #A, #triton_gpu.shared_memory, mutable> triton_gpu.async_commit_group tt.return } @@ -773,14 +773,14 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32} { // CHECK-LABEL: convert_dot tt.func @convert_dot(%A: tensor<16x16xf16, #blocked0>, %B: tensor<16x16xf16, #blocked0>) { - %AA = triton_gpu.local_alloc %A : (tensor<16x16xf16, #blocked0>) -> !tt.memdesc<16x16xf16, #shared0, #triton_gpu.shared_memory> - %BB = triton_gpu.local_alloc %B : (tensor<16x16xf16, #blocked0>) -> !tt.memdesc<16x16xf16, #shared0, #triton_gpu.shared_memory> + %AA = triton_gpu.local_alloc %A : (tensor<16x16xf16, #blocked0>) -> !triton_gpu.memdesc<16x16xf16, #shared0, #triton_gpu.shared_memory> + %BB = triton_gpu.local_alloc %B : (tensor<16x16xf16, #blocked0>) -> !triton_gpu.memdesc<16x16xf16, #shared0, #triton_gpu.shared_memory> // CHECK: llvm.inline_asm // CHECK: ldmatrix.sync.aligned.m8n8.x4 // CHECK: llvm.inline_asm // CHECK-SAME: ldmatrix.sync.aligned.m8n8.x4 - %AA_DOT = triton_gpu.local_load %AA : !tt.memdesc<16x16xf16, #shared0, #triton_gpu.shared_memory> -> tensor<16x16xf16, #dot_operand_a> - %BB_DOT = triton_gpu.local_load %BB : !tt.memdesc<16x16xf16, #shared0, #triton_gpu.shared_memory> -> tensor<16x16xf16, #dot_operand_b> + %AA_DOT = triton_gpu.local_load %AA : !triton_gpu.memdesc<16x16xf16, #shared0, #triton_gpu.shared_memory> -> tensor<16x16xf16, #dot_operand_a> + %BB_DOT = triton_gpu.local_load %BB : !triton_gpu.memdesc<16x16xf16, #shared0, #triton_gpu.shared_memory> -> tensor<16x16xf16, #dot_operand_b> %cst0 = arith.constant dense<0.000000e+00> : tensor<16x16xf32, #mma0> // CHECK: llvm.inline_asm @@ -812,12 +812,12 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32} { // CHECK-LABEL: convert_dot_fp8 tt.func @convert_dot_fp8(%A: tensor<16x16xf8E5M2, #blocked0>, %B: tensor<16x16xf8E5M2, #blocked0>) { - %AA = triton_gpu.local_alloc %A : (tensor<16x16xf8E5M2, #blocked0>) -> !tt.memdesc<16x16xf8E5M2, #shared0, #triton_gpu.shared_memory> - %BB = triton_gpu.local_alloc %B : (tensor<16x16xf8E5M2, #blocked0>) -> !tt.memdesc<16x16xf8E5M2, #shared0, #triton_gpu.shared_memory> + %AA = triton_gpu.local_alloc %A : (tensor<16x16xf8E5M2, #blocked0>) -> !triton_gpu.memdesc<16x16xf8E5M2, #shared0, #triton_gpu.shared_memory> + %BB = triton_gpu.local_alloc %B : (tensor<16x16xf8E5M2, #blocked0>) -> !triton_gpu.memdesc<16x16xf8E5M2, #shared0, #triton_gpu.shared_memory> // CHECK: llvm.inline_asm // CHECK-SAME: ldmatrix.sync.aligned.m8n8.x4 - %AA_DOT = triton_gpu.local_load %AA : !tt.memdesc<16x16xf8E5M2, #shared0, #triton_gpu.shared_memory> -> tensor<16x16xf8E5M2, #dot_operand_a> - %BB_DOT = triton_gpu.local_load %BB : !tt.memdesc<16x16xf8E5M2, #shared0, #triton_gpu.shared_memory> -> tensor<16x16xf8E5M2, #dot_operand_b> + %AA_DOT = triton_gpu.local_load %AA : !triton_gpu.memdesc<16x16xf8E5M2, #shared0, #triton_gpu.shared_memory> -> tensor<16x16xf8E5M2, #dot_operand_a> + %BB_DOT = triton_gpu.local_load %BB : !triton_gpu.memdesc<16x16xf8E5M2, #shared0, #triton_gpu.shared_memory> -> tensor<16x16xf8E5M2, #dot_operand_b> %cst0 = arith.constant dense<0.000000e+00> : tensor<16x16xf32, #mma0> // CHECK: llvm.inline_asm @@ -1054,7 +1054,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : // CHECK-SAME: !llvm.ptr<3> // CHECK: llvm.store // CHECK-SAME: !llvm.ptr<3> - %0 = triton_gpu.local_alloc %arg0 : (tensor<128x32xf32, #blocked0>) -> !tt.memdesc<128x32xf32, #shared0, #triton_gpu.shared_memory> + %0 = triton_gpu.local_alloc %arg0 : (tensor<128x32xf32, #blocked0>) -> !triton_gpu.memdesc<128x32xf32, #shared0, #triton_gpu.shared_memory> tt.return } } @@ -1111,11 +1111,11 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : #dot_operand_b = #triton_gpu.dot_op<{opIdx=1, parent=#mma, kWidth=2}> module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { tt.func @matmul_kernel_dot_operand_layout(%ptr:!tt.ptr {tt.divisibility = 16 : i32}, - %a:!tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory>, %b:!tt.memdesc<32x256xf16, #shared, #triton_gpu.shared_memory>) { + %a:!triton_gpu.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory>, %b:!triton_gpu.memdesc<32x256xf16, #shared, #triton_gpu.shared_memory>) { %cst = arith.constant dense<0.000000e+00> : tensor<128x256xf32, #mma> // CHECK: ldmatrix.sync.aligned.m8n8.x4.shared.b16 - %a_mat = triton_gpu.local_load %a : !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory> -> tensor<128x32xf16, #dot_operand_a> - %b_mat = triton_gpu.local_load %b : !tt.memdesc<32x256xf16, #shared, #triton_gpu.shared_memory> -> tensor<32x256xf16, #dot_operand_b> + %a_mat = triton_gpu.local_load %a : !triton_gpu.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory> -> tensor<128x32xf16, #dot_operand_a> + %b_mat = triton_gpu.local_load %b : !triton_gpu.memdesc<32x256xf16, #shared, #triton_gpu.shared_memory> -> tensor<32x256xf16, #dot_operand_b> %28 = tt.dot %a_mat, %b_mat, %cst : tensor<128x32xf16, #dot_operand_a> * tensor<32x256xf16, #dot_operand_b> -> tensor<128x256xf32, #mma> %38 = triton_gpu.convert_layout %28 : tensor<128x256xf32, #mma> -> tensor<128x256xf32, #blocked> @@ -1135,11 +1135,11 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : #dot_operand_b = #triton_gpu.dot_op<{opIdx=1, parent=#blocked}> module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { tt.func @matmul_fmadot(%ptr:!tt.ptr {tt.divisibility = 16 : i32}, - %a:!tt.memdesc<32x16xf32, #shared, #triton_gpu.shared_memory>, %b:!tt.memdesc<16x32xf32, #shared, #triton_gpu.shared_memory>) { + %a:!triton_gpu.memdesc<32x16xf32, #shared, #triton_gpu.shared_memory>, %b:!triton_gpu.memdesc<16x32xf32, #shared, #triton_gpu.shared_memory>) { %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #blocked> // CHECK: llvm.intr.fmuladd - %a_mat = triton_gpu.local_load %a : !tt.memdesc<32x16xf32, #shared, #triton_gpu.shared_memory> -> tensor<32x16xf32, #dot_operand_a> - %b_mat = triton_gpu.local_load %b : !tt.memdesc<16x32xf32, #shared, #triton_gpu.shared_memory> -> tensor<16x32xf32, #dot_operand_b> + %a_mat = triton_gpu.local_load %a : !triton_gpu.memdesc<32x16xf32, #shared, #triton_gpu.shared_memory> -> tensor<32x16xf32, #dot_operand_a> + %b_mat = triton_gpu.local_load %b : !triton_gpu.memdesc<16x32xf32, #shared, #triton_gpu.shared_memory> -> tensor<16x32xf32, #dot_operand_b> %28 = tt.dot %a_mat, %b_mat, %cst, inputPrecision = ieee : tensor<32x16xf32, #dot_operand_a> * tensor<16x32xf32, #dot_operand_b> -> tensor<32x32xf32, #blocked> %30 = tt.splat %ptr : !tt.ptr -> tensor<32x1x!tt.ptr, #blocked> @@ -1159,7 +1159,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { // CHECK-LABEL: matmul_tf32dot tt.func @matmul_tf32dot(%ptr:!tt.ptr {tt.divisibility = 16 : i32}, - %a:!tt.memdesc<32x16xf32, #shared, #triton_gpu.shared_memory>, %b:!tt.memdesc<16x32xf32, #shared, #triton_gpu.shared_memory>) { + %a:!triton_gpu.memdesc<32x16xf32, #shared, #triton_gpu.shared_memory>, %b:!triton_gpu.memdesc<16x32xf32, #shared, #triton_gpu.shared_memory>) { %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma> // CHECK: llvm.inline_asm // CHECK-SAME: ldmatrix.sync.aligned.m8n8.x4.shared.b16 @@ -1167,8 +1167,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // CHECK: llvm.inline_asm // CHECK-SAME: ldmatrix.sync.aligned.m8n8.x4.shared.b16 // CHECK-SAME: (i32, i32, i32, i32) - %a_mat = triton_gpu.local_load %a : !tt.memdesc<32x16xf32, #shared, #triton_gpu.shared_memory> -> tensor<32x16xf32, #dot_operand_a> - %b_mat = triton_gpu.local_load %b : !tt.memdesc<16x32xf32, #shared, #triton_gpu.shared_memory> -> tensor<16x32xf32, #dot_operand_b> + %a_mat = triton_gpu.local_load %a : !triton_gpu.memdesc<32x16xf32, #shared, #triton_gpu.shared_memory> -> tensor<32x16xf32, #dot_operand_a> + %b_mat = triton_gpu.local_load %b : !triton_gpu.memdesc<16x32xf32, #shared, #triton_gpu.shared_memory> -> tensor<16x32xf32, #dot_operand_b> // CHECK: llvm.inline_asm // CHECK-SAME: mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 @@ -1391,8 +1391,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : // CHECK-LABEL: test_base_index_cache tt.func @test_base_index_cache(%arg0: tensor<128x32xf32, #blocked0>) { // CHECK: nvvm.read.ptx.sreg.tid.x - %0 = triton_gpu.local_alloc %arg0 : (tensor<128x32xf32, #blocked0>) -> !tt.memdesc<128x32xf32, #shared0, #triton_gpu.shared_memory> - %1 = triton_gpu.local_alloc %arg0 : (tensor<128x32xf32, #blocked0>) -> !tt.memdesc<128x32xf32, #shared0, #triton_gpu.shared_memory> + %0 = triton_gpu.local_alloc %arg0 : (tensor<128x32xf32, #blocked0>) -> !triton_gpu.memdesc<128x32xf32, #shared0, #triton_gpu.shared_memory> + %1 = triton_gpu.local_alloc %arg0 : (tensor<128x32xf32, #blocked0>) -> !triton_gpu.memdesc<128x32xf32, #shared0, #triton_gpu.shared_memory> tt.return } } @@ -1404,10 +1404,10 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : // CHECK-LABEL: test_index_cache_different_block tt.func @test_index_cache_different_block(%arg0: tensor<128x32xf32, #blocked0>, %arg1: i1) { // CHECK: nvvm.read.ptx.sreg.tid.x - %0 = triton_gpu.local_alloc %arg0 : (tensor<128x32xf32, #blocked0>) -> !tt.memdesc<128x32xf32, #shared0, #triton_gpu.shared_memory> + %0 = triton_gpu.local_alloc %arg0 : (tensor<128x32xf32, #blocked0>) -> !triton_gpu.memdesc<128x32xf32, #shared0, #triton_gpu.shared_memory> cf.cond_br %arg1, ^bb1, ^bb2 ^bb1: // pred: ^bb0 - %1 = triton_gpu.local_alloc %arg0 : (tensor<128x32xf32, #blocked0>) -> !tt.memdesc<128x32xf32, #shared0, #triton_gpu.shared_memory> + %1 = triton_gpu.local_alloc %arg0 : (tensor<128x32xf32, #blocked0>) -> !triton_gpu.memdesc<128x32xf32, #shared0, #triton_gpu.shared_memory> cf.br ^bb2 ^bb2: // 2 preds: ^bb0, ^bb1 tt.return @@ -1648,16 +1648,16 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : tt.func @i16_mma_layout(%f16_inp: tensor<16x16xf16, #blocked0>, %i16_inp: tensor<16x16xi16, #blocked0>) { // CHECK-LABEL: @i16_mma_layout - %f16_shared = triton_gpu.local_alloc %f16_inp : (tensor<16x16xf16, #blocked0>) -> !tt.memdesc<16x16xf16, #shared0, #triton_gpu.shared_memory> - %i16_shared = triton_gpu.local_alloc %i16_inp : (tensor<16x16xi16, #blocked0>) -> !tt.memdesc<16x16xi16, #shared0, #triton_gpu.shared_memory> + %f16_shared = triton_gpu.local_alloc %f16_inp : (tensor<16x16xf16, #blocked0>) -> !triton_gpu.memdesc<16x16xf16, #shared0, #triton_gpu.shared_memory> + %i16_shared = triton_gpu.local_alloc %i16_inp : (tensor<16x16xi16, #blocked0>) -> !triton_gpu.memdesc<16x16xi16, #shared0, #triton_gpu.shared_memory> // CHECK: llvm.inline_asm // CHECK-SAME: ldmatrix.sync.aligned.m8n8.x4 // CHECK: llvm.inline_asm // CHECK-SAME: ldmatrix.sync.aligned.m8n8.x4 - %f16_dot = triton_gpu.local_load %f16_shared : !tt.memdesc<16x16xf16, #shared0, #triton_gpu.shared_memory> -> tensor<16x16xf16, #dot_operand_a> - %i16_dot = triton_gpu.local_load %i16_shared : !tt.memdesc<16x16xi16, #shared0, #triton_gpu.shared_memory> -> tensor<16x16xi16, #dot_operand_b> + %f16_dot = triton_gpu.local_load %f16_shared : !triton_gpu.memdesc<16x16xf16, #shared0, #triton_gpu.shared_memory> -> tensor<16x16xf16, #dot_operand_a> + %i16_dot = triton_gpu.local_load %i16_shared : !triton_gpu.memdesc<16x16xi16, #shared0, #triton_gpu.shared_memory> -> tensor<16x16xi16, #dot_operand_b> // CHECK: llvm.sitofp %{{.*}} : i16 to f16 @@ -1720,8 +1720,8 @@ module attributes {"triton_gpu.target" = "cuda:90", "triton_gpu.num-ctas" = 1 : // CHECK: llvm.load // CHECK-SAME: {alignment = 8 : i64} : !llvm.ptr<3> -> vector<8xi8> // CHECK-NOT: llvm.load - tt.func public @vectorize_shmem_load(%shmem : !tt.memdesc<16x16xi8, #shared, #triton_gpu.shared_memory>) { - %0 = triton_gpu.local_load %shmem : !tt.memdesc<16x16xi8, #shared, #triton_gpu.shared_memory> -> tensor<16x16xi8, #blocked> + tt.func public @vectorize_shmem_load(%shmem : !triton_gpu.memdesc<16x16xi8, #shared, #triton_gpu.shared_memory>) { + %0 = triton_gpu.local_load %shmem : !triton_gpu.memdesc<16x16xi8, #shared, #triton_gpu.shared_memory> -> tensor<16x16xi8, #blocked> tt.return } } @@ -1736,7 +1736,7 @@ module attributes {"triton_gpu.target" = "cuda:90", "triton_gpu.num-ctas" = 1 : // CHECK-SAME: {alignment = 64 : i64} : vector<16xi32>, !llvm.ptr<3> // CHECK-NOT: llvm.store tt.func public @vectorize_shmem_store(%block : tensor<64x64xi32, #blocked>) { - %0 = triton_gpu.local_alloc %block : (tensor<64x64xi32, #blocked>) -> !tt.memdesc<64x64xi32, #shared, #triton_gpu.shared_memory> + %0 = triton_gpu.local_alloc %block : (tensor<64x64xi32, #blocked>) -> !triton_gpu.memdesc<64x64xi32, #shared, #triton_gpu.shared_memory> tt.return } } @@ -1761,9 +1761,9 @@ module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : // CHECK: llvm.extractelement {{.*}} : vector<8xbf16> tt.func public @test_local_load_bf16() { %c0_i32 = arith.constant 0 : i32 - %19 = triton_gpu.local_alloc : () -> !tt.memdesc<1x1x2048xbf16, #shared, #triton_gpu.shared_memory, mutable> - %22 = triton_gpu.memdesc_subview %19[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<1x1x2048xbf16, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<1x2048xbf16, #shared, #triton_gpu.shared_memory, mutable> - %39 = triton_gpu.local_load %22 : !tt.memdesc<1x2048xbf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<1x2048xbf16, #blocked> + %19 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x1x2048xbf16, #shared, #triton_gpu.shared_memory, mutable> + %22 = triton_gpu.memdesc_subview %19[%c0_i32, %c0_i32, %c0_i32] : !triton_gpu.memdesc<1x1x2048xbf16, #shared, #triton_gpu.shared_memory, mutable> -> !triton_gpu.memdesc<1x2048xbf16, #shared, #triton_gpu.shared_memory, mutable> + %39 = triton_gpu.local_load %22 : !triton_gpu.memdesc<1x2048xbf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<1x2048xbf16, #blocked> %40 = arith.extf %39 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> tt.return } @@ -1777,8 +1777,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // CHECK: llvm.store tt.func public @test_local_store(%arg0: tensor<1xf32, #blocked>) { %c0_i32 = arith.constant 0 : i32 - %0 = triton_gpu.local_alloc {allocation.offset = 0 : i32} : () -> !tt.memdesc<1xf32, #shared, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %arg0, %0 : tensor<1xf32, #blocked> -> !tt.memdesc<1xf32, #shared, #triton_gpu.shared_memory, mutable> + %0 = triton_gpu.local_alloc {allocation.offset = 0 : i32} : () -> !triton_gpu.memdesc<1xf32, #shared, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %arg0, %0 : tensor<1xf32, #blocked> -> !triton_gpu.memdesc<1xf32, #shared, #triton_gpu.shared_memory, mutable> tt.return } } @@ -1791,9 +1791,9 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // CHECK: llvm.store tt.func public @test_local_store_subview(%arg0: tensor<1xf32, #blocked>) { %c0_i32 = arith.constant 0 : i32 - %0 = triton_gpu.local_alloc {allocation.offset = 0 : i32} : () -> !tt.memdesc<1xf32, #shared, #triton_gpu.shared_memory, mutable> - %sv = triton_gpu.memdesc_subview %0[%c0_i32] : !tt.memdesc<1xf32, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<1xf32, #shared, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %arg0, %sv : tensor<1xf32, #blocked> -> !tt.memdesc<1xf32, #shared, #triton_gpu.shared_memory, mutable> + %0 = triton_gpu.local_alloc {allocation.offset = 0 : i32} : () -> !triton_gpu.memdesc<1xf32, #shared, #triton_gpu.shared_memory, mutable> + %sv = triton_gpu.memdesc_subview %0[%c0_i32] : !triton_gpu.memdesc<1xf32, #shared, #triton_gpu.shared_memory, mutable> -> !triton_gpu.memdesc<1xf32, #shared, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %arg0, %sv : tensor<1xf32, #blocked> -> !triton_gpu.memdesc<1xf32, #shared, #triton_gpu.shared_memory, mutable> tt.return } } diff --git a/test/Conversion/tritongpu_to_llvm_hopper.mlir b/test/Conversion/tritongpu_to_llvm_hopper.mlir index a8603f4a85..1f35d8fdd6 100644 --- a/test/Conversion/tritongpu_to_llvm_hopper.mlir +++ b/test/Conversion/tritongpu_to_llvm_hopper.mlir @@ -5,7 +5,7 @@ #shared1 = #triton_gpu.shared<{vec = 16, perPhase = 4, maxPhase = 2, order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1], hasLeadingOffset = true}> module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32} { // CHECK-LABEL: @dot_high_precision_acc - tt.func @dot_high_precision_acc(%a: !tt.memdesc<128x128xf8E5M2, #shared>, %b: !tt.memdesc<128x256xf8E5M2, #shared1>, %c: tensor<128x256xf32, #mma>) { + tt.func @dot_high_precision_acc(%a: !triton_gpu.memdesc<128x128xf8E5M2, #shared>, %b: !triton_gpu.memdesc<128x256xf8E5M2, #shared1>, %c: tensor<128x256xf32, #mma>) { // CHECK: nvgpu.wgmma // CHECK-COUNT-128: llvm.fadd // CHECK: nvgpu.wgmma @@ -16,7 +16,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : // CHECK-COUNT-128: llvm.fadd %m = triton_nvidia_gpu.warp_group_dot %a, %b, %c {maxNumImpreciseAcc = 32 : i32, inputPrecision = 0 : i32} : - !tt.memdesc<128x128xf8E5M2, #shared> * !tt.memdesc<128x256xf8E5M2, #shared1> -> tensor<128x256xf32, #mma> + !triton_gpu.memdesc<128x128xf8E5M2, #shared> * !triton_gpu.memdesc<128x256xf8E5M2, #shared1> -> tensor<128x256xf32, #mma> tt.return } } @@ -28,7 +28,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : #shared1 = #triton_gpu.shared<{vec = 16, perPhase = 4, maxPhase = 2, order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1], hasLeadingOffset = true}> module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32} { // CHECK-LABEL: @dot_low_precision_acc - tt.func @dot_low_precision_acc(%a: !tt.memdesc<128x128xf8E5M2, #shared>, %b: !tt.memdesc<128x256xf8E5M2, #shared1>, %c: tensor<128x256xf32, #mma>) { + tt.func @dot_low_precision_acc(%a: !triton_gpu.memdesc<128x128xf8E5M2, #shared>, %b: !triton_gpu.memdesc<128x256xf8E5M2, #shared1>, %c: tensor<128x256xf32, #mma>) { // CHECK: nvgpu.wgmma // CHECK-NOT: llvm.fadd // CHECK: nvgpu.wgmma @@ -40,7 +40,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : // CHECK: llvm.return %m = triton_nvidia_gpu.warp_group_dot %a, %b, %c {maxNumImpreciseAcc = 129 : i32, inputPrecision = 0 : i32} : - !tt.memdesc<128x128xf8E5M2, #shared> * !tt.memdesc<128x256xf8E5M2, #shared1> -> tensor<128x256xf32, #mma> + !triton_gpu.memdesc<128x128xf8E5M2, #shared> * !triton_gpu.memdesc<128x256xf8E5M2, #shared1> -> tensor<128x256xf32, #mma> tt.return } } @@ -52,7 +52,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : #shared1 = #triton_gpu.shared<{vec = 16, perPhase = 4, maxPhase = 2, order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1], hasLeadingOffset = true}> module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32} { // CHECK-LABEL: @dot_mix_precision_acc - tt.func @dot_mix_precision_acc(%a: !tt.memdesc<128x128xf8E5M2, #shared>, %b: !tt.memdesc<128x256xf8E5M2, #shared1>, %c: tensor<128x256xf32, #mma>) { + tt.func @dot_mix_precision_acc(%a: !triton_gpu.memdesc<128x128xf8E5M2, #shared>, %b: !triton_gpu.memdesc<128x256xf8E5M2, #shared1>, %c: tensor<128x256xf32, #mma>) { // CHECK: nvgpu.wgmma // CHECK-NOT: llvm.fadd // CHECK: nvgpu.wgmma @@ -64,7 +64,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : // CHECK: llvm.return %m = triton_nvidia_gpu.warp_group_dot %a, %b, %c {maxNumImpreciseAcc = 64 : i32, inputPrecision = 0 : i32} : - !tt.memdesc<128x128xf8E5M2, #shared> * !tt.memdesc<128x256xf8E5M2, #shared1> -> tensor<128x256xf32, #mma> + !triton_gpu.memdesc<128x128xf8E5M2, #shared> * !triton_gpu.memdesc<128x256xf8E5M2, #shared1> -> tensor<128x256xf32, #mma> tt.return } } @@ -78,10 +78,10 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // CHECK-LABEL: @dot_zero_acc // Generate a wgmma with 2 sources. // CHECK: nvgpu.wgmma %{{.*}}, %{{.*}} { - tt.func @dot_zero_acc(%a: !tt.memdesc<128x64xf16, #shared>, %b: !tt.memdesc<64x64xf16, #shared1>) { + tt.func @dot_zero_acc(%a: !triton_gpu.memdesc<128x64xf16, #shared>, %b: !triton_gpu.memdesc<64x64xf16, #shared1>) { %cst = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma> %m = triton_nvidia_gpu.warp_group_dot %a, %b, %cst {inputPrecision = 0 : i32, maxNumImpreciseAcc = 0 : i32} : - !tt.memdesc<128x64xf16, #shared> * !tt.memdesc<64x64xf16, #shared1> -> tensor<128x64xf32, #mma> + !triton_gpu.memdesc<128x64xf16, #shared> * !triton_gpu.memdesc<64x64xf16, #shared1> -> tensor<128x64xf32, #mma> tt.return } } @@ -95,11 +95,11 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // Generate a wgmma where the first operand is a struct. // CHECK: nvgpu.wgmma {{.*}} : (!llvm.struct<(i32, i32, i32, i32)>, i64, i1) -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)> // CHECK: nvgpu.wgmma_wait_group %{{.*}} {pendings = 0 : i32} : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)> - tt.func @dot_reg_operand_A(%a: tensor<128x64xf16, #mma>, %b: !tt.memdesc<64x64xf16, #shared>) { + tt.func @dot_reg_operand_A(%a: tensor<128x64xf16, #mma>, %b: !triton_gpu.memdesc<64x64xf16, #shared>) { %cst = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma> %opA = triton_gpu.convert_layout %a : tensor<128x64xf16, #mma> -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> %m = triton_nvidia_gpu.warp_group_dot %opA, %b, %cst { inputPrecision = 0 : i32 }: - tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * !tt.memdesc<64x64xf16, #shared> -> tensor<128x64xf32, #mma> + tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * !triton_gpu.memdesc<64x64xf16, #shared> -> tensor<128x64xf32, #mma> tt.return } } @@ -114,10 +114,10 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // Generate a wgmma where the first operand is a struct. // CHECK: nvgpu.wgmma {{.*}} : (!llvm.struct<(i32, i32, i32, i32)>, i64, i1) -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)> // CHECK: nvgpu.wgmma_wait_group %{{.*}} {pendings = 0 : i32} - tt.func @dot_reg_operand_A_fp8(%a: tensor<128x128xf8E5M2, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>>, %b: !tt.memdesc<128x256xf8E5M2, #shared>) { + tt.func @dot_reg_operand_A_fp8(%a: tensor<128x128xf8E5M2, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>>, %b: !triton_gpu.memdesc<128x256xf8E5M2, #shared>) { %cst = arith.constant dense<0.000000e+00> : tensor<128x256xf32, #mma1> %m = triton_nvidia_gpu.warp_group_dot %a, %b, %cst { maxNumImpreciseAcc = 1073741824 : i32, inputPrecision = 0 : i32 } : - tensor<128x128xf8E5M2, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>> * !tt.memdesc<128x256xf8E5M2, #shared> -> tensor<128x256xf32, #mma1> + tensor<128x128xf8E5M2, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>> * !triton_gpu.memdesc<128x256xf8E5M2, #shared> -> tensor<128x256xf32, #mma1> tt.return } } @@ -128,10 +128,10 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : #mma = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0], instrShape = [16, 64, 16]}> #shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1], hasLeadingOffset = true}> module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { - tt.func @dot_reg_operand_upcast(%a_desc: !tt.memdesc<128x64xi8, #shared>, %b: !tt.memdesc<64x64xf16, #shared>, %acc: tensor<128x64xf32, #mma>) { - %a_dotop = triton_gpu.local_load %a_desc : !tt.memdesc<128x64xi8, #shared> -> tensor<128x64xi8, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> + tt.func @dot_reg_operand_upcast(%a_desc: !triton_gpu.memdesc<128x64xi8, #shared>, %b: !triton_gpu.memdesc<64x64xf16, #shared>, %acc: tensor<128x64xf32, #mma>) { + %a_dotop = triton_gpu.local_load %a_desc : !triton_gpu.memdesc<128x64xi8, #shared> -> tensor<128x64xi8, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> %a_casted = arith.sitofp %a_dotop : tensor<128x64xi8, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> to tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> - %res = triton_nvidia_gpu.warp_group_dot %a_casted, %b, %acc : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * !tt.memdesc<64x64xf16, #shared> -> tensor<128x64xf32, #mma> + %res = triton_nvidia_gpu.warp_group_dot %a_casted, %b, %acc : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * !triton_gpu.memdesc<64x64xf16, #shared> -> tensor<128x64xf32, #mma> tt.return } } @@ -220,10 +220,10 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { // CHECK-LABEL: dot_zero_acc_operand // CHECK-COUNT-128: llvm.fadd - tt.func @dot_zero_acc_operand(%a: !tt.memdesc<128x128xf8E5M2, #shared>, %b: !tt.memdesc<128x128xf8E5M2, #shared1>) { + tt.func @dot_zero_acc_operand(%a: !triton_gpu.memdesc<128x128xf8E5M2, #shared>, %b: !triton_gpu.memdesc<128x128xf8E5M2, #shared1>) { %cst = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #mma> %m = triton_nvidia_gpu.warp_group_dot %a, %b, %cst {maxNumImpreciseAcc = 64 : i32, inputPrecision = 0 : i32} : - !tt.memdesc<128x128xf8E5M2, #shared> * !tt.memdesc<128x128xf8E5M2, #shared1> -> tensor<128x128xf32, #mma> + !triton_gpu.memdesc<128x128xf8E5M2, #shared> * !triton_gpu.memdesc<128x128xf8E5M2, #shared1> -> tensor<128x128xf32, #mma> tt.return } } @@ -238,7 +238,7 @@ module attributes {"triton_gpu.target" = "cuda:90", "triton_gpu.num-ctas" = 1 : tt.func @distribute_to_shared_st_matrix(%a: tensor<128x128xf16, #mma>) { // CHECK-COUNT-16: nvgpu.stmatrix // CHECK: llvm.return - %b = triton_gpu.local_alloc %a {allocation.offset = 0 : i32} : (tensor<128x128xf16, #mma>) -> !tt.memdesc<128x128xf16, #shared, mutable> + %b = triton_gpu.local_alloc %a {allocation.offset = 0 : i32} : (tensor<128x128xf16, #mma>) -> !triton_gpu.memdesc<128x128xf16, #shared, mutable> tt.return } } diff --git a/test/Conversion/tritonnvidiagpu_to_llvm.mlir b/test/Conversion/tritonnvidiagpu_to_llvm.mlir index 0bcab369f7..c7cc5fa5db 100644 --- a/test/Conversion/tritonnvidiagpu_to_llvm.mlir +++ b/test/Conversion/tritonnvidiagpu_to_llvm.mlir @@ -3,9 +3,9 @@ #shared0 = #triton_gpu.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}> module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { // CHECK-LABEL: init_barrier - tt.func @init_barrier(%alloc: !tt.memdesc<1xi64, #shared0>) { + tt.func @init_barrier(%alloc: !triton_gpu.memdesc<1xi64, #shared0>) { // CHECK: "@$0 mbarrier.init.shared::cta.b64 [$1], 1;", "b,r" %{{.*}}, %{{.*}} : (i1, !llvm.ptr<3>) -> !llvm.void - triton_nvidia_gpu.init_barrier %alloc, 1 : !tt.memdesc<1xi64, #shared0> + triton_nvidia_gpu.init_barrier %alloc, 1 : !triton_gpu.memdesc<1xi64, #shared0> tt.return } } @@ -15,11 +15,11 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : #shared0 = #triton_gpu.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}> module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { // CHECK-LABEL: wait_barrier - tt.func @wait_barrier(%alloc: !tt.memdesc<1xi64, #shared0>, %phase: i32) { + tt.func @wait_barrier(%alloc: !triton_gpu.memdesc<1xi64, #shared0>, %phase: i32) { // CHECK: waitLoop: // CHECK: mbarrier.try_wait.parity.shared.b64 // CHECK: @!P1 bra.uni waitLoop - triton_nvidia_gpu.wait_barrier %alloc, %phase : !tt.memdesc<1xi64, #shared0> + triton_nvidia_gpu.wait_barrier %alloc, %phase : !triton_gpu.memdesc<1xi64, #shared0> tt.return } } @@ -35,8 +35,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // CHECK: "@$0 cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes [$1], [$2, {$3, $4}], [$5];", "b,r,l,r,r,r" {{.*}} : (i1, !llvm.ptr<3>, !llvm.ptr<1>, i32, i32, !llvm.ptr<3>) -> !llvm.void // CHECK-NOT: cp.async.bulk.tensor.2d.shared // CHECK: return - tt.func @tma_copy_global_to_local(%tma: !tt.ptr, %alloc: !tt.memdesc<128x128xf32, #shared1, mutable>, %x: i32, %barrier: !tt.memdesc<1xi64, #shared0>, %pred: i1) { - triton_nvidia_gpu.async_tma_copy_global_to_local %tma[%x, %x] %alloc, %barrier, %pred : !tt.ptr, !tt.memdesc<1xi64, #shared0> -> !tt.memdesc<128x128xf32, #shared1, mutable> + tt.func @tma_copy_global_to_local(%tma: !tt.ptr, %alloc: !triton_gpu.memdesc<128x128xf32, #shared1, mutable>, %x: i32, %barrier: !triton_gpu.memdesc<1xi64, #shared0>, %pred: i1) { + triton_nvidia_gpu.async_tma_copy_global_to_local %tma[%x, %x] %alloc, %barrier, %pred : !tt.ptr, !triton_gpu.memdesc<1xi64, #shared0> -> !triton_gpu.memdesc<128x128xf32, #shared1, mutable> tt.return } } @@ -50,7 +50,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // CHECK: "@$0 cp.async.bulk.tensor.2d.global.shared::cta.bulk_group [$1, {$2, $3}], [$4];", "b,l,r,r,r" {{.*}} : (i1, !llvm.ptr<1>, i32, i32, !llvm.ptr<3>) -> !llvm.void // CHECK-NOT: cp.async.bulk.tensor.2d.global.shared::cta.bulk_group // CHECK: cp.async.bulk.commit_group - tt.func @tma_copy_local_to_global(%tma: !tt.ptr, %alloc: !tt.memdesc<128x128xf32, #shared1>, %x: i32) { + tt.func @tma_copy_local_to_global(%tma: !tt.ptr, %alloc: !triton_gpu.memdesc<128x128xf32, #shared1>, %x: i32) { triton_nvidia_gpu.async_tma_copy_local_to_global %tma[%x, %x] %alloc : , <128x128xf32, #shared1> tt.return } @@ -74,7 +74,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { // CHECK-LABEL: expect_barrier // CHECK: @$0 mbarrier.arrive.expect_tx.shared.b64 _, [$1], 16384; - tt.func @expect_barrier(%barrier: !tt.memdesc<1xi64, #shared0, mutable>, %pred: i1) { + tt.func @expect_barrier(%barrier: !triton_gpu.memdesc<1xi64, #shared0, mutable>, %pred: i1) { triton_nvidia_gpu.barrier_expect %barrier, 16384, %pred : <1xi64, #shared0, mutable> tt.return } diff --git a/test/Triton/invalid.mlir b/test/Triton/invalid.mlir index d121d285d3..c7fb41707e 100644 --- a/test/Triton/invalid.mlir +++ b/test/Triton/invalid.mlir @@ -277,9 +277,9 @@ tt.func public @fn(%arg0: tensor<2x4x8x16xf32, #blocked>, %arg1: tensor<16x32x64 #shared2 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], CTAsPerCGA = [1, 2], CTASplitNum = [2, 4], CTAOrder = [0, 1], hasLeadingOffset = true}> #shared3 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [0, 1], CTAsPerCGA = [2, 1], CTASplitNum = [4, 2], CTAOrder = [1, 0], hasLeadingOffset = true}> module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 8 : i32, "triton_gpu.num-warps" = 64 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { -tt.func public @fn(%arg0: !tt.memdesc<2x4x8x16xf32, #shared>, %arg1: !tt.memdesc<16x32xf32, #shared2>) { - %a = triton_gpu.memdesc_trans %arg0 {order = array} : !tt.memdesc<2x4x8x16xf32, #shared> -> !tt.memdesc<4x16x8x2xf32, #shared1> - %b = triton_gpu.memdesc_trans %arg1 {order = array} : !tt.memdesc<16x32xf32, #shared2> -> !tt.memdesc<32x16xf32, #shared3> +tt.func public @fn(%arg0: !triton_gpu.memdesc<2x4x8x16xf32, #shared>, %arg1: !triton_gpu.memdesc<16x32xf32, #shared2>) { + %a = triton_gpu.memdesc_trans %arg0 {order = array} : !triton_gpu.memdesc<2x4x8x16xf32, #shared> -> !triton_gpu.memdesc<4x16x8x2xf32, #shared1> + %b = triton_gpu.memdesc_trans %arg1 {order = array} : !triton_gpu.memdesc<16x32xf32, #shared2> -> !triton_gpu.memdesc<32x16xf32, #shared3> tt.return } } // end module diff --git a/test/TritonGPU/accumulator-init.mlir b/test/TritonGPU/accumulator-init.mlir index 3026c5b162..e73934818a 100644 --- a/test/TritonGPU/accumulator-init.mlir +++ b/test/TritonGPU/accumulator-init.mlir @@ -11,14 +11,14 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // CHECK-LABEL: @constant_init // CHECK-DAG: %[[FALSE:.+]] = arith.constant false // CHECK: triton_nvidia_gpu.warp_group_dot {{.*}}, {{.*}}, {{.*}}, %[[FALSE]] - tt.func @constant_init(%A: !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>, %B: !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>, %arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> { + tt.func @constant_init(%A: !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>, %B: !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>, %arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> { %c0_i32 = arith.constant 0 : i32 %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma1> %c1_i32 = arith.constant 1 : i32 %c8_i32 = arith.constant 8 : i32 %17 = scf.for %arg3 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg4 = %cst_2) -> (tensor<128x16xf32, #mma1>) : i32 { %cnd = arith.cmpi slt, %arg3, %ext : i32 - %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %cst_2 : !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1> + %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %cst_2 : !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1> scf.yield %acc: tensor<128x16xf32, #mma1> } tt.return %17 : tensor<128x16xf32, #mma1> @@ -27,14 +27,14 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // CHECK-LABEL: @constant_init_integer // CHECK-DAG: %[[FALSE:.+]] = arith.constant false // CHECK: triton_nvidia_gpu.warp_group_dot {{.*}}, {{.*}}, {{.*}}, %[[FALSE]] - tt.func @constant_init_integer(%A: !tt.memdesc<128x64xi8, #shared, #triton_gpu.shared_memory>, %B: !tt.memdesc<64x16xi8, #shared1, #triton_gpu.shared_memory>, %arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xi32, #mma1> { + tt.func @constant_init_integer(%A: !triton_gpu.memdesc<128x64xi8, #shared, #triton_gpu.shared_memory>, %B: !triton_gpu.memdesc<64x16xi8, #shared1, #triton_gpu.shared_memory>, %arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xi32, #mma1> { %c0_i32 = arith.constant 0 : i32 %cst_2 = arith.constant dense<0> : tensor<128x16xi32, #mma1> %c1_i32 = arith.constant 1 : i32 %c8_i32 = arith.constant 8 : i32 %17 = scf.for %arg3 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg4 = %cst_2) -> (tensor<128x16xi32, #mma1>) : i32 { %cnd = arith.cmpi slt, %arg3, %ext : i32 - %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %cst_2 : !tt.memdesc<128x64xi8, #shared, #triton_gpu.shared_memory> * !tt.memdesc<64x16xi8, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xi32, #mma1> + %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %cst_2 : !triton_gpu.memdesc<128x64xi8, #shared, #triton_gpu.shared_memory> * !triton_gpu.memdesc<64x16xi8, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xi32, #mma1> scf.yield %acc: tensor<128x16xi32, #mma1> } tt.return %17 : tensor<128x16xi32, #mma1> @@ -53,14 +53,14 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // CHECK: else // CHECK: scf.yield %[[ACC_NEXT]] // CHECK: scf.yield {{.*}}, %[[USE_ACC_NEXT]] - tt.func @if_after_mma(%A: !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>, %B: !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>, %arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> { + tt.func @if_after_mma(%A: !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>, %B: !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>, %arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> { %c0_i32 = arith.constant 0 : i32 %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma1> %c1_i32 = arith.constant 1 : i32 %c8_i32 = arith.constant 8 : i32 %17 = scf.for %arg3 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg4 = %cst_2) -> (tensor<128x16xf32, #mma1>) : i32 { %cnd = arith.cmpi slt, %arg3, %ext : i32 - %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %arg4 : !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1> + %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %arg4 : !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1> %acc_ = scf.if %cnd -> (tensor<128x16xf32, #mma1>) { scf.yield %cst_2 : tensor<128x16xf32, #mma1> } else { @@ -84,14 +84,14 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // CHECK: else // CHECK: scf.yield %[[ACC_NEXT]] // CHECK: scf.yield {{.*}}, %[[USE_ACC_NEXT]] - tt.func @if_after_mma_invert(%A: !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>, %B: !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>, %arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> { + tt.func @if_after_mma_invert(%A: !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>, %B: !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>, %arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> { %c0_i32 = arith.constant 0 : i32 %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma1> %c1_i32 = arith.constant 1 : i32 %c8_i32 = arith.constant 8 : i32 %17 = scf.for %arg3 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg4 = %cst_2) -> (tensor<128x16xf32, #mma1>) : i32 { %cnd = arith.cmpi slt, %arg3, %ext : i32 - %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %arg4 : !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1> + %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %arg4 : !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1> %acc_ = scf.if %cnd -> (tensor<128x16xf32, #mma1>) { scf.yield %acc : tensor<128x16xf32, #mma1> } else { @@ -115,7 +115,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // CHECK: scf.yield %[[ACC]] // CHECK: %[[ACC_NEXT:.+]] = triton_nvidia_gpu.warp_group_dot {{.*}}, {{.*}}, %[[ACC_CND]], %[[USE_ACC_NEXT]] // CHECK: scf.yield {{.*}}, %[[TRUE]] - tt.func @if_before_mma(%A: !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>, %B: !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>, %arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> { + tt.func @if_before_mma(%A: !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>, %B: !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>, %arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> { %c0_i32 = arith.constant 0 : i32 %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma1> %c1_i32 = arith.constant 1 : i32 @@ -127,7 +127,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : } else { scf.yield %arg4 : tensor<128x16xf32, #mma1> } - %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %acc_ : !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1> + %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %acc_ : !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1> scf.yield %acc: tensor<128x16xf32, #mma1> } tt.return %17 : tensor<128x16xf32, #mma1> @@ -146,7 +146,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // CHECK: scf.yield %[[ACC]] // CHECK: %[[ACC_NEXT:.+]] = triton_nvidia_gpu.warp_group_dot {{.*}}, {{.*}}, %[[ACC_CND]], %[[USE_ACC_NEXT]] // CHECK: scf.yield {{.*}}, %[[TRUE]] - tt.func @if_before_mma_invert(%A: !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>, %B: !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>, %arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> { + tt.func @if_before_mma_invert(%A: !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>, %B: !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>, %arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> { %c0_i32 = arith.constant 0 : i32 %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma1> %c1_i32 = arith.constant 1 : i32 @@ -158,7 +158,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : } else { scf.yield %cst_2 : tensor<128x16xf32, #mma1> } - %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %acc_ : !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1> + %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %acc_ : !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1> scf.yield %acc: tensor<128x16xf32, #mma1> } tt.return %17 : tensor<128x16xf32, #mma1> @@ -173,14 +173,14 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // CHECK: %[[ACC_NEXT:.+]] = triton_nvidia_gpu.warp_group_dot {{.*}}, {{.*}}, %[[ACC]], %[[USE_ACC]] // CHECK: %[[USE_ACC_NEXT:.*]] = arith.select %[[CND]], %[[FALSE]], %[[TRUE]] // CHECK: scf.yield {{.*}}, %[[USE_ACC_NEXT]] - tt.func @sel_after_mma(%A: !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>, %B: !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>, %arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> { + tt.func @sel_after_mma(%A: !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>, %B: !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>, %arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> { %c0_i32 = arith.constant 0 : i32 %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma1> %c1_i32 = arith.constant 1 : i32 %c8_i32 = arith.constant 8 : i32 %17 = scf.for %arg3 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg4 = %cst_2) -> (tensor<128x16xf32, #mma1>) : i32 { %cnd = arith.cmpi slt, %arg3, %ext : i32 - %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %arg4 : !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1> + %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %arg4 : !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1> %acc_ = arith.select %cnd, %cst_2, %acc : tensor<128x16xf32, #mma1> scf.yield %acc_: tensor<128x16xf32, #mma1> } @@ -196,7 +196,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // CHECK: %[[USE_ACC_NEXT:.*]] = arith.select %[[CND]], %[[FALSE]], %[[USE_ACC]] // CHECK: %[[ACC_NEXT:.+]] = triton_nvidia_gpu.warp_group_dot {{.*}}, {{.*}}, %[[ACC]], %[[USE_ACC_NEXT]] // CHECK: scf.yield {{.*}}, %[[TRUE]] - tt.func @sel_before_mma(%A: !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>, %B: !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>, %arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> { + tt.func @sel_before_mma(%A: !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>, %B: !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>, %arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> { %c0_i32 = arith.constant 0 : i32 %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma1> %c1_i32 = arith.constant 1 : i32 @@ -204,7 +204,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : %17 = scf.for %arg3 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg4 = %cst_2) -> (tensor<128x16xf32, #mma1>) : i32 { %cnd = arith.cmpi slt, %arg3, %ext : i32 %acc_ = arith.select %cnd, %cst_2, %arg4 : tensor<128x16xf32, #mma1> - %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %acc_ : !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1> + %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %acc_ : !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1> scf.yield %acc: tensor<128x16xf32, #mma1> } tt.return %17 : tensor<128x16xf32, #mma1> @@ -230,7 +230,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // CHECK: else // CHECK: scf.yield %[[ACC_NEXT]] // CHECK: scf.yield {{.*}}, %[[TRUE]] - tt.func @if_before_and_after_mma(%A: !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>, %B: !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>, %arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> { + tt.func @if_before_and_after_mma(%A: !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>, %B: !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>, %arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> { %c0_i32 = arith.constant 0 : i32 %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma1> %c1_i32 = arith.constant 1 : i32 @@ -242,7 +242,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : } else { scf.yield %arg4 : tensor<128x16xf32, #mma1> } - %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %acc_0 : !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1> + %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %acc_0 : !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1> %acc_1 = scf.if %cnd -> (tensor<128x16xf32, #mma1>) { scf.yield %cst_2 : tensor<128x16xf32, #mma1> } else { @@ -270,14 +270,14 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // CHECK: else // CHECK: scf.yield %[[ACC_CND]] // CHECK: scf.yield {{.*}}, %[[USE_ACC_NEXT]] - tt.func @two_ifs_after_mma(%A: !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>, %B: !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>, %arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> { + tt.func @two_ifs_after_mma(%A: !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>, %B: !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>, %arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> { %c0_i32 = arith.constant 0 : i32 %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma1> %c1_i32 = arith.constant 1 : i32 %c8_i32 = arith.constant 8 : i32 %17 = scf.for %arg3 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg4 = %cst_2) -> (tensor<128x16xf32, #mma1>) : i32 { %cnd = arith.cmpi slt, %arg3, %ext : i32 - %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %arg4 : !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1> + %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %arg4 : !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1> %acc_0 = scf.if %cnd -> (tensor<128x16xf32, #mma1>) { scf.yield %cst_2 : tensor<128x16xf32, #mma1> } else { @@ -296,15 +296,15 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // Check that we bail out in unsupported cases // CHECK-LABEL: @non_zero_init -// CHECK-NOT: %[[ACC_NEXT:.+]] = triton_nvidia_gpu.warp_group_dot {{.*}}, {{.*}}, {{.*}}, {{.*}} : !tt.memdesc - tt.func @non_zero_init(%A: !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>, %B: !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>, %arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> { +// CHECK-NOT: %[[ACC_NEXT:.+]] = triton_nvidia_gpu.warp_group_dot {{.*}}, {{.*}}, {{.*}}, {{.*}} : !triton_gpu.memdesc + tt.func @non_zero_init(%A: !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>, %B: !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>, %arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> { %c0_i32 = arith.constant 0 : i32 %cst_2 = arith.constant dense<1.000000e+00> : tensor<128x16xf32, #mma1> %c1_i32 = arith.constant 1 : i32 %c8_i32 = arith.constant 8 : i32 %17 = scf.for %arg3 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg4 = %cst_2) -> (tensor<128x16xf32, #mma1>) : i32 { %cnd = arith.cmpi slt, %arg3, %ext : i32 - %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %arg4 : !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1> + %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %arg4 : !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1> %acc_ = arith.select %cnd, %cst_2, %acc : tensor<128x16xf32, #mma1> scf.yield %acc_: tensor<128x16xf32, #mma1> } @@ -312,15 +312,15 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : } // CHECK-LABEL: @zero_init_dist_2 -// CHECK-NOT: %[[ACC_NEXT:.+]] = triton_nvidia_gpu.warp_group_dot {{.*}}, {{.*}}, {{.*}}, {{.*}} : !tt.memdesc - tt.func @zero_init_dist_2(%A: !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>, %B: !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>, %arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> { +// CHECK-NOT: %[[ACC_NEXT:.+]] = triton_nvidia_gpu.warp_group_dot {{.*}}, {{.*}}, {{.*}}, {{.*}} : !triton_gpu.memdesc + tt.func @zero_init_dist_2(%A: !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>, %B: !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>, %arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> { %c0_i32 = arith.constant 0 : i32 %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma1> %c1_i32 = arith.constant 1 : i32 %c8_i32 = arith.constant 8 : i32 %17:2 = scf.for %arg3 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg4 = %cst_2, %arg5 = %cst_2) -> (tensor<128x16xf32, #mma1>, tensor<128x16xf32, #mma1>) : i32 { %cnd = arith.cmpi slt, %arg3, %ext : i32 - %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %arg5 : !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1> + %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %arg5 : !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1> %acc_ = arith.select %cnd, %cst_2, %acc : tensor<128x16xf32, #mma1> scf.yield %acc_, %arg4: tensor<128x16xf32, #mma1>, tensor<128x16xf32, #mma1> } @@ -328,8 +328,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : } // CHECK-LABEL: @if_defines_alternative -// CHECK-NOT: %[[ACC_NEXT:.+]] = triton_nvidia_gpu.warp_group_dot {{.*}}, {{.*}}, {{.*}}, {{.*}} : !tt.memdesc - tt.func @if_defines_alternative(%A: !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>, %B: !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>, %arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> { +// CHECK-NOT: %[[ACC_NEXT:.+]] = triton_nvidia_gpu.warp_group_dot {{.*}}, {{.*}}, {{.*}}, {{.*}} : !triton_gpu.memdesc + tt.func @if_defines_alternative(%A: !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>, %B: !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>, %arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> { %c0_i32 = arith.constant 0 : i32 %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma1> %cst_3 = arith.constant dense<1.000000e+00> : tensor<128x16xf32, #mma1> @@ -337,7 +337,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : %c8_i32 = arith.constant 8 : i32 %17 = scf.for %arg3 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg4 = %cst_2) -> (tensor<128x16xf32, #mma1>) : i32 { %cnd = arith.cmpi slt, %arg3, %ext : i32 - %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %arg4 : !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1> + %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %arg4 : !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1> %acc_ = scf.if %cnd -> (tensor<128x16xf32, #mma1>) { scf.yield %cst_2 : tensor<128x16xf32, #mma1> } else { @@ -350,15 +350,15 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : } // CHECK-LABEL: @non_cond_override -// CHECK-NOT: %[[ACC_NEXT:.+]] = triton_nvidia_gpu.warp_group_dot {{.*}}, {{.*}}, {{.*}}, {{.*}} : !tt.memdesc - tt.func @non_cond_override(%A: !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>, %B: !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>, %arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> { +// CHECK-NOT: %[[ACC_NEXT:.+]] = triton_nvidia_gpu.warp_group_dot {{.*}}, {{.*}}, {{.*}}, {{.*}} : !triton_gpu.memdesc + tt.func @non_cond_override(%A: !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>, %B: !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>, %arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> { %c0_i32 = arith.constant 0 : i32 %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma1> %cst_3 = arith.constant dense<1.000000e+00> : tensor<128x16xf32, #mma1> %c1_i32 = arith.constant 1 : i32 %c8_i32 = arith.constant 8 : i32 %17 = scf.for %arg3 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg4 = %cst_2) -> (tensor<128x16xf32, #mma1>) : i32 { - %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %arg4 : !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1> + %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %arg4 : !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1> %acc_ = arith.addf %acc, %cst_3 : tensor<128x16xf32, #mma1> scf.yield %acc_: tensor<128x16xf32, #mma1> } @@ -367,15 +367,15 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // If the condition is a tensor skip the optimization. // CHECK-LABEL: @negative_sel_tensor -// CHECK-NOT: %[[ACC_NEXT:.+]] = triton_nvidia_gpu.warp_group_dot {{.*}}, {{.*}}, {{.*}}, {{.*}} : !tt.memdesc - tt.func @negative_sel_tensor(%A: !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>, %B: !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>, %cnd: tensor<128x16xi1, #mma1>) -> tensor<128x16xf32, #mma1> { +// CHECK-NOT: %[[ACC_NEXT:.+]] = triton_nvidia_gpu.warp_group_dot {{.*}}, {{.*}}, {{.*}}, {{.*}} : !triton_gpu.memdesc + tt.func @negative_sel_tensor(%A: !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>, %B: !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>, %cnd: tensor<128x16xi1, #mma1>) -> tensor<128x16xf32, #mma1> { %c0_i32 = arith.constant 0 : i32 %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma1> %c1_i32 = arith.constant 1 : i32 %c8_i32 = arith.constant 8 : i32 %17 = scf.for %arg3 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg4 = %cst_2) -> (tensor<128x16xf32, #mma1>) : i32 { %acc_ = arith.select %cnd, %cst_2, %arg4 : tensor<128x16xi1, #mma1>, tensor<128x16xf32, #mma1> - %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %acc_ : !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1> + %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %acc_ : !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1> scf.yield %acc: tensor<128x16xf32, #mma1> } tt.return %17 : tensor<128x16xf32, #mma1> diff --git a/test/TritonGPU/amd/amd-reorder-instructions.mlir b/test/TritonGPU/amd/amd-reorder-instructions.mlir index 5dfd0f2a5f..51353d31c3 100644 --- a/test/TritonGPU/amd/amd-reorder-instructions.mlir +++ b/test/TritonGPU/amd/amd-reorder-instructions.mlir @@ -34,10 +34,10 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : %54:1 = scf.for %arg21 = %c0_i32 to %arg20 step %c128_i32 iter_args(%arg26 = %c0_i64) -> (i64) : i32 { %73 = tt.splat %3 : !tt.ptr -> tensor<128x128x!tt.ptr, #blocked2> %74 = tt.load %73 : tensor<128x128x!tt.ptr, #blocked2> - %75 = triton_gpu.local_alloc %45 : (tensor<256x128xf16, #blocked1>) -> !tt.memdesc<256x128xf16, #shared, #triton_gpu.shared_memory> - %76 = triton_gpu.local_load %75 : !tt.memdesc<256x128xf16, #shared, #triton_gpu.shared_memory> -> tensor<256x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mfma, kWidth = 4}>> - %77 = triton_gpu.local_alloc %74 : (tensor<128x128xf16, #blocked2>) -> !tt.memdesc<128x128xf16, #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 16, order = [0, 1], hasLeadingOffset = false}>, #triton_gpu.shared_memory> - %78 = triton_gpu.local_load %77 : !tt.memdesc<128x128xf16, #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 16, order = [0, 1], hasLeadingOffset = false}>, #triton_gpu.shared_memory> -> tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mfma, kWidth = 4}>> + %75 = triton_gpu.local_alloc %45 : (tensor<256x128xf16, #blocked1>) -> !triton_gpu.memdesc<256x128xf16, #shared, #triton_gpu.shared_memory> + %76 = triton_gpu.local_load %75 : !triton_gpu.memdesc<256x128xf16, #shared, #triton_gpu.shared_memory> -> tensor<256x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mfma, kWidth = 4}>> + %77 = triton_gpu.local_alloc %74 : (tensor<128x128xf16, #blocked2>) -> !triton_gpu.memdesc<128x128xf16, #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 16, order = [0, 1], hasLeadingOffset = false}>, #triton_gpu.shared_memory> + %78 = triton_gpu.local_load %77 : !triton_gpu.memdesc<128x128xf16, #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 16, order = [0, 1], hasLeadingOffset = false}>, #triton_gpu.shared_memory> -> tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mfma, kWidth = 4}>> %79 = tt.dot %76, %78, %cst_2 : tensor<256x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mfma, kWidth = 4}>> * tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mfma, kWidth = 4}>> -> tensor<256x128xf32, #mfma> %107 = arith.addi %arg26, %c128_i64 : i64 scf.yield %107 : i64 @@ -78,10 +78,10 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : %cst_2 = arith.constant dense<0.000000e+00> : tensor<256x128xf32, #mfma> %73 = tt.splat %3 : !tt.ptr -> tensor<128x128x!tt.ptr, #blocked2> %74 = tt.load %73 : tensor<128x128x!tt.ptr, #blocked2> - %75 = triton_gpu.local_alloc %45 : (tensor<256x128xf16, #blocked1>) -> !tt.memdesc<256x128xf16, #shared, #triton_gpu.shared_memory> - %76 = triton_gpu.local_load %75 : !tt.memdesc<256x128xf16, #shared, #triton_gpu.shared_memory> -> tensor<256x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mfma, kWidth = 4}>> - %77 = triton_gpu.local_alloc %74 : (tensor<128x128xf16, #blocked2>) -> !tt.memdesc<128x128xf16, #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 16, order = [0, 1], hasLeadingOffset = false}>, #triton_gpu.shared_memory> - %78 = triton_gpu.local_load %77 : !tt.memdesc<128x128xf16, #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 16, order = [0, 1], hasLeadingOffset = false}>, #triton_gpu.shared_memory> -> tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mfma, kWidth = 4}>> + %75 = triton_gpu.local_alloc %45 : (tensor<256x128xf16, #blocked1>) -> !triton_gpu.memdesc<256x128xf16, #shared, #triton_gpu.shared_memory> + %76 = triton_gpu.local_load %75 : !triton_gpu.memdesc<256x128xf16, #shared, #triton_gpu.shared_memory> -> tensor<256x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mfma, kWidth = 4}>> + %77 = triton_gpu.local_alloc %74 : (tensor<128x128xf16, #blocked2>) -> !triton_gpu.memdesc<128x128xf16, #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 16, order = [0, 1], hasLeadingOffset = false}>, #triton_gpu.shared_memory> + %78 = triton_gpu.local_load %77 : !triton_gpu.memdesc<128x128xf16, #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 16, order = [0, 1], hasLeadingOffset = false}>, #triton_gpu.shared_memory> -> tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mfma, kWidth = 4}>> %79 = tt.dot %76, %78, %cst_2 : tensor<256x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mfma, kWidth = 4}>> * tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mfma, kWidth = 4}>> -> tensor<256x128xf32, #mfma> %107 = arith.addi %arg26, %c128_i64 : i64 scf.yield %107 : i64 @@ -104,10 +104,10 @@ module attributes {"triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-war tt.func public @order_load_alloc_local_load_local_store(%arg0: tensor<32x32x!tt.ptr, #blocked>) attributes {noinline = false} { %9 = tt.load %arg0 : tensor<32x32x!tt.ptr, #blocked> %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma> - %10 = triton_gpu.local_alloc : () -> !tt.memdesc<32x32xf32, #shared, mutable> - triton_gpu.local_store %9, %10 : tensor<32x32xf32, #blocked> -> !tt.memdesc<32x32xf32, #shared, mutable> + %10 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<32x32xf32, #shared, mutable> + triton_gpu.local_store %9, %10 : tensor<32x32xf32, #blocked> -> !triton_gpu.memdesc<32x32xf32, #shared, mutable> %cst_0 = arith.constant dense<1.230000e+02> : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> - %11 = triton_gpu.local_load %10 : !tt.memdesc<32x32xf32, #shared, mutable> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> + %11 = triton_gpu.local_load %10 : !triton_gpu.memdesc<32x32xf32, #shared, mutable> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> %12 = tt.dot %11, %cst_0, %cst : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<32x32xf32, #mma> %13 = triton_gpu.convert_layout %12 : tensor<32x32xf32, #mma> -> tensor<32x32xf32, #blocked> tt.store %arg0, %13 : tensor<32x32x!tt.ptr, #blocked> @@ -222,22 +222,22 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : %7 = tt.expand_dims %6 {axis = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked> %8 = tt.broadcast %7 : tensor<1x128xi32, #blocked> -> tensor<32x128xi32, #blocked> %9 = tt.addptr %5, %8 : tensor<32x128x!tt.ptr, #blocked>, tensor<32x128xi32, #blocked> - %10 = triton_gpu.local_alloc : () -> !tt.memdesc<1x128x32xf16, #shared, #triton_gpu.shared_memory, mutable> - %11 = triton_gpu.local_alloc : () -> !tt.memdesc<1x32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> + %10 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x128x32xf16, #shared, #triton_gpu.shared_memory, mutable> + %11 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> %12 = arith.cmpi slt, %arg0, %arg1 : index %13 = tt.splat %12 : i1 -> tensor<128x32xi1, #blocked1> %14 = tt.load %4, %13 : tensor<128x32x!tt.ptr, #blocked1> %15 = tt.splat %12 : i1 -> tensor<32x128xi1, #blocked> %16 = tt.load %9, %15, %cst_3 : tensor<32x128x!tt.ptr, #blocked> - %17 = triton_gpu.memdesc_subview %10[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<1x128x32xf16, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %14, %17 : tensor<128x32xf16, #blocked1> -> !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable> - %18 = triton_gpu.memdesc_subview %11[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<1x32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %16, %18 : tensor<32x128xf16, #blocked> -> !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> - %19:6 = scf.for %arg5 = %arg0 to %arg1 step %arg2 iter_args(%arg6 = %4, %arg7 = %9, %arg8 = %cst_2, %arg9 = %c0_i32, %arg10 = %17, %arg11 = %18) -> (tensor<128x32x!tt.ptr, #blocked1>, tensor<32x128x!tt.ptr, #blocked>, tensor<128x128xf32, #mma>, i32, !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable>, !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable>) { + %17 = triton_gpu.memdesc_subview %10[%c0_i32, %c0_i32, %c0_i32] : !triton_gpu.memdesc<1x128x32xf16, #shared, #triton_gpu.shared_memory, mutable> -> !triton_gpu.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %14, %17 : tensor<128x32xf16, #blocked1> -> !triton_gpu.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable> + %18 = triton_gpu.memdesc_subview %11[%c0_i32, %c0_i32, %c0_i32] : !triton_gpu.memdesc<1x32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> -> !triton_gpu.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %16, %18 : tensor<32x128xf16, #blocked> -> !triton_gpu.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> + %19:6 = scf.for %arg5 = %arg0 to %arg1 step %arg2 iter_args(%arg6 = %4, %arg7 = %9, %arg8 = %cst_2, %arg9 = %c0_i32, %arg10 = %17, %arg11 = %18) -> (tensor<128x32x!tt.ptr, #blocked1>, tensor<32x128x!tt.ptr, #blocked>, tensor<128x128xf32, #mma>, i32, !triton_gpu.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable>) { %20 = arith.subi %arg1, %arg2 : index %21 = arith.cmpi slt, %arg5, %20 : index - %22 = triton_gpu.local_load %arg10 : !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<128x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> - %23 = triton_gpu.local_load %arg11 : !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<32x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> + %22 = triton_gpu.local_load %arg10 : !triton_gpu.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<128x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> + %23 = triton_gpu.local_load %arg11 : !triton_gpu.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<32x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> %24 = arith.mulf %23, %cst : tensor<32x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> %25 = tt.dot %22, %24, %arg8 : tensor<128x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<32x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x128xf32, #mma> %26 = tt.addptr %arg6, %cst_1 : tensor<128x32x!tt.ptr, #blocked1>, tensor<128x32xi32, #blocked1> @@ -249,14 +249,14 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : %32 = arith.addi %arg9, %c1_i32 : i32 %33 = arith.cmpi slt, %32, %c1_i32 : i32 %34 = arith.select %33, %32, %c0_i32 : i32 - %35 = triton_gpu.memdesc_subview %10[%34, %c0_i32, %c0_i32] : !tt.memdesc<1x128x32xf16, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %29, %35 : tensor<128x32xf16, #blocked1> -> !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable> - %36 = triton_gpu.memdesc_subview %11[%34, %c0_i32, %c0_i32] : !tt.memdesc<1x32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %31, %36 : tensor<32x128xf16, #blocked> -> !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> - scf.yield %26, %27, %25, %34, %35, %36 : tensor<128x32x!tt.ptr, #blocked1>, tensor<32x128x!tt.ptr, #blocked>, tensor<128x128xf32, #mma>, i32, !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable>, !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> + %35 = triton_gpu.memdesc_subview %10[%34, %c0_i32, %c0_i32] : !triton_gpu.memdesc<1x128x32xf16, #shared, #triton_gpu.shared_memory, mutable> -> !triton_gpu.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %29, %35 : tensor<128x32xf16, #blocked1> -> !triton_gpu.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable> + %36 = triton_gpu.memdesc_subview %11[%34, %c0_i32, %c0_i32] : !triton_gpu.memdesc<1x32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> -> !triton_gpu.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %31, %36 : tensor<32x128xf16, #blocked> -> !triton_gpu.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> + scf.yield %26, %27, %25, %34, %35, %36 : tensor<128x32x!tt.ptr, #blocked1>, tensor<32x128x!tt.ptr, #blocked>, tensor<128x128xf32, #mma>, i32, !triton_gpu.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> } - triton_gpu.local_dealloc %10 : !tt.memdesc<1x128x32xf16, #shared, #triton_gpu.shared_memory, mutable> - triton_gpu.local_dealloc %11 : !tt.memdesc<1x32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> + triton_gpu.local_dealloc %10 : !triton_gpu.memdesc<1x128x32xf16, #shared, #triton_gpu.shared_memory, mutable> + triton_gpu.local_dealloc %11 : !triton_gpu.memdesc<1x32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> tt.return %19#2 : tensor<128x128xf32, #mma> } @@ -313,8 +313,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : %7 = tt.expand_dims %6 {axis = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked> %8 = tt.broadcast %7 : tensor<1x128xi32, #blocked> -> tensor<32x128xi32, #blocked> %9 = tt.addptr %5, %8 : tensor<32x128x!tt.ptr, #blocked>, tensor<32x128xi32, #blocked> - %10 = triton_gpu.local_alloc : () -> !tt.memdesc<2x128x32xf16, #shared, #triton_gpu.shared_memory, mutable> - %11 = triton_gpu.local_alloc : () -> !tt.memdesc<2x32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> + %10 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<2x128x32xf16, #shared, #triton_gpu.shared_memory, mutable> + %11 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<2x32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> %12 = arith.cmpi slt, %arg0, %arg1 : index %13 = tt.splat %12 : i1 -> tensor<128x32xi1, #blocked1> %14 = tt.load %4, %13 : tensor<128x32x!tt.ptr, #blocked1> @@ -328,16 +328,16 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : %22 = tt.load %19, %21 : tensor<128x32x!tt.ptr, #blocked1> %23 = tt.splat %18 : i1 -> tensor<32x128xi1, #blocked> %24 = tt.load %20, %23, %cst_3 : tensor<32x128x!tt.ptr, #blocked> - %25 = triton_gpu.memdesc_subview %10[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<2x128x32xf16, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %14, %25 : tensor<128x32xf16, #blocked1> -> !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable> - %26 = triton_gpu.memdesc_subview %11[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<2x32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %16, %26 : tensor<32x128xf16, #blocked> -> !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> - %27:8 = scf.for %arg5 = %arg0 to %arg1 step %arg2 iter_args(%arg6 = %19, %arg7 = %20, %arg8 = %cst_2, %arg9 = %c0_i32, %arg10 = %25, %arg11 = %26, %arg12 = %22, %arg13 = %24) -> (tensor<128x32x!tt.ptr, #blocked1>, tensor<32x128x!tt.ptr, #blocked>, tensor<128x128xf32, #mma>, i32, !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable>, !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable>, tensor<128x32xf16, #blocked1>, tensor<32x128xf16, #blocked>) { + %25 = triton_gpu.memdesc_subview %10[%c0_i32, %c0_i32, %c0_i32] : !triton_gpu.memdesc<2x128x32xf16, #shared, #triton_gpu.shared_memory, mutable> -> !triton_gpu.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %14, %25 : tensor<128x32xf16, #blocked1> -> !triton_gpu.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable> + %26 = triton_gpu.memdesc_subview %11[%c0_i32, %c0_i32, %c0_i32] : !triton_gpu.memdesc<2x32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> -> !triton_gpu.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %16, %26 : tensor<32x128xf16, #blocked> -> !triton_gpu.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> + %27:8 = scf.for %arg5 = %arg0 to %arg1 step %arg2 iter_args(%arg6 = %19, %arg7 = %20, %arg8 = %cst_2, %arg9 = %c0_i32, %arg10 = %25, %arg11 = %26, %arg12 = %22, %arg13 = %24) -> (tensor<128x32x!tt.ptr, #blocked1>, tensor<32x128x!tt.ptr, #blocked>, tensor<128x128xf32, #mma>, i32, !triton_gpu.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable>, tensor<128x32xf16, #blocked1>, tensor<32x128xf16, #blocked>) { %28 = arith.muli %arg2, %c2 : index %29 = arith.subi %arg1, %28 : index %30 = arith.cmpi slt, %arg5, %29 : index - %31 = triton_gpu.local_load %arg10 : !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<128x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> - %32 = triton_gpu.local_load %arg11 : !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<32x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> + %31 = triton_gpu.local_load %arg10 : !triton_gpu.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<128x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> + %32 = triton_gpu.local_load %arg11 : !triton_gpu.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<32x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> %33 = arith.mulf %32, %cst : tensor<32x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> %34 = tt.dot %31, %33, %arg8 : tensor<128x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<32x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x128xf32, #mma> %35 = tt.addptr %arg6, %cst_1 : tensor<128x32x!tt.ptr, #blocked1>, tensor<128x32xi32, #blocked1> @@ -349,14 +349,14 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : %41 = arith.addi %arg9, %c1_i32 : i32 %42 = arith.cmpi slt, %41, %c2_i32 : i32 %43 = arith.select %42, %41, %c0_i32 : i32 - %44 = triton_gpu.memdesc_subview %10[%43, %c0_i32, %c0_i32] : !tt.memdesc<2x128x32xf16, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %arg12, %44 : tensor<128x32xf16, #blocked1> -> !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable> - %45 = triton_gpu.memdesc_subview %11[%43, %c0_i32, %c0_i32] : !tt.memdesc<2x32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %arg13, %45 : tensor<32x128xf16, #blocked> -> !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> - scf.yield %35, %36, %34, %43, %44, %45, %38, %40 : tensor<128x32x!tt.ptr, #blocked1>, tensor<32x128x!tt.ptr, #blocked>, tensor<128x128xf32, #mma>, i32, !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable>, !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable>, tensor<128x32xf16, #blocked1>, tensor<32x128xf16, #blocked> + %44 = triton_gpu.memdesc_subview %10[%43, %c0_i32, %c0_i32] : !triton_gpu.memdesc<2x128x32xf16, #shared, #triton_gpu.shared_memory, mutable> -> !triton_gpu.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %arg12, %44 : tensor<128x32xf16, #blocked1> -> !triton_gpu.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable> + %45 = triton_gpu.memdesc_subview %11[%43, %c0_i32, %c0_i32] : !triton_gpu.memdesc<2x32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> -> !triton_gpu.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %arg13, %45 : tensor<32x128xf16, #blocked> -> !triton_gpu.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> + scf.yield %35, %36, %34, %43, %44, %45, %38, %40 : tensor<128x32x!tt.ptr, #blocked1>, tensor<32x128x!tt.ptr, #blocked>, tensor<128x128xf32, #mma>, i32, !triton_gpu.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable>, tensor<128x32xf16, #blocked1>, tensor<32x128xf16, #blocked> } - triton_gpu.local_dealloc %10 : !tt.memdesc<2x128x32xf16, #shared, #triton_gpu.shared_memory, mutable> - triton_gpu.local_dealloc %11 : !tt.memdesc<2x32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> + triton_gpu.local_dealloc %10 : !triton_gpu.memdesc<2x128x32xf16, #shared, #triton_gpu.shared_memory, mutable> + triton_gpu.local_dealloc %11 : !triton_gpu.memdesc<2x32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> tt.return %27#2 : tensor<128x128xf32, #mma> } @@ -404,8 +404,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : %c0 = arith.constant 0 : index %c1_i32 = arith.constant 1 : i32 %cst_0 = arith.constant dense<1> : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %0 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> - %1 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> + %0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> + %1 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> %2 = arith.cmpi sgt, %arg1, %c0 : index %3 = tt.splat %2 : i1 -> tensor<16xi1, #triton_gpu.slice<{dim = 1, parent = #blocked}>> %4 = tt.load %arg3, %3 : tensor<16x!tt.ptr, #triton_gpu.slice<{dim = 1, parent = #blocked}>> @@ -421,17 +421,17 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : %14 = tt.load %12, %13 : tensor<16x16x!tt.ptr, #blocked> %15 = tt.splat %5 : i1 -> tensor<16xi1, #triton_gpu.slice<{dim = 1, parent = #blocked}>> %16 = tt.load %6, %15 : tensor<16x!tt.ptr, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %17 = triton_gpu.memdesc_subview %0[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %8, %17 : tensor<16x16xf16, #blocked1> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> - %18 = triton_gpu.memdesc_subview %1[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %14, %18 : tensor<16x16xf16, #blocked> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> - %19:7 = scf.for %arg6 = %c0 to %arg1 step %c1 iter_args(%arg7 = %cst, %arg8 = %arg2, %arg9 = %6, %arg10 = %c0_i32, %arg11 = %17, %arg12 = %18, %arg13 = %16) -> (tensor<16x16xf32, #mma>, tensor<16x16x!tt.ptr, #blocked1>, tensor<16x!tt.ptr, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, i32, !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>, !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>, tensor<16xi64, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) { + %17 = triton_gpu.memdesc_subview %0[%c0_i32, %c0_i32, %c0_i32] : !triton_gpu.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> !triton_gpu.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %8, %17 : tensor<16x16xf16, #blocked1> -> !triton_gpu.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> + %18 = triton_gpu.memdesc_subview %1[%c0_i32, %c0_i32, %c0_i32] : !triton_gpu.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> !triton_gpu.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %14, %18 : tensor<16x16xf16, #blocked> -> !triton_gpu.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> + %19:7 = scf.for %arg6 = %c0 to %arg1 step %c1 iter_args(%arg7 = %cst, %arg8 = %arg2, %arg9 = %6, %arg10 = %c0_i32, %arg11 = %17, %arg12 = %18, %arg13 = %16) -> (tensor<16x16xf32, #mma>, tensor<16x16x!tt.ptr, #blocked1>, tensor<16x!tt.ptr, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, i32, !triton_gpu.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>, tensor<16xi64, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) { %20 = arith.subi %arg1, %c2 : index %21 = arith.cmpi slt, %arg6, %20 : index %22 = arith.subi %arg1, %c1 : index %23 = arith.cmpi slt, %arg6, %22 : index - %24 = triton_gpu.local_load %arg11 : !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> - %25 = triton_gpu.local_load %arg12 : !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> + %24 = triton_gpu.local_load %arg11 : !triton_gpu.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> + %25 = triton_gpu.local_load %arg12 : !triton_gpu.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> %26 = tt.dot %24, %25, %arg7 : tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<16x16xf32, #mma> %27 = tt.addptr %arg8, %arg4 : tensor<16x16x!tt.ptr, #blocked1>, tensor<16x16xi32, #blocked1> %28 = tt.addptr %arg9, %cst_0 : tensor<16x!tt.ptr, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> @@ -448,14 +448,14 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : %39 = arith.addi %arg10, %c1_i32 : i32 %40 = arith.cmpi slt, %39, %c1_i32 : i32 %41 = arith.select %40, %39, %c0_i32 : i32 - %42 = triton_gpu.memdesc_subview %0[%41, %c0_i32, %c0_i32] : !tt.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %30, %42 : tensor<16x16xf16, #blocked1> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> - %43 = triton_gpu.memdesc_subview %1[%41, %c0_i32, %c0_i32] : !tt.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %36, %43 : tensor<16x16xf16, #blocked> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> - scf.yield %26, %27, %28, %41, %42, %43, %38 : tensor<16x16xf32, #mma>, tensor<16x16x!tt.ptr, #blocked1>, tensor<16x!tt.ptr, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, i32, !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>, !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>, tensor<16xi64, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %42 = triton_gpu.memdesc_subview %0[%41, %c0_i32, %c0_i32] : !triton_gpu.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> !triton_gpu.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %30, %42 : tensor<16x16xf16, #blocked1> -> !triton_gpu.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> + %43 = triton_gpu.memdesc_subview %1[%41, %c0_i32, %c0_i32] : !triton_gpu.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> !triton_gpu.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %36, %43 : tensor<16x16xf16, #blocked> -> !triton_gpu.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> + scf.yield %26, %27, %28, %41, %42, %43, %38 : tensor<16x16xf32, #mma>, tensor<16x16x!tt.ptr, #blocked1>, tensor<16x!tt.ptr, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, i32, !triton_gpu.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>, tensor<16xi64, #triton_gpu.slice<{dim = 1, parent = #blocked}>> } - triton_gpu.local_dealloc %0 : !tt.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> - triton_gpu.local_dealloc %1 : !tt.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> + triton_gpu.local_dealloc %0 : !triton_gpu.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> + triton_gpu.local_dealloc %1 : !triton_gpu.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> tt.return %19#0 : tensor<16x16xf32, #mma> } } @@ -463,18 +463,18 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // ----- // CHECK-LABEL: sink_convert_dealloc -// CHECK-COUNT-2: triton_gpu.local_dealloc %{{.+}} : !tt.memdesc<4x128x64xf16, #shared, mutable> +// CHECK-COUNT-2: triton_gpu.local_dealloc %{{.+}} : !triton_gpu.memdesc<4x128x64xf16, #shared, mutable> // CHECK: triton_gpu.convert_layout %arg0 : tensor<32x32xf32, #blocked> -> tensor<32x32xf32, #blocked1> #blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 4], order = [0, 1]}> #blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 4], order = [1, 0]}> #shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 4, order = [0, 1]}> module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { tt.func public @sink_convert_dealloc(%arg0: tensor<32x32xf32, #blocked>) attributes {noinline = false} { - %0 = triton_gpu.local_alloc : () -> !tt.memdesc<4x128x64xf16, #shared, mutable> - %1 = triton_gpu.local_alloc : () -> !tt.memdesc<4x128x64xf16, #shared, mutable> + %0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<4x128x64xf16, #shared, mutable> + %1 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<4x128x64xf16, #shared, mutable> %2 = triton_gpu.convert_layout %arg0 : tensor<32x32xf32, #blocked> -> tensor<32x32xf32, #blocked1> - triton_gpu.local_dealloc %0 : !tt.memdesc<4x128x64xf16, #shared, mutable> - triton_gpu.local_dealloc %1 : !tt.memdesc<4x128x64xf16, #shared, mutable> + triton_gpu.local_dealloc %0 : !triton_gpu.memdesc<4x128x64xf16, #shared, mutable> + triton_gpu.local_dealloc %1 : !triton_gpu.memdesc<4x128x64xf16, #shared, mutable> %3 = arith.addf %2, %2 : tensor<32x32xf32, #blocked1> tt.return } @@ -490,12 +490,12 @@ module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-war #shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 4, order = [0, 1]}> module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { tt.func public @anchor_barrier(%arg0: tensor<32x32x!tt.ptr, #blocked>) attributes {noinline = false} { - %0 = triton_gpu.local_alloc : () -> !tt.memdesc<4x128x64xf16, #shared, mutable> + %0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<4x128x64xf16, #shared, mutable> gpu.barrier %2 = tt.load %arg0 : tensor<32x32x!tt.ptr, #blocked> - %1 = triton_gpu.local_alloc %2 : (tensor<32x32xf16, #blocked>) -> !tt.memdesc<4x128x64xf16, #shared, mutable> - triton_gpu.local_dealloc %0 : !tt.memdesc<4x128x64xf16, #shared, mutable> - triton_gpu.local_dealloc %1 : !tt.memdesc<4x128x64xf16, #shared, mutable> + %1 = triton_gpu.local_alloc %2 : (tensor<32x32xf16, #blocked>) -> !triton_gpu.memdesc<4x128x64xf16, #shared, mutable> + triton_gpu.local_dealloc %0 : !triton_gpu.memdesc<4x128x64xf16, #shared, mutable> + triton_gpu.local_dealloc %1 : !triton_gpu.memdesc<4x128x64xf16, #shared, mutable> tt.return } } diff --git a/test/TritonGPU/amd/amd-sched-2nd-load.mlir b/test/TritonGPU/amd/amd-sched-2nd-load.mlir index bcf769320e..09c71215f9 100644 --- a/test/TritonGPU/amd/amd-sched-2nd-load.mlir +++ b/test/TritonGPU/amd/amd-sched-2nd-load.mlir @@ -30,18 +30,18 @@ // CHECK-NEXT: triton_gpu.local_store %[[tileA]] // CHECK-NEXT: triton_gpu.local_store %[[tileB]] module attributes {"triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 64 : i32} { - tt.func public @sink_2nd_load_256x256x128(%A_ptr: tensor<256x128x!tt.ptr, #blocked>, %B_ptr: tensor<128x256x!tt.ptr, #blocked1>, %C_ptr: tensor<256x256x!tt.ptr, #mma>, %A_LDS: !tt.memdesc<256x128xf16, #shared, #triton_gpu.shared_memory, mutable>, %B_LDS: !tt.memdesc<128x256xf16, #shared1, #triton_gpu.shared_memory, mutable>) { + tt.func public @sink_2nd_load_256x256x128(%A_ptr: tensor<256x128x!tt.ptr, #blocked>, %B_ptr: tensor<128x256x!tt.ptr, #blocked1>, %C_ptr: tensor<256x256x!tt.ptr, #mma>, %A_LDS: !triton_gpu.memdesc<256x128xf16, #shared, #triton_gpu.shared_memory, mutable>, %B_LDS: !triton_gpu.memdesc<128x256xf16, #shared1, #triton_gpu.shared_memory, mutable>) { %c0 = arith.constant 0 : i32 %c1 = arith.constant 1 : i32 %cst = arith.constant dense<0.000000e+00> : tensor<256x256xf32, #mma> %0:1 = scf.for %arg0 = %c0 to %c1 step %c1 iter_args(%arg1 = %cst) -> (tensor<256x256xf32, #mma>) : i32 { %4 = tt.load %A_ptr : tensor<256x128x!tt.ptr, #blocked> - %1 = triton_gpu.local_load %A_LDS : !tt.memdesc<256x128xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<256x128xf16, #dotOp0> + %1 = triton_gpu.local_load %A_LDS : !triton_gpu.memdesc<256x128xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<256x128xf16, #dotOp0> %5 = tt.load %B_ptr : tensor<128x256x!tt.ptr, #blocked1> - %2 = triton_gpu.local_load %B_LDS : !tt.memdesc<128x256xf16, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<128x256xf16, #dotOp1> + %2 = triton_gpu.local_load %B_LDS : !triton_gpu.memdesc<128x256xf16, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<128x256xf16, #dotOp1> %3 = tt.dot %1, %2, %arg1 : tensor<256x128xf16, #dotOp0> * tensor<128x256xf16, #dotOp1> -> tensor<256x256xf32, #mma> - triton_gpu.local_store %4, %A_LDS : tensor<256x128xf16, #blocked> -> !tt.memdesc<256x128xf16, #shared, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %5, %B_LDS : tensor<128x256xf16, #blocked1> -> !tt.memdesc<128x256xf16, #shared1, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %4, %A_LDS : tensor<256x128xf16, #blocked> -> !triton_gpu.memdesc<256x128xf16, #shared, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %5, %B_LDS : tensor<128x256xf16, #blocked1> -> !triton_gpu.memdesc<128x256xf16, #shared1, #triton_gpu.shared_memory, mutable> scf.yield %3 : tensor<256x256xf32, #mma> } tt.store %C_ptr, %0#0: tensor<256x256x!tt.ptr, #mma> @@ -69,18 +69,18 @@ module attributes {"triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-war // CHECK-NEXT: triton_gpu.local_store %[[tileA]] // CHECK-NEXT: triton_gpu.local_store %[[tileB]] module attributes {"triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 64 : i32} { - tt.func public @sink_2nd_load_256x256x64(%A_ptr: tensor<256x64x!tt.ptr, #blocked>, %B_ptr: tensor<64x256x!tt.ptr, #blocked1>, %C_ptr: tensor<256x256x!tt.ptr, #mma>, %A_LDS: !tt.memdesc<256x64xf16, #shared, #triton_gpu.shared_memory, mutable>, %B_LDS: !tt.memdesc<64x256xf16, #shared1, #triton_gpu.shared_memory, mutable>) { + tt.func public @sink_2nd_load_256x256x64(%A_ptr: tensor<256x64x!tt.ptr, #blocked>, %B_ptr: tensor<64x256x!tt.ptr, #blocked1>, %C_ptr: tensor<256x256x!tt.ptr, #mma>, %A_LDS: !triton_gpu.memdesc<256x64xf16, #shared, #triton_gpu.shared_memory, mutable>, %B_LDS: !triton_gpu.memdesc<64x256xf16, #shared1, #triton_gpu.shared_memory, mutable>) { %c0 = arith.constant 0 : i32 %c1 = arith.constant 1 : i32 %cst = arith.constant dense<0.000000e+00> : tensor<256x256xf32, #mma> %0:1 = scf.for %arg0 = %c0 to %c1 step %c1 iter_args(%arg1 = %cst) -> (tensor<256x256xf32, #mma>) : i32 { %4 = tt.load %A_ptr : tensor<256x64x!tt.ptr, #blocked> - %1 = triton_gpu.local_load %A_LDS : !tt.memdesc<256x64xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<256x64xf16, #dotOp0> + %1 = triton_gpu.local_load %A_LDS : !triton_gpu.memdesc<256x64xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<256x64xf16, #dotOp0> %5 = tt.load %B_ptr : tensor<64x256x!tt.ptr, #blocked1> - %2 = triton_gpu.local_load %B_LDS : !tt.memdesc<64x256xf16, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<64x256xf16, #dotOp1> + %2 = triton_gpu.local_load %B_LDS : !triton_gpu.memdesc<64x256xf16, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<64x256xf16, #dotOp1> %3 = tt.dot %1, %2, %arg1 : tensor<256x64xf16, #dotOp0> * tensor<64x256xf16, #dotOp1> -> tensor<256x256xf32, #mma> - triton_gpu.local_store %4, %A_LDS : tensor<256x64xf16, #blocked> -> !tt.memdesc<256x64xf16, #shared, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %5, %B_LDS : tensor<64x256xf16, #blocked1> -> !tt.memdesc<64x256xf16, #shared1, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %4, %A_LDS : tensor<256x64xf16, #blocked> -> !triton_gpu.memdesc<256x64xf16, #shared, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %5, %B_LDS : tensor<64x256xf16, #blocked1> -> !triton_gpu.memdesc<64x256xf16, #shared1, #triton_gpu.shared_memory, mutable> scf.yield %3 : tensor<256x256xf32, #mma> } tt.store %C_ptr, %0#0: tensor<256x256x!tt.ptr, #mma> @@ -108,18 +108,18 @@ module attributes {"triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-war // CHECK-NEXT: triton_gpu.local_store %[[tileA]] // CHECK-NEXT: triton_gpu.local_store %[[tileB]] module attributes {"triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 64 : i32} { - tt.func public @sink_2nd_load_256x64x128(%A_ptr: tensor<256x128x!tt.ptr, #blocked>, %B_ptr: tensor<128x64x!tt.ptr, #blocked1>, %C_ptr: tensor<256x64x!tt.ptr, #mma>, %A_LDS: !tt.memdesc<256x128xf16, #shared, #triton_gpu.shared_memory, mutable>, %B_LDS: !tt.memdesc<128x64xf16, #shared1, #triton_gpu.shared_memory, mutable>) { + tt.func public @sink_2nd_load_256x64x128(%A_ptr: tensor<256x128x!tt.ptr, #blocked>, %B_ptr: tensor<128x64x!tt.ptr, #blocked1>, %C_ptr: tensor<256x64x!tt.ptr, #mma>, %A_LDS: !triton_gpu.memdesc<256x128xf16, #shared, #triton_gpu.shared_memory, mutable>, %B_LDS: !triton_gpu.memdesc<128x64xf16, #shared1, #triton_gpu.shared_memory, mutable>) { %c0 = arith.constant 0 : i32 %c1 = arith.constant 1 : i32 %cst = arith.constant dense<0.000000e+00> : tensor<256x64xf32, #mma> %0:1 = scf.for %arg0 = %c0 to %c1 step %c1 iter_args(%arg1 = %cst) -> (tensor<256x64xf32, #mma>) : i32 { %4 = tt.load %A_ptr : tensor<256x128x!tt.ptr, #blocked> - %1 = triton_gpu.local_load %A_LDS : !tt.memdesc<256x128xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<256x128xf16, #dotOp0> + %1 = triton_gpu.local_load %A_LDS : !triton_gpu.memdesc<256x128xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<256x128xf16, #dotOp0> %5 = tt.load %B_ptr : tensor<128x64x!tt.ptr, #blocked1> - %2 = triton_gpu.local_load %B_LDS : !tt.memdesc<128x64xf16, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<128x64xf16, #dotOp1> + %2 = triton_gpu.local_load %B_LDS : !triton_gpu.memdesc<128x64xf16, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<128x64xf16, #dotOp1> %3 = tt.dot %1, %2, %arg1 : tensor<256x128xf16, #dotOp0> * tensor<128x64xf16, #dotOp1> -> tensor<256x64xf32, #mma> - triton_gpu.local_store %4, %A_LDS : tensor<256x128xf16, #blocked> -> !tt.memdesc<256x128xf16, #shared, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %5, %B_LDS : tensor<128x64xf16, #blocked1> -> !tt.memdesc<128x64xf16, #shared1, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %4, %A_LDS : tensor<256x128xf16, #blocked> -> !triton_gpu.memdesc<256x128xf16, #shared, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %5, %B_LDS : tensor<128x64xf16, #blocked1> -> !triton_gpu.memdesc<128x64xf16, #shared1, #triton_gpu.shared_memory, mutable> scf.yield %3 : tensor<256x64xf32, #mma> } tt.store %C_ptr, %0#0: tensor<256x64x!tt.ptr, #mma> @@ -147,18 +147,18 @@ module attributes {"triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-war // CHECK-NEXT: triton_gpu.local_store %[[tileA]] // CHECK-NEXT: triton_gpu.local_store %[[tileB]] module attributes {"triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 64 : i32} { - tt.func public @sink_2nd_load_256x256x32(%A_ptr: tensor<256x32x!tt.ptr, #blocked>, %B_ptr: tensor<32x256x!tt.ptr, #blocked1>, %C_ptr: tensor<256x256x!tt.ptr, #mma>, %A_LDS: !tt.memdesc<256x32xf16, #shared, #triton_gpu.shared_memory, mutable>, %B_LDS: !tt.memdesc<32x256xf16, #shared1, #triton_gpu.shared_memory, mutable>) { + tt.func public @sink_2nd_load_256x256x32(%A_ptr: tensor<256x32x!tt.ptr, #blocked>, %B_ptr: tensor<32x256x!tt.ptr, #blocked1>, %C_ptr: tensor<256x256x!tt.ptr, #mma>, %A_LDS: !triton_gpu.memdesc<256x32xf16, #shared, #triton_gpu.shared_memory, mutable>, %B_LDS: !triton_gpu.memdesc<32x256xf16, #shared1, #triton_gpu.shared_memory, mutable>) { %c0 = arith.constant 0 : i32 %c1 = arith.constant 1 : i32 %cst = arith.constant dense<0.000000e+00> : tensor<256x256xf32, #mma> %0:1 = scf.for %arg0 = %c0 to %c1 step %c1 iter_args(%arg1 = %cst) -> (tensor<256x256xf32, #mma>) : i32 { %4 = tt.load %A_ptr : tensor<256x32x!tt.ptr, #blocked> - %1 = triton_gpu.local_load %A_LDS : !tt.memdesc<256x32xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<256x32xf16, #dotOp0> + %1 = triton_gpu.local_load %A_LDS : !triton_gpu.memdesc<256x32xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<256x32xf16, #dotOp0> %5 = tt.load %B_ptr : tensor<32x256x!tt.ptr, #blocked1> - %2 = triton_gpu.local_load %B_LDS : !tt.memdesc<32x256xf16, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<32x256xf16, #dotOp1> + %2 = triton_gpu.local_load %B_LDS : !triton_gpu.memdesc<32x256xf16, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<32x256xf16, #dotOp1> %3 = tt.dot %1, %2, %arg1 : tensor<256x32xf16, #dotOp0> * tensor<32x256xf16, #dotOp1> -> tensor<256x256xf32, #mma> - triton_gpu.local_store %4, %A_LDS : tensor<256x32xf16, #blocked> -> !tt.memdesc<256x32xf16, #shared, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %5, %B_LDS : tensor<32x256xf16, #blocked1> -> !tt.memdesc<32x256xf16, #shared1, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %4, %A_LDS : tensor<256x32xf16, #blocked> -> !triton_gpu.memdesc<256x32xf16, #shared, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %5, %B_LDS : tensor<32x256xf16, #blocked1> -> !triton_gpu.memdesc<32x256xf16, #shared1, #triton_gpu.shared_memory, mutable> scf.yield %3 : tensor<256x256xf32, #mma> } tt.store %C_ptr, %0#0: tensor<256x256x!tt.ptr, #mma> @@ -188,18 +188,18 @@ module attributes {"triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-war // CHECK-NEXT: tt.dot // CHECK-NEXT: triton_gpu.local_store %[[tileA]] module attributes {"triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 64 : i32} { - tt.func public @sink_2nd_load_128x128x128_user_before_dot(%A_ptr: tensor<128x128x!tt.ptr, #blocked>, %B_ptr: tensor<128x128x!tt.ptr, #blocked>, %B_ptr2: tensor<128x128x!tt.ptr, #blocked>, %C_ptr: tensor<128x128x!tt.ptr, #mma>, %A_LDS: !tt.memdesc<128x128xf16, #shared, #triton_gpu.shared_memory, mutable>, %B_LDS: !tt.memdesc<128x128xf16, #shared1, #triton_gpu.shared_memory, mutable>) { + tt.func public @sink_2nd_load_128x128x128_user_before_dot(%A_ptr: tensor<128x128x!tt.ptr, #blocked>, %B_ptr: tensor<128x128x!tt.ptr, #blocked>, %B_ptr2: tensor<128x128x!tt.ptr, #blocked>, %C_ptr: tensor<128x128x!tt.ptr, #mma>, %A_LDS: !triton_gpu.memdesc<128x128xf16, #shared, #triton_gpu.shared_memory, mutable>, %B_LDS: !triton_gpu.memdesc<128x128xf16, #shared1, #triton_gpu.shared_memory, mutable>) { %c0 = arith.constant 0 : i32 %c1 = arith.constant 1 : i32 %cst = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #mma> %0:1 = scf.for %arg0 = %c0 to %c1 step %c1 iter_args(%arg1 = %cst) -> (tensor<128x128xf32, #mma>) : i32 { %4 = tt.load %A_ptr : tensor<128x128x!tt.ptr, #blocked> - %1 = triton_gpu.local_load %A_LDS : !tt.memdesc<128x128xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<128x128xf16, #dotOp0> + %1 = triton_gpu.local_load %A_LDS : !triton_gpu.memdesc<128x128xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<128x128xf16, #dotOp0> %5 = tt.load %B_ptr : tensor<128x128x!tt.ptr, #blocked> - %2 = triton_gpu.local_load %B_LDS : !tt.memdesc<128x128xf16, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<128x128xf16, #dotOp1> + %2 = triton_gpu.local_load %B_LDS : !triton_gpu.memdesc<128x128xf16, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<128x128xf16, #dotOp1> tt.store %B_ptr, %5 : tensor<128x128x!tt.ptr, #blocked> %3 = tt.dot %1, %2, %arg1 : tensor<128x128xf16, #dotOp0> * tensor<128x128xf16, #dotOp1> -> tensor<128x128xf32, #mma> - triton_gpu.local_store %4, %A_LDS : tensor<128x128xf16, #blocked> -> !tt.memdesc<128x128xf16, #shared, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %4, %A_LDS : tensor<128x128xf16, #blocked> -> !triton_gpu.memdesc<128x128xf16, #shared, #triton_gpu.shared_memory, mutable> scf.yield %3 : tensor<128x128xf32, #mma> } tt.store %C_ptr, %0#0: tensor<128x128x!tt.ptr, #mma> @@ -229,19 +229,19 @@ module attributes {"triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-war #dotOp0 = #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}> #dotOp1 = #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}> module attributes {"triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 64 : i32} { - tt.func public @sink_2nd_load_256x256x64_two_dot(%A_ptr: tensor<256x64x!tt.ptr, #blocked>, %B_ptr: tensor<64x256x!tt.ptr, #blocked1>, %C_ptr: tensor<256x256x!tt.ptr, #mma>, %A_LDS: !tt.memdesc<256x64xf16, #shared, #triton_gpu.shared_memory, mutable>, %B_LDS: !tt.memdesc<64x256xf16, #shared1, #triton_gpu.shared_memory, mutable>) { + tt.func public @sink_2nd_load_256x256x64_two_dot(%A_ptr: tensor<256x64x!tt.ptr, #blocked>, %B_ptr: tensor<64x256x!tt.ptr, #blocked1>, %C_ptr: tensor<256x256x!tt.ptr, #mma>, %A_LDS: !triton_gpu.memdesc<256x64xf16, #shared, #triton_gpu.shared_memory, mutable>, %B_LDS: !triton_gpu.memdesc<64x256xf16, #shared1, #triton_gpu.shared_memory, mutable>) { %c0 = arith.constant 0 : i32 %c1 = arith.constant 1 : i32 %cst = arith.constant dense<0.000000e+00> : tensor<256x256xf32, #mma> %0:1 = scf.for %arg0 = %c0 to %c1 step %c1 iter_args(%arg1 = %cst) -> (tensor<256x256xf32, #mma>) : i32 { %4 = tt.load %A_ptr : tensor<256x64x!tt.ptr, #blocked> %5 = tt.load %B_ptr : tensor<64x256x!tt.ptr, #blocked1> - %1 = triton_gpu.local_load %A_LDS : !tt.memdesc<256x64xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<256x64xf16, #dotOp0> - %2 = triton_gpu.local_load %B_LDS : !tt.memdesc<64x256xf16, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<64x256xf16, #dotOp1> + %1 = triton_gpu.local_load %A_LDS : !triton_gpu.memdesc<256x64xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<256x64xf16, #dotOp0> + %2 = triton_gpu.local_load %B_LDS : !triton_gpu.memdesc<64x256xf16, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<64x256xf16, #dotOp1> %3 = tt.dot %1, %2, %arg1 : tensor<256x64xf16, #dotOp0> * tensor<64x256xf16, #dotOp1> -> tensor<256x256xf32, #mma> %6 = tt.dot %1, %2, %3 : tensor<256x64xf16, #dotOp0> * tensor<64x256xf16, #dotOp1> -> tensor<256x256xf32, #mma> - triton_gpu.local_store %4, %A_LDS : tensor<256x64xf16, #blocked> -> !tt.memdesc<256x64xf16, #shared, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %5, %B_LDS : tensor<64x256xf16, #blocked1> -> !tt.memdesc<64x256xf16, #shared1, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %4, %A_LDS : tensor<256x64xf16, #blocked> -> !triton_gpu.memdesc<256x64xf16, #shared, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %5, %B_LDS : tensor<64x256xf16, #blocked1> -> !triton_gpu.memdesc<64x256xf16, #shared1, #triton_gpu.shared_memory, mutable> scf.yield %3 : tensor<256x256xf32, #mma> } tt.store %C_ptr, %0#0: tensor<256x256x!tt.ptr, #mma> diff --git a/test/TritonGPU/amd/optimize-lds-usage.mlir b/test/TritonGPU/amd/optimize-lds-usage.mlir index 5cd34aab27..61d1861b29 100644 --- a/test/TritonGPU/amd/optimize-lds-usage.mlir +++ b/test/TritonGPU/amd/optimize-lds-usage.mlir @@ -13,9 +13,9 @@ #shared = #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 16, order = [0, 1], hasLeadingOffset = false}> module attributes {"triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 64 : i32} { tt.func public @alloc_convert_load(%arg0: tensor<128x128xf16, #blocked>, %arg1: tensor<128x128xf32, #blocked>) attributes {noinline = false} { - %1 = triton_gpu.local_alloc %arg0 : (tensor<128x128xf16, #blocked>) -> !tt.memdesc<128x128xf16, #shared, #triton_gpu.shared_memory> + %1 = triton_gpu.local_alloc %arg0 : (tensor<128x128xf16, #blocked>) -> !triton_gpu.memdesc<128x128xf16, #shared, #triton_gpu.shared_memory> %2 = triton_gpu.convert_layout %arg1 : tensor<128x128xf32, #blocked> -> tensor<128x128xf32, #mma> - %3 = triton_gpu.local_load %1 : !tt.memdesc<128x128xf16, #shared, #triton_gpu.shared_memory> -> tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>> + %3 = triton_gpu.local_load %1 : !triton_gpu.memdesc<128x128xf16, #shared, #triton_gpu.shared_memory> -> tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>> tt.return } } @@ -35,9 +35,9 @@ module attributes {"triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-war #shared = #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 16, order = [0, 1], hasLeadingOffset = false}> module attributes {"triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 64 : i32} { tt.func public @alloc_convert_small_load(%arg0: tensor<128x128xf16, #blocked>, %arg1: tensor<128x128xf16, #blocked>) attributes {noinline = false} { - %1 = triton_gpu.local_alloc %arg0 : (tensor<128x128xf16, #blocked>) -> !tt.memdesc<128x128xf16, #shared, #triton_gpu.shared_memory> + %1 = triton_gpu.local_alloc %arg0 : (tensor<128x128xf16, #blocked>) -> !triton_gpu.memdesc<128x128xf16, #shared, #triton_gpu.shared_memory> %2 = triton_gpu.convert_layout %arg1 : tensor<128x128xf16, #blocked> -> tensor<128x128xf16, #mma> - %3 = triton_gpu.local_load %1 : !tt.memdesc<128x128xf16, #shared, #triton_gpu.shared_memory> -> tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>> + %3 = triton_gpu.local_load %1 : !triton_gpu.memdesc<128x128xf16, #shared, #triton_gpu.shared_memory> -> tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>> tt.return } } @@ -57,9 +57,9 @@ module attributes {"triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-war #shared = #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 16, order = [0, 1, 2], hasLeadingOffset = false}> module attributes {"triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 64 : i32} { tt.func public @alloc_convert_3d_load(%arg0: tensor<1x128x128xf16, #blocked>, %arg1: tensor<1x128x128xf16, #blocked>) attributes {noinline = false} { - %1 = triton_gpu.local_alloc %arg0 : (tensor<1x128x128xf16, #blocked>) -> !tt.memdesc<1x128x128xf16, #shared, #triton_gpu.shared_memory> + %1 = triton_gpu.local_alloc %arg0 : (tensor<1x128x128xf16, #blocked>) -> !triton_gpu.memdesc<1x128x128xf16, #shared, #triton_gpu.shared_memory> %2 = triton_gpu.convert_layout %arg1 : tensor<1x128x128xf16, #blocked> -> tensor<1x128x128xf16, #mma> - %3 = triton_gpu.local_load %1 : !tt.memdesc<1x128x128xf16, #shared, #triton_gpu.shared_memory> -> tensor<1x128x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>> + %3 = triton_gpu.local_load %1 : !triton_gpu.memdesc<1x128x128xf16, #shared, #triton_gpu.shared_memory> -> tensor<1x128x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>> tt.return } } @@ -81,9 +81,9 @@ module attributes {"triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-war #shared = #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 16, order = [0, 1], hasLeadingOffset = false}> module attributes {"triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 64 : i32} { tt.func public @alloc_convert_32k_limit(%arg0: tensor<64x128xf16, #blocked>, %arg1: tensor<64x128xf16, #blocked>) attributes {noinline = false} { - %1 = triton_gpu.local_alloc %arg0 : (tensor<64x128xf16, #blocked>) -> !tt.memdesc<64x128xf16, #shared, #triton_gpu.shared_memory> + %1 = triton_gpu.local_alloc %arg0 : (tensor<64x128xf16, #blocked>) -> !triton_gpu.memdesc<64x128xf16, #shared, #triton_gpu.shared_memory> %2 = triton_gpu.convert_layout %arg1 : tensor<64x128xf16, #blocked> -> tensor<64x128xf16, #mma> - %3 = triton_gpu.local_load %1 : !tt.memdesc<64x128xf16, #shared, #triton_gpu.shared_memory> -> tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 0, kWidth = 4, parent = #mma}>> + %3 = triton_gpu.local_load %1 : !triton_gpu.memdesc<64x128xf16, #shared, #triton_gpu.shared_memory> -> tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 0, kWidth = 4, parent = #mma}>> tt.return } } @@ -98,11 +98,11 @@ module attributes {"triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-war // CHECK-DAG: [[SHARED:#[a-z0-9]*]] = #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 16, order = [0, 1], hasLeadingOffset = false}> // CHECK: tt.func public @mfma_dot_shortcut([[ARG_0:%[a-z0-9]*]]: {{.*}}, [[ARG_1:%[a-z0-9]*]]: {{.*}}, [[ARG_2:%[a-z0-9]*]]: {{.*}}) -// CHECK: [[ALLOC:%[0-9]+]] = triton_gpu.local_alloc [[ARG_0]] : (tensor<128x128xf16, [[BLOCKED_1]]>) -> !tt.memdesc<128x128xf16, [[SHARED]], #triton_gpu.shared_memory> +// CHECK: [[ALLOC:%[0-9]+]] = triton_gpu.local_alloc [[ARG_0]] : (tensor<128x128xf16, [[BLOCKED_1]]>) -> !triton_gpu.memdesc<128x128xf16, [[SHARED]], #triton_gpu.shared_memory> // CHECK: [[INTERMEDIATE_CONV:%[0-9]+]] = triton_gpu.convert_layout [[ARG_1]] : tensor<128x128xf32, [[BLOCKED_1]]> -> tensor<128x128xf32, [[BLOCKED_2]]> // CHECK: [[CONVERT_1:%[0-9]+]] = triton_gpu.convert_layout [[INTERMEDIATE_CONV]] : tensor<128x128xf32, [[BLOCKED_2]]> -> tensor<128x128xf32, [[MMA_2]]> // CHECK: [[CONVERT_2:%[0-9]+]] = triton_gpu.convert_layout [[ARG_2]] : tensor<256x128xf16, [[MMA_1]]> -> tensor<256x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = [[MMA_1]], kWidth = 4}>> -// CHECK: [[LOAD:%[0-9]+]] = triton_gpu.local_load [[ALLOC]] : !tt.memdesc<128x128xf16, [[SHARED]], #triton_gpu.shared_memory> -> tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = [[MMA_2]], kWidth = 4}>> +// CHECK: [[LOAD:%[0-9]+]] = triton_gpu.local_load [[ALLOC]] : !triton_gpu.memdesc<128x128xf16, [[SHARED]], #triton_gpu.shared_memory> -> tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = [[MMA_2]], kWidth = 4}>> #blocked = #triton_gpu.blocked<{sizePerThread = [4, 1], threadsPerWarp = [16, 4], warpsPerCTA = [1, 8], order = [0, 1]}> #mma1 = #triton_gpu.amd_mfma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [1, 8], instrShape = [32, 32], isTransposed = false}> #mma2 = #triton_gpu.amd_mfma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [8, 1], instrShape = [32, 32], isTransposed = true}> @@ -111,10 +111,10 @@ module attributes {"triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-war #shared = #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 16, order = [0, 1], hasLeadingOffset = false}> module attributes {"triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 64 : i32} { tt.func public @mfma_dot_shortcut(%arg0: tensor<128x128xf16, #blocked>, %arg1: tensor<128x128xf32, #blocked>, %arg2: tensor<256x128xf16, #mma2>) attributes {noinline = false} { - %alloc = triton_gpu.local_alloc %arg0 : (tensor<128x128xf16, #blocked>) -> !tt.memdesc<128x128xf16, #shared, #triton_gpu.shared_memory> + %alloc = triton_gpu.local_alloc %arg0 : (tensor<128x128xf16, #blocked>) -> !triton_gpu.memdesc<128x128xf16, #shared, #triton_gpu.shared_memory> %convert_1 = triton_gpu.convert_layout %arg1 : tensor<128x128xf32, #blocked> -> tensor<128x128xf32, #mma1> %convert_2 = triton_gpu.convert_layout %arg2 : tensor<256x128xf16, #mma2> -> tensor<256x128xf16, #dotop2> - %load = triton_gpu.local_load %alloc : !tt.memdesc<128x128xf16, #shared, #triton_gpu.shared_memory> -> tensor<128x128xf16, #dotop1> + %load = triton_gpu.local_load %alloc : !triton_gpu.memdesc<128x128xf16, #shared, #triton_gpu.shared_memory> -> tensor<128x128xf16, #dotop1> tt.return } } @@ -131,9 +131,9 @@ module attributes {"triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-war #shared = #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 16, order = [0, 1], hasLeadingOffset = false}> module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32} { tt.func public @convert_1d(%arg0: tensor<128xf32, #triton_gpu.slice<{dim = 0, parent = #mma}>>, %arg1: tensor<128x128xf32, #mma>) attributes {noinline = false} { - %alloc = triton_gpu.local_alloc %arg1 : (tensor<128x128xf32, #mma>) -> !tt.memdesc<128x128xf32, #shared, #triton_gpu.shared_memory> + %alloc = triton_gpu.local_alloc %arg1 : (tensor<128x128xf32, #mma>) -> !triton_gpu.memdesc<128x128xf32, #shared, #triton_gpu.shared_memory> %1 = triton_gpu.convert_layout %arg0 : tensor<128xf32, #triton_gpu.slice<{dim = 0, parent = #mma}>> -> tensor<128xf32, #blocked> - %load = triton_gpu.local_load %alloc : !tt.memdesc<128x128xf32, #shared, #triton_gpu.shared_memory> -> tensor<128x128xf32, #mma> + %load = triton_gpu.local_load %alloc : !triton_gpu.memdesc<128x128xf32, #shared, #triton_gpu.shared_memory> -> tensor<128x128xf32, #mma> tt.return } } diff --git a/test/TritonGPU/canonicalize.mlir b/test/TritonGPU/canonicalize.mlir index 9422bb0f85..64385d9297 100644 --- a/test/TritonGPU/canonicalize.mlir +++ b/test/TritonGPU/canonicalize.mlir @@ -71,8 +71,8 @@ tt.func @test_canonicalize_convert_histogram(%arg0: tensor<256xi32, #blocked1>) #shared = #triton_gpu.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0], hasLeadingOffset = false}> module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.compute-capability" = 80} { tt.func @test_canonicalize_convert_local_load() -> tensor<256xi32, #blocked1> { - %0 = triton_gpu.local_alloc : () -> !tt.memdesc<256xi32, #shared, mutable> - %1 = triton_gpu.local_load %0 : !tt.memdesc<256xi32, #shared, mutable> -> tensor<256xi32, #blocked> + %0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<256xi32, #shared, mutable> + %1 = triton_gpu.local_load %0 : !triton_gpu.memdesc<256xi32, #shared, mutable> -> tensor<256xi32, #blocked> gpu.barrier %2 = triton_gpu.convert_layout %1 : tensor<256xi32, #blocked> -> tensor<256xi32, #blocked1> tt.return %2 : tensor<256xi32, #blocked1> @@ -85,15 +85,15 @@ tt.func @test_canonicalize_convert_local_load() -> tensor<256xi32, #blocked1> { #shared = #triton_gpu.shared<{vec = 1, perPhase=2, maxPhase=8, order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32} { // CHECK-LABEL: local_alloc_nofold1 - tt.func @local_alloc_nofold1(%arg0: tensor<16x16xf16, #blocked>) -> !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> { + tt.func @local_alloc_nofold1(%arg0: tensor<16x16xf16, #blocked>) -> !triton_gpu.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> { // CHECK: %[[ARG:.+]] = triton_gpu.local_alloc // CHECK-NEXT: %[[ARG2:.+]] = triton_gpu.local_load %[[ARG]] // CHECK-NEXT: %[[ARG3:.+]] = triton_gpu.local_alloc %[[ARG2]] // CHECK-NEXT: tt.return %[[ARG3]] - %0 = triton_gpu.local_alloc %arg0 : (tensor<16x16xf16, #blocked>) -> !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable> - %1 = triton_gpu.local_load %0 : !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<16x16xf16, #blocked> - %2 = triton_gpu.local_alloc %1 : (tensor<16x16xf16, #blocked>) -> !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> - tt.return %2 : !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> + %0 = triton_gpu.local_alloc %arg0 : (tensor<16x16xf16, #blocked>) -> !triton_gpu.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable> + %1 = triton_gpu.local_load %0 : !triton_gpu.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<16x16xf16, #blocked> + %2 = triton_gpu.local_alloc %1 : (tensor<16x16xf16, #blocked>) -> !triton_gpu.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> + tt.return %2 : !triton_gpu.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> } } // end module @@ -105,15 +105,15 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : #shared1 = #triton_gpu.shared<{vec = 1, perPhase=1, maxPhase=1, order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32} { // CHECK-LABEL: local_alloc_nofold2 - tt.func @local_alloc_nofold2(%arg0: tensor<16x16xf16, #blocked>) -> !tt.memdesc<16x16xf16, #shared1, #triton_gpu.shared_memory> { + tt.func @local_alloc_nofold2(%arg0: tensor<16x16xf16, #blocked>) -> !triton_gpu.memdesc<16x16xf16, #shared1, #triton_gpu.shared_memory> { // CHECK: %[[ARG:.+]] = triton_gpu.local_alloc // CHECK-NEXT: %[[ARG2:.+]] = triton_gpu.local_load %[[ARG]] // CHECK-NEXT: %[[ARG3:.+]] = triton_gpu.local_alloc %[[ARG2]] // CHECK-NEXT: tt.return %[[ARG3]] - %0 = triton_gpu.local_alloc %arg0 : (tensor<16x16xf16, #blocked>) -> !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> - %1 = triton_gpu.local_load %0 : !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> -> tensor<16x16xf16, #blocked> - %2 = triton_gpu.local_alloc %1 : (tensor<16x16xf16, #blocked>) -> !tt.memdesc<16x16xf16, #shared1, #triton_gpu.shared_memory> - tt.return %2 : !tt.memdesc<16x16xf16, #shared1, #triton_gpu.shared_memory> + %0 = triton_gpu.local_alloc %arg0 : (tensor<16x16xf16, #blocked>) -> !triton_gpu.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> + %1 = triton_gpu.local_load %0 : !triton_gpu.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> -> tensor<16x16xf16, #blocked> + %2 = triton_gpu.local_alloc %1 : (tensor<16x16xf16, #blocked>) -> !triton_gpu.memdesc<16x16xf16, #shared1, #triton_gpu.shared_memory> + tt.return %2 : !triton_gpu.memdesc<16x16xf16, #shared1, #triton_gpu.shared_memory> } } // end module @@ -123,13 +123,13 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : #blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 4], warpsPerCTA = [1, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> #shared = #triton_gpu.shared<{vec = 1, perPhase=2, maxPhase=8, order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32} { - tt.func @local_alloc_fold(%arg0: tensor<16x16xf16, #blocked>) -> !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> { + tt.func @local_alloc_fold(%arg0: tensor<16x16xf16, #blocked>) -> !triton_gpu.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> { // CHECK-LABEL: local_alloc_fold // CHECK-NEXT: %[[ARG:.+]] = triton_gpu.local_alloc // CHECK-NEXT: tt.return %[[ARG]] - %0 = triton_gpu.local_alloc %arg0 : (tensor<16x16xf16, #blocked>) -> !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> - %1 = triton_gpu.local_load %0 : !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> -> tensor<16x16xf16, #blocked> - %2 = triton_gpu.local_alloc %1 : (tensor<16x16xf16, #blocked>) -> !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> - tt.return %2 : !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> + %0 = triton_gpu.local_alloc %arg0 : (tensor<16x16xf16, #blocked>) -> !triton_gpu.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> + %1 = triton_gpu.local_load %0 : !triton_gpu.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> -> tensor<16x16xf16, #blocked> + %2 = triton_gpu.local_alloc %1 : (tensor<16x16xf16, #blocked>) -> !triton_gpu.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> + tt.return %2 : !triton_gpu.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> } } // end module diff --git a/test/TritonGPU/combine.mlir b/test/TritonGPU/combine.mlir index 2ec11a24f1..5e1cad52af 100644 --- a/test/TritonGPU/combine.mlir +++ b/test/TritonGPU/combine.mlir @@ -1561,9 +1561,9 @@ module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-war %20 = triton_gpu.convert_layout %16 : tensor<32x32x!tt.ptr, #blocked> -> tensor<32x32x!tt.ptr, #blocked4> %21 = tt.load %20 : tensor<32x32x!tt.ptr, #blocked4> %22 = triton_gpu.convert_layout %21 : tensor<32x32xf16, #blocked4> -> tensor<32x32xf16, #blocked> - %23 = triton_gpu.local_alloc %22 : (tensor<32x32xf16, #blocked>) -> !tt.memdesc<32x32xf16, #shared> - %24 = triton_gpu.memdesc_trans %23 {order=array} : !tt.memdesc<32x32xf16, #shared> -> !tt.memdesc<32x32xf16, #shared1> - %25 = triton_gpu.local_load %24 : !tt.memdesc<32x32xf16, #shared1> -> tensor<32x32xf16, #blocked> + %23 = triton_gpu.local_alloc %22 : (tensor<32x32xf16, #blocked>) -> !triton_gpu.memdesc<32x32xf16, #shared> + %24 = triton_gpu.memdesc_trans %23 {order=array} : !triton_gpu.memdesc<32x32xf16, #shared> -> !triton_gpu.memdesc<32x32xf16, #shared1> + %25 = triton_gpu.local_load %24 : !triton_gpu.memdesc<32x32xf16, #shared1> -> tensor<32x32xf16, #blocked> %26 = triton_gpu.convert_layout %19 : tensor<32x32xf16, #blocked> -> tensor<32x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked5}>> %27 = triton_gpu.convert_layout %25 : tensor<32x32xf16, #blocked> -> tensor<32x32xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked5}>> %28 = triton_gpu.convert_layout %cst : tensor<32x32xf32, #blocked> -> tensor<32x32xf32, #blocked5> @@ -1993,10 +1993,10 @@ module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : %67 = tt.load %66 : tensor<32x64x!tt.ptr, #blocked> %68 = tt.addptr %17, %65 : tensor<256x64x!tt.ptr, #blocked>, tensor<256x64xi32, #blocked> %69 = tt.load %68 : tensor<256x64x!tt.ptr, #blocked> - %70 = triton_gpu.local_alloc %69 : (tensor<256x64xf16, #blocked>) -> !tt.memdesc<256x64xf16, #shared> - %71 = triton_gpu.memdesc_trans %70 {order=array} : !tt.memdesc<256x64xf16, #shared> -> !tt.memdesc<64x256xf16, #shared1> + %70 = triton_gpu.local_alloc %69 : (tensor<256x64xf16, #blocked>) -> !triton_gpu.memdesc<256x64xf16, #shared> + %71 = triton_gpu.memdesc_trans %70 {order=array} : !triton_gpu.memdesc<256x64xf16, #shared> -> !triton_gpu.memdesc<64x256xf16, #shared1> %72 = triton_gpu.convert_layout %67 : tensor<32x64xf16, #blocked> -> tensor<32x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked3}>> - %73 = triton_gpu.local_load %71 : !tt.memdesc<64x256xf16, #shared1> -> tensor<64x256xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked3}>> + %73 = triton_gpu.local_load %71 : !triton_gpu.memdesc<64x256xf16, #shared1> -> tensor<64x256xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked3}>> %74 = triton_gpu.convert_layout %arg8 : tensor<32x256xf32, #blocked3> -> tensor<32x256xf32, #mma> %75 = triton_gpu.convert_layout %72 : tensor<32x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked3}>> -> tensor<32x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> %76 = triton_gpu.convert_layout %73 : tensor<64x256xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked3}>> -> tensor<64x256xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> diff --git a/test/TritonGPU/dot-operands.mlir b/test/TritonGPU/dot-operands.mlir index 2d562b9587..911cf4fb40 100644 --- a/test/TritonGPU/dot-operands.mlir +++ b/test/TritonGPU/dot-operands.mlir @@ -162,10 +162,10 @@ tt.func @update_kwidth_slice( module attributes {"triton_gpu.target" = "cuda:90", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { // CHECK: tt.func @mma_v3_reg_operand_A // CHECK: %[[A:.+]] = triton_gpu.convert_layout %{{.*}} : tensor<128x64xf16, #mma> -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> -// CHECK: triton_nvidia_gpu.warp_group_dot %[[A]], {{.*}} : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * !tt.memdesc<64x64xf16, #shared> -> tensor<128x64xf32, #mma> -tt.func @mma_v3_reg_operand_A(%arg0: tensor<128x64xf16, #mma>, %arg1: !tt.memdesc<64x64xf16, #shared>, %arg2: tensor<128x64xf32, #mma>) -> tensor<128x64xf32, #mma>{ - %A = triton_gpu.local_alloc %arg0 : (tensor<128x64xf16, #mma>) -> !tt.memdesc<128x64xf16, #shared1> - %r = triton_nvidia_gpu.warp_group_dot %A, %arg1, %arg2 : !tt.memdesc<128x64xf16, #shared1> * !tt.memdesc<64x64xf16, #shared> -> tensor<128x64xf32, #mma> +// CHECK: triton_nvidia_gpu.warp_group_dot %[[A]], {{.*}} : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * !triton_gpu.memdesc<64x64xf16, #shared> -> tensor<128x64xf32, #mma> +tt.func @mma_v3_reg_operand_A(%arg0: tensor<128x64xf16, #mma>, %arg1: !triton_gpu.memdesc<64x64xf16, #shared>, %arg2: tensor<128x64xf32, #mma>) -> tensor<128x64xf32, #mma>{ + %A = triton_gpu.local_alloc %arg0 : (tensor<128x64xf16, #mma>) -> !triton_gpu.memdesc<128x64xf16, #shared1> + %r = triton_nvidia_gpu.warp_group_dot %A, %arg1, %arg2 : !triton_gpu.memdesc<128x64xf16, #shared1> * !triton_gpu.memdesc<64x64xf16, #shared> -> tensor<128x64xf32, #mma> tt.return %r : tensor<128x64xf32, #mma> } } @@ -178,10 +178,10 @@ tt.func @mma_v3_reg_operand_A(%arg0: tensor<128x64xf16, #mma>, %arg1: !tt.memdes module attributes {"triton_gpu.target" = "cuda:90", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { // CHECK: tt.func @mma_v3_reg_operand_A_fp8 // CHECK: %[[A:.+]] = triton_gpu.convert_layout %{{.*}} : tensor<128x64xf8E5M2, #mma> -> tensor<128x64xf8E5M2, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>> -// CHECK: triton_nvidia_gpu.warp_group_dot %[[A]], {{.*}} : tensor<128x64xf8E5M2, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>> * !tt.memdesc<64x64xf8E5M2, #shared> -> tensor<128x64xf32, #mma> -tt.func @mma_v3_reg_operand_A_fp8(%arg0: tensor<128x64xf8E5M2, #mma>, %arg1: !tt.memdesc<64x64xf8E5M2, #shared>, %arg2: tensor<128x64xf32, #mma>) -> tensor<128x64xf32, #mma>{ - %A = triton_gpu.local_alloc %arg0 : (tensor<128x64xf8E5M2, #mma>) -> !tt.memdesc<128x64xf8E5M2, #shared1> - %r = triton_nvidia_gpu.warp_group_dot %A, %arg1, %arg2 : !tt.memdesc<128x64xf8E5M2, #shared1> * !tt.memdesc<64x64xf8E5M2, #shared> -> tensor<128x64xf32, #mma> +// CHECK: triton_nvidia_gpu.warp_group_dot %[[A]], {{.*}} : tensor<128x64xf8E5M2, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>> * !triton_gpu.memdesc<64x64xf8E5M2, #shared> -> tensor<128x64xf32, #mma> +tt.func @mma_v3_reg_operand_A_fp8(%arg0: tensor<128x64xf8E5M2, #mma>, %arg1: !triton_gpu.memdesc<64x64xf8E5M2, #shared>, %arg2: tensor<128x64xf32, #mma>) -> tensor<128x64xf32, #mma>{ + %A = triton_gpu.local_alloc %arg0 : (tensor<128x64xf8E5M2, #mma>) -> !triton_gpu.memdesc<128x64xf8E5M2, #shared1> + %r = triton_nvidia_gpu.warp_group_dot %A, %arg1, %arg2 : !triton_gpu.memdesc<128x64xf8E5M2, #shared1> * !triton_gpu.memdesc<64x64xf8E5M2, #shared> -> tensor<128x64xf32, #mma> tt.return %r : tensor<128x64xf32, #mma> } } @@ -220,12 +220,12 @@ module attributes {"triton_gpu.target" = "cuda:90", "triton_gpu.num-ctas" = 1 : // CHECK: %[[A_BLOCK:.*]] = tt.load %{{.*}} : tensor<128x64x!tt.ptr, #blocked> // CHECK: %[[A_DOTOP:.*]] = triton_gpu.convert_layout %[[A_BLOCK]] : tensor<128x64xbf16, #blocked> -> tensor<128x64xbf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> // CHECK: %[[A_CASTED:.*]] = tt.fp_to_fp %[[A_DOTOP]] : tensor<128x64xbf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> -// CHECK: %[[R:.*]] = triton_nvidia_gpu.warp_group_dot %[[A_CASTED]], %{{.*}}, %{{.*}} : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * !tt.memdesc<64x64xf16, #shared> -> tensor<128x64xf32, #mma> - tt.func @mma_v3_reg_push_elementwise(%pa: tensor<128x64x!tt.ptr, #blocked>, %dotb: !tt.memdesc<64x64xf16, #shared>, %dotc: tensor<128x64xf32, #mma>) -> tensor<128x64xf32, #mma>{ +// CHECK: %[[R:.*]] = triton_nvidia_gpu.warp_group_dot %[[A_CASTED]], %{{.*}}, %{{.*}} : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * !triton_gpu.memdesc<64x64xf16, #shared> -> tensor<128x64xf32, #mma> + tt.func @mma_v3_reg_push_elementwise(%pa: tensor<128x64x!tt.ptr, #blocked>, %dotb: !triton_gpu.memdesc<64x64xf16, #shared>, %dotc: tensor<128x64xf32, #mma>) -> tensor<128x64xf32, #mma>{ %a_bf16 = tt.load %pa : tensor<128x64x!tt.ptr, #blocked> %a = tt.fp_to_fp %a_bf16 : tensor<128x64xbf16, #blocked> -> tensor<128x64xf16, #blocked> - %dota = triton_gpu.local_alloc %a: (tensor<128x64xf16, #blocked>) -> !tt.memdesc<128x64xf16, #shared1> - %r = triton_nvidia_gpu.warp_group_dot %dota, %dotb, %dotc : !tt.memdesc<128x64xf16, #shared1> * !tt.memdesc<64x64xf16, #shared> -> tensor<128x64xf32, #mma> + %dota = triton_gpu.local_alloc %a: (tensor<128x64xf16, #blocked>) -> !triton_gpu.memdesc<128x64xf16, #shared1> + %r = triton_nvidia_gpu.warp_group_dot %dota, %dotb, %dotc : !triton_gpu.memdesc<128x64xf16, #shared1> * !triton_gpu.memdesc<64x64xf16, #shared> -> tensor<128x64xf32, #mma> tt.return %r : tensor<128x64xf32, #mma> } } @@ -244,15 +244,15 @@ module attributes {"triton_gpu.target" = "cuda:90", "triton_gpu.num-ctas" = 1 : // CHECK: %[[A_CASTED:.*]] = arith.sitofp %[[A_DOTOP]] : tensor<128x64xi8, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> to tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> // CHECK: %[[A_SCALED:.*]] = arith.mulf %[[A_CASTED]], %[[CST_DOTOP]] : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> // CHECK: %[[A_NEGATED:.*]] = arith.negf %[[A_SCALED]] : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> -// CHECK: %[[R:.*]] = triton_nvidia_gpu.warp_group_dot %[[A_NEGATED]], %{{.*}}, %{{.*}} : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * !tt.memdesc<64x64xf16, #shared> -> tensor<128x64xf32, #mma> - tt.func @mma_v3_reg_push_elementwise_chained(%pa: tensor<128x64x!tt.ptr, #blocked>, %dotb: !tt.memdesc<64x64xf16, #shared>, %dotc: tensor<128x64xf32, #mma>) -> tensor<128x64xf32, #mma>{ +// CHECK: %[[R:.*]] = triton_nvidia_gpu.warp_group_dot %[[A_NEGATED]], %{{.*}}, %{{.*}} : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * !triton_gpu.memdesc<64x64xf16, #shared> -> tensor<128x64xf32, #mma> + tt.func @mma_v3_reg_push_elementwise_chained(%pa: tensor<128x64x!tt.ptr, #blocked>, %dotb: !triton_gpu.memdesc<64x64xf16, #shared>, %dotc: tensor<128x64xf32, #mma>) -> tensor<128x64xf32, #mma>{ %cst = arith.constant dense<0.000000e+00> : tensor<128x64xf16, #blocked> %a_i8 = tt.load %pa : tensor<128x64x!tt.ptr, #blocked> %a_f16 = arith.sitofp %a_i8 : tensor<128x64xi8, #blocked> to tensor<128x64xf16, #blocked> %a_scaled = arith.mulf %a_f16, %cst : tensor<128x64xf16, #blocked> %a_negated = arith.negf %a_scaled : tensor<128x64xf16, #blocked> - %dota = triton_gpu.local_alloc %a_negated: (tensor<128x64xf16, #blocked>) -> !tt.memdesc<128x64xf16, #shared1> - %r = triton_nvidia_gpu.warp_group_dot %dota, %dotb, %dotc : !tt.memdesc<128x64xf16, #shared1> * !tt.memdesc<64x64xf16, #shared> -> tensor<128x64xf32, #mma> + %dota = triton_gpu.local_alloc %a_negated: (tensor<128x64xf16, #blocked>) -> !triton_gpu.memdesc<128x64xf16, #shared1> + %r = triton_nvidia_gpu.warp_group_dot %dota, %dotb, %dotc : !triton_gpu.memdesc<128x64xf16, #shared1> * !triton_gpu.memdesc<64x64xf16, #shared> -> tensor<128x64xf32, #mma> tt.return %r : tensor<128x64xf32, #mma> } } diff --git a/test/TritonGPU/fence-inserstion.mlir b/test/TritonGPU/fence-inserstion.mlir index 9ed3646d92..f83acb21f1 100644 --- a/test/TritonGPU/fence-inserstion.mlir +++ b/test/TritonGPU/fence-inserstion.mlir @@ -9,10 +9,10 @@ module attributes {"triton_gpu.target" = "cuda:90", "triton_gpu.num-ctas" = 1 : // CHECK-LABEL: matmul_like_fence tt.func public @matmul_like_fence(%arg0: tensor<128x128xf16, #blocked>, %arg1: tensor<128x64xf16, #blocked2>) { %cst = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma> - %0 = triton_gpu.local_alloc %arg0 : (tensor<128x128xf16, #blocked>) -> !tt.memdesc<128x128xf16, #shared> - %1 = triton_gpu.local_alloc %arg1 : (tensor<128x64xf16, #blocked2>) -> !tt.memdesc<128x64xf16, #shared1> + %0 = triton_gpu.local_alloc %arg0 : (tensor<128x128xf16, #blocked>) -> !triton_gpu.memdesc<128x128xf16, #shared> + %1 = triton_gpu.local_alloc %arg1 : (tensor<128x64xf16, #blocked2>) -> !triton_gpu.memdesc<128x64xf16, #shared1> // CHECK: triton_nvidia_gpu.fence_async_shared - %2 = triton_nvidia_gpu.warp_group_dot %0, %1, %cst : !tt.memdesc<128x128xf16, #shared> * !tt.memdesc<128x64xf16, #shared1> -> tensor<128x64xf32, #mma> + %2 = triton_nvidia_gpu.warp_group_dot %0, %1, %cst : !triton_gpu.memdesc<128x128xf16, #shared> * !triton_gpu.memdesc<128x64xf16, #shared1> -> tensor<128x64xf32, #mma> tt.return } } @@ -31,15 +31,15 @@ module attributes {"triton_gpu.target" = "cuda:90", "triton_gpu.num-ctas" = 1 : %c64_i32 = arith.constant 64 : i32 %c0_i32 = arith.constant 0 : i32 %c32_i32 = arith.constant 32 : i32 - %0 = triton_gpu.local_alloc %arg0 : (tensor<128x128xf16, #blocked>) -> !tt.memdesc<128x128xf16, #shared> - %1 = triton_gpu.local_alloc %arg1 : (tensor<128x64xf16, #blocked>) -> !tt.memdesc<128x64xf16, #shared1> + %0 = triton_gpu.local_alloc %arg0 : (tensor<128x128xf16, #blocked>) -> !triton_gpu.memdesc<128x128xf16, #shared> + %1 = triton_gpu.local_alloc %arg1 : (tensor<128x64xf16, #blocked>) -> !triton_gpu.memdesc<128x64xf16, #shared1> // CHECK: triton_nvidia_gpu.fence_async_shared // CHECK: scf.for // CHECK-NOT: triton_nvidia_gpu.fence_async_shared // CHECK: triton_nvidia_gpu.warp_group_dot scf.for %iv0 = %c0_i32 to %c64_i32 step %c32_i32 : i32 { scf.for %iv1 = %c0_i32 to %c64_i32 step %c32_i32 : i32 { - %2 = triton_nvidia_gpu.warp_group_dot %0, %1, %cst : !tt.memdesc<128x128xf16, #shared> * !tt.memdesc<128x64xf16, #shared1> -> tensor<128x64xf32, #mma> + %2 = triton_nvidia_gpu.warp_group_dot %0, %1, %cst : !triton_gpu.memdesc<128x128xf16, #shared> * !triton_gpu.memdesc<128x64xf16, #shared1> -> tensor<128x64xf32, #mma> } } tt.return diff --git a/test/TritonGPU/invalid.mlir b/test/TritonGPU/invalid.mlir index f9e265f3ee..5a91a3cc0c 100644 --- a/test/TritonGPU/invalid.mlir +++ b/test/TritonGPU/invalid.mlir @@ -1,45 +1,45 @@ // RUN: triton-opt --split-input-file %s --verify-diagnostics -tt.func public @subview_element_ty(%arg0: !tt.memdesc<8x16xf32>) { +tt.func public @subview_element_ty(%arg0: !triton_gpu.memdesc<8x16xf32>) { %zero = arith.constant 0 : i32 // expected-error @+1 {{element type}} - %a = triton_gpu.memdesc_subview %arg0[%zero, %zero] : !tt.memdesc<8x16xf32> -> !tt.memdesc<8x16xf16> + %a = triton_gpu.memdesc_subview %arg0[%zero, %zero] : !triton_gpu.memdesc<8x16xf32> -> !triton_gpu.memdesc<8x16xf16> tt.return } // ----- -tt.func public @too_many_offsets(%arg0: !tt.memdesc<8x16xf32>) { +tt.func public @too_many_offsets(%arg0: !triton_gpu.memdesc<8x16xf32>) { %zero = arith.constant 0 : i32 // expected-error @+1 {{offsets}} - %a = triton_gpu.memdesc_subview %arg0[%zero, %zero, %zero] : !tt.memdesc<8x16xf32> -> !tt.memdesc + %a = triton_gpu.memdesc_subview %arg0[%zero, %zero, %zero] : !triton_gpu.memdesc<8x16xf32> -> !triton_gpu.memdesc tt.return } // ----- -tt.func public @too_few_offsets(%arg0: !tt.memdesc<8x16xf32>) { +tt.func public @too_few_offsets(%arg0: !triton_gpu.memdesc<8x16xf32>) { %zero = arith.constant 0 : i32 // expected-error @+1 {{offsets}} - %a = triton_gpu.memdesc_subview %arg0[%zero] : !tt.memdesc<8x16xf32> -> !tt.memdesc + %a = triton_gpu.memdesc_subview %arg0[%zero] : !triton_gpu.memdesc<8x16xf32> -> !triton_gpu.memdesc tt.return } // ----- -tt.func public @result_rank_too_large(%arg0: !tt.memdesc<8x16xf32>) { +tt.func public @result_rank_too_large(%arg0: !triton_gpu.memdesc<8x16xf32>) { %zero = arith.constant 0 : i32 // expected-error @+1 {{result rank}} - %a = triton_gpu.memdesc_subview %arg0[%zero, %zero] : !tt.memdesc<8x16xf32> -> !tt.memdesc<3x8x16xf32> + %a = triton_gpu.memdesc_subview %arg0[%zero, %zero] : !triton_gpu.memdesc<8x16xf32> -> !triton_gpu.memdesc<3x8x16xf32> tt.return } // ----- -tt.func public @result_dim_too_large(%arg0: !tt.memdesc<8x16xf32>) { +tt.func public @result_dim_too_large(%arg0: !triton_gpu.memdesc<8x16xf32>) { %zero = arith.constant 0 : i32 // expected-error @+1 {{result shape}} - %a = triton_gpu.memdesc_subview %arg0[%zero, %zero] : !tt.memdesc<8x16xf32> -> !tt.memdesc<32xf32> + %a = triton_gpu.memdesc_subview %arg0[%zero, %zero] : !triton_gpu.memdesc<8x16xf32> -> !triton_gpu.memdesc<32xf32> tt.return } diff --git a/test/TritonGPU/loop-pipeline-cuda.mlir b/test/TritonGPU/loop-pipeline-cuda.mlir index 3cb8511b0b..fe8f45e92f 100644 --- a/test/TritonGPU/loop-pipeline-cuda.mlir +++ b/test/TritonGPU/loop-pipeline-cuda.mlir @@ -49,9 +49,9 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : %21 = tt.dot %19, %20, %cst_1 : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<64x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x16xf32, #mma> %22 = arith.truncf %21 : tensor<128x16xf32, #mma> to tensor<128x16xf16, #mma> %23 = triton_gpu.convert_layout %22 : tensor<128x16xf16, #mma> -> tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> - %24 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory> - %25 = triton_gpu.memdesc_trans %24 {order=array} : !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory> -> !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory> - %26 = triton_gpu.local_load %25 : !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory> -> tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> + %24 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !triton_gpu.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory> + %25 = triton_gpu.memdesc_trans %24 {order=array} : !triton_gpu.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory> -> !triton_gpu.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory> + %26 = triton_gpu.local_load %25 : !triton_gpu.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory> -> tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> %27 = tt.dot %23, %26, %arg4 : tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x64xf32, #mma> scf.yield %21, %27 : tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma> } @@ -140,9 +140,9 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : %63 = scf.for %arg6 = %c0_i32 to %c64_i32 step %c32_i32 iter_args(%arg7 = %cst) -> (tensor<64x32xf32, #mma>) : i32 { %70 = tt.load %59 : tensor<32x64x!tt.ptr, #blocked1> %71 = triton_gpu.convert_layout %62 : tensor<64x64xf32, #blocked> -> tensor<64x64xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> - %72 = triton_gpu.local_alloc %70 : (tensor<32x64xf32, #blocked1>) -> !tt.memdesc<32x64xf32, #shared, #triton_gpu.shared_memory> - %73 = triton_gpu.memdesc_trans %72 {order=array} : !tt.memdesc<32x64xf32, #shared, #triton_gpu.shared_memory> -> !tt.memdesc<64x32xf32, #shared1, #triton_gpu.shared_memory> - %74 = triton_gpu.local_load %73 : !tt.memdesc<64x32xf32, #shared1, #triton_gpu.shared_memory> -> tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> + %72 = triton_gpu.local_alloc %70 : (tensor<32x64xf32, #blocked1>) -> !triton_gpu.memdesc<32x64xf32, #shared, #triton_gpu.shared_memory> + %73 = triton_gpu.memdesc_trans %72 {order=array} : !triton_gpu.memdesc<32x64xf32, #shared, #triton_gpu.shared_memory> -> !triton_gpu.memdesc<64x32xf32, #shared1, #triton_gpu.shared_memory> + %74 = triton_gpu.local_load %73 : !triton_gpu.memdesc<64x32xf32, #shared1, #triton_gpu.shared_memory> -> tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> %75 = tt.dot %71, %74, %cst : tensor<64x64xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<64x32xf32, #mma> %76 = tt.load %61 : tensor<32x32x!tt.ptr, #blocked1> %77 = triton_gpu.convert_layout %75 : tensor<64x32xf32, #mma> -> tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> @@ -169,9 +169,9 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : #shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = true}> module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { // CHECK-LABEL: @matmul_tma -// CHECK-DAG: triton_gpu.local_alloc : () -> !tt.memdesc<3x128x64xf16, #{{.+}}, #triton_gpu.shared_memory, mutable> -// CHECK-DAG: triton_gpu.local_alloc : () -> !tt.memdesc<3x64x256xf16, #{{.+}}, #triton_gpu.shared_memory, mutable> -// CHECK-DAG: triton_gpu.local_alloc : () -> !tt.memdesc<3xi64, #{{.+}}, #triton_gpu.shared_memory, mutable> +// CHECK-DAG: triton_gpu.local_alloc : () -> !triton_gpu.memdesc<3x128x64xf16, #{{.+}}, #triton_gpu.shared_memory, mutable> +// CHECK-DAG: triton_gpu.local_alloc : () -> !triton_gpu.memdesc<3x64x256xf16, #{{.+}}, #triton_gpu.shared_memory, mutable> +// CHECK-DAG: triton_gpu.local_alloc : () -> !triton_gpu.memdesc<3xi64, #{{.+}}, #triton_gpu.shared_memory, mutable> // CHECK-COUNT-3: triton_nvidia_gpu.init_barrier // CHECK-COUNT-4: triton_nvidia_gpu.async_tma_copy_global_to_local // CHECK: scf.for @@ -187,10 +187,10 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : %cst = arith.constant dense<0.000000e+00> : tensor<128x256xf32, #mma> %0:2 = scf.for %arg3 = %c0_i32 to %c256_i32 step %c1_i32 iter_args(%arg4 = %cst, %arg5 = %c0_i32) -> (tensor<128x256xf32, #mma>, i32) : i32 { %1 = tt.experimental_descriptor_load %arg0[%c0_i32, %arg5] : !tt.tensordesc> -> tensor<128x64xf16, #blocked> - %2 = triton_gpu.local_alloc %1 : (tensor<128x64xf16, #blocked>) -> !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> + %2 = triton_gpu.local_alloc %1 : (tensor<128x64xf16, #blocked>) -> !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> %3 = tt.experimental_descriptor_load %arg1[%arg5, %c0_i32] : !tt.tensordesc> -> tensor<64x256xf16, #blocked1> - %4 = triton_gpu.local_alloc %3 : (tensor<64x256xf16, #blocked1>) -> !tt.memdesc<64x256xf16, #shared, #triton_gpu.shared_memory> - %5 = triton_nvidia_gpu.warp_group_dot %2, %4, %arg4 { inputPrecision = 0 : i32 } : !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !tt.memdesc<64x256xf16, #shared, #triton_gpu.shared_memory> -> tensor<128x256xf32, #mma> + %4 = triton_gpu.local_alloc %3 : (tensor<64x256xf16, #blocked1>) -> !triton_gpu.memdesc<64x256xf16, #shared, #triton_gpu.shared_memory> + %5 = triton_nvidia_gpu.warp_group_dot %2, %4, %arg4 { inputPrecision = 0 : i32 } : !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !triton_gpu.memdesc<64x256xf16, #shared, #triton_gpu.shared_memory> -> tensor<128x256xf32, #mma> %6 = arith.addi %arg5, %c64_i32 : i32 scf.yield %5, %6 : tensor<128x256xf32, #mma>, i32 } diff --git a/test/TritonGPU/loop-pipeline-hip.mlir b/test/TritonGPU/loop-pipeline-hip.mlir index 4ad94615c8..6ca0897578 100644 --- a/test/TritonGPU/loop-pipeline-hip.mlir +++ b/test/TritonGPU/loop-pipeline-hip.mlir @@ -47,9 +47,9 @@ module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 %21 = tt.dot %19, %20, %cst_1 : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<64x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x16xf32, #mma> %22 = arith.truncf %21 : tensor<128x16xf32, #mma> to tensor<128x16xf16, #mma> %23 = triton_gpu.convert_layout %22 : tensor<128x16xf16, #mma> -> tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> - %24 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory, mutable> - %25 = triton_gpu.memdesc_trans %24 {order=array} : !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory, mutable> - %26 = triton_gpu.local_load %25 : !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> + %24 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !triton_gpu.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory, mutable> + %25 = triton_gpu.memdesc_trans %24 {order=array} : !triton_gpu.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory, mutable> -> !triton_gpu.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory, mutable> + %26 = triton_gpu.local_load %25 : !triton_gpu.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> %27 = tt.dot %23, %26, %arg4 : tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x64xf32, #mma> scf.yield %21, %27 : tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma> } @@ -139,9 +139,9 @@ module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 %63 = scf.for %arg6 = %c0_i32 to %c64_i32 step %c32_i32 iter_args(%arg7 = %cst) -> (tensor<64x32xf32, #mma>) : i32 { %70 = tt.load %59 : tensor<32x64x!tt.ptr, #blocked1> %71 = triton_gpu.convert_layout %62 : tensor<64x64xf32, #blocked> -> tensor<64x64xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> - %72 = triton_gpu.local_alloc %70 : (tensor<32x64xf32, #blocked1>) -> !tt.memdesc<32x64xf32, #shared, #triton_gpu.shared_memory, mutable> - %73 = triton_gpu.memdesc_trans %72 {order=array} : !tt.memdesc<32x64xf32, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<64x32xf32, #shared1, #triton_gpu.shared_memory, mutable> - %74 = triton_gpu.local_load %73 : !tt.memdesc<64x32xf32, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> + %72 = triton_gpu.local_alloc %70 : (tensor<32x64xf32, #blocked1>) -> !triton_gpu.memdesc<32x64xf32, #shared, #triton_gpu.shared_memory, mutable> + %73 = triton_gpu.memdesc_trans %72 {order=array} : !triton_gpu.memdesc<32x64xf32, #shared, #triton_gpu.shared_memory, mutable> -> !triton_gpu.memdesc<64x32xf32, #shared1, #triton_gpu.shared_memory, mutable> + %74 = triton_gpu.local_load %73 : !triton_gpu.memdesc<64x32xf32, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> %75 = tt.dot %71, %74, %cst : tensor<64x64xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<64x32xf32, #mma> %76 = tt.load %61 : tensor<32x32x!tt.ptr, #blocked1> %77 = triton_gpu.convert_layout %75 : tensor<64x32xf32, #mma> -> tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> @@ -252,9 +252,9 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #blocked> %0 = scf.for %arg2 = %c0_i32 to %arg1 step %c1_i32 iter_args(%arg3 = %cst) -> (tensor<32x32xf32, #blocked>) : i32 { %2 = tt.load %arg4 : tensor<32x32x!tt.ptr, #blocked1> - %3 = triton_gpu.local_alloc %2 : (tensor<32x32xf32, #blocked1>) -> !tt.memdesc<32x32xf32, #shared, #triton_gpu.shared_memory> - %4 = triton_gpu.memdesc_trans %3 {order = array} : !tt.memdesc<32x32xf32, #shared, #triton_gpu.shared_memory> -> !tt.memdesc<32x32xf32, #shared1, #triton_gpu.shared_memory> - %5 = triton_gpu.local_load %4 : !tt.memdesc<32x32xf32, #shared1, #triton_gpu.shared_memory> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked}>> + %3 = triton_gpu.local_alloc %2 : (tensor<32x32xf32, #blocked1>) -> !triton_gpu.memdesc<32x32xf32, #shared, #triton_gpu.shared_memory> + %4 = triton_gpu.memdesc_trans %3 {order = array} : !triton_gpu.memdesc<32x32xf32, #shared, #triton_gpu.shared_memory> -> !triton_gpu.memdesc<32x32xf32, #shared1, #triton_gpu.shared_memory> + %5 = triton_gpu.local_load %4 : !triton_gpu.memdesc<32x32xf32, #shared1, #triton_gpu.shared_memory> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked}>> %6 = triton_gpu.convert_layout %2 : tensor<32x32xf32, #blocked1> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>> %7 = tt.dot %6, %5, %cst : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked}>> -> tensor<32x32xf32, #blocked> scf.yield %7 : tensor<32x32xf32, #blocked> diff --git a/test/TritonGPU/loop-pipeline-hopper.mlir b/test/TritonGPU/loop-pipeline-hopper.mlir index f3784fbe8c..d358be4d97 100644 --- a/test/TritonGPU/loop-pipeline-hopper.mlir +++ b/test/TritonGPU/loop-pipeline-hopper.mlir @@ -19,7 +19,7 @@ // CHECK: %[[BBUFFER:.*]] = triton_gpu.local_alloc // CHECK-DAG: %[[LOOP_COND_0:.*]] = arith.cmpi slt, %[[LB:.*]], %[[UB:.*]] // CHECK-DAG: %[[LOOP_COND_0_SPLAT_A:.*]] = tt.splat %[[LOOP_COND_0]] -// CHECK-DAG: %[[ASUB:.*]] = triton_gpu.memdesc_subview %[[ABUFFER]][%[[CONSTANT_0]], %[[CONSTANT_0]], %[[CONSTANT_0]]] : !tt.memdesc<2x128x32xf16, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable> +// CHECK-DAG: %[[ASUB:.*]] = triton_gpu.memdesc_subview %[[ABUFFER]][%[[CONSTANT_0]], %[[CONSTANT_0]], %[[CONSTANT_0]]] : !triton_gpu.memdesc<2x128x32xf16, #shared, #triton_gpu.shared_memory, mutable> -> !triton_gpu.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable> // CHECK: %[[T_A0:.*]] = triton_gpu.async_copy_global_to_local %{{.*}}, %[[ASUB]] mask %[[LOOP_COND_0_SPLAT_A]] : tensor<128x32x!tt.ptr, #blocked1> -> <128x32xf16, #shared, #triton_gpu.shared_memory, mutable> // CHECK-DAG: %[[LOOP_COND_0_SPLAT_B:.*]] = tt.splat %[[LOOP_COND_0]] // CHECK-DAG: %[[BSUB:.*]] = triton_gpu.memdesc_subview %[[BBUFFER]][%[[CONSTANT_0]], %[[CONSTANT_0]], %[[CONSTANT_0]]] @@ -333,8 +333,8 @@ tt.func @matmul_loop_single_pipeline(%lb : index, %ub : index, %step : index, // %a = tt.load %a_tileptr : !tt.ptr, 1> // %b = tt.load %b_tileptr : !tt.ptr, 1> // -// %sa = triton_gpu.local_alloc %a : (tensor<128x32xf16, #BA>) -> !tt.memdesc<128x32xf16, #SA, #triton_gpu.shared_memory> -// %sb = triton_gpu.local_alloc %b : (tensor<32x128xf16, #BB>) -> !tt.memdesc<32x128xf16, #SB, #triton_gpu.shared_memory> +// %sa = triton_gpu.local_alloc %a : (tensor<128x32xf16, #BA>) -> !triton_gpu.memdesc<128x32xf16, #SA, #triton_gpu.shared_memory> +// %sb = triton_gpu.local_alloc %b : (tensor<32x128xf16, #BB>) -> !triton_gpu.memdesc<32x128xf16, #SB, #triton_gpu.shared_memory> // %c = triton_nvidia_gpu.warp_group_dot %sa, %sb, %prev_c : tensor<128x32xf16, #SA> * tensor<32x128xf16, #SB> -> tensor<128x128xf32, #C> // // %a_tileptr_next = tt.advance %a_tileptr, [%c0, %c32_i32] : !tt.ptr, 1> @@ -393,13 +393,13 @@ module attributes {"triton_gpu.target" = "cuda:90", "triton_gpu.num-ctas" = 1 : // CHECK: scf.yield %17:2 = scf.for %arg3 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg4 = %cst_3, %arg5 = %16) -> (tensor<128x64xf32, #mma>, tensor<64x16x!tt.ptr, #blocked>) : i32 { %18 = tt.load %arg5 : tensor<64x16x!tt.ptr, #blocked> - %19 = triton_gpu.local_alloc %9 : (tensor<128x64xf16, #blocked1>) -> !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> - %20 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> - %21 = triton_nvidia_gpu.warp_group_dot %19, %20, %cst_2 : !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1> + %19 = triton_gpu.local_alloc %9 : (tensor<128x64xf16, #blocked1>) -> !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> + %20 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> + %21 = triton_nvidia_gpu.warp_group_dot %19, %20, %cst_2 : !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1> %22 = arith.truncf %21 : tensor<128x16xf32, #mma1> to tensor<128x16xf16, #mma1> - %23 = triton_gpu.memdesc_trans %20 {order=array} : !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> !tt.memdesc<16x64xf16, #shared, #triton_gpu.shared_memory> + %23 = triton_gpu.memdesc_trans %20 {order=array} : !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> !triton_gpu.memdesc<16x64xf16, #shared, #triton_gpu.shared_memory> %24 = triton_gpu.convert_layout %22 : tensor<128x16xf16, #mma1> -> tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> - %25 = triton_nvidia_gpu.warp_group_dot %24, %23, %arg4 : tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> * !tt.memdesc<16x64xf16, #shared, #triton_gpu.shared_memory> -> tensor<128x64xf32, #mma> + %25 = triton_nvidia_gpu.warp_group_dot %24, %23, %arg4 : tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> * !triton_gpu.memdesc<16x64xf16, #shared, #triton_gpu.shared_memory> -> tensor<128x64xf32, #mma> %26 = tt.addptr %arg5, %cst : tensor<64x16x!tt.ptr, #blocked>, tensor<64x16xi32, #blocked> scf.yield %25, %26 : tensor<128x64xf32, #mma>, tensor<64x16x!tt.ptr, #blocked> } @@ -445,9 +445,9 @@ module attributes {"triton_gpu.target" = "cuda:90", "triton_gpu.num-ctas" = 1 : %17:3 = scf.for %arg3 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg4 = %cst_2, %arg5 = %16, %arg6 = %8) -> (tensor<128x16xf32, #mma1>, tensor<64x16x!tt.ptr, #blocked>, tensor<128x64x!tt.ptr, #blocked1>) : i32 { %9 = tt.load %arg6 : tensor<128x64x!tt.ptr, #blocked1> %18 = tt.load %arg5 : tensor<64x16x!tt.ptr, #blocked> - %19 = triton_gpu.local_alloc %9 : (tensor<128x64xf16, #blocked1>) -> !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> - %20 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> - %acc = triton_nvidia_gpu.warp_group_dot %19, %20, %arg4 : !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1> + %19 = triton_gpu.local_alloc %9 : (tensor<128x64xf16, #blocked1>) -> !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> + %20 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> + %acc = triton_nvidia_gpu.warp_group_dot %19, %20, %arg4 : !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1> %cnd = arith.cmpi slt, %arg3, %ext : i32 %acc_ = scf.if %cnd -> (tensor<128x16xf32, #mma1>) { %acc_zero = arith.mulf %acc, %cst_2 : tensor<128x16xf32, #mma1> @@ -502,24 +502,24 @@ module attributes {"triton_gpu.target" = "cuda:90", "triton_gpu.num-ctas" = 1 : %15 = tt.broadcast %13 : tensor<64x1xi32, #blocked> -> tensor<64x16xi32, #blocked> %16 = tt.addptr %14, %15 : tensor<64x16x!tt.ptr, #blocked>, tensor<64x16xi32, #blocked> %18 = tt.load %16 : tensor<64x16x!tt.ptr, #blocked> - %19 = triton_gpu.local_alloc %9 : (tensor<128x64xf16, #blocked1>) -> !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> - %20 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> + %19 = triton_gpu.local_alloc %9 : (tensor<128x64xf16, #blocked1>) -> !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> + %20 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> // CHECK: %[[ALLOC1:.+]] = triton_gpu.local_alloc // CHECK: %[[ALLOC2:.+]] = triton_gpu.local_alloc // CHECK: %[[R:.+]]:{{.+}} = scf.for // CHECK: %[[DOT1:.+]] = triton_nvidia_gpu.warp_group_dot{{.*}} // CHECK: triton_gpu.async_wait {{.*}} {num = 1 : i32} - // CHECK: %[[TRANS:.+]] = triton_gpu.memdesc_trans{{.*}} : !tt.memdesc + // CHECK: %[[TRANS:.+]] = triton_gpu.memdesc_trans{{.*}} : !triton_gpu.memdesc // CHECK: %[[DOT2:.+]] = triton_nvidia_gpu.warp_group_dot{{.*}} %[[TRANS]] // CHECK: triton_nvidia_gpu.warp_group_dot_wait %[[DOT1]], %[[DOT2]], %[[ALLOC1]], %[[ALLOC2]], %[[TRANS]] {pendings = 2 : i32} // CHECK: scf.yield // CHECK: %{{.*}}:2 = triton_nvidia_gpu.warp_group_dot_wait %[[R]]#{{.+}}, %[[R]]#{{.+}} {pendings = 0 : i32} : tensor<128x16xf32, #{{.*}}>, tensor<128x64xf32, #{{.*}}> %17:3 = scf.for %arg3 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg4 = %cst_3, %arg5 = %16, %arg6 = %cst_2) -> (tensor<128x64xf32, #mma>, tensor<64x16x!tt.ptr, #blocked>, tensor<128x16xf32, #mma1>) : i32 { - %21 = triton_nvidia_gpu.warp_group_dot %19, %20, %arg6 : !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1> + %21 = triton_nvidia_gpu.warp_group_dot %19, %20, %arg6 : !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1> %l = tt.load %arg5 : tensor<64x16x!tt.ptr, #blocked> - %c = triton_gpu.local_alloc %l : (tensor<64x16xf16, #blocked>) -> !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> - %23 = triton_gpu.memdesc_trans %c {order=array} : !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> !tt.memdesc<16x64xf16, #shared, #triton_gpu.shared_memory> - %25 = triton_nvidia_gpu.warp_group_dot %cst_4, %23, %arg4 : tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> * !tt.memdesc<16x64xf16, #shared, #triton_gpu.shared_memory> -> tensor<128x64xf32, #mma> + %c = triton_gpu.local_alloc %l : (tensor<64x16xf16, #blocked>) -> !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> + %23 = triton_gpu.memdesc_trans %c {order=array} : !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> !triton_gpu.memdesc<16x64xf16, #shared, #triton_gpu.shared_memory> + %25 = triton_nvidia_gpu.warp_group_dot %cst_4, %23, %arg4 : tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> * !triton_gpu.memdesc<16x64xf16, #shared, #triton_gpu.shared_memory> -> tensor<128x64xf32, #mma> %26 = tt.addptr %arg5, %cst : tensor<64x16x!tt.ptr, #blocked>, tensor<64x16xi32, #blocked> scf.yield %25, %26, %21 : tensor<128x64xf32, #mma>, tensor<64x16x!tt.ptr, #blocked>, tensor<128x16xf32, #mma1> } @@ -577,13 +577,13 @@ module attributes {"triton_gpu.target" = "cuda:90", "triton_gpu.num-ctas" = 1 : %22:3 = scf.for %arg3 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg4 = %cst, %arg5 = %12, %arg6 = %21) -> (tensor<128x256xf32, #mma>, tensor<128x64x!tt.ptr, #blocked>, tensor<64x256x!tt.ptr, #blocked1>) : i32 { %35 = tt.load %arg5 : tensor<128x64x!tt.ptr, #blocked> %36 = tt.load %arg6 : tensor<64x256x!tt.ptr, #blocked1> - %37 = triton_gpu.local_alloc %35 : (tensor<128x64xf8E5M2, #blocked>) -> !tt.memdesc<128x64xf8E5M2, #shared, #triton_gpu.shared_memory> - %38 = triton_gpu.local_alloc %36 : (tensor<64x256xf8E5M2, #blocked1>) -> !tt.memdesc<64x256xf8E5M2, #shared1, #triton_gpu.shared_memory> + %37 = triton_gpu.local_alloc %35 : (tensor<128x64xf8E5M2, #blocked>) -> !triton_gpu.memdesc<128x64xf8E5M2, #shared, #triton_gpu.shared_memory> + %38 = triton_gpu.local_alloc %36 : (tensor<64x256xf8E5M2, #blocked1>) -> !triton_gpu.memdesc<64x256xf8E5M2, #shared1, #triton_gpu.shared_memory> // CHECK: triton_gpu.local_alloc // CHECK: scf.for // CHECK: triton_nvidia_gpu.warp_group_dot // CHECK-NEXT: triton_nvidia_gpu.warp_group_dot_wait - %39 = triton_nvidia_gpu.warp_group_dot %37, %38, %arg4 {maxNumImpreciseAcc = 1073741824 : i32} : !tt.memdesc<128x64xf8E5M2, #shared, #triton_gpu.shared_memory> * !tt.memdesc<64x256xf8E5M2, #shared1, #triton_gpu.shared_memory> -> tensor<128x256xf32, #mma> + %39 = triton_nvidia_gpu.warp_group_dot %37, %38, %arg4 {maxNumImpreciseAcc = 1073741824 : i32} : !triton_gpu.memdesc<128x64xf8E5M2, #shared, #triton_gpu.shared_memory> * !triton_gpu.memdesc<64x256xf8E5M2, #shared1, #triton_gpu.shared_memory> -> tensor<128x256xf32, #mma> %40 = tt.addptr %arg5, %cst_6 : tensor<128x64x!tt.ptr, #blocked>, tensor<128x64xi32, #blocked> %41 = tt.addptr %arg6, %cst_5 : tensor<64x256x!tt.ptr, #blocked1>, tensor<64x256xi32, #blocked1> scf.yield %39, %40, %41 : tensor<128x256xf32, #mma>, tensor<128x64x!tt.ptr, #blocked>, tensor<64x256x!tt.ptr, #blocked1> @@ -657,8 +657,8 @@ module attributes {"triton_gpu.target" = "cuda:90", "triton_gpu.num-ctas" = 1 : %15 = tt.broadcast %13 : tensor<64x1xi32, #blocked> -> tensor<64x16xi32, #blocked> %16 = tt.addptr %14, %15 : tensor<64x16x!tt.ptr, #blocked>, tensor<64x16xi32, #blocked> %18 = tt.load %16 : tensor<64x16x!tt.ptr, #blocked> - %19 = triton_gpu.local_alloc %9 : (tensor<128x64xf16, #blocked1>) -> !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> - %20 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> + %19 = triton_gpu.local_alloc %9 : (tensor<128x64xf16, #blocked1>) -> !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> + %20 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> // CHECK: %[[LOOP:[^ :]+]]{{.*}} scf.for {{.*}} iter_args(%[[PREV_DOT2:[^ ]+]] // CHECK-NOT: triton_nvidia_gpu.warp_group_dot_wait // CHECK: %[[DOT0:.+]] = triton_nvidia_gpu.warp_group_dot @@ -675,17 +675,17 @@ module attributes {"triton_gpu.target" = "cuda:90", "triton_gpu.num-ctas" = 1 : // CHECK: triton_nvidia_gpu.warp_group_dot_wait %[[LOOP]]#3, %[[LOOP]]#0 {pendings = 0 : i32} %17:4 = scf.for %arg3 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%prev_dot2 = %cst_3, %arg5 = %16, %prev_dot1 = %cst_2, %prev_dot0 = %cst_2) -> (tensor<128x64xf32, #mma>, tensor<64x16x!tt.ptr, #blocked>, tensor<128x16xf32, #mma1>, tensor<128x16xf32, #mma1>) : i32 { // This one can be async. - %dot0 = triton_nvidia_gpu.warp_group_dot %19, %20, %prev_dot1 : !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1> + %dot0 = triton_nvidia_gpu.warp_group_dot %19, %20, %prev_dot1 : !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1> // This can't be async because its result is modified before it's yielded. - %dot1 = triton_nvidia_gpu.warp_group_dot %19, %20, %prev_dot1 : !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1> + %dot1 = triton_nvidia_gpu.warp_group_dot %19, %20, %prev_dot1 : !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1> %dot1.1 = arith.addf %dot1, %dot1 : tensor<128x16xf32, #mma1> %l = tt.load %arg5 : tensor<64x16x!tt.ptr, #blocked> - %c = triton_gpu.local_alloc %l : (tensor<64x16xf16, #blocked>) -> !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> - %23 = triton_gpu.memdesc_trans %c {order=array} : !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> !tt.memdesc<16x64xf16, #shared, #triton_gpu.shared_memory> + %c = triton_gpu.local_alloc %l : (tensor<64x16xf16, #blocked>) -> !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> + %23 = triton_gpu.memdesc_trans %c {order=array} : !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> !triton_gpu.memdesc<16x64xf16, #shared, #triton_gpu.shared_memory> // This dot can be async even though %prev_dot2 is not used directly by an // async dot, because that use follows the synchronous dot above. %prev_dot2.1 = arith.addf %prev_dot2, %prev_dot2 : tensor<128x64xf32, #mma> - %dot2 = triton_nvidia_gpu.warp_group_dot %cst_4, %23, %prev_dot2.1 : tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> * !tt.memdesc<16x64xf16, #shared, #triton_gpu.shared_memory> -> tensor<128x64xf32, #mma> + %dot2 = triton_nvidia_gpu.warp_group_dot %cst_4, %23, %prev_dot2.1 : tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> * !triton_gpu.memdesc<16x64xf16, #shared, #triton_gpu.shared_memory> -> tensor<128x64xf32, #mma> %26 = tt.addptr %arg5, %cst : tensor<64x16x!tt.ptr, #blocked>, tensor<64x16xi32, #blocked> scf.yield %dot2, %26, %dot1.1, %dot0 : tensor<128x64xf32, #mma>, tensor<64x16x!tt.ptr, #blocked>, tensor<128x16xf32, #mma1>, tensor<128x16xf32, #mma1> } @@ -719,7 +719,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // CHECK-LABEL: tma_multiple_store_pipeline tt.func public @tma_multiple_store_pipeline(%arg0: tensor<1xf32, #blocked>, %arg1: !tt.tensordesc>, %arg2: i32, %arg3: i32) attributes {noinline = false} { %c0_i32 = arith.constant 0 : i32 - // CHECK: %[[ALLOC:.+]] = triton_gpu.local_alloc : () -> !tt.memdesc<1xf32, #shared, #triton_gpu.shared_memory, mutable> + // CHECK: %[[ALLOC:.+]] = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1xf32, #shared, #triton_gpu.shared_memory, mutable> // CHECK: scf.for scf.for %arg4 = %c0_i32 to %arg3 step %arg2 : i32 { %1 = arith.divsi %arg4, %arg2 : i32 @@ -780,10 +780,10 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : %11 = tt.broadcast %10 : tensor<128x1xi32, #blocked1> -> tensor<128x128xi32, #blocked1> %12 = tt.addptr %1, %11 : tensor<128x128x!tt.ptr, #blocked1>, tensor<128x128xi32, #blocked1> %13 = tt.load %arg0 : tensor<128x128x!tt.ptr, #blocked> - %14 = triton_gpu.local_alloc %13 : (tensor<128x128xf8E4M3FNUZ, #blocked>) -> !tt.memdesc<128x128xf8E4M3FNUZ, #shared> + %14 = triton_gpu.local_alloc %13 : (tensor<128x128xf8E4M3FNUZ, #blocked>) -> !triton_gpu.memdesc<128x128xf8E4M3FNUZ, #shared> %15 = tt.load %12 : tensor<128x128x!tt.ptr, #blocked1> - %16 = triton_gpu.local_alloc %15 : (tensor<128x128xf8E4M3FNUZ, #blocked1>) -> !tt.memdesc<128x128xf8E4M3FNUZ, #shared1> - %17 = triton_nvidia_gpu.warp_group_dot %14, %16, %arg9 {inputPrecision = 0 : i32, maxNumImpreciseAcc = 1073741824 : i32} : !tt.memdesc<128x128xf8E4M3FNUZ, #shared> * !tt.memdesc<128x128xf8E4M3FNUZ, #shared1> -> tensor<128x128xf32, #mma> + %16 = triton_gpu.local_alloc %15 : (tensor<128x128xf8E4M3FNUZ, #blocked1>) -> !triton_gpu.memdesc<128x128xf8E4M3FNUZ, #shared1> + %17 = triton_nvidia_gpu.warp_group_dot %14, %16, %arg9 {inputPrecision = 0 : i32, maxNumImpreciseAcc = 1073741824 : i32} : !triton_gpu.memdesc<128x128xf8E4M3FNUZ, #shared> * !triton_gpu.memdesc<128x128xf8E4M3FNUZ, #shared1> -> tensor<128x128xf32, #mma> %18 = tt.splat %7 : f32 -> tensor<128x128xf32, #mma> %19 = arith.mulf %17, %18 : tensor<128x128xf32, #mma> %20 = scf.if %6 -> (tensor<128x128xf32, #mma>) { @@ -852,9 +852,9 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : scf.yield %arg5 : tensor<64x16x!tt.ptr, #blocked> } %18 = tt.load %inc_ptr : tensor<64x16x!tt.ptr, #blocked> - %19 = triton_gpu.local_alloc %9 : (tensor<128x64xf16, #blocked1>) -> !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> - %20 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> - %acc = triton_nvidia_gpu.warp_group_dot %19, %20, %arg4 : !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1> + %19 = triton_gpu.local_alloc %9 : (tensor<128x64xf16, #blocked1>) -> !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> + %20 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> + %acc = triton_nvidia_gpu.warp_group_dot %19, %20, %arg4 : !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1> %acc_ = scf.if %cnd -> (tensor<128x16xf32, #mma1>) { %acc_zero = arith.mulf %acc, %cst_2 : tensor<128x16xf32, #mma1> scf.yield %acc_zero : tensor<128x16xf32, #mma1> @@ -917,9 +917,9 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : %17:3 = scf.for %arg3 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg4 = %cst_2, %arg5 = %16, %arg6 = %8) -> (tensor<128x16xf32, #mma1>, tensor<64x16x!tt.ptr, #blocked>, tensor<128x64x!tt.ptr, #blocked1>) : i32 { %9 = tt.load %arg6 : tensor<128x64x!tt.ptr, #blocked1> %18 = tt.load %arg5 : tensor<64x16x!tt.ptr, #blocked> - %19 = triton_gpu.local_alloc %9 : (tensor<128x64xf16, #blocked1>) -> !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> - %20 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> - %acc = triton_nvidia_gpu.warp_group_dot %19, %20, %arg4 : !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1> + %19 = triton_gpu.local_alloc %9 : (tensor<128x64xf16, #blocked1>) -> !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> + %20 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> + %acc = triton_nvidia_gpu.warp_group_dot %19, %20, %arg4 : !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1> %cnd = arith.cmpi slt, %arg3, %ext : i32 %if_ret:2 = scf.if %cnd -> (tensor<128x16xf32, #mma1>, tensor<64x16xi32, #blocked>) { %acc_zero = arith.mulf %acc, %cst_2 : tensor<128x16xf32, #mma1> @@ -988,8 +988,8 @@ module attributes {"triton_gpu.target" = "cuda:90", "triton_gpu.num-ctas" = 1 : %b_block = tt.load %arg6 : tensor<64x16x!tt.ptr, #blocked> %a_dotop = triton_gpu.convert_layout %a_block : tensor<128x64xf16, #blocked1> -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> %a_dotop_mul = arith.mulf %a_dotop, %cst_4 : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> - %b_smem = triton_gpu.local_alloc %b_block : (tensor<64x16xf16, #blocked>) -> !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> - %21 = triton_nvidia_gpu.warp_group_dot %a_dotop_mul, %b_smem, %arg4 : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma> + %b_smem = triton_gpu.local_alloc %b_block : (tensor<64x16xf16, #blocked>) -> !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> + %21 = triton_nvidia_gpu.warp_group_dot %a_dotop_mul, %b_smem, %arg4 : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma> %25 = tt.addptr %arg5, %cst_3 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> %26 = tt.addptr %arg6, %cst : tensor<64x16x!tt.ptr, #blocked>, tensor<64x16xi32, #blocked> scf.yield %21, %25, %26 : tensor<128x16xf32, #mma>, tensor<128x64x!tt.ptr, #blocked1>, tensor<64x16x!tt.ptr, #blocked> diff --git a/test/TritonGPU/loop-pipeline.mlir b/test/TritonGPU/loop-pipeline.mlir index 973b35defb..bb7e102c90 100644 --- a/test/TritonGPU/loop-pipeline.mlir +++ b/test/TritonGPU/loop-pipeline.mlir @@ -930,9 +930,9 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : %21 = tt.dot %19, %20, %cst_1 : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<64x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x16xf32, #mma> %22 = arith.truncf %21 : tensor<128x16xf32, #mma> to tensor<128x16xf16, #mma> %23 = triton_gpu.convert_layout %22 : tensor<128x16xf16, #mma> -> tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> - %24 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory> - %25 = triton_gpu.memdesc_trans %24 {order=array} : !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory> -> !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory> - %26 = triton_gpu.local_load %25 : !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory> -> tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> + %24 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !triton_gpu.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory> + %25 = triton_gpu.memdesc_trans %24 {order=array} : !triton_gpu.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory> -> !triton_gpu.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory> + %26 = triton_gpu.local_load %25 : !triton_gpu.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory> -> tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> %27 = tt.dot %23, %26, %arg4 : tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x64xf32, #mma> scf.yield %21, %27 : tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma> } @@ -1041,7 +1041,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // CHECK: scf.for // CHECK: %[[NEXT_BUFFER_1:.*]] = tt.addptr %{{.*}}, {{.*}} // CHECK: triton_gpu.async_copy_global_to_local %[[NEXT_BUFFER_1]] -// CHECK: %[[IND_BUFFER_0:.*]] = triton_gpu.memdesc_subview {{.*}} : !tt.memdesc<1x16xi64, #[[$SHARED_LAYOUT]], #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16xi64, #[[$SHARED_LAYOUT]], #triton_gpu.shared_memory, mutable> +// CHECK: %[[IND_BUFFER_0:.*]] = triton_gpu.memdesc_subview {{.*}} : !triton_gpu.memdesc<1x16xi64, #[[$SHARED_LAYOUT]], #triton_gpu.shared_memory, mutable> -> !triton_gpu.memdesc<16xi64, #[[$SHARED_LAYOUT]], #triton_gpu.shared_memory, mutable> // CHECK: %[[IND_BUFFER_1:.*]] = triton_gpu.local_load %[[IND_BUFFER_0]] // CHECK: %[[IND_BUFFER_2:.*]] = tt.expand_dims %[[IND_BUFFER_1]] {axis = 1 : i32} // CHECK: %[[IND_BUFFER_3:.*]] = tt.broadcast %[[IND_BUFFER_2]] @@ -1361,9 +1361,9 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : %9 = tt.addptr %7, %8 : tensor<16x16x!tt.ptr, #blocked>, tensor<16x16xi32, #blocked> scf.for %arg1 = %c0_i32 to %c2_i32 step %c1_i32 : i32 { %10 = tt.load %9 : tensor<16x16x!tt.ptr, #blocked> - %11 = triton_gpu.local_alloc %10 : (tensor<16x16xf32, #blocked>) -> !tt.memdesc<16x16xf32, #shared, #triton_gpu.shared_memory> - %12 = triton_gpu.memdesc_trans %11 {order = array} : !tt.memdesc<16x16xf32, #shared, #triton_gpu.shared_memory> -> !tt.memdesc<16x16xf32, #shared1, #triton_gpu.shared_memory> - %13 = triton_gpu.local_load %12 : !tt.memdesc<16x16xf32, #shared1, #triton_gpu.shared_memory> -> tensor<16x16xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> + %11 = triton_gpu.local_alloc %10 : (tensor<16x16xf32, #blocked>) -> !triton_gpu.memdesc<16x16xf32, #shared, #triton_gpu.shared_memory> + %12 = triton_gpu.memdesc_trans %11 {order = array} : !triton_gpu.memdesc<16x16xf32, #shared, #triton_gpu.shared_memory> -> !triton_gpu.memdesc<16x16xf32, #shared1, #triton_gpu.shared_memory> + %13 = triton_gpu.local_load %12 : !triton_gpu.memdesc<16x16xf32, #shared1, #triton_gpu.shared_memory> -> tensor<16x16xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> scf.for %arg2 = %c0_i32 to %c2_i32 step %c1_i32 : i32 { %14 = tt.load %9 : tensor<16x16x!tt.ptr, #blocked> %15 = triton_gpu.convert_layout %14 : tensor<16x16xf32, #blocked> -> tensor<16x16xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> diff --git a/test/TritonGPU/pipeline-hopper-remove-wait.mlir b/test/TritonGPU/pipeline-hopper-remove-wait.mlir index 5b0149c32b..cd93be2c47 100644 --- a/test/TritonGPU/pipeline-hopper-remove-wait.mlir +++ b/test/TritonGPU/pipeline-hopper-remove-wait.mlir @@ -108,11 +108,11 @@ module attributes {"triton_gpu.target" = "cuda:90", "triton_gpu.num-ctas" = 1 : %110 = tt.broadcast %109 : tensor<64x128xi64, #blocked> -> tensor<64x128xi64, #blocked> %111 = tt.addptr %101, %110 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi64, #blocked> %112 = tt.load %111 : tensor<64x128x!tt.ptr, #blocked> - %113 = triton_gpu.local_alloc %38 : (tensor<128x128xf16, #blocked>) -> !tt.memdesc<128x128xf16, #shared> - %114 = triton_gpu.local_alloc %90 : (tensor<128x64xf16, #blocked2>) -> !tt.memdesc<128x64xf16, #shared1> - %115 = triton_nvidia_gpu.warp_group_dot %113, %114, %cst :!tt.memdesc<128x128xf16, #shared> * !tt.memdesc<128x64xf16, #shared1> -> tensor<128x64xf32, #mma> + %113 = triton_gpu.local_alloc %38 : (tensor<128x128xf16, #blocked>) -> !triton_gpu.memdesc<128x128xf16, #shared> + %114 = triton_gpu.local_alloc %90 : (tensor<128x64xf16, #blocked2>) -> !triton_gpu.memdesc<128x64xf16, #shared1> + %115 = triton_nvidia_gpu.warp_group_dot %113, %114, %cst :!triton_gpu.memdesc<128x128xf16, #shared> * !triton_gpu.memdesc<128x64xf16, #shared1> -> tensor<128x64xf32, #mma> %116 = arith.truncf %115 : tensor<128x64xf32, #mma> to tensor<128x64xf16, #mma> - %117 = triton_gpu.local_alloc %112 : (tensor<64x128xf16, #blocked>) -> !tt.memdesc<64x128xf16, #shared> + %117 = triton_gpu.local_alloc %112 : (tensor<64x128xf16, #blocked>) -> !triton_gpu.memdesc<64x128xf16, #shared> %118 = triton_gpu.convert_layout %116 : tensor<128x64xf16, #mma> -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> // The first dot gets converted to dot-async + wait. The second one // doesn't have a wait because the first wait is sufficient. @@ -121,7 +121,7 @@ module attributes {"triton_gpu.target" = "cuda:90", "triton_gpu.num-ctas" = 1 : // CHECK: triton_nvidia_gpu.warp_group_dot // CHECK-NOT: triton_nvidia_gpu.warp_group_dot_wait // CHECK: scf.yield - %119 = triton_nvidia_gpu.warp_group_dot %118, %117, %arg23 : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * !tt.memdesc<64x128xf16, #shared> -> tensor<128x128xf32, #mma1> + %119 = triton_nvidia_gpu.warp_group_dot %118, %117, %arg23 : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * !triton_gpu.memdesc<64x128xf16, #shared> -> tensor<128x128xf32, #mma1> %120 = arith.mulf %arg24, %arg25 : tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> %121 = arith.addf %120, %arg25 : tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> %122 = arith.extsi %c0_i32 : i32 to i64 diff --git a/test/TritonGPU/prefetch.mlir b/test/TritonGPU/prefetch.mlir index 9fbc540b92..1c0eeeb666 100644 --- a/test/TritonGPU/prefetch.mlir +++ b/test/TritonGPU/prefetch.mlir @@ -48,24 +48,24 @@ tt.func @matmul_loop_mixed(%lb : index, %ub : index, %step : index, %A : !tt.ptr %b_off = arith.constant dense<4> : tensor<32x128xi32, #BL> %a_ = tt.load %a_ptr_init, %a_mask, %a_other : tensor<128x32x!tt.ptr, #AL> - %a_init = triton_gpu.local_alloc %a_ : (tensor<128x32xf8E5M2, #AL>) -> !tt.memdesc<128x32xf8E5M2, #A> + %a_init = triton_gpu.local_alloc %a_ : (tensor<128x32xf8E5M2, #AL>) -> !triton_gpu.memdesc<128x32xf8E5M2, #A> %b_ = tt.load %b_ptr_init, %b_mask, %b_other : tensor<32x128x!tt.ptr, #BL> - %b_init = triton_gpu.local_alloc %b_ : (tensor<32x128xf16, #BL>) -> !tt.memdesc<32x128xf16, #B> + %b_init = triton_gpu.local_alloc %b_ : (tensor<32x128xf16, #BL>) -> !triton_gpu.memdesc<32x128xf16, #B> - %loop:5 = scf.for %iv = %lb to %ub step %step iter_args(%a_ptr = %a_ptr_init, %b_ptr = %b_ptr_init, %a = %a_init, %b = %b_init, %prev_c = %c_init) -> (tensor<128x32x!tt.ptr, #AL>, tensor<32x128x!tt.ptr, #BL>, !tt.memdesc<128x32xf8E5M2, #A>, !tt.memdesc<32x128xf16, #B>, tensor<128x128xf32, #C>) { - %a_op_ = triton_gpu.local_load %a : !tt.memdesc<128x32xf8E5M2, #A> -> tensor<128x32xf8E5M2, #A_OP> + %loop:5 = scf.for %iv = %lb to %ub step %step iter_args(%a_ptr = %a_ptr_init, %b_ptr = %b_ptr_init, %a = %a_init, %b = %b_init, %prev_c = %c_init) -> (tensor<128x32x!tt.ptr, #AL>, tensor<32x128x!tt.ptr, #BL>, !triton_gpu.memdesc<128x32xf8E5M2, #A>, !triton_gpu.memdesc<32x128xf16, #B>, tensor<128x128xf32, #C>) { + %a_op_ = triton_gpu.local_load %a : !triton_gpu.memdesc<128x32xf8E5M2, #A> -> tensor<128x32xf8E5M2, #A_OP> %a_op = tt.fp_to_fp %a_op_ : tensor<128x32xf8E5M2, #A_OP> -> tensor<128x32xf16, #A_OP> - %b_op = triton_gpu.local_load %b : !tt.memdesc<32x128xf16, #B> -> tensor<32x128xf16, #B_OP> + %b_op = triton_gpu.local_load %b : !triton_gpu.memdesc<32x128xf16, #B> -> tensor<32x128xf16, #B_OP> %c = tt.dot %a_op, %b_op, %prev_c : tensor<128x32xf16, #A_OP> * tensor<32x128xf16, #B_OP> -> tensor<128x128xf32, #C> %next_a_ptr = tt.addptr %a_ptr, %a_off : tensor<128x32x!tt.ptr, #AL>, tensor<128x32xi32, #AL> %next_b_ptr = tt.addptr %b_ptr, %b_off : tensor<32x128x!tt.ptr, #BL>, tensor<32x128xi32, #BL> %next_a_ = tt.load %next_a_ptr, %a_mask, %a_other : tensor<128x32x!tt.ptr, #AL> - %next_a = triton_gpu.local_alloc %next_a_ : (tensor<128x32xf8E5M2, #AL>) -> !tt.memdesc<128x32xf8E5M2, #A> + %next_a = triton_gpu.local_alloc %next_a_ : (tensor<128x32xf8E5M2, #AL>) -> !triton_gpu.memdesc<128x32xf8E5M2, #A> %next_b_ = tt.load %next_b_ptr, %b_mask, %b_other : tensor<32x128x!tt.ptr, #BL> - %next_b = triton_gpu.local_alloc %b_ : (tensor<32x128xf16, #BL>) -> !tt.memdesc<32x128xf16, #B> + %next_b = triton_gpu.local_alloc %b_ : (tensor<32x128xf16, #BL>) -> !triton_gpu.memdesc<32x128xf16, #B> - scf.yield %next_a_ptr, %next_b_ptr, %next_a, %next_b, %c : tensor<128x32x!tt.ptr, #AL>, tensor<32x128x!tt.ptr, #BL>, !tt.memdesc<128x32xf8E5M2, #A>, !tt.memdesc<32x128xf16, #B>, tensor<128x128xf32, #C> + scf.yield %next_a_ptr, %next_b_ptr, %next_a, %next_b, %c : tensor<128x32x!tt.ptr, #AL>, tensor<32x128x!tt.ptr, #BL>, !triton_gpu.memdesc<128x32xf8E5M2, #A>, !triton_gpu.memdesc<32x128xf16, #B>, tensor<128x128xf32, #C> } tt.return %loop#4 : tensor<128x128xf32, #C> } @@ -103,24 +103,24 @@ tt.func @matmul_loop_mixed(%lb : index, %ub : index, %step : index, %A : !tt.ptr %b_off = arith.constant dense<4> : tensor<16x128xi32, #BL> %a_ = tt.load %a_ptr_init, %a_mask, %a_other : tensor<128x16x!tt.ptr, #AL> - %a_init = triton_gpu.local_alloc %a_ : (tensor<128x16xf8E5M2, #AL>) -> !tt.memdesc<128x16xf8E5M2, #A> + %a_init = triton_gpu.local_alloc %a_ : (tensor<128x16xf8E5M2, #AL>) -> !triton_gpu.memdesc<128x16xf8E5M2, #A> %b_ = tt.load %b_ptr_init, %b_mask, %b_other : tensor<16x128x!tt.ptr, #BL> - %b_init = triton_gpu.local_alloc %b_ : (tensor<16x128xf16, #BL>) -> !tt.memdesc<16x128xf16, #B> + %b_init = triton_gpu.local_alloc %b_ : (tensor<16x128xf16, #BL>) -> !triton_gpu.memdesc<16x128xf16, #B> - %loop:5 = scf.for %iv = %lb to %ub step %step iter_args(%a_ptr = %a_ptr_init, %b_ptr = %b_ptr_init, %a = %a_init, %b = %b_init, %prev_c = %c_init) -> (tensor<128x16x!tt.ptr, #AL>, tensor<16x128x!tt.ptr, #BL>, !tt.memdesc<128x16xf8E5M2, #A>, !tt.memdesc<16x128xf16, #B>, tensor<128x128xf32, #C>) { - %a_op_ = triton_gpu.local_load %a : !tt.memdesc<128x16xf8E5M2, #A> -> tensor<128x16xf8E5M2, #A_OP> + %loop:5 = scf.for %iv = %lb to %ub step %step iter_args(%a_ptr = %a_ptr_init, %b_ptr = %b_ptr_init, %a = %a_init, %b = %b_init, %prev_c = %c_init) -> (tensor<128x16x!tt.ptr, #AL>, tensor<16x128x!tt.ptr, #BL>, !triton_gpu.memdesc<128x16xf8E5M2, #A>, !triton_gpu.memdesc<16x128xf16, #B>, tensor<128x128xf32, #C>) { + %a_op_ = triton_gpu.local_load %a : !triton_gpu.memdesc<128x16xf8E5M2, #A> -> tensor<128x16xf8E5M2, #A_OP> %a_op = tt.fp_to_fp %a_op_ : tensor<128x16xf8E5M2, #A_OP> -> tensor<128x16xf16, #A_OP> - %b_op = triton_gpu.local_load %b : !tt.memdesc<16x128xf16, #B> -> tensor<16x128xf16, #B_OP> + %b_op = triton_gpu.local_load %b : !triton_gpu.memdesc<16x128xf16, #B> -> tensor<16x128xf16, #B_OP> %c = tt.dot %a_op, %b_op, %prev_c : tensor<128x16xf16, #A_OP> * tensor<16x128xf16, #B_OP> -> tensor<128x128xf32, #C> %next_a_ptr = tt.addptr %a_ptr, %a_off : tensor<128x16x!tt.ptr, #AL>, tensor<128x16xi32, #AL> %next_b_ptr = tt.addptr %b_ptr, %b_off : tensor<16x128x!tt.ptr, #BL>, tensor<16x128xi32, #BL> %next_a_ = tt.load %next_a_ptr, %a_mask, %a_other : tensor<128x16x!tt.ptr, #AL> - %next_a = triton_gpu.local_alloc %next_a_ : (tensor<128x16xf8E5M2, #AL>) -> !tt.memdesc<128x16xf8E5M2, #A> + %next_a = triton_gpu.local_alloc %next_a_ : (tensor<128x16xf8E5M2, #AL>) -> !triton_gpu.memdesc<128x16xf8E5M2, #A> %next_b_ = tt.load %next_b_ptr, %b_mask, %b_other : tensor<16x128x!tt.ptr, #BL> - %next_b = triton_gpu.local_alloc %b_ : (tensor<16x128xf16, #BL>) -> !tt.memdesc<16x128xf16, #B> + %next_b = triton_gpu.local_alloc %b_ : (tensor<16x128xf16, #BL>) -> !triton_gpu.memdesc<16x128xf16, #B> - scf.yield %next_a_ptr, %next_b_ptr, %next_a, %next_b, %c : tensor<128x16x!tt.ptr, #AL>, tensor<16x128x!tt.ptr, #BL>, !tt.memdesc<128x16xf8E5M2, #A>, !tt.memdesc<16x128xf16, #B>, tensor<128x128xf32, #C> + scf.yield %next_a_ptr, %next_b_ptr, %next_a, %next_b, %c : tensor<128x16x!tt.ptr, #AL>, tensor<16x128x!tt.ptr, #BL>, !triton_gpu.memdesc<128x16xf8E5M2, #A>, !triton_gpu.memdesc<16x128xf16, #B>, tensor<128x128xf32, #C> } tt.return %loop#4 : tensor<128x128xf32, #C> } @@ -221,24 +221,24 @@ tt.func @matmul_loop_mixed_amd(%lb : index, %ub : index, %step : index, %A : !tt %b_off = arith.constant dense<4> : tensor<32x128xi32, #BL> %a_ = tt.load %a_ptr_init, %a_mask, %a_other : tensor<128x32x!tt.ptr, #AL> - %a_init = triton_gpu.local_alloc %a_ : (tensor<128x32xf8E5M2, #AL>) -> !tt.memdesc<128x32xf8E5M2, #A> + %a_init = triton_gpu.local_alloc %a_ : (tensor<128x32xf8E5M2, #AL>) -> !triton_gpu.memdesc<128x32xf8E5M2, #A> %b_ = tt.load %b_ptr_init, %b_mask, %b_other : tensor<32x128x!tt.ptr, #BL> - %b_init = triton_gpu.local_alloc %b_ : (tensor<32x128xf16, #BL>) -> !tt.memdesc<32x128xf16, #B> + %b_init = triton_gpu.local_alloc %b_ : (tensor<32x128xf16, #BL>) -> !triton_gpu.memdesc<32x128xf16, #B> - %loop:5 = scf.for %iv = %lb to %ub step %step iter_args(%a_ptr = %a_ptr_init, %b_ptr = %b_ptr_init, %a = %a_init, %b = %b_init, %prev_c = %c_init) -> (tensor<128x32x!tt.ptr, #AL>, tensor<32x128x!tt.ptr, #BL>, !tt.memdesc<128x32xf8E5M2, #A>, !tt.memdesc<32x128xf16, #B>, tensor<128x128xf32, #C>) { - %a_op_ = triton_gpu.local_load %a : !tt.memdesc<128x32xf8E5M2, #A> -> tensor<128x32xf8E5M2, #A_OP> + %loop:5 = scf.for %iv = %lb to %ub step %step iter_args(%a_ptr = %a_ptr_init, %b_ptr = %b_ptr_init, %a = %a_init, %b = %b_init, %prev_c = %c_init) -> (tensor<128x32x!tt.ptr, #AL>, tensor<32x128x!tt.ptr, #BL>, !triton_gpu.memdesc<128x32xf8E5M2, #A>, !triton_gpu.memdesc<32x128xf16, #B>, tensor<128x128xf32, #C>) { + %a_op_ = triton_gpu.local_load %a : !triton_gpu.memdesc<128x32xf8E5M2, #A> -> tensor<128x32xf8E5M2, #A_OP> %a_op = tt.fp_to_fp %a_op_ : tensor<128x32xf8E5M2, #A_OP> -> tensor<128x32xf16, #A_OP> - %b_op = triton_gpu.local_load %b : !tt.memdesc<32x128xf16, #B> -> tensor<32x128xf16, #B_OP> + %b_op = triton_gpu.local_load %b : !triton_gpu.memdesc<32x128xf16, #B> -> tensor<32x128xf16, #B_OP> %c = tt.dot %a_op, %b_op, %prev_c : tensor<128x32xf16, #A_OP> * tensor<32x128xf16, #B_OP> -> tensor<128x128xf32, #C> %next_a_ptr = tt.addptr %a_ptr, %a_off : tensor<128x32x!tt.ptr, #AL>, tensor<128x32xi32, #AL> %next_b_ptr = tt.addptr %b_ptr, %b_off : tensor<32x128x!tt.ptr, #BL>, tensor<32x128xi32, #BL> %next_a_ = tt.load %next_a_ptr, %a_mask, %a_other : tensor<128x32x!tt.ptr, #AL> - %next_a = triton_gpu.local_alloc %next_a_ : (tensor<128x32xf8E5M2, #AL>) -> !tt.memdesc<128x32xf8E5M2, #A> + %next_a = triton_gpu.local_alloc %next_a_ : (tensor<128x32xf8E5M2, #AL>) -> !triton_gpu.memdesc<128x32xf8E5M2, #A> %next_b_ = tt.load %next_b_ptr, %b_mask, %b_other : tensor<32x128x!tt.ptr, #BL> - %next_b = triton_gpu.local_alloc %b_ : (tensor<32x128xf16, #BL>) -> !tt.memdesc<32x128xf16, #B> + %next_b = triton_gpu.local_alloc %b_ : (tensor<32x128xf16, #BL>) -> !triton_gpu.memdesc<32x128xf16, #B> - scf.yield %next_a_ptr, %next_b_ptr, %next_a, %next_b, %c : tensor<128x32x!tt.ptr, #AL>, tensor<32x128x!tt.ptr, #BL>, !tt.memdesc<128x32xf8E5M2, #A>, !tt.memdesc<32x128xf16, #B>, tensor<128x128xf32, #C> + scf.yield %next_a_ptr, %next_b_ptr, %next_a, %next_b, %c : tensor<128x32x!tt.ptr, #AL>, tensor<32x128x!tt.ptr, #BL>, !triton_gpu.memdesc<128x32xf8E5M2, #A>, !triton_gpu.memdesc<32x128xf16, #B>, tensor<128x128xf32, #C> } tt.return %loop#4 : tensor<128x128xf32, #C> } diff --git a/test/TritonGPU/reduce-data-duplication.mlir b/test/TritonGPU/reduce-data-duplication.mlir index 9fca92c9b0..67bf5bdbcc 100644 --- a/test/TritonGPU/reduce-data-duplication.mlir +++ b/test/TritonGPU/reduce-data-duplication.mlir @@ -2,7 +2,7 @@ // CHECK: #[[$SHARED:.*]] = #triton_gpu.shared<{vec = 8, perPhase = 8, maxPhase = 2, order = [0, 1], hasLeadingOffset = false} // CHECK-LABEL: apply_swizzle -// CHECK: %{{.*}} = triton_gpu.local_alloc %{{.*}} : (tensor<16x256xf16, #{{.*}}>) -> !tt.memdesc<16x256xf16, #[[$SHARED]], #triton_gpu.shared_memory> +// CHECK: %{{.*}} = triton_gpu.local_alloc %{{.*}} : (tensor<16x256xf16, #{{.*}}>) -> !triton_gpu.memdesc<16x256xf16, #[[$SHARED]], #triton_gpu.shared_memory> #blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 4], order = [0, 1]}> #mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [1, 4], instrShape = [16, 8]}> diff --git a/test/TritonGPU/reorder-instructions.mlir b/test/TritonGPU/reorder-instructions.mlir index dff1e6b60f..28f8d385cf 100644 --- a/test/TritonGPU/reorder-instructions.mlir +++ b/test/TritonGPU/reorder-instructions.mlir @@ -13,8 +13,8 @@ module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-war %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma> %cst_0 = arith.constant dense<1.230000e+02> : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> %9 = tt.load %arg0 : tensor<32x32x!tt.ptr, #blocked> - %10 = triton_gpu.local_alloc %9 : (tensor<32x32xf32, #blocked>) -> !tt.memdesc<32x32xf32, #shared> - %11 = triton_gpu.local_load %10 : !tt.memdesc<32x32xf32, #shared> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> + %10 = triton_gpu.local_alloc %9 : (tensor<32x32xf32, #blocked>) -> !triton_gpu.memdesc<32x32xf32, #shared> + %11 = triton_gpu.local_load %10 : !triton_gpu.memdesc<32x32xf32, #shared> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> %12 = tt.dot %11, %cst_0, %cst : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<32x32xf32, #mma> %13 = triton_gpu.convert_layout %12 : tensor<32x32xf32, #mma> -> tensor<32x32xf32, #blocked> tt.store %arg0, %13 : tensor<32x32x!tt.ptr, #blocked> @@ -26,20 +26,20 @@ module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-war // CHECK-LABEL: sink_convert_dealloc // CHECK: triton_gpu.async_wait {num = 0 : i32} -// CHECK: triton_gpu.local_dealloc %0 : !tt.memdesc<4x128x64xf16, #shared, mutable> -// CHECK: triton_gpu.local_dealloc %1 : !tt.memdesc<4x128x64xf16, #shared, mutable> +// CHECK: triton_gpu.local_dealloc %0 : !triton_gpu.memdesc<4x128x64xf16, #shared, mutable> +// CHECK: triton_gpu.local_dealloc %1 : !triton_gpu.memdesc<4x128x64xf16, #shared, mutable> // CHECK: %3 = triton_gpu.convert_layout %arg0 : tensor<32x32xf32, #blocked> -> tensor<32x32xf32, #blocked1> #blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 4], order = [0, 1]}> #blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 4], order = [1, 0]}> #shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 4, order = [0, 1]}> module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { tt.func public @sink_convert_dealloc(%arg0: tensor<32x32xf32, #blocked>) attributes {noinline = false} { - %0 = triton_gpu.local_alloc : () -> !tt.memdesc<4x128x64xf16, #shared, mutable> - %1 = triton_gpu.local_alloc : () -> !tt.memdesc<4x128x64xf16, #shared, mutable> + %0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<4x128x64xf16, #shared, mutable> + %1 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<4x128x64xf16, #shared, mutable> %2 = triton_gpu.convert_layout %arg0 : tensor<32x32xf32, #blocked> -> tensor<32x32xf32, #blocked1> triton_gpu.async_wait {num = 0 : i32} - triton_gpu.local_dealloc %0 : !tt.memdesc<4x128x64xf16, #shared, mutable> - triton_gpu.local_dealloc %1 : !tt.memdesc<4x128x64xf16, #shared, mutable> + triton_gpu.local_dealloc %0 : !triton_gpu.memdesc<4x128x64xf16, #shared, mutable> + triton_gpu.local_dealloc %1 : !triton_gpu.memdesc<4x128x64xf16, #shared, mutable> %3 = arith.addf %2, %2 : tensor<32x32xf32, #blocked1> tt.return } @@ -48,8 +48,8 @@ module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-war // ----- // CHECK-LABEL: sink_convert_idx_1 -// CHECK: triton_gpu.local_load %{{.*}} : !tt.memdesc<32x32xf32, #shared> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> -// CHECK: triton_gpu.local_load %{{.*}} : !tt.memdesc<32x32xf32, #shared> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> +// CHECK: triton_gpu.local_load %{{.*}} : !triton_gpu.memdesc<32x32xf32, #shared> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> +// CHECK: triton_gpu.local_load %{{.*}} : !triton_gpu.memdesc<32x32xf32, #shared> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> // CHECK: tt.dot #blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 4], order = [0, 1]}> #mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [2, 2]}> @@ -58,12 +58,12 @@ module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-war tt.func public @sink_convert_idx_1(%arg0: tensor<32x32x!tt.ptr, #blocked>) attributes {noinline = false} { %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma> %B = tt.load %arg0 : tensor<32x32x!tt.ptr, #blocked> - %BS = triton_gpu.local_alloc %B : (tensor<32x32xf32, #blocked>) -> !tt.memdesc<32x32xf32, #shared> - %BD = triton_gpu.local_load %BS : !tt.memdesc<32x32xf32, #shared> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> + %BS = triton_gpu.local_alloc %B : (tensor<32x32xf32, #blocked>) -> !triton_gpu.memdesc<32x32xf32, #shared> + %BD = triton_gpu.local_load %BS : !triton_gpu.memdesc<32x32xf32, #shared> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> %cst_0 = arith.constant dense<1.230000e+02> : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> %A = tt.load %arg0 : tensor<32x32x!tt.ptr, #blocked> - %AS = triton_gpu.local_alloc %A : (tensor<32x32xf32, #blocked>) -> !tt.memdesc<32x32xf32, #shared> - %AD = triton_gpu.local_load %AS : !tt.memdesc<32x32xf32, #shared> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> + %AS = triton_gpu.local_alloc %A : (tensor<32x32xf32, #blocked>) -> !triton_gpu.memdesc<32x32xf32, #shared> + %AD = triton_gpu.local_load %AS : !triton_gpu.memdesc<32x32xf32, #shared> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> %12 = tt.dot %AD, %BD, %cst : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<32x32xf32, #mma> %13 = triton_gpu.convert_layout %12 : tensor<32x32xf32, #mma> -> tensor<32x32xf32, #blocked> tt.store %arg0, %13 : tensor<32x32x!tt.ptr, #blocked> @@ -75,10 +75,10 @@ module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-war // check that we don't sink convert_layout if it has multi users // CHECK-LABEL: convert_cannot_sink -// CHECK: triton_gpu.local_load %{{.*}} : !tt.memdesc<32x32xf32, #shared> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -// CHECK: triton_gpu.local_load %{{.*}} : !tt.memdesc<32x32xf32, #shared> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> +// CHECK: triton_gpu.local_load %{{.*}} : !triton_gpu.memdesc<32x32xf32, #shared> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> +// CHECK: triton_gpu.local_load %{{.*}} : !triton_gpu.memdesc<32x32xf32, #shared> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> // CHECK: tt.dot -// CHECK: triton_gpu.local_load %{{.*}} : !tt.memdesc<32x32xf32, #shared> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> +// CHECK: triton_gpu.local_load %{{.*}} : !triton_gpu.memdesc<32x32xf32, #shared> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> // CHECK: tt.dot #blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 4], order = [0, 1]}> #mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [2, 2]}> @@ -87,15 +87,15 @@ module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-war tt.func public @convert_cannot_sink(%arg0: tensor<32x32x!tt.ptr, #blocked>) attributes {noinline = false} { %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma> %B = tt.load %arg0 : tensor<32x32x!tt.ptr, #blocked> - %BS = triton_gpu.local_alloc %B : (tensor<32x32xf32, #blocked>) -> !tt.memdesc<32x32xf32, #shared> - %BD = triton_gpu.local_load %BS : !tt.memdesc<32x32xf32, #shared> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> + %BS = triton_gpu.local_alloc %B : (tensor<32x32xf32, #blocked>) -> !triton_gpu.memdesc<32x32xf32, #shared> + %BD = triton_gpu.local_load %BS : !triton_gpu.memdesc<32x32xf32, #shared> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> %A0 = tt.load %arg0 : tensor<32x32x!tt.ptr, #blocked> - %AS0 = triton_gpu.local_alloc %A0 : (tensor<32x32xf32, #blocked>) -> !tt.memdesc<32x32xf32, #shared> - %AD0 = triton_gpu.local_load %AS0 : !tt.memdesc<32x32xf32, #shared> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> + %AS0 = triton_gpu.local_alloc %A0 : (tensor<32x32xf32, #blocked>) -> !triton_gpu.memdesc<32x32xf32, #shared> + %AD0 = triton_gpu.local_load %AS0 : !triton_gpu.memdesc<32x32xf32, #shared> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> %12 = tt.dot %AD0, %BD, %cst : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<32x32xf32, #mma> %A1 = tt.load %arg0 : tensor<32x32x!tt.ptr, #blocked> - %AS1 = triton_gpu.local_alloc %A1 : (tensor<32x32xf32, #blocked>) -> !tt.memdesc<32x32xf32, #shared> - %AD1 = triton_gpu.local_load %AS1 : !tt.memdesc<32x32xf32, #shared> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> + %AS1 = triton_gpu.local_alloc %A1 : (tensor<32x32xf32, #blocked>) -> !triton_gpu.memdesc<32x32xf32, #shared> + %AD1 = triton_gpu.local_load %AS1 : !triton_gpu.memdesc<32x32xf32, #shared> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> %13 = tt.dot %AD1, %BD, %cst : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<32x32xf32, #mma> tt.return } diff --git a/test/TritonGPU/tritongpu_ops.mlir b/test/TritonGPU/tritongpu_ops.mlir index d5c6a52e8e..3fc0585b12 100644 --- a/test/TritonGPU/tritongpu_ops.mlir +++ b/test/TritonGPU/tritongpu_ops.mlir @@ -4,8 +4,8 @@ module attributes {"triton_gpu.target" = "cuda:0", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { // CHECK-LABEL: memdesc - // CHECK-SAME: !tt.memdesc<1x64x16xf16, #{{.+}}> - tt.func @memdesc(%d : !tt.memdesc<1x64x16xf16, #shared0>) { + // CHECK-SAME: !triton_gpu.memdesc<1x64x16xf16, #{{.+}}> + tt.func @memdesc(%d : !triton_gpu.memdesc<1x64x16xf16, #shared0>) { tt.return } } diff --git a/test/TritonNvidiaGPU/membar.mlir b/test/TritonNvidiaGPU/membar.mlir index 6d9c166508..924216222a 100644 --- a/test/TritonNvidiaGPU/membar.mlir +++ b/test/TritonNvidiaGPU/membar.mlir @@ -9,8 +9,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // CHECK-NEXT: init_barrier tt.func @init_barrier() { %cst = arith.constant dense<0> : tensor<1xi64, #blocked0> - %alloc = triton_gpu.local_alloc %cst : (tensor<1xi64, #blocked0>) -> !tt.memdesc<1xi64, #shared0, #triton_gpu.shared_memory, mutable> - triton_nvidia_gpu.init_barrier %alloc, 1 : !tt.memdesc<1xi64, #shared0, #triton_gpu.shared_memory, mutable> + %alloc = triton_gpu.local_alloc %cst : (tensor<1xi64, #blocked0>) -> !triton_gpu.memdesc<1xi64, #shared0, #triton_gpu.shared_memory, mutable> + triton_nvidia_gpu.init_barrier %alloc, 1 : !triton_gpu.memdesc<1xi64, #shared0, #triton_gpu.shared_memory, mutable> tt.return } } @@ -28,9 +28,9 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // CHECK-NEXT: inval_barrier tt.func @inval_barrier() { %cst = arith.constant dense<0> : tensor<1xi64, #blocked0> - %alloc = triton_gpu.local_alloc %cst : (tensor<1xi64, #blocked0>) -> !tt.memdesc<1xi64, #shared0, #triton_gpu.shared_memory, mutable> - triton_nvidia_gpu.init_barrier %alloc, 1 : !tt.memdesc<1xi64, #shared0, #triton_gpu.shared_memory, mutable> - triton_nvidia_gpu.inval_barrier %alloc : !tt.memdesc<1xi64, #shared0, #triton_gpu.shared_memory, mutable> + %alloc = triton_gpu.local_alloc %cst : (tensor<1xi64, #blocked0>) -> !triton_gpu.memdesc<1xi64, #shared0, #triton_gpu.shared_memory, mutable> + triton_nvidia_gpu.init_barrier %alloc, 1 : !triton_gpu.memdesc<1xi64, #shared0, #triton_gpu.shared_memory, mutable> + triton_nvidia_gpu.inval_barrier %alloc : !triton_gpu.memdesc<1xi64, #shared0, #triton_gpu.shared_memory, mutable> tt.return } } @@ -48,8 +48,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // CHECK-NEXT: barrier_expect tt.func @barrier_expect(%pred : i1) { %cst = arith.constant dense<0> : tensor<1xi64, #blocked0> - %alloc = triton_gpu.local_alloc %cst : (tensor<1xi64, #blocked0>) -> !tt.memdesc<1xi64, #shared0, #triton_gpu.shared_memory, mutable> - triton_nvidia_gpu.init_barrier %alloc, 1 : !tt.memdesc<1xi64, #shared0, #triton_gpu.shared_memory, mutable> + %alloc = triton_gpu.local_alloc %cst : (tensor<1xi64, #blocked0>) -> !triton_gpu.memdesc<1xi64, #shared0, #triton_gpu.shared_memory, mutable> + triton_nvidia_gpu.init_barrier %alloc, 1 : !triton_gpu.memdesc<1xi64, #shared0, #triton_gpu.shared_memory, mutable> triton_nvidia_gpu.barrier_expect %alloc, 16384, %pred : <1xi64, #shared0, #triton_gpu.shared_memory, mutable> tt.return } @@ -68,8 +68,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // CHECK-NEXT: wait_barrier tt.func @wait_barrier(%phase : i32) { %cst = arith.constant dense<0> : tensor<1xi64, #blocked0> - %alloc = triton_gpu.local_alloc %cst : (tensor<1xi64, #blocked0>) -> !tt.memdesc<1xi64, #shared0, #triton_gpu.shared_memory, mutable> - triton_nvidia_gpu.init_barrier %alloc, 1 : !tt.memdesc<1xi64, #shared0, #triton_gpu.shared_memory, mutable> + %alloc = triton_gpu.local_alloc %cst : (tensor<1xi64, #blocked0>) -> !triton_gpu.memdesc<1xi64, #shared0, #triton_gpu.shared_memory, mutable> + triton_nvidia_gpu.init_barrier %alloc, 1 : !triton_gpu.memdesc<1xi64, #shared0, #triton_gpu.shared_memory, mutable> triton_nvidia_gpu.wait_barrier %alloc, %phase : <1xi64, #shared0, #triton_gpu.shared_memory, mutable> tt.return } @@ -89,8 +89,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // CHECK-NEXT: gpu.barrier // CHECK-NEXT: init_barrier %cst = arith.constant dense<0> : tensor<128x64xi64, #blocked0> - %alloc = triton_gpu.local_alloc %cst : (tensor<128x64xi64, #blocked0>) -> !tt.memdesc<128x64xi64, #shared0, #triton_gpu.shared_memory, mutable> - triton_gpu.local_dealloc %alloc : !tt.memdesc<128x64xi64, #shared0, #triton_gpu.shared_memory, mutable> + %alloc = triton_gpu.local_alloc %cst : (tensor<128x64xi64, #blocked0>) -> !triton_gpu.memdesc<128x64xi64, #shared0, #triton_gpu.shared_memory, mutable> + triton_gpu.local_dealloc %alloc : !triton_gpu.memdesc<128x64xi64, #shared0, #triton_gpu.shared_memory, mutable> %l = tt.experimental_descriptor_load %arg0[%arg1, %arg1] : !tt.tensordesc> -> tensor<128x64xf16, #blocked0> tt.return %l : tensor<128x64xf16, #blocked0> } @@ -108,8 +108,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // CHECK-NEXT: triton_gpu.local_alloc tt.func public @tma_store(%arg0: !tt.tensordesc>, %arg1: i32 {tt.divisibility = 16 : i32}, %arg2: tensor<128x256xf32, #blocked0>) { %cst = arith.constant dense<0> : tensor<128x64xi64, #blocked0> - %alloc = triton_gpu.local_alloc %cst : (tensor<128x64xi64, #blocked0>) -> !tt.memdesc<128x64xi64, #shared0, #triton_gpu.shared_memory, mutable> - triton_gpu.local_dealloc %alloc : !tt.memdesc<128x64xi64, #shared0, #triton_gpu.shared_memory, mutable> + %alloc = triton_gpu.local_alloc %cst : (tensor<128x64xi64, #blocked0>) -> !triton_gpu.memdesc<128x64xi64, #shared0, #triton_gpu.shared_memory, mutable> + triton_gpu.local_dealloc %alloc : !triton_gpu.memdesc<128x64xi64, #shared0, #triton_gpu.shared_memory, mutable> tt.experimental_descriptor_store %arg0[%arg1, %arg1], %arg2 : !tt.tensordesc>, tensor<128x256xf32, #blocked0> tt.return } diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM.cpp index d3ffaed2e8..208483beb8 100644 --- a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM.cpp +++ b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM.cpp @@ -9,6 +9,7 @@ using ::mlir::triton::gpu::AMDMfmaEncodingAttr; using ::mlir::triton::gpu::AMDWmmaEncodingAttr; using ::mlir::triton::gpu::DotOperandEncodingAttr; using ::mlir::triton::gpu::getTotalElemsPerThread; +using ::mlir::triton::gpu::MemDescType; using ::mlir::triton::gpu::SharedEncodingAttr; namespace SharedToDotOperandMFMA { diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandMFMA.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandMFMA.cpp index 9043090802..c79df66c48 100644 --- a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandMFMA.cpp +++ b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandMFMA.cpp @@ -198,7 +198,7 @@ Value convertLayout(int opIdx, ConversionPatternRewriter &rewriter, const SharedMemoryObject &smemObj, const LLVMTypeConverter *typeConverter, Value thread) { assert((opIdx == 0 || opIdx == 1) && "unexpected operand idx"); - auto aTensorTy = cast(tensor.getType()); + auto aTensorTy = cast(tensor.getType()); ArrayRef shape = aTensorTy.getShape(); auto rank = shape.size(); int kDimIdx = opIdx == 0 ? rank - 1 : rank - 2; diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandWMMA.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandWMMA.cpp index 1ca9e49745..7f037b89b8 100644 --- a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandWMMA.cpp +++ b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandWMMA.cpp @@ -152,7 +152,7 @@ Value convertLayout(int opIdx, ConversionPatternRewriter &rewriter, assert(wmmaLayout.getMNKDimPerInstr()[nonKDimIdx] == 16); auto warpsPerCTA = wmmaLayout.getWarpsPerCTA(); - auto aTensorTy = cast(tensor.getType()); + auto aTensorTy = cast(tensor.getType()); ArrayRef shape = aTensorTy.getShape(); auto sharedLayout = cast(aTensorTy.getEncoding()); auto order = sharedLayout.getOrder(); diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp index 1a4dd8227c..2088fd8073 100644 --- a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp +++ b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp @@ -258,7 +258,7 @@ void StreamPipeliner::createStreamCopy(tt::LoadOp loadOp, Value alloc, Value mask = loadOp.getMask(); Value other = loadOp.getOther(); - tt::MemDescType allocTy = cast(alloc.getType()); + ttg::MemDescType allocTy = cast(alloc.getType()); SmallVector copyOffsets(allocTy.getRank(), zero); Operation *copy = builder.clone(*loadOp); @@ -271,7 +271,7 @@ void StreamPipeliner::createStreamCopy(tt::LoadOp loadOp, Value alloc, loadOffsets[0] = extractIdx; auto sharedMemorySpace = triton::gpu::SharedMemorySpaceAttr::get(forOp.getContext()); - auto subviewTy = tt::MemDescType::get( + auto subviewTy = ttg::MemDescType::get( allocTy.getShape().drop_front(), allocTy.getElementType(), allocTy.getEncoding(), sharedMemorySpace, /*mutableMemory=*/true); auto viewLoad = @@ -330,7 +330,7 @@ getSharedEncIfAllUsersAreDotEnc(Value val) { if (user->getNumResults() != 1) return std::nullopt; if (auto memDesc = - dyn_cast(user->getResult(0).getType())) { + dyn_cast(user->getResult(0).getType())) { // First time we find a shared encoding in the chain, save it and try to // use it if it is compatible with the other users. tempAttr = cast(memDesc.getEncoding()); @@ -340,10 +340,11 @@ getSharedEncIfAllUsersAreDotEnc(Value val) { if (!isa(user)) return std::nullopt; auto dotOpEnc = dyn_cast( - cast(user->getResult(0).getType()).getEncoding()); + cast(user->getResult(0).getType()) + .getEncoding()); if (!dotOpEnc) return std::nullopt; - auto srcTy = cast(val.getType()); + auto srcTy = cast(val.getType()); auto CTALayout = ttg::getCTALayout(srcTy.getEncoding()); auto order = ttg::getOrder(srcTy.getEncoding()); unsigned bitWidth = srcTy.getElementType().getIntOrFloatBitWidth(); @@ -669,9 +670,9 @@ Value StreamPipeliner::createAlloc(Operation *loadOp, auto ty = cast(loadOp->getResultTypes()[0]); SmallVector bufferShape(ty.getShape().begin(), ty.getShape().end()); bufferShape.insert(bufferShape.begin(), numBuffers); - Type memdescType = tt::MemDescType::get(bufferShape, ty.getElementType(), - sharedEnc, sharedMemorySpace, - /*mutableMemory=*/true); + Type memdescType = ttg::MemDescType::get(bufferShape, ty.getElementType(), + sharedEnc, sharedMemorySpace, + /*mutableMemory=*/true); auto alloc = builder.create(loadOp->getLoc(), memdescType, Value()); sharedMemAllocs.push_back(alloc); diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandMMAv2OrV3.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandMMAv2OrV3.cpp index 8c91eb2ebf..b9aac96cbf 100644 --- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandMMAv2OrV3.cpp +++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandMMAv2OrV3.cpp @@ -14,6 +14,7 @@ using ::mlir::triton::gpu::getOrder; using ::mlir::triton::gpu::getShapePerCTA; using ::mlir::triton::gpu::getSizePerThread; using ::mlir::triton::gpu::getTotalElemsPerThread; +using ::mlir::triton::gpu::MemDescType; using ::mlir::triton::gpu::SharedEncodingAttr; // Data loader for mma.16816 instruction. diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DecomposeUnsupportedConversions.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DecomposeUnsupportedConversions.cpp index 40cb55bbc0..29a76f6033 100644 --- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DecomposeUnsupportedConversions.cpp +++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DecomposeUnsupportedConversions.cpp @@ -50,7 +50,7 @@ class DecomposeLocalLoadToDotOperand blockEncoding); Value load = rewriter.create(op.getLoc(), tmpType, op.getSrc()); - auto newSharedDescTy = triton::MemDescType::get( + auto newSharedDescTy = MemDescType::get( type.getShape(), type.getElementType(), triton::gpu::SharedEncodingAttr::get( op.getContext(), dstDotOp, type.getShape(), diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/WGMMA.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/WGMMA.cpp index 9b1667db70..85f7da2cb5 100644 --- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/WGMMA.cpp +++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/WGMMA.cpp @@ -30,6 +30,7 @@ using namespace mlir::triton; using ::mlir::LLVM::getSharedMemoryObjectFromStruct; using ::mlir::triton::gpu::getShapePerCTA; using ::mlir::triton::gpu::getShapePerCTATile; +using ::mlir::triton::gpu::MemDescType; using ::mlir::triton::gpu::NvidiaMmaEncodingAttr; using ::mlir::triton::gpu::SharedEncodingAttr; @@ -47,7 +48,7 @@ triton::nvgpu::WGMMAEltType getMmaRetType(Value d) { } triton::nvgpu::WGMMAEltType getMmaOperandType(Value a, bool allowTF32) { - auto aTy = cast(a.getType()).getElementType(); + auto aTy = cast(a.getType()).getElementType(); if (aTy.isF16()) { return triton::nvgpu::WGMMAEltType::f16; } else if (aTy.isBF16()) { @@ -197,7 +198,7 @@ DotOpMmaV3SmemLoader loadA(const LLVMTypeConverter *typeConverter, ConversionPatternRewriter &rewriter, Location loc, const NvidiaMmaEncodingAttr &mmaEncoding, Value tensor, Value smemObjBase, Value thread) { - auto aTy = cast(tensor.getType()); + auto aTy = cast(tensor.getType()); auto aSharedLayout = dyn_cast(aTy.getEncoding()); assert(aSharedLayout && "only support load dot operand from shared."); auto instrShape = mmaEncoding.getInstrShape(); @@ -378,8 +379,8 @@ LogicalResult convertDot(const LLVMTypeConverter *typeConverter, Value loadedC, bool allowTF32, bool needsPartialAccumulator, uint32_t maxNumImpreciseAcc, bool sync, Value thread) { - auto aTensorTy = cast(a.getType()); - auto bTensorTy = cast(b.getType()); + auto aTensorTy = cast(a.getType()); + auto bTensorTy = cast(b.getType()); auto dTensorTy = cast(d.getType()); auto aSharedLayout = dyn_cast(aTensorTy.getEncoding()); auto bSharedLayout = cast(bTensorTy.getEncoding()); From c8a31a025df9726c20c6da9c67484084d32a705b Mon Sep 17 00:00:00 2001 From: Thomas Raoux Date: Thu, 21 Nov 2024 14:55:49 -0800 Subject: [PATCH 02/12] [AMD] Prevent wrong reordering of scf operations (#5203) The pass was reordering scf.if operations without checking the extra dependencies coming from the region. For now just prevent this case although this part of the code might still be fragile. --- .../amd/amd-reorder-instructions.mlir | 37 +++++++++++++++++++ .../ReorderInstructions.cpp | 11 ++++++ 2 files changed, 48 insertions(+) diff --git a/test/TritonGPU/amd/amd-reorder-instructions.mlir b/test/TritonGPU/amd/amd-reorder-instructions.mlir index 51353d31c3..708d75a232 100644 --- a/test/TritonGPU/amd/amd-reorder-instructions.mlir +++ b/test/TritonGPU/amd/amd-reorder-instructions.mlir @@ -499,3 +499,40 @@ module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-war tt.return } } + + +// ----- + +#mfma = #triton_gpu.amd_mfma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [8, 1], instrShape = [32, 32], isTransposed = true}> +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, triton_gpu.target = "hip:gfx90a", "triton_gpu.threads-per-warp" = 64 : i32} { + // CHECK-LABEL: dont_hoist_scf_ops + // Make sure we don't hoist scf ops above its dependencies. + tt.func public @dont_hoist_scf_ops(%init: tensor<256x128xf32, #mfma>, + %base: tensor<256x128x!tt.ptr, #triton_gpu.dot_op<{opIdx = 0, parent = #mfma, kWidth = 4}>>, + %p1: tensor<128x128x!tt.ptr, #triton_gpu.dot_op<{opIdx = 1, parent = #mfma, kWidth = 4}>>, %i1: i1) -> (tensor<256x128xf32, #mfma>) { + %c0_i32 = arith.constant 0 : i32 + %c1_i32 = arith.constant 1 : i32 + %c4_i32 = arith.constant 4 : i32 + %cst = arith.constant 1.44269502 : f32 + %c128_i32 = arith.constant 128 : i32 + // CHECK: scf.for + %54 = scf.for %arg21 = %c0_i32 to %c4_i32 step %c1_i32 iter_args(%arg = %init) -> (tensor<256x128xf32, #mfma>) : i32 { + // CHECK: arith.addi + %f = arith.addi %arg21, %c128_i32 : i32 + // CHECK: scf.if + // CHECK: tt.load + %p0 = scf.if %i1 -> tensor<256x128x!tt.ptr, #triton_gpu.dot_op<{opIdx = 0, parent = #mfma, kWidth = 4}>>{ + %t = tt.splat %f : i32 -> tensor<256x128xi32> + %padd = tt.addptr %base, %t : tensor<256x128x!tt.ptr, #triton_gpu.dot_op<{opIdx = 0, parent = #mfma, kWidth = 4}>>, tensor<256x128xi32> + scf.yield %padd : tensor<256x128x!tt.ptr, #triton_gpu.dot_op<{opIdx = 0, parent = #mfma, kWidth = 4}>> + } else { + scf.yield %base : tensor<256x128x!tt.ptr, #triton_gpu.dot_op<{opIdx = 0, parent = #mfma, kWidth = 4}>> + } + %l = tt.load %p0 : tensor<256x128x!tt.ptr, #triton_gpu.dot_op<{opIdx = 0, parent = #mfma, kWidth = 4}>> + %r = tt.load %p1 : tensor<128x128x!tt.ptr, #triton_gpu.dot_op<{opIdx = 1, parent = #mfma, kWidth = 4}>> + %acc = tt.dot %l, %r, %arg : tensor<256x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mfma, kWidth = 4}>> * tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mfma, kWidth = 4}>> -> tensor<256x128xf32, #mfma> + scf.yield %acc : tensor<256x128xf32, #mfma> + } + tt.return %54 : tensor<256x128xf32, #mfma> + } +} diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp index f55ab78554..0837f16dcf 100644 --- a/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp +++ b/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp @@ -227,6 +227,7 @@ static void scheduleGlobalLoadLocalStore(triton::FuncOp funcOp) { // Gather use-def chain in block. Block *block = op->getBlock(); bool leadsToLoad = false; + bool dontReorder = false; SetVector backwardSet; BackwardSliceOptions options; @@ -236,6 +237,13 @@ static void scheduleGlobalLoadLocalStore(triton::FuncOp funcOp) { Block *defBlock = defOp->getBlock(); if (!block->findAncestorOpInBlock(*defOp)) return false; + // Don't hoist control flow as we don't track backtraces of ops within + // their regions. + if (isa(defOp)) { + dontReorder = true; + return false; + } + // Check for a `load` dependent path. leadsToLoad |= isa(defOp); // Only move ops residing in the same block. @@ -244,6 +252,9 @@ static void scheduleGlobalLoadLocalStore(triton::FuncOp funcOp) { mlir::getBackwardSlice(op, &backwardSet, options); backwardSet.insert(op); + // If we found ops in the slice we don't want to hoist. + if (dontReorder) + continue; // Don't move a local_store if its source is a load from // the same iteration. if (isa(op) && leadsToLoad) From 3164a4e2d5a9cb63527bd6226c6fe2405188eb66 Mon Sep 17 00:00:00 2001 From: Maksim Levental Date: Thu, 21 Nov 2024 17:33:07 -0600 Subject: [PATCH 03/12] [AMD] Cover default case in MfmaGroup (#5218) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If you build using the `CMakeLists.txt` and not `setup.py` and you build in `Release` then you get ``` /__w/triton/triton/third_party/amd/lib/TritonAMDGPUTransforms/MfmaGroup.cpp: In function ‘std::pair mlir::TypesFromMfmaId(MLIRContext*, MfmaTypeId)’: Warning: /__w/triton/triton/third_party/amd/lib/TritonAMDGPUTransforms/MfmaGroup.cpp:240:1: warning: control reaches end of non-void function [-Wreturn-type] ``` --- third_party/amd/lib/TritonAMDGPUTransforms/MfmaGroup.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/MfmaGroup.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/MfmaGroup.cpp index d3b2b70f85..9fce18e21f 100644 --- a/third_party/amd/lib/TritonAMDGPUTransforms/MfmaGroup.cpp +++ b/third_party/amd/lib/TritonAMDGPUTransforms/MfmaGroup.cpp @@ -235,8 +235,9 @@ std::pair TypesFromMfmaId(mlir::MLIRContext *ctx, return {f8e5m2fnuz, f8e4m3fnuz}; case MfmaTypeId::Bf8Bf8TyId: return {f8e5m2fnuz, f8e5m2fnuz}; + default: + llvm_unreachable("unsupported MfmaTypeId!"); } - assert(false && "unsupported MfmaTypeId"); } FailureOr MfmaInsn::selectMfma(unsigned mDim, unsigned nDim, From 82b8f0f5983fb7fd5b40b52089f8177a0ea6561a Mon Sep 17 00:00:00 2001 From: Matthew Brookhart Date: Thu, 21 Nov 2024 16:53:48 -0700 Subject: [PATCH 04/12] Allow Layouts to propogate to local_load (#5219) While working on some higher dimension tensor kernels, I noticed poor performance due to the fact that layouts wouldn't propagate to local loads. Since we do allow layout folding with local store and local alloc, this seems like a bit of an oversight. The change gives a 40% speed improvement on certain kernels for NVidia GPUs. This also removes asserts in lowering for higher dimensional kernels. As far as I can tell, those restrictions aren't required in practice. # New contributor declaration - [x] I am not making a trivial change, such as fixing a typo in a comment. - [x] I have written a PR description following these [rules](https://cbea.ms/git-commit/#why-not-how). - [x] I have run `pre-commit run --from-ref origin/main --to-ref HEAD`. - [x] I have added tests. - [x] The `lit` tests I have added follow these [best practices](https://mlir.llvm.org/getting_started/TestingGuide/#filecheck-best-practices) --- lib/Dialect/TritonGPU/Transforms/Utility.cpp | 3 ++- test/TritonGPU/combine.mlir | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/lib/Dialect/TritonGPU/Transforms/Utility.cpp b/lib/Dialect/TritonGPU/Transforms/Utility.cpp index b8f3abfcac..5914eb2905 100644 --- a/lib/Dialect/TritonGPU/Transforms/Utility.cpp +++ b/lib/Dialect/TritonGPU/Transforms/Utility.cpp @@ -563,7 +563,8 @@ bool canFoldIntoConversion(Operation *op, Attribute targetEncoding) { } return isa(op); + triton::gpu::LocalAllocOp, triton::gpu::LocalLoadOp, + triton::gpu::LocalStoreOp>(op); } scf::ForOp replaceForOpWithNewSignature( diff --git a/test/TritonGPU/combine.mlir b/test/TritonGPU/combine.mlir index 5e1cad52af..129eb8c101 100644 --- a/test/TritonGPU/combine.mlir +++ b/test/TritonGPU/combine.mlir @@ -2685,3 +2685,21 @@ module attributes {"triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-war tt.return } } + +// ----- + +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1, 1, 1, 4], threadsPerWarp = [2, 1, 16, 1, 1], warpsPerCTA = [1, 1, 2, 2, 1], order = [4, 0, 1, 2, 3]}> +#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1, 1, 1, 4], threadsPerWarp = [1, 1, 32, 1, 1], warpsPerCTA = [1, 1, 1, 1, 4], order = [4, 3, 2, 1, 0]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1, 1, 1, 4], threadsPerWarp = [2, 1, 16, 1, 1], warpsPerCTA = [1, 2, 2, 1, 1], order = [4, 0, 3, 2, 1]}> +#shared = #triton_gpu.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [4, 0, 1, 2, 3], hasLeadingOffset = false}> +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:100", "triton_gpu.threads-per-warp" = 32 : i32} { + // CHECK-LABEL: lift_convert_to_local_load + // CHECK-NOT: convert_layout + // CHECK: tt.return + tt.func public @lift_convert_to_local_load(%arg0 : !triton_gpu.memdesc<2x1x32x4x4xi8, #shared, #triton_gpu.shared_memory, mutable>) -> tensor<2x4x32x1x4xi8, #blocked2> { + %1 = triton_gpu.local_load %arg0 : !triton_gpu.memdesc<2x1x32x4x4xi8, #shared, #triton_gpu.shared_memory, mutable> -> tensor<2x1x32x4x4xi8, #blocked> + %2 = tt.trans %1 {order = array} : tensor<2x1x32x4x4xi8, #blocked> -> tensor<2x4x32x1x4xi8, #blocked1> + %3 = triton_gpu.convert_layout %2 : tensor<2x4x32x1x4xi8, #blocked1> -> tensor<2x4x32x1x4xi8, #blocked2> + tt.return %3 : tensor<2x4x32x1x4xi8, #blocked2> + } +} From 4ae95e70cd81eb62f89ec530605440b85e799dee Mon Sep 17 00:00:00 2001 From: Thomas Raoux Date: Thu, 21 Nov 2024 19:42:10 -0800 Subject: [PATCH 05/12] [BACKEND] Fix transpose optimization missed during refactor (#5226) --- .../Transforms/OptimizeDotOperands.cpp | 4 ++-- test/TritonGPU/dot-operands.mlir | 20 +++++++++++++++++++ 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp b/lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp index b6b376101a..c776944a24 100644 --- a/lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp +++ b/lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp @@ -326,13 +326,13 @@ class FuseTransHopper : public OpRewritePattern { return failure(); // Match outerCvt(trans(innerCvt(x))). - auto trans = allocOp.getSrc().getDefiningOp(); + auto trans = allocOp.getSrc().getDefiningOp(); if (!trans || trans.getOrder() != ArrayRef({1, 0})) return failure(); MemDescType allocType = allocOp.getType(); auto allocEncoding = cast(allocType.getEncoding()); - MemDescType srcTy = trans.getSrc().getType(); + RankedTensorType srcTy = trans.getSrc().getType(); // MMAv3 with transpose only supports f16 and bf16. Fall back to MMAv3 // without transpose for other data types.) diff --git a/test/TritonGPU/dot-operands.mlir b/test/TritonGPU/dot-operands.mlir index 911cf4fb40..990a0b4f7a 100644 --- a/test/TritonGPU/dot-operands.mlir +++ b/test/TritonGPU/dot-operands.mlir @@ -256,3 +256,23 @@ module attributes {"triton_gpu.target" = "cuda:90", "triton_gpu.num-ctas" = 1 : tt.return %r : tensor<128x64xf32, #mma> } } + +// ----- + +#blocked = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [1, 0]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 16], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [0, 1]}> +#mma = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 64, 16]}> +#shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = true}> +#shared1 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = true}> +module attributes {"triton_gpu.target" = "cuda:90", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { +// CHECK-LABEL: mma_reorder_transpose +// CHECK: triton_gpu.local_alloc +// CHECK: triton_gpu.memdesc_trans +// CHECK: triton_nvidia_gpu.warp_group_dot + tt.func @mma_reorder_transpose(%t: tensor<64x128xf16, #blocked1>, %dotb: !triton_gpu.memdesc<64x64xf16, #shared>, %dotc: tensor<128x64xf32, #mma>) -> tensor<128x64xf32, #mma>{ + %a = tt.trans %t {order = array} : tensor<64x128xf16, #blocked1> -> tensor<128x64xf16, #blocked> + %dota = triton_gpu.local_alloc %a: (tensor<128x64xf16, #blocked>) -> !triton_gpu.memdesc<128x64xf16, #shared1> + %r = triton_nvidia_gpu.warp_group_dot %dota, %dotb, %dotc : !triton_gpu.memdesc<128x64xf16, #shared1> * !triton_gpu.memdesc<64x64xf16, #shared> -> tensor<128x64xf32, #mma> + tt.return %r : tensor<128x64xf32, #mma> + } +} From af0649d2fcbbe8e486a5a2d397d798a476b9c22a Mon Sep 17 00:00:00 2001 From: ilia-cher <30845429+ilia-cher@users.noreply.github.com> Date: Fri, 22 Nov 2024 01:11:56 -0600 Subject: [PATCH 06/12] [AMD] Use warp shuffle for fp8 MFMA to dot operand layout conversion (#5139) Adding a shortcut case for fp8 MFMA to dot operand layout conversion that avoids using shared memory, to speed up FP8 attention kernels. --- include/triton/Analysis/Utility.h | 5 + lib/Analysis/Utility.cpp | 25 ++- .../TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp | 6 + test/Conversion/amd/mfma-shortcut.mlir | 190 +++++++++++++++++- .../ConvertLayoutOpToLLVM.cpp | 154 ++++++++++++++ 5 files changed, 378 insertions(+), 2 deletions(-) diff --git a/include/triton/Analysis/Utility.h b/include/triton/Analysis/Utility.h index df6029db0d..ae517912fb 100644 --- a/include/triton/Analysis/Utility.h +++ b/include/triton/Analysis/Utility.h @@ -218,6 +218,11 @@ bool isBlockedToDotShortcut(RankedTensorType srcTy, RankedTensorType dstTy); bool matchMmaV3AndDotOperandLayout(RankedTensorType srcTy, RankedTensorType dstTy); +// Check if MFMA layout can be converted to the dot operand +// layout using warp shuffle. +bool matchMFMAAndDotOperandShuffleCase(RankedTensorType srcTy, + RankedTensorType dstTy); + // TODO: Move utility functions that belong to ConvertLayoutOp to class // ConvertLayoutOpHelper in the future bool shouldUseDistSmem(Attribute srcLayout, Attribute dstLayout); diff --git a/lib/Analysis/Utility.cpp b/lib/Analysis/Utility.cpp index 6166e10199..5fd87e4c01 100644 --- a/lib/Analysis/Utility.cpp +++ b/lib/Analysis/Utility.cpp @@ -10,6 +10,7 @@ #include "mlir/IR/Dialect.h" #include "mlir/IR/Matchers.h" #include "mlir/Support/LLVM.h" +#include "triton/Conversion/MLIRTypes.h" #include "triton/Dialect/Triton/IR/Dialect.h" #include "triton/Dialect/Triton/IR/Utility.h" #include "triton/Dialect/TritonGPU/IR/Dialect.h" @@ -632,6 +633,25 @@ bool matchMmaV3AndDotOperandLayout(RankedTensorType srcTy, return ans; } +bool matchMFMAAndDotOperandShuffleCase(RankedTensorType srcTy, + RankedTensorType dstTy) { + auto mfmaLayout = dyn_cast(srcTy.getEncoding()); + auto dotOperandLayout = dyn_cast(dstTy.getEncoding()); + if (!mfmaLayout || !dotOperandLayout) + return false; + + // Currently supporting 32x32 and 16x16 FP8 MFMA -> dot operand case + return dotOperandLayout.getParent() == mfmaLayout && + dotOperandLayout.getOpIdx() == 0 && mfmaLayout.getIsTransposed() && + dotOperandLayout.getKWidth() == 8 && + getContigPerThread(mfmaLayout)[1] == 4 && + ((mfmaLayout.getMDim() == 16 && mfmaLayout.getNDim() == 16) || + (mfmaLayout.getMDim() == 32 && mfmaLayout.getNDim() == 32)) && + triton::type::isFloat8(srcTy.getElementType()) && + triton::type::isFloat8(dstTy.getElementType()) && + mfmaLayout.getWarpsPerCTA()[1] == 1; +} + // We get the smallest submap of srcTy^{-1} * dstTy that is not the identity // under kBlock, kWarp or kLane (in that order). The idea here is that if we // have a transformation that's the identity on kBlock, we don't need to use @@ -730,7 +750,10 @@ bool cvtNeedsSharedMemory(RankedTensorType srcTy, RankedTensorType dstTy) { // supported yet in Triton's backend. return !cvtReordersRegisters(srcTy, dstTy) && !isBlockedToDotShortcut(srcTy, dstTy) && - !matchMmaV3AndDotOperandLayout(srcTy, dstTy); + !matchMmaV3AndDotOperandLayout(srcTy, dstTy) && + // to be removed when generalized warp shuffle conversions + // are ready: + !matchMFMAAndDotOperandShuffleCase(srcTy, dstTy); } bool atomicNeedsSharedMemory(Value value) { diff --git a/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp index aab97c7dd2..f0026c1993 100644 --- a/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp +++ b/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp @@ -409,6 +409,12 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion return failure(); } + // The following check can be removed when generalized warp shuffle + // conversions are ready: + if (matchMFMAAndDotOperandShuffleCase(srcTy, dstTy)) { + return failure(); + } + assert(cvtNeedsSharedMemory(srcTy, dstTy)); SmallVector inVals = diff --git a/test/Conversion/amd/mfma-shortcut.mlir b/test/Conversion/amd/mfma-shortcut.mlir index a2c8f48718..bcbc7eff59 100644 --- a/test/Conversion/amd/mfma-shortcut.mlir +++ b/test/Conversion/amd/mfma-shortcut.mlir @@ -1,4 +1,4 @@ -// RUN: triton-opt %s --decompose-unsupported-amd-conversions --allocate-shared-memory --convert-triton-amdgpu-to-llvm=arch="gfx90a" -split-input-file | FileCheck %s +// RUN: triton-opt %s --decompose-unsupported-amd-conversions --allocate-shared-memory --convert-triton-amdgpu-to-llvm=arch="gfx942" -split-input-file | FileCheck %s #mfma = #triton_gpu.amd_mfma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 16], isTransposed = true}> #dotop = #triton_gpu.dot_op<{opIdx = 0, parent = #mfma, kWidth=4}> @@ -27,3 +27,191 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : tt.return } } + +// ----- + +#mfma = #triton_gpu.amd_mfma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [32, 32], isTransposed = true}> +#dotop0 = #triton_gpu.dot_op<{opIdx = 0, parent = #mfma, kWidth=8}> + +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32} { + // CHECK-LABEL: mfma_dot_cvt_f8_mfma32 + tt.func public @mfma_dot_cvt_f8_mfma32(%arg0: tensor<128x32xf8E4M3FNUZ, #mfma>) { + // CHECK-NOT: store + // CHECK-NOT: load + + // CHECK: [[val3:%.*]] = llvm.extractvalue %arg0[3] + // CHECK: [[val7:%.*]] = llvm.extractvalue %arg0[7] + + // CHECK-DAG: [[c32:%.*]] = llvm.mlir.constant(32 : i32) + // CHECK-DAG: [[c64:%.*]] = llvm.mlir.constant(64 : i32) + + // CHECK: [[threadId:%.*]] = rocdl.workitem.id.x + // CHECK: [[laneId:%.*]] = llvm.urem [[threadId]], [[c64]] + // CHECK: [[mask0:%.*]] = llvm.icmp "slt" [[laneId]], [[c32]] + + // CHECK: [[shflLaneId:%.*]] = llvm.add [[laneId]], [[c32]] + // CHECK: [[addr32:%.*]] = llvm.urem [[shflLaneId]], [[c64]] + + // CHECK: [[vec0:%.*]] = llvm.insertelement [[val3]], {{.*}} : vector<4xi8> + // CHECK: [[vec1:%.*]] = llvm.insertelement [[val7]], {{.*}} : vector<4xi8> + + // CHECK: [[bvec0:%.*]] = llvm.bitcast [[vec0]] + // CHECK: [[c2:%.*]] = llvm.mlir.constant(2 : i32) + // CHECK: [[addr:%.*]] = llvm.shl [[addr32]], [[c2]] + // CHECK: [[bShflVec0:%.*]] = rocdl.ds_bpermute [[addr]], [[bvec0]] + // CHECK: [[shflVec0:%.*]] = llvm.bitcast [[bShflVec0]] + + // CHECK: [[bvec1:%.*]] = llvm.bitcast [[vec1]] + // CHECK: [[c2:%.*]] = llvm.mlir.constant(2 : i32) + // CHECK: [[addr:%.*]] = llvm.shl [[addr32]], [[c2]] + // CHECK: [[bShflVec1:%.*]] = rocdl.ds_bpermute [[addr]], [[bvec1]] + // CHECK: [[shflVec1:%.*]] = llvm.bitcast [[bShflVec1]] + + // Input (8 values): (vec0, vec1) + // Output (8 values shuffled, '>> n' - take the value from (lane + n) % 64): + // resVec0 resVec1 + // lanes 0-31: (vec0 , vec0 >> 32) (mask0=1) + // lanes 32-63: (vec1 >> 32, vec1 ) (mask0=0) + + // CHECK: [[resVec0:%.*]] = llvm.select [[mask0]], [[vec0]], [[shflVec1]] + // CHECK: [[resVec1:%.*]] = llvm.select [[mask0]], [[shflVec0]], [[vec1]] + + // CHECK: [[c3:%.*]] = llvm.mlir.constant(3 : i32) + // CHECK: [[resVal3:%.*]] = llvm.extractelement [[resVec0]][[[c3]] : i32] : vector<4xi8> + // CHECK: [[c3:%.*]] = llvm.mlir.constant(3 : i32) : i32 + // CHECK: [[resVal7:%.*]] = llvm.extractelement [[resVec1]][[[c3]] : i32] : vector<4xi8> + + // CHECK: llvm.insertvalue [[resVal3]], {{.*}}[3] + // CHECK: llvm.insertvalue [[resVal7]], {{.*}}[7] + + // CHECK: llvm.return + %0 = triton_gpu.convert_layout %arg0 : tensor<128x32xf8E4M3FNUZ, #mfma> -> tensor<128x32xf8E4M3FNUZ, #dotop0> + tt.return + } +} + +// ----- + +#mfma = #triton_gpu.amd_mfma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [32, 32], isTransposed = true}> +#dotop0 = #triton_gpu.dot_op<{opIdx = 0, parent = #mfma, kWidth=8}> + +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32} { + // CHECK-LABEL: mfma_dot_cvt_bf8_mfma32 + tt.func public @mfma_dot_cvt_bf8_mfma32(%arg0: tensor<128x32xf8E5M2, #mfma>) { + // CHECK-NOT: store + // CHECK-NOT: load + // CHECK: rocdl.ds_bpermute + // CHECK: llvm.return + %0 = triton_gpu.convert_layout %arg0 : tensor<128x32xf8E5M2, #mfma> -> tensor<128x32xf8E5M2, #dotop0> + tt.return + } +} + +// ----- + +#mfma = #triton_gpu.amd_mfma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 16], isTransposed = true}> +#dotop0 = #triton_gpu.dot_op<{opIdx = 0, parent = #mfma, kWidth=8}> + +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32} { + // CHECK-LABEL: mfma_dot_cvt_f8_mfma16 + tt.func public @mfma_dot_cvt_f8_mfma16(%arg0: tensor<128x32xf8E4M3FNUZ, #mfma>) { + // CHECK-NOT: store + // CHECK-NOT: load + + // CHECK: [[val3:%.*]] = llvm.extractvalue %arg0[3] + // CHECK: [[val7:%.*]] = llvm.extractvalue %arg0[7] + + // CHECK-DAG: [[c16:%.*]] = llvm.mlir.constant(16 : i32) + // CHECK-DAG: [[c32:%.*]] = llvm.mlir.constant(32 : i32) + // CHECK-DAG: [[c48:%.*]] = llvm.mlir.constant(48 : i32) + // CHECK-DAG: [[c64:%.*]] = llvm.mlir.constant(64 : i32) + + // CHECK: [[threadId:%.*]] = rocdl.workitem.id.x + // CHECK: [[laneId:%.*]] = llvm.urem [[threadId]], [[c64]] + // CHECK: [[mask0:%.*]] = llvm.icmp "slt" [[laneId]], [[c32]] + + // CHECK: [[laneIdRem:%.*]] = llvm.urem [[laneId]], [[c32]] + // CHECK: [[mask1:%.*]] = llvm.icmp "slt" [[laneIdRem]], [[c16]] + + // CHECK: [[shflLaneId:%.*]] = llvm.add [[laneId]], [[c16]] + // CHECK: [[addr16:%.*]] = llvm.urem [[shflLaneId]], [[c64]] + + // CHECK: [[shflLaneId:%.*]] = llvm.add [[laneId]], [[c32]] + // CHECK: [[addr32:%.*]] = llvm.urem [[shflLaneId]], [[c64]] + + // CHECK: [[shflLaneId:%.*]] = llvm.add [[laneId]], [[c48]] + // CHECK: [[addr48:%.*]] = llvm.urem [[shflLaneId]], [[c64]] + + // CHECK: [[vec0:%.*]] = llvm.insertelement [[val3]], {{.*}} : vector<4xi8> + // CHECK: [[vec1:%.*]] = llvm.insertelement [[val7]], {{.*}} : vector<4xi8> + + // CHECK: [[bvec0:%.*]] = llvm.bitcast [[vec0]] + // CHECK: [[c2:%.*]] = llvm.mlir.constant(2 : i32) + // CHECK: [[addr:%.*]] = llvm.shl [[addr16]], [[c2]] + // CHECK: [[bShflVec0_16:%.*]] = rocdl.ds_bpermute [[addr]], [[bvec0]] + // CHECK: [[shflVec0_16:%.*]] = llvm.bitcast [[bShflVec0_16]] + + // CHECK: [[bvec0:%.*]] = llvm.bitcast [[vec0]] + // CHECK: [[c2:%.*]] = llvm.mlir.constant(2 : i32) + // CHECK: [[addr:%.*]] = llvm.shl [[addr32]], [[c2]] + // CHECK: [[bShflVec0_32:%.*]] = rocdl.ds_bpermute [[addr]], [[bvec0]] + // CHECK: [[shflVec0_32:%.*]] = llvm.bitcast [[bShflVec0_32]] + + // CHECK: [[bvec1:%.*]] = llvm.bitcast [[vec1]] + // CHECK: [[c2:%.*]] = llvm.mlir.constant(2 : i32) + // CHECK: [[addr:%.*]] = llvm.shl [[addr32]], [[c2]] + // CHECK: [[bShflVec1_32:%.*]] = rocdl.ds_bpermute [[addr]], [[bvec1]] + // CHECK: [[shflVec1_32:%.*]] = llvm.bitcast [[bShflVec1_32]] + + // CHECK: [[bvec1:%.*]] = llvm.bitcast [[vec1]] + // CHECK: [[c2:%.*]] = llvm.mlir.constant(2 : i32) + // CHECK: [[addr:%.*]] = llvm.shl [[addr48]], [[c2]] + // CHECK: [[bShflVec1_48:%.*]] = rocdl.ds_bpermute [[addr]], [[bvec1]] + // CHECK: [[shflVec1_48:%.*]] = llvm.bitcast [[bShflVec1_48]] + + // Input (8 values): (vec0, vec1) + // Output (8 values shuffled, '>> n' - take the value from (lane + n) % 64): + // resVec0 resVec1 + // lanes 0-15: (vec0 , vec0 >> 16) (mask0=1, mask1=1) + // lanes 16-31: (vec0 >> 16, vec0 >> 32) (mask0=1, mask1=0) + // lanes 32-47: (vec1 >> 32, vec1 >> 48) (mask0=0, mask1=1) + // lanes 48-63: (vec1 >> 48, vec1 ) (mask0=0, mask1=0) + + // CHECK-DAG: [[mask0_true:%.*]] = llvm.select [[mask1]], [[vec0]], [[shflVec0_16]] : i1, vector<4xi8> + // CHECK-DAG: [[mask0_false:%.*]] = llvm.select [[mask1]], [[shflVec1_32]], [[shflVec1_48]] : i1, vector<4xi8> + // CHECK: [[resVec0:%.*]] = llvm.select [[mask0]], [[mask0_true]], [[mask0_false]] : i1, vector<4xi8> + + // CHECK-DAG: [[mask0_true:%.*]] = llvm.select [[mask1]], [[shflVec0_16]], [[shflVec0_32]] : i1, vector<4xi8> + // CHECK-DAG: [[mask0_false:%.*]] = llvm.select [[mask1]], [[shflVec1_48]], [[vec1]] : i1, vector<4xi8> + // CHECK: [[resVec1:%.*]] = llvm.select [[mask0]], [[mask0_true]], [[mask0_false]] : i1, vector<4xi8> + + // CHECK: [[c3:%.*]] = llvm.mlir.constant(3 : i32) + // CHECK: [[resVal3:%.*]] = llvm.extractelement [[resVec0]][[[c3]] : i32] : vector<4xi8> + // CHECK: [[c3:%.*]] = llvm.mlir.constant(3 : i32) : i32 + // CHECK: [[resVal7:%.*]] = llvm.extractelement [[resVec1]][[[c3]] : i32] : vector<4xi8> + + // CHECK: llvm.insertvalue [[resVal3]], {{.*}}[3] + // CHECK: llvm.insertvalue [[resVal7]], {{.*}}[7] + + // CHECK: llvm.return + %0 = triton_gpu.convert_layout %arg0 : tensor<128x32xf8E4M3FNUZ, #mfma> -> tensor<128x32xf8E4M3FNUZ, #dotop0> + tt.return + } +} + +// ----- + +#mfma = #triton_gpu.amd_mfma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 16], isTransposed = true}> +#dotop0 = #triton_gpu.dot_op<{opIdx = 0, parent = #mfma, kWidth=8}> + +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32} { + // CHECK-LABEL: mfma_dot_cvt_bf8_mfma16 + tt.func public @mfma_dot_cvt_bf8_mfma16(%arg0: tensor<128x32xf8E5M2, #mfma>) { + // CHECK-NOT: store + // CHECK-NOT: load + // CHECK: rocdl.ds_bpermute + // CHECK: llvm.return + %0 = triton_gpu.convert_layout %arg0 : tensor<128x32xf8E5M2, #mfma> -> tensor<128x32xf8E5M2, #dotop0> + tt.return + } +} diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM.cpp index 208483beb8..3b61fb8cc4 100644 --- a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM.cpp +++ b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM.cpp @@ -116,6 +116,158 @@ struct LocalLoadOpConversion } }; +struct ConvertLayoutOpMFMAToDotOpConversion + : public ConvertOpToLLVMPattern { +public: + explicit ConvertLayoutOpMFMAToDotOpConversion( + LLVMTypeConverter &typeConverter, const TargetInfoBase &targetInfo, + PatternBenefit benefit) + : ConvertOpToLLVMPattern(typeConverter, + benefit), + targetInfo(targetInfo) {} + + LogicalResult + matchAndRewrite(triton::gpu::ConvertLayoutOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto srcType = cast(op.getSrc().getType()); + auto dstType = cast(op.getType()); + + if (!matchMFMAAndDotOperandShuffleCase(srcType, dstType)) + return failure(); + + auto loc = op.getLoc(); + + SmallVector inVals = + unpackLLElements(loc, adaptor.getSrc(), rewriter); + if (inVals.empty() || inVals.size() % 8 != 0) + return failure(); + + auto mfmaLayout = dyn_cast(srcType.getEncoding()); + assert((mfmaLayout.getMDim() == 16 || mfmaLayout.getMDim() == 32) && + "Expected MFMA size 16 or 32"); + assert(triton::gpu::getWarpSize(mfmaLayout) == 64 && + "Expected warp size 64 for MFMA"); + + auto elemTy = int_ty(8); + auto vecTy = vec_ty(elemTy, 4); + + Value c16 = i32_val(16); + Value c32 = i32_val(32); + Value c48 = i32_val(48); + Value c64 = i32_val(64); + + Value threadId = tid_val(); + Value laneId = urem(threadId, c64); + + Value mask0 = icmp_slt(laneId, c32); + Value mask1 = icmp_slt(urem(laneId, c32), c16); + + Value addrShift16 = urem(add(laneId, c16), c64); + Value addrShift32 = urem(add(laneId, c32), c64); + Value addrShift48 = urem(add(laneId, c48), c64); + + SmallVector outVals; + for (size_t startIdx = 0; startIdx < inVals.size(); startIdx += 8) { + Value vec0 = undef(vecTy); + for (size_t vIdx = 0; vIdx < 4; ++vIdx) { + vec0 = + insert_element(vecTy, vec0, inVals[startIdx + vIdx], i32_val(vIdx)); + } + Value vec1 = undef(vecTy); + for (size_t vIdx = 0; vIdx < 4; ++vIdx) { + vec1 = insert_element(vecTy, vec1, inVals[startIdx + vIdx + 4], + i32_val(vIdx)); + } + + Value resVec0, resVec1; + if (mfmaLayout.getMDim() == 32) { + /* + Using wave shuffle to convert layouts (32x32x16 case): + 1) Input MMA layout (32x32, fp8, 16 values): + _____________________________________________________________ + |(t0 v0 v1 v2 v3) (t32 v0 v1 v2 v3) ... (t32 v12 v13 v14 v15)| + | ... ... | + |(t31 v0 v1 v2 v3) (t63 v0 v1 v2 v3) ... (t63 v12 v13 v14 v15)| + |_____________________________________________________________| + + 2) Output Dot operand layout (two 32x16 tiles, fp8, 8 values each): + ____________________________________________________________ ___ + |(t0 v0 v1 v2 v3 v4 v5 v6 v7) (t32 v0 v1 v2 v3 v4 v5 v6 v7) || + | ... ... ||... + |(t31 v0 v1 v2 v3 v4 v5 v6 v7) (t63 v0 v1 v2 v3 v4 v5 v6 v7) || + |____________________________________________________________||___ + */ + + Value shflVec0 = + bitcast(targetInfo.shuffleIdx( + rewriter, loc, bitcast(vec0, int_ty(32)), addrShift32), + vecTy); + Value shflVec1 = + bitcast(targetInfo.shuffleIdx( + rewriter, loc, bitcast(vec1, int_ty(32)), addrShift32), + vecTy); + + resVec0 = select(mask0, vec0, shflVec1); + resVec1 = select(mask0, shflVec0, vec1); + } else if (mfmaLayout.getMDim() == 16) { + /* + 16x16x32 case: + 1) Input MMA layout (two 16x16, fp8, 4 values each): + _________________________________________________________ ___________ + |(t0 v0 v1 v2 v3) (t16 v0 v1 v2 v3) ... (t48 v0 v1 v2 v3)||(t0 v4 ... + | ... ... || ... + |(t15 v0 v1 v2 v3) (t31 v0 v1 v2 v3) ... (t63 v0 v1 v2 v3)||(t15 v4 ... + |_________________________________________________________||___________ + + 2) Output Dot operand layout (16x32 tile, fp8, 8 values): + ________________________________________________________________ + |(t0 v0 v1 v2 v3 v4 v5 v6 v7) ... (t48 v0 v1 v2 v3 v4 v5 v6 v7) | + | ... ... | + |(t15 v0 v1 v2 v3 v4 v5 v6 v7) ... (t63 v0 v1 v2 v3 v4 v5 v6 v7) | + |________________________________________________________________| + */ + + Value shflVec0_16 = + bitcast(targetInfo.shuffleIdx( + rewriter, loc, bitcast(vec0, int_ty(32)), addrShift16), + vecTy); + Value shflVec0_32 = + bitcast(targetInfo.shuffleIdx( + rewriter, loc, bitcast(vec0, int_ty(32)), addrShift32), + vecTy); + Value shflVec1_32 = + bitcast(targetInfo.shuffleIdx( + rewriter, loc, bitcast(vec1, int_ty(32)), addrShift32), + vecTy); + Value shflVec1_48 = + bitcast(targetInfo.shuffleIdx( + rewriter, loc, bitcast(vec1, int_ty(32)), addrShift48), + vecTy); + + resVec0 = select(mask0, select(mask1, vec0, shflVec0_16), + select(mask1, shflVec1_32, shflVec1_48)); + resVec1 = select(mask0, select(mask1, shflVec0_16, shflVec0_32), + select(mask1, shflVec1_48, vec1)); + } + + for (size_t vIdx = 0; vIdx < 4; ++vIdx) { + outVals.push_back(extract_element(elemTy, resVec0, i32_val(vIdx))); + } + for (size_t vIdx = 0; vIdx < 4; ++vIdx) { + outVals.push_back(extract_element(elemTy, resVec1, i32_val(vIdx))); + } + } + + Value result = packLLElements(loc, getTypeConverter(), outVals, rewriter, + op.getType()); + rewriter.replaceOp(op, result); + return success(); + } + +protected: + const TargetInfoBase &targetInfo; +}; + } // namespace namespace mlir::triton::AMD { @@ -124,5 +276,7 @@ void populateConvertLayoutOpToLLVMPatterns( RewritePatternSet &patterns, int numWarps, ModuleAxisInfoAnalysis &axisInfoAnalysis, PatternBenefit benefit) { patterns.add(typeConverter, benefit); + patterns.add(typeConverter, targetInfo, + benefit); } } // namespace mlir::triton::AMD From 433037206d8870f0b82a3cd669097001084a29ed Mon Sep 17 00:00:00 2001 From: Mario Lezcano Casado <3291265+lezcano@users.noreply.github.com> Date: Fri, 22 Nov 2024 11:44:34 +0000 Subject: [PATCH 07/12] [LAYOUTS] [BE] Simplify Ampere/Hopper paths introduced in #5189 (#5200) We simplify the implementation of `getElemsPerThread` and strengthen the preconditions of `getRepForOperand`. More generally, we should try to minimise the calls to `isAmpere` and `isHopper` throughout the codebase. I'll do a pass fixing many of these once we land LLs for `ldmatrix` and Hopper. --- lib/Dialect/TritonGPU/IR/Dialect.cpp | 79 ++++++++++++++-------------- 1 file changed, 40 insertions(+), 39 deletions(-) diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp index 3273154fc7..2b1657652e 100644 --- a/lib/Dialect/TritonGPU/IR/Dialect.cpp +++ b/lib/Dialect/TritonGPU/IR/Dialect.cpp @@ -1038,23 +1038,18 @@ DotOperandEncodingAttr::getElemsPerThread(ArrayRef shape, elemsPerThread[rank - 1] = (idx == 0) ? rep[2] * kWidth : rep[2]; return elemsPerThread; } else if (auto mma = mlir::dyn_cast(parent)) { - if (mma.isAmpere() || mma.isHopper()) { - auto bitwidth = getPointeeType(eltTy).getIntOrFloatBitWidth(); - auto rep = mma.getRepForOperand(shape, bitwidth, kWidth, idx); - auto sizePerThread = getSizePerThread(); - auto elemsPerKRep = mma.isHopper() ? (kWidth * 2) : (32 / bitwidth * 2); - if (rank == 3) - elemsPerThread[0] = rep[0]; - elemsPerThread[rank - 2] = - (idx == 0) - ? rep[1] * sizePerThread[rank - 2] - : std::max(rep[1] * elemsPerKRep, sizePerThread[rank - 2]); - elemsPerThread[rank - 1] = - (idx == 0) - ? std::max(rep[2] * elemsPerKRep, sizePerThread[rank - 1]) - : rep[2] * sizePerThread[rank - 1]; - return elemsPerThread; + assert(getCTALayout(*this) == + CTALayoutAttr::getDefault(getContext(), rank) && + "NYI"); + auto sizePerThread = getSizePerThread(); + auto threadsPerWarp = getThreadsPerWarp(); + auto warpsPerCTA = getWarpsPerCTA(); + SmallVector regs; + for (auto [n, nsize, nThread, nWarp] : + llvm::zip(shape, sizePerThread, threadsPerWarp, warpsPerCTA)) { + regs.push_back(std::max(nsize, n / (nThread * nWarp))); } + return regs; } llvm_unreachable("getElemsPerThread is not supported for dot operand"); @@ -2341,35 +2336,41 @@ NvidiaMmaEncodingAttr::getRepOrderForOperand(int opIdx) const { SmallVector NvidiaMmaEncodingAttr::getRepForOperand(ArrayRef shape, int bitwidth, int kWidth, int opIdx) const { + assert( + kWidth >= 32 / bitwidth && + "kWidth must be >= 32 / bitwidth for this function to be well-defined"); auto rank = shape.size(); + // Broadcast long K auto warpsPerCTA = getWarpsPerCTA(); + auto kDim = opIdx == 0 ? rank - 1 : rank - 2; + warpsPerCTA[kDim] = 1; - // {batch, m, n, k} - // Hopper path never uses the n value, since this method is only invoked - // for in-RF (dotOpEnc) operands, but WGMMA only supports in A to be in RF - // TODO: rep per operand is not accurate for Hopper. It is currently done that - // way to allow us to get the correct total number of elements. this will be - // fixed when moving to linear layout. - SmallVector shapePerWarp = { - 1, 16, 8, isHopper() ? 4 * 2 * kWidth : 4 * 64 / bitwidth}; - int numRepBatch = - rank == 3 - ? std::max(1, shape[0] / (shapePerWarp[0] * warpsPerCTA[0])) - : 1; - + SmallVector tileSize; + if (rank == 3) { + tileSize.push_back(1); + } if (opIdx == 0) { - return {numRepBatch, - std::max(1, /*repM=*/shape[rank - 2] / - (shapePerWarp[1] * warpsPerCTA[rank - 2])), - std::max(1, /*repK=*/shape[rank - 1] / shapePerWarp[3])}; + // m x k + tileSize.push_back(16); + tileSize.push_back(4 * 64 / bitwidth); } else { - assert(opIdx == 1); - return { - numRepBatch, - std::max(1, /*repK=*/shape[rank - 2] / shapePerWarp[3]), - std::max(1, /*repN=*/shape[rank - 1] / - (shapePerWarp[2] * warpsPerCTA[rank - 1]))}; + // k x n + // Hopper path never uses the n value, since this method is only invoked + // for in-RF (dotOpEnc) operands, but WGMMA only supports in A to be in RF + // so it's fine if the n is incorrect here + tileSize.push_back(4 * 64 / bitwidth); + tileSize.push_back(8); + } + + SmallVector numRep; + // Lezcano: This is odd. Why do we always return a vector of size 3? + if (rank != 3) { + numRep.push_back(1); + } + for (auto [s, size, warp] : llvm::zip(shape, tileSize, warpsPerCTA)) { + numRep.push_back(std::max(1, s / (size * warp))); } + return numRep; } SmallVector From e5588383b9efc5bea8b0e8e42a8a9720c3c724c8 Mon Sep 17 00:00:00 2001 From: Keren Zhou Date: Fri, 22 Nov 2024 07:26:31 -0800 Subject: [PATCH 08/12] [BACKEND] Use LL to simplify redundant elements check and fix related issues (#5225) --- python/test/unit/language/test_core.py | 21 +-- .../LoadStoreOpToLLVM.cpp | 153 ++++++++---------- 2 files changed, 70 insertions(+), 104 deletions(-) diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py index 23c598ee16..514ac171a3 100644 --- a/python/test/unit/language/test_core.py +++ b/python/test/unit/language/test_core.py @@ -5436,21 +5436,11 @@ def test_convertmma2mma(M, N, mma_pair, dtype, device, tmp_path: pathlib.Path): pytest.skip("Skip testing MMAv3 on devices with CC < 9") num_warps = np.cumprod(src_layout.warps_per_cta)[-1] - # TODO(Keren): Remove the intermediate layout once we have resolved the redundantDataMask issue for WGMMA - warps_per_cta = src_layout.warps_per_cta - interm = BlockedLayout([1, 4], [4, THREADS_PER_WARP // 4], [warps_per_cta[0], warps_per_cta[1]], [0, 1], [1, 1], - [1, 1], [0, 1]) def do_test(src_layout, dst_layout): layouts = f""" #src = {src_layout} #dst = {dst_layout} - #interm = {interm} - """ - - conversion = f""" - %12 = triton_gpu.convert_layout %9 : tensor<{M}x{N}xi32, #src> -> tensor<{M}x{N}xi32, #dst> - %13 = triton_gpu.convert_layout %11 : tensor<{M}x{N}xf16, #src> -> tensor<{M}x{N}xf16, #dst> """ ir = layouts + f""" @@ -5460,6 +5450,7 @@ def do_test(src_layout, dst_layout): %0 = tt.make_range {{end = {M} : i32, start = 0 : i32}} : tensor<{M}xi32, #triton_gpu.slice<{{dim = 1, parent = #src}}>> %1 = tt.make_range {{end = {N} : i32, start = 0 : i32}} : tensor<{N}xi32, #triton_gpu.slice<{{dim = 0, parent = #src}}>> %2 = tt.splat %arg0 : !tt.ptr -> tensor<{M}x{N}x!tt.ptr, #src> + %3 = tt.splat %arg1 : !tt.ptr -> tensor<{M}x{N}x!tt.ptr, #dst> %4 = tt.expand_dims %0 {{axis = 1 : i32}} : tensor<{M}xi32, #triton_gpu.slice<{{dim = 1, parent = #src}}>> -> tensor<{M}x1xi32, #src> %5 = arith.muli %4, %cst : tensor<{M}x1xi32, #src> %6 = tt.expand_dims %1 {{axis = 0 : i32}} : tensor<{N}xi32, #triton_gpu.slice<{{dim = 0, parent = #src}}>> -> tensor<1x{N}xi32, #src> @@ -5468,12 +5459,10 @@ def do_test(src_layout, dst_layout): %9 = arith.addi %8, %7 : tensor<{M}x{N}xi32, #src> %10 = tt.addptr %2, %9 : tensor<{M}x{N}x!tt.ptr, #src>, tensor<{M}x{N}xi32, #src> %11 = tt.load %10 : tensor<{M}x{N}x!tt.ptr, #src> - %3 = tt.splat %arg1 : !tt.ptr -> tensor<{M}x{N}x!tt.ptr, #interm> - """ + conversion + f""" - %15 = triton_gpu.convert_layout %12 : tensor<{M}x{N}xi32, #dst> -> tensor<{M}x{N}xi32, #interm> - %16 = triton_gpu.convert_layout %13 : tensor<{M}x{N}xf16, #dst> -> tensor<{M}x{N}xf16, #interm> - %17 = tt.addptr %3, %15 : tensor<{M}x{N}x!tt.ptr, #interm>, tensor<{M}x{N}xi32, #interm> - tt.store %17, %16 : tensor<{M}x{N}x!tt.ptr, #interm> + %12 = triton_gpu.convert_layout %9 : tensor<{M}x{N}xi32, #src> -> tensor<{M}x{N}xi32, #dst> + %13 = triton_gpu.convert_layout %11 : tensor<{M}x{N}xf16, #src> -> tensor<{M}x{N}xf16, #dst> + %14 = tt.addptr %3, %12 : tensor<{M}x{N}x!tt.ptr, #dst>, tensor<{M}x{N}xi32, #dst> + tt.store %14, %13 : tensor<{M}x{N}x!tt.ptr, #dst> tt.return }} }} diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/LoadStoreOpToLLVM.cpp index cc52507121..d2cef405eb 100644 --- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/LoadStoreOpToLLVM.cpp +++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/LoadStoreOpToLLVM.cpp @@ -8,6 +8,7 @@ #include "Utility.h" #include "triton/Conversion/TritonGPUToLLVM/Utility.h" #include "triton/Dialect/Triton/IR/Dialect.h" +#include "triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h" #include "triton/Dialect/TritonGPU/Transforms/Utility.h" using namespace mlir; @@ -24,87 +25,57 @@ using ::mlir::triton::gpu::SharedEncodingAttr; namespace { // Return the mask for the unique data accessed by given tensor type. -// Used to mask out the redundant data accessed by threads. -Value redundantDataMask(Type valueTy, ConversionPatternRewriter &rewriter, - Location loc, const NVIDIA::TargetInfo &targetInfo) { +// NOTE: Redundant memory load is allowed in triton, but redundant memory store +// is not allowed. +// mask = true: thread can write +// mask = false: thread should not write +Value getRedundantDataMask(ModuleOp moduleOp, Type valueTy, + ConversionPatternRewriter &rewriter, Location loc, + int regIdx, const NVIDIA::TargetInfo &targetInfo) { + auto ctx = moduleOp.getContext(); auto tensorTy = dyn_cast(valueTy); - Value mask = int_val(1, 1); + auto numCTAs = triton::gpu::TritonGPUDialect::getNumCTAs(moduleOp); auto tid = tid_val(); - auto clusterCTAId = targetInfo.getClusterCTAId(rewriter, loc); + auto mask = true_val(); + auto kReg = str_attr("register"); + auto kLane = str_attr("lane"); + auto kWarp = str_attr("warp"); + auto kBlock = str_attr("block"); if (tensorTy) { - auto layout = tensorTy.getEncoding(); auto shape = tensorTy.getShape(); - unsigned rank = shape.size(); - auto sizePerThread = triton::gpu::getSizePerThread(layout); - auto threadsPerWarp = triton::gpu::getThreadsPerWarp(layout); - auto warpsPerCTA = triton::gpu::getWarpsPerCTA(layout); - auto threadOrder = triton::gpu::getThreadOrder(layout); - SmallVector warpOrder(rank); - if (auto enc = dyn_cast(layout)) { - warpOrder = - triton::gpu::getMatrixOrder(rank, /*rowMajor=*/enc.getOpIdx() == 1); + auto layout = tensorTy.getEncoding(); + auto ll = triton::gpu::toLinearLayout(shape, layout); + assert(ll.has_value() && "Failed to convert layout to linear layout"); + auto freeVariableMasks = ll->getFreeVariableMasks(); + auto regMasks = freeVariableMasks[kReg]; + if (regMasks & regIdx) { + // Step 1: check register redundancy + mask = false_val(); } else { - warpOrder = triton::gpu::getWarpOrder(layout); - } - auto shapePerCTATile = triton::gpu::getShapePerCTATile(layout); - Value warpSize = i32_val(32); - Value laneId = urem(tid, warpSize); - Value warpId = udiv(tid, warpSize); - // TODO: [DOT LL] - // The delinearize function is not entirely correct for certain layouts, - // such as wgmma. The correct approach is to convert a legacy layout to its - // corresponding linear layout and use the linear layout's - // getFreeVariableMasks to identify redundant elements. - SmallVector multiDimWarpId = - delinearize(rewriter, loc, warpId, warpsPerCTA, warpOrder); - SmallVector multiDimThreadId = - delinearize(rewriter, loc, laneId, threadsPerWarp, threadOrder); - for (unsigned dim = 0; dim < rank; ++dim) { - // if there is no data replication across threads on this dimension - if (shape[dim] >= shapePerCTATile[dim]) - continue; - // Otherwise, we need to mask threads that will replicate data on this - // dimension. Calculate the thread index on this dimension for the CTA - Value threadDim = - add(mul(multiDimWarpId[dim], i32_val(threadsPerWarp[dim])), - multiDimThreadId[dim]); - mask = and_(mask, icmp_slt(mul(threadDim, i32_val(sizePerThread[dim])), - i32_val(shape[dim]))); - } - // Do not write duplicated data when multicast is enabled - if (triton::gpu::getNumCTAs(layout) > 1) { - auto _0 = i32_val(0); - auto CTAsPerCGA = triton::gpu::getCTAsPerCGA(layout); - auto CTASplitNum = triton::gpu::getCTASplitNum(layout); - auto CTAOrder = triton::gpu::getCTAOrder(layout); - - auto multiDimClusterCTAId = - delinearize(rewriter, loc, clusterCTAId, CTAsPerCGA, CTAOrder); - - for (unsigned dim = 0; dim < rank; ++dim) { - // Skip when multicast is not enabled in this dimension - if (CTAsPerCGA[dim] == CTASplitNum[dim]) - continue; - // This wrapping rule must be consistent with emitCTAOffsetForLayout - unsigned splitNum = std::min(shape[dim], CTASplitNum[dim]); - Value repId = udiv(multiDimClusterCTAId[dim], i32_val(splitNum)); - // Consider the example where CTAsPerCGA = [4] and CTASplitNum = [2]: - // CTA0 and CTA2 holds data of block0, - // CTA1 and CTA3 holds data of block1. - // Only CTA0 and CTA1 are expected to write while CTA2 and CTA3 should - // be masked. We add the following mask: - // multiDimClusterCTAId[dim] / splitNum == 0 - // Actually in all existing cases of multicast, splitNum is always 1. - // The mask is equivalent to: - // multiDimClusterCTAId[dim] == 0 - mask = and_(mask, icmp_eq(repId, _0)); + Value warpSize = + i32_val(triton::gpu::TritonGPUDialect::getThreadsPerWarp(moduleOp)); + Value laneId = urem(tid, warpSize); + Value warpId = udiv(tid, warpSize); + // Step 2: check lane and warp redundancy + auto laneMasks = freeVariableMasks[kLane]; + auto warpMasks = freeVariableMasks[kWarp]; + mask = and_(mask, icmp_eq(and_(i32_val(laneMasks), laneId), i32_val(0))); + mask = and_(mask, icmp_eq(and_(i32_val(warpMasks), warpId), i32_val(0))); + if (numCTAs > 1) { + // Step 3: check block redundancy + auto ctaId = targetInfo.getClusterCTAId(rewriter, loc); + auto ctaMasks = freeVariableMasks[kBlock]; + mask = and_(mask, icmp_eq(and_(i32_val(ctaMasks), ctaId), i32_val(0))); } } } else { - // If the tensor is not ranked, then it is a scalar and only thread 0 of - // CTA0 can write - mask = and_(mask, icmp_eq(clusterCTAId, i32_val(0))); mask = and_(mask, icmp_eq(tid, i32_val(0))); + if (numCTAs > 1) { + auto ctaId = targetInfo.getClusterCTAId(rewriter, loc); + // If the tensor is not ranked, then it is a scalar and only thread 0 of + // CTA0 within the cluster can write + mask = and_(mask, icmp_eq(ctaId, i32_val(0))); + } } return mask; } @@ -264,7 +235,7 @@ struct LoadOpConversion : public ConvertOpToLLVMPattern, PTXBuilder ptxBuilder; - Value pred = mask ? maskElems[vecStart] : int_val(1, 1); + Value pred = mask ? maskElems[vecStart] : true_val(); const std::string readConstraint = (width == 64) ? "l" : ((width == 32) ? "r" : "c"); @@ -437,7 +408,7 @@ struct StoreOpConversion : public ConvertOpToLLVMPattern, << mask << "\n"; } - Value mask = redundantDataMask(valueTy, rewriter, loc, targetInfo); + auto moduleOp = op->getParentOfType(); const size_t dtsize = std::max(1, valueElemTy.getIntOrFloatBitWidth() / 8); const size_t valueElemNBits = dtsize * 8; @@ -485,6 +456,8 @@ struct StoreOpConversion : public ConvertOpToLLVMPattern, PTXBuilder ptxBuilder; auto *asmArgList = ptxBuilder.newListOperand(asmArgs); + Value mask = getRedundantDataMask(moduleOp, valueTy, rewriter, loc, + vecStart, targetInfo); Value maskVal = llMask ? and_(mask, maskElems[vecStart]) : mask; auto *asmAddr = @@ -577,7 +550,6 @@ struct AtomicCASOpConversion << " origin vec = " << vecOrig << " elemsPerThread = " << elemsPerThread << "\n"; - Value mask = redundantDataMask(valueTy, rewriter, loc, targetInfo); auto vecTy = vec_ty(valueElemTy, vec); SmallVector resultVals(elemsPerThread); @@ -607,6 +579,8 @@ struct AtomicCASOpConversion os << op.getSem(); auto scope = stringifyMemSyncScope(op.getScope()).str(); atom.global().o(semStr).o(scope).o("cas").o(sTy); + Value mask = + getRedundantDataMask(moduleOp, valueTy, rewriter, loc, i, targetInfo); atom(dstOpr, ptrOpr, cmpOpr, valOpr).predicate(mask); if (tensorTy) { @@ -736,12 +710,12 @@ struct AtomicRMWOpConversion << " packed = " << packed << " origin vec = " << vecOrig << " numElems = " << numElems; - Value mask = redundantDataMask(valueTy, rewriter, loc, targetInfo); - auto packedTy = vec_ty(valueElemTy, packed); SmallVector resultVals(elemsPerThread); for (size_t i = 0; i < elemsPerThread; i += vec * packed) { Value rmwPtr = ptrElements[i]; + Value mask = + getRedundantDataMask(moduleOp, valueTy, rewriter, loc, i, targetInfo); Value rmwMask = llMask ? and_(mask, maskElements[i]) : mask; std::string sTy; PTXBuilder ptxBuilderAtomicRMW; @@ -976,6 +950,7 @@ struct AsyncCopyGlobalToLocalOpConversion << vecBytes << " bytes"; } + auto moduleOp = op->getParentOfType(); for (int i = 0; i < shmemAddrs.size(); i++) { // It's possible that vecTy is larger than 128 bits, in which case we have // to use multiple cp.async instructions. @@ -1003,24 +978,26 @@ struct AsyncCopyGlobalToLocalOpConversion // if there's any mask. cp.async will automatically fill the // remaining slots with 0 if cp-size > src-size. // XXX(Keren): Always assume other = 0 for now. + // When 'other != 0' is supported, we will need to fold the + // op.getMask() and redundantDataMask() into the same predicate, the + // way it is done for LoadOp. auto selectOp = select(maskElems[elemIdx], i32_val(wordBytes), i32_val(0)); srcSize = ptxBuilder.newOperand(selectOp, "r"); } - // When 'other != 0' is supported, we will need to fold the op.getMask() - // and redundantDataMask() into the same predicate, the way it is done - // for LoadOp. - Value maskVal = redundantDataMask(srcTy, rewriter, loc, targetInfo); - - // TODO: Masking does not work for CTA multicast with cp.async. This is - // a quick and dirty workaround to avoid the issue. bool skipMaskForMultiCTA = triton::gpu::getNumCTAs(srcLayout) > 1; - if (!skipMaskForMultiCTA) { - copyAsyncOp(dstOperand, srcOperand, copySize, srcSize) - .predicate(maskVal); - } else { + if (skipMaskForMultiCTA) { + // TODO: Masking does not work for CTA multicast with cp.async. + // XXX(@peterbell10): In the multi-CTA mode, the redundant data might + // be on different CTAs which don't share the same smem address space, + // so we might need to load the same data multiple times. copyAsyncOp(dstOperand, srcOperand, copySize, srcSize); + } else { + Value mask = getRedundantDataMask(moduleOp, srcTy, rewriter, loc, + elemIdx, targetInfo); + copyAsyncOp(dstOperand, srcOperand, copySize, srcSize) + .predicate(mask); } ptxBuilder.launch(rewriter, loc, void_ty(getContext())); } From bede39f56ff3f623f818849fde6f99ceb706eb83 Mon Sep 17 00:00:00 2001 From: Elliot Gorokhovsky Date: Fri, 22 Nov 2024 10:34:14 -0500 Subject: [PATCH 09/12] Make TMA tests compatible with older CUDA toolchains (#5221) TMA fences require CUDA toolchain 12.3 or greater, but current gating does not check the CUDA toolchain version. This causes `test_experimental_tma.py` to fail when run with older CUDA toolchains. ## Before With cuda-12.0: ``` 55 failed, 9 passed in 18.11s ``` With cuda-12.4: ``` 64 passed in 11.99s ``` ## After With cuda-12.0: ``` 9 passed, 55 skipped in 4.26s ``` With cuda-12.4: ``` 64 passed in 11.96s ``` --- .../test/unit/hopper/test_experimental_tma.py | 10 +++++++--- python/triton/_internal_testing.py | 18 +++++++++++++++--- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/python/test/unit/hopper/test_experimental_tma.py b/python/test/unit/hopper/test_experimental_tma.py index 7062093aef..23065953d6 100644 --- a/python/test/unit/hopper/test_experimental_tma.py +++ b/python/test/unit/hopper/test_experimental_tma.py @@ -4,7 +4,7 @@ import triton import triton.language as tl from triton.tools.experimental_descriptor import (create_1d_tma_descriptor, create_2d_tma_descriptor) -from triton._internal_testing import dtypes_with_bfloat16, numpy_random, to_triton, requires_tma +from triton._internal_testing import dtypes_with_bfloat16, numpy_random, to_triton, requires_tma, supports_tma, tma_skip_msg from typing import Optional @@ -29,9 +29,11 @@ def unwrap_tensor(t: torch.Tensor | triton.runtime.jit.TensorWrapper): tma_dtypes = sorted(set(dtypes_with_bfloat16) - {"int64", "uint64", "float64"}) -@requires_tma @pytest.mark.parametrize("byval_tma", [True, False]) def test_experimetal_descriptor_load(byval_tma): + if not supports_tma(byval_tma): + pytest.skip(tma_skip_msg(byval_tma)) + device = "cuda" SIZE = 128 @@ -82,11 +84,13 @@ def matmul_kernel_tma(a_desc_ptr, b_desc_ptr, c_desc_ptr, # tl._experimental_descriptor_store(c_desc_ptr, accumulator, [offs_am, offs_bn]) -@requires_tma @pytest.mark.parametrize("num_stages", [1, 4]) @pytest.mark.parametrize("BLOCK_M, BLOCK_N, BLOCK_K", [(32, 32, 32), (128, 64, 64), (128, 128, 64), (128, 256, 64)]) @pytest.mark.parametrize("byval_tma", [True, False]) def test_experimental_tma_matmul(num_stages, BLOCK_M, BLOCK_N, BLOCK_K, byval_tma): + if not supports_tma(byval_tma): + pytest.skip(tma_skip_msg(byval_tma)) + device = "cuda" M, N, K = 8192, 8192, 1024 torch.manual_seed(42) diff --git a/python/triton/_internal_testing.py b/python/triton/_internal_testing.py index fa5df4f865..5ba0be1e34 100644 --- a/python/triton/_internal_testing.py +++ b/python/triton/_internal_testing.py @@ -4,6 +4,7 @@ import torch import triton import triton.language as tl +from triton.backends.nvidia.compiler import _path_to_binary import pytest from numpy.random import RandomState @@ -140,8 +141,19 @@ def to_numpy(x): raise ValueError(f"Not a triton-compatible tensor: {x}") -def supports_tma(): - return is_cuda() and torch.cuda.get_device_capability()[0] >= 9 +def supports_tma(byval_only=False): + _, cuda_version = _path_to_binary("ptxas") + min_cuda_version = (12, 0) if byval_only else (12, 3) + cuda_version_tuple = tuple(map(int, cuda_version.split("."))) + assert len(cuda_version_tuple) == 2, cuda_version_tuple + return is_cuda() and torch.cuda.get_device_capability()[0] >= 9 and cuda_version_tuple >= min_cuda_version + + +def tma_skip_msg(byval_only=False): + if byval_only: + return "Requires __grid_constant__ TMA support (NVIDIA Hopper or higher, CUDA 12.0 or higher)" + else: + return "Requires advanced TMA support (NVIDIA Hopper or higher, CUDA 12.3 or higher)" -requires_tma = pytest.mark.skipif(not supports_tma(), reason="Requires TMA support (NVIDIA Hopper or higher)") +requires_tma = pytest.mark.skipif(not supports_tma(), reason=tma_skip_msg()) From 03c63129d314bc2d112a802115e39fce3f17d2ec Mon Sep 17 00:00:00 2001 From: Maksim Levental Date: Fri, 22 Nov 2024 09:42:54 -0600 Subject: [PATCH 10/12] [CMake] Add C as project language (#5217) If you build with `-DTRITON_BUILD_UT=OFF` on Mac you will get something like ``` -- Looking for histedit.h CMake Error at /opt/homebrew/Cellar/cmake/3.30.5/share/cmake/Modules/CheckIncludeFile.cmake:90 (try_compile): Unknown extension ".c" for file -- Looking for histedit.h - not found /Users/runner/work/triton/triton/triton-build/CMakeFiles/CMakeScratch/TryCompile-QA06d6/CheckIncludeFile.c try_compile() works only for enabled languages. Currently these are: CXX See project() command to enable other languages. Call Stack (most recent call first): llvm-bd9145c8-macos-arm64/lib/cmake/llvm/FindLibEdit.cmake:28 (check_include_file) llvm-bd9145c8-macos-arm64/lib/cmake/llvm/LLVMConfig.cmake:177 (find_package) llvm-bd9145c8-macos-arm64/lib/cmake/mlir/MLIRConfig.cmake:10 (find_package) ``` because `C` isn't an enabled project language. --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c5aa40499e..a73f3ad716 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,7 +12,7 @@ set(CMAKE_CXX_STANDARD 17) set(CMAKE_INCLUDE_CURRENT_DIR ON) -project(triton CXX) +project(triton CXX C) include(CTest) if(NOT WIN32) From 422e5d349592e06cdc50d2c5f34f65371138a31c Mon Sep 17 00:00:00 2001 From: Alexander Efimov Date: Fri, 22 Nov 2024 20:56:28 +0300 Subject: [PATCH 11/12] [AMD] Fix slow compilation due to inlining print calls (#5153) This PR disables inline of print related functions, which speeds up compilation of test_scan_layouts dramatically. --------- Co-authored-by: Lei Zhang --- python/src/llvm.cc | 2 -- python/test/unit/language/test_core.py | 2 -- third_party/amd/backend/compiler.py | 3 +++ third_party/amd/python/triton_amd.cc | 18 ++++++++++++++++++ 4 files changed, 21 insertions(+), 4 deletions(-) diff --git a/python/src/llvm.cc b/python/src/llvm.cc index f9b98a2540..182f79d783 100644 --- a/python/src/llvm.cc +++ b/python/src/llvm.cc @@ -139,8 +139,6 @@ std::string translateLLVMIRToASM(llvm::Module &module, { llvm::raw_string_ostream stream(result); llvm::buffer_ostream pstream(stream); - for (llvm::Function &f : module.functions()) - f.addFnAttr(llvm::Attribute::AlwaysInline); llvm::legacy::PassManager pass; // emit auto fileType = isObject ? llvm::CodeGenFileType::ObjectFile diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py index 514ac171a3..e62373d6fb 100644 --- a/python/test/unit/language/test_core.py +++ b/python/test/unit/language/test_core.py @@ -2563,8 +2563,6 @@ def kernel(X, Y, N, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, NUM_PID_N: tl. @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("add_overflow_check", [False, True]) def test_scan_layouts(M, N, src_layout, axis, add_overflow_check, device, tmp_path: pathlib.Path): - if add_overflow_check is True and is_hip(): - pytest.skip("overflow check disabled on HIP while fixing issues") overflow_check = """ %17 = arith.extsi %arg2 : i32 to i64 diff --git a/third_party/amd/backend/compiler.py b/third_party/amd/backend/compiler.py index c8c43a0512..c222be2cd6 100644 --- a/third_party/amd/backend/compiler.py +++ b/third_party/amd/backend/compiler.py @@ -342,6 +342,9 @@ def make_llir(src, metadata, options): metadata["shared"] = src.get_int_attr("triton_gpu.shared") amd.cleanup_bitcode_metadata(llvm_mod) + # Disable inlining of print related functions, + # because inlining of these function could slow down compilation significantly + amd.disable_print_inline(llvm_mod) return str(llvm_mod) @staticmethod diff --git a/third_party/amd/python/triton_amd.cc b/third_party/amd/python/triton_amd.cc index a9bd3e9b7f..3c33509910 100644 --- a/third_party/amd/python/triton_amd.cc +++ b/third_party/amd/python/triton_amd.cc @@ -161,6 +161,24 @@ void init_triton_amd(py::module &&m) { module->eraseNamedMetadata(openclVersion); }); + m.def("disable_print_inline", [](llvm::Module *module) { + // List of functions name prefixes we want to forbid inline. + std::array prefixes = {"__ockl_fprintf", "__ockl_printf"}; + + for (llvm::Function &f : module->functions()) { + if (!f.hasName()) + continue; + llvm::StringRef name = f.getName(); + + auto isNamePrefixed = [&name](const char *prefix) { + return name.starts_with(prefix); + }; + + if (llvm::any_of(prefixes, isNamePrefixed)) + f.addFnAttr(llvm::Attribute::NoInline); + } + }); + m.def( "assemble_amdgcn", [](const std::string &assembly, const std::string &arch, From 16ce143b54eacf465c5a90a6aabdc9c3a723cb99 Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Fri, 22 Nov 2024 11:16:23 -0800 Subject: [PATCH 12/12] [AMD] Re-enable overflow test in test_reduce_layouts (#5233) https://github.com/triton-lang/triton/pull/5153 fixed the issue; but we missed enabling one of the disabled case. --- python/test/unit/language/test_core.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py index e62373d6fb..b2e0fa59a8 100644 --- a/python/test/unit/language/test_core.py +++ b/python/test/unit/language/test_core.py @@ -2666,8 +2666,6 @@ def test_reduce_layouts(M, N, src_layout, axis, epilogue_kind, dtype_str, add_ov pytest.skip("Skipping because tensor shape is smaller than M(f)maLayout instr_shape") if is_hip() and isinstance(src_layout, MfmaLayout) and ((M, N) == (128, 128)): pytest.skip("Skipping test because it runs out of shared memory") - if add_overflow_check is True and is_hip(): - pytest.skip("overflow check disabled on HIP while fixing issues") if reduce_op == "sum" and dtype_str == "float16" and M * N > 1024: pytest.skip("Skipping sum reduction on float16 due to accuracy issues")