intel
diff --git a/‎README.md‎
Lines changed: 3 additions & 0 deletions b/‎README.md‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎cmake/AddTritonUnitTest.cmake‎
Lines changed: 1 addition & 1 deletion b/‎cmake/AddTritonUnitTest.cmake‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/triton/Dialect/Triton/IR/TritonOps.td‎
Lines changed: 40 additions & 24 deletions b/‎include/triton/Dialect/Triton/IR/TritonOps.td‎
Lines changed: 40 additions & 24 deletions
diff --git a/‎include/triton/Dialect/Triton/IR/TritonTypes.td‎
Lines changed: 11 additions & 0 deletions b/‎include/triton/Dialect/Triton/IR/TritonTypes.td‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td‎
Lines changed: 1 addition & 1 deletion b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td‎
Lines changed: 20 additions & 0 deletions b/‎include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/TypeConverter.cpp‎
Lines changed: 4 additions & 0 deletions b/‎lib/Conversion/TritonGPUToLLVM/TypeConverter.cpp‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp‎
Lines changed: 4 additions & 9 deletions b/‎lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp‎
Lines changed: 4 additions & 9 deletions
diff --git a/‎lib/Dialect/Triton/IR/Ops.cpp‎
Lines changed: 11 additions & 5 deletions b/‎lib/Dialect/Triton/IR/Ops.cpp‎
Lines changed: 11 additions & 5 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp‎
Lines changed: 6 additions & 4 deletions b/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp‎
Lines changed: 6 additions & 4 deletions
@@ -176,6 +176,9 @@ For detailed instructions on how to debug Triton's frontend, please refer to thi
    kernels. Use `MLIR_ENABLE_DUMP=kernelName` to dump for a specific kernel only.
   - Triton cache can interfere with the dump. In cases where `MLIR_ENABLE_DUMP=1` does not work, try cleaning your triton cache: `rm -r ~/.triton/cache/*`
 - `LLVM_IR_ENABLE_DUMP=1` dumps the IR before every pass run over the LLVM IR.
+- `TRITON_REPRODUCER_PATH=<reproducer_path>` will generate an MLIR reproducer file
+  at `<reproducer_path>` before each MLIR compiler stage. If any of the stages fail,
+  `<reproducer_path>` will be a local MLIR reproducer captured right before the failing pass.
 - `TRITON_INTERPRET=1` uses the Triton interpreter instead of running on the
   GPU.  You can insert Python breakpoints in your kernel code!
 - `TRITON_ENABLE_LLVM_DEBUG=1` passes `-debug` to LLVM, printing a lot of
 
@@ -35,5 +35,5 @@ function(add_triton_ut)
   # Without the TEST_DISCOVERY_TIMEOUT, the tests randomly time out on my mac
   # laptop.  I think the issue may be that the very first time you run a program
   # it's a bit slow.
-  gtest_discover_tests(${__NAME} PROPERTIES TEST_DISCOVERY_TIMEOUT 60)
+  gtest_discover_tests(${__NAME} DISCOVERY_TIMEOUT 60)
 endfunction()
@@ -956,9 +956,10 @@ def TT_MakeTensorPtrOp : TT_Op<"make_tensor_ptr",
 //
 // Make Tensor Descriptor Op
 //
-def TT_MakeTensorDescOp : TT_Op<"make_tensor_descriptor",
-                               [Pure,
-                                SameVariadicOperandSize]> {
+def TT_MakeTensorDescOp : TT_Op<"make_tensor_descriptor", [
+    Pure,
+    SameVariadicOperandSize,
+]> {
   let summary = "Make a tensor descriptor type with meta information of the parent tensor and block size";
 
   let description = [{
@@ -969,23 +970,38 @@ def TT_MakeTensorDescOp : TT_Op<"make_tensor_descriptor",
   let arguments = (ins
     TT_Ptr:$base,
     Variadic<I32>:$shape,
-    Variadic<I64>:$strides,
-    DenseI32ArrayAttr:$tensorShape
+    Variadic<I64>:$strides
   );
 
-  // TODO(peterbell10): define a custom IR type to represent descriptors
-  let results = (outs TT_Ptr:$result);
+  let results = (outs TT_TensorDescType:$result);
 
   let assemblyFormat = "$base `,` `[` $shape `]` `,` `[` $strides `]` attr-dict `:` type($base) `,` type($result)";
 
   let builders = [
-    OpBuilder<(ins
-        "Value":$base,
-        "ValueRange":$shape,
-        "ValueRange":$strides,
-        "ArrayRef<int32_t>":$tensorShape
-    )>
+    OpBuilder<(ins "Value":$base, "ValueRange":$shape, "ValueRange":$strides, "ArrayRef<int32_t>":$blockShape)>
   ];
+
+  let extraClassDeclaration = [{
+    ArrayRef<int64_t> getTensorShape() {
+      return getType().getBlockType().getShape();
+    }
+  }];
+}
+
+def ReinterpretTensorDescOp : TT_Op<"reinterpret_tensor_descriptor", [Pure]> {
+  let summary = "Reinterpret a pointer as a tensor descriptor";
+
+  let description = [{
+     This Op exists to help the transition from untyped raw TMA objects to typed Tensor descriptor objects.
+     Ideally, we can remove this once the APIs are fully fleshed out.
+  }];
+
+  let arguments = (ins TT_Ptr:$rawDesc);
+  let results = (outs TT_TensorDescType:$result);
+
+  let assemblyFormat = [{
+    $rawDesc attr-dict `:` qualified(type($rawDesc))  `to` qualified(type($result))
+  }];
 }
 
 // The following ops, including `call`, `func`, and `return` are copied and modified from
@@ -1195,20 +1211,19 @@ def ReturnOp : TT_Op<"return", [Pure, HasParent<"FuncOp">, /*MemRefsNormalizable
 }
 
 
-def TT_ExperimentalDescriptorLoadOp : TT_Op<"experimental_descriptor_load", [
-  MemoryEffects<[MemRead<GlobalMemory>]>]> {
+def TT_ExperimentalDescriptorLoadOp : TT_Op<"experimental_descriptor_load", [MemoryEffects<[MemRead<GlobalMemory>]>]> {
     let summary = "Load from descriptor";
     let description = [{
       This operation will be lowered to Nvidia TMA load operation on targets supporting it.
-      `desc_ptr` is a pointer to the TMA descriptor allocated in global memory.
+      `desc` is a tensor descriptor object.
       The destination tensor type and shape must match the descriptor otherwise the result is undefined.
 
       This is an escape hatch and is only there for testing/experimenting.
       This op will be removed in the future.
     }];
     let arguments = (
       ins
-      TT_PtrType:$desc_ptr,
+      TT_TensorDescType:$desc,
       Variadic<I32>:$indices,
       DefaultValuedAttr<TT_CacheModifierAttr, "::mlir::triton::CacheModifier::NONE">:$cache,
       DefaultValuedAttr<TT_EvictionPolicyAttr, "::mlir::triton::EvictionPolicy::NORMAL">:$evict
@@ -1217,36 +1232,37 @@ def TT_ExperimentalDescriptorLoadOp : TT_Op<"experimental_descriptor_load", [
     let results = (outs TT_Tensor:$result);
 
     let assemblyFormat = [{
-      $desc_ptr `[` $indices `]`
+      $desc `[` $indices `]`
       oilist(
         `cacheModifier` `=` $cache |
         `evictionPolicy` `=` $evict
       )
-      attr-dict `:` qualified(type($desc_ptr)) `->` type($result)
+      attr-dict `:` qualified(type($desc)) `->` type($result)
     }];
 }
 
 def TT_ExperimentalDescriptorStoreOp : TT_Op<"experimental_descriptor_store", [
-  MemoryEffects<[MemRead<GlobalMemory>, MemWrite<GlobalMemory>]>]> {
+    MemoryEffects<[MemRead<GlobalMemory>, MemWrite<GlobalMemory>]>,
+]> {
     let summary = "store value based on descriptor";
     let description = [{
       This operation will be lowered to Nvidia TMA store operation on targets supporting it.
-      `desc_ptr` is a pointer to the TMA descriptor allocated in global memory.
+      `desc` is a tensor descriptor object.
       The shape and types of `src` must match the descriptor otherwise the result is undefined.
 
       This is an escape hatch and is only there for testing/experimenting.
       This op will be removed in the future.
     }];
     let arguments = (
       ins
-      TT_PtrType:$desc_ptr,
+      TT_TensorDescType:$desc,
       TT_Tensor:$src,
       Variadic<I32>:$indices
     );
 
     let assemblyFormat = [{
-      $desc_ptr `[` $indices `]` `,` $src
-      attr-dict `:` qualified(type($desc_ptr)) `,` type($src)
+      $desc `[` $indices `]` `,` $src
+      attr-dict `:` qualified(type($desc)) `,` type($src)
     }];
 }
 
 
@@ -140,5 +140,16 @@ def TT_MemDescType : TritonTypeDef<"MemDesc", "memdesc", [ShapedTypeInterface]>
   let hasCustomAssemblyFormat = 1;
 }
 
+// Result type of ExperimentalMakeTensorDescriptor
+def TT_TensorDescType : TritonTypeDef<"TensorDesc", "tensordesc", []> {
+  let summary = "Tensor descriptor type (`::mlir::triton::TensorDescType`) in Triton IR type system";
+
+  let description = [{
+      A portable abstraction for nvidia-TMA descriptors.
+  }];
+
+  let parameters = (ins "RankedTensorType":$blockType);
+  let assemblyFormat = "`<` $blockType `>`";
+}
 
 #endif
@@ -295,7 +295,7 @@ def TTG_GlobalScratchAllocOp : TTG_Op<"global_scratch_alloc", [MemoryEffects<[Me
                        $_builder.getI32IntegerAttr(nbytes), $_builder.getI32IntegerAttr(alignment)); }]>
   ];
 
-  let assemblyFormat = [{attr-dict `:` type($result)}];
+  let assemblyFormat = [{attr-dict `:` qualified(type($result))}];
 }
 
 #endif
@@ -185,6 +185,26 @@ def TTNG_WaitBarrierOp : TTNG_Op<"wait_barrier", [DeclareOpInterfaceMethods<Memo
     let assemblyFormat = "$alloc `,` $phase attr-dict `:` type($alloc)";
 }
 
+def TTNG_TensorDescToTMAPtrOp : TTNG_Op<"tensor_desc_to_tma_ptr", [Pure]> {
+  let summary = "Convert tensor descriptor to pointer to tma descriptor";
+
+  let arguments = (ins TT_TensorDescType:$desc);
+  let results = (outs TT_Ptr:$ptr);
+
+  let assemblyFormat = [{
+    $desc attr-dict `:` qualified(type($desc)) `to` qualified(type($ptr))
+  }];
+
+  let builders = [
+    OpBuilder<(ins "Value":$desc), [{
+      auto ptrTy = triton::PointerType::get($_builder.getI8Type(), 1);
+      build($_builder, $_state, ptrTy, desc);
+    }]>
+  ];
+
+  let hasCanonicalizeMethod = 1;
+}
+
 
 def TTNG_AsyncTMACopyGlobalToLocalOp : TTNG_Op<"async_tma_copy_global_to_local", [DeclareOpInterfaceMethods<MemoryEffectsOpInterface>]> {
   let summary = "copy data based on descriptor from global memory to local memory asynchronously";
 
@@ -28,6 +28,10 @@ TritonGPUToLLVMTypeConverter::TritonGPUToLLVMTypeConverter(
   addConversion([&](MemDescType type) -> std::optional<Type> {
     return convertMemDescType(type, targetInfo);
   });
+  addConversion([](TensorDescType type) -> std::optional<Type> {
+    auto ctx = type.getContext();
+    return LLVM::LLVMPointerType::get(ctx, 1);
+  });
   addConversion([&](triton::gpu::AsyncTokenType type) -> std::optional<Type> {
     return convertAsyncToken(type);
   });
 
@@ -59,15 +59,10 @@ class ArithConstantPattern : public OpConversionPattern<arith::ConstantOp> {
     Type retType = getTypeConverter()->convertType(op.getType());
     auto retShapedType = cast<ShapedType>(retType);
     auto value = dyn_cast<DenseElementsAttr>(adaptor.getValue());
-    if (dyn_cast<RankedTensorType>(retShapedType)) {
-      assert(value);
-      if (value.getElementType().isInteger(1) && value.isSplat())
-        // Workaround until https://reviews.llvm.org/D133743 is included.
-        value =
-            DenseElementsAttr::get(retShapedType, value.getSplatValue<bool>());
-      else
-        // This is a hack. We just want to add encoding
-        value = value.reshape(retShapedType);
+    if (isa<RankedTensorType>(retShapedType)) {
+      assert(value && "expected a dense elements attribute");
+      // This is a hack. We just want to add encoding.
+      value = value.reshape(retShapedType);
     }
     addNamedAttrs(rewriter.replaceOpWithNewOp<arith::ConstantOp>(
                       op, retShapedType, value),
 
@@ -8,6 +8,7 @@
 #include "triton/Dialect/Triton/IR/Dialect.h"
 #include "triton/Dialect/Triton/IR/Types.h"
 #include "triton/Dialect/Triton/IR/Utility.h"
+#include "llvm/Support/ErrorHandling.h"
 
 namespace mlir {
 namespace triton {
@@ -863,12 +864,17 @@ OpFoldResult AdvanceOp::fold(FoldAdaptor adaptor) {
 //-- MakeTensorDescOp --
 void MakeTensorDescOp::build(OpBuilder &builder, OperationState &state,
                              Value base, ValueRange shape, ValueRange strides,
-                             ArrayRef<int32_t> tensorShape) {
-  auto resultTy = getPointerType(builder.getI8Type());
-  assert(resultTy.getContext());
+                             ArrayRef<int32_t> blockShape) {
+  auto ptrTy = dyn_cast<triton::PointerType>(base.getType());
+  if (!ptrTy) {
+    llvm::report_fatal_error("Expected pointer type");
+  }
+  auto elemTy = ptrTy.getPointeeType();
 
-  return build(builder, state, resultTy, base, shape, strides,
-               builder.getDenseI32ArrayAttr(tensorShape));
+  SmallVector<int64_t> blockShape64(blockShape);
+  auto blockTy = RankedTensorType::get(blockShape64, elemTy);
+  auto descTy = TensorDescType::get(builder.getContext(), blockTy);
+  return build(builder, state, descTy, base, shape, strides);
 }
 
 // The following ops, including `call`, `func`, and `return` are copied and
 
@@ -34,13 +34,13 @@ namespace tt = mlir::triton;
 namespace ttg = mlir::triton::gpu;
 namespace ttng = mlir::triton::nvidia_gpu;
 
-// TODO: We can extra some helpers into common utilities once we add more
+// TODO: We can extract some helpers into common utilities once we add more
 // schedules.
 
 namespace {
 
 struct LoadInfo {
-  // Layout of the data in the shared memory.
+  // Layout of the data in shared memory.
   ttg::SharedEncodingAttr sharedEncoding = nullptr;
   // Blocked encoding is used for loads not used by the dot.
   ttg::BlockedEncodingAttr blockedEncoding = nullptr;
@@ -239,9 +239,11 @@ createTMAAsyncCopy(scf::ForOp &forOp, tt::ExperimentalDescriptorLoadOp loadOp,
 
   Value pred = builder.createWithStage<arith::ConstantIntOp>(loc, stage,
                                                              clusterId, 1, 1);
+  Value tmaPtr =
+      builder.createWithStage<triton::nvidia_gpu::TensorDescToTMAPtrOp>(
+          loc, stage, clusterId, loadOp.getDesc());
   Operation *copy = builder.createWithStage<ttng::AsyncTMACopyGlobalToLocalOp>(
-      loc, stage, clusterId, loadOp.getDescPtr(), loadOp.getIndices(), barrier,
-      view, pred);
+      loc, stage, clusterId, tmaPtr, loadOp.getIndices(), barrier, view, pred);
 
   bool isMMV3Load = loadToInfo[loadOp].loadIsMMAV3;
Original file line number	Diff line number	Diff line change
`@@ -295,7 +295,7 @@ def TTG_GlobalScratchAllocOp : TTG_Op<"global_scratch_alloc", [MemoryEffects<[Me`
`295`	`295`	`$_builder.getI32IntegerAttr(nbytes), $_builder.getI32IntegerAttr(alignment)); }]>`
`296`	`296`	`];`
`297`	`297`
`298`		- let assemblyFormat = [{attr-dict `:` type($result)}];
	`298`	+ let assemblyFormat = [{attr-dict `:` qualified(type($result))}];
`299`	`299`	`}`
`300`	`300`
`301`	`301`	`#endif`