intel
diff --git a/‎CMakeLists.txt
Lines changed: 4 additions & 0 deletions b/‎CMakeLists.txt
Lines changed: 4 additions & 0 deletions
diff --git a/‎Makefile
Lines changed: 1 addition & 1 deletion b/‎Makefile
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/programming-guide/chapter-3/debugging.rst
Lines changed: 1 addition & 1 deletion b/‎docs/programming-guide/chapter-3/debugging.rst
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/triton/Analysis/Allocation.h
Lines changed: 5 additions & 0 deletions b/‎include/triton/Analysis/Allocation.h
Lines changed: 5 additions & 0 deletions
diff --git a/‎include/triton/Analysis/Utility.h
Lines changed: 0 additions & 4 deletions b/‎include/triton/Analysis/Utility.h
Lines changed: 0 additions & 4 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h
Lines changed: 15 additions & 2 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h
Lines changed: 15 additions & 2 deletions
diff --git a/‎include/triton/Dialect/Triton/IR/TritonOps.td
Lines changed: 20 additions & 0 deletions b/‎include/triton/Dialect/Triton/IR/TritonOps.td
Lines changed: 20 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td
Lines changed: 38 additions & 19 deletions b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td
Lines changed: 38 additions & 19 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Passes.td
Lines changed: 5 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Passes.td
Lines changed: 5 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h
Lines changed: 11 additions & 2 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h
Lines changed: 11 additions & 2 deletions
@@ -116,6 +116,10 @@ if(NOT MLIR_DIR)
   set(MLIR_DIR ${LLVM_LIBRARY_DIR}/cmake/mlir)
 endif()
 
+if(NOT LLD_DIR)
+  set(LLD_DIR ${LLVM_LIBRARY_DIR}/cmake/lld)
+endif()
+
 # MLIR
 find_package(MLIR REQUIRED CONFIG PATHS ${MLIR_DIR})
 
 
@@ -6,7 +6,7 @@ PYTHON ?= python
 BUILD_DIR := $(shell cd python; $(PYTHON) -c 'from build_helpers import get_cmake_dir; print(get_cmake_dir())')
 TRITON_OPT := $(BUILD_DIR)/bin/triton-opt
 PYTEST := $(PYTHON) -m pytest
-LLVM_BUILD_PATH ?= ".llvm-project/build"
+LLVM_BUILD_PATH ?= $(realpath .llvm-project/build)
 NUM_PROCS ?= 8
 
 # Incremental builds
 
@@ -77,6 +77,6 @@ Using Third-party Tools
 For debugging on NVIDIA GPUs, `compute-sanitizer <https://docs.nvidia.com/cuda/compute-sanitizer/index.html>`_ is an effective tool for checking data races and memory access issues.
 To use it, prepend :code:`compute-sanitizer` to your command to run the Triton program.
 
-For debugging on AMD GPUs, you may want to try the LLVM `AddressSanitizer <https://rocm.docs.amd.com/en/latest/conceptual/using-gpu-sanitizer.html>`_ for ROCm.
+For debugging on AMD GPUs, you may want to try the LLVM `AddressSanitizer <https://rocm.docs.amd.com/projects/llvm-project/en/latest/conceptual/using-gpu-sanitizer.html>`_ for ROCm.
 
 For detailed visualization of memory access in Triton programs, consider using the `triton-viz <https://github.com/Deep-Learning-Profiling-Tools/triton-viz>`_ tool, which is agnostic to the underlying GPUs.
@@ -63,6 +63,11 @@ getScratchCvtInOutVecLengths(RankedTensorType srcTy, RankedTensorType dstTy);
 ScratchConfig getScratchConfigForCvt(RankedTensorType srcTy,
                                      RankedTensorType dstTy);
 
+unsigned getNumScratchElemsSwizzledCvt(RankedTensorType srcTy,
+                                       RankedTensorType dstTy);
+
+unsigned getNumScratchElemsPaddedCvt(RankedTensorType srcTy,
+                                     RankedTensorType dstTy);
 } // namespace triton
 
 /// Modified from llvm-15.0: llvm/ADT/AddressRanges.h
 
@@ -254,10 +254,6 @@ bool cvtNeedsSharedMemory(RankedTensorType srcTy, RankedTensorType dstTy);
 
 bool atomicNeedsSharedMemory(Value result);
 
-// Return true if the src and dst layout match.
-bool matchMmaV3AndDotOperandLayout(RankedTensorType srcTy,
-                                   RankedTensorType dstTy);
-
 // Check if MFMA layout can be converted to the dot operand
 // layout using warp shuffle.
 bool matchMFMAAndDotOperandShuffleCase(RankedTensorType srcTy,
 
@@ -356,12 +356,12 @@ class SharedMemoryObject {
                                 RewriterBase &rewriter) const;
 
   // Returns a mask representing all the bits of the memdesc offsets that
-  // may be modified by an affine offset coming from a memdesc_subview.
+  // may be modified by an affine offset coming from a memdesc_subslice.
   // The offsets are considered to be in the type of the memdesc.
   // For padded layouts, we return the offsets without padding.
   static uint64_t getMaskSpanOffsets(triton::gpu::MemDescType srcTy);
 
-  // Returns whether the shared memory access had a memdesc_subview
+  // Returns whether the shared memory access had a memdesc_subslice
   // that is rank-preserving (soon to be called memdesc_slice)
   static bool isAffineSharedMemoryAccess(triton::gpu::MemDescType srcTy) {
     return getMaskSpanOffsets(srcTy) != 0;
@@ -644,6 +644,19 @@ Value transferWithinBlockPadding(triton::gpu::ConvertLayoutOp op, Value src,
                                  const TargetInfoBase &targetInfo,
                                  const LLVMTypeConverter *typeConverter,
                                  RewriterBase &rewriter);
+
+SmallVector<Value> inlineRegionImpl(RewriterBase &rewriter, Region &region,
+                                    ArrayRef<Value> args,
+                                    mlir::TypeID terminatorTypeId,
+                                    Location loc);
+
+template <typename TerminatorOp>
+SmallVector<Value> inlineRegion(RewriterBase &rewriter, Region &region,
+                                ArrayRef<Value> args, Location loc) {
+  return inlineRegionImpl(rewriter, region, args,
+                          mlir::TypeID::get<TerminatorOp>(), loc);
+}
+
 } // namespace mlir
 
 #endif
@@ -797,6 +797,26 @@ def TT_ScanReturnOp: TT_Op<"scan.return",
     let assemblyFormat = "$result attr-dict `:` type($result)";
 }
 
+//
+// Map Elementwise op
+//
+def TT_MapElementwiseOp: TT_Op<"map_elementwise", [SameOperandsAndResultEncoding,
+                                                   SameOperandsAndResultShape,
+                                                   RecursiveMemoryEffects]> {
+    let summary = "Map a scalar subregion over a tensor";
+    let arguments = (ins Variadic<TT_Tensor>:$srcs, I32Attr:$pack);
+    let results = (outs Variadic<TT_Tensor>:$result);
+    let regions = (region AnyRegion:$scalarOp);
+    let hasVerifier = 1;
+    let hasRegionVerifier = 1;
+}
+
+def TT_MapElementwiseReturnOp: TT_Op<"map_elementwise.return",
+                               [HasParent<"MapElementwiseOp">, Pure, Terminator, ReturnLike]> {
+    let summary = "terminator for map elementwise operator";
+    let arguments = (ins Variadic<AnyType>:$result);
+    let assemblyFormat = "attr-dict ($result^ `:` type($result))?";
+}
 
 //
 // External Elementwise op
 
@@ -200,38 +200,57 @@ def TTG_LocalDeallocOp : TTG_Op<"local_dealloc"> {
   // Use qualified() otherwise "!ttg.memdesc<X>" is printed as "<X>".
   let assemblyFormat = [{$src attr-dict `:` qualified(type($src))}];
 }
-
-def TTG_MemDescSubviewOp : TTG_Op<"memdesc_subview", [Pure, MemDescViewTrait]> {
+def TTG_MemDescIndexOp : TTG_Op<"memdesc_index", [Pure, MemDescViewTrait]> {
   let summary = "take a subview of the descriptor.";
 
   let description = [{
-    This operation returns a new descriptor representing a subview of the buffer.
+    This operation returns a new descriptor pointing to the `i`-th element of the
+    input descriptor along the 0-th dimension.
+
     It doesn't affect the underlying memory.
 
     For example, suppose that
      - the input shape is 2x4x16xf16,
      - the output shape is 4x16xf16, and
-     - offsets = [1, 0, 0].
+     - index = 1.
+    Then the output descriptor is equivalent to input[1], where input is the logical tensor.
 
-    Then in Python syntax, the subview covers input[1].
+    When the input is of rank 1 (i.e, shape=[k]), the output will have shape=[1].
+  }];
 
-    Just one dimension may be split (at most one non-zero offset).
+  let arguments = (ins TTG_MemDescType:$src, I32:$index);
 
-    When the input shape and the output shape have different rank:
-    Or the output shape is a tensor of 1D tensor of 1 element:
-      - The rank of the output must be 1D smaller than the input.
-      - We assume the input is split along the 0th dimension.
-      - The offset along the 0th dimension may be a runtime value.
-    When the input and the output have the same rank:
-      - The offset must be a compile-time constant
-      - Larger or equal to the tile of the tensor (or zero)
-      - That does not split the input along the swizzling pattern (if any)
-  }];
-  let arguments = (
-    ins TTG_MemDescType:$src, Variadic<I32>:$offsets);
+  let results = (outs TTG_MemDescType:$result);
+
+  let assemblyFormat = [{$src `,` $index attr-dict `:` qualified(type($src)) `->` qualified(type($result))}];
+
+  let hasVerifier = 1;
+}
 
+def TTG_MemDescSubsliceOp : TTG_Op<"memdesc_subslice", [Pure, MemDescViewTrait]> {
+  let summary = "take a subview of the descriptor.";
+
+  let description = [{
+    This operation returns a new descriptor representing a subview of the logical tensor.
+    It doesn't affect the underlying memory.
+
+    For example, suppose that
+     - the input shape is 32x16xf16,
+     - the output shape is 8x16xf16, and
+     - offsets = [2, 1].
+    Then in Python syntax, the subview covers input[2:8+2, 1:16+1] where input is
+    the logical tensor.
+
+    The offsets must be larger or equal to the tile of the tensor (or zero).
+  }];
+  let arguments = (ins TTG_MemDescType:$src, DenseI32ArrayAttr:$offsets);
   // Use qualified() otherwise "!ttg.memdesc<X>" is printed as "<X>".
-  let assemblyFormat = [{$src `[` $offsets `]` attr-dict `:` qualified(type($src)) `->` qualified(type($result))}];
+  // Render offsets inline as %src[0, 0] via a custom directive, but keep
+  // the overall parse/print generated from this assemblyFormat.
+  let assemblyFormat = [{
+    $src `[` custom<Offsets>($offsets) `]` attr-dict `:` qualified(type($src))
+    `->` qualified(type($result))
+  }];
 
   let results = (outs TTG_MemDescType:$result);
 
 
@@ -60,6 +60,11 @@ def TritonGPUHoistTMEMAlloc : Pass<"tritongpu-hoist-tmem-alloc", "mlir::ModuleOp
                            "mlir::triton::nvidia_gpu::TritonNvidiaGPUDialect",
                            "mlir::scf::SCFDialect",
                            "mlir::arith::ArithDialect"];
+  let options = [
+    Option<"hoistOutOfIf", "hoist-out-of-if",
+           "bool", /*default*/"false",
+           "Hoist TMEM allocations out of if statements">
+  ];
 }
 
 def TritonGPUTestPipelineLowerLoop : Pass<"tritongpu-test-pipeline-lower-loop", "mlir::ModuleOp"> {
 
@@ -68,6 +68,15 @@ bool isOuterLoop(scf::ForOp forOp);
 /// Function to mask operations during scheduling.
 Operation *predicateOp(RewriterBase &rewriter, Operation *op, Value pred);
 
+/// Wrap the operation into a MaskOp using the provided predicate, enabling high
+/// level predication abstraction during pipelining.
+Operation *wrapInMaskOp(RewriterBase &rewriter, Operation *op, Value pred);
+
+// Utilize high level predication abstraction to perform optimizations before
+// lowering to predicated operations
+void resolveMaskOp(ModuleOp moduleOp,
+                   DenseSet<triton::gpu::MaskOp> &peeledMaskOps);
+
 // Return true if the given ForOp has the attribute
 // `tt.disallow_acc_multi_buffer` set to true.
 bool getDisallowAccMultiBuffer(scf::ForOp forOp);
@@ -133,11 +142,11 @@ gpu::SharedEncodingTrait getSharedEncoding(Operation *loadOp);
 // specified.
 int getNumStagesOrDefault(scf::ForOp forOp, int defaultNumStages);
 
-// Given a result of MemDescSubview, or Alloca, create a MemDescSubview with a
+// Given a result of MemDescIndex, or Alloca, create a MemDescIndex with a
 // single buffer slice (leading dimension equal to 1), at the given index.
 TypedValue<triton::gpu::MemDescType>
 createSingleBufferView(OpBuilder &builder, Value alloc, Value idx);
-// Given a result of MemDescSubview, or Alloca, create a MemDescSubview with a
+// Given a result of MemDescIndex, or Alloca, create a MemDescIndex with a
 // single buffer slice (leading dimension equal to 1), at the given index.
 TypedValue<triton::gpu::MemDescType>
 createSingleBufferView(OpBuilder &builder, Value alloc, int idx);