intel
diff --git a/‎.github/workflows/integration-tests-amd.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/integration-tests-amd.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎.gitignore
Lines changed: 1 addition & 0 deletions b/‎.gitignore
Lines changed: 1 addition & 0 deletions
diff --git a/‎Makefile
Lines changed: 2 additions & 2 deletions b/‎Makefile
Lines changed: 2 additions & 2 deletions
diff --git a/‎bin/RegisterTritonDialects.h
Lines changed: 1 addition & 1 deletion b/‎bin/RegisterTritonDialects.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td
Lines changed: 17 additions & 5 deletions b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td
Lines changed: 17 additions & 5 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Passes.td
Lines changed: 10 additions & 17 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Passes.td
Lines changed: 10 additions & 17 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Utility.h
Lines changed: 4 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Utility.h
Lines changed: 4 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonNvidiaGPU/Transforms/Passes.h
Lines changed: 5 additions & 24 deletions b/‎include/triton/Dialect/TritonNvidiaGPU/Transforms/Passes.h
Lines changed: 5 additions & 24 deletions
diff --git a/‎include/triton/Dialect/TritonNvidiaGPU/Transforms/Passes.td
Lines changed: 2 additions & 12 deletions b/‎include/triton/Dialect/TritonNvidiaGPU/Transforms/Passes.td
Lines changed: 2 additions & 12 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp
Lines changed: 27 additions & 36 deletions b/‎lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp
Lines changed: 27 additions & 36 deletions
@@ -109,7 +109,7 @@ jobs:
             echo "Could not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
           fi
           pytest --capture=tee-sys -rfs python/tutorials/06-fused-attention.py
-          pytest --capture=tee-sys -rfs third_party/amd/python/test/test_extract_slice.py
+          pytest --capture=tee-sys -rfs third_party/amd/python/test/test_extract_slice_concat_op.py
           TRITON_ALWAYS_COMPILE=1 pytest --capture=tee-sys -rfs third_party/amd/python/test/test_scalarize_packed_fops.py
           cd python/test/unit
           pytest --capture=tee-sys -rfs -n 12 language runtime \
 
@@ -10,6 +10,7 @@ llvm-project-*/
 dist/
 triton*.egg-info/
 *.whl
+python/triton_kernels/triton*.egg-info/
 
 python/triton/_C/*.pyd
 python/triton/_C/*.so
 
@@ -36,8 +36,8 @@ test-unit: all
 	$(PYTEST) -s -n 8 python/test/unit/test_debug.py --forked
 	$(PYTEST) -s -n 8 python/triton_kernels/tests/
 	TRITON_DISABLE_LINE_INFO=0 $(PYTEST) -s python/test/unit/language/test_line_info.py
-	# Run cuda/test_flashattention.py separately to avoid out of gpu memory
-	$(PYTEST) -s python/test/unit/cuda/test_flashattention.py
+	# Run attention separately to avoid out of gpu memory
+	$(PYTEST) -vs python/tutorials/06-fused-attention.py
 	TRITON_ALWAYS_COMPILE=1 TRITON_DISABLE_LINE_INFO=0 LLVM_PASS_PLUGIN_PATH=python/triton/instrumentation/libGPUInstrumentationTestLib.so \
 		$(PYTEST) --capture=tee-sys -rfs -vvv python/test/unit/instrumentation/test_gpuhello.py
 
 
@@ -61,7 +61,7 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
   mlir::registerAllPasses();
   mlir::triton::registerTritonPasses();
   mlir::triton::gpu::registerTritonGPUPasses();
-  mlir::registerTritonNvidiaGPUPasses();
+  mlir::triton::nvidia_gpu::registerTritonNvidiaGPUPasses();
   mlir::test::intel::registerTestAxisInfoPass();
   mlir::test::registerTestAliasPass();
   mlir::test::registerTestAlignmentPass();
 
@@ -208,14 +208,26 @@ def TTG_MemDescSubviewOp : TTG_Op<"memdesc_subview", [Pure, MemDescViewTrait]> {
 
   let description = [{
     This operation returns a new descriptor representing a subview of the buffer.
-    It doesn't affect the underlying memory. The subview can be rank-reduced.
+    It doesn't affect the underlying memory.
 
     For example, suppose that
      - the input shape is 2x4x16xf16,
-     - the output shape is 4x4xf16, and
-     - offsets = [1, 0, 4].
-
-    Then in Python syntax, the subview covers input[1][0:4][4:8].
+     - the output shape is 4x16xf16, and
+     - offsets = [1, 0, 0].
+
+    Then in Python syntax, the subview covers input[1].
+
+    Just one dimension may be split (at most one non-zero offset).
+
+    When the input shape and the output shape have different rank:
+    Or the output shape is a tensor of 1D tensor of 1 element:
+      - The rank of the output must be 1D smaller than the input.
+      - We assume the input is split along the 0th dimension.
+      - The offset along the 0th dimension may be a runtime value.
+    When the input and the output have the same rank:
+      - The offset must be a compile-time constant
+      - Larger or equal to the tile of the tensor (or zero)
+      - That does not split the input along the swizzling pattern (if any)
   }];
   let arguments = (
     ins TTG_MemDescType:$src, Variadic<I32>:$offsets);
 
@@ -165,6 +165,16 @@ def TritonGPUOptimizePartitionWarps : Pass<"tritongpu-optimize-partition-warps",
   }];
 }
 
+def TritonGPUPartitionScheduling : Pass<"tritongpu-partition-scheduling", "mlir::ModuleOp"> {
+  let summary = "warp specialization partitioning pass";
+
+  let description = [{
+    The `tritongpu-partition-scheduling` analyzes the loads, MMAs, and other
+    operations in a loop that is meant to be warp specialized and determines
+    which partitions to assign to each operation.
+  }];
+}
+
 def TritonGPULoadMMASpecialization : Pass<"tritongpu-load-mma-specialization", "mlir::ModuleOp"> {
   let summary = "load MMA specialization";
 
@@ -219,23 +229,6 @@ def TritonGPUPrefetch : Pass<"tritongpu-prefetch", "mlir::ModuleOp"> {
                            "mlir::arith::ArithDialect"];
 }
 
-def TritonGPUWGMMAPrefetch : Pass<"tritongpu-wgmma-prefetch", "mlir::ModuleOp"> {
-   let summary = "prefetch for wgmma mixed precision";
-
-   let description = [{
-       This pass attempts to prefetch from shared memory for mixed-precision
-       wgmma when operand A is in the shared memory and needs to be loaded
-       to the local registers.
-   }];
-
-   let dependentDialects = [ "mlir::triton::gpu::TritonGPUDialect",
-                             "mlir::triton::nvidia_gpu::TritonNvidiaGPUDialect",
-                             "mlir::scf::SCFDialect",
-                             "mlir::arith::ArithDialect"];
-}
-
-
-
 def TritonGPUAccelerateMatmul : Pass<"tritongpu-accelerate-matmul", "mlir::ModuleOp"> {
   let summary = "accelerate matmul";
 
 
@@ -54,6 +54,10 @@ getNumElementsPerThread(Operation *op, SmallVector<unsigned> order,
 // Returns whether the op is a "view op", i.e. doesn't move any data
 bool isView(Operation *op);
 
+// Returns whether the op is a "noop op", i.e. has one input and one output
+// and lowers to llvm as the identity function (returns the input)
+bool isNoop(Operation *op);
+
 /* Dump Triton IR in graphviz dot format.
  *
  * You can override `onValue` and `onOperation` in a subclass to mark
 
@@ -38,38 +38,19 @@ struct ClusterInfo {
   int clusterDimZ;
 };
 
-} // namespace nvidia_gpu
-} // namespace triton
-} // namespace mlir
-
-namespace mlir {
-
 std::unique_ptr<Pass> createTritonNvidiaGPUPlanCTAPass(
     mlir::triton::nvidia_gpu::ClusterInfo *clusterInfo = nullptr);
 
-std::unique_ptr<Pass>
-createTritonNvidiaGPUFenceInsertionPass(int computeCapability = 90);
-
-std::unique_ptr<Pass> createTritonNvidiaGPUTMALoweringPass();
-
-std::unique_ptr<Pass> createTensorMemoryAllocationPass();
-
-std::unique_ptr<Pass> createTritonNvidiaGPUMMALoweringPass();
-
-std::unique_ptr<Pass> createTritonNvidiaGPUPromoteLHSToTMemPass();
-
-std::unique_ptr<Pass> createTritonNvidiaGPURemoveTMEMTokensPass();
-
-std::unique_ptr<Pass> createTritonNvidiaGPUOptimizeDescriptorEncodingPass();
-
-std::unique_ptr<Pass> createTritonNvidiaGPUOptimizeTMemLayoutsPass();
-
-std::unique_ptr<Pass> createTritonNvidiaGPUInterleaveTMemPass();
+#define GEN_PASS_DECL
+#include "triton/Dialect/TritonNvidiaGPU/Transforms/Passes.h.inc"
 
 /// Generate the code for registering passes.
 #define GEN_PASS_REGISTRATION
 #define GEN_PASS_DECL_TRITONNVIDIAGPULEGALIZETMALAYOUTS
 #include "triton/Dialect/TritonNvidiaGPU/Transforms/Passes.h.inc"
 
+} // namespace nvidia_gpu
+} // namespace triton
 } // namespace mlir
+
 #endif // TRITON_DIALECT_TRITONNVIDIAGPU_TRANSFORMS_PASSES_H_
@@ -32,7 +32,7 @@ def TritonGPUPlanCTAPass : Pass<"triton-nvidia-gpu-plan-cta", "mlir::ModuleOp">
     and StoreLikeOps operations.
   }];
 
-  let constructor = "mlir::createTritonNvidiaGPUPlanCTAPass()";
+  let constructor = "mlir::triton::nvidia_gpu::createTritonNvidiaGPUPlanCTAPass()";
 
   let dependentDialects = [
     "mlir::triton::gpu::TritonGPUDialect",
@@ -48,8 +48,6 @@ def TritonGPUFenceInsertion : Pass<"triton-nvidia-gpu-fence-insertion", "mlir::M
     properly ordered across generic and async operations.
   }];
 
-  let constructor = "mlir::createTritonNvidiaGPUFenceInsertionPass()";
-
   let dependentDialects = [
     "mlir::triton::gpu::TritonGPUDialect",
     "mlir::triton::nvidia_gpu::TritonNvidiaGPUDialect"
@@ -69,22 +67,18 @@ def TritonNvidiaGPUTMALoweringPass : Pass<"triton-nvidia-tma-lowering", "mlir::M
     Lower Triton experimental descriptor load to TMA load/store operations in TritonNvidiaGPUDialect.
   }];
 
-  let constructor = "mlir::createTritonNvidiaGPUTMALoweringPass()";
-
   let dependentDialects = [
     "mlir::triton::nvidia_gpu::TritonNvidiaGPUDialect"
   ];
 }
 
-def TritionTensorMemoryAllocationPass : Pass<"triton-tensor-memory-allocation", "mlir::ModuleOp"> {
+def TritonTensorMemoryAllocationPass : Pass<"triton-tensor-memory-allocation", "mlir::ModuleOp"> {
   let summary = "Assign tensor memory allocation";
 
   let description = [{
     Decide on tensor memory allocation and assign attributes to each allocation.
   }];
 
-  let constructor = "mlir::createTensorMemoryAllocationPass()";
-
   let dependentDialects = [
     "mlir::triton::nvidia_gpu::TritonNvidiaGPUDialect"
   ];
@@ -97,8 +91,6 @@ def TritonNvidiaGPUMMALoweringPass : Pass<"triton-nvidia-mma-lowering", "mlir::M
     Lower MMA ops to prepare for conversion to LLVM.
   }];
 
-  let constructor = "mlir::createTritonNvidiaGPUMMALoweringPass()";
-
   let dependentDialects = [
     "mlir::triton::nvidia_gpu::TritonNvidiaGPUDialect"
   ];
@@ -111,8 +103,6 @@ def TritonNvidiaGPUPromoteLHSToTMemPass : Pass<"tritongpu-promote-lhs-to-tmem",
     Promote LHS operand of MMAv5 op to Tensor Memory.
   }];
 
-  let constructor = "mlir::createTritonNvidiaGPUPromoteLHSToTMemPass()";
-
   let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect",
                            "mlir::triton::nvidia_gpu::TritonNvidiaGPUDialect",
                            "mlir::triton::TritonDialect"];
 
@@ -3,6 +3,7 @@
 #include "triton/Conversion/TritonGPUToLLVM/Utility.h"
 #include "triton/Dialect/TritonGPU/IR/Attributes.h"
 #include "triton/Dialect/TritonGPU/IR/Types.h"
+#include "triton/Tools/LayoutUtils.h"
 
 using namespace mlir;
 using namespace mlir::triton;
@@ -421,6 +422,7 @@ struct MemDescSubviewOpConversion
   matchAndRewrite(triton::gpu::MemDescSubviewOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     Location loc = op->getLoc();
+    auto *ctx = op->getContext();
     auto b = TritonLLVMOpBuilder(loc, rewriter);
     auto srcTy = op.getSrc().getType();
     auto destTy = op.getResult().getType();
@@ -433,53 +435,42 @@ struct MemDescSubviewOpConversion
                                                    llvmElemTy, rewriter);
     auto smemStrides = smemObj.getStrides(srcTy, loc, rewriter);
     SmallVector<Value> opOffsetVals = op.getOffsets();
+    // We assume we always create a subview of the last dimensions
     SmallVector<Value> opSmemStrides(smemStrides.end() - opOffsetVals.size(),
                                      smemStrides.end());
+    // Compute total offset
     SmallVector<Value> offsetVals;
     auto destRank = op.getResult().getType().getRank();
     auto rankReduced = srcTy.getRank() - destRank;
     for (int i = rankReduced; i < opOffsetVals.size(); i++) {
       offsetVals.push_back(b.add(opOffsetVals[i], smemObj.getOffsets()[i]));
     }
+
     Value offset;
-    auto allocShape = srcTy.getAllocShape();
-    auto nvmmaEnc = dyn_cast<NVMMASharedEncodingAttr>(enc);
-    bool isSimpleSubview =
-        (!nvmmaEnc || allocShape.take_back(destRank) == destTy.getShape() ||
-         nvmmaEnc.getSwizzlingByteWidth() == 0);
-    if (!isSimpleSubview) {
-      assert(destRank >= 2 &&
-             "Shape size should be >= 2 when using NVMMAShared encoding");
-      auto swizzleStride = b.i32_val((nvmmaEnc.getSwizzlingByteWidth() * 8) /
-                                     llvmElemTy.getIntOrFloatBitWidth());
-      offset = b.i32_val(0);
-      for (auto i = 0; i < opOffsetVals.size() - 2; ++i) {
-        offset = b.add(offset, b.mul(opOffsetVals[i], opSmemStrides[i]));
-      }
-      // newOffset = offset - (stridedOff * swizzledStride + contigOff /
-      // swizzledStride * tileSize + contigOff % swizzledStride)
-      // + stridedInc * swizzledStride + contigInc / swizzledStride *
-      // tileSize + contigInc % swizzledStride
-      auto stridedDim = destRank - 1 - layoutOrder[0];
-      auto contigDim = destRank - 1 - layoutOrder[1];
-      auto stridedOff = smemObj.getOffsets()[stridedDim];
-      auto contigOff = smemObj.getOffsets()[contigDim];
-      auto stridedInc = offsetVals[stridedDim];
-      auto contigInc = offsetVals[contigDim];
-      int allocStridedDim = allocShape.size() - 1 - layoutOrder[0];
-      auto tileSize =
-          b.mul(b.i32_val(allocShape[allocStridedDim]), swizzleStride);
-      offset = b.sub(offset, b.mul(stridedOff, swizzleStride));
-      offset = b.sub(offset, b.mul(b.udiv(contigOff, swizzleStride), tileSize));
-      offset = b.sub(offset, b.urem(contigOff, swizzleStride));
-      offset = b.add(offset, b.mul(stridedInc, swizzleStride));
-      offset = b.add(offset, b.mul(b.udiv(contigInc, swizzleStride), tileSize));
-      offset = b.add(offset, b.urem(contigInc, swizzleStride));
-    } else {
-      // Compute the offset based on the original strides of the shared memory
-      // object
+    if (rankReduced || (destTy.getRank() == 1 && destTy.getDimSize(0) == 1)) {
+      // We are splitting the pipelining dimension which may not be a power of 2
+      // so we can't use LinearLayouts
       offset = dot(rewriter, loc, opOffsetVals, opSmemStrides);
+    } else {
+      auto dimNames = standardOutDimNames(ctx, opOffsetVals.size());
+      SmallVector<std::pair<StringAttr, Value>> logicalOffsets;
+      // This assumes the subviews are additive, in the sense that we can
+      // compute the offset of one and an add it to the offset of the previous
+      // one we computed. We check for this in the verifier.
+      for (int i = 0; i < rankReduced; i++) {
+        logicalOffsets.push_back({dimNames[i], b.i32_val(0)});
+      }
+      for (int i = rankReduced; i < opOffsetVals.size(); i++) {
+        logicalOffsets.push_back({dimNames[i], offsetVals[i - rankReduced]});
+      }
+      // The order gives us the honest-to-goodness layout rank
+      auto srcAllocShape =
+          srcTy.getAllocShape().take_back(getOrder(srcTy).size());
+      auto llInv = toLinearLayout(srcAllocShape, srcTy.getEncoding()).invert();
+      offset =
+          applyLinearLayout(loc, rewriter, llInv, logicalOffsets)[0].second;
     }
+
     auto base = smemObj.getBase();
     auto elemPtrTy = base.getType();
     smemObj = SharedMemoryObject(b.gep(elemPtrTy, llvmElemTy, base, offset),