intel
diff --git a/‎bin/RegisterTritonDialects.h‎
Lines changed: 1 addition & 0 deletions b/‎bin/RegisterTritonDialects.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Passes.td‎
Lines changed: 16 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Passes.td‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Utility.h‎
Lines changed: 2 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Utility.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎include/triton/Tools/Sys/GetEnv.hpp‎
Lines changed: 1 addition & 0 deletions b/‎include/triton/Tools/Sys/GetEnv.hpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/CMakeLists.txt‎
Lines changed: 14 additions & 14 deletions b/‎lib/Conversion/TritonGPUToLLVM/CMakeLists.txt‎
Lines changed: 14 additions & 14 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎lib/Dialect/TritonGPU/Transforms/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/Coalesce.cpp‎
Lines changed: 1 addition & 50 deletions b/‎lib/Dialect/TritonGPU/Transforms/Coalesce.cpp‎
Lines changed: 1 addition & 50 deletions
@@ -85,6 +85,7 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
   // TritonAMDGPUTransforms passes
   mlir::registerTritonAMDGPUAccelerateMatmul();
   mlir::registerTritonAMDGPUOptimizeEpilogue();
+  mlir::registerTritonAMDGPUBypassLDSForDotOperand();
   mlir::registerTritonAMDGPUReorderInstructions();
   mlir::registerTritonAMDGPUBlockPingpong();
   mlir::registerTritonAMDGPUStreamPipeline();
 
@@ -55,6 +55,22 @@ def TritonGPUTestPipelineScheduleLoop : Pass<"tritongpu-test-pipeline-schedule-l
                            "mlir::arith::ArithDialect"];
 }
 
+def TritonGPUFuseNestedLoops : Pass<"tritongpu-fuse-nested-loops", "mlir::ModuleOp"> {
+  let summary = "fuse nested loops for pipelining";
+
+  let description = [{
+    The `tritongpu-fuse-nested-loops` pass will analyze loop nests in the module
+    that need to be pipelined and fuse them into a single loop. This composes
+    with the pipeliner to pipeline loop nests.
+  }];
+
+  let dependentDialects = [
+    "mlir::triton::gpu::TritonGPUDialect",
+    "mlir::arith::ArithDialect",
+    "mlir::ub::UBDialect",
+  ];
+}
+
 def TritonGPUF32DotTC : Pass<"tritongpu-F32DotTC", "mlir::ModuleOp"> {
   let summary = "3xTF32 trick";
 
 
@@ -205,6 +205,8 @@ enum class MMALoadType {
 };
 MMALoadType getMMALoadType(Operation *loadOp);
 
+// Convert \param op operands and results to layout \param encoding.
+void convertOpEncoding(Attribute encoding, Operation *op);
 } // namespace mlir
 
 #endif // TRITON_DIALECT_TRITONGPU_TRANSFORMS_UTILITY_H_
@@ -31,6 +31,7 @@ inline const std::set<std::string> CACHE_INVALIDATING_ENV_VARS = {
     "TRITON_ENABLE_LLVM_DEBUG",
     "TRITON_HIP_STREAM_PREFETCH",
     "TRITON_HIP_USE_BLOCK_PINGPONG",
+    "TRITON_HIP_BYPASS_LDS_FOR_DOT",
     "TRITON_LLVM_DEBUG_ONLY",
     "TRITON_ENABLE_ASAN",
     "TRITON_OVERRIDE_ARCH",
 
@@ -1,25 +1,25 @@
 add_triton_library(TritonGPUToLLVM
     ConvertLayoutOpToLLVM/SharedToDotOperandFMA.cpp
     DotOpToLLVM/FMA.cpp
-    GlobalScratchMemoryAllocation.cpp
-    TypeConverter.cpp
-    Utility.cpp
-    ElementwiseOpToLLVM.cpp
-    MemoryOpToLLVM.cpp
+    AllocateSharedMemory.cpp
     AssertOpToLLVM.cpp
-    ViewOpToLLVM.cpp
-    MakeRangeOpToLLVM.cpp
+    ControlFlowOpToLLVM.cpp
+    ConvertLayoutOpToLLVM.cpp
+    DecomposeUnsupportedConversions.cpp
+    ElementwiseOpToLLVM.cpp
+    FuncOpToLLVM.cpp
+    GatherOpToLLVM.cpp
+    GlobalScratchMemoryAllocation.cpp
     HistogramOpToLLVM.cpp
-    AllocateSharedMemory.cpp
+    MakeRangeOpToLLVM.cpp
+    MemoryOpToLLVM.cpp
+    PrintOpToLLVM.cpp
     ReduceOpToLLVM.cpp
     ScanOpToLLVM.cpp
-    GatherOpToLLVM.cpp
-    ConvertLayoutOpToLLVM.cpp
-    ControlFlowOpToLLVM.cpp
-    FuncOpToLLVM.cpp
     SPMDOpToLLVM.cpp
-    DecomposeUnsupportedConversions.cpp
-    PrintOpToLLVM.cpp
+    TypeConverter.cpp
+    Utility.cpp
+    ViewOpToLLVM.cpp
 
     DEPENDS
     TritonGPUConversionPassIncGen
 
@@ -2,6 +2,7 @@ add_triton_library(TritonGPUTransforms
   AccelerateMatmul.cpp
   Coalesce.cpp
   F32DotTC.cpp
+  FuseNestedLoops.cpp
   CombineTensorSelectAndIf.cpp
   LoopScheduling.cpp
   ReduceDataDuplication.cpp
 
@@ -104,55 +104,6 @@ struct CoalescePass : public impl::TritonGPUCoalesceBase<CoalescePass> {
         threadsPerWarp, CTALayout);
   }
 
-  static Type getNewType(Type type, Attribute encoding) {
-    RankedTensorType tensorType = cast<RankedTensorType>(type);
-    return RankedTensorType::get(tensorType.getShape(),
-                                 tensorType.getElementType(), encoding);
-  }
-
-  void coalesceOp(Attribute encoding, Operation *op) {
-    OpBuilder builder(op);
-    // Convert operands
-    // For load/store with tensor pointers, we don't have to change the
-    // operands' type, we do this by changing the outputs' type of
-    // `make_tensor_ptr`
-    SmallVector<Value, 4> newArgs;
-    for (auto operand : op->getOperands()) {
-      auto tensorType = dyn_cast<RankedTensorType>(operand.getType());
-      if (tensorType &&
-          !isa<triton::gpu::SharedEncodingAttr>(tensorType.getEncoding())) {
-        Type newType = getNewType(tensorType, encoding);
-        newArgs.push_back(builder.create<triton::gpu::ConvertLayoutOp>(
-            op->getLoc(), newType, operand));
-      } else {
-        newArgs.push_back(operand);
-      }
-    }
-
-    // Convert output types
-    SmallVector<Type, 4> newTypes;
-    for (auto t : op->getResultTypes()) {
-      bool isAsync = isa<triton::gpu::AsyncCopyGlobalToLocalOp>(op);
-      newTypes.push_back(isAsync ? t : getNewType(t, encoding));
-    }
-
-    // Construct new op with the new encoding
-    Operation *newOp =
-        builder.create(op->getLoc(), op->getName().getIdentifier(), newArgs,
-                       newTypes, op->getAttrs());
-
-    // Cast the results back to the original layout
-    for (size_t i = 0; i < op->getNumResults(); i++) {
-      Value newResult = newOp->getResult(i);
-      if (newTypes[i] != op->getResultTypes()[i]) {
-        newResult = builder.create<triton::gpu::ConvertLayoutOp>(
-            op->getLoc(), op->getResult(i).getType(), newResult);
-      }
-      op->getResult(i).replaceAllUsesWith(newResult);
-    }
-    op->erase();
-  }
-
   void runOnOperation() override {
     // Run axis info analysis
     ModuleOp moduleOp = getOperation();
@@ -187,7 +138,7 @@ struct CoalescePass : public impl::TritonGPUCoalesceBase<CoalescePass> {
     // 4. Convert the output of this new memory op back to L1
     // 5. Replace all the uses of the original memory op by the new one
     for (auto &kv : layoutMap) {
-      coalesceOp(kv.second, kv.first);
+      convertOpEncoding(kv.second, kv.first);
     }
   }
 };