intel
diff --git a/‎.github/workflows/llvm-build.yml‎
Lines changed: 6 additions & 5 deletions b/‎.github/workflows/llvm-build.yml‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎include/triton/Dialect/Triton/IR/Dialect.h‎
Lines changed: 10 additions & 0 deletions b/‎include/triton/Dialect/Triton/IR/Dialect.h‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Passes.td‎
Lines changed: 32 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Passes.td‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h‎
Lines changed: 3 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Schedule.h‎
Lines changed: 13 additions & 2 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Schedule.h‎
Lines changed: 13 additions & 2 deletions
diff --git a/‎lib/Dialect/Triton/IR/Traits.cpp‎
Lines changed: 6 additions & 49 deletions b/‎lib/Dialect/Triton/IR/Traits.cpp‎
Lines changed: 6 additions & 49 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 63 additions & 0 deletions b/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 63 additions & 0 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/CMakeLists.txt‎
Lines changed: 3 additions & 0 deletions b/‎lib/Dialect/TritonGPU/Transforms/CMakeLists.txt‎
Lines changed: 3 additions & 0 deletions
@@ -30,10 +30,9 @@ jobs:
         - {runner: 'Ubuntu 20.04 ARM64', runs_on: 'ubuntu-20.04', target-os: 'ubuntu', arch: 'arm64'}
         - {runner: 'CentOS 7', runs_on: ['self-hosted', 'CPU'], target-os: 'centos', arch: 'x64'}
         - {runner: 'AlmaLinux 8', runs_on: ['self-hosted', 'CPU'], target-os: 'almalinux', arch: 'x64'}
-        - {runner: 'MacOS X64', runs_on: 'macos-12', target-os: 'macos', arch: 'x64'}
-        - {runner: 'MacOS ARM64', runs_on: 'macos-12', target-os: 'macos', arch: 'arm64'}
-        # TODO(#2805): add back once the workflow works and runs in comparable time to the other ones
-        # - {runner: 'Windows Latest', runs_on: 'windows-latest', target-os: 'windows', arch: 'x64'}
+        - {runner: 'MacOS X64', runs_on: 'macos-13', target-os: 'macos', arch: 'x64'}
+        - {runner: 'MacOS ARM64', runs_on: 'macos-13', target-os: 'macos', arch: 'arm64'}
+        - {runner: 'Windows Latest', runs_on: 'windows-latest', target-os: 'windows', arch: 'x64'}
 
     steps:
 
@@ -127,7 +126,8 @@ jobs:
         -DLLVM_BUILD_TOOLS=ON
         -DLLVM_ENABLE_ASSERTIONS=ON
         -DMLIR_ENABLE_BINDINGS_PYTHON=ON
-        -DLLVM_ENABLE_PROJECTS="clang;mlir"
+        -DLLVM_ENABLE_PROJECTS="mlir;llvm"
+        -DLLVM_ENABLE_DIA_SDK=OFF
         -DLLVM_INSTALL_UTILS=ON
         -DLLVM_TARGETS_TO_BUILD="host;NVPTX;AMDGPU"
         -DLLVM_ENABLE_TERMINFO=OFF
@@ -300,6 +300,7 @@ jobs:
 
     - name: Upload LLVM Artifacts to Azure
       if: ${{ (github.repository == 'triton-lang/triton') }}
+      shell: bash -el {0}
       run: |
         az storage blob upload --account-name oaitriton --auth-mode login --container-name public --file "${{ env.llvm_install_dir }}.tar.gz" --name "llvm-builds/${{ env.llvm_install_dir }}.tar.gz" --overwrite
 
 
@@ -78,6 +78,16 @@ class DialectInferLayoutInterface
                                    Attribute operandEncodingB) const = 0;
 };
 
+class DialectVerifyTensorLayoutInterface
+    : public DialectInterface::Base<DialectVerifyTensorLayoutInterface> {
+public:
+  DialectVerifyTensorLayoutInterface(Dialect *dialect) : Base(dialect) {}
+
+  virtual LogicalResult
+  verifyTensorLayout(Attribute layout, RankedTensorType type, ModuleOp module,
+                     function_ref<InFlightDiagnostic()> emitError) const = 0;
+};
+
 } // namespace triton
 } // namespace mlir
 
 
@@ -23,6 +23,38 @@ def TritonGPUPipeline : Pass<"tritongpu-pipeline", "mlir::ModuleOp"> {
   ];
 }
 
+def TritonGPUTestPipelineAssignLatencies : Pass<"tritongpu-test-pipeline-assign-latencies", "mlir::ModuleOp"> {
+  let summary = "test assigning latencies to interesting ops ahead of pipelining";
+
+  let description = [{
+    This is a test pass that tests `assignLatencies` method of `TritonGPULoopScheduling`.
+  }];
+
+  let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect",
+                           "mlir::triton::nvidia_gpu::TritonNvidiaGPUDialect",
+                           "mlir::scf::SCFDialect",
+                           "mlir::arith::ArithDialect"];
+
+  let options = [
+    Option<"numStages", "num-stages",
+           "int32_t", /*default*/"3",
+           "number of pipeline stages">
+  ];
+}
+
+def TritonGPUTestPipelineScheduleLoop : Pass<"tritongpu-test-pipeline-schedule-loop", "mlir::ModuleOp"> {
+  let summary = "test scheduling a loop for software pipelining";
+
+  let description = [{
+    This is a test pass that tests `scheduleLoop` method of `TritonGPULoopScheduling`.
+  }];
+
+  let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect",
+                           "mlir::triton::nvidia_gpu::TritonNvidiaGPUDialect",
+                           "mlir::scf::SCFDialect",
+                           "mlir::arith::ArithDialect"];
+}
+
 def TritonGPUF32DotTC : Pass<"tritongpu-F32DotTC", "mlir::ModuleOp"> {
   let summary = "3xTF32 trick";
 
 
@@ -11,6 +11,9 @@ static const char *kNumStagesAttrName = "tt.num_stages";
 static const char *kLoopStageAttrName = "loop.stage";
 static const char *kLoopClusterAttrName = "loop.cluster";
 
+bool loopHasDistGreaterThanOne(scf::ForOp forOp);
+bool isOuterLoop(scf::ForOp forOp);
+
 /// Function to mask operations during scheduling.
 Operation *predicateOp(RewriterBase &rewriter, Operation *op, Value pred);
 
 
@@ -11,6 +11,18 @@
 namespace mlir {
 namespace triton {
 
+namespace gpu {
+
+/// Discover operations that should become async and assign latencies to them
+/// based on the numStages value provided by the user.
+DenseMap<Operation *, int> assignLatencies(ModuleOp forOp, int numStages);
+
+/// Schedule the loop based on the latencies assigned to the operations.
+void scheduleLoop(scf::ForOp forOp,
+                  const DenseMap<Operation *, int> &opLatency);
+
+}; // namespace gpu
+
 /// This fill out the pipelining options including schedule and annotations
 /// for wait ops. This also does pre-processing by converting some of the
 /// loads into async loads so that the IR is ready to be pipelined.
@@ -108,8 +120,7 @@ class CoarseSchedule {
 
 // Add dependencies of anchor ops to the coarse schedule. Schedule them to
 // the same stage and ordering cluster as the anchor op.
-void scheduleDependencies(scf::ForOp forOp, CoarseSchedule &schedule,
-                          int numStages);
+void scheduleDependencies(scf::ForOp forOp, CoarseSchedule &schedule);
 
 } // namespace triton
 } // namespace mlir
 
@@ -5,11 +5,9 @@
 #include "mlir/IR/TypeUtilities.h"
 #include "triton/Dialect/Triton/IR/Types.h"
 #include "triton/Dialect/Triton/IR/Utility.h"
-#include "triton/Dialect/TritonGPU/IR/Dialect.h"
 #include "llvm/Support/ErrorHandling.h"
 
 using namespace mlir;
-namespace ttg = mlir::triton::gpu;
 
 static LogicalResult verifySameEncoding(Type typeA, Type typeB,
                                         bool allowTensorPointerType) {
@@ -118,53 +116,12 @@ LogicalResult OpTrait::impl::verifyTensorLayouts(Operation *op) {
     if (!layout)
       return success();
 
-    if (isa<ttg::SharedEncodingAttr>(layout))
-      return makeErr() << "Shared layout is not allowed on tensor type.";
-    // TODO(jlebar): Currently this only checks blocked layouts, but other
-    // layouts also have invariants!
-
-    // TODO(jlebar): Handle the case when the encoding is nested within tt.ptr.
-    if (auto blocked = dyn_cast<ttg::BlockedEncodingAttr>(layout)) {
-      // A different verifier should have checked that the layout itself is
-      // valid, including that threads-per-warp has the same rank as
-      // warps-per-block etc.
-      auto layoutRank = blocked.getThreadsPerWarp().size();
-      if (layoutRank != rankedTy.getRank()) {
-        return makeErr() << layout << ".\nLayout has rank " << layoutRank
-                         << ", but the tensor it's attached to has rank "
-                         << rankedTy.getRank() << ".";
-      }
-
-      int moduleThreadsPerWarp =
-          ttg::TritonGPUDialect::getThreadsPerWarp(module);
-      int64_t layoutThreadsPerWarp = product(blocked.getThreadsPerWarp());
-      if (layoutThreadsPerWarp != moduleThreadsPerWarp) {
-        return makeErr() << layout << ".\nLayout has a total of "
-                         << layoutThreadsPerWarp
-                         << " threads per warp, but the module specifies "
-                         << moduleThreadsPerWarp << " threads per warp.";
-      }
-
-      int moduleWarpsPerCTA = ttg::TritonGPUDialect::getNumWarps(module);
-      int64_t layoutWarpsPerCTA = product(blocked.getWarpsPerCTA());
-      if (layoutWarpsPerCTA != moduleWarpsPerCTA) {
-        return makeErr() << layout << ".\nLayout has a total of "
-                         << layoutWarpsPerCTA
-                         << " warps per CTA, but the module specifies "
-                         << moduleWarpsPerCTA << " warps per CTA.";
-      }
-
-      if (blocked.getCTALayout().getCTAsPerCGA().size() > 0) {
-        int moduleCTAsPerCGA = ttg::TritonGPUDialect::getNumCTAs(module);
-        int64_t layoutCTAsPerCGA =
-            product(blocked.getCTALayout().getCTAsPerCGA());
-        if (layoutCTAsPerCGA != moduleCTAsPerCGA) {
-          return makeErr() << layout << ".\nLayout has a total of "
-                           << layoutCTAsPerCGA
-                           << " CTAs per CGA, but the module specifies "
-                           << moduleCTAsPerCGA << " CTAs per CGA.";
-        }
-      }
+    Dialect &dialect = layout.getDialect();
+    auto verifyLayoutInterface =
+        dyn_cast<mlir::triton::DialectVerifyTensorLayoutInterface>(&dialect);
+    if (verifyLayoutInterface) {
+      return verifyLayoutInterface->verifyTensorLayout(layout, rankedTy, module,
+                                                       makeErr);
     }
 
     return success();
 
@@ -3058,6 +3058,68 @@ struct TritonGPUInferLayoutInterface
   }
 };
 
+struct TritonGPUVerifyTensorLayoutInterface
+    : public triton::DialectVerifyTensorLayoutInterface {
+  using DialectVerifyTensorLayoutInterface::DialectVerifyTensorLayoutInterface;
+
+  LogicalResult verifyTensorLayout(
+      Attribute layout, RankedTensorType rankedTy, ModuleOp module,
+      function_ref<InFlightDiagnostic()> makeErr) const override {
+    if (isa<triton::gpu::SharedEncodingAttr>(layout))
+      return makeErr() << "Shared layout is not allowed on tensor type.";
+    // TODO(jlebar): Currently this only checks blocked layouts, but other
+    // layouts also have invariants!
+
+    // TODO(jlebar): Handle the case when the encoding is nested within tt.ptr.
+    if (auto blocked = dyn_cast<triton::gpu::BlockedEncodingAttr>(layout)) {
+      // A different verifier should have checked that the layout itself is
+      // valid, including that threads-per-warp has the same rank as
+      // warps-per-block etc.
+      auto layoutRank = blocked.getThreadsPerWarp().size();
+      if (layoutRank != rankedTy.getRank()) {
+        return makeErr() << layout << ".\nLayout has rank " << layoutRank
+                         << ", but the tensor it's attached to has rank "
+                         << rankedTy.getRank() << ".";
+      }
+
+      int moduleThreadsPerWarp =
+          triton::gpu::TritonGPUDialect::getThreadsPerWarp(module);
+      int64_t layoutThreadsPerWarp = product(blocked.getThreadsPerWarp());
+      if (layoutThreadsPerWarp != moduleThreadsPerWarp) {
+        return makeErr() << layout << ".\nLayout has a total of "
+                         << layoutThreadsPerWarp
+                         << " threads per warp, but the module specifies "
+                         << moduleThreadsPerWarp << " threads per warp.";
+      }
+
+      int moduleWarpsPerCTA =
+          triton::gpu::TritonGPUDialect::getNumWarps(module);
+      int64_t layoutWarpsPerCTA = product(blocked.getWarpsPerCTA());
+      if (layoutWarpsPerCTA != moduleWarpsPerCTA) {
+        return makeErr() << layout << ".\nLayout has a total of "
+                         << layoutWarpsPerCTA
+                         << " warps per CTA, but the module specifies "
+                         << moduleWarpsPerCTA << " warps per CTA.";
+      }
+
+      if (blocked.getCTALayout().getCTAsPerCGA().size() > 0) {
+        int moduleCTAsPerCGA =
+            triton::gpu::TritonGPUDialect::getNumCTAs(module);
+        int64_t layoutCTAsPerCGA =
+            product(blocked.getCTALayout().getCTAsPerCGA());
+        if (layoutCTAsPerCGA != moduleCTAsPerCGA) {
+          return makeErr() << layout << ".\nLayout has a total of "
+                           << layoutCTAsPerCGA
+                           << " CTAs per CGA, but the module specifies "
+                           << moduleCTAsPerCGA << " CTAs per CGA.";
+        }
+      }
+    }
+
+    return success();
+  }
+};
+
 //===----------------------------------------------------------------------===//
 // Canonicalizer
 //===----------------------------------------------------------------------===//
@@ -3798,6 +3860,7 @@ void TritonGPUDialect::initialize() {
       >();
   addInterfaces<TritonGPUOpAsmInterface>();
   addInterfaces<TritonGPUInferLayoutInterface>();
+  addInterfaces<TritonGPUVerifyTensorLayoutInterface>();
 
   RankedTensorType::attachInterface<TensorModel>(*getContext());
   MemDescType::attachInterface<MemDescModel>(*getContext());
 
@@ -8,9 +8,12 @@ add_triton_library(TritonGPUTransforms
   OptimizeAccumulatorInit.cpp
   OptimizeDotOperands.cpp
   OptimizeThreadLocality.cpp
+  Pipeliner/AssignLatencies.cpp
   Pipeliner/MatmulLoopPipeline.cpp
   Pipeliner/OuterLoopPipeline.cpp
   Pipeliner/PipelineExpander.cpp
+  Pipeliner/TestPipelineAssignLatencies.cpp
+  Pipeliner/TestPipelineScheduleLoop.cpp
   Pipeliner/SoftwarePipeliner.cpp
   Pipeliner/TMAStoresPipeline.cpp
   Pipeliner/PipeliningUtility.cpp