Merge commit 'a19f32454271ff9565ab957834bdf1e5d4ddce57'

whitneywhtsang · whitneywhtsang · commit a86e5a0817d6 · 2024-10-23T15:42:55.000Z
diff --git a/bin/CMakeLists.txt b/bin/CMakeLists.txt
@@ -55,7 +55,6 @@ target_link_libraries(triton-reduce PRIVATE
 mlir_check_all_link_libraries(triton-reduce)
 
 add_llvm_executable(triton-lsp triton-lsp.cpp PARTIAL_SOURCES_INTENDED)
-mlir_check_all_link_libraries(triton-lsp)
 
 llvm_update_compile_flags(triton-lsp)
 target_link_libraries(triton-lsp PRIVATE
diff --git a/include/triton/Dialect/Triton/IR/TritonOps.td b/include/triton/Dialect/Triton/IR/TritonOps.td
@@ -710,6 +710,7 @@ def TT_DotScaledOp : TT_Op<"dot_scaled", [Pure,
 //
 def TT_ReduceOp: TT_Op<"reduce",
                        [Pure,
+                        SameOperandsShape,
                         SameOperandsEncoding,
                         SingleBlock,
                         DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp
@@ -538,17 +538,14 @@ scheduleLoads(scf::ForOp forOp, tt::CoarseSchedule &schedule,
   if (loadOpToIndLevelAndUse.empty())
     return {};
 
-  for (auto iter = loadOpToIndLevelAndUse.begin();
-       iter != loadOpToIndLevelAndUse.end();) {
-    auto iterNext = iter + 1;
-    if (std::get<1>(*iter) >= numStages - 1)
-      // We assume loads with different dist are assigned to different stages.
-      // If numStages is 2, we will have no stage available for indirect loads
-      // with dist >= 1. In general, when dist is equal to numStages - 1, we
-      // should not pipeline it.
-      loadOpToIndLevelAndUse.erase(iter);
-    iter = iterNext;
-  }
+  // We assume loads with different dist are assigned to different stages.
+  // If numStages is 2, we will have no stage available for indirect loads
+  // with dist >= 1. In general, when dist is equal to numStages - 1, we
+  // should not pipeline it.
+  auto it = llvm::remove_if(loadOpToIndLevelAndUse, [=](auto op) {
+    return std::get<1>(op) >= numStages - 1;
+  });
+  loadOpToIndLevelAndUse.erase(it, loadOpToIndLevelAndUse.end());
 
   // Check which loads are good for pipelining, and assign them
   // memory layouts.
diff --git a/lib/Tools/LinearLayout.cpp b/lib/Tools/LinearLayout.cpp
@@ -16,6 +16,24 @@
 #define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
 #define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
 
+#if defined(_MSC_VER) && !defined(__clang__)
+// from https://gist.github.com/pps83/3210a2f980fd02bb2ba2e5a1fc4a2ef0
+#include <intrin.h>
+
+static int __builtin_ctz(unsigned x) {
+  unsigned long r;
+  _BitScanForward(&r, x);
+  return static_cast<int>(r);
+}
+
+static int __builtin_ctzll(unsigned long long x) {
+  unsigned long r;
+  _BitScanForward64(&r, x);
+  return static_cast<int>(r);
+}
+
+#endif
+
 namespace mlir::triton {
 
 namespace {
diff --git a/test/Triton/invalid.mlir b/test/Triton/invalid.mlir
@@ -108,6 +108,19 @@ tt.func public @fn(%v: tensor<4x128xf64>) {
 
 // -----
 
+tt.func @reduce_different_input_shapes(%arg0: tensor<32x32x64xf32>, %arg1: tensor<16x32x64xf32>) -> (tensor<32x64xf32>, tensor<16x64xf32>) {
+    // expected-error @below {{op requires the same shape for all operands}}
+    %0:2 = "tt.reduce" (%arg0, %arg1) <{axis = 1 : i32}> ({
+    ^bb0(%acc0: f32, %acc1: f32, %cur0: f32, %cur1: f32):
+      %1 = arith.addf %acc0, %cur0 : f32
+      %2 = arith.addf %acc1, %cur1 : f32
+      tt.reduce.return %1, %2 : f32, f32
+    }) : (tensor<32x32x64xf32>, tensor<16x32x64xf32>) -> (tensor<32x64xf32>, tensor<16x64xf32>)
+    tt.return %0#0, %0#1 : tensor<32x64xf32>, tensor<16x64xf32>
+}
+
+// -----
+
 tt.func public @fn(%v: tensor<4x128xf32>) {
     // expected-error @+1 {{requires the same shape}}
     %a = "tt.scan" (%v) ({