Merge commit '68aa962e67baa191cec5aac173255abdba80db1a'

whitneywhtsang · whitneywhtsang · commit bfc51a4bb8ec · 2024-10-09T14:27:04.000Z
diff --git a/.gitignore b/.gitignore
@@ -17,6 +17,9 @@ python/triton/backends/
 !python/triton/backends/compiler.py
 !python/triton/backends/driver.py
 
+# Language extras
+python/triton/language/extra
+
 # Proton
 python/triton/profiler
 
diff --git a/lib/Analysis/AxisInfo.cpp b/lib/Analysis/AxisInfo.cpp
@@ -278,6 +278,11 @@ class AddSubOpAxisInfoVisitor final : public BinaryOpVisitorImpl<OpTy> {
 private:
   int64_t getContiguity(OpTy op, const AxisInfo &lhs, const AxisInfo &rhs,
                         int dim) override {
+    // Contiguity assumes an increasing sequence. So for SubIOp contiguous
+    // RHS doesn't produce a contiguous result.
+    if (isa<arith::SubIOp>(op))
+      return gcd(lhs.getContiguity(dim), rhs.getConstancy(dim));
+
     return std::max(gcd(lhs.getConstancy(dim), rhs.getContiguity(dim)),
                     gcd(lhs.getContiguity(dim), rhs.getConstancy(dim)));
   }
diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp
@@ -219,8 +219,9 @@ static void createTMAAsyncCopy(
 // encodings, raise assertion, since incompatible shared encoding has been
 // handled in splitLoadsForIncompatible.
 static std::optional<ttg::SharedEncodingAttr>
-getSharedEncIfAllUsersAreDotEnc(Value val) {
+getSharedEncIfAllUsersAreDotEnc(Value val, bool &incompatible) {
   ttg::SharedEncodingAttr attr;
+  incompatible = false;
   for (Operation *user : val.getUsers()) {
     ttg::SharedEncodingAttr tempAttr;
     if (user->getNumResults() != 1)
@@ -230,7 +231,8 @@ getSharedEncIfAllUsersAreDotEnc(Value val) {
       // First time we find a shared encoding in the chain, save it and try to
       // use it if it is compatible with the other users.
       tempAttr = cast<ttg::SharedEncodingAttr>(memDesc.getEncoding());
-      if (!getSharedEncIfAllUsersAreDotEnc(user->getResult(0)).has_value())
+      if (!getSharedEncIfAllUsersAreDotEnc(user->getResult(0), incompatible)
+               .has_value())
         return std::nullopt;
     } else {
       if (!isa<ttg::LocalLoadOp, ttg::ConvertLayoutOp>(user))
@@ -248,8 +250,10 @@ getSharedEncIfAllUsersAreDotEnc(Value val) {
           bitWidth, /*needTrans=*/false);
     }
     // Check that the shared encodings needed by the users are compatible.
-    if (attr != nullptr)
-      assert(attr == tempAttr && "incompatible shared encoding");
+    if (attr != nullptr && attr != tempAttr) {
+      incompatible = true;
+      return std::nullopt;
+    }
     attr = tempAttr;
   }
   return attr;
@@ -439,8 +443,44 @@ assignMemoryLayouts(llvm::SmallVector<std::tuple<Operation *, int, Operation *>>
         loadInfo.sharedEncoding =
             getSharedEncoding(op, /*loadIsMMAv3=*/true).value_or(nullptr);
       } else if (auto dot = dyn_cast<tt::DotOp>(use)) {
+        bool incompatible = false;
         loadInfo.sharedEncoding =
-            getSharedEncIfAllUsersAreDotEnc(op->getResult(0)).value_or(nullptr);
+            getSharedEncIfAllUsersAreDotEnc(op->getResult(0), incompatible)
+                .value_or(nullptr);
+        // If we can't agree on a shared encoding skip pipelinig the load.
+        if (incompatible)
+          continue;
+
+        // HACK: Triton LLVM codegen has a bug where local_loads from #shared to
+        // #mma layout can lead to invalid code if the loaded shape is smaller
+        // than the mma tile (e.g. loading a 128x1 tensor for an MMAv2 dot with
+        // tile {16,8} is bad because 1 < 8).  To work around this, don't
+        // pipeline such loads.
+        //
+        // The codegen bug is caught by an assertion, so if you think you've
+        // fixed it, feel free to delete this code and see if the assert still
+        // fails.  :)
+        if (!loadInfo.sharedEncoding) {
+          if (auto dotEnc = dyn_cast<ttg::NvidiaMmaEncodingAttr>(
+                  dot.getResult().getType().getEncoding())) {
+            auto loadTy = cast<RankedTensorType>(op->getResultTypes()[0]);
+            auto mmaInstrShape = dotEnc.getInstrShape();
+            if (loadTy.getRank() < mmaInstrShape.size())
+              continue;
+            bool ok = true;
+            for (int i = 0; i < mmaInstrShape.size(); i++) {
+              if (loadTy.getShape()[loadTy.getRank() - mmaInstrShape.size() +
+                                    i] < mmaInstrShape[i]) {
+                ok = false;
+                break;
+              }
+            }
+            // If this load might trigger the bug, don't do the fallback logic
+            // below, which might allow the load to be pipelined.
+            if (!ok)
+              continue;
+          }
+        }
       }
     } else if (auto loadOp = dyn_cast<tt::LoadOp>(use)) {
       // The use of this loadOp is another loadOp. If the use is not in the
@@ -476,83 +516,6 @@ assignMemoryLayouts(llvm::SmallVector<std::tuple<Operation *, int, Operation *>>
   return loadToInfo;
 }
 
-// Split users to groups, each group has the same shared encoding.
-// If not all users are Dot encoding, return empty vector.
-static DenseMap<ttg::SharedEncodingAttr, SmallVector<Operation *>>
-handleIncompatibleSharedEncoding(Operation *loadOp) {
-  DenseMap<ttg::SharedEncodingAttr, SmallVector<Operation *>> loadGroups;
-  // Go through transitive uses of the loadOp in the same block.
-  for (Operation *user : loadOp->getUsers()) {
-    if (user->getBlock() != loadOp->getBlock())
-      continue;
-    if (user->getNumResults() != 1)
-      return loadGroups;
-
-    ttg::SharedEncodingAttr tempAttr;
-    if (auto memDesc =
-            dyn_cast<triton::MemDescType>(user->getResult(0).getType())) {
-      tempAttr = cast<ttg::SharedEncodingAttr>(memDesc.getEncoding());
-      loadGroups[tempAttr].push_back(user);
-    } else {
-      if (!isa<ttg::LocalLoadOp, ttg::ConvertLayoutOp>(user))
-        return loadGroups;
-      auto dotOpEnc = dyn_cast<ttg::DotOperandEncodingAttr>(
-          cast<TensorOrMemDesc>(user->getResult(0).getType()).getEncoding());
-      if (!dotOpEnc)
-        return loadGroups;
-      auto srcTy = cast<TensorOrMemDesc>(loadOp->getResult(0).getType());
-      auto CTALayout = ttg::getCTALayout(srcTy.getEncoding());
-      auto order = ttg::getOrder(srcTy.getEncoding());
-      unsigned bitWidth = srcTy.getElementType().getIntOrFloatBitWidth();
-      tempAttr = ttg::SharedEncodingAttr::get(
-          loadOp->getContext(), dotOpEnc, srcTy.getShape(),
-          ttg::getOrder(srcTy.getEncoding()),
-          ttg::getCTALayout(srcTy.getEncoding()),
-          srcTy.getElementType().getIntOrFloatBitWidth(), /*needTrans=*/false);
-      loadGroups[tempAttr].push_back(user);
-    }
-  }
-  return loadGroups;
-}
-
-// Clone loads so each group of uses with same shared encoding will have a
-// corresponding Load.
-static void splitLoadsForIncompatible(
-    OpBuilder &builder, Operation *loadOp,
-    DenseMap<ttg::SharedEncodingAttr, SmallVector<Operation *>> &lGroups) {
-  // The first group will use the original load, create new loads for other
-  // groups.
-  unsigned idx = 0;
-  builder.setInsertionPointAfter(loadOp);
-  for (auto pair : lGroups) {
-    SmallVector<Operation *> &group = pair.second;
-    if (idx++ == 0)
-      continue;
-    Operation *newLoad = builder.clone(*loadOp);
-    for (auto *user : group) {
-      user->replaceUsesOfWith(loadOp->getResult(0), newLoad->getResult(0));
-    }
-  }
-}
-
-static void splitLoadsWithIncompatibleEncoding(scf::ForOp forOp) {
-  // Get the list of all loads.
-  SmallVector<Operation *> loads;
-  for (Operation &op : forOp.getBody()->without_terminator()) {
-    if (isa<tt::LoadOp, tt::ExperimentalDescriptorLoadOp>(op)) {
-      loads.push_back(&op);
-    }
-  }
-  OpBuilder builder(forOp);
-  for (auto *loadOp : loads) {
-    auto lGroups = handleIncompatibleSharedEncoding(loadOp);
-    LDBG("groups with different encoding: " << lGroups.size() << " "
-                                            << *loadOp);
-    if (lGroups.size() > 1)
-      splitLoadsForIncompatible(builder, loadOp, lGroups);
-  }
-}
-
 static llvm::MapVector<Operation *, LoadInfo>
 scheduleLoads(scf::ForOp forOp, tt::CoarseSchedule &schedule,
               DenseSet<Operation *> &rootUsers, int numStages) {
@@ -1106,8 +1069,6 @@ static void invalidateBarriers(OpBuilder &builder,
 
 bool mlir::triton::preProcessLoopAndGetSchedule(
     scf::ForOp &forOp, int numStages, mlir::triton::PipeliningOption &options) {
-  splitLoadsWithIncompatibleEncoding(forOp);
-
   // Schedule the loads and root ops (dot ops) in the loop. This will give us
   // a scaffold for the final schedule.
   DenseSet<Operation *> rootUsers;
diff --git a/python/setup.py b/python/setup.py
@@ -14,7 +14,7 @@
 from io import BytesIO
 from distutils.command.clean import clean
 from pathlib import Path
-from typing import NamedTuple
+from typing import List, NamedTuple
 
 from setuptools import Extension, setup
 from setuptools.command.build_ext import build_ext
@@ -32,8 +32,8 @@
 @dataclass
 class Backend:
     name: str
-    package_data: list[str]
-    language_package_data: list[str]
+    package_data: List[str]
+    language_package_data: List[str]
     src_dir: str
     backend_dir: str
     language_dir: str
diff --git a/python/test/regression/test_functional_regressions.py b/python/test/regression/test_functional_regressions.py
@@ -226,6 +226,21 @@ def grid(META):
     torch.testing.assert_close(torch_output, triton_output, rtol=1e-2, atol=1e-2)
 
 
+def test_reverse_range(device):
+
+    @triton.jit
+    def kernel(in_ptr, out_ptr):
+        x0 = tl.arange(0, 512)
+        tmp0 = tl.load(in_ptr + (512 - x0))
+        tl.store(out_ptr + x0, tmp0)
+
+    data = torch.randn((516, ), dtype=torch.float32, device=device)
+    res = torch.empty((512, ), dtype=torch.float32, device=device)
+    kernel[(1, )](data, res)
+    ref = torch.flip(data[1:513], [0])
+    assert (res == ref).all()
+
+
 @triton.jit
 def _triton_cummax_helper_fn(arg0_0, arg0_1, arg1_0, arg1_1):
     tmp0 = arg0_0 > arg1_0
diff --git a/test/Analysis/test-alignment.mlir b/test/Analysis/test-alignment.mlir
@@ -97,10 +97,12 @@ tt.func @sub() {
   %1 = arith.constant dense<1> : tensor<128xi32>
   // CHECK-NEXT: contiguity = [128], divisibility = [1], constancy = [1], constant_value = <none>
   %2 = arith.subi %0, %1 : tensor<128xi32>
+  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>
+  %3 = arith.subi %1, %0 : tensor<128xi32>
   // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [128], constant_value = 129
-  %3 = arith.constant dense<129> : tensor<128xi32>
+  %4 = arith.constant dense<129> : tensor<128xi32>
   // CHECK-NEXT: contiguity = [1], divisibility = [128], constancy = [128], constant_value = 128
-  %4 = arith.subi %3, %1 : tensor<128xi32>
+  %5 = arith.subi %4, %1 : tensor<128xi32>
   tt.return
 }
 
diff --git a/test/TritonGPU/loop-pipeline.mlir b/test/TritonGPU/loop-pipeline.mlir
@@ -844,16 +844,9 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
     %14 = tt.broadcast %11 : tensor<1x16x!tt.ptr<f16>, #blocked> -> tensor<64x16x!tt.ptr<f16>, #blocked>
     %15 = tt.broadcast %13 : tensor<64x1xi32, #blocked> -> tensor<64x16xi32, #blocked>
     %16 = tt.addptr %14, %15 : tensor<64x16x!tt.ptr<f16>, #blocked>, tensor<64x16xi32, #blocked>
-    // check that the load with incompatiable shared encoding gets cloned and feeds into uses with same encoding
-    // AMD-NOT: alloc
-    // AMD: scf.for
-    // CHECK: local_alloc
-    // CHECK: local_alloc
-    // CHECK: scf.for
-    // CHECK: local_load {{.*}} tensor<64x16xf16, #triton_gpu.dot_op<{opIdx = 1
-    // CHECK: convert_layout {{.*}} tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0
-    // CHECK: tt.dot
-    // CHECK: tt.trans %arg
+    // check that the load didn't get pipelined.
+    // COMMON-NOT: alloc
+    // COMMON: scf.for
     %17:2 = scf.for %arg2 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg3 = %cst_1, %arg4 = %cst_2) -> (tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>)  : i32 {
       %18 = tt.load %16 : tensor<64x16x!tt.ptr<f16>, #blocked>
       %19 = triton_gpu.convert_layout %9 : tensor<128x64xf16, #blocked1> -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>
@@ -1460,8 +1453,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 :
 // -----
 
 // COMMON-LABEL: @dont_pipeline_128x1
-// AMD-NOT: local_load{{.*}}128x1
-// CHECK: local_load{{.*}}128x1
+// COMMON-NOT: local_load{{.*}}128x1
 #blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [0, 1]}>
 #mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 8]}>
 module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} {