[BACKEND] Allow mixed of linear layout and legacy in split/join (#6028)

ThomasRaoux · web-flow · commit c4c8bac7889a · 2025-02-26T18:20:38.000Z
Relax the verifier of split/join to allow linear and legacy layouts to
be mixed.
diff --git a/include/triton/Dialect/Triton/IR/TritonOps.td b/include/triton/Dialect/Triton/IR/TritonOps.td
@@ -505,7 +505,7 @@ def TT_CatOp : TT_Op<"cat", [NoMemoryEffect,
 
 def TT_JoinOp : TT_Op<"join", [
     NoMemoryEffect, SameTypeOperands,
-    DeclareOpInterfaceMethods<InferTypeOpInterface>,
+    InferTypeOpWithLayoutEquivalence,
 ]> {
     let summary = "join two tensors along a new, minor dimension";
     let description = [{
@@ -523,7 +523,7 @@ def TT_JoinOp : TT_Op<"join", [
 
 def TT_SplitOp : TT_Op<"split", [
   NoMemoryEffect,
-  DeclareOpInterfaceMethods<InferTypeOpInterface>,
+  InferTypeOpWithLayoutEquivalence,
   TypesMatchWith<"outLHS and outRHS types match",
                   "outLHS", "outRHS", "$_self">,
 ]> {
diff --git a/lib/Dialect/Triton/IR/Ops.cpp b/lib/Dialect/Triton/IR/Ops.cpp
@@ -1027,17 +1027,9 @@ LogicalResult ReturnOp::verify() {
 // -- JoinOp --
 LogicalResult
 JoinOp::inferReturnTypes(MLIRContext *context, std::optional<Location> location,
-                         ValueRange operands, DictionaryAttr attributes,
-                         OpaqueProperties properties, RegionRange regions,
+                         JoinOp::Adaptor adaptor,
                          SmallVectorImpl<Type> &inferredReturnTypes) {
-  // These should have been checked by tablegen-generated code.
-  assert(operands.size() == 2);
-  assert(operands[0].getType() == operands[1].getType());
-  assert(isa<RankedTensorType>(operands[0].getType()));
-  assert(isa<RankedTensorType>(operands[1].getType()));
-
-  Value lhs = operands[0];
-  auto srcTy = cast<RankedTensorType>(lhs.getType());
+  auto srcTy = cast<RankedTensorType>(adaptor.getLhs().getType());
 
   SmallVector<int64_t> retShape(srcTy.getShape());
   retShape.push_back(2);
@@ -1058,15 +1050,9 @@ JoinOp::inferReturnTypes(MLIRContext *context, std::optional<Location> location,
 
 // -- SplitOp --
 LogicalResult SplitOp::inferReturnTypes(
-    MLIRContext *context, std::optional<Location> location, ValueRange operands,
-    DictionaryAttr attributes, OpaqueProperties properties, RegionRange regions,
-    SmallVectorImpl<Type> &inferredReturnTypes) {
-  // These should have been checked by tablegen-generated code.
-  assert(operands.size() == 1);
-  assert(isa<RankedTensorType>(operands[0].getType()));
-
-  Value src = operands[0];
-  auto srcTy = cast<RankedTensorType>(src.getType());
+    MLIRContext *context, std::optional<Location> location,
+    SplitOp::Adaptor adaptor, SmallVectorImpl<Type> &inferredReturnTypes) {
+  auto srcTy = cast<RankedTensorType>(adaptor.getSrc().getType());
   auto srcShape = srcTy.getShape();
 
   if (srcShape.empty() || srcShape.back() != 2) {
diff --git a/lib/Dialect/Triton/IR/Traits.cpp b/lib/Dialect/Triton/IR/Traits.cpp
@@ -21,7 +21,8 @@ LogicalResult OpTrait::impl::verifyEquivalentType(Type typeA, Type typeB) {
   auto shapeB = tensorTypeB.getShape();
   if (shapeA != shapeB)
     return failure();
-
+  if (tensorTypeA.getElementType() != tensorTypeB.getElementType())
+    return failure();
   // If there's no encoding or the encodings are the same
   if (encodingA == encodingB)
     return success();
diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp
@@ -2802,6 +2802,8 @@ struct TritonGPUInferLayoutInterface
     if (expected == got) {
       return success();
     }
+    if (!expected || !got)
+      return failure();
     // Check whether the encodings are structurally the same.
     auto expectedLL = triton::gpu::toLinearLayout(shape, expected);
     auto gotLL = triton::gpu::toLinearLayout(shape, got);
diff --git a/test/Triton/invalid.mlir b/test/Triton/invalid.mlir
@@ -170,11 +170,11 @@ tt.func public @fn(%arg0: tensor<32xf32, #blocked>) {
 // -----
 
 // Bad order; should be [1,0]
-#blocked  = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [1], order = [0]}>
+#blocked  = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [1], order = [0]}>
 #blocked1 = #ttg.blocked<{sizePerThread = [1,2], threadsPerWarp = [32,1], warpsPerCTA = [1,1], order = [0,1]}>
 module attributes {"ttg.target" = "cuda:80", "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 32 : i32} {
 tt.func public @fn(%arg0: tensor<32xf32, #blocked>) {
-    // expected-error @+2 {{order}}
+    // expected-error @+2 {{incompatible with return type(s) of operation}}
     // expected-error @+1 {{op failed to infer returned types}}
     %a = tt.join %arg0, %arg0 : tensor<32xf32, #blocked> -> tensor<32x2xf32, #blocked1>
     tt.return
@@ -215,7 +215,7 @@ tt.func public @fn(%arg0: tensor<2xf32>) {
 
 // -----
 
-#blocked  = #ttg.blocked<{sizePerThread = [1,1,2], threadsPerWarp = [1,32,1], warpsPerCTA = [1,1,1], order = [2,0,1]}>
+#blocked  = #ttg.blocked<{sizePerThread = [1,2,2], threadsPerWarp = [1,32,1], warpsPerCTA = [1,1,1], order = [2,0,1]}>
 // Bad order, should be [1,0].
 #blocked1 = #ttg.blocked<{sizePerThread = [1,1], threadsPerWarp = [1,32], warpsPerCTA = [1,1], order = [1,0]}>
 
diff --git a/test/TritonGPU/ops.mlir b/test/TritonGPU/ops.mlir
@@ -189,3 +189,21 @@ tt.func @function_no_scope() {
 }
 
 }
+
+// -----
+
+// CHECK-DAG: [[$BLOCKED:#.*]] = #ttg.blocked
+#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
+// CHECK-DAG: [[$LINEAR:#.*]] = #ttg.linear
+#linear = #ttg.linear<{register = [[0, 1], [16, 0], [32, 0], [64, 0]], lane = [[0, 0], [0, 0], [0, 0], [1, 0], [2, 0]], warp = [[4, 0], [8, 0]], block = []}>
+
+module attributes {"ttg.num-warps" = 4 : i32} {
+// CHECK-LABEL: @split_join_linear_mix
+tt.func @split_join_linear_mix(%arg: tensor<128x2xf32, #linear>) attributes {"ttg.num-warps" = 8 : i32} {
+  // CHECK-NEXT: tt.split %{{.*}} : tensor<128x2xf32, [[$LINEAR]]> -> tensor<128xf32, #ttg.slice<{dim = 1, parent = [[$BLOCKED]]}>>
+  %lhs, %rhs = tt.split %arg : tensor<128x2xf32, #linear> -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked}>>
+  // CHECK-NEXT: tt.join %{{.*}}, %{{.*}} : tensor<128xf32, #ttg.slice<{dim = 1, parent = [[$BLOCKED]]}>> -> tensor<128x2xf32, [[$LINEAR]]>
+  %j = tt.join %lhs, %rhs : tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<128x2xf32, #linear>
+  tt.return
+}
+}

Original file line number	Diff line number	Diff line change
`@@ -2802,6 +2802,8 @@ struct TritonGPUInferLayoutInterface`
`2802`	`2802`	`if (expected == got) {`
`2803`	`2803`	`return success();`
`2804`	`2804`	`}`
	`2805`	`+ if (!expected \|\| !got)`
	`2806`	`+ return failure();`
`2805`	`2807`	`// Check whether the encodings are structurally the same.`
`2806`	`2808`	`auto expectedLL = triton::gpu::toLinearLayout(shape, expected);`
`2807`	`2809`	`auto gotLL = triton::gpu::toLinearLayout(shape, got);`