[BACKEND] Allow ranked reduced descriptor load (triton-lang#5880)

ThomasRaoux · web-flow · commit cdf49bfc1b95 · 2025-02-11T08:42:46.000-08:00
This allows us to implement efficient batch matmul with TMAs. Note that
this could be solved by allowing reshapes on shared descriptor and
propagating layout but for simplicity we currently do it with a pattern
match.
diff --git a/lib/Dialect/Triton/IR/Ops.cpp b/lib/Dialect/Triton/IR/Ops.cpp
@@ -1237,7 +1237,19 @@ static LogicalResult verifyDesciptorLoadStoreType(Operation *op,
                                                   TensorDescType desc,
                                                   RankedTensorType tensor) {
   RankedTensorType block = desc.getBlockType();
-  if (block.getShape() == tensor.getShape() &&
+  ArrayRef<int64_t> blockShape = block.getShape();
+  ArrayRef<int64_t> tensorShape = tensor.getShape();
+  if (blockShape.size() > tensorShape.size()) {
+    // Allow ranked reduced load if the leading dimensions are all 1s.
+    for (int i = 0; i < blockShape.size() - tensorShape.size(); ++i) {
+      if (blockShape[i] != 1)
+        return op->emitOpError(
+            "ranked reduce load only allowed for unit dimension leading dim.");
+    }
+    blockShape = blockShape.take_back(tensorShape.size());
+  }
+
+  if (blockShape == tensorShape &&
       block.getElementType() == tensor.getElementType())
     return success();
   return op->emitOpError("tensor desciptor block and tensor types must match");
diff --git a/lib/Dialect/Triton/Transforms/Combine.cpp b/lib/Dialect/Triton/Transforms/Combine.cpp
@@ -210,6 +210,37 @@ class CombineReshapeReducePatterns : public mlir::OpRewritePattern<ReshapeOp> {
   }
 };
 
+class RankedReduceDescriptorLoads : public mlir::OpRewritePattern<ReshapeOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(triton::ReshapeOp reshapeOp,
+                  mlir::PatternRewriter &rewriter) const override {
+    auto loadDef = reshapeOp.getSrc()
+                       .getDefiningOp<triton::ExperimentalDescriptorLoadOp>();
+    if (!loadDef || !loadDef->hasOneUse())
+      return failure();
+    int loadRank = loadDef.getType().getRank();
+    int reshapeRank = reshapeOp.getType().getRank();
+    if (!(reshapeRank < loadRank))
+      return failure();
+    ArrayRef<int64_t> loadShape = loadDef.getType().getShape();
+    ArrayRef<int64_t> reshapeShape = reshapeOp.getType().getShape();
+    for (int i = 0; i < loadRank - reshapeRank; ++i) {
+      // Only rank reduce unit dims.
+      if (loadShape[i] != 1)
+        return failure();
+    }
+    if (loadShape.take_back(reshapeRank) != reshapeShape)
+      return failure();
+    rewriter.modifyOpInPlace(
+        loadDef, [&]() { loadDef.getResult().setType(reshapeOp.getType()); });
+    rewriter.replaceOp(reshapeOp, loadDef.getResult());
+    return success();
+  }
+};
+
 class CombineOpsPass : public TritonCombineOpsBase<CombineOpsPass> {
 public:
   void runOnOperation() override {
@@ -227,6 +258,7 @@ class CombineOpsPass : public TritonCombineOpsBase<CombineOpsPass> {
     patterns.add<CombineAddPtrPattern>(context);
     patterns.add<CombineBroadcastMulReducePattern>(context);
     patterns.add<CombineReshapeReducePatterns>(context);
+    patterns.add<RankedReduceDescriptorLoads>(context);
 
     if (applyPatternsGreedily(m, std::move(patterns)).failed())
       signalPassFailure();
diff --git a/test/Triton/combine.mlir b/test/Triton/combine.mlir
@@ -380,3 +380,13 @@ tt.func @test_reshape_reduce(%0: tensor<32x4x2xi32>) -> (i32, tensor<16xi32>) {
   %3 = tt.histogram %1 : tensor<256xi32> -> tensor<16xi32>
   tt.return %2, %3 : i32, tensor<16xi32>
 }
+
+// CHECK-LABEL: test_rank_reduce_desc_load
+tt.func @test_rank_reduce_desc_load(%0: !tt.tensordesc<tensor<1x128x64xf16>>) -> (tensor<128x64xf16>) {
+  %c0 = arith.constant 0 : i32
+  // CHECK: %[[R:.+]] = tt.experimental_descriptor_load {{.*}} : !tt.tensordesc<tensor<1x128x64xf16>> -> tensor<128x64xf16>
+  // CHECK: tt.return %[[R]]
+  %l = tt.experimental_descriptor_load %0[%c0, %c0, %c0] : !tt.tensordesc<tensor<1x128x64xf16>> -> tensor<1x128x64xf16>
+  %r = tt.reshape %l : tensor<1x128x64xf16> -> tensor<128x64xf16>
+  tt.return %r :  tensor<128x64xf16>
+}
diff --git a/test/Triton/invalid.mlir b/test/Triton/invalid.mlir
@@ -409,7 +409,7 @@ tt.func @gather_op(%arg0: tensor<128x16xf32>, %arg1: tensor<512x4xi32>) {
 
 tt.func @invalid_desc_load(%arg0: !tt.tensordesc<tensor<16x16xf32>>) {
   %c = arith.constant 0 : i32
-  // expected-error @below {{tensor desciptor block and tensor types must match}}
+  // expected-error @below {{ranked reduce load only allowed for unit dimension leading dim}}
   tt.experimental_descriptor_load %arg0[%c, %c] : !tt.tensordesc<tensor<16x16xf32>> -> tensor<16xf32>
   tt.return
 }

Original file line number	Diff line number	Diff line change
`@@ -409,7 +409,7 @@ tt.func @gather_op(%arg0: tensor<128x16xf32>, %arg1: tensor<512x4xi32>) {`
`409`	`409`
`410`	`410`	`tt.func @invalid_desc_load(%arg0: !tt.tensordesc<tensor<16x16xf32>>) {`
`411`	`411`	`%c = arith.constant 0 : i32`
`412`		`- // expected-error @below {{tensor desciptor block and tensor types must match}}`
	`412`	`+ // expected-error @below {{ranked reduce load only allowed for unit dimension leading dim}}`
`413`	`413`	`tt.experimental_descriptor_load %arg0[%c, %c] : !tt.tensordesc<tensor<16x16xf32>> -> tensor<16xf32>`
`414`	`414`	`tt.return`
`415`	`415`	`}`