[BACKEND] Extended combiner regarding dot scaled ops (#9616)

ravil-mobile · web-flow · commit 12aa288babcf · 2026-03-03T09:27:02.000-08:00
When using tl.dot_scaled, changing the code from an explicit accumulator
to Python's `+=` causes a big change in how many registers are used. In
our tests, the += version uses many more registers. This leads to lower
occupancy, more pressure on memory bandwidth, and register spills.

## Version A (explicit acc=acc) — uses fewer registers

```python
acc = tl.dot_scaled(
    a, a_scale, A_FMT,
    b, b_scale, B_FMT,
    acc=acc,
    out_dtype=tl.float32,
)
```

The generated `.amdgcn` code shows:

```asm
    .vgpr_count:     186
    .vgpr_spill_count: 0
```

 - Much better performance

### Version B (+=) — uses more registers

```python
acc += tl.dot_scaled(
    a, a_scale, A_FMT,
    b, b_scale, B_FMT,
    out_dtype=tl.float32,
)
```

The generated .amdgcn code shows:

```asm
    .vgpr_count:     256
    .vgpr_spill_count: 45
```

-  Much worse performance

### Expected behavior

Both versions do the same thing logically, so they should produce
similar compiled code and use about the same number of registers.

### Comparison with tl.dot

This problem does not happen with tl.dot. In that case, the compiler
correctly detects the accumulation pattern and merges it, which avoids
extra temporary values and keeps register usage low.
diff --git a/lib/Dialect/Triton/Transforms/Combine.cpp b/lib/Dialect/Triton/Transforms/Combine.cpp
@@ -231,17 +231,18 @@ class RankedReduceDescriptorLoads : public mlir::OpRewritePattern<ReshapeOp> {
   }
 };
 
-template <typename OpTy>
-class CombineDotAddPattern : public mlir::OpRewritePattern<OpTy> {
+template <typename DotOpType, typename AddOpType>
+class CombineDotAddPattern : public mlir::OpRewritePattern<AddOpType> {
 public:
-  using OpRewritePattern<OpTy>::OpRewritePattern;
+  using OpRewritePattern<AddOpType>::OpRewritePattern;
 
   mlir::LogicalResult
-  matchAndRewrite(OpTy addOp, mlir::PatternRewriter &rewriter) const override {
-    auto dotOp = addOp.getRhs().template getDefiningOp<DotOp>();
+  matchAndRewrite(AddOpType addOp,
+                  mlir::PatternRewriter &rewriter) const override {
+    auto dotOp = addOp.getRhs().template getDefiningOp<DotOpType>();
     bool isDotLHS = false;
     if (!dotOp) {
-      dotOp = addOp.getLhs().template getDefiningOp<DotOp>();
+      dotOp = addOp.getLhs().template getDefiningOp<DotOpType>();
       if (!dotOp) {
         return failure();
       }
@@ -252,7 +253,8 @@ class CombineDotAddPattern : public mlir::OpRewritePattern<OpTy> {
     }
     if (!isZero(dotOp.getC()))
       return failure();
-    if constexpr (std::is_same_v<OpTy, arith::AddFOp>) {
+    if constexpr (std::is_same_v<DotOpType, DotOp> &&
+                  std::is_same_v<AddOpType, arith::AddFOp>) {
       if (dotOp.getMaxNumImpreciseAcc() != 0) {
         return failure();
       }
@@ -270,8 +272,10 @@ class CombineDotAddPattern : public mlir::OpRewritePattern<OpTy> {
 // AddFOp(DotOp(a, b, c), d) and c==0 => DotOp(a, b, d)
 // AddIOp(d, DotOp(a, b, c)) and c==0 => DotOp(a, b, d)
 // AddFOp(d, DotOp(a, b, c)) and c==0 => DotOp(a, b, d)
-using CombineDotAddIPattern = CombineDotAddPattern<arith::AddIOp>;
-using CombineDotAddFPattern = CombineDotAddPattern<arith::AddFOp>;
+using CombineDotAddIPattern = CombineDotAddPattern<DotOp, arith::AddIOp>;
+using CombineDotAddFPattern = CombineDotAddPattern<DotOp, arith::AddFOp>;
+using CombineDotScaledAddFPattern =
+    CombineDotAddPattern<DotScaledOp, arith::AddFOp>;
 
 } // anonymous namespace
 
@@ -284,6 +288,7 @@ class CombineOpsPass : public impl::TritonCombineOpsBase<CombineOpsPass> {
 
     patterns.add<CombineDotAddIPattern>(context);
     patterns.add<CombineDotAddFPattern>(context);
+    patterns.add<CombineDotScaledAddFPattern>(context);
     patterns.add<CombineSelectMaskedLoadPattern>(context);
     patterns.add<CombineAddPtrPattern>(context);
     patterns.add<CombineBroadcastMulReducePattern>(context);
diff --git a/test/Triton/combine.mlir b/test/Triton/combine.mlir
@@ -43,6 +43,30 @@ tt.func @test_combine_dot_add_pattern() -> (tensor<128x128xf32>) {
 }
 
 
+// CHECK-LABEL: @test_combine_scale_dot_add_pattern
+tt.func @test_combine_scale_dot_add_pattern() -> (tensor<128x128xf32>) {
+    // CHECK-DAG: %[[a:.*]] = arith.constant dense<1.000000e+00> : tensor<128x128xf8E5M2>
+    // CHECK-DAG: %[[sa:.*]] = arith.constant dense<1> : tensor<128x4xi8>
+    // CHECK-DAG: %[[b:.*]] = arith.constant dense<2.000000e+00> : tensor<128x128xf8E5M2>
+    // CHECK-DAG: %[[sb:.*]] = arith.constant dense<2> : tensor<128x4xi8>
+    // CHECK-DAG: %[[d:.*]] = arith.constant dense<3.000000e+00> : tensor<128x128xf32>
+    %a = arith.constant dense<1.0> : tensor<128x128xf8E5M2>
+    %sa = arith.constant dense<1> : tensor<128x4xi8>
+    %b = arith.constant dense<2.0> : tensor<128x128xf8E5M2>
+    %sb = arith.constant dense<2> : tensor<128x4xi8>
+    %zero = arith.constant dense<0.0> : tensor<128x128xf32>
+    %d = arith.constant dense<3.0> : tensor<128x128xf32>
+
+    %dot_out = tt.dot_scaled %a scale %sa, %b scale %sb, %zero lhs = e5m2 rhs = e5m2 {fastMath = false}
+      : tensor<128x128xf8E5M2>, tensor<128x4xi8> * tensor<128x128xf8E5M2>, tensor<128x4xi8> -> tensor<128x128xf32>
+
+    // CHECK-NEXT: %[[res:.*]] = tt.dot_scaled %[[a]] scale %[[sa]], %[[b]] scale %[[sb]], %[[d]] lhs = e5m2 rhs = e5m2 {fastMath = false} : tensor<128x128xf8E5M2>, tensor<128x4xi8> * tensor<128x128xf8E5M2>, tensor<128x4xi8> -> tensor<128x128xf32>
+    // CHECK-NEXT: tt.return %[[res]] : tensor<128x128xf32>
+    %res = arith.addf %dot_out, %d : tensor<128x128xf32>
+    tt.return %res : tensor<128x128xf32>
+}
+
+
 // CHECK-LABEL: @test_combine_dot_add_rev_pattern
 tt.func @test_combine_dot_add_rev_pattern() -> (tensor<128x128xf32>) {
     // CHECK-DAG: %[[d:.*]] = arith.constant dense<3.000000e+00> : tensor<128x128xf32>