Restore maxNumImpreciseAcc guard for AddFOp (triton-lang#8056)

NikhilAPatel · web-flow · commit 0ce5d77990ed · 2025-09-03T22:54:18.000-07:00
This restores the legacy guard from Combine.td: we only fold `addf(dot, bias)` into `dot(..., C=bias)` when `maxNumImpreciseAcc == 0`. Without this, `use_fast_accum=False` kernels were silently rewritten into the fast-accum form, causing accuracy drift in FP8 tests. This change ensures precise accumulation semantics are preserved while keeping the optimization enabled when imprecise accumulation is allowed.  # New contributor declaration - [x] I am not making a trivial change, such as fixing a typo in a comment. - [x] I have written a PR description following these [rules](https://cbea.ms/git-commit/#why-not-how). - [x] I have run `pre-commit run --from-ref origin/main --to-ref HEAD`. - Select one of the following. - [x] I have added tests. - `/test` for `lit` tests - `/unittest` for C++ tests - `/python/test` for end-to-end tests - [ ] This PR does not need a test because existing tests should cover it. - Select one of the following. - [ ] I have not added any `lit` tests. - [x] The `lit` tests I have added follow these [best practices](https://mlir.llvm.org/getting_started/TestingGuide/#filecheck-best-practices), including the "tests should be minimal" section. (Usually running Python code and using the instructions it generates is not minimal.)
diff --git a/lib/Dialect/Triton/Transforms/Combine.cpp b/lib/Dialect/Triton/Transforms/Combine.cpp
@@ -252,6 +252,11 @@ class CombineDotAddPattern : public mlir::OpRewritePattern<OpTy> {
     }
     if (!isZero(dotOp.getC()))
       return failure();
+    if constexpr (std::is_same_v<OpTy, arith::AddFOp>) {
+      if (dotOp.getMaxNumImpreciseAcc() != 0) {
+        return failure();
+      }
+    }
     rewriter.modifyOpInPlace(dotOp, [&] {
       dotOp.getCMutable().assign(isDotLHS ? addOp.getRhs() : addOp.getLhs());
       dotOp->moveBefore(addOp);
diff --git a/test/Triton/combine.mlir b/test/Triton/combine.mlir
@@ -413,3 +413,39 @@ tt.func @test_rank_reduce_desc_load(%0: !tt.tensordesc<tensor<1x128x64xf16>>) ->
   %r = tt.reshape %l : tensor<1x128x64xf16> -> tensor<128x64xf16>
   tt.return %r :  tensor<128x64xf16>
 }
+
+// CHECK-LABEL: @test_combine_dot_add_no_fold_when_imprecise_allowed
+tt.func @test_combine_dot_add_no_fold_when_imprecise_allowed() -> (tensor<128x128xf32>) {
+    // CHECK-DAG: %[[D:.*]] = arith.constant dense<3.000000e+00> : tensor<128x128xf32>
+    %a    = arith.constant dense<1.0> : tensor<128x128xf32>
+    %b    = arith.constant dense<2.0> : tensor<128x128xf32>
+    %zero = arith.constant dense<0.0> : tensor<128x128xf32>
+    %d    = arith.constant dense<3.0> : tensor<128x128xf32>
+
+    %dot_out = tt.dot %a, %b, %zero {maxNumImpreciseAcc = 1 : i32}
+               : tensor<128x128xf32> * tensor<128x128xf32> -> tensor<128x128xf32>
+
+    // CHECK: arith.addf %{{.*}}, %[[D]] : tensor<128x128xf32>
+    // CHECK-NEXT: tt.return %{{.*}} : tensor<128x128xf32>
+    %res = arith.addf %dot_out, %d : tensor<128x128xf32>
+    tt.return %res : tensor<128x128xf32>
+}
+
+// CHECK-LABEL: @test_combine_dot_add_fold_when_precise_required
+tt.func @test_combine_dot_add_fold_when_precise_required() -> (tensor<128x128xf32>) {
+    // CHECK-DAG: %[[D:.*]] = arith.constant dense<3.000000e+00> : tensor<128x128xf32>
+    // CHECK-DAG: %[[B:.*]] = arith.constant dense<2.000000e+00> : tensor<128x128xf32>
+    // CHECK-DAG: %[[A:.*]] = arith.constant dense<1.000000e+00> : tensor<128x128xf32>
+    %a    = arith.constant dense<1.0> : tensor<128x128xf32>
+    %b    = arith.constant dense<2.0> : tensor<128x128xf32>
+    %zero = arith.constant dense<0.0> : tensor<128x128xf32>
+    %d    = arith.constant dense<3.0> : tensor<128x128xf32>
+
+    %dot_out = tt.dot %a, %b, %zero {maxNumImpreciseAcc = 0 : i32}
+               : tensor<128x128xf32> * tensor<128x128xf32> -> tensor<128x128xf32>
+
+    // CHECK-NEXT: %[[RES:.*]] = tt.dot %[[A]], %[[B]], %[[D]] : tensor<128x128xf32> * tensor<128x128xf32> -> tensor<128x128xf32>
+    // CHECK-NEXT: tt.return %[[RES]] : tensor<128x128xf32>
+    %res = arith.addf %dot_out, %d : tensor<128x128xf32>
+    tt.return %res : tensor<128x128xf32>
+}