Add relative and absolute tolerance as command-line arguments

jananisriram · web-flow · commit db2155c80775 · 2025-08-15T10:52:15.000-07:00
Differential Revision: D80137287 Pull Request resolved: #329
diff --git a/tritonbench/operators/gemm/operator.py b/tritonbench/operators/gemm/operator.py
@@ -313,7 +313,9 @@ def pt2_matmul_maxautotune(self, a, b, bias) -> Callable:
 
     @register_benchmark(enabled=not is_cuda())
     def streamk_matmul(self, a, b, bias) -> Callable:
-        return lambda: streamk_amd_matmul(a, b, bias) if bias else streamk_amd_matmul(a, b)
+        return (
+            lambda: streamk_amd_matmul(a, b, bias) if bias else streamk_amd_matmul(a, b)
+        )
 
     @register_benchmark(enabled=is_cuda())
     def streamk_matmul(self, a, b, bias) -> Callable:
@@ -322,8 +324,14 @@ def streamk_matmul(self, a, b, bias) -> Callable:
         b = b.T.contiguous()
         baseline = streamk_cuda_matmul(a, b)
         if not torch.allclose(streamk, baseline):
-            print(f"StreamK matmul on {a.shape} x {b.shape} result does not match baseline matmul result. Max abs(streamk/baseline - 1):  {torch.max(torch.abs(streamk / baseline - 1))}")
-        return lambda: streamk_cuda_matmul(a, b) + bias if bias else streamk_cuda_matmul(a, b)
+            print(
+                f"StreamK matmul on {a.shape} x {b.shape} result does not match baseline matmul result. Max abs(streamk/baseline - 1):  {torch.max(torch.abs(streamk / baseline - 1))}"
+            )
+        return (
+            lambda: streamk_cuda_matmul(a, b) + bias
+            if bias
+            else streamk_cuda_matmul(a, b)
+        )
 
     @register_benchmark(enabled=is_cuda())
     def pt2_cutlass_matmul(self, a, b, bias) -> Callable:
diff --git a/tritonbench/operators/gemm/stream_k.py b/tritonbench/operators/gemm/stream_k.py
@@ -390,6 +390,7 @@ def streamk_amd_matmul(a, b, bias=None):
     # print(a @ b)
     return c
 
+
 def _matmul_launch_metadata(grid, kernel, args):
     ret = {}
     M, N, K = args["M"], args["N"], args["K"]
@@ -406,19 +407,26 @@ def _matmul_launch_metadata(grid, kernel, args):
 def matmul_get_configs(pre_hook=None):
     return [
         triton.Config(
-            {"BLOCK_M": BM, "BLOCK_N": BN, "BLOCK_K": BK, "SK_BLOCK_K": skBK, "GROUP_M": 8},
+            {
+                "BLOCK_M": BM,
+                "BLOCK_N": BN,
+                "BLOCK_K": BK,
+                "SK_BLOCK_K": skBK,
+                "GROUP_M": 8,
+            },
             num_stages=s,
             num_warps=w,
             pre_hook=pre_hook,
         )  #
         for BM in [128, 256]  #
         for BN in [128, 256]  #
         for BK in [32, 64, 128]  #
-        for skBK in [16, 32, 64, 128] #
+        for skBK in [16, 32, 64, 128]  #
         for s in ([2, 3, 4])  #
         for w in [4, 8]  #
     ]
 
+
 def matmul_tma_set_block_size_hook(nargs):
     BLOCK_M = nargs["BLOCK_M"]
     BLOCK_N = nargs["BLOCK_N"]
@@ -431,6 +439,7 @@ def matmul_tma_set_block_size_hook(nargs):
     nargs["a_desc_sk"].block_shape = [BLOCK_M, SK_BLOCK_K]
     nargs["b_desc_sk"].block_shape = [BLOCK_N, SK_BLOCK_K]
 
+
 @triton.autotune(
     configs=matmul_get_configs(pre_hook=matmul_tma_set_block_size_hook),
     key=["M", "N", "K"],
@@ -494,7 +503,6 @@ def streamk_cuda_gemm(
         total_ddp_tiles = num_pid - NUM_SMS
         streamk_sms = NUM_SMS
 
-
     # ----------------------------------------------------------------------------
     # DDP phase
     # ----------------------------------------------------------------------------
@@ -534,12 +542,12 @@ def streamk_cuda_gemm(
 
         # `evenly` distribute work units across SMs, with rem tiles assigned contiguously to the first rem programs
         base = total_work_units // streamk_sms
-        rem  = total_work_units % streamk_sms
+        rem = total_work_units % streamk_sms
         work = tl.where(worker_id < rem, base + 1, base)
         start = tl.where(
             worker_id < rem,
             worker_id * (base + 1),
-            rem * (base + 1) + (worker_id - rem) * base
+            rem * (base + 1) + (worker_id - rem) * base,
         )
         end = start + work - 1
 
@@ -567,7 +575,9 @@ def streamk_cuda_gemm(
 
             # compute the start and end K index on this tile for this work unit
             curr_st_k = tl.where(curr_tile == st_tile_streamk, st_k_streamk, 0)
-            curr_en_k = tl.where(curr_tile == en_tile_streamk, en_k_streamk, work_units_per_tile - 1)
+            curr_en_k = tl.where(
+                curr_tile == en_tile_streamk, en_k_streamk, work_units_per_tile - 1
+            )
 
             accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
 
@@ -590,6 +600,7 @@ def streamk_cuda_gemm(
                 # NOTE: known correctness issue with atomic_add
                 c_desc.atomic_add([offs_am, offs_bn], c)
 
+
 def streamk_cuda_matmul(a, b):
     assert a.dtype == b.dtype, "Incompatible dtypes"
 
@@ -624,7 +635,6 @@ def grid(META):
             streamk_sms = num_sms
         return (total_ddp_tiles + streamk_sms,)
 
-
     streamk_cuda_gemm[grid](
         a_desc,
         b_desc,
@@ -636,6 +646,6 @@ def grid(META):
         K,  #
         FP8_OUTPUT=dtype == torch.float8_e4m3fn,  #
         ENABLE_BUFFER_OPS_ASSUMES=True,  #
-        NUM_SMS=num_sms #
+        NUM_SMS=num_sms  #
     )
     return c
diff --git a/tritonbench/utils/parser.py b/tritonbench/utils/parser.py
@@ -246,6 +246,18 @@ def get_parser(args=None):
         default=None,
         help="Name of group for benchmarking.",
     )
+    parser.add_argument(
+        "--rtol",
+        type=float,
+        default=None,
+        help="Relative tolerance for accuracy metric.",
+    )
+    parser.add_argument(
+        "--atol",
+        type=float,
+        default=None,
+        help="Absolute tolerance for accuracy metric.",
+    )
 
     # A/B Testing parameters
     parser.add_argument(
diff --git a/tritonbench/utils/triton_op.py b/tritonbench/utils/triton_op.py
@@ -1145,14 +1145,34 @@ def accuracy(self, fn: Callable, baseline_fn: Callable) -> bool:
         baseline_output = baseline_fn()
         try:
             if self.mode == Mode.FWD:
-                torch.testing.assert_close(output, baseline_output)
+                torch.testing.assert_close(
+                    output,
+                    baseline_output,
+                    rtol=self.tb_args.rtol,
+                    atol=self.tb_args.atol,
+                )
             elif self.mode == Mode.BWD:
-                torch.testing.assert_close(output.grad, baseline_output.grad)
+                torch.testing.assert_close(
+                    output.grad,
+                    baseline_output.grad,
+                    rtol=self.tb_args.rtol,
+                    atol=self.tb_args.atol,
+                )
             else:
                 fwd_output, loss = output
                 baseline_fwd_output, baseline_loss = baseline_output
-                torch.testing.assert_close(fwd_output, baseline_fwd_output)
-                torch.testing.assert_close(loss.grad, baseline_loss.grad)
+                torch.testing.assert_close(
+                    fwd_output,
+                    baseline_fwd_output,
+                    rtol=self.tb_args.rtol,
+                    atol=self.tb_args.atol,
+                )
+                torch.testing.assert_close(
+                    loss.grad,
+                    baseline_loss.grad,
+                    rtol=self.tb_args.rtol,
+                    atol=self.tb_args.atol,
+                )
             return True
         except Exception:
             # either the output tensor or the loss grad tensor does not match