[Inductor][Triton] Fix SCALING_ROWWISE misclassification for scalar scales (pytorch#160450)

NikhilAPatel · facebook-github-bot · commit f192dc86b309 · 2025-08-20T09:03:59.000-07:00
Summary: Pull Request resolved: pytorch#160450 In `tuned_scaled_mm()`, we unsqeeuze any scalar scale from [] -> [1, 1]. Later, when we are determining how to set the `SCALING_ROWWISE` kernel attribute, we check whether the scale has 2 dimensions. However, since we previously unsqueezed any scalar scales, this will always evaluate to True. Test Plan: ` buck2 test 'fbcode//mode/opt' fbcode//caffe2/test/inductor:fp8 -- test_tensorwise_scaling ` ` buck2 test 'fbcode//mode/opt' fbcode//caffe2/test/inductor:fp8 -- test_rowwise_scaling ` ` buck2 test 'fbcode//mode/opt' fbcode//caffe2/test/inductor:fp8 -- test_tensorwise_scaling_acceptable_input_dims ` ` buck2 test 'fbcode//mode/opt' fbcode//caffe2/test/inductor:fp8 -- test_tensorwise_scaling_acceptable_input_dims ` Rollback Plan: Reviewed By: eellison, PaulZhang12 Differential Revision: D80108117
diff --git a/test/inductor/test_fp8.py b/test/inductor/test_fp8.py
@@ -8,6 +8,7 @@
 from torch import Tensor
 from torch._inductor import config, utils
 from torch._inductor.test_case import run_tests, TestCase
+from torch._inductor.utils import run_and_get_code
 from torch.testing._internal.common_cuda import (
     PLATFORM_SUPPORTS_FP8,
     PLATFORM_SUPPORTS_MX_GEMM,
@@ -24,6 +25,7 @@
     HAS_CPU,
     HAS_CUDA_AND_TRITON,
 )
+from torch.testing._internal.jit_utils import FileCheck
 from torch.utils._triton import has_triton_tma_device
 
 
@@ -465,6 +467,86 @@ def linear(x_fp8, x_inverse_scale, w_t_fp8, w_inverse_scale, bias):
             # setting a small absolute tolerance in these tests
             torch.testing.assert_close(y_eager, y_compiled, rtol=1e-2, atol=0.05)
 
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
+    @unittest.skipIf(
+        not has_triton_tma_device(), "Need device-side TMA support in Triton"
+    )
+    @parametrize("dtype", (torch.bfloat16, torch.float32))
+    @parametrize("shape", ("16,32,32", "1024,1024,512"))
+    @parametrize("use_fast_accum", (False, True))
+    def test_tensorwise_scaling_tma_template(
+        self,
+        dtype: torch.dtype,
+        shape: str,
+        use_fast_accum: bool,
+    ):
+        device = "cuda"
+        dtype_float8 = torch.float8_e4m3fn
+        dtype_float8 = _fix_fp8_dtype_for_rocm(dtype_float8, device)
+
+        shape = [int(dim) for dim in shape.split(",")]
+        M, K, N = shape  # Matmul Y = X [M, K] x W [N, K]
+        # input and output dtypes of _scaled_mm do not need to be the same, but
+        # typically in a model they are
+        x = torch.randn(M, K, dtype=dtype, device=device)
+        w = torch.randn(N, K, dtype=dtype, device=device)
+        bias = None
+
+        # quantize weight (prior to inference)
+        w_fp8, w_inverse_scale = _quantize_tensorwise(w, dtype_float8)
+        w_t_fp8 = w_fp8.t()
+
+        # quantize input x
+        x_fp8, x_inverse_scale = _quantize_tensorwise(x, dtype_float8)
+
+        def linear(x_fp8, x_inverse_scale, w_t_fp8, w_inverse_scale, bias):
+            y = torch._scaled_mm(
+                x_fp8,
+                w_t_fp8,
+                x_inverse_scale,
+                w_inverse_scale,
+                bias,
+                out_dtype=dtype,
+                use_fast_accum=use_fast_accum,
+            )
+            return y
+
+        y_eager = linear(
+            x_fp8,
+            x_inverse_scale,
+            w_t_fp8,
+            w_inverse_scale,
+            bias,
+        )
+        with config.patch(
+            {
+                "triton.enable_persistent_tma_matmul": True,
+                "test_configs.autotune_choice_name_regex": "triton_scaled_mm_device_tma",
+                "max_autotune_gemm_backends": "TRITON",
+                "max_autotune": True,
+            }
+        ):
+            linear_compiled = torch.compile(
+                linear, backend="inductor", mode="max-autotune"
+            )
+            y_compiled, code = run_and_get_code(
+                linear_compiled,
+                x_fp8,
+                x_inverse_scale,
+                w_t_fp8,
+                w_inverse_scale,
+                bias,
+            )
+
+            FileCheck().check("SCALING_ROWWISE : tl.constexpr = False").run(code[0])
+            self.assertEqual(y_eager.dtype, dtype)
+            self.assertEqual(y_compiled.dtype, dtype)
+            # depending on the kernel config (BLOCK_M size, etc) selected during Inductor
+            # autotuning for the compiled case, the results can be different because of
+            # the way blocks of results are accumulated (float addition not associative), so
+            # setting a small absolute tolerance in these tests
+            torch.testing.assert_close(y_eager, y_compiled, rtol=1e-2, atol=0.05)
+
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
     @parametrize("shape", ("16,16,32", "16,32,32", "1024,1024,512"))
     @parametrize("has_bias", (False, True))
@@ -531,6 +613,81 @@ def linear(x_fp8, x_inverse_scale, w_t_fp8, w_inverse_scale, bias):
         self.assertEqual(y_compiled.dtype, dtype)
         torch.testing.assert_close(y_eager, y_compiled, rtol=1e-2, atol=0.05)
 
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
+    @unittest.skipIf(
+        not has_triton_tma_device(), "Need device-side TMA support in Triton"
+    )
+    @parametrize("shape", ("16,32,32", "1024,1024,512"))
+    @parametrize("use_fast_accum", (False, True))
+    def test_rowwise_scaling_tma_template(
+        self,
+        shape: str,
+        use_fast_accum: bool,
+    ):
+        # Only bf16 output type is supported for row-wise scaling, not fp32
+        dtype: torch.dtype = torch.bfloat16
+        device = "cuda"
+        dtype_float8 = torch.float8_e4m3fn
+        dtype_float8 = _fix_fp8_dtype_for_rocm(dtype_float8, device)
+
+        shape = [int(dim) for dim in shape.split(",")]
+        M, K, N = shape  # Matmul Y = X [M, K] x W [N, K]
+        x = torch.randn(M, K, dtype=dtype, device=device)
+        w = torch.randn(N, K, dtype=dtype, device=device)
+        bias = None
+
+        # quantize weight (prior to inference)
+        w_fp8, w_inverse_scale = _quantize_rowwise(w, dtype_float8)
+        w_t_fp8 = w_fp8.t()
+        w_inverse_scale = w_inverse_scale.t()  # scale_b should be (1, N)
+
+        # quantize input x
+        x_fp8, x_inverse_scale = _quantize_rowwise(x, dtype_float8)
+
+        def linear(x_fp8, x_inverse_scale, w_t_fp8, w_inverse_scale, bias):
+            y = torch._scaled_mm(
+                x_fp8,
+                w_t_fp8,
+                x_inverse_scale,
+                w_inverse_scale,
+                bias,
+                out_dtype=dtype,
+                use_fast_accum=use_fast_accum,
+            )
+            return y
+
+        y_eager = linear(
+            x_fp8,
+            x_inverse_scale,
+            w_t_fp8,
+            w_inverse_scale,
+            bias,
+        )
+        with config.patch(
+            {
+                "triton.enable_persistent_tma_matmul": True,
+                "test_configs.autotune_choice_name_regex": "triton_scaled_mm_device_tma",
+                "max_autotune_gemm_backends": "TRITON",
+                "max_autotune": True,
+            }
+        ):
+            linear_compiled = torch.compile(
+                linear, backend="inductor", mode="max-autotune"
+            )
+            y_compiled, code = run_and_get_code(
+                linear_compiled,
+                x_fp8,
+                x_inverse_scale,
+                w_t_fp8,
+                w_inverse_scale,
+                bias,
+            )
+
+        FileCheck().check("SCALING_ROWWISE : tl.constexpr = True").run(code[0])
+        self.assertEqual(y_eager.dtype, dtype)
+        self.assertEqual(y_compiled.dtype, dtype)
+        torch.testing.assert_close(y_eager, y_compiled, rtol=1e-2, atol=0.05)
+
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
     @parametrize("M", (1, 3, 33, 257, 1024))
     @parametrize("K", (16, 32, 1024))
diff --git a/torch/_inductor/template_heuristics.py b/torch/_inductor/template_heuristics.py
@@ -1487,6 +1487,11 @@ def are_compatible_scales(size_a: Any, size_b: Any) -> bool:
 
             return False
 
+        def is_scalar_like(sz: Any) -> bool:
+            return (len(sz) == 0) or all(
+                V.graph.sizevars.statically_known_equals(d, 1) for d in sz
+            )
+
         size_a, size_b = scale_a.get_size(), scale_b.get_size()
         assert are_compatible_scales(size_a, size_b), (
             "Expect scale_a and scale_b to be either both scalars (including single-element tensors) "
@@ -1500,8 +1505,9 @@ def are_compatible_scales(size_a: Any, size_b: Any) -> bool:
             # Add scaled MM-specific options (moved from mm_common.scaled_mm_options)
             # Override accumulator type for scaled MM
             template_kwargs["ACC_TYPE"] = "tl.float32"
-            # Add SCALING_ROWWISE attribute based on scale_a tensor shape
-            template_kwargs["SCALING_ROWWISE"] = len(size_a) == 2
+            # Add SCALING_ROWWISE attribute based on scale tensor shapes
+            both_scalar_like = is_scalar_like(size_a) and is_scalar_like(size_b)
+            template_kwargs["SCALING_ROWWISE"] = not both_scalar_like
 
             yield template_kwargs