[Cutlass] fp8 dynamic shapes test (pytorch#154829)

mlazos · pytorchmergebot · commit 5911f870c00d · 2025-06-05T20:17:01.000Z
Pull Request resolved: pytorch#154829 Approved by: https://github.com/henrylhtsang, https://github.com/eellison
diff --git a/test/inductor/test_cutlass_backend.py b/test/inductor/test_cutlass_backend.py
@@ -446,21 +446,26 @@ def test_max_autotune_cutlass_backend_regular_mm(
         Main test for mm.
         """
 
-        class MyModel(torch.nn.Module):
-            def forward(self, a, b):
-                return a @ b
-
-        model = MyModel().cuda()
         # M, N, K
         shapes = [
             (128, 128, 16),
             (1024, 1024, 256),
         ]
-        shapes = shapes[0:1] if not dynamic else shapes
+
+        # M, N, K
+        shapes = shapes if dynamic else shapes[0:1]
+
+        class MyModel(torch.nn.Module):
+            def forward(self, a, b):
+                return a @ b
+
+        model = MyModel().cuda()
+
         inputs = [
             (torch.randn(M, K).cuda().to(dtype), torch.randn(K, N).cuda().to(dtype))
             for (M, N, K) in shapes
         ]
+
         dynamic_shapes = (
             {
                 "a": {0: Dim.DYNAMIC, 1: Dim.DYNAMIC},
@@ -483,11 +488,100 @@ def forward(self, a, b):
                     model, inputs, dynamic_shapes=dynamic_shapes
                 )
             else:
-                compiled_model = torch.compile(model, dynamic=dynamic)
+                compiled_model = torch.compile(model, dynamic=True)
                 actual = [compiled_model(*input) for input in inputs]
 
             torch.testing.assert_close(actual, expected)
 
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @parametrize("dynamic", (False, True))
+    @parametrize("use_aoti", (False,))
+    @parametrize("dtype", (torch.float8_e4m3fn,))
+    @mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
+    def test_max_autotune_cutlass_backend_fp8_scaled_mm(
+        self,
+        dynamic: bool,
+        max_autotune_gemm_backends: str = "CUTLASS",
+        use_aoti: bool = False,
+        dtype: torch.dtype = torch.float16,
+    ):
+        """
+        Main test for mm.
+        """
+
+        # M, N, K
+        shapes = [
+            (128, 128, 16),
+            (1024, 1024, 256),
+        ]
+
+        # M, N, K
+        shapes = shapes if dynamic else shapes[0:1]
+
+        inputs = []
+        for shape in shapes:
+            M, N, K = shape
+            output_dtype = torch.bfloat16
+            device = "cuda"
+
+            x = torch.randn(M, K, dtype=output_dtype, device=device)
+            w = torch.randn(N, K, dtype=output_dtype, device=device)
+
+            # quantize weight (prior to inference)
+            w_fp8, w_inverse_scale = _quantize_rowwise(w, dtype)
+            w_t_fp8 = w_fp8.t()
+            w_inverse_scale = w_inverse_scale.t()  # scale_b should be (1, N)
+
+            # quantize input x
+            x_fp8, x_inverse_scale = _quantize_rowwise(x, dtype)
+
+            inputs.append((x_fp8, x_inverse_scale, w_t_fp8, w_inverse_scale))
+
+        class MyModel(torch.nn.Module):
+            def forward(self, x_fp8, x_inverse_scale, w_t_fp8, w_inverse_scale):
+                y = torch._scaled_mm(
+                    x_fp8,
+                    w_t_fp8,
+                    x_inverse_scale,
+                    w_inverse_scale,
+                    None,
+                    out_dtype=torch.bfloat16,
+                    use_fast_accum=False,
+                )
+                return y
+
+        dynamic_shapes = (
+            {
+                "x_fp8": {0: Dim.DYNAMIC, 1: Dim.DYNAMIC},
+                "x_inverse_scale": {0: Dim.DYNAMIC, 1: 1},
+                "w_t_fp8": {0: Dim.DYNAMIC, 1: Dim.DYNAMIC},
+                "w_inverse_scale": {0: 1, 1: Dim.DYNAMIC},
+            }
+            if dynamic
+            else None
+        )
+        model = MyModel().cuda()
+
+        with config.patch(
+            {
+                "max_autotune": True,
+                "max_autotune_gemm_backends": max_autotune_gemm_backends,
+                "cuda.cutlass_max_profiling_configs": 2,
+                "benchmark_epilogue_fusion": False,  # EVT doesn't support benchmark fusion yet
+                "cuda.cutlass_tma_only": True,
+            }
+        ), dynamo_config.patch({"error_on_recompile": dynamic}):
+            expected = [model(*input) for input in inputs]
+            if use_aoti:
+                actual = AOTIRunnerUtil.run_multiple(
+                    model, inputs, dynamic_shapes=dynamic_shapes
+                )
+            else:
+                compiled_model = torch.compile(model, dynamic=True)
+                actual = [compiled_model(*input) for input in inputs]
+
+            torch.testing.assert_close(actual, expected, rtol=1e-2, atol=0.05)
+
     @unittest.skipIf(not SM90OrLater, "need sm_90")
     @parametrize("dynamic", (False, True))
     @parametrize("use_aoti", (False, True))
@@ -1648,9 +1742,9 @@ def test_gemm_operation_serialization(self, arch: str, cuda_version: str):
         "shape",
         (
             (
-                16,
-                16,
-                32,
+                512,
+                128,
+                64,
             ),
         ),
     )
@@ -1720,9 +1814,9 @@ def linear(x_fp8, x_inverse_scale, w_t_fp8, w_inverse_scale, bias):
         "shape",
         (
             (
-                16,
-                16,
-                32,
+                512,
+                128,
+                64,
             ),
         ),
     )
diff --git a/test/inductor/test_cutlass_evt.py b/test/inductor/test_cutlass_evt.py
@@ -347,7 +347,9 @@ def test_example_tensor_creation(self):
         )
         buffer_renames = {"buf0": "buf0", "buf1": "buf1", "acc": "buf0"}
         name_to_buffer = {"buf0": row_major_buf0, "buf1": col_major_buf1}
-        result = create_example_tensors(buffer_renames, name_to_buffer)
+        result = create_example_tensors(
+            buffer_renames, name_to_buffer, lambda x: int(x)
+        )
         self.assertEqual(result["acc"].shape, (3, 4, 1))
         self.assertEqual(result["acc"].stride, (4, 1, 0))
         self.assertEqual(
diff --git a/torch/_inductor/codegen/cuda/cuda_kernel.py b/torch/_inductor/codegen/cuda/cuda_kernel.py
@@ -12,7 +12,7 @@
 from torch import dtype as torch_dtype
 from torch._inductor.codegen.cpp_wrapper_cpu import CppWrapperCpu
 from torch._inductor.scheduler import BaseSchedulerNode
-from torch._inductor.utils import do_bench_using_profiling, Placeholder
+from torch._inductor.utils import do_bench_using_profiling, OrderedSet, Placeholder
 from torch.utils._sympy.value_ranges import ValueRanges
 
 from .cutlass_utils import DTYPE_TO_CUTLASS_TYPE
@@ -81,6 +81,7 @@ class CUDAKernel(Kernel):
     def __init__(self, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
         self.layout_args: dict[str, list[LayoutArg]] = defaultdict(list)
+        self.size_args: list[Union[Expr, int]] = []
         # Mapping from arg name to IRNode.
         self.named_nodes: dict[str, IRNode] = {}
 
@@ -172,6 +173,9 @@ def get_ld(node) -> Union[Expr, int]:
         LDD = get_ld(Y)
         return (M, N, K, B, LDA, LDB, LDC, LDD)
 
+    def get_dynamic_shape_args(self) -> list[Union[Expr, int]]:
+        return [*self.get_layout_args(), *self.size_args]
+
     @staticmethod
     def find_ld_idx(node: IRNode) -> int:
         strides = node.get_stride()
@@ -257,6 +261,7 @@ def def_kernel(
                            e.g. The template might have input argument defined as [X, W, Bias],
                            and the actual input passed into this template could be [Bias, X, W].
                            In this case, the `input_reorder` would be [2, 0, 1].
+            additional_size_args: Additional size arguments for epilogue inputs
         """
         names = [x.strip() for x in names_str.strip().split(",")]
         if len(inputs) + len(outputs) != len(names):
@@ -276,17 +281,30 @@ def def_kernel(
                 self.named_nodes[name] = node
                 self.args.input_buffers[node.get_name()] = name
 
+        free_symbols: OrderedSet[Expr] = OrderedSet()
         for name, node in zip(names[len(inputs) : len(inputs) + len(outputs)], outputs):
             if node is not None:
                 self.named_nodes[name] = node
                 self.args.output_buffers[node.get_name()] = name
 
+                if name not in (
+                    "X",
+                    "W",
+                    "Bias",
+                    "Y",
+                ):  # we handle these symbolic shapes explicitly
+                    for expr in itertools.chain(node.get_size(), node.get_stride()):
+                        if isinstance(expr, Expr):
+                            for s in expr.free_symbols:
+                                free_symbols.add(s)  # type: ignore[arg-type]
+
         arg_defs, *_ = self.args.cpp_argdefs(DTYPE_TO_CUTLASS_TYPE)
 
         self.init_layout_args()
-        size_args = [
-            f"const int {s}" for s in ("M", "N", "K", "B", "lda", "ldb", "ldc", "ldd")
-        ]
+        size_vars = ["M", "N", "K", "B", "lda", "ldb", "ldc", "ldd"]
+        size_vars.extend(str(s) for s in free_symbols)
+        self.size_args.extend(free_symbols)
+        size_args = [f"const int {s}" for s in size_vars]
 
         runtime_arg_decls = ",".join(
             [f"{arg.ty} {arg.name}" for arg in self.runtime_arg_info]
@@ -326,11 +344,11 @@ def call_kernel(
         else:
             _, call_args, _, arg_types = self.args.python_argdefs()
 
-        layout_args = self.get_layout_args()
-        call_args.extend(layout_args)  # type: ignore[arg-type]
+        dynamic_shape_args = self.get_dynamic_shape_args()
+        call_args.extend(dynamic_shape_args)  # type: ignore[arg-type]
         for arg in self.runtime_arg_values:
             call_args.append(arg)
-        arg_types.extend("int" for a in layout_args)
+        arg_types.extend("int" for _ in dynamic_shape_args)
         for arg in self.runtime_arg_info:
             arg_types.append(arg.ty)
         # dynamo wraps unspec variable as 0d CPU tensor, need convert to scalar
diff --git a/torch/_inductor/codegen/cuda/cuda_template.py b/torch/_inductor/codegen/cuda/cuda_template.py
@@ -116,7 +116,7 @@ def generate(  # type: ignore[override]
             expected_args,
         )
         V.graph.sizevars.size_hints(map(sympy.expand, call_args[len(expected_args) :]))
-        size_args = V.graph.sizevars.size_hints(kernel.get_layout_args())
+        size_args = V.graph.sizevars.size_hints(kernel.get_dynamic_shape_args())
         extra_args = tuple(list(size_args) + self.get_runtime_arg_values(**kwargs))
 
         kernel_hash = hashlib.sha256(code.encode("utf-8")).hexdigest()[:8]
diff --git a/torch/_inductor/codegen/cuda/cutlass_lib_extensions/evt_extensions.py b/torch/_inductor/codegen/cuda/cutlass_lib_extensions/evt_extensions.py
@@ -1,4 +1,6 @@
-from typing import Any, Union
+from typing import Any, Callable, Union
+
+from sympy import Expr
 
 from torch._inductor.ir import (
     ComputedBuffer,
@@ -61,18 +63,13 @@
     def create_example_tensors(
         var_name_to_buffer_name: dict[str, str],
         name_to_buffer: dict[str, Buffer],
+        size_hint_fn: Callable[[Union[Expr, int]], int],
     ) -> dict[str, CutlassTensor]:
         def cutlass_tensor_from_buffer(buffer: Buffer) -> CutlassTensor:
             shape = buffer.get_layout().size
             stride = buffer.get_layout().stride
-            assert all(isinstance(x, int) or x.is_integer for x in shape), (
-                f"{buffer.get_name()}'s shape {shape} contains symints which aren't supported for cutlass EVT"
-            )
-            assert all(isinstance(x, int) or x.is_integer for x in stride), (
-                f"{buffer.get_name()}'s stride {stride} contains symints which aren't supported for cutlass EVT"
-            )
-            shape = tuple(int(x) for x in shape)
-            stride = tuple(int(x) for x in stride)
+            shape = tuple(size_hint_fn(x) for x in shape)
+            stride = tuple(size_hint_fn(x) for x in stride)
 
             is_row_major = is_contiguous_strides_for_shape(stride, shape)
             is_column_major = is_contiguous_strides_for_shape(stride[::-1], shape[::-1])
diff --git a/torch/_inductor/codegen/cuda/gemm_template.py b/torch/_inductor/codegen/cuda/gemm_template.py
@@ -1390,6 +1390,7 @@ def _render_evt(
         examples = create_example_tensors(
             var_name_to_buffer_name,
             name_to_buffer,  # type: ignore[arg-type]
+            V.graph.sizevars.size_hint,
         )
         evt_name, evt_args, evt_code = trace(
             evt_py_code,

Original file line number	Diff line number	Diff line change
`@@ -116,7 +116,7 @@ def generate( # type: ignore[override]`
`116`	`116`	`expected_args,`
`117`	`117`	`)`
`118`	`118`	`V.graph.sizevars.size_hints(map(sympy.expand, call_args[len(expected_args) :]))`
`119`		`- size_args = V.graph.sizevars.size_hints(kernel.get_layout_args())`
	`119`	`+ size_args = V.graph.sizevars.size_hints(kernel.get_dynamic_shape_args())`
`120`	`120`	`extra_args = tuple(list(size_args) + self.get_runtime_arg_values(**kwargs))`
`121`	`121`
`122`	`122`	`kernel_hash = hashlib.sha256(code.encode("utf-8")).hexdigest()[:8]`
Original file line number	Diff line number	Diff line change
`@@ -1390,6 +1390,7 @@ def _render_evt(`
`1390`	`1390`	`examples = create_example_tensors(`
`1391`	`1391`	`var_name_to_buffer_name,`
`1392`	`1392`	`name_to_buffer, # type: ignore[arg-type]`
	`1393`	`+ V.graph.sizevars.size_hint,`
`1393`	`1394`	`)`
`1394`	`1395`	`evt_name, evt_args, evt_code = trace(`
`1395`	`1396`	`evt_py_code,`