[Inductor] Support precomputed size args in the FX backend. (pytorch#157758)

blaine-rister · pytorchmergebot · commit 92f41ccc2651 · 2025-07-08T23:22:17.000Z
# Feature If a Triton kernel has a complicated indexing expression, Inductor may decide to precompute it on the host and pass it to the kernel as an argument. This happens in situations like broadcasts with dynamic shapes. This PR adds support for this feature to Inductor's FX IR backend. We generate FX IR for precomputed size args in 3 steps: 1. In `PythonWrapperCodegen`, this PR refactors the relevant code to use a `SymbolicCallArgLine` instead of raw Python strings. This stores a (symbol, expr) pair. (Prior to this PR, it was (str, expr), but changing this to a symbol makes it easier to do substitutions later on.) 2. In `WrapperFxCodegen`, keep a dict of {symbol: expr} arg defs which gets updated whenever we see a `SymbolicCallArgLine`. 3. When the FX backend sees a `KernelCallLine`, it uses this dict to replace symbolic call args with their definitions. In the longer run, it might be desirable to emit FX nodes defining these symbolic call args. That way, we could reuse the size computation when the same kernel is called multiple times. However, I wasn't sure if there was an existing way to generate FX nodes from a sympy expression, and implementing that seemed like overkill for the present purposes. # Test plan Added a new CI test exercising this feature. Pull Request resolved: pytorch#157758 Approved by: https://github.com/jansel
diff --git a/test/inductor/test_fxir_backend.py b/test/inductor/test_fxir_backend.py
@@ -393,6 +393,22 @@ def get_input():
             ]
             self.assertEqual(placeholder.meta["val"], symbol)
 
+    def test_dynamic_shapes_precomputed_size(self):
+        """
+        Test dynamic shapes where a kernel's size arg is precomputed.
+        """
+        func = torch.add
+        args = [
+            torch.randn(shape, device=self.device) for shape in [(7, 12, 9), (7, 1, 1)]
+        ]
+        (gm,) = self._compile_and_check(func, args, compile_kwargs={"dynamic": True})
+
+        # Check for the precomputed size arg.
+        (triton_node,) = gm.graph.find_nodes(
+            op="call_function", target=triton_kernel_wrapper_mutation
+        )
+        self.assertIn("ks0", triton_node.kwargs["kwargs"])
+
     @config.patch({"trace.enabled": True})
     @unittest.mock.patch("torch._inductor.debug.DebugFormatter.output_code")
     def test_debug(self, mock_output_code):
diff --git a/torch/_inductor/codegen/wrapper.py b/torch/_inductor/codegen/wrapper.py
@@ -322,7 +322,7 @@ def traverse(cur_kernel):
 
 @dataclasses.dataclass
 class SymbolicCallArg:
-    inner: str
+    inner: sympy.Symbol
     # the original symbolic expression represented by inner
     inner_expr: sympy.Expr
 
@@ -1726,7 +1726,8 @@ def ensure_size_computed(self, sym: sympy.Symbol):
                 return
             self.computed_sizes.add(sym)
             expr = V.graph.sizevars.inv_precomputed_replacements[sym]
-            self.writeline(f"{sym} = {pexpr(expr)}")
+            arg = SymbolicCallArg(sym, expr)
+            self.writeline(SymbolicCallArgLine(self, arg, V.graph))
 
     def finalize_prefix(self):
         pass
@@ -2257,9 +2258,10 @@ def rename_sizes_for_launcher(expr: Union[int, sympy.Expr]) -> sympy.Expr:
         return name, triton_meta, extra_launcher_call_args
 
     def generate_numel_expr(self, kernel_name: str, tree, suffix: Optional[str] = None):
-        expr = f"{kernel_name}_{tree.prefix}numel"
+        sym_name = f"{kernel_name}_{tree.prefix}numel"
         if suffix is not None:
-            expr += f"_{suffix}"
+            sym_name += f"_{suffix}"
+        sym = sympy.Symbol(sym_name, is_integer=True, is_positive=True)
 
         # We can get symbolic expressions here, like s0*64
         # It is fine to have them here, but we need to handle them correctly as their own type
@@ -2268,7 +2270,7 @@ def generate_numel_expr(self, kernel_name: str, tree, suffix: Optional[str] = No
         # This is handled in `generate_args_decl` which has a correct comment of: TODO: only works for
         # constant now, need type info. I agree, this needs type info, and while this is not true type info
         # it suffices as a type hint for the purposes of producing the correct code for this type.
-        arg = SymbolicCallArg(expr, tree.numel)
+        arg = SymbolicCallArg(sym, tree.numel)
         self.writeline(SymbolicCallArgLine(self, arg, V.graph))
 
         return arg
diff --git a/torch/_inductor/codegen/wrapper_fxir.py b/torch/_inductor/codegen/wrapper_fxir.py
@@ -17,7 +17,7 @@
 from torch._inductor.codecache import PyCodeCache
 from torch._inductor.runtime.triton_heuristics import CachingAutotuner
 from torch._inductor.select_algorithm import extern_kernels  # noqa: F401
-from torch._inductor.utils import sympy_product
+from torch._inductor.utils import sympy_product, sympy_subs
 from torch._inductor.virtualized import V
 from torch._library.triton import wrap_triton
 from torch.fx import GraphModule
@@ -155,6 +155,9 @@ def __post_init__(self) -> None:
             Optional[str], torch.fx.Node
         ] = {}  # Symbol table for codegen.
         self.kernels: dict[str, TritonKernel] = {}  # Table to store Triton kernels.
+        self.symbolic_arg_defs: dict[
+            sympy.Symbol, sympy.Expr
+        ] = {}  # Call arg definitions.
         self._unique_symbol_ids: Counter[str] = Counter()
 
     def _import_kernel(self, code: str, kernel_name: str) -> CachingAutotuner:
@@ -576,12 +579,15 @@ def replace_floor_div(expr: sympy.Expr) -> sympy.Expr:
             else:
                 return sympy.floor(expr)
 
-        def expr_to_symint(expr: Union[int, sympy.Expr]) -> Union[int, sympy.Expr]:
-            return (
-                convert_to_symint(expr.replace(sympy.floor, replace_floor_div))
-                if isinstance(expr, sympy.Expr)
-                else expr
-            )
+        def expr_to_symint(
+            expr: Union[int, torch.fx.Node, sympy.Expr],
+        ) -> Union[int, torch.fx.Node, sympy.Expr]:
+            if not isinstance(expr, sympy.Expr):
+                return expr
+
+            expr = expr.replace(sympy.floor, replace_floor_div)
+            expr = sympy_subs(expr, self.symbolic_arg_defs)
+            return convert_to_symint(expr)
 
         # Convert sympy expressions to symints.
         # Use FloorDiv over sympy.floor, so we can get nicer Python code from FX.
@@ -691,4 +697,6 @@ def _generate_kernel_definition(self, line: WrapperLine) -> None:
 
     def _generate_symbolic_call_arg(self, line: WrapperLine) -> None:
         assert isinstance(line, SymbolicCallArgLine)
-        # No need for an FX node, as we will pass the arg to kernels via a SymInt.
+        # Store the arg: expr mapping for later use.
+        arg = line.arg
+        self.symbolic_arg_defs[arg.inner] = arg.inner_expr