add support for hint_override in mark_unbacked (pytorch#162652)

bobrenjc93 · pytorchmergebot · commit 0661ecdb382c · 2025-09-17T22:29:54.000Z
Very similar to pytorch#161007 except now for mark_unbacked. Pull Request resolved: pytorch#162652 Approved by: https://github.com/laithsakka
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
@@ -10696,6 +10696,44 @@ def override(x):
 
         self.assertEqual(no_override(x_small), override(x_small))
 
+    @requires_gpu()
+    @skip_if_not_triton
+    @unittest.skipIf(
+        not IS_BIG_GPU, "Skipping triton backend only since not big GPU (not enough SM)"
+    )
+    @config.patch({"force_disable_caches": True})
+    def test_mark_unbacked_with_hint_override(self):
+        @torch.compile
+        def no_override(x):
+            return x.sum(dim=0)
+
+        @torch.compile
+        def override(x):
+            return x.sum(dim=0)
+
+        @torch.compile(fullgraph=True)
+        def branching(x):
+            if x.shape[0] > 4096:
+                return 1
+            return 2
+
+        x_small = torch.randn(4096, 512, device=GPU_TYPE)
+        torch._dynamo.decorators.mark_unbacked(x_small, 0)
+        code1 = run_and_get_triton_code(no_override, x_small)
+
+        torch._dynamo.reset_code_caches()
+
+        torch._dynamo.decorators.mark_unbacked(x_small, 0, hint_override=4096 * 10)
+        code2 = run_and_get_triton_code(override, x_small)
+        self.assertNotEqual(code1, code2)
+
+        self.assertEqual(no_override(x_small), override(x_small))
+
+        with self.assertRaisesRegex(
+            RuntimeError, "Could not guard on data-dependent expression"
+        ):
+            branching(x_small)
+
     @requires_gpu()
     def test_stride_preservation_with_stride_modifying_fx_pass(self):
         def f(x):
diff --git a/torch/_dynamo/decorators.py b/torch/_dynamo/decorators.py
@@ -536,19 +536,26 @@ class directly; instead, use :func:`mark_dynamic`.
 def mark_unbacked(
     t: Any,
     index: Union[int, list[Any], tuple[Any]],
+    hint_override: Optional[int] = None,
     strict: bool = False,
     specialize_on: Optional[list[Any]] = None,
 ) -> None:
     """
-    Mark a tensor as having an unbacked dim.  This changes the semantics of operations,
-    we will always report the size does not equal zero/one, we will turn asserts
-    on this index into runtime asserts, and if you try to get the real value we will
-    raise an exception.  In other words, we will treat this dimension as if it was
-    data dependent (we do not know anything about its value.)
+    Mark a tensor as having an unbacked dimension. This changes the semantics of operations:
+    - The size of the specified dimension will always be reported as not equal to zero or one.
+    - Assertions on this index will be turned into runtime asserts.
+    - Attempting to get the real value of this dimension will raise an exception.
+    - In effect, this dimension is treated as data-dependent (its value is unknown).
 
-    For historical reasons, by default if an unbacked dim is specialized, we will
-    happily specialize it and continue. If you want to error in these cases, pass
-    strict=True.
+    Args:
+        t (Any): The tensor to mark as having an unbacked dimension.
+        index (int or list/tuple of int): The dimension(s) to mark as unbacked. Can be a single integer or a list/tuple of integers.
+        hint_override (Optional[int], default=None): An optional integer to override the size hint for this dimension.
+            This is only used by the inductor backend for size hint queries, such as during autotuning.
+        strict (bool, default=False): If True, an error will be raised if the unbacked dimension is specialized.
+            By default (strict=False), specialization is allowed and will proceed without error.
+        specialize_on (Optional[list[Any]], default=None): A list of specialization criteria (e.g., lambdas) for this dimension.
+            If provided, Dynamo will generate specialized compiled regions for each criterion in addition to a generic trace.
     """
     # You could have copied the mark_dynamic behavior but I'm not convinced
     # it's what you want
@@ -567,6 +574,12 @@ def mark_unbacked(
         if not hasattr(t, "_dynamo_unbacked_indices"):
             t._dynamo_unbacked_indices = set()
 
+        if not hasattr(t, "_dynamo_hint_overrides"):
+            t._dynamo_hint_overrides = {}
+
+        if hint_override:
+            t._dynamo_hint_overrides[index] = hint_override
+
         # FX tracers don't respect @forbid_in_graph and choke on the following error since it passes in proxies:
         # TypeError: 'Attribute' object does not support item assignment
         if isinstance(t._specialize_on, dict):
@@ -612,7 +625,10 @@ def mark_dynamic(
     4) Attempts to trace this function will explicitly raise. As such, all calls to mark_dynamic must be made
     before torch.compile.
 
-    5) If specialize_on is passed in, we will perform a single generic Dynamo trace followed by
+    5) If hint_override is passed, the hint_override for the specified dimension will replace the provided value
+    from the first example input as the official size hint.
+
+    6) If specialize_on is passed in, we will perform a single generic Dynamo trace followed by
     multiple specialized compilations in addition to a single generic compilation. NB: For now we only support
     per dimension specialization, or in other words we do not generate a cross product of specializations.
     At runtime, we will dispatch to a specialized compiled region if the input matches the specialization criteria.
@@ -626,6 +642,7 @@ def mark_dynamic(
     This approach results in one Dynamo trace and two backend compilations. When the input dimension equals 8 or 16
     at runtime, execution will be directed to the specialized compiled region. Performance measurements indicate
     2-8x speedups depending on the specific specialization and model architecture.
+
     """
     if is_traceable_wrapper_subclass(t):
         # default behavior: mirror mark_dynamic() on all inner tensors with same dim as t
diff --git a/torch/_inductor/sizevars.py b/torch/_inductor/sizevars.py
@@ -74,6 +74,7 @@ def __init__(self, shape_env=None) -> None:
             shape_env = ShapeEnv()
         self.shape_env = shape_env
         self.var_to_val = self.shape_env.var_to_val
+        self.var_to_hint_override = self.shape_env.var_to_hint_override
         self.replacements: dict[sympy.Symbol, Expr] = self.shape_env.replacements
         self.unbacked_replacements: Optional[dict[Expr, Expr]] = None
         # Maps of dynamic sizes that have to be precomputed on the host to the kernel args.
@@ -544,7 +545,13 @@ def remove_precomputed_replacements(self, expr: Expr) -> Expr:
         return expr
 
     def symbolic_hint(
-        self, expr: Union[Expr, int], hint_override: Optional[int] = None
+        self,
+        expr: Union[Expr, int],
+        hint_override: Optional[int] = None,
+        # Only flip this flag if you don't plan on guarding/adding runtime
+        # asserts based on this value and promise to only use this value
+        # in a heuristic nature.
+        use_user_provided_hint_override: bool = False,
     ) -> Union[Expr, int]:
         if isinstance(expr, int):
             return expr
@@ -564,6 +571,10 @@ def symbolic_hint(
             return hint_override
 
         expr = self.remove_precomputed_replacements(expr)
+
+        if use_user_provided_hint_override:
+            expr = sympy_subs(expr, self.var_to_hint_override)
+
         return sympy_subs(expr, self.var_to_val)
 
     def size_hint(
@@ -573,7 +584,11 @@ def size_hint(
         fallback: Optional[int] = None,
         hint_override: Optional[int] = None,
     ) -> int:
-        out = self.symbolic_hint(expr, hint_override=hint_override)
+        out = self.symbolic_hint(
+            expr,
+            hint_override=hint_override,
+            use_user_provided_hint_override=fallback is not None,
+        )
         if not isinstance(out, (int, sympy.Integer)) and fallback is not None:
             # Use the provided heuristic fallback hint
             unbacked_sym_vrs = {
@@ -610,7 +625,11 @@ def size_hints(
         hint_override: Optional[int] = None,
     ) -> tuple[int, ...]:
         return tuple(
-            self.size_hint(x, fallback=fallback, hint_override=hint_override)
+            self.size_hint(
+                x,
+                fallback=fallback,
+                hint_override=hint_override,
+            )
             for x in exprs
         )
 
diff --git a/torch/fx/experimental/symbolic_shapes.py b/torch/fx/experimental/symbolic_shapes.py
@@ -3718,6 +3718,7 @@ def _init(
         self.source_name_to_debug_name: dict[str, str] = {}
         self.var_to_sources: dict[sympy.Symbol, list[Source]] = {}
         self.var_to_stack: dict[sympy.Symbol, CapturedTraceback] = {}
+        self.var_to_hint_override: dict[sympy.Symbol, int] = {}
         # Maps a source to the *original* symbol that was assigned to it
         self.source_to_var: dict[str, sympy.Symbol] = {}
         # Maps from sympy ints to expressions representing them
@@ -4582,6 +4583,11 @@ def _create_symbolic_sizes_strides_storage_offset(
             )
             for i, (sym, hint) in enumerate(zip(size, ex_size))
         ]
+
+        for i, sym in enumerate(sym_sizes):
+            if isinstance(sym, torch.SymInt) and i in hint_overrides:
+                self.var_to_hint_override[sym.node.expr] = hint_overrides[i]
+
         sym_stride = []
         for i, stride_expr in enumerate(stride):
             # NB: Don't duck size the stride; instead use the expression