[inductor] fix issue for example value with unbacked strides (pytorch#163660)

sevenEng · pytorchmergebot · commit 3f83e8915e86 · 2025-10-14T20:07:51.000Z
## Issue During autotune, we're not applying size hints atomically for the example inputs used for benchmarking. If there is unbacked symint showing up in inputs' strides, this might lead to CUDA IMA, and this could be reproduced by the added unittest, with stride being `[128 * u0, 128, 1]` and unbacked fallback being 8192, after calling `benchmark_example_value`, we get back a tensor with stride as `[8192, 128, 1]` as opposed to `[128 * 8192, 128, 1]` ## Fix Using the atomic API when trying to apply size hints to input tensor' strides. Pull Request resolved: pytorch#163660 Approved by: https://github.com/ColinPeppler
diff --git a/test/inductor/test_unbacked_symints.py b/test/inductor/test_unbacked_symints.py
@@ -653,6 +653,28 @@ def fn(x):
         expected = fn(*example_inputs)
         torch.testing.assert_close(actual, expected)
 
+    @skipGPUIf(not HAS_GPU, "requires gpu and triton")
+    @inductor_config.patch({"max_autotune": True})
+    @dynamo_config.patch({"capture_scalar_outputs": True})
+    def test_autotune_with_unbacked_stride(self, device):
+        def fn(x, y, a):
+            u0 = a.item()
+            torch._check(u0 != 1)
+            unbacked = x.expand(8, u0, *x.shape).clone()
+            unbacked = torch.permute(unbacked, [0, 2, 1])
+            y = y.expand(8, *y.shape)
+            bmm = torch.ops.aten.bmm(unbacked, y)
+            return bmm
+
+        example_inputs = (
+            torch.randn((32,), dtype=torch.bfloat16, device=device),
+            torch.randn((128, 64), dtype=torch.bfloat16, device=device),
+            torch.tensor(128, device=device),
+        )
+        actual = torch.compile(fn, fullgraph=True)(*example_inputs)
+        expected = fn(*example_inputs)
+        torch.testing.assert_close(actual, expected)
+
 
 instantiate_device_type_tests(TestUnbackedSymints, globals(), allow_xpu=True)
 
diff --git a/torch/_inductor/select_algorithm.py b/torch/_inductor/select_algorithm.py
@@ -3622,10 +3622,13 @@ def benchmark_example_value(node, hint_override: Optional[int] = None):
                 fallback=config.unbacked_symint_fallback,
                 hint_override=hint_override,
             ),
-            V.graph.sizevars.size_hints(
-                node.get_stride(),
-                fallback=config.unbacked_symint_fallback,
-                hint_override=hint_override,
+            tuple(
+                V.graph.sizevars.atomically_apply_size_hint(
+                    stride,
+                    fallback=config.unbacked_symint_fallback,
+                    hint_override=hint_override,
+                )
+                for stride in node.get_stride()
             ),
             node.get_device(),
             node.get_dtype(),
@@ -3677,9 +3680,12 @@ def key_of(node):
                 node.get_size(),
                 fallback=config.unbacked_symint_fallback,
             ),
-            *sizevars.size_hints(
-                node.get_stride(),
-                fallback=config.unbacked_symint_fallback,
+            *tuple(
+                V.graph.sizevars.atomically_apply_size_hint(
+                    stride,
+                    fallback=config.unbacked_symint_fallback,
+                )
+                for stride in node.get_stride()
             ),
             sizevars.size_hint(
                 node.get_layout().offset,
diff --git a/torch/_inductor/sizevars.py b/torch/_inductor/sizevars.py
@@ -908,7 +908,11 @@ def _sub_unbacked_exprs(self, expr: Expr) -> Expr:
         return expr
 
     def atomically_apply_size_hint(
-        self, expr: Union[Expr, int], *, fallback: Optional[int] = None
+        self,
+        expr: Union[Expr, int],
+        *,
+        fallback: Optional[int] = None,
+        hint_override: Optional[int] = None,
     ) -> Union[Expr, int]:
         if isinstance(expr, (int, sympy.Integer)):
             return int(expr)
@@ -925,7 +929,9 @@ def atomically_apply_size_hint(
         assert isinstance(expr, Expr), type(expr)
         free_symbols = expr.free_symbols
         size_dict = {
-            symbol: V.graph.sizevars.size_hint(symbol, fallback=fallback)
+            symbol: V.graph.sizevars.size_hint(
+                symbol, fallback=fallback, hint_override=hint_override
+            )
             for symbol in free_symbols
         }
         return expr.subs(size_dict)