Fix fake tensor caching when output has unbacked (pytorch#153034)

aorenste · pytorchmergebot · commit 4b7abce6a4d7 · 2025-05-23T15:03:31.000Z
We handle fake tensor caching in two ways: 1. If the inputs have no symbols (SymInt, etc) then we cache on the FakeTensorMode. 2. If the inputs have symbols then we cache on the ShapeEnv. This way the symbols in the inputs and outputs are associated with the guards in place at the time of the call. However - it's possible to have an op where there are no symbols in the inputs but there is an unbacked symbol in the output. In this case we shouldn't cache at all because what would that really mean? So this PR changes the caching behavior so that if there's a symbol in the output which doesn't come in some way from the input then we refuse to cache that op. Added a test which checks for this case. While in there I also did a couple other related changes: 1. Added negative caching - if we see that an (op, args) failed to cache previously we don't even bother trying to cache it again. 2. Reworked the inner behavior of _cached_dispatch_impl a little to make it more clear which bits we expect to be able to throw _BypassDispatchCache and add some comments. The latest version of this also: 1. Addresses the problem that caused pytorch#153891. The issue was that with caching ops are required to support `__eq__`. Unfortunately _RecordFunction is minimalistic and doesn't support that - so in the off-chance that two keys hash to the same value the `__eq__` check would raise an exception. Apparently this was much more common on MacOS where memory patterns end up with more reuse (so the object IDs are the same and give you the same hash value for objects that use pointer hash). Tested locally on MacOS where running ``` python test/inductor/test_torchinductor.py GPUTests ``` was pretty much guaranteed to fail (at least for me) somewhere around test 100-200 and passed all 800 tests after this change. Another way to test this is to run the inductor tests with `torch._subclasses.fake_tensor._DispatchCacheKey.__hash__` monkey-patched to return a constant (causing all values to hash-collide) but this can't really be checked-in since it causes the cache lookup to turn into an O(n) lookup which takes a crazy long time to run through all the tests... 2. Folds in pytorch#153780 to ensure that exceptions raised from the op don't include the context from the cache key bypass. Pull Request resolved: pytorch#153034 Approved by: https://github.com/masnesral, https://github.com/tugsbayasgalan
diff --git a/test/test_fake_tensor.py b/test/test_fake_tensor.py
@@ -2265,13 +2265,10 @@ def count_invoke_subgraph_keys():
         gc.collect()
         self.assertTrue(count_invoke_subgraph_keys() == 0)
 
-
-
     @skipIfTorchDynamo("cache hit/miss changes with invoke_subgraph caching")
     def test_invoke_subgraph_cacheable_inplace(self):
         invoke_subgraph = torch._higher_order_ops.invoke_subgraph
 
-
         def fn(x, y):
             # aten ops are used so that eager backend graph is suitable for fake
             # tensor testing
@@ -2317,5 +2314,32 @@ def fn(x, y):
                     extract_tensor_metadata(b),
                 )
 
+    @skipIfTorchDynamo("cache hit/miss changes with invoke_subgraph caching")
+    def test_unbacked_output(self):
+        # The point of this test is to have an op which has no symbols as input
+        # but a symbol as an output and make sure that we skip caching it.
+        class LengthsGather(torch.nn.Module):
+            def forward(
+                self,
+                input: torch.Tensor,
+                lengths: torch.Tensor,
+                indices: torch.Tensor,
+                offsets: torch.Tensor,
+            ) -> torch.Tensor:
+                bias = torch.gather(offsets, 0, indices)
+                lengths_selected = torch.gather(lengths, 0, indices)
+                index = torch.repeat_interleave(bias, lengths_selected, dim=0)
+                return index
+
+        input = torch.tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
+        lengths = torch.tensor([0, 2, 3, 1, 4])
+        indices = torch.tensor([2, 3, 4, 6, 7, 8, 9])
+        offsets = torch.cumsum(lengths, 0)
+        ep = torch.export.export(LengthsGather(), (input, lengths, indices, offsets), strict=False)
+
+        FakeTensorMode.cache_clear()
+        ep.run_decompositions({})
+        self.assertBypasses("unrepresented symbol in output", 2)
+
 if __name__ == "__main__":
     run_tests()
diff --git a/torch/_subclasses/_fake_tensor_utils.py b/torch/_subclasses/_fake_tensor_utils.py
@@ -218,6 +218,11 @@ class _CacheKeyState:
     # matches one of the inputs so we can uncache it properly.
     sym_node_lookup: dict[int, int]  # id(SymNode) -> index
 
+    # This is a list of all seen input sympy.Symbols. We use it when building
+    # the cache entry to see if the output value has any symbols that we didn't
+    # see on input. See _has_unrepresented_symbols().
+    known_symbols: set[sympy.Symbol]
+
     # There are cases where we're asked to perform an op when we have no
     # ShapeEnv on the FakeTensorMode - but for SymNodes we MUST have a
     # ShapeEnv. So as we scan if we see a SymNode (with a ShapeEnv) we record it
@@ -226,6 +231,7 @@ class _CacheKeyState:
 
     def __init__(self, shape_env: Optional[ShapeEnv] = None) -> None:
         self.sym_node_lookup = {}
+        self.known_symbols = set()
         self.shape_env = shape_env
 
     def cache_on_shape_env(self) -> bool:
@@ -247,6 +253,7 @@ def convert_sym_int(self, result: list[object], arg: SymInt) -> None:
             result.append(_InputBackref(self.sym_node_lookup[node_id]))
         else:
             self.sym_node_lookup[node_id] = len(result)
+            self.known_symbols.update(arg.node.expr.free_symbols)
             if self.shape_env is None:
                 self.shape_env = arg.node.shape_env
             result.append(_PySymInputStub(arg))
diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py
@@ -74,12 +74,6 @@
         raise e
 
 
-class _Unassigned:
-    pass
-
-
-_UNASSIGNED = _Unassigned()
-
 DimList = list
 
 pytree = torch.utils._pytree
@@ -1118,7 +1112,7 @@ class _DispatchCacheEntryOutputInfo:
 
 @dataclass_slots
 @dataclass(frozen=True)
-class _DispatchCacheEntry:
+class _DispatchCacheValidEntry:
     """
     Entry type for the FakeTensor dispatch cache. It supports two types of outputs
     1) tensor
@@ -1131,6 +1125,20 @@ class _DispatchCacheEntry:
     is_output_tuple: bool = False
 
 
+@dataclass_slots
+@dataclass(frozen=True)
+class _DispatchCacheBypassEntry:
+    """
+    Entry type for a negative cache entry.
+    """
+
+    reason: str
+
+
+if TYPE_CHECKING:
+    _DispatchCacheEntry = Union[_DispatchCacheValidEntry, _DispatchCacheBypassEntry]
+
+
 @dataclass_slots
 @dataclass(frozen=True)
 class _BypassDispatchCache(Exception):
@@ -1418,37 +1426,72 @@ def _cached_dispatch_impl(
         Lookup a cache entry for the given arguments. If none exists, dispatch
         and cache the result (if the result is eligible for caching).
         """
-        output: object = _UNASSIGNED
+        state = None
+        key = None
         try:
             state = _CacheKeyState(self.shape_env)
             key = self._cache_key(state, func, args, kwargs)
-            if state.cache_on_shape_env():
-                assert state.shape_env is not None
-                cache = state.shape_env.fake_tensor_cache
-            else:
-                cache = FakeTensorMode.cache
-            entry = cache.get(key, None)
-            if entry is not None:
-                output = self._output_from_cache_entry(state, entry, key, func, args)
-                FakeTensorMode.cache_hits += 1
-                if self.cache_crosscheck_enabled:
-                    # For debugging / testing: Validate that the output synthesized
-                    # from the cache matches the output created by normal dispatch.
-                    with disable_fake_tensor_cache(self):
-                        self._crosscheck_cache_output(output, func, types, args, kwargs)
-            else:
-                self._validate_cache_key(func, args, kwargs)
-                output = self._dispatch_impl(func, types, args, kwargs)
-                entry = self._make_cache_entry(state, key, func, args, kwargs, output)
-                key.strip_shape_env()
-                cache[key] = entry
-                FakeTensorMode.cache_misses += 1
         except _BypassDispatchCache as e:
+            # We couldn't create the cache key at all
             FakeTensorMode.cache_bypasses[e.reason] += 1
 
-        if output is _UNASSIGNED:
-            output = self._dispatch_impl(func, types, args, kwargs)
+        if key is None:
+            # Do this dispatch outside the above except handler so if it
+            # generates its own exception there won't be a __context__ caused by
+            # the caching mechanism.
+            return self._dispatch_impl(func, types, args, kwargs)
+
+        assert state is not None
+        if state.cache_on_shape_env():
+            assert state.shape_env is not None
+            cache = state.shape_env.fake_tensor_cache
+            set_cache_key = _set_cache_key_for_shape_env
+        else:
+            cache = FakeTensorMode.cache
+            set_cache_key = _set_cache_key
+        entry = cache.get(key, None)
+
+        if entry is not None:
+            if isinstance(entry, _DispatchCacheBypassEntry):
+                # This represents a negative cache entry - we already saw that the
+                # output is uncachable. Compute it from first principals.
+                FakeTensorMode.cache_bypasses[entry.reason] += 1
+                return self._dispatch_impl(func, types, args, kwargs)
+
+            # We have a cache entry.
+            output = self._output_from_cache_entry(state, entry, key, func, args)
+            FakeTensorMode.cache_hits += 1
+            if self.cache_crosscheck_enabled:
+                # For debugging / testing: Validate that the output synthesized
+                # from the cache matches the output created by normal dispatch.
+                with disable_fake_tensor_cache(self):
+                    self._crosscheck_cache_output(output, func, types, args, kwargs)
+            return output
+
+        # We don't have a cache entry.
+        output = self._dispatch_impl(func, types, args, kwargs)
 
+        try:
+            self._validate_cache_key(func, args, kwargs)
+        except _BypassDispatchCache as e:
+            # We ran "extra" checks on the cache key and determined that it's no
+            # good. Record the reason and mark it so we don't bother validating
+            # again.
+            FakeTensorMode.cache_bypasses[e.reason] += 1
+            set_cache_key(cache, key, _DispatchCacheBypassEntry(e.reason))
+            return output
+
+        try:
+            entry = self._make_cache_entry(state, key, func, args, kwargs, output)
+        except _BypassDispatchCache as e:
+            # We had trouble making the cache entry. Record the reason and mark
+            # it.
+            FakeTensorMode.cache_bypasses[e.reason] += 1
+            set_cache_key(cache, key, _DispatchCacheBypassEntry(e.reason))
+            return output
+
+        set_cache_key(cache, key, entry)
+        FakeTensorMode.cache_misses += 1
         return output
 
     def _cache_key(
@@ -1634,17 +1677,17 @@ def _validate_output_for_cache_entry(
         kwargs: Mapping[str, object],
         output: Optional[FakeTensor],
     ) -> None:
-        from torch.fx.experimental.symbolic_shapes import has_free_unbacked_symbols
-
+        # Is this even possible? According to the signature this can be None but
+        # not `int`. So either the signature is a lie or (part of) this line is
+        # unnecessary...
         if isinstance(output, (int, type(None))):
             return
 
-        if isinstance(output, torch.SymInt):
-            if has_free_unbacked_symbols(output):
-                # This is unreachable but adding the check for extra safety in
-                # case we change code path in future.
-                raise _BypassDispatchCache("unbacked symbol in output")
-            return
+        if _has_unrepresented_symbols(state, output):
+            # Unbacked symbols are fine - but only if they're also represented
+            # in the input. If there are any new unbacked symbols then we can't
+            # cache this output.
+            raise _BypassDispatchCache("unrepresented symbol in output")
 
         # Some ops return tuples of Tensors, but it's rare, so avoid
         # the complexity of caching other types.
@@ -1718,7 +1761,7 @@ def _get_output_info_for_cache_entry(
         # we can synthesize a tensor here and do the checks on that instance.
         # This approach keeps the (more frequent) cache-hit path as lightweight
         # as possible.
-        entry_for_synth_output = _DispatchCacheEntry(
+        entry_for_synth_output = _DispatchCacheValidEntry(
             output_infos=(entry,), is_output_tuple=False
         )
         synth_output = self._output_from_cache_entry(
@@ -1742,7 +1785,7 @@ def _make_cache_entry(
         args: Sequence[object],
         kwargs: Mapping[str, object],
         output: Optional[FakeTensor],
-    ) -> _DispatchCacheEntry:
+    ) -> _DispatchCacheValidEntry:
         """
         Make a cache entry object for the given 'output' Tensor. Raises
         _BypassDispatchCache if the output tensor has characteristics that
@@ -1773,7 +1816,7 @@ def _make_cache_entry(
             output_info = _DispatchCacheEntryOutputInfo(
                 inplace_idx=None, metadata=None, view_idx=None, constant_value=output
             )
-            return _DispatchCacheEntry(
+            return _DispatchCacheValidEntry(
                 output_infos=(output_info,), is_output_tuple=False
             )
 
@@ -1794,15 +1837,15 @@ def _make_cache_entry(
                 )
                 for out_elem in output
             ]
-            return _DispatchCacheEntry(
+            return _DispatchCacheValidEntry(
                 output_infos=tuple(output_infos), is_output_tuple=True
             )
 
         else:
             output_info = self._get_output_info_for_cache_entry(
                 state, key, func, args, kwargs, output
             )
-            return _DispatchCacheEntry(
+            return _DispatchCacheValidEntry(
                 output_infos=(output_info,), is_output_tuple=False
             )
 
@@ -1882,7 +1925,7 @@ def check_value(
     def _output_from_cache_entry(
         self,
         state: _CacheKeyState,
-        entry: _DispatchCacheEntry,
+        entry: _DispatchCacheValidEntry,
         key: _DispatchCacheKey,
         func: OpOverload,
         args: Sequence[object],
@@ -2886,6 +2929,19 @@ def from_tensor(
 _StoragePointer = object
 
 
+def _has_unrepresented_symbols(
+    state: _CacheKeyState, output: Optional[FakeTensor]
+) -> bool:
+    from torch.fx.experimental.symbolic_shapes import _iterate_exprs
+
+    for s in _iterate_exprs(output):
+        for symbol in s.free_symbols:
+            if symbol not in state.known_symbols:
+                return True
+
+    return False
+
+
 # NB: returns fake tensors
 def run_fallback_kernel(
     fake_mode: FakeTensorMode,
@@ -2951,6 +3007,23 @@ def map_out(e: T) -> Union[T, FakeTensor]:
     return pytree.tree_map(map_out, r)
 
 
+def _set_cache_key_for_shape_env(
+    cache: dict[_DispatchCacheKey, _DispatchCacheEntry],
+    key: _DispatchCacheKey,
+    entry: _DispatchCacheEntry,
+) -> None:
+    key.strip_shape_env()
+    cache[key] = entry
+
+
+def _set_cache_key(
+    cache: dict[_DispatchCacheKey, _DispatchCacheEntry],
+    key: _DispatchCacheKey,
+    entry: _DispatchCacheEntry,
+) -> None:
+    cache[key] = entry
+
+
 # Just for use to allow copying a module to fake tensors,
 # does not apply elsewhere
 class FakeCopyMode(TorchFunctionMode):
@@ -3042,6 +3115,9 @@ def _check_for_subclass_arg(x: object) -> bool:
     torch.ops.aten.is_coalesced.default,
     torch.ops.aten.dense_dim.default,
     torch.ops.aten.sparse_dim.default,
+    # _RecordFunction doesn't support __eq__ so make sure not to attempt to
+    # cache it.
+    torch.ops.profiler._record_function_exit._RecordFunction,
 )
 
 from torch._subclasses.fake_impls import (  # noqa: F401