Revert "[AOTI] Fix a GPU memory leak caused by reference circle (pytorch#168063)"

pytorchmergebot · pytorchmergebot · commit 192b96e42b82 · 2025-11-20T01:59:23.000Z
This reverts commit cdca10b. Reverted pytorch#168063 on behalf of https://github.com/yangw-dev due to Internal test breaks, contacted author to revert it and fix it test_codegen_int_array_var_fix_memory_leak, self.assertTrue(allocated_memory[1] == allocated_memory[2]) AssertionError: False is not true ([comment](pytorch#168063 (comment)))
diff --git a/test/inductor/test_aot_inductor.py b/test/inductor/test_aot_inductor.py
@@ -7437,50 +7437,6 @@ def forward(self, x):
             "RAIIAtenTensorHandle buf0(buf0_handle_restrided);"
         ).run(code)
 
-    def test_codegen_int_array_var_fix_memory_leak(self):
-        """
-        Fix https://github.com/pytorch/pytorch/issues/167630
-        """
-        if self.device != "cuda":
-            raise unittest.SkipTest("test is only for cuda")
-
-        def make_mlp(in_dim=128, hidden=256, out_dim=64, depth=3):
-            layers = []
-            d = in_dim
-            for _ in range(depth):
-                layers += [nn.Linear(d, hidden), nn.ReLU()]
-                d = hidden
-            layers += [nn.Linear(d, out_dim)]
-            return nn.Sequential(*layers)
-
-        batch = 32
-        in_dim = 2048
-        hidden = 512
-        out_dim = 10
-        depth = 6
-
-        import gc
-
-        allocated_memory = []
-        for _ in range(3):
-            torch.cuda.reset_peak_memory_stats()
-
-            model = make_mlp(in_dim, hidden, out_dim, depth).to(self.device)
-            example_inputs = (torch.randn(batch, in_dim, device=self.device),)
-            ep = torch.export.export(
-                model,
-                example_inputs,
-            )
-            torch._inductor.aoti_compile_and_package(ep)
-
-            del model, example_inputs, ep
-            torch.cuda.synchronize()
-            torch.cuda.empty_cache()
-            gc.collect()
-            allocated_memory.append(torch.cuda.memory_allocated())
-
-        self.assertTrue(allocated_memory[1] == allocated_memory[2])
-
     @unittest.skipIf(IS_MACOS, "might have no readelf on Mac")
     def test_libtorch_free_so(self):
         class Model(torch.nn.Module):
diff --git a/torch/_inductor/codegen/cpp_wrapper_cpu.py b/torch/_inductor/codegen/cpp_wrapper_cpu.py
@@ -96,7 +96,6 @@ def __init__(self):
         self.include_extra_header = functools.lru_cache(None)(  # type: ignore[method-assign]
             self._include_extra_header
         )
-        self.codegen_int_array_var_cache = {}
 
     @staticmethod
     def create(
@@ -1637,33 +1636,14 @@ def codegen_memory_format(self, memory_format):
         self.used_cached_memory_formats.add(memory_format_str)
         return f"cached_torch_memory_format_{memory_format_str}"
 
+    @functools.cache  # noqa: B019
     def codegen_int_array_var(
         self,
         int_array: str,
         writeline: Callable[..., None],
         known_statically=False,
         graph=None,  # for per-graph caching
-    ) -> str:
-        # Use id(graph) for caching to avoid circular references
-        cache_key = (
-            int_array,
-            id(writeline),
-            known_statically,
-            id(graph) if graph else None,
-        )
-        if cache_key not in self.codegen_int_array_var_cache:
-            self.codegen_int_array_var_cache[cache_key] = (
-                self._codegen_int_array_var_impl(int_array, writeline, known_statically)
-            )
-
-        return self.codegen_int_array_var_cache[cache_key]
-
-    def _codegen_int_array_var_impl(
-        self,
-        int_array: str,
-        writeline: Callable[..., None],
-        known_statically: bool,
-    ) -> str:
+    ):
         # Used for size/stride declaration
         #
         # Because the memory planning is done in two passes (see the implementation