Skip to content

Commit 192b96e

Browse files
Revert "[AOTI] Fix a GPU memory leak caused by reference circle (pytorch#168063)"
This reverts commit cdca10b. Reverted pytorch#168063 on behalf of https://github.com/yangw-dev due to Internal test breaks, contacted author to revert it and fix it test_codegen_int_array_var_fix_memory_leak, self.assertTrue(allocated_memory[1] == allocated_memory[2]) AssertionError: False is not true ([comment](pytorch#168063 (comment)))
1 parent c055ebe commit 192b96e

File tree

2 files changed

+2
-66
lines changed

2 files changed

+2
-66
lines changed

test/inductor/test_aot_inductor.py

Lines changed: 0 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -7437,50 +7437,6 @@ def forward(self, x):
74377437
"RAIIAtenTensorHandle buf0(buf0_handle_restrided);"
74387438
).run(code)
74397439

7440-
def test_codegen_int_array_var_fix_memory_leak(self):
7441-
"""
7442-
Fix https://github.com/pytorch/pytorch/issues/167630
7443-
"""
7444-
if self.device != "cuda":
7445-
raise unittest.SkipTest("test is only for cuda")
7446-
7447-
def make_mlp(in_dim=128, hidden=256, out_dim=64, depth=3):
7448-
layers = []
7449-
d = in_dim
7450-
for _ in range(depth):
7451-
layers += [nn.Linear(d, hidden), nn.ReLU()]
7452-
d = hidden
7453-
layers += [nn.Linear(d, out_dim)]
7454-
return nn.Sequential(*layers)
7455-
7456-
batch = 32
7457-
in_dim = 2048
7458-
hidden = 512
7459-
out_dim = 10
7460-
depth = 6
7461-
7462-
import gc
7463-
7464-
allocated_memory = []
7465-
for _ in range(3):
7466-
torch.cuda.reset_peak_memory_stats()
7467-
7468-
model = make_mlp(in_dim, hidden, out_dim, depth).to(self.device)
7469-
example_inputs = (torch.randn(batch, in_dim, device=self.device),)
7470-
ep = torch.export.export(
7471-
model,
7472-
example_inputs,
7473-
)
7474-
torch._inductor.aoti_compile_and_package(ep)
7475-
7476-
del model, example_inputs, ep
7477-
torch.cuda.synchronize()
7478-
torch.cuda.empty_cache()
7479-
gc.collect()
7480-
allocated_memory.append(torch.cuda.memory_allocated())
7481-
7482-
self.assertTrue(allocated_memory[1] == allocated_memory[2])
7483-
74847440
@unittest.skipIf(IS_MACOS, "might have no readelf on Mac")
74857441
def test_libtorch_free_so(self):
74867442
class Model(torch.nn.Module):

torch/_inductor/codegen/cpp_wrapper_cpu.py

Lines changed: 2 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,6 @@ def __init__(self):
9696
self.include_extra_header = functools.lru_cache(None)( # type: ignore[method-assign]
9797
self._include_extra_header
9898
)
99-
self.codegen_int_array_var_cache = {}
10099

101100
@staticmethod
102101
def create(
@@ -1637,33 +1636,14 @@ def codegen_memory_format(self, memory_format):
16371636
self.used_cached_memory_formats.add(memory_format_str)
16381637
return f"cached_torch_memory_format_{memory_format_str}"
16391638

1639+
@functools.cache # noqa: B019
16401640
def codegen_int_array_var(
16411641
self,
16421642
int_array: str,
16431643
writeline: Callable[..., None],
16441644
known_statically=False,
16451645
graph=None, # for per-graph caching
1646-
) -> str:
1647-
# Use id(graph) for caching to avoid circular references
1648-
cache_key = (
1649-
int_array,
1650-
id(writeline),
1651-
known_statically,
1652-
id(graph) if graph else None,
1653-
)
1654-
if cache_key not in self.codegen_int_array_var_cache:
1655-
self.codegen_int_array_var_cache[cache_key] = (
1656-
self._codegen_int_array_var_impl(int_array, writeline, known_statically)
1657-
)
1658-
1659-
return self.codegen_int_array_var_cache[cache_key]
1660-
1661-
def _codegen_int_array_var_impl(
1662-
self,
1663-
int_array: str,
1664-
writeline: Callable[..., None],
1665-
known_statically: bool,
1666-
) -> str:
1646+
):
16671647
# Used for size/stride declaration
16681648
#
16691649
# Because the memory planning is done in two passes (see the implementation

0 commit comments

Comments
 (0)