Various fix for memory leak in test autograd and dataloader (pytorch#143323)

albanD · pytorchmergebot · commit 80a42399bb07 · 2024-12-18T13:56:59.000Z
Pull Request resolved: pytorch#143323 Approved by: https://github.com/andrewkho, https://github.com/soulitzer ghstack dependencies: pytorch#143225
diff --git a/test/test_autograd.py b/test/test_autograd.py
@@ -264,6 +264,10 @@ def backward(ctx, grad_output):
         self.assertExpected(x_grad_desc, "x_grad_desc")
         self.assertExpected(y_grad_desc, "y_grad_desc")
 
+        # Avoid leaking memory
+        x.grad = None
+        y.grad = None
+
     def test_once_differentiable(self):
         class MyFunction(Function):
             @staticmethod
@@ -293,6 +297,10 @@ def backward(ctx, grad_output):
             "CopyBackwards(None, Error(AccumulateGrad(), None, AccumulateGrad()))",
         )
 
+        # Avoid leaking memory
+        x.grad = None
+        y.grad = None
+
     def test_function_returns_input(self):
         class MyFunction(Function):
             @staticmethod
@@ -640,8 +648,8 @@ def fn(x):
             for g in should_not_execute:
                 self.assertFalse(torch._C._will_engine_execute_node(g))
 
-        b.register_hook(fn)
-        c.register_hook(fn)
+        h1 = b.register_hook(fn)
+        h2 = c.register_hook(fn)
 
         # .backward(inputs=) is OK
         out = c.sum()
@@ -668,7 +676,7 @@ def fn(x):
             counter[0] += 1
             self.assertTrue(torch._C._will_engine_execute_node(b.grad_fn))
 
-        b.register_hook(fn)
+        h3 = b.register_hook(fn)
         counter[0] = 0
         torch.autograd.grad(b.sum(), (a,))
         self.assertEqual(counter[0], 1)
@@ -680,6 +688,11 @@ def fn(x):
         with self.assertRaisesRegex(RuntimeError, "expects an grad_fn"):
             torch._C._will_engine_execute_node(out)
 
+        # Ensure we don't leak memory
+        h1.remove()
+        h2.remove()
+        h3.remove()
+
     def test_custom_function_vmap_defaults(self):
         class MySquare(Function):
             @staticmethod
@@ -899,6 +912,10 @@ def test_hessian_vector(self):
         self.assertEqual(x.grad, x_grad + x_hv)
         self.assertEqual(y.grad, y_grad + y_hv)
 
+        # Avoid leaking memory
+        x.grad = None
+        y.grad = None
+
     def test_grad(self):
         x = torch.randn(2, 2, requires_grad=True)
         y = torch.randn(2, 2, requires_grad=True)
@@ -924,6 +941,10 @@ def test_grad(self):
         self.assertEqual(x.grad, x_grad)
         self.assertEqual(y.grad, y_grad)
 
+        # Avoid leaking memory
+        x.grad = None
+        y.grad = None
+
         # Test that grad_outputs and outputs have the same shape
         grad_out = torch.ones(2)
         try:
@@ -1071,6 +1092,7 @@ def test_grad_fn_input_metadata(self):
             layout=torch.jagged,
             requires_grad=True,
         )
+
         nt_metadata = nt.clone().grad_fn._input_metadata[0]
 
         self.assertIsInstance(nt_metadata.shape[1], torch.SymInt)
@@ -2209,16 +2231,21 @@ def fn2(grad):
 
             b = torch.rand(3, 3, requires_grad=True)
             out1, out2 = fn(b)
-            out1.register_hook(fn0)
-            out2.register_hook(fn1)
+            h1 = out1.register_hook(fn0)
+            h2 = out2.register_hook(fn1)
             # node refers to two hook dicts
             # out1 no longer no longer points to its old hook dict
             out1.mul_(2)
             # fn2 is registered to out1's new hook dict
-            out1.register_hook(fn2)
+            h3 = out1.register_hook(fn2)
             (out1 + out2 * 3).sum().backward()
             self.assertEqual(counts, [1, 1, 1])
 
+            # Avoid leaking memory
+            h1.remove()
+            h2.remove()
+            h3.remove()
+
     def test_tensor_hooks_inplace_over_view(self):
         # There might be a better UX here, but this is the way it is now
         count = [0]
@@ -2484,6 +2511,11 @@ def test_backward_with_nonleaf_inputs(self):
         )
         self.assertIsNone(z.grad)
 
+        # Avoid leaking memory
+        x.grad = None
+        y.grad = None
+        x_nonleaf.grad = None
+
     def test_dependent_backward(self):
         x = torch.randn(10, requires_grad=True)
         y = x**2
@@ -4445,6 +4477,7 @@ def hook(_):
 
     def test_current_graph_task_execution_order(self):
         predicted = [None]
+        all_hooks = []
 
         def hook(_):
             predicted[0] = torch._C._current_graph_task_execution_order()
@@ -4473,11 +4506,11 @@ def hook(t_):
                 return hook
 
             for i, t in enumerate(tensors):
-                t.register_hook(get_hook(i))
+                all_hooks.append(t.register_hook(get_hook(i)))
 
         # Basic example: single path
         t = torch.tensor(1.0, requires_grad=True).clone().sin().exp()
-        t.register_hook(hook)
+        all_hooks.append(t.register_hook(hook))
         with torch.autograd.set_multithreading_enabled(False):
             t.backward()
         self.assertExpectedInline(
@@ -4494,7 +4527,7 @@ def hook(t_):
         d = a.cos()
         out = c * d
         register_logging_hooks(a, b, c, d, out)
-        out.register_hook(hook)
+        all_hooks.append(out.register_hook(hook))
         with torch.autograd.set_multithreading_enabled(False):
             out.backward()
         self.assertEqual(predicted[0], grad_fns(*actual))
@@ -4506,7 +4539,7 @@ def hook(t_):
         c = a.cos()
         out = b * c
         register_logging_hooks(a, b, c, out)
-        out.register_hook(hook)
+        all_hooks.append(out.register_hook(hook))
         with torch.autograd.set_multithreading_enabled(False):
             out.backward()
         self.assertEqual(predicted[0], grad_fns(*actual))
@@ -4519,7 +4552,7 @@ def hook(t_):
         out2 = b.cos()
         out3 = b.cos()
         register_logging_hooks(a, b, out, out2, out3)
-        out3.register_hook(hook)
+        all_hooks.append(out3.register_hook(hook))
         with torch.autograd.set_multithreading_enabled(False):
             torch.autograd.grad((out, out3, out2), inputs=(a,))
         self.assertExpectedInline(
@@ -4537,7 +4570,7 @@ def hook(t_):
         b = a * 2
         out = b.sin()
         register_logging_hooks(a, b, out)
-        out.register_hook(hook)
+        all_hooks.append(out.register_hook(hook))
         with torch.autograd.set_multithreading_enabled(False):
             out.backward()
         self.assertEqual(predicted[0], grad_fns(*actual))
@@ -4548,7 +4581,7 @@ def hook(t_):
         b = a * 2
         out = b.sin()
         register_logging_hooks(a, b, out)
-        out.register_hook(hook)
+        all_hooks.append(out.register_hook(hook))
         with torch.autograd.set_multithreading_enabled(False):
             torch.autograd.grad((out,), inputs=(a, b))
         self.assertEqual(
@@ -4567,7 +4600,7 @@ def hook(t_):
         c = a * b
         out = c.sin()
         register_logging_hooks(a, b, c, out)
-        out.register_hook(hook)
+        all_hooks.append(out.register_hook(hook))
         with torch.autograd.set_multithreading_enabled(False):
             torch.autograd.grad((out,), inputs=(a,))
         self.assertEqual(
@@ -4588,13 +4621,17 @@ def hook(t_):
 
         # Errors when context manager not enabled
         t = torch.tensor(1.0, requires_grad=True).clone().sin().exp()
-        t.register_hook(hook)
+        all_hooks.append(t.register_hook(hook))
         with self.assertRaisesRegex(
             RuntimeError,
             "expects the current backward to be executed with multithreading disabled",
         ):
             t.backward()
 
+        # Avoid leaking memory
+        for h in all_hooks:
+            h.remove()
+
     @skipIfWindows(msg="node name demangling inconsistent on windows")
     def test_backward_hook_relative_ordering(self):
         order = []
@@ -12927,7 +12964,7 @@ def hook(grads):
                 else:
                     self.assertEqual(res, grad_is_none)
 
-        torch.autograd.graph.register_multi_grad_hook((t1, t2, t3, t4), hook)
+        handle = torch.autograd.graph.register_multi_grad_hook((t1, t2, t3, t4), hook)
 
         out = (t2 * t3).sum()
 
@@ -12976,6 +13013,8 @@ def backward_retain_graph(out, t2, t3):
         self.assertEqual(err_count[0], 1)
         self.assertEqual(res, [False, True, True, False])
 
+        handle.remove()
+
     def test_multi_grad_any_hooks(self):
         # Multihooks should behave independently per execution of backward
         # Test that the hook fired the number of times we ran backward
diff --git a/test/test_utils.py b/test/test_utils.py
@@ -561,11 +561,6 @@ def test_infer_device_state_recursive_multi_cuda(self):
 class TestDataLoaderUtils(TestCase):
     MAX_TIMEOUT_IN_SECOND = 300
 
-    def setUp(self):
-        super().setUp()
-        self.dataset = torch.randn(5, 3, 3, 2)
-        self.batch_size = 3
-
     def test_random_seed(self):
         def run():
             dataloader = torch.utils.data.DataLoader(
@@ -584,12 +579,12 @@ def run():
         self.assertEqual(x1, x2)
 
     def test_single_keep(self):
-        # self.dataset is a Tensor here; technically not a valid input because
+        # torch.rand(5, 3, 3, 2) is a Tensor here; technically not a valid input because
         # not a Dataset subclass, but needs to stay working so add ignore's
         # for type checking with mypy
         dataloader: DataLoader = DataLoader(
-            self.dataset,  # type: ignore[arg-type]
-            batch_size=self.batch_size,
+            torch.rand(5, 3, 3, 2),  # type: ignore[arg-type]
+            batch_size=3,
             num_workers=0,
             drop_last=False,
         )
@@ -598,8 +593,8 @@ def test_single_keep(self):
 
     def test_single_drop(self):
         dataloader: DataLoader = DataLoader(
-            self.dataset,  # type: ignore[arg-type]
-            batch_size=self.batch_size,
+            torch.rand(5, 3, 3, 2),  # type: ignore[arg-type]
+            batch_size=3,
             num_workers=0,
             drop_last=True,
         )
@@ -611,8 +606,8 @@ def test_single_drop(self):
     )
     def test_multi_keep(self):
         dataloader: DataLoader = DataLoader(
-            self.dataset,  # type: ignore[arg-type]
-            batch_size=self.batch_size,
+            torch.rand(5, 3, 3, 2),  # type: ignore[arg-type]
+            batch_size=3,
             num_workers=2,
             drop_last=False,
             timeout=self.MAX_TIMEOUT_IN_SECOND,
@@ -622,8 +617,8 @@ def test_multi_keep(self):
 
     def test_multi_drop(self):
         dataloader: DataLoader = DataLoader(
-            self.dataset,  # type: ignore[arg-type]
-            batch_size=self.batch_size,
+            torch.rand(5, 3, 3, 2),  # type: ignore[arg-type]
+            batch_size=3,
             num_workers=2,
             drop_last=True,
             timeout=self.MAX_TIMEOUT_IN_SECOND,