Fix hanging trainer.test() (#142)

amogkam · web-flow · commit d5f325a9d08f · 2022-04-19T14:06:18.000-07:00
Closes #132 Fixes an issue where trainer.test() hangs when using multiple workers in the test DataLoader. This issue is a bit weird as I was only able to reproduce the hanging with that exact setup. It does not occur with trainer.train() and multiple workers in the training DataLoader. It also does not occur when I set the num_workers for the test DataLoader to 0. I'm not exactly sure what's going on, but the testing actually finishes and the program hangs at torch.distributed.destroy_process_group(). It may potentially be related to pytorch/pytorch#75097. The difference between trainer.train() and trainer.test() is that the former wraps the model in DDP while the latter doesn't (but still creates the process group). In any case, the shudown_remote cleanup code is not actually necessary- the CUDA cache cleanup is already being called in the parent DDPSpawnPlugin on each worker, and it seems that torch.distributed.destroy_process_group() is not a Public API (and is not being called by PyTorch Lightning either). The test added in this PR hangs prior to the changes to ray_ddp.py, but is passing after.
diff --git a/ray_lightning/ray_ddp.py b/ray_lightning/ray_ddp.py
@@ -380,12 +380,6 @@ def post_dispatch(self, trainer: "pl.Trainer"):
                 .best_model_path = best_path
         # DDPSpawnPlugin.__recover_child_process_weights_end
 
-        def shutdown_remote():
-            torch.distributed.destroy_process_group()
-            if torch.cuda.is_available():
-                torch.cuda.empty_cache()
-
-        ray.get([w.execute.remote(shutdown_remote) for w in self.workers])
         for w in self.workers:
             ray.kill(w, no_restart=True)
             del w
diff --git a/ray_lightning/tests/test_ddp.py b/ray_lightning/tests/test_ddp.py
@@ -227,6 +227,15 @@ def test_train_client(tmpdir, start_ray_client_server_2_cpus, num_workers):
     train_test(trainer, model)
 
 
+def test_test_with_dataloader_workers(tmpdir, ray_start_2_cpus, seed):
+    """Tests trainer.test with >0 workers for data loading."""
+    model = BoringModel()
+    plugin = RayPlugin(num_workers=1, use_gpu=False)
+    trainer = get_trainer(
+        tmpdir, limit_train_batches=20, max_epochs=1, plugins=[plugin])
+    trainer.test(model)
+
+
 @pytest.mark.parametrize("num_workers", [1, 2])
 def test_load(tmpdir, ray_start_2_cpus, num_workers):
     """Tests if model checkpoint can be loaded."""
diff --git a/ray_lightning/tests/utils.py b/ray_lightning/tests/utils.py
@@ -86,7 +86,8 @@ def val_dataloader(self):
         return torch.utils.data.DataLoader(RandomDataset(32, 64))
 
     def test_dataloader(self):
-        return torch.utils.data.DataLoader(RandomDataset(32, 64))
+        return torch.utils.data.DataLoader(
+            RandomDataset(32, 64), num_workers=1)
 
     def on_save_checkpoint(self, checkpoint):
         checkpoint["val_epoch"] = self.val_epoch