1.3.8 release commit

awaelchli · lexierule · commit 7b3bf48295e0 · 2021-07-01T09:50:18.000-04:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,19 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 
+## [1.3.8] - 2021-06-30
+
+- Fixed a sync deadlock when checkpointing a `LightningModule` that uses a torchmetrics 0.4 `Metric` ([#8218](https://github.com/PyTorchLightning/pytorch-lightning/pull/8218))
+- Fixed compatibility TorchMetrics v0.4 ([#8206](https://github.com/PyTorchLightning/pytorch-lightning/pull/8206))
+- Added torchelastic check when sanitizing GPUs ([#8095](https://github.com/PyTorchLightning/pytorch-lightning/pull/8095))
+- Fixed a DDP info message that was never shown ([#8111](https://github.com/PyTorchLightning/pytorch-lightning/pull/8111))
+- Fixed metrics deprecation message at module import level ([#8163](https://github.com/PyTorchLightning/pytorch-lightning/pull/8163))
+- Fixed a bug where an infinite recursion would be triggered when using the `BaseFinetuning` callback on a model that contains a `ModuleDict` ([#8170](https://github.com/PyTorchLightning/pytorch-lightning/pull/8170))
+- Added a mechanism to detect `deadlock` for `DDP` when only 1 process trigger an `Exception`. The mechanism will `kill the processes` when it happens ([#8167](https://github.com/PyTorchLightning/pytorch-lightning/pull/8167))
+- Fixed NCCL error when selecting non-consecutive device ids ([#8165](https://github.com/PyTorchLightning/pytorch-lightning/pull/8165))
+- Fixed SWA to also work with `IterableDataset` ([#8172](https://github.com/PyTorchLightning/pytorch-lightning/pull/8172))
+
+
 ## [1.3.7] - 2021-06-22
 
 - Fixed a bug where skipping an optimizer while using amp causes amp to trigger an assertion error ([#7975](https://github.com/PyTorchLightning/pytorch-lightning/pull/7975))
@@ -24,10 +37,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed setting `worker_init_fn` to seed dataloaders correctly when using DDP ([#7942](https://github.com/PyTorchLightning/pytorch-lightning/pull/7942))
 - Fixed `BaseFinetuning` callback to properly handle parent modules w/ parameters ([#7931](https://github.com/PyTorchLightning/pytorch-lightning/pull/7931))
 
-## [1.3.6] - 2021-06-DD
-
-- Fix compatibility TorchMetrics v0.4 ([#8206](https://github.com/PyTorchLightning/pytorch-lightning/pull/8206))
-
 
 ## [1.3.5] - 2021-06-08
 
diff --git a/pl_examples/basic_examples/autoencoder.py b/pl_examples/basic_examples/autoencoder.py
@@ -115,7 +115,7 @@ def test_dataloader(self):
 
 
 def cli_main():
-    cli = LightningCLI(LitAutoEncoder, MyDataModule, seed_everything_default=1234)
+    cli = LightningCLI(LitAutoEncoder, MyDataModule, seed_everything_default=1234, save_config_callback=None)
     result = cli.trainer.test(cli.model, datamodule=cli.datamodule)
     print(result)
 
diff --git a/pl_examples/basic_examples/backbone_image_classifier.py b/pl_examples/basic_examples/backbone_image_classifier.py
@@ -128,7 +128,7 @@ def test_dataloader(self):
 
 
 def cli_main():
-    cli = LightningCLI(LitClassifier, MyDataModule, seed_everything_default=1234)
+    cli = LightningCLI(LitClassifier, MyDataModule, seed_everything_default=1234, save_config_callback=None)
     result = cli.trainer.test(cli.model, datamodule=cli.datamodule)
     print(result)
 
diff --git a/pl_examples/basic_examples/simple_image_classifier.py b/pl_examples/basic_examples/simple_image_classifier.py
@@ -76,7 +76,7 @@ def configure_optimizers(self):
 
 
 def cli_main():
-    cli = LightningCLI(LitClassifier, MNISTDataModule, seed_everything_default=1234)
+    cli = LightningCLI(LitClassifier, MNISTDataModule, seed_everything_default=1234, save_config_callback=None)
     result = cli.trainer.test(cli.model, datamodule=cli.datamodule)
     print(result)
 
diff --git a/pytorch_lightning/__about__.py b/pytorch_lightning/__about__.py
@@ -1,7 +1,7 @@
 import time
 
 _this_year = time.strftime("%Y")
-__version__ = '1.3.7post0'
+__version__ = '1.3.8'
 __author__ = 'William Falcon et al.'
 __author_email__ = 'waf2107@columbia.edu'
 __license__ = 'Apache-2.0'
diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py
@@ -296,7 +296,7 @@ def post_dispatch(self) -> None:
         self.cluster_environment.teardown()
 
     def barrier(self, *args, **kwargs) -> None:
-        if not torch_distrib.is_initialized():
+        if not (torch_distrib.is_available() and torch_distrib.is_initialized()):
             return
         if _TORCH_GREATER_EQUAL_1_8 and torch.distributed.get_backend() == "nccl":
             torch_distrib.barrier(device_ids=self.determine_ddp_device_ids())
diff --git a/pytorch_lightning/plugins/training_type/ddp_spawn.py b/pytorch_lightning/plugins/training_type/ddp_spawn.py
@@ -272,7 +272,7 @@ def __recover_child_process_weights(self, best_path, last_path):
             self.lightning_module.load_state_dict(ckpt)
 
     def barrier(self, *args, **kwargs) -> None:
-        if not torch_distrib.is_initialized():
+        if not (torch_distrib.is_available() and torch_distrib.is_initialized()):
             return
         if _TORCH_GREATER_EQUAL_1_8 and torch.distributed.get_backend() == "nccl":
             torch_distrib.barrier(device_ids=self.determine_ddp_device_ids())
diff --git a/pytorch_lightning/utilities/distributed.py b/pytorch_lightning/utilities/distributed.py
@@ -105,6 +105,10 @@ def rank_zero_info(*args, stacklevel: int = 4, **kwargs):
     _info(*args, stacklevel=stacklevel, **kwargs)
 
 
+def distributed_available() -> bool:
+    return torch.distributed.is_available() and torch.distributed.is_initialized() or tpu_distributed()
+
+
 def gather_all_tensors(result: Union[torch.Tensor], group: Optional[Any] = None):
     """
     Function to gather all tensors from several ddp processes onto a list that
diff --git a/requirements.txt b/requirements.txt
@@ -10,3 +10,4 @@ tensorboard>=2.2.0, !=2.5.0  # 2.5.0 GPU CI error: 'Couldn't build proto file in
 torchmetrics>=0.2.0
 pyDeprecate==0.3.0
 packaging>=17.0
+pillow!=8.3.0  # TODO: Delete line after https://github.com/python-pillow/Pillow/issues/5571