Fix XLAEnvironment detection on TPU pod (#16806)

carmocca · lexierule · commit 41a5c2a317dd · 2023-02-21T15:10:18.000-05:00
tpu fixes
diff --git a/src/lightning_fabric/CHANGELOG.md b/src/lightning_fabric/CHANGELOG.md
@@ -5,6 +5,26 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 
+## [1.9.3] - 2023-MM-DD
+
+### Added
+
+
+### Changed
+
+
+### Deprecated
+
+-
+
+
+### Removed
+
+### Fixed
+
+- Fixed an issue causing a wrong environment plugin to be selected when `accelerator=tpu` and `devices > 1` ([#16806](https://github.com/Lightning-AI/lightning/pull/16806))
+
+
 ## [1.9.2] - 2023-02-15
 
 - Fixed an attribute error and improved input validation for invalid strategy types being passed to Fabric ([#16693](https://github.com/Lightning-AI/lightning/pull/16693))
diff --git a/src/lightning_fabric/connector.py b/src/lightning_fabric/connector.py
@@ -511,7 +511,9 @@ def _lazy_init_strategy(self) -> None:
         if self.checkpoint_io:
             self.strategy.checkpoint_io = self.checkpoint_io
         if hasattr(self.strategy, "cluster_environment"):
-            self.strategy.cluster_environment = self.cluster_environment
+            if self.strategy.cluster_environment is None:
+                self.strategy.cluster_environment = self.cluster_environment
+            self.cluster_environment = self.strategy.cluster_environment
         if hasattr(self.strategy, "parallel_devices"):
             if self.strategy.parallel_devices:
                 self._parallel_devices = self.strategy.parallel_devices
diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md
@@ -5,12 +5,20 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 
+## [1.9.3] - YYYY-MM-DD
+
+### Fixed
+
+- Fixed an issue causing a wrong environment plugin to be selected when `accelerator=tpu` and `devices > 1` ([#16806](https://github.com/Lightning-AI/lightning/pull/16806))
+
+
 ## [1.9.2] - 2023-02-15
 
 ### Changed
 
 - Disabled strict loading in multiprocessing launcher ("ddp_spawn", etc.) when loading weights back into the main process ([#16365](https://github.com/Lightning-AI/lightning/pull/16365))
 
+
 ### Fixed
 
 - Fixed an attribute error and improved input validation for invalid strategy types being passed to Trainer ([#16693](https://github.com/Lightning-AI/lightning/pull/16693))
diff --git a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py
@@ -276,7 +276,7 @@ def _check_config_and_set_final_flags(
         if strategy is not None and strategy not in self._registered_strategies and not isinstance(strategy, Strategy):
             raise ValueError(
                 f"You selected an invalid strategy name: `strategy={strategy!r}`."
-                " It must be either a string or an instance of `lightning.pytorch.strategies.Strategy`."
+                " It must be either a string or an instance of `pytorch_lightning.strategies.Strategy`."
                 " Example choices: ddp, ddp_spawn, deepspeed, dp, ..."
                 " Find a complete list of options in our documentation at https://lightning.ai"
             )
@@ -821,7 +821,9 @@ def _lazy_init_strategy(self) -> None:
         if self.checkpoint_io:
             self.strategy.checkpoint_io = self.checkpoint_io
         if hasattr(self.strategy, "cluster_environment"):
-            self.strategy.cluster_environment = self.cluster_environment
+            if self.strategy.cluster_environment is None:
+                self.strategy.cluster_environment = self.cluster_environment
+            self.cluster_environment = self.strategy.cluster_environment
         if hasattr(self.strategy, "parallel_devices"):
             if self.strategy.parallel_devices:
                 self._parallel_devices = self.strategy.parallel_devices
diff --git a/tests/tests_fabric/test_connector.py b/tests/tests_fabric/test_connector.py
@@ -36,6 +36,7 @@
     LSFEnvironment,
     SLURMEnvironment,
     TorchElasticEnvironment,
+    XLAEnvironment,
 )
 from lightning_fabric.plugins.io import TorchCheckpointIO
 from lightning_fabric.strategies import (
@@ -69,6 +70,8 @@ def test_accelerator_choice_tpu(accelerator, devices):
         # accelerator=tpu, devices=None (default) maps to devices=auto (8) and then chooses XLAStrategy
         # This behavior may change in the future: https://github.com/Lightning-AI/lightning/issues/10606
         assert isinstance(connector.strategy, XLAStrategy)
+        assert isinstance(connector.strategy.cluster_environment, XLAEnvironment)
+        assert isinstance(connector.cluster_environment, XLAEnvironment)
     else:
         assert isinstance(connector.strategy, SingleTPUStrategy)
 
diff --git a/tests/tests_pytorch/accelerators/test_tpu.py b/tests/tests_pytorch/accelerators/test_tpu.py
@@ -81,7 +81,8 @@ def test_if_test_works_after_train(tmpdir):
 
 
 @RunIf(skip_windows=True)
-def test_accelerator_cpu_with_tpu_cores_flag(tpu_available):
+@mock.patch("pytorch_lightning.strategies.tpu_spawn.TPUSpawnStrategy.set_world_ranks")
+def test_accelerator_cpu_with_tpu_cores_flag(_, tpu_available):
     assert TPUAccelerator.is_available()
 
     trainer = Trainer(accelerator="cpu", devices=8)
@@ -94,7 +95,8 @@ def test_accelerator_cpu_with_tpu_cores_flag(tpu_available):
 
 @RunIf(skip_windows=True)
 @pytest.mark.parametrize(["accelerator", "devices"], [("auto", 8), ("auto", "auto"), ("tpu", None)])
-def test_accelerator_tpu(accelerator, devices, tpu_available):
+@mock.patch("pytorch_lightning.strategies.tpu_spawn.TPUSpawnStrategy.set_world_ranks")
+def test_accelerator_tpu(_, accelerator, devices, tpu_available):
     assert TPUAccelerator.is_available()
 
     trainer = Trainer(accelerator=accelerator, devices=devices)
@@ -104,7 +106,8 @@ def test_accelerator_tpu(accelerator, devices, tpu_available):
 
 
 @RunIf(skip_windows=True)
-def test_accelerator_tpu_with_tpu_cores_priority(tpu_available):
+@mock.patch("pytorch_lightning.strategies.tpu_spawn.TPUSpawnStrategy.set_world_ranks")
+def test_accelerator_tpu_with_tpu_cores_priority(_, tpu_available):
     """Test for checking `tpu_cores` flag takes priority over `devices`."""
     tpu_cores = 8
     with pytest.warns(UserWarning, match="The flag `devices=1` will be ignored,"):
@@ -115,7 +118,8 @@ def test_accelerator_tpu_with_tpu_cores_priority(tpu_available):
 
 
 @RunIf(skip_windows=True)
-def test_set_devices_if_none_tpu(tpu_available):
+@mock.patch("pytorch_lightning.strategies.tpu_spawn.TPUSpawnStrategy.set_world_ranks")
+def test_set_devices_if_none_tpu(_, tpu_available):
     with pytest.deprecated_call(match=r"is deprecated in v1.7 and will be removed in v2.0."):
         trainer = Trainer(accelerator="tpu", tpu_cores=8)
     assert isinstance(trainer.accelerator, TPUAccelerator)
@@ -202,7 +206,8 @@ def test_strategy_choice_tpu_str_ddp_spawn(tpu_available):
 
 
 @RunIf(skip_windows=True)
-def test_strategy_choice_tpu_str_tpu_spawn_debug(tpu_available):
+@mock.patch("pytorch_lightning.strategies.tpu_spawn.TPUSpawnStrategy.set_world_ranks")
+def test_strategy_choice_tpu_str_tpu_spawn_debug(_, tpu_available):
     trainer = Trainer(strategy="tpu_spawn_debug", accelerator="tpu", devices=8)
     assert isinstance(trainer.strategy, TPUSpawnStrategy)
 
@@ -286,7 +291,8 @@ def test_tpu_invalid_raises_set_precision_with_strategy(tpu_available):
 
 
 @RunIf(skip_windows=True)
-def test_xla_checkpoint_plugin_being_default(tpu_available):
+@mock.patch("pytorch_lightning.strategies.tpu_spawn.TPUSpawnStrategy.set_world_ranks")
+def test_xla_checkpoint_plugin_being_default(_, tpu_available):
     trainer = Trainer(accelerator="tpu", devices=8)
     assert isinstance(trainer.strategy.checkpoint_io, XLACheckpointIO)
 
diff --git a/tests/tests_pytorch/deprecated_api/test_remove_2-0.py b/tests/tests_pytorch/deprecated_api/test_remove_2-0.py
@@ -90,7 +90,8 @@ def test_v2_0_0_deprecated_gpus(cuda_count_4):
 
 
 @RunIf(skip_windows=True)
-def test_v2_0_0_deprecated_tpu_cores(tpu_available):
+@mock.patch("pytorch_lightning.strategies.tpu_spawn.TPUSpawnStrategy.set_world_ranks")
+def test_v2_0_0_deprecated_tpu_cores(_, tpu_available):
     with pytest.deprecated_call(match=r"is deprecated in v1.7 and will be removed in v2.0."):
         _ = Trainer(tpu_cores=8)
 
diff --git a/tests/tests_pytorch/strategies/test_registry.py b/tests/tests_pytorch/strategies/test_registry.py
@@ -11,6 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from unittest import mock
+
 import pytest
 
 from pytorch_lightning import Trainer
@@ -56,7 +58,8 @@ def test_deepspeed_strategy_registry_with_trainer(tmpdir, strategy):
 
 
 @RunIf(skip_windows=True)
-def test_tpu_spawn_debug_strategy_registry(xla_available):
+@mock.patch("pytorch_lightning.strategies.tpu_spawn.TPUSpawnStrategy.set_world_ranks")
+def test_tpu_spawn_debug_strategy_registry(_, xla_available):
     strategy = "tpu_spawn_debug"
 
     assert strategy in StrategyRegistry
diff --git a/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py b/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py
@@ -28,12 +28,14 @@
     LSFEnvironment,
     SLURMEnvironment,
     TorchElasticEnvironment,
+    XLAEnvironment,
 )
 from pytorch_lightning import Trainer
 from pytorch_lightning.accelerators.accelerator import Accelerator
 from pytorch_lightning.accelerators.cpu import CPUAccelerator
 from pytorch_lightning.accelerators.cuda import CUDAAccelerator
 from pytorch_lightning.accelerators.mps import MPSAccelerator
+from pytorch_lightning.accelerators.tpu import TPUAccelerator
 from pytorch_lightning.plugins import DoublePrecisionPlugin, LayerSync, NativeSyncBatchNorm, PrecisionPlugin
 from pytorch_lightning.plugins.io import TorchCheckpointIO
 from pytorch_lightning.strategies import (
@@ -45,6 +47,8 @@
     DDPStrategy,
     DeepSpeedStrategy,
     SingleDeviceStrategy,
+    SingleTPUStrategy,
+    TPUSpawnStrategy,
 )
 from pytorch_lightning.strategies.ddp_spawn import _DDP_FORK_ALIASES
 from pytorch_lightning.strategies.hpu_parallel import HPUParallelStrategy
@@ -59,6 +63,24 @@ def test_accelerator_choice_cpu(tmpdir):
     assert isinstance(trainer.strategy, SingleDeviceStrategy)
 
 
+@RunIf(tpu=True, standalone=True)
+@pytest.mark.parametrize(
+    ["accelerator", "devices"], [("tpu", None), ("tpu", 1), ("tpu", [1]), ("tpu", 8), ("auto", 1), ("auto", 8)]
+)
+@mock.patch.dict(os.environ, os.environ.copy(), clear=True)
+def test_accelerator_choice_tpu(accelerator, devices):
+    connector = AcceleratorConnector(accelerator=accelerator, devices=devices)
+    assert isinstance(connector.accelerator, TPUAccelerator)
+    if devices is None or (isinstance(devices, int) and devices > 1):
+        # accelerator=tpu, devices=None (default) maps to devices=auto (8) and then chooses TPUSpawnStrategy
+        # This behavior may change in the future: https://github.com/Lightning-AI/lightning/issues/10606
+        assert isinstance(connector.strategy, TPUSpawnStrategy)
+        assert isinstance(connector.strategy.cluster_environment, XLAEnvironment)
+        assert isinstance(connector.cluster_environment, XLAEnvironment)
+    else:
+        assert isinstance(connector.strategy, SingleTPUStrategy)
+
+
 def test_accelerator_invalid_choice():
     with pytest.raises(ValueError, match="You selected an invalid accelerator name: `accelerator='invalid'`"):
         Trainer(accelerator="invalid")
@@ -265,7 +287,8 @@ def test_interactive_compatible_dp_strategy_gpu(mps_count_0, cuda_count_2, monke
 
 
 @RunIf(skip_windows=True)
-def test_interactive_compatible_strategy_tpu(tpu_available, monkeypatch):
+@mock.patch("pytorch_lightning.strategies.tpu_spawn.TPUSpawnStrategy.set_world_ranks")
+def test_interactive_compatible_strategy_tpu(_, tpu_available, monkeypatch):
     monkeypatch.setattr(pytorch_lightning.trainer.connectors.accelerator_connector, "_IS_INTERACTIVE", True)
     trainer = Trainer(accelerator="tpu")
     assert trainer.strategy.launcher.is_interactive_compatible
diff --git a/tests/tests_pytorch/trainer/properties/test_estimated_stepping_batches.py b/tests/tests_pytorch/trainer/properties/test_estimated_stepping_batches.py
@@ -15,7 +15,6 @@
 import logging
 import os
 from unittest import mock
-from unittest.mock import PropertyMock
 
 import pytest
 import torch
@@ -152,19 +151,19 @@ def test_num_stepping_batches_with_tpu_single():
     assert trainer.estimated_stepping_batches == len(model.train_dataloader())
 
 
+class MultiprocessModel(BoringModel):
+    def on_train_start(self):
+        assert self.trainer.world_size == 8
+        assert self.trainer.estimated_stepping_batches == len(self.train_dataloader()) // 8
+
+
 @RunIf(tpu=True)
-@mock.patch(
-    "pytorch_lightning.strategies.tpu_spawn.TPUSpawnStrategy.root_device",
-    new_callable=PropertyMock,
-    return_value=torch.device("xla:0"),
-)
-def test_num_stepping_batches_with_tpu_multi(_):
+@mock.patch.dict(os.environ, os.environ.copy(), clear=True)
+def test_num_stepping_batches_with_tpu_multi():
     """Test stepping batches with the TPU strategy across multiple devices."""
     trainer = Trainer(accelerator="tpu", devices=8, max_epochs=1)
-    model = BoringModel()
-    trainer._data_connector.attach_data(model)
-    trainer.strategy.connect(model)
-    assert trainer.estimated_stepping_batches == len(model.train_dataloader()) // 8
+    model = MultiprocessModel()
+    trainer.fit(model)
 
 
 @mock.patch("pytorch_lightning.accelerators.ipu.IPUAccelerator.is_available", return_value=True)