From 9e46f024af5bb0235811e4eff15e0119781c4e0e Mon Sep 17 00:00:00 2001 From: Nicki Skafte Date: Thu, 21 Aug 2025 13:53:06 +0200 Subject: [PATCH 01/12] add missing device id for pytorch 2.8 --- src/lightning/fabric/strategies/ddp.py | 7 ++++++- src/lightning/fabric/strategies/fsdp.py | 7 ++++++- src/lightning/fabric/strategies/model_parallel.py | 7 ++++++- src/lightning/pytorch/strategies/ddp.py | 7 ++++++- src/lightning/pytorch/strategies/fsdp.py | 7 ++++++- src/lightning/pytorch/strategies/model_parallel.py | 7 ++++++- 6 files changed, 36 insertions(+), 6 deletions(-) diff --git a/src/lightning/fabric/strategies/ddp.py b/src/lightning/fabric/strategies/ddp.py index ce47e4e403c34..1e9e163533d6a 100644 --- a/src/lightning/fabric/strategies/ddp.py +++ b/src/lightning/fabric/strategies/ddp.py @@ -212,7 +212,12 @@ def _setup_distributed(self) -> None: self._set_world_ranks() self._process_group_backend = self._get_process_group_backend() assert self.cluster_environment is not None - _init_dist_connection(self.cluster_environment, self._process_group_backend, timeout=self._timeout) + _init_dist_connection( + self.cluster_environment, + self._process_group_backend, + timeout=self._timeout, + device_id=self.root_device if self.root_device.type != "cpu" else None, + ) def _get_process_group_backend(self) -> str: return self._process_group_backend or _get_default_process_group_backend_for_device(self.root_device) diff --git a/src/lightning/fabric/strategies/fsdp.py b/src/lightning/fabric/strategies/fsdp.py index 972cb0a2cd840..38b540a5c0178 100644 --- a/src/lightning/fabric/strategies/fsdp.py +++ b/src/lightning/fabric/strategies/fsdp.py @@ -663,7 +663,12 @@ def _setup_distributed(self) -> None: self._set_world_ranks() self._process_group_backend = self._get_process_group_backend() assert self.cluster_environment is not None - _init_dist_connection(self.cluster_environment, self._process_group_backend, timeout=self._timeout) + _init_dist_connection( + self.cluster_environment, + self._process_group_backend, + timeout=self._timeout, + device_id=self.root_device if self.root_device.type != "cpu" else None, + ) def _get_process_group_backend(self) -> str: return self._process_group_backend or _get_default_process_group_backend_for_device(self.root_device) diff --git a/src/lightning/fabric/strategies/model_parallel.py b/src/lightning/fabric/strategies/model_parallel.py index ace23a9c7a2c5..8340ce55c88bc 100644 --- a/src/lightning/fabric/strategies/model_parallel.py +++ b/src/lightning/fabric/strategies/model_parallel.py @@ -302,7 +302,12 @@ def _setup_distributed(self) -> None: self._set_world_ranks() self._process_group_backend = self._get_process_group_backend() assert self.cluster_environment is not None - _init_dist_connection(self.cluster_environment, self._process_group_backend, timeout=self._timeout) + _init_dist_connection( + self.cluster_environment, + self._process_group_backend, + timeout=self._timeout, + device_id=self.root_device if self.root_device.type != "cpu" else None, + ) def _get_process_group_backend(self) -> str: return self._process_group_backend or _get_default_process_group_backend_for_device(self.root_device) diff --git a/src/lightning/pytorch/strategies/ddp.py b/src/lightning/pytorch/strategies/ddp.py index fd3f66ef42471..4b612464c9315 100644 --- a/src/lightning/pytorch/strategies/ddp.py +++ b/src/lightning/pytorch/strategies/ddp.py @@ -200,7 +200,12 @@ def setup_distributed(self) -> None: self.set_world_ranks() self._process_group_backend = self._get_process_group_backend() assert self.cluster_environment is not None - _init_dist_connection(self.cluster_environment, self._process_group_backend, timeout=self._timeout) + _init_dist_connection( + self.cluster_environment, + self._process_group_backend, + timeout=self._timeout, + device_id=self.root_device if self.root_device.type != "cpu" else None, + ) def _get_process_group_backend(self) -> str: return self._process_group_backend or _get_default_process_group_backend_for_device(self.root_device) diff --git a/src/lightning/pytorch/strategies/fsdp.py b/src/lightning/pytorch/strategies/fsdp.py index 55ea354a5cb60..df282f19dc5f6 100644 --- a/src/lightning/pytorch/strategies/fsdp.py +++ b/src/lightning/pytorch/strategies/fsdp.py @@ -260,7 +260,12 @@ def setup_environment(self) -> None: self._process_group_backend = self._get_process_group_backend() assert self.cluster_environment is not None - _init_dist_connection(self.cluster_environment, self._process_group_backend, timeout=self._timeout) + _init_dist_connection( + self.cluster_environment, + self._process_group_backend, + timeout=self._timeout, + device_id=self.root_device if self.root_device.type != "cpu" else None, + ) # if 'device_mesh' in the `kwargs` is provided as a tuple, update it into the `DeviceMesh` object here if isinstance(self.kwargs.get("device_mesh"), tuple): diff --git a/src/lightning/pytorch/strategies/model_parallel.py b/src/lightning/pytorch/strategies/model_parallel.py index 82fec205af731..ec140c86e8914 100644 --- a/src/lightning/pytorch/strategies/model_parallel.py +++ b/src/lightning/pytorch/strategies/model_parallel.py @@ -350,7 +350,12 @@ def _setup_distributed(self) -> None: self.set_world_ranks() self._process_group_backend = self._get_process_group_backend() assert self.cluster_environment is not None - _init_dist_connection(self.cluster_environment, self._process_group_backend, timeout=self._timeout) + _init_dist_connection( + self.cluster_environment, + self._process_group_backend, + timeout=self._timeout, + device_id=self.root_device if self.root_device.type != "cpu" else None, + ) def _get_process_group_backend(self) -> str: return self._process_group_backend or _get_default_process_group_backend_for_device(self.root_device) From c2532802b44ce4661aff37eb4241c7ad1cd81a90 Mon Sep 17 00:00:00 2001 From: Nicki Skafte Date: Mon, 25 Aug 2025 13:56:33 +0200 Subject: [PATCH 02/12] skip device id for older pytorch versions --- src/lightning/fabric/strategies/ddp.py | 7 +++++-- src/lightning/fabric/strategies/fsdp.py | 6 ++++-- src/lightning/fabric/strategies/model_parallel.py | 6 ++++-- src/lightning/pytorch/strategies/ddp.py | 8 +++++--- src/lightning/pytorch/strategies/fsdp.py | 8 +++++--- src/lightning/pytorch/strategies/model_parallel.py | 8 +++++--- 6 files changed, 28 insertions(+), 15 deletions(-) diff --git a/src/lightning/fabric/strategies/ddp.py b/src/lightning/fabric/strategies/ddp.py index 1e9e163533d6a..da3249e83289d 100644 --- a/src/lightning/fabric/strategies/ddp.py +++ b/src/lightning/fabric/strategies/ddp.py @@ -41,6 +41,7 @@ _sync_ddp_if_available, ) from lightning.fabric.utilities.distributed import group as _group +from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_3 from lightning.fabric.utilities.rank_zero import rank_zero_only _DDP_FORK_ALIASES = ( @@ -212,11 +213,13 @@ def _setup_distributed(self) -> None: self._set_world_ranks() self._process_group_backend = self._get_process_group_backend() assert self.cluster_environment is not None + kwargs = {"timeout": self._timeout} + if _TORCH_GREATER_EQUAL_2_3: + kwargs["device_id"] = self.root_device if self.root_device.type != "cpu" else None _init_dist_connection( self.cluster_environment, self._process_group_backend, - timeout=self._timeout, - device_id=self.root_device if self.root_device.type != "cpu" else None, + **kwargs, ) def _get_process_group_backend(self) -> str: diff --git a/src/lightning/fabric/strategies/fsdp.py b/src/lightning/fabric/strategies/fsdp.py index 38b540a5c0178..87261c799e535 100644 --- a/src/lightning/fabric/strategies/fsdp.py +++ b/src/lightning/fabric/strategies/fsdp.py @@ -663,11 +663,13 @@ def _setup_distributed(self) -> None: self._set_world_ranks() self._process_group_backend = self._get_process_group_backend() assert self.cluster_environment is not None + kwargs = {"timeout": self._timeout} + if _TORCH_GREATER_EQUAL_2_3: + kwargs["device_id"] = self.root_device if self.root_device.type != "cpu" else None _init_dist_connection( self.cluster_environment, self._process_group_backend, - timeout=self._timeout, - device_id=self.root_device if self.root_device.type != "cpu" else None, + **kwargs, ) def _get_process_group_backend(self) -> str: diff --git a/src/lightning/fabric/strategies/model_parallel.py b/src/lightning/fabric/strategies/model_parallel.py index 8340ce55c88bc..4938cd18f9432 100644 --- a/src/lightning/fabric/strategies/model_parallel.py +++ b/src/lightning/fabric/strategies/model_parallel.py @@ -302,11 +302,13 @@ def _setup_distributed(self) -> None: self._set_world_ranks() self._process_group_backend = self._get_process_group_backend() assert self.cluster_environment is not None + kwargs = {"timeout": self._timeout} + if _TORCH_GREATER_EQUAL_2_3: + kwargs["device_id"] = self.root_device if self.root_device.type != "cpu" else None _init_dist_connection( self.cluster_environment, self._process_group_backend, - timeout=self._timeout, - device_id=self.root_device if self.root_device.type != "cpu" else None, + **kwargs, ) def _get_process_group_backend(self) -> str: diff --git a/src/lightning/pytorch/strategies/ddp.py b/src/lightning/pytorch/strategies/ddp.py index 4b612464c9315..8c5a4e7079b3f 100644 --- a/src/lightning/pytorch/strategies/ddp.py +++ b/src/lightning/pytorch/strategies/ddp.py @@ -36,7 +36,7 @@ _sync_ddp_if_available, ) from lightning.fabric.utilities.distributed import group as _group -from lightning.fabric.utilities.imports import _IS_WINDOWS +from lightning.fabric.utilities.imports import _IS_WINDOWS, _TORCH_GREATER_EQUAL_2_3 from lightning.fabric.utilities.optimizer import _optimizers_to_device from lightning.fabric.utilities.seed import reset_seed from lightning.fabric.utilities.types import ReduceOp @@ -200,11 +200,13 @@ def setup_distributed(self) -> None: self.set_world_ranks() self._process_group_backend = self._get_process_group_backend() assert self.cluster_environment is not None + kwargs = {"timeout": self._timeout} + if _TORCH_GREATER_EQUAL_2_3: + kwargs["device_id"] = self.root_device if self.root_device.type != "cpu" else None _init_dist_connection( self.cluster_environment, self._process_group_backend, - timeout=self._timeout, - device_id=self.root_device if self.root_device.type != "cpu" else None, + **kwargs, ) def _get_process_group_backend(self) -> str: diff --git a/src/lightning/pytorch/strategies/fsdp.py b/src/lightning/pytorch/strategies/fsdp.py index df282f19dc5f6..52197a270d983 100644 --- a/src/lightning/pytorch/strategies/fsdp.py +++ b/src/lightning/pytorch/strategies/fsdp.py @@ -61,7 +61,7 @@ _sync_ddp_if_available, ) from lightning.fabric.utilities.distributed import group as _group -from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_2 +from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_2, _TORCH_GREATER_EQUAL_2_3 from lightning.fabric.utilities.init import _has_meta_device_parameters_or_buffers from lightning.fabric.utilities.load import _lazy_load, _materialize_tensors from lightning.fabric.utilities.optimizer import _optimizers_to_device @@ -260,11 +260,13 @@ def setup_environment(self) -> None: self._process_group_backend = self._get_process_group_backend() assert self.cluster_environment is not None + kwargs = {"timeout": self._timeout} + if _TORCH_GREATER_EQUAL_2_3: + kwargs["device_id"] = self.root_device if self.root_device.type != "cpu" else None _init_dist_connection( self.cluster_environment, self._process_group_backend, - timeout=self._timeout, - device_id=self.root_device if self.root_device.type != "cpu" else None, + **kwargs, ) # if 'device_mesh' in the `kwargs` is provided as a tuple, update it into the `DeviceMesh` object here diff --git a/src/lightning/pytorch/strategies/model_parallel.py b/src/lightning/pytorch/strategies/model_parallel.py index ec140c86e8914..6381cf88a9306 100644 --- a/src/lightning/pytorch/strategies/model_parallel.py +++ b/src/lightning/pytorch/strategies/model_parallel.py @@ -39,7 +39,7 @@ _sync_ddp_if_available, ) from lightning.fabric.utilities.distributed import group as _group -from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_4 +from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_3, _TORCH_GREATER_EQUAL_2_4 from lightning.fabric.utilities.init import _materialize_distributed_module from lightning.fabric.utilities.load import _METADATA_FILENAME from lightning.fabric.utilities.optimizer import _optimizers_to_device @@ -350,11 +350,13 @@ def _setup_distributed(self) -> None: self.set_world_ranks() self._process_group_backend = self._get_process_group_backend() assert self.cluster_environment is not None + kwargs = {"timeout": self._timeout} + if _TORCH_GREATER_EQUAL_2_3: + kwargs["device_id"] = self.root_device if self.root_device.type != "cpu" else None _init_dist_connection( self.cluster_environment, self._process_group_backend, - timeout=self._timeout, - device_id=self.root_device if self.root_device.type != "cpu" else None, + **kwargs, ) def _get_process_group_backend(self) -> str: From 927167ef0b97403f8be52b54ad887a9c2f79a9e4 Mon Sep 17 00:00:00 2001 From: Nicki Skafte Date: Mon, 25 Aug 2025 13:57:12 +0200 Subject: [PATCH 03/12] add testing --- tests/tests_fabric/strategies/test_ddp.py | 41 +++++++++++++++++++++- tests/tests_pytorch/strategies/test_ddp.py | 29 ++++++++++++++- 2 files changed, 68 insertions(+), 2 deletions(-) diff --git a/tests/tests_fabric/strategies/test_ddp.py b/tests/tests_fabric/strategies/test_ddp.py index fa5c975228a5e..897dc725de0d6 100644 --- a/tests/tests_fabric/strategies/test_ddp.py +++ b/tests/tests_fabric/strategies/test_ddp.py @@ -169,5 +169,44 @@ def test_set_timeout(init_process_group_mock): global_rank = strategy.cluster_environment.global_rank() world_size = strategy.cluster_environment.world_size() init_process_group_mock.assert_called_with( - process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta + process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta, device_id=None + ) + + +@mock.patch("torch.distributed.init_process_group") +def test_device_id_passed_for_cuda_devices(init_process_group_mock): + """Test that device_id is passed to init_process_group for CUDA devices but not for CPU.""" + # Test with CPU device - device_id should be None + cpu_strategy = DDPStrategy(parallel_devices=[torch.device("cpu")]) + cpu_strategy.cluster_environment = LightningEnvironment() + cpu_strategy.accelerator = Mock() + cpu_strategy.setup_environment() + + process_group_backend = cpu_strategy._get_process_group_backend() + global_rank = cpu_strategy.cluster_environment.global_rank() + world_size = cpu_strategy.cluster_environment.world_size() + + init_process_group_mock.assert_called_with( + process_group_backend, rank=global_rank, world_size=world_size, timeout=cpu_strategy._timeout, device_id=None + ) + + init_process_group_mock.reset_mock() + + # Test with CUDA device - device_id should be the device + cuda_device = torch.device("cuda", 0) + cuda_strategy = DDPStrategy(parallel_devices=[cuda_device]) + cuda_strategy.cluster_environment = LightningEnvironment() + cuda_strategy.accelerator = Mock() + cuda_strategy.setup_environment() + + process_group_backend = cuda_strategy._get_process_group_backend() + global_rank = cuda_strategy.cluster_environment.global_rank() + world_size = cuda_strategy.cluster_environment.world_size() + + init_process_group_mock.assert_called_with( + process_group_backend, + rank=global_rank, + world_size=world_size, + timeout=cuda_strategy._timeout, + device_id=cuda_device, ) diff --git a/tests/tests_pytorch/strategies/test_ddp.py b/tests/tests_pytorch/strategies/test_ddp.py index 915e57440b40f..de02ec8f96699 100644 --- a/tests/tests_pytorch/strategies/test_ddp.py +++ b/tests/tests_pytorch/strategies/test_ddp.py @@ -133,7 +133,34 @@ def test_set_timeout(mock_init_process_group): global_rank = trainer.strategy.cluster_environment.global_rank() world_size = trainer.strategy.cluster_environment.world_size() mock_init_process_group.assert_called_with( - process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta + process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta, device_id=None + ) + + +@mock.patch("torch.distributed.init_process_group") +def test_device_id_passed_for_cuda_devices_pytorch(mock_init_process_group): + """Test that device_id is passed to init_process_group for CUDA devices but not for CPU.""" + # Test with CPU device - device_id should be None + model = BoringModel() + ddp_strategy = DDPStrategy() + trainer = Trainer( + max_epochs=1, + accelerator="cpu", + strategy=ddp_strategy, + ) + trainer.strategy.connect(model) + trainer.lightning_module.trainer = trainer + trainer.strategy.setup_environment() + + process_group_backend = trainer.strategy._get_process_group_backend() + global_rank = trainer.strategy.cluster_environment.global_rank() + world_size = trainer.strategy.cluster_environment.world_size() + mock_init_process_group.assert_called_with( + process_group_backend, + rank=global_rank, + world_size=world_size, + timeout=trainer.strategy._timeout, + device_id=None, ) From b1a7e1c89fc509059614505b7963aecdf11d7071 Mon Sep 17 00:00:00 2001 From: Deependu Date: Mon, 1 Sep 2025 07:00:56 +0000 Subject: [PATCH 04/12] fix mypy without touching submodule --- src/lightning/fabric/strategies/ddp.py | 2 +- src/lightning/fabric/strategies/fsdp.py | 2 +- src/lightning/fabric/strategies/model_parallel.py | 2 +- src/lightning/pytorch/strategies/ddp.py | 2 +- src/lightning/pytorch/strategies/fsdp.py | 2 +- src/lightning/pytorch/strategies/model_parallel.py | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/lightning/fabric/strategies/ddp.py b/src/lightning/fabric/strategies/ddp.py index da3249e83289d..b1b7862e6e1e0 100644 --- a/src/lightning/fabric/strategies/ddp.py +++ b/src/lightning/fabric/strategies/ddp.py @@ -213,7 +213,7 @@ def _setup_distributed(self) -> None: self._set_world_ranks() self._process_group_backend = self._get_process_group_backend() assert self.cluster_environment is not None - kwargs = {"timeout": self._timeout} + kwargs: dict[str, Any] = {"timeout": self._timeout} if _TORCH_GREATER_EQUAL_2_3: kwargs["device_id"] = self.root_device if self.root_device.type != "cpu" else None _init_dist_connection( diff --git a/src/lightning/fabric/strategies/fsdp.py b/src/lightning/fabric/strategies/fsdp.py index 87261c799e535..f4ac0ee8f3338 100644 --- a/src/lightning/fabric/strategies/fsdp.py +++ b/src/lightning/fabric/strategies/fsdp.py @@ -663,7 +663,7 @@ def _setup_distributed(self) -> None: self._set_world_ranks() self._process_group_backend = self._get_process_group_backend() assert self.cluster_environment is not None - kwargs = {"timeout": self._timeout} + kwargs: dict[str, Any] = {"timeout": self._timeout} if _TORCH_GREATER_EQUAL_2_3: kwargs["device_id"] = self.root_device if self.root_device.type != "cpu" else None _init_dist_connection( diff --git a/src/lightning/fabric/strategies/model_parallel.py b/src/lightning/fabric/strategies/model_parallel.py index 4938cd18f9432..906fbeff1c7dc 100644 --- a/src/lightning/fabric/strategies/model_parallel.py +++ b/src/lightning/fabric/strategies/model_parallel.py @@ -302,7 +302,7 @@ def _setup_distributed(self) -> None: self._set_world_ranks() self._process_group_backend = self._get_process_group_backend() assert self.cluster_environment is not None - kwargs = {"timeout": self._timeout} + kwargs: dict[str, Any] = {"timeout": self._timeout} if _TORCH_GREATER_EQUAL_2_3: kwargs["device_id"] = self.root_device if self.root_device.type != "cpu" else None _init_dist_connection( diff --git a/src/lightning/pytorch/strategies/ddp.py b/src/lightning/pytorch/strategies/ddp.py index 8c5a4e7079b3f..246cf0f3f8795 100644 --- a/src/lightning/pytorch/strategies/ddp.py +++ b/src/lightning/pytorch/strategies/ddp.py @@ -200,7 +200,7 @@ def setup_distributed(self) -> None: self.set_world_ranks() self._process_group_backend = self._get_process_group_backend() assert self.cluster_environment is not None - kwargs = {"timeout": self._timeout} + kwargs: dict[str, Any] = {"timeout": self._timeout} if _TORCH_GREATER_EQUAL_2_3: kwargs["device_id"] = self.root_device if self.root_device.type != "cpu" else None _init_dist_connection( diff --git a/src/lightning/pytorch/strategies/fsdp.py b/src/lightning/pytorch/strategies/fsdp.py index 52197a270d983..a2fab2520e8ac 100644 --- a/src/lightning/pytorch/strategies/fsdp.py +++ b/src/lightning/pytorch/strategies/fsdp.py @@ -260,7 +260,7 @@ def setup_environment(self) -> None: self._process_group_backend = self._get_process_group_backend() assert self.cluster_environment is not None - kwargs = {"timeout": self._timeout} + kwargs: dict[str, Any] = {"timeout": self._timeout} if _TORCH_GREATER_EQUAL_2_3: kwargs["device_id"] = self.root_device if self.root_device.type != "cpu" else None _init_dist_connection( diff --git a/src/lightning/pytorch/strategies/model_parallel.py b/src/lightning/pytorch/strategies/model_parallel.py index 6381cf88a9306..bbfb7c1e1cbd1 100644 --- a/src/lightning/pytorch/strategies/model_parallel.py +++ b/src/lightning/pytorch/strategies/model_parallel.py @@ -350,7 +350,7 @@ def _setup_distributed(self) -> None: self.set_world_ranks() self._process_group_backend = self._get_process_group_backend() assert self.cluster_environment is not None - kwargs = {"timeout": self._timeout} + kwargs: dict[str, Any] = {"timeout": self._timeout} if _TORCH_GREATER_EQUAL_2_3: kwargs["device_id"] = self.root_device if self.root_device.type != "cpu" else None _init_dist_connection( From 1ff12053846d07bacd0fdb2d6f05bf22678651d0 Mon Sep 17 00:00:00 2001 From: Deependu Date: Mon, 1 Sep 2025 07:25:29 +0000 Subject: [PATCH 05/12] fix failing tests --- tests/tests_fabric/strategies/test_ddp.py | 6 +++++- tests/tests_fabric/strategies/test_fsdp.py | 2 +- tests/tests_fabric/strategies/test_model_parallel.py | 2 +- tests/tests_pytorch/strategies/test_ddp.py | 6 +++++- tests/tests_pytorch/strategies/test_fsdp.py | 2 +- tests/tests_pytorch/strategies/test_model_parallel.py | 2 +- 6 files changed, 14 insertions(+), 6 deletions(-) diff --git a/tests/tests_fabric/strategies/test_ddp.py b/tests/tests_fabric/strategies/test_ddp.py index 897dc725de0d6..c6cab24a2eb44 100644 --- a/tests/tests_fabric/strategies/test_ddp.py +++ b/tests/tests_fabric/strategies/test_ddp.py @@ -25,6 +25,7 @@ from lightning.fabric.plugins.environments import LightningEnvironment from lightning.fabric.strategies import DDPStrategy from lightning.fabric.strategies.ddp import _DDPBackwardSyncControl +from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_3 from tests_fabric.helpers.runif import RunIf @@ -168,8 +169,11 @@ def test_set_timeout(init_process_group_mock): process_group_backend = strategy._get_process_group_backend() global_rank = strategy.cluster_environment.global_rank() world_size = strategy.cluster_environment.world_size() + kwargs = {} + if _TORCH_GREATER_EQUAL_2_3: + kwargs["device_id"] = None init_process_group_mock.assert_called_with( - process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta, device_id=None + process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta, **kwargs ) diff --git a/tests/tests_fabric/strategies/test_fsdp.py b/tests/tests_fabric/strategies/test_fsdp.py index d5f82752a9176..ec8cdffb1482a 100644 --- a/tests/tests_fabric/strategies/test_fsdp.py +++ b/tests/tests_fabric/strategies/test_fsdp.py @@ -382,7 +382,7 @@ def test_set_timeout(init_process_group_mock): global_rank = strategy.cluster_environment.global_rank() world_size = strategy.cluster_environment.world_size() init_process_group_mock.assert_called_with( - process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta + process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta, device_id=None ) diff --git a/tests/tests_fabric/strategies/test_model_parallel.py b/tests/tests_fabric/strategies/test_model_parallel.py index d044626bf8389..b773f0bf428b6 100644 --- a/tests/tests_fabric/strategies/test_model_parallel.py +++ b/tests/tests_fabric/strategies/test_model_parallel.py @@ -317,7 +317,7 @@ def test_set_timeout(init_process_group_mock, _): global_rank = strategy.cluster_environment.global_rank() world_size = strategy.cluster_environment.world_size() init_process_group_mock.assert_called_with( - process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta + process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta, device_id=None ) diff --git a/tests/tests_pytorch/strategies/test_ddp.py b/tests/tests_pytorch/strategies/test_ddp.py index de02ec8f96699..d16050389cf4c 100644 --- a/tests/tests_pytorch/strategies/test_ddp.py +++ b/tests/tests_pytorch/strategies/test_ddp.py @@ -20,6 +20,7 @@ from torch.nn.parallel import DistributedDataParallel from lightning.fabric.plugins.environments import LightningEnvironment +from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_3 from lightning.pytorch import LightningModule, Trainer from lightning.pytorch.demos.boring_classes import BoringModel from lightning.pytorch.plugins import DoublePrecision, HalfPrecision, Precision @@ -155,12 +156,15 @@ def test_device_id_passed_for_cuda_devices_pytorch(mock_init_process_group): process_group_backend = trainer.strategy._get_process_group_backend() global_rank = trainer.strategy.cluster_environment.global_rank() world_size = trainer.strategy.cluster_environment.world_size() + kwargs = {} + if _TORCH_GREATER_EQUAL_2_3: + kwargs["device_id"] = None mock_init_process_group.assert_called_with( process_group_backend, rank=global_rank, world_size=world_size, timeout=trainer.strategy._timeout, - device_id=None, + **kwargs, ) diff --git a/tests/tests_pytorch/strategies/test_fsdp.py b/tests/tests_pytorch/strategies/test_fsdp.py index 560ab19f823ca..561cde8fb8a25 100644 --- a/tests/tests_pytorch/strategies/test_fsdp.py +++ b/tests/tests_pytorch/strategies/test_fsdp.py @@ -533,7 +533,7 @@ def test_set_timeout(init_process_group_mock): global_rank = strategy.cluster_environment.global_rank() world_size = strategy.cluster_environment.world_size() init_process_group_mock.assert_called_with( - process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta + process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta, device_id=None ) diff --git a/tests/tests_pytorch/strategies/test_model_parallel.py b/tests/tests_pytorch/strategies/test_model_parallel.py index 86a95944ac20d..93fa7e5f9124d 100644 --- a/tests/tests_pytorch/strategies/test_model_parallel.py +++ b/tests/tests_pytorch/strategies/test_model_parallel.py @@ -203,7 +203,7 @@ def test_set_timeout(init_process_group_mock, _): global_rank = strategy.cluster_environment.global_rank() world_size = strategy.cluster_environment.world_size() init_process_group_mock.assert_called_with( - process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta + process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta, device_id=None ) From 0a76e7f9ccb9f95c306a58c6831327efb4a8e8d3 Mon Sep 17 00:00:00 2001 From: Deependu Date: Mon, 1 Sep 2025 07:41:36 +0000 Subject: [PATCH 06/12] update --- tests/tests_fabric/strategies/test_ddp.py | 6 ++++-- tests/tests_pytorch/strategies/test_ddp.py | 5 ++++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/tests/tests_fabric/strategies/test_ddp.py b/tests/tests_fabric/strategies/test_ddp.py index c6cab24a2eb44..c755a52d5d08d 100644 --- a/tests/tests_fabric/strategies/test_ddp.py +++ b/tests/tests_fabric/strategies/test_ddp.py @@ -206,11 +206,13 @@ def test_device_id_passed_for_cuda_devices(init_process_group_mock): process_group_backend = cuda_strategy._get_process_group_backend() global_rank = cuda_strategy.cluster_environment.global_rank() world_size = cuda_strategy.cluster_environment.world_size() - + kwargs = {} + if _TORCH_GREATER_EQUAL_2_3: + kwargs["device_id"] = None init_process_group_mock.assert_called_with( process_group_backend, rank=global_rank, world_size=world_size, timeout=cuda_strategy._timeout, - device_id=cuda_device, + **kwargs, ) diff --git a/tests/tests_pytorch/strategies/test_ddp.py b/tests/tests_pytorch/strategies/test_ddp.py index d16050389cf4c..a46ec79ca3399 100644 --- a/tests/tests_pytorch/strategies/test_ddp.py +++ b/tests/tests_pytorch/strategies/test_ddp.py @@ -133,8 +133,11 @@ def test_set_timeout(mock_init_process_group): process_group_backend = trainer.strategy._get_process_group_backend() global_rank = trainer.strategy.cluster_environment.global_rank() world_size = trainer.strategy.cluster_environment.world_size() + kwargs = {} + if _TORCH_GREATER_EQUAL_2_3: + kwargs["device_id"] = None mock_init_process_group.assert_called_with( - process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta, device_id=None + process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta, **kwargs ) From 319cb16d94051e70ce834bcfea8586a5590ee1bf Mon Sep 17 00:00:00 2001 From: Deependu Date: Mon, 1 Sep 2025 07:59:14 +0000 Subject: [PATCH 07/12] maybe --- tests/tests_fabric/strategies/test_ddp.py | 6 ++++-- tests/tests_fabric/strategies/test_fsdp.py | 7 +++++-- tests/tests_fabric/strategies/test_model_parallel.py | 6 +++++- tests/tests_pytorch/strategies/test_fsdp.py | 7 +++++-- tests/tests_pytorch/strategies/test_model_parallel.py | 6 +++++- 5 files changed, 24 insertions(+), 8 deletions(-) diff --git a/tests/tests_fabric/strategies/test_ddp.py b/tests/tests_fabric/strategies/test_ddp.py index c755a52d5d08d..b10223f42fbaf 100644 --- a/tests/tests_fabric/strategies/test_ddp.py +++ b/tests/tests_fabric/strategies/test_ddp.py @@ -189,9 +189,11 @@ def test_device_id_passed_for_cuda_devices(init_process_group_mock): process_group_backend = cpu_strategy._get_process_group_backend() global_rank = cpu_strategy.cluster_environment.global_rank() world_size = cpu_strategy.cluster_environment.world_size() - + kwargs = {} + if _TORCH_GREATER_EQUAL_2_3: + kwargs["device_id"] = None init_process_group_mock.assert_called_with( - process_group_backend, rank=global_rank, world_size=world_size, timeout=cpu_strategy._timeout, device_id=None + process_group_backend, rank=global_rank, world_size=world_size, timeout=cpu_strategy._timeout, **kwargs ) init_process_group_mock.reset_mock() diff --git a/tests/tests_fabric/strategies/test_fsdp.py b/tests/tests_fabric/strategies/test_fsdp.py index ec8cdffb1482a..439278d71cc22 100644 --- a/tests/tests_fabric/strategies/test_fsdp.py +++ b/tests/tests_fabric/strategies/test_fsdp.py @@ -31,7 +31,7 @@ _get_full_state_dict_context, _is_sharded_checkpoint, ) -from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_2 +from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_2, _TORCH_GREATER_EQUAL_2_3 def test_custom_mixed_precision(): @@ -381,8 +381,11 @@ def test_set_timeout(init_process_group_mock): process_group_backend = strategy._get_process_group_backend() global_rank = strategy.cluster_environment.global_rank() world_size = strategy.cluster_environment.world_size() + kwargs = {} + if _TORCH_GREATER_EQUAL_2_3: + kwargs["device_id"] = None init_process_group_mock.assert_called_with( - process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta, device_id=None + process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta, **kwargs ) diff --git a/tests/tests_fabric/strategies/test_model_parallel.py b/tests/tests_fabric/strategies/test_model_parallel.py index b773f0bf428b6..1cfbb1c5a7a61 100644 --- a/tests/tests_fabric/strategies/test_model_parallel.py +++ b/tests/tests_fabric/strategies/test_model_parallel.py @@ -25,6 +25,7 @@ from lightning.fabric.strategies import ModelParallelStrategy from lightning.fabric.strategies.fsdp import _is_sharded_checkpoint from lightning.fabric.strategies.model_parallel import _ParallelBackwardSyncControl +from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_3 from tests_fabric.helpers.runif import RunIf @@ -316,8 +317,11 @@ def test_set_timeout(init_process_group_mock, _): process_group_backend = strategy._get_process_group_backend() global_rank = strategy.cluster_environment.global_rank() world_size = strategy.cluster_environment.world_size() + kwargs = {} + if _TORCH_GREATER_EQUAL_2_3: + kwargs["device_id"] = None init_process_group_mock.assert_called_with( - process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta, device_id=None + process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta, **kwargs ) diff --git a/tests/tests_pytorch/strategies/test_fsdp.py b/tests/tests_pytorch/strategies/test_fsdp.py index 561cde8fb8a25..ec1f7a5a27029 100644 --- a/tests/tests_pytorch/strategies/test_fsdp.py +++ b/tests/tests_pytorch/strategies/test_fsdp.py @@ -18,7 +18,7 @@ from lightning.fabric.plugins.environments import LightningEnvironment from lightning.fabric.strategies.fsdp import _is_sharded_checkpoint -from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_2 +from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_2, _TORCH_GREATER_EQUAL_2_3 from lightning.fabric.utilities.load import _load_distributed_checkpoint from lightning.pytorch import Trainer from lightning.pytorch.callbacks import ModelCheckpoint @@ -532,8 +532,11 @@ def test_set_timeout(init_process_group_mock): process_group_backend = strategy._get_process_group_backend() global_rank = strategy.cluster_environment.global_rank() world_size = strategy.cluster_environment.world_size() + kwargs = {} + if _TORCH_GREATER_EQUAL_2_3: + kwargs["device_id"] = None init_process_group_mock.assert_called_with( - process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta, device_id=None + process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta, **kwargs ) diff --git a/tests/tests_pytorch/strategies/test_model_parallel.py b/tests/tests_pytorch/strategies/test_model_parallel.py index 93fa7e5f9124d..58083af255655 100644 --- a/tests/tests_pytorch/strategies/test_model_parallel.py +++ b/tests/tests_pytorch/strategies/test_model_parallel.py @@ -22,6 +22,7 @@ import torch.nn as nn from lightning.fabric.strategies.model_parallel import _is_sharded_checkpoint +from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_3 from lightning.pytorch import LightningModule from lightning.pytorch.plugins.environments import LightningEnvironment from lightning.pytorch.strategies import ModelParallelStrategy @@ -202,8 +203,11 @@ def test_set_timeout(init_process_group_mock, _): process_group_backend = strategy._get_process_group_backend() global_rank = strategy.cluster_environment.global_rank() world_size = strategy.cluster_environment.world_size() + kwargs = {} + if _TORCH_GREATER_EQUAL_2_3: + kwargs["device_id"] = None init_process_group_mock.assert_called_with( - process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta, device_id=None + process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta, **kwargs ) From 58bd573420e5f384a5fcc8103e2cc9ea9f2daae5 Mon Sep 17 00:00:00 2001 From: Deependu Date: Mon, 1 Sep 2025 08:14:22 +0000 Subject: [PATCH 08/12] meow --- tests/tests_fabric/strategies/test_ddp.py | 6 +++--- tests/tests_fabric/strategies/test_fsdp.py | 2 +- tests/tests_fabric/strategies/test_model_parallel.py | 2 +- tests/tests_pytorch/strategies/test_ddp.py | 4 ++-- tests/tests_pytorch/strategies/test_fsdp.py | 2 +- tests/tests_pytorch/strategies/test_model_parallel.py | 2 +- 6 files changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/tests_fabric/strategies/test_ddp.py b/tests/tests_fabric/strategies/test_ddp.py index b10223f42fbaf..f302da5d1bc4f 100644 --- a/tests/tests_fabric/strategies/test_ddp.py +++ b/tests/tests_fabric/strategies/test_ddp.py @@ -171,7 +171,7 @@ def test_set_timeout(init_process_group_mock): world_size = strategy.cluster_environment.world_size() kwargs = {} if _TORCH_GREATER_EQUAL_2_3: - kwargs["device_id"] = None + kwargs["device_id"] = strategy.root_device if strategy.root_device.type != "cpu" else None init_process_group_mock.assert_called_with( process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta, **kwargs ) @@ -191,7 +191,7 @@ def test_device_id_passed_for_cuda_devices(init_process_group_mock): world_size = cpu_strategy.cluster_environment.world_size() kwargs = {} if _TORCH_GREATER_EQUAL_2_3: - kwargs["device_id"] = None + kwargs["device_id"] = cpu_strategy.root_device if cpu_strategy.root_device.type != "cpu" else None init_process_group_mock.assert_called_with( process_group_backend, rank=global_rank, world_size=world_size, timeout=cpu_strategy._timeout, **kwargs ) @@ -210,7 +210,7 @@ def test_device_id_passed_for_cuda_devices(init_process_group_mock): world_size = cuda_strategy.cluster_environment.world_size() kwargs = {} if _TORCH_GREATER_EQUAL_2_3: - kwargs["device_id"] = None + kwargs["device_id"] = cuda_strategy.root_device if cuda_strategy.root_device.type != "cpu" else None init_process_group_mock.assert_called_with( process_group_backend, rank=global_rank, diff --git a/tests/tests_fabric/strategies/test_fsdp.py b/tests/tests_fabric/strategies/test_fsdp.py index 439278d71cc22..6be379d36582c 100644 --- a/tests/tests_fabric/strategies/test_fsdp.py +++ b/tests/tests_fabric/strategies/test_fsdp.py @@ -383,7 +383,7 @@ def test_set_timeout(init_process_group_mock): world_size = strategy.cluster_environment.world_size() kwargs = {} if _TORCH_GREATER_EQUAL_2_3: - kwargs["device_id"] = None + kwargs["device_id"] = strategy.root_device if strategy.root_device.type != "cpu" else None init_process_group_mock.assert_called_with( process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta, **kwargs ) diff --git a/tests/tests_fabric/strategies/test_model_parallel.py b/tests/tests_fabric/strategies/test_model_parallel.py index 1cfbb1c5a7a61..0e38f6e7777d1 100644 --- a/tests/tests_fabric/strategies/test_model_parallel.py +++ b/tests/tests_fabric/strategies/test_model_parallel.py @@ -319,7 +319,7 @@ def test_set_timeout(init_process_group_mock, _): world_size = strategy.cluster_environment.world_size() kwargs = {} if _TORCH_GREATER_EQUAL_2_3: - kwargs["device_id"] = None + kwargs["device_id"] = strategy.root_device if strategy.root_device.type != "cpu" else None init_process_group_mock.assert_called_with( process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta, **kwargs ) diff --git a/tests/tests_pytorch/strategies/test_ddp.py b/tests/tests_pytorch/strategies/test_ddp.py index a46ec79ca3399..823d77d0d5848 100644 --- a/tests/tests_pytorch/strategies/test_ddp.py +++ b/tests/tests_pytorch/strategies/test_ddp.py @@ -135,7 +135,7 @@ def test_set_timeout(mock_init_process_group): world_size = trainer.strategy.cluster_environment.world_size() kwargs = {} if _TORCH_GREATER_EQUAL_2_3: - kwargs["device_id"] = None + kwargs["device_id"] = trainer.strategy.root_device if trainer.strategy.root_device.type != "cpu" else None mock_init_process_group.assert_called_with( process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta, **kwargs ) @@ -161,7 +161,7 @@ def test_device_id_passed_for_cuda_devices_pytorch(mock_init_process_group): world_size = trainer.strategy.cluster_environment.world_size() kwargs = {} if _TORCH_GREATER_EQUAL_2_3: - kwargs["device_id"] = None + kwargs["device_id"] = trainer.strategy.root_device if trainer.strategy.root_device.type != "cpu" else None mock_init_process_group.assert_called_with( process_group_backend, rank=global_rank, diff --git a/tests/tests_pytorch/strategies/test_fsdp.py b/tests/tests_pytorch/strategies/test_fsdp.py index ec1f7a5a27029..f7c15b5930be8 100644 --- a/tests/tests_pytorch/strategies/test_fsdp.py +++ b/tests/tests_pytorch/strategies/test_fsdp.py @@ -534,7 +534,7 @@ def test_set_timeout(init_process_group_mock): world_size = strategy.cluster_environment.world_size() kwargs = {} if _TORCH_GREATER_EQUAL_2_3: - kwargs["device_id"] = None + kwargs["device_id"] = strategy.root_device if strategy.root_device.type != "cpu" else None init_process_group_mock.assert_called_with( process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta, **kwargs ) diff --git a/tests/tests_pytorch/strategies/test_model_parallel.py b/tests/tests_pytorch/strategies/test_model_parallel.py index 58083af255655..c803c10afa4b4 100644 --- a/tests/tests_pytorch/strategies/test_model_parallel.py +++ b/tests/tests_pytorch/strategies/test_model_parallel.py @@ -205,7 +205,7 @@ def test_set_timeout(init_process_group_mock, _): world_size = strategy.cluster_environment.world_size() kwargs = {} if _TORCH_GREATER_EQUAL_2_3: - kwargs["device_id"] = None + kwargs["device_id"] = strategy.root_device if strategy.root_device.type != "cpu" else None init_process_group_mock.assert_called_with( process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta, **kwargs ) From cf7fbe328951d28d821cb812436782c123440977 Mon Sep 17 00:00:00 2001 From: Deependu Date: Mon, 1 Sep 2025 09:44:11 +0000 Subject: [PATCH 09/12] update --- tests/tests_pytorch/strategies/test_ddp_integration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/tests_pytorch/strategies/test_ddp_integration.py b/tests/tests_pytorch/strategies/test_ddp_integration.py index 048403366ebc7..fc3a8cfebbac0 100644 --- a/tests/tests_pytorch/strategies/test_ddp_integration.py +++ b/tests/tests_pytorch/strategies/test_ddp_integration.py @@ -66,7 +66,7 @@ def test_multi_gpu_model_ddp_fit_test(tmp_path): assert out["test_acc"] > 0.7 -@RunIf(skip_windows=True) +@RunIf(skip_windows=True, max_torch="2.7") @mock.patch("torch.cuda.set_device") @mock.patch("lightning.pytorch.accelerators.cuda._check_cuda_matmul_precision") @mock.patch("lightning.pytorch.accelerators.cuda._clear_cuda_memory") From c326a34e603639f15ace902a3adc1d2a16635c19 Mon Sep 17 00:00:00 2001 From: Jirka B Date: Mon, 1 Sep 2025 13:09:10 +0200 Subject: [PATCH 10/12] chlog --- src/lightning/fabric/CHANGELOG.md | 4 ++-- src/lightning/pytorch/CHANGELOG.md | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md index 16cd42adc90d9..cb37fcc6a937f 100644 --- a/src/lightning/fabric/CHANGELOG.md +++ b/src/lightning/fabric/CHANGELOG.md @@ -24,7 +24,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed -- +- Fixed with adding a missing device id for pytorch 2.8 ([#21105](https://github.com/Lightning-AI/pytorch-lightning/pull/21105)) --- @@ -33,7 +33,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Changed -- Added support for NVIDIA H200 GPUs in `get_available_flops` ([#20913](https://github.com/Lightning-AI/pytorch-lightning/pull/21119)) +- Added support for NVIDIA H200 GPUs in `get_available_flops` ([#21119](https://github.com/Lightning-AI/pytorch-lightning/pull/21119)) diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md index 176e34273d776..03664c8e2d1ad 100644 --- a/src/lightning/pytorch/CHANGELOG.md +++ b/src/lightning/pytorch/CHANGELOG.md @@ -31,6 +31,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed callbacks by defer step/time-triggered `ModelCheckpoint` saves until validation metrics are available ([#21106](https://github.com/Lightning-AI/pytorch-lightning/pull/21106)) +- Fixed with adding a missing device id for pytorch 2.8 ([#21105](https://github.com/Lightning-AI/pytorch-lightning/pull/21105)) + --- From 00608a3818370391bd585960fa6cce18d21617b2 Mon Sep 17 00:00:00 2001 From: Jirka Borovec <6035284+Borda@users.noreply.github.com> Date: Mon, 1 Sep 2025 13:10:24 +0200 Subject: [PATCH 11/12] Apply suggestions from code review --- src/lightning/fabric/strategies/ddp.py | 2 +- src/lightning/fabric/strategies/fsdp.py | 2 +- src/lightning/fabric/strategies/model_parallel.py | 2 +- src/lightning/pytorch/strategies/ddp.py | 2 +- src/lightning/pytorch/strategies/fsdp.py | 2 +- src/lightning/pytorch/strategies/model_parallel.py | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/lightning/fabric/strategies/ddp.py b/src/lightning/fabric/strategies/ddp.py index b1b7862e6e1e0..330dc5c6795e1 100644 --- a/src/lightning/fabric/strategies/ddp.py +++ b/src/lightning/fabric/strategies/ddp.py @@ -219,7 +219,7 @@ def _setup_distributed(self) -> None: _init_dist_connection( self.cluster_environment, self._process_group_backend, - **kwargs, + **kwargs ) def _get_process_group_backend(self) -> str: diff --git a/src/lightning/fabric/strategies/fsdp.py b/src/lightning/fabric/strategies/fsdp.py index f4ac0ee8f3338..5904f7634a360 100644 --- a/src/lightning/fabric/strategies/fsdp.py +++ b/src/lightning/fabric/strategies/fsdp.py @@ -669,7 +669,7 @@ def _setup_distributed(self) -> None: _init_dist_connection( self.cluster_environment, self._process_group_backend, - **kwargs, + **kwargs ) def _get_process_group_backend(self) -> str: diff --git a/src/lightning/fabric/strategies/model_parallel.py b/src/lightning/fabric/strategies/model_parallel.py index 906fbeff1c7dc..f67661acff2b1 100644 --- a/src/lightning/fabric/strategies/model_parallel.py +++ b/src/lightning/fabric/strategies/model_parallel.py @@ -308,7 +308,7 @@ def _setup_distributed(self) -> None: _init_dist_connection( self.cluster_environment, self._process_group_backend, - **kwargs, + **kwargs ) def _get_process_group_backend(self) -> str: diff --git a/src/lightning/pytorch/strategies/ddp.py b/src/lightning/pytorch/strategies/ddp.py index 246cf0f3f8795..e7630f9662735 100644 --- a/src/lightning/pytorch/strategies/ddp.py +++ b/src/lightning/pytorch/strategies/ddp.py @@ -206,7 +206,7 @@ def setup_distributed(self) -> None: _init_dist_connection( self.cluster_environment, self._process_group_backend, - **kwargs, + **kwargs ) def _get_process_group_backend(self) -> str: diff --git a/src/lightning/pytorch/strategies/fsdp.py b/src/lightning/pytorch/strategies/fsdp.py index a2fab2520e8ac..a4e5b6e52c949 100644 --- a/src/lightning/pytorch/strategies/fsdp.py +++ b/src/lightning/pytorch/strategies/fsdp.py @@ -266,7 +266,7 @@ def setup_environment(self) -> None: _init_dist_connection( self.cluster_environment, self._process_group_backend, - **kwargs, + **kwargs ) # if 'device_mesh' in the `kwargs` is provided as a tuple, update it into the `DeviceMesh` object here diff --git a/src/lightning/pytorch/strategies/model_parallel.py b/src/lightning/pytorch/strategies/model_parallel.py index bbfb7c1e1cbd1..aa21f7be2963c 100644 --- a/src/lightning/pytorch/strategies/model_parallel.py +++ b/src/lightning/pytorch/strategies/model_parallel.py @@ -356,7 +356,7 @@ def _setup_distributed(self) -> None: _init_dist_connection( self.cluster_environment, self._process_group_backend, - **kwargs, + **kwargs ) def _get_process_group_backend(self) -> str: From a8b89487a5095c152dfc72239b212127d36aa620 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 1 Sep 2025 11:11:57 +0000 Subject: [PATCH 12/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/lightning/fabric/strategies/ddp.py | 6 +----- src/lightning/fabric/strategies/fsdp.py | 6 +----- src/lightning/fabric/strategies/model_parallel.py | 6 +----- src/lightning/pytorch/strategies/ddp.py | 6 +----- src/lightning/pytorch/strategies/fsdp.py | 6 +----- src/lightning/pytorch/strategies/model_parallel.py | 6 +----- 6 files changed, 6 insertions(+), 30 deletions(-) diff --git a/src/lightning/fabric/strategies/ddp.py b/src/lightning/fabric/strategies/ddp.py index 330dc5c6795e1..e826b910c16d3 100644 --- a/src/lightning/fabric/strategies/ddp.py +++ b/src/lightning/fabric/strategies/ddp.py @@ -216,11 +216,7 @@ def _setup_distributed(self) -> None: kwargs: dict[str, Any] = {"timeout": self._timeout} if _TORCH_GREATER_EQUAL_2_3: kwargs["device_id"] = self.root_device if self.root_device.type != "cpu" else None - _init_dist_connection( - self.cluster_environment, - self._process_group_backend, - **kwargs - ) + _init_dist_connection(self.cluster_environment, self._process_group_backend, **kwargs) def _get_process_group_backend(self) -> str: return self._process_group_backend or _get_default_process_group_backend_for_device(self.root_device) diff --git a/src/lightning/fabric/strategies/fsdp.py b/src/lightning/fabric/strategies/fsdp.py index 5904f7634a360..baaee74af0ec9 100644 --- a/src/lightning/fabric/strategies/fsdp.py +++ b/src/lightning/fabric/strategies/fsdp.py @@ -666,11 +666,7 @@ def _setup_distributed(self) -> None: kwargs: dict[str, Any] = {"timeout": self._timeout} if _TORCH_GREATER_EQUAL_2_3: kwargs["device_id"] = self.root_device if self.root_device.type != "cpu" else None - _init_dist_connection( - self.cluster_environment, - self._process_group_backend, - **kwargs - ) + _init_dist_connection(self.cluster_environment, self._process_group_backend, **kwargs) def _get_process_group_backend(self) -> str: return self._process_group_backend or _get_default_process_group_backend_for_device(self.root_device) diff --git a/src/lightning/fabric/strategies/model_parallel.py b/src/lightning/fabric/strategies/model_parallel.py index f67661acff2b1..0d49ddf91a0bc 100644 --- a/src/lightning/fabric/strategies/model_parallel.py +++ b/src/lightning/fabric/strategies/model_parallel.py @@ -305,11 +305,7 @@ def _setup_distributed(self) -> None: kwargs: dict[str, Any] = {"timeout": self._timeout} if _TORCH_GREATER_EQUAL_2_3: kwargs["device_id"] = self.root_device if self.root_device.type != "cpu" else None - _init_dist_connection( - self.cluster_environment, - self._process_group_backend, - **kwargs - ) + _init_dist_connection(self.cluster_environment, self._process_group_backend, **kwargs) def _get_process_group_backend(self) -> str: return self._process_group_backend or _get_default_process_group_backend_for_device(self.root_device) diff --git a/src/lightning/pytorch/strategies/ddp.py b/src/lightning/pytorch/strategies/ddp.py index e7630f9662735..92206e1accc31 100644 --- a/src/lightning/pytorch/strategies/ddp.py +++ b/src/lightning/pytorch/strategies/ddp.py @@ -203,11 +203,7 @@ def setup_distributed(self) -> None: kwargs: dict[str, Any] = {"timeout": self._timeout} if _TORCH_GREATER_EQUAL_2_3: kwargs["device_id"] = self.root_device if self.root_device.type != "cpu" else None - _init_dist_connection( - self.cluster_environment, - self._process_group_backend, - **kwargs - ) + _init_dist_connection(self.cluster_environment, self._process_group_backend, **kwargs) def _get_process_group_backend(self) -> str: return self._process_group_backend or _get_default_process_group_backend_for_device(self.root_device) diff --git a/src/lightning/pytorch/strategies/fsdp.py b/src/lightning/pytorch/strategies/fsdp.py index a4e5b6e52c949..3fbd0f9cd5f0a 100644 --- a/src/lightning/pytorch/strategies/fsdp.py +++ b/src/lightning/pytorch/strategies/fsdp.py @@ -263,11 +263,7 @@ def setup_environment(self) -> None: kwargs: dict[str, Any] = {"timeout": self._timeout} if _TORCH_GREATER_EQUAL_2_3: kwargs["device_id"] = self.root_device if self.root_device.type != "cpu" else None - _init_dist_connection( - self.cluster_environment, - self._process_group_backend, - **kwargs - ) + _init_dist_connection(self.cluster_environment, self._process_group_backend, **kwargs) # if 'device_mesh' in the `kwargs` is provided as a tuple, update it into the `DeviceMesh` object here if isinstance(self.kwargs.get("device_mesh"), tuple): diff --git a/src/lightning/pytorch/strategies/model_parallel.py b/src/lightning/pytorch/strategies/model_parallel.py index aa21f7be2963c..e0286dbe2e0e6 100644 --- a/src/lightning/pytorch/strategies/model_parallel.py +++ b/src/lightning/pytorch/strategies/model_parallel.py @@ -353,11 +353,7 @@ def _setup_distributed(self) -> None: kwargs: dict[str, Any] = {"timeout": self._timeout} if _TORCH_GREATER_EQUAL_2_3: kwargs["device_id"] = self.root_device if self.root_device.type != "cpu" else None - _init_dist_connection( - self.cluster_environment, - self._process_group_backend, - **kwargs - ) + _init_dist_connection(self.cluster_environment, self._process_group_backend, **kwargs) def _get_process_group_backend(self) -> str: return self._process_group_backend or _get_default_process_group_backend_for_device(self.root_device)