From 9e46f024af5bb0235811e4eff15e0119781c4e0e Mon Sep 17 00:00:00 2001
From: Nicki Skafte <skaftenicki@gmail.com>
Date: Thu, 21 Aug 2025 13:53:06 +0200
Subject: [PATCH 01/12] add missing device id for pytorch 2.8

---
 src/lightning/fabric/strategies/ddp.py             | 7 ++++++-
 src/lightning/fabric/strategies/fsdp.py            | 7 ++++++-
 src/lightning/fabric/strategies/model_parallel.py  | 7 ++++++-
 src/lightning/pytorch/strategies/ddp.py            | 7 ++++++-
 src/lightning/pytorch/strategies/fsdp.py           | 7 ++++++-
 src/lightning/pytorch/strategies/model_parallel.py | 7 ++++++-
 6 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/src/lightning/fabric/strategies/ddp.py b/src/lightning/fabric/strategies/ddp.py
index ce47e4e403c34..1e9e163533d6a 100644
--- a/src/lightning/fabric/strategies/ddp.py
+++ b/src/lightning/fabric/strategies/ddp.py
@@ -212,7 +212,12 @@ def _setup_distributed(self) -> None:
         self._set_world_ranks()
         self._process_group_backend = self._get_process_group_backend()
         assert self.cluster_environment is not None
-        _init_dist_connection(self.cluster_environment, self._process_group_backend, timeout=self._timeout)
+        _init_dist_connection(
+            self.cluster_environment,
+            self._process_group_backend,
+            timeout=self._timeout,
+            device_id=self.root_device if self.root_device.type != "cpu" else None,
+        )
 
     def _get_process_group_backend(self) -> str:
         return self._process_group_backend or _get_default_process_group_backend_for_device(self.root_device)
diff --git a/src/lightning/fabric/strategies/fsdp.py b/src/lightning/fabric/strategies/fsdp.py
index 972cb0a2cd840..38b540a5c0178 100644
--- a/src/lightning/fabric/strategies/fsdp.py
+++ b/src/lightning/fabric/strategies/fsdp.py
@@ -663,7 +663,12 @@ def _setup_distributed(self) -> None:
         self._set_world_ranks()
         self._process_group_backend = self._get_process_group_backend()
         assert self.cluster_environment is not None
-        _init_dist_connection(self.cluster_environment, self._process_group_backend, timeout=self._timeout)
+        _init_dist_connection(
+            self.cluster_environment,
+            self._process_group_backend,
+            timeout=self._timeout,
+            device_id=self.root_device if self.root_device.type != "cpu" else None,
+        )
 
     def _get_process_group_backend(self) -> str:
         return self._process_group_backend or _get_default_process_group_backend_for_device(self.root_device)
diff --git a/src/lightning/fabric/strategies/model_parallel.py b/src/lightning/fabric/strategies/model_parallel.py
index ace23a9c7a2c5..8340ce55c88bc 100644
--- a/src/lightning/fabric/strategies/model_parallel.py
+++ b/src/lightning/fabric/strategies/model_parallel.py
@@ -302,7 +302,12 @@ def _setup_distributed(self) -> None:
         self._set_world_ranks()
         self._process_group_backend = self._get_process_group_backend()
         assert self.cluster_environment is not None
-        _init_dist_connection(self.cluster_environment, self._process_group_backend, timeout=self._timeout)
+        _init_dist_connection(
+            self.cluster_environment,
+            self._process_group_backend,
+            timeout=self._timeout,
+            device_id=self.root_device if self.root_device.type != "cpu" else None,
+        )
 
     def _get_process_group_backend(self) -> str:
         return self._process_group_backend or _get_default_process_group_backend_for_device(self.root_device)
diff --git a/src/lightning/pytorch/strategies/ddp.py b/src/lightning/pytorch/strategies/ddp.py
index fd3f66ef42471..4b612464c9315 100644
--- a/src/lightning/pytorch/strategies/ddp.py
+++ b/src/lightning/pytorch/strategies/ddp.py
@@ -200,7 +200,12 @@ def setup_distributed(self) -> None:
         self.set_world_ranks()
         self._process_group_backend = self._get_process_group_backend()
         assert self.cluster_environment is not None
-        _init_dist_connection(self.cluster_environment, self._process_group_backend, timeout=self._timeout)
+        _init_dist_connection(
+            self.cluster_environment,
+            self._process_group_backend,
+            timeout=self._timeout,
+            device_id=self.root_device if self.root_device.type != "cpu" else None,
+        )
 
     def _get_process_group_backend(self) -> str:
         return self._process_group_backend or _get_default_process_group_backend_for_device(self.root_device)
diff --git a/src/lightning/pytorch/strategies/fsdp.py b/src/lightning/pytorch/strategies/fsdp.py
index 55ea354a5cb60..df282f19dc5f6 100644
--- a/src/lightning/pytorch/strategies/fsdp.py
+++ b/src/lightning/pytorch/strategies/fsdp.py
@@ -260,7 +260,12 @@ def setup_environment(self) -> None:
 
         self._process_group_backend = self._get_process_group_backend()
         assert self.cluster_environment is not None
-        _init_dist_connection(self.cluster_environment, self._process_group_backend, timeout=self._timeout)
+        _init_dist_connection(
+            self.cluster_environment,
+            self._process_group_backend,
+            timeout=self._timeout,
+            device_id=self.root_device if self.root_device.type != "cpu" else None,
+        )
 
         # if 'device_mesh' in the `kwargs` is provided as a tuple, update it into the `DeviceMesh` object here
         if isinstance(self.kwargs.get("device_mesh"), tuple):
diff --git a/src/lightning/pytorch/strategies/model_parallel.py b/src/lightning/pytorch/strategies/model_parallel.py
index 82fec205af731..ec140c86e8914 100644
--- a/src/lightning/pytorch/strategies/model_parallel.py
+++ b/src/lightning/pytorch/strategies/model_parallel.py
@@ -350,7 +350,12 @@ def _setup_distributed(self) -> None:
         self.set_world_ranks()
         self._process_group_backend = self._get_process_group_backend()
         assert self.cluster_environment is not None
-        _init_dist_connection(self.cluster_environment, self._process_group_backend, timeout=self._timeout)
+        _init_dist_connection(
+            self.cluster_environment,
+            self._process_group_backend,
+            timeout=self._timeout,
+            device_id=self.root_device if self.root_device.type != "cpu" else None,
+        )
 
     def _get_process_group_backend(self) -> str:
         return self._process_group_backend or _get_default_process_group_backend_for_device(self.root_device)

From c2532802b44ce4661aff37eb4241c7ad1cd81a90 Mon Sep 17 00:00:00 2001
From: Nicki Skafte <skaftenicki@gmail.com>
Date: Mon, 25 Aug 2025 13:56:33 +0200
Subject: [PATCH 02/12] skip device id for older pytorch versions

---
 src/lightning/fabric/strategies/ddp.py             | 7 +++++--
 src/lightning/fabric/strategies/fsdp.py            | 6 ++++--
 src/lightning/fabric/strategies/model_parallel.py  | 6 ++++--
 src/lightning/pytorch/strategies/ddp.py            | 8 +++++---
 src/lightning/pytorch/strategies/fsdp.py           | 8 +++++---
 src/lightning/pytorch/strategies/model_parallel.py | 8 +++++---
 6 files changed, 28 insertions(+), 15 deletions(-)

diff --git a/src/lightning/fabric/strategies/ddp.py b/src/lightning/fabric/strategies/ddp.py
index 1e9e163533d6a..da3249e83289d 100644
--- a/src/lightning/fabric/strategies/ddp.py
+++ b/src/lightning/fabric/strategies/ddp.py
@@ -41,6 +41,7 @@
     _sync_ddp_if_available,
 )
 from lightning.fabric.utilities.distributed import group as _group
+from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_3
 from lightning.fabric.utilities.rank_zero import rank_zero_only
 
 _DDP_FORK_ALIASES = (
@@ -212,11 +213,13 @@ def _setup_distributed(self) -> None:
         self._set_world_ranks()
         self._process_group_backend = self._get_process_group_backend()
         assert self.cluster_environment is not None
+        kwargs = {"timeout": self._timeout}
+        if _TORCH_GREATER_EQUAL_2_3:
+            kwargs["device_id"] = self.root_device if self.root_device.type != "cpu" else None
         _init_dist_connection(
             self.cluster_environment,
             self._process_group_backend,
-            timeout=self._timeout,
-            device_id=self.root_device if self.root_device.type != "cpu" else None,
+            **kwargs,
         )
 
     def _get_process_group_backend(self) -> str:
diff --git a/src/lightning/fabric/strategies/fsdp.py b/src/lightning/fabric/strategies/fsdp.py
index 38b540a5c0178..87261c799e535 100644
--- a/src/lightning/fabric/strategies/fsdp.py
+++ b/src/lightning/fabric/strategies/fsdp.py
@@ -663,11 +663,13 @@ def _setup_distributed(self) -> None:
         self._set_world_ranks()
         self._process_group_backend = self._get_process_group_backend()
         assert self.cluster_environment is not None
+        kwargs = {"timeout": self._timeout}
+        if _TORCH_GREATER_EQUAL_2_3:
+            kwargs["device_id"] = self.root_device if self.root_device.type != "cpu" else None
         _init_dist_connection(
             self.cluster_environment,
             self._process_group_backend,
-            timeout=self._timeout,
-            device_id=self.root_device if self.root_device.type != "cpu" else None,
+            **kwargs,
         )
 
     def _get_process_group_backend(self) -> str:
diff --git a/src/lightning/fabric/strategies/model_parallel.py b/src/lightning/fabric/strategies/model_parallel.py
index 8340ce55c88bc..4938cd18f9432 100644
--- a/src/lightning/fabric/strategies/model_parallel.py
+++ b/src/lightning/fabric/strategies/model_parallel.py
@@ -302,11 +302,13 @@ def _setup_distributed(self) -> None:
         self._set_world_ranks()
         self._process_group_backend = self._get_process_group_backend()
         assert self.cluster_environment is not None
+        kwargs = {"timeout": self._timeout}
+        if _TORCH_GREATER_EQUAL_2_3:
+            kwargs["device_id"] = self.root_device if self.root_device.type != "cpu" else None
         _init_dist_connection(
             self.cluster_environment,
             self._process_group_backend,
-            timeout=self._timeout,
-            device_id=self.root_device if self.root_device.type != "cpu" else None,
+            **kwargs,
         )
 
     def _get_process_group_backend(self) -> str:
diff --git a/src/lightning/pytorch/strategies/ddp.py b/src/lightning/pytorch/strategies/ddp.py
index 4b612464c9315..8c5a4e7079b3f 100644
--- a/src/lightning/pytorch/strategies/ddp.py
+++ b/src/lightning/pytorch/strategies/ddp.py
@@ -36,7 +36,7 @@
     _sync_ddp_if_available,
 )
 from lightning.fabric.utilities.distributed import group as _group
-from lightning.fabric.utilities.imports import _IS_WINDOWS
+from lightning.fabric.utilities.imports import _IS_WINDOWS, _TORCH_GREATER_EQUAL_2_3
 from lightning.fabric.utilities.optimizer import _optimizers_to_device
 from lightning.fabric.utilities.seed import reset_seed
 from lightning.fabric.utilities.types import ReduceOp
@@ -200,11 +200,13 @@ def setup_distributed(self) -> None:
         self.set_world_ranks()
         self._process_group_backend = self._get_process_group_backend()
         assert self.cluster_environment is not None
+        kwargs = {"timeout": self._timeout}
+        if _TORCH_GREATER_EQUAL_2_3:
+            kwargs["device_id"] = self.root_device if self.root_device.type != "cpu" else None
         _init_dist_connection(
             self.cluster_environment,
             self._process_group_backend,
-            timeout=self._timeout,
-            device_id=self.root_device if self.root_device.type != "cpu" else None,
+            **kwargs,
         )
 
     def _get_process_group_backend(self) -> str:
diff --git a/src/lightning/pytorch/strategies/fsdp.py b/src/lightning/pytorch/strategies/fsdp.py
index df282f19dc5f6..52197a270d983 100644
--- a/src/lightning/pytorch/strategies/fsdp.py
+++ b/src/lightning/pytorch/strategies/fsdp.py
@@ -61,7 +61,7 @@
     _sync_ddp_if_available,
 )
 from lightning.fabric.utilities.distributed import group as _group
-from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_2
+from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_2, _TORCH_GREATER_EQUAL_2_3
 from lightning.fabric.utilities.init import _has_meta_device_parameters_or_buffers
 from lightning.fabric.utilities.load import _lazy_load, _materialize_tensors
 from lightning.fabric.utilities.optimizer import _optimizers_to_device
@@ -260,11 +260,13 @@ def setup_environment(self) -> None:
 
         self._process_group_backend = self._get_process_group_backend()
         assert self.cluster_environment is not None
+        kwargs = {"timeout": self._timeout}
+        if _TORCH_GREATER_EQUAL_2_3:
+            kwargs["device_id"] = self.root_device if self.root_device.type != "cpu" else None
         _init_dist_connection(
             self.cluster_environment,
             self._process_group_backend,
-            timeout=self._timeout,
-            device_id=self.root_device if self.root_device.type != "cpu" else None,
+            **kwargs,
         )
 
         # if 'device_mesh' in the `kwargs` is provided as a tuple, update it into the `DeviceMesh` object here
diff --git a/src/lightning/pytorch/strategies/model_parallel.py b/src/lightning/pytorch/strategies/model_parallel.py
index ec140c86e8914..6381cf88a9306 100644
--- a/src/lightning/pytorch/strategies/model_parallel.py
+++ b/src/lightning/pytorch/strategies/model_parallel.py
@@ -39,7 +39,7 @@
     _sync_ddp_if_available,
 )
 from lightning.fabric.utilities.distributed import group as _group
-from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_4
+from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_3, _TORCH_GREATER_EQUAL_2_4
 from lightning.fabric.utilities.init import _materialize_distributed_module
 from lightning.fabric.utilities.load import _METADATA_FILENAME
 from lightning.fabric.utilities.optimizer import _optimizers_to_device
@@ -350,11 +350,13 @@ def _setup_distributed(self) -> None:
         self.set_world_ranks()
         self._process_group_backend = self._get_process_group_backend()
         assert self.cluster_environment is not None
+        kwargs = {"timeout": self._timeout}
+        if _TORCH_GREATER_EQUAL_2_3:
+            kwargs["device_id"] = self.root_device if self.root_device.type != "cpu" else None
         _init_dist_connection(
             self.cluster_environment,
             self._process_group_backend,
-            timeout=self._timeout,
-            device_id=self.root_device if self.root_device.type != "cpu" else None,
+            **kwargs,
         )
 
     def _get_process_group_backend(self) -> str:

From 927167ef0b97403f8be52b54ad887a9c2f79a9e4 Mon Sep 17 00:00:00 2001
From: Nicki Skafte <skaftenicki@gmail.com>
Date: Mon, 25 Aug 2025 13:57:12 +0200
Subject: [PATCH 03/12] add testing

---
 tests/tests_fabric/strategies/test_ddp.py  | 41 +++++++++++++++++++++-
 tests/tests_pytorch/strategies/test_ddp.py | 29 ++++++++++++++-
 2 files changed, 68 insertions(+), 2 deletions(-)

diff --git a/tests/tests_fabric/strategies/test_ddp.py b/tests/tests_fabric/strategies/test_ddp.py
index fa5c975228a5e..897dc725de0d6 100644
--- a/tests/tests_fabric/strategies/test_ddp.py
+++ b/tests/tests_fabric/strategies/test_ddp.py
@@ -169,5 +169,44 @@ def test_set_timeout(init_process_group_mock):
     global_rank = strategy.cluster_environment.global_rank()
     world_size = strategy.cluster_environment.world_size()
     init_process_group_mock.assert_called_with(
-        process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta
+        process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta, device_id=None
+    )
+
+
+@mock.patch("torch.distributed.init_process_group")
+def test_device_id_passed_for_cuda_devices(init_process_group_mock):
+    """Test that device_id is passed to init_process_group for CUDA devices but not for CPU."""
+    # Test with CPU device - device_id should be None
+    cpu_strategy = DDPStrategy(parallel_devices=[torch.device("cpu")])
+    cpu_strategy.cluster_environment = LightningEnvironment()
+    cpu_strategy.accelerator = Mock()
+    cpu_strategy.setup_environment()
+
+    process_group_backend = cpu_strategy._get_process_group_backend()
+    global_rank = cpu_strategy.cluster_environment.global_rank()
+    world_size = cpu_strategy.cluster_environment.world_size()
+
+    init_process_group_mock.assert_called_with(
+        process_group_backend, rank=global_rank, world_size=world_size, timeout=cpu_strategy._timeout, device_id=None
+    )
+
+    init_process_group_mock.reset_mock()
+
+    # Test with CUDA device - device_id should be the device
+    cuda_device = torch.device("cuda", 0)
+    cuda_strategy = DDPStrategy(parallel_devices=[cuda_device])
+    cuda_strategy.cluster_environment = LightningEnvironment()
+    cuda_strategy.accelerator = Mock()
+    cuda_strategy.setup_environment()
+
+    process_group_backend = cuda_strategy._get_process_group_backend()
+    global_rank = cuda_strategy.cluster_environment.global_rank()
+    world_size = cuda_strategy.cluster_environment.world_size()
+
+    init_process_group_mock.assert_called_with(
+        process_group_backend,
+        rank=global_rank,
+        world_size=world_size,
+        timeout=cuda_strategy._timeout,
+        device_id=cuda_device,
     )
diff --git a/tests/tests_pytorch/strategies/test_ddp.py b/tests/tests_pytorch/strategies/test_ddp.py
index 915e57440b40f..de02ec8f96699 100644
--- a/tests/tests_pytorch/strategies/test_ddp.py
+++ b/tests/tests_pytorch/strategies/test_ddp.py
@@ -133,7 +133,34 @@ def test_set_timeout(mock_init_process_group):
     global_rank = trainer.strategy.cluster_environment.global_rank()
     world_size = trainer.strategy.cluster_environment.world_size()
     mock_init_process_group.assert_called_with(
-        process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta
+        process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta, device_id=None
+    )
+
+
+@mock.patch("torch.distributed.init_process_group")
+def test_device_id_passed_for_cuda_devices_pytorch(mock_init_process_group):
+    """Test that device_id is passed to init_process_group for CUDA devices but not for CPU."""
+    # Test with CPU device - device_id should be None
+    model = BoringModel()
+    ddp_strategy = DDPStrategy()
+    trainer = Trainer(
+        max_epochs=1,
+        accelerator="cpu",
+        strategy=ddp_strategy,
+    )
+    trainer.strategy.connect(model)
+    trainer.lightning_module.trainer = trainer
+    trainer.strategy.setup_environment()
+
+    process_group_backend = trainer.strategy._get_process_group_backend()
+    global_rank = trainer.strategy.cluster_environment.global_rank()
+    world_size = trainer.strategy.cluster_environment.world_size()
+    mock_init_process_group.assert_called_with(
+        process_group_backend,
+        rank=global_rank,
+        world_size=world_size,
+        timeout=trainer.strategy._timeout,
+        device_id=None,
     )
 
 

From b1a7e1c89fc509059614505b7963aecdf11d7071 Mon Sep 17 00:00:00 2001
From: Deependu <deependujha21@gmail.com>
Date: Mon, 1 Sep 2025 07:00:56 +0000
Subject: [PATCH 04/12] fix mypy without touching submodule

---
 src/lightning/fabric/strategies/ddp.py             | 2 +-
 src/lightning/fabric/strategies/fsdp.py            | 2 +-
 src/lightning/fabric/strategies/model_parallel.py  | 2 +-
 src/lightning/pytorch/strategies/ddp.py            | 2 +-
 src/lightning/pytorch/strategies/fsdp.py           | 2 +-
 src/lightning/pytorch/strategies/model_parallel.py | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/lightning/fabric/strategies/ddp.py b/src/lightning/fabric/strategies/ddp.py
index da3249e83289d..b1b7862e6e1e0 100644
--- a/src/lightning/fabric/strategies/ddp.py
+++ b/src/lightning/fabric/strategies/ddp.py
@@ -213,7 +213,7 @@ def _setup_distributed(self) -> None:
         self._set_world_ranks()
         self._process_group_backend = self._get_process_group_backend()
         assert self.cluster_environment is not None
-        kwargs = {"timeout": self._timeout}
+        kwargs: dict[str, Any] = {"timeout": self._timeout}
         if _TORCH_GREATER_EQUAL_2_3:
             kwargs["device_id"] = self.root_device if self.root_device.type != "cpu" else None
         _init_dist_connection(
diff --git a/src/lightning/fabric/strategies/fsdp.py b/src/lightning/fabric/strategies/fsdp.py
index 87261c799e535..f4ac0ee8f3338 100644
--- a/src/lightning/fabric/strategies/fsdp.py
+++ b/src/lightning/fabric/strategies/fsdp.py
@@ -663,7 +663,7 @@ def _setup_distributed(self) -> None:
         self._set_world_ranks()
         self._process_group_backend = self._get_process_group_backend()
         assert self.cluster_environment is not None
-        kwargs = {"timeout": self._timeout}
+        kwargs: dict[str, Any] = {"timeout": self._timeout}
         if _TORCH_GREATER_EQUAL_2_3:
             kwargs["device_id"] = self.root_device if self.root_device.type != "cpu" else None
         _init_dist_connection(
diff --git a/src/lightning/fabric/strategies/model_parallel.py b/src/lightning/fabric/strategies/model_parallel.py
index 4938cd18f9432..906fbeff1c7dc 100644
--- a/src/lightning/fabric/strategies/model_parallel.py
+++ b/src/lightning/fabric/strategies/model_parallel.py
@@ -302,7 +302,7 @@ def _setup_distributed(self) -> None:
         self._set_world_ranks()
         self._process_group_backend = self._get_process_group_backend()
         assert self.cluster_environment is not None
-        kwargs = {"timeout": self._timeout}
+        kwargs: dict[str, Any] = {"timeout": self._timeout}
         if _TORCH_GREATER_EQUAL_2_3:
             kwargs["device_id"] = self.root_device if self.root_device.type != "cpu" else None
         _init_dist_connection(
diff --git a/src/lightning/pytorch/strategies/ddp.py b/src/lightning/pytorch/strategies/ddp.py
index 8c5a4e7079b3f..246cf0f3f8795 100644
--- a/src/lightning/pytorch/strategies/ddp.py
+++ b/src/lightning/pytorch/strategies/ddp.py
@@ -200,7 +200,7 @@ def setup_distributed(self) -> None:
         self.set_world_ranks()
         self._process_group_backend = self._get_process_group_backend()
         assert self.cluster_environment is not None
-        kwargs = {"timeout": self._timeout}
+        kwargs: dict[str, Any] = {"timeout": self._timeout}
         if _TORCH_GREATER_EQUAL_2_3:
             kwargs["device_id"] = self.root_device if self.root_device.type != "cpu" else None
         _init_dist_connection(
diff --git a/src/lightning/pytorch/strategies/fsdp.py b/src/lightning/pytorch/strategies/fsdp.py
index 52197a270d983..a2fab2520e8ac 100644
--- a/src/lightning/pytorch/strategies/fsdp.py
+++ b/src/lightning/pytorch/strategies/fsdp.py
@@ -260,7 +260,7 @@ def setup_environment(self) -> None:
 
         self._process_group_backend = self._get_process_group_backend()
         assert self.cluster_environment is not None
-        kwargs = {"timeout": self._timeout}
+        kwargs: dict[str, Any] = {"timeout": self._timeout}
         if _TORCH_GREATER_EQUAL_2_3:
             kwargs["device_id"] = self.root_device if self.root_device.type != "cpu" else None
         _init_dist_connection(
diff --git a/src/lightning/pytorch/strategies/model_parallel.py b/src/lightning/pytorch/strategies/model_parallel.py
index 6381cf88a9306..bbfb7c1e1cbd1 100644
--- a/src/lightning/pytorch/strategies/model_parallel.py
+++ b/src/lightning/pytorch/strategies/model_parallel.py
@@ -350,7 +350,7 @@ def _setup_distributed(self) -> None:
         self.set_world_ranks()
         self._process_group_backend = self._get_process_group_backend()
         assert self.cluster_environment is not None
-        kwargs = {"timeout": self._timeout}
+        kwargs: dict[str, Any] = {"timeout": self._timeout}
         if _TORCH_GREATER_EQUAL_2_3:
             kwargs["device_id"] = self.root_device if self.root_device.type != "cpu" else None
         _init_dist_connection(

From 1ff12053846d07bacd0fdb2d6f05bf22678651d0 Mon Sep 17 00:00:00 2001
From: Deependu <deependujha21@gmail.com>
Date: Mon, 1 Sep 2025 07:25:29 +0000
Subject: [PATCH 05/12] fix failing tests

---
 tests/tests_fabric/strategies/test_ddp.py             | 6 +++++-
 tests/tests_fabric/strategies/test_fsdp.py            | 2 +-
 tests/tests_fabric/strategies/test_model_parallel.py  | 2 +-
 tests/tests_pytorch/strategies/test_ddp.py            | 6 +++++-
 tests/tests_pytorch/strategies/test_fsdp.py           | 2 +-
 tests/tests_pytorch/strategies/test_model_parallel.py | 2 +-
 6 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/tests/tests_fabric/strategies/test_ddp.py b/tests/tests_fabric/strategies/test_ddp.py
index 897dc725de0d6..c6cab24a2eb44 100644
--- a/tests/tests_fabric/strategies/test_ddp.py
+++ b/tests/tests_fabric/strategies/test_ddp.py
@@ -25,6 +25,7 @@
 from lightning.fabric.plugins.environments import LightningEnvironment
 from lightning.fabric.strategies import DDPStrategy
 from lightning.fabric.strategies.ddp import _DDPBackwardSyncControl
+from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_3
 from tests_fabric.helpers.runif import RunIf
 
 
@@ -168,8 +169,11 @@ def test_set_timeout(init_process_group_mock):
     process_group_backend = strategy._get_process_group_backend()
     global_rank = strategy.cluster_environment.global_rank()
     world_size = strategy.cluster_environment.world_size()
+    kwargs = {}
+    if _TORCH_GREATER_EQUAL_2_3:
+        kwargs["device_id"] = None
     init_process_group_mock.assert_called_with(
-        process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta, device_id=None
+        process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta, **kwargs
     )
 
 
diff --git a/tests/tests_fabric/strategies/test_fsdp.py b/tests/tests_fabric/strategies/test_fsdp.py
index d5f82752a9176..ec8cdffb1482a 100644
--- a/tests/tests_fabric/strategies/test_fsdp.py
+++ b/tests/tests_fabric/strategies/test_fsdp.py
@@ -382,7 +382,7 @@ def test_set_timeout(init_process_group_mock):
     global_rank = strategy.cluster_environment.global_rank()
     world_size = strategy.cluster_environment.world_size()
     init_process_group_mock.assert_called_with(
-        process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta
+        process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta, device_id=None
     )
 
 
diff --git a/tests/tests_fabric/strategies/test_model_parallel.py b/tests/tests_fabric/strategies/test_model_parallel.py
index d044626bf8389..b773f0bf428b6 100644
--- a/tests/tests_fabric/strategies/test_model_parallel.py
+++ b/tests/tests_fabric/strategies/test_model_parallel.py
@@ -317,7 +317,7 @@ def test_set_timeout(init_process_group_mock, _):
     global_rank = strategy.cluster_environment.global_rank()
     world_size = strategy.cluster_environment.world_size()
     init_process_group_mock.assert_called_with(
-        process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta
+        process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta, device_id=None
     )
 
 
diff --git a/tests/tests_pytorch/strategies/test_ddp.py b/tests/tests_pytorch/strategies/test_ddp.py
index de02ec8f96699..d16050389cf4c 100644
--- a/tests/tests_pytorch/strategies/test_ddp.py
+++ b/tests/tests_pytorch/strategies/test_ddp.py
@@ -20,6 +20,7 @@
 from torch.nn.parallel import DistributedDataParallel
 
 from lightning.fabric.plugins.environments import LightningEnvironment
+from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_3
 from lightning.pytorch import LightningModule, Trainer
 from lightning.pytorch.demos.boring_classes import BoringModel
 from lightning.pytorch.plugins import DoublePrecision, HalfPrecision, Precision
@@ -155,12 +156,15 @@ def test_device_id_passed_for_cuda_devices_pytorch(mock_init_process_group):
     process_group_backend = trainer.strategy._get_process_group_backend()
     global_rank = trainer.strategy.cluster_environment.global_rank()
     world_size = trainer.strategy.cluster_environment.world_size()
+    kwargs = {}
+    if _TORCH_GREATER_EQUAL_2_3:
+        kwargs["device_id"] = None
     mock_init_process_group.assert_called_with(
         process_group_backend,
         rank=global_rank,
         world_size=world_size,
         timeout=trainer.strategy._timeout,
-        device_id=None,
+        **kwargs,
     )
 
 
diff --git a/tests/tests_pytorch/strategies/test_fsdp.py b/tests/tests_pytorch/strategies/test_fsdp.py
index 560ab19f823ca..561cde8fb8a25 100644
--- a/tests/tests_pytorch/strategies/test_fsdp.py
+++ b/tests/tests_pytorch/strategies/test_fsdp.py
@@ -533,7 +533,7 @@ def test_set_timeout(init_process_group_mock):
     global_rank = strategy.cluster_environment.global_rank()
     world_size = strategy.cluster_environment.world_size()
     init_process_group_mock.assert_called_with(
-        process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta
+        process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta, device_id=None
     )
 
 
diff --git a/tests/tests_pytorch/strategies/test_model_parallel.py b/tests/tests_pytorch/strategies/test_model_parallel.py
index 86a95944ac20d..93fa7e5f9124d 100644
--- a/tests/tests_pytorch/strategies/test_model_parallel.py
+++ b/tests/tests_pytorch/strategies/test_model_parallel.py
@@ -203,7 +203,7 @@ def test_set_timeout(init_process_group_mock, _):
     global_rank = strategy.cluster_environment.global_rank()
     world_size = strategy.cluster_environment.world_size()
     init_process_group_mock.assert_called_with(
-        process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta
+        process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta, device_id=None
     )
 
 

From 0a76e7f9ccb9f95c306a58c6831327efb4a8e8d3 Mon Sep 17 00:00:00 2001
From: Deependu <deependujha21@gmail.com>
Date: Mon, 1 Sep 2025 07:41:36 +0000
Subject: [PATCH 06/12] update

---
 tests/tests_fabric/strategies/test_ddp.py  | 6 ++++--
 tests/tests_pytorch/strategies/test_ddp.py | 5 ++++-
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/tests/tests_fabric/strategies/test_ddp.py b/tests/tests_fabric/strategies/test_ddp.py
index c6cab24a2eb44..c755a52d5d08d 100644
--- a/tests/tests_fabric/strategies/test_ddp.py
+++ b/tests/tests_fabric/strategies/test_ddp.py
@@ -206,11 +206,13 @@ def test_device_id_passed_for_cuda_devices(init_process_group_mock):
     process_group_backend = cuda_strategy._get_process_group_backend()
     global_rank = cuda_strategy.cluster_environment.global_rank()
     world_size = cuda_strategy.cluster_environment.world_size()
-
+    kwargs = {}
+    if _TORCH_GREATER_EQUAL_2_3:
+        kwargs["device_id"] = None
     init_process_group_mock.assert_called_with(
         process_group_backend,
         rank=global_rank,
         world_size=world_size,
         timeout=cuda_strategy._timeout,
-        device_id=cuda_device,
+        **kwargs,
     )
diff --git a/tests/tests_pytorch/strategies/test_ddp.py b/tests/tests_pytorch/strategies/test_ddp.py
index d16050389cf4c..a46ec79ca3399 100644
--- a/tests/tests_pytorch/strategies/test_ddp.py
+++ b/tests/tests_pytorch/strategies/test_ddp.py
@@ -133,8 +133,11 @@ def test_set_timeout(mock_init_process_group):
     process_group_backend = trainer.strategy._get_process_group_backend()
     global_rank = trainer.strategy.cluster_environment.global_rank()
     world_size = trainer.strategy.cluster_environment.world_size()
+    kwargs = {}
+    if _TORCH_GREATER_EQUAL_2_3:
+        kwargs["device_id"] = None
     mock_init_process_group.assert_called_with(
-        process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta, device_id=None
+        process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta, **kwargs
     )
 
 

From 319cb16d94051e70ce834bcfea8586a5590ee1bf Mon Sep 17 00:00:00 2001
From: Deependu <deependujha21@gmail.com>
Date: Mon, 1 Sep 2025 07:59:14 +0000
Subject: [PATCH 07/12] maybe

---
 tests/tests_fabric/strategies/test_ddp.py             | 6 ++++--
 tests/tests_fabric/strategies/test_fsdp.py            | 7 +++++--
 tests/tests_fabric/strategies/test_model_parallel.py  | 6 +++++-
 tests/tests_pytorch/strategies/test_fsdp.py           | 7 +++++--
 tests/tests_pytorch/strategies/test_model_parallel.py | 6 +++++-
 5 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/tests/tests_fabric/strategies/test_ddp.py b/tests/tests_fabric/strategies/test_ddp.py
index c755a52d5d08d..b10223f42fbaf 100644
--- a/tests/tests_fabric/strategies/test_ddp.py
+++ b/tests/tests_fabric/strategies/test_ddp.py
@@ -189,9 +189,11 @@ def test_device_id_passed_for_cuda_devices(init_process_group_mock):
     process_group_backend = cpu_strategy._get_process_group_backend()
     global_rank = cpu_strategy.cluster_environment.global_rank()
     world_size = cpu_strategy.cluster_environment.world_size()
-
+    kwargs = {}
+    if _TORCH_GREATER_EQUAL_2_3:
+        kwargs["device_id"] = None
     init_process_group_mock.assert_called_with(
-        process_group_backend, rank=global_rank, world_size=world_size, timeout=cpu_strategy._timeout, device_id=None
+        process_group_backend, rank=global_rank, world_size=world_size, timeout=cpu_strategy._timeout, **kwargs
     )
 
     init_process_group_mock.reset_mock()
diff --git a/tests/tests_fabric/strategies/test_fsdp.py b/tests/tests_fabric/strategies/test_fsdp.py
index ec8cdffb1482a..439278d71cc22 100644
--- a/tests/tests_fabric/strategies/test_fsdp.py
+++ b/tests/tests_fabric/strategies/test_fsdp.py
@@ -31,7 +31,7 @@
     _get_full_state_dict_context,
     _is_sharded_checkpoint,
 )
-from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_2
+from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_2, _TORCH_GREATER_EQUAL_2_3
 
 
 def test_custom_mixed_precision():
@@ -381,8 +381,11 @@ def test_set_timeout(init_process_group_mock):
     process_group_backend = strategy._get_process_group_backend()
     global_rank = strategy.cluster_environment.global_rank()
     world_size = strategy.cluster_environment.world_size()
+    kwargs = {}
+    if _TORCH_GREATER_EQUAL_2_3:
+        kwargs["device_id"] = None
     init_process_group_mock.assert_called_with(
-        process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta, device_id=None
+        process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta, **kwargs
     )
 
 
diff --git a/tests/tests_fabric/strategies/test_model_parallel.py b/tests/tests_fabric/strategies/test_model_parallel.py
index b773f0bf428b6..1cfbb1c5a7a61 100644
--- a/tests/tests_fabric/strategies/test_model_parallel.py
+++ b/tests/tests_fabric/strategies/test_model_parallel.py
@@ -25,6 +25,7 @@
 from lightning.fabric.strategies import ModelParallelStrategy
 from lightning.fabric.strategies.fsdp import _is_sharded_checkpoint
 from lightning.fabric.strategies.model_parallel import _ParallelBackwardSyncControl
+from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_3
 from tests_fabric.helpers.runif import RunIf
 
 
@@ -316,8 +317,11 @@ def test_set_timeout(init_process_group_mock, _):
     process_group_backend = strategy._get_process_group_backend()
     global_rank = strategy.cluster_environment.global_rank()
     world_size = strategy.cluster_environment.world_size()
+    kwargs = {}
+    if _TORCH_GREATER_EQUAL_2_3:
+        kwargs["device_id"] = None
     init_process_group_mock.assert_called_with(
-        process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta, device_id=None
+        process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta, **kwargs
     )
 
 
diff --git a/tests/tests_pytorch/strategies/test_fsdp.py b/tests/tests_pytorch/strategies/test_fsdp.py
index 561cde8fb8a25..ec1f7a5a27029 100644
--- a/tests/tests_pytorch/strategies/test_fsdp.py
+++ b/tests/tests_pytorch/strategies/test_fsdp.py
@@ -18,7 +18,7 @@
 
 from lightning.fabric.plugins.environments import LightningEnvironment
 from lightning.fabric.strategies.fsdp import _is_sharded_checkpoint
-from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_2
+from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_2, _TORCH_GREATER_EQUAL_2_3
 from lightning.fabric.utilities.load import _load_distributed_checkpoint
 from lightning.pytorch import Trainer
 from lightning.pytorch.callbacks import ModelCheckpoint
@@ -532,8 +532,11 @@ def test_set_timeout(init_process_group_mock):
     process_group_backend = strategy._get_process_group_backend()
     global_rank = strategy.cluster_environment.global_rank()
     world_size = strategy.cluster_environment.world_size()
+    kwargs = {}
+    if _TORCH_GREATER_EQUAL_2_3:
+        kwargs["device_id"] = None
     init_process_group_mock.assert_called_with(
-        process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta, device_id=None
+        process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta, **kwargs
     )
 
 
diff --git a/tests/tests_pytorch/strategies/test_model_parallel.py b/tests/tests_pytorch/strategies/test_model_parallel.py
index 93fa7e5f9124d..58083af255655 100644
--- a/tests/tests_pytorch/strategies/test_model_parallel.py
+++ b/tests/tests_pytorch/strategies/test_model_parallel.py
@@ -22,6 +22,7 @@
 import torch.nn as nn
 
 from lightning.fabric.strategies.model_parallel import _is_sharded_checkpoint
+from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_3
 from lightning.pytorch import LightningModule
 from lightning.pytorch.plugins.environments import LightningEnvironment
 from lightning.pytorch.strategies import ModelParallelStrategy
@@ -202,8 +203,11 @@ def test_set_timeout(init_process_group_mock, _):
     process_group_backend = strategy._get_process_group_backend()
     global_rank = strategy.cluster_environment.global_rank()
     world_size = strategy.cluster_environment.world_size()
+    kwargs = {}
+    if _TORCH_GREATER_EQUAL_2_3:
+        kwargs["device_id"] = None
     init_process_group_mock.assert_called_with(
-        process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta, device_id=None
+        process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta, **kwargs
     )
 
 

From 58bd573420e5f384a5fcc8103e2cc9ea9f2daae5 Mon Sep 17 00:00:00 2001
From: Deependu <deependujha21@gmail.com>
Date: Mon, 1 Sep 2025 08:14:22 +0000
Subject: [PATCH 08/12] meow

---
 tests/tests_fabric/strategies/test_ddp.py             | 6 +++---
 tests/tests_fabric/strategies/test_fsdp.py            | 2 +-
 tests/tests_fabric/strategies/test_model_parallel.py  | 2 +-
 tests/tests_pytorch/strategies/test_ddp.py            | 4 ++--
 tests/tests_pytorch/strategies/test_fsdp.py           | 2 +-
 tests/tests_pytorch/strategies/test_model_parallel.py | 2 +-
 6 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/tests/tests_fabric/strategies/test_ddp.py b/tests/tests_fabric/strategies/test_ddp.py
index b10223f42fbaf..f302da5d1bc4f 100644
--- a/tests/tests_fabric/strategies/test_ddp.py
+++ b/tests/tests_fabric/strategies/test_ddp.py
@@ -171,7 +171,7 @@ def test_set_timeout(init_process_group_mock):
     world_size = strategy.cluster_environment.world_size()
     kwargs = {}
     if _TORCH_GREATER_EQUAL_2_3:
-        kwargs["device_id"] = None
+        kwargs["device_id"] = strategy.root_device if strategy.root_device.type != "cpu" else None
     init_process_group_mock.assert_called_with(
         process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta, **kwargs
     )
@@ -191,7 +191,7 @@ def test_device_id_passed_for_cuda_devices(init_process_group_mock):
     world_size = cpu_strategy.cluster_environment.world_size()
     kwargs = {}
     if _TORCH_GREATER_EQUAL_2_3:
-        kwargs["device_id"] = None
+        kwargs["device_id"] = cpu_strategy.root_device if cpu_strategy.root_device.type != "cpu" else None
     init_process_group_mock.assert_called_with(
         process_group_backend, rank=global_rank, world_size=world_size, timeout=cpu_strategy._timeout, **kwargs
     )
@@ -210,7 +210,7 @@ def test_device_id_passed_for_cuda_devices(init_process_group_mock):
     world_size = cuda_strategy.cluster_environment.world_size()
     kwargs = {}
     if _TORCH_GREATER_EQUAL_2_3:
-        kwargs["device_id"] = None
+        kwargs["device_id"] = cuda_strategy.root_device if cuda_strategy.root_device.type != "cpu" else None
     init_process_group_mock.assert_called_with(
         process_group_backend,
         rank=global_rank,
diff --git a/tests/tests_fabric/strategies/test_fsdp.py b/tests/tests_fabric/strategies/test_fsdp.py
index 439278d71cc22..6be379d36582c 100644
--- a/tests/tests_fabric/strategies/test_fsdp.py
+++ b/tests/tests_fabric/strategies/test_fsdp.py
@@ -383,7 +383,7 @@ def test_set_timeout(init_process_group_mock):
     world_size = strategy.cluster_environment.world_size()
     kwargs = {}
     if _TORCH_GREATER_EQUAL_2_3:
-        kwargs["device_id"] = None
+        kwargs["device_id"] = strategy.root_device if strategy.root_device.type != "cpu" else None
     init_process_group_mock.assert_called_with(
         process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta, **kwargs
     )
diff --git a/tests/tests_fabric/strategies/test_model_parallel.py b/tests/tests_fabric/strategies/test_model_parallel.py
index 1cfbb1c5a7a61..0e38f6e7777d1 100644
--- a/tests/tests_fabric/strategies/test_model_parallel.py
+++ b/tests/tests_fabric/strategies/test_model_parallel.py
@@ -319,7 +319,7 @@ def test_set_timeout(init_process_group_mock, _):
     world_size = strategy.cluster_environment.world_size()
     kwargs = {}
     if _TORCH_GREATER_EQUAL_2_3:
-        kwargs["device_id"] = None
+        kwargs["device_id"] = strategy.root_device if strategy.root_device.type != "cpu" else None
     init_process_group_mock.assert_called_with(
         process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta, **kwargs
     )
diff --git a/tests/tests_pytorch/strategies/test_ddp.py b/tests/tests_pytorch/strategies/test_ddp.py
index a46ec79ca3399..823d77d0d5848 100644
--- a/tests/tests_pytorch/strategies/test_ddp.py
+++ b/tests/tests_pytorch/strategies/test_ddp.py
@@ -135,7 +135,7 @@ def test_set_timeout(mock_init_process_group):
     world_size = trainer.strategy.cluster_environment.world_size()
     kwargs = {}
     if _TORCH_GREATER_EQUAL_2_3:
-        kwargs["device_id"] = None
+        kwargs["device_id"] = trainer.strategy.root_device if trainer.strategy.root_device.type != "cpu" else None
     mock_init_process_group.assert_called_with(
         process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta, **kwargs
     )
@@ -161,7 +161,7 @@ def test_device_id_passed_for_cuda_devices_pytorch(mock_init_process_group):
     world_size = trainer.strategy.cluster_environment.world_size()
     kwargs = {}
     if _TORCH_GREATER_EQUAL_2_3:
-        kwargs["device_id"] = None
+        kwargs["device_id"] = trainer.strategy.root_device if trainer.strategy.root_device.type != "cpu" else None
     mock_init_process_group.assert_called_with(
         process_group_backend,
         rank=global_rank,
diff --git a/tests/tests_pytorch/strategies/test_fsdp.py b/tests/tests_pytorch/strategies/test_fsdp.py
index ec1f7a5a27029..f7c15b5930be8 100644
--- a/tests/tests_pytorch/strategies/test_fsdp.py
+++ b/tests/tests_pytorch/strategies/test_fsdp.py
@@ -534,7 +534,7 @@ def test_set_timeout(init_process_group_mock):
     world_size = strategy.cluster_environment.world_size()
     kwargs = {}
     if _TORCH_GREATER_EQUAL_2_3:
-        kwargs["device_id"] = None
+        kwargs["device_id"] = strategy.root_device if strategy.root_device.type != "cpu" else None
     init_process_group_mock.assert_called_with(
         process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta, **kwargs
     )
diff --git a/tests/tests_pytorch/strategies/test_model_parallel.py b/tests/tests_pytorch/strategies/test_model_parallel.py
index 58083af255655..c803c10afa4b4 100644
--- a/tests/tests_pytorch/strategies/test_model_parallel.py
+++ b/tests/tests_pytorch/strategies/test_model_parallel.py
@@ -205,7 +205,7 @@ def test_set_timeout(init_process_group_mock, _):
     world_size = strategy.cluster_environment.world_size()
     kwargs = {}
     if _TORCH_GREATER_EQUAL_2_3:
-        kwargs["device_id"] = None
+        kwargs["device_id"] = strategy.root_device if strategy.root_device.type != "cpu" else None
     init_process_group_mock.assert_called_with(
         process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta, **kwargs
     )

From cf7fbe328951d28d821cb812436782c123440977 Mon Sep 17 00:00:00 2001
From: Deependu <deependujha21@gmail.com>
Date: Mon, 1 Sep 2025 09:44:11 +0000
Subject: [PATCH 09/12] update

---
 tests/tests_pytorch/strategies/test_ddp_integration.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/tests_pytorch/strategies/test_ddp_integration.py b/tests/tests_pytorch/strategies/test_ddp_integration.py
index 048403366ebc7..fc3a8cfebbac0 100644
--- a/tests/tests_pytorch/strategies/test_ddp_integration.py
+++ b/tests/tests_pytorch/strategies/test_ddp_integration.py
@@ -66,7 +66,7 @@ def test_multi_gpu_model_ddp_fit_test(tmp_path):
         assert out["test_acc"] > 0.7
 
 
-@RunIf(skip_windows=True)
+@RunIf(skip_windows=True, max_torch="2.7")
 @mock.patch("torch.cuda.set_device")
 @mock.patch("lightning.pytorch.accelerators.cuda._check_cuda_matmul_precision")
 @mock.patch("lightning.pytorch.accelerators.cuda._clear_cuda_memory")

From c326a34e603639f15ace902a3adc1d2a16635c19 Mon Sep 17 00:00:00 2001
From: Jirka B <j.borovec+github@gmail.com>
Date: Mon, 1 Sep 2025 13:09:10 +0200
Subject: [PATCH 10/12] chlog

---
 src/lightning/fabric/CHANGELOG.md  | 4 ++--
 src/lightning/pytorch/CHANGELOG.md | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md
index 16cd42adc90d9..cb37fcc6a937f 100644
--- a/src/lightning/fabric/CHANGELOG.md
+++ b/src/lightning/fabric/CHANGELOG.md
@@ -24,7 +24,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Fixed
 
--
+- Fixed with adding a missing device id for pytorch 2.8 ([#21105](https://github.com/Lightning-AI/pytorch-lightning/pull/21105))
 
 
 ---
@@ -33,7 +33,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Changed
 
-- Added support for NVIDIA H200 GPUs in `get_available_flops` ([#20913](https://github.com/Lightning-AI/pytorch-lightning/pull/21119))
+- Added support for NVIDIA H200 GPUs in `get_available_flops` ([#21119](https://github.com/Lightning-AI/pytorch-lightning/pull/21119))
 
 
 
diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md
index 176e34273d776..03664c8e2d1ad 100644
--- a/src/lightning/pytorch/CHANGELOG.md
+++ b/src/lightning/pytorch/CHANGELOG.md
@@ -31,6 +31,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed callbacks by defer step/time-triggered `ModelCheckpoint` saves until validation metrics are available ([#21106](https://github.com/Lightning-AI/pytorch-lightning/pull/21106))
 
 
+- Fixed with adding a missing device id for pytorch 2.8 ([#21105](https://github.com/Lightning-AI/pytorch-lightning/pull/21105))
+
 
 ---
 

From 00608a3818370391bd585960fa6cce18d21617b2 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <6035284+Borda@users.noreply.github.com>
Date: Mon, 1 Sep 2025 13:10:24 +0200
Subject: [PATCH 11/12] Apply suggestions from code review

---
 src/lightning/fabric/strategies/ddp.py             | 2 +-
 src/lightning/fabric/strategies/fsdp.py            | 2 +-
 src/lightning/fabric/strategies/model_parallel.py  | 2 +-
 src/lightning/pytorch/strategies/ddp.py            | 2 +-
 src/lightning/pytorch/strategies/fsdp.py           | 2 +-
 src/lightning/pytorch/strategies/model_parallel.py | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/lightning/fabric/strategies/ddp.py b/src/lightning/fabric/strategies/ddp.py
index b1b7862e6e1e0..330dc5c6795e1 100644
--- a/src/lightning/fabric/strategies/ddp.py
+++ b/src/lightning/fabric/strategies/ddp.py
@@ -219,7 +219,7 @@ def _setup_distributed(self) -> None:
         _init_dist_connection(
             self.cluster_environment,
             self._process_group_backend,
-            **kwargs,
+            **kwargs
         )
 
     def _get_process_group_backend(self) -> str:
diff --git a/src/lightning/fabric/strategies/fsdp.py b/src/lightning/fabric/strategies/fsdp.py
index f4ac0ee8f3338..5904f7634a360 100644
--- a/src/lightning/fabric/strategies/fsdp.py
+++ b/src/lightning/fabric/strategies/fsdp.py
@@ -669,7 +669,7 @@ def _setup_distributed(self) -> None:
         _init_dist_connection(
             self.cluster_environment,
             self._process_group_backend,
-            **kwargs,
+            **kwargs
         )
 
     def _get_process_group_backend(self) -> str:
diff --git a/src/lightning/fabric/strategies/model_parallel.py b/src/lightning/fabric/strategies/model_parallel.py
index 906fbeff1c7dc..f67661acff2b1 100644
--- a/src/lightning/fabric/strategies/model_parallel.py
+++ b/src/lightning/fabric/strategies/model_parallel.py
@@ -308,7 +308,7 @@ def _setup_distributed(self) -> None:
         _init_dist_connection(
             self.cluster_environment,
             self._process_group_backend,
-            **kwargs,
+            **kwargs
         )
 
     def _get_process_group_backend(self) -> str:
diff --git a/src/lightning/pytorch/strategies/ddp.py b/src/lightning/pytorch/strategies/ddp.py
index 246cf0f3f8795..e7630f9662735 100644
--- a/src/lightning/pytorch/strategies/ddp.py
+++ b/src/lightning/pytorch/strategies/ddp.py
@@ -206,7 +206,7 @@ def setup_distributed(self) -> None:
         _init_dist_connection(
             self.cluster_environment,
             self._process_group_backend,
-            **kwargs,
+            **kwargs
         )
 
     def _get_process_group_backend(self) -> str:
diff --git a/src/lightning/pytorch/strategies/fsdp.py b/src/lightning/pytorch/strategies/fsdp.py
index a2fab2520e8ac..a4e5b6e52c949 100644
--- a/src/lightning/pytorch/strategies/fsdp.py
+++ b/src/lightning/pytorch/strategies/fsdp.py
@@ -266,7 +266,7 @@ def setup_environment(self) -> None:
         _init_dist_connection(
             self.cluster_environment,
             self._process_group_backend,
-            **kwargs,
+            **kwargs
         )
 
         # if 'device_mesh' in the `kwargs` is provided as a tuple, update it into the `DeviceMesh` object here
diff --git a/src/lightning/pytorch/strategies/model_parallel.py b/src/lightning/pytorch/strategies/model_parallel.py
index bbfb7c1e1cbd1..aa21f7be2963c 100644
--- a/src/lightning/pytorch/strategies/model_parallel.py
+++ b/src/lightning/pytorch/strategies/model_parallel.py
@@ -356,7 +356,7 @@ def _setup_distributed(self) -> None:
         _init_dist_connection(
             self.cluster_environment,
             self._process_group_backend,
-            **kwargs,
+            **kwargs
         )
 
     def _get_process_group_backend(self) -> str:

From a8b89487a5095c152dfc72239b212127d36aa620 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 1 Sep 2025 11:11:57 +0000
Subject: [PATCH 12/12] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/lightning/fabric/strategies/ddp.py             | 6 +-----
 src/lightning/fabric/strategies/fsdp.py            | 6 +-----
 src/lightning/fabric/strategies/model_parallel.py  | 6 +-----
 src/lightning/pytorch/strategies/ddp.py            | 6 +-----
 src/lightning/pytorch/strategies/fsdp.py           | 6 +-----
 src/lightning/pytorch/strategies/model_parallel.py | 6 +-----
 6 files changed, 6 insertions(+), 30 deletions(-)

diff --git a/src/lightning/fabric/strategies/ddp.py b/src/lightning/fabric/strategies/ddp.py
index 330dc5c6795e1..e826b910c16d3 100644
--- a/src/lightning/fabric/strategies/ddp.py
+++ b/src/lightning/fabric/strategies/ddp.py
@@ -216,11 +216,7 @@ def _setup_distributed(self) -> None:
         kwargs: dict[str, Any] = {"timeout": self._timeout}
         if _TORCH_GREATER_EQUAL_2_3:
             kwargs["device_id"] = self.root_device if self.root_device.type != "cpu" else None
-        _init_dist_connection(
-            self.cluster_environment,
-            self._process_group_backend,
-            **kwargs
-        )
+        _init_dist_connection(self.cluster_environment, self._process_group_backend, **kwargs)
 
     def _get_process_group_backend(self) -> str:
         return self._process_group_backend or _get_default_process_group_backend_for_device(self.root_device)
diff --git a/src/lightning/fabric/strategies/fsdp.py b/src/lightning/fabric/strategies/fsdp.py
index 5904f7634a360..baaee74af0ec9 100644
--- a/src/lightning/fabric/strategies/fsdp.py
+++ b/src/lightning/fabric/strategies/fsdp.py
@@ -666,11 +666,7 @@ def _setup_distributed(self) -> None:
         kwargs: dict[str, Any] = {"timeout": self._timeout}
         if _TORCH_GREATER_EQUAL_2_3:
             kwargs["device_id"] = self.root_device if self.root_device.type != "cpu" else None
-        _init_dist_connection(
-            self.cluster_environment,
-            self._process_group_backend,
-            **kwargs
-        )
+        _init_dist_connection(self.cluster_environment, self._process_group_backend, **kwargs)
 
     def _get_process_group_backend(self) -> str:
         return self._process_group_backend or _get_default_process_group_backend_for_device(self.root_device)
diff --git a/src/lightning/fabric/strategies/model_parallel.py b/src/lightning/fabric/strategies/model_parallel.py
index f67661acff2b1..0d49ddf91a0bc 100644
--- a/src/lightning/fabric/strategies/model_parallel.py
+++ b/src/lightning/fabric/strategies/model_parallel.py
@@ -305,11 +305,7 @@ def _setup_distributed(self) -> None:
         kwargs: dict[str, Any] = {"timeout": self._timeout}
         if _TORCH_GREATER_EQUAL_2_3:
             kwargs["device_id"] = self.root_device if self.root_device.type != "cpu" else None
-        _init_dist_connection(
-            self.cluster_environment,
-            self._process_group_backend,
-            **kwargs
-        )
+        _init_dist_connection(self.cluster_environment, self._process_group_backend, **kwargs)
 
     def _get_process_group_backend(self) -> str:
         return self._process_group_backend or _get_default_process_group_backend_for_device(self.root_device)
diff --git a/src/lightning/pytorch/strategies/ddp.py b/src/lightning/pytorch/strategies/ddp.py
index e7630f9662735..92206e1accc31 100644
--- a/src/lightning/pytorch/strategies/ddp.py
+++ b/src/lightning/pytorch/strategies/ddp.py
@@ -203,11 +203,7 @@ def setup_distributed(self) -> None:
         kwargs: dict[str, Any] = {"timeout": self._timeout}
         if _TORCH_GREATER_EQUAL_2_3:
             kwargs["device_id"] = self.root_device if self.root_device.type != "cpu" else None
-        _init_dist_connection(
-            self.cluster_environment,
-            self._process_group_backend,
-            **kwargs
-        )
+        _init_dist_connection(self.cluster_environment, self._process_group_backend, **kwargs)
 
     def _get_process_group_backend(self) -> str:
         return self._process_group_backend or _get_default_process_group_backend_for_device(self.root_device)
diff --git a/src/lightning/pytorch/strategies/fsdp.py b/src/lightning/pytorch/strategies/fsdp.py
index a4e5b6e52c949..3fbd0f9cd5f0a 100644
--- a/src/lightning/pytorch/strategies/fsdp.py
+++ b/src/lightning/pytorch/strategies/fsdp.py
@@ -263,11 +263,7 @@ def setup_environment(self) -> None:
         kwargs: dict[str, Any] = {"timeout": self._timeout}
         if _TORCH_GREATER_EQUAL_2_3:
             kwargs["device_id"] = self.root_device if self.root_device.type != "cpu" else None
-        _init_dist_connection(
-            self.cluster_environment,
-            self._process_group_backend,
-            **kwargs
-        )
+        _init_dist_connection(self.cluster_environment, self._process_group_backend, **kwargs)
 
         # if 'device_mesh' in the `kwargs` is provided as a tuple, update it into the `DeviceMesh` object here
         if isinstance(self.kwargs.get("device_mesh"), tuple):
diff --git a/src/lightning/pytorch/strategies/model_parallel.py b/src/lightning/pytorch/strategies/model_parallel.py
index aa21f7be2963c..e0286dbe2e0e6 100644
--- a/src/lightning/pytorch/strategies/model_parallel.py
+++ b/src/lightning/pytorch/strategies/model_parallel.py
@@ -353,11 +353,7 @@ def _setup_distributed(self) -> None:
         kwargs: dict[str, Any] = {"timeout": self._timeout}
         if _TORCH_GREATER_EQUAL_2_3:
             kwargs["device_id"] = self.root_device if self.root_device.type != "cpu" else None
-        _init_dist_connection(
-            self.cluster_environment,
-            self._process_group_backend,
-            **kwargs
-        )
+        _init_dist_connection(self.cluster_environment, self._process_group_backend, **kwargs)
 
     def _get_process_group_backend(self) -> str:
         return self._process_group_backend or _get_default_process_group_backend_for_device(self.root_device)