Merge branch 'master' into ioannis@18861-CSVLogger-fails-on-remote-fs

Borda · web-flow · commit 5be1825aa501 · 2025-09-04T18:26:45.000+02:00
diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md
@@ -22,7 +22,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Changed
 
--
+- let `_get_default_process_group_backend_for_device` support more hardware platforms (
+    [#21057](https://github.com/Lightning-AI/pytorch-lightning/pull/21057), [#21093](https://github.com/Lightning-AI/pytorch-lightning/pull/21093))
 
 
 ### Fixed
diff --git a/src/lightning/fabric/strategies/ddp.py b/src/lightning/fabric/strategies/ddp.py
@@ -160,7 +160,17 @@ def barrier(self, *args: Any, **kwargs: Any) -> None:
         if torch.distributed.get_backend() == "nccl":
             torch.distributed.barrier(device_ids=self._determine_ddp_device_ids())
         else:
-            torch.distributed.barrier()
+            # Handle PyTorch bug where barrier() fails on CPU with "PrivateUse1HooksInterface" error
+            try:
+                torch.distributed.barrier()
+            except RuntimeError as e:
+                if "PrivateUse1HooksInterface" in str(e):
+                    # Fallback: Use all_reduce as barrier - all processes must participate
+                    # This achieves the same synchronization effect as barrier()
+                    dummy_tensor = torch.tensor(0.0, device=self.root_device)
+                    torch.distributed.all_reduce(dummy_tensor)
+                else:
+                    raise
 
     @override
     def broadcast(self, obj: TBroadcast, src: int = 0) -> TBroadcast:
diff --git a/src/lightning/fabric/utilities/distributed.py b/src/lightning/fabric/utilities/distributed.py
@@ -319,7 +319,11 @@ def _destroy_dist_connection() -> None:
 
 
 def _get_default_process_group_backend_for_device(device: torch.device) -> str:
-    return "nccl" if device.type == "cuda" else "gloo"
+    """Return corresponding distributed backend for a given device."""
+    device_backend_map = torch.distributed.Backend.default_device_backend_map
+    if device.type in device_backend_map:
+        return device_backend_map[device.type]
+    return "gloo"
 
 
 class _DatasetSamplerWrapper(Dataset):
diff --git a/tests/tests_fabric/utilities/test_distributed.py b/tests/tests_fabric/utilities/test_distributed.py
@@ -17,6 +17,7 @@
 from lightning.fabric.utilities.distributed import (
     _destroy_dist_connection,
     _gather_all_tensors,
+    _get_default_process_group_backend_for_device,
     _InfiniteBarrier,
     _init_dist_connection,
     _is_dtensor,
@@ -243,6 +244,27 @@ def test_init_dist_connection_registers_destruction_handler(_, atexit_mock):
     atexit_mock.register.assert_not_called()
 
 
+def test_get_default_process_group_backend_for_device():
+    """Test that each device type maps to its correct default process group backend."""
+    # register a custom backend for test
+    torch.utils.rename_privateuse1_backend("pcu")
+
+    def mock_backend(store, group_rank, group_size, timeout):
+        pass
+
+    torch.distributed.Backend.register_backend(
+        "pccl",
+        lambda store, group_rank, group_size, timeout: mock_backend(store, group_rank, group_size, timeout),
+        devices=["pcu"],
+    )
+
+    # test that the default backend is correctly set for each device
+    devices = [torch.device("cpu"), torch.device("cuda:0"), torch.device("pcu:0")]
+    backends = ["gloo", "nccl", "pccl"]
+    for device, backend in zip(devices, backends):
+        assert _get_default_process_group_backend_for_device(device) == backend
+
+
 @RunIf(min_torch="2.4")
 def test_is_dtensor(monkeypatch):
     from torch.distributed._tensor import DTensor
diff --git a/tests/tests_fabric/utilities/test_spike.py b/tests/tests_fabric/utilities/test_spike.py
@@ -30,7 +30,7 @@ def spike_detection_test(fabric, global_rank_spike, spike_value, should_raise):
             )
 
 
-@pytest.mark.flaky(max_runs=3)
+@pytest.mark.flaky(reruns=3)
 @pytest.mark.parametrize(
     ("global_rank_spike", "num_devices", "spike_value", "finite_only"),
     # NOTE FOR ALL FOLLOWING TESTS:
diff --git a/tests/tests_pytorch/callbacks/test_spike.py b/tests/tests_pytorch/callbacks/test_spike.py
@@ -48,7 +48,7 @@ def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
             super().on_train_batch_end(trainer, pl_module, outputs, batch, batch_idx)
 
 
-@pytest.mark.flaky(max_runs=3)
+@pytest.mark.flaky(reruns=3)
 @pytest.mark.parametrize(
     ("global_rank_spike", "num_devices", "spike_value", "finite_only"),
     # NOTE FOR ALL FOLLOWING TESTS:

Original file line number	Diff line number	Diff line change
`@@ -30,7 +30,7 @@ def spike_detection_test(fabric, global_rank_spike, spike_value, should_raise):`
`30`	`30`	`)`
`31`	`31`
`32`	`32`
`33`		`-@pytest.mark.flaky(max_runs=3)`
	`33`	`+@pytest.mark.flaky(reruns=3)`
`34`	`34`	`@pytest.mark.parametrize(`
`35`	`35`	`("global_rank_spike", "num_devices", "spike_value", "finite_only"),`
`36`	`36`	`# NOTE FOR ALL FOLLOWING TESTS:`