Skip to content

Commit 3bdd540

Browse files
committed
PrivateUse1HooksInterface
1 parent fb16216 commit 3bdd540

File tree

1 file changed

+17
-2
lines changed
  • src/lightning/fabric/strategies

1 file changed

+17
-2
lines changed

src/lightning/fabric/strategies/ddp.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from datetime import timedelta
1616
from typing import Any, Literal, Optional, Union
1717

18+
import inspect
1819
import torch
1920
import torch.distributed
2021
from lightning_utilities.core.rank_zero import rank_zero_only as utils_rank_zero_only
@@ -156,10 +157,24 @@ def all_reduce(
156157
def barrier(self, *args: Any, **kwargs: Any) -> None:
157158
if not _distributed_is_initialized():
158159
return
159-
if torch.distributed.get_backend() == "nccl":
160+
backend = torch.distributed.get_backend()
161+
if backend == "nccl":
160162
torch.distributed.barrier(device_ids=self._determine_ddp_device_ids())
161-
else:
163+
return
164+
# For CPU backends (e.g., gloo), recent PyTorch may attempt to resolve an accelerator and crash on CPU-only runs.
165+
try:
162166
torch.distributed.barrier()
167+
except RuntimeError as e:
168+
# Handle: "Please register PrivateUse1HooksInterface by `RegisterPrivateUse1HooksInterface` first."
169+
if "PrivateUse1HooksInterface" in str(e):
170+
# Use explicit CPU device if supported in this PyTorch version
171+
if "device" in inspect.signature(torch.distributed.barrier).parameters:
172+
torch.distributed.barrier(device=torch.device("cpu"))
173+
else:
174+
# Older versions shouldn't trigger this path; re-raise to avoid masking other issues
175+
raise
176+
else:
177+
raise
163178

164179
@override
165180
def broadcast(self, obj: TBroadcast, src: int = 0) -> TBroadcast:

0 commit comments

Comments
 (0)