File tree Expand file tree Collapse file tree 1 file changed +17
-2
lines changed
src/lightning/fabric/strategies Expand file tree Collapse file tree 1 file changed +17
-2
lines changed Original file line number Diff line number Diff line change 1515from datetime import timedelta
1616from typing import Any , Literal , Optional , Union
1717
18+ import inspect
1819import torch
1920import torch .distributed
2021from lightning_utilities .core .rank_zero import rank_zero_only as utils_rank_zero_only
@@ -156,10 +157,24 @@ def all_reduce(
156157 def barrier (self , * args : Any , ** kwargs : Any ) -> None :
157158 if not _distributed_is_initialized ():
158159 return
159- if torch .distributed .get_backend () == "nccl" :
160+ backend = torch .distributed .get_backend ()
161+ if backend == "nccl" :
160162 torch .distributed .barrier (device_ids = self ._determine_ddp_device_ids ())
161- else :
163+ return
164+ # For CPU backends (e.g., gloo), recent PyTorch may attempt to resolve an accelerator and crash on CPU-only runs.
165+ try :
162166 torch .distributed .barrier ()
167+ except RuntimeError as e :
168+ # Handle: "Please register PrivateUse1HooksInterface by `RegisterPrivateUse1HooksInterface` first."
169+ if "PrivateUse1HooksInterface" in str (e ):
170+ # Use explicit CPU device if supported in this PyTorch version
171+ if "device" in inspect .signature (torch .distributed .barrier ).parameters :
172+ torch .distributed .barrier (device = torch .device ("cpu" ))
173+ else :
174+ # Older versions shouldn't trigger this path; re-raise to avoid masking other issues
175+ raise
176+ else :
177+ raise
163178
164179 @override
165180 def broadcast (self , obj : TBroadcast , src : int = 0 ) -> TBroadcast :
You can’t perform that action at this time.
0 commit comments