@@ -77,6 +77,9 @@ class Cluster:
7777 ClusterEnvVar .COMM_NET_DEVICES : None ,
7878 }
7979
80+ class NamespaceConflictError (Exception ):
81+ """Raised when there is a namespace conflict in Ray initialization."""
82+
8083 @classmethod
8184 def find_free_port (cls ):
8285 """Find a free port on the node."""
@@ -109,16 +112,26 @@ def __init__(
109112 """
110113 if self ._has_initialized :
111114 return
115+ self ._setup_logger ()
112116 if num_nodes is not None or cluster_cfg is not None :
113117 self ._ray_instance_count = 0
114- self ._init_and_launch_managers (num_nodes , cluster_cfg )
118+ while True :
119+ try :
120+ self ._init_and_launch_managers (num_nodes , cluster_cfg )
121+ break
122+ except Cluster .NamespaceConflictError :
123+ # Switch the namespace when multiple ray instances are created in the same node
124+ self ._ray_instance_count += 1
125+ self ._logger .info (
126+ f"Ray namespace conflict detected. Retrying to initialize Cluster with a new namespace (attempt { self ._ray_instance_count } )."
127+ )
128+ Cluster .NAMESPACE = f"{ Cluster .SYS_NAME } _{ self ._ray_instance_count } "
129+ continue
115130 else :
116131 self ._init_from_existing_managers ()
117132 self ._has_initialized = True
118133
119- def _init_and_launch_managers (
120- self , num_nodes : int , cluster_cfg : Optional [DictConfig ]
121- ):
134+ def _setup_logger (self ):
122135 # Add logger
123136 self ._logger = logging .getLogger (Cluster .SYS_NAME )
124137 self ._logger .setLevel (Cluster .LOGGING_LEVEL )
@@ -133,6 +146,9 @@ def _init_and_launch_managers(
133146 handler .setFormatter (formatter )
134147 self ._logger .addHandler (handler )
135148
149+ def _init_and_launch_managers (
150+ self , num_nodes : int , cluster_cfg : Optional [DictConfig ]
151+ ):
136152 if ray .is_initialized ():
137153 if self ._ray_instance_count > 0 :
138154 # For reinit Ray to switch namespace
@@ -235,10 +251,7 @@ def _init_and_launch_managers(
235251 .remote ()
236252 )
237253 except ValueError :
238- # If the WorkerManager is already running, we need to switch the namespace
239- self ._ray_instance_count += 1
240- Cluster .NAMESPACE = f"RLinf_{ self ._ray_instance_count } "
241- return self ._init_and_launch_managers (num_nodes )
254+ raise Cluster .NamespaceConflictError
242255
243256 def signal_handler (sig , frame ):
244257 # Exit the main process if SIGUSR1 is received, which is sent by the worker group when an exception occurs.
0 commit comments