@@ -90,8 +90,8 @@ def main() -> int:
9090
9191 # 2. Wait for gang formation and validate
9292 gang_config = _wait_for_gang (cfg )
93- if gang_config is None :
94- return NCCLError . GANG_CONFIG_ERROR . value . exit_code
93+ if isinstance ( gang_config , int ) :
94+ return gang_config
9595
9696 # 3. Launch torchrun (replaces this process)
9797 _launch_torchrun (gang_config , cfg .nprocs_per_node )
@@ -143,11 +143,11 @@ def _load_config() -> _EntrypointConfig | None:
143143 )
144144
145145
146- def _wait_for_gang (cfg : _EntrypointConfig ) -> GangConfig | None :
146+ def _wait_for_gang (cfg : _EntrypointConfig ) -> GangConfig | int :
147147 """Wait for gang formation and validate the resulting configuration.
148148
149149 Returns:
150- The gang configuration on success, or None on failure (error already reported) .
150+ The gang configuration on success, or an NCCLError exit code on failure .
151151 """
152152 waiter = GangWaiter (cfg .gang_config_dir )
153153
@@ -156,23 +156,23 @@ def _wait_for_gang(cfg: _EntrypointConfig) -> GangConfig | None:
156156 except TimeoutError as err :
157157 log .error ("Gang formation timeout" , extra = {"error" : str (err )})
158158 _report_error (NCCLError .GANG_TIMEOUT , str (err ))
159- return None
159+ return NCCLError . GANG_TIMEOUT . value . exit_code
160160 except ValueError as err :
161161 log .error ("Invalid gang configuration" , extra = {"error" : str (err )})
162162 _report_error (NCCLError .GANG_CONFIG_ERROR , str (err ))
163- return None
163+ return NCCLError . GANG_CONFIG_ERROR . value . exit_code
164164
165165 if gang_config .my_rank < 0 :
166166 error_msg = f"Pod { cfg .pod_name } not found in peers list"
167167 log .error (error_msg )
168168 _report_error (NCCLError .GANG_CONFIG_ERROR , error_msg )
169- return None
169+ return NCCLError . GANG_CONFIG_ERROR . value . exit_code
170170
171171 if not gang_config .master_addr :
172172 error_msg = "Master address not set in ConfigMap"
173173 log .error (error_msg )
174174 _report_error (NCCLError .GANG_CONFIG_ERROR , error_msg )
175- return None
175+ return NCCLError . GANG_CONFIG_ERROR . value . exit_code
176176
177177 return gang_config
178178
@@ -202,7 +202,12 @@ def _launch_torchrun(gang_config: GangConfig, nprocs_per_node: int) -> None:
202202
203203 os .environ ["NPROCS_PER_NODE" ] = str (nprocs_per_node )
204204
205- os .execvp (cmd [0 ], cmd )
205+ try :
206+ os .execvp (cmd [0 ], cmd )
207+ except OSError as err :
208+ log .error ("Failed to exec torchrun" , extra = {"command" : cmd [0 ], "error" : str (err )})
209+ _report_error (NCCLError .GANG_CONFIG_ERROR , f"Failed to exec { cmd [0 ]} : { err } " )
210+ sys .exit (NCCLError .GANG_CONFIG_ERROR .value .exit_code )
206211
207212
208213def _detect_gpu_count () -> int :
0 commit comments