File tree Expand file tree Collapse file tree 1 file changed +29
-2
lines changed
Expand file tree Collapse file tree 1 file changed +29
-2
lines changed Original file line number Diff line number Diff line change @@ -281,9 +281,36 @@ def cancel_deploy_task(self, task_id):
281281 resp .raise_for_status ()
282282
283283 def _wait_for_ready (self ):
284- """ Attempt to connect to the machine via SSH """
285- with safe_while (sleep = 6 , timeout = config .fog_wait_for_ssh_timeout ) as proceed :
284+ """
285+ Attempt to connect to the machine via SSH (power cycle once at 50% of timeout).
286+ """
287+
288+ total_timeout = config .fog_wait_for_ssh_timeout
289+ ipmi_cycle_after_seconds = total_timeout * 0.5
290+
291+ start = datetime .datetime .now (datetime .timezone .utc )
292+ ipmi_cycle_sent = False
293+
294+ with safe_while (sleep = 6 , timeout = total_timeout ) as proceed :
286295 while proceed ():
296+ now = datetime .datetime .now (datetime .timezone .utc )
297+ elapsed = (now - start ).total_seconds ()
298+
299+ # ipmitool power cycle once at 50% of timeout
300+ if not ipmi_cycle_sent and elapsed >= ipmi_cycle_after_seconds :
301+ ipmi_cycle_sent = True
302+ self .log .warning (
303+ f"{ self .shortname } : SSH not up after { int (elapsed )} s "
304+ f"(~50% of timeout); power cycling and continuing to wait"
305+ )
306+ try :
307+ self .remote .console .power_off ()
308+ self .remote .console .power_on ()
309+ except Exception as e :
310+ self .log .error (
311+ f"{ self .shortname } : power cycle failed but continuing: { e } "
312+ )
313+
287314 try :
288315 self .remote .connect ()
289316 break
You can’t perform that action at this time.
0 commit comments