Skip to content

Commit fa17720

Browse files
authored
Merge pull request #2146 from ceph/reboot-7min
fog: Try ipmi power-cycle if stuck in a reimage reboot hang
2 parents 7fc6083 + 61b34a6 commit fa17720

File tree

1 file changed

+29
-2
lines changed

1 file changed

+29
-2
lines changed

teuthology/provision/fog.py

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -281,9 +281,36 @@ def cancel_deploy_task(self, task_id):
281281
resp.raise_for_status()
282282

283283
def _wait_for_ready(self):
284-
""" Attempt to connect to the machine via SSH """
285-
with safe_while(sleep=6, timeout=config.fog_wait_for_ssh_timeout) as proceed:
284+
"""
285+
Attempt to connect to the machine via SSH (power cycle once at 50% of timeout).
286+
"""
287+
288+
total_timeout = config.fog_wait_for_ssh_timeout
289+
ipmi_cycle_after_seconds = total_timeout * 0.5
290+
291+
start = datetime.datetime.now(datetime.timezone.utc)
292+
ipmi_cycle_sent = False
293+
294+
with safe_while(sleep=6, timeout=total_timeout) as proceed:
286295
while proceed():
296+
now = datetime.datetime.now(datetime.timezone.utc)
297+
elapsed = (now - start).total_seconds()
298+
299+
# ipmitool power cycle once at 50% of timeout
300+
if not ipmi_cycle_sent and elapsed >= ipmi_cycle_after_seconds:
301+
ipmi_cycle_sent = True
302+
self.log.warning(
303+
f"{self.shortname}: SSH not up after {int(elapsed)}s "
304+
f"(~50% of timeout); power cycling and continuing to wait"
305+
)
306+
try:
307+
self.remote.console.power_off()
308+
self.remote.console.power_on()
309+
except Exception as e:
310+
self.log.error(
311+
f"{self.shortname}: power cycle failed but continuing: {e}"
312+
)
313+
287314
try:
288315
self.remote.connect()
289316
break

0 commit comments

Comments
 (0)