Skip to content

Commit 5f66ecf

Browse files
committed
provision.fog: Mark nodes down in some cases
Specifically, if we time out waiting for the SSH port to open after a reimage, we want to mark down immediately so that we can investigate. Signed-off-by: Zack Cerza <zack@cerza.org>
1 parent 6eec69a commit 5f66ecf

File tree

3 files changed

+65
-14
lines changed

3 files changed

+65
-14
lines changed

teuthology/dispatcher/supervisor.py

Lines changed: 36 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
from teuthology import exporter, dispatcher, kill, report, safepath
1212
from teuthology.config import config as teuth_config
13-
from teuthology.exceptions import SkipJob, MaxWhileTries
13+
from teuthology.exceptions import SkipJob, MaxWhileTries, ReimageFailureNeedsInvestigation
1414
from teuthology import setup_log_file, install_except_hook
1515
from teuthology.misc import get_user, archive_logs, compress_logs
1616
from teuthology.config import FakeNamespace
@@ -175,8 +175,14 @@ def run_job(job_config, teuth_bin_path, archive_dir, verbose):
175175
log.error('Child exited with code %d', p.returncode)
176176
else:
177177
log.info('Success!')
178-
if 'targets' in job_config:
179-
unlock_targets(job_config)
178+
if 'targets' in job_config and job_config.get("unlock_on_failure", True):
179+
unlock_targets(
180+
job_config['targets'],
181+
job_config['owner'],
182+
job_config['name'],
183+
job_config['job_id'],
184+
job_config['archive_path'],
185+
)
180186
return p.returncode
181187

182188
def failure_is_reimage(failure_reason):
@@ -232,8 +238,28 @@ def reimage(job_config):
232238
try:
233239
reimaged = lock_ops.reimage_machines(ctx, targets, job_config['machine_type'])
234240
except Exception as e:
235-
log.exception('Reimaging error. Unlocking machines...')
236-
unlock_targets(job_config)
241+
targets = job_config['targets'].copy()
242+
log.exception('Reimaging error')
243+
if isinstance(e, ReimageFailureNeedsInvestigation):
244+
# This error requires further investigation. Mark the affected node
245+
# down and leave it locked.
246+
log.info(f"Marking {e.node_name} down for investigation")
247+
lock_ops.update_lock(
248+
e.node_name,
249+
description=str(e.inner),
250+
status='down',
251+
)
252+
targets = job_config['targets'].copy()
253+
targets.pop(e.node_name)
254+
if job_config.get("unlock_on_failure", True):
255+
log.info('Unlocking machines...')
256+
unlock_targets(
257+
targets,
258+
job_config['owner'],
259+
job_config['name'],
260+
job_config['job_id'],
261+
job_config['archive_path'],
262+
)
237263
# Reimage failures should map to the 'dead' status instead of 'fail'
238264
report.try_push_job_info(
239265
ctx.config,
@@ -252,20 +278,20 @@ def reimage(job_config):
252278
report.try_push_job_info(ctx.config, dict(status='running'))
253279

254280

255-
def unlock_targets(job_config):
281+
def unlock_targets(targets: dict, owner: str, run_name: str, job_id: str, archive_path: str):
256282
"""
257283
Unlock machines only if locked and description matches.
258284
259285
:param job_config: dict, job config data
260286
"""
261-
machine_statuses = query.get_statuses(job_config['targets'].keys())
287+
machine_statuses = query.get_statuses(targets.keys())
262288
locked = []
263289
for status in machine_statuses:
264290
name = shortname(status['name'])
265291
description = status['description']
266292
if not status['locked']:
267293
continue
268-
if description != job_config['archive_path']:
294+
if description != archive_path:
269295
log.warning(
270296
"Was going to unlock %s but it was locked by another job: %s",
271297
name, description
@@ -274,9 +300,8 @@ def unlock_targets(job_config):
274300
locked.append(name)
275301
if not locked:
276302
return
277-
if job_config.get("unlock_on_failure", True):
278-
log.info('Unlocking machines...')
279-
lock_ops.unlock_safe(locked, job_config["owner"], job_config["name"], job_config["job_id"])
303+
log.info('Unlocking machines...')
304+
lock_ops.unlock_safe(locked, owner, run_name, job_id)
280305

281306

282307
def run_with_watchdog(process, job_config):

teuthology/exceptions.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -245,3 +245,15 @@ def __str__(self):
245245
prefix=prefix,
246246
message=self.message,
247247
)
248+
249+
class ReimageFailure(Exception):
250+
def __init__(self, node_name: str, message: str, inner: Optional[Exception] = None):
251+
self.node_name: str = node_name
252+
self.message: str = message
253+
self.inner: Exception = inner
254+
255+
def __str__(self):
256+
return f"Reimage of {self.node_name} failed with message: '{self.message}' (Inner exception: {self.inner})"
257+
258+
class ReimageFailureNeedsInvestigation(ReimageFailure):
259+
pass

teuthology/provision/fog.py

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
from teuthology.config import config
1414
from teuthology.contextutil import safe_while
15-
from teuthology.exceptions import MaxWhileTries
15+
from teuthology.exceptions import MaxWhileTries, ReimageFailureNeedsInvestigation
1616
from teuthology.orchestra.opsys import OS
1717
from teuthology import misc
1818

@@ -88,7 +88,19 @@ def create(self):
8888
except Exception:
8989
self.cancel_deploy_task(task_id)
9090
raise
91-
self._wait_for_ready()
91+
try:
92+
self._wait_for_ready()
93+
except MaxWhileTries as e:
94+
# If the SSH port never opened, this requires investigation
95+
if isinstance(e.last_exception, NoValidConnectionsError):
96+
log.exception("Reimage failure")
97+
raise ReimageFailureNeedsInvestigation(
98+
node_name=self.name,
99+
message=f"Reimage failed: {e.last_exception}",
100+
inner=e.last_exception,
101+
)
102+
else:
103+
raise
92104
self._fix_hostname()
93105
self._verify_installed_os()
94106
self.log.info("Deploy complete!")
@@ -283,7 +295,8 @@ def cancel_deploy_task(self, task_id):
283295
def _wait_for_ready(self):
284296
""" Attempt to connect to the machine via SSH """
285297
with safe_while(sleep=6, timeout=config.fog_wait_for_ssh_timeout) as proceed:
286-
while proceed():
298+
last_exception = None
299+
while proceed(last_exception):
287300
try:
288301
self.remote.connect()
289302
break
@@ -299,6 +312,7 @@ def _wait_for_ready(self):
299312
# a mismatched host key in ~/.ssh/known_hosts, or
300313
# something)
301314
self.log.warning(e)
315+
last_exception = e
302316
sentinel_file = config.fog.get('sentinel_file', None)
303317
if sentinel_file:
304318
cmd = "while [ ! -e '%s' ]; do sleep 5; done" % sentinel_file

0 commit comments

Comments
 (0)