1010
1111from teuthology import exporter , dispatcher , kill , report , safepath
1212from teuthology .config import config as teuth_config
13- from teuthology .exceptions import SkipJob , MaxWhileTries
13+ from teuthology .exceptions import SkipJob , MaxWhileTries , ReimageFailureNeedsInvestigation
1414from teuthology import setup_log_file , install_except_hook
1515from teuthology .misc import get_user , archive_logs , compress_logs
1616from teuthology .config import FakeNamespace
@@ -175,8 +175,14 @@ def run_job(job_config, teuth_bin_path, archive_dir, verbose):
175175 log .error ('Child exited with code %d' , p .returncode )
176176 else :
177177 log .info ('Success!' )
178- if 'targets' in job_config :
179- unlock_targets (job_config )
178+ if 'targets' in job_config and job_config .get ("unlock_on_failure" , True ):
179+ unlock_targets (
180+ job_config ['targets' ],
181+ job_config ['owner' ],
182+ job_config ['name' ],
183+ job_config ['job_id' ],
184+ job_config ['archive_path' ],
185+ )
180186 return p .returncode
181187
182188def failure_is_reimage (failure_reason ):
@@ -232,8 +238,28 @@ def reimage(job_config):
232238 try :
233239 reimaged = lock_ops .reimage_machines (ctx , targets , job_config ['machine_type' ])
234240 except Exception as e :
235- log .exception ('Reimaging error. Unlocking machines...' )
236- unlock_targets (job_config )
241+ targets = job_config ['targets' ].copy ()
242+ log .exception ('Reimaging error' )
243+ if isinstance (e , ReimageFailureNeedsInvestigation ):
244+ # This error requires further investigation. Mark the affected node
245+ # down and leave it locked.
246+ log .info (f"Marking { e .node_name } down for investigation" )
247+ lock_ops .update_lock (
248+ e .node_name ,
249+ description = str (e .inner ),
250+ status = 'down' ,
251+ )
252+ targets = job_config ['targets' ].copy ()
253+ targets .pop (e .node_name )
254+ if job_config .get ("unlock_on_failure" , True ):
255+ log .info ('Unlocking machines...' )
256+ unlock_targets (
257+ targets ,
258+ job_config ['owner' ],
259+ job_config ['name' ],
260+ job_config ['job_id' ],
261+ job_config ['archive_path' ],
262+ )
237263 # Reimage failures should map to the 'dead' status instead of 'fail'
238264 report .try_push_job_info (
239265 ctx .config ,
@@ -252,20 +278,20 @@ def reimage(job_config):
252278 report .try_push_job_info (ctx .config , dict (status = 'running' ))
253279
254280
255- def unlock_targets (job_config ):
281+ def unlock_targets (targets : dict , owner : str , run_name : str , job_id : str , archive_path : str ):
256282 """
257283 Unlock machines only if locked and description matches.
258284
259285 :param job_config: dict, job config data
260286 """
261- machine_statuses = query .get_statuses (job_config [ ' targets' ] .keys ())
287+ machine_statuses = query .get_statuses (targets .keys ())
262288 locked = []
263289 for status in machine_statuses :
264290 name = shortname (status ['name' ])
265291 description = status ['description' ]
266292 if not status ['locked' ]:
267293 continue
268- if description != job_config [ ' archive_path' ] :
294+ if description != archive_path :
269295 log .warning (
270296 "Was going to unlock %s but it was locked by another job: %s" ,
271297 name , description
@@ -274,9 +300,8 @@ def unlock_targets(job_config):
274300 locked .append (name )
275301 if not locked :
276302 return
277- if job_config .get ("unlock_on_failure" , True ):
278- log .info ('Unlocking machines...' )
279- lock_ops .unlock_safe (locked , job_config ["owner" ], job_config ["name" ], job_config ["job_id" ])
303+ log .info ('Unlocking machines...' )
304+ lock_ops .unlock_safe (locked , owner , run_name , job_id )
280305
281306
282307def run_with_watchdog (process , job_config ):
0 commit comments