77import time
88import webbrowser
99from dataclasses import replace
10- from datetime import datetime
10+ from datetime import datetime , timedelta
1111from pathlib import Path
1212from typing import Any
1313
4242 Unauthenticated ,
4343 Unknown ,
4444)
45+ from databricks .sdk .retries import retried
4546from databricks .sdk .service import compute , jobs
4647from databricks .sdk .service .jobs import RunLifeCycleState , RunResultState
4748from databricks .sdk .service .sql import EndpointInfoWarehouseType , SpotInstancePolicy
@@ -171,6 +172,7 @@ def __init__(
171172 promtps : Prompts | None = None ,
172173 wheels : Wheels | None = None ,
173174 sql_backend : SqlBackend | None = None ,
175+ verify_timeout : timedelta | None = None ,
174176 ):
175177 if "DATABRICKS_RUNTIME_VERSION" in os .environ :
176178 msg = "WorkspaceInstaller is not supposed to be executed in Databricks Runtime"
@@ -189,6 +191,9 @@ def __init__(
189191 self ._this_file = Path (__file__ )
190192 self ._dashboards : dict [str , str ] = {}
191193 self ._install_override_clusters = None
194+ if verify_timeout is None :
195+ verify_timeout = timedelta (minutes = 2 )
196+ self ._verify_timeout = verify_timeout
192197
193198 def run (self ):
194199 logger .info (f"Installing UCX v{ self ._product_info .version ()} " )
@@ -906,6 +911,14 @@ def latest_job_status(self) -> list[dict]:
906911 continue
907912 return latest_status
908913
914+ def _get_result_state (self , job_id ):
915+ job_runs = list (self ._ws .jobs .list_runs (job_id = job_id , limit = 1 ))
916+ latest_job_run = job_runs [0 ]
917+ if not latest_job_run .state .result_state :
918+ raise AttributeError ("no result state in job run" )
919+ job_state = latest_job_run .state .result_state .value
920+ return job_state
921+
909922 def repair_run (self , workflow ):
910923 try :
911924 job_id = self ._state .jobs .get (workflow )
@@ -917,17 +930,26 @@ def repair_run(self, workflow):
917930 logger .warning (f"{ workflow } job is not initialized yet. Can't trigger repair run now" )
918931 return
919932 latest_job_run = job_runs [0 ]
920- state = latest_job_run .state
921- if state .result_state .value != "FAILED" :
933+ retry_on_attribute_error = retried (on = [AttributeError ], timeout = self ._verify_timeout )
934+ retried_check = retry_on_attribute_error (self ._get_result_state )
935+ state_value = retried_check (job_id )
936+
937+ logger .info (f"The status for the latest run is { state_value } " )
938+
939+ if state_value != "FAILED" :
922940 logger .warning (f"{ workflow } job is not in FAILED state hence skipping Repair Run" )
923941 return
924942 run_id = latest_job_run .run_id
943+ run_details = self ._ws .jobs .get_run (run_id = run_id , include_history = True )
944+ latest_repair_run_id = run_details .repair_history [- 1 ].id
925945 job_url = f"{ self ._ws .config .host } #job/{ job_id } /run/{ run_id } "
926946 logger .debug (f"Repair Running { workflow } job: { job_url } " )
927- self ._ws .jobs .repair_run (run_id = run_id , rerun_all_failed_tasks = True )
947+ self ._ws .jobs .repair_run (run_id = run_id , rerun_all_failed_tasks = True , latest_repair_id = latest_repair_run_id )
928948 webbrowser .open (job_url )
929949 except InvalidParameterValue as e :
930950 logger .warning (f"skipping { workflow } : { e } " )
951+ except TimeoutError :
952+ logger .warning (f"Skipping the { workflow } due to time out. Please try after sometime" )
931953
932954 def uninstall (self ):
933955 if self ._prompts and not self ._prompts .confirm (
0 commit comments