Skip to content

Commit b45fa41

Browse files
authored
Fixed databricks labs ucx repair-run command to execute correctly (#801)
1 parent b32d885 commit b45fa41

File tree

2 files changed

+47
-4
lines changed

2 files changed

+47
-4
lines changed

src/databricks/labs/ucx/install.py

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import time
88
import webbrowser
99
from dataclasses import replace
10-
from datetime import datetime
10+
from datetime import datetime, timedelta
1111
from pathlib import Path
1212
from typing import Any
1313

@@ -42,6 +42,7 @@
4242
Unauthenticated,
4343
Unknown,
4444
)
45+
from databricks.sdk.retries import retried
4546
from databricks.sdk.service import compute, jobs
4647
from databricks.sdk.service.jobs import RunLifeCycleState, RunResultState
4748
from databricks.sdk.service.sql import EndpointInfoWarehouseType, SpotInstancePolicy
@@ -171,6 +172,7 @@ def __init__(
171172
promtps: Prompts | None = None,
172173
wheels: Wheels | None = None,
173174
sql_backend: SqlBackend | None = None,
175+
verify_timeout: timedelta | None = None,
174176
):
175177
if "DATABRICKS_RUNTIME_VERSION" in os.environ:
176178
msg = "WorkspaceInstaller is not supposed to be executed in Databricks Runtime"
@@ -189,6 +191,9 @@ def __init__(
189191
self._this_file = Path(__file__)
190192
self._dashboards: dict[str, str] = {}
191193
self._install_override_clusters = None
194+
if verify_timeout is None:
195+
verify_timeout = timedelta(minutes=2)
196+
self._verify_timeout = verify_timeout
192197

193198
def run(self):
194199
logger.info(f"Installing UCX v{self._product_info.version()}")
@@ -906,6 +911,14 @@ def latest_job_status(self) -> list[dict]:
906911
continue
907912
return latest_status
908913

914+
def _get_result_state(self, job_id):
915+
job_runs = list(self._ws.jobs.list_runs(job_id=job_id, limit=1))
916+
latest_job_run = job_runs[0]
917+
if not latest_job_run.state.result_state:
918+
raise AttributeError("no result state in job run")
919+
job_state = latest_job_run.state.result_state.value
920+
return job_state
921+
909922
def repair_run(self, workflow):
910923
try:
911924
job_id = self._state.jobs.get(workflow)
@@ -917,17 +930,26 @@ def repair_run(self, workflow):
917930
logger.warning(f"{workflow} job is not initialized yet. Can't trigger repair run now")
918931
return
919932
latest_job_run = job_runs[0]
920-
state = latest_job_run.state
921-
if state.result_state.value != "FAILED":
933+
retry_on_attribute_error = retried(on=[AttributeError], timeout=self._verify_timeout)
934+
retried_check = retry_on_attribute_error(self._get_result_state)
935+
state_value = retried_check(job_id)
936+
937+
logger.info(f"The status for the latest run is {state_value}")
938+
939+
if state_value != "FAILED":
922940
logger.warning(f"{workflow} job is not in FAILED state hence skipping Repair Run")
923941
return
924942
run_id = latest_job_run.run_id
943+
run_details = self._ws.jobs.get_run(run_id=run_id, include_history=True)
944+
latest_repair_run_id = run_details.repair_history[-1].id
925945
job_url = f"{self._ws.config.host}#job/{job_id}/run/{run_id}"
926946
logger.debug(f"Repair Running {workflow} job: {job_url}")
927-
self._ws.jobs.repair_run(run_id=run_id, rerun_all_failed_tasks=True)
947+
self._ws.jobs.repair_run(run_id=run_id, rerun_all_failed_tasks=True, latest_repair_id=latest_repair_run_id)
928948
webbrowser.open(job_url)
929949
except InvalidParameterValue as e:
930950
logger.warning(f"skipping {workflow}: {e}")
951+
except TimeoutError:
952+
logger.warning(f"Skipping the {workflow} due to time out. Please try after sometime")
931953

932954
def uninstall(self):
933955
if self._prompts and not self._prompts.confirm(

tests/unit/test_install.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import io
2+
from datetime import timedelta
23
from pathlib import Path
34
from typing import Any
45
from unittest.mock import MagicMock, create_autospec, patch
@@ -1179,3 +1180,23 @@ def test_repair_run_exception(ws):
11791180
install._state.jobs = {"assessment": "123"}
11801181
ws.jobs.list_runs.side_effect = InvalidParameterValue("Workflow does not exists")
11811182
install.repair_run("assessment")
1183+
1184+
1185+
def test_repair_run_result_state(ws, caplog):
1186+
base = [
1187+
BaseRun(
1188+
job_clusters=None,
1189+
job_id=677268692725050,
1190+
job_parameters=None,
1191+
number_in_job=725118654200173,
1192+
run_id=725118654200173,
1193+
run_name="[UCX] assessment",
1194+
state=RunState(result_state=None),
1195+
)
1196+
]
1197+
install = WorkspaceInstaller(ws, verify_timeout=timedelta(seconds=5))
1198+
install._state.jobs = {"assessment": "123"}
1199+
ws.jobs.list_runs.return_value = base
1200+
ws.jobs.list_runs.repair_run = None
1201+
install.repair_run("assessment")
1202+
assert "Please try after sometime" in caplog.text

0 commit comments

Comments
 (0)