Added databricks labs ucx repair-run --step ... CLI command for repair run of any failed workflows, like assessment, migrate-groups etc. (#724)

prajin-29 · web-flow · commit 5ff8bc149b96 · 2023-12-28T17:02:02.000+01:00
diff --git a/labs.yml b/labs.yml
@@ -50,6 +50,12 @@ commands:
   - name: validate-external-locations
     description: validates and provides mapping to external table to external location and shared generation tf scripts
 
+  - name: repair-run
+    description: Repair Run the Failed Job
+    flags:
+      - name: step
+        description: name of the step
+
   - name: revert-migrated-tables
     description: remove notation on a migrated table for re-migration
     flags:
diff --git a/src/databricks/labs/ucx/cli.py b/src/databricks/labs/ucx/cli.py
@@ -114,6 +114,15 @@ def ensure_assessment_run():
         workspace_installer.validate_and_run("assessment")
 
 
+def repair_run(step):
+    if not step:
+        raise KeyError("You did not specify --step")
+    ws = WorkspaceClient()
+    installer = WorkspaceInstaller(ws)
+    logger.info(f"Repair Running {step} Job")
+    installer.repair_run(step)
+
+
 def revert_migrated_tables(schema: str, table: str, *, delete_managed: bool = False):
     ws = WorkspaceClient()
     prompts = Prompts()
@@ -149,6 +158,7 @@ def revert_migrated_tables(schema: str, table: str, *, delete_managed: bool = Fa
     "validate-external-locations": validate_external_locations,
     "ensure-assessment-run": ensure_assessment_run,
     "skip": skip,
+    "repair-run": repair_run,
     "revert-migrated-tables": revert_migrated_tables,
 }
 
diff --git a/src/databricks/labs/ucx/install.py b/src/databricks/labs/ucx/install.py
@@ -813,6 +813,29 @@ def latest_job_status(self) -> list[dict]:
                 continue
         return latest_status
 
+    def repair_run(self, workflow):
+        try:
+            job_id = self._state.jobs.get(workflow)
+            if not job_id:
+                logger.warning(f"{workflow} job does not exists hence skipping Repair Run")
+                return
+            job_runs = list(self._ws.jobs.list_runs(job_id=job_id, limit=1))
+            if not job_runs:
+                logger.warning(f"{workflow} job is not initialized yet. Can't trigger repair run now")
+                return
+            latest_job_run = job_runs[0]
+            state = latest_job_run.state
+            if state.result_state.value != "FAILED":
+                logger.warning(f"{workflow} job is not in FAILED state hence skipping Repair Run")
+                return
+            run_id = latest_job_run.run_id
+            job_url = f"{self._ws.config.host}#job/{job_id}/run/{run_id}"
+            logger.debug(f"Repair Running {workflow} job: {job_url}")
+            self._ws.jobs.repair_run(run_id=run_id, rerun_all_failed_tasks=True)
+            webbrowser.open(job_url)
+        except InvalidParameterValue as e:
+            logger.warning(f"skipping {workflow}: {e}")
+
     def uninstall(self):
         if self._prompts and not self._prompts.confirm(
             "Do you want to uninstall ucx from the workspace too, this would "
diff --git a/tests/integration/test_installation.py b/tests/integration/test_installation.py
@@ -149,6 +149,25 @@ def test_running_real_remove_backup_groups_job(ws, sql_backend, new_installation
         ws.groups.get(ws_group_a.id)
 
 
+@retried(on=[NotFound, InvalidParameterValue, OperationFailed], timeout=timedelta(minutes=10))
+def test_repair_run_workflow_job(ws, mocker, new_installation, sql_backend):
+    install = new_installation()
+    mocker.patch("webbrowser.open")
+    sql_backend.execute(f"DROP SCHEMA {install.current_config.inventory_database} CASCADE")
+    with pytest.raises(OperationFailed):
+        install.run_workflow("099-destroy-schema")
+
+    sql_backend.execute(f"CREATE SCHEMA IF NOT EXISTS {install.current_config.inventory_database}")
+
+    install.repair_run("099-destroy-schema")
+    workflow_job_id = install._state.jobs["099-destroy-schema"]
+    run_status = None
+    while run_status is None:
+        job_runs = list(install._ws.jobs.list_runs(job_id=workflow_job_id, limit=1))
+        run_status = job_runs[0].state.result_state
+    assert run_status.value == "SUCCESS"
+
+
 @retried(on=[NotFound], timeout=timedelta(minutes=5))
 def test_uninstallation(ws, sql_backend, new_installation):
     install = new_installation()
diff --git a/tests/unit/test_cli.py b/tests/unit/test_cli.py
@@ -5,7 +5,7 @@
 from databricks.sdk.service import iam
 from databricks.sdk.service.iam import ComplexValue, User
 
-from databricks.labs.ucx.cli import skip
+from databricks.labs.ucx.cli import repair_run, skip
 
 
 @pytest.fixture
@@ -33,3 +33,21 @@ def test_skip_no_ucx(caplog, mocker):
     mocker.patch("databricks.labs.ucx.installer.InstallationManager.for_user", return_value=None)
     skip(schema="schema", table="table")
     assert [rec.message for rec in caplog.records if "UCX configuration" in rec.message]
+
+
+def test_repair_run(mocker, caplog):
+    mocker.patch("databricks.sdk.WorkspaceClient.__init__", return_value=None)
+    mocker.patch("databricks.labs.ucx.install.WorkspaceInstaller.__init__", return_value=None)
+    mocker.patch("databricks.labs.ucx.install.WorkspaceInstaller.repair_run", return_value=None)
+    repair_run("assessment")
+    assert caplog.messages == []
+
+
+def test_no_step_in_repair_run(mocker, caplog):
+    mocker.patch("databricks.sdk.WorkspaceClient.__init__", return_value=None)
+    mocker.patch("databricks.labs.ucx.install.WorkspaceInstaller.__init__", return_value=None)
+    mocker.patch("databricks.labs.ucx.install.WorkspaceInstaller.repair_run", return_value=None)
+    try:
+        repair_run("")
+    except KeyError as e:
+        assert e.args[0] == "You did not specify --step"
diff --git a/tests/unit/test_install.py b/tests/unit/test_install.py
@@ -17,6 +17,7 @@
     GlobalInitScriptDetailsWithContent,
     Policy,
 )
+from databricks.sdk.service.jobs import BaseRun, RunResultState, RunState
 from databricks.sdk.service.sql import (
     Dashboard,
     DataSource,
@@ -1002,3 +1003,77 @@ def test_uninstall_no_config_file(ws, mocker):
     ws.workspace.download = lambda _: io.BytesIO(config_bytes)
     ws.workspace.get_status.side_effect = NotFound(...)
     install.uninstall()
+
+
+def test_repair_run(ws, mocker):
+    base = [
+        BaseRun(
+            job_clusters=None,
+            job_id=677268692725050,
+            job_parameters=None,
+            number_in_job=725118654200173,
+            run_id=725118654200173,
+            run_name="[UCX] assessment",
+            state=RunState(result_state=RunResultState.FAILED),
+        )
+    ]
+    install = WorkspaceInstaller(ws, promtps=MockPrompts({".*": ""}))
+    mocker.patch("webbrowser.open")
+    install._state.jobs = {"assessment": "123"}
+    ws.jobs.list_runs.return_value = base
+    ws.jobs.list_runs.repair_run = None
+    install.repair_run("assessment")
+
+
+def test_repair_run_success(ws, caplog):
+    base = [
+        BaseRun(
+            job_clusters=None,
+            job_id=677268692725050,
+            job_parameters=None,
+            number_in_job=725118654200173,
+            run_id=725118654200173,
+            run_name="[UCX] assessment",
+            state=RunState(result_state=RunResultState.SUCCESS),
+        )
+    ]
+    install = WorkspaceInstaller(ws)
+    install._state.jobs = {"assessment": "123"}
+    ws.jobs.list_runs.return_value = base
+    ws.jobs.list_runs.repair_run = None
+    install.repair_run("assessment")
+    assert "job is not in FAILED state" in caplog.text
+
+
+def test_repair_run_no_job_id(ws):
+    base = [
+        BaseRun(
+            job_clusters=None,
+            job_id=677268692725050,
+            job_parameters=None,
+            number_in_job=725118654200173,
+            run_id=725118654200173,
+            run_name="[UCX] assessment",
+            state=RunState(result_state=RunResultState.SUCCESS),
+        )
+    ]
+    install = WorkspaceInstaller(ws)
+    install._state.jobs = {"assessment": ""}
+    ws.jobs.list_runs.return_value = base
+    ws.jobs.list_runs.repair_run = None
+    install.repair_run("workflow")
+
+
+def test_repair_run_no_job_run(ws):
+    install = WorkspaceInstaller(ws)
+    install._state.jobs = {"assessment": "677268692725050"}
+    ws.jobs.list_runs.return_value = ""
+    ws.jobs.list_runs.repair_run = None
+    install.repair_run("assessment")
+
+
+def test_repair_run_exception(ws):
+    install = WorkspaceInstaller(ws)
+    install._state.jobs = {"assessment": "123"}
+    ws.jobs.list_runs.side_effect = InvalidParameterValue("Workflow does not exists")
+    install.repair_run("assessment")