Skip to content

Commit 9d4724f

Browse files
authored
Fixed handling of potentially corrupt state.json of UCX workflows (#2673)
Fix #2667
1 parent 0b474d5 commit 9d4724f

File tree

5 files changed

+73
-108
lines changed

5 files changed

+73
-108
lines changed

src/databricks/labs/ucx/install.py

Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
from databricks.labs.lsql.dashboards import DashboardMetadata, Dashboards
3030
from databricks.labs.lsql.deployment import SchemaDeployer
3131
from databricks.sdk import WorkspaceClient, AccountClient
32-
from databricks.sdk.core import with_user_agent_extra
32+
from databricks.sdk.useragent import with_extra
3333
from databricks.sdk.errors import (
3434
AlreadyExists,
3535
BadRequest,
@@ -87,7 +87,6 @@
8787
NUM_USER_ATTEMPTS = 10 # number of attempts user gets at answering a question
8888

8989
logger = logging.getLogger(__name__)
90-
with_user_agent_extra("cmd", "install")
9190

9291

9392
def deploy_schema(sql_backend: SqlBackend, inventory_schema: str):
@@ -679,17 +678,7 @@ def _remove_secret_scope(self):
679678
logger.error("Secret scope already deleted")
680679

681680
def _remove_jobs(self):
682-
logger.info("Deleting jobs")
683-
if not self._install_state.jobs:
684-
logger.error("No jobs present or jobs already deleted")
685-
return
686-
for step_name, job_id in self._install_state.jobs.items():
687-
try:
688-
logger.info(f"Deleting {step_name} job_id={job_id}.")
689-
self._ws.jobs.delete(job_id)
690-
except InvalidParameterValue:
691-
logger.error(f"Already deleted: {step_name} job_id={job_id}.")
692-
continue
681+
self._workflows_installer.remove_jobs()
693682

694683
def _remove_warehouse(self):
695684
try:
@@ -894,6 +883,7 @@ def _get_collection_workspace(
894883

895884

896885
if __name__ == "__main__":
886+
with_extra("cmd", "install")
897887
logger = get_logger(__file__)
898888
if is_in_debug():
899889
logging.getLogger('databricks').setLevel(logging.DEBUG)

src/databricks/labs/ucx/installer/workflows.py

Lines changed: 36 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -478,19 +478,46 @@ def create_jobs(self) -> None:
478478
)
479479
self._deploy_workflow(workflow_name, settings)
480480

481-
for workflow_name, job_id in self._install_state.jobs.items():
482-
if workflow_name not in desired_workflows:
483-
try:
484-
logger.info(f"Removing job_id={job_id}, as it is no longer needed")
485-
self._ws.jobs.delete(job_id)
486-
except InvalidParameterValue:
487-
logger.warning(f"step={workflow_name} does not exist anymore for some reason")
488-
continue
489-
481+
self.remove_jobs(keep=desired_workflows)
490482
self._install_state.save()
491483
self._create_debug(remote_wheels)
492484
self._create_readme()
493485

486+
def remove_jobs(self, *, keep: set[str] | None = None) -> None:
487+
for workflow_name, job_id in self._install_state.jobs.items():
488+
if keep and workflow_name in keep:
489+
continue
490+
try:
491+
if not self._is_managed_job_failsafe(int(job_id)):
492+
logger.warning(f"Corrupt installation state. Skipping job_id={job_id} as it is not managed by UCX")
493+
continue
494+
logger.info(f"Removing job_id={job_id}, as it is no longer needed")
495+
self._ws.jobs.delete(job_id)
496+
except InvalidParameterValue:
497+
logger.warning(f"step={workflow_name} does not exist anymore for some reason")
498+
continue
499+
500+
# see https://github.com/databrickslabs/ucx/issues/2667
501+
def _is_managed_job_failsafe(self, job_id: int) -> bool:
502+
install_folder = self._installation.install_folder()
503+
try:
504+
return self._is_managed_job(job_id, install_folder)
505+
except ResourceDoesNotExist:
506+
return False
507+
except InvalidParameterValue:
508+
return False
509+
510+
def _is_managed_job(self, job_id: int, install_folder: str) -> bool:
511+
job = self._ws.jobs.get(job_id)
512+
if not job.settings or not job.settings.tasks:
513+
return False
514+
for task in job.settings.tasks:
515+
if task.notebook_task and task.notebook_task.notebook_path.startswith(install_folder):
516+
return True
517+
if task.python_wheel_task and task.python_wheel_task.package_name == "databricks_labs_ucx":
518+
return True
519+
return False
520+
494521
@property
495522
def _config_file(self):
496523
return f"{self._installation.install_folder()}/config.yml"

src/databricks/labs/ucx/source_code/known.json

Lines changed: 1 addition & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -1824,86 +1824,7 @@
18241824
"databricks.labs.pylint": []
18251825
},
18261826
"databricks-labs-pytester": {
1827-
"databricks": [],
1828-
"databricks.labs": [],
18291827
"databricks.labs.pytester": [],
1830-
"databricks.labs.pytester.fixtures": [],
1831-
"databricks.labs.pytester.fixtures.baseline": [
1832-
{
1833-
"code": "direct-filesystem-access",
1834-
"message": "The use of direct filesystem references is deprecated: /"
1835-
}
1836-
],
1837-
"databricks.labs.pytester.fixtures.catalog": [
1838-
{
1839-
"code": "direct-filesystem-access",
1840-
"message": "The use of direct filesystem references is deprecated: /"
1841-
},
1842-
{
1843-
"code": "direct-filesystem-access",
1844-
"message": "The use of direct filesystem references is deprecated: dbfs:/tmp/ucx_test_"
1845-
},
1846-
{
1847-
"code": "direct-filesystem-access",
1848-
"message": "The use of direct filesystem references is deprecated: dbfs:/user/hive/warehouse/"
1849-
},
1850-
{
1851-
"code": "direct-filesystem-access",
1852-
"message": "The use of direct filesystem references is deprecated: s3a://databricks-datasets-oregon/delta-sharing/share/open-datasets.share"
1853-
}
1854-
],
1855-
"databricks.labs.pytester.fixtures.compute": [],
1856-
"databricks.labs.pytester.fixtures.connect": [],
1857-
"databricks.labs.pytester.fixtures.environment": [],
1858-
"databricks.labs.pytester.fixtures.iam": [
1859-
{
1860-
"code": "direct-filesystem-access",
1861-
"message": "The use of direct filesystem references is deprecated: /members"
1862-
},
1863-
{
1864-
"code": "direct-filesystem-access",
1865-
"message": "The use of direct filesystem references is deprecated: /users/groups/"
1866-
}
1867-
],
1868-
"databricks.labs.pytester.fixtures.ml": [
1869-
{
1870-
"code": "direct-filesystem-access",
1871-
"message": "The use of direct filesystem references is deprecated: /"
1872-
},
1873-
{
1874-
"code": "direct-filesystem-access",
1875-
"message": "The use of direct filesystem references is deprecated: /api/2.0/feature-store/feature-tables/create"
1876-
},
1877-
{
1878-
"code": "direct-filesystem-access",
1879-
"message": "The use of direct filesystem references is deprecated: /api/2.0/feature-store/feature-tables/delete"
1880-
}
1881-
],
1882-
"databricks.labs.pytester.fixtures.notebooks": [
1883-
{
1884-
"code": "direct-filesystem-access",
1885-
"message": "The use of direct filesystem references is deprecated: /Repos/"
1886-
},
1887-
{
1888-
"code": "direct-filesystem-access",
1889-
"message": "The use of direct filesystem references is deprecated: /Users/"
1890-
},
1891-
{
1892-
"code": "direct-filesystem-access",
1893-
"message": "The use of direct filesystem references is deprecated: /dummy-"
1894-
},
1895-
{
1896-
"code": "direct-filesystem-access",
1897-
"message": "The use of direct filesystem references is deprecated: /sdk-"
1898-
}
1899-
],
1900-
"databricks.labs.pytester.fixtures.permissions": [],
1901-
"databricks.labs.pytester.fixtures.plugin": [],
1902-
"databricks.labs.pytester.fixtures.redash": [],
1903-
"databricks.labs.pytester.fixtures.secrets": [],
1904-
"databricks.labs.pytester.fixtures.sql": [],
1905-
"databricks.labs.pytester.fixtures.unwrap": [],
1906-
"databricks.labs.pytester.fixtures.watchdog": [],
19071828
"tests.resources.hatchling-whl.src.hatchling_whl": []
19081829
},
19091830
"databricks-labs-ucx": {
@@ -30000,4 +29921,4 @@
3000029921
"zipp.compat.py310": [],
3000129922
"zipp.glob": []
3000229923
}
30003-
}
29924+
}

src/databricks/labs/ucx/uninstall.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,15 @@
11
import logging
22

33
from databricks.sdk import WorkspaceClient
4+
from databricks.sdk.useragent import with_extra
45

56
from databricks.labs.ucx.__about__ import __version__
67
from databricks.labs.ucx.install import WorkspaceInstallation
78

89
logger = logging.getLogger("databricks.labs.ucx.install")
910

1011
if __name__ == "__main__":
12+
with_extra("cmd", "uninstall")
1113
logger.setLevel("INFO")
1214
ws = WorkspaceClient(product="ucx", product_version=__version__)
1315
installer = WorkspaceInstallation.current(ws)

tests/unit/install/test_install.py

Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
import databricks.labs.ucx.installer.mixins
3535
import databricks.labs.ucx.uninstall # noqa
3636
from databricks.labs.ucx.config import WorkspaceConfig
37+
from databricks.labs.ucx.framework.tasks import Task
3738
from databricks.labs.ucx.install import AccountInstaller, WorkspaceInstallation, WorkspaceInstaller, extract_major_minor
3839
from databricks.labs.ucx.installer.workflows import DeployedWorkflows, WorkflowsDeployment
3940
from databricks.labs.ucx.runtime import Workflows
@@ -86,7 +87,7 @@ def mock_installation_extra_jobs():
8687
{
8788
'state.json': {
8889
'resources': {
89-
'jobs': {"assessment": "123", "extra_job": "123"},
90+
'jobs': {"assessment": "123", "extra_job": "124", "other_job": "125"},
9091
'dashboards': {'assessment_main': 'abc', 'assessment_estimates': 'def'},
9192
}
9293
}
@@ -615,9 +616,10 @@ def test_remove_jobs_with_state_missing_job(ws, caplog, mock_installation_with_j
615616
config, mock_installation_with_jobs, install_state, sql_backend, ws, workflows_installer, prompts, PRODUCT_INFO
616617
)
617618

618-
with caplog.at_level('ERROR'):
619+
with caplog.at_level('WARNING'):
619620
workspace_installation.uninstall()
620-
assert 'Already deleted: assessment job_id=123.' in caplog.messages
621+
failure = 'Corrupt installation state. Skipping job_id=123 as it is not managed by UCX'
622+
assert failure in caplog.messages
621623

622624
mock_installation_with_jobs.assert_removed()
623625
wheels.upload_to_wsfs.assert_not_called()
@@ -1282,14 +1284,18 @@ def test_remove_jobs(ws, caplog, mock_installation_extra_jobs, any_prompt):
12821284
sql_backend = MockBackend()
12831285
install_state = InstallState.from_installation(mock_installation_extra_jobs)
12841286
wheels = create_autospec(WheelsV2)
1287+
1288+
def dummy_task(*_):
1289+
pass
1290+
12851291
workflows_installation = WorkflowsDeployment(
12861292
WorkspaceConfig(inventory_database="...", policy_id='123'),
12871293
mock_installation_extra_jobs,
12881294
install_state,
12891295
ws,
12901296
wheels,
12911297
PRODUCT_INFO,
1292-
[],
1298+
[Task('assessment', 'some', '...', dummy_task)],
12931299
)
12941300

12951301
workspace_installation = WorkspaceInstallation(
@@ -1303,9 +1309,28 @@ def test_remove_jobs(ws, caplog, mock_installation_extra_jobs, any_prompt):
13031309
PRODUCT_INFO,
13041310
)
13051311

1306-
workspace_installation.run()
1307-
ws.jobs.delete.assert_called_with("123")
1312+
def job_side_effect(job_id):
1313+
tasks = {
1314+
123: [jobs.Task('x', notebook_task=jobs.NotebookTask(notebook_path='~/mock/assessment'))],
1315+
124: [jobs.Task('y', python_wheel_task=jobs.PythonWheelTask('databricks_labs_ucx', 'runtime'))],
1316+
125: [jobs.Task('z', notebook_task=jobs.NotebookTask(notebook_path='outside-of-ucx'))],
1317+
}
1318+
return jobs.Job(
1319+
settings=jobs.JobSettings(
1320+
tasks=tasks[job_id],
1321+
),
1322+
)
1323+
1324+
ws.jobs.get.side_effect = job_side_effect
1325+
1326+
with caplog.at_level('WARNING'):
1327+
workspace_installation.run()
1328+
1329+
job_deletes = {_.args[0] for _ in ws.jobs.delete.mock_calls}
1330+
assert len(job_deletes) == 1
1331+
assert '124' in job_deletes
13081332
wheels.upload_to_wsfs.assert_called()
1333+
assert 'Corrupt installation state. Skipping job_id=125 as it is not managed by UCX' in caplog.messages
13091334

13101335

13111336
def test_remove_jobs_already_deleted(ws, caplog, mock_installation_extra_jobs, any_prompt):

0 commit comments

Comments
 (0)