Skip to content

Commit c849cdf

Browse files
authored
Improved installation integration test flakiness (#998)
- improved `_infer_error_from_job_run` and `_infer_error_from_task_run` to also catch `KeyError` and `ValueError` - removed retries for `Unknown` errors for installation tests
1 parent 5773358 commit c849cdf

File tree

3 files changed

+42
-27
lines changed

3 files changed

+42
-27
lines changed

src/databricks/labs/ucx/install.py

Lines changed: 33 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323
AlreadyExists,
2424
BadRequest,
2525
Cancelled,
26-
DatabricksError,
2726
DataLoss,
2827
DeadlineExceeded,
2928
InternalError,
@@ -436,28 +435,20 @@ def run_workflow(self, step: str):
436435
except OperationFailed as err:
437436
# currently we don't have any good message from API, so we have to work around it.
438437
job_run = self._ws.jobs.get_run(job_run_waiter.run_id)
439-
raise self._infer_nested_error(job_run) from err
438+
raise self._infer_error_from_job_run(job_run) from err
440439

441-
def _infer_nested_error(self, job_run) -> Exception:
442-
errors: list[DatabricksError] = []
440+
def _infer_error_from_job_run(self, job_run) -> Exception:
441+
errors: list[Exception] = []
443442
timeouts: list[DeadlineExceeded] = []
444443
assert job_run.tasks is not None
445444
for run_task in job_run.tasks:
446-
if not run_task.state:
445+
error = self._infer_error_from_task_run(run_task)
446+
if not error:
447447
continue
448-
if run_task.state.result_state == jobs.RunResultState.TIMEDOUT:
449-
msg = f"{run_task.task_key}: The run was stopped after reaching the timeout"
450-
timeouts.append(DeadlineExceeded(msg))
448+
if isinstance(error, DeadlineExceeded):
449+
timeouts.append(error)
451450
continue
452-
if run_task.state.result_state != jobs.RunResultState.FAILED:
453-
continue
454-
assert run_task.run_id is not None
455-
run_output = self._ws.jobs.get_run_output(run_task.run_id)
456-
if logger.isEnabledFor(logging.DEBUG):
457-
if run_output and run_output.error_trace:
458-
sys.stderr.write(run_output.error_trace)
459-
if run_output and run_output.error:
460-
errors.append(self._infer_task_exception(f"{run_task.task_key}: {run_output.error}"))
451+
errors.append(error)
461452
assert job_run.state is not None
462453
assert job_run.state.state_message is not None
463454
if len(errors) == 1:
@@ -467,8 +458,29 @@ def _infer_nested_error(self, job_run) -> Exception:
467458
return Unknown(job_run.state.state_message)
468459
return ManyError(all_errors)
469460

461+
def _infer_error_from_task_run(self, run_task: jobs.RunTask) -> Exception | None:
462+
if not run_task.state:
463+
return None
464+
if run_task.state.result_state == jobs.RunResultState.TIMEDOUT:
465+
msg = f"{run_task.task_key}: The run was stopped after reaching the timeout"
466+
return DeadlineExceeded(msg)
467+
if run_task.state.result_state != jobs.RunResultState.FAILED:
468+
return None
469+
assert run_task.run_id is not None
470+
run_output = self._ws.jobs.get_run_output(run_task.run_id)
471+
if not run_output:
472+
msg = f'No run output. {run_task.state.state_message}'
473+
return InternalError(msg)
474+
if logger.isEnabledFor(logging.DEBUG):
475+
if run_output.error_trace:
476+
sys.stderr.write(run_output.error_trace)
477+
if not run_output.error:
478+
msg = f'No error in run output. {run_task.state.state_message}'
479+
return InternalError(msg)
480+
return self._infer_task_exception(f"{run_task.task_key}: {run_output.error}")
481+
470482
@staticmethod
471-
def _infer_task_exception(haystack: str) -> DatabricksError:
483+
def _infer_task_exception(haystack: str) -> Exception:
472484
needles = [
473485
BadRequest,
474486
Unauthenticated,
@@ -490,8 +502,10 @@ def _infer_task_exception(haystack: str) -> DatabricksError:
490502
RequestLimitExceeded,
491503
Unknown,
492504
DataLoss,
505+
ValueError,
506+
KeyError,
493507
]
494-
constructors: dict[re.Pattern, type[DatabricksError]] = {
508+
constructors: dict[re.Pattern, type[Exception]] = {
495509
re.compile(r".*\[TABLE_OR_VIEW_NOT_FOUND] (.*)"): NotFound,
496510
re.compile(r".*\[SCHEMA_NOT_FOUND] (.*)"): NotFound,
497511
}

src/databricks/labs/ucx/workspace_access/generic.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -418,6 +418,7 @@ def inner() -> list[GenericPermissionsInfo]:
418418
result = ws.api_client.do(
419419
"GET", "/api/2.0/feature-store/feature-tables/search", query={"page_token": token, "max_results": 200}
420420
)
421+
assert isinstance(result, dict)
421422
for table in result.get("feature_tables", []):
422423
feature_tables.append(GenericPermissionsInfo(table["id"], "feature-tables"))
423424

tests/integration/test_installation.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from databricks.labs.blueprint.parallel import Threads
1313
from databricks.labs.blueprint.tui import MockPrompts
1414
from databricks.labs.blueprint.wheels import WheelsV2
15-
from databricks.sdk.errors import InvalidParameterValue, NotFound, Unknown
15+
from databricks.sdk.errors import InvalidParameterValue, NotFound
1616
from databricks.sdk.retries import retried
1717
from databricks.sdk.service import compute, sql
1818
from databricks.sdk.service.iam import PermissionLevel
@@ -36,7 +36,7 @@
3636

3737

3838
@pytest.fixture
39-
def new_installation(ws, sql_backend, env_or_skip, inventory_schema, make_random, make_cluster_policy):
39+
def new_installation(ws, sql_backend, env_or_skip, inventory_schema, make_random):
4040
cleanup = []
4141

4242
def factory(config_transform: Callable[[WorkspaceConfig], WorkspaceConfig] | None = None):
@@ -98,7 +98,7 @@ def factory(config_transform: Callable[[WorkspaceConfig], WorkspaceConfig] | Non
9898
pending.uninstall()
9999

100100

101-
@retried(on=[NotFound, Unknown, TimeoutError], timeout=timedelta(minutes=5))
101+
@retried(on=[NotFound, TimeoutError], timeout=timedelta(minutes=5))
102102
def test_job_failure_propagates_correct_error_message_and_logs(ws, sql_backend, new_installation):
103103
install = new_installation()
104104

@@ -113,7 +113,7 @@ def test_job_failure_propagates_correct_error_message_and_logs(ws, sql_backend,
113113
assert len(workflow_run_logs) == 1
114114

115115

116-
@retried(on=[NotFound, Unknown, InvalidParameterValue], timeout=timedelta(minutes=3))
116+
@retried(on=[NotFound, InvalidParameterValue], timeout=timedelta(minutes=3))
117117
def test_job_cluster_policy(ws, new_installation):
118118
install = new_installation(lambda wc: replace(wc, override_clusters=None))
119119
user_name = ws.current_user.me().user_name
@@ -154,7 +154,7 @@ def test_new_job_cluster_with_policy_assessment(
154154
assert before[ws_group_a.display_name] == PermissionLevel.CAN_USE
155155

156156

157-
@retried(on=[NotFound, Unknown, InvalidParameterValue], timeout=timedelta(minutes=10))
157+
@retried(on=[NotFound, InvalidParameterValue], timeout=timedelta(minutes=10))
158158
def test_running_real_assessment_job(
159159
ws, new_installation, make_ucx_group, make_cluster_policy, make_cluster_policy_permissions
160160
):
@@ -175,7 +175,7 @@ def test_running_real_assessment_job(
175175
assert before[ws_group_a.display_name] == PermissionLevel.CAN_USE
176176

177177

178-
@retried(on=[NotFound, Unknown, InvalidParameterValue], timeout=timedelta(minutes=5))
178+
@retried(on=[NotFound, InvalidParameterValue], timeout=timedelta(minutes=5))
179179
def test_running_real_migrate_groups_job(
180180
ws, sql_backend, new_installation, make_ucx_group, make_cluster_policy, make_cluster_policy_permissions
181181
):
@@ -208,7 +208,7 @@ def test_running_real_migrate_groups_job(
208208
assert found[f"{install.config.renamed_group_prefix}{ws_group_a.display_name}"] == PermissionLevel.CAN_USE
209209

210210

211-
@retried(on=[NotFound, Unknown, InvalidParameterValue], timeout=timedelta(minutes=5))
211+
@retried(on=[NotFound, InvalidParameterValue], timeout=timedelta(minutes=5))
212212
def test_running_real_validate_groups_permissions_job(
213213
ws, sql_backend, new_installation, make_group, make_query, make_query_permissions
214214
):
@@ -264,7 +264,7 @@ def test_running_real_validate_groups_permissions_job_fails(
264264
request_object_type="cluster-policies", request_object_id=cluster_policy.policy_id, access_control_list=[]
265265
)
266266

267-
with pytest.raises(Unknown, match=r"Detected \d+ failures: ValueError"):
267+
with pytest.raises(ValueError):
268268
install.run_workflow("validate-groups-permissions")
269269

270270

0 commit comments

Comments
 (0)