Skip to content

Commit 6496b3b

Browse files
authored
Checking pipeline cluster config and cluster policy in 'crawl_pipelines' task (#864)
1 parent 0d83215 commit 6496b3b

File tree

3 files changed

+70
-28
lines changed

3 files changed

+70
-28
lines changed

src/databricks/labs/ucx/assessment/pipelines.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,16 @@ def _assess_pipelines(self, all_pipelines) -> Iterable[PipelineInfo]:
5050
pipeline_config = pipeline_response.spec.configuration
5151
if pipeline_config:
5252
failures.extend(self.check_spark_conf(pipeline_config, "pipeline"))
53+
pipeline_cluster = pipeline_response.spec.clusters
54+
if pipeline_cluster:
55+
for cluster in pipeline_cluster:
56+
if cluster.spark_conf:
57+
failures.extend(self.check_spark_conf(cluster.spark_conf, "pipeline cluster"))
58+
# Checking if cluster config is present in cluster policies
59+
if cluster.policy_id:
60+
failures.extend(self._check_cluster_policy(cluster.policy_id, "pipeline cluster"))
61+
if cluster.init_scripts:
62+
failures.extend(self._check_cluster_init_script(cluster.init_scripts, "pipeline cluster"))
5363

5464
pipeline_info.failures = json.dumps(failures)
5565
if len(failures) > 0:

tests/unit/assessment/pipelines/spec-with-spn.json

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,41 @@
11
{
22
"spec": {
3-
"clusters": [],
3+
"clusters": [{
4+
"autoscale": {
5+
"max_workers": 6,
6+
"min_workers": 1
7+
},
8+
"custom_tags": {
9+
"cluster_type": "default"
10+
},
11+
"label": "default",
12+
"init_scripts": [
13+
{
14+
"dbfs": {
15+
"destination": "dbfs:/users/[email protected]/init_scripts/test.sh"
16+
}
17+
}
18+
],
19+
"node_type_id": "Standard_F4s",
20+
"num_workers": 1,
21+
"policy_id": "single-user-with-spn",
22+
"spark_conf": {
23+
"spark.databricks.delta.preview.enabled": "true"
24+
}
25+
},
26+
{
27+
"autoscale": {
28+
"max_workers": 6,
29+
"min_workers": 1
30+
},
31+
"custom_tags": {
32+
"cluster_type": "default"
33+
},
34+
"label": "default",
35+
"init_scripts": [],
36+
"node_type_id": "Standard_F4s",
37+
"num_workers": 1
38+
}],
439
"configuration": {
540
"spark.hadoop.fs.azure.account.oauth2.client.id.newstorageacct.dfs.core.windows.net": "pipeline_dummy_application_id",
641
"spark.hadoop.fs.azure.account.oauth2.client.endpoint.newstorageacct.dfs.core.windows.net": "https://login.microsoftonline.com/directory_12345/oauth2/token",

tests/unit/assessment/test_pipelines.py

Lines changed: 24 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -6,53 +6,51 @@
66
from databricks.labs.ucx.assessment.pipelines import PipelineInfo, PipelinesCrawler
77

88
from ..framework.mocks import MockBackend
9+
from . import workspace_client_mock
910

1011

11-
def test_pipeline_assessment_with_config(mocker):
12+
def test_pipeline_assessment_with_config():
1213
sample_pipelines = [
1314
PipelineStateInfo(
1415
cluster_id=None,
1516
creator_user_name="[email protected]",
1617
latest_updates=None,
1718
name="New DLT Pipeline",
18-
pipeline_id="0112eae7-9d11-4b40-a2b8-6c83cb3c7407",
19+
pipeline_id="spec-with-spn",
1920
run_as_user_name="[email protected]",
2021
state=PipelineState.IDLE,
2122
)
2223
]
2324

24-
ws = Mock()
25-
config_dict = {
26-
"spark.hadoop.fs.azure.account.auth.type.abcde.dfs.core.windows.net": "SAS",
27-
"spark.hadoop.fs.azure.sas.token.provider.type.abcde.dfs."
28-
"core.windows.net": "org.apache.hadoop.fs.azurebfs.sas.FixedSASTokenProvider",
29-
"spark.hadoop.fs.azure.sas.fixed.token.abcde.dfs.core.windows.net": "{{secrets/abcde_access/sasFixedToken}}",
30-
}
31-
ws.pipelines.get().spec.configuration = config_dict
25+
ws = workspace_client_mock(clusters="job-source-cluster.json")
26+
ws.workspace.export().content = "JXNoCmVjaG8gIj0="
27+
ws.dbfs.read().data = "JXNoCmVjaG8gIj0="
3228

33-
crawler = PipelinesCrawler(ws, MockBackend(), "ucx")._assess_pipelines(sample_pipelines)
29+
ws.pipelines.list_pipelines.return_value = sample_pipelines
30+
crawler = PipelinesCrawler(ws, MockBackend(), "ucx").snapshot()
3431
result_set = list(crawler)
3532

3633
assert len(result_set) == 1
3734
assert result_set[0].success == 0
3835

3936

40-
def test_pipeline_assessment_without_config(mocker):
37+
def test_pipeline_assessment_without_config():
4138
sample_pipelines = [
4239
PipelineStateInfo(
4340
cluster_id=None,
4441
creator_user_name="[email protected]",
4542
latest_updates=None,
4643
name="New DLT Pipeline",
47-
pipeline_id="0112eae7-9d11-4b40-a2b8-6c83cb3c7497",
44+
pipeline_id="empty-spec",
4845
run_as_user_name="[email protected]",
4946
state=PipelineState.IDLE,
5047
)
5148
]
52-
ws = Mock()
53-
config_dict = {}
54-
ws.pipelines.get().spec.configuration = config_dict
55-
crawler = PipelinesCrawler(ws, MockBackend(), "ucx")._assess_pipelines(sample_pipelines)
49+
ws = workspace_client_mock(clusters="job-source-cluster.json")
50+
ws.workspace.export().content = "JXNoCmVjaG8gIj0="
51+
ws.dbfs.read().data = "JXNoCmVjaG8gIj0="
52+
ws.pipelines.list_pipelines.return_value = sample_pipelines
53+
crawler = PipelinesCrawler(ws, MockBackend(), "ucx").snapshot()
5654
result_set = list(crawler)
5755

5856
assert len(result_set) == 1
@@ -69,7 +67,7 @@ def test_pipeline_snapshot_with_config():
6967
failures="",
7068
)
7169
]
72-
mock_ws = Mock()
70+
mock_ws = workspace_client_mock(clusters="job-source-cluster.json")
7371
crawler = PipelinesCrawler(mock_ws, MockBackend(), "ucx")
7472
crawler._try_fetch = Mock(return_value=[])
7573
crawler._crawl = Mock(return_value=sample_pipelines)
@@ -85,16 +83,14 @@ def test_pipeline_list_with_no_config():
8583
PipelineInfo(
8684
creator_name="[email protected]",
8785
pipeline_name="New DLT Pipeline",
88-
pipeline_id="0112eae7-9d11-4b40-a2b8-6c83cb3c7497",
86+
pipeline_id="empty-spec",
8987
success=1,
9088
failures="",
9189
)
9290
]
93-
mock_ws = Mock()
91+
mock_ws = workspace_client_mock(clusters="no-spark-conf.json")
9492
mock_ws.pipelines.list_pipelines.return_value = sample_pipelines
95-
config_dict = {"spark.hadoop.fs.azure1.account.oauth2.client.id.abcde.dfs.core.windows.net": "wewewerty"}
96-
mock_ws.pipelines.get().spec.configuration = config_dict
97-
crawler = AzureServicePrincipalCrawler(mock_ws, MockBackend(), "ucx")._list_all_pipeline_with_spn_in_spark_conf()
93+
crawler = AzureServicePrincipalCrawler(mock_ws, MockBackend(), "ucx").snapshot()
9894

9995
assert len(crawler) == 0
10096

@@ -106,22 +102,23 @@ def test_pipeline_without_owners_should_have_empty_creator_name():
106102
creator_user_name=None,
107103
latest_updates=None,
108104
name="New DLT Pipeline",
109-
pipeline_id="0112eae7-9d11-4b40-a2b8-6c83cb3c7407",
105+
pipeline_id="empty-spec",
110106
run_as_user_name="[email protected]",
111107
state=PipelineState.IDLE,
112108
)
113109
]
114110

115-
ws = Mock()
111+
ws = workspace_client_mock(clusters="no-spark-conf.json")
116112
ws.pipelines.list_pipelines.return_value = sample_pipelines
117-
ws.pipelines.get().spec.configuration = {}
113+
ws.workspace.export().content = "JXNoCmVjaG8gIj0="
114+
ws.dbfs.read().data = "JXNoCmVjaG8gIj0="
118115
mockbackend = MockBackend()
119116
PipelinesCrawler(ws, mockbackend, "ucx").snapshot()
120117
result = mockbackend.rows_written_for("hive_metastore.ucx.pipelines", "append")
121118

122119
assert result == [
123120
PipelineInfo(
124-
pipeline_id="0112eae7-9d11-4b40-a2b8-6c83cb3c7407",
121+
pipeline_id="empty-spec",
125122
pipeline_name="New DLT Pipeline",
126123
creator_name=None,
127124
success=1,

0 commit comments

Comments
 (0)