Skip to content

Commit 82bdd33

Browse files
Added crawler for Azure Service principals used for direct storage access (#305)
Fixes #249
1 parent 2e8a880 commit 82bdd33

File tree

2 files changed

+425
-4
lines changed

2 files changed

+425
-4
lines changed

src/databricks/labs/ucx/assessment/crawlers.py

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import json
2+
import re
23
from dataclasses import dataclass
34

45
from databricks.sdk import WorkspaceClient
@@ -13,6 +14,17 @@
1314
"spark.databricks.hive.metastore.glueCatalog.enabled",
1415
]
1516

17+
_AZURE_SP_CONF = [
18+
"fs.azure.account.key",
19+
"fs.azure.account.auth.type",
20+
"fs.azure.account.oauth.provider.type",
21+
"fs.azure.account.oauth2.client.id",
22+
"fs.azure.account.oauth2.client.secret",
23+
"fs.azure.account.oauth2.client.endpoint",
24+
]
25+
26+
_AZURE_SP_CONF_FAILURE_MSG = "Uses azure service principal credentials config in "
27+
1628

1729
@dataclass
1830
class JobInfo:
@@ -32,6 +44,23 @@ class ClusterInfo:
3244
failures: str
3345

3446

47+
@dataclass
48+
class PipelineInfo:
49+
pipeline_id: str
50+
pipeline_name: str
51+
creator_name: str
52+
success: int
53+
failures: str
54+
55+
56+
def _azure_sp_conf_present_check(config: dict) -> bool:
57+
for key in config.keys():
58+
for conf in _AZURE_SP_CONF:
59+
if re.search(conf, key):
60+
return True
61+
return False
62+
63+
3564
def spark_version_compatibility(spark_version: str) -> str:
3665
first_comp_custom_rt = 3
3766
first_comp_custom_x = 2
@@ -51,6 +80,37 @@ def spark_version_compatibility(spark_version: str) -> str:
5180
return "supported"
5281

5382

83+
class PipelinesCrawler(CrawlerBase):
84+
def __init__(self, ws: WorkspaceClient, sbe: SqlBackend, schema):
85+
super().__init__(sbe, "hive_metastore", schema, "pipelines")
86+
self._ws = ws
87+
88+
def _crawl(self) -> list[PipelineInfo]:
89+
all_pipelines = list(self._ws.pipelines.list_pipelines())
90+
return list(self._assess_pipelines(all_pipelines))
91+
92+
def _assess_pipelines(self, all_pipelines):
93+
for pipeline in all_pipelines:
94+
pipeline_info = PipelineInfo(pipeline.pipeline_id, pipeline.name, pipeline.creator_user_name, 1, "")
95+
failures = []
96+
pipeline_config = self._ws.pipelines.get(pipeline.pipeline_id).spec.configuration
97+
if pipeline_config:
98+
if _azure_sp_conf_present_check(pipeline_config):
99+
failures.append(f"{_AZURE_SP_CONF_FAILURE_MSG} pipeline.")
100+
101+
pipeline_info.failures = json.dumps(failures)
102+
if len(failures) > 0:
103+
pipeline_info.success = 0
104+
yield pipeline_info
105+
106+
def snapshot(self) -> list[PipelineInfo]:
107+
return self._snapshot(self._try_fetch, self._crawl)
108+
109+
def _try_fetch(self) -> list[PipelineInfo]:
110+
for row in self._fetch(f"SELECT * FROM {self._schema}.{self._table}"):
111+
yield PipelineInfo(*row)
112+
113+
54114
class ClustersCrawler(CrawlerBase):
55115
def __init__(self, ws: WorkspaceClient, sbe: SqlBackend, schema):
56116
super().__init__(sbe, "hive_metastore", schema, "clusters")
@@ -78,6 +138,20 @@ def _assess_clusters(self, all_clusters):
78138
for value in cluster.spark_conf.values():
79139
if "dbfs:/mnt" in value or "/dbfs/mnt" in value:
80140
failures.append(f"using DBFS mount in configuration: {value}")
141+
142+
# Checking if Azure cluster config is present in spark config
143+
if _azure_sp_conf_present_check(cluster.spark_conf):
144+
failures.append(f"{_AZURE_SP_CONF_FAILURE_MSG} cluster.")
145+
146+
# Checking if Azure cluster config is present in cluster policies
147+
if cluster.policy_id:
148+
policy = self._ws.cluster_policies.get(cluster.policy_id)
149+
if _azure_sp_conf_present_check(json.loads(policy.definition)):
150+
failures.append(f"{_AZURE_SP_CONF_FAILURE_MSG} cluster.")
151+
if policy.policy_family_definition_overrides:
152+
if _azure_sp_conf_present_check(json.loads(policy.policy_family_definition_overrides)):
153+
failures.append(f"{_AZURE_SP_CONF_FAILURE_MSG} cluster.")
154+
81155
cluster_info.failures = json.dumps(failures)
82156
if len(failures) > 0:
83157
cluster_info.success = 0
@@ -139,6 +213,20 @@ def _assess_jobs(self, all_jobs: list[BaseJob], all_clusters_by_id) -> list[JobI
139213
for value in cluster_config.spark_conf.values():
140214
if "dbfs:/mnt" in value or "/dbfs/mnt" in value:
141215
job_assessment[job.job_id].add(f"using DBFS mount in configuration: {value}")
216+
217+
# Checking if Azure cluster config is present in spark config
218+
if _azure_sp_conf_present_check(cluster_config.spark_conf):
219+
job_assessment[job.job_id].add(f"{_AZURE_SP_CONF_FAILURE_MSG} Job cluster.")
220+
221+
# Checking if Azure cluster config is present in cluster policies
222+
if cluster_config.policy_id:
223+
policy = self._ws.cluster_policies.get(cluster_config.policy_id)
224+
if _azure_sp_conf_present_check(json.loads(policy.definition)):
225+
job_assessment[job.job_id].add(f"{_AZURE_SP_CONF_FAILURE_MSG} Job cluster.")
226+
if policy.policy_family_definition_overrides:
227+
if _azure_sp_conf_present_check(json.loads(policy.policy_family_definition_overrides)):
228+
job_assessment[job.job_id].add(f"{_AZURE_SP_CONF_FAILURE_MSG} Job cluster.")
229+
142230
for job_key in job_details.keys():
143231
job_details[job_key].failures = json.dumps(list(job_assessment[job_key]))
144232
if len(job_assessment[job_key]) > 0:

0 commit comments

Comments
 (0)