11import json
2+ import re
23from dataclasses import dataclass
34
45from databricks .sdk import WorkspaceClient
1314 "spark.databricks.hive.metastore.glueCatalog.enabled" ,
1415]
1516
17+ _AZURE_SP_CONF = [
18+ "fs.azure.account.key" ,
19+ "fs.azure.account.auth.type" ,
20+ "fs.azure.account.oauth.provider.type" ,
21+ "fs.azure.account.oauth2.client.id" ,
22+ "fs.azure.account.oauth2.client.secret" ,
23+ "fs.azure.account.oauth2.client.endpoint" ,
24+ ]
25+
26+ _AZURE_SP_CONF_FAILURE_MSG = "Uses azure service principal credentials config in "
27+
1628
1729@dataclass
1830class JobInfo :
@@ -32,6 +44,23 @@ class ClusterInfo:
3244 failures : str
3345
3446
47+ @dataclass
48+ class PipelineInfo :
49+ pipeline_id : str
50+ pipeline_name : str
51+ creator_name : str
52+ success : int
53+ failures : str
54+
55+
56+ def _azure_sp_conf_present_check (config : dict ) -> bool :
57+ for key in config .keys ():
58+ for conf in _AZURE_SP_CONF :
59+ if re .search (conf , key ):
60+ return True
61+ return False
62+
63+
3564def spark_version_compatibility (spark_version : str ) -> str :
3665 first_comp_custom_rt = 3
3766 first_comp_custom_x = 2
@@ -51,6 +80,37 @@ def spark_version_compatibility(spark_version: str) -> str:
5180 return "supported"
5281
5382
83+ class PipelinesCrawler (CrawlerBase ):
84+ def __init__ (self , ws : WorkspaceClient , sbe : SqlBackend , schema ):
85+ super ().__init__ (sbe , "hive_metastore" , schema , "pipelines" )
86+ self ._ws = ws
87+
88+ def _crawl (self ) -> list [PipelineInfo ]:
89+ all_pipelines = list (self ._ws .pipelines .list_pipelines ())
90+ return list (self ._assess_pipelines (all_pipelines ))
91+
92+ def _assess_pipelines (self , all_pipelines ):
93+ for pipeline in all_pipelines :
94+ pipeline_info = PipelineInfo (pipeline .pipeline_id , pipeline .name , pipeline .creator_user_name , 1 , "" )
95+ failures = []
96+ pipeline_config = self ._ws .pipelines .get (pipeline .pipeline_id ).spec .configuration
97+ if pipeline_config :
98+ if _azure_sp_conf_present_check (pipeline_config ):
99+ failures .append (f"{ _AZURE_SP_CONF_FAILURE_MSG } pipeline." )
100+
101+ pipeline_info .failures = json .dumps (failures )
102+ if len (failures ) > 0 :
103+ pipeline_info .success = 0
104+ yield pipeline_info
105+
106+ def snapshot (self ) -> list [PipelineInfo ]:
107+ return self ._snapshot (self ._try_fetch , self ._crawl )
108+
109+ def _try_fetch (self ) -> list [PipelineInfo ]:
110+ for row in self ._fetch (f"SELECT * FROM { self ._schema } .{ self ._table } " ):
111+ yield PipelineInfo (* row )
112+
113+
54114class ClustersCrawler (CrawlerBase ):
55115 def __init__ (self , ws : WorkspaceClient , sbe : SqlBackend , schema ):
56116 super ().__init__ (sbe , "hive_metastore" , schema , "clusters" )
@@ -78,6 +138,20 @@ def _assess_clusters(self, all_clusters):
78138 for value in cluster .spark_conf .values ():
79139 if "dbfs:/mnt" in value or "/dbfs/mnt" in value :
80140 failures .append (f"using DBFS mount in configuration: { value } " )
141+
142+ # Checking if Azure cluster config is present in spark config
143+ if _azure_sp_conf_present_check (cluster .spark_conf ):
144+ failures .append (f"{ _AZURE_SP_CONF_FAILURE_MSG } cluster." )
145+
146+ # Checking if Azure cluster config is present in cluster policies
147+ if cluster .policy_id :
148+ policy = self ._ws .cluster_policies .get (cluster .policy_id )
149+ if _azure_sp_conf_present_check (json .loads (policy .definition )):
150+ failures .append (f"{ _AZURE_SP_CONF_FAILURE_MSG } cluster." )
151+ if policy .policy_family_definition_overrides :
152+ if _azure_sp_conf_present_check (json .loads (policy .policy_family_definition_overrides )):
153+ failures .append (f"{ _AZURE_SP_CONF_FAILURE_MSG } cluster." )
154+
81155 cluster_info .failures = json .dumps (failures )
82156 if len (failures ) > 0 :
83157 cluster_info .success = 0
@@ -139,6 +213,20 @@ def _assess_jobs(self, all_jobs: list[BaseJob], all_clusters_by_id) -> list[JobI
139213 for value in cluster_config .spark_conf .values ():
140214 if "dbfs:/mnt" in value or "/dbfs/mnt" in value :
141215 job_assessment [job .job_id ].add (f"using DBFS mount in configuration: { value } " )
216+
217+ # Checking if Azure cluster config is present in spark config
218+ if _azure_sp_conf_present_check (cluster_config .spark_conf ):
219+ job_assessment [job .job_id ].add (f"{ _AZURE_SP_CONF_FAILURE_MSG } Job cluster." )
220+
221+ # Checking if Azure cluster config is present in cluster policies
222+ if cluster_config .policy_id :
223+ policy = self ._ws .cluster_policies .get (cluster_config .policy_id )
224+ if _azure_sp_conf_present_check (json .loads (policy .definition )):
225+ job_assessment [job .job_id ].add (f"{ _AZURE_SP_CONF_FAILURE_MSG } Job cluster." )
226+ if policy .policy_family_definition_overrides :
227+ if _azure_sp_conf_present_check (json .loads (policy .policy_family_definition_overrides )):
228+ job_assessment [job .job_id ].add (f"{ _AZURE_SP_CONF_FAILURE_MSG } Job cluster." )
229+
142230 for job_key in job_details .keys ():
143231 job_details [job_key ].failures = json .dumps (list (job_assessment [job_key ]))
144232 if len (job_assessment [job_key ]) > 0 :
0 commit comments