databrickslabs
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/databricks/labs/ucx/account.py‎
Lines changed: 27 additions & 31 deletions b/‎src/databricks/labs/ucx/account.py‎
Lines changed: 27 additions & 31 deletions
diff --git a/‎src/databricks/labs/ucx/assessment/jobs.py‎
Lines changed: 40 additions & 33 deletions b/‎src/databricks/labs/ucx/assessment/jobs.py‎
Lines changed: 40 additions & 33 deletions
diff --git a/‎src/databricks/labs/ucx/assessment/pipelines.py‎
Lines changed: 13 additions & 11 deletions b/‎src/databricks/labs/ucx/assessment/pipelines.py‎
Lines changed: 13 additions & 11 deletions
diff --git a/‎src/databricks/labs/ucx/framework/dashboards.py‎
Lines changed: 11 additions & 8 deletions b/‎src/databricks/labs/ucx/framework/dashboards.py‎
Lines changed: 11 additions & 8 deletions
diff --git a/‎src/databricks/labs/ucx/hive_metastore/locations.py‎
Lines changed: 39 additions & 32 deletions b/‎src/databricks/labs/ucx/hive_metastore/locations.py‎
Lines changed: 39 additions & 32 deletions
@@ -598,7 +598,7 @@ default-docstring-type = "default"
 
 [tool.pylint.refactoring]
 # Maximum number of nested blocks for function / method body
-max-nested-blocks = 5
+max-nested-blocks = 3
 
 # Complete name of functions that never returns. When checking for inconsistent-
 # return-statements if a never returning function is called then it will be
 
@@ -131,41 +131,37 @@ def _get_valid_workspaces_groups(self, prompts: Prompts, workspace_ids: list[int
         for workspace in self._workspaces():
             if workspace.workspace_id not in workspace_ids:
                 continue
-            client = self.client_for(workspace)
-            logger.info(f"Crawling groups in workspace {client.config.host}")
+            self._load_workspace_groups(prompts, workspace, all_workspaces_groups)
 
-            ws_group_ids = client.groups.list(attributes="id")
-            for group_id in ws_group_ids:
-                full_workspace_group = self._safe_groups_get(client, group_id.id)
-                if not full_workspace_group:
-                    continue
-                group_name = full_workspace_group.display_name
+        return all_workspaces_groups
 
-                if self._is_group_out_of_scope(full_workspace_group):
+    def _load_workspace_groups(self, prompts, workspace, all_workspaces_groups):
+        client = self.client_for(workspace)
+        logger.info(f"Crawling groups in workspace {client.config.host}")
+        ws_group_ids = client.groups.list(attributes="id")
+        for group_id in ws_group_ids:
+            full_workspace_group = self._safe_groups_get(client, group_id.id)
+            if not full_workspace_group:
+                continue
+            group_name = full_workspace_group.display_name
+            if self._is_group_out_of_scope(full_workspace_group):
+                continue
+            if not group_name:
+                continue
+            if group_name in all_workspaces_groups:
+                if self._has_same_members(all_workspaces_groups[group_name], full_workspace_group):
+                    logger.info(f"Workspace group {group_name} already found, ignoring")
                     continue
-
-                if group_name in all_workspaces_groups:
-                    if self._has_same_members(all_workspaces_groups[group_name], full_workspace_group):
-                        logger.info(f"Workspace group {group_name} already found, ignoring")
-                        continue
-
-                    if prompts.confirm(
-                        f"Group {group_name} does not have the same amount of members "
-                        f"in workspace {client.config.host} than previous workspaces which contains the same group name,"
-                        f"it will be created at the account with name : {workspace.workspace_name}_{group_name}"
-                    ):
-                        all_workspaces_groups[f"{workspace.workspace_name}_{group_name}"] = full_workspace_group
-                        continue
-
-                if not group_name:
+                if prompts.confirm(
+                    f"Group {group_name} does not have the same amount of members "
+                    f"in workspace {client.config.host} than previous workspaces which contains the same group name,"
+                    f"it will be created at the account with name : {workspace.workspace_name}_{group_name}"
+                ):
+                    all_workspaces_groups[f"{workspace.workspace_name}_{group_name}"] = full_workspace_group
                     continue
-
-                logger.info(f"Found new group {group_name}")
-                all_workspaces_groups[group_name] = full_workspace_group
-
-            logger.info(f"Found a total of {len(all_workspaces_groups)} groups to migrate to the account")
-
-        return all_workspaces_groups
+            logger.info(f"Found new group {group_name}")
+            all_workspaces_groups[group_name] = full_workspace_group
+        logger.info(f"Found a total of {len(all_workspaces_groups)} groups to migrate to the account")
 
     def _is_group_out_of_scope(self, group: Group) -> bool:
         if group.display_name in {"users", "admins", "account users"}:
 
@@ -38,27 +38,34 @@ class JobInfo:
 
 
 class JobsMixin:
-    @staticmethod
-    def _get_cluster_configs_from_all_jobs(all_jobs, all_clusters_by_id):  # pylint: disable=too-complex
-        for j in all_jobs:
-            if j.settings is None:
+    @classmethod
+    def _get_cluster_configs_from_all_jobs(cls, all_jobs, all_clusters_by_id):
+        for job in all_jobs:
+            if job.settings is None:
                 continue
-            if j.settings.job_clusters is not None:
-                for job_cluster in j.settings.job_clusters:
-                    if job_cluster.new_cluster is None:
-                        continue
-                    yield j, job_cluster.new_cluster
-            if j.settings.tasks is None:
+            if job.settings.job_clusters is not None:
+                yield from cls._job_clusters(job)
+            if job.settings.tasks is None:
                 continue
-            for task in j.settings.tasks:
-                if task.existing_cluster_id is not None:
-                    interactive_cluster = all_clusters_by_id.get(task.existing_cluster_id, None)
-                    if interactive_cluster is None:
-                        continue
-                    yield j, interactive_cluster
+            yield from cls._task_clusters(job, all_clusters_by_id)
 
-                elif task.new_cluster is not None:
-                    yield j, task.new_cluster
+    @classmethod
+    def _task_clusters(cls, job, all_clusters_by_id):
+        for task in job.settings.tasks:
+            if task.existing_cluster_id is not None:
+                interactive_cluster = all_clusters_by_id.get(task.existing_cluster_id, None)
+                if interactive_cluster is None:
+                    continue
+                yield job, interactive_cluster
+            elif task.new_cluster is not None:
+                yield job, task.new_cluster
+
+    @staticmethod
+    def _job_clusters(job):
+        for job_cluster in job.settings.job_clusters:
+            if job_cluster.new_cluster is None:
+                continue
+            yield job, job_cluster.new_cluster
 
 
 class JobsCrawler(CrawlerBase[JobInfo], JobsMixin, CheckClusterMixin):
@@ -299,22 +306,10 @@ def _assess_job_runs(self, submit_runs: Iterable[BaseRun], all_clusters_by_id) -
         runs_per_hash: dict[str, list[int | None]] = {}
 
         for submit_run in submit_runs:
-            task_failures = []
+            task_failures: list[str] = []
             # v2.1+ API, with tasks
             if submit_run.tasks:
-                all_tasks: list[RunTask] = submit_run.tasks
-                for task in sorted(all_tasks, key=lambda x: x.task_key if x.task_key is not None else ""):
-                    _task_key = task.task_key if task.task_key is not None else ""
-                    _cluster_details = None
-                    if task.new_cluster:
-                        _cluster_details = ClusterDetails.from_dict(task.new_cluster.as_dict())
-                        if self._needs_compatibility_check(task.new_cluster):
-                            task_failures.append("no data security mode specified")
-                    if task.existing_cluster_id:
-                        _cluster_details = all_clusters_by_id.get(task.existing_cluster_id, None)
-                    if _cluster_details:
-                        task_failures.extend(self._check_cluster_failures(_cluster_details, _task_key))
-
+                self._check_run_task(submit_run.tasks, all_clusters_by_id, task_failures)
             # v2.0 API, without tasks
             elif submit_run.cluster_spec:
                 _cluster_details = ClusterDetails.from_dict(submit_run.cluster_spec.as_dict())
@@ -324,11 +319,23 @@ def _assess_job_runs(self, submit_runs: Iterable[BaseRun], all_clusters_by_id) -
                 runs_per_hash[hashed_id].append(submit_run.run_id)
             else:
                 runs_per_hash[hashed_id] = [submit_run.run_id]
-
             result[hashed_id] = SubmitRunInfo(
                 run_ids=json.dumps(runs_per_hash[hashed_id]),
                 hashed_id=hashed_id,
                 failures=json.dumps(list(set(task_failures))),
             )
 
         return list(result.values())
+
+    def _check_run_task(self, all_tasks: list[RunTask], clusters: dict[str, ClusterDetails], task_failures: list[str]):
+        for task in sorted(all_tasks, key=lambda x: x.task_key if x.task_key is not None else ""):
+            _task_key = task.task_key if task.task_key is not None else ""
+            cluster_details = None
+            if task.new_cluster:
+                cluster_details = ClusterDetails.from_dict(task.new_cluster.as_dict())
+                if self._needs_compatibility_check(task.new_cluster):
+                    task_failures.append("no data security mode specified")
+            if task.existing_cluster_id:
+                cluster_details = clusters.get(task.existing_cluster_id, None)
+            if cluster_details:
+                task_failures.extend(self._check_cluster_failures(cluster_details, _task_key))
@@ -50,22 +50,24 @@ def _assess_pipelines(self, all_pipelines) -> Iterable[PipelineInfo]:
             pipeline_config = pipeline_response.spec.configuration
             if pipeline_config:
                 failures.extend(self._check_spark_conf(pipeline_config, "pipeline"))
-            pipeline_cluster = pipeline_response.spec.clusters
-            if pipeline_cluster:
-                for cluster in pipeline_cluster:
-                    if cluster.spark_conf:
-                        failures.extend(self._check_spark_conf(cluster.spark_conf, "pipeline cluster"))
-                    # Checking if cluster config is present in cluster policies
-                    if cluster.policy_id:
-                        failures.extend(self._check_cluster_policy(cluster.policy_id, "pipeline cluster"))
-                    if cluster.init_scripts:
-                        failures.extend(self._check_cluster_init_script(cluster.init_scripts, "pipeline cluster"))
-
+            clusters = pipeline_response.spec.clusters
+            if clusters:
+                self._pipeline_clusters(clusters, failures)
             pipeline_info.failures = json.dumps(failures)
             if len(failures) > 0:
                 pipeline_info.success = 0
             yield pipeline_info
 
+    def _pipeline_clusters(self, clusters, failures):
+        for cluster in clusters:
+            if cluster.spark_conf:
+                failures.extend(self._check_spark_conf(cluster.spark_conf, "pipeline cluster"))
+            # Checking if cluster config is present in cluster policies
+            if cluster.policy_id:
+                failures.extend(self._check_cluster_policy(cluster.policy_id, "pipeline cluster"))
+            if cluster.init_scripts:
+                failures.extend(self._check_cluster_init_script(cluster.init_scripts, "pipeline cluster"))
+
     def snapshot(self) -> Iterable[PipelineInfo]:
         return self._snapshot(self._try_fetch, self._crawl)
 
 
@@ -126,14 +126,17 @@ def validate(self):
             dashboard_folders = [f for f in step_folder.glob("*") if f.is_dir()]
             # Create separate dashboards per step, represented as second-level folders
             for dashboard_folder in dashboard_folders:
-                dashboard_ref = f"{step_folder.stem}_{dashboard_folder.stem}".lower()
-                for query in self._desired_queries(dashboard_folder, dashboard_ref):
-                    try:
-                        self._get_viz_options(query)
-                        self._get_widget_options(query)
-                    except Exception as err:
-                        msg = f"Error in {query.name}: {err}"
-                        raise AssertionError(msg) from err
+                self._validate_folder(dashboard_folder, step_folder)
+
+    def _validate_folder(self, dashboard_folder, step_folder):
+        dashboard_ref = f"{step_folder.stem}_{dashboard_folder.stem}".lower()
+        for query in self._desired_queries(dashboard_folder, dashboard_ref):
+            try:
+                self._get_viz_options(query)
+                self._get_widget_options(query)
+            except Exception as err:
+                msg = f"Error in {query.name}: {err}"
+                raise AssertionError(msg) from err
 
     def _install_widget(self, query: SimpleQuery, dashboard_ref: str):
         dashboard_id = self._state.dashboards[dashboard_ref]
 
@@ -7,6 +7,7 @@
 
 from databricks.labs.blueprint.installation import Installation
 from databricks.sdk import WorkspaceClient
+from databricks.sdk.service.catalog import ExternalLocationInfo
 
 from databricks.labs.ucx.framework.crawlers import CrawlerBase, SqlBackend
 from databricks.labs.ucx.framework.utils import escape_sql_identifier
@@ -39,23 +40,27 @@ def _external_locations(self, tables: list[Row], mounts) -> Iterable[ExternalLoc
         external_locations: list[ExternalLocation] = []
         for table in tables:
             location = table.location
-            if location is not None and len(location) > 0:
-                if location.startswith("dbfs:/mnt"):
-                    for mount in mounts:
-                        if location[5:].startswith(mount.name.lower()):
-                            location = location[5:].replace(mount.name, mount.source)
-                            break
-                if (
-                    not location.startswith("dbfs")
-                    and (self._prefix_size[0] < location.find(":/") < self._prefix_size[1])
-                    and not location.startswith("jdbc")
-                ):
-                    self._dbfs_locations(external_locations, location, min_slash)
-                if location.startswith("jdbc"):
-                    self._add_jdbc_location(external_locations, location, table)
-
+            if not location:
+                continue
+            if location.startswith("dbfs:/mnt"):
+                location = self._resolve_mount(location, mounts)
+            if (
+                not location.startswith("dbfs")
+                and (self._prefix_size[0] < location.find(":/") < self._prefix_size[1])
+                and not location.startswith("jdbc")
+            ):
+                self._dbfs_locations(external_locations, location, min_slash)
+            if location.startswith("jdbc"):
+                self._add_jdbc_location(external_locations, location, table)
         return external_locations
 
+    def _resolve_mount(self, location, mounts):
+        for mount in mounts:
+            if location[5:].startswith(mount.name.lower()):
+                location = location[5:].replace(mount.name, mount.source)
+                break
+        return location
+
     @staticmethod
     def _dbfs_locations(external_locations, location, min_slash):
         dupe = False
@@ -161,31 +166,33 @@ def _get_ext_location_definitions(self, missing_locations: list[ExternalLocation
         return tf_script
 
     def match_table_external_locations(self) -> tuple[dict[str, int], list[ExternalLocation]]:
-        uc_external_locations = list(self._ws.external_locations.list())
+        existing_locations = list(self._ws.external_locations.list())
         table_locations = self.snapshot()
-        matching_locations = {}
+        matching_locations: dict[str, int] = {}
         missing_locations = []
         for table_loc in table_locations:
             # external_location.list returns url without trailing "/" but ExternalLocation.snapshot
             # does so removing the trailing slash before comparing
-            matched = False
-            for uc_loc in uc_external_locations:
-                if not uc_loc.url:
-                    continue
-                if not uc_loc.name:
-                    continue
-                uc_loc_path = uc_loc.url.lower()
-                if uc_loc_path in table_loc.location.rstrip("/").lower():
-                    if uc_loc.name not in matching_locations:
-                        matching_locations[uc_loc.name] = table_loc.table_count
-                    else:
-                        matching_locations[uc_loc.name] = matching_locations[uc_loc.name] + table_loc.table_count
-                    matched = True
-                    break
-            if not matched:
+            if not self._match_existing(table_loc, matching_locations, existing_locations):
                 missing_locations.append(table_loc)
         return matching_locations, missing_locations
 
+    @staticmethod
+    def _match_existing(table_loc, matching_locations: dict[str, int], existing_locations: list[ExternalLocationInfo]):
+        for uc_loc in existing_locations:
+            if not uc_loc.url:
+                continue
+            if not uc_loc.name:
+                continue
+            uc_loc_path = uc_loc.url.lower()
+            if uc_loc_path in table_loc.location.rstrip("/").lower():
+                if uc_loc.name not in matching_locations:
+                    matching_locations[uc_loc.name] = table_loc.table_count
+                else:
+                    matching_locations[uc_loc.name] = matching_locations[uc_loc.name] + table_loc.table_count
+                return True
+        return False
+
     def save_as_terraform_definitions_on_workspace(self, installation: Installation):
         matching_locations, missing_locations = self.match_table_external_locations()
         if len(matching_locations) > 0: