Refactor code and clean out HTML markup (#252)

larsgeorge-db · web-flow · commit 6d30d12ba1f7 · 2023-09-21T14:06:03.000Z
This was missed merging PR #244
diff --git a/src/databricks/labs/ucx/assessment/__init__.py b/src/databricks/labs/ucx/assessment/__init__.py
@@ -1,3 +0,0 @@
-from databricks.labs.ucx.assessment.assessment import AssessmentToolkit
-
-__all__ = ["AssessmentToolkit"]
diff --git a/src/databricks/labs/ucx/assessment/crawlers.py b/src/databricks/labs/ucx/assessment/crawlers.py
@@ -1,14 +1,18 @@
 import json
-import typing
 from dataclasses import dataclass
 
 from databricks.sdk import WorkspaceClient
 from databricks.sdk.service.jobs import BaseJob
 
 from databricks.labs.ucx.framework.crawlers import CrawlerBase
-from databricks.labs.ucx.hive_metastore.data_objects import ExternalLocationCrawler
 from databricks.labs.ucx.hive_metastore.table_acls import SqlBackend
 
+INCOMPATIBLE_SPARK_CONFIG_KEYS = [
+    "spark.databricks.passthrough.enabled",
+    "spark.hadoop.javax.jdo.option.ConnectionURL",
+    "spark.databricks.hive.metastore.glueCatalog.enabled",
+]
+
 
 @dataclass
 class JobInfo:
@@ -47,42 +51,6 @@ def spark_version_compatibility(spark_version: str) -> str:
     return "supported"
 
 
-class AssessmentToolkit:
-    incompatible_spark_config_keys: typing.ClassVar[tuple] = {
-        "spark.databricks.passthrough.enabled",
-        "spark.hadoop.javax.jdo.option.ConnectionURL",
-        "spark.databricks.hive.metastore.glueCatalog.enabled",
-    }
-
-    def __init__(self, ws: WorkspaceClient, inventory_schema, backend=None):
-        self._all_jobs = None
-        self._all_clusters_by_id = None
-        self._ws = ws
-        self._inventory_schema = inventory_schema
-        self._backend = backend
-        self._external_locations = None
-
-    @staticmethod
-    def _verify_ws_client(w: WorkspaceClient):
-        _me = w.current_user.me()
-        is_workspace_admin = any(g.display == "admins" for g in _me.groups)
-        if not is_workspace_admin:
-            msg = "Current user is not a workspace admin"
-            raise RuntimeError(msg)
-
-    def generate_external_location_list(self):
-        crawler = ExternalLocationCrawler(self._ws, self._backend, self._inventory_schema)
-        return crawler.snapshot()
-
-    def generate_job_assessment(self):
-        crawler = JobsCrawler(self._ws, self._backend, self._inventory_schema)
-        return crawler.snapshot()
-
-    def generate_cluster_assessment(self):
-        crawler = ClustersCrawler(self._ws, self._backend, self._inventory_schema)
-        return crawler.snapshot()
-
-
 class ClustersCrawler(CrawlerBase):
     def __init__(self, ws: WorkspaceClient, sbe: SqlBackend, schema):
         super().__init__(sbe, "hive_metastore", schema, "clusters")
@@ -101,7 +69,7 @@ def _assess_clusters(self, all_clusters):
                 failures.append(f"not supported DBR: {cluster.spark_version}")
 
             if cluster.spark_conf is not None:
-                for k in AssessmentToolkit.incompatible_spark_config_keys:
+                for k in INCOMPATIBLE_SPARK_CONFIG_KEYS:
                     if k in cluster.spark_conf:
                         failures.append(f"unsupported config: {k}")
 
@@ -162,7 +130,7 @@ def _assess_jobs(self, all_jobs: list[BaseJob], all_clusters_by_id) -> list[JobI
                 job_assessment[job.job_id].add(f"not supported DBR: {cluster_config.spark_version}")
 
             if cluster_config.spark_conf is not None:
-                for k in AssessmentToolkit.incompatible_spark_config_keys:
+                for k in INCOMPATIBLE_SPARK_CONFIG_KEYS:
                     if k in cluster_config.spark_conf:
                         job_assessment[job.job_id].add(f"unsupported config: {k}")
 
diff --git a/src/databricks/labs/ucx/runtime.py b/src/databricks/labs/ucx/runtime.py
@@ -4,11 +4,12 @@
 
 from databricks.sdk import WorkspaceClient
 
-from databricks.labs.ucx.assessment import AssessmentToolkit
+from databricks.labs.ucx.assessment.crawlers import ClustersCrawler, JobsCrawler
 from databricks.labs.ucx.config import MigrationConfig
 from databricks.labs.ucx.framework.crawlers import RuntimeBackend
 from databricks.labs.ucx.framework.tasks import task, trigger
 from databricks.labs.ucx.hive_metastore import TaclToolkit
+from databricks.labs.ucx.hive_metastore.data_objects import ExternalLocationCrawler
 from databricks.labs.ucx.hive_metastore.list_mounts import Mounts
 from databricks.labs.ucx.workspace_access import GroupMigrationToolkit
 
@@ -80,44 +81,38 @@ def inventorize_external_locations(cfg: MigrationConfig):
     These external_locations will be created in a later stage before the table can be migrated.
     """
     ws = WorkspaceClient(config=cfg.to_databricks_config())
-    assess = AssessmentToolkit(ws, cfg.inventory_database, RuntimeBackend())
-    assess.generate_external_location_list()
+    crawler = ExternalLocationCrawler(ws, RuntimeBackend(), cfg.inventory_database)
+    crawler.snapshot()
 
 
 @task("assessment", depends_on=[setup_schema])
 def inventorize_jobs(cfg: MigrationConfig):
     """This part scan through all the jobs and locate ones that are not compatible with UC.
-    It looks for:<br>
-    <ol>
-    <li>Clusters with DBR version earlier than 11.3<br>
-    <li>Clusters using Passthru Authentication<br>
-    <li>Clusters with incompatible spark config tags<br>
-    <li>Clusters referencing DBFS locations in one or more config options<br>
-    </ol>
-    <br>
-    A report with a list of all the Jobs is saved to the `$inventory.jobs` Table.
+    It looks for:
+      - Clusters with DBR version earlier than 11.3
+      - Clusters using Passthru Authentication
+      - Clusters with incompatible spark config tags
+      - Clusters referencing DBFS locations in one or more config options
+    A report with a list of all the Jobs is saved to the `$inventory.jobs` table.
     """
     ws = WorkspaceClient(config=cfg.to_databricks_config())
-    assess = AssessmentToolkit(ws, cfg.inventory_database, RuntimeBackend())
-    assess.generate_job_assessment()
+    crawler = JobsCrawler(ws, RuntimeBackend(), cfg.inventory_database)
+    crawler.snapshot()
 
 
 @task("assessment", depends_on=[setup_schema])
 def inventorize_clusters(cfg: MigrationConfig):
     """This part scan through all the clusters and locate ones that are not compatible with UC.
-    It looks for:<br>
-    <ol>
-    <li>Clusters with DBR version earlier than 11.3<br>
-    <li>Clusters using Passthru Authentication<br>
-    <li>Clusters with incompatible spark config tags<br>
-    <li>Clusters referencing DBFS locations in one or more config options<br>
-    </ol>
-    <br>
-    A report with a list of all the Jobs is saved to the `$inventory.clusters` Table.
+    It looks for:
+      - Clusters with DBR version earlier than 11.3
+      - Clusters using Passthru Authentication
+      - Clusters with incompatible spark config tags
+      - Clusters referencing DBFS locations in one or more config options
+    A report with a list of all the Jobs is saved to the `$inventory.clusters` table.
     """
     ws = WorkspaceClient(config=cfg.to_databricks_config())
-    assess = AssessmentToolkit(ws, cfg.inventory_database, RuntimeBackend())
-    assess.generate_cluster_assessment()
+    crawler = ClustersCrawler(ws, RuntimeBackend(), cfg.inventory_database)
+    crawler.snapshot()
 
 
 @task("assessment", depends_on=[setup_schema])
diff --git a/tests/integration/assessment/test_assessment.py b/tests/integration/assessment/test_assessment.py
@@ -1,9 +1,5 @@
 import pytest
 
-from databricks.labs.ucx.assessment.assessment import AssessmentToolkit
-
 
 def test_table_inventory(ws, make_catalog, make_schema):
     pytest.skip("test is broken")
-    assess = AssessmentToolkit(ws, make_catalog(), make_schema())
-    assess.table_inventory()
diff --git a/tests/unit/assessment/test_assessment.py b/tests/unit/assessment/test_assessment.py
@@ -3,11 +3,8 @@
 from databricks.sdk.service.compute import AutoScale, ClusterDetails
 from databricks.sdk.service.jobs import BaseJob, JobSettings, NotebookTask, Task
 
-from databricks.labs.ucx.assessment.assessment import (
-    ClustersCrawler,
-    ExternalLocationCrawler,
-    JobsCrawler,
-)
+from databricks.labs.ucx.assessment.crawlers import ClustersCrawler, JobsCrawler
+from databricks.labs.ucx.hive_metastore.data_objects import ExternalLocationCrawler
 from databricks.labs.ucx.hive_metastore.list_mounts import Mount
 from databricks.labs.ucx.mixins.sql import Row
 from tests.unit.framework.mocks import MockBackend

Original file line number	Diff line number	Diff line change
`@@ -1,3 +0,0 @@`
`1`		`-from databricks.labs.ucx.assessment.assessment import AssessmentToolkit`
`2`		`-`
`3`		`-__all__ = ["AssessmentToolkit"]`