Added crawlers for compatibility of jobs and clusters, along with basic recommendations for external locations (#244)

FastLee · larsgeorge-db · william-conti · web-flow · commit 24e354b65eb3 · 2023-09-21T12:15:50.000Z
This PR include the following assessment features:
External Locations
Jobs
Clusters

---------

Co-authored-by: Lars George &lt;136075433+larsgeorge-db@users.noreply.github.com&gt;
Co-authored-by: william-conti &lt;107417584+william-conti@users.noreply.github.com&gt;
Co-authored-by: Lars George &lt;lars.george@databricks.com&gt;
diff --git a/src/databricks/labs/ucx/assessment/__init__.py b/src/databricks/labs/ucx/assessment/__init__.py
@@ -0,0 +1,3 @@
+from databricks.labs.ucx.assessment.assessment import AssessmentToolkit
+
+__all__ = ["AssessmentToolkit"]
diff --git a/src/databricks/labs/ucx/assessment/assessment.py b/src/databricks/labs/ucx/assessment/assessment.py
@@ -1,56 +1,66 @@
-import logging
-import re
-from importlib import resources
+import json
+import typing
+from dataclasses import dataclass
 
 from databricks.sdk import WorkspaceClient
-from databricks.sdk.service.compute import Language
+from databricks.sdk.service.jobs import BaseJob
 
-from databricks.labs.ucx.assessment import commands
-from databricks.labs.ucx.mixins.compute import CommandExecutor
+from databricks.labs.ucx.framework.crawlers import CrawlerBase
+from databricks.labs.ucx.hive_metastore.data_objects import ExternalLocationCrawler
+from databricks.labs.ucx.hive_metastore.table_acls import SqlBackend
 
-logger = logging.getLogger(__name__)
+
+@dataclass
+class JobInfo:
+    job_id: str
+    job_name: str
+    creator: str
+    success: int
+    failures: str
+
+
+@dataclass
+class ClusterInfo:
+    cluster_id: str
+    cluster_name: str
+    creator: str
+    success: int
+    failures: str
+
+
+def spark_version_compatibility(spark_version: str) -> str:
+    first_comp_custom_rt = 3
+    first_comp_custom_x = 2
+    dbr_version_components = spark_version.split("-")
+    first_components = dbr_version_components[0].split(".")
+    if len(first_components) != first_comp_custom_rt:
+        # custom runtime
+        return "unsupported"
+    if first_components[first_comp_custom_x] != "x":
+        # custom runtime
+        return "unsupported"
+    version = int(first_components[0]), int(first_components[1])
+    if version < (10, 0):
+        return "unsupported"
+    if (10, 0) <= version < (11, 3):
+        return "kinda works"
+    return "supported"
 
 
 class AssessmentToolkit:
-    def __init__(self, ws: WorkspaceClient, cluster_id, inventory_catalog, inventory_schema, warehouse_id=None):
+    incompatible_spark_config_keys: typing.ClassVar[tuple] = {
+        "spark.databricks.passthrough.enabled",
+        "spark.hadoop.javax.jdo.option.ConnectionURL",
+        "spark.databricks.hive.metastore.glueCatalog.enabled",
+    }
+
+    def __init__(self, ws: WorkspaceClient, inventory_schema, backend=None):
+        self._all_jobs = None
+        self._all_clusters_by_id = None
         self._ws = ws
-        self._inventory_catalog = inventory_catalog
         self._inventory_schema = inventory_schema
-        self._warehouse_id = warehouse_id
-        self._cluster_id = cluster_id
-        self._command_executor = None
-        self._managed_executor = False
-
-    def _get_command_executor(self, executor: CommandExecutor | None = None, language=None):
-        ce = executor
-        if ce is None:
-            if language:
-                ce = CommandExecutor(self._ws, language=language, cluster_id=self._cluster_id)
-            else:
-                ce = CommandExecutor(self._ws, cluster_id=self._cluster_id)
-            self._managed_executor = True
-        self._command_executor = ce
-        return ce
-
-    def _remove_command_executor(self):
-        if self._managed_executor:
-            self._command_executor = None
-            self._managed_executor = False
-
-    @staticmethod
-    def _load_command_code(name):
-        cmd_file = resources.files(commands) / name
-        with cmd_file.open("rt") as f:
-            cmd_code = f.read()
-        return cmd_code
-
-    def _get_command(self, name, params: dict | None = None):
-        cmd_code = self._load_command_code(name)
-        if params:
-            for pattern, replace in params.items():
-                p = re.compile(pattern)
-                cmd_code = p.sub(replace, cmd_code)
-        return cmd_code
+        self._backend = backend
+        self._external_locations = None
 
     @staticmethod
     def _verify_ws_client(w: WorkspaceClient):
@@ -60,20 +70,118 @@ def _verify_ws_client(w: WorkspaceClient):
             msg = "Current user is not a workspace admin"
             raise RuntimeError(msg)
 
-    def table_inventory(self, executor: CommandExecutor | None = None):
-        logger.info("Started dataset inventorization...")
-        ce = self._get_command_executor(executor, language=Language.SCALA)
-        params = {"SCHEMA": self._inventory_schema}
-        cmd_code = self._get_command("create_table_inventory.scala", params=params)
-        command_output = ce.run(cmd_code)
-        logger.debug(command_output)
-        if executor is None:
-            self._remove_command_executor()
-        logger.info("Completed dataset inventorization...")
-
-    def compile_report(self):
-        logger.info("Started report compilation...")
-        ce = self._get_command_executor(None, language=Language.SCALA)
-        self.table_inventory(ce)
-        self._remove_command_executor()
-        logger.info("Completed report compilation...")
+    def generate_external_location_list(self):
+        crawler = ExternalLocationCrawler(self._ws, self._backend, self._inventory_schema)
+        return crawler.snapshot()
+
+    def generate_job_assessment(self):
+        crawler = JobsCrawler(self._ws, self._backend, self._inventory_schema)
+        return crawler.snapshot()
+
+    def generate_cluster_assessment(self):
+        crawler = ClustersCrawler(self._ws, self._backend, self._inventory_schema)
+        return crawler.snapshot()
+
+
+class ClustersCrawler(CrawlerBase):
+    def __init__(self, ws: WorkspaceClient, sbe: SqlBackend, schema):
+        super().__init__(sbe, "hive_metastore", schema, "clusters")
+        self._ws = ws
+
+    def _crawl(self) -> list[ClusterInfo]:
+        all_clusters = list(self._ws.clusters.list())
+        return list(self._assess_clusters(all_clusters))
+
+    def _assess_clusters(self, all_clusters):
+        for cluster in all_clusters:
+            cluster_info = ClusterInfo(cluster.cluster_id, cluster.cluster_name, cluster.creator_user_name, 1, "")
+            support_status = spark_version_compatibility(cluster.spark_version)
+            failures = []
+            if support_status != "supported":
+                failures.append(f"not supported DBR: {cluster.spark_version}")
+
+            if cluster.spark_conf is not None:
+                for k in AssessmentToolkit.incompatible_spark_config_keys:
+                    if k in cluster.spark_conf:
+                        failures.append(f"unsupported config: {k}")
+
+                for value in cluster.spark_conf.values():
+                    if "dbfs:/mnt" in value or "/dbfs/mnt" in value:
+                        failures.append(f"using DBFS mount in configuration: {value}")
+            cluster_info.failures = json.dumps(failures)
+            if len(failures) > 0:
+                cluster_info.success = 0
+            yield cluster_info
+
+    def snapshot(self) -> list[ClusterInfo]:
+        return self._snapshot(self._try_fetch, self._crawl)
+
+    def _try_fetch(self) -> list[ClusterInfo]:
+        for row in self._fetch(f"SELECT * FROM {self._schema}.{self._table}"):
+            yield ClusterInfo(*row)
+
+
+class JobsCrawler(CrawlerBase):
+    def __init__(self, ws: WorkspaceClient, sbe: SqlBackend, schema):
+        super().__init__(sbe, "hive_metastore", schema, "jobs")
+        self._ws = ws
+
+    def _get_cluster_configs_from_all_jobs(self, all_jobs, all_clusters_by_id):
+        for j in all_jobs:
+            if j.settings.job_clusters is not None:
+                for jc in j.settings.job_clusters:
+                    if jc.new_cluster is None:
+                        continue
+                    yield j, jc.new_cluster
+
+            for t in j.settings.tasks:
+                if t.existing_cluster_id is not None:
+                    interactive_cluster = all_clusters_by_id.get(t.existing_cluster_id, None)
+                    if interactive_cluster is None:
+                        continue
+                    yield j, interactive_cluster
+
+                elif t.new_cluster is not None:
+                    yield j, t.new_cluster
+
+    def _crawl(self) -> list[JobInfo]:
+        all_jobs = list(self._ws.jobs.list(expand_tasks=True))
+        all_clusters = {c.cluster_id: c for c in self._ws.clusters.list()}
+        return self._assess_jobs(all_jobs, all_clusters)
+
+    def _assess_jobs(self, all_jobs: list[BaseJob], all_clusters_by_id) -> list[JobInfo]:
+        job_assessment = {}
+        job_details = {}
+        for job in all_jobs:
+            job_assessment[job.job_id] = set()
+            job_details[job.job_id] = JobInfo(str(job.job_id), job.settings.name, job.creator_user_name, 1, "")
+
+        for job, cluster_config in self._get_cluster_configs_from_all_jobs(all_jobs, all_clusters_by_id):
+            support_status = spark_version_compatibility(cluster_config.spark_version)
+            if support_status != "supported":
+                job_assessment[job.job_id].add(f"not supported DBR: {cluster_config.spark_version}")
+
+            if cluster_config.spark_conf is not None:
+                for k in AssessmentToolkit.incompatible_spark_config_keys:
+                    if k in cluster_config.spark_conf:
+                        job_assessment[job.job_id].add(f"unsupported config: {k}")
+
+                for value in cluster_config.spark_conf.values():
+                    if "dbfs:/mnt" in value or "/dbfs/mnt" in value:
+                        job_assessment[job.job_id].add(f"using DBFS mount in configuration: {value}")
+        for job_key in job_details.keys():
+            job_details[job_key].failures = json.dumps(list(job_assessment[job_key]))
+            if len(job_assessment[job_key]) > 0:
+                job_details[job_key].success = 0
+        return list(job_details.values())
+
+    def snapshot(self) -> list[ClusterInfo]:
+        return self._snapshot(self._try_fetch, self._crawl)
+
+    def _try_fetch(self) -> list[ClusterInfo]:
+        for row in self._fetch(f"SELECT * FROM {self._schema}.{self._table}"):
+            yield JobInfo(*row)
+
+
+if __name__ == "__main__":
+    print("Databricks UC Assessment")
diff --git a/src/databricks/labs/ucx/framework/crawlers.py b/src/databricks/labs/ucx/framework/crawlers.py
@@ -86,7 +86,10 @@ def _row_to_sql(row, fields):
             elif f.type == bool:
                 data.append("TRUE" if value else "FALSE")
             elif f.type == str:
+                value = value.replace("'", "''")
                 data.append(f"'{value}'")
+            elif f.type == int:
+                data.append(f"{value}")
             else:
                 msg = f"unknown type: {f.type}"
                 raise ValueError(msg)
diff --git a/src/databricks/labs/ucx/hive_metastore/data_objects.py b/src/databricks/labs/ucx/hive_metastore/data_objects.py
@@ -0,0 +1,64 @@
+import os
+import typing
+from dataclasses import dataclass
+
+from databricks.sdk import WorkspaceClient
+
+from databricks.labs.ucx.framework.crawlers import CrawlerBase, SqlBackend
+from databricks.labs.ucx.hive_metastore.list_mounts import Mounts
+
+
+@dataclass
+class ExternalLocation:
+    location: str
+
+
+class ExternalLocationCrawler(CrawlerBase):
+    _prefix_size: typing.ClassVar[list[int]] = [1, 12]
+
+    def __init__(self, ws: WorkspaceClient, sbe: SqlBackend, schema):
+        super().__init__(sbe, "hive_metastore", schema, "external_locations")
+        self._ws = ws
+
+    def _external_locations(self, tables, mounts):
+        min_slash = 2
+        external_locations = []
+        for table in tables:
+            location = table.as_dict()["location"]
+            if location is not None and len(location) > 0:
+                if location.startswith("dbfs:/mnt"):
+                    for mount in mounts:
+                        if location[5:].startswith(mount.name):
+                            location = location[5:].replace(mount.name, mount.source)
+                            break
+                if not location.startswith("dbfs") and (
+                    self._prefix_size[0] < location.find(":/") < self._prefix_size[1]
+                ):
+                    dupe = False
+                    loc = 0
+                    while loc < len(external_locations) and not dupe:
+                        common = (
+                            os.path.commonpath(
+                                [external_locations[loc].location, os.path.dirname(location) + "/"]
+                            ).replace(":/", "://")
+                            + "/"
+                        )
+                        if common.count("/") > min_slash:
+                            external_locations[loc] = ExternalLocation(common)
+                            dupe = True
+                        loc += 1
+                    if not dupe:
+                        external_locations.append(ExternalLocation(os.path.dirname(location) + "/"))
+        return external_locations
+
+    def _external_location_list(self):
+        tables = self._backend.fetch(f"SELECT location FROM {self._schema}.tables WHERE location IS NOT NULL")
+        mounts = Mounts(self._backend, self._ws, self._schema).snapshot()
+        return self._external_locations(list(tables), list(mounts))
+
+    def snapshot(self) -> list[ExternalLocation]:
+        return self._snapshot(self._try_fetch, self._external_location_list)
+
+    def _try_fetch(self) -> list[ExternalLocation]:
+        for row in self._fetch(f"SELECT * FROM {self._schema}.{self._table}"):
+            yield ExternalLocation(*row)
diff --git a/src/databricks/labs/ucx/hive_metastore/list_mounts.py b/src/databricks/labs/ucx/hive_metastore/list_mounts.py
@@ -27,3 +27,10 @@ def _list_mounts(self):
         for mount_point, source, _ in self._dbutils.fs.mounts():
             mounts.append(Mount(mount_point, source))
         return mounts
+
+    def snapshot(self) -> list[Mount]:
+        return self._snapshot(self._try_fetch, self._list_mounts)
+
+    def _try_fetch(self) -> list[Mount]:
+        for row in self._fetch(f"SELECT * FROM {self._schema}.{self._table}"):
+            yield Mount(*row)
diff --git a/src/databricks/labs/ucx/runtime.py b/src/databricks/labs/ucx/runtime.py
diff --git a/tests/unit/assessment/test_assessment.py b/tests/unit/assessment/test_assessment.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+from databricks.labs.ucx.assessment.assessment import AssessmentToolkit`
	`2`	`+`
	`3`	`+__all__ = ["AssessmentToolkit"]`