Create $inventory.tables from Scala notebook (#207)

nfx · web-flow · commit e824c185d87c · 2023-09-18T15:08:41.000Z
This PR allows fetching table metadata without the need to access storage, allowing for a more straightforward configuration that no longer requires Azure storage credentials to be present in the cluster config. This notebook allows for a faster scanning performance. Fixes #205 Co-authored by: Lars George <lars.george@databricks.com>
diff --git a/src/databricks/labs/ucx/framework/tasks.py b/src/databricks/labs/ucx/framework/tasks.py
@@ -18,9 +18,10 @@ class Task:
     fn: Callable[[MigrationConfig], None]
     depends_on: list[str] = None
     job_cluster: str = "main"
+    notebook: str = None
 
 
-def task(workflow, *, depends_on=None, job_cluster="main"):
+def task(workflow, *, depends_on=None, job_cluster="main", notebook: str | None = None):
     def decorator(func):
         @wraps(func)
         def wrapper(*args, **kwargs):
@@ -52,7 +53,13 @@ def wrapper(*args, **kwargs):
             raise SyntaxError(msg)
 
         _TASKS[func.__name__] = Task(
-            workflow=workflow, name=func.__name__, doc=func.__doc__, fn=func, depends_on=deps, job_cluster=job_cluster
+            workflow=workflow,
+            name=func.__name__,
+            doc=func.__doc__,
+            fn=func,
+            depends_on=deps,
+            job_cluster=job_cluster,
+            notebook=notebook,
         )
 
         return wrapper
diff --git a/src/databricks/labs/ucx/hive_metastore/tables.scala b/src/databricks/labs/ucx/hive_metastore/tables.scala
@@ -0,0 +1,42 @@
+import org.apache.spark.sql.catalyst.catalog.{CatalogTable, CatalogTableType}
+import org.apache.spark.sql.DataFrame
+
+// must follow the same structure as databricks.labs.ucx.hive_metastore.tables.Table
+case class TableDetails(catalog: String, database: String, name: String, object_type: String,
+                        table_format: String, location: String, view_text: String)
+
+def metadataForAllTables(databases: Seq[String]): DataFrame = {
+  import spark.implicits._
+
+  val externalCatalog = spark.sharedState.externalCatalog
+  databases.par.flatMap(databaseName => {
+    val tables = externalCatalog.listTables(databaseName)
+    if (tables == null) {
+      println(s"[WARN][${databaseName}] listTables returned null")
+      Seq()
+    } else {
+      tables.par.map(tableName => try {
+        val table = externalCatalog.getTable(databaseName, tableName)
+        if (table == null) {
+          println(s"[WARN][${databaseName}.${tableName}] result is null")
+          None
+        } else {
+          Some(TableDetails("hive_metastore", databaseName, tableName, table.tableType.name, table.provider.orNull,
+            table.storage.locationUri.map(_.toString).orNull, table.viewText.orNull))
+        }
+      } catch {
+        case err: Throwable =>
+          println(s"[ERROR][${databaseName}.${tableName}] ignoring table because of ${err}")
+          None
+      }).toList.collect {
+        case Some(x) => x
+      }
+    }
+  }).toList.toDF
+}
+
+dbutils.widgets.text("inventory_database", "ucx")
+val inventoryDatabase = dbutils.widgets.get("inventory_database")
+
+val df = metadataForAllTables(spark.sharedState.externalCatalog.listDatabases())
+df.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable(s"$inventoryDatabase.tables")
diff --git a/src/databricks/labs/ucx/install.py b/src/databricks/labs/ucx/install.py
@@ -16,7 +16,7 @@
 
 from databricks.labs.ucx.__about__ import __version__
 from databricks.labs.ucx.config import GroupsConfig, MigrationConfig, TaclConfig
-from databricks.labs.ucx.framework.tasks import _TASKS
+from databricks.labs.ucx.framework.tasks import _TASKS, Task
 from databricks.labs.ucx.runtime import main
 
 TAG_STEP = "step"
@@ -65,6 +65,7 @@ def __init__(self, ws: WorkspaceClient, *, prefix: str = "ucx", promtps: bool =
         self._ws = ws
         self._prefix = prefix
         self._prompts = promtps
+        self._this_file = Path(__file__)
 
     def run(self):
         self._configure()
@@ -230,8 +231,7 @@ def _upload_wheel(self) -> str:
                 self._ws.workspace.upload(remote_wheel, f, overwrite=True, format=ImportFormat.AUTO)
         return remote_wheel
 
-    def _job_settings(self, step_name, dbfs_path):
-        config_file = f"/Workspace/{self._install_folder}/config.yml"
+    def _job_settings(self, step_name: str, dbfs_path: str):
         email_notifications = None
         if "@" in self._my_username:
             email_notifications = jobs.JobEmailNotifications(
@@ -243,22 +243,44 @@ def _job_settings(self, step_name, dbfs_path):
             "tags": {TAG_APP: self._prefix, TAG_STEP: step_name},
             "job_clusters": self._job_clusters({t.job_cluster for t in tasks}),
             "email_notifications": email_notifications,
-            "tasks": [
-                jobs.Task(
-                    task_key=task.name,
-                    job_cluster_key=task.job_cluster,
-                    depends_on=[jobs.TaskDependency(task_key=d) for d in _TASKS[task.name].depends_on],
-                    libraries=[compute.Library(whl=f"dbfs:{dbfs_path}")],
-                    python_wheel_task=jobs.PythonWheelTask(
-                        package_name="databricks_labs_ucx",
-                        entry_point="runtime",  # [project.entry-points.databricks] in pyproject.toml
-                        named_parameters={"task": task.name, "config": config_file},
-                    ),
-                )
-                for task in tasks
-            ],
+            "tasks": [self._job_task(task, dbfs_path) for task in tasks],
         }
 
+    def _job_task(self, task: Task, dbfs_path: str) -> jobs.Task:
+        jobs_task = jobs.Task(
+            task_key=task.name,
+            job_cluster_key=task.job_cluster,
+            depends_on=[jobs.TaskDependency(task_key=d) for d in _TASKS[task.name].depends_on],
+        )
+        if task.notebook:
+            return self._job_notebook_task(jobs_task, task)
+        return self._job_wheel_task(jobs_task, task, dbfs_path)
+
+    def _job_notebook_task(self, jobs_task: jobs.Task, task: Task) -> jobs.Task:
+        local_notebook = self._this_file.parent / task.notebook
+        remote_notebook = f"{self._install_folder}/{local_notebook.name}"
+        with local_notebook.open("rb") as f:
+            self._ws.workspace.upload(remote_notebook, f)
+        return replace(
+            jobs_task,
+            notebook_task=jobs.NotebookTask(
+                notebook_path=remote_notebook,
+                # ES-872211: currently, we cannot read WSFS files from Scala context
+                base_parameters={"inventory_database": self._current_config.inventory_database},
+            ),
+        )
+
+    def _job_wheel_task(self, jobs_task: jobs.Task, task: Task, dbfs_path: str) -> jobs.Task:
+        return replace(
+            jobs_task,
+            libraries=[compute.Library(whl=f"dbfs:{dbfs_path}")],
+            python_wheel_task=jobs.PythonWheelTask(
+                package_name="databricks_labs_ucx",
+                entry_point="runtime",  # [project.entry-points.databricks] in pyproject.toml
+                named_parameters={"task": task.name, "config": self._config_file},
+            ),
+        )
+
     def _job_clusters(self, names: set[str]):
         clusters = []
         spec = self._cluster_node_type(
@@ -292,16 +314,15 @@ def _job_clusters(self, names: set[str]):
             )
         return clusters
 
-    @staticmethod
-    def _build_wheel(tmp_dir: str, *, verbose: bool = False):
+    def _build_wheel(self, tmp_dir: str, *, verbose: bool = False):
         """Helper to build the wheel package"""
         streams = {}
         if not verbose:
             streams = {
                 "stdout": subprocess.DEVNULL,
                 "stderr": subprocess.DEVNULL,
             }
-        project_root = Installer._find_project_root(Path(__file__))
+        project_root = Installer._find_project_root(self._this_file)
         if not project_root:
             msg = "Cannot find project root"
             raise NotADirectoryError(msg)
diff --git a/src/databricks/labs/ucx/runtime.py b/src/databricks/labs/ucx/runtime.py
@@ -20,20 +20,15 @@ def setup_schema(cfg: MigrationConfig):
     backend.execute(f"CREATE SCHEMA IF NOT EXISTS hive_metastore.{cfg.inventory_database}")
 
 
-@task("assessment", depends_on=[setup_schema])
-def crawl_tables(cfg: MigrationConfig):
+@task("assessment", depends_on=[setup_schema], notebook="hive_metastore/tables.scala")
+def crawl_tables(_: MigrationConfig):
     """During this operation, a systematic scan is conducted, encompassing every table within the Hive Metastore.
     This scan extracts essential details associated with each table, including its unique identifier or name, table
     format, storage location details.
 
     The extracted metadata is subsequently organized and cataloged within a dedicated storage entity known as
     the `$inventory.tables` table. This table functions as a comprehensive inventory, providing a structured and
     easily accessible reference point for users, data engineers, and administrators."""
-    ws = WorkspaceClient(config=cfg.to_databricks_config())
-    tacls = TaclToolkit(
-        ws, inventory_catalog="hive_metastore", inventory_schema=cfg.inventory_database, databases=cfg.tacl.databases
-    )
-    tacls.database_snapshot()
 
 
 @task("assessment", depends_on=[crawl_tables], job_cluster="tacl")