Simplify SqlBackend and table creation logic (#203)

nfx · web-flow · commit e60486ff4719 · 2023-09-18T13:58:40.000+02:00
Fixes #202
diff --git a/src/databricks/labs/ucx/framework/crawlers.py b/src/databricks/labs/ucx/framework/crawlers.py
@@ -3,6 +3,7 @@
 import os
 from abc import ABC, abstractmethod
 from collections.abc import Iterator
+from typing import ClassVar
 
 from databricks.sdk import WorkspaceClient
 
@@ -20,6 +21,26 @@ def execute(self, sql):
     def fetch(self, sql) -> Iterator[any]:
         raise NotImplementedError
 
+    @abstractmethod
+    def save_table(self, full_name: str, rows: list[any], mode: str = "append"):
+        raise NotImplementedError
+
+    _builtin_type_mapping: ClassVar[dict[type, str]] = {str: "STRING", int: "INT", bool: "BOOLEAN", float: "FLOAT"}
+
+    @classmethod
+    def _schema_for(cls, klass):
+        fields = []
+        for f in dataclasses.fields(klass):
+            if f.type not in cls._builtin_type_mapping:
+                msg = f"Cannot auto-convert {f.type}"
+                raise SyntaxError(msg)
+            not_null = " NOT NULL"
+            if f.default is None:
+                not_null = ""
+            spark_type = cls._builtin_type_mapping[f.type]
+            fields.append(f"{f.name} {spark_type}{not_null}")
+        return ", ".join(fields)
+
 
 class StatementExecutionBackend(SqlBackend):
     def __init__(self, ws: WorkspaceClient, warehouse_id):
@@ -34,6 +55,40 @@ def fetch(self, sql) -> Iterator[any]:
         logger.debug(f"[api][fetch] {sql}")
         return self._sql.execute_fetch_all(self._warehouse_id, sql)
 
+    def save_table(self, full_name: str, rows: list[any], mode="append"):
+        if mode == "overwrite":
+            msg = "Overwrite mode is not yet supported"
+            raise NotImplementedError(msg)
+
+        if len(rows) == 0:
+            return
+
+        klass = rows[0].__class__
+        ddl = f"CREATE TABLE IF NOT EXISTS {full_name} ({self._schema_for(klass)}) USING DELTA"
+        self.execute(ddl)
+
+        fields = dataclasses.fields(klass)
+        field_names = [f.name for f in fields]
+        vals = "), (".join(self._row_to_sql(r, fields) for r in rows)
+        sql = f'INSERT INTO {full_name} ({", ".join(field_names)}) VALUES ({vals})'
+        self.execute(sql)
+
+    @staticmethod
+    def _row_to_sql(row, fields):
+        data = []
+        for f in fields:
+            value = getattr(row, f.name)
+            if value is None:
+                data.append("NULL")
+            elif f.type == bool:
+                data.append("TRUE" if value else "FALSE")
+            elif f.type == str:
+                data.append(f"'{value}'")
+            else:
+                msg = f"unknown type: {f.type}"
+                raise ValueError(msg)
+        return ", ".join(data)
+
 
 class RuntimeBackend(SqlBackend):
     def __init__(self):
@@ -42,6 +97,7 @@ def __init__(self):
         if "DATABRICKS_RUNTIME_VERSION" not in os.environ:
             msg = "Not in the Databricks Runtime"
             raise RuntimeError(msg)
+
         self._spark = SparkSession.builder.getOrCreate()
 
     def execute(self, sql):
@@ -52,6 +108,13 @@ def fetch(self, sql) -> Iterator[any]:
         logger.debug(f"[spark][fetch] {sql}")
         return self._spark.sql(sql).collect()
 
+    def save_table(self, full_name: str, rows: list[any], mode: str = "append"):
+        if len(rows) == 0:
+            return
+        # pyspark deals well with lists of dataclass instances, as long as schema is provided
+        df = self._spark.createDataFrame(rows, self._schema_for(rows[0]))
+        df.write.saveAsTable(full_name, mode=mode)
+
 
 class CrawlerBase:
     def __init__(self, backend: SqlBackend, catalog: str, schema: str, table: str):
@@ -116,129 +179,38 @@ def _try_valid(cls, name: str):
             return None
         return cls._valid(name)
 
-    def _snapshot(self, klass, fetcher, loader) -> list[any]:
+    def _snapshot(self, fetcher, loader) -> list[any]:
         """
-        Tries to load dataset of records with the type `klass` with `fetcher` function,
-        otherwise automatically creates a table with the schema defined in `klass` and
-        executes `loader` function to populate the dataset.
+        Tries to load dataset of records with `fetcher` function, otherwise automatically creates
+        a table with the schema defined in the class of the first row and executes `loader` function
+        to populate the dataset.
 
         Args:
-            klass: The class representing the data structure.
             fetcher: A function to fetch existing data.
             loader: A function to load new data.
 
-        Behavior:
-        - Initiates an infinite loop to attempt fetching existing data using the provided fetcher function.
-        - If the fetcher function encounters a runtime error with the message "TABLE_OR_VIEW_NOT_FOUND",
-          it indicates that the data does not exist in the table.
-        - In this case, the method logs that the data is not found and triggers the loader function to load new data.
-        - The new data loaded by the loader function is then appended to the existing table using the `_append_records`
-          method.
-
-        Note:
-        - The method assumes that the provided fetcher and loader functions operate on the same data structure.
-        - The fetcher function should return an iterator of data records.
-        - The loader function should return an iterator of new data records to be added to the table.
-
         Exceptions:
         - If a runtime error occurs during fetching (other than "TABLE_OR_VIEW_NOT_FOUND"), the original error is
           re-raised.
 
         Returns:
         list[any]: A list of data records, either fetched or loaded.
         """
-        loaded = False
-        trigger_load = ValueError("trigger records load")
-        while True:
-            try:
-                logger.debug(f"[{self._full_name}] fetching {self._table} inventory")
-                cached_results = list(fetcher())
-                if len(cached_results) == 0 and loaded:
-                    return cached_results
-                if len(cached_results) == 0 and not loaded:
-                    raise trigger_load
+        logger.debug(f"[{self._full_name}] fetching {self._table} inventory")
+        try:
+            cached_results = list(fetcher())
+            if len(cached_results) > 0:
                 return cached_results
-            except Exception as e:
-                if not (e == trigger_load or "TABLE_OR_VIEW_NOT_FOUND" in str(e)):
-                    raise e
-            logger.debug(f"[{self._full_name}] crawling new batch for {self._table}")
-            loaded_records = list(loader())
-            if len(loaded_records) > 0:
-                logger.debug(f"[{self._full_name}] found {len(loaded_records)} new records for {self._table}")
-                self._append_records(klass, loaded_records)
-            loaded = True
-
-    @staticmethod
-    def _row_to_sql(row, fields):
-        data = []
-        for f in fields:
-            value = getattr(row, f.name)
-            if value is None:
-                data.append("NULL")
-            elif f.type == bool:
-                data.append("TRUE" if value else "FALSE")
-            elif f.type == str:
-                data.append(f"'{value}'")
-            else:
-                msg = f"unknown type: {f.type}"
-                raise ValueError(msg)
-        return ", ".join(data)
-
-    @staticmethod
-    def _field_type(f):
-        if f.type == bool:
-            return "BOOLEAN"
-        elif f.type == str:
-            return "STRING"
-        else:
-            msg = f"unknown type: {f.type}"
-            raise ValueError(msg)
-
-    def _append_records(self, klass, records: Iterator[any]):
-        """
-        Appends records to the table or creates the table if it does not exist.
-
-        Args:
-            klass: The class representing the data structure.
-            records (Iterator[any]): An iterator of records to be appended.
-
-        Behavior:
-        - Retrieves the fields of the provided class representing the data.
-        - Generates a comma-separated list of field names from the fields.
-        - Converts each record into a formatted SQL representation using the `_row_to_sql` method.
-        - Constructs an SQL INSERT statement with the formatted field names and values.
-        - Attempts to execute the INSERT statement using the `_exec` function.
-        - If the table does not exist (TABLE_OR_VIEW_NOT_FOUND), it creates the table using a CREATE TABLE statement.
-
-        Note:
-        - The method assumes that the target table exists in the database.
-        - If the table does not exist, it will be created with the schema inferred from the class fields.
-        - If the table already exists, the provided records will be appended to it.
-
-        Exceptions:
-        - If a runtime error occurs during execution, it checks if the error message contains "TABLE_OR_VIEW_NOT_FOUND".
-        - If the table does not exist, a new table will be created using the schema inferred from the class fields.
-        - If the error is different, the original error is re-raised.
-        """
-        fields = dataclasses.fields(klass)
-        field_names = [f.name for f in fields]
-        vals = "), (".join(self._row_to_sql(r, fields) for r in records)
-        sql = f'INSERT INTO {self._full_name} ({", ".join(field_names)}) VALUES ({vals})'
-        while True:
-            try:
-                logger.debug(f"[{self._full_name}] appending records")
-                self._exec(sql)
-                return
-            except Exception as e:
-                if "TABLE_OR_VIEW_NOT_FOUND" not in str(e):
-                    raise e
-                logger.debug(f"[{self._full_name}] not found. creating")
-                schema = ", ".join(f"{f.name} {self._field_type(f)}" for f in fields)
-                try:
-                    self._exec(f"CREATE TABLE {self._full_name} ({schema}) USING DELTA")
-                except Exception as e:
-                    schema_not_found = "SCHEMA_NOT_FOUND" in str(e)
-                    if not schema_not_found:
-                        raise e
-                    logger.debug(f"[{self._catalog}.{self._schema}] not found. creating")
-                    self._exec(f"CREATE SCHEMA {self._catalog}.{self._schema}")
+        except Exception as err:
+            if "TABLE_OR_VIEW_NOT_FOUND" not in str(err):
+                raise err
+        logger.debug(f"[{self._full_name}] crawling new batch for {self._table}")
+        loaded_records = list(loader())
+        self._append_records(loaded_records)
+        return loaded_records
+
+    def _append_records(self, items):
+        if len(items) == 0:
+            return
+        logger.debug(f"[{self._full_name}] found {len(items)} new records for {self._table}")
+        self._backend.save_table(self._full_name, items, mode="append")
diff --git a/src/databricks/labs/ucx/framework/tasks.py b/src/databricks/labs/ucx/framework/tasks.py
@@ -47,6 +47,10 @@ def wrapper(*args, **kwargs):
                     continue
                 deps.append(fn.__name__)
 
+        if not func.__doc__:
+            msg = f"Task {func.__name__} must have documentation"
+            raise SyntaxError(msg)
+
         _TASKS[func.__name__] = Task(
             workflow=workflow, name=func.__name__, doc=func.__doc__, fn=func, depends_on=deps, job_cluster=job_cluster
         )
diff --git a/src/databricks/labs/ucx/hive_metastore/grants.py b/src/databricks/labs/ucx/hive_metastore/grants.py
@@ -120,9 +120,7 @@ def __init__(self, tc: TablesCrawler):
         self._tc = tc
 
     def snapshot(self, catalog: str, database: str) -> list[Grant]:
-        return self._snapshot(
-            Grant, partial(self._try_load, catalog, database), partial(self._crawl, catalog, database)
-        )
+        return self._snapshot(partial(self._try_load, catalog, database), partial(self._crawl, catalog, database))
 
     def _try_load(self, catalog: str, database: str):
         for row in self._fetch(
diff --git a/src/databricks/labs/ucx/hive_metastore/tables.py b/src/databricks/labs/ucx/hive_metastore/tables.py
@@ -95,9 +95,7 @@ def snapshot(self, catalog: str, database: str) -> list[Table]:
         Returns:
             list[Table]: A list of Table objects representing the snapshot of tables.
         """
-        return self._snapshot(
-            Table, partial(self._try_load, catalog, database), partial(self._crawl, catalog, database)
-        )
+        return self._snapshot(partial(self._try_load, catalog, database), partial(self._crawl, catalog, database))
 
     def _try_load(self, catalog: str, database: str):
         """Tries to load table information from the database or throws TABLE_OR_VIEW_NOT_FOUND error"""
diff --git a/src/databricks/labs/ucx/runtime.py b/src/databricks/labs/ucx/runtime.py
@@ -5,6 +5,7 @@
 from databricks.sdk import WorkspaceClient
 
 from databricks.labs.ucx.config import MigrationConfig
+from databricks.labs.ucx.framework.crawlers import RuntimeBackend
 from databricks.labs.ucx.framework.tasks import task, trigger
 from databricks.labs.ucx.hive_metastore import TaclToolkit
 from databricks.labs.ucx.workspace_access import GroupMigrationToolkit
@@ -13,6 +14,13 @@
 
 
 @task("assessment")
+def setup_schema(cfg: MigrationConfig):
+    """Creates a database for UCX migration intermediate state"""
+    backend = RuntimeBackend()
+    backend.execute(f"CREATE SCHEMA IF NOT EXISTS hive_metastore.{cfg.inventory_database}")
+
+
+@task("assessment", depends_on=[setup_schema])
 def crawl_tables(cfg: MigrationConfig):
     """During this operation, a systematic scan is conducted, encompassing every table within the Hive Metastore.
     This scan extracts essential details associated with each table, including its unique identifier or name, table
@@ -48,7 +56,7 @@ def crawl_grants(cfg: MigrationConfig):
     tacls.grants_snapshot()
 
 
-@task("assessment")
+@task("assessment", depends_on=[setup_schema])
 def inventorize_permissions(cfg: MigrationConfig):
     """As we embark on the complex migration journey from Hive Metastore to the Databricks Unity Catalog, a pivotal
     aspect of this transition is the comprehensive examination and preservation of permissions associated with a myriad
diff --git a/src/databricks/labs/ucx/workspace_access/manager.py b/src/databricks/labs/ucx/workspace_access/manager.py
@@ -73,9 +73,7 @@ def cleanup(self):
         logger.info("Inventory table cleanup complete")
 
     def _save(self, items: list[Permissions]):
-        # TODO: update instead of append
-        logger.info(f"Saving {len(items)} items to {self._full_name}")
-        self._append_records(Permissions, items)
+        self._append_records(items)  # TODO: update instead of append
         logger.info("Successfully saved the items to inventory table")
 
     def _load_all(self) -> list[Permissions]:
diff --git a/tests/unit/framework/mocks.py b/tests/unit/framework/mocks.py
@@ -13,6 +13,7 @@ def __init__(self, *, fails_on_first: dict | None = None, rows: dict | None = No
         if not rows:
             rows = {}
         self._rows = rows
+        self._save_table = []
         self.queries = []
 
     def _sql(self, sql):
@@ -39,3 +40,14 @@ def fetch(self, sql) -> Iterator[any]:
                     rows.extend(self._rows[pattern])
         logger.debug(f"Returning rows: {rows}")
         return iter(rows)
+
+    def save_table(self, full_name: str, rows: list[any], mode: str = "append"):
+        self._save_table.append((full_name, rows, mode))
+
+    def rows_written_for(self, full_name: str, mode: str) -> list[any]:
+        rows = []
+        for stub_full_name, stub_rows, stub_mode in self._save_table:
+            if not (stub_full_name == full_name and stub_mode == mode):
+                continue
+            rows += stub_rows
+        return rows
diff --git a/tests/unit/framework/test_crawlers.py b/tests/unit/framework/test_crawlers.py
diff --git a/tests/unit/workspace_access/test_manager.py b/tests/unit/workspace_access/test_manager.py