Escape column names in target tables of the table migration (#2563)

JCZuurmond · web-flow · commit df7f1d764725 · 2024-09-12T20:23:46.000+02:00
## Changes Escape column names in target tables of the table migration ### Linked issues Resolves #2544 ### Functionality - [x] modified existing workflow: `-migration-`-ones ### Tests - [x] added unit tests - [x] changed integration tests
diff --git a/src/databricks/labs/ucx/framework/utils.py b/src/databricks/labs/ucx/framework/utils.py
@@ -4,19 +4,20 @@
 logger = logging.getLogger(__name__)
 
 
-def escape_sql_identifier(path: str) -> str:
+def escape_sql_identifier(path: str, *, maxsplit: int = 2) -> str:
     """
     Escapes the path components to make them SQL safe.
 
     Args:
         path (str): The dot-separated path of a catalog object.
+        maxsplit (int): The maximum number of splits to perform.
 
     Returns:
          str: The path with all parts escaped in backticks.
     """
     if not path:
         return path
-    parts = path.split(".", maxsplit=2)
+    parts = path.split(".", maxsplit=maxsplit)
     escaped = [f"`{part.strip('`').replace('`', '``')}`" for part in parts]
     return ".".join(escaped)
 
diff --git a/src/databricks/labs/ucx/hive_metastore/tables.py b/src/databricks/labs/ucx/hive_metastore/tables.py
@@ -305,17 +305,17 @@ def sql_migrate_view(self, target_table_key):
     def sql_migrate_table_in_mount(self, target_table_key: str, table_schema: Iterator[typing.Any]):
         fields = []
         partitioned_fields = []
-        next_fileds_are_partitioned = False
+        next_fields_are_partitioned = False
         for key, value, _ in table_schema:
             if key == "# Partition Information":
                 continue
             if key == "# col_name":
-                next_fileds_are_partitioned = True
+                next_fields_are_partitioned = True
                 continue
-            if next_fileds_are_partitioned:
-                partitioned_fields.append(f"{key}")
+            if next_fields_are_partitioned:
+                partitioned_fields.append(escape_sql_identifier(key, maxsplit=0))
             else:
-                fields.append(f"{key} {value}")
+                fields.append(f"{escape_sql_identifier(key, maxsplit=0)} {value}")
 
         partitioned_str = ""
         if partitioned_fields:
diff --git a/src/databricks/labs/ucx/mixins/fixtures.py b/src/databricks/labs/ucx/mixins/fixtures.py
@@ -25,6 +25,7 @@
     AwsIamRoleRequest,
     AzureServicePrincipal,
     CatalogInfo,
+    ColumnInfo,
     DataSourceFormat,
     FunctionInfo,
     SchemaInfo,
@@ -55,6 +56,7 @@
 from databricks.sdk.service.workspace import ImportFormat, Language
 
 from databricks.labs.ucx.workspace_access.groups import MigratedGroup
+from databricks.labs.ucx.framework.utils import escape_sql_identifier
 
 # this file will get to databricks-labs-pytester project and be maintained/refactored there
 # pylint: disable=redefined-outer-name,too-many-try-statements,import-outside-toplevel,unnecessary-lambda,too-complex,invalid-name
@@ -1014,6 +1016,37 @@ def remove(schema_info: SchemaInfo):
 @pytest.fixture
 # pylint: disable-next=too-many-statements
 def make_table(ws, sql_backend, make_schema, make_random) -> Generator[Callable[..., TableInfo], None, None]:
+    def generate_sql_schema(columns: list[ColumnInfo]) -> str:
+        """Generate a SQL schema from columns."""
+        schema = "("
+        for index, column in enumerate(columns):
+            schema += escape_sql_identifier(column.name or str(index), maxsplit=0)
+            if column.type_name is None:
+                type_name = "STRING"
+            else:
+                type_name = column.type_name.value
+            schema += f" {type_name}, "
+        schema = schema[:-2] + ")"  # Remove the last ', '
+        return schema
+
+    def generate_sql_column_casting(existing_columns: list[ColumnInfo], new_columns: list[ColumnInfo]) -> str:
+        """Generate the SQL to cast columns"""
+        if any(column.name is None for column in existing_columns):
+            raise ValueError(f"Columns should have a name: {existing_columns}")
+        if len(new_columns) > len(existing_columns):
+            raise ValueError(f"Too many columns: {new_columns}")
+        select_expressions = []
+        for index, (existing_column, new_column) in enumerate(zip(existing_columns, new_columns)):
+            column_name_new = escape_sql_identifier(new_column.name or str(index), maxsplit=0)
+            if new_column.type_name is None:
+                type_name = "STRING"
+            else:
+                type_name = new_column.type_name.value
+            select_expression = f"CAST({existing_column.name} AS {type_name}) AS {column_name_new}"
+            select_expressions.append(select_expression)
+        select = ", ".join(select_expressions)
+        return select
+
     def create(  # pylint: disable=too-many-locals,too-many-arguments,too-many-statements
         *,
         catalog_name="hive_metastore",
@@ -1028,6 +1061,7 @@ def create(  # pylint: disable=too-many-locals,too-many-arguments,too-many-state
         tbl_properties: dict[str, str] | None = None,
         hiveserde_ddl: str | None = None,
         storage_override: str | None = None,
+        columns: list[ColumnInfo] | None = None,
     ) -> TableInfo:
         if schema_name is None:
             schema = make_schema(catalog_name=catalog_name)
@@ -1041,6 +1075,10 @@ def create(  # pylint: disable=too-many-locals,too-many-arguments,too-many-state
         view_text = None
         full_name = f"{catalog_name}.{schema_name}.{name}".lower()
         ddl = f'CREATE {"VIEW" if view else "TABLE"} {full_name}'
+        if columns is None:
+            schema = "(id INT, value STRING)"
+        else:
+            schema = generate_sql_schema(columns)
         if view:
             table_type = TableType.VIEW
             view_text = ctas
@@ -1052,21 +1090,36 @@ def create(  # pylint: disable=too-many-locals,too-many-arguments,too-many-state
             data_source_format = DataSourceFormat.JSON
             # DBFS locations are not purged; no suffix necessary.
             storage_location = f"dbfs:/tmp/ucx_test_{make_random(4)}"
+            if columns is None:
+                select = "*"
+            else:
+                # These are the columns from the JSON dataset below
+                dataset_columns = [
+                    ColumnInfo(name="calories_burnt"),
+                    ColumnInfo(name="device_id"),
+                    ColumnInfo(name="id"),
+                    ColumnInfo(name="miles_walked"),
+                    ColumnInfo(name="num_steps"),
+                    ColumnInfo(name="timestamp"),
+                    ColumnInfo(name="user_id"),
+                    ColumnInfo(name="value"),
+                ]
+                select = generate_sql_column_casting(dataset_columns, columns)
             # Modified, otherwise it will identify the table as a DB Dataset
             ddl = (
-                f"{ddl} USING json location '{storage_location}' as SELECT * FROM "
+                f"{ddl} USING json location '{storage_location}' as SELECT {select} FROM "
                 f"JSON.`dbfs:/databricks-datasets/iot-stream/data-device`"
             )
         elif external_csv is not None:
             table_type = TableType.EXTERNAL
             data_source_format = DataSourceFormat.CSV
             storage_location = external_csv
-            ddl = f"{ddl} USING CSV OPTIONS (header=true) LOCATION '{storage_location}'"
+            ddl = f"{ddl} {schema} USING CSV OPTIONS (header=true) LOCATION '{storage_location}'"
         elif external_delta is not None:
             table_type = TableType.EXTERNAL
             data_source_format = DataSourceFormat.DELTA
             storage_location = external_delta
-            ddl = f"{ddl} (id string) LOCATION '{storage_location}'"
+            ddl = f"{ddl} {schema} LOCATION '{storage_location}'"
         elif external:
             # external table
             table_type = TableType.EXTERNAL
@@ -1079,7 +1132,7 @@ def create(  # pylint: disable=too-many-locals,too-many-arguments,too-many-state
             table_type = TableType.MANAGED
             data_source_format = DataSourceFormat.DELTA
             storage_location = f"dbfs:/user/hive/warehouse/{schema_name}/{name}"
-            ddl = f"{ddl} (id INT, value STRING)"
+            ddl = f"{ddl} {schema}"
         if tbl_properties:
             tbl_properties.update({"RemoveAfter": get_test_purge_time()})
         else:
diff --git a/tests/integration/hive_metastore/test_migrate.py b/tests/integration/hive_metastore/test_migrate.py
@@ -5,7 +5,7 @@
 from databricks.sdk.errors import NotFound
 from databricks.sdk.retries import retried
 from databricks.sdk.service.compute import DataSecurityMode, AwsAttributes
-from databricks.sdk.service.catalog import Privilege, SecurableType, TableInfo, TableType
+from databricks.sdk.service.catalog import ColumnInfo, ColumnTypeName, Privilege, SecurableType, TableInfo, TableType
 from databricks.sdk.service.iam import PermissionLevel
 from databricks.labs.ucx.config import WorkspaceConfig
 from databricks.labs.ucx.hive_metastore.mapping import Rule, TableMapping
@@ -20,7 +20,11 @@
 @retried(on=[NotFound], timeout=timedelta(minutes=2))
 def test_migrate_managed_tables(ws, sql_backend, runtime_ctx, make_catalog):
     src_schema = runtime_ctx.make_schema(catalog_name="hive_metastore")
-    src_managed_table = runtime_ctx.make_table(catalog_name=src_schema.catalog_name, schema_name=src_schema.name)
+    src_managed_table = runtime_ctx.make_table(
+        catalog_name=src_schema.catalog_name,
+        schema_name=src_schema.name,
+        columns=[ColumnInfo(name="-das-hes-", type_name=ColumnTypeName.STRING)],  # Test with column that needs escaping
+    )
 
     dst_catalog = make_catalog()
     dst_schema = runtime_ctx.make_schema(catalog_name=dst_catalog.name, name=src_schema.name)
@@ -48,7 +52,11 @@ def test_migrate_dbfs_non_delta_tables(ws, sql_backend, runtime_ctx, make_catalo
         pytest.skip("temporary: only works in azure test env")
     src_schema = runtime_ctx.make_schema(catalog_name="hive_metastore")
     src_managed_table = runtime_ctx.make_table(
-        catalog_name=src_schema.catalog_name, non_delta=True, schema_name=src_schema.name
+        catalog_name=src_schema.catalog_name,
+        non_delta=True,
+        schema_name=src_schema.name,
+        # Test with column that needs escaping
+        columns=[ColumnInfo(name="1-0`.0-ugly-column", type_name=ColumnTypeName.STRING)],
     )
 
     dst_catalog = make_catalog()
@@ -134,7 +142,12 @@ def test_migrate_external_table(
     make_mounted_location,
 ):
     src_schema = runtime_ctx.make_schema(catalog_name="hive_metastore")
-    src_external_table = runtime_ctx.make_table(schema_name=src_schema.name, external_csv=make_mounted_location)
+    src_external_table = runtime_ctx.make_table(
+        schema_name=src_schema.name,
+        external_csv=make_mounted_location,
+        # Test with column that needs escaping
+        columns=[ColumnInfo(name="`back`ticks`", type_name=ColumnTypeName.STRING)],
+    )
     dst_catalog = make_catalog()
     dst_schema = runtime_ctx.make_schema(catalog_name=dst_catalog.name, name=src_schema.name)
     logger.info(f"dst_catalog={dst_catalog.name}, external_table={src_external_table.full_name}")
@@ -667,6 +680,8 @@ def test_migrate_table_in_mount(
     src_external_table = runtime_ctx.make_table(
         schema_name=src_schema.name,
         external_delta=f"dbfs:/mnt/{env_or_skip('TEST_MOUNT_NAME')}/a/b/{table_path}",
+        # Test with column that needs escaping
+        columns=[ColumnInfo(name="1-0`.0-ugly-column", type_name=ColumnTypeName.STRING)],
     )
     table_in_mount_location = f"abfss://things@labsazurethings.dfs.core.windows.net/a/b/{table_path}"
     # TODO: Remove this hack below
diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py
@@ -108,3 +108,8 @@ def mock_notebook_resolver():
     resolver = create_autospec(BaseNotebookResolver)
     resolver.resolve_notebook.return_value = None
     return resolver
+
+
+@pytest.fixture
+def mock_backend():
+    return MockBackend()
diff --git a/tests/unit/framework/test_utils.py b/tests/unit/framework/test_utils.py
@@ -28,3 +28,9 @@
 )
 def test_escaped_path(path: str, expected: str) -> None:
     assert escape_sql_identifier(path) == expected
+
+
+def test_escaped_when_column_contains_period() -> None:
+    expected = "`column.with.periods`"
+    path = "column.with.periods"
+    assert escape_sql_identifier(path, maxsplit=0) == expected
diff --git a/tests/unit/hive_metastore/test_table_migrate.py b/tests/unit/hive_metastore/test_table_migrate.py
@@ -1068,7 +1068,7 @@ def test_table_in_mount_mapping_with_table_owner():
     )
     table_migrate.migrate_tables(what=What.TABLE_IN_MOUNT)
     assert (
-        "CREATE TABLE IF NOT EXISTS `tgt_catalog`.`tgt_db`.`test` (col1 string, col2 decimal)  LOCATION 'abfss://bucket@msft/path/test';"
+        "CREATE TABLE IF NOT EXISTS `tgt_catalog`.`tgt_db`.`test` (`col1` string, `col2` decimal)  LOCATION 'abfss://bucket@msft/path/test';"
         in backend.queries
     )
     migrate_grants.apply.assert_called()
@@ -1111,7 +1111,7 @@ def test_table_in_mount_mapping_with_partition_information():
     )
     table_migrate.migrate_tables(what=What.TABLE_IN_MOUNT)
     assert (
-        "CREATE TABLE IF NOT EXISTS `tgt_catalog`.`tgt_db`.`test` (col1 string, col2 decimal) PARTITIONED BY (col1) LOCATION 'abfss://bucket@msft/path/test';"
+        "CREATE TABLE IF NOT EXISTS `tgt_catalog`.`tgt_db`.`test` (`col1` string, `col2` decimal) PARTITIONED BY (`col1`) LOCATION 'abfss://bucket@msft/path/test';"
         in backend.queries
     )
     migrate_grants.apply.assert_called()
diff --git a/tests/unit/hive_metastore/test_tables.py b/tests/unit/hive_metastore/test_tables.py
@@ -127,6 +127,55 @@ def test_uc_sql(table, target, query):
         assert table.sql_migrate_external(target) == query
 
 
+@pytest.mark.parametrize(
+    "schema,partitions,table_schema",
+    [
+        (
+            "(`id` INT, `value` STRING)",
+            "",
+            [
+                ("id", "INT", ""),
+                ("value", "STRING", ""),
+            ],
+        ),
+        (
+            "(`column.with.periods` STRING)",
+            "",
+            [
+                ("column.with.periods", "STRING", ""),
+            ],
+        ),
+        (
+            "(`id` STRING, `country` STRING)",
+            "PARTITIONED BY (`country`)",
+            [
+                ("id", "STRING", ""),
+                ("country", "STRING", ""),
+                ("# Partition Information", "", ""),
+                ("# col_name", "", ""),
+                ("country", "", ""),
+            ],
+        ),
+    ],
+)
+def test_uc_sql_when_table_is_in_mount(schema, partitions, table_schema):
+    expected = (
+        f"CREATE TABLE IF NOT EXISTS `new_catalog`.`db`.`external_table` "
+        f"{schema} {partitions} LOCATION 's3a://foo/bar';"
+    )
+    table = Table(
+        catalog="catalog",
+        database="db",
+        name="external_table",
+        object_type="EXTERNAL",
+        table_format="DELTA",
+        location="s3a://foo/bar",
+    )
+    target = "new_catalog.db.external_table"
+
+    assert table.sql_migrate_table_in_mount(target, table_schema) == expected
+
+
 def test_tables_returning_error_when_describing():
     errors = {"DESCRIBE TABLE EXTENDED `hive_metastore`.`database`.`table1`": "error"}
     rows = {
diff --git a/tests/unit/workspace_access/test_manager.py b/tests/unit/workspace_access/test_manager.py
@@ -14,11 +14,6 @@
 from databricks.labs.ucx.workspace_access.manager import PermissionManager, Permissions
 
 
-@pytest.fixture
-def mock_backend():
-    return MockBackend()
-
-
 def test_inventory_permission_manager_init(mock_backend):
     permission_manager = PermissionManager(mock_backend, "test_database", [])
 

Original file line number	Diff line number	Diff line change
`@@ -1068,7 +1068,7 @@ def test_table_in_mount_mapping_with_table_owner():`
`1068`	`1068`	`)`
`1069`	`1069`	`table_migrate.migrate_tables(what=What.TABLE_IN_MOUNT)`
`1070`	`1070`	`assert (`
`1071`		- "CREATE TABLE IF NOT EXISTS `tgt_catalog`.`tgt_db`.`test` (col1 string, col2 decimal) LOCATION 'abfss://bucket@msft/path/test';"
	`1071`	+ "CREATE TABLE IF NOT EXISTS `tgt_catalog`.`tgt_db`.`test` (`col1` string, `col2` decimal) LOCATION 'abfss://bucket@msft/path/test';"
`1072`	`1072`	`in backend.queries`
`1073`	`1073`	`)`
`1074`	`1074`	`migrate_grants.apply.assert_called()`
`@@ -1111,7 +1111,7 @@ def test_table_in_mount_mapping_with_partition_information():`
`1111`	`1111`	`)`
`1112`	`1112`	`table_migrate.migrate_tables(what=What.TABLE_IN_MOUNT)`
`1113`	`1113`	`assert (`
`1114`		- "CREATE TABLE IF NOT EXISTS `tgt_catalog`.`tgt_db`.`test` (col1 string, col2 decimal) PARTITIONED BY (col1) LOCATION 'abfss://bucket@msft/path/test';"
	`1114`	+ "CREATE TABLE IF NOT EXISTS `tgt_catalog`.`tgt_db`.`test` (`col1` string, `col2` decimal) PARTITIONED BY (`col1`) LOCATION 'abfss://bucket@msft/path/test';"
`1115`	`1115`	`in backend.queries`
`1116`	`1116`	`)`
`1117`	`1117`	`migrate_grants.apply.assert_called()`