Skip to content

Commit 669b136

Browse files
authored
Warn when table has column without no name during table migration (#2984)
## Changes Warn when table has column without no name during table migration. The code goes with a "do first, apologize later" approach to avoid an additional call to retrieve the column schema. ### Linked issues Resolves #2891 ### Functionality - [x] modified existing workflow: `tables-migrate` ### Tests - [x] added unit tests
1 parent c385d77 commit 669b136

File tree

2 files changed

+84
-6
lines changed

2 files changed

+84
-6
lines changed

src/databricks/labs/ucx/hive_metastore/table_migrate.py

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,15 @@
11
import dataclasses
22
import logging
3+
import re
34
from collections import defaultdict
45
from functools import partial
56

67
from databricks.labs.blueprint.parallel import Threads
78
from databricks.labs.lsql.backends import SqlBackend
8-
9-
from databricks.labs.ucx.framework.utils import escape_sql_identifier
109
from databricks.sdk import WorkspaceClient
10+
from databricks.sdk.errors.platform import DatabricksError
1111

12+
from databricks.labs.ucx.framework.utils import escape_sql_identifier
1213
from databricks.labs.ucx.hive_metastore import TablesCrawler
1314
from databricks.labs.ucx.hive_metastore.grants import MigrateGrants
1415
from databricks.labs.ucx.hive_metastore.locations import ExternalLocations
@@ -28,7 +29,6 @@
2829
ViewsMigrationSequencer,
2930
ViewToMigrate,
3031
)
31-
from databricks.sdk.errors.platform import DatabricksError
3232

3333
logger = logging.getLogger(__name__)
3434

@@ -95,7 +95,7 @@ def _migrate_tables(
9595
for table in tables_in_scope:
9696
tasks.append(
9797
partial(
98-
self._migrate_table,
98+
self._safe_migrate_table,
9999
table,
100100
managed_table_external_storage,
101101
hiveserde_in_place_migrate,
@@ -131,15 +131,35 @@ def _migrate_managed_table(self, managed_table_external_storage: str, src_table:
131131
logger.warning(f"failed-to-migrate: unknown managed_table_external_storage: {managed_table_external_storage}")
132132
return True
133133

134-
def _migrate_table(
134+
def _safe_migrate_table(
135135
self,
136136
src_table: TableToMigrate,
137137
managed_table_external_storage: str,
138138
hiveserde_in_place_migrate: bool = False,
139-
):
139+
) -> bool:
140140
if self._table_already_migrated(src_table.rule.as_uc_table_key):
141141
logger.info(f"Table {src_table.src.key} already migrated to {src_table.rule.as_uc_table_key}")
142142
return True
143+
try:
144+
return self._migrate_table(src_table, managed_table_external_storage, hiveserde_in_place_migrate)
145+
except Exception as e: # pylint: disable=broad-exception-caught
146+
# Catching a Spark AnalysisException here, for which we do not have the dependency to catch explicitly
147+
pattern = ( # See https://github.com/databrickslabs/ucx/issues/2891
148+
r"INVALID_PARAMETER_VALUE: Invalid input: RPC CreateTable Field managedcatalog.ColumnInfo.name: "
149+
r'At columns.\d+: name "" is not a valid name`'
150+
)
151+
if re.match(pattern, str(e)):
152+
logger.warning(f"failed-to-migrate: Table with empty column name '{src_table.src.key}'", exc_info=e)
153+
else:
154+
logger.warning(f"failed-to-migrate: Unknown reason for table '{src_table.src.key}'", exc_info=e)
155+
return False
156+
157+
def _migrate_table(
158+
self,
159+
src_table: TableToMigrate,
160+
managed_table_external_storage: str,
161+
hiveserde_in_place_migrate: bool = False,
162+
) -> bool:
143163
if src_table.src.what == What.DBFS_ROOT_DELTA:
144164
return self._migrate_dbfs_root_table(src_table.src, src_table.rule)
145165
if src_table.src.what == What.DBFS_ROOT_NON_DELTA:

tests/unit/hive_metastore/test_table_migrate.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from databricks.sdk.service.catalog import CatalogInfo, SchemaInfo, TableInfo
1111

1212
from databricks.labs.ucx.framework.owners import AdministratorLocator
13+
from databricks.labs.ucx.framework.utils import escape_sql_identifier
1314
from databricks.labs.ucx.hive_metastore.grants import MigrateGrants
1415
from databricks.labs.ucx.hive_metastore.locations import ExternalLocations
1516
from databricks.labs.ucx.hive_metastore.mapping import (
@@ -1483,3 +1484,60 @@ def test_table_migration_status_source_table_unknown() -> None:
14831484

14841485
assert owner == "an_admin"
14851486
table_ownership.owner_of.assert_not_called()
1487+
1488+
1489+
class MockBackendWithGeneralException(MockBackend):
1490+
"""Mock backend that allows raising a general exception.
1491+
1492+
Note: we want to raise a Spark AnalysisException, for which we do not have the dependency to raise explicitly.
1493+
"""
1494+
1495+
@staticmethod
1496+
def _api_error_from_message(error_message: str): # No return type to avoid mypy complains on different return type
1497+
return Exception(error_message)
1498+
1499+
1500+
def test_migrate_tables_handles_table_with_empty_column(caplog) -> None:
1501+
table_crawler = create_autospec(TablesCrawler)
1502+
table = Table("hive_metastore", "schema", "table", "MANAGED", "DELTA")
1503+
1504+
error_message = (
1505+
"INVALID_PARAMETER_VALUE: Invalid input: RPC CreateTable Field managedcatalog.ColumnInfo.name: "
1506+
'At columns.21: name "" is not a valid name`'
1507+
)
1508+
query = f"ALTER TABLE {escape_sql_identifier(table.full_name)} SET TBLPROPERTIES ('upgraded_to' = 'catalog.schema.table');"
1509+
backend = MockBackendWithGeneralException(fails_on_first={query: error_message})
1510+
1511+
ws = create_autospec(WorkspaceClient)
1512+
ws.get_workspace_id.return_value = 123456789
1513+
1514+
table_mapping = create_autospec(TableMapping)
1515+
rule = Rule("workspace", "catalog", "schema", "schema", "table", "table")
1516+
table_to_migrate = TableToMigrate(table, rule)
1517+
table_mapping.get_tables_to_migrate.return_value = [table_to_migrate]
1518+
1519+
migration_status_refresher = create_autospec(TableMigrationStatusRefresher)
1520+
migration_status_refresher.get_seen_tables.return_value = {}
1521+
migration_status_refresher.index.return_value = []
1522+
1523+
migrate_grants = create_autospec(MigrateGrants)
1524+
external_locations = create_autospec(ExternalLocations)
1525+
table_migrator = TablesMigrator(
1526+
table_crawler,
1527+
ws,
1528+
backend,
1529+
table_mapping,
1530+
migration_status_refresher,
1531+
migrate_grants,
1532+
external_locations,
1533+
)
1534+
1535+
with caplog.at_level(logging.WARN, logger="databricks.labs.ucx.hive_metastore"):
1536+
table_migrator.migrate_tables(table.what)
1537+
assert "failed-to-migrate: Table with empty column name 'hive_metastore.schema.table'" in caplog.messages
1538+
1539+
table_crawler.snapshot.assert_not_called() # Mocking table mapping instead
1540+
ws.get_workspace_id.assert_not_called() # Errors before getting here
1541+
migration_status_refresher.index.assert_not_called() # Only called when migrating view
1542+
migrate_grants.apply.assert_not_called() # Errors before getting here
1543+
external_locations.resolve_mount.assert_not_called() # Only called when migrating external table

0 commit comments

Comments
 (0)