Skip to content

Commit 2e8a880

Browse files
william-continfx
andauthored
Improved automation for MANAGED table migration and continued building tables migration component (#295)
Fixes #106 --------- Co-authored-by: Serge Smertin <[email protected]>
1 parent b6fc0ab commit 2e8a880

File tree

7 files changed

+252
-32
lines changed

7 files changed

+252
-32
lines changed

src/databricks/labs/ucx/config.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,8 @@ class WorkspaceConfig(_Config["WorkspaceConfig"]):
205205
connect: ConnectConfig | None = None
206206
num_threads: int | None = 10
207207
log_level: str | None = "INFO"
208+
database_to_catalog_mapping: dict[str, str] = None
209+
default_catalog: str = "ucx_default"
208210

209211
# Starting path for notebooks and directories crawler
210212
workspace_start_path: str = "/"
@@ -220,6 +222,8 @@ def from_dict(cls, raw: dict):
220222
warehouse_id=raw.get("warehouse_id", None),
221223
num_threads=raw.get("num_threads", 10),
222224
log_level=raw.get("log_level", "INFO"),
225+
database_to_catalog_mapping=raw.get("database_to_catalog_mapping", None),
226+
default_catalog=raw.get("default_catalog", "main"),
223227
)
224228

225229
def to_workspace_client(self) -> WorkspaceClient:

src/databricks/labs/ucx/hive_metastore/tables.py

Lines changed: 66 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
from dataclasses import dataclass
44
from functools import partial
55

6+
from databricks.sdk import WorkspaceClient
7+
68
from databricks.labs.ucx.framework.crawlers import CrawlerBase, SqlBackend
79
from databricks.labs.ucx.framework.parallel import ThreadedExecution
810
from databricks.labs.ucx.mixins.sql import Row
@@ -35,39 +37,38 @@ def key(self) -> str:
3537
def kind(self) -> str:
3638
return "VIEW" if self.view_text is not None else "TABLE"
3739

38-
def _sql_alter(self, catalog):
39-
return (
40-
f"ALTER {self.kind} {self.key} SET"
41-
f" TBLPROPERTIES ('upgraded_to' = '{catalog}.{self.database}.{self.name}');"
42-
)
43-
4440
def _sql_external(self, catalog):
45-
# TODO: https://github.com/databricks/ucx/issues/106
46-
return (
47-
f"CREATE TABLE IF NOT EXISTS {catalog}.{self.database}.{self.name}"
48-
f" LIKE {self.key} COPY LOCATION;" + self._sql_alter(catalog)
49-
)
41+
return f"SYNC TABLE {catalog}.{self.database}.{self.name} FROM {self.key};"
5042

5143
def _sql_managed(self, catalog):
5244
if not self.is_delta:
5345
msg = f"{self.key} is not DELTA: {self.table_format}"
5446
raise ValueError(msg)
55-
return (
56-
f"CREATE TABLE IF NOT EXISTS {catalog}.{self.database}.{self.name}"
57-
f" DEEP CLONE {self.key};" + self._sql_alter(catalog)
58-
)
47+
return f"CREATE TABLE IF NOT EXISTS {catalog}.{self.database}.{self.name} DEEP CLONE {self.key};"
5948

6049
def _sql_view(self, catalog):
6150
return f"CREATE VIEW IF NOT EXISTS {catalog}.{self.database}.{self.name} AS {self.view_text};"
6251

6352
def uc_create_sql(self, catalog):
6453
if self.kind == "VIEW":
6554
return self._sql_view(catalog)
66-
elif self.location is not None:
55+
elif self.object_type == "EXTERNAL":
6756
return self._sql_external(catalog)
6857
else:
6958
return self._sql_managed(catalog)
7059

60+
def sql_alter_to(self, catalog):
61+
return (
62+
f"ALTER {self.kind} {self.key} SET"
63+
f" TBLPROPERTIES ('upgraded_to' = '{catalog}.{self.database}.{self.name}');"
64+
)
65+
66+
def sql_alter_from(self, catalog):
67+
return (
68+
f"ALTER {self.kind} {catalog}.{self.database}.{self.name} SET"
69+
f" TBLPROPERTIES ('upgraded_from' = '{self.key}');"
70+
)
71+
7172

7273
class TablesCrawler(CrawlerBase):
7374
def __init__(self, backend: SqlBackend, schema):
@@ -143,3 +144,52 @@ def _describe(self, catalog: str, database: str, table: str) -> Table | None:
143144
except Exception as e:
144145
logger.error(f"Couldn't fetch information for table {full_name} : {e}")
145146
return None
147+
148+
149+
class TablesMigrate:
150+
def __init__(
151+
self,
152+
tc: TablesCrawler,
153+
ws: WorkspaceClient,
154+
backend: SqlBackend,
155+
inventory_database: str,
156+
default_catalog=None,
157+
database_to_catalog_mapping: dict[str, str] | None = None,
158+
):
159+
self._tc = tc
160+
self._backend = backend
161+
self._ws = ws
162+
self._inventory_database = inventory_database
163+
self._database_to_catalog_mapping = database_to_catalog_mapping
164+
self._seen_tables = {}
165+
self._default_catalog = self._init_default_catalog(default_catalog)
166+
167+
@staticmethod
168+
def _init_default_catalog(default_catalog):
169+
if default_catalog:
170+
return default_catalog
171+
else:
172+
return "ucx_default" # TODO : Fetch current workspace name and append it to the default catalog.
173+
174+
def migrate_tables(self):
175+
tasks = []
176+
for table in self._tc.snapshot():
177+
target_catalog = self._default_catalog
178+
if self._database_to_catalog_mapping:
179+
target_catalog = self._database_to_catalog_mapping[table.database]
180+
tasks.append(partial(self._migrate_table, target_catalog, table))
181+
ThreadedExecution.gather("migrate tables", tasks)
182+
183+
def _migrate_table(self, target_catalog, table):
184+
try:
185+
sql = table.uc_create_sql(target_catalog)
186+
logger.debug(f"Migrating table {table.key} to using SQL query: {sql}")
187+
188+
if table.object_type == "MANAGED":
189+
self._backend.execute(sql)
190+
self._backend.execute(table.sql_alter_to(target_catalog))
191+
self._backend.execute(table.sql_alter_from(target_catalog))
192+
else:
193+
logger.info(f"Table {table.key} is a {table.object_type} and is not supported for migration yet ")
194+
except Exception as e:
195+
logger.error(f"Could not create table {table.name} because: {e}")

tests/integration/conftest.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -82,12 +82,17 @@ def test_catalog_fixture(make_catalog):
8282

8383
@pytest.fixture
8484
def make_schema(sql_exec, make_random):
85-
def create(*, catalog="hive_metastore"):
86-
name = f"{catalog}.ucx_S{make_random(4)}".lower()
87-
sql_exec(f"CREATE SCHEMA {name}")
88-
return name
85+
def create(*, catalog="hive_metastore", schema):
86+
if schema is None:
87+
schema = f"{catalog}.ucx_S{make_random(4)}".lower()
88+
else:
89+
schema = f"{catalog}.{schema}"
90+
sql_exec(f"CREATE SCHEMA {schema}")
91+
return schema
8992

90-
yield from factory("schema", create, lambda name: sql_exec(f"DROP SCHEMA IF EXISTS {name} CASCADE")) # noqa: F405
93+
yield from factory( # noqa: F405
94+
"schema", create, lambda schema_name: sql_exec(f"DROP SCHEMA IF EXISTS {schema_name} CASCADE")
95+
)
9196

9297

9398
def test_schema_fixture(make_schema):
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
import logging
2+
import os
3+
4+
import pytest
5+
6+
from databricks.labs.ucx.framework.crawlers import StatementExecutionBackend
7+
from databricks.labs.ucx.hive_metastore import TablesCrawler
8+
from databricks.labs.ucx.hive_metastore.tables import TablesMigrate
9+
10+
logger = logging.getLogger(__name__)
11+
12+
13+
def test_migrate_managed_tables(ws, make_catalog, make_schema, make_table):
14+
target_catalog = make_catalog()
15+
schema_a = make_schema(catalog="hive_metastore")
16+
_, target_schema = schema_a.split(".")
17+
18+
make_schema(catalog=target_catalog, schema_name=target_schema)
19+
20+
managed_table = make_table(schema=schema_a)
21+
22+
logger.info(f"target catalog={target_catalog}, managed_table={managed_table}")
23+
24+
inventory_schema = make_schema(catalog="hive_metastore")
25+
_, inventory_schema = inventory_schema.split(".")
26+
27+
backend = StatementExecutionBackend(ws, os.environ["TEST_DEFAULT_WAREHOUSE_ID"])
28+
crawler = TablesCrawler(backend, inventory_schema)
29+
tm = TablesMigrate(crawler, ws, backend, target_catalog, inventory_schema)
30+
tm.migrate_tables()
31+
32+
target_tables = list(backend.fetch(f"SHOW TABLES IN {target_catalog}.{target_schema}"))
33+
assert len(target_tables) == 1
34+
35+
_, _, managed_table_name = managed_table.split(".")
36+
target_table_properties = ws.tables.get(f"{target_catalog}.{target_schema}.{managed_table_name}").properties
37+
38+
assert target_table_properties["upgraded_from"] == managed_table
39+
40+
41+
@pytest.mark.skip(reason="Needs Storage credential + External Location in place")
42+
def test_migrate_external_table(ws, make_catalog, make_schema, make_table):
43+
target_catalog = make_catalog()
44+
schema_a = make_schema(catalog="hive_metastore")
45+
_, target_schema = schema_a.split(".")
46+
47+
make_schema(catalog=target_catalog, schema_name=target_schema)
48+
49+
external_table = make_table(schema=schema_a, external=True)
50+
51+
logger.info(f"target catalog={target_catalog}, external_table={external_table} ")
52+
53+
inventory_schema = make_schema(catalog="hive_metastore")
54+
_, inventory_schema = inventory_schema.split(".")
55+
56+
backend = StatementExecutionBackend(ws, os.environ["TEST_DEFAULT_WAREHOUSE_ID"])
57+
58+
backend = StatementExecutionBackend(ws, os.environ["TEST_DEFAULT_WAREHOUSE_ID"])
59+
crawler = TablesCrawler(backend, inventory_schema)
60+
tm = TablesMigrate(crawler, ws, backend, target_catalog, inventory_schema)
61+
tm.migrate_tables()
62+
63+
target_tables = list(backend.fetch(f"SHOW TABLES IN {target_catalog}.{target_schema}"))
64+
assert len(target_tables) == 1
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
import logging
2+
from unittest.mock import MagicMock
3+
4+
import pytest
5+
6+
from databricks.labs.ucx.hive_metastore.tables import TablesCrawler, TablesMigrate
7+
8+
from ..framework.mocks import MockBackend
9+
10+
logger = logging.getLogger(__name__)
11+
12+
13+
def test_migrate_managed_tables_should_produce_proper_queries():
14+
errors = {}
15+
rows = {
16+
"SELECT": [
17+
(
18+
"hive_metastore",
19+
"db1",
20+
"managed",
21+
"MANAGED",
22+
"DELTA",
23+
None,
24+
None,
25+
),
26+
]
27+
}
28+
backend = MockBackend(fails_on_first=errors, rows=rows)
29+
tc = TablesCrawler(backend, "inventory_database")
30+
client = MagicMock()
31+
tm = TablesMigrate(tc, client, backend, "")
32+
tm.migrate_tables()
33+
34+
assert (list(backend.queries)) == [
35+
"SELECT * FROM hive_metastore.inventory_database.tables",
36+
"CREATE TABLE IF NOT EXISTS ucx_default.db1.managed DEEP CLONE hive_metastore.db1.managed;",
37+
"ALTER TABLE hive_metastore.db1.managed SET TBLPROPERTIES ('upgraded_to' = 'ucx_default.db1.managed');",
38+
"ALTER TABLE ucx_default.db1.managed SET TBLPROPERTIES ('upgraded_from' = 'hive_metastore.db1.managed');",
39+
]
40+
41+
42+
@pytest.mark.skip(reason="Not implemented yet")
43+
def test_migrate_managed_tables_should_do_nothing_if_upgrade_tag_is_present():
44+
errors = {}
45+
rows = {
46+
"SELECT": [
47+
("hive_metastore", "db1", "managed", "MANAGED", "DELTA", None, None, "[upgraded_to=target]"),
48+
]
49+
}
50+
backend = MockBackend(fails_on_first=errors, rows=rows)
51+
tc = TablesCrawler(backend, "inventory_database")
52+
client = MagicMock()
53+
tm = TablesMigrate(tc, client, backend, "")
54+
tm.migrate_tables()
55+
56+
assert (list(backend.queries)) == ["SELECT * FROM hive_metastore.inventory_database.tables"]
57+
58+
59+
def test_migrate_tables_should_migrate_tables_to_default_catalog_if_not_found_in_mapping():
60+
errors = {}
61+
rows = {
62+
"SELECT": [
63+
("hive_metastore", "db1", "managed", "MANAGED", "DELTA", None, None),
64+
]
65+
}
66+
backend = MockBackend(fails_on_first=errors, rows=rows)
67+
tc = TablesCrawler(backend, "inventory_database")
68+
client = MagicMock()
69+
database_to_catalog_mapping = {"db1": "catalog_1", "db2": "catalog_2"}
70+
tm = TablesMigrate(tc, client, backend, "", database_to_catalog_mapping=database_to_catalog_mapping)
71+
tm.migrate_tables()
72+
73+
assert (list(backend.queries)) == [
74+
"SELECT * FROM hive_metastore.inventory_database.tables",
75+
"CREATE TABLE IF NOT EXISTS catalog_1.db1.managed DEEP CLONE hive_metastore.db1.managed;",
76+
"ALTER TABLE hive_metastore.db1.managed SET TBLPROPERTIES ('upgraded_to' = 'catalog_1.db1.managed');",
77+
"ALTER TABLE catalog_1.db1.managed SET TBLPROPERTIES ('upgraded_from' = 'hive_metastore.db1.managed');",
78+
]
79+
80+
81+
def test_migrate_tables_should_migrate_tables_to_default_catalog_if_specified():
82+
errors = {}
83+
rows = {
84+
"SELECT": [
85+
("hive_metastore", "db1", "managed", "MANAGED", "DELTA", None, None),
86+
]
87+
}
88+
backend = MockBackend(fails_on_first=errors, rows=rows)
89+
tc = TablesCrawler(backend, "inventory_database")
90+
client = MagicMock()
91+
tm = TablesMigrate(tc, client, backend, "", default_catalog="test_catalog")
92+
tm.migrate_tables()
93+
94+
assert (list(backend.queries)) == [
95+
"SELECT * FROM hive_metastore.inventory_database.tables",
96+
"CREATE TABLE IF NOT EXISTS test_catalog.db1.managed DEEP CLONE hive_metastore.db1.managed;",
97+
"ALTER TABLE hive_metastore.db1.managed SET TBLPROPERTIES ('upgraded_to' = 'test_catalog.db1.managed');",
98+
"ALTER TABLE test_catalog.db1.managed SET TBLPROPERTIES ('upgraded_from' = 'hive_metastore.db1.managed');",
99+
]

tests/unit/hive_metastore/test_tables.py

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -49,9 +49,7 @@ def test_sql_managed_non_delta():
4949
[
5050
(
5151
Table(catalog="catalog", database="db", name="managed_table", object_type="..", table_format="DELTA"),
52-
"CREATE TABLE IF NOT EXISTS new_catalog.db.managed_table DEEP CLONE "
53-
"catalog.db.managed_table;ALTER TABLE catalog.db.managed_table SET "
54-
"TBLPROPERTIES ('upgraded_to' = 'new_catalog.db.managed_table');",
52+
"CREATE TABLE IF NOT EXISTS new_catalog.db.managed_table DEEP CLONE catalog.db.managed_table;",
5553
),
5654
(
5755
Table(
@@ -69,14 +67,11 @@ def test_sql_managed_non_delta():
6967
catalog="catalog",
7068
database="db",
7169
name="external_table",
72-
object_type="..",
70+
object_type="EXTERNAL",
7371
table_format="DELTA",
7472
location="s3a://foo/bar",
7573
),
76-
"CREATE TABLE IF NOT EXISTS new_catalog.db.external_table LIKE "
77-
"catalog.db.external_table COPY LOCATION;ALTER TABLE "
78-
"catalog.db.external_table SET TBLPROPERTIES ('upgraded_to' = "
79-
"'new_catalog.db.external_table');",
74+
"SYNC TABLE new_catalog.db.external_table FROM catalog.db.external_table;",
8075
),
8176
],
8277
)

tests/unit/test_install.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,8 @@ def not_found(_):
4949

5050
ws.workspace.upload.assert_called_with(
5151
"/Users/[email protected]/.ucx/config.yml",
52-
b"""groups:
52+
b"""default_catalog: ucx_default
53+
groups:
5354
backup_group_prefix: '42'
5455
selected:
5556
- '42'
@@ -106,7 +107,8 @@ def mock_question(text: str, *, default: str | None = None) -> str:
106107

107108
ws.workspace.upload.assert_called_with(
108109
"/Users/[email protected]/.ucx/config.yml",
109-
b"""groups:
110+
b"""default_catalog: ucx_default
111+
groups:
110112
auto: true
111113
backup_group_prefix: '42'
112114
inventory_database: '42'
@@ -146,7 +148,8 @@ def mock_question(text: str, *, default: str | None = None) -> str:
146148

147149
ws.workspace.upload.assert_called_with(
148150
"/Users/[email protected]/.ucx/config.yml",
149-
b"""groups:
151+
b"""default_catalog: ucx_default
152+
groups:
150153
backup_group_prefix: '42'
151154
selected:
152155
- g1

0 commit comments

Comments
 (0)