Skip to content

Commit fedf569

Browse files
authored
Added "what" property for migration to scope down table migrations (#856)
## Changes <!-- Summary of your changes that are easy to understand. Add screenshots when necessary --> ### Linked issues related to #333 Resolves #.. ### Functionality - [ ] added relevant user documentation - [ ] added new CLI command - [ ] modified existing command: `databricks labs ucx ...` - [ ] added a new workflow - [ ] modified existing workflow: `...` - [ ] added a new table - [ ] modified existing table: `...` ### Tests <!-- How is this tested? Please see the checklist below and also describe any other relevant tests --> - [ ] manually tested - [ ] added unit tests - [ ] added integration tests - [ ] verified on staging environment (screenshot attached)
1 parent ee67586 commit fedf569

File tree

5 files changed

+205
-56
lines changed

5 files changed

+205
-56
lines changed

src/databricks/labs/ucx/hive_metastore/table_migrate.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,13 @@
1515
from databricks.labs.ucx.framework.crawlers import SqlBackend
1616
from databricks.labs.ucx.hive_metastore import TablesCrawler
1717
from databricks.labs.ucx.hive_metastore.mapping import Rule, TableMapping
18-
from databricks.labs.ucx.hive_metastore.tables import MigrationCount, Table
18+
from databricks.labs.ucx.hive_metastore.tables import MigrationCount, Table, What
1919

2020
logger = logging.getLogger(__name__)
2121

2222

2323
class TablesMigrate:
24+
2425
def __init__(
2526
self,
2627
tc: TablesCrawler,
@@ -34,23 +35,24 @@ def __init__(
3435
self._tm = tm
3536
self._seen_tables: dict[str, str] = {}
3637

37-
def migrate_tables(self):
38+
def migrate_tables(self, *, what: What | None = None):
3839
self._init_seen_tables()
3940
tables_to_migrate = self._tm.get_tables_to_migrate(self._tc)
4041
tasks = []
4142
for table in tables_to_migrate:
42-
tasks.append(partial(self._migrate_table, table.src, table.rule))
43+
if not what or table.src.what == what:
44+
tasks.append(partial(self._migrate_table, table.src, table.rule))
4345
Threads.strict("migrate tables", tasks)
4446

4547
def _migrate_table(self, src_table: Table, rule: Rule):
4648
if self._table_already_upgraded(rule.as_uc_table_key):
4749
logger.info(f"Table {src_table.key} already upgraded to {rule.as_uc_table_key}")
4850
return True
49-
if src_table.kind == "TABLE" and src_table.table_format == "DELTA" and src_table.is_dbfs_root:
51+
if src_table.what == What.DBFS_ROOT_DELTA:
5052
return self._migrate_dbfs_root_table(src_table, rule)
51-
if src_table.kind == "TABLE" and src_table.is_format_supported_for_sync:
53+
if src_table.what == What.EXTERNAL_SYNC:
5254
return self._migrate_external_table(src_table, rule)
53-
if src_table.kind == "VIEW":
55+
if src_table.what == What.VIEW:
5456
return self._migrate_view(src_table, rule)
5557
logger.info(f"Table {src_table.key} is not supported for migration")
5658
return True

src/databricks/labs/ucx/hive_metastore/tables.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import typing
44
from collections.abc import Iterable, Iterator
55
from dataclasses import dataclass
6+
from enum import Enum, auto
67
from functools import partial
78

89
from databricks.labs.blueprint.parallel import Threads
@@ -14,6 +15,16 @@
1415
logger = logging.getLogger(__name__)
1516

1617

18+
class What(Enum):
19+
EXTERNAL_SYNC = auto()
20+
EXTERNAL_NO_SYNC = auto()
21+
DBFS_ROOT_DELTA = auto()
22+
DBFS_ROOT_NON_DELTA = auto()
23+
VIEW = auto()
24+
DB_DATASET = auto()
25+
UNKNOWN = auto()
26+
27+
1728
@dataclass
1829
class Table:
1930
catalog: str
@@ -96,6 +107,22 @@ def is_databricks_dataset(self) -> bool:
96107
return True
97108
return False
98109

110+
@property
111+
def what(self) -> What:
112+
if self.is_databricks_dataset:
113+
return What.DB_DATASET
114+
if self.is_dbfs_root and self.table_format == "DELTA":
115+
return What.DBFS_ROOT_DELTA
116+
if self.is_dbfs_root:
117+
return What.DBFS_ROOT_NON_DELTA
118+
if self.kind == "TABLE" and self.is_format_supported_for_sync:
119+
return What.EXTERNAL_SYNC
120+
if self.kind == "TABLE":
121+
return What.EXTERNAL_NO_SYNC
122+
if self.kind == "VIEW":
123+
return What.VIEW
124+
return What.UNKNOWN
125+
99126
def sql_migrate_external(self, target_table_key):
100127
return f"SYNC TABLE {escape_sql_identifier(target_table_key)} FROM {escape_sql_identifier(self.key)};"
101128

tests/integration/hive_metastore/test_tables.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from databricks.sdk.retries import retried
66

77
from databricks.labs.ucx.hive_metastore import TablesCrawler
8+
from databricks.labs.ucx.hive_metastore.tables import What
89

910
logger = logging.getLogger(__name__)
1011

@@ -38,8 +39,13 @@ def test_describe_all_tables_in_databases(ws, sql_backend, inventory_schema, mak
3839

3940
assert len(all_tables) >= 5
4041
assert all_tables[non_delta.full_name].table_format == "JSON"
42+
assert all_tables[non_delta.full_name].what == What.DB_DATASET
4143
assert all_tables[managed_table.full_name].object_type == "MANAGED"
44+
assert all_tables[managed_table.full_name].what == What.DBFS_ROOT_DELTA
4245
assert all_tables[tmp_table.full_name].object_type == "MANAGED"
46+
assert all_tables[tmp_table.full_name].what == What.DBFS_ROOT_DELTA
4347
assert all_tables[external_table.full_name].object_type == "EXTERNAL"
48+
assert all_tables[external_table.full_name].what == What.EXTERNAL_NO_SYNC
4449
assert all_tables[view.full_name].object_type == "VIEW"
4550
assert all_tables[view.full_name].view_text == "SELECT 2+2 AS four"
51+
assert all_tables[view.full_name].what == What.VIEW

tests/unit/hive_metastore/test_table_migrate.py

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
MigrationCount,
1717
Table,
1818
TablesCrawler,
19+
What,
1920
)
2021

2122
from ..framework.mocks import MockBackend
@@ -66,6 +67,25 @@ def test_migrate_dbfs_root_tables_should_produce_proper_queries():
6667
)
6768

6869

70+
def test_migrate_dbfs_root_tables_should_be_skipped_when_upgrading_external():
71+
errors = {}
72+
rows = {}
73+
backend = MockBackend(fails_on_first=errors, rows=rows)
74+
table_crawler = TablesCrawler(backend, "inventory_database")
75+
client = MagicMock()
76+
table_mapping = create_autospec(TableMapping)
77+
table_mapping.get_tables_to_migrate.return_value = [
78+
TableToMigrate(
79+
Table("hive_metastore", "db1_src", "managed_dbfs", "MANAGED", "DELTA", "dbfs:/some_location"),
80+
Rule("workspace", "ucx_default", "db1_src", "db1_dst", "managed_dbfs", "managed_dbfs"),
81+
),
82+
]
83+
table_migrate = TablesMigrate(table_crawler, client, backend, table_mapping)
84+
table_migrate.migrate_tables(what=What.EXTERNAL_SYNC)
85+
86+
assert len(backend.queries) == 0
87+
88+
6989
def test_migrate_external_tables_should_produce_proper_queries():
7090
errors = {}
7191
rows = {}
@@ -87,6 +107,58 @@ def test_migrate_external_tables_should_produce_proper_queries():
87107
]
88108

89109

110+
def test_migrate_already_upgraded_table_should_produce_no_queries():
111+
errors = {}
112+
rows = {}
113+
backend = MockBackend(fails_on_first=errors, rows=rows)
114+
table_crawler = TablesCrawler(backend, "inventory_database")
115+
client = create_autospec(WorkspaceClient)
116+
client.catalogs.list.return_value = [CatalogInfo(name="cat1")]
117+
client.schemas.list.return_value = [
118+
SchemaInfo(catalog_name="cat1", name="test_schema1"),
119+
]
120+
client.tables.list.return_value = [
121+
TableInfo(
122+
catalog_name="cat1",
123+
schema_name="schema1",
124+
name="dest1",
125+
full_name="cat1.schema1.dest1",
126+
properties={"upgraded_from": "hive_metastore.db1_src.external_src"},
127+
),
128+
]
129+
130+
table_mapping = create_autospec(TableMapping)
131+
table_mapping.get_tables_to_migrate.return_value = [
132+
TableToMigrate(
133+
Table("hive_metastore", "db1_src", "external_src", "EXTERNAL", "DELTA"),
134+
Rule("workspace", "cat1", "db1_src", "schema1", "external_src", "dest1"),
135+
)
136+
]
137+
table_migrate = TablesMigrate(table_crawler, client, backend, table_mapping)
138+
table_migrate.migrate_tables()
139+
140+
assert len(backend.queries) == 0
141+
142+
143+
def test_migrate_unsupported_format_table_should_produce_no_queries():
144+
errors = {}
145+
rows = {}
146+
backend = MockBackend(fails_on_first=errors, rows=rows)
147+
table_crawler = TablesCrawler(backend, "inventory_database")
148+
client = create_autospec(WorkspaceClient)
149+
table_mapping = create_autospec(TableMapping)
150+
table_mapping.get_tables_to_migrate.return_value = [
151+
TableToMigrate(
152+
Table("hive_metastore", "db1_src", "external_src", "EXTERNAL", "UNSUPPORTED_FORMAT"),
153+
Rule("workspace", "cat1", "db1_src", "schema1", "external_src", "dest1"),
154+
)
155+
]
156+
table_migrate = TablesMigrate(table_crawler, client, backend, table_mapping)
157+
table_migrate.migrate_tables()
158+
159+
assert len(backend.queries) == 0
160+
161+
90162
def test_migrate_view_should_produce_proper_queries():
91163
errors = {}
92164
rows = {}

tests/unit/hive_metastore/test_tables.py

Lines changed: 92 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import pytest
22

3-
from databricks.labs.ucx.hive_metastore.tables import Table, TablesCrawler
3+
from databricks.labs.ucx.hive_metastore.tables import Table, TablesCrawler, What
44

55
from ..framework.mocks import MockBackend
66

@@ -136,52 +136,94 @@ def test_tables_returning_error_when_describing():
136136
assert len(results) == 1
137137

138138

139-
def test_is_dbfs_root():
140-
assert Table("a", "b", "c", "MANAGED", "DELTA", location="dbfs:/somelocation/tablename").is_dbfs_root
141-
assert Table("a", "b", "c", "MANAGED", "DELTA", location="/dbfs/somelocation/tablename").is_dbfs_root
142-
assert not Table("a", "b", "c", "MANAGED", "DELTA", location="dbfs:/mnt/somelocation/tablename").is_dbfs_root
143-
assert not Table("a", "b", "c", "MANAGED", "DELTA", location="/dbfs/mnt/somelocation/tablename").is_dbfs_root
144-
assert not Table(
145-
"a", "b", "c", "MANAGED", "DELTA", location="dbfs:/databricks-datasets/somelocation/tablename"
146-
).is_dbfs_root
147-
assert not Table(
148-
"a", "b", "c", "MANAGED", "DELTA", location="/dbfs/databricks-datasets/somelocation/tablename"
149-
).is_dbfs_root
150-
assert not Table("a", "b", "c", "MANAGED", "DELTA", location="s3:/somelocation/tablename").is_dbfs_root
151-
assert not Table("a", "b", "c", "MANAGED", "DELTA", location="adls:/somelocation/tablename").is_dbfs_root
152-
153-
154-
def test_is_db_dataset():
155-
assert not Table("a", "b", "c", "MANAGED", "DELTA", location="dbfs:/somelocation/tablename").is_databricks_dataset
156-
assert not Table("a", "b", "c", "MANAGED", "DELTA", location="/dbfs/somelocation/tablename").is_databricks_dataset
157-
assert not Table(
158-
"a", "b", "c", "MANAGED", "DELTA", location="dbfs:/mnt/somelocation/tablename"
159-
).is_databricks_dataset
160-
assert not Table(
161-
"a", "b", "c", "MANAGED", "DELTA", location="/dbfs/mnt/somelocation/tablename"
162-
).is_databricks_dataset
163-
assert Table(
164-
"a", "b", "c", "MANAGED", "DELTA", location="dbfs:/databricks-datasets/somelocation/tablename"
165-
).is_databricks_dataset
166-
assert Table(
167-
"a", "b", "c", "MANAGED", "DELTA", location="/dbfs/databricks-datasets/somelocation/tablename"
168-
).is_databricks_dataset
169-
assert not Table("a", "b", "c", "MANAGED", "DELTA", location="s3:/somelocation/tablename").is_databricks_dataset
170-
assert not Table("a", "b", "c", "MANAGED", "DELTA", location="adls:/somelocation/tablename").is_databricks_dataset
171-
172-
173-
def test_is_supported_for_sync():
174-
assert Table(
175-
"a", "b", "c", "EXTERNAL", "DELTA", location="dbfs:/somelocation/tablename"
176-
).is_format_supported_for_sync
177-
assert Table("a", "b", "c", "EXTERNAL", "CSV", location="dbfs:/somelocation/tablename").is_format_supported_for_sync
178-
assert Table(
179-
"a", "b", "c", "EXTERNAL", "TEXT", location="dbfs:/somelocation/tablename"
180-
).is_format_supported_for_sync
181-
assert Table("a", "b", "c", "EXTERNAL", "ORC", location="dbfs:/somelocation/tablename").is_format_supported_for_sync
182-
assert Table(
183-
"a", "b", "c", "EXTERNAL", "JSON", location="dbfs:/somelocation/tablename"
184-
).is_format_supported_for_sync
185-
assert not (
186-
Table("a", "b", "c", "EXTERNAL", "AVRO", location="dbfs:/somelocation/tablename").is_format_supported_for_sync
187-
)
139+
@pytest.mark.parametrize(
140+
'table,dbfs_root,what',
141+
[
142+
(Table("a", "b", "c", "MANAGED", "DELTA", location="dbfs:/somelocation/tablename"), True, What.DBFS_ROOT_DELTA),
143+
(
144+
Table("a", "b", "c", "MANAGED", "PARQUET", location="dbfs:/somelocation/tablename"),
145+
True,
146+
What.DBFS_ROOT_NON_DELTA,
147+
),
148+
(Table("a", "b", "c", "MANAGED", "DELTA", location="/dbfs/somelocation/tablename"), True, What.DBFS_ROOT_DELTA),
149+
(
150+
Table("a", "b", "c", "MANAGED", "DELTA", location="dbfs:/mnt/somelocation/tablename"),
151+
False,
152+
What.EXTERNAL_SYNC,
153+
),
154+
(
155+
Table("a", "b", "c", "MANAGED", "DELTA", location="/dbfs/mnt/somelocation/tablename"),
156+
False,
157+
What.EXTERNAL_SYNC,
158+
),
159+
(
160+
Table("a", "b", "c", "MANAGED", "DELTA", location="dbfs:/databricks-datasets/somelocation/tablename"),
161+
False,
162+
What.DB_DATASET,
163+
),
164+
(
165+
Table("a", "b", "c", "MANAGED", "DELTA", location="/dbfs/databricks-datasets/somelocation/tablename"),
166+
False,
167+
What.DB_DATASET,
168+
),
169+
(Table("a", "b", "c", "MANAGED", "DELTA", location="s3:/somelocation/tablename"), False, What.EXTERNAL_SYNC),
170+
(Table("a", "b", "c", "MANAGED", "DELTA", location="adls:/somelocation/tablename"), False, What.EXTERNAL_SYNC),
171+
],
172+
)
173+
def test_is_dbfs_root(table, dbfs_root, what):
174+
assert table.is_dbfs_root == dbfs_root
175+
assert table.what == what
176+
177+
178+
@pytest.mark.parametrize(
179+
'table,db_dataset',
180+
[
181+
(Table("a", "b", "c", "MANAGED", "DELTA", location="dbfs:/somelocation/tablename"), False),
182+
(Table("a", "b", "c", "MANAGED", "DELTA", location="/dbfs/somelocation/tablename"), False),
183+
(Table("a", "b", "c", "MANAGED", "DELTA", location="dbfs:/mnt/somelocation/tablename"), False),
184+
(Table("a", "b", "c", "MANAGED", "DELTA", location="/dbfs/mnt/somelocation/tablename"), False),
185+
(Table("a", "b", "c", "MANAGED", "DELTA", location="dbfs:/databricks-datasets/somelocation/tablename"), True),
186+
(Table("a", "b", "c", "MANAGED", "DELTA", location="/dbfs/databricks-datasets/somelocation/tablename"), True),
187+
(Table("a", "b", "c", "MANAGED", "DELTA", location="s3:/somelocation/tablename"), False),
188+
(Table("a", "b", "c", "MANAGED", "DELTA", location="adls:/somelocation/tablename"), False),
189+
],
190+
)
191+
def test_is_db_dataset(table, db_dataset):
192+
assert table.is_databricks_dataset == db_dataset
193+
assert (table.what == What.DB_DATASET) == db_dataset
194+
195+
196+
@pytest.mark.parametrize(
197+
'table,supported',
198+
[
199+
(Table("a", "b", "c", "EXTERNAL", "DELTA", location="dbfs:/somelocation/tablename"), True),
200+
(Table("a", "b", "c", "EXTERNAL", "CSV", location="dbfs:/somelocation/tablename"), True),
201+
(Table("a", "b", "c", "EXTERNAL", "TEXT", location="dbfs:/somelocation/tablename"), True),
202+
(Table("a", "b", "c", "EXTERNAL", "ORC", location="dbfs:/somelocation/tablename"), True),
203+
(Table("a", "b", "c", "EXTERNAL", "JSON", location="dbfs:/somelocation/tablename"), True),
204+
(Table("a", "b", "c", "EXTERNAL", "AVRO", location="dbfs:/somelocation/tablename"), False),
205+
],
206+
)
207+
def test_is_supported_for_sync(table, supported):
208+
assert table.is_format_supported_for_sync == supported
209+
210+
211+
@pytest.mark.parametrize(
212+
'table,what',
213+
[
214+
(Table("a", "b", "c", "EXTERNAL", "DELTA", location="s3://external_location/table"), What.EXTERNAL_SYNC),
215+
(
216+
Table("a", "b", "c", "EXTERNAL", "UNSUPPORTED_FORMAT", location="s3://external_location/table"),
217+
What.EXTERNAL_NO_SYNC,
218+
),
219+
(Table("a", "b", "c", "MANAGED", "DELTA", location="dbfs:/somelocation/tablename"), What.DBFS_ROOT_DELTA),
220+
(Table("a", "b", "c", "MANAGED", "PARQUET", location="dbfs:/somelocation/tablename"), What.DBFS_ROOT_NON_DELTA),
221+
(Table("a", "b", "c", "VIEW", "VIEW", view_text="select * from some_table"), What.VIEW),
222+
(
223+
Table("a", "b", "c", "MANAGED", "DELTA", location="dbfs:/databricks-datasets/somelocation/tablename"),
224+
What.DB_DATASET,
225+
),
226+
],
227+
)
228+
def test_table_what(table, what):
229+
assert table.what == what

0 commit comments

Comments
 (0)