Skip to content

Commit bccc103

Browse files
authored
Track UsedTables on TableProgressEncoder (#3373)
<!-- REMOVE IRRELEVANT COMMENTS BEFORE CREATING A PULL REQUEST --> ## Changes Track `UsedTables` on `TableProgressEncoder` ### Linked issues Resolves #3061 ### Functionality - [x] modified existing workflow: `migration-progress-experimental` ### Tests - [ ] manually tested - [x] added unit tests - [x] added integration tests
1 parent 780008e commit bccc103

File tree

6 files changed

+153
-35
lines changed

6 files changed

+153
-35
lines changed

src/databricks/labs/ucx/contexts/workflow_task.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,7 @@ def tables_progress(self) -> ProgressEncoder[Table]:
226226
self.sql_backend,
227227
self.table_ownership,
228228
self.migration_status_refresher,
229+
[self.used_tables_crawler_for_paths, self.used_tables_crawler_for_queries],
229230
self.parent_run_id,
230231
self.workspace_id,
231232
self.config.ucx_catalog,
Lines changed: 34 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import logging
2+
from collections import defaultdict
23
from collections.abc import Iterable
34
from dataclasses import replace
45

@@ -11,29 +12,25 @@
1112
from databricks.labs.ucx.hive_metastore.ownership import TableOwnership
1213
from databricks.labs.ucx.progress.history import ProgressEncoder
1314
from databricks.labs.ucx.progress.install import Historical
15+
from databricks.labs.ucx.source_code.base import UsedTable
16+
from databricks.labs.ucx.source_code.used_table import UsedTablesCrawler
1417

1518

1619
logger = logging.getLogger(__name__)
1720

1821

1922
class TableProgressEncoder(ProgressEncoder[Table]):
20-
"""Encoder class:Table to class:History.
21-
22-
A progress failure for a table means:
23-
- the table is not migrated yet
24-
- the associated grants have a failure
25-
"""
23+
"""Encoder class:Table to class:History."""
2624

2725
def __init__(
2826
self,
2927
sql_backend: SqlBackend,
3028
ownership: TableOwnership,
3129
migration_status_refresher: CrawlerBase[TableMigrationStatus],
30+
used_tables_crawlers: list[UsedTablesCrawler],
3231
run_id: int,
3332
workspace_id: int,
3433
catalog: str,
35-
schema: str = "multiworkspace",
36-
table: str = "historical",
3734
) -> None:
3835
super().__init__(
3936
sql_backend,
@@ -42,27 +39,48 @@ def __init__(
4239
run_id,
4340
workspace_id,
4441
catalog,
45-
schema,
46-
table,
42+
"multiworkspace",
43+
"historical",
4744
)
4845
self._migration_status_refresher = migration_status_refresher
46+
self._used_tables_crawlers = used_tables_crawlers
4947

5048
def append_inventory_snapshot(self, snapshot: Iterable[Table]) -> None:
5149
migration_index = TableMigrationIndex(self._migration_status_refresher.snapshot())
52-
history_records = [self._encode_table_as_historical(record, migration_index) for record in snapshot]
50+
used_hive_tables = self._get_used_hive_tables()
51+
history_records = []
52+
for record in snapshot:
53+
history_record = self._encode_table_as_historical(record, migration_index, used_hive_tables)
54+
history_records.append(history_record)
5355
logger.debug(f"Appending {len(history_records)} {self._klass} table record(s) to history.")
5456
# The mode is 'append'. This is documented as conflict-free.
5557
self._sql_backend.save_table(escape_sql_identifier(self.full_name), history_records, Historical, mode="append")
5658

57-
def _encode_table_as_historical(self, record: Table, migration_index: TableMigrationIndex) -> Historical:
58-
"""Encode a table record, enriching with the migration status.
59+
def _get_used_hive_tables(self) -> dict[str, list[UsedTable]]:
60+
used_tables: dict[str, list[UsedTable]] = defaultdict(list[UsedTable])
61+
for crawler in self._used_tables_crawlers:
62+
for used_table in crawler.snapshot():
63+
if used_table.catalog_name == "hive_metastore":
64+
used_tables[used_table.full_name].append(used_table)
65+
return used_tables
66+
67+
def _encode_table_as_historical(
68+
self, record: Table, migration_index: TableMigrationIndex, used_hive_tables: dict[str, list[UsedTable]]
69+
) -> Historical:
70+
"""Encode a table record, enriching with the migration status and used table references.
71+
72+
Possible failures, the table is
73+
- Pending migration
74+
- A Hive table referenced by code
5975
60-
A table failure means that the table is pending migration. Grants are purposefully left out, because a grant
61-
might not be mappable to UC, like `READ_METADATA`, thus possibly resulting in false "pending migration" failure
62-
for tables that are migrated to UC with their relevant grants also being migrated.
76+
Grants are purposefully left out, because a grant might not be mappable to UC, like `READ_METADATA`, thus
77+
possibly resulting in false "pending migration" failure for tables that are migrated to UC with their relevant
78+
grants also being migrated.
6379
"""
6480
historical = super()._encode_record_as_historical(record)
6581
failures = []
6682
if not migration_index.is_migrated(record.database, record.name):
6783
failures.append("Pending migration")
84+
for used_table in used_hive_tables.get(record.full_name, []):
85+
failures.append(f"Used by {used_table.source_type}: {used_table.source_id}")
6886
return replace(historical, failures=historical.failures + failures)

src/databricks/labs/ucx/source_code/base.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -260,6 +260,10 @@ def parse(cls, value: str, default_schema: str, is_read=True, is_write=False) ->
260260
catalog_name=catalog_name, schema_name=schema_name, table_name=parts[0], is_read=is_read, is_write=is_write
261261
)
262262

263+
@property
264+
def full_name(self) -> str:
265+
return ".".join([self.catalog_name, self.schema_name, self.table_name])
266+
263267
catalog_name: str = SourceInfo.UNKNOWN
264268
schema_name: str = SourceInfo.UNKNOWN
265269
table_name: str = SourceInfo.UNKNOWN
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
import datetime as dt
2+
3+
import pytest
4+
5+
from databricks.labs.ucx.framework.utils import escape_sql_identifier
6+
from databricks.labs.ucx.hive_metastore.tables import Table
7+
from databricks.labs.ucx.source_code.base import LineageAtom, UsedTable
8+
9+
10+
@pytest.mark.parametrize("is_migrated_table", [True, False])
11+
@pytest.mark.parametrize("is_used_table", [True, False])
12+
def test_table_progress_encoder_table_failures(
13+
runtime_ctx,
14+
az_cli_ctx,
15+
make_catalog,
16+
is_migrated_table: bool,
17+
is_used_table: bool,
18+
) -> None:
19+
failures = []
20+
if not is_migrated_table:
21+
failures.append("Pending migration")
22+
if is_used_table:
23+
failures.append("Used by NOTEBOOK: test/test.py")
24+
25+
az_cli_ctx.progress_tracking_installation.run()
26+
runtime_ctx = runtime_ctx.replace(
27+
parent_run_id=1,
28+
sql_backend=az_cli_ctx.sql_backend,
29+
ucx_catalog=az_cli_ctx.ucx_catalog,
30+
)
31+
# To set both the `upgraded_to` and `upgraded_from` table property values during table creation is not possible
32+
# The below works because the `upgraded_to` value is not used for matching, the property only needs to be present
33+
hive_tbl_properties = {"upgraded_to": "upgraded_to.name_does.not_matter"} if is_migrated_table else {}
34+
hive_table_info = runtime_ctx.make_table(tbl_properties=hive_tbl_properties)
35+
uc_tbl_properties = {"upgraded_from": hive_table_info.full_name} if is_migrated_table else {}
36+
runtime_ctx.make_table(catalog_name=make_catalog().name, tbl_properties=uc_tbl_properties)
37+
hive_used_table = UsedTable(
38+
catalog_name="hive_metastore" if is_used_table else "catalog",
39+
schema_name=hive_table_info.schema_name,
40+
table_name=hive_table_info.name,
41+
source_id="test/test.py",
42+
source_timestamp=dt.datetime.now(tz=dt.timezone.utc),
43+
source_lineage=[LineageAtom(object_type="NOTEBOOK", object_id="test/test.py")],
44+
assessment_start_timestamp=dt.datetime.now(tz=dt.timezone.utc),
45+
assessment_end_timestamp=dt.datetime.now(tz=dt.timezone.utc),
46+
)
47+
runtime_ctx.used_tables_crawler_for_paths.dump_all([hive_used_table])
48+
49+
hive_table = Table(
50+
hive_table_info.catalog_name,
51+
hive_table_info.schema_name,
52+
hive_table_info.name,
53+
hive_table_info.table_type.value,
54+
hive_table_info.data_source_format.value,
55+
)
56+
runtime_ctx.tables_progress.append_inventory_snapshot([hive_table])
57+
58+
history_table_name = escape_sql_identifier(runtime_ctx.tables_progress.full_name)
59+
records = list(runtime_ctx.sql_backend.fetch(f"SELECT * FROM {history_table_name}"))
60+
61+
assert len(records) == 1, "Expected one historical entry"
62+
assert records[0].failures == failures

tests/unit/progress/test_tables.py

Lines changed: 38 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
1+
import datetime as dt
12
from unittest.mock import create_autospec
23

3-
import pytest
4-
54
from databricks.labs.ucx.framework.owners import Ownership
65
from databricks.labs.ucx.framework.utils import escape_sql_identifier
76
from databricks.labs.ucx.hive_metastore.table_migration_status import (
@@ -10,23 +9,28 @@
109
)
1110
from databricks.labs.ucx.hive_metastore.tables import Table
1211
from databricks.labs.ucx.progress.tables import TableProgressEncoder
12+
from databricks.labs.ucx.source_code.base import LineageAtom, UsedTable
13+
from databricks.labs.ucx.source_code.used_table import UsedTablesCrawler
1314

1415

15-
@pytest.mark.parametrize(
16-
"table",
17-
[
18-
Table("hive_metastore", "schema", "table", "MANAGED", "DELTA"),
19-
],
20-
)
21-
def test_table_progress_encoder_no_failures(mock_backend, table: Table) -> None:
16+
def test_table_progress_encoder_no_failures(mock_backend) -> None:
17+
table = Table("hive_metastore", "schema", "table", "MANAGED", "DELTA")
2218
ownership = create_autospec(Ownership)
2319
ownership.owner_of.return_value = "user"
2420
migration_status_crawler = create_autospec(TableMigrationStatusRefresher)
2521
migration_status_crawler.snapshot.return_value = (
2622
TableMigrationStatus(table.database, table.name, "main", "default", table.name, update_ts=None),
2723
)
24+
used_tables_crawler = create_autospec(UsedTablesCrawler)
25+
used_tables_crawler.snapshot.return_value = []
2826
encoder = TableProgressEncoder(
29-
mock_backend, ownership, migration_status_crawler, run_id=1, workspace_id=123456789, catalog="test"
27+
mock_backend,
28+
ownership,
29+
migration_status_crawler,
30+
[used_tables_crawler],
31+
run_id=1,
32+
workspace_id=123456789,
33+
catalog="test",
3034
)
3135

3236
encoder.append_inventory_snapshot([table])
@@ -36,29 +40,44 @@ def test_table_progress_encoder_no_failures(mock_backend, table: Table) -> None:
3640
assert len(rows[0].failures) == 0
3741
ownership.owner_of.assert_called_once()
3842
migration_status_crawler.snapshot.assert_called_once()
43+
used_tables_crawler.snapshot.assert_called_once()
3944

4045

41-
@pytest.mark.parametrize(
42-
"table",
43-
[
44-
Table("hive_metastore", "schema", "table", "MANAGED", "DELTA"),
45-
],
46-
)
47-
def test_table_progress_encoder_pending_migration_failure(mock_backend, table: Table) -> None:
46+
def test_table_progress_encoder_pending_migration_failure(mock_backend) -> None:
47+
table = Table("hive_metastore", "schema", "table", "MANAGED", "DELTA")
4848
ownership = create_autospec(Ownership)
4949
ownership.owner_of.return_value = "user"
5050
migration_status_crawler = create_autospec(TableMigrationStatusRefresher)
5151
migration_status_crawler.snapshot.return_value = (
5252
TableMigrationStatus(table.database, table.name), # No destination: therefore not yet migrated.
5353
)
54+
used_tables_crawler_for_paths = create_autospec(UsedTablesCrawler)
55+
used_table = UsedTable(
56+
catalog_name=table.catalog,
57+
schema_name=table.database,
58+
table_name=table.name,
59+
source_id="test/test.py",
60+
source_timestamp=dt.datetime.now(tz=dt.timezone.utc),
61+
source_lineage=[LineageAtom(object_type="NOTEBOOK", object_id="test/test.py")],
62+
assessment_start_timestamp=dt.datetime.now(tz=dt.timezone.utc),
63+
assessment_end_timestamp=dt.datetime.now(tz=dt.timezone.utc),
64+
)
65+
used_tables_crawler_for_paths.snapshot.return_value = [used_table]
5466
encoder = TableProgressEncoder(
55-
mock_backend, ownership, migration_status_crawler, run_id=1, workspace_id=123456789, catalog="test"
67+
mock_backend,
68+
ownership,
69+
migration_status_crawler,
70+
[used_tables_crawler_for_paths],
71+
run_id=1,
72+
workspace_id=123456789,
73+
catalog="test",
5674
)
5775

5876
encoder.append_inventory_snapshot([table])
5977

6078
rows = mock_backend.rows_written_for(escape_sql_identifier(encoder.full_name), "append")
6179
assert len(rows) > 0, f"No rows written for: {encoder.full_name}"
62-
assert rows[0].failures == ["Pending migration"]
80+
assert rows[0].failures == ["Pending migration", "Used by NOTEBOOK: test/test.py"]
6381
ownership.owner_of.assert_called_once()
6482
migration_status_crawler.snapshot.assert_called_once()
83+
used_tables_crawler_for_paths.snapshot.assert_called_once()

tests/unit/source_code/test_base.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,14 @@
11
import dataclasses
22

3+
import pytest
4+
35
from databricks.labs.ucx.source_code.base import (
46
Advice,
57
Advisory,
68
Convention,
79
Deprecation,
810
Failure,
11+
UsedTable,
912
)
1013

1114

@@ -40,3 +43,14 @@ def test_deprecation_initialization() -> None:
4043
def test_convention_initialization() -> None:
4144
convention = Convention('code5', 'This is a convention', 1, 1, 2, 2)
4245
assert isinstance(convention, Advice)
46+
47+
48+
@pytest.mark.parametrize(
49+
"used_table, expected_name",
50+
[
51+
(UsedTable(), "unknown.unknown.unknown"),
52+
(UsedTable(catalog_name="catalog", schema_name="schema", table_name="table"), "catalog.schema.table"),
53+
],
54+
)
55+
def test_used_table_full_name(used_table: UsedTable, expected_name: str) -> None:
56+
assert used_table.full_name == expected_name

0 commit comments

Comments
 (0)