1+ import datetime
12import logging
23from collections import defaultdict
4+ from collections .abc import Iterable
5+ from dataclasses import dataclass
36from functools import partial
47
58from databricks .labs .blueprint .installation import Installation
1518)
1619
1720from databricks .labs .ucx .config import WorkspaceConfig
21+ from databricks .labs .ucx .framework .crawlers import CrawlerBase
1822from databricks .labs .ucx .hive_metastore import TablesCrawler
1923from databricks .labs .ucx .hive_metastore .mapping import Rule , TableMapping
2024from databricks .labs .ucx .hive_metastore .tables import MigrationCount , Table , What
2125
2226logger = logging .getLogger (__name__ )
2327
2428
29+ @dataclass
30+ class MigrationStatus :
31+ src_schema : str
32+ src_table : str
33+ dst_catalog : str | None = None
34+ dst_schema : str | None = None
35+ dst_table : str | None = None
36+ update_ts : str | None = None
37+
38+
2539class TablesMigrate :
2640 def __init__ (
2741 self ,
2842 tables_crawler : TablesCrawler ,
2943 ws : WorkspaceClient ,
3044 backend : SqlBackend ,
3145 table_mapping : TableMapping ,
46+ migration_status_refresher ,
3247 ):
3348 self ._tc = tables_crawler
3449 self ._backend = backend
3550 self ._ws = ws
3651 self ._tm = table_mapping
52+ self ._migration_status_refresher = migration_status_refresher
3753 self ._seen_tables : dict [str , str ] = {}
3854
3955 @classmethod
@@ -43,7 +59,8 @@ def for_cli(cls, ws: WorkspaceClient, product='ucx'):
4359 sql_backend = StatementExecutionBackend (ws , config .warehouse_id )
4460 table_crawler = TablesCrawler (sql_backend , config .inventory_database )
4561 table_mapping = TableMapping (installation , ws , sql_backend )
46- return cls (table_crawler , ws , sql_backend , table_mapping )
62+ migration_status_refresher = MigrationStatusRefresher (ws , sql_backend , config .inventory_database , table_crawler )
63+ return cls (table_crawler , ws , sql_backend , table_mapping , migration_status_refresher )
4764
4865 def migrate_tables (self , * , what : What | None = None ):
4966 self ._init_seen_tables ()
@@ -93,19 +110,6 @@ def _migrate_view(self, src_table: Table, rule: Rule):
93110 self ._backend .execute (src_table .sql_alter_from (rule .as_uc_table_key , self ._ws .get_workspace_id ()))
94111 return True
95112
96- def _iter_schemas (self ):
97- for catalog in self ._ws .catalogs .list ():
98- yield from self ._ws .schemas .list (catalog_name = catalog .name )
99-
100- def _init_seen_tables (self ):
101- for schema in self ._iter_schemas ():
102- for table in self ._ws .tables .list (catalog_name = schema .catalog_name , schema_name = schema .name ):
103- if table .properties is None :
104- continue
105- if "upgraded_from" not in table .properties :
106- continue
107- self ._seen_tables [table .full_name .lower ()] = table .properties ["upgraded_from" ].lower ()
108-
109113 def _table_already_upgraded (self , target ) -> bool :
110114 return target in self ._seen_tables
111115
@@ -169,13 +173,7 @@ def _get_revert_count(self, schema: str | None = None, table: str | None = None)
169173 return migration_list
170174
171175 def is_upgraded (self , schema : str , table : str ) -> bool :
172- result = self ._backend .fetch (f"SHOW TBLPROPERTIES `{ schema } `.`{ table } `" )
173- for value in result :
174- if value ["key" ] == "upgraded_to" :
175- logger .info (f"{ schema } .{ table } is set as upgraded" )
176- return True
177- logger .info (f"{ schema } .{ table } is set as not upgraded" )
178- return False
176+ return self ._migration_status_refresher .is_upgraded (schema , table )
179177
180178 def print_revert_report (self , * , delete_managed : bool ) -> bool | None :
181179 migrated_count = self ._get_revert_count ()
@@ -215,6 +213,9 @@ def print_revert_report(self, *, delete_managed: bool) -> bool | None:
215213 print ("To revert and delete Migrated Tables, add --delete_managed true flag to the command" )
216214 return True
217215
216+ def _init_seen_tables (self ):
217+ self ._seen_tables = self ._migration_status_refresher .get_seen_tables ()
218+
218219
219220class TableMove :
220221 def __init__ (self , ws : WorkspaceClient , backend : SqlBackend ):
@@ -458,3 +459,62 @@ def _recreate_view(self, to_view_name, view_text):
458459 create_sql = f"CREATE VIEW { to_view_name } AS { view_text } "
459460 logger .info (f"Creating view { to_view_name } " )
460461 self ._backend .execute (create_sql )
462+
463+
464+ class MigrationStatusRefresher (CrawlerBase [MigrationStatus ]):
465+ def __init__ (self , ws : WorkspaceClient , sbe : SqlBackend , schema , table_crawler : TablesCrawler ):
466+ super ().__init__ (sbe , "hive_metastore" , schema , "migration_status" , MigrationStatus )
467+ self ._ws = ws
468+ self ._table_crawler = table_crawler
469+
470+ def snapshot (self ) -> Iterable [MigrationStatus ]:
471+ return self ._snapshot (self ._try_fetch , self ._crawl )
472+
473+ def get_seen_tables (self ) -> dict [str , str ]:
474+ seen_tables : dict [str , str ] = {}
475+ for schema in self ._iter_schemas ():
476+ for table in self ._ws .tables .list (catalog_name = schema .catalog_name , schema_name = schema .name ):
477+ if not table .properties :
478+ continue
479+ if "upgraded_from" not in table .properties :
480+ continue
481+ if not table .full_name :
482+ logger .warning (f"The table { table .name } in { schema .name } has no full name" )
483+ continue
484+ seen_tables [table .full_name .lower ()] = table .properties ["upgraded_from" ].lower ()
485+ return seen_tables
486+
487+ def is_upgraded (self , schema : str , table : str ) -> bool :
488+ result = self ._backend .fetch (f"SHOW TBLPROPERTIES `{ schema } `.`{ table } `" )
489+ for value in result :
490+ if value ["key" ] == "upgraded_to" :
491+ logger .info (f"{ schema } .{ table } is set as upgraded" )
492+ return True
493+ logger .info (f"{ schema } .{ table } is set as not upgraded" )
494+ return False
495+
496+ def _crawl (self ) -> Iterable [MigrationStatus ]:
497+ all_tables = self ._table_crawler .snapshot ()
498+ reverse_seen = {v : k for k , v in self .get_seen_tables ().items ()}
499+ timestamp = datetime .datetime .now (datetime .timezone .utc ).timestamp ()
500+ for table in all_tables :
501+ table_migration_status = MigrationStatus (
502+ src_schema = table .database ,
503+ src_table = table .name ,
504+ update_ts = str (timestamp ),
505+ )
506+ if table .key in reverse_seen and self .is_upgraded (table .database , table .name ):
507+ target_table = reverse_seen [table .key ]
508+ if len (target_table .split ("." )) == 3 :
509+ table_migration_status .dst_catalog = target_table .split ("." )[0 ]
510+ table_migration_status .dst_schema = target_table .split ("." )[1 ]
511+ table_migration_status .dst_table = target_table .split ("." )[2 ]
512+ yield table_migration_status
513+
514+ def _try_fetch (self ) -> Iterable [MigrationStatus ]:
515+ for row in self ._fetch (f"SELECT * FROM { self ._schema } .{ self ._table } " ):
516+ yield MigrationStatus (* row )
517+
518+ def _iter_schemas (self ):
519+ for catalog in self ._ws .catalogs .list ():
520+ yield from self ._ws .schemas .list (catalog_name = catalog .name )
0 commit comments