@@ -70,54 +70,54 @@ def uc_create_sql(self, catalog):
7070
7171
7272class TablesCrawler (CrawlerBase ):
73- def __init__ (self , backend : SqlBackend , catalog , schema ):
73+ def __init__ (self , backend : SqlBackend , schema ):
7474 """
7575 Initializes a TablesCrawler instance.
7676
7777 Args:
7878 backend (SqlBackend): The SQL Execution Backend abstraction (either REST API or Spark)
79- catalog (str): The catalog name for the inventory persistence.
8079 schema: The schema name for the inventory persistence.
8180 """
82- super ().__init__ (backend , catalog , schema , "tables" )
81+ super ().__init__ (backend , "hive_metastore" , schema , "tables" )
8382
8483 def _all_databases (self ) -> Iterator [Row ]:
8584 yield from self ._fetch ("SHOW DATABASES" )
8685
87- def snapshot (self , catalog : str , database : str ) -> list [Table ]:
86+ def snapshot (self ) -> list [Table ]:
8887 """
8988 Takes a snapshot of tables in the specified catalog and database.
9089
91- Args:
92- catalog (str): The catalog name.
93- database (str): The database name.
94-
9590 Returns:
9691 list[Table]: A list of Table objects representing the snapshot of tables.
9792 """
98- return self ._snapshot (partial (self ._try_load , catalog , database ), partial (self ._crawl , catalog , database ))
93+ return self ._snapshot (partial (self ._try_load ), partial (self ._crawl ))
9994
100- def _try_load (self , catalog : str , database : str ):
95+ def _try_load (self ):
10196 """Tries to load table information from the database or throws TABLE_OR_VIEW_NOT_FOUND error"""
102- for row in self ._fetch (
103- f'SELECT * FROM { self ._full_name } WHERE catalog = "{ catalog } " AND database = "{ database } "'
104- ):
97+ for row in self ._fetch (f"SELECT * FROM { self ._full_name } " ):
10598 yield Table (* row )
10699
107- def _crawl (self , catalog : str , database : str ) -> list [Table ]:
100+ def _crawl (self ) -> list [Table ]:
108101 """Crawls and lists tables within the specified catalog and database.
109102
110103 After performing initial scan of all tables, starts making parallel
111104 DESCRIBE TABLE EXTENDED queries for every table.
105+
106+ Production tasks would most likely be executed through `tables.scala`
107+ within `crawl_tables` task due to `spark.sharedState.externalCatalog`
108+ lower-level APIs not requiring a roundtrip to storage, which is not
109+ possible for Azure storage with credentials supplied through Spark
110+ conf (see https://github.com/databrickslabs/ucx/issues/249).
111+
112+ See also https://github.com/databrickslabs/ucx/issues/247
112113 """
113- catalog = self ._valid (catalog )
114- database = self ._valid (database )
115- logger .debug (f"[{ catalog } .{ database } ] listing tables" )
116114 tasks = []
117- for _ , table , _is_tmp in self ._fetch (f"SHOW TABLES FROM { catalog } .{ database } " ):
118- tasks .append (partial (self ._describe , catalog , database , table ))
119- results = ThreadedExecution .gather (f"listing tables in { catalog } .{ database } " , tasks )
120-
115+ catalog = "hive_metastore"
116+ for (database ,) in self ._all_databases ():
117+ logger .debug (f"[{ catalog } .{ database } ] listing tables" )
118+ for _ , table , _is_tmp in self ._fetch (f"SHOW TABLES FROM { catalog } .{ database } " ):
119+ tasks .append (partial (self ._describe , catalog , database , table ))
120+ results = ThreadedExecution .gather (f"listing tables in { catalog } " , tasks )
121121 return [x for x in results if x is not None ]
122122
123123 def _describe (self , catalog : str , database : str , table : str ) -> Table | None :
0 commit comments