Skip to content

Commit ccd15b3

Browse files
authored
[TECH DEBT]: Deal with some nits in the existing crawler code (#2604)
## Changes This PR deals with some nits that I noticed amongst the crawlers while working on #2566, but omitted to avoid distracting from the focus of that PR. These do not result in any functional changes.
1 parent 8bc6aef commit ccd15b3

File tree

3 files changed

+27
-14
lines changed

3 files changed

+27
-14
lines changed

src/databricks/labs/ucx/hive_metastore/grants.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -245,10 +245,12 @@ def _crawl(self) -> Iterable[Grant]:
245245
list[Grant]: A list of Grant objects representing the grants found in hive_metastore.
246246
"""
247247
catalog = "hive_metastore"
248-
tasks = [partial(self.grants, catalog=catalog)]
249-
# Scanning ANY FILE and ANONYMOUS FUNCTION grants
250-
tasks.append(partial(self.grants, catalog=catalog, any_file=True))
251-
tasks.append(partial(self.grants, catalog=catalog, anonymous_function=True))
248+
tasks = [
249+
partial(self.grants, catalog=catalog),
250+
# Scanning ANY FILE and ANONYMOUS FUNCTION grants
251+
partial(self.grants, catalog=catalog, any_file=True),
252+
partial(self.grants, catalog=catalog, anonymous_function=True),
253+
]
252254
if not self._include_databases:
253255
# scan all databases, even empty ones
254256
for row in self._fetch(f"SHOW DATABASES FROM {escape_sql_identifier(catalog)}"):

src/databricks/labs/ucx/hive_metastore/locations.py

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -304,7 +304,8 @@ def __init__(self, backend: SqlBackend, ws: WorkspaceClient, inventory_database:
304304
super().__init__(backend, "hive_metastore", inventory_database, "mounts", Mount)
305305
self._dbutils = ws.dbutils
306306

307-
def _deduplicate_mounts(self, mounts: list) -> list:
307+
@staticmethod
308+
def _deduplicate_mounts(mounts: list) -> list:
308309
seen = set()
309310
deduplicated_mounts = []
310311
for obj in mounts:
@@ -395,7 +396,8 @@ def _try_fetch(self) -> Iterable[Table]:
395396
):
396397
yield Table(*row)
397398

398-
def _get_tables_paths_from_assessment(self, loaded_records: Iterable[Table]) -> dict[str, str]:
399+
@staticmethod
400+
def _get_tables_paths_from_assessment(loaded_records: Iterable[Table]) -> dict[str, str]:
399401
seen = {}
400402
for rec in loaded_records:
401403
if not rec.location:
@@ -410,8 +412,8 @@ def _crawl_tables(self, table_paths_from_assessment: dict[str, str]) -> list[Tab
410412
if self._include_mounts and mount.name not in self._include_mounts:
411413
logger.info(f"Filtering mount {mount.name}")
412414
continue
413-
table_paths = {}
414415
if self._include_paths_in_mount:
416+
table_paths = {}
415417
for path in self._include_paths_in_mount:
416418
table_paths.update(self._find_delta_log_folders(path))
417419
else:
@@ -446,7 +448,8 @@ def _crawl_tables(self, table_paths_from_assessment: dict[str, str]) -> list[Tab
446448
logger.info(f"Found a total of {len(all_tables)} tables in mount points")
447449
return all_tables
448450

449-
def _get_table_location(self, mount: Mount, path: str):
451+
@staticmethod
452+
def _get_table_location(mount: Mount, path: str) -> str:
450453
"""
451454
There can be different cases for mounts:
452455
- Mount(name='/mnt/things/a', source='abfss://[email protected]/a')
@@ -457,7 +460,11 @@ def _get_table_location(self, mount: Mount, path: str):
457460
return path.replace(f"dbfs:{mount.name}/", mount.source)
458461
return path.replace(f"dbfs:{mount.name}", mount.source)
459462

460-
def _find_delta_log_folders(self, root_dir: str, delta_log_folders=None) -> dict:
463+
def _find_delta_log_folders(
464+
self,
465+
root_dir: str,
466+
delta_log_folders: dict[str, TableInMount] | None = None,
467+
) -> dict[str, TableInMount]:
461468
if delta_log_folders is None:
462469
delta_log_folders = {}
463470
logger.info(f"Listing {root_dir}")
@@ -519,18 +526,22 @@ def _assess_path(self, file_info: FileInfo) -> TableInMount | None:
519526
return TableInMount(format="PARQUET", is_partitioned=False)
520527
return None
521528

522-
def _is_partitioned(self, file_name: str) -> bool:
529+
@staticmethod
530+
def _is_partitioned(file_name: str) -> bool:
523531
return '=' in file_name
524532

525-
def _is_parquet(self, file_name: str) -> bool:
533+
@staticmethod
534+
def _is_parquet(file_name: str) -> bool:
526535
parquet_patterns = {'.parquet'}
527536
return any(pattern in file_name for pattern in parquet_patterns)
528537

529-
def _is_csv(self, file_name: str) -> bool:
538+
@staticmethod
539+
def _is_csv(file_name: str) -> bool:
530540
csv_patterns = {'.csv'}
531541
return any(pattern in file_name for pattern in csv_patterns)
532542

533-
def _is_json(self, file_name: str) -> bool:
543+
@staticmethod
544+
def _is_json(file_name: str) -> bool:
534545
json_patterns = {'.json'}
535546
return any(pattern in file_name for pattern in json_patterns)
536547

src/databricks/labs/ucx/hive_metastore/tables.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -562,7 +562,7 @@ def _describe(self, catalog, database, table) -> Table | None:
562562
def _crawl(self) -> Iterable[Table]:
563563
"""Crawls and lists tables within the specified catalog and database."""
564564
tasks = []
565-
catalog_tables: Collection[Table] = []
565+
catalog_tables: Collection[Table]
566566
catalog = "hive_metastore"
567567
databases = self._all_databases()
568568
for database in databases:

0 commit comments

Comments
 (0)