Skip to content

Commit 0bc27e5

Browse files
authored
Avoid errors in corner cases where Azure Service Principal Credentials are not available in Spark context (#254)
This PR simplifies Table ACL crawling by removing the configurability of which databases to iterate - now, `crawl_grants` will crawl all databases consistently. Fixes #247
1 parent 6d30d12 commit 0bc27e5

File tree

19 files changed

+213
-266
lines changed

19 files changed

+213
-266
lines changed

examples/migration_config.yml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,5 @@
11
inventory_database: unity_catalog_migration
22

3-
tacl:
4-
databases: [ "default" ]
5-
63
warehouse_id: None
74

85
groups:

notebooks/toolkit.py

Lines changed: 6 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,10 @@
1515
from databricks.labs.ucx.config import (
1616
GroupsConfig,
1717
MigrationConfig,
18-
TaclConfig,
1918
)
19+
from databricks.labs.ucx.framework.crawlers import RuntimeBackend
2020
from databricks.labs.ucx.workspace_access import GroupMigrationToolkit
21-
from databricks.labs.ucx.hive_metastore import TaclToolkit
21+
from databricks.labs.ucx.hive_metastore import TablesCrawler, GrantsCrawler
2222

2323
# COMMAND ----------
2424

@@ -40,22 +40,13 @@
4040
# use this option to select all groups automatically
4141
# auto=True
4242
),
43-
tacl=TaclConfig(
44-
# use this option to select specific databases manually
45-
databases=databases,
46-
# use this option to select all databases automatically
47-
# auto=True
48-
),
4943
log_level="DEBUG",
5044
)
5145

5246
toolkit = GroupMigrationToolkit(config)
53-
tacltoolkit = TaclToolkit(
54-
toolkit._ws,
55-
inventory_catalog="hive_metastore",
56-
inventory_schema=config.inventory_database,
57-
databases=config.tacl.databases,
58-
)
47+
backend = RuntimeBackend()
48+
tables = TablesCrawler(backend, config.inventory_database)
49+
grants = GrantsCrawler(tables)
5950

6051
# COMMAND ----------
6152

@@ -100,7 +91,7 @@
10091

10192
# COMMAND ----------
10293

103-
tacltoolkit.grants_snapshot()
94+
grants.snapshot()
10495

10596
# COMMAND ----------
10697

src/databricks/labs/ucx/assessment/crawlers.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,7 @@
44
from databricks.sdk import WorkspaceClient
55
from databricks.sdk.service.jobs import BaseJob
66

7-
from databricks.labs.ucx.framework.crawlers import CrawlerBase
8-
from databricks.labs.ucx.hive_metastore.table_acls import SqlBackend
7+
from databricks.labs.ucx.framework.crawlers import CrawlerBase, SqlBackend
98

109
INCOMPATIBLE_SPARK_CONFIG_KEYS = [
1110
"spark.databricks.passthrough.enabled",

src/databricks/labs/ucx/config.py

Lines changed: 2 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -68,37 +68,18 @@ def from_dict(cls, raw: dict):
6868
return cls(**raw)
6969

7070

71-
@dataclass
72-
class TaclConfig:
73-
databases: list[str] | None = None
74-
auto: bool | None = None
75-
76-
def __post_init__(self):
77-
if not self.databases and self.auto is None:
78-
msg = "Either selected or auto must be set"
79-
raise ValueError(msg)
80-
if self.databases and self.auto is False:
81-
msg = "No selected groups provided, but auto-collection is disabled"
82-
raise ValueError(msg)
83-
84-
@classmethod
85-
def from_dict(cls, raw: dict):
86-
return cls(**raw)
87-
88-
8971
# Used to set the right expectation about configuration file schema
9072
_CONFIG_VERSION = 1
9173

9274

9375
@dataclass
9476
class MigrationConfig:
9577
inventory_database: str
96-
tacl: TaclConfig
9778
groups: GroupsConfig
9879
instance_pool_id: str = None
9980
warehouse_id: str = None
10081
connect: ConnectConfig | None = None
101-
num_threads: int | None = 4
82+
num_threads: int | None = 10
10283
log_level: str | None = "INFO"
10384

10485
# Starting path for notebooks and directories crawler
@@ -137,12 +118,11 @@ def from_dict(cls, raw: dict) -> "MigrationConfig":
137118
raise ValueError(msg)
138119
return cls(
139120
inventory_database=raw.get("inventory_database"),
140-
tacl=TaclConfig.from_dict(raw.get("tacl", {})),
141121
groups=GroupsConfig.from_dict(raw.get("groups", {})),
142122
connect=ConnectConfig.from_dict(raw.get("connect", {})),
143123
instance_pool_id=raw.get("instance_pool_id", None),
144124
warehouse_id=raw.get("warehouse_id", None),
145-
num_threads=raw.get("num_threads", 8),
125+
num_threads=raw.get("num_threads", 10),
146126
log_level=raw.get("log_level", "INFO"),
147127
)
148128

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1-
from databricks.labs.ucx.hive_metastore.table_acls import TaclToolkit
1+
from databricks.labs.ucx.hive_metastore.grants import GrantsCrawler
2+
from databricks.labs.ucx.hive_metastore.list_mounts import Mounts
3+
from databricks.labs.ucx.hive_metastore.tables import TablesCrawler
24

3-
__all__ = ["TaclToolkit"]
5+
__all__ = ["TablesCrawler", "GrantsCrawler", "Mounts"]

src/databricks/labs/ucx/hive_metastore/grants.py

Lines changed: 16 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -122,28 +122,21 @@ def __init__(self, tc: TablesCrawler):
122122
super().__init__(tc._backend, tc._catalog, tc._schema, "grants")
123123
self._tc = tc
124124

125-
def snapshot(self, catalog: str, database: str) -> list[Grant]:
126-
return self._snapshot(partial(self._try_load, catalog, database), partial(self._crawl, catalog, database))
125+
def snapshot(self) -> list[Grant]:
126+
return self._snapshot(partial(self._try_load), partial(self._crawl))
127127

128-
def _try_load(self, catalog: str, database: str):
129-
for row in self._fetch(
130-
f'SELECT * FROM {self._full_name} WHERE catalog = "{catalog}" AND database = "{database}"'
131-
):
128+
def _try_load(self):
129+
for row in self._fetch(f"SELECT * FROM {self._full_name}"):
132130
yield Grant(*row)
133131

134-
def _crawl(self, catalog: str, database: str) -> list[Grant]:
132+
def _crawl(self) -> list[Grant]:
135133
"""
136-
Crawls and lists grants for tables and views within the specified catalog and database.
137-
138-
Args:
139-
catalog (str): The catalog name.
140-
database (str): The database name.
134+
Crawls and lists grants for all databases, tables, and views within hive_metastore.
141135
142136
Returns:
143137
list[Grant]: A list of Grant objects representing the listed grants.
144138
145139
Behavior:
146-
- Validates and prepares the provided catalog and database names.
147140
- Constructs a list of tasks to fetch grants using the `_grants` method, including both database-wide and
148141
table/view-specific grants.
149142
- Iterates through tables in the specified database using the `_tc.snapshot` method.
@@ -156,21 +149,22 @@ def _crawl(self, catalog: str, database: str) -> list[Grant]:
156149
database, table, view).
157150
158151
Returns:
159-
list[Grant]: A list of Grant objects representing the grants found in the specified catalog and database.
152+
list[Grant]: A list of Grant objects representing the grants found in hive_metastore.
160153
"""
161-
catalog = self._valid(catalog)
162-
database = self._valid(database)
163-
tasks = [partial(self._grants, catalog=catalog), partial(self._grants, catalog=catalog, database=database)]
164-
for table in self._tc.snapshot(catalog, database):
165-
fn = partial(self._grants, catalog=catalog, database=database)
154+
seen_databases = set()
155+
catalog = "hive_metastore"
156+
tasks = [partial(self._grants, catalog=catalog)]
157+
for table in self._tc.snapshot():
158+
if table.database not in seen_databases:
159+
tasks.append(partial(self._grants, catalog=catalog, database=table.database))
160+
seen_databases.add(table.database)
161+
fn = partial(self._grants, catalog=catalog, database=table.database)
166162
if table.kind == "VIEW":
167163
tasks.append(partial(fn, view=table.name))
168164
else:
169165
tasks.append(partial(fn, table=table.name))
170166
return [
171-
grant
172-
for grants in ThreadedExecution.gather(f"listing grants for {catalog}.{database}", tasks)
173-
for grant in grants
167+
grant for grants in ThreadedExecution.gather(f"listing grants for {catalog}", tasks) for grant in grants
174168
]
175169

176170
def _grants(

src/databricks/labs/ucx/hive_metastore/table_acls.py

Lines changed: 0 additions & 48 deletions
This file was deleted.

src/databricks/labs/ucx/hive_metastore/tables.py

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -70,54 +70,54 @@ def uc_create_sql(self, catalog):
7070

7171

7272
class TablesCrawler(CrawlerBase):
73-
def __init__(self, backend: SqlBackend, catalog, schema):
73+
def __init__(self, backend: SqlBackend, schema):
7474
"""
7575
Initializes a TablesCrawler instance.
7676
7777
Args:
7878
backend (SqlBackend): The SQL Execution Backend abstraction (either REST API or Spark)
79-
catalog (str): The catalog name for the inventory persistence.
8079
schema: The schema name for the inventory persistence.
8180
"""
82-
super().__init__(backend, catalog, schema, "tables")
81+
super().__init__(backend, "hive_metastore", schema, "tables")
8382

8483
def _all_databases(self) -> Iterator[Row]:
8584
yield from self._fetch("SHOW DATABASES")
8685

87-
def snapshot(self, catalog: str, database: str) -> list[Table]:
86+
def snapshot(self) -> list[Table]:
8887
"""
8988
Takes a snapshot of tables in the specified catalog and database.
9089
91-
Args:
92-
catalog (str): The catalog name.
93-
database (str): The database name.
94-
9590
Returns:
9691
list[Table]: A list of Table objects representing the snapshot of tables.
9792
"""
98-
return self._snapshot(partial(self._try_load, catalog, database), partial(self._crawl, catalog, database))
93+
return self._snapshot(partial(self._try_load), partial(self._crawl))
9994

100-
def _try_load(self, catalog: str, database: str):
95+
def _try_load(self):
10196
"""Tries to load table information from the database or throws TABLE_OR_VIEW_NOT_FOUND error"""
102-
for row in self._fetch(
103-
f'SELECT * FROM {self._full_name} WHERE catalog = "{catalog}" AND database = "{database}"'
104-
):
97+
for row in self._fetch(f"SELECT * FROM {self._full_name}"):
10598
yield Table(*row)
10699

107-
def _crawl(self, catalog: str, database: str) -> list[Table]:
100+
def _crawl(self) -> list[Table]:
108101
"""Crawls and lists tables within the specified catalog and database.
109102
110103
After performing initial scan of all tables, starts making parallel
111104
DESCRIBE TABLE EXTENDED queries for every table.
105+
106+
Production tasks would most likely be executed through `tables.scala`
107+
within `crawl_tables` task due to `spark.sharedState.externalCatalog`
108+
lower-level APIs not requiring a roundtrip to storage, which is not
109+
possible for Azure storage with credentials supplied through Spark
110+
conf (see https://github.com/databrickslabs/ucx/issues/249).
111+
112+
See also https://github.com/databrickslabs/ucx/issues/247
112113
"""
113-
catalog = self._valid(catalog)
114-
database = self._valid(database)
115-
logger.debug(f"[{catalog}.{database}] listing tables")
116114
tasks = []
117-
for _, table, _is_tmp in self._fetch(f"SHOW TABLES FROM {catalog}.{database}"):
118-
tasks.append(partial(self._describe, catalog, database, table))
119-
results = ThreadedExecution.gather(f"listing tables in {catalog}.{database}", tasks)
120-
115+
catalog = "hive_metastore"
116+
for (database,) in self._all_databases():
117+
logger.debug(f"[{catalog}.{database}] listing tables")
118+
for _, table, _is_tmp in self._fetch(f"SHOW TABLES FROM {catalog}.{database}"):
119+
tasks.append(partial(self._describe, catalog, database, table))
120+
results = ThreadedExecution.gather(f"listing tables in {catalog}", tasks)
121121
return [x for x in results if x is not None]
122122

123123
def _describe(self, catalog: str, database: str, table: str) -> Table | None:

src/databricks/labs/ucx/install.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from databricks.sdk.service.workspace import ImportFormat
1818

1919
from databricks.labs.ucx.__about__ import __version__
20-
from databricks.labs.ucx.config import GroupsConfig, MigrationConfig, TaclConfig
20+
from databricks.labs.ucx.config import GroupsConfig, MigrationConfig
2121
from databricks.labs.ucx.framework.dashboards import DashboardFromFiles
2222
from databricks.labs.ucx.framework.tasks import _TASKS, Task
2323
from databricks.labs.ucx.runtime import main
@@ -199,7 +199,6 @@ def _configure(self):
199199
self._config = MigrationConfig(
200200
inventory_database=inventory_database,
201201
groups=GroupsConfig(**groups_config_args),
202-
tacl=TaclConfig(auto=True),
203202
warehouse_id=warehouse_id,
204203
log_level=log_level,
205204
num_threads=num_threads,

src/databricks/labs/ucx/runtime.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from databricks.labs.ucx.config import MigrationConfig
99
from databricks.labs.ucx.framework.crawlers import RuntimeBackend
1010
from databricks.labs.ucx.framework.tasks import task, trigger
11-
from databricks.labs.ucx.hive_metastore import TaclToolkit
11+
from databricks.labs.ucx.hive_metastore import GrantsCrawler, TablesCrawler
1212
from databricks.labs.ucx.hive_metastore.data_objects import ExternalLocationCrawler
1313
from databricks.labs.ucx.hive_metastore.list_mounts import Mounts
1414
from databricks.labs.ucx.workspace_access import GroupMigrationToolkit
@@ -48,11 +48,10 @@ def crawl_grants(cfg: MigrationConfig):
4848
setup. This approach not only safeguards data integrity and access control but also ensures a smooth and
4949
secure transition for our data assets. It reinforces our commitment to data security and compliance throughout the
5050
migration process and beyond"""
51-
ws = WorkspaceClient(config=cfg.to_databricks_config())
52-
tacls = TaclToolkit(
53-
ws, inventory_catalog="hive_metastore", inventory_schema=cfg.inventory_database, databases=cfg.tacl.databases
54-
)
55-
tacls.grants_snapshot()
51+
backend = RuntimeBackend()
52+
tables = TablesCrawler(backend, cfg.inventory_database)
53+
grants = GrantsCrawler(tables)
54+
grants.snapshot()
5655

5756

5857
@task("assessment", depends_on=[setup_schema])

0 commit comments

Comments
 (0)