Skip to content

Commit c9ee832

Browse files
mwojtyczkanfx
andauthored
Added support for crawling grants and applying Hive Metastore UDF ACLs (#812)
## Changes Added support for crawling grants and applying Hive Metastore UDF ACLs. Resolves #808 ### Functionality - [x] added new crawler for udfs - [x] added handling of udfs to the existing TableAclSupport - [x] added new column to inventory permissions table for udfs ### Tests - [x] tested manually - [x] added unit tests - [x] added integration tests --------- Co-authored-by: Serge Smertin <[email protected]>
1 parent af80620 commit c9ee832

File tree

16 files changed

+682
-71
lines changed

16 files changed

+682
-71
lines changed

src/databricks/labs/ucx/hive_metastore/grants.py

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
from databricks.labs.ucx.framework.crawlers import CrawlerBase
1111
from databricks.labs.ucx.hive_metastore.tables import TablesCrawler
12+
from databricks.labs.ucx.hive_metastore.udfs import UdfsCrawler
1213

1314
logger = logging.getLogger(__name__)
1415

@@ -21,6 +22,7 @@ class Grant:
2122
database: str | None = None
2223
table: str | None = None
2324
view: str | None = None
25+
udf: str | None = None
2426
any_file: bool = False
2527
anonymous_function: bool = False
2628

@@ -31,6 +33,7 @@ def type_and_key(
3133
database: str | None = None,
3234
table: str | None = None,
3335
view: str | None = None,
36+
udf: str | None = None,
3437
any_file: bool = False,
3538
anonymous_function: bool = False,
3639
) -> tuple[str, str]:
@@ -42,6 +45,10 @@ def type_and_key(
4245
catalog = "hive_metastore" if catalog is None else catalog
4346
database = "default" if database is None else database
4447
return "VIEW", f"{catalog}.{database}.{view}"
48+
if udf is not None:
49+
catalog = "hive_metastore" if catalog is None else catalog
50+
database = "default" if database is None else database
51+
return "FUNCTION", f"{catalog}.{database}.{udf}"
4552
if database is not None:
4653
catalog = "hive_metastore" if catalog is None else catalog
4754
return "DATABASE", f"{catalog}.{database}"
@@ -53,7 +60,7 @@ def type_and_key(
5360
if catalog is not None:
5461
return "CATALOG", catalog
5562
msg = (
56-
f"invalid grant keys: catalog={catalog}, database={database}, view={view}, "
63+
f"invalid grant keys: catalog={catalog}, database={database}, view={view}, udf={udf}"
5764
f"any_file={any_file}, anonymous_function={anonymous_function}"
5865
)
5966
raise ValueError(msg)
@@ -69,6 +76,7 @@ def this_type_and_key(self):
6976
database=self.database,
7077
table=self.table,
7178
view=self.view,
79+
udf=self.udf,
7280
any_file=self.any_file,
7381
anonymous_function=self.anonymous_function,
7482
)
@@ -135,9 +143,13 @@ def uc_grant_sql(self):
135143

136144

137145
class GrantsCrawler(CrawlerBase[Grant]):
138-
def __init__(self, tc: TablesCrawler):
146+
def __init__(self, tc: TablesCrawler, udf: UdfsCrawler):
147+
assert tc._backend == udf._backend
148+
assert tc._catalog == udf._catalog
149+
assert tc._schema == udf._schema
139150
super().__init__(tc._backend, tc._catalog, tc._schema, "grants", Grant)
140151
self._tc = tc
152+
self._udf = udf
141153

142154
def snapshot(self) -> Iterable[Grant]:
143155
return self._snapshot(partial(self._try_load), partial(self._crawl))
@@ -148,7 +160,7 @@ def _try_load(self):
148160

149161
def _crawl(self) -> Iterable[Grant]:
150162
"""
151-
Crawls and lists grants for all databases, tables, views, any file
163+
Crawls and lists grants for all databases, tables, views, udfs, any file
152164
and anonymous function within hive_metastore.
153165
154166
Returns:
@@ -159,12 +171,14 @@ def _crawl(self) -> Iterable[Grant]:
159171
table/view-specific grants.
160172
- Iterates through tables in the specified database using the `_tc.snapshot` method.
161173
- For each table, adds tasks to fetch grants for the table or its view, depending on the kind of the table.
174+
- Iterates through udfs in the specified database using the `_udf.snapshot` method.
175+
- For each udf, adds tasks to fetch grants for the udf.
162176
- Executes the tasks concurrently using Threads.gather.
163177
- Flattens the list of retrieved grant lists into a single list of Grant objects.
164178
165179
Note:
166180
- The method assumes that the `_grants` method fetches grants based on the provided parameters (catalog,
167-
database, table, view, any file, anonymous function).
181+
database, table, view, udfs, any file, anonymous function).
168182
169183
Returns:
170184
list[Grant]: A list of Grant objects representing the grants found in hive_metastore.
@@ -181,6 +195,9 @@ def _crawl(self) -> Iterable[Grant]:
181195
fn = partial(self._grants, catalog=catalog, database=table.database)
182196
# views are recognized as tables
183197
tasks.append(partial(fn, table=table.name))
198+
for udf in self._udf.snapshot():
199+
fn = partial(self._grants, catalog=catalog, database=udf.database)
200+
tasks.append(partial(fn, udf=udf.name))
184201
catalog_grants, errors = Threads.gather(f"listing grants for {catalog}", tasks)
185202
if len(errors) > 0:
186203
raise ManyError(errors)
@@ -206,6 +223,7 @@ def _grants(
206223
database: str | None = None,
207224
table: str | None = None,
208225
view: str | None = None,
226+
udf: str | None = None,
209227
any_file: bool = False,
210228
anonymous_function: bool = False,
211229
) -> list[Grant]:
@@ -217,6 +235,7 @@ def _grants(
217235
database (str | None): The database name (optional).
218236
table (str | None): The table name (optional).
219237
view (str | None): The view name (optional).
238+
udf (str | None): The udf name (optional).
220239
any_file (bool): Whether to include any file grants (optional).
221240
anonymous_function (bool): Whether to include anonymous function grants (optional).
222241
@@ -245,13 +264,12 @@ def _grants(
245264
database=self._try_valid(database),
246265
table=self._try_valid(table),
247266
view=self._try_valid(view),
267+
udf=self._try_valid(udf),
248268
any_file=any_file,
249269
anonymous_function=anonymous_function,
250270
)
251271
try:
252272
grants = []
253-
# Added ANY FILE and ANONYMOUS FUNCTION in object_type_normalization
254-
# to capture the same in grants. issue:#623
255273
object_type_normalization = {
256274
"SCHEMA": "DATABASE",
257275
"CATALOG$": "CATALOG",
@@ -271,6 +289,7 @@ def _grants(
271289
action_type=action_type,
272290
table=table,
273291
view=view,
292+
udf=udf,
274293
database=database,
275294
catalog=catalog,
276295
any_file=any_file,
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
import logging
2+
from collections.abc import Iterable, Iterator
3+
from dataclasses import dataclass
4+
from functools import partial
5+
6+
from databricks.labs.blueprint.parallel import Threads
7+
8+
from databricks.labs.ucx.framework.crawlers import CrawlerBase, SqlBackend
9+
from databricks.labs.ucx.mixins.sql import Row
10+
11+
logger = logging.getLogger(__name__)
12+
13+
14+
@dataclass
15+
class Udf:
16+
catalog: str
17+
database: str
18+
name: str
19+
func_type: str
20+
func_input: str
21+
func_returns: str
22+
deterministic: bool
23+
data_access: str
24+
body: str
25+
comment: str = ""
26+
27+
@property
28+
def key(self) -> str:
29+
return f"{self.catalog}.{self.database}.{self.name}".lower()
30+
31+
32+
class UdfsCrawler(CrawlerBase):
33+
def __init__(self, backend: SqlBackend, schema):
34+
"""
35+
Initializes a UdfsCrawler instance.
36+
37+
Args:
38+
backend (SqlBackend): The SQL Execution Backend abstraction (either REST API or Spark)
39+
schema: The schema name for the inventory persistence.
40+
"""
41+
super().__init__(backend, "hive_metastore", schema, "udfs", Udf)
42+
43+
def _all_databases(self) -> Iterator[Row]:
44+
yield from self._fetch("SHOW DATABASES")
45+
46+
def snapshot(self) -> list[Udf]:
47+
"""
48+
Takes a snapshot of tables in the specified catalog and database.
49+
50+
Returns:
51+
list[Udf]: A list of Udf objects representing the snapshot of tables.
52+
"""
53+
return self._snapshot(self._try_load, self._crawl)
54+
55+
def _try_load(self) -> Iterable[Udf]:
56+
"""Tries to load udf information from the database or throws TABLE_OR_VIEW_NOT_FOUND error"""
57+
for row in self._fetch(f"SELECT * FROM {self._full_name}"):
58+
yield Udf(*row)
59+
60+
def _crawl(self) -> Iterable[Udf]:
61+
"""Crawls and lists udfs within the specified catalog and database."""
62+
tasks = []
63+
catalog = "hive_metastore"
64+
# need to set the current catalog otherwise "SHOW USER FUNCTIONS FROM" is raising error:
65+
# "target schema <database> is not in the current catalog"
66+
self._exec(f"USE CATALOG {catalog};")
67+
for (database,) in self._all_databases():
68+
logger.debug(f"[{catalog}.{database}] listing udfs")
69+
for (udf,) in self._fetch(f"SHOW USER FUNCTIONS FROM {catalog}.{database};"):
70+
if udf.startswith(f"{catalog}.{database}"):
71+
udf_name = udf[udf.rfind(".") + 1 :] # remove catalog and database info from the name
72+
tasks.append(partial(self._describe, catalog, database, udf_name))
73+
catalog_tables, errors = Threads.gather(f"listing udfs in {catalog}", tasks)
74+
if len(errors) > 0:
75+
logger.error(f"Detected {len(errors)} while scanning udfs in {catalog}")
76+
return catalog_tables
77+
78+
def _describe(self, catalog: str, database: str, udf: str) -> Udf | None:
79+
"""Fetches metadata like udf type, input, returns, data access and body
80+
if specified for a specific udf within the given catalog and database.
81+
"""
82+
full_name = f"{catalog}.{database}.{udf}"
83+
try:
84+
logger.debug(f"[{full_name}] fetching udf metadata")
85+
describe = {}
86+
for key_value in self._fetch(f"DESCRIBE FUNCTION EXTENDED {full_name}"):
87+
if ":" in key_value: # skip free text configs that don't have a key
88+
key, value = key_value.split(":")
89+
describe[key] = value.strip()
90+
return Udf(
91+
catalog=catalog.lower(),
92+
database=database.lower(),
93+
name=udf.lower(),
94+
func_type=describe.get("Type", "UNKNOWN"),
95+
func_input=describe.get("Input", "UNKNOWN"),
96+
func_returns=describe.get("Returns", "UNKNOWN"),
97+
deterministic=describe.get("Deterministic", False),
98+
data_access=describe.get("Type", "UNKNOWN"),
99+
comment=describe.get("Comment", "UNKNOWN"),
100+
body=describe.get("Body", "UNKNOWN"),
101+
)
102+
except Exception as e:
103+
logger.error(f"Couldn't fetch information for udf {full_name} : {e}")
104+
return None

src/databricks/labs/ucx/mixins/fixtures.py

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,13 @@
1515
import pytest
1616
from databricks.sdk import AccountClient, WorkspaceClient
1717
from databricks.sdk.core import DatabricksError
18-
from databricks.sdk.errors import ResourceConflict
18+
from databricks.sdk.errors import NotFound, ResourceConflict
1919
from databricks.sdk.retries import retried
2020
from databricks.sdk.service import compute, iam, jobs, pipelines, sql, workspace
2121
from databricks.sdk.service.catalog import (
2222
CatalogInfo,
2323
DataSourceFormat,
24+
FunctionInfo,
2425
SchemaInfo,
2526
TableInfo,
2627
TableType,
@@ -1023,6 +1024,45 @@ def remove(table_info: TableInfo):
10231024
yield from factory("table", create, remove)
10241025

10251026

1027+
@pytest.fixture
1028+
def make_udf(sql_backend, make_schema, make_random) -> Generator[Callable[..., FunctionInfo], None, None]:
1029+
def create(
1030+
*, catalog_name="hive_metastore", schema_name: str | None = None, name: str | None = None
1031+
) -> FunctionInfo:
1032+
if schema_name is None:
1033+
schema = make_schema(catalog_name=catalog_name)
1034+
catalog_name = schema.catalog_name
1035+
schema_name = schema.name
1036+
1037+
if name is None:
1038+
name = f"ucx_T{make_random(4)}".lower()
1039+
1040+
full_name = f"{catalog_name}.{schema_name}.{name}".lower()
1041+
ddl = f"CREATE FUNCTION {full_name}(x INT) RETURNS FLOAT CONTAINS SQL DETERMINISTIC RETURN 0;"
1042+
1043+
sql_backend.execute(ddl)
1044+
udf_info = FunctionInfo(
1045+
catalog_name=catalog_name,
1046+
schema_name=schema_name,
1047+
name=name,
1048+
full_name=full_name,
1049+
)
1050+
1051+
logger.info(f"Function {udf_info.full_name} crated")
1052+
return udf_info
1053+
1054+
def remove(udf_info: FunctionInfo):
1055+
try:
1056+
sql_backend.execute(f"DROP FUNCTION IF EXISTS {udf_info.full_name}")
1057+
except NotFound as e:
1058+
if "SCHEMA_NOT_FOUND" in str(e):
1059+
logger.warning("Schema was already dropped while executing the test", exc_info=e)
1060+
else:
1061+
raise e
1062+
1063+
yield from factory("table", create, remove)
1064+
1065+
10261066
@pytest.fixture
10271067
def make_query(ws, make_table, make_random):
10281068
def create() -> QueryInfo:

src/databricks/labs/ucx/queries/views/grant_detail.sql

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ SELECT
66
WHEN table IS NOT NULL THEN 'TABLE'
77
WHEN database IS NOT NULL THEN 'DATABASE'
88
WHEN catalog IS NOT NULL THEN 'CATALOG'
9+
WHEN udf IS NOT NULL THEN 'UDF'
910
ELSE 'UNKNOWN'
1011
END AS object_type,
1112
CASE
@@ -15,6 +16,7 @@ SELECT
1516
WHEN table IS NOT NULL THEN CONCAT(catalog, '.', database, '.', table)
1617
WHEN database IS NOT NULL THEN CONCAT(catalog, '.', database)
1718
WHEN catalog IS NOT NULL THEN catalog
19+
WHEN udf IS NOT NULL THEN CONCAT(catalog, '.', database, '.', udf)
1820
ELSE 'UNKNOWN'
1921
END AS object_id,
2022
action_type,
@@ -28,5 +30,6 @@ SELECT
2830
principal,
2931
catalog,
3032
database,
31-
table
33+
table,
34+
udf
3235
FROM $inventory.grants where database != split("$inventory",'[.]')[1]

src/databricks/labs/ucx/runtime.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
TablesCrawler,
2020
)
2121
from databricks.labs.ucx.hive_metastore.table_size import TableSizeCrawler
22+
from databricks.labs.ucx.hive_metastore.udfs import UdfsCrawler
2223
from databricks.labs.ucx.workspace_access.generic import WorkspaceListing
2324
from databricks.labs.ucx.workspace_access.groups import GroupManager
2425
from databricks.labs.ucx.workspace_access.manager import PermissionManager
@@ -53,7 +54,8 @@ def crawl_grants(cfg: WorkspaceConfig):
5354
ACLs enabled and available for retrieval."""
5455
backend = RuntimeBackend()
5556
tables = TablesCrawler(backend, cfg.inventory_database)
56-
grants = GrantsCrawler(tables)
57+
udfs = UdfsCrawler(backend, cfg.inventory_database)
58+
grants = GrantsCrawler(tables, udfs)
5759
grants.snapshot()
5860

5961

src/databricks/labs/ucx/workspace_access/manager.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
SqlBackend,
1616
)
1717
from databricks.labs.ucx.hive_metastore import GrantsCrawler, TablesCrawler
18+
from databricks.labs.ucx.hive_metastore.udfs import UdfsCrawler
1819
from databricks.labs.ucx.workspace_access import generic, redash, scim, secrets
1920
from databricks.labs.ucx.workspace_access.base import AclSupport, Permissions
2021
from databricks.labs.ucx.workspace_access.groups import MigrationState
@@ -71,7 +72,8 @@ def factory(
7172
secrets_support = secrets.SecretScopesSupport(ws)
7273
scim_support = scim.ScimSupport(ws)
7374
tables_crawler = TablesCrawler(sql_backend, inventory_database)
74-
grants_crawler = GrantsCrawler(tables_crawler)
75+
udfs_crawler = UdfsCrawler(sql_backend, inventory_database)
76+
grants_crawler = GrantsCrawler(tables_crawler, udfs_crawler)
7577
tacl_support = TableAclSupport(grants_crawler, sql_backend)
7678
return cls(
7779
sql_backend, inventory_database, [generic_support, sql_support, secrets_support, scim_support, tacl_support]

src/databricks/labs/ucx/workspace_access/tacl.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,9 @@ def _from_reduced(self, object_type: str, object_id: str, principal: str, action
6565
case "CATALOG":
6666
catalog = object_id
6767
return Grant(principal=principal, action_type=action_type, catalog=catalog)
68+
case "FUNCTION":
69+
catalog, database, udf = object_id.split(".")
70+
return Grant(principal=principal, action_type=action_type, catalog=catalog, database=database, udf=udf)
6871
case "ANONYMOUS FUNCTION":
6972
catalog = object_id
7073
return Grant(principal=principal, action_type=action_type, catalog=catalog, anonymous_function=True)
@@ -73,7 +76,7 @@ def _from_reduced(self, object_type: str, object_id: str, principal: str, action
7376
return Grant(principal=principal, action_type=action_type, catalog=catalog, any_file=True)
7477

7578
def object_types(self) -> set[str]:
76-
return {"TABLE", "DATABASE", "VIEW", "CATALOG", "ANONYMOUS FUNCTION", "ANY FILE"}
79+
return {"TABLE", "DATABASE", "VIEW", "CATALOG", "FUNCTION", "ANONYMOUS FUNCTION", "ANY FILE"}
7780

7881
def get_apply_task(self, item: Permissions, migration_state: MigrationState):
7982
grant = Grant(**json.loads(item.raw))

0 commit comments

Comments
 (0)