Skip to content

Commit f4e5989

Browse files
authored
Removed redundant pyspark, databricks-connect, delta-spark, and pandas dependencies (#193)
This PR removes redundant pyspark, databricks-connect, delta-spark, and pandas dependencies and their usages. After it we can use consistent crawlers across HMS Crawling and Workspace Permissions. This PR supersedes and closes #105
1 parent 0898bd6 commit f4e5989

File tree

12 files changed

+95
-204
lines changed

12 files changed

+95
-204
lines changed

pyproject.toml

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -31,15 +31,11 @@ dependencies = [
3131

3232
# TODO: remove later
3333
"typer[all]>=0.9.0,<0.10.0",
34-
"pandas>=2.0.3,<3.0.0",
3534
"ratelimit>=2.2.1,<3.0.0",
3635
"tenacity>=8.2.2,<9.0.0",
3736
]
3837

3938
[project.optional-dependencies]
40-
dbconnect = [
41-
"databricks-connect>=13.2.0,<=14.0.0"
42-
]
4339
test = [
4440
"coverage[toml]>=6.5",
4541
"pytest",
@@ -62,9 +58,7 @@ path = "src/databricks/labs/ucx/__about__.py"
6258

6359
[tool.hatch.envs.unit]
6460
dependencies = [
65-
"databricks-labs-ucx[test]",
66-
"pyspark>=3.4.0,<=3.5.0",
67-
"delta-spark>=2.4.0,<3.0.0"
61+
"databricks-labs-ucx[test]"
6862
]
6963

7064
[tool.hatch.envs.unit.scripts]
@@ -74,8 +68,6 @@ test-cov-report = "pytest --cov src tests/unit --cov-report=html"
7468
[tool.hatch.envs.integration]
7569
dependencies = [
7670
"databricks-labs-ucx[test]",
77-
"databricks-labs-ucx[dbconnect]",
78-
"delta-spark>=2.4.0,<3.0.0"
7971
]
8072

8173
[tool.hatch.envs.integration.scripts]
@@ -108,10 +100,6 @@ profile = "black"
108100

109101
[tool.pytest.ini_options]
110102
addopts = "-s -p no:warnings -vv --cache-clear"
111-
filterwarnings = [
112-
"ignore:::.*pyspark.broadcast*",
113-
"ignore:::.*pyspark.sql.pandas.utils*"
114-
]
115103

116104
[tool.black]
117105
target-version = ["py310"]

src/databricks/labs/ucx/inventory/permissions.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
from databricks.labs.ucx.inventory.permissions_inventory import (
88
PermissionsInventoryTable,
99
)
10-
from databricks.labs.ucx.inventory.types import PermissionsInventoryItem
1110
from databricks.labs.ucx.providers.groups_info import GroupMigrationState
1211
from databricks.labs.ucx.support.impl import SupportsProvider
1312
from databricks.labs.ucx.utils import ThreadedExecution
@@ -28,8 +27,7 @@ def inventorize_permissions(self):
2827
crawler_tasks = list(self._supports_provider.get_crawler_tasks())
2928
logger.info(f"Total crawler tasks: {len(crawler_tasks)}")
3029
logger.info("Starting the permissions inventorization")
31-
execution = ThreadedExecution[PermissionsInventoryItem | None](crawler_tasks)
32-
results = execution.run()
30+
results = ThreadedExecution.gather("crawl permissions", crawler_tasks)
3331
items = [item for item in results if item is not None]
3432
logger.info(f"Total inventorized items: {len(items)}")
3533
self._permissions_inventory.save(items)
@@ -62,6 +60,5 @@ def apply_group_permissions(self, migration_state: GroupMigrationState, destinat
6260

6361
logger.info(f"Total applier tasks: {len(applier_tasks)}")
6462
logger.info("Starting the permissions application")
65-
execution = ThreadedExecution(applier_tasks)
66-
execution.run()
63+
ThreadedExecution.gather("apply permissions", applier_tasks)
6764
logger.info("Permissions were applied")
Lines changed: 15 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,50 +1,31 @@
11
import logging
22

3-
from databricks.sdk import WorkspaceClient
4-
53
from databricks.labs.ucx.inventory.types import PermissionsInventoryItem
6-
from databricks.labs.ucx.providers.spark import SparkMixin
4+
from databricks.labs.ucx.tacl._internal import CrawlerBase, SqlBackend
75

86
logger = logging.getLogger(__name__)
97

108

11-
class PermissionsInventoryTable(SparkMixin):
12-
def __init__(self, inventory_database: str, ws: WorkspaceClient):
13-
super().__init__(ws)
14-
self._table = f"hive_metastore.{inventory_database}.permissions"
15-
16-
@property
17-
def _table_schema(self):
18-
from pyspark.sql.types import StringType, StructField, StructType
19-
20-
return StructType(
21-
[
22-
StructField("object_id", StringType(), True),
23-
StructField("support", StringType(), True),
24-
StructField("raw_object_permissions", StringType(), True),
25-
]
26-
)
27-
28-
@property
29-
def _df(self):
30-
return self.spark.table(self._table)
9+
class PermissionsInventoryTable(CrawlerBase):
10+
def __init__(self, backend: SqlBackend, inventory_database: str):
11+
super().__init__(backend, "hive_metastore", inventory_database, "permissions")
3112

3213
def cleanup(self):
33-
logger.info(f"Cleaning up inventory table {self._table}")
34-
self.spark.sql(f"DROP TABLE IF EXISTS {self._table}")
14+
logger.info(f"Cleaning up inventory table {self._full_name}")
15+
self._exec(f"DROP TABLE IF EXISTS {self._full_name}")
3516
logger.info("Inventory table cleanup complete")
3617

3718
def save(self, items: list[PermissionsInventoryItem]):
3819
# TODO: update instead of append
39-
logger.info(f"Saving {len(items)} items to inventory table {self._table}")
40-
serialized_items = [item.as_dict() for item in items]
41-
df = self.spark.createDataFrame(serialized_items, schema=self._table_schema)
42-
df.write.mode("append").format("delta").saveAsTable(self._table)
20+
logger.info(f"Saving {len(items)} items to inventory table {self._full_name}")
21+
self._append_records(PermissionsInventoryItem, items)
4322
logger.info("Successfully saved the items to inventory table")
4423

4524
def load_all(self) -> list[PermissionsInventoryItem]:
46-
logger.info(f"Loading inventory table {self._table}")
47-
df = self._df.toPandas()
48-
49-
logger.info("Successfully loaded the inventory table")
50-
return PermissionsInventoryItem.from_pandas(df)
25+
logger.info(f"Loading inventory table {self._full_name}")
26+
return [
27+
PermissionsInventoryItem(object_id, support, raw_object_permissions)
28+
for object_id, support, raw_object_permissions in self._fetch(
29+
f"SELECT object_id, support, raw_object_permissions FROM {self._full_name}"
30+
)
31+
]

src/databricks/labs/ucx/inventory/types.py

Lines changed: 1 addition & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
1-
from dataclasses import asdict, dataclass
1+
from dataclasses import dataclass
22
from typing import Literal
33

4-
import pandas as pd
5-
64
from databricks.labs.ucx.generic import StrEnum
75

86
Destination = Literal["backup", "account"]
@@ -32,19 +30,3 @@ class PermissionsInventoryItem:
3230
object_id: str
3331
support: str # shall be taken from CRAWLERS dict
3432
raw_object_permissions: str
35-
36-
@staticmethod
37-
def from_pandas(source: pd.DataFrame) -> list["PermissionsInventoryItem"]:
38-
items = source.to_dict(orient="records")
39-
return [PermissionsInventoryItem.from_dict(item) for item in items]
40-
41-
def as_dict(self) -> dict:
42-
return asdict(self)
43-
44-
@classmethod
45-
def from_dict(cls, raw: dict) -> "PermissionsInventoryItem":
46-
return cls(
47-
object_id=raw["object_id"],
48-
raw_object_permissions=raw["raw_object_permissions"],
49-
support=raw["support"],
50-
)

src/databricks/labs/ucx/providers/spark.py

Lines changed: 0 additions & 39 deletions
This file was deleted.

src/databricks/labs/ucx/toolkits/group_migration.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,15 @@
1010
from databricks.labs.ucx.inventory.verification import VerificationManager
1111
from databricks.labs.ucx.managers.group import GroupManager
1212
from databricks.labs.ucx.support.impl import SupportsProvider
13+
from databricks.labs.ucx.tacl._internal import (
14+
RuntimeBackend,
15+
SqlBackend,
16+
StatementExecutionBackend,
17+
)
1318

1419

1520
class GroupMigrationToolkit:
16-
def __init__(self, config: MigrationConfig):
21+
def __init__(self, config: MigrationConfig, *, warehouse_id=None):
1722
self._num_threads = config.num_threads
1823
self._workspace_start_path = config.workspace_start_path
1924

@@ -27,13 +32,20 @@ def __init__(self, config: MigrationConfig):
2732
self._verify_ws_client(self._ws)
2833

2934
self._group_manager = GroupManager(self._ws, config.groups)
30-
self._permissions_inventory = PermissionsInventoryTable(config.inventory_database, self._ws)
35+
sql_backend = self._backend(self._ws, warehouse_id)
36+
self._permissions_inventory = PermissionsInventoryTable(sql_backend, config.inventory_database)
3137
self._supports_provider = SupportsProvider(self._ws, self._num_threads, self._workspace_start_path)
3238
self._permissions_manager = PermissionManager(
3339
self._ws, self._permissions_inventory, supports_provider=self._supports_provider
3440
)
3541
self._verification_manager = VerificationManager(self._ws, self._supports_provider.supports["secrets"])
3642

43+
@staticmethod
44+
def _backend(ws: WorkspaceClient, warehouse_id: str | None = None) -> SqlBackend:
45+
if warehouse_id is None:
46+
return RuntimeBackend()
47+
return StatementExecutionBackend(ws, warehouse_id)
48+
3749
@staticmethod
3850
def _verify_ws_client(w: WorkspaceClient):
3951
_me = w.current_user.me()

src/databricks/labs/ucx/toolkits/table_acls.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,7 @@ def __init__(
2525
self._tc = TablesCrawler(self._backend(ws, warehouse_id), inventory_catalog, inventory_schema)
2626
self._gc = GrantsCrawler(self._tc)
2727

28-
self._databases = (
29-
databases if databases else [database["databaseName"] for database in self._tc._all_databases()]
30-
)
28+
self._databases = databases if databases else [database for (database,) in self._tc._all_databases()]
3129

3230
def database_snapshot(self):
3331
tables = []

tests/integration/test_e2e.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,9 @@ def test_e2e(
166166
tacl=TaclConfig(auto=True),
167167
log_level="DEBUG",
168168
)
169-
toolkit = GroupMigrationToolkit(config)
169+
170+
warehouse_id = os.environ["TEST_DEFAULT_WAREHOUSE_ID"]
171+
toolkit = GroupMigrationToolkit(config, warehouse_id=warehouse_id)
170172
toolkit.prepare_environment()
171173

172174
group_migration_state = toolkit._group_manager.migration_groups_provider
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
import os
2+
3+
from databricks.labs.ucx.inventory.permissions_inventory import (
4+
PermissionsInventoryTable,
5+
)
6+
from databricks.labs.ucx.inventory.types import PermissionsInventoryItem
7+
from databricks.labs.ucx.tacl._internal import StatementExecutionBackend
8+
9+
10+
def test_permissions_save_and_load(ws, make_schema):
11+
schema = make_schema().split(".")[-1]
12+
backend = StatementExecutionBackend(ws, os.environ["TEST_DEFAULT_WAREHOUSE_ID"])
13+
pi = PermissionsInventoryTable(backend, schema)
14+
15+
saved = [
16+
PermissionsInventoryItem(object_id="abc", support="bcd", raw_object_permissions="def"),
17+
PermissionsInventoryItem(object_id="efg", support="fgh", raw_object_permissions="ghi"),
18+
]
19+
20+
pi.save(saved)
21+
loaded = pi.load_all()
22+
23+
assert saved == loaded

tests/unit/conftest.py

Lines changed: 0 additions & 37 deletions
This file was deleted.

0 commit comments

Comments
 (0)