Skip to content

Commit 2eff82b

Browse files
authored
Added DBFS Root resolution when HMS Federation is enabled (#2947)
This PR adds a DBFS Resolver which is used by HMS federation to resolve the DBFS root locations. TODO: - [x] #2954 - [x] resolve special `/user/hive/metastore` as special case of DBFS mount - [x] use `LocationTrie` to come guess locations - #2965 co-authored by @vihangk-db
1 parent 00c89aa commit 2eff82b

File tree

3 files changed

+68
-5
lines changed

3 files changed

+68
-5
lines changed

src/databricks/labs/ucx/contexts/application.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -294,7 +294,12 @@ def table_move(self) -> TableMove:
294294

295295
@cached_property
296296
def mounts_crawler(self) -> MountsCrawler:
297-
return MountsCrawler(self.sql_backend, self.workspace_client, self.inventory_database)
297+
return MountsCrawler(
298+
self.sql_backend,
299+
self.workspace_client,
300+
self.inventory_database,
301+
self.config.enable_hms_federation,
302+
)
298303

299304
@cached_property
300305
def azure_service_principal_crawler(self) -> AzureServicePrincipalCrawler:

src/databricks/labs/ucx/hive_metastore/locations.py

Lines changed: 45 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -341,14 +341,18 @@ class Mount:
341341
def as_scheme_prefix(self) -> str:
342342
return f'dbfs:{self.name}' # dbfs:/mnt/mount-name
343343

344-
def as_fuse_prefix(self) -> str:
345-
return f'/dbfs{self.name}' # /dbfs/mnt/mount-name
346-
347344

348345
class MountsCrawler(CrawlerBase[Mount]):
349-
def __init__(self, backend: SqlBackend, ws: WorkspaceClient, inventory_database: str):
346+
def __init__(
347+
self,
348+
backend: SqlBackend,
349+
ws: WorkspaceClient,
350+
inventory_database: str,
351+
enable_hms_federation: bool = False,
352+
):
350353
super().__init__(backend, "hive_metastore", inventory_database, "mounts", Mount)
351354
self._dbutils = ws.dbutils
355+
self._enable_hms_federation = enable_hms_federation
352356

353357
@staticmethod
354358
def _deduplicate_mounts(mounts: list) -> list:
@@ -364,11 +368,48 @@ def _deduplicate_mounts(mounts: list) -> list:
364368
deduplicated_mounts.append(obj)
365369
return deduplicated_mounts
366370

371+
@cached_property
372+
def _jvm(self):
373+
# pylint: disable=import-error,import-outside-toplevel,broad-exception-caught
374+
try:
375+
from pyspark.sql.session import SparkSession # type: ignore[import-not-found]
376+
377+
spark = SparkSession.builder.getOrCreate()
378+
return spark._jvm # pylint: disable=protected-access
379+
except Exception as err:
380+
logger.warning(f"Cannot create Py4j proxy: {err}")
381+
return None
382+
383+
def _resolve_dbfs_root(self) -> Mount | None:
384+
# pylint: disable=broad-exception-caught,too-many-try-statements
385+
try:
386+
jvm = self._jvm
387+
if not jvm:
388+
return None
389+
uri = jvm.java.net.URI
390+
some = jvm.scala.Some
391+
hms_fed_dbfs_utils = jvm.com.databricks.sql.managedcatalog.connections.HmsFedDbfsUtils
392+
root_location_opt = hms_fed_dbfs_utils.resolveDbfsPath(some(uri("dbfs:/user/hive/warehouse")))
393+
if root_location_opt.isDefined():
394+
source: str = root_location_opt.get().toString()
395+
source = source.removesuffix('user/hive/warehouse')
396+
return Mount("/", source)
397+
return None
398+
except Exception as err:
399+
logger.warning(f"Failed to resolve DBFS root location: {err}")
400+
return None
401+
367402
def _crawl(self) -> Iterable[Mount]:
368403
mounts = []
369404
try:
370405
for mount_point, source, _ in self._dbutils.fs.mounts():
371406
mounts.append(Mount(mount_point, source))
407+
if self._enable_hms_federation:
408+
root_mount = self._resolve_dbfs_root()
409+
if root_mount:
410+
# filter out DatabricksRoot, otherwise ExternalLocations.resolve_mount() won't work
411+
mounts = list(filter(lambda _: _.source != 'DatabricksRoot', mounts))
412+
mounts.append(root_mount)
372413
except Exception as error: # pylint: disable=broad-except
373414
if "com.databricks.backend.daemon.dbutils.DBUtilsCore.mounts() is not whitelisted" in str(error):
374415
logger.warning(

tests/unit/hive_metastore/test_locations.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -717,3 +717,20 @@ def my_side_effect(path, **_):
717717
assert results[0].location == "adls://bucket/table1"
718718
assert results[1].location == "dbfs:/mnt/test_mount/table2"
719719
assert results[2].location is None
720+
721+
722+
def test_resolve_dbfs_root_in_hms_federation():
723+
jvm = Mock()
724+
sql_backend = MockBackend()
725+
client = create_autospec(WorkspaceClient)
726+
client.dbutils.fs.mounts.return_value = [MountInfo('/', 'DatabricksRoot', '')]
727+
728+
mounts_crawler = MountsCrawler(sql_backend, client, "test", enable_hms_federation=True)
729+
mounts_crawler.__dict__['_jvm'] = jvm
730+
731+
hms_fed_dbfs_utils = jvm.com.databricks.sql.managedcatalog.connections.HmsFedDbfsUtils
732+
hms_fed_dbfs_utils.resolveDbfsPath().get().toString.return_value = 's3://original/bucket/user/hive/warehouse'
733+
734+
mounts = mounts_crawler.snapshot()
735+
736+
assert [Mount("/", 's3://original/bucket/')] == mounts

0 commit comments

Comments
 (0)