Skip to content

Commit 9981fac

Browse files
authored
Fixed External Location widget to return hostname for jdbc: locations (#621)
The UCX tool fetches the 'location' from the table properties while for an external location with jdbc, the host information is stored in 'storage properties'. Add a field for Table class and fetch Storage Properties from the table. Using the host field from Storage properties, create the full jdbc url Older PR closed due to Git issues: #582
1 parent 44143c8 commit 9981fac

File tree

5 files changed

+160
-17
lines changed

5 files changed

+160
-17
lines changed

src/databricks/labs/ucx/hive_metastore/data_objects.py

Lines changed: 46 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1+
import logging
12
import os
3+
import re
24
import typing
35
from dataclasses import dataclass
46

@@ -8,6 +10,8 @@
810
from databricks.labs.ucx.hive_metastore.mounts import Mounts
911
from databricks.labs.ucx.mixins.sql import Row
1012

13+
logger = logging.getLogger(__name__)
14+
1115

1216
@dataclass
1317
class ExternalLocation:
@@ -32,8 +36,10 @@ def _external_locations(self, tables: list[Row], mounts) -> list[ExternalLocatio
3236
if location[5:].startswith(mount.name):
3337
location = location[5:].replace(mount.name, mount.source)
3438
break
35-
if not location.startswith("dbfs") and (
36-
self._prefix_size[0] < location.find(":/") < self._prefix_size[1]
39+
if (
40+
not location.startswith("dbfs")
41+
and (self._prefix_size[0] < location.find(":/") < self._prefix_size[1])
42+
and not location.startswith("jdbc")
3743
):
3844
dupe = False
3945
loc = 0
@@ -50,10 +56,47 @@ def _external_locations(self, tables: list[Row], mounts) -> list[ExternalLocatio
5056
loc += 1
5157
if not dupe:
5258
external_locations.append(ExternalLocation(os.path.dirname(location) + "/"))
59+
if location.startswith("jdbc"):
60+
pattern = r"(\w+)=(.*?)(?=\s*,|\s*\])"
61+
62+
# Find all matches in the input string
63+
# Storage properties is of the format
64+
# "[personalAccessToken=*********(redacted), \
65+
# httpPath=/sql/1.0/warehouses/65b52fb5bd86a7be, host=dbc-test1-aa11.cloud.databricks.com, \
66+
# dbtable=samples.nyctaxi.trips]"
67+
matches = re.findall(pattern, table.storage_properties)
68+
69+
# Create a dictionary from the matches
70+
result_dict = dict(matches)
71+
72+
# Fetch the value of host from the newly created dict
73+
host = result_dict.get("host", "")
74+
port = result_dict.get("port", "")
75+
database = result_dict.get("database", "")
76+
httppath = result_dict.get("httpPath", "")
77+
provider = result_dict.get("provider", "")
78+
# dbtable = result_dict.get("dbtable", "")
79+
80+
# currently supporting databricks and mysql external tables
81+
# add other jdbc types
82+
if "databricks" in location.lower():
83+
jdbc_location = f"jdbc:databricks://{host};httpPath={httppath}"
84+
elif "mysql" in location.lower():
85+
jdbc_location = f"jdbc:mysql://{host}:{port}/{database}"
86+
elif not provider == "":
87+
jdbc_location = f"jdbc:{provider.lower()}://{host}:{port}/{database}"
88+
else:
89+
jdbc_location = f"{location.lower()}/{host}:{port}/{database}"
90+
external_locations.append(ExternalLocation(jdbc_location))
91+
5392
return external_locations
5493

5594
def _external_location_list(self):
56-
tables = list(self._backend.fetch(f"SELECT location FROM {self._schema}.tables WHERE location IS NOT NULL"))
95+
tables = list(
96+
self._backend.fetch(
97+
f"SELECT location, storage_properties FROM {self._schema}.tables WHERE location IS NOT NULL"
98+
)
99+
)
57100
mounts = Mounts(self._backend, self._ws, self._schema).snapshot()
58101
return self._external_locations(list(tables), list(mounts))
59102

src/databricks/labs/ucx/hive_metastore/tables.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ class Table:
2626
view_text: str = None
2727
upgraded_to: str = None
2828

29+
storage_properties: str = None
30+
2931
@property
3032
def is_delta(self) -> bool:
3133
if self.table_format is None:
@@ -162,6 +164,7 @@ def _describe(self, catalog: str, database: str, table: str) -> Table | None:
162164
location=describe.get("Location", None),
163165
view_text=describe.get("View Text", None),
164166
upgraded_to=self._parse_table_props(describe.get("Table Properties", "")).get("upgraded_to", None),
167+
storage_properties=self._parse_table_props(describe.get("Storage Properties", "")),
165168
)
166169
except Exception as e:
167170
# TODO: https://github.com/databrickslabs/ucx/issues/406

src/databricks/labs/ucx/hive_metastore/tables.scala

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ import org.apache.spark.sql.DataFrame
88

99
// must follow the same structure as databricks.labs.ucx.hive_metastore.tables.Table
1010
case class TableDetails(catalog: String, database: String, name: String, object_type: String,
11-
table_format: String, location: String, view_text: String, upgraded_to: String)
11+
table_format: String, location: String, view_text: String, upgraded_to: String, storage_properties: String)
1212

1313
// recording error log in the database
1414
case class TableError(catalog: String, database: String, name: String, error: String)
@@ -37,10 +37,22 @@ def metadataForAllTables(databases: Seq[String], queue: ConcurrentLinkedQueue[Ta
3737
failures.add(TableError("hive_metastore", databaseName, tableName, s"result is null"))
3838
None
3939
} else {
40-
val upgraded_to=table.properties.get("upgraded_to")
40+
val upgraded_to = table.properties.get("upgraded_to")
41+
val redactedKey = "*********"
42+
43+
val formattedString = table.storage.properties.map {
44+
case (key, value) =>
45+
if (key == "personalAccessToken")
46+
s"$key=$redactedKey(redacted)"
47+
else if (key.equalsIgnoreCase("password"))
48+
s"$key=$redactedKey(redacted)"
49+
else
50+
s"$key=$value"
51+
}.mkString("[", ", ", "]")
52+
4153
Some(TableDetails("hive_metastore", databaseName, tableName, table.tableType.name, table.provider.orNull,
4254
table.storage.locationUri.map(_.toString).orNull, table.viewText.orNull,
43-
upgraded_to match {case Some(target) => target case None => null}))
55+
upgraded_to match { case Some(target) => target case None => null }, formattedString))
4456
}
4557
} catch {
4658
case err: Throwable =>

tests/integration/hive_metastore/test_external_locations.py

Lines changed: 50 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,60 @@ def test_external_locations(ws, sql_backend, inventory_schema, env_or_skip):
1313
Table("hive_metastore", "foo", "bar", "MANAGED", "delta", location="s3://test_location/test1/table1"),
1414
Table("hive_metastore", "foo", "bar", "EXTERNAL", "delta", location="s3://test_location/test2/table2"),
1515
Table("hive_metastore", "foo", "bar", "EXTERNAL", "delta", location="dbfs:/mnt/foo/test3/table3"),
16+
Table(
17+
"hive_metastore",
18+
"foo",
19+
"bar",
20+
"EXTERNAL",
21+
"delta",
22+
location="jdbc://databricks/",
23+
storage_properties="[personalAccessToken=*********(redacted), \
24+
httpPath=/sql/1.0/warehouses/65b52fb5bd86a7be, host=dbc-test1-aa11.cloud.databricks.com, \
25+
dbtable=samples.nyctaxi.trips]",
26+
),
27+
Table(
28+
"hive_metastore",
29+
"foo",
30+
"bar",
31+
"EXTERNAL",
32+
"delta",
33+
location="jdbc://MYSQL/",
34+
storage_properties="[database=test_db, host=somemysql.us-east-1.rds.amazonaws.com, \
35+
port=3306, dbtable=movies, user=*********(redacted), password=*********(redacted)]",
36+
),
37+
Table(
38+
"hive_metastore",
39+
"foo",
40+
"bar",
41+
"EXTERNAL",
42+
"delta",
43+
location="jdbc://providerknown/",
44+
storage_properties="[database=test_db, host=somedb.us-east-1.rds.amazonaws.com, \
45+
port=1234, dbtable=sometable, user=*********(redacted), password=*********(redacted), \
46+
provider=providerknown]",
47+
),
48+
Table(
49+
"hive_metastore",
50+
"foo",
51+
"bar",
52+
"EXTERNAL",
53+
"delta",
54+
location="jdbc://providerunknown/",
55+
storage_properties="[database=test_db, host=somedb.us-east-1.rds.amazonaws.com, \
56+
port=1234, dbtable=sometable, user=*********(redacted), password=*********(redacted)]",
57+
),
1658
]
1759
sql_backend.save_table(f"{inventory_schema}.tables", tables, Table)
1860
sql_backend.save_table(f"{inventory_schema}.mounts", [Mount("/mnt/foo", "s3://bar")], Mount)
1961

2062
crawler = ExternalLocationCrawler(ws, sql_backend, inventory_schema)
2163
results = crawler.snapshot()
22-
assert len(results) == 2
64+
assert len(results) == 6
2365
assert results[1].location == "s3://bar/test3/"
66+
assert (
67+
results[2].location
68+
== "jdbc:databricks://dbc-test1-aa11.cloud.databricks.com;httpPath=/sql/1.0/warehouses/65b52fb5bd86a7be"
69+
)
70+
assert results[3].location == "jdbc:mysql://somemysql.us-east-1.rds.amazonaws.com:3306/test_db"
71+
assert results[4].location == "jdbc:providerknown://somedb.us-east-1.rds.amazonaws.com:1234/test_db"
72+
assert results[5].location == "jdbc://providerunknown//somedb.us-east-1.rds.amazonaws.com:1234/test_db"

tests/unit/assessment/test_assessment.py

Lines changed: 46 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -65,22 +65,58 @@ def test_spark_version_compatibility():
6565

6666
def test_external_locations():
6767
crawler = ExternalLocationCrawler(Mock(), MockBackend(), "test")
68-
row_factory = type("Row", (Row,), {"__columns__": ["location"]})
68+
row_factory = type("Row", (Row,), {"__columns__": ["location", "storage_properties"]})
6969
sample_locations = [
70-
row_factory(["s3://us-east-1-dev-account-staging-uc-ext-loc-bucket-1/Location/Table"]),
71-
row_factory(["s3://us-east-1-dev-account-staging-uc-ext-loc-bucket-1/Location/Table2"]),
72-
row_factory(["s3://us-east-1-dev-account-staging-uc-ext-loc-bucket-23/testloc/Table3"]),
73-
row_factory(["s3://us-east-1-dev-account-staging-uc-ext-loc-bucket-23/anotherloc/Table4"]),
74-
row_factory(["dbfs:/mnt/ucx/database1/table1"]),
75-
row_factory(["dbfs:/mnt/ucx/database2/table2"]),
76-
row_factory(["DatabricksRootmntDatabricksRoot"]),
70+
row_factory(["s3://us-east-1-dev-account-staging-uc-ext-loc-bucket-1/Location/Table", ""]),
71+
row_factory(["s3://us-east-1-dev-account-staging-uc-ext-loc-bucket-1/Location/Table2", ""]),
72+
row_factory(["s3://us-east-1-dev-account-staging-uc-ext-loc-bucket-23/testloc/Table3", ""]),
73+
row_factory(["s3://us-east-1-dev-account-staging-uc-ext-loc-bucket-23/anotherloc/Table4", ""]),
74+
row_factory(["dbfs:/mnt/ucx/database1/table1", ""]),
75+
row_factory(["dbfs:/mnt/ucx/database2/table2", ""]),
76+
row_factory(["DatabricksRootmntDatabricksRoot", ""]),
77+
row_factory(
78+
[
79+
"jdbc:databricks://",
80+
"[personalAccessToken=*********(redacted), \
81+
httpPath=/sql/1.0/warehouses/65b52fb5bd86a7be, host=dbc-test1-aa11.cloud.databricks.com, \
82+
dbtable=samples.nyctaxi.trips]",
83+
]
84+
),
85+
row_factory(
86+
[
87+
"jdbc:/MYSQL",
88+
"[database=test_db, host=somemysql.us-east-1.rds.amazonaws.com, \
89+
port=3306, dbtable=movies, user=*********(redacted), password=*********(redacted)]",
90+
]
91+
),
92+
row_factory(
93+
[
94+
"jdbc:providerknown:/",
95+
"[database=test_db, host=somedb.us-east-1.rds.amazonaws.com, \
96+
port=1234, dbtable=sometable, user=*********(redacted), password=*********(redacted), \
97+
provider=providerknown]",
98+
]
99+
),
100+
row_factory(
101+
[
102+
"jdbc:providerunknown:/",
103+
"[database=test_db, host=somedb.us-east-1.rds.amazonaws.com, \
104+
port=1234, dbtable=sometable, user=*********(redacted), password=*********(redacted)]",
105+
]
106+
),
77107
]
78108
sample_mounts = [Mount("/mnt/ucx", "s3://us-east-1-ucx-container")]
79109
result_set = crawler._external_locations(sample_locations, sample_mounts)
80-
assert len(result_set) == 3
110+
assert len(result_set) == 7
81111
assert result_set[0].location == "s3://us-east-1-dev-account-staging-uc-ext-loc-bucket-1/Location/"
82112
assert result_set[1].location == "s3://us-east-1-dev-account-staging-uc-ext-loc-bucket-23/"
83-
assert result_set[2].location == "s3://us-east-1-ucx-container/"
113+
assert (
114+
result_set[3].location
115+
== "jdbc:databricks://dbc-test1-aa11.cloud.databricks.com;httpPath=/sql/1.0/warehouses/65b52fb5bd86a7be"
116+
)
117+
assert result_set[4].location == "jdbc:mysql://somemysql.us-east-1.rds.amazonaws.com:3306/test_db"
118+
assert result_set[5].location == "jdbc:providerknown://somedb.us-east-1.rds.amazonaws.com:1234/test_db"
119+
assert result_set[6].location == "jdbc:providerunknown://somedb.us-east-1.rds.amazonaws.com:1234/test_db"
84120

85121

86122
def test_job_assessment():

0 commit comments

Comments
 (0)