Skip to content

Commit 46ea0f1

Browse files
authored
Added databricks labs ucx validate-external-locations command for cli (#715)
Description: This change contains steps to validate external location defined in the external tables with the ones already available in the unity external location. It prints the count of tables that can be migrated for external location already present. Also shares the list of external location that needs to be created for the missing ones. It generates a tf file Changes: labs.yml - to add the new command cli.py - to add code for invoking the new command locations.py - add a field which gives count of tables using the external location. also added logic to identify duplicate for jdbc connection objects, functionality to compare locations and print details and generate tf file updated test cases to locations added new test cases for mapping
1 parent 2168f20 commit 46ea0f1

File tree

5 files changed

+204
-5
lines changed

5 files changed

+204
-5
lines changed

labs.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,3 +43,6 @@ commands:
4343

4444
- name: create-table-mapping
4545
description: create initial table mapping for review
46+
47+
- name: validate-external-locations
48+
description: validates and provides mapping to external table to external location and shared generation tf scripts

src/databricks/labs/ucx/cli.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from databricks.labs.ucx.config import AccountConfig, ConnectConfig
1111
from databricks.labs.ucx.framework.crawlers import StatementExecutionBackend
1212
from databricks.labs.ucx.framework.tui import Prompts
13-
from databricks.labs.ucx.hive_metastore import TablesCrawler
13+
from databricks.labs.ucx.hive_metastore import ExternalLocations, TablesCrawler
1414
from databricks.labs.ucx.hive_metastore.mapping import TableMapping
1515
from databricks.labs.ucx.install import WorkspaceInstaller
1616
from databricks.labs.ucx.installer import InstallationManager
@@ -90,13 +90,26 @@ def create_table_mapping():
9090
webbrowser.open(f"{ws.config.host}/#workspace{path}")
9191

9292

93+
def validate_external_locations():
94+
ws = WorkspaceClient()
95+
prompts = Prompts()
96+
installation_manager = InstallationManager(ws)
97+
installation = installation_manager.for_user(ws.current_user.me())
98+
sql_backend = StatementExecutionBackend(ws, installation.config.warehouse_id)
99+
location_crawler = ExternalLocations(ws, sql_backend, installation.config.inventory_database)
100+
path = location_crawler.save_as_terraform_definitions_on_workspace()
101+
if len(path) > 0 and prompts.confirm(f"external_locations.tf file written to {path}. Do you want to open it?"):
102+
webbrowser.open(f"{ws.config.host}/#workspace{path}")
103+
104+
93105
MAPPING = {
94106
"open-remote-config": open_remote_config,
95107
"installations": list_installations,
96108
"workflows": workflows,
97109
"sync-workspace-info": sync_workspace_info,
98110
"manual-workspace-info": manual_workspace_info,
99111
"create-table-mapping": create_table_mapping,
112+
"validate-external-locations": validate_external_locations,
100113
"skip": skip,
101114
}
102115

src/databricks/labs/ucx/hive_metastore/locations.py

Lines changed: 90 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import io
12
import logging
23
import os
34
import re
@@ -6,6 +7,7 @@
67
from typing import ClassVar
78

89
from databricks.sdk import WorkspaceClient
10+
from databricks.sdk.service.workspace import ImportFormat
911

1012
from databricks.labs.ucx.framework.crawlers import CrawlerBase, SqlBackend
1113
from databricks.labs.ucx.mixins.sql import Row
@@ -16,6 +18,7 @@
1618
@dataclass
1719
class ExternalLocation:
1820
location: str
21+
table_count: int
1922

2023

2124
@dataclass
@@ -30,6 +33,7 @@ class ExternalLocations(CrawlerBase[ExternalLocation]):
3033
def __init__(self, ws: WorkspaceClient, sbe: SqlBackend, schema):
3134
super().__init__(sbe, "hive_metastore", schema, "external_locations", ExternalLocation)
3235
self._ws = ws
36+
self._folder = f"/Users/{ws.current_user.me().user_name}/.ucx"
3337

3438
def _external_locations(self, tables: list[Row], mounts) -> Iterable[ExternalLocation]:
3539
min_slash = 2
@@ -57,12 +61,14 @@ def _external_locations(self, tables: list[Row], mounts) -> Iterable[ExternalLoc
5761
+ "/"
5862
)
5963
if common.count("/") > min_slash:
60-
external_locations[loc] = ExternalLocation(common)
64+
table_count = external_locations[loc].table_count
65+
external_locations[loc] = ExternalLocation(common, table_count + 1)
6166
dupe = True
6267
loc += 1
6368
if not dupe:
64-
external_locations.append(ExternalLocation(os.path.dirname(location) + "/"))
69+
external_locations.append(ExternalLocation(os.path.dirname(location) + "/", 1))
6570
if location.startswith("jdbc"):
71+
dupe = False
6672
pattern = r"(\w+)=(.*?)(?=\s*,|\s*\])"
6773

6874
# Find all matches in the input string
@@ -93,7 +99,13 @@ def _external_locations(self, tables: list[Row], mounts) -> Iterable[ExternalLoc
9399
jdbc_location = f"jdbc:{provider.lower()}://{host}:{port}/{database}"
94100
else:
95101
jdbc_location = f"{location.lower()}/{host}:{port}/{database}"
96-
external_locations.append(ExternalLocation(jdbc_location))
102+
for ext_loc in external_locations:
103+
if ext_loc.location == jdbc_location:
104+
ext_loc.table_count += 1
105+
dupe = True
106+
break
107+
if not dupe:
108+
external_locations.append(ExternalLocation(jdbc_location, 1))
97109

98110
return external_locations
99111

@@ -113,6 +125,81 @@ def _try_fetch(self) -> Iterable[ExternalLocation]:
113125
for row in self._fetch(f"SELECT * FROM {self._schema}.{self._table}"):
114126
yield ExternalLocation(*row)
115127

128+
def _get_ext_location_definitions(self, missing_locations: list[ExternalLocation]) -> list:
129+
tf_script = []
130+
cnt = 1
131+
for loc in missing_locations:
132+
if loc.location.startswith("s3://"):
133+
res_name = loc.location[5:].rstrip("/").replace("/", "_")
134+
elif loc.location.startswith("gcs://"):
135+
res_name = loc.location[6:].rstrip("/").replace("/", "_")
136+
elif loc.location.startswith("abfss://"):
137+
container_name = loc.location[8 : loc.location.index("@")]
138+
res_name = (
139+
loc.location[loc.location.index("@") + 1 :]
140+
.replace(".dfs.core.windows.net", "")
141+
.rstrip("/")
142+
.replace("/", "_")
143+
)
144+
res_name = f"{container_name}_{res_name}"
145+
else:
146+
# if the cloud storage url doesn't match the above condition or incorrect (example wasb://)
147+
# dont generate tf script and ignore
148+
logger.warning(f"unsupported storage format {loc.location}")
149+
continue
150+
script = f'resource "databricks_external_location" "{res_name}" {{ \n'
151+
script += f' name = "{res_name}"\n'
152+
script += f' url = "{loc.location.rstrip("/")}"\n'
153+
script += " credential_name = databricks_storage_credential.<storage_credential_reference>.id\n"
154+
script += "}\n"
155+
tf_script.append(script)
156+
cnt += 1
157+
return tf_script
158+
159+
def _match_table_external_locations(self) -> tuple[list[list], list[ExternalLocation]]:
160+
external_locations = list(self._ws.external_locations.list())
161+
location_path = [_.url.lower() for _ in external_locations]
162+
table_locations = self.snapshot()
163+
matching_locations = []
164+
missing_locations = []
165+
for loc in table_locations:
166+
# external_location.list returns url without trailing "/" but ExternalLocation.snapshot
167+
# does so removing the trailing slash before comparing
168+
if loc.location.rstrip("/").lower() in location_path:
169+
# identify the index of the matching external_locations
170+
iloc = location_path.index(loc.location.rstrip("/"))
171+
matching_locations.append([external_locations[iloc].name, loc.table_count])
172+
continue
173+
missing_locations.append(loc)
174+
return matching_locations, missing_locations
175+
176+
def save_as_terraform_definitions_on_workspace(self, folder: str | None = None) -> str:
177+
if folder:
178+
self._folder = folder
179+
matching_locations, missing_locations = self._match_table_external_locations()
180+
if len(matching_locations) > 0:
181+
logger.info("following external locations are already configured.")
182+
logger.info("sharing details of # tables that can be migrated for each location")
183+
for _ in matching_locations:
184+
logger.info(f"{_[1]} tables can be migrated using external location {_[0]}.")
185+
if len(missing_locations) > 0:
186+
logger.info("following external location need to be created.")
187+
for _ in missing_locations:
188+
logger.info(f"{_.table_count} tables can be migrated using external location {_.location}.")
189+
buffer = io.StringIO()
190+
for script in self._get_ext_location_definitions(missing_locations):
191+
buffer.write(script)
192+
buffer.seek(0)
193+
return self._overwrite_mapping(buffer)
194+
else:
195+
logger.info("no additional external location to be created.")
196+
return ""
197+
198+
def _overwrite_mapping(self, buffer) -> str:
199+
path = f"{self._folder}/external_locations.tf"
200+
self._ws.workspace.upload(path, buffer, overwrite=True, format=ImportFormat.AUTO)
201+
return path
202+
116203

117204
class Mounts(CrawlerBase[Mount]):
118205
def __init__(self, backend: SqlBackend, ws: WorkspaceClient, inventory_database: str):

tests/integration/hive_metastore/test_external_locations.py

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
import logging
22

3-
from databricks.labs.ucx.hive_metastore.locations import ExternalLocations, Mount
3+
from databricks.labs.ucx.hive_metastore.locations import (
4+
ExternalLocation,
5+
ExternalLocations,
6+
Mount,
7+
)
48
from databricks.labs.ucx.hive_metastore.tables import Table
59

610
logger = logging.getLogger(__name__)
@@ -44,6 +48,17 @@ def test_external_locations(ws, sql_backend, inventory_schema, env_or_skip):
4448
port=1234, dbtable=sometable, user=*********(redacted), password=*********(redacted), \
4549
provider=providerknown]",
4650
),
51+
Table(
52+
"hive_metastore",
53+
"foo",
54+
"bar2",
55+
"EXTERNAL",
56+
"delta",
57+
location="jdbc://providerknown/",
58+
storage_properties="[database=test_db, host=somedb.us-east-1.rds.amazonaws.com, \
59+
port=1234, dbtable=sometable2, user=*********(redacted), password=*********(redacted), \
60+
provider=providerknown]",
61+
),
4762
Table(
4863
"hive_metastore",
4964
"foo",
@@ -68,4 +83,17 @@ def test_external_locations(ws, sql_backend, inventory_schema, env_or_skip):
6883
)
6984
assert results[3].location == "jdbc:mysql://somemysql.us-east-1.rds.amazonaws.com:3306/test_db"
7085
assert results[4].location == "jdbc:providerknown://somedb.us-east-1.rds.amazonaws.com:1234/test_db"
86+
assert results[4].table_count == 2
7187
assert results[5].location == "jdbc://providerunknown//somedb.us-east-1.rds.amazonaws.com:1234/test_db"
88+
89+
90+
def test_save_external_location_mapping_missing_location(ws, sql_backend, inventory_schema):
91+
logger.info("setting up fixtures")
92+
locations = [
93+
ExternalLocation("abfss://cont1@storage123/test_location", 2),
94+
ExternalLocation("abfss://cont1@storage456/test_location2", 1),
95+
]
96+
sql_backend.save_table(f"{inventory_schema}.external_locations", locations, ExternalLocation)
97+
location_crawler = ExternalLocations(ws, sql_backend, inventory_schema)
98+
path = location_crawler.save_as_terraform_definitions_on_workspace()
99+
assert ws.workspace.get_status(path)

tests/unit/hive_metastore/test_locations.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from unittest.mock import MagicMock, Mock
22

33
from databricks.sdk.dbutils import MountInfo
4+
from databricks.sdk.service.catalog import ExternalLocationInfo
45

56
from databricks.labs.ucx.hive_metastore.locations import (
67
ExternalLocations,
@@ -79,6 +80,14 @@ def test_external_locations():
7980
provider=providerknown]",
8081
]
8182
),
83+
row_factory(
84+
[
85+
"jdbc:providerknown:/",
86+
"[database=test_db, host=somedb.us-east-1.rds.amazonaws.com, \
87+
port=1234, dbtable=sometable2, user=*********(redacted), password=*********(redacted), \
88+
provider=providerknown]",
89+
]
90+
),
8291
row_factory(
8392
[
8493
"jdbc:providerunknown:/",
@@ -91,6 +100,7 @@ def test_external_locations():
91100
result_set = crawler._external_locations(sample_locations, sample_mounts)
92101
assert len(result_set) == 7
93102
assert result_set[0].location == "s3://us-east-1-dev-account-staging-uc-ext-loc-bucket-1/Location/"
103+
assert result_set[0].table_count == 2
94104
assert result_set[1].location == "s3://us-east-1-dev-account-staging-uc-ext-loc-bucket-23/"
95105
assert (
96106
result_set[3].location
@@ -99,3 +109,61 @@ def test_external_locations():
99109
assert result_set[4].location == "jdbc:mysql://somemysql.us-east-1.rds.amazonaws.com:3306/test_db"
100110
assert result_set[5].location == "jdbc:providerknown://somedb.us-east-1.rds.amazonaws.com:1234/test_db"
101111
assert result_set[6].location == "jdbc:providerunknown://somedb.us-east-1.rds.amazonaws.com:1234/test_db"
112+
113+
114+
def make_row(data, columns):
115+
row = Row(data)
116+
row.__columns__ = columns
117+
return row
118+
119+
120+
def test_save_external_location_mapping_missing_location():
121+
ws = MagicMock()
122+
select_cols = ["location", "storage_properties"]
123+
sbe = MockBackend(
124+
rows={
125+
"SELECT location, storage_properties FROM test.tables WHERE location IS NOT NULL": [
126+
make_row(("s3://test_location/test1/table1", ""), select_cols),
127+
make_row(("gcs://test_location2/test2/table2", ""), select_cols),
128+
make_row(("abfss://[email protected]/test2/table3", ""), select_cols),
129+
],
130+
}
131+
)
132+
location_crawler = ExternalLocations(ws, sbe, "test")
133+
ws.external_locations.list.return_value = [ExternalLocationInfo(name="loc1", url="s3://test_location/test11")]
134+
location_crawler.save_as_terraform_definitions_on_workspace("~/.ucx")
135+
(path, content), _ = ws.workspace.upload.call_args
136+
assert "~/.ucx/external_locations.tf" == path
137+
assert (
138+
'resource "databricks_external_location" "test_location_test1" { \n'
139+
' name = "test_location_test1"\n'
140+
' url = "s3://test_location/test1"\n'
141+
" credential_name = databricks_storage_credential.<storage_credential_reference>.id\n"
142+
"}\n"
143+
'resource "databricks_external_location" "test_location2_test2" { \n'
144+
' name = "test_location2_test2"\n'
145+
' url = "gcs://test_location2/test2"\n'
146+
" credential_name = databricks_storage_credential.<storage_credential_reference>.id\n"
147+
"}\n"
148+
'resource "databricks_external_location" "cont1_storagetest1_test2" { \n'
149+
' name = "cont1_storagetest1_test2"\n'
150+
' url = "abfss://[email protected]/test2"\n'
151+
" credential_name = databricks_storage_credential.<storage_credential_reference>.id\n"
152+
"}\n"
153+
) == content.read()
154+
155+
156+
def test_save_external_location_mapping_no_missing_location():
157+
ws = MagicMock()
158+
select_cols = ["location", "storage_properties"]
159+
sbe = MockBackend(
160+
rows={
161+
"SELECT location, storage_properties FROM test.tables WHERE location IS NOT NULL": [
162+
make_row(("s3://test_location/test1/table1", ""), select_cols),
163+
],
164+
}
165+
)
166+
location_crawler = ExternalLocations(ws, sbe, "test")
167+
ws.external_locations.list.return_value = [ExternalLocationInfo(name="loc1", url="s3://test_location/test1")]
168+
location_crawler.save_as_terraform_definitions_on_workspace("~/.ucx")
169+
ws.workspace.upload.assert_not_called()

0 commit comments

Comments
 (0)