Skip to content

Commit 40568e5

Browse files
authored
Added crawling for init scripts on local files to assessment workflow (#960)
## Changes Scanning Init Script for local file and S3 ### Linked issues #954 Resolves #954 ### Functionality - [ ] added relevant user documentation - [ ] added new CLI command - [ ] modified existing command: `databricks labs ucx ...` - [ ] added a new workflow - [ ] modified existing workflow: `...` - [ ] added a new table - [ ] modified existing table: `...` ### Tests <!-- How is this tested? Please see the checklist below and also describe any other relevant tests --> - [ ] manually tested - [x] added unit tests - [ ] added integration tests - [ ] verified on staging environment (screenshot attached)
1 parent e6d7435 commit 40568e5

File tree

5 files changed

+69
-1
lines changed

5 files changed

+69
-1
lines changed

src/databricks/labs/ucx/assessment/clusters.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
DataSecurityMode,
1313
DbfsStorageInfo,
1414
InitScriptInfo,
15+
LocalFileInfo,
1516
Policy,
1617
WorkspaceStorageInfo,
1718
)
@@ -20,6 +21,7 @@
2021
AZURE_SP_CONF_FAILURE_MSG,
2122
INCOMPATIBLE_SPARK_CONFIG_KEYS,
2223
INIT_SCRIPT_DBFS_PATH,
24+
INIT_SCRIPT_LOCAL_PATH,
2325
azure_sp_conf_present_check,
2426
spark_version_compatibility,
2527
)
@@ -74,6 +76,14 @@ def _get_init_script_data(self, init_script_info: InitScriptInfo) -> str | None:
7476
data = self._ws.workspace.export(workspace_file_destination).content
7577
if data is not None:
7678
return base64.b64decode(data).decode("utf-8")
79+
case InitScriptInfo(file=LocalFileInfo(destination)):
80+
split = destination.split(":/")
81+
if len(split) != INIT_SCRIPT_LOCAL_PATH:
82+
return None
83+
with open(split[1], "r", encoding="utf-8") as file:
84+
data = file.read()
85+
return data
86+
7787
return None
7888
except NotFound:
7989
return None

src/databricks/labs/ucx/assessment/crawlers.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
]
1919
AZURE_SP_CONF_FAILURE_MSG = "Uses azure service principal credentials config in"
2020
INIT_SCRIPT_DBFS_PATH = 2
21+
INIT_SCRIPT_LOCAL_PATH = 2
2122

2223

2324
def azure_sp_conf_in_init_scripts(init_script_data: str) -> bool:
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
{
2+
"autoscale": {
3+
"max_workers": 6,
4+
"min_workers": 1
5+
},
6+
"cluster_id": "01234-11223344-1122334455",
7+
"cluster_name": "UCX Cluster",
8+
"policy_id": "single-user-with-spn",
9+
"spark_version": "13.3.x-cpu-ml-scala2.12",
10+
"init_scripts": [
11+
{
12+
"file": {
13+
"destination": "file:/users/init_scripts/test.sh"
14+
}
15+
},
16+
{
17+
"file": {
18+
"destination": "file"
19+
}
20+
},
21+
{
22+
"workspace": {
23+
"destination": "init.sh"
24+
}
25+
}
26+
]
27+
}
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
{
2+
"autoscale": {
3+
"max_workers": 6,
4+
"min_workers": 1
5+
},
6+
"cluster_id": "01234-11223344-1122334455",
7+
"cluster_name": "UCX Cluster",
8+
"policy_id": "single-user-with-spn",
9+
"spark_version": "13.3.x-cpu-ml-scala2.12",
10+
"init_scripts": [
11+
{
12+
"abfss": {
13+
"destination": "abfss"
14+
}
15+
}
16+
]
17+
}

tests/unit/assessment/test_clusters.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import json
2-
from unittest.mock import MagicMock
2+
from unittest.mock import MagicMock, mock_open, patch
33

44
import pytest
55
from databricks.sdk.errors import DatabricksError, InternalError, NotFound
@@ -84,6 +84,19 @@ def test_cluster_init_script():
8484
assert len(init_crawler) == 1
8585

8686

87+
def test_cluster_file_init_script():
88+
ws = workspace_client_mock(cluster_ids=['init-scripts-file'])
89+
with patch("builtins.open", mock_open(read_data="data")):
90+
init_crawler = ClustersCrawler(ws, MockBackend(), "ucx").snapshot()
91+
assert len(init_crawler) == 1
92+
93+
94+
def test_cluster_no_match_file_init_script():
95+
ws = workspace_client_mock(cluster_ids=['init-scripts-no-match'])
96+
init_crawler = ClustersCrawler(ws, MockBackend(), "ucx").snapshot()
97+
assert len(init_crawler) == 1
98+
99+
87100
def test_cluster_init_script_check_dbfs():
88101
ws = workspace_client_mock(cluster_ids=['init-scripts-dbfs'])
89102
ws.dbfs.read().data = "JXNoCmVjaG8gIj0="

0 commit comments

Comments
 (0)