Skip to content

Commit 41309a2

Browse files
authored
Added AWS S3 support for migrate-locations command (#1009)
1 parent bc843c9 commit 41309a2

File tree

4 files changed

+169
-5
lines changed

4 files changed

+169
-5
lines changed

src/databricks/labs/ucx/assessment/aws.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
from databricks.labs.ucx.framework.crawlers import StatementExecutionBackend
1919
from databricks.labs.ucx.hive_metastore import ExternalLocations
20+
from databricks.labs.ucx.hive_metastore.locations import ExternalLocation
2021

2122
logger = logging.getLogger(__name__)
2223

@@ -358,6 +359,11 @@ def get_uc_compatible_roles(self):
358359
return role_actions
359360

360361
def create_uc_roles_cli(self, *, single_role=True, role_name="UC_ROLE", policy_name="UC_POLICY"):
362+
# Get the missing paths
363+
# Identify the S3 prefixes
364+
# Create the roles and policies for the missing S3 prefixes
365+
# If single_role is True, create a single role and policy for all the missing S3 prefixes
366+
# If single_role is False, create a role and policy for each missing S3 prefix
361367
missing_paths = self._identify_missing_paths()
362368
s3_prefixes = set()
363369
for missing_path in missing_paths:
@@ -460,3 +466,64 @@ def save_instance_profile_permissions(self) -> str | None:
460466
logger.warning("No Mapping Was Generated.")
461467
return None
462468
return self._installation.save(instance_profile_access, filename=self.INSTANCE_PROFILES_FILE_NAMES)
469+
470+
def _identify_missing_external_locations(
471+
self,
472+
external_locations: Iterable[ExternalLocation],
473+
existing_paths: list[str],
474+
compatible_roles: list[AWSRoleAction],
475+
) -> set[tuple[str, str]]:
476+
# Get recommended external locations
477+
# Get existing external locations
478+
# Get list of paths from get_uc_compatible_roles
479+
# Identify recommended external location paths that don't have an external location and return them
480+
missing_paths = set()
481+
for external_location in external_locations:
482+
existing = False
483+
for path in existing_paths:
484+
if path in external_location.location:
485+
existing = True
486+
continue
487+
if existing:
488+
continue
489+
new_path = PurePath(external_location.location)
490+
matching_role = None
491+
for role in compatible_roles:
492+
if new_path.match(role.resource_path):
493+
matching_role = role.role_arn
494+
continue
495+
if matching_role:
496+
missing_paths.add((external_location.location, matching_role))
497+
498+
return missing_paths
499+
500+
def _get_existing_credentials_dict(self):
501+
credentials = self._ws.storage_credentials.list()
502+
credentials_dict = {}
503+
for credential in credentials:
504+
credentials_dict[credential.aws_iam_role.role_arn] = credential.name
505+
return credentials_dict
506+
507+
def create_external_locations(self, location_init="UCX_location"):
508+
# For each path find out the role that has access to it
509+
# Find out the credential that is pointing to this path
510+
# Create external location for the path using the credential identified
511+
credential_dict = self._get_existing_credentials_dict()
512+
external_locations = ExternalLocations(self._ws, self._backend, self._schema).snapshot()
513+
existing_external_locations = self._ws.external_locations.list()
514+
existing_paths = [external_location.url for external_location in existing_external_locations]
515+
compatible_roles = self.get_uc_compatible_roles()
516+
missing_paths = self._identify_missing_external_locations(external_locations, existing_paths, compatible_roles)
517+
external_location_names = [external_location.name for external_location in existing_external_locations]
518+
external_location_num = 1
519+
for path, role_arn in missing_paths:
520+
if role_arn not in credential_dict:
521+
logger.error(f"Missing credential for role {role_arn} for path {path}")
522+
continue
523+
while True:
524+
external_location_name = f"{location_init}_{external_location_num}"
525+
if external_location_name not in external_location_names:
526+
break
527+
external_location_num += 1
528+
self._ws.external_locations.create(external_location_name, path, credential_dict[role_arn])
529+
external_location_num += 1

src/databricks/labs/ucx/cli.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -346,7 +346,7 @@ def create_uber_principal(w: WorkspaceClient, subscription_id: str):
346346

347347

348348
@ucx.command
349-
def migrate_locations(w: WorkspaceClient):
349+
def migrate_locations(w: WorkspaceClient, aws_profile: str | None = None):
350350
"""This command creates UC external locations. The candidate locations to be created are extracted from guess_external_locations
351351
task in the assessment job. You can run validate_external_locations command to check the candidate locations. Please make sure
352352
the credentials haven migrated before running this command. The command will only create the locations that have corresponded UC Storage Credentials.
@@ -357,7 +357,15 @@ def migrate_locations(w: WorkspaceClient):
357357
service_principal_migration = ExternalLocationsMigration.for_cli(w, installation)
358358
service_principal_migration.run()
359359
if w.config.is_aws:
360-
logger.error("migrate_locations is not yet supported in AWS")
360+
logger.error("Migrate_locations for AWS")
361+
if not shutil.which("aws"):
362+
logger.error("Couldn't find AWS CLI in path. Please install the CLI from https://aws.amazon.com/cli/")
363+
return
364+
installation = Installation.current(w, 'ucx')
365+
config = installation.load(WorkspaceConfig)
366+
sql_backend = StatementExecutionBackend(w, config.warehouse_id)
367+
aws_permissions = AWSResourcePermissions.for_cli(w, sql_backend, aws_profile, config.inventory_database)
368+
aws_permissions.create_external_locations()
361369
if w.config.is_gcp:
362370
logger.error("migrate_locations is not yet supported in GCP")
363371

tests/unit/assessment/test_aws.py

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,17 @@
11
import logging
2+
from unittest import mock
23
from unittest.mock import MagicMock, call, create_autospec
34

45
import pytest
56
from databricks.labs.blueprint.installation import MockInstallation
67
from databricks.sdk import WorkspaceClient
78
from databricks.sdk.errors import ResourceDoesNotExist
89
from databricks.sdk.service import iam
10+
from databricks.sdk.service.catalog import (
11+
AwsIamRole,
12+
ExternalLocationInfo,
13+
StorageCredentialInfo,
14+
)
915
from databricks.sdk.service.compute import InstanceProfile
1016

1117
from databricks.labs.ucx.assessment.aws import (
@@ -973,3 +979,76 @@ def test_get_uc_compatible_roles():
973979
},
974980
],
975981
)
982+
983+
984+
def test_create_external_locations():
985+
ws = create_autospec(WorkspaceClient)
986+
aws = create_autospec(AWSResources)
987+
installation = MockInstallation()
988+
installation.load = MagicMock()
989+
installation.load.return_value = [
990+
AWSRoleAction("arn:aws:iam::12345:role/uc-role1", "s3", "WRITE_FILES", "s3://BUCKET1/*"),
991+
AWSRoleAction("arn:aws:iam::12345:role/uc-role1", "s3", "WRITE_FILES", "s3://BUCKET2/*"),
992+
AWSRoleAction("arn:aws:iam::12345:role/uc-rolex", "s3", "WRITE_FILES", "s3://BUCKETX/FOLDERX"),
993+
]
994+
rows = {
995+
"external_locations": [["s3://BUCKET1/FOLDER1", 1], ["s3://BUCKET2/FOLDER2", 1], ["s3://BUCKETX/FOLDERX", 1]]
996+
}
997+
ws.storage_credentials.list.return_value = [
998+
StorageCredentialInfo(
999+
id="1",
1000+
name="cred1",
1001+
aws_iam_role=AwsIamRole("arn:aws:iam::12345:role/uc-role1"),
1002+
),
1003+
StorageCredentialInfo(
1004+
id="2",
1005+
name="credx",
1006+
aws_iam_role=AwsIamRole("arn:aws:iam::12345:role/uc-rolex"),
1007+
),
1008+
]
1009+
errors = {}
1010+
backend = MockBackend(rows=rows, fails_on_first=errors)
1011+
aws_resource_permissions = AWSResourcePermissions(installation, ws, backend, aws, "ucx")
1012+
aws_resource_permissions.create_external_locations()
1013+
calls = [
1014+
call(mock.ANY, 's3://BUCKET1/FOLDER1', 'cred1'),
1015+
call(mock.ANY, 's3://BUCKET2/FOLDER2', 'cred1'),
1016+
call(mock.ANY, 's3://BUCKETX/FOLDERX', 'credx'),
1017+
]
1018+
ws.external_locations.create.assert_has_calls(calls, any_order=True)
1019+
1020+
1021+
def test_create_external_locations_skip_existing():
1022+
ws = create_autospec(WorkspaceClient)
1023+
aws = create_autospec(AWSResources)
1024+
installation = MockInstallation()
1025+
installation.load = MagicMock()
1026+
installation.load.return_value = [
1027+
AWSRoleAction("arn:aws:iam::12345:role/uc-role1", "s3", "WRITE_FILES", "s3://BUCKET1/*"),
1028+
AWSRoleAction("arn:aws:iam::12345:role/uc-rolex", "s3", "WRITE_FILES", "s3://BUCKETX/FOLDERX"),
1029+
]
1030+
rows = {"external_locations": [["s3://BUCKET1/FOLDER1", 1], ["s3://BUCKETX/FOLDERX", 1]]}
1031+
ws.storage_credentials.list.return_value = [
1032+
StorageCredentialInfo(
1033+
id="1",
1034+
name="cred1",
1035+
aws_iam_role=AwsIamRole("arn:aws:iam::12345:role/uc-role1"),
1036+
),
1037+
StorageCredentialInfo(
1038+
id="2",
1039+
name="credx",
1040+
aws_iam_role=AwsIamRole("arn:aws:iam::12345:role/uc-rolex"),
1041+
),
1042+
]
1043+
ws.external_locations.list.return_value = [
1044+
ExternalLocationInfo(name="UCX_FOO_1", url="s3://BUCKETX/FOLDERX", credential_name="credx"),
1045+
]
1046+
1047+
errors = {}
1048+
backend = MockBackend(rows=rows, fails_on_first=errors)
1049+
aws_resource_permissions = AWSResourcePermissions(installation, ws, backend, aws, "ucx")
1050+
aws_resource_permissions.create_external_locations(location_init="UCX_FOO")
1051+
calls = [
1052+
call("UCX_FOO_2", 's3://BUCKET1/FOLDER1', 'cred1'),
1053+
]
1054+
ws.external_locations.create.assert_has_calls(calls, any_order=True)

tests/unit/test_cli.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -368,12 +368,22 @@ def test_migrate_locations_azure(ws):
368368
ws.external_locations.list.assert_called()
369369

370370

371-
def test_migrate_locations_aws(ws, caplog):
371+
def test_migrate_locations_aws(ws, caplog, mocker):
372+
mocker.patch("shutil.which", return_value="/path/aws")
372373
ws.config.is_azure = False
373374
ws.config.is_aws = True
374375
ws.config.is_gcp = False
375-
migrate_locations(ws)
376-
assert "migrate_locations is not yet supported in AWS" in caplog.messages
376+
with pytest.raises(ResourceWarning):
377+
migrate_locations(ws, aws_profile="profile")
378+
379+
380+
def test_missing_aws_cli(ws, caplog, mocker):
381+
mocker.patch("shutil.which", return_value=None)
382+
ws.config.is_azure = False
383+
ws.config.is_aws = True
384+
ws.config.is_gcp = False
385+
migrate_locations(ws, aws_profile="profile")
386+
assert "Couldn't find AWS CLI in path. Please install the CLI from https://aws.amazon.com/cli/" in caplog.messages
377387

378388

379389
def test_migrate_locations_gcp(ws, caplog):

0 commit comments

Comments
 (0)