|
| 1 | +import logging |
| 2 | +from urllib.parse import urlparse |
| 3 | + |
| 4 | +from databricks.labs.blueprint.installation import Installation |
| 5 | +from databricks.sdk import WorkspaceClient |
| 6 | +from databricks.sdk.errors.platform import InvalidParameterValue, PermissionDenied |
| 7 | + |
| 8 | +from databricks.labs.ucx.azure.access import AzureResourcePermissions |
| 9 | +from databricks.labs.ucx.azure.resources import AzureAPIClient, AzureResources |
| 10 | +from databricks.labs.ucx.config import WorkspaceConfig |
| 11 | +from databricks.labs.ucx.framework.crawlers import StatementExecutionBackend |
| 12 | +from databricks.labs.ucx.hive_metastore import ExternalLocations |
| 13 | + |
| 14 | +logger = logging.getLogger(__name__) |
| 15 | + |
| 16 | + |
| 17 | +class ExternalLocationsMigration: |
| 18 | + def __init__( |
| 19 | + self, |
| 20 | + ws: WorkspaceClient, |
| 21 | + hms_locations: ExternalLocations, |
| 22 | + resource_permissions: AzureResourcePermissions, |
| 23 | + azurerm: AzureResources, |
| 24 | + ): |
| 25 | + self._ws = ws |
| 26 | + self._hms_locations = hms_locations |
| 27 | + self._resource_permissions = resource_permissions |
| 28 | + self._azurerm = azurerm |
| 29 | + |
| 30 | + @classmethod |
| 31 | + def for_cli(cls, ws: WorkspaceClient, installation: Installation): |
| 32 | + config = installation.load(WorkspaceConfig) |
| 33 | + sql_backend = StatementExecutionBackend(ws, config.warehouse_id) |
| 34 | + hms_locations = ExternalLocations(ws, sql_backend, config.inventory_database) |
| 35 | + |
| 36 | + azure_mgmt_client = AzureAPIClient( |
| 37 | + ws.config.arm_environment.resource_manager_endpoint, |
| 38 | + ws.config.arm_environment.service_management_endpoint, |
| 39 | + ) |
| 40 | + graph_client = AzureAPIClient("https://graph.microsoft.com", "https://graph.microsoft.com") |
| 41 | + azurerm = AzureResources(azure_mgmt_client, graph_client) |
| 42 | + |
| 43 | + resource_permissions = AzureResourcePermissions(installation, ws, azurerm, hms_locations) |
| 44 | + |
| 45 | + return cls(ws, hms_locations, resource_permissions, azurerm) |
| 46 | + |
| 47 | + def _app_id_credential_name_mapping(self) -> tuple[dict[str, str], dict[str, str]]: |
| 48 | + # list all storage credentials. |
| 49 | + # generate the managed identity/service principal application id to credential name mapping. |
| 50 | + # return one mapping for all non read-only credentials and one mapping for all read-only credentials |
| 51 | + # TODO: considering put this logic into the StorageCredentialManager |
| 52 | + app_id_mapping_write = {} |
| 53 | + app_id_mapping_read = {} |
| 54 | + all_credentials = self._ws.storage_credentials.list(max_results=0) |
| 55 | + for credential in all_credentials: |
| 56 | + name = credential.name |
| 57 | + # cannot have none credential name, it's required for external location |
| 58 | + if not name: |
| 59 | + continue |
| 60 | + |
| 61 | + read_only = credential.read_only |
| 62 | + service_principal = credential.azure_service_principal |
| 63 | + managed_identity = credential.azure_managed_identity |
| 64 | + |
| 65 | + application_id = None |
| 66 | + if service_principal: |
| 67 | + # if service principal based credential, use service principal's application_id directly |
| 68 | + application_id = service_principal.application_id |
| 69 | + if managed_identity: |
| 70 | + # if managed identity based credential, fetch the application_id of the managed identity |
| 71 | + application_id = self._azurerm.managed_identity_client_id( |
| 72 | + managed_identity.access_connector_id, |
| 73 | + managed_identity.managed_identity_id, |
| 74 | + ) |
| 75 | + if not application_id: |
| 76 | + continue |
| 77 | + |
| 78 | + if read_only: |
| 79 | + app_id_mapping_read[application_id] = name |
| 80 | + continue |
| 81 | + app_id_mapping_write[application_id] = name |
| 82 | + |
| 83 | + return app_id_mapping_write, app_id_mapping_read |
| 84 | + |
| 85 | + def _prefix_credential_name_mapping(self) -> tuple[dict[str, str], dict[str, str]]: |
| 86 | + # get managed identity/service principal's application id to storage credential name mapping |
| 87 | + # for all non read-only and read-only credentials |
| 88 | + app_id_mapping_write, app_id_mapping_read = self._app_id_credential_name_mapping() |
| 89 | + |
| 90 | + # use the application id to storage credential name mapping to create prefix to storage credential name mapping |
| 91 | + prefix_mapping_write = {} |
| 92 | + prefix_mapping_read = {} |
| 93 | + for permission_mapping in self._resource_permissions.load(): |
| 94 | + if permission_mapping.client_id in app_id_mapping_write: |
| 95 | + prefix_mapping_write[permission_mapping.prefix] = app_id_mapping_write[permission_mapping.client_id] |
| 96 | + continue |
| 97 | + if permission_mapping.client_id in app_id_mapping_read: |
| 98 | + prefix_mapping_read[permission_mapping.prefix] = app_id_mapping_read[permission_mapping.client_id] |
| 99 | + return prefix_mapping_write, prefix_mapping_read |
| 100 | + |
| 101 | + def _create_location_name(self, location_url: str) -> str: |
| 102 | + # generate the UC external location name |
| 103 | + before_at, _, after_at = location_url.partition('@') |
| 104 | + container_name = before_at.removeprefix("abfss://") |
| 105 | + res_name = after_at.replace(".dfs.core.windows.net", "").rstrip("/").replace("/", "_") |
| 106 | + return f"{container_name}_{res_name}" |
| 107 | + |
| 108 | + def _create_external_location_helper( |
| 109 | + self, name, url, credential, comment="Created by UCX", read_only=False, skip_validation=False |
| 110 | + ) -> str | None: |
| 111 | + try: |
| 112 | + self._ws.external_locations.create( |
| 113 | + name, url, credential, comment=comment, read_only=read_only, skip_validation=skip_validation |
| 114 | + ) |
| 115 | + return url |
| 116 | + except InvalidParameterValue as invalid: |
| 117 | + if "overlaps with an existing external location" in str(invalid): |
| 118 | + logger.warning(f"Skip creating external location, see details: {str(invalid)}") |
| 119 | + return None |
| 120 | + raise invalid |
| 121 | + |
| 122 | + def _create_external_location( |
| 123 | + self, location_url: str, prefix_mapping_write: dict[str, str], prefix_mapping_read: dict[str, str] |
| 124 | + ) -> str | None: |
| 125 | + location_name = self._create_location_name(location_url) |
| 126 | + |
| 127 | + # get container url as the prefix |
| 128 | + parsed_url = urlparse(location_url) |
| 129 | + container_url = f"{parsed_url.scheme}://{parsed_url.netloc}/" |
| 130 | + |
| 131 | + # try to create external location with write privilege first |
| 132 | + if container_url in prefix_mapping_write: |
| 133 | + url = self._create_external_location_helper( |
| 134 | + location_name, location_url, prefix_mapping_write[container_url], comment="Created by UCX" |
| 135 | + ) |
| 136 | + return url |
| 137 | + # if no matched write privilege credential, try to create read-only external location |
| 138 | + if container_url in prefix_mapping_read: |
| 139 | + try: |
| 140 | + url = self._create_external_location_helper( |
| 141 | + location_name, |
| 142 | + location_url, |
| 143 | + prefix_mapping_read[container_url], |
| 144 | + comment="Created by UCX", |
| 145 | + read_only=True, |
| 146 | + ) |
| 147 | + return url |
| 148 | + except PermissionDenied as denied: |
| 149 | + if "No file available under the location to read" in str(denied): |
| 150 | + # Empty location will cause failed READ permission check with read-only credential |
| 151 | + # Skip skip_validation in this case |
| 152 | + url = self._create_external_location_helper( |
| 153 | + location_name, |
| 154 | + location_url, |
| 155 | + prefix_mapping_read[container_url], |
| 156 | + comment="Created by UCX", |
| 157 | + read_only=True, |
| 158 | + skip_validation=True, |
| 159 | + ) |
| 160 | + return url |
| 161 | + raise denied |
| 162 | + # if no credential found |
| 163 | + return None |
| 164 | + |
| 165 | + def run(self): |
| 166 | + # list missing external locations in UC |
| 167 | + _, missing_locations = self._hms_locations.match_table_external_locations() |
| 168 | + # Extract the location URLs from the missing locations |
| 169 | + missing_loc_urls = [loc.location for loc in missing_locations] |
| 170 | + |
| 171 | + # get prefix to storage credential name mapping |
| 172 | + prefix_mapping_write, prefix_mapping_read = self._prefix_credential_name_mapping() |
| 173 | + |
| 174 | + # if missing external location is in prefix to storage credential name mapping |
| 175 | + # create a UC external location with mapped storage credential name |
| 176 | + migrated_loc_urls = [] |
| 177 | + for location_url in missing_loc_urls: |
| 178 | + migrated_loc_url = self._create_external_location(location_url, prefix_mapping_write, prefix_mapping_read) |
| 179 | + if migrated_loc_url: |
| 180 | + migrated_loc_urls.append(migrated_loc_url) |
| 181 | + |
| 182 | + leftover_loc_urls = [url for url in missing_loc_urls if url not in migrated_loc_urls] |
| 183 | + if leftover_loc_urls: |
| 184 | + logger.info( |
| 185 | + "External locations below are not created in UC. You may check following cases and rerun this command:" |
| 186 | + "1. Please check the output of 'migrate_credentials' command for storage credentials migration failure." |
| 187 | + "2. If you use service principal in extra_config when create dbfs mount or use service principal " |
| 188 | + "in your code directly for storage access, UCX cannot automatically migrate them to storage credential." |
| 189 | + "Please manually create those storage credentials first." |
| 190 | + "3. You may have overlapping external location already in UC." |
| 191 | + ) |
| 192 | + for loc_url in leftover_loc_urls: |
| 193 | + logger.info(f"Not created external location: {loc_url}") |
| 194 | + return leftover_loc_urls |
| 195 | + |
| 196 | + logger.info("All UC external location are created.") |
| 197 | + return leftover_loc_urls |
0 commit comments