Skip to content

Commit ba9c52f

Browse files
authored
Refactoring AzureResources, AzureResourcePermissions and related classes to separate Module ucx/azure (#938)
Refactoring AzureResources, AzureResourcePermissions and related classes to separate Module ucx/azure ## Changes - Moving AzureResource, AzureResources and related Data classes from ucx/assessment/azure to ucx/azure/resources - Moving AzureResourcePermission and related Data classes from ucx/assessment/azure to ucx/azure/access - moving related integration test cases to test/integration/azure - moving related unit test cases to test/unit/azure ### Linked issues ### Functionality - [ ] added relevant user documentation - [ ] added new CLI command - [x] modified existing command: `databricks labs ucx ...` - [ ] added a new workflow - [ ] modified existing workflow: `...` - [ ] added a new table - [ ] modified existing table: `...` ### Tests <!-- How is this tested? Please see the checklist below and also describe any other relevant tests --> - [x] manually tested - [x] added unit tests - [ ] added integration tests - [ ] verified on staging environment (screenshot attached)
1 parent 914f0b8 commit ba9c52f

File tree

15 files changed

+602
-594
lines changed

15 files changed

+602
-594
lines changed

src/databricks/labs/ucx/assessment/azure.py

Lines changed: 1 addition & 305 deletions
Original file line numberDiff line numberDiff line change
@@ -4,16 +4,8 @@
44
from collections.abc import Iterable
55
from dataclasses import dataclass
66

7-
from databricks.labs.blueprint.installation import Installation
87
from databricks.sdk import WorkspaceClient
9-
from databricks.sdk.core import (
10-
ApiClient,
11-
AzureCliTokenSource,
12-
Config,
13-
credentials_provider,
14-
)
158
from databricks.sdk.errors import NotFound
16-
from databricks.sdk.service.catalog import Privilege
179
from databricks.sdk.service.compute import ClusterSource, Policy
1810

1911
from databricks.labs.ucx.assessment.crawlers import (
@@ -25,13 +17,7 @@
2517
logger,
2618
)
2719
from databricks.labs.ucx.assessment.jobs import JobsMixin
28-
from databricks.labs.ucx.config import WorkspaceConfig
29-
from databricks.labs.ucx.framework.crawlers import (
30-
CrawlerBase,
31-
SqlBackend,
32-
StatementExecutionBackend,
33-
)
34-
from databricks.labs.ucx.hive_metastore.locations import ExternalLocations
20+
from databricks.labs.ucx.framework.crawlers import CrawlerBase, SqlBackend
3521

3622

3723
@dataclass
@@ -244,293 +230,3 @@ def snapshot(self) -> Iterable[AzureServicePrincipalInfo]:
244230
def _try_fetch(self) -> Iterable[AzureServicePrincipalInfo]:
245231
for row in self._fetch(f"SELECT * FROM {self._schema}.{self._table}"):
246232
yield AzureServicePrincipalInfo(*row)
247-
248-
249-
@dataclass
250-
class AzureSubscription:
251-
name: str
252-
subscription_id: str
253-
tenant_id: str
254-
255-
256-
class AzureResource:
257-
def __init__(self, resource_id: str):
258-
self._pairs = {}
259-
self._resource_id = resource_id
260-
split = resource_id.lstrip("/").split("/")
261-
if len(split) % 2 != 0:
262-
msg = f"not a list of pairs: {resource_id}"
263-
raise ValueError(msg)
264-
i = 0
265-
while i < len(split):
266-
k = split[i]
267-
v = split[i + 1]
268-
i += 2
269-
self._pairs[k] = v
270-
271-
@property
272-
def subscription_id(self):
273-
return self._pairs.get("subscriptions")
274-
275-
@property
276-
def resource_group(self):
277-
return self._pairs.get("resourceGroups")
278-
279-
@property
280-
def storage_account(self):
281-
return self._pairs.get("storageAccounts")
282-
283-
@property
284-
def container(self):
285-
return self._pairs.get("containers")
286-
287-
def __eq__(self, other):
288-
if not isinstance(other, AzureResource):
289-
return NotImplemented
290-
return self._resource_id == other._resource_id
291-
292-
def __repr__(self):
293-
properties = ["subscription_id", "resource_group", "storage_account", "container"]
294-
pairs = [f"{_}={getattr(self, _)}" for _ in properties]
295-
return f'AzureResource<{", ".join(pairs)}>'
296-
297-
def __str__(self):
298-
return self._resource_id
299-
300-
301-
@dataclass
302-
class Principal:
303-
client_id: str
304-
display_name: str
305-
object_id: str
306-
307-
308-
@dataclass
309-
class AzureRoleAssignment:
310-
resource: AzureResource
311-
scope: AzureResource
312-
principal: Principal
313-
role_name: str
314-
315-
316-
class AzureResources:
317-
def __init__(self, ws: WorkspaceClient, *, include_subscriptions=None):
318-
if not include_subscriptions:
319-
include_subscriptions = []
320-
rm_host = ws.config.arm_environment.resource_manager_endpoint
321-
self._resource_manager = ApiClient(
322-
Config(
323-
host=rm_host,
324-
credentials_provider=self._provider_for(ws.config.arm_environment.service_management_endpoint),
325-
)
326-
)
327-
self._graph = ApiClient(
328-
Config(
329-
host="https://graph.microsoft.com",
330-
credentials_provider=self._provider_for("https://graph.microsoft.com"),
331-
)
332-
)
333-
self._token_source = AzureCliTokenSource(rm_host)
334-
self._include_subscriptions = include_subscriptions
335-
self._role_definitions = {} # type: dict[str, str]
336-
self._principals: dict[str, Principal | None] = {}
337-
338-
def _provider_for(self, endpoint: str):
339-
@credentials_provider("azure-cli", ["host"])
340-
def _credentials(_: Config):
341-
token_source = AzureCliTokenSource(endpoint)
342-
343-
def inner() -> dict[str, str]:
344-
token = token_source.token()
345-
return {"Authorization": f"{token.token_type} {token.access_token}"}
346-
347-
return inner
348-
349-
return _credentials
350-
351-
def _get_subscriptions(self) -> Iterable[AzureSubscription]:
352-
for subscription in self._get_resource("/subscriptions", api_version="2022-12-01").get("value", []):
353-
yield AzureSubscription(
354-
name=subscription["displayName"],
355-
subscription_id=subscription["subscriptionId"],
356-
tenant_id=subscription["tenantId"],
357-
)
358-
359-
def _tenant_id(self):
360-
token = self._token_source.token()
361-
return token.jwt_claims().get("tid")
362-
363-
def subscriptions(self):
364-
tenant_id = self._tenant_id()
365-
for subscription in self._get_subscriptions():
366-
if subscription.tenant_id != tenant_id:
367-
continue
368-
if subscription.subscription_id not in self._include_subscriptions:
369-
continue
370-
yield subscription
371-
372-
def _get_resource(self, path: str, api_version: str):
373-
headers = {"Accept": "application/json"}
374-
query = {"api-version": api_version}
375-
return self._resource_manager.do("GET", path, query=query, headers=headers)
376-
377-
def storage_accounts(self) -> Iterable[AzureResource]:
378-
for subscription in self.subscriptions():
379-
logger.info(f"Checking in subscription {subscription.name} for storage accounts")
380-
path = f"/subscriptions/{subscription.subscription_id}/providers/Microsoft.Storage/storageAccounts"
381-
for storage in self._get_resource(path, "2023-01-01").get("value", []):
382-
resource_id = storage.get("id")
383-
if not resource_id:
384-
continue
385-
yield AzureResource(resource_id)
386-
387-
def containers(self, storage: AzureResource):
388-
for raw in self._get_resource(f"{storage}/blobServices/default/containers", "2023-01-01").get("value", []):
389-
resource_id = raw.get("id")
390-
if not resource_id:
391-
continue
392-
yield AzureResource(resource_id)
393-
394-
def _get_principal(self, principal_id: str) -> Principal | None:
395-
if principal_id in self._principals:
396-
return self._principals[principal_id]
397-
try:
398-
path = f"/v1.0/directoryObjects/{principal_id}"
399-
raw: dict[str, str] = self._graph.do("GET", path) # type: ignore[assignment]
400-
except NotFound:
401-
# don't load principals from external directories twice
402-
self._principals[principal_id] = None
403-
return self._principals[principal_id]
404-
client_id = raw.get("appId")
405-
display_name = raw.get("displayName")
406-
object_id = raw.get("id")
407-
assert client_id is not None
408-
assert display_name is not None
409-
assert object_id is not None
410-
self._principals[principal_id] = Principal(client_id, display_name, object_id)
411-
return self._principals[principal_id]
412-
413-
def role_assignments(
414-
self, resource_id: str, *, principal_types: list[str] | None = None
415-
) -> Iterable[AzureRoleAssignment]:
416-
"""See https://learn.microsoft.com/en-us/rest/api/authorization/role-assignments/list-for-resource"""
417-
if not principal_types:
418-
principal_types = ["ServicePrincipal"]
419-
result = self._get_resource(f"{resource_id}/providers/Microsoft.Authorization/roleAssignments", "2022-04-01")
420-
for role_assignment in result.get("value", []):
421-
assignment_properties = role_assignment.get("properties", {})
422-
principal_type = assignment_properties.get("principalType")
423-
if not principal_type:
424-
continue
425-
if principal_type not in principal_types:
426-
continue
427-
principal_id = assignment_properties.get("principalId")
428-
if not principal_id:
429-
continue
430-
role_definition_id = assignment_properties.get("roleDefinitionId")
431-
if not role_definition_id:
432-
continue
433-
scope = assignment_properties.get("scope")
434-
if not scope:
435-
continue
436-
if role_definition_id not in self._role_definitions:
437-
role_definition = self._get_resource(role_definition_id, "2022-04-01")
438-
definition_properties = role_definition.get("properties", {})
439-
role_name: str = definition_properties.get("roleName")
440-
if not role_name:
441-
continue
442-
self._role_definitions[role_definition_id] = role_name
443-
principal = self._get_principal(principal_id)
444-
if not principal:
445-
continue
446-
role_name = self._role_definitions[role_definition_id]
447-
if scope == "/":
448-
scope = resource_id
449-
yield AzureRoleAssignment(
450-
resource=AzureResource(resource_id),
451-
scope=AzureResource(scope),
452-
principal=principal,
453-
role_name=role_name,
454-
)
455-
456-
457-
@dataclass
458-
class StoragePermissionMapping:
459-
prefix: str
460-
client_id: str
461-
principal: str
462-
privilege: str
463-
464-
465-
class AzureResourcePermissions:
466-
def __init__(self, installation: Installation, ws: WorkspaceClient, azurerm: AzureResources, lc: ExternalLocations):
467-
self._filename = 'azure_storage_account_info.csv'
468-
self._installation = installation
469-
self._locations = lc
470-
self._azurerm = azurerm
471-
self._ws = ws
472-
self._levels = {
473-
"Storage Blob Data Contributor": Privilege.WRITE_FILES,
474-
"Storage Blob Data Owner": Privilege.WRITE_FILES,
475-
"Storage Blob Data Reader": Privilege.READ_FILES,
476-
}
477-
478-
@classmethod
479-
def for_cli(cls, ws: WorkspaceClient, product='ucx'):
480-
installation = Installation.current(ws, product)
481-
config = installation.load(WorkspaceConfig)
482-
sql_backend = StatementExecutionBackend(ws, config.warehouse_id)
483-
azurerm = AzureResources(ws)
484-
locations = ExternalLocations(ws, sql_backend, config.inventory_database)
485-
return cls(installation, ws, azurerm, locations)
486-
487-
def _map_storage(self, storage: AzureResource) -> list[StoragePermissionMapping]:
488-
logger.info(f"Fetching role assignment for {storage.storage_account}")
489-
out = []
490-
for container in self._azurerm.containers(storage):
491-
for role_assignment in self._azurerm.role_assignments(str(container)):
492-
# one principal may be assigned multiple roles with overlapping dataActions, hence appearing
493-
# here in duplicates. hence, role name -> permission level is not enough for the perfect scenario.
494-
if role_assignment.role_name not in self._levels:
495-
continue
496-
privilege = self._levels[role_assignment.role_name].value
497-
out.append(
498-
StoragePermissionMapping(
499-
prefix=f"abfss://{container.container}@{container.storage_account}.dfs.core.windows.net/",
500-
client_id=role_assignment.principal.client_id,
501-
principal=role_assignment.principal.display_name,
502-
privilege=privilege,
503-
)
504-
)
505-
return out
506-
507-
def save_spn_permissions(self) -> str | None:
508-
used_storage_accounts = self._get_storage_accounts()
509-
if len(used_storage_accounts) == 0:
510-
logger.warning(
511-
"There are no external table present with azure storage account. "
512-
"Please check if assessment job is run"
513-
)
514-
return None
515-
storage_account_infos = []
516-
for storage in self._azurerm.storage_accounts():
517-
if storage.storage_account not in used_storage_accounts:
518-
continue
519-
for mapping in self._map_storage(storage):
520-
storage_account_infos.append(mapping)
521-
if len(storage_account_infos) == 0:
522-
logger.error("No storage account found in current tenant with spn permission")
523-
return None
524-
return self._installation.save(storage_account_infos, filename=self._filename)
525-
526-
def _get_storage_accounts(self) -> list[str]:
527-
external_locations = self._locations.snapshot()
528-
storage_accounts = []
529-
for location in external_locations:
530-
if location.location.startswith("abfss://"):
531-
start = location.location.index("@")
532-
end = location.location.index(".dfs.core.windows.net")
533-
storage_acct = location.location[start + 1 : end]
534-
if storage_acct not in storage_accounts:
535-
storage_accounts.append(storage_acct)
536-
return storage_accounts

src/databricks/labs/ucx/azure/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)