Skip to content

Commit cf9d1cf

Browse files
authored
Group migration: improve robustness while deleting workspace groups (#2247)
## Changes This PR updates the group manager so that deleting workspace groups is more reliable. Changes include: - We no longer skip deletion of groups that don't appear to be present. Due to eventual consistency issues this was occurring with groups that had recently been renamed (to their temporary name). - Deletion now waits for the effects of deletion to be visible by double-checking that a group can no longer be directly retrieved from the API, and that it no longer appears in the list of groups during enumeration. (Due to API limitations this is not a guarantee that the groups are no longer visible, but it does decrease the likelihood of anything noticed afterwards.) - Improved logging. A subsequent PR will update group renaming to use a similar approach to that here. ### Linked issues Resolves #2227. ### Functionality - modified existing workflow: `remove-workspace-local-backup-groups` ### Tests - updated unit tests - updated integration tests
1 parent c242fb4 commit cf9d1cf

File tree

5 files changed

+198
-57
lines changed

5 files changed

+198
-57
lines changed

src/databricks/labs/ucx/workspace_access/groups.py

Lines changed: 116 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -380,6 +380,16 @@ def generate_migrated_groups(self) -> Iterable[MigratedGroup]:
380380
)
381381

382382

383+
class GroupDeletionIncompleteError(RuntimeError):
384+
__slots__ = ("group_id", "display_name")
385+
386+
def __init__(self, group_id: str, display_name: str | None) -> None:
387+
msg = f"Group deletion incomplete: {display_name if display_name else '<name-missing>'} (id={group_id})"
388+
super().__init__(msg)
389+
self.group_id = group_id
390+
self.display_name = display_name
391+
392+
383393
class GroupRenameIncompleteError(RuntimeError):
384394
__slots__ = ("group_id", "old_name", "new_name")
385395

@@ -530,27 +540,49 @@ def get_migration_state(self) -> MigrationState:
530540
return MigrationState(self.snapshot())
531541

532542
def delete_original_workspace_groups(self):
533-
tasks = []
534-
workspace_groups_in_workspace = self._workspace_groups_in_workspace()
535543
account_groups_in_workspace = self._account_groups_in_workspace()
536544
migrated_groups = self.snapshot()
537545
logger.info(f"Starting to remove {len(migrated_groups)} migrated workspace groups...")
546+
# Group deletion is eventually consistent, and not monotonically consistent, with a rather long time to
547+
# converge: internally API caches some things for up to 60s. To avoid excessive wait times when large numbers of
548+
# groups need to be deleted (some deployments have >10K groups) we use the following steps:
549+
# 1. Delete the groups.
550+
# 2. Confirm that direct GETs no longer see the group.
551+
# 3. Confirm that account enumeration no longer includes the deleted groups.
552+
deletion_tasks = []
553+
waiting_tasks = []
554+
deleted_groups = []
538555
for migrated_group in migrated_groups:
539-
if migrated_group.temporary_name not in workspace_groups_in_workspace:
540-
logger.info(f"Skipping {migrated_group.name_in_workspace}: no longer in workspace")
541-
continue
542556
if migrated_group.name_in_account not in account_groups_in_workspace:
543-
logger.info(f"Skipping {migrated_group.name_in_account}: not reflected in workspace")
557+
logger.warning(
558+
f"Not deleting group {migrated_group.temporary_name}(id={migrated_group.id_in_workspace}) (originally {migrated_group.name_in_workspace}): its migrated account group ({migrated_group.name_in_account}) cannot be found."
559+
)
544560
continue
545-
tasks.append(
561+
deletion_tasks.append(
546562
functools.partial(
547563
self._delete_workspace_group, migrated_group.id_in_workspace, migrated_group.temporary_name
548564
)
549565
)
550-
_, errors = Threads.gather("removing original workspace groups", tasks)
566+
waiting_tasks.append(
567+
functools.partial(
568+
self._wait_for_workspace_group_deletion,
569+
migrated_group.id_in_workspace,
570+
migrated_group.temporary_name,
571+
)
572+
)
573+
deleted_groups.append(migrated_group)
574+
# Step 1: Delete the groups.
575+
_, errors = Threads.gather("removing original workspace groups", deletion_tasks)
576+
if len(errors) > 0:
577+
logger.error(f"During deletion of workspace groups got {len(errors)} errors. See debug logs.")
578+
raise ManyError(errors)
579+
# Step 2: Confirm that direct gets no longer return the deleted group.
580+
_, errors = Threads.gather("waiting for removal of original workspace groups", waiting_tasks)
551581
if len(errors) > 0:
552-
logger.error(f"During account-to-workspace reflection got {len(errors)} errors. See debug logs")
582+
logger.error(f"Waiting for deletion of workspace groups got {len(errors)} errors. See debug logs.")
553583
raise ManyError(errors)
584+
# Step 3: Confirm that enumeration no longer returns the deleted groups.
585+
self._wait_for_deleted_workspace_groups(deleted_groups)
554586

555587
def _fetcher(self) -> Iterable[MigratedGroup]:
556588
state = []
@@ -672,7 +704,6 @@ def _list_workspace_groups(self, resource_type: str, scim_attributes: str) -> li
672704
# a strategy of enumerating the bare minimum and request full attributes for each group individually.
673705
attributes = scim_attributes.split(",")
674706
if "members" in attributes:
675-
attributes.remove("members")
676707
retry_on_internal_error = retried(on=[InternalError], timeout=self._verify_timeout)
677708
get_group = retry_on_internal_error(self._get_group)
678709
# Limit to the attributes we need for determining if the group is out of scope; the rest are fetched later.
@@ -726,16 +757,86 @@ def _list_account_groups(self, scim_attributes: str) -> list[iam.Group]:
726757
sorted_groups: list[iam.Group] = sorted(account_groups, key=lambda _: _.display_name) # type: ignore[arg-type,return-value]
727758
return sorted_groups
728759

760+
def _delete_workspace_group_and_wait_for_deletion(self, group_id: str, display_name: str) -> str:
761+
logger.debug(f"Deleting workspace group: {display_name} (id={group_id})")
762+
self._delete_workspace_group(group_id, display_name)
763+
logger.debug(f"Waiting for workspace group deletion to take effect: {display_name} (id={group_id})")
764+
self._wait_for_workspace_group_deletion(group_id, display_name)
765+
return group_id
766+
729767
@retried(on=[InternalError, ResourceConflict, DeadlineExceeded])
730768
@rate_limited(max_requests=35, burst_period_seconds=60)
731-
def _delete_workspace_group(self, group_id: str, display_name: str) -> None:
769+
def _rate_limited_group_delete_with_retry(self, group_id: str) -> None:
732770
try:
733-
logger.info(f"Deleting the workspace-level group {display_name} with id {group_id}")
734771
self._ws.groups.delete(id=group_id)
735-
logger.info(f"Workspace-level group {display_name} with id {group_id} was deleted")
736-
return None
737772
except NotFound:
738-
return None
773+
pass
774+
775+
def _delete_workspace_group(self, group_id: str, display_name: str) -> None:
776+
logger.debug(f"Deleting workspace group: {display_name} (id={group_id})")
777+
self._rate_limited_group_delete_with_retry(group_id)
778+
779+
@retried(on=[GroupDeletionIncompleteError], timeout=timedelta(seconds=90))
780+
def _wait_for_workspace_group_deletion(self, group_id: str, display_name: str) -> None:
781+
# The groups API is eventually consistent, but not monotonically consistent. Here we verify that the group
782+
# has been deleted, and try to compensate for the lack of monotonic consistency by requiring two subsequent
783+
# calls to confirm deletion. REST API internals cache things for up to 60s, and we see times close to this
784+
# during testing. The retry timeout reflects this: if it's taking much longer then something else is wrong.
785+
self._check_workspace_group_deletion(group_id, display_name, logging.DEBUG)
786+
self._check_workspace_group_deletion(group_id, display_name, logging.WARNING)
787+
logger.debug(f"Workspace group is assumed deleted: {display_name} (id={group_id})")
788+
789+
def _check_workspace_group_deletion(self, group_id: str, display_name: str, still_present_level_level: int) -> None:
790+
try:
791+
_ = self._ws.groups.get(id=group_id)
792+
logger.log(
793+
still_present_level_level,
794+
f"Deleted group is still present; still waiting for deletion to take effect: {display_name} (id={group_id})",
795+
)
796+
# Deletion is still pending.
797+
raise GroupDeletionIncompleteError(group_id, display_name)
798+
except NotFound:
799+
logger.debug(f"Workspace group not found; possibly deleted: {display_name} (id={group_id})")
800+
801+
@retried(on=[ManyError], timeout=timedelta(minutes=5))
802+
def _wait_for_deleted_workspace_groups(self, deleted_workspace_groups: list[MigratedGroup]) -> None:
803+
# The groups API is eventually consistent, but not monotonically consistent. Here we verify that enumerating
804+
# all groups no longer includes the deleted groups. We try to compensate for the lack of monotonic consistency
805+
# by requiring two subsequent enumerations to omit all deleted groups. REST API internals cache things for up
806+
# to 60s. The retry timeout reflects this, and the fact that enumeration can take a long time for large numbers
807+
# of groups. (Currently there is no way to configure the retry handler to retry at least once, so the timeout
808+
# needs to be high enough to allow at least one retry.)
809+
self._check_for_deleted_workspace_groups(deleted_workspace_groups, logging.DEBUG)
810+
self._check_for_deleted_workspace_groups(deleted_workspace_groups, logging.WARNING)
811+
logger.debug(
812+
f"Group enumeration omitted all {len(deleted_workspace_groups)} workspace groups; assuming deleted."
813+
)
814+
815+
def _check_for_deleted_workspace_groups(
816+
self, deleted_workspace_groups: list[MigratedGroup], still_present_log_level: int
817+
) -> None:
818+
attributes = "id,displayName"
819+
expected_deletions = {group.id_in_workspace for group in deleted_workspace_groups}
820+
pending_deletions = [
821+
GroupDeletionIncompleteError(group.id, group.display_name)
822+
for group in self._list_workspace_groups("WorkspaceGroup", attributes)
823+
if group.id in expected_deletions
824+
]
825+
if pending_deletions:
826+
if logger.isEnabledFor(still_present_log_level):
827+
logger.log(
828+
still_present_log_level,
829+
f"Group enumeration still contains {len(pending_deletions)}/{len(expected_deletions)} deleted workspace groups.",
830+
)
831+
for pending_deletion in pending_deletions:
832+
logger.log(
833+
still_present_log_level,
834+
f"Group enumeration still contains deleted group: {pending_deletion.display_name}(id={pending_deletion.group_id})",
835+
)
836+
raise ManyError(pending_deletions)
837+
logger.debug(
838+
f"Group enumeration does not contain any of the {len(expected_deletions)} deleted workspace groups; possibly deleted."
839+
)
739840

740841
@retried(on=[InternalError, ResourceConflict, DeadlineExceeded])
741842
@rate_limited(max_requests=5)

tests/integration/conftest.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -100,15 +100,20 @@ def sql_fetch_all(sql_backend):
100100

101101
@pytest.fixture
102102
def make_ucx_group(make_random, make_group, make_acc_group, make_user):
103-
def inner(workspace_group_name=None, account_group_name=None):
103+
def inner(workspace_group_name=None, account_group_name=None, **kwargs):
104104
if not workspace_group_name:
105105
workspace_group_name = f"ucx_G{make_random(4)}"
106106
if not account_group_name:
107107
account_group_name = workspace_group_name
108108
user = make_user()
109109
members = [user.id]
110-
ws_group = make_group(display_name=workspace_group_name, members=members, entitlements=["allow-cluster-create"])
111-
acc_group = make_acc_group(display_name=account_group_name, members=members)
110+
ws_group = make_group(
111+
display_name=workspace_group_name,
112+
members=members,
113+
entitlements=["allow-cluster-create"],
114+
**kwargs,
115+
)
116+
acc_group = make_acc_group(display_name=account_group_name, members=members, **kwargs)
112117
return ws_group, acc_group
113118

114119
return inner

tests/integration/install/test_installation.py

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import json
33
import logging
44
from datetime import timedelta
5+
from typing import NoReturn
56

67
import pytest
78

@@ -10,7 +11,7 @@
1011
from databricks.labs.blueprint.parallel import ManyError
1112
from databricks.labs.blueprint.tui import MockPrompts
1213
from databricks.labs.blueprint.wheels import ProductInfo
13-
from databricks.sdk import AccountClient
14+
from databricks.sdk import AccountClient, WorkspaceClient
1415
from databricks.labs.lsql.backends import StatementExecutionBackend
1516
from databricks.sdk.errors import (
1617
AlreadyExists,
@@ -27,6 +28,7 @@
2728
from databricks.labs.ucx.install import WorkspaceInstaller
2829
from databricks.labs.ucx.workspace_access.groups import MigratedGroup
2930

31+
from ..conftest import MockInstallationContext
3032

3133
logger = logging.getLogger(__name__)
3234

@@ -168,8 +170,8 @@ def test_job_cluster_policy(ws, installation_ctx):
168170

169171

170172
@retried(on=[NotFound, InvalidParameterValue])
171-
def test_running_real_remove_backup_groups_job(ws, installation_ctx):
172-
ws_group_a, _ = installation_ctx.make_ucx_group()
173+
def test_running_real_remove_backup_groups_job(ws: WorkspaceClient, installation_ctx: MockInstallationContext) -> None:
174+
ws_group_a, _ = installation_ctx.make_ucx_group(wait_for_provisioning=True)
173175

174176
installation_ctx.__dict__['include_group_names'] = [ws_group_a.display_name]
175177
installation_ctx.workspace_installation.run()
@@ -180,10 +182,13 @@ def test_running_real_remove_backup_groups_job(ws, installation_ctx):
180182

181183
installation_ctx.deployed_workflows.run_workflow("remove-workspace-local-backup-groups")
182184

183-
# The API needs a moment to delete a group, i.e. until the group is not found anymore
184-
@retried(on=[KeyError], timeout=timedelta(minutes=6))
185-
def get_group(group_id: str):
186-
ws.groups.get(group_id)
185+
# Group deletion is eventually consistent. Although the group manager tries to wait for convergence, parts of the
186+
# API internals have a 60s timeout. As such we should wait at least that long before concluding deletion has not
187+
# happened.
188+
# Note: If you are adjusting this, also look at: test_running_real_remove_backup_groups_job
189+
@retried(on=[KeyError], timeout=timedelta(seconds=90))
190+
def get_group(group_id: str) -> NoReturn:
191+
_ = ws.groups.get(group_id)
187192
raise KeyError(f"Group is not deleted: {group_id}")
188193

189194
with pytest.raises(NotFound, match=f"Group with id {ws_group_a.id} not found."):

tests/integration/workspace_access/test_groups.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import json
22
import logging
33
from datetime import timedelta
4+
from typing import NoReturn
45

56
import pytest
67
from databricks.sdk.errors import NotFound, ResourceConflict
@@ -92,7 +93,7 @@ def test_reflect_account_groups_on_workspace(ws, make_ucx_group, sql_backend, in
9293
def test_delete_ws_groups_should_delete_renamed_and_reflected_groups_only(
9394
ws, make_ucx_group, sql_backend, inventory_schema
9495
):
95-
ws_group, _ = make_ucx_group()
96+
ws_group, _ = make_ucx_group(wait_for_provisioning=True)
9697

9798
group_manager = GroupManager(
9899
sql_backend,
@@ -105,10 +106,13 @@ def test_delete_ws_groups_should_delete_renamed_and_reflected_groups_only(
105106
group_manager.reflect_account_groups_on_workspace()
106107
group_manager.delete_original_workspace_groups()
107108

108-
# The API needs a moment to delete a group, i.e. until the group is not found anymore
109-
@retried(on=[KeyError], timeout=timedelta(minutes=2))
110-
def get_group(group_id: str):
111-
ws.groups.get(group_id)
109+
# Group deletion is eventually consistent. Although the group manager tries to wait for convergence, parts of the
110+
# API internals have a 60s timeout. As such we should wait at least that long before concluding deletion has not
111+
# happened.
112+
# Note: If you are adjusting this, also look at: test_running_real_remove_backup_groups_job
113+
@retried(on=[KeyError], timeout=timedelta(seconds=90))
114+
def get_group(group_id: str) -> NoReturn:
115+
_ = ws.groups.get(group_id)
112116
raise KeyError(f"Group is not deleted: {group_id}")
113117

114118
with pytest.raises(NotFound, match=f"Group with id {ws_group.id} not found."):

0 commit comments

Comments
 (0)