Group migration: improve robustness while deleting workspace groups (#2247)

asnare · web-flow · commit cf9d1cf7978f · 2024-07-25T17:57:08.000+02:00
## Changes This PR updates the group manager so that deleting workspace groups is more reliable. Changes include: - We no longer skip deletion of groups that don't appear to be present. Due to eventual consistency issues this was occurring with groups that had recently been renamed (to their temporary name). - Deletion now waits for the effects of deletion to be visible by double-checking that a group can no longer be directly retrieved from the API, and that it no longer appears in the list of groups during enumeration. (Due to API limitations this is not a guarantee that the groups are no longer visible, but it does decrease the likelihood of anything noticed afterwards.) - Improved logging. A subsequent PR will update group renaming to use a similar approach to that here. ### Linked issues Resolves #2227. ### Functionality - modified existing workflow: `remove-workspace-local-backup-groups` ### Tests - updated unit tests - updated integration tests
diff --git a/src/databricks/labs/ucx/workspace_access/groups.py b/src/databricks/labs/ucx/workspace_access/groups.py
@@ -380,6 +380,16 @@ def generate_migrated_groups(self) -> Iterable[MigratedGroup]:
             )
 
 
+class GroupDeletionIncompleteError(RuntimeError):
+    __slots__ = ("group_id", "display_name")
+
+    def __init__(self, group_id: str, display_name: str | None) -> None:
+        msg = f"Group deletion incomplete: {display_name if display_name else '<name-missing>'} (id={group_id})"
+        super().__init__(msg)
+        self.group_id = group_id
+        self.display_name = display_name
+
+
 class GroupRenameIncompleteError(RuntimeError):
     __slots__ = ("group_id", "old_name", "new_name")
 
@@ -530,27 +540,49 @@ def get_migration_state(self) -> MigrationState:
         return MigrationState(self.snapshot())
 
     def delete_original_workspace_groups(self):
-        tasks = []
-        workspace_groups_in_workspace = self._workspace_groups_in_workspace()
         account_groups_in_workspace = self._account_groups_in_workspace()
         migrated_groups = self.snapshot()
         logger.info(f"Starting to remove {len(migrated_groups)} migrated workspace groups...")
+        # Group deletion is eventually consistent, and not monotonically consistent, with a rather long time to
+        # converge: internally API caches some things for up to 60s. To avoid excessive wait times when large numbers of
+        # groups need to be deleted (some deployments have >10K groups) we use the following steps:
+        #  1. Delete the groups.
+        #  2. Confirm that direct GETs no longer see the group.
+        #  3. Confirm that account enumeration no longer includes the deleted groups.
+        deletion_tasks = []
+        waiting_tasks = []
+        deleted_groups = []
         for migrated_group in migrated_groups:
-            if migrated_group.temporary_name not in workspace_groups_in_workspace:
-                logger.info(f"Skipping {migrated_group.name_in_workspace}: no longer in workspace")
-                continue
             if migrated_group.name_in_account not in account_groups_in_workspace:
-                logger.info(f"Skipping {migrated_group.name_in_account}: not reflected in workspace")
+                logger.warning(
+                    f"Not deleting group {migrated_group.temporary_name}(id={migrated_group.id_in_workspace}) (originally {migrated_group.name_in_workspace}): its migrated account group ({migrated_group.name_in_account}) cannot be found."
+                )
                 continue
-            tasks.append(
+            deletion_tasks.append(
                 functools.partial(
                     self._delete_workspace_group, migrated_group.id_in_workspace, migrated_group.temporary_name
                 )
             )
-        _, errors = Threads.gather("removing original workspace groups", tasks)
+            waiting_tasks.append(
+                functools.partial(
+                    self._wait_for_workspace_group_deletion,
+                    migrated_group.id_in_workspace,
+                    migrated_group.temporary_name,
+                )
+            )
+            deleted_groups.append(migrated_group)
+        # Step 1: Delete the groups.
+        _, errors = Threads.gather("removing original workspace groups", deletion_tasks)
+        if len(errors) > 0:
+            logger.error(f"During deletion of workspace groups got {len(errors)} errors. See debug logs.")
+            raise ManyError(errors)
+        # Step 2: Confirm that direct gets no longer return the deleted group.
+        _, errors = Threads.gather("waiting for removal of original workspace groups", waiting_tasks)
         if len(errors) > 0:
-            logger.error(f"During account-to-workspace reflection got {len(errors)} errors. See debug logs")
+            logger.error(f"Waiting for deletion of workspace groups got {len(errors)} errors. See debug logs.")
             raise ManyError(errors)
+        # Step 3: Confirm that enumeration no longer returns the deleted groups.
+        self._wait_for_deleted_workspace_groups(deleted_groups)
 
     def _fetcher(self) -> Iterable[MigratedGroup]:
         state = []
@@ -672,7 +704,6 @@ def _list_workspace_groups(self, resource_type: str, scim_attributes: str) -> li
         # a strategy of enumerating the bare minimum and request full attributes for each group individually.
         attributes = scim_attributes.split(",")
         if "members" in attributes:
-            attributes.remove("members")
             retry_on_internal_error = retried(on=[InternalError], timeout=self._verify_timeout)
             get_group = retry_on_internal_error(self._get_group)
             # Limit to the attributes we need for determining if the group is out of scope; the rest are fetched later.
@@ -726,16 +757,86 @@ def _list_account_groups(self, scim_attributes: str) -> list[iam.Group]:
         sorted_groups: list[iam.Group] = sorted(account_groups, key=lambda _: _.display_name)  # type: ignore[arg-type,return-value]
         return sorted_groups
 
+    def _delete_workspace_group_and_wait_for_deletion(self, group_id: str, display_name: str) -> str:
+        logger.debug(f"Deleting workspace group: {display_name} (id={group_id})")
+        self._delete_workspace_group(group_id, display_name)
+        logger.debug(f"Waiting for workspace group deletion to take effect: {display_name} (id={group_id})")
+        self._wait_for_workspace_group_deletion(group_id, display_name)
+        return group_id
+
     @retried(on=[InternalError, ResourceConflict, DeadlineExceeded])
     @rate_limited(max_requests=35, burst_period_seconds=60)
-    def _delete_workspace_group(self, group_id: str, display_name: str) -> None:
+    def _rate_limited_group_delete_with_retry(self, group_id: str) -> None:
         try:
-            logger.info(f"Deleting the workspace-level group {display_name} with id {group_id}")
             self._ws.groups.delete(id=group_id)
-            logger.info(f"Workspace-level group {display_name} with id {group_id} was deleted")
-            return None
         except NotFound:
-            return None
+            pass
+
+    def _delete_workspace_group(self, group_id: str, display_name: str) -> None:
+        logger.debug(f"Deleting workspace group: {display_name} (id={group_id})")
+        self._rate_limited_group_delete_with_retry(group_id)
+
+    @retried(on=[GroupDeletionIncompleteError], timeout=timedelta(seconds=90))
+    def _wait_for_workspace_group_deletion(self, group_id: str, display_name: str) -> None:
+        # The groups API is eventually consistent, but not monotonically consistent. Here we verify that the group
+        # has been deleted, and try to compensate for the lack of monotonic consistency by requiring two subsequent
+        # calls to confirm deletion. REST API internals cache things for up to 60s, and we see times close to this
+        # during testing. The retry timeout reflects this: if it's taking much longer then something else is wrong.
+        self._check_workspace_group_deletion(group_id, display_name, logging.DEBUG)
+        self._check_workspace_group_deletion(group_id, display_name, logging.WARNING)
+        logger.debug(f"Workspace group is assumed deleted: {display_name} (id={group_id})")
+
+    def _check_workspace_group_deletion(self, group_id: str, display_name: str, still_present_level_level: int) -> None:
+        try:
+            _ = self._ws.groups.get(id=group_id)
+            logger.log(
+                still_present_level_level,
+                f"Deleted group is still present; still waiting for deletion to take effect: {display_name} (id={group_id})",
+            )
+            # Deletion is still pending.
+            raise GroupDeletionIncompleteError(group_id, display_name)
+        except NotFound:
+            logger.debug(f"Workspace group not found; possibly deleted: {display_name} (id={group_id})")
+
+    @retried(on=[ManyError], timeout=timedelta(minutes=5))
+    def _wait_for_deleted_workspace_groups(self, deleted_workspace_groups: list[MigratedGroup]) -> None:
+        # The groups API is eventually consistent, but not monotonically consistent. Here we verify that enumerating
+        # all groups no longer includes the deleted groups. We try to compensate for the lack of monotonic consistency
+        # by requiring two subsequent enumerations to omit all deleted groups. REST API internals cache things for up
+        # to 60s. The retry timeout reflects this, and the fact that enumeration can take a long time for large numbers
+        # of groups. (Currently there is no way to configure the retry handler to retry at least once, so the timeout
+        # needs to be high enough to allow at least one retry.)
+        self._check_for_deleted_workspace_groups(deleted_workspace_groups, logging.DEBUG)
+        self._check_for_deleted_workspace_groups(deleted_workspace_groups, logging.WARNING)
+        logger.debug(
+            f"Group enumeration omitted all {len(deleted_workspace_groups)} workspace groups; assuming deleted."
+        )
+
+    def _check_for_deleted_workspace_groups(
+        self, deleted_workspace_groups: list[MigratedGroup], still_present_log_level: int
+    ) -> None:
+        attributes = "id,displayName"
+        expected_deletions = {group.id_in_workspace for group in deleted_workspace_groups}
+        pending_deletions = [
+            GroupDeletionIncompleteError(group.id, group.display_name)
+            for group in self._list_workspace_groups("WorkspaceGroup", attributes)
+            if group.id in expected_deletions
+        ]
+        if pending_deletions:
+            if logger.isEnabledFor(still_present_log_level):
+                logger.log(
+                    still_present_log_level,
+                    f"Group enumeration still contains {len(pending_deletions)}/{len(expected_deletions)} deleted workspace groups.",
+                )
+                for pending_deletion in pending_deletions:
+                    logger.log(
+                        still_present_log_level,
+                        f"Group enumeration still contains deleted group: {pending_deletion.display_name}(id={pending_deletion.group_id})",
+                    )
+            raise ManyError(pending_deletions)
+        logger.debug(
+            f"Group enumeration does not contain any of the {len(expected_deletions)} deleted workspace groups; possibly deleted."
+        )
 
     @retried(on=[InternalError, ResourceConflict, DeadlineExceeded])
     @rate_limited(max_requests=5)
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
@@ -100,15 +100,20 @@ def sql_fetch_all(sql_backend):
 
 @pytest.fixture
 def make_ucx_group(make_random, make_group, make_acc_group, make_user):
-    def inner(workspace_group_name=None, account_group_name=None):
+    def inner(workspace_group_name=None, account_group_name=None, **kwargs):
         if not workspace_group_name:
             workspace_group_name = f"ucx_G{make_random(4)}"
         if not account_group_name:
             account_group_name = workspace_group_name
         user = make_user()
         members = [user.id]
-        ws_group = make_group(display_name=workspace_group_name, members=members, entitlements=["allow-cluster-create"])
-        acc_group = make_acc_group(display_name=account_group_name, members=members)
+        ws_group = make_group(
+            display_name=workspace_group_name,
+            members=members,
+            entitlements=["allow-cluster-create"],
+            **kwargs,
+        )
+        acc_group = make_acc_group(display_name=account_group_name, members=members, **kwargs)
         return ws_group, acc_group
 
     return inner
diff --git a/tests/integration/install/test_installation.py b/tests/integration/install/test_installation.py
@@ -2,6 +2,7 @@
 import json
 import logging
 from datetime import timedelta
+from typing import NoReturn
 
 import pytest
 
@@ -10,7 +11,7 @@
 from databricks.labs.blueprint.parallel import ManyError
 from databricks.labs.blueprint.tui import MockPrompts
 from databricks.labs.blueprint.wheels import ProductInfo
-from databricks.sdk import AccountClient
+from databricks.sdk import AccountClient, WorkspaceClient
 from databricks.labs.lsql.backends import StatementExecutionBackend
 from databricks.sdk.errors import (
     AlreadyExists,
@@ -27,6 +28,7 @@
 from databricks.labs.ucx.install import WorkspaceInstaller
 from databricks.labs.ucx.workspace_access.groups import MigratedGroup
 
+from ..conftest import MockInstallationContext
 
 logger = logging.getLogger(__name__)
 
@@ -168,8 +170,8 @@ def test_job_cluster_policy(ws, installation_ctx):
 
 
 @retried(on=[NotFound, InvalidParameterValue])
-def test_running_real_remove_backup_groups_job(ws, installation_ctx):
-    ws_group_a, _ = installation_ctx.make_ucx_group()
+def test_running_real_remove_backup_groups_job(ws: WorkspaceClient, installation_ctx: MockInstallationContext) -> None:
+    ws_group_a, _ = installation_ctx.make_ucx_group(wait_for_provisioning=True)
 
     installation_ctx.__dict__['include_group_names'] = [ws_group_a.display_name]
     installation_ctx.workspace_installation.run()
@@ -180,10 +182,13 @@ def test_running_real_remove_backup_groups_job(ws, installation_ctx):
 
     installation_ctx.deployed_workflows.run_workflow("remove-workspace-local-backup-groups")
 
-    # The API needs a moment to delete a group, i.e. until the group is not found anymore
-    @retried(on=[KeyError], timeout=timedelta(minutes=6))
-    def get_group(group_id: str):
-        ws.groups.get(group_id)
+    # Group deletion is eventually consistent. Although the group manager tries to wait for convergence, parts of the
+    # API internals have a 60s timeout. As such we should wait at least that long before concluding deletion has not
+    # happened.
+    # Note: If you are adjusting this, also look at: test_running_real_remove_backup_groups_job
+    @retried(on=[KeyError], timeout=timedelta(seconds=90))
+    def get_group(group_id: str) -> NoReturn:
+        _ = ws.groups.get(group_id)
         raise KeyError(f"Group is not deleted: {group_id}")
 
     with pytest.raises(NotFound, match=f"Group with id {ws_group_a.id} not found."):
diff --git a/tests/integration/workspace_access/test_groups.py b/tests/integration/workspace_access/test_groups.py
@@ -1,6 +1,7 @@
 import json
 import logging
 from datetime import timedelta
+from typing import NoReturn
 
 import pytest
 from databricks.sdk.errors import NotFound, ResourceConflict
@@ -92,7 +93,7 @@ def test_reflect_account_groups_on_workspace(ws, make_ucx_group, sql_backend, in
 def test_delete_ws_groups_should_delete_renamed_and_reflected_groups_only(
     ws, make_ucx_group, sql_backend, inventory_schema
 ):
-    ws_group, _ = make_ucx_group()
+    ws_group, _ = make_ucx_group(wait_for_provisioning=True)
 
     group_manager = GroupManager(
         sql_backend,
@@ -105,10 +106,13 @@ def test_delete_ws_groups_should_delete_renamed_and_reflected_groups_only(
     group_manager.reflect_account_groups_on_workspace()
     group_manager.delete_original_workspace_groups()
 
-    # The API needs a moment to delete a group, i.e. until the group is not found anymore
-    @retried(on=[KeyError], timeout=timedelta(minutes=2))
-    def get_group(group_id: str):
-        ws.groups.get(group_id)
+    # Group deletion is eventually consistent. Although the group manager tries to wait for convergence, parts of the
+    # API internals have a 60s timeout. As such we should wait at least that long before concluding deletion has not
+    # happened.
+    # Note: If you are adjusting this, also look at: test_running_real_remove_backup_groups_job
+    @retried(on=[KeyError], timeout=timedelta(seconds=90))
+    def get_group(group_id: str) -> NoReturn:
+        _ = ws.groups.get(group_id)
         raise KeyError(f"Group is not deleted: {group_id}")
 
     with pytest.raises(NotFound, match=f"Group with id {ws_group.id} not found."):
diff --git a/tests/unit/workspace_access/test_groups.py b/tests/unit/workspace_access/test_groups.py