Skip to content

Commit 5860709

Browse files
authored
Group migration: improve robustness when renaming groups (#2263)
## Changes This PR updates the way we check for the completion of group renames; similar to deletion we now double-check before assuming the rename has taken effect. ### Linked issues Follows on from #2247. ### Functionality - modified existing workflows: - `migrate-groups` - `migrate-groups-experimental` ### Tests - updated unit tests - existing integration tests
1 parent f3b3847 commit 5860709

File tree

2 files changed

+105
-47
lines changed

2 files changed

+105
-47
lines changed

src/databricks/labs/ucx/workspace_access/groups.py

Lines changed: 76 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -437,11 +437,20 @@ def has_groups(self) -> bool:
437437
return len(self.snapshot()) > 0
438438

439439
def rename_groups(self):
440-
tasks = []
441440
account_groups_in_workspace = self._account_groups_in_workspace()
442441
workspace_groups_in_workspace = self._workspace_groups_in_workspace()
442+
# Renaming a group is eventually consistent, and not monotonically consistent, with a rather long time to
443+
# converge: internally the Databricks API caches some things for up to 60s. To avoid excessive wait times when
444+
# large numbers of groups need to be deleted (some deployments have >10K groups) we use the following steps:
445+
# 1. Rename all the groups.
446+
# 2. Confirm for each group that direct GETs yield the new name.
447+
# 3. Confirm that group enumeration no longer includes the deleted groups.
448+
# This caution is necessary because otherwise downstream tasks (like reflect_account_groups_on_workspace()) may
449+
# skip a renamed group because it doesn't appear to be present.
443450
groups_to_migrate = self.get_migration_state().groups
444-
451+
rename_tasks = []
452+
waiting_tasks = []
453+
renamed_groups = []
445454
logger.info(f"Starting to rename {len(groups_to_migrate)} groups for migration...")
446455
for migrated_group in groups_to_migrate:
447456
if migrated_group.name_in_account in account_groups_in_workspace:
@@ -450,51 +459,82 @@ def rename_groups(self):
450459
if migrated_group.temporary_name in workspace_groups_in_workspace:
451460
logger.info(f"Skipping {migrated_group.name_in_workspace}: already renamed")
452461
continue
453-
logger.info(f"Renaming: {migrated_group.name_in_workspace} -> {migrated_group.temporary_name}")
454-
tasks.append(
462+
rename_tasks.append(
455463
functools.partial(
456-
self._rename_group_and_wait_for_rename,
464+
self._rename_group,
457465
migrated_group.id_in_workspace,
458466
migrated_group.name_in_workspace,
459467
migrated_group.temporary_name,
460468
)
461469
)
462-
renamed_groups = Threads.strict("rename groups in the workspace", tasks)
463-
# Renaming is eventually consistent, and the tasks above have each polled to verify their rename completed.
464-
# Here we also check that enumeration yields the updated names; this is necessary because otherwise downstream
465-
# tasks (like reflect_account_groups_on_workspace()) may skip a renamed group because it doesn't appear to be
466-
# present.
470+
waiting_tasks.append(
471+
functools.partial(
472+
self._wait_for_group_rename,
473+
migrated_group.id_in_workspace,
474+
migrated_group.name_in_workspace,
475+
migrated_group.temporary_name,
476+
)
477+
)
478+
renamed_groups.append((migrated_group.id_in_workspace, migrated_group.temporary_name))
479+
# Step 1: Rename all the groups.
480+
_, errors = Threads.gather("rename groups in the workspace", rename_tasks)
481+
if errors:
482+
logger.error(f"During renaming of workspace groups {len(errors)} errors occurred. See debug logs.")
483+
raise ManyError(errors)
484+
# Step 2: Confirm that direct GETs yield the updated information.
485+
_, errors = Threads.gather("waiting for renamed groups in the workspace", waiting_tasks)
486+
if errors:
487+
logger.error(f"While waiting for renamed workspace groups {len(errors)} errors occurred. See debug logs.")
488+
raise ManyError(errors)
489+
# Step 3: Wait for enumeration to also reflect the updated information.
467490
self._wait_for_renamed_groups(renamed_groups)
468491

469-
def _rename_group_and_wait_for_rename(self, group_id: str, old_group_name, new_group_name: str) -> tuple[str, str]:
470-
logger.debug(f"Renaming group {group_id}: {old_group_name} -> {new_group_name}")
471-
self._rename_group(group_id, new_group_name)
472-
logger.debug(f"Waiting for group {group_id} rename to take effect: {old_group_name} -> {new_group_name}")
473-
self._wait_for_rename(group_id, old_group_name, new_group_name)
474-
return group_id, new_group_name
492+
def _rename_group(self, group_id: str, old_group_name: str, new_group_name: str) -> None:
493+
logger.debug(f"Renaming group: {old_group_name} (id={group_id}) -> {new_group_name}")
494+
self._rate_limited_rename_group_with_retry(group_id, new_group_name)
475495

476496
@retried(on=[InternalError, ResourceConflict, DeadlineExceeded])
477497
@rate_limited(max_requests=10, burst_period_seconds=60)
478-
def _rename_group(self, group_id: str, new_group_name: str) -> None:
498+
def _rate_limited_rename_group_with_retry(self, group_id: str, new_group_name: str) -> None:
479499
ops = [iam.Patch(iam.PatchOp.REPLACE, "displayName", new_group_name)]
480500
self._ws.groups.patch(group_id, operations=ops)
481501

482-
@retried(on=[GroupRenameIncompleteError], timeout=timedelta(minutes=2))
483-
def _wait_for_rename(self, group_id: str, old_group_name: str, new_group_name: str) -> None:
502+
@retried(on=[GroupRenameIncompleteError], timeout=timedelta(seconds=90))
503+
def _wait_for_group_rename(self, group_id: str, old_group_name: str, new_group_name: str) -> None:
504+
# The groups API is eventually consistent, but not monotonically consistent. Here we verify that the effects of
505+
# the rename have taken effect, and try to compensate for the lack of monotonic consistency by requiring two
506+
# subsequent calls to confirm the rename. REST API internals cache things for up to 60s, and we see times close
507+
# to this during testing. The retry timeout reflects this: if it's taking much longer then something else is
508+
# wrong.
509+
self._check_group_rename(group_id, old_group_name, new_group_name, logging.DEBUG)
510+
self._check_group_rename(group_id, old_group_name, new_group_name, logging.WARNING)
511+
logger.debug(f"Group rename is assumed complete: {old_group_name} (id={group_id}) -> {new_group_name}")
512+
513+
def _check_group_rename(self, group_id, old_group_name: str, new_group_name: str, pending_log_level: int) -> None:
484514
group = self._ws.groups.get(group_id)
485515
if group.display_name == old_group_name:
486-
logger.debug(
487-
f"Group {group_id} still has old name; still waiting for rename to take effect: {old_group_name} -> {new_group_name}"
516+
logger.log(
517+
pending_log_level,
518+
f"Group still has old name; still waiting for rename to take effect: {old_group_name} (id={group_id}) -> {new_group_name}",
488519
)
489520
raise GroupRenameIncompleteError(group_id, old_group_name, new_group_name)
490521
if group.display_name != new_group_name:
491522
# Group has an entirely unexpected name; something else is interfering.
492-
msg = f"While waiting for group {group_id} rename ({old_group_name} -> {new_group_name}) an unexpected name was observed: {group.display_name}"
523+
msg = f"While waiting for group rename ({old_group_name} (id={group_id}) -> {new_group_name}) an unexpected name was observed: {group.display_name}"
493524
raise RuntimeError(msg)
494-
logger.debug(f"Group {group_id} rename has taken effect: {old_group_name} -> {new_group_name}")
525+
logger.debug(f"Group rename has possibly taken effect: {old_group_name} (id={group_id}) -> {new_group_name}")
495526

496527
@retried(on=[ManyError], timeout=timedelta(minutes=2))
497528
def _wait_for_renamed_groups(self, expected_groups: Collection[tuple[str, str]]) -> None:
529+
# The groups API is eventually consistent, but not monotonically consistent. Here we verify that the group
530+
# has been deleted, and try to compensate for the lack of monotonic consistency by requiring two subsequent
531+
# calls to confirm deletion. REST API internals cache things for up to 60s, and we see times close to this
532+
# during testing. The retry timeout reflects this: if it's taking much longer then something else is wrong.
533+
self._check_for_renamed_groups(expected_groups, logging.DEBUG)
534+
self._check_for_renamed_groups(expected_groups, logging.WARNING)
535+
logger.debug(f"Group enumeration showed all {len(expected_groups)} renamed groups; assuming complete.")
536+
537+
def _check_for_renamed_groups(self, expected_groups: Collection[tuple[str, str]], pending_log_level: int) -> None:
498538
attributes = "id,displayName"
499539
found_groups = {
500540
group.id: group.display_name
@@ -508,8 +548,9 @@ def _wait_for_renamed_groups(self, expected_groups: Collection[tuple[str, str]])
508548
logger.warning(f"Group enumeration omits renamed group: {group_id} (renamed to {expected_name})")
509549
pending_renames.append(RuntimeError(f"Missing group with id: {group_id} (renamed to {expected_name})"))
510550
elif found_name != expected_name:
511-
logger.debug(
512-
f"Group enumeration does not yet reflect rename: {group_id} (renamed to {expected_name} but currently {found_name})"
551+
logger.log(
552+
pending_log_level,
553+
f"Group enumeration does not yet reflect rename: {group_id} (renamed to {expected_name} but currently {found_name})",
513554
)
514555
pending_renames.append(GroupRenameIncompleteError(group_id, found_name, expected_name))
515556
else:
@@ -544,11 +585,11 @@ def delete_original_workspace_groups(self):
544585
migrated_groups = self.snapshot()
545586
logger.info(f"Starting to remove {len(migrated_groups)} migrated workspace groups...")
546587
# Group deletion is eventually consistent, and not monotonically consistent, with a rather long time to
547-
# converge: internally API caches some things for up to 60s. To avoid excessive wait times when large numbers of
548-
# groups need to be deleted (some deployments have >10K groups) we use the following steps:
549-
# 1. Delete the groups.
550-
# 2. Confirm that direct GETs no longer see the group.
551-
# 3. Confirm that account enumeration no longer includes the deleted groups.
588+
# converge: internally the Databricks API caches some things for up to 60s. To avoid excessive wait times when
589+
# large numbers of groups need to be deleted (some deployments have >10K groups) we use the following steps:
590+
# 1. Delete all the groups.
591+
# 2. Confirm for each group that direct GETs no longer see the group.
592+
# 3. Confirm that group enumeration no longer includes the deleted groups.
552593
deletion_tasks = []
553594
waiting_tasks = []
554595
deleted_groups = []
@@ -573,12 +614,12 @@ def delete_original_workspace_groups(self):
573614
deleted_groups.append(migrated_group)
574615
# Step 1: Delete the groups.
575616
_, errors = Threads.gather("removing original workspace groups", deletion_tasks)
576-
if len(errors) > 0:
617+
if errors:
577618
logger.error(f"During deletion of workspace groups got {len(errors)} errors. See debug logs.")
578619
raise ManyError(errors)
579-
# Step 2: Confirm that direct gets no longer return the deleted group.
620+
# Step 2: Confirm that direct GETs no longer return the deleted group.
580621
_, errors = Threads.gather("waiting for removal of original workspace groups", waiting_tasks)
581-
if len(errors) > 0:
622+
if errors:
582623
logger.error(f"Waiting for deletion of workspace groups got {len(errors)} errors. See debug logs.")
583624
raise ManyError(errors)
584625
# Step 3: Confirm that enumeration no longer returns the deleted groups.
@@ -786,11 +827,11 @@ def _wait_for_workspace_group_deletion(self, group_id: str, display_name: str) -
786827
self._check_workspace_group_deletion(group_id, display_name, logging.WARNING)
787828
logger.debug(f"Workspace group is assumed deleted: {display_name} (id={group_id})")
788829

789-
def _check_workspace_group_deletion(self, group_id: str, display_name: str, still_present_level_level: int) -> None:
830+
def _check_workspace_group_deletion(self, group_id: str, display_name: str, pending_log_level: int) -> None:
790831
try:
791832
_ = self._ws.groups.get(id=group_id)
792833
logger.log(
793-
still_present_level_level,
834+
pending_log_level,
794835
f"Deleted group is still present; still waiting for deletion to take effect: {display_name} (id={group_id})",
795836
)
796837
# Deletion is still pending.

tests/unit/workspace_access/test_groups.py

Lines changed: 29 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -269,24 +269,30 @@ def test_snapshot_should_rename_groups_defined_in_conf():
269269
]
270270

271271

272-
def test_rename_groups_should_patch_eligible_groups():
272+
def test_rename_groups_should_patch_eligible_groups(fake_sleep: Mock) -> None:
273273
backend = MockBackend()
274274
wsclient = create_autospec(WorkspaceClient)
275275
group1 = Group(id="1", display_name="de", meta=ResourceMeta(resource_type="WorkspaceGroup"))
276276
updated_group1 = dataclasses.replace(group1, display_name="test-group-de")
277-
wsclient.groups.list.side_effect = (
277+
list_side_effects = (
278278
# Preparing for the rename.
279279
*[[group1]] * 3,
280-
# Checking the rename completed
280+
# Checking (twice) that enumeration reflects the rename.
281+
[updated_group1],
281282
[updated_group1],
282283
)
283-
wsclient.groups.get.side_effect = (
284+
wsclient.groups.list.side_effect = list_side_effects
285+
get_side_effects = (
284286
# Preparing for the rename.
285287
*[group1] * 2,
286-
# Checking the rename completed.
288+
# Checking that the rename completed: emulate "not yet".
289+
updated_group1,
290+
group1,
291+
# Checking (twice) that the rename completed.
292+
updated_group1,
287293
updated_group1,
288294
)
289-
wsclient.groups.get.return_value = group1
295+
wsclient.groups.get.side_effect = get_side_effects
290296
account_admins_group_1 = Group(id="11", display_name="de")
291297
wsclient.api_client.do.return_value = {
292298
"Resources": [g.as_dict() for g in (account_admins_group_1,)],
@@ -296,6 +302,9 @@ def test_rename_groups_should_patch_eligible_groups():
296302
"1",
297303
operations=[iam.Patch(iam.PatchOp.REPLACE, "displayName", "test-group-de")],
298304
)
305+
assert wsclient.groups.list.call_count == len(list_side_effects)
306+
assert wsclient.groups.get.call_count == len(get_side_effects)
307+
fake_sleep.assert_called()
299308

300309

301310
def test_rename_groups_should_filter_account_groups_in_workspace():
@@ -337,10 +346,14 @@ def test_rename_groups_should_wait_for_renames_to_complete(fake_sleep: Mock) ->
337346
*[[original_group]] * 3,
338347
# Checking the rename completed; simulate:
339348
# 1. Rename incomplete: original group still visible.
340-
# 2. Still incomplete: group missing entirely.
341-
# 3. Rename has finished: updated group is now visible.
342349
[original_group],
350+
# 2. Still incomplete: group missing entirely.
343351
[],
352+
# 3. Maybe complete: update visible, but not twice in a row.
353+
[updated_group],
354+
[original_group],
355+
# 4. Rename has finished: updated group is now visible (twice).
356+
[updated_group],
344357
[updated_group],
345358
)
346359
wsclient.groups.list.side_effect = list_side_effect_values
@@ -355,9 +368,11 @@ def test_rename_groups_should_wait_for_renames_to_complete(fake_sleep: Mock) ->
355368
get_return_values = (
356369
# Enumerating the workspace groups and loading the migration state.
357370
*[original_group] * 2,
358-
# First call after rename: simulate the rename not yet being visible.
371+
# First calls after the rename: simulate the rename not yet being visible.
372+
updated_group,
359373
original_group,
360-
# Second call, simulate things now being complete.
374+
# Final calls after the rename: simulate things now being complete.
375+
updated_group,
361376
updated_group,
362377
)
363378
wsclient.groups.get.side_effect = get_return_values
@@ -380,7 +395,8 @@ def test_rename_groups_should_retry_on_internal_error(fake_sleep: Mock) -> None:
380395
wsclient.groups.list.side_effect = (
381396
# Preparing for the rename.
382397
*[[original_group]] * 3,
383-
# Checking the rename completed.
398+
# Checking (twice) that the rename completed.
399+
[updated_group],
384400
[updated_group],
385401
)
386402
matching_account_admin_group = dataclasses.replace(
@@ -392,7 +408,8 @@ def test_rename_groups_should_retry_on_internal_error(fake_sleep: Mock) -> None:
392408
wsclient.groups.get.side_effect = (
393409
# Preparing for the rename.
394410
*[original_group] * 2,
395-
# Checking the rename completed.
411+
# Checking (twice) that the rename completed.
412+
updated_group,
396413
updated_group,
397414
)
398415
# The response to the PATCH call is ignored; set things up to fail on the first call and succeed on the second.

0 commit comments

Comments
 (0)