@@ -437,11 +437,20 @@ def has_groups(self) -> bool:
437437 return len (self .snapshot ()) > 0
438438
439439 def rename_groups (self ):
440- tasks = []
441440 account_groups_in_workspace = self ._account_groups_in_workspace ()
442441 workspace_groups_in_workspace = self ._workspace_groups_in_workspace ()
442+ # Renaming a group is eventually consistent, and not monotonically consistent, with a rather long time to
443+ # converge: internally the Databricks API caches some things for up to 60s. To avoid excessive wait times when
444+ # large numbers of groups need to be deleted (some deployments have >10K groups) we use the following steps:
445+ # 1. Rename all the groups.
446+ # 2. Confirm for each group that direct GETs yield the new name.
447+ # 3. Confirm that group enumeration no longer includes the deleted groups.
448+ # This caution is necessary because otherwise downstream tasks (like reflect_account_groups_on_workspace()) may
449+ # skip a renamed group because it doesn't appear to be present.
443450 groups_to_migrate = self .get_migration_state ().groups
444-
451+ rename_tasks = []
452+ waiting_tasks = []
453+ renamed_groups = []
445454 logger .info (f"Starting to rename { len (groups_to_migrate )} groups for migration..." )
446455 for migrated_group in groups_to_migrate :
447456 if migrated_group .name_in_account in account_groups_in_workspace :
@@ -450,51 +459,82 @@ def rename_groups(self):
450459 if migrated_group .temporary_name in workspace_groups_in_workspace :
451460 logger .info (f"Skipping { migrated_group .name_in_workspace } : already renamed" )
452461 continue
453- logger .info (f"Renaming: { migrated_group .name_in_workspace } -> { migrated_group .temporary_name } " )
454- tasks .append (
462+ rename_tasks .append (
455463 functools .partial (
456- self ._rename_group_and_wait_for_rename ,
464+ self ._rename_group ,
457465 migrated_group .id_in_workspace ,
458466 migrated_group .name_in_workspace ,
459467 migrated_group .temporary_name ,
460468 )
461469 )
462- renamed_groups = Threads .strict ("rename groups in the workspace" , tasks )
463- # Renaming is eventually consistent, and the tasks above have each polled to verify their rename completed.
464- # Here we also check that enumeration yields the updated names; this is necessary because otherwise downstream
465- # tasks (like reflect_account_groups_on_workspace()) may skip a renamed group because it doesn't appear to be
466- # present.
470+ waiting_tasks .append (
471+ functools .partial (
472+ self ._wait_for_group_rename ,
473+ migrated_group .id_in_workspace ,
474+ migrated_group .name_in_workspace ,
475+ migrated_group .temporary_name ,
476+ )
477+ )
478+ renamed_groups .append ((migrated_group .id_in_workspace , migrated_group .temporary_name ))
479+ # Step 1: Rename all the groups.
480+ _ , errors = Threads .gather ("rename groups in the workspace" , rename_tasks )
481+ if errors :
482+ logger .error (f"During renaming of workspace groups { len (errors )} errors occurred. See debug logs." )
483+ raise ManyError (errors )
484+ # Step 2: Confirm that direct GETs yield the updated information.
485+ _ , errors = Threads .gather ("waiting for renamed groups in the workspace" , waiting_tasks )
486+ if errors :
487+ logger .error (f"While waiting for renamed workspace groups { len (errors )} errors occurred. See debug logs." )
488+ raise ManyError (errors )
489+ # Step 3: Wait for enumeration to also reflect the updated information.
467490 self ._wait_for_renamed_groups (renamed_groups )
468491
469- def _rename_group_and_wait_for_rename (self , group_id : str , old_group_name , new_group_name : str ) -> tuple [str , str ]:
470- logger .debug (f"Renaming group { group_id } : { old_group_name } -> { new_group_name } " )
471- self ._rename_group (group_id , new_group_name )
472- logger .debug (f"Waiting for group { group_id } rename to take effect: { old_group_name } -> { new_group_name } " )
473- self ._wait_for_rename (group_id , old_group_name , new_group_name )
474- return group_id , new_group_name
492+ def _rename_group (self , group_id : str , old_group_name : str , new_group_name : str ) -> None :
493+ logger .debug (f"Renaming group: { old_group_name } (id={ group_id } ) -> { new_group_name } " )
494+ self ._rate_limited_rename_group_with_retry (group_id , new_group_name )
475495
476496 @retried (on = [InternalError , ResourceConflict , DeadlineExceeded ])
477497 @rate_limited (max_requests = 10 , burst_period_seconds = 60 )
478- def _rename_group (self , group_id : str , new_group_name : str ) -> None :
498+ def _rate_limited_rename_group_with_retry (self , group_id : str , new_group_name : str ) -> None :
479499 ops = [iam .Patch (iam .PatchOp .REPLACE , "displayName" , new_group_name )]
480500 self ._ws .groups .patch (group_id , operations = ops )
481501
482- @retried (on = [GroupRenameIncompleteError ], timeout = timedelta (minutes = 2 ))
483- def _wait_for_rename (self , group_id : str , old_group_name : str , new_group_name : str ) -> None :
502+ @retried (on = [GroupRenameIncompleteError ], timeout = timedelta (seconds = 90 ))
503+ def _wait_for_group_rename (self , group_id : str , old_group_name : str , new_group_name : str ) -> None :
504+ # The groups API is eventually consistent, but not monotonically consistent. Here we verify that the effects of
505+ # the rename have taken effect, and try to compensate for the lack of monotonic consistency by requiring two
506+ # subsequent calls to confirm the rename. REST API internals cache things for up to 60s, and we see times close
507+ # to this during testing. The retry timeout reflects this: if it's taking much longer then something else is
508+ # wrong.
509+ self ._check_group_rename (group_id , old_group_name , new_group_name , logging .DEBUG )
510+ self ._check_group_rename (group_id , old_group_name , new_group_name , logging .WARNING )
511+ logger .debug (f"Group rename is assumed complete: { old_group_name } (id={ group_id } ) -> { new_group_name } " )
512+
513+ def _check_group_rename (self , group_id , old_group_name : str , new_group_name : str , pending_log_level : int ) -> None :
484514 group = self ._ws .groups .get (group_id )
485515 if group .display_name == old_group_name :
486- logger .debug (
487- f"Group { group_id } still has old name; still waiting for rename to take effect: { old_group_name } -> { new_group_name } "
516+ logger .log (
517+ pending_log_level ,
518+ f"Group still has old name; still waiting for rename to take effect: { old_group_name } (id={ group_id } ) -> { new_group_name } " ,
488519 )
489520 raise GroupRenameIncompleteError (group_id , old_group_name , new_group_name )
490521 if group .display_name != new_group_name :
491522 # Group has an entirely unexpected name; something else is interfering.
492- msg = f"While waiting for group { group_id } rename ({ old_group_name } -> { new_group_name } ) an unexpected name was observed: { group .display_name } "
523+ msg = f"While waiting for group rename ({ old_group_name } (id= { group_id } ) -> { new_group_name } ) an unexpected name was observed: { group .display_name } "
493524 raise RuntimeError (msg )
494- logger .debug (f"Group { group_id } rename has taken effect: { old_group_name } -> { new_group_name } " )
525+ logger .debug (f"Group rename has possibly taken effect: { old_group_name } (id= { group_id } ) -> { new_group_name } " )
495526
496527 @retried (on = [ManyError ], timeout = timedelta (minutes = 2 ))
497528 def _wait_for_renamed_groups (self , expected_groups : Collection [tuple [str , str ]]) -> None :
529+ # The groups API is eventually consistent, but not monotonically consistent. Here we verify that the group
530+ # has been deleted, and try to compensate for the lack of monotonic consistency by requiring two subsequent
531+ # calls to confirm deletion. REST API internals cache things for up to 60s, and we see times close to this
532+ # during testing. The retry timeout reflects this: if it's taking much longer then something else is wrong.
533+ self ._check_for_renamed_groups (expected_groups , logging .DEBUG )
534+ self ._check_for_renamed_groups (expected_groups , logging .WARNING )
535+ logger .debug (f"Group enumeration showed all { len (expected_groups )} renamed groups; assuming complete." )
536+
537+ def _check_for_renamed_groups (self , expected_groups : Collection [tuple [str , str ]], pending_log_level : int ) -> None :
498538 attributes = "id,displayName"
499539 found_groups = {
500540 group .id : group .display_name
@@ -508,8 +548,9 @@ def _wait_for_renamed_groups(self, expected_groups: Collection[tuple[str, str]])
508548 logger .warning (f"Group enumeration omits renamed group: { group_id } (renamed to { expected_name } )" )
509549 pending_renames .append (RuntimeError (f"Missing group with id: { group_id } (renamed to { expected_name } )" ))
510550 elif found_name != expected_name :
511- logger .debug (
512- f"Group enumeration does not yet reflect rename: { group_id } (renamed to { expected_name } but currently { found_name } )"
551+ logger .log (
552+ pending_log_level ,
553+ f"Group enumeration does not yet reflect rename: { group_id } (renamed to { expected_name } but currently { found_name } )" ,
513554 )
514555 pending_renames .append (GroupRenameIncompleteError (group_id , found_name , expected_name ))
515556 else :
@@ -544,11 +585,11 @@ def delete_original_workspace_groups(self):
544585 migrated_groups = self .snapshot ()
545586 logger .info (f"Starting to remove { len (migrated_groups )} migrated workspace groups..." )
546587 # Group deletion is eventually consistent, and not monotonically consistent, with a rather long time to
547- # converge: internally API caches some things for up to 60s. To avoid excessive wait times when large numbers of
548- # groups need to be deleted (some deployments have >10K groups) we use the following steps:
549- # 1. Delete the groups.
550- # 2. Confirm that direct GETs no longer see the group.
551- # 3. Confirm that account enumeration no longer includes the deleted groups.
588+ # converge: internally the Databricks API caches some things for up to 60s. To avoid excessive wait times when
589+ # large numbers of groups need to be deleted (some deployments have >10K groups) we use the following steps:
590+ # 1. Delete all the groups.
591+ # 2. Confirm for each group that direct GETs no longer see the group.
592+ # 3. Confirm that group enumeration no longer includes the deleted groups.
552593 deletion_tasks = []
553594 waiting_tasks = []
554595 deleted_groups = []
@@ -573,12 +614,12 @@ def delete_original_workspace_groups(self):
573614 deleted_groups .append (migrated_group )
574615 # Step 1: Delete the groups.
575616 _ , errors = Threads .gather ("removing original workspace groups" , deletion_tasks )
576- if len ( errors ) > 0 :
617+ if errors :
577618 logger .error (f"During deletion of workspace groups got { len (errors )} errors. See debug logs." )
578619 raise ManyError (errors )
579- # Step 2: Confirm that direct gets no longer return the deleted group.
620+ # Step 2: Confirm that direct GETs no longer return the deleted group.
580621 _ , errors = Threads .gather ("waiting for removal of original workspace groups" , waiting_tasks )
581- if len ( errors ) > 0 :
622+ if errors :
582623 logger .error (f"Waiting for deletion of workspace groups got { len (errors )} errors. See debug logs." )
583624 raise ManyError (errors )
584625 # Step 3: Confirm that enumeration no longer returns the deleted groups.
@@ -786,11 +827,11 @@ def _wait_for_workspace_group_deletion(self, group_id: str, display_name: str) -
786827 self ._check_workspace_group_deletion (group_id , display_name , logging .WARNING )
787828 logger .debug (f"Workspace group is assumed deleted: { display_name } (id={ group_id } )" )
788829
789- def _check_workspace_group_deletion (self , group_id : str , display_name : str , still_present_level_level : int ) -> None :
830+ def _check_workspace_group_deletion (self , group_id : str , display_name : str , pending_log_level : int ) -> None :
790831 try :
791832 _ = self ._ws .groups .get (id = group_id )
792833 logger .log (
793- still_present_level_level ,
834+ pending_log_level ,
794835 f"Deleted group is still present; still waiting for deletion to take effect: { display_name } (id={ group_id } )" ,
795836 )
796837 # Deletion is still pending.
0 commit comments