Skip to content

Commit 2193127

Browse files
authored
Enable joining an existing installation to a collection (#1799)
1 parent 06854a4 commit 2193127

File tree

9 files changed

+355
-83
lines changed

9 files changed

+355
-83
lines changed

labs.yml

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ commands:
4545

4646
- name: report-account-compatibility
4747
is_account_level: true
48-
description: aggregation of UCX output of multiple workspaces in the account.
48+
description: aggregation of UCX output of multiple workspaces in the account.
4949
If --workspace-ids is not provided, it will use all workspaces present in the account.
5050
flags:
5151
- name: workspace-ids
@@ -95,7 +95,7 @@ commands:
9595

9696
- name: alias
9797
description: |
98-
alias tables across schema/catalog withing a UC metastore
98+
alias tables across schema/catalog withing a UC metastore
9999
create a view pointing to the "from" table
100100
if a view is aliased, recreates the same view in the target schema/catalog
101101
flags:
@@ -111,9 +111,9 @@ commands:
111111
description: target schema to migrate tables to
112112

113113
- name: principal-prefix-access
114-
description: For azure cloud, identifies all storage account used by tables in the workspace, identify spn and its
115-
permission on each storage accounts. For aws, identifies all the Instance Profiles configured in the workspace and
116-
its access to all the S3 buckets, along with AWS roles that are set with UC access and its access to S3 buckets.
114+
description: For azure cloud, identifies all storage account used by tables in the workspace, identify spn and its
115+
permission on each storage accounts. For aws, identifies all the Instance Profiles configured in the workspace and
116+
its access to all the S3 buckets, along with AWS roles that are set with UC access and its access to S3 buckets.
117117
The output is stored in the workspace install folder.
118118
flags:
119119
- name: subscription-id
@@ -122,8 +122,8 @@ commands:
122122
description: AWS Profile to use for authentication
123123

124124
- name: create-missing-principals
125-
description: For AWS, this command identifies all the S3 locations that are missing a UC compatible role and
126-
creates them. It accepts a number of optional parameters, i.e. KMS Key, Role Name, Policy Name, and whether to
125+
description: For AWS, this command identifies all the S3 locations that are missing a UC compatible role and
126+
creates them. It accepts a number of optional parameters, i.e. KMS Key, Role Name, Policy Name, and whether to
127127
create a single role for all the S3 locations.
128128
flags:
129129
- name: aws-profile
@@ -138,7 +138,7 @@ commands:
138138
description: (Optional) Create a single role for all S3 locations. (default:False)
139139

140140
- name: create-uber-principal
141-
description: For azure cloud, creates a service principal and gives STORAGE BLOB READER access on all the storage account
141+
description: For azure cloud, creates a service principal and gives STORAGE BLOB READER access on all the storage account
142142
used by tables in the workspace and stores the spn info in the UCX cluster policy. For aws,
143143
it identifies all s3 buckets used by the Instance Profiles configured in the workspace.
144144
flags:
@@ -215,7 +215,7 @@ commands:
215215

216216
- name: migrate-tables
217217
description: |
218-
Trigger the migrate-tables workflow and, optionally, migrate-external-hiveserde-tables-in-place-experimental
218+
Trigger the migrate-tables workflow and, optionally, migrate-external-hiveserde-tables-in-place-experimental
219219
workflow and migrate-external-tables-ctas workflow.
220220
221221
- name: migrate-dbsql-dashboards
@@ -229,3 +229,12 @@ commands:
229229
flags:
230230
- name: dashboard-id
231231
description: (Optional) DBSQL dashboard ID to revert. If no dashboard ID is provided, all migrated DBSQL dashboards in the workspace will be reverted.
232+
233+
- name: join-collection
234+
is_account_level: true
235+
description: workspace_id to join a collection.
236+
flags:
237+
- name: workspace-id
238+
description: workspace_id which should join a collection.
239+
- name: target_workspace-id
240+
description: (Optional) id of a workspace in the target collection. If not specified, ucx will prompt to select from a list

src/databricks/labs/ucx/account/workspaces.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -93,11 +93,11 @@ def get_accessible_workspaces(self) -> list[Workspace]:
9393
"""
9494
accessible_workspaces = []
9595
for workspace in self._ac.workspaces.list():
96-
if self._can_administer(workspace):
96+
if self.can_administer(workspace):
9797
accessible_workspaces.append(workspace)
9898
return accessible_workspaces
9999

100-
def _can_administer(self, workspace: Workspace) -> bool:
100+
def can_administer(self, workspace: Workspace) -> bool:
101101
try:
102102
# check if user has access to workspace
103103
ws = self.client_for(workspace)
@@ -252,7 +252,7 @@ def __init__(self, installation: Installation, ws: WorkspaceClient):
252252
self._installation = installation
253253
self._ws = ws
254254

255-
def _load_workspace_info(self) -> dict[int, Workspace]:
255+
def load_workspace_info(self) -> dict[int, Workspace]:
256256
try:
257257
id_to_workspace = {}
258258
for workspace in self._installation.load(list[Workspace], filename=AccountWorkspaces.SYNC_FILE_NAME):
@@ -265,7 +265,7 @@ def _load_workspace_info(self) -> dict[int, Workspace]:
265265

266266
def current(self) -> str:
267267
workspace_id = self._ws.get_workspace_id()
268-
workspaces = self._load_workspace_info()
268+
workspaces = self.load_workspace_info()
269269
if workspace_id not in workspaces:
270270
msg = f"Current workspace is not known: {workspace_id}"
271271
raise KeyError(msg) from None

src/databricks/labs/ucx/cli.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from databricks.labs.ucx.contexts.account_cli import AccountContext
1414
from databricks.labs.ucx.contexts.workspace_cli import WorkspaceContext, LocalCheckoutContext
1515
from databricks.labs.ucx.hive_metastore.tables import What
16+
from databricks.labs.ucx.install import AccountInstaller
1617

1718
ucx = App(__file__)
1819
logger = get_logger(__file__)
@@ -471,6 +472,13 @@ def revert_dbsql_dashboards(w: WorkspaceClient, dashboard_id: str | None = None)
471472
ctx.redash.revert_dashboards(dashboard_id)
472473

473474

475+
@ucx.command(is_account=True)
476+
def join_collection(a: AccountClient, workspace_id: int, join_workspace_id: int):
477+
"""joins the workspace to an existing collection"""
478+
account_installer = AccountInstaller(a)
479+
account_installer.join_collection(workspace_id, join_workspace_id)
480+
481+
474482
@ucx.command
475483
def lint_local_code(
476484
w: WorkspaceClient, prompts: Prompts, path: str | None = None, ctx: LocalCheckoutContext | None = None

src/databricks/labs/ucx/install.py

Lines changed: 105 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
)
4141

4242
from databricks.labs.ucx.__about__ import __version__
43+
from databricks.labs.ucx.account.workspaces import WorkspaceInfo
4344
from databricks.labs.ucx.assessment.azure import AzureServicePrincipalInfo
4445
from databricks.labs.ucx.assessment.clusters import ClusterInfo, PolicyInfo
4546
from databricks.labs.ucx.assessment.init_scripts import GlobalInitScriptInfo
@@ -663,34 +664,94 @@ def install_on_account(self):
663664
# upload the json dump of workspace info in the .ucx folder
664665
ctx.account_workspaces.sync_workspace_info(installed_workspaces)
665666

667+
@staticmethod
668+
def _get_workspace(workspace_id: int, ids_to_workspace: dict[int, Workspace]) -> Workspace:
669+
try:
670+
workspace = ids_to_workspace[workspace_id]
671+
return workspace
672+
except KeyError:
673+
msg = "Current workspace is not known, Please run as account-admin: databricks labs ucx sync-workspace-info"
674+
raise KeyError(msg) from None
675+
676+
def _get_workspace_info(self, current_workspace_id: int):
677+
account_client = self._get_safe_account_client()
678+
workspace = account_client.workspaces.get(current_workspace_id)
679+
current_workspace_client = account_client.get_workspace_client(workspace)
680+
installation = Installation.current(current_workspace_client, self.product_info.product_name())
681+
workspace_info = WorkspaceInfo(installation, current_workspace_client)
682+
return workspace_info.load_workspace_info()
683+
666684
def join_collection(
667685
self,
668686
current_workspace_id: int,
687+
target_workspace_id: int | None = None,
669688
):
670-
if not self.is_account_install and self.prompts.confirm(
671-
"Do you want to join the current installation to an existing collection?"
672-
):
673-
674-
installed_workspaces: list[Workspace] | None = []
675-
accessible_workspaces: list[Workspace] = []
676-
account_client = self._get_safe_account_client()
677-
ctx = AccountContext(account_client)
678-
try:
679-
accessible_workspaces = ctx.account_workspaces.get_accessible_workspaces()
680-
except PermissionDenied:
681-
logger.warning("User doesnt have account admin permission, cant join a collection, skipping...")
682-
collection_workspace = self._get_collection_workspace(accessible_workspaces, account_client)
683-
if collection_workspace is not None:
684-
installed_workspaces = self._sync_collection(collection_workspace, current_workspace_id, account_client)
685-
if installed_workspaces is not None:
686-
ctx.account_workspaces.sync_workspace_info(installed_workspaces)
689+
if self.is_account_install:
690+
# skip joining collection when installer is running for all account workspaces
691+
return None
692+
collection_workspace: Workspace
693+
account_client = self._get_safe_account_client()
694+
ctx = AccountContext(account_client)
695+
ids_to_workspace = self._get_workspace_info(current_workspace_id)
696+
if target_workspace_id is None:
697+
if self.prompts.confirm("Do you want to join the current installation to an existing collection?"):
698+
# If joining a collection as part of the installation then collection_workspace_id would be empty
699+
try:
700+
# if user is account admin list and show available workspaces to select from
701+
accessible_workspaces = ctx.account_workspaces.get_accessible_workspaces()
702+
target_workspace = self._get_collection_workspace(
703+
accessible_workspaces,
704+
account_client,
705+
)
706+
assert target_workspace is not None
707+
target_workspace_id = target_workspace.workspace_id
708+
709+
except PermissionDenied:
710+
# if the user is not account admin, allow user to enter the workspace_id to join as collection.
711+
# if no workspace_id is entered, then exit
712+
logger.warning("User doesnt have account admin permission, cant list workspaces")
713+
target_workspace_id = int(
714+
self.prompts.question(
715+
"Please enter, the workspace id to join as a collection (enter 0 to skip it)",
716+
valid_number=True,
717+
default="0",
718+
)
719+
)
720+
else:
721+
return None
722+
if target_workspace_id == 0 or target_workspace_id is None:
723+
# if user didn't enter workspace id
724+
logger.info("Skipping joining collection...")
725+
return None
726+
# below code is executed if either joining an existing collection (through the cli)
727+
# or selecting one while installing
728+
collection_workspace = AccountInstaller._get_workspace(
729+
target_workspace_id,
730+
ids_to_workspace,
731+
)
732+
if not ctx.account_workspaces.can_administer(collection_workspace):
733+
# if user is not workspace admin on the workspace to join as collection then exit
734+
logger.error(
735+
f"User doesnt have admin access on the workspace {target_workspace_id}, " f"cant join collection."
736+
)
737+
return None
738+
if collection_workspace is not None:
739+
self._sync_collection(
740+
collection_workspace,
741+
current_workspace_id,
742+
ids_to_workspace,
743+
)
744+
return None
687745

688746
def _sync_collection(
689747
self,
690748
collection_workspace: Workspace,
691749
current_workspace_id: int,
692-
account_client: AccountClient,
750+
ids_to_workspace: dict[int, Workspace],
693751
) -> list[Workspace] | None:
752+
# gets the list of existing collection of workspace from the config
753+
# checks if user is workspace admin on all the workspace id, if yes
754+
# then joins the collection and syncs the config with all the affected workspaces
694755
installer = self._get_installer(collection_workspace)
695756
installed_workspace_ids = installer.config.installed_workspace_ids
696757
if installed_workspace_ids is None:
@@ -701,9 +762,20 @@ def _sync_collection(
701762
)
702763
installed_workspace_ids.append(current_workspace_id)
703764
installed_workspaces = []
704-
for account_workspace in account_client.workspaces.list():
705-
if account_workspace.workspace_id in installed_workspace_ids:
706-
installed_workspaces.append(account_workspace)
765+
ctx = AccountContext(self._get_safe_account_client())
766+
for workspace_id in installed_workspace_ids:
767+
workspace = AccountInstaller._get_workspace(
768+
workspace_id,
769+
ids_to_workspace,
770+
)
771+
installed_workspaces.append(workspace)
772+
if not ctx.account_workspaces.can_administer(workspace):
773+
774+
logger.error(
775+
f"User doesnt have admin access on the workspace {workspace_id} in the collection, "
776+
f"cant join collection."
777+
)
778+
return None
707779

708780
for installed_workspace in installed_workspaces:
709781
installer = self._get_installer(installed_workspace)
@@ -715,10 +787,15 @@ def _get_collection_workspace(
715787
accessible_workspaces: list[Workspace],
716788
account_client: AccountClient,
717789
) -> Workspace | None:
790+
# iterate through each workspace and select workspace which has existing ucx installation
791+
# allow user to select from the eligible workspace to join as collection
718792
installed_workspaces = []
719793
for workspace in accessible_workspaces:
720794
workspace_client = account_client.get_workspace_client(workspace)
721-
workspace_installation = Installation.existing(workspace_client, self.product_info.product_name())
795+
workspace_installation = Installation.existing(
796+
workspace_client,
797+
self.product_info.product_name(),
798+
)
722799
if len(workspace_installation) > 0:
723800
installed_workspaces.append(workspace)
724801

@@ -730,12 +807,12 @@ def _get_collection_workspace(
730807
for workspace in installed_workspaces
731808
if workspace.deployment_name is not None
732809
}
733-
workspace = self.prompts.choice_from_dict(
810+
target_workspace = self.prompts.choice_from_dict(
734811
"Please select a workspace, the current installation of ucx will be grouped as a "
735812
"collection with the selected workspace",
736813
workspaces,
737814
)
738-
return workspace
815+
return target_workspace
739816

740817

741818
if __name__ == "__main__":
@@ -749,5 +826,8 @@ def _get_collection_workspace(
749826
account_installer.install_on_account()
750827
else:
751828
workspace_installer = WorkspaceInstaller(WorkspaceClient(product="ucx", product_version=__version__))
829+
752830
workspace_installer.run()
753-
account_installer.join_collection(workspace_installer.workspace_client.get_workspace_id())
831+
account_installer.join_collection(
832+
workspace_installer.workspace_client.get_workspace_id(),
833+
)

tests/integration/conftest.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from databricks.sdk.service.iam import Group
2020

2121
from databricks.labs.ucx.__about__ import __version__
22+
from databricks.labs.ucx.account.workspaces import AccountWorkspaces
2223
from databricks.labs.ucx.assessment.aws import AWSRoleAction, run_command
2324
from databricks.labs.ucx.assessment.azure import (
2425
AzureServicePrincipalCrawler,
@@ -273,6 +274,9 @@ def __init__(
273274
def with_table_mapping_rules(self, rules):
274275
self.installation.save(rules, filename=TableMapping.FILENAME)
275276

277+
def with_workspace_info(self, workspace_info):
278+
self.installation.save(workspace_info, filename=AccountWorkspaces.SYNC_FILE_NAME)
279+
276280
def make_schema(self, **kwargs):
277281
schema_info = self._make_schema(**kwargs)
278282
self._schemas.append(schema_info)
@@ -764,18 +768,18 @@ def prepare_hiveserde_tables(context, random, schema, table_base_dir) -> dict[st
764768
)
765769

766770
avro_table_name = f"avro_serde_{random}"
767-
avro_ddl = f"""CREATE TABLE hive_metastore.{schema.name}.{avro_table_name} (id INT, region STRING)
771+
avro_ddl = f"""CREATE TABLE hive_metastore.{schema.name}.{avro_table_name} (id INT, region STRING)
768772
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
769773
STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
770774
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
771775
TBLPROPERTIES ('avro.schema.literal'='{{
772-
"namespace": "org.apache.hive",
773-
"name": "first_schema",
776+
"namespace": "org.apache.hive",
777+
"name": "first_schema",
774778
"type": "record",
775779
"fields": [
776780
{{ "name":"id", "type":"int" }},
777781
{{ "name":"region", "type":"string" }}
778-
] }}')
782+
] }}')
779783
LOCATION '{table_base_dir}/{avro_table_name}'
780784
"""
781785
tables[avro_table_name] = context.make_table(

tests/integration/test_installation.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,17 +7,21 @@
77
from datetime import timedelta
88

99
import pytest # pylint: disable=wrong-import-order
10+
from databricks.labs.ucx.__about__ import __version__
11+
1012
from databricks.labs.blueprint.installation import Installation
1113
from databricks.labs.blueprint.installer import RawState
1214
from databricks.labs.blueprint.parallel import ManyError
1315
from databricks.labs.blueprint.tui import MockPrompts
1416
from databricks.labs.blueprint.wheels import ProductInfo
17+
from databricks.sdk import AccountClient
1518
from databricks.labs.lsql.backends import StatementExecutionBackend
1619
from databricks.sdk.errors import (
1720
AlreadyExists,
1821
InvalidParameterValue,
1922
NotFound,
2023
)
24+
2125
from databricks.sdk.retries import retried
2226
from databricks.sdk.service import compute
2327
from databricks.sdk.service.iam import PermissionLevel
@@ -453,7 +457,13 @@ def test_compare_remote_local_install_versions(ws, installation_ctx):
453457

454458

455459
def test_new_collection(ws, sql_backend, installation_ctx, env_or_skip):
456-
installation_ctx.workspace_installation.run()
460+
host = ws.config.environment.deployment_url("accounts")
461+
acc_client = AccountClient(
462+
host=host, account_id=env_or_skip("DATABRICKS_ACCOUNT_ID"), product='ucx', product_version=__version__
463+
)
464+
workspace = acc_client.workspaces.get(ws.get_workspace_id())
465+
installation_ctx.with_workspace_info([workspace])
466+
installation_ctx.workspace_installer.run()
457467
workspace_id = installation_ctx.workspace_installer.workspace_client.get_workspace_id()
458468
acc_installer = installation_ctx.account_installer
459469
prompts = MockPrompts(

0 commit comments

Comments
 (0)