Estimate migration effort based on assessment database (#1008)

william-conti · web-flow · commit f6bd7349f8a8 · 2024-03-08T19:10:59.000+01:00
## Changes Adding a functionality to broadly estimate the migration effort in days for each asset crawled in the assessment database. ### Linked issues resolve #877 ### Tasks - [x] Add a summary widget for a global estimate per object type - [x] Add an assumption and scope for each object type - [x] Add a new estimates dashboard ### Tests - [x] manually tested
diff --git a/src/databricks/labs/ucx/framework/dashboards.py b/src/databricks/labs/ucx/framework/dashboards.py
@@ -26,9 +26,10 @@
 class SimpleQuery:
     dashboard_ref: str
     name: str
-    query: str
-    viz: dict[str, str]
     widget: dict[str, str]
+    viz: dict[str, str]
+    query: str | None = None
+    text: str | None = None
 
     @property
     def key(self):
@@ -131,6 +132,8 @@ def validate(self):
     def _validate_folder(self, dashboard_folder, step_folder):
         dashboard_ref = f"{step_folder.stem}_{dashboard_folder.stem}".lower()
         for query in self._desired_queries(dashboard_folder, dashboard_ref):
+            if query.text:
+                continue
             try:
                 self._get_viz_options(query)
                 self._get_widget_options(query)
@@ -142,9 +145,15 @@ def _install_widget(self, query: SimpleQuery, dashboard_ref: str):
         dashboard_id = self._state.dashboards[dashboard_ref]
         widget_options = self._get_widget_options(query)
         # widgets are cleaned up every dashboard redeploy
-        widget = self._ws.dashboard_widgets.create(
-            dashboard_id, widget_options, 1, visualization_id=self._state.viz[query.key]
-        )
+        if query.query:
+            widget = self._ws.dashboard_widgets.create(
+                dashboard_id, widget_options, 1, visualization_id=self._state.viz[query.key]
+            )
+        elif query.text:
+            text = query.text[query.text.index("\n") + 1 :]
+            widget = self._ws.dashboard_widgets.create(dashboard_id, widget_options, 1, text=text)
+        else:
+            raise ValueError("Query or Text should be set")
         assert widget.id is not None
         self._state.widgets[query.key] = widget.id
 
@@ -245,9 +254,25 @@ def _desired_queries(self, local_folder: Path, dashboard_ref: str) -> list[Simpl
                     widget=self._parse_magic_comment(f, "-- widget ", text),
                 )
             )
+        for f in local_folder.glob("*.md"):
+            if f.name == "README.md":
+                continue
+            text = f.read_text("utf8")
+            desired_queries.append(
+                SimpleQuery(
+                    dashboard_ref=dashboard_ref,
+                    name=f.name,
+                    text=text,
+                    widget=self._parse_magic_comment(f, "-- widget ", text),
+                    viz={},
+                )
+            )
         return desired_queries
 
     def _install_viz(self, query: SimpleQuery):
+        if query.text:
+            logger.debug(f"Skipping viz {query.name} because it's a text widget")
+            return None
         viz_args = self._get_viz_options(query)
         if query.key in self._state.viz:
             return self._ws.query_visualizations.update(self._state.viz[query.key], **viz_args)
@@ -265,6 +290,9 @@ def _get_viz_options(self, query: SimpleQuery):
         return viz_args
 
     def _install_query(self, query: SimpleQuery, dashboard_name: str, data_source_id: str, parent: str):
+        if query.text:
+            logger.debug(f"Skipping query {query.name} because it's a text widget")
+            return None
         query_meta = {
             "data_source_id": data_source_id,
             "name": f"{dashboard_name} - {query.name}",
diff --git a/src/databricks/labs/ucx/install.py b/src/databricks/labs/ucx/install.py
@@ -167,6 +167,7 @@ def deploy_schema(sql_backend: SqlBackend, inventory_schema: str):
     )
     deployer.deploy_view("objects", "queries/views/objects.sql")
     deployer.deploy_view("grant_detail", "queries/views/grant_detail.sql")
+    deployer.deploy_view("table_estimates", "queries/views/table_estimates.sql")
 
 
 class WorkspaceInstaller:
diff --git a/src/databricks/labs/ucx/queries/assessment/estimates/00_0_metastore_assignment.md b/src/databricks/labs/ucx/queries/assessment/estimates/00_0_metastore_assignment.md
@@ -0,0 +1,21 @@
+-- widget title=Metastore assignment, row=0, col=0, size_x=2, size_y=8
+
+## 1 - Metastore assignment
+
+The first step of adopting is UC is attaching your current workspace to a UC metastore.
+
+This section automatically detects if your workspace has been attached to a UC metastore, and also detects jobs that can potentially fail when attaching the workspace to the metastore.
+
+Follow the docs below to attach your workspace to the metastore:
+
+[[AWS]](https://docs.databricks.com/en/data-governance/unity-catalog/enable-workspaces.html)  
+[[Azure]](https://learn.microsoft.com/en-us/azure/databricks/data-governance/unity-catalog/enable-workspaces)  
+[[GCP]](https://docs.gcp.databricks.com/data-governance/unity-catalog/enable-workspaces.html)
+
+If any incompatible submit runs has been detected, please follow the steps highlighted below:
+
+1. Find out the incompatible jobs in your local orchestrator based on the object_id identified by UCX
+2. Change the job configuration to include the following in the ClusterInfo:   “data_security_mode”: “NONE”
+3. Alternatively:
+    1. Create Cluster Policy for External Orchestrators and set “data_security_mode”: “NONE”
+    2. Assign Cluster Policy to Service Principals
diff --git a/src/databricks/labs/ucx/queries/assessment/estimates/00_2_is_metastore_assigned.sql b/src/databricks/labs/ucx/queries/assessment/estimates/00_2_is_metastore_assigned.sql
@@ -0,0 +1,3 @@
+-- viz type=counter, name=Metastore assigned, value_column=uc_metastore_assigned
+-- widget row=0, col=5, size_x=1, size_y=8
+SELECT case when CURRENT_METASTORE() is not null then "Metastore already assigned" else "Metastore not assigned" end as uc_metastore_assigned
diff --git a/src/databricks/labs/ucx/queries/assessment/estimates/00_4_is_incompatible_submit_run_detected.sql b/src/databricks/labs/ucx/queries/assessment/estimates/00_4_is_incompatible_submit_run_detected.sql
@@ -0,0 +1,6 @@
+-- viz type=table, name=Incompatible submit runs detected, columns=object_type,object_id,failure
+-- widget title=Incimpatible submit runs, row=0, col=2, size_x=3, size_y=8
+SELECT * FROM
+(SELECT object_type, object_id, EXPLODE(from_json(failures, 'array<string>')) AS failure
+FROM $inventory.objects) WHERE failure = "no data security mode specified" AND object_type = "submit_runs"
+
diff --git a/src/databricks/labs/ucx/queries/assessment/estimates/01_0_group_migration.md b/src/databricks/labs/ucx/queries/assessment/estimates/01_0_group_migration.md
@@ -0,0 +1,21 @@
+-- widget title=Group migration, row=1, col=0, size_x=2, size_y=8
+
+## 2 - Group migration
+
+The second step of succesfully adopting UC if migrating your workspace local groups to the account.  
+This step is a relatively low risk as it's an additive operation, it won't disturb your currently running pipelines.
+
+Follow those steps in order to successfully migrate your groups to the account:
+
+If you're not using an Identity Provider (Okta, Azure Entra etc...):
+1. Create the groups at the account level, consider using the [create-account-groups](https://github.com/databrickslabs/ucx/blob/main/README.md#create-account-groups-command) command.
+   1. For extra safety, consider running [validate-group-membership](https://github.com/databrickslabs/ucx/blob/main/README.md#validate-groups-membership-command) command to validate that you have the same amount of groups/users in the workspace and the account
+   2. Enable SCIM at the Account level.
+
+If you're using an Identity Provider:
+1. Enable SCIM at the account level
+2. Disable SCIM at the workspace level if not done already.
+3. Trigger a sync from your IdP to the account
+   1. To validate that all groups are properly setup for the group migration, run [validate-group-membership](https://github.com/databrickslabs/ucx/blob/main/README.md#validate-groups-membership-command)
+
+Once the account groups are setup, perform the group migration by using the Group migration workflow, more information in the [docs](https://github.com/databrickslabs/ucx/blob/main/README.md#group-migration-workflow)
diff --git a/src/databricks/labs/ucx/queries/assessment/estimates/01_2_group_migration.sql b/src/databricks/labs/ucx/queries/assessment/estimates/01_2_group_migration.sql
@@ -0,0 +1,3 @@
+-- viz type=table, name=Workspace local groups, columns=id_in_workspace,name_in_workspace,name_in_account,temporary_name,members,entitlements,external_id,roles
+-- widget title=Workspace local groups to migrate, row=1, col=2, size_x=3, size_y=8
+SELECT * FROM $inventory.groups
diff --git a/src/databricks/labs/ucx/queries/assessment/estimates/01_5_group_migration_complexity.sql b/src/databricks/labs/ucx/queries/assessment/estimates/01_5_group_migration_complexity.sql
@@ -0,0 +1,10 @@
+-- viz type=counter, name=Group migration complexity, counter_label=Group migration complexity, value_column=group_migration_complexity
+-- widget row=1, col=5, size_x=1, size_y=8
+select
+case when total_groups = 0 then NULL
+when total_groups between 1 and 50 then "S"
+when total_groups between 51 and 200 then "M"
+when total_groups > 201 then "L"
+ELSE NULL
+end as group_migration_complexity from
+(SELECT count(*) as total_groups FROM $inventory.groups)
diff --git a/src/databricks/labs/ucx/queries/assessment/estimates/02_0_data_modeling.md b/src/databricks/labs/ucx/queries/assessment/estimates/02_0_data_modeling.md
@@ -0,0 +1,10 @@
+-- widget title=Table estimates, row=2, col=0, size_x=2, size_y=8
+## 3 - UC Data modeling
+
+The third step of a successful UC migration is defining your target data model on UC.  
+This step is required in order to choose in which catalogs yout existing data in Hive Metastore will land.
+
+As a starting point, consider creating a catalog that has the same name as your workspace. 
+For example, a table `database.table1` will land in the `workspace_name.database.table1` table.
+
+The complexity factor is relative to the number of databases and tables identified during the assessment.
diff --git a/src/databricks/labs/ucx/queries/assessment/estimates/02_2_uc_data_modeling.sql b/src/databricks/labs/ucx/queries/assessment/estimates/02_2_uc_data_modeling.sql
@@ -0,0 +1,3 @@
+-- viz type=table, name=Tables to migrate, columns=catalog,database,name,object_type,table_format,location,view_text,upgraded_to,storage_properties
+-- widget title=Tables to migrate, row=2, col=2, size_x=3, size_y=8
+select * from $inventory.tables;
diff --git a/src/databricks/labs/ucx/queries/assessment/estimates/02_5_uc_data_modeling_complexity.sql b/src/databricks/labs/ucx/queries/assessment/estimates/02_5_uc_data_modeling_complexity.sql
@@ -0,0 +1,9 @@
+-- viz type=counter, name=Data modeling complexity, counter_label=Data modeling complexity, value_column=uc_model_complexity
+-- widget row=2, col=5, size_x=1, size_y=8
+select
+case when distinct_tables = 0 then NULL
+when distinct_tables between 1 and 100 then "S"
+when distinct_tables between 101 and 300 then "M"
+when distinct_tables > 301 then "L"
+else NULL end as uc_model_complexity from
+(select count(distinct concat(database,".",name)) as distinct_tables from $inventory.tables);
diff --git a/src/databricks/labs/ucx/queries/assessment/estimates/03_0_data_migration.md b/src/databricks/labs/ucx/queries/assessment/estimates/03_0_data_migration.md
@@ -0,0 +1,28 @@
+-- widget title=Table estimates, row=3, col=0, size_x=2, size_y=8
+## 4 - Data migration to UC
+
+Once you have defined your data model in UC and that you've created appropriate Storage Credentials and External Locations, 
+you can then migrate your data to UC 
+
+Assumptions for a single table upgrade estimates:
+
+- UC Data model has been defined
+- Storage Credentials are in place
+- External Locations are in place
+- Target Catalogs and schemas has been defined
+- Grants has been defined
+
+Please note that depending on the table type and their location, the upgrade effort will differ.
+[Full guidance](https://www.databricks.com/blog/migrating-tables-hive-metastore-unity-catalog-metastore)
+
+| object_type | table_format                         | location       | estimated effort | suggestion                                                                               |
+|-------------|--------------------------------------|----------------|------------------|------------------------------------------------------------------------------------------|
+| MANAGED     | DELTA                                | -              | 0.5              | CTAS or recreate as external table in S3/ADLS, then SYNC                                 |
+| MANAGED     | not DELTA                            | -              | 2                | Can vary depending of table_format                                                       |
+| EXTERNAL    | DELTA                                | dbfs:/         | 0.5              | CTAS or recreate as external table in S3/ADLS, then SYNC                                 |
+| EXTERNAL    | DELTA                                | adl:/          | 0.5              | CTAS or recreate as external table in S3/ADLS, then SYNC                                 |
+| EXTERNAL    | DELTA                                | wasbs:/        | 0.5              | CTAS or recreate as external table in S3/ADLS, then SYNC                                 |
+| EXTERNAL    | DELTA                                | adls:/ or S3:/ | 0.1              | In place SYNC                                                                            |
+| EXTERNAL    | not DELTA                            | -              | 2                | Can vary depending of table_format                                                       |
+| EXTERNAL    | -SQLSERVER<br/>-MYSQL<br/>-SNOWFLAKE | -              | 2                | Recreate via Lakehouse Federation                                                        |
+| VIEW        | -                                    | -              | 2                | Recreate in target catalog on UC, can vary depending on the number of tables in the view |
diff --git a/src/databricks/labs/ucx/queries/assessment/estimates/03_2_data_migration_summary.sql b/src/databricks/labs/ucx/queries/assessment/estimates/03_2_data_migration_summary.sql
@@ -0,0 +1,3 @@
+-- viz type=table, name=Table estimates, columns=table_name,object_type,table_format,estimated_hours
+-- widget title=Table estimates, row=3, col=2, size_x=3, size_y=8
+SELECT * FROM $inventory.table_estimates
diff --git a/src/databricks/labs/ucx/queries/assessment/estimates/03_5_data_migration_complexity.sql b/src/databricks/labs/ucx/queries/assessment/estimates/03_5_data_migration_complexity.sql
@@ -0,0 +1,11 @@
+-- viz type=counter, name=Data migration complexity, counter_label=Data migration complexity, value_column=data_migration_complexity
+-- widget row=3, col=5, size_x=1, size_y=8
+SELECT
+CASE WHEN total_estimated_hours < 30 THEN "S"
+ WHEN total_estimated_hours BETWEEN 30 AND 100 THEN "M"
+ WHEN total_estimated_hours BETWEEN 100 AND 300 THEN "L"
+ WHEN total_estimated_hours > 300 THEN "XL"
+ ELSE NULL
+END as data_migration_complexity FROM
+(SELECT sum(estimated_hours) AS total_estimated_hours
+FROM $inventory.table_estimates)
diff --git a/src/databricks/labs/ucx/queries/views/table_estimates.sql b/src/databricks/labs/ucx/queries/views/table_estimates.sql
@@ -0,0 +1,13 @@
+select concat(catalog, ".", database,".", name) as table_name, object_type, table_format, case
+when object_type == "MANAGED" and table_format == "DELTA" then 0.5 -- CTAS or recreate as external table, then SYNC
+when object_type == "MANAGED" and table_format != "DELTA" then 2 -- Can vary depending of format
+when object_type == "EXTERNAL" and table_format == "DELTA" and startswith(location, "dbfs:/") then 0.5 -- Must CTAS the target table
+when object_type == "EXTERNAL" and table_format == "DELTA" and startswith(location, "wasbs:/") then 1 -- Must Offload data to abfss
+when object_type == "EXTERNAL" and table_format == "DELTA" and startswith(location, "adl:/") then 1 -- Must Offload data to abfss
+when object_type == "EXTERNAL" and table_format == "DELTA" then 0.1 -- In place SYNC, mostly quick
+when object_type == "EXTERNAL" and table_format in ("SQLSERVER", "MYSQL", "SNOWFLAKE") then 2 -- Must uses Lakehouse Federation
+when object_type == "EXTERNAL" and table_format != "DELTA" then 1 -- Can vary depending of format
+when object_type == "VIEW" then 2 -- Can vary depending of view complexity and number of tables used in the view
+else NULL
+end as estimated_hours from $inventory.tables
+where not startswith(name, "__apply_changes")
diff --git a/src/databricks/labs/ucx/runtime.py b/src/databricks/labs/ucx/runtime.py
@@ -249,6 +249,22 @@ def assessment_report(*_):
     dashboard _before_ all tasks have been completed, but then only already completed information is shown."""
 
 
+@task(
+    "assessment",
+    depends_on=[
+        assess_jobs,
+        assess_incompatible_submit_runs,
+        assess_clusters,
+        assess_pipelines,
+        crawl_tables,
+    ],
+    dashboard="assessment_estimates",
+)
+def estimates_report(*_):
+    """Refreshes the assessment dashboard after all previous tasks have been completed. Note that you can access the
+    dashboard _before_ all tasks have been completed, but then only already completed information is shown."""
+
+
 @task("migrate-groups", depends_on=[crawl_groups])
 def rename_workspace_local_groups(cfg: WorkspaceConfig, ws: WorkspaceClient, sql_backend: SqlBackend):
     """Renames workspace local groups by adding `ucx-renamed-` prefix."""
diff --git a/tests/unit/test_install.py b/tests/unit/test_install.py
@@ -118,13 +118,22 @@ def created_job_tasks(workspace_client: MagicMock, name: str) -> dict[str, jobs.
 
 @pytest.fixture
 def mock_installation():
-    return MockInstallation({'state.json': {'resources': {'dashboards': {'assessment_main': 'abc'}}}})
+    return MockInstallation(
+        {'state.json': {'resources': {'dashboards': {'assessment_main': 'abc', 'assessment_estimates': 'def'}}}}
+    )
 
 
 @pytest.fixture
 def mock_installation_with_jobs():
     return MockInstallation(
-        {'state.json': {'resources': {'jobs': {"assessment": "123"}, 'dashboards': {'assessment_main': 'abc'}}}}
+        {
+            'state.json': {
+                'resources': {
+                    'jobs': {"assessment": "123"},
+                    'dashboards': {'assessment_main': 'abc', 'assessment_estimates': 'def'},
+                }
+            }
+        }
     )
 
 
@@ -172,6 +181,7 @@ def test_install_cluster_override_jobs(ws, mock_installation, any_prompt):
     tasks = created_job_tasks(ws, '[MOCK] assessment')
     assert tasks['assess_jobs'].existing_cluster_id == 'one'
     assert tasks['crawl_grants'].existing_cluster_id == 'two'
+    assert tasks['estimates_report'].sql_task.dashboard.dashboard_id == 'def'
 
 
 def test_write_protected_dbfs(ws, tmp_path, mock_installation):

Original file line number	Diff line number	Diff line change
`@@ -167,6 +167,7 @@ def deploy_schema(sql_backend: SqlBackend, inventory_schema: str):`
`167`	`167`	`)`
`168`	`168`	`deployer.deploy_view("objects", "queries/views/objects.sql")`
`169`	`169`	`deployer.deploy_view("grant_detail", "queries/views/grant_detail.sql")`
	`170`	`+ deployer.deploy_view("table_estimates", "queries/views/table_estimates.sql")`
`170`	`171`
`171`	`172`
`172`	`173`	`class WorkspaceInstaller:`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+-- viz type=counter, name=Metastore assigned, value_column=uc_metastore_assigned`
	`2`	`+-- widget row=0, col=5, size_x=1, size_y=8`
	`3`	`+SELECT case when CURRENT_METASTORE() is not null then "Metastore already assigned" else "Metastore not assigned" end as uc_metastore_assigned`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+-- viz type=table, name=Workspace local groups, columns=id_in_workspace,name_in_workspace,name_in_account,temporary_name,members,entitlements,external_id,roles`
	`2`	`+-- widget title=Workspace local groups to migrate, row=1, col=2, size_x=3, size_y=8`
	`3`	`+SELECT * FROM $inventory.groups`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+-- viz type=table, name=Tables to migrate, columns=catalog,database,name,object_type,table_format,location,view_text,upgraded_to,storage_properties`
	`2`	`+-- widget title=Tables to migrate, row=2, col=2, size_x=3, size_y=8`
	`3`	`+select * from $inventory.tables;`