Added used tables in assessment dashboard (#2836)

ericvergnaud · web-flow · commit e06fd44e4d6a · 2024-10-09T11:59:31.000+02:00
## Changes Adds a widget in the assessment dashboard for displaying used tables ### Linked issues None ### Functionality - [x] added some UI ### Tests - [x] verified on staging environment: https://github.com/user-attachments/assets/64de0164-26dd-4470-b51d-b564266c8604 --------- Co-authored-by: Eric Vergnaud <eric.vergnaud@databricks.com>
diff --git a/src/databricks/labs/ucx/queries/assessment/main/36_0_direct_filesystem_access_problems.md b/src/databricks/labs/ucx/queries/assessment/main/36_0_direct_filesystem_access_problems.md
@@ -9,6 +9,6 @@ As a reminder, `dbfs:/` is not supported in Unity Catalog, and more generally di
 Rather, data should be accessed via Unity tables.
 
 Each row:
-- Points to a direct filesystem access detected in the code using the code path, query or workflow & task reference and start/end line & column;
+- Points to a direct filesystem access detected in the code using the code path, query or workflow & task reference;
 - Provides the _lineage_ i.e. which `workflow -> task -> notebook...` execution sequence leads to that access.
 
diff --git a/src/databricks/labs/ucx/queries/assessment/main/37_0_used_tables.md b/src/databricks/labs/ucx/queries/assessment/main/37_0_used_tables.md
@@ -0,0 +1,12 @@
+---
+height: 4
+---
+
+# Used tables
+
+The table below assists with verifying if workflows and dashboards utilize legacy tables.
+
+Each row:
+- Points to a legacy table detected in the code using the code path, query or workflow & task reference;
+- Provides the _lineage_ i.e. which `workflow -> task -> notebook...` execution sequence leads to that usage.
+
diff --git a/src/databricks/labs/ucx/queries/assessment/main/37_1_used_tables.sql b/src/databricks/labs/ucx/queries/assessment/main/37_1_used_tables.sql
@@ -0,0 +1,62 @@
+/*
+--title 'Used tables'
+--width 6
+--overrides '{"spec":{
+    "encodings":{
+      "columns": [
+        {"fieldName": "fullname", "title": "table", "type": "string", "displayAs": "string", "booleanValues": ["false", "true"]},
+        {"fieldName": "is_read", "title": "is_read", "type": "boolean", "displayAs": "boolean", "booleanValues": ["false", "true"]},
+        {"fieldName": "is_write", "title": "is_write", "type": "boolean", "displayAs": "boolean", "booleanValues": ["false", "true"]},
+        {"fieldName": "source", "title": "source", "type": "string", "displayAs": "link", "linkUrlTemplate": "{{ source_link }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "booleanValues": ["false", "true"]},
+        {"fieldName": "timestamp", "title": "last_modified", "type": "datetime", "displayAs": "datetime", "dateTimeFormat": "ll LTS (z)", "booleanValues": ["false", "true"]},
+        {"fieldName": "lineage", "title": "lineage", "type": "string", "displayAs": "link", "linkUrlTemplate": "{{ lineage_link }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "booleanValues": ["false", "true"]},
+        {"fieldName": "lineage_data", "title": "lineage_data", "type": "complex", "displayAs": "json", "booleanValues": ["false", "true"]},
+        {"fieldName": "assessment_start", "title": "assessment_start", "type": "datetime", "displayAs": "datetime", "dateTimeFormat": "ll LTS (z)", "booleanValues": ["false", "true"]},
+        {"fieldName": "assessment_end", "title": "assessment_end", "type": "datetime", "displayAs": "datetime", "dateTimeFormat": "ll LTS (z)", "booleanValues": ["false", "true"]}
+      ]},
+    "invisibleColumns": [
+    {"fieldName": "source_link", "title": "source_link", "type": "string", "displayAs": "string", "booleanValues": ["false", "true"]},
+    {"fieldName": "lineage_type", "title": "lineage_type", "type": "string", "displayAs": "string", "booleanValues": ["false", "true"]},
+    {"fieldName": "lineage_id", "title": "lineage_id", "type": "string", "displayAs": "string", "booleanValues": ["false", "true"]},
+    {"fieldName": "lineage_link", "title": "lineage_link", "type": "string", "displayAs": "string", "booleanValues": ["false", "true"]}
+    ]
+  }}'
+*/
+SELECT
+  fullname,
+  is_read,
+  is_write,
+  if( startswith(source_id, '/'), substring_index(source_id, '@databricks.com/', -1), split_part(source_id, '/', 2)) as source,
+  if( startswith(source_id, '/'), concat('/#workspace/', source_id), concat('/sql/editor/', split_part(source_id, '/', 2))) as source_link,
+  source_timestamp as `timestamp`,
+  case
+    when lineage.object_type = 'WORKFLOW' then concat('Workflow: ', lineage.other.name)
+    when lineage.object_type = 'TASK' then concat('Task: ', split_part(lineage.object_id, '/', 2))
+    when lineage.object_type = 'NOTEBOOK' then concat('Notebook: ', substring_index(lineage.object_id, '@databricks.com/', -1))
+    when lineage.object_type = 'FILE' then concat('File: ', substring_index(lineage.object_id, '@databricks.com/', -1))
+    when lineage.object_type = 'DASHBOARD' then concat('Dashboard: ', lineage.other.name)
+    when lineage.object_type = 'QUERY' then concat('Query: ', lineage.other.name)
+  end as lineage,
+  lineage.object_type as lineage_type,
+  lineage.object_id as lineage_id,
+  case
+    when lineage.object_type = 'WORKFLOW' then concat('/jobs/', lineage.object_id)
+    when lineage.object_type = 'TASK' then concat('/jobs/', split_part(lineage.object_id, '/', 1), '/tasks/', split_part(lineage.object_id, '/', 2))
+    when lineage.object_type = 'NOTEBOOK' then concat('/#workspace/', lineage.object_id)
+    when lineage.object_type = 'FILE' then concat('/#workspace/', lineage.object_id)
+    when lineage.object_type = 'DASHBOARD' then concat('/sql/dashboards/', lineage.object_id)
+    when lineage.object_type = 'QUERY' then concat('/sql/editor/', split_part(lineage.object_id, '/', 2))
+  end as lineage_link,
+  lineage.other as lineage_data,
+  assessment_start,
+  assessment_end
+from (SELECT
+  concat( catalog_name, '.', schema_name, '.', table_name) as fullname,
+  is_read,
+  is_write,
+  source_id,
+  source_timestamp,
+  explode(source_lineage) as lineage,
+  assessment_start_timestamp as assessment_start,
+  assessment_end_timestamp as assessment_end
+FROM inventory.used_tables)
diff --git a/src/databricks/labs/ucx/source_code/queries.py b/src/databricks/labs/ucx/source_code/queries.py
@@ -1,7 +1,7 @@
 import dataclasses
 import logging
 from collections.abc import Iterable
-from dataclasses import asdict, dataclass, field
+from dataclasses import dataclass, field
 from datetime import datetime, timezone
 
 from databricks.sdk import WorkspaceClient
@@ -208,7 +208,7 @@ def collect_dfsas_from_query(self, dashboard_id: str, query: LegacyQuery) -> Ite
         source_timestamp = self._read_timestamp(query.updated_at)
         source_lineage = [LineageAtom(object_type="QUERY", object_id=source_id, other={"name": source_name})]
         for dfsa in collector.collect_dfsas(query.query):
-            yield DirectFsAccess(**asdict(dfsa)).replace_source(
+            yield dfsa.replace_source(
                 source_id=source_id, source_timestamp=source_timestamp, source_lineage=source_lineage
             )
 
@@ -222,7 +222,7 @@ def collect_used_tables_from_query(self, dashboard_id: str, query: LegacyQuery)
         source_timestamp = self._read_timestamp(query.updated_at)
         source_lineage = [LineageAtom(object_type="QUERY", object_id=source_id, other={"name": source_name})]
         for table in collector.collect_tables(query.query):
-            yield UsedTable(**asdict(table)).replace_source(
+            yield table.replace_source(
                 source_id=source_id, source_timestamp=source_timestamp, source_lineage=source_lineage
             )
 
diff --git a/tests/integration/assessment/test_dashboards.py b/tests/integration/assessment/test_dashboards.py
@@ -2,7 +2,7 @@
 
 import pytest
 
-from databricks.labs.ucx.source_code.base import LineageAtom
+from databricks.labs.ucx.source_code.base import LineageAtom, UsedTable
 from databricks.labs.ucx.source_code.directfs_access import DirectFsAccess
 from databricks.labs.ucx.source_code.jobs import JobProblem
 from databricks.sdk.service.iam import PermissionLevel
@@ -63,10 +63,10 @@ def _populate_directfs_problems(installation_ctx):
             source_id="xyz.py",
             source_timestamp=datetime.now(timezone.utc) - timedelta(hours=2.0),
             source_lineage=[
-                LineageAtom(object_type="WORKFLOW", object_id="my_workflow"),
-                LineageAtom(object_type="TASK", object_id="my_workflow/my_task"),
-                LineageAtom(object_type="NOTEBOOK", object_id="my_notebook"),
-                LineageAtom(object_type="FILE", object_id="my file"),
+                LineageAtom(object_type="WORKFLOW", object_id="my_workflow_id", other={"name": "my_workflow"}),
+                LineageAtom(object_type="TASK", object_id="my_workflow_id/my_task_id"),
+                LineageAtom(object_type="NOTEBOOK", object_id="my_notebook_path"),
+                LineageAtom(object_type="FILE", object_id="my file_path"),
             ],
             assessment_start_timestamp=datetime.now(timezone.utc) - timedelta(minutes=5.0),
             assessment_end_timestamp=datetime.now(timezone.utc) - timedelta(minutes=2.0),
@@ -81,8 +81,8 @@ def _populate_directfs_problems(installation_ctx):
             source_id="xyz.py",
             source_timestamp=datetime.now(timezone.utc) - timedelta(hours=2.0),
             source_lineage=[
-                LineageAtom(object_type="DASHBOARD", object_id="my_dashboard"),
-                LineageAtom(object_type="QUERY", object_id="my_dashboard/my_query"),
+                LineageAtom(object_type="DASHBOARD", object_id="my_dashboard_id", other={"name": "my_dashboard"}),
+                LineageAtom(object_type="QUERY", object_id="my_dashboard_id/my_query_id", other={"name": "my_query"}),
             ],
             assessment_start_timestamp=datetime.now(timezone.utc) - timedelta(minutes=5.0),
             assessment_end_timestamp=datetime.now(timezone.utc) - timedelta(minutes=2.0),
@@ -91,6 +91,47 @@ def _populate_directfs_problems(installation_ctx):
     installation_ctx.directfs_access_crawler_for_queries.dump_all(dfsas)
 
 
+def _populate_used_tables(installation_ctx):
+    tables = [
+        UsedTable(
+            catalog_name="hive_metastore",
+            schema_name="staff_db",
+            table_name="employees",
+            is_read=False,
+            is_write=True,
+            source_id="xyz.py",
+            source_timestamp=datetime.now(timezone.utc) - timedelta(hours=2.0),
+            source_lineage=[
+                LineageAtom(object_type="WORKFLOW", object_id="my_workflow_id", other={"name": "my_workflow"}),
+                LineageAtom(object_type="TASK", object_id="my_workflow_id/my_task_id"),
+                LineageAtom(object_type="NOTEBOOK", object_id="my_notebook_path"),
+                LineageAtom(object_type="FILE", object_id="my file_path"),
+            ],
+            assessment_start_timestamp=datetime.now(timezone.utc) - timedelta(minutes=5.0),
+            assessment_end_timestamp=datetime.now(timezone.utc) - timedelta(minutes=2.0),
+        )
+    ]
+    installation_ctx.used_tables_crawler_for_paths.dump_all(tables)
+    tables = [
+        UsedTable(
+            catalog_name="hive_metastore",
+            schema_name="customers_db",
+            table_name="customers",
+            is_read=False,
+            is_write=True,
+            source_id="xyz.py",
+            source_timestamp=datetime.now(timezone.utc) - timedelta(hours=2.0),
+            source_lineage=[
+                LineageAtom(object_type="DASHBOARD", object_id="my_dashboard_id", other={"name": "my_dashboard"}),
+                LineageAtom(object_type="QUERY", object_id="my_dashboard_id/my_query_id", other={"name": "my_query"}),
+            ],
+            assessment_start_timestamp=datetime.now(timezone.utc) - timedelta(minutes=5.0),
+            assessment_end_timestamp=datetime.now(timezone.utc) - timedelta(minutes=2.0),
+        )
+    ]
+    installation_ctx.used_tables_crawler_for_queries.dump_all(tables)
+
+
 @pytest.mark.skip("Development tool")
 def test_dashboard_with_prepopulated_data(installation_ctx, make_cluster_policy, make_cluster_policy_permissions):
     """the purpose of this test is to prepopulate data used by the dashboard without running an actual -lengthy- assessment"""
@@ -103,10 +144,11 @@ def test_dashboard_with_prepopulated_data(installation_ctx, make_cluster_policy,
     )
     installation_ctx.__dict__['include_object_permissions'] = [f"cluster-policies:{cluster_policy.policy_id}"]
     installation_ctx.workspace_installation.run()
+    print(f"\nInventory database is {installation_ctx.inventory_database}\n")
     # populate data
     _populate_workflow_problems(installation_ctx)
     _populate_dashboard_problems(installation_ctx)
     _populate_directfs_problems(installation_ctx)
-    print(f"\nInventory database is {installation_ctx.inventory_database}\n")
+    _populate_used_tables(installation_ctx)
     # put a breakpoint here
     print("Put a breakpoint here! Then go check the dashboard in your workspace ;-)\n")