Skip to content

Commit e06fd44

Browse files
authored
Added used tables in assessment dashboard (#2836)
## Changes Adds a widget in the assessment dashboard for displaying used tables ### Linked issues None ### Functionality - [x] added some UI ### Tests - [x] verified on staging environment: https://github.com/user-attachments/assets/64de0164-26dd-4470-b51d-b564266c8604 --------- Co-authored-by: Eric Vergnaud <[email protected]>
1 parent 444d474 commit e06fd44

File tree

5 files changed

+128
-12
lines changed

5 files changed

+128
-12
lines changed

src/databricks/labs/ucx/queries/assessment/main/36_0_direct_filesystem_access_problems.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,6 @@ As a reminder, `dbfs:/` is not supported in Unity Catalog, and more generally di
99
Rather, data should be accessed via Unity tables.
1010

1111
Each row:
12-
- Points to a direct filesystem access detected in the code using the code path, query or workflow & task reference and start/end line & column;
12+
- Points to a direct filesystem access detected in the code using the code path, query or workflow & task reference;
1313
- Provides the _lineage_ i.e. which `workflow -> task -> notebook...` execution sequence leads to that access.
1414

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
---
2+
height: 4
3+
---
4+
5+
# Used tables
6+
7+
The table below assists with verifying if workflows and dashboards utilize legacy tables.
8+
9+
Each row:
10+
- Points to a legacy table detected in the code using the code path, query or workflow & task reference;
11+
- Provides the _lineage_ i.e. which `workflow -> task -> notebook...` execution sequence leads to that usage.
12+
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
/*
2+
--title 'Used tables'
3+
--width 6
4+
--overrides '{"spec":{
5+
"encodings":{
6+
"columns": [
7+
{"fieldName": "fullname", "title": "table", "type": "string", "displayAs": "string", "booleanValues": ["false", "true"]},
8+
{"fieldName": "is_read", "title": "is_read", "type": "boolean", "displayAs": "boolean", "booleanValues": ["false", "true"]},
9+
{"fieldName": "is_write", "title": "is_write", "type": "boolean", "displayAs": "boolean", "booleanValues": ["false", "true"]},
10+
{"fieldName": "source", "title": "source", "type": "string", "displayAs": "link", "linkUrlTemplate": "{{ source_link }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "booleanValues": ["false", "true"]},
11+
{"fieldName": "timestamp", "title": "last_modified", "type": "datetime", "displayAs": "datetime", "dateTimeFormat": "ll LTS (z)", "booleanValues": ["false", "true"]},
12+
{"fieldName": "lineage", "title": "lineage", "type": "string", "displayAs": "link", "linkUrlTemplate": "{{ lineage_link }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "booleanValues": ["false", "true"]},
13+
{"fieldName": "lineage_data", "title": "lineage_data", "type": "complex", "displayAs": "json", "booleanValues": ["false", "true"]},
14+
{"fieldName": "assessment_start", "title": "assessment_start", "type": "datetime", "displayAs": "datetime", "dateTimeFormat": "ll LTS (z)", "booleanValues": ["false", "true"]},
15+
{"fieldName": "assessment_end", "title": "assessment_end", "type": "datetime", "displayAs": "datetime", "dateTimeFormat": "ll LTS (z)", "booleanValues": ["false", "true"]}
16+
]},
17+
"invisibleColumns": [
18+
{"fieldName": "source_link", "title": "source_link", "type": "string", "displayAs": "string", "booleanValues": ["false", "true"]},
19+
{"fieldName": "lineage_type", "title": "lineage_type", "type": "string", "displayAs": "string", "booleanValues": ["false", "true"]},
20+
{"fieldName": "lineage_id", "title": "lineage_id", "type": "string", "displayAs": "string", "booleanValues": ["false", "true"]},
21+
{"fieldName": "lineage_link", "title": "lineage_link", "type": "string", "displayAs": "string", "booleanValues": ["false", "true"]}
22+
]
23+
}}'
24+
*/
25+
SELECT
26+
fullname,
27+
is_read,
28+
is_write,
29+
if( startswith(source_id, '/'), substring_index(source_id, '@databricks.com/', -1), split_part(source_id, '/', 2)) as source,
30+
if( startswith(source_id, '/'), concat('/#workspace/', source_id), concat('/sql/editor/', split_part(source_id, '/', 2))) as source_link,
31+
source_timestamp as `timestamp`,
32+
case
33+
when lineage.object_type = 'WORKFLOW' then concat('Workflow: ', lineage.other.name)
34+
when lineage.object_type = 'TASK' then concat('Task: ', split_part(lineage.object_id, '/', 2))
35+
when lineage.object_type = 'NOTEBOOK' then concat('Notebook: ', substring_index(lineage.object_id, '@databricks.com/', -1))
36+
when lineage.object_type = 'FILE' then concat('File: ', substring_index(lineage.object_id, '@databricks.com/', -1))
37+
when lineage.object_type = 'DASHBOARD' then concat('Dashboard: ', lineage.other.name)
38+
when lineage.object_type = 'QUERY' then concat('Query: ', lineage.other.name)
39+
end as lineage,
40+
lineage.object_type as lineage_type,
41+
lineage.object_id as lineage_id,
42+
case
43+
when lineage.object_type = 'WORKFLOW' then concat('/jobs/', lineage.object_id)
44+
when lineage.object_type = 'TASK' then concat('/jobs/', split_part(lineage.object_id, '/', 1), '/tasks/', split_part(lineage.object_id, '/', 2))
45+
when lineage.object_type = 'NOTEBOOK' then concat('/#workspace/', lineage.object_id)
46+
when lineage.object_type = 'FILE' then concat('/#workspace/', lineage.object_id)
47+
when lineage.object_type = 'DASHBOARD' then concat('/sql/dashboards/', lineage.object_id)
48+
when lineage.object_type = 'QUERY' then concat('/sql/editor/', split_part(lineage.object_id, '/', 2))
49+
end as lineage_link,
50+
lineage.other as lineage_data,
51+
assessment_start,
52+
assessment_end
53+
from (SELECT
54+
concat( catalog_name, '.', schema_name, '.', table_name) as fullname,
55+
is_read,
56+
is_write,
57+
source_id,
58+
source_timestamp,
59+
explode(source_lineage) as lineage,
60+
assessment_start_timestamp as assessment_start,
61+
assessment_end_timestamp as assessment_end
62+
FROM inventory.used_tables)

src/databricks/labs/ucx/source_code/queries.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import dataclasses
22
import logging
33
from collections.abc import Iterable
4-
from dataclasses import asdict, dataclass, field
4+
from dataclasses import dataclass, field
55
from datetime import datetime, timezone
66

77
from databricks.sdk import WorkspaceClient
@@ -208,7 +208,7 @@ def collect_dfsas_from_query(self, dashboard_id: str, query: LegacyQuery) -> Ite
208208
source_timestamp = self._read_timestamp(query.updated_at)
209209
source_lineage = [LineageAtom(object_type="QUERY", object_id=source_id, other={"name": source_name})]
210210
for dfsa in collector.collect_dfsas(query.query):
211-
yield DirectFsAccess(**asdict(dfsa)).replace_source(
211+
yield dfsa.replace_source(
212212
source_id=source_id, source_timestamp=source_timestamp, source_lineage=source_lineage
213213
)
214214

@@ -222,7 +222,7 @@ def collect_used_tables_from_query(self, dashboard_id: str, query: LegacyQuery)
222222
source_timestamp = self._read_timestamp(query.updated_at)
223223
source_lineage = [LineageAtom(object_type="QUERY", object_id=source_id, other={"name": source_name})]
224224
for table in collector.collect_tables(query.query):
225-
yield UsedTable(**asdict(table)).replace_source(
225+
yield table.replace_source(
226226
source_id=source_id, source_timestamp=source_timestamp, source_lineage=source_lineage
227227
)
228228

tests/integration/assessment/test_dashboards.py

Lines changed: 50 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
import pytest
44

5-
from databricks.labs.ucx.source_code.base import LineageAtom
5+
from databricks.labs.ucx.source_code.base import LineageAtom, UsedTable
66
from databricks.labs.ucx.source_code.directfs_access import DirectFsAccess
77
from databricks.labs.ucx.source_code.jobs import JobProblem
88
from databricks.sdk.service.iam import PermissionLevel
@@ -63,10 +63,10 @@ def _populate_directfs_problems(installation_ctx):
6363
source_id="xyz.py",
6464
source_timestamp=datetime.now(timezone.utc) - timedelta(hours=2.0),
6565
source_lineage=[
66-
LineageAtom(object_type="WORKFLOW", object_id="my_workflow"),
67-
LineageAtom(object_type="TASK", object_id="my_workflow/my_task"),
68-
LineageAtom(object_type="NOTEBOOK", object_id="my_notebook"),
69-
LineageAtom(object_type="FILE", object_id="my file"),
66+
LineageAtom(object_type="WORKFLOW", object_id="my_workflow_id", other={"name": "my_workflow"}),
67+
LineageAtom(object_type="TASK", object_id="my_workflow_id/my_task_id"),
68+
LineageAtom(object_type="NOTEBOOK", object_id="my_notebook_path"),
69+
LineageAtom(object_type="FILE", object_id="my file_path"),
7070
],
7171
assessment_start_timestamp=datetime.now(timezone.utc) - timedelta(minutes=5.0),
7272
assessment_end_timestamp=datetime.now(timezone.utc) - timedelta(minutes=2.0),
@@ -81,8 +81,8 @@ def _populate_directfs_problems(installation_ctx):
8181
source_id="xyz.py",
8282
source_timestamp=datetime.now(timezone.utc) - timedelta(hours=2.0),
8383
source_lineage=[
84-
LineageAtom(object_type="DASHBOARD", object_id="my_dashboard"),
85-
LineageAtom(object_type="QUERY", object_id="my_dashboard/my_query"),
84+
LineageAtom(object_type="DASHBOARD", object_id="my_dashboard_id", other={"name": "my_dashboard"}),
85+
LineageAtom(object_type="QUERY", object_id="my_dashboard_id/my_query_id", other={"name": "my_query"}),
8686
],
8787
assessment_start_timestamp=datetime.now(timezone.utc) - timedelta(minutes=5.0),
8888
assessment_end_timestamp=datetime.now(timezone.utc) - timedelta(minutes=2.0),
@@ -91,6 +91,47 @@ def _populate_directfs_problems(installation_ctx):
9191
installation_ctx.directfs_access_crawler_for_queries.dump_all(dfsas)
9292

9393

94+
def _populate_used_tables(installation_ctx):
95+
tables = [
96+
UsedTable(
97+
catalog_name="hive_metastore",
98+
schema_name="staff_db",
99+
table_name="employees",
100+
is_read=False,
101+
is_write=True,
102+
source_id="xyz.py",
103+
source_timestamp=datetime.now(timezone.utc) - timedelta(hours=2.0),
104+
source_lineage=[
105+
LineageAtom(object_type="WORKFLOW", object_id="my_workflow_id", other={"name": "my_workflow"}),
106+
LineageAtom(object_type="TASK", object_id="my_workflow_id/my_task_id"),
107+
LineageAtom(object_type="NOTEBOOK", object_id="my_notebook_path"),
108+
LineageAtom(object_type="FILE", object_id="my file_path"),
109+
],
110+
assessment_start_timestamp=datetime.now(timezone.utc) - timedelta(minutes=5.0),
111+
assessment_end_timestamp=datetime.now(timezone.utc) - timedelta(minutes=2.0),
112+
)
113+
]
114+
installation_ctx.used_tables_crawler_for_paths.dump_all(tables)
115+
tables = [
116+
UsedTable(
117+
catalog_name="hive_metastore",
118+
schema_name="customers_db",
119+
table_name="customers",
120+
is_read=False,
121+
is_write=True,
122+
source_id="xyz.py",
123+
source_timestamp=datetime.now(timezone.utc) - timedelta(hours=2.0),
124+
source_lineage=[
125+
LineageAtom(object_type="DASHBOARD", object_id="my_dashboard_id", other={"name": "my_dashboard"}),
126+
LineageAtom(object_type="QUERY", object_id="my_dashboard_id/my_query_id", other={"name": "my_query"}),
127+
],
128+
assessment_start_timestamp=datetime.now(timezone.utc) - timedelta(minutes=5.0),
129+
assessment_end_timestamp=datetime.now(timezone.utc) - timedelta(minutes=2.0),
130+
)
131+
]
132+
installation_ctx.used_tables_crawler_for_queries.dump_all(tables)
133+
134+
94135
@pytest.mark.skip("Development tool")
95136
def test_dashboard_with_prepopulated_data(installation_ctx, make_cluster_policy, make_cluster_policy_permissions):
96137
"""the purpose of this test is to prepopulate data used by the dashboard without running an actual -lengthy- assessment"""
@@ -103,10 +144,11 @@ def test_dashboard_with_prepopulated_data(installation_ctx, make_cluster_policy,
103144
)
104145
installation_ctx.__dict__['include_object_permissions'] = [f"cluster-policies:{cluster_policy.policy_id}"]
105146
installation_ctx.workspace_installation.run()
147+
print(f"\nInventory database is {installation_ctx.inventory_database}\n")
106148
# populate data
107149
_populate_workflow_problems(installation_ctx)
108150
_populate_dashboard_problems(installation_ctx)
109151
_populate_directfs_problems(installation_ctx)
110-
print(f"\nInventory database is {installation_ctx.inventory_database}\n")
152+
_populate_used_tables(installation_ctx)
111153
# put a breakpoint here
112154
print("Put a breakpoint here! Then go check the dashboard in your workspace ;-)\n")

0 commit comments

Comments
 (0)