Skip to content

Commit abfe380

Browse files
ericvergnaudnfxasnareJCZuurmond
authored
Added task for linting queries (#2630)
## Changes Workspace sql queries are not currently linted. This PR fixes that by introducing a QueryLinter which mimics for dashboard queries what WorkflowLinter does for jobs ### Linked issues Resolves #2350 ### Functionality - [x] modified existing workflow: `experimental-workflow-linter` - [x] added a new table `query_problems` ### Tests - [x] added unit tests - [x] added integration tests - [x] manually tested schema upgrade: ![Screenshot 2024-09-13 at 16 20 42](https://github.com/user-attachments/assets/a64bf2fa-d8c6-4346-818d-1be5526df24b) Please note that storage is simplified in PR #2662 --------- Co-authored-by: Eric Vergnaud <[email protected]> Co-authored-by: Serge Smertin <[email protected]> Co-authored-by: Andrew Snare <[email protected]> Co-authored-by: Cor <[email protected]> Co-authored-by: Serge Smertin <[email protected]>
1 parent 1d1eec9 commit abfe380

File tree

16 files changed

+368
-166
lines changed

16 files changed

+368
-166
lines changed

src/databricks/labs/ucx/contexts/application.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -431,7 +431,11 @@ def workflow_linter(self):
431431

432432
@cached_property
433433
def query_linter(self):
434-
return QueryLinter(self.workspace_client, self.directfs_access_crawler_for_queries)
434+
return QueryLinter(
435+
self.workspace_client,
436+
TableMigrationIndex([]), # TODO: bring back self.tables_migrator.index()
437+
self.directfs_access_crawler_for_queries,
438+
)
435439

436440
@cached_property
437441
def directfs_access_crawler_for_paths(self):

src/databricks/labs/ucx/framework/crawlers.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -136,8 +136,8 @@ def _snapshot(self, fetcher: ResultFn, loader: ResultFn, *, force_refresh: bool)
136136
cached_results = list(fetcher())
137137
if cached_results:
138138
return cached_results
139-
except NotFound:
140-
pass
139+
except NotFound as e:
140+
logger.debug("Inventory table not found", exc_info=e)
141141
logger.debug(f"[{self.full_name}] crawling new set of snapshot data for {self._table}")
142142
loaded_records = list(loader())
143143
self._update_snapshot(loaded_records, mode="overwrite")

src/databricks/labs/ucx/install.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -75,8 +75,9 @@
7575
from databricks.labs.ucx.installer.workflows import WorkflowsDeployment
7676
from databricks.labs.ucx.recon.migration_recon import ReconResult
7777
from databricks.labs.ucx.runtime import Workflows
78-
from databricks.labs.ucx.source_code.directfs_access import DirectFsAccessInPath, DirectFsAccessInQuery
78+
from databricks.labs.ucx.source_code.directfs_access import DirectFsAccess
7979
from databricks.labs.ucx.source_code.jobs import JobProblem
80+
from databricks.labs.ucx.source_code.queries import QueryProblem
8081
from databricks.labs.ucx.workspace_access.base import Permissions
8182
from databricks.labs.ucx.workspace_access.generic import WorkspaceObjectInfo
8283
from databricks.labs.ucx.workspace_access.groups import ConfigureGroups, MigratedGroup
@@ -118,11 +119,12 @@ def deploy_schema(sql_backend: SqlBackend, inventory_schema: str):
118119
functools.partial(table, "policies", PolicyInfo),
119120
functools.partial(table, "migration_status", TableMigrationStatus),
120121
functools.partial(table, "workflow_problems", JobProblem),
122+
functools.partial(table, "query_problems", QueryProblem),
121123
functools.partial(table, "udfs", Udf),
122124
functools.partial(table, "logs", LogRecord),
123125
functools.partial(table, "recon_results", ReconResult),
124-
functools.partial(table, "directfs_in_paths", DirectFsAccessInPath),
125-
functools.partial(table, "directfs_in_queries", DirectFsAccessInQuery),
126+
functools.partial(table, "directfs_in_paths", DirectFsAccess),
127+
functools.partial(table, "directfs_in_queries", DirectFsAccess),
126128
],
127129
)
128130
deployer.deploy_view("grant_detail", "queries/views/grant_detail.sql")

src/databricks/labs/ucx/mixins/fixtures.py

Whitespace-only changes.

src/databricks/labs/ucx/queries/views/directfs.sql

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,7 @@ SELECT
66
source_timestamp,
77
source_lineage,
88
assessment_start_timestamp,
9-
assessment_end_timestamp,
10-
job_id,
11-
job_name,
12-
task_key
9+
assessment_end_timestamp
1310
FROM $inventory.directfs_in_paths
1411
UNION ALL
1512
SELECT
@@ -20,8 +17,5 @@ SELECT
2017
source_timestamp,
2118
source_lineage,
2219
assessment_start_timestamp,
23-
assessment_end_timestamp,
24-
NULL as job_id,
25-
NULL as job_name,
26-
null as task_key
20+
assessment_end_timestamp
2721
FROM $inventory.directfs_in_queries

src/databricks/labs/ucx/source_code/directfs_access.py

Lines changed: 14 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from collections.abc import Sequence, Iterable
77
from dataclasses import dataclass, field
88
from datetime import datetime
9-
from typing import Any, TypeVar
9+
from typing import Any
1010

1111
from databricks.labs.ucx.framework.crawlers import CrawlerBase
1212
from databricks.labs.lsql.backends import SqlBackend
@@ -39,7 +39,7 @@ class DirectFsAccess:
3939
def from_dict(cls, data: dict[str, Any]) -> Self:
4040
source_lineage = data.get("source_lineage", None)
4141
if isinstance(source_lineage, list) and len(source_lineage) > 0 and isinstance(source_lineage[0], dict):
42-
lineage_atoms = [LineageAtom(*lineage) for lineage in source_lineage]
42+
lineage_atoms = [LineageAtom(**lineage) for lineage in source_lineage]
4343
data["source_lineage"] = lineage_atoms
4444
return cls(**data)
4545

@@ -77,63 +77,27 @@ def replace_assessment_infos(
7777
)
7878

7979

80-
@dataclass
81-
class DirectFsAccessInQuery(DirectFsAccess):
82-
83-
pass
84-
85-
86-
@dataclass
87-
class DirectFsAccessInPath(DirectFsAccess):
88-
job_id: int = -1
89-
job_name: str = DirectFsAccess.UNKNOWN
90-
task_key: str = DirectFsAccess.UNKNOWN
91-
92-
def replace_job_infos(
93-
self,
94-
job_id: int | None = None,
95-
job_name: str | None = None,
96-
task_key: str | None = None,
97-
):
98-
return dataclasses.replace(
99-
self, job_id=job_id or self.job_id, job_name=job_name or self.job_name, task_key=task_key or self.task_key
100-
)
101-
102-
103-
T = TypeVar("T", bound=DirectFsAccess)
104-
105-
106-
class DirectFsAccessCrawler(CrawlerBase[T]):
80+
class DirectFsAccessCrawler(CrawlerBase[DirectFsAccess]):
10781

10882
@classmethod
10983
def for_paths(cls, backend: SqlBackend, schema) -> DirectFsAccessCrawler:
110-
return DirectFsAccessCrawler[DirectFsAccessInPath](
111-
backend,
112-
schema,
113-
"directfs_in_paths",
114-
DirectFsAccessInPath,
115-
)
84+
return DirectFsAccessCrawler(backend, schema, "directfs_in_paths")
11685

11786
@classmethod
11887
def for_queries(cls, backend: SqlBackend, schema) -> DirectFsAccessCrawler:
119-
return DirectFsAccessCrawler[DirectFsAccessInQuery](
120-
backend,
121-
schema,
122-
"directfs_in_queries",
123-
DirectFsAccessInQuery,
124-
)
88+
return DirectFsAccessCrawler(backend, schema, "directfs_in_queries")
12589

126-
def __init__(self, backend: SqlBackend, schema: str, table: str, klass: type[T]):
90+
def __init__(self, backend: SqlBackend, schema: str, table: str):
12791
"""
12892
Initializes a DFSACrawler instance.
12993
13094
Args:
13195
sql_backend (SqlBackend): The SQL Execution Backend abstraction (either REST API or Spark)
13296
schema: The schema name for the inventory persistence.
13397
"""
134-
super().__init__(backend, "hive_metastore", schema, table, klass)
98+
super().__init__(backend=backend, catalog="hive_metastore", schema=schema, table=table, klass=DirectFsAccess)
13599

136-
def dump_all(self, dfsas: Sequence[T]):
100+
def dump_all(self, dfsas: Sequence[DirectFsAccess]):
137101
"""This crawler doesn't follow the pull model because the fetcher fetches data for 2 crawlers, not just one
138102
It's not **bad** because all records are pushed at once.
139103
Providing a multi-entity crawler is out-of-scope of this PR
@@ -144,9 +108,11 @@ def dump_all(self, dfsas: Sequence[T]):
144108
except DatabricksError as e:
145109
logger.error("Failed to store DFSAs", exc_info=e)
146110

147-
def _try_fetch(self) -> Iterable[T]:
111+
def _try_fetch(self) -> Iterable[DirectFsAccess]:
148112
sql = f"SELECT * FROM {escape_sql_identifier(self.full_name)}"
149-
yield from self._backend.fetch(sql)
113+
for row in self._backend.fetch(sql):
114+
yield self._klass.from_dict(row.as_dict())
150115

151-
def _crawl(self) -> Iterable[T]:
152-
raise NotImplementedError()
116+
def _crawl(self) -> Iterable[DirectFsAccess]:
117+
return []
118+
# TODO raise NotImplementedError() once CrawlerBase supports empty snapshots

src/databricks/labs/ucx/source_code/graph.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -337,7 +337,7 @@ def __repr__(self):
337337

338338
@property
339339
def lineage(self) -> list[LineageAtom]:
340-
return [LineageAtom("path", str(self.path))]
340+
return [LineageAtom(object_type="PATH", object_id=str(self.path))]
341341

342342

343343
class SourceContainer(abc.ABC):

src/databricks/labs/ucx/source_code/jobs.py

Lines changed: 23 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
1+
import dataclasses
12
import functools
23
import logging
34
import shutil
45
import tempfile
56
from collections.abc import Generator, Iterable
67
from contextlib import contextmanager
7-
from dataclasses import dataclass, asdict
8+
from dataclasses import dataclass
89
from datetime import datetime, timezone
910
from importlib import metadata
1011
from pathlib import Path
@@ -28,10 +29,9 @@
2829
guess_encoding,
2930
)
3031
from databricks.labs.ucx.source_code.directfs_access import (
31-
DirectFsAccess,
3232
LineageAtom,
3333
DirectFsAccessCrawler,
34-
DirectFsAccessInPath,
34+
DirectFsAccess,
3535
)
3636
from databricks.labs.ucx.source_code.graph import (
3737
Dependency,
@@ -86,8 +86,8 @@ def __repr__(self):
8686
@property
8787
def lineage(self) -> list[LineageAtom]:
8888
job_name = (None if self._job.settings is None else self._job.settings.name) or "unknown job"
89-
job_lineage = LineageAtom("job", str(self._job.job_id), {"name": job_name})
90-
task_lineage = LineageAtom("task", self._task.task_key)
89+
job_lineage = LineageAtom("JOB", str(self._job.job_id), {"name": job_name})
90+
task_lineage = LineageAtom("TASK", self._task.task_key)
9191
return [job_lineage, task_lineage]
9292

9393

@@ -363,7 +363,7 @@ def refresh_report(self, sql_backend: SqlBackend, inventory_database: str):
363363
logger.info(f"Running {tasks} linting tasks in parallel...")
364364
job_results, errors = Threads.gather('linting workflows', tasks)
365365
job_problems: list[JobProblem] = []
366-
job_dfsas: list[DirectFsAccessInPath] = []
366+
job_dfsas: list[DirectFsAccess] = []
367367
for problems, dfsas in job_results:
368368
job_problems.extend(problems)
369369
job_dfsas.extend(dfsas)
@@ -378,7 +378,7 @@ def refresh_report(self, sql_backend: SqlBackend, inventory_database: str):
378378
if len(errors) > 0:
379379
raise ManyError(errors)
380380

381-
def lint_job(self, job_id: int) -> tuple[list[JobProblem], list[DirectFsAccessInPath]]:
381+
def lint_job(self, job_id: int) -> tuple[list[JobProblem], list[DirectFsAccess]]:
382382
try:
383383
job = self._ws.jobs.get(job_id)
384384
except NotFound:
@@ -393,9 +393,9 @@ def lint_job(self, job_id: int) -> tuple[list[JobProblem], list[DirectFsAccessIn
393393

394394
_UNKNOWN = Path('<UNKNOWN>')
395395

396-
def _lint_job(self, job: jobs.Job) -> tuple[list[JobProblem], list[DirectFsAccessInPath]]:
396+
def _lint_job(self, job: jobs.Job) -> tuple[list[JobProblem], list[DirectFsAccess]]:
397397
problems: list[JobProblem] = []
398-
dfsas: list[DirectFsAccessInPath] = []
398+
dfsas: list[DirectFsAccess] = []
399399
assert job.job_id is not None
400400
assert job.settings is not None
401401
assert job.settings.name is not None
@@ -420,9 +420,9 @@ def _lint_job(self, job: jobs.Job) -> tuple[list[JobProblem], list[DirectFsAcces
420420
end_col=advice.advice.end_col,
421421
)
422422
problems.append(job_problem)
423-
assessment_start = datetime.now()
424-
task_dfsas = self._collect_task_dfsas(task, job, graph, session_state)
425-
assessment_end = datetime.now()
423+
assessment_start = datetime.now(timezone.utc)
424+
task_dfsas = self._collect_task_dfsas(job, task, graph, session_state)
425+
assessment_end = datetime.now(timezone.utc)
426426
for dfsa in task_dfsas:
427427
dfsa = dfsa.replace_assessment_infos(assessment_start=assessment_start, assessment_end=assessment_end)
428428
dfsas.append(dfsa)
@@ -462,15 +462,17 @@ def _lint_task(
462462
yield from walker
463463

464464
def _collect_task_dfsas(
465-
self, task: jobs.Task, job: jobs.Job, graph: DependencyGraph, session_state: CurrentSessionState
466-
) -> Iterable[DirectFsAccessInPath]:
467-
collector = DfsaCollectorWalker(graph, set(), self._path_lookup, session_state)
468-
assert job.settings is not None # as already done in _lint_job
469-
job_name = job.settings.name
470-
for dfsa in collector:
471-
yield DirectFsAccessInPath(**asdict(dfsa)).replace_job_infos(
472-
job_id=job.job_id, job_name=job_name, task_key=task.task_key
473-
)
465+
self, job: jobs.Job, task: jobs.Task, graph: DependencyGraph, session_state: CurrentSessionState
466+
) -> Iterable[DirectFsAccess]:
467+
# walker doesn't register lineage for job/task
468+
job_id = str(job.job_id)
469+
job_name = job.settings.name if job.settings and job.settings.name else "<anonymous>"
470+
for dfsa in DfsaCollectorWalker(graph, set(), self._path_lookup, session_state):
471+
atoms = [
472+
LineageAtom(object_type="JOB", object_id=job_id, other={"name": job_name}),
473+
LineageAtom(object_type="TASK", object_id=task.task_key),
474+
]
475+
yield dataclasses.replace(dfsa, source_lineage=atoms + dfsa.source_lineage)
474476

475477

476478
class LintingWalker(DependencyGraphWalker[LocatedAdvice]):

src/databricks/labs/ucx/source_code/known.json

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1823,6 +1823,89 @@
18231823
"databricks-labs-pylint": {
18241824
"databricks.labs.pylint": []
18251825
},
1826+
"databricks-labs-pytester": {
1827+
"databricks": [],
1828+
"databricks.labs": [],
1829+
"databricks.labs.pytester": [],
1830+
"databricks.labs.pytester.fixtures": [],
1831+
"databricks.labs.pytester.fixtures.baseline": [
1832+
{
1833+
"code": "direct-filesystem-access",
1834+
"message": "The use of direct filesystem references is deprecated: /"
1835+
}
1836+
],
1837+
"databricks.labs.pytester.fixtures.catalog": [
1838+
{
1839+
"code": "direct-filesystem-access",
1840+
"message": "The use of direct filesystem references is deprecated: /"
1841+
},
1842+
{
1843+
"code": "direct-filesystem-access",
1844+
"message": "The use of direct filesystem references is deprecated: dbfs:/tmp/ucx_test_"
1845+
},
1846+
{
1847+
"code": "direct-filesystem-access",
1848+
"message": "The use of direct filesystem references is deprecated: dbfs:/user/hive/warehouse/"
1849+
},
1850+
{
1851+
"code": "direct-filesystem-access",
1852+
"message": "The use of direct filesystem references is deprecated: s3a://databricks-datasets-oregon/delta-sharing/share/open-datasets.share"
1853+
}
1854+
],
1855+
"databricks.labs.pytester.fixtures.compute": [],
1856+
"databricks.labs.pytester.fixtures.connect": [],
1857+
"databricks.labs.pytester.fixtures.environment": [],
1858+
"databricks.labs.pytester.fixtures.iam": [
1859+
{
1860+
"code": "direct-filesystem-access",
1861+
"message": "The use of direct filesystem references is deprecated: /members"
1862+
},
1863+
{
1864+
"code": "direct-filesystem-access",
1865+
"message": "The use of direct filesystem references is deprecated: /users/groups/"
1866+
}
1867+
],
1868+
"databricks.labs.pytester.fixtures.ml": [
1869+
{
1870+
"code": "direct-filesystem-access",
1871+
"message": "The use of direct filesystem references is deprecated: /"
1872+
},
1873+
{
1874+
"code": "direct-filesystem-access",
1875+
"message": "The use of direct filesystem references is deprecated: /api/2.0/feature-store/feature-tables/create"
1876+
},
1877+
{
1878+
"code": "direct-filesystem-access",
1879+
"message": "The use of direct filesystem references is deprecated: /api/2.0/feature-store/feature-tables/delete"
1880+
}
1881+
],
1882+
"databricks.labs.pytester.fixtures.notebooks": [
1883+
{
1884+
"code": "direct-filesystem-access",
1885+
"message": "The use of direct filesystem references is deprecated: /Repos/"
1886+
},
1887+
{
1888+
"code": "direct-filesystem-access",
1889+
"message": "The use of direct filesystem references is deprecated: /Users/"
1890+
},
1891+
{
1892+
"code": "direct-filesystem-access",
1893+
"message": "The use of direct filesystem references is deprecated: /dummy-"
1894+
},
1895+
{
1896+
"code": "direct-filesystem-access",
1897+
"message": "The use of direct filesystem references is deprecated: /sdk-"
1898+
}
1899+
],
1900+
"databricks.labs.pytester.fixtures.permissions": [],
1901+
"databricks.labs.pytester.fixtures.plugin": [],
1902+
"databricks.labs.pytester.fixtures.redash": [],
1903+
"databricks.labs.pytester.fixtures.secrets": [],
1904+
"databricks.labs.pytester.fixtures.sql": [],
1905+
"databricks.labs.pytester.fixtures.unwrap": [],
1906+
"databricks.labs.pytester.fixtures.watchdog": [],
1907+
"tests.resources.hatchling-whl.src.hatchling_whl": []
1908+
},
18261909
"databricks-labs-ucx": {
18271910
"databricks.labs.ucx": []
18281911
},

0 commit comments

Comments
 (0)