Skip to content

Commit e87ca95

Browse files
authored
Refactor linters (#2506)
## Changes Naming of current linters is inconsistent, making the existing code very hard to read. Also, SQL linters do not derive from a specialized base linter, leading to code duplication and suboptimal performance This PR: - introduces specialized SqlLinter and SqlSequentialLinter - renames existing linters such that their names are self-explanatory ### Linked issues None ### Functionality None ### Tests - [x] ran existing unit tests --------- Co-authored-by: Eric Vergnaud <[email protected]>
1 parent 3226727 commit e87ca95

File tree

19 files changed

+162
-160
lines changed

19 files changed

+162
-160
lines changed

src/databricks/labs/ucx/account/aggregate.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
from databricks.labs.ucx.hive_metastore.locations import LocationTrie
1717
from databricks.labs.ucx.hive_metastore.tables import Table
1818
from databricks.labs.ucx.source_code.base import CurrentSessionState
19-
from databricks.labs.ucx.source_code.queries import FromTable
19+
from databricks.labs.ucx.source_code.queries import FromTableSqlLinter
2020

2121
logger = logging.getLogger(__name__)
2222
logger.setLevel(logging.INFO)
@@ -81,7 +81,7 @@ def _federated_ucx_query(self, query: str, table_name='objects') -> Iterable[tup
8181
)
8282
]
8383
)
84-
from_table = FromTable(stub_index, CurrentSessionState(schema=inventory_database))
84+
from_table = FromTableSqlLinter(stub_index, CurrentSessionState(schema=inventory_database))
8585
logger.info(f"Querying Schema {inventory_database}")
8686

8787
workspace_specific_query = from_table.apply(query)

src/databricks/labs/ucx/hive_metastore/view_migrate.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex, TableView
1010
from databricks.labs.ucx.hive_metastore.mapping import TableToMigrate
1111
from databricks.labs.ucx.source_code.base import CurrentSessionState
12-
from databricks.labs.ucx.source_code.queries import FromTable
12+
from databricks.labs.ucx.source_code.queries import FromTableSqlLinter
1313

1414
logger = logging.getLogger(__name__)
1515

@@ -53,7 +53,7 @@ def _read_aliases(self, statement: expressions.Expression):
5353
return aliases
5454

5555
def sql_migrate_view(self, index: MigrationIndex) -> str:
56-
from_table = FromTable(index, CurrentSessionState(self.src.database))
56+
from_table = FromTableSqlLinter(index, CurrentSessionState(self.src.database))
5757
assert self.src.view_text is not None, 'Expected a view text'
5858
migrated_select = from_table.apply(self.src.view_text)
5959
statements = sqlglot.parse(migrated_select, read='databricks')

src/databricks/labs/ucx/source_code/base.py

Lines changed: 36 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,12 @@
99
from pathlib import Path
1010

1111
from astroid import AstroidSyntaxError, NodeNG # type: ignore
12-
from databricks.labs.blueprint.paths import WorkspacePath
12+
from sqlglot import Expression, parse as parse_sql, ParseError as SqlParseError
1313

1414
from databricks.sdk.service import compute
1515
from databricks.sdk.service.workspace import Language
1616

17+
from databricks.labs.blueprint.paths import WorkspacePath
1718
from databricks.labs.ucx.source_code.linters.python_ast import Tree
1819

1920
# Code mapping between LSP, PyLint, and our own diagnostics:
@@ -136,6 +137,35 @@ class Linter:
136137
def lint(self, code: str) -> Iterable[Advice]: ...
137138

138139

140+
class SqlLinter(Linter):
141+
142+
def lint(self, code: str) -> Iterable[Advice]:
143+
try:
144+
expressions = parse_sql(code, read='databricks')
145+
for expression in expressions:
146+
if not expression:
147+
continue
148+
yield from self.lint_expression(expression)
149+
except SqlParseError as e:
150+
logger.debug(f"Failed to parse SQL: {code}", exc_info=e)
151+
yield self.sql_parse_failure(code)
152+
153+
@staticmethod
154+
def sql_parse_failure(code: str):
155+
return Failure(
156+
code='sql-parse-error',
157+
message=f"SQL expression is not supported yet: {code}",
158+
# SQLGlot does not propagate tokens yet. See https://github.com/tobymao/sqlglot/issues/3159
159+
start_line=0,
160+
start_col=0,
161+
end_line=0,
162+
end_col=1024,
163+
)
164+
165+
@abstractmethod
166+
def lint_expression(self, expression: Expression) -> Iterable[Advice]: ...
167+
168+
139169
class PythonLinter(Linter):
140170

141171
def lint(self, code: str) -> Iterable[Advice]:
@@ -201,13 +231,14 @@ def parse_security_mode(mode_str: str | None) -> compute.DataSecurityMode | None
201231
return None
202232

203233

204-
class SequentialLinter(Linter):
205-
def __init__(self, linters: list[Linter]):
234+
class SqlSequentialLinter(SqlLinter):
235+
236+
def __init__(self, linters: list[SqlLinter]):
206237
self._linters = linters
207238

208-
def lint(self, code: str) -> Iterable[Advice]:
239+
def lint_expression(self, expression: Expression) -> Iterable[Advice]:
209240
for linter in self._linters:
210-
yield from linter.lint(code)
241+
yield from linter.lint_expression(expression)
211242

212243

213244
class PythonSequentialLinter(Linter):

src/databricks/labs/ucx/source_code/linters/context.py

Lines changed: 21 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -6,18 +6,19 @@
66
from databricks.labs.ucx.source_code.base import (
77
Fixer,
88
Linter,
9-
SequentialLinter,
9+
SqlSequentialLinter,
1010
CurrentSessionState,
1111
PythonSequentialLinter,
1212
PythonLinter,
13+
SqlLinter,
1314
)
14-
from databricks.labs.ucx.source_code.linters.dbfs import FromDbfsFolder, DBFSUsageLinter
15-
from databricks.labs.ucx.source_code.linters.imports import DbutilsLinter
15+
from databricks.labs.ucx.source_code.linters.dbfs import DbfsUsageSqlLinter, DBFSUsagePyLinter
16+
from databricks.labs.ucx.source_code.linters.imports import DbutilsPyLinter
1617

17-
from databricks.labs.ucx.source_code.linters.pyspark import SparkSql
18-
from databricks.labs.ucx.source_code.linters.spark_connect import SparkConnectLinter
19-
from databricks.labs.ucx.source_code.linters.table_creation import DBRv8d0Linter
20-
from databricks.labs.ucx.source_code.queries import FromTable
18+
from databricks.labs.ucx.source_code.linters.pyspark import SparkSqlPyLinter
19+
from databricks.labs.ucx.source_code.linters.spark_connect import SparkConnectPyLinter
20+
from databricks.labs.ucx.source_code.linters.table_creation import DBRv8d0PyLinter
21+
from databricks.labs.ucx.source_code.queries import FromTableSqlLinter
2122

2223

2324
class LinterContext:
@@ -28,26 +29,25 @@ def __init__(self, index: MigrationIndex | None = None, session_state: CurrentSe
2829
python_linters: list[PythonLinter] = []
2930
python_fixers: list[Fixer] = []
3031

31-
sql_linters: list[Linter] = []
32+
sql_linters: list[SqlLinter] = []
3233
sql_fixers: list[Fixer] = []
3334

3435
if index is not None:
35-
from_table = FromTable(index, session_state=session_state)
36-
python_linters.append(SparkSql(from_table, index, session_state))
37-
python_fixers.append(SparkSql(from_table, index, session_state))
38-
36+
from_table = FromTableSqlLinter(index, session_state=session_state)
3937
sql_linters.append(from_table)
4038
sql_fixers.append(from_table)
39+
python_linters.append(SparkSqlPyLinter(from_table, index, session_state))
40+
python_fixers.append(SparkSqlPyLinter(from_table, index, session_state))
4141

4242
python_linters += [
43-
DBFSUsageLinter(session_state),
44-
DBRv8d0Linter(dbr_version=session_state.dbr_version),
45-
SparkConnectLinter(session_state),
46-
DbutilsLinter(session_state),
43+
DBFSUsagePyLinter(session_state),
44+
DBRv8d0PyLinter(dbr_version=session_state.dbr_version),
45+
SparkConnectPyLinter(session_state),
46+
DbutilsPyLinter(session_state),
4747
]
48-
sql_linters.append(FromDbfsFolder())
48+
sql_linters.append(DbfsUsageSqlLinter())
4949

50-
self._linters: dict[Language, list[Linter] | list[PythonLinter]] = {
50+
self._linters: dict[Language, list[SqlLinter] | list[PythonLinter]] = {
5151
Language.PYTHON: python_linters,
5252
Language.SQL: sql_linters,
5353
}
@@ -64,7 +64,9 @@ def linter(self, language: Language) -> Linter:
6464
raise ValueError(f"Unsupported language: {language}")
6565
if language is Language.PYTHON:
6666
return PythonSequentialLinter(cast(list[PythonLinter], self._linters[language]))
67-
return SequentialLinter(cast(list[Linter], self._linters[language]))
67+
if language is Language.SQL:
68+
return SqlSequentialLinter(cast(list[SqlLinter], self._linters[language]))
69+
raise ValueError(f"Unsupported language: {language}")
6870

6971
def fixer(self, language: Language, diagnostic_code: str) -> Fixer | None:
7072
if language not in self._fixers:

src/databricks/labs/ucx/source_code/linters/dbfs.py

Lines changed: 12 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,18 @@
22
from collections.abc import Iterable
33

44
from astroid import Call, Const, InferenceError, NodeNG # type: ignore
5-
from sqlglot import Expression, parse as parse_sql, ParseError as SqlParseError
5+
from sqlglot import Expression
66
from sqlglot.expressions import Table
77

8-
from databricks.labs.ucx.source_code.base import Advice, Linter, Deprecation, CurrentSessionState, PythonLinter
8+
from databricks.labs.ucx.source_code.base import (
9+
Advice,
10+
Deprecation,
11+
CurrentSessionState,
12+
PythonLinter,
13+
SqlLinter,
14+
)
915
from databricks.labs.ucx.source_code.linters.python_ast import Tree, TreeVisitor
1016
from databricks.labs.ucx.source_code.linters.python_infer import InferredValue
11-
from databricks.labs.ucx.source_code.queries import FromTable
1217

1318
logger = logging.getLogger(__name__)
1419

@@ -68,7 +73,7 @@ def get_advices(self) -> Iterable[Advice]:
6873
yield from self._advices
6974

7075

71-
class DBFSUsageLinter(PythonLinter):
76+
class DBFSUsagePyLinter(PythonLinter):
7277

7378
def __init__(self, session_state: CurrentSessionState):
7479
self._session_state = session_state
@@ -89,27 +94,16 @@ def lint_tree(self, tree: Tree) -> Iterable[Advice]:
8994
yield from visitor.get_advices()
9095

9196

92-
class FromDbfsFolder(Linter):
97+
class DbfsUsageSqlLinter(SqlLinter):
9398
def __init__(self):
9499
self._dbfs_prefixes = ["/dbfs/mnt", "dbfs:/", "/mnt/", "/dbfs/", "/"]
95100

96101
@staticmethod
97102
def name() -> str:
98103
return 'dbfs-query'
99104

100-
def lint(self, code: str) -> Iterable[Advice]:
101-
try:
102-
queries = parse_sql(code, read='databricks')
103-
for query in queries:
104-
if not query:
105-
continue
106-
yield from self._lint_query(query)
107-
except SqlParseError as e:
108-
logger.debug(f"Failed to parse SQL: {code}", exc_info=e)
109-
yield FromTable.sql_parse_failure(code)
110-
111-
def _lint_query(self, query: Expression):
112-
for table in query.find_all(Table):
105+
def lint_expression(self, expression: Expression):
106+
for table in expression.find_all(Table):
113107
# Check table names for deprecated DBFS table names
114108
yield from self._check_dbfs_folder(table)
115109

src/databricks/labs/ucx/source_code/linters/imports.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ def get_notebook_paths(self, session_state: CurrentSessionState) -> tuple[bool,
9191
for path in paths:
9292
dbutils.notebook.run(path)
9393
"""
94-
arg = DbutilsLinter.get_dbutils_notebook_run_path_arg(self.node)
94+
arg = DbutilsPyLinter.get_dbutils_notebook_run_path_arg(self.node)
9595
try:
9696
all_inferred = InferredValue.infer_from_node(arg, session_state)
9797
return self._get_notebook_paths(all_inferred)
@@ -113,7 +113,7 @@ def _get_notebook_paths(cls, all_inferred: Iterable[InferredValue]) -> tuple[boo
113113
return has_unresolved, paths
114114

115115

116-
class DbutilsLinter(PythonLinter):
116+
class DbutilsPyLinter(PythonLinter):
117117

118118
def __init__(self, session_state: CurrentSessionState):
119119
self._session_state = session_state

src/databricks/labs/ucx/source_code/linters/pyspark.py

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
PythonLinter,
1414
)
1515
from databricks.labs.ucx.source_code.linters.python_infer import InferredValue
16-
from databricks.labs.ucx.source_code.queries import FromTable
16+
from databricks.labs.ucx.source_code.queries import FromTableSqlLinter
1717
from databricks.labs.ucx.source_code.linters.python_ast import Tree, TreeHelper
1818

1919

@@ -37,12 +37,12 @@ def matches(self, node: NodeNG):
3737

3838
@abstractmethod
3939
def lint(
40-
self, from_table: FromTable, index: MigrationIndex, session_state: CurrentSessionState, node: Call
40+
self, from_table: FromTableSqlLinter, index: MigrationIndex, session_state: CurrentSessionState, node: Call
4141
) -> Iterator[Advice]:
4242
"""raises Advices by linting the code"""
4343

4444
@abstractmethod
45-
def apply(self, from_table: FromTable, index: MigrationIndex, node: Call) -> None:
45+
def apply(self, from_table: FromTableSqlLinter, index: MigrationIndex, node: Call) -> None:
4646
"""applies recommendations"""
4747

4848
def _get_table_arg(self, node: Call):
@@ -78,7 +78,7 @@ def _check_call_context(self, node: Call) -> bool:
7878
class QueryMatcher(Matcher):
7979

8080
def lint(
81-
self, from_table: FromTable, index: MigrationIndex, session_state: CurrentSessionState, node: Call
81+
self, from_table: FromTableSqlLinter, index: MigrationIndex, session_state: CurrentSessionState, node: Call
8282
) -> Iterator[Advice]:
8383
table_arg = self._get_table_arg(node)
8484
if table_arg:
@@ -93,7 +93,7 @@ def lint(
9393
)
9494

9595
@classmethod
96-
def _lint_table_arg(cls, from_table: FromTable, call_node: NodeNG, inferred: InferredValue):
96+
def _lint_table_arg(cls, from_table: FromTableSqlLinter, call_node: NodeNG, inferred: InferredValue):
9797
if inferred.is_inferred():
9898
for advice in from_table.lint(inferred.as_string()):
9999
yield advice.replace_from_node(call_node)
@@ -104,7 +104,7 @@ def _lint_table_arg(cls, from_table: FromTable, call_node: NodeNG, inferred: Inf
104104
node=call_node,
105105
)
106106

107-
def apply(self, from_table: FromTable, index: MigrationIndex, node: Call) -> None:
107+
def apply(self, from_table: FromTableSqlLinter, index: MigrationIndex, node: Call) -> None:
108108
table_arg = self._get_table_arg(node)
109109
assert isinstance(table_arg, Const)
110110
new_query = from_table.apply(table_arg.value)
@@ -115,7 +115,7 @@ def apply(self, from_table: FromTable, index: MigrationIndex, node: Call) -> Non
115115
class TableNameMatcher(Matcher):
116116

117117
def lint(
118-
self, from_table: FromTable, index: MigrationIndex, session_state: CurrentSessionState, node: Call
118+
self, from_table: FromTableSqlLinter, index: MigrationIndex, session_state: CurrentSessionState, node: Call
119119
) -> Iterator[Advice]:
120120
table_arg = self._get_table_arg(node)
121121
table_name = table_arg.as_string().strip("'").strip('"')
@@ -137,7 +137,7 @@ def lint(
137137
node=node,
138138
)
139139

140-
def apply(self, from_table: FromTable, index: MigrationIndex, node: Call) -> None:
140+
def apply(self, from_table: FromTableSqlLinter, index: MigrationIndex, node: Call) -> None:
141141
table_arg = self._get_table_arg(node)
142142
assert isinstance(table_arg, Const)
143143
dst = self._find_dest(index, table_arg.value, from_table.schema)
@@ -162,7 +162,7 @@ def matches(self, node: NodeNG):
162162
)
163163

164164
def lint(
165-
self, from_table: FromTable, index: MigrationIndex, session_state: CurrentSessionState, node: Call
165+
self, from_table: FromTableSqlLinter, index: MigrationIndex, session_state: CurrentSessionState, node: Call
166166
) -> Iterator[Advice]:
167167
assert isinstance(node.func, Attribute) # always true, avoids a pylint warning
168168
yield Advisory.from_node(
@@ -171,7 +171,7 @@ def lint(
171171
node=node,
172172
)
173173

174-
def apply(self, from_table: FromTable, index: MigrationIndex, node: Call) -> None:
174+
def apply(self, from_table: FromTableSqlLinter, index: MigrationIndex, node: Call) -> None:
175175
# No transformations to apply
176176
return
177177

@@ -200,7 +200,7 @@ def matches(self, node: NodeNG):
200200
)
201201

202202
def lint(
203-
self, from_table: FromTable, index: MigrationIndex, session_state: CurrentSessionState, node: NodeNG
203+
self, from_table: FromTableSqlLinter, index: MigrationIndex, session_state: CurrentSessionState, node: NodeNG
204204
) -> Iterator[Advice]:
205205
table_arg = self._get_table_arg(node)
206206
if not isinstance(table_arg, Const):
@@ -223,7 +223,7 @@ def lint(
223223
node=node,
224224
)
225225

226-
def apply(self, from_table: FromTable, index: MigrationIndex, node: Call) -> None:
226+
def apply(self, from_table: FromTableSqlLinter, index: MigrationIndex, node: Call) -> None:
227227
# No transformations to apply
228228
return
229229

@@ -327,11 +327,11 @@ def matchers(self):
327327
return self._matchers
328328

329329

330-
class SparkSql(PythonLinter, Fixer):
330+
class SparkSqlPyLinter(PythonLinter, Fixer):
331331

332332
_spark_matchers = SparkMatchers()
333333

334-
def __init__(self, from_table: FromTable, index: MigrationIndex, session_state):
334+
def __init__(self, from_table: FromTableSqlLinter, index: MigrationIndex, session_state):
335335
self._from_table = from_table
336336
self._index = index
337337
self._session_state = session_state

src/databricks/labs/ucx/source_code/linters/spark_connect.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -242,7 +242,7 @@ def lint(self, node: NodeNG) -> Iterator[Advice]:
242242
)
243243

244244

245-
class SparkConnectLinter(PythonLinter):
245+
class SparkConnectPyLinter(PythonLinter):
246246
def __init__(self, session_state: CurrentSessionState):
247247
if session_state.data_security_mode != DataSecurityMode.USER_ISOLATION:
248248
self._matchers = []

src/databricks/labs/ucx/source_code/linters/table_creation.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ def lint(self, node: NodeNG) -> Iterator[Advice]:
9696
)
9797

9898

99-
class DBRv8d0Linter(PythonLinter):
99+
class DBRv8d0PyLinter(PythonLinter):
100100
"""Performs Python linting for backwards incompatible changes in DBR version 8.0.
101101
Specifically, it yields advice for table-creation with implicit format.
102102
"""

0 commit comments

Comments
 (0)