Skip to content

Commit 8f6400f

Browse files
authored
Fixed SystemError: AST constructor recursion depth mismatch failing the entire job (#3000)
This PR adds more deterministic, Go-style, error handling for parsing Python code Fix #2976
1 parent 519df78 commit 8f6400f

File tree

21 files changed

+510
-267
lines changed

21 files changed

+510
-267
lines changed

src/databricks/labs/ucx/source_code/base.py

Lines changed: 5 additions & 116 deletions
Original file line numberDiff line numberDiff line change
@@ -12,15 +12,15 @@
1212
from pathlib import Path
1313
from typing import Any, BinaryIO, TextIO
1414

15-
from astroid import AstroidSyntaxError, NodeNG # type: ignore
16-
from sqlglot import Expression, parse as parse_sql, ParseError as SqlParseError
15+
from astroid import NodeNG # type: ignore
16+
from sqlglot import Expression, parse as parse_sql
17+
from sqlglot.errors import SqlglotError
1718

1819
from databricks.sdk.service import compute
1920
from databricks.sdk.service.workspace import Language
2021

2122
from databricks.labs.blueprint.paths import WorkspacePath
2223

23-
from databricks.labs.ucx.source_code.python.python_ast import Tree
2424

2525
if sys.version_info >= (3, 11):
2626
from typing import Self
@@ -137,12 +137,13 @@ class SqlLinter(Linter):
137137

138138
def lint(self, code: str) -> Iterable[Advice]:
139139
try:
140+
# TODO: unify with SqlParser.walk_expressions(...)
140141
expressions = parse_sql(code, read='databricks')
141142
for expression in expressions:
142143
if not expression:
143144
continue
144145
yield from self.lint_expression(expression)
145-
except SqlParseError as e:
146+
except SqlglotError as e:
146147
logger.debug(f"Failed to parse SQL: {code}", exc_info=e)
147148
yield self.sql_parse_failure(code)
148149

@@ -162,16 +163,6 @@ def sql_parse_failure(code: str) -> Failure:
162163
def lint_expression(self, expression: Expression) -> Iterable[Advice]: ...
163164

164165

165-
class PythonLinter(Linter):
166-
167-
def lint(self, code: str) -> Iterable[Advice]:
168-
tree = Tree.normalize_and_parse(code)
169-
yield from self.lint_tree(tree)
170-
171-
@abstractmethod
172-
def lint_tree(self, tree: Tree) -> Iterable[Advice]: ...
173-
174-
175166
class Fixer(ABC):
176167

177168
@property
@@ -271,20 +262,6 @@ class UsedTableNode:
271262
node: NodeNG
272263

273264

274-
class TablePyCollector(TableCollector, ABC):
275-
276-
def collect_tables(self, source_code: str) -> Iterable[UsedTable]:
277-
try:
278-
tree = Tree.normalize_and_parse(source_code)
279-
for table_node in self.collect_tables_from_tree(tree):
280-
yield table_node.table
281-
except AstroidSyntaxError as e:
282-
logger.warning('syntax-error', exc_info=e)
283-
284-
@abstractmethod
285-
def collect_tables_from_tree(self, tree: Tree) -> Iterable[UsedTableNode]: ...
286-
287-
288265
class TableSqlCollector(TableCollector, ABC): ...
289266

290267

@@ -309,17 +286,6 @@ class DfsaCollector(ABC):
309286
def collect_dfsas(self, source_code: str) -> Iterable[DirectFsAccess]: ...
310287

311288

312-
class DfsaPyCollector(DfsaCollector, ABC):
313-
314-
def collect_dfsas(self, source_code: str) -> Iterable[DirectFsAccess]:
315-
tree = Tree.normalize_and_parse(source_code)
316-
for dfsa_node in self.collect_dfsas_from_tree(tree):
317-
yield dfsa_node.dfsa
318-
319-
@abstractmethod
320-
def collect_dfsas_from_tree(self, tree: Tree) -> Iterable[DirectFsAccessNode]: ...
321-
322-
323289
class DfsaSqlCollector(DfsaCollector, ABC): ...
324290

325291

@@ -395,83 +361,6 @@ def collect_tables(self, source_code: str) -> Iterable[UsedTable]:
395361
yield from collector.collect_tables(source_code)
396362

397363

398-
class PythonSequentialLinter(Linter, DfsaCollector, TableCollector):
399-
400-
def __init__(
401-
self,
402-
linters: list[PythonLinter],
403-
dfsa_collectors: list[DfsaPyCollector],
404-
table_collectors: list[TablePyCollector],
405-
):
406-
self._linters = linters
407-
self._dfsa_collectors = dfsa_collectors
408-
self._table_collectors = table_collectors
409-
self._tree: Tree | None = None
410-
411-
def lint(self, code: str) -> Iterable[Advice]:
412-
try:
413-
tree = self._parse_and_append(code)
414-
yield from self.lint_tree(tree)
415-
except AstroidSyntaxError as e:
416-
yield Failure('syntax-error', str(e), 0, 0, 0, 0)
417-
418-
def lint_tree(self, tree: Tree) -> Iterable[Advice]:
419-
for linter in self._linters:
420-
yield from linter.lint_tree(tree)
421-
422-
def _parse_and_append(self, code: str) -> Tree:
423-
tree = Tree.normalize_and_parse(code)
424-
self.append_tree(tree)
425-
return tree
426-
427-
def append_tree(self, tree: Tree) -> None:
428-
self._make_tree().append_tree(tree)
429-
430-
def append_nodes(self, nodes: list[NodeNG]) -> None:
431-
self._make_tree().append_nodes(nodes)
432-
433-
def append_globals(self, globs: dict) -> None:
434-
self._make_tree().append_globals(globs)
435-
436-
def process_child_cell(self, code: str) -> None:
437-
try:
438-
this_tree = self._make_tree()
439-
tree = Tree.normalize_and_parse(code)
440-
this_tree.append_tree(tree)
441-
except AstroidSyntaxError as e:
442-
# error already reported when linting enclosing notebook
443-
logger.warning(f"Failed to parse Python cell: {code}", exc_info=e)
444-
445-
def collect_dfsas(self, source_code: str) -> Iterable[DirectFsAccess]:
446-
try:
447-
tree = self._parse_and_append(source_code)
448-
for dfsa_node in self.collect_dfsas_from_tree(tree):
449-
yield dfsa_node.dfsa
450-
except AstroidSyntaxError as e:
451-
logger.warning('syntax-error', exc_info=e)
452-
453-
def collect_dfsas_from_tree(self, tree: Tree) -> Iterable[DirectFsAccessNode]:
454-
for collector in self._dfsa_collectors:
455-
yield from collector.collect_dfsas_from_tree(tree)
456-
457-
def collect_tables(self, source_code: str) -> Iterable[UsedTable]:
458-
try:
459-
tree = self._parse_and_append(source_code)
460-
for table_node in self.collect_tables_from_tree(tree):
461-
yield table_node.table
462-
except AstroidSyntaxError as e:
463-
logger.warning('syntax-error', exc_info=e)
464-
465-
def collect_tables_from_tree(self, tree: Tree) -> Iterable[UsedTableNode]:
466-
for collector in self._table_collectors:
467-
yield from collector.collect_tables_from_tree(tree)
468-
469-
def _make_tree(self) -> Tree:
470-
if self._tree is None:
471-
self._tree = Tree.new_module()
472-
return self._tree
473-
474-
475364
SUPPORTED_EXTENSION_LANGUAGES = {
476365
'.py': Language.PYTHON,
477366
'.sql': Language.SQL,

src/databricks/labs/ucx/source_code/jobs.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@
3434
SourceInfo,
3535
UsedTable,
3636
LineageAtom,
37-
PythonSequentialLinter,
3837
read_text,
3938
)
4039
from databricks.labs.ucx.source_code.directfs_access import (
@@ -52,7 +51,7 @@
5251
)
5352
from databricks.labs.ucx.source_code.linters.context import LinterContext
5453
from databricks.labs.ucx.source_code.notebooks.cells import CellLanguage
55-
from databricks.labs.ucx.source_code.python.python_ast import Tree
54+
from databricks.labs.ucx.source_code.python.python_ast import Tree, PythonSequentialLinter
5655
from databricks.labs.ucx.source_code.notebooks.sources import FileLinter, Notebook
5756
from databricks.labs.ucx.source_code.path_lookup import PathLookup
5857
from databricks.labs.ucx.source_code.used_table import UsedTablesCrawler
@@ -638,8 +637,12 @@ def _collect_from_notebook(
638637
if cell.language is CellLanguage.PYTHON:
639638
if inherited_tree is None:
640639
inherited_tree = Tree.new_module()
641-
tree = Tree.normalize_and_parse(cell.original_code)
642-
inherited_tree.append_tree(tree)
640+
maybe_tree = Tree.maybe_normalized_parse(cell.original_code)
641+
if maybe_tree.failure:
642+
logger.warning(maybe_tree.failure.message)
643+
continue
644+
assert maybe_tree.tree is not None
645+
inherited_tree.append_tree(maybe_tree.tree)
643646

644647
def _collect_from_source(
645648
self,

src/databricks/labs/ucx/source_code/linters/context.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,16 +8,18 @@
88
Linter,
99
SqlSequentialLinter,
1010
CurrentSessionState,
11-
PythonSequentialLinter,
12-
PythonLinter,
1311
SqlLinter,
14-
TablePyCollector,
1512
TableSqlCollector,
1613
TableCollector,
1714
DfsaCollector,
18-
DfsaPyCollector,
1915
DfsaSqlCollector,
2016
)
17+
from databricks.labs.ucx.source_code.python.python_ast import (
18+
PythonLinter,
19+
TablePyCollector,
20+
DfsaPyCollector,
21+
PythonSequentialLinter,
22+
)
2123
from databricks.labs.ucx.source_code.linters.directfs import DirectFsAccessPyLinter, DirectFsAccessSqlLinter
2224
from databricks.labs.ucx.source_code.linters.imports import DbutilsPyLinter
2325

src/databricks/labs/ucx/source_code/linters/directfs.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,18 @@
99
Advice,
1010
Deprecation,
1111
CurrentSessionState,
12-
PythonLinter,
1312
SqlLinter,
14-
DfsaPyCollector,
1513
DirectFsAccessNode,
1614
DfsaSqlCollector,
1715
DirectFsAccess,
1816
)
19-
from databricks.labs.ucx.source_code.python.python_ast import Tree, TreeVisitor, TreeHelper
17+
from databricks.labs.ucx.source_code.python.python_ast import (
18+
Tree,
19+
TreeVisitor,
20+
TreeHelper,
21+
PythonLinter,
22+
DfsaPyCollector,
23+
)
2024
from databricks.labs.ucx.source_code.python.python_infer import InferredValue
2125
from databricks.labs.ucx.source_code.sql.sql_parser import SqlParser, SqlExpression
2226

src/databricks/labs/ucx/source_code/linters/imports.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,8 @@
1717
NodeNG,
1818
)
1919

20-
from databricks.labs.ucx.source_code.base import Advice, Advisory, CurrentSessionState, PythonLinter
21-
from databricks.labs.ucx.source_code.python.python_ast import Tree, NodeBase, TreeVisitor
20+
from databricks.labs.ucx.source_code.base import Advice, Advisory, CurrentSessionState
21+
from databricks.labs.ucx.source_code.python.python_ast import Tree, NodeBase, TreeVisitor, PythonLinter
2222
from databricks.labs.ucx.source_code.python.python_infer import InferredValue
2323
from databricks.labs.ucx.source_code.path_lookup import PathLookup
2424

src/databricks/labs/ucx/source_code/linters/pyspark.py

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,20 +11,24 @@
1111
Advisory,
1212
Deprecation,
1313
CurrentSessionState,
14-
PythonLinter,
1514
SqlLinter,
1615
Fixer,
1716
UsedTable,
1817
UsedTableNode,
19-
TablePyCollector,
2018
TableSqlCollector,
21-
DfsaPyCollector,
2219
DfsaSqlCollector,
2320
)
2421
from databricks.labs.ucx.source_code.linters.directfs import DIRECT_FS_ACCESS_PATTERNS, DirectFsAccessNode
2522
from databricks.labs.ucx.source_code.python.python_infer import InferredValue
2623
from databricks.labs.ucx.source_code.linters.from_table import FromTableSqlLinter
27-
from databricks.labs.ucx.source_code.python.python_ast import Tree, TreeHelper, MatchingVisitor
24+
from databricks.labs.ucx.source_code.python.python_ast import (
25+
Tree,
26+
TreeHelper,
27+
MatchingVisitor,
28+
PythonLinter,
29+
TablePyCollector,
30+
DfsaPyCollector,
31+
)
2832

2933
logger = logging.getLogger(__name__)
3034

@@ -408,7 +412,12 @@ def lint_tree(self, tree: Tree) -> Iterable[Advice]:
408412
yield from matcher.lint(self._from_table, self._index, self._session_state, node)
409413

410414
def apply(self, code: str) -> str:
411-
tree = Tree.parse(code)
415+
maybe_tree = Tree.maybe_parse(code)
416+
if not maybe_tree.tree:
417+
assert maybe_tree.failure is not None
418+
logger.warning(maybe_tree.failure.message)
419+
return code
420+
tree = maybe_tree.tree
412421
# we won't be doing it like this in production, but for the sake of the example
413422
for node in tree.walk():
414423
matcher = self._find_matcher(node)
@@ -477,7 +486,12 @@ def lint_tree(self, tree: Tree) -> Iterable[Advice]:
477486
def apply(self, code: str) -> str:
478487
if not self._sql_fixer:
479488
return code
480-
tree = Tree.normalize_and_parse(code)
489+
maybe_tree = Tree.maybe_normalized_parse(code)
490+
if maybe_tree.failure:
491+
logger.warning(maybe_tree.failure.message)
492+
return code
493+
assert maybe_tree.tree is not None
494+
tree = maybe_tree.tree
481495
for _call_node, query in self._visit_call_nodes(tree):
482496
if not isinstance(query, Const) or not isinstance(query.value, str):
483497
continue

src/databricks/labs/ucx/source_code/linters/spark_connect.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,11 @@
66
from databricks.labs.ucx.source_code.base import (
77
Advice,
88
Failure,
9-
PythonLinter,
109
CurrentSessionState,
1110
)
1211
from databricks.sdk.service.compute import DataSecurityMode
1312

14-
from databricks.labs.ucx.source_code.python.python_ast import Tree, TreeHelper
13+
from databricks.labs.ucx.source_code.python.python_ast import Tree, TreeHelper, PythonLinter
1514

1615

1716
@dataclass

src/databricks/labs/ucx/source_code/linters/table_creation.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,8 @@
77

88
from databricks.labs.ucx.source_code.base import (
99
Advice,
10-
PythonLinter,
1110
)
12-
from databricks.labs.ucx.source_code.python.python_ast import Tree, TreeHelper
11+
from databricks.labs.ucx.source_code.python.python_ast import Tree, TreeHelper, PythonLinter
1312

1413

1514
@dataclass

src/databricks/labs/ucx/source_code/notebooks/sources.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from pathlib import Path
99
from typing import cast
1010

11-
from astroid import AstroidSyntaxError, Module, NodeNG # type: ignore
11+
from astroid import Module, NodeNG # type: ignore
1212

1313
from databricks.sdk.service.workspace import Language
1414

@@ -17,7 +17,6 @@
1717
Advice,
1818
Failure,
1919
Linter,
20-
PythonSequentialLinter,
2120
CurrentSessionState,
2221
Advisory,
2322
file_language,
@@ -37,7 +36,7 @@
3736
UnresolvedPath,
3837
)
3938
from databricks.labs.ucx.source_code.notebooks.magic import MagicLine
40-
from databricks.labs.ucx.source_code.python.python_ast import Tree, NodeBase
39+
from databricks.labs.ucx.source_code.python.python_ast import Tree, NodeBase, PythonSequentialLinter
4140
from databricks.labs.ucx.source_code.notebooks.cells import (
4241
CellLanguage,
4342
Cell,
@@ -196,13 +195,14 @@ def _load_tree_from_notebook(self, notebook: Notebook, register_trees: bool) ->
196195
continue
197196

198197
def _load_tree_from_python_cell(self, python_cell: PythonCell, register_trees: bool) -> Iterable[Advice]:
199-
try:
200-
tree = Tree.normalize_and_parse(python_cell.original_code)
201-
if register_trees:
202-
self._python_trees[python_cell] = tree
203-
yield from self._load_children_from_tree(tree)
204-
except AstroidSyntaxError as e:
205-
yield Failure('syntax-error', str(e), 0, 0, 0, 0)
198+
maybe_tree = Tree.maybe_normalized_parse(python_cell.original_code)
199+
if maybe_tree.failure:
200+
yield maybe_tree.failure
201+
assert maybe_tree.tree is not None
202+
tree = maybe_tree.tree
203+
if register_trees:
204+
self._python_trees[python_cell] = tree
205+
yield from self._load_children_from_tree(tree)
206206

207207
def _load_children_from_tree(self, tree: Tree) -> Iterable[Advice]:
208208
assert isinstance(tree.node, Module)

0 commit comments

Comments
 (0)