Skip to content

Commit 89f3830

Browse files
authored
Refactor python code analysis (#2521)
## Changes Python code analysis is under linter package which is misleading since these are general utilities. This PR reorganizes code in preparation of PR #2519 ### Linked issues Progresses #2350 ### Functionality None ### Tests - [x] ran unit tests --------- Co-authored-by: Eric Vergnaud <[email protected]>
1 parent 1a4fe3e commit 89f3830

File tree

25 files changed

+341
-272
lines changed

25 files changed

+341
-272
lines changed

src/databricks/labs/ucx/source_code/base.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
from databricks.sdk.service.workspace import Language
1616

1717
from databricks.labs.blueprint.paths import WorkspacePath
18-
from databricks.labs.ucx.source_code.linters.python_ast import Tree
18+
from databricks.labs.ucx.source_code.python.python_ast import Tree
1919

2020
# Code mapping between LSP, PyLint, and our own diagnostics:
2121
# | LSP | PyLint | Our |

src/databricks/labs/ucx/source_code/graph.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
NodeNG,
1212
)
1313
from databricks.labs.ucx.source_code.base import Advisory, CurrentSessionState, is_a_notebook
14-
from databricks.labs.ucx.source_code.linters.python_ast import Tree
14+
from databricks.labs.ucx.source_code.python.python_ast import Tree
1515
from databricks.labs.ucx.source_code.path_lookup import PathLookup
1616

1717
logger = logging.Logger(__name__)

src/databricks/labs/ucx/source_code/jobs.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
DependencyGraphWalker,
3232
)
3333
from databricks.labs.ucx.source_code.linters.context import LinterContext
34-
from databricks.labs.ucx.source_code.linters.python_ast import Tree
34+
from databricks.labs.ucx.source_code.python.python_ast import Tree
3535
from databricks.labs.ucx.source_code.notebooks.sources import FileLinter
3636
from databricks.labs.ucx.source_code.path_lookup import PathLookup
3737

src/databricks/labs/ucx/source_code/linters/dbfs.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@
1212
PythonLinter,
1313
SqlLinter,
1414
)
15-
from databricks.labs.ucx.source_code.linters.python_ast import Tree, TreeVisitor
16-
from databricks.labs.ucx.source_code.linters.python_infer import InferredValue
15+
from databricks.labs.ucx.source_code.python.python_ast import Tree, TreeVisitor
16+
from databricks.labs.ucx.source_code.python.python_infer import InferredValue
1717

1818
logger = logging.getLogger(__name__)
1919

src/databricks/labs/ucx/source_code/linters/files.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from typing import TextIO
88

99
from databricks.labs.ucx.source_code.base import LocatedAdvice, CurrentSessionState, file_language, is_a_notebook
10-
from databricks.labs.ucx.source_code.linters.python_ast import Tree
10+
from databricks.labs.ucx.source_code.python.python_ast import Tree
1111
from databricks.labs.ucx.source_code.notebooks.loaders import NotebookLoader
1212
from databricks.labs.ucx.source_code.notebooks.sources import FileLinter
1313
from databricks.labs.ucx.source_code.path_lookup import PathLookup

src/databricks/labs/ucx/source_code/linters/imports.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@
1818
)
1919

2020
from databricks.labs.ucx.source_code.base import Advice, Advisory, CurrentSessionState, PythonLinter
21-
from databricks.labs.ucx.source_code.linters.python_ast import Tree, NodeBase, TreeVisitor
22-
from databricks.labs.ucx.source_code.linters.python_infer import InferredValue
21+
from databricks.labs.ucx.source_code.python.python_ast import Tree, NodeBase, TreeVisitor
22+
from databricks.labs.ucx.source_code.python.python_infer import InferredValue
2323
from databricks.labs.ucx.source_code.path_lookup import PathLookup
2424

2525
logger = logging.getLogger(__name__)

src/databricks/labs/ucx/source_code/linters/pyspark.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,9 @@
1212
CurrentSessionState,
1313
PythonLinter,
1414
)
15-
from databricks.labs.ucx.source_code.linters.python_infer import InferredValue
15+
from databricks.labs.ucx.source_code.python.python_infer import InferredValue
1616
from databricks.labs.ucx.source_code.queries import FromTableSqlLinter
17-
from databricks.labs.ucx.source_code.linters.python_ast import Tree, TreeHelper
17+
from databricks.labs.ucx.source_code.python.python_ast import Tree, TreeHelper
1818

1919

2020
@dataclass

src/databricks/labs/ucx/source_code/linters/spark_connect.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
)
1212
from databricks.sdk.service.compute import DataSecurityMode
1313

14-
from databricks.labs.ucx.source_code.linters.python_ast import Tree, TreeHelper
14+
from databricks.labs.ucx.source_code.python.python_ast import Tree, TreeHelper
1515

1616

1717
@dataclass

src/databricks/labs/ucx/source_code/linters/table_creation.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
Advice,
1010
PythonLinter,
1111
)
12-
from databricks.labs.ucx.source_code.linters.python_ast import Tree, TreeHelper
12+
from databricks.labs.ucx.source_code.python.python_ast import Tree, TreeHelper
1313

1414

1515
@dataclass

src/databricks/labs/ucx/source_code/notebooks/cells.py

Lines changed: 16 additions & 245 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,10 @@
55
import shlex
66
from abc import ABC, abstractmethod
77
from ast import parse as parse_python
8-
from collections.abc import Callable, Iterable
98
from enum import Enum
109
from pathlib import Path
11-
from typing import TypeVar, cast
1210

13-
from astroid import Call, Const, ImportFrom, Name, NodeNG, Try # type: ignore
14-
from astroid.exceptions import AstroidSyntaxError # type: ignore
11+
from astroid import NodeNG # type: ignore
1512
from sqlglot import parse as parse_sql, ParseError as SQLParseError
1613

1714
from databricks.sdk.service.workspace import Language
@@ -23,14 +20,8 @@
2320
DependencyGraphContext,
2421
InheritedContext,
2522
)
26-
from databricks.labs.ucx.source_code.linters.imports import (
27-
SysPathChange,
28-
DbutilsPyLinter,
29-
ImportSource,
30-
NotebookRunCall,
31-
UnresolvedPath,
32-
)
33-
from databricks.labs.ucx.source_code.linters.python_ast import Tree, NodeBase
23+
from databricks.labs.ucx.source_code.python.python_analyzer import PythonCodeAnalyzer
24+
from databricks.labs.ucx.source_code.notebooks.magic import MagicNode, MagicCommand, magic_command_factory
3425

3526
# use a specific logger for sqlglot warnings so we can disable them selectively
3627
sqlglot_logger = logging.getLogger(f"{__name__}.sqlglot")
@@ -403,242 +394,15 @@ def wrap_with_magic(self, code: str, cell_language: CellLanguage) -> str:
403394
return "\n".join(lines)
404395

405396

406-
class PythonCodeAnalyzer:
407-
408-
def __init__(self, context: DependencyGraphContext, python_code: str):
409-
self._context = context
410-
self._python_code = python_code
411-
412-
def build_graph(self) -> list[DependencyProblem]:
413-
"""Check python code for dependency-related problems.
414-
415-
Returns:
416-
A list of dependency problems; position information is relative to the python code itself.
417-
"""
418-
problems: list[DependencyProblem] = []
419-
try:
420-
_, nodes, parse_problems = self._parse_and_extract_nodes()
421-
problems.extend(parse_problems)
422-
except AstroidSyntaxError as e:
423-
logger.debug(f"Could not parse Python code: {self._python_code}", exc_info=True)
424-
problems.append(DependencyProblem('parse-error', f"Could not parse Python code: {e}"))
425-
return problems
426-
for base_node in nodes:
427-
for problem in self._build_graph_from_node(base_node):
428-
# Astroid line numbers are 1-based.
429-
problem = problem.replace(
430-
start_line=base_node.node.lineno - 1,
431-
start_col=base_node.node.col_offset,
432-
end_line=(base_node.node.end_lineno or 1) - 1,
433-
end_col=base_node.node.end_col_offset or 0,
434-
)
435-
problems.append(problem)
436-
return problems
437-
438-
def build_inherited_context(self, child_path: Path) -> InheritedContext:
439-
try:
440-
tree, nodes, _ = self._parse_and_extract_nodes()
441-
except AstroidSyntaxError:
442-
logger.debug(f"Could not parse Python code: {self._python_code}", exc_info=True)
443-
return InheritedContext(None, False)
444-
if len(nodes) == 0:
445-
return InheritedContext(tree, False)
446-
context = InheritedContext(Tree.new_module(), False)
447-
last_line = -1
448-
for base_node in nodes:
449-
# append nodes
450-
node_line = base_node.node.lineno
451-
nodes = tree.nodes_between(last_line + 1, node_line - 1)
452-
context.tree.append_nodes(nodes)
453-
globs = tree.globals_between(last_line + 1, node_line - 1)
454-
context.tree.append_globals(globs)
455-
last_line = node_line
456-
# process node
457-
child_context = self._build_inherited_context_from_node(base_node, child_path)
458-
context = context.append(child_context, True)
459-
if context.found:
460-
return context
461-
line_count = tree.line_count()
462-
if last_line < line_count:
463-
nodes = tree.nodes_between(last_line + 1, line_count)
464-
context.tree.append_nodes(nodes)
465-
globs = tree.globals_between(last_line + 1, line_count)
466-
context.tree.append_globals(globs)
467-
return context
468-
469-
def _parse_and_extract_nodes(self) -> tuple[Tree, list[NodeBase], Iterable[DependencyProblem]]:
470-
problems: list[DependencyProblem] = []
471-
tree = Tree.normalize_and_parse(self._python_code)
472-
syspath_changes = SysPathChange.extract_from_tree(self._context.session_state, tree)
473-
run_calls = DbutilsPyLinter.list_dbutils_notebook_run_calls(tree)
474-
import_sources: list[ImportSource]
475-
import_problems: list[DependencyProblem]
476-
import_sources, import_problems = ImportSource.extract_from_tree(tree, DependencyProblem.from_node)
477-
problems.extend(import_problems)
478-
magic_lines, command_problems = MagicLine.extract_from_tree(tree, DependencyProblem.from_node)
479-
problems.extend(command_problems)
480-
# need to evaluate things in intertwined sequence so concat and sort them
481-
nodes: list[NodeBase] = cast(list[NodeBase], syspath_changes + run_calls + import_sources + magic_lines)
482-
nodes = sorted(nodes, key=lambda node: (node.node.lineno, node.node.col_offset))
483-
return tree, nodes, problems
484-
485-
def _build_graph_from_node(self, base_node: NodeBase) -> Iterable[DependencyProblem]:
486-
if isinstance(base_node, SysPathChange):
487-
yield from self._mutate_path_lookup(base_node)
488-
elif isinstance(base_node, NotebookRunCall):
489-
yield from self._register_notebook(base_node)
490-
elif isinstance(base_node, ImportSource):
491-
yield from self._register_import(base_node)
492-
elif isinstance(base_node, MagicLine):
493-
yield from base_node.build_dependency_graph(self._context.parent)
494-
else:
495-
logger.warning(f"Can't build graph for node {NodeBase.__name__} of type {type(base_node).__name__}")
496-
497-
def _build_inherited_context_from_node(self, base_node: NodeBase, child_path: Path) -> InheritedContext:
498-
if isinstance(base_node, SysPathChange):
499-
self._mutate_path_lookup(base_node)
500-
return InheritedContext(None, False)
501-
if isinstance(base_node, ImportSource):
502-
# nothing to do, Astroid takes care of imports
503-
return InheritedContext(None, False)
504-
if isinstance(base_node, NotebookRunCall):
505-
# nothing to do, dbutils.notebook.run uses a dedicated context
506-
return InheritedContext(None, False)
507-
if isinstance(base_node, MagicLine):
508-
return base_node.build_inherited_context(self._context, child_path)
509-
logger.warning(f"Can't build inherited context for node {NodeBase.__name__} of type {type(base_node).__name__}")
510-
return InheritedContext(None, False)
511-
512-
def _register_import(self, base_node: ImportSource) -> Iterable[DependencyProblem]:
513-
prefix = ""
514-
if isinstance(base_node.node, ImportFrom) and base_node.node.level is not None:
515-
prefix = "." * base_node.node.level
516-
name = base_node.name or ""
517-
problems = self._context.parent.register_import(prefix + name)
518-
for problem in problems:
519-
prob = self._filter_import_problem_in_try_except(problem, base_node)
520-
if prob is not None:
521-
yield prob
522-
523-
@classmethod
524-
def _filter_import_problem_in_try_except(
525-
cls, problem: DependencyProblem, base_node: ImportSource
526-
) -> DependencyProblem | None:
527-
if problem.code != 'import-not-found':
528-
return problem
529-
# is base_node in a try-except clause ?
530-
node = base_node.node.parent
531-
while node and not isinstance(node, Try):
532-
node = node.parent
533-
if cls._is_try_except_import_error(node):
534-
return None
535-
return problem
536-
537-
@classmethod
538-
def _is_try_except_import_error(cls, node: Try | None) -> bool:
539-
if not isinstance(node, Try):
540-
return False
541-
for handler in node.handlers:
542-
if isinstance(handler.type, Name):
543-
if handler.type.name == "ImportError":
544-
return True
545-
return False
546-
547-
def _register_notebook(self, base_node: NotebookRunCall) -> Iterable[DependencyProblem]:
548-
has_unresolved, paths = base_node.get_notebook_paths(self._context.session_state)
549-
if has_unresolved:
550-
yield DependencyProblem(
551-
'dependency-cannot-compute-value',
552-
f"Can't check dependency from {base_node.node.as_string()} because the expression cannot be computed",
553-
)
554-
for path in paths:
555-
# notebooks ran via dbutils.notebook.run do not inherit or propagate context
556-
yield from self._context.parent.register_notebook(Path(path), False)
557-
558-
def _mutate_path_lookup(self, change: SysPathChange) -> Iterable[DependencyProblem]:
559-
if isinstance(change, UnresolvedPath):
560-
yield DependencyProblem(
561-
'sys-path-cannot-compute-value',
562-
f"Can't update sys.path from {change.node.as_string()} because the expression cannot be computed",
563-
)
564-
return
565-
change.apply_to(self._context.path_lookup)
566-
567-
568-
T = TypeVar("T")
569-
570-
571-
class MagicLine(NodeBase):
572-
573-
@classmethod
574-
def extract_from_tree(
575-
cls, tree: Tree, problem_factory: Callable[[str, str, NodeNG], T]
576-
) -> tuple[list[MagicLine], list[T]]:
577-
problems: list[T] = []
578-
commands: list[MagicLine] = []
579-
try:
580-
nodes = tree.locate(Call, [("magic_command", Name)])
581-
for command in cls._make_commands_for_magic_command_call_nodes(nodes):
582-
commands.append(command)
583-
except Exception as e: # pylint: disable=broad-except
584-
logger.debug(f"Internal error while checking magic commands in tree: {tree.root}", exc_info=True)
585-
problem = problem_factory('internal-error', f"While checking magic commands: {e}", tree.root)
586-
problems.append(problem)
587-
return commands, problems
397+
class RunCommand(MagicCommand):
588398

589-
@classmethod
590-
def _make_commands_for_magic_command_call_nodes(cls, nodes: list[Call]):
591-
for node in nodes:
592-
arg = node.args[0]
593-
if isinstance(arg, Const):
594-
yield MagicLine(node, arg.value)
595-
596-
def __init__(self, node: NodeNG, command: bytes):
597-
super().__init__(node)
598-
self._command = command.decode()
599-
600-
def as_magic(self) -> MagicCommand | None:
601-
if self._command.startswith("%pip") or self._command.startswith("!pip"):
602-
return PipCommand(self.node, self._command)
603-
if self._command.startswith("%run"):
604-
return RunCommand(self.node, self._command)
399+
@staticmethod
400+
@magic_command_factory
401+
def factory(command: str, node: NodeNG) -> MagicCommand | None:
402+
if command.startswith("%run"):
403+
return RunCommand(node, command)
605404
return None
606405

607-
def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProblem]:
608-
magic = self.as_magic()
609-
if magic is not None:
610-
return magic.build_dependency_graph(parent)
611-
problem = DependencyProblem.from_node(
612-
code='unsupported-magic-line', message=f"magic line '{self._command}' is not supported yet", node=self.node
613-
)
614-
return [problem]
615-
616-
def build_inherited_context(self, context: DependencyGraphContext, child_path: Path) -> InheritedContext:
617-
magic = self.as_magic()
618-
if magic is not None:
619-
return magic.build_inherited_context(context, child_path)
620-
return InheritedContext(None, False)
621-
622-
623-
class MagicNode(NodeNG):
624-
pass
625-
626-
627-
class MagicCommand(ABC):
628-
629-
def __init__(self, node: NodeNG, code: str):
630-
self._node = node
631-
self._code = code
632-
633-
@abstractmethod
634-
def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProblem]: ...
635-
636-
def build_inherited_context(self, _context: DependencyGraphContext, _child_path: Path) -> InheritedContext:
637-
return InheritedContext(None, False)
638-
639-
640-
class RunCommand(MagicCommand):
641-
642406
def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProblem]:
643407
path = self.notebook_path
644408
if path is not None:
@@ -678,6 +442,13 @@ def build_inherited_context(self, context: DependencyGraphContext, child_path: P
678442

679443
class PipCommand(MagicCommand):
680444

445+
@staticmethod
446+
@magic_command_factory
447+
def factory(command: str, node: NodeNG) -> MagicCommand | None:
448+
if command.startswith("%pip") or command.startswith("!pip"):
449+
return PipCommand(node, command)
450+
return None
451+
681452
def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProblem]:
682453
argv = self._split(self._code)
683454
if len(argv) == 1:

0 commit comments

Comments
 (0)