Skip to content

Commit 3226727

Browse files
Implement dependency graph walker (#2502)
## Changes There is redundant code for walking a dependency graph 'properly' i.e. from roots to leaves, populating python globals etc... This PR makes it trivial to reuse the correct code, and reuses it for local code linting and jobs linting. This will also be used by #2371 ### Linked issues None ### Functionality None ### Tests - [x] ran existing unit tests --------- Co-authored-by: Eric Vergnaud <[email protected]> Co-authored-by: Cor <[email protected]>
1 parent 7b263ff commit 3226727

File tree

3 files changed

+99
-58
lines changed

3 files changed

+99
-58
lines changed

src/databricks/labs/ucx/source_code/graph.py

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44
import logging
55
from dataclasses import dataclass
66
from pathlib import Path
7-
from collections.abc import Callable
7+
from collections.abc import Callable, Iterable, Iterator
8+
from typing import TypeVar, Generic
89

910
from astroid import ( # type: ignore
1011
NodeNG,
@@ -581,3 +582,45 @@ def finalize(self) -> InheritedContext:
581582
return self
582583
tree = self._tree.renumber(-1)
583584
return InheritedContext(tree, self.found)
585+
586+
587+
T = TypeVar("T")
588+
589+
590+
class DependencyGraphWalker(abc.ABC, Generic[T]):
591+
592+
def __init__(self, graph: DependencyGraph, walked_paths: set[Path], path_lookup: PathLookup):
593+
self._graph = graph
594+
self._walked_paths = walked_paths
595+
self._path_lookup = path_lookup
596+
597+
def __iter__(self) -> Iterator[T]:
598+
for dependency in self._graph.root_dependencies:
599+
# the dependency is a root, so its path is the one to use
600+
# for computing lineage and building python global context
601+
root_path = dependency.path
602+
yield from self._iter_one(dependency, self._graph, root_path)
603+
604+
def _iter_one(self, dependency: Dependency, graph: DependencyGraph, root_path: Path) -> Iterable[T]:
605+
if dependency.path in self._walked_paths:
606+
return
607+
self._walked_paths.add(dependency.path)
608+
self._log_walk_one(dependency)
609+
if dependency.path.is_file() or is_a_notebook(dependency.path):
610+
inherited_tree = graph.root.build_inherited_tree(root_path, dependency.path)
611+
path_lookup = self._path_lookup.change_directory(dependency.path.parent)
612+
yield from self._process_dependency(dependency, path_lookup, inherited_tree)
613+
maybe_graph = graph.locate_dependency(dependency.path)
614+
# missing graph problems have already been reported while building the graph
615+
if maybe_graph.graph:
616+
child_graph = maybe_graph.graph
617+
for child_dependency in child_graph.local_dependencies:
618+
yield from self._iter_one(child_dependency, child_graph, root_path)
619+
620+
def _log_walk_one(self, dependency: Dependency):
621+
logger.debug(f'Analyzing dependency: {dependency}')
622+
623+
@abc.abstractmethod
624+
def _process_dependency(
625+
self, dependency: Dependency, path_lookup: PathLookup, inherited_tree: Tree | None
626+
) -> Iterable[T]: ...

src/databricks/labs/ucx/source_code/jobs.py

Lines changed: 34 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -20,16 +20,18 @@
2020
from databricks.labs.ucx.assessment.crawlers import runtime_version_tuple
2121
from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex
2222
from databricks.labs.ucx.mixins.cached_workspace_path import WorkspaceCache
23-
from databricks.labs.ucx.source_code.base import CurrentSessionState, is_a_notebook, LocatedAdvice
23+
from databricks.labs.ucx.source_code.base import CurrentSessionState, LocatedAdvice
2424
from databricks.labs.ucx.source_code.graph import (
2525
Dependency,
2626
DependencyGraph,
2727
DependencyProblem,
2828
DependencyResolver,
2929
SourceContainer,
3030
WrappingLoader,
31+
DependencyGraphWalker,
3132
)
3233
from databricks.labs.ucx.source_code.linters.context import LinterContext
34+
from databricks.labs.ucx.source_code.linters.python_ast import Tree
3335
from databricks.labs.ucx.source_code.notebooks.sources import FileLinter
3436
from databricks.labs.ucx.source_code.path_lookup import PathLookup
3537

@@ -389,51 +391,53 @@ def _lint_job(self, job: jobs.Job) -> list[JobProblem]:
389391
return problems
390392

391393
def _lint_task(self, task: jobs.Task, job: jobs.Job, linted_paths: set[Path]) -> Iterable[LocatedAdvice]:
392-
dependency: Dependency = WorkflowTask(self._ws, task, job)
394+
root_dependency: Dependency = WorkflowTask(self._ws, task, job)
393395
# we can load it without further preparation since the WorkflowTask is merely a wrapper
394-
container = dependency.load(self._path_lookup)
396+
container = root_dependency.load(self._path_lookup)
395397
assert isinstance(container, WorkflowTaskContainer)
396398
session_state = CurrentSessionState(
397399
data_security_mode=container.data_security_mode,
398400
named_parameters=container.named_parameters,
399401
spark_conf=container.spark_conf,
400402
dbr_version=container.runtime_version,
401403
)
402-
graph = DependencyGraph(dependency, None, self._resolver, self._path_lookup, session_state)
404+
graph = DependencyGraph(root_dependency, None, self._resolver, self._path_lookup, session_state)
403405
problems = container.build_dependency_graph(graph)
404406
if problems:
405407
for problem in problems:
406408
source_path = self._UNKNOWN if problem.is_path_missing() else problem.source_path
407409
yield LocatedAdvice(problem.as_advisory(), source_path)
408410
return
409-
for dependency in graph.root_dependencies:
410-
root = dependency.path # since it's a root
411-
yield from self._lint_one(task, dependency, graph, root, session_state, linted_paths)
411+
walker = LintingWalker(
412+
graph, linted_paths, self._path_lookup, task.task_key, session_state, self._migration_index
413+
)
414+
yield from walker
415+
412416

413-
def _lint_one(
417+
class LintingWalker(DependencyGraphWalker[LocatedAdvice]):
418+
419+
def __init__(
414420
self,
415-
task: jobs.Task,
416-
dependency: Dependency,
417421
graph: DependencyGraph,
418-
root_path: Path,
419-
session_state: CurrentSessionState,
420422
linted_paths: set[Path],
423+
path_lookup: PathLookup,
424+
key: str,
425+
session_state: CurrentSessionState,
426+
migration_index: MigrationIndex,
427+
):
428+
super().__init__(graph, linted_paths, path_lookup)
429+
self._key = key
430+
self._session_state = session_state
431+
self._migration_index = migration_index
432+
433+
def _log_walk_one(self, dependency: Dependency):
434+
logger.info(f'Linting {self._key} dependency: {dependency}')
435+
436+
def _process_dependency(
437+
self, dependency: Dependency, path_lookup: PathLookup, inherited_tree: Tree | None
421438
) -> Iterable[LocatedAdvice]:
422-
if dependency.path in linted_paths:
423-
return
424-
linted_paths.add(dependency.path)
425-
logger.info(f'Linting {task.task_key} dependency: {dependency}')
426-
if dependency.path.is_file() or is_a_notebook(dependency.path):
427-
inherited_tree = graph.root.build_inherited_tree(root_path, dependency.path)
428-
ctx = LinterContext(self._migration_index, session_state)
429-
path_lookup = self._path_lookup.change_directory(dependency.path.parent)
430-
# FileLinter will determine which file/notebook linter to use
431-
linter = FileLinter(ctx, path_lookup, session_state, dependency.path, inherited_tree)
432-
for advice in linter.lint():
433-
yield LocatedAdvice(advice, dependency.path)
434-
maybe_graph = graph.locate_dependency(dependency.path)
435-
# problems have already been reported while building the graph
436-
if maybe_graph.graph:
437-
child_graph = maybe_graph.graph
438-
for child_dependency in child_graph.local_dependencies:
439-
yield from self._lint_one(task, child_dependency, child_graph, root_path, session_state, linted_paths)
439+
ctx = LinterContext(self._migration_index, self._session_state)
440+
# FileLinter determines which file/notebook linter to use
441+
linter = FileLinter(ctx, path_lookup, self._session_state, dependency.path, inherited_tree)
442+
for advice in linter.lint():
443+
yield LocatedAdvice(advice, dependency.path)

src/databricks/labs/ucx/source_code/linters/files.py

Lines changed: 21 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from typing import TextIO
88

99
from databricks.labs.ucx.source_code.base import LocatedAdvice, CurrentSessionState, file_language, is_a_notebook
10+
from databricks.labs.ucx.source_code.linters.python_ast import Tree
1011
from databricks.labs.ucx.source_code.notebooks.loaders import NotebookLoader
1112
from databricks.labs.ucx.source_code.notebooks.sources import FileLinter
1213
from databricks.labs.ucx.source_code.path_lookup import PathLookup
@@ -27,6 +28,7 @@
2728
SourceContainer,
2829
DependencyResolver,
2930
InheritedContext,
31+
DependencyGraphWalker,
3032
)
3133

3234
logger = logging.getLogger(__name__)
@@ -141,40 +143,32 @@ def lint_path(self, path: Path, linted_paths: set[Path] | None = None) -> Iterab
141143
is_dir = path.is_dir()
142144
loader = self._folder_loader if is_dir else self._file_loader
143145
path_lookup = self._path_lookup.change_directory(path if is_dir else path.parent)
144-
dependency = Dependency(loader, path, not is_dir) # don't inherit context when traversing folders
145-
graph = DependencyGraph(dependency, None, self._dependency_resolver, path_lookup, self._session_state)
146-
container = dependency.load(path_lookup)
146+
root_dependency = Dependency(loader, path, not is_dir) # don't inherit context when traversing folders
147+
graph = DependencyGraph(root_dependency, None, self._dependency_resolver, path_lookup, self._session_state)
148+
container = root_dependency.load(path_lookup)
147149
assert container is not None # because we just created it
148150
problems = container.build_dependency_graph(graph)
149151
for problem in problems:
150152
problem_path = Path('UNKNOWN') if problem.is_path_missing() else problem.source_path.absolute()
151153
yield problem.as_advisory().for_path(problem_path)
154+
context_factory = self._context_factory
155+
session_state = self._session_state
156+
157+
class LintingWalker(DependencyGraphWalker[LocatedAdvice]):
158+
159+
def _process_dependency(
160+
self, dependency: Dependency, path_lookup: PathLookup, inherited_tree: Tree | None
161+
) -> Iterable[LocatedAdvice]:
162+
ctx = context_factory()
163+
# FileLinter will determine which file/notebook linter to use
164+
linter = FileLinter(ctx, path_lookup, session_state, dependency.path, inherited_tree)
165+
for advice in linter.lint():
166+
yield LocatedAdvice(advice, dependency.path)
167+
152168
if linted_paths is None:
153169
linted_paths = set()
154-
for dependency in graph.root_dependencies:
155-
root = dependency.path # since it's a root
156-
yield from self._lint_one(dependency, graph, root, linted_paths)
157-
158-
def _lint_one(
159-
self, dependency: Dependency, graph: DependencyGraph, root_path: Path, linted_paths: set[Path]
160-
) -> Iterable[LocatedAdvice]:
161-
if dependency.path in linted_paths:
162-
return
163-
linted_paths.add(dependency.path)
164-
if dependency.path.is_file() or is_a_notebook(dependency.path):
165-
inherited_tree = graph.root.build_inherited_tree(root_path, dependency.path)
166-
ctx = self._context_factory()
167-
path_lookup = self._path_lookup.change_directory(dependency.path.parent)
168-
# FileLinter will determine which file/notebook linter to use
169-
linter = FileLinter(ctx, path_lookup, self._session_state, dependency.path, inherited_tree)
170-
for advice in linter.lint():
171-
yield advice.for_path(dependency.path)
172-
maybe_graph = graph.locate_dependency(dependency.path)
173-
# problems have already been reported while building the graph
174-
if maybe_graph.graph:
175-
child_graph = maybe_graph.graph
176-
for child_dependency in child_graph.local_dependencies:
177-
yield from self._lint_one(child_dependency, child_graph, root_path, linted_paths)
170+
walker = LintingWalker(graph, linted_paths, self._path_lookup)
171+
yield from walker
178172

179173

180174
class LocalFileMigrator:

0 commit comments

Comments
 (0)