Skip to content

Commit f595029

Browse files
authored
Compute dependency routes (#2244)
## Changes Have `DependencyGraph` compute route from root file/notebook to child file/notebook. That route will be later used to populate an `InheritedContext` Have dependencies convey a property for inheriting context, such that context will be computed when but only when required This is in preparation of #2236 ### Linked issues Progresses #2155 Progresses #2156 ### Functionality None ### Tests - [x] added unit tests --------- Co-authored-by: Eric Vergnaud <[email protected]>
1 parent b7cc950 commit f595029

17 files changed

+237
-47
lines changed

src/databricks/labs/ucx/source_code/graph.py

Lines changed: 61 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,8 @@ def path(self):
4747
def register_library(self, *libraries: str) -> list[DependencyProblem]:
4848
return self._resolver.register_library(self.path_lookup, *libraries)
4949

50-
def register_notebook(self, path: Path) -> list[DependencyProblem]:
51-
maybe = self._resolver.resolve_notebook(self.path_lookup, path)
50+
def register_notebook(self, path: Path, inherit_context: bool) -> list[DependencyProblem]:
51+
maybe = self._resolver.resolve_notebook(self.path_lookup, path, inherit_context)
5252
if not maybe.dependency:
5353
return maybe.problems
5454
maybe_graph = self.register_dependency(maybe.dependency)
@@ -199,6 +199,52 @@ def visit(self, visit_node: Callable[[DependencyGraph], bool | None], visited: s
199199
def new_dependency_graph_context(self):
200200
return DependencyGraphContext(parent=self, path_lookup=self._path_lookup, session_state=self._session_state)
201201

202+
def _compute_route(self, root: Path, leaf: Path, visited: set[Path]) -> list[Dependency]:
203+
"""given 2 files or notebooks root and leaf, compute the list of dependencies that must be traversed
204+
in order to lint the leaf in the context of its parents"""
205+
try:
206+
route = self._do_compute_route(root, leaf, visited)
207+
return self._trim_route(route)
208+
except ValueError:
209+
# we only compute routes on graphs so _compute_route can't fail
210+
# but we return an empty route in case it does
211+
return []
212+
213+
def _do_compute_route(self, root: Path, leaf: Path, visited: set[Path]) -> list[Dependency]:
214+
maybe = self.locate_dependency(root)
215+
if not maybe.graph:
216+
logger.warning(f"Could not compute route because dependency {root} cannot be located")
217+
raise ValueError()
218+
route: list[Dependency] = []
219+
220+
def do_compute_route(graph: DependencyGraph) -> bool:
221+
route.append(graph.dependency)
222+
for dependency in graph.local_dependencies:
223+
if dependency.path == leaf:
224+
route.append(dependency)
225+
return True
226+
for dependency in graph.local_dependencies:
227+
sub_route = self._do_compute_route(dependency.path, leaf, visited)
228+
if len(sub_route) > 0:
229+
route.extend(sub_route)
230+
return True
231+
route.pop()
232+
return False
233+
234+
maybe.graph.visit(do_compute_route, visited)
235+
return route
236+
237+
def _trim_route(self, dependencies: list[Dependency]) -> list[Dependency]:
238+
"""don't inherit context if dependency is not a file/notebook, or it is loaded via dbutils.notebook.run or via import"""
239+
# filter out intermediate containers
240+
dependencies = list(dependency for dependency in dependencies if dependency.path.is_file())
241+
# restart when not inheriting context
242+
for i, dependency in enumerate(dependencies):
243+
if dependency.inherits_context:
244+
continue
245+
return [dependency] + self._trim_route(dependencies[i + 1 :])
246+
return dependencies
247+
202248
def __repr__(self):
203249
return f"<DependencyGraph {self.path}>"
204250

@@ -212,14 +258,19 @@ class DependencyGraphContext:
212258

213259
class Dependency(abc.ABC):
214260

215-
def __init__(self, loader: DependencyLoader, path: Path):
261+
def __init__(self, loader: DependencyLoader, path: Path, inherits_context=True):
216262
self._loader = loader
217263
self._path = path
264+
self._inherits_context = inherits_context
218265

219266
@property
220267
def path(self) -> Path:
221268
return self._path
222269

270+
@property
271+
def inherits_context(self):
272+
return self._inherits_context
273+
223274
def __hash__(self):
224275
return hash(self.path)
225276

@@ -236,14 +287,12 @@ def __repr__(self):
236287
class SourceContainer(abc.ABC):
237288

238289
@abc.abstractmethod
239-
def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProblem]:
240-
"""builds a dependency graph from the contents of this container"""
290+
def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProblem]: ...
241291

242292

243293
class DependencyLoader(abc.ABC):
244294
@abc.abstractmethod
245-
def load_dependency(self, path_lookup: PathLookup, dependency: Dependency) -> SourceContainer | None:
246-
"""loads a dependency"""
295+
def load_dependency(self, path_lookup: PathLookup, dependency: Dependency) -> SourceContainer | None: ...
247296

248297

249298
class WrappingLoader(DependencyLoader):
@@ -267,8 +316,7 @@ def register_library(self, path_lookup: PathLookup, *libraries: str) -> list[Dep
267316
class BaseNotebookResolver(abc.ABC):
268317

269318
@abc.abstractmethod
270-
def resolve_notebook(self, path_lookup: PathLookup, path: Path) -> MaybeDependency:
271-
"""locates a notebook"""
319+
def resolve_notebook(self, path_lookup: PathLookup, path: Path, inherit_context: bool) -> MaybeDependency: ...
272320

273321
@staticmethod
274322
def _fail(code: str, message: str) -> MaybeDependency:
@@ -315,8 +363,8 @@ def __init__(
315363
self._import_resolver = import_resolver
316364
self._path_lookup = path_lookup
317365

318-
def resolve_notebook(self, path_lookup: PathLookup, path: Path) -> MaybeDependency:
319-
return self._notebook_resolver.resolve_notebook(path_lookup, path)
366+
def resolve_notebook(self, path_lookup: PathLookup, path: Path, inherit_context: bool) -> MaybeDependency:
367+
return self._notebook_resolver.resolve_notebook(path_lookup, path, inherit_context)
320368

321369
def resolve_import(self, path_lookup: PathLookup, name: str) -> MaybeDependency:
322370
return self._import_resolver.resolve_import(path_lookup, name)
@@ -353,7 +401,8 @@ def _local_file_resolver(self) -> BaseFileResolver | None:
353401
def build_notebook_dependency_graph(self, path: Path, session_state: CurrentSessionState) -> MaybeGraph:
354402
"""Builds a dependency graph starting from a notebook. This method is mainly intended for testing purposes.
355403
In case of problems, the paths in the problems will be relative to the starting path lookup."""
356-
maybe = self._notebook_resolver.resolve_notebook(self._path_lookup, path)
404+
# the notebook is the root of the graph, so there's no context to inherit
405+
maybe = self._notebook_resolver.resolve_notebook(self._path_lookup, path, inherit_context=False)
357406
if not maybe.dependency:
358407
return MaybeGraph(None, self._make_relative_paths(maybe.problems, path))
359408
graph = DependencyGraph(maybe.dependency, None, self, self._path_lookup, session_state)

src/databricks/labs/ucx/source_code/jobs.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ def as_message(self) -> str:
5757
class WorkflowTask(Dependency):
5858
def __init__(self, ws: WorkspaceClient, task: jobs.Task, job: jobs.Job):
5959
loader = WrappingLoader(WorkflowTaskContainer(ws, task, job))
60-
super().__init__(loader, Path(f'/jobs/{task.task_key}'))
60+
super().__init__(loader, Path(f'/jobs/{task.task_key}'), False)
6161
self._task = task
6262
self._job = job
6363

@@ -188,7 +188,7 @@ def _register_notebook(self, graph: DependencyGraph) -> Iterable[DependencyProbl
188188
logger.info(f'Discovering {self._task.task_key} entrypoint: {notebook_path}')
189189
# Notebooks can't be on DBFS.
190190
path = WorkspacePath(self._ws, notebook_path)
191-
return graph.register_notebook(path)
191+
return graph.register_notebook(path, False)
192192

193193
def _register_spark_python_task(self, graph: DependencyGraph):
194194
if not self._task.spark_python_task:
@@ -197,7 +197,7 @@ def _register_spark_python_task(self, graph: DependencyGraph):
197197
notebook_path = self._task.spark_python_task.python_file
198198
logger.info(f'Discovering {self._task.task_key} entrypoint: {notebook_path}')
199199
path = self._as_path(notebook_path)
200-
return graph.register_notebook(path)
200+
return graph.register_notebook(path, False)
201201

202202
@staticmethod
203203
def _find_first_matching_distribution(path_lookup: PathLookup, name: str) -> metadata.Distribution | None:
@@ -263,7 +263,8 @@ def _register_pipeline_task(self, graph: DependencyGraph):
263263
notebook_path = library.notebook.path
264264
# Notebooks can't be on DBFS.
265265
path = WorkspacePath(self._ws, notebook_path)
266-
yield from graph.register_notebook(path)
266+
# the notebook is the root of the graph, so there's no context to inherit
267+
yield from graph.register_notebook(path, inherit_context=False)
267268
if library.jar:
268269
yield from self._register_library(graph, compute.Library(jar=library.jar))
269270
if library.maven:

src/databricks/labs/ucx/source_code/linters/files.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -38,21 +38,21 @@ def __init__(self, path: Path, source: str, language: Language):
3838
# using CellLanguage so we can reuse the facilities it provides
3939
self._language = CellLanguage.of_language(language)
4040

41+
@property
42+
def path(self) -> Path:
43+
return self._path
44+
4145
def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProblem]:
4246
if self._language is CellLanguage.PYTHON:
4347
context = parent.new_dependency_graph_context()
44-
analyser = PythonCodeAnalyzer(context, self._original_code)
45-
return analyser.build_graph()
48+
analyzer = PythonCodeAnalyzer(context, self._original_code)
49+
return analyzer.build_graph()
4650
# supported language that does not generate dependencies
4751
if self._language is CellLanguage.SQL:
4852
return []
4953
logger.warning(f"Unsupported language: {self._language.language}")
5054
return []
5155

52-
@property
53-
def path(self) -> Path:
54-
return self._path
55-
5656
def __repr__(self):
5757
return f"<LocalFile {self._path}>"
5858

@@ -76,7 +76,7 @@ def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProb
7676
def _build_dependency_graph(self, parent: DependencyGraph) -> Iterable[DependencyProblem]:
7777
for child_path in self._path.iterdir():
7878
loader = self._folder_loader if child_path.is_dir() else self._file_loader
79-
dependency = Dependency(loader, child_path)
79+
dependency = Dependency(loader, child_path, False)
8080
yield from parent.register_dependency(dependency).problems
8181

8282
def __repr__(self):
@@ -310,7 +310,7 @@ def _resolve_import(self, path_lookup: PathLookup, name: str) -> MaybeDependency
310310
absolute_path = path_lookup.resolve(relative_path)
311311
if not absolute_path:
312312
continue
313-
dependency = Dependency(self._file_loader, absolute_path)
313+
dependency = Dependency(self._file_loader, absolute_path, False)
314314
return MaybeDependency(dependency, [])
315315
return None
316316

src/databricks/labs/ucx/source_code/notebooks/cells.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -70,12 +70,10 @@ def migrated_code(self, value: str):
7070

7171
@property
7272
@abstractmethod
73-
def language(self) -> CellLanguage:
74-
"""returns the language of this cell"""
73+
def language(self) -> CellLanguage: ...
7574

7675
@abstractmethod
77-
def is_runnable(self) -> bool:
78-
"""whether of not this cell can be run"""
76+
def is_runnable(self) -> bool: ...
7977

8078
def build_dependency_graph(self, _: DependencyGraph) -> list[DependencyProblem]:
8179
"""Check for any problems with dependencies of this cell.
@@ -173,7 +171,7 @@ def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProb
173171
path, idx, line = self._read_notebook_path()
174172
if path is not None:
175173
start_line = self._original_offset + idx
176-
problems = parent.register_notebook(path)
174+
problems = parent.register_notebook(path, True)
177175
return [
178176
problem.replace(start_line=start_line, start_col=0, end_line=start_line, end_col=len(line))
179177
for problem in problems
@@ -467,7 +465,8 @@ def _register_notebook(self, base_node: NotebookRunCall):
467465
f"Can't check dependency from {base_node.node.as_string()} because the expression cannot be computed",
468466
)
469467
for path in paths:
470-
yield from self._context.parent.register_notebook(Path(path))
468+
# notebooks ran via dbutils.notebook.run do not inherit or propagate context
469+
yield from self._context.parent.register_notebook(Path(path), False)
471470

472471
def _mutate_path_lookup(self, change: SysPathChange):
473472
if isinstance(change, UnresolvedPath):
@@ -547,7 +546,7 @@ class RunCommand(MagicCommand):
547546
def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProblem]:
548547
path = self.notebook_path
549548
if path is not None:
550-
problems = parent.register_notebook(path)
549+
problems = parent.register_notebook(path, True)
551550
return [problem.from_node(problem.code, problem.message, self._node) for problem in problems]
552551
problem = DependencyProblem.from_node('invalid-run-cell', "Missing notebook path in %run command", self._node)
553552
return [problem]

src/databricks/labs/ucx/source_code/notebooks/loaders.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,11 @@ def __init__(self, notebook_loader: NotebookLoader):
2626
super().__init__()
2727
self._notebook_loader = notebook_loader
2828

29-
def resolve_notebook(self, path_lookup: PathLookup, path: Path) -> MaybeDependency:
29+
def resolve_notebook(self, path_lookup: PathLookup, path: Path, inherit_context: bool) -> MaybeDependency:
3030
absolute_path = self._notebook_loader.resolve(path_lookup, path)
3131
if not absolute_path:
3232
return self._fail('notebook-not-found', f"Notebook not found: {path.as_posix()}")
33-
dependency = Dependency(self._notebook_loader, absolute_path)
33+
dependency = Dependency(self._notebook_loader, absolute_path, inherit_context)
3434
return MaybeDependency(dependency, [])
3535

3636

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# Databricks notebook source
2+
3+
# ucx[cannot-autofix-table-reference:+2:0:+2:33] Can't migrate 'spark.table(f'{some_table_name}')' because its table name argument cannot be computed
4+
# ucx[default-format-changed-in-dbr8:+1:0:+1:33] The default format changed in Databricks Runtime 8.0, from Parquet to Delta
5+
spark.table(f"{some_table_name}")
6+
x = 2
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# Databricks notebook source
2+
3+
# ucx[default-format-changed-in-dbr8:+1:0:+1:33] The default format changed in Databricks Runtime 8.0, from Parquet to Delta
4+
spark.table(f"{some_table_name}")
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# Databricks notebook source
2+
3+
dbutils.notebook.run("./parent_that_magic_runs_child_that_uses_value_from_parent.py")
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# Databricks notebook source
2+
3+
from parent_that_magic_runs_child_that_uses_value_from_parent import other_table_name
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# Databricks notebook source
2+
3+
%run ./parent_that_magic_runs_child_that_uses_value_from_parent.py

0 commit comments

Comments
 (0)