Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions src/fromager/dependency_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,60 @@ def construct_root_node(cls) -> DependencyNode:
Version("0"),
)

def iter_build_requirements(self) -> typing.Iterable[DependencyNode]:
"""Get all unique, recursive build requirements

Yield all direct and indirect requirements to build the dependency.
Includes direct build dependencies and their recursive **install**
requirements.

The result is equivalent to the set of ``[build-system].requires``
plus all ``Requires-Dist`` of build system requirements -- all
packages in the build environment.
"""
visited: set[str] = set()
# The outer loop iterates over all children and picks
# direct build requirements. For each build requirement, it traverses
# all children and recursively get their install requirements
# (depth first).
for edge in self.children:
if edge.key in visited:
# optimization: don't traverse visited nodes
continue
if not edge.req_type.is_build_requirement:
# not a build requirement
continue
visited.add(edge.key)
# it's a new ``[build-system].requires``.
yield edge.destination_node
# recursively get install dependencies of this build dep (depth first).
for install_edge in self._traverse_install_requirements(
edge.destination_node.children, visited
):
yield install_edge.destination_node

def iter_install_requirements(self) -> typing.Iterable[DependencyNode]:
"""Get all unique, recursive install requirements"""
visited: set[str] = set()
for edge in self._traverse_install_requirements(self.children, visited):
yield edge.destination_node

def _traverse_install_requirements(
self,
start_edges: list[DependencyEdge],
visited: set[str],
) -> typing.Iterable[DependencyEdge]:
for edge in start_edges:
if edge.key in visited:
continue
if not edge.req_type.is_install_requirement:
continue
visited.add(edge.destination_node.key)
yield edge
yield from self._traverse_install_requirements(
edge.destination_node.children, visited
)


@dataclasses.dataclass(frozen=True, order=True, slots=True)
class DependencyEdge:
Expand Down
113 changes: 113 additions & 0 deletions tests/test_dependency_graph.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
import dataclasses
import graphlib

import pytest
from packaging.requirements import Requirement
from packaging.utils import canonicalize_name
from packaging.version import Version

from fromager.dependency_graph import DependencyNode
from fromager.requirements_file import RequirementType


def mknode(name: str, version: str = "1.0", **kwargs) -> DependencyNode:
Expand Down Expand Up @@ -59,3 +62,113 @@ def test_dependencynode_dataclass():
assert root.canonicalized_name == ""
assert root.version == Version("0.0")
assert root.key == ""


def test_iter_requirements() -> None:
a = mknode("a")
# install requirements of a
b = mknode("b")
# build requirement of a
c = mknode("c")
# build requirement of c
d = mknode("d")
# install requirement of b and c
e = mknode("e")
# build requirement of a and c
f = mknode("f")

a.add_child(b, Requirement(b.canonicalized_name), RequirementType.INSTALL)
a.add_child(c, Requirement(c.canonicalized_name), RequirementType.BUILD_BACKEND)
a.add_child(c, Requirement(c.canonicalized_name), RequirementType.BUILD_SYSTEM)
a.add_child(f, Requirement(c.canonicalized_name), RequirementType.BUILD_SYSTEM)
b.add_child(e, Requirement(b.canonicalized_name), RequirementType.INSTALL)
c.add_child(d, Requirement(d.canonicalized_name), RequirementType.BUILD_SYSTEM)
c.add_child(e, Requirement(e.canonicalized_name), RequirementType.INSTALL)
c.add_child(f, Requirement(f.canonicalized_name), RequirementType.BUILD_BACKEND)

assert sorted(a.iter_install_requirements()) == [b, e]
assert sorted(a.iter_build_requirements()) == [c, e, f]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would expect to see D in this list because D is a build requirement of C. We can't build A until we can install C, and we can't install C until we can build it.

Like I think I said on another review, we have the same issue here that we had with determining whether something was a build dependency during bootstrap. There, the package "inherits" the "build dependency" type from the why chain. If anything earlier in the why chain is a build dependency, then so is the current package.

In the bootstrap process we build packages using in-order traversal. We build the build dependencies, then the package, then its install dependencies. That separates the dependency types and allows us to cope with potential circular dependencies in installation dependencies, since those are not forbidden.

I think we need to apply the same logic when building in parallel. That is going to potentially limit the amount we do in parallel, but I think that's required to achieve a correct build, without deadlocking on cyclic dependencies or trying to use something that isn't available.

I think that means keeping track of 2 things in build-parallel for each package: Is the package ready to be built, and is the package ready to be used to build other packages. That will complicate the logic for deciding what work to do, and while a literal topological traversal will give us the right order for sequential build I don't think it gives us parallel builds on its own.

The current implementation in the main branch doesn't attempt to traverse the graph explicitly. During each pass it iterates over the set of all unbuilt nodes and asks if the build dependencies for the node have been built. If so, it adds the current node to the set to be built. I think we need to extend that logic so that for each build dependency we check not only that it has been built, but that its installation dependencies are built. Only at that point can we say that the build dependencies are ready to be used to build the current node.

We could build that using the topological sorter by having the sorter tell us when a package is ready to be built, but then keeping those built nodes in a separate list and waiting to mark them as done in the graph when the installation dependencies are also built.

Alternatively, we could fix the loop we have already to keep track of the same information.

I'm not sure which will be easier to follow.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've done some refactoring of the existing logic and added install dependency checking in #794

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would expect to see D in this list because D is a build requirement of C. We can't build A until we can install C, and we can't install C until we can build it.

No, A does not need D to build. A only needs C+E+F to build. The indirect dependencies A -> C -> D is handled by the topology graph. C does not become available until D is built. Because C has a dependency on D and A has a dependency on C, the topology converges on the solution D, then C, then A.

assert sorted(b.iter_install_requirements()) == [e]
assert sorted(b.iter_build_requirements()) == []
assert sorted(c.iter_install_requirements()) == [e]
assert sorted(c.iter_build_requirements()) == [d, f]

build_graph = get_build_graph(a, b, c, d, e, f)
assert build_graph == [
# no build requirements, B and E can be built in parallel, as
# B just has an install requirement on E.
["b", "d", "e", "f"],
# C needs D, F to build.
["c"],
# A needs C, E, F.
["a"],
]


def get_build_graph(*nodes: DependencyNode) -> list[list[str]]:
topo: graphlib.TopologicalSorter[str] = graphlib.TopologicalSorter()
for node in nodes:
build_deps = [n.canonicalized_name for n in node.iter_build_requirements()]
topo.add(node.canonicalized_name, *build_deps)
topo.prepare()
steps: list[list[str]] = []
while topo.is_active():
ready = topo.get_ready()
steps.append(sorted(ready))
topo.done(*ready)
return steps


def test_pr759_discussion() -> None:
a = mknode("a")
b = mknode("b")
c = mknode("c")
d = mknode("d")
# A needs B to build.
a.add_child(b, Requirement(c.canonicalized_name), RequirementType.BUILD_BACKEND)
# B needs C to build.
b.add_child(c, Requirement(c.canonicalized_name), RequirementType.BUILD_BACKEND)
# B needs D to install.
b.add_child(d, Requirement(c.canonicalized_name), RequirementType.INSTALL)

assert sorted(a.iter_build_requirements()) == [b, d]
assert sorted(b.iter_build_requirements()) == [c]
assert sorted(c.iter_build_requirements()) == []
assert sorted(d.iter_build_requirements()) == []

build_graph = get_build_graph(a, b, c, d)
assert build_graph == [["c", "d"], ["b"], ["a"]]

# add more nodes
e = mknode("e")
f = mknode("f")
# D needs E to install.
d.add_child(e, Requirement(c.canonicalized_name), RequirementType.INSTALL)
# E needs F to build.
e.add_child(f, Requirement(c.canonicalized_name), RequirementType.BUILD_BACKEND)

# build requirements
assert sorted(a.iter_build_requirements()) == [b, d, e]
assert sorted(b.iter_build_requirements()) == [c]
assert sorted(c.iter_build_requirements()) == []
assert sorted(d.iter_build_requirements()) == []
assert sorted(e.iter_build_requirements()) == [f]

build_graph = get_build_graph(a, b, c, d, e, f)
assert build_graph == [
# D, C, F don't have build requirements
["c", "d", "f"],
# B needs C, E needs F
["b", "e"],
# A needs B, D, E
["a"],
]

# install requirements
assert sorted(a.iter_install_requirements()) == []
# E is an indirect install dependency
assert sorted(b.iter_install_requirements()) == [d, e]
assert sorted(c.iter_install_requirements()) == []
assert sorted(d.iter_install_requirements()) == [e]
assert sorted(e.iter_install_requirements()) == []
assert sorted(f.iter_install_requirements()) == []
Loading