Skip to content

Commit 41f91e9

Browse files
authored
refactor: avoid conflict merging when shared libraries are present (#3448)
Today, when a shared library is present, an extra VenvSymlinkEntry is generated so that it is linked directly. Unfortunately, this will always have a path overlap conflict with the rest of the venv symlinks, which triggers the conflict merge logic later in py_executable. That logic is expensive, as it must flatten all the files and then link each file individually (essentially doubling the number of files materialized). For large packages like torch (10k+ files), this can dramatically increase overhead. To fix, generate VenvSymlinkEntries that don't overlap. The basic logic for how this works is to identify paths that *must* be directly linked, marking all their parent directories as not being able to be directly linked, and then grouping what remains into the highest directly-linkable path. Along the way, drop the logic that only considers code files and special cases `__init__.py` files and implicit packages. This is simplify the code and more correctly map the extracted wheel into the venv.
1 parent 89c8f20 commit 41f91e9

File tree

2 files changed

+289
-116
lines changed

2 files changed

+289
-116
lines changed

python/private/venv_runfiles.bzl

Lines changed: 106 additions & 93 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
load("@bazel_skylib//lib:paths.bzl", "paths")
44
load(
55
":common.bzl",
6-
"PYTHON_FILE_EXTENSIONS",
76
"is_file",
87
"relative_path",
98
"runfiles_root_path",
@@ -58,7 +57,6 @@ def create_venv_app_files(ctx, deps, venv_dir_map):
5857
if is_file(link_to):
5958
symlink_from = "{}/{}".format(ctx.label.package, bin_venv_path)
6059
runfiles_symlinks[symlink_from] = link_to
61-
6260
else:
6361
venv_link = ctx.actions.declare_symlink(bin_venv_path)
6462
venv_link_rf_path = runfiles_root_path(ctx, venv_link.short_path)
@@ -77,14 +75,16 @@ def create_venv_app_files(ctx, deps, venv_dir_map):
7775
)
7876

7977
# Visible for testing
80-
def build_link_map(ctx, entries):
78+
def build_link_map(ctx, entries, return_conflicts = False):
8179
"""Compute the mapping of venv paths to their backing objects.
8280
83-
8481
Args:
8582
ctx: {type}`ctx` current ctx.
8683
entries: {type}`list[VenvSymlinkEntry]` the entries that describe the
8784
venv-relative
85+
return_conflicts: {type}`bool`. Only present for testing. If True,
86+
also return a list of the groups that had overlapping paths and had
87+
to be resolved and merged.
8888
8989
Returns:
9090
{type}`dict[str, dict[str, str|File]]` Mappings of venv paths to their
@@ -114,6 +114,7 @@ def build_link_map(ctx, entries):
114114

115115
# final paths to keep, grouped by kind
116116
keep_link_map = {} # dict[str kind, dict[path, str|File]]
117+
conflicts = [] if return_conflicts else None
117118
for kind, entries in entries_by_kind.items():
118119
# dict[str kind-relative path, str|File link_to]
119120
keep_kind_link_map = {}
@@ -129,12 +130,17 @@ def build_link_map(ctx, entries):
129130
else:
130131
keep_kind_link_map[entry.venv_path] = entry.link_to_path
131132
else:
133+
if return_conflicts:
134+
conflicts.append(group)
135+
132136
# Merge a group of overlapping prefixes
133137
_merge_venv_path_group(ctx, group, keep_kind_link_map)
134138

135139
keep_link_map[kind] = keep_kind_link_map
136-
137-
return keep_link_map
140+
if return_conflicts:
141+
return keep_link_map, conflicts
142+
else:
143+
return keep_link_map
138144

139145
def _group_venv_path_entries(entries):
140146
"""Group entries by VenvSymlinkEntry.venv_path overlap.
@@ -235,109 +241,115 @@ def get_venv_symlinks(ctx, files, package, version_str, site_packages_root):
235241
# Append slash to prevent incorrect prefix-string matches
236242
site_packages_root += "/"
237243

238-
# We have to build a list of (runfiles path, site-packages path) pairs of the files to
239-
# create in the consuming binary's venv site-packages directory. To minimize the number of
240-
# files to create, we just return the paths to the directories containing the code of
241-
# interest.
242-
#
243-
# However, namespace packages complicate matters: multiple distributions install in the
244-
# same directory in site-packages. This works out because they don't overlap in their
245-
# files. Typically, they install to different directories within the namespace package
246-
# directory. We also need to ensure that we can handle a case where the main package (e.g.
247-
# airflow) has directories only containing data files and then namespace packages coming
248-
# along and being next to it.
249-
#
250-
# Lastly we have to assume python modules just being `.py` files (e.g. typing-extensions)
251-
# is just a single Python file.
244+
all_files = sorted(files, key = lambda f: f.short_path)
252245

253-
dir_symlinks = {} # dirname -> runfile path
254-
venv_symlinks = []
246+
# venv paths that cannot be directly linked. Dict acting as set.
247+
cannot_be_linked_directly = {}
255248

256-
# Sort so order is top-down
257-
all_files = sorted(files, key = lambda f: f.short_path)
249+
# dict[str path, VenvSymlinkEntry]
250+
# Where path is the venv path (i.e. relative to site_packages_prefix)
251+
venv_symlinks = {}
258252

253+
# List of (File, str venv_path) tuples
254+
files_left_to_link = []
255+
256+
# We want to minimize the number of files symlinked. Ideally, only the
257+
# top-level directories are symlinked. Unfortunately, shared libraries
258+
# complicate matters: if a shared library's directory is linked, then the
259+
# dynamic linker computes the wrong search path.
260+
#
261+
# To fix, we have to directly link shared libraries. This then means that
262+
# all the parent directories of the shared library can't be linked
263+
# directly.
259264
for src in all_files:
260-
path = _repo_relative_short_path(src.short_path)
261-
if not path.startswith(site_packages_root):
265+
rf_root_path = runfiles_root_path(ctx, src.short_path)
266+
_, _, repo_rel_path = rf_root_path.partition("/")
267+
head, found_sp_root, venv_path = repo_rel_path.partition(site_packages_root)
268+
if head or not found_sp_root:
269+
# If head is set, then the path didn't start with site_packages_root
270+
# if found_sp_root is empty, then it means it wasn't found at all.
262271
continue
263-
path = path.removeprefix(site_packages_root)
264-
dir_name, _, filename = path.rpartition("/")
265-
runfiles_dir_name, _, _ = runfiles_root_path(ctx, src.short_path).partition("/")
266272

273+
filename = paths.basename(venv_path)
267274
if _is_linker_loaded_library(filename):
268-
entry = VenvSymlinkEntry(
275+
venv_symlinks[venv_path] = VenvSymlinkEntry(
269276
kind = VenvSymlinkKind.LIB,
270-
link_to_path = paths.join(runfiles_dir_name, site_packages_root, filename),
277+
link_to_path = rf_root_path,
271278
link_to_file = src,
272279
package = package,
273280
version = version_str,
274-
venv_path = path,
275281
files = depset([src]),
282+
venv_path = venv_path,
276283
)
277-
venv_symlinks.append(entry)
278-
continue
279-
280-
if dir_name in dir_symlinks:
281-
# we already have this dir, this allows us to short-circuit since most of the
282-
# ctx.files.data might share the same directories as ctx.files.srcs
284+
parent = paths.dirname(venv_path)
285+
for _ in range(len(venv_path) + 1): # Iterate enough times to traverse up
286+
if not parent:
287+
break
288+
if cannot_be_linked_directly.get(parent, False):
289+
# Already seen
290+
break
291+
cannot_be_linked_directly[parent] = True
292+
parent = paths.dirname(parent)
293+
else:
294+
files_left_to_link.append((src, venv_path))
295+
296+
# At this point, venv_symlinks has entries for the shared libraries
297+
# and cannot_be_linked_directly has the directories that cannot be
298+
# directly linked. Next, we loop over the remaining files and group
299+
# them into the highest level directory that can be linked.
300+
301+
# dict[str venv_path, list[File]]
302+
optimized_groups = {}
303+
304+
for src, venv_path in files_left_to_link:
305+
parent = paths.dirname(venv_path)
306+
if not parent:
307+
# File in root, must be linked directly
308+
optimized_groups.setdefault(venv_path, [])
309+
optimized_groups[venv_path].append(src)
283310
continue
284311

285-
if dir_name:
286-
# This can be either:
287-
# * a directory with libs (e.g. numpy.libs, created by auditwheel)
288-
# * a directory with `__init__.py` file that potentially also needs to be
289-
# symlinked.
290-
# * `.dist-info` directory
291-
#
292-
# This could be also regular files, that just need to be symlinked, so we will
293-
# add the directory here.
294-
dir_symlinks[dir_name] = runfiles_dir_name
295-
elif src.extension in PYTHON_FILE_EXTENSIONS:
296-
# This would be files that do not have directories and we just need to add
297-
# direct symlinks to them as is, we only allow Python files in here
298-
entry = VenvSymlinkEntry(
299-
kind = VenvSymlinkKind.LIB,
300-
link_to_path = paths.join(runfiles_dir_name, site_packages_root, filename),
301-
link_to_file = src,
302-
package = package,
303-
version = version_str,
304-
venv_path = path,
305-
files = depset([src]),
306-
)
307-
venv_symlinks.append(entry)
308-
309-
# Sort so that we encounter `foo` before `foo/bar`. This ensures we
310-
# see the top-most explicit package first.
311-
dirnames = sorted(dir_symlinks.keys())
312-
first_level_explicit_packages = []
313-
for d in dirnames:
314-
is_sub_package = False
315-
for existing in first_level_explicit_packages:
316-
# Suffix with / to prevent foo matching foobar
317-
if d.startswith(existing + "/"):
318-
is_sub_package = True
319-
break
320-
if not is_sub_package:
321-
first_level_explicit_packages.append(d)
322-
323-
for dirname in first_level_explicit_packages:
324-
prefix = dir_symlinks[dirname]
325-
link_to_path = paths.join(prefix, site_packages_root, dirname)
326-
entry = VenvSymlinkEntry(
312+
if parent in cannot_be_linked_directly:
313+
# File in a directory that cannot be directly linked,
314+
# so link the file directly
315+
optimized_groups.setdefault(venv_path, [])
316+
optimized_groups[venv_path].append(src)
317+
else:
318+
# This path can be grouped. Find the highest-level directory to link.
319+
venv_path = parent
320+
next_parent = paths.dirname(parent)
321+
for _ in range(len(venv_path) + 1): # Iterate enough times
322+
if next_parent:
323+
if next_parent not in cannot_be_linked_directly:
324+
venv_path = next_parent
325+
next_parent = paths.dirname(next_parent)
326+
else:
327+
break
328+
else:
329+
break
330+
331+
optimized_groups.setdefault(venv_path, [])
332+
optimized_groups[venv_path].append(src)
333+
334+
# Finally, for each group, we create the VenvSymlinkEntry objects
335+
for venv_path, files in optimized_groups.items():
336+
link_to_path = (
337+
_get_label_runfiles_repo(ctx, files[0].owner) +
338+
"/" +
339+
site_packages_root +
340+
venv_path
341+
)
342+
venv_symlinks[venv_path] = VenvSymlinkEntry(
327343
kind = VenvSymlinkKind.LIB,
328344
link_to_path = link_to_path,
345+
link_to_file = None,
329346
package = package,
330347
version = version_str,
331-
venv_path = dirname,
332-
files = depset([
333-
f
334-
for f in all_files
335-
if runfiles_root_path(ctx, f.short_path).startswith(link_to_path + "/")
336-
]),
348+
venv_path = venv_path,
349+
files = depset(files),
337350
)
338-
venv_symlinks.append(entry)
339351

340-
return venv_symlinks
352+
return venv_symlinks.values()
341353

342354
def _is_linker_loaded_library(filename):
343355
"""Tells if a filename is one that `dlopen()` or the runtime linker handles.
@@ -357,9 +369,10 @@ def _is_linker_loaded_library(filename):
357369
return True
358370
return False
359371

360-
def _repo_relative_short_path(short_path):
361-
# Convert `../+pypi+foo/some/file.py` to `some/file.py`
362-
if short_path.startswith("../"):
363-
return short_path[3:].partition("/")[2]
372+
def _get_label_runfiles_repo(ctx, label):
373+
repo = label.repo_name
374+
if repo:
375+
return repo
364376
else:
365-
return short_path
377+
# For files, empty repo means the main repo
378+
return ctx.workspace_name

0 commit comments

Comments
 (0)