Skip to content

Commit 677520a

Browse files
committed
Bazel: improved lazy lfs files
This reintroduces lazy lfs file rules that were removed in #16117, now improved. The new rules will make the actual file download go through bazel's download manager, which includes: * caching into the repository cache * sane limiting of concurrent downloads * retries The bulk of the work is done by `git_lfs_probe.py`, which will use the LFS protocol (with authentication via SSH) to output short lived download URLs that can be consumed by `repository_ctx.download`.
1 parent a8f2cbc commit 677520a

File tree

4 files changed

+196
-0
lines changed

4 files changed

+196
-0
lines changed

.lfsconfig

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
[lfs]
2+
# codeql is publicly forked by many users, and we don't want any LFS file polluting their working
3+
# copies. We therefore exclude everything by default.
4+
# For files required by bazel builds, use rules in `misc/bazel/lfs.bzl` to download them on demand.
5+
fetchinclude = /nothing

misc/bazel/internal/BUILD.bazel

Whitespace-only changes.

misc/bazel/internal/git_lfs_probe.py

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
#!/usr/bin/env python3
2+
3+
"""
4+
Probe lfs files.
5+
For each source file provided as output, this will print:
6+
* "local", if the source file is not an LFS pointer
7+
* the sha256 hash, a space character and a transient download link obtained via the LFS protocol otherwise
8+
"""
9+
10+
import sys
11+
import pathlib
12+
import subprocess
13+
import os
14+
import shutil
15+
import json
16+
import urllib.request
17+
from urllib.parse import urlparse
18+
import re
19+
20+
sources = [pathlib.Path(arg).resolve() for arg in sys.argv[1:]]
21+
source_dir = pathlib.Path(os.path.commonpath(src.parent for src in sources))
22+
source_dir = subprocess.check_output(["git", "rev-parse", "--show-toplevel"], cwd=source_dir, text=True).strip()
23+
24+
25+
def get_endpoint():
26+
lfs_env = subprocess.check_output(["git", "lfs", "env"], text=True, cwd=source_dir)
27+
endpoint = ssh_server = ssh_path = None
28+
endpoint_re = re.compile(r'Endpoint(?: \(\S+\))?=(\S+)')
29+
ssh_re = re.compile(r'\s*SSH=(\S*):(.*)')
30+
for line in lfs_env.splitlines():
31+
m = endpoint_re.match(line)
32+
if m:
33+
if endpoint is None:
34+
endpoint = m[1]
35+
else:
36+
break
37+
m = ssh_re.match(line)
38+
if m:
39+
ssh_server, ssh_path = m.groups()
40+
break
41+
assert endpoint, f"no Endpoint= line found in git lfs env:\n{lfs_env}"
42+
headers = {
43+
"Content-Type": "application/vnd.git-lfs+json",
44+
"Accept": "application/vnd.git-lfs+json",
45+
}
46+
if ssh_server:
47+
ssh_command = shutil.which(os.environ.get("GIT_SSH", os.environ.get("GIT_SSH_COMMAND", "ssh")))
48+
assert ssh_command, "no ssh command found"
49+
with subprocess.Popen([ssh_command, ssh_server, "git-lfs-authenticate", ssh_path, "download"],
50+
stdout=subprocess.PIPE) as ssh:
51+
resp = json.load(ssh.stdout)
52+
assert ssh.wait() == 0, "ssh command failed"
53+
endpoint = resp.get("href", endpoint)
54+
for k, v in resp.get("header", {}).items():
55+
headers[k.capitalize()] = v
56+
url = urlparse(endpoint)
57+
# this is how actions/checkout persist credentials
58+
# see https://github.com/actions/checkout/blob/44c2b7a8a4ea60a981eaca3cf939b5f4305c123b/src/git-auth-helper.ts#L56-L63
59+
auth = subprocess.run(["git", "config", f"http.{url.scheme}://{url.netloc}/.extraheader"], text=True,
60+
stdout=subprocess.PIPE, cwd=source_dir).stdout.strip()
61+
for l in auth.splitlines():
62+
k, _, v = l.partition(": ")
63+
headers[k.capitalize()] = v
64+
if "GITHUB_TOKEN" in os.environ:
65+
headers["Authorization"] = f"token {os.environ['GITHUB_TOKEN']}"
66+
return endpoint, headers
67+
68+
69+
# see https://github.com/git-lfs/git-lfs/blob/310d1b4a7d01e8d9d884447df4635c7a9c7642c2/docs/api/basic-transfers.md
70+
def get_locations(objects):
71+
href, headers = get_endpoint()
72+
indexes = [i for i, o in enumerate(objects) if o]
73+
ret = ["local" for _ in objects]
74+
req = urllib.request.Request(
75+
f"{href}/objects/batch",
76+
headers=headers,
77+
data=json.dumps({
78+
"operation": "download",
79+
"transfers": ["basic"],
80+
"objects": [o for o in objects if o],
81+
"hash_algo": "sha256",
82+
}).encode("ascii"),
83+
)
84+
with urllib.request.urlopen(req) as resp:
85+
data = json.load(resp)
86+
assert len(data["objects"]) == len(indexes), data
87+
for i, resp in zip(indexes, data["objects"]):
88+
ret[i] = f'{resp["oid"]} {resp["actions"]["download"]["href"]}'
89+
return ret
90+
91+
92+
def get_lfs_object(path):
93+
with open(path, 'rb') as fileobj:
94+
lfs_header = "version https://git-lfs.github.com/spec".encode()
95+
actual_header = fileobj.read(len(lfs_header))
96+
sha256 = size = None
97+
if lfs_header != actual_header:
98+
return None
99+
for line in fileobj:
100+
line = line.decode('ascii').strip()
101+
if line.startswith("oid sha256:"):
102+
sha256 = line[len("oid sha256:"):]
103+
elif line.startswith("size "):
104+
size = int(line[len("size "):])
105+
if not (sha256 and line):
106+
raise Exception("malformed pointer file")
107+
return {"oid": sha256, "size": size}
108+
109+
110+
objects = [get_lfs_object(src) for src in sources]
111+
for resp in get_locations(objects):
112+
print(resp)

misc/bazel/lfs.bzl

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
def lfs_smudge(repository_ctx, srcs):
2+
for src in srcs:
3+
repository_ctx.watch(src)
4+
script = Label("//misc/bazel/internal:git_lfs_probe.py")
5+
python = repository_ctx.which("python3") or repository_ctx.which("python")
6+
if not python:
7+
fail("Neither python3 nor python executables found")
8+
res = repository_ctx.execute([python, script] + srcs, quiet = True)
9+
if res.return_code != 0:
10+
fail("git LFS probing failed while instantiating @%s:\n%s" % (repository_ctx.name, res.stderr))
11+
for src, loc in zip(srcs, res.stdout.splitlines()):
12+
if loc == "local":
13+
repository_ctx.symlink(src, src.basename)
14+
else:
15+
sha256, _, url = loc.partition(" ")
16+
repository_ctx.download(url, src.basename, sha256 = sha256)
17+
18+
def _download_and_extract_lfs(repository_ctx):
19+
attr = repository_ctx.attr
20+
src = repository_ctx.path(attr.src)
21+
if attr.build_file_content and attr.build_file:
22+
fail("You should specify only one among build_file_content and build_file for rule @%s" % repository_ctx.name)
23+
lfs_smudge(repository_ctx, [src])
24+
repository_ctx.extract(src.basename, stripPrefix = attr.strip_prefix)
25+
repository_ctx.delete(src.basename)
26+
if attr.build_file_content:
27+
repository_ctx.file("BUILD.bazel", attr.build_file_content)
28+
elif attr.build_file:
29+
repository_ctx.symlink(attr.build_file, "BUILD.bazel")
30+
31+
def _download_lfs(repository_ctx):
32+
attr = repository_ctx.attr
33+
if int(bool(attr.srcs)) + int(bool(attr.dir)) != 1:
34+
fail("Exactly one between `srcs` and `dir` must be defined for @%s" % repository_ctx.name)
35+
if attr.srcs:
36+
srcs = [repository_ctx.path(src) for src in attr.srcs]
37+
else:
38+
dir = repository_ctx.path(attr.dir)
39+
if not dir.is_dir:
40+
fail("`dir` not a directory in @%s" % repository_ctx.name)
41+
srcs = [f for f in dir.readdir() if not f.is_dir]
42+
lfs_smudge(repository_ctx, srcs)
43+
44+
# with bzlmod the name is qualified with `~` separators, and we want the base name here
45+
name = repository_ctx.name.split("~")[-1]
46+
repository_ctx.file("BUILD.bazel", """
47+
exports_files({files})
48+
49+
filegroup(
50+
name = "{name}",
51+
srcs = {files},
52+
visibility = ["//visibility:public"],
53+
)
54+
""".format(name = name, files = repr([src.basename for src in srcs])))
55+
56+
lfs_archive = repository_rule(
57+
doc = "Export the contents from an on-demand LFS archive. The corresponding path should be added to be ignored " +
58+
"in `.lfsconfig`.",
59+
implementation = _download_and_extract_lfs,
60+
attrs = {
61+
"src": attr.label(mandatory = True, doc = "Local path to the LFS archive to extract."),
62+
"build_file_content": attr.string(doc = "The content for the BUILD file for this repository. " +
63+
"Either build_file or build_file_content can be specified, but not both."),
64+
"build_file": attr.label(doc = "The file to use as the BUILD file for this repository. " +
65+
"Either build_file or build_file_content can be specified, but not both."),
66+
"strip_prefix": attr.string(default = "", doc = "A directory prefix to strip from the extracted files. "),
67+
},
68+
)
69+
70+
lfs_files = repository_rule(
71+
doc = "Export LFS files for on-demand download. Exactly one between `srcs` and `dir` must be defined. The " +
72+
"corresponding paths should be added to be ignored in `.lfsconfig`.",
73+
implementation = _download_lfs,
74+
attrs = {
75+
"srcs": attr.label_list(doc = "Local paths to the LFS files to export."),
76+
"dir": attr.label(doc = "Local path to a directory containing LFS files to export. Only the direct contents " +
77+
"of the directory are exported"),
78+
},
79+
)

0 commit comments

Comments
 (0)