Skip to content

Commit d58fa3c

Browse files
committed
feat(run-task): implement efficient git clones
1 parent 310123d commit d58fa3c

File tree

3 files changed

+391
-28
lines changed

3 files changed

+391
-28
lines changed

src/taskgraph/run-task/run-task

Lines changed: 75 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -28,15 +28,13 @@ import os
2828
import platform
2929
import re
3030
import shutil
31-
import signal
3231
import socket
3332
import stat
3433
import subprocess
3534
import time
3635
import urllib.error
3736
import urllib.request
3837
from pathlib import Path
39-
from threading import Thread
4038
from typing import Optional
4139

4240
SECRET_BASEURL_TPL = "{}/secrets/v1/secret/{{}}".format(os.environ.get("TASKCLUSTER_PROXY_URL", "http://taskcluster").rstrip('/'))
@@ -600,6 +598,8 @@ def git_checkout(
600598
commit: Optional[str],
601599
ssh_key_file: Optional[Path],
602600
ssh_known_hosts_file: Optional[Path],
601+
efficient_clone: bool = False,
602+
sparse_dirs: Optional[str] = None,
603603
):
604604
env = {
605605
# abort if transfer speed is lower than 1kB/s for 1 minute
@@ -636,22 +636,43 @@ def git_checkout(
636636
args = [
637637
"git",
638638
"clone",
639+
]
640+
641+
if efficient_clone:
642+
# Use blobless clone for faster initial clone
643+
# This fetches commit and tree objects but not file contents
644+
args.extend(["--filter=blob:none"])
645+
# Use shallow clone with depth 1 for minimal history
646+
args.extend(["--depth=1"])
647+
# Skip checkout initially, we'll do sparse checkout later
648+
args.extend(["--no-checkout"])
649+
elif sparse_dirs:
650+
# For sparse checkout without efficient clone, still skip initial checkout
651+
# so we can set up sparse checkout before checking out files
652+
args.extend(["--no-checkout"])
653+
654+
args.extend([
639655
base_repo if base_repo else head_repo,
640656
destination_path,
641-
]
657+
])
642658

643659
retry_required_command(b"vcs", args, extra_env=env)
644660

645661
if base_ref:
646-
args = ["git", "fetch", "origin", base_ref]
662+
args = ["git", "fetch"]
663+
if efficient_clone:
664+
# For shallow clones, we need to deepen to fetch more history
665+
args.extend(["--depth=100"])
666+
args.extend(["origin", base_ref])
647667

648668
retry_required_command(b"vcs", args, cwd=destination_path, extra_env=env)
649669

650670
# Create local branch so that taskgraph is able to compute differences
651671
# between the head branch and the base one, if needed
652-
args = ["git", "checkout", base_ref]
653-
654-
retry_required_command(b"vcs", args, cwd=destination_path, extra_env=env)
672+
if not efficient_clone and not sparse_dirs:
673+
# Only checkout if we didn't use --no-checkout initially
674+
args = ["git", "checkout", base_ref]
675+
retry_required_command(b"vcs", args, cwd=destination_path, extra_env=env)
655676

656677
# When commits are force-pushed (like on a testing branch), base_rev doesn't
657678
# exist on base_ref. Fetching it allows taskgraph to compute differences
@@ -660,7 +681,11 @@ def git_checkout(
660681
# Unlike base_ref just above, there is no need to checkout the revision:
661682
# it's immediately available after the fetch.
662683
if base_rev and base_rev != NULL_REVISION:
663-
args = ["git", "fetch", "origin", base_rev]
684+
args = ["git", "fetch"]
685+
if efficient_clone:
686+
# For shallow clones, we need to deepen to fetch more history
687+
args.extend(["--depth=100"])
688+
args.extend(["origin", base_rev])
664689

665690
retry_required_command(b"vcs", args, cwd=destination_path, extra_env=env)
666691

@@ -671,28 +696,44 @@ def git_checkout(
671696
# in not having a tag, or worse: having an outdated version of one.
672697
# `--force` is needed to be able to update an existing tag.
673698
if ref and base_repo == head_repo:
674-
args = [
675-
"git",
676-
"fetch",
677-
"--tags",
678-
"--force",
679-
base_repo,
680-
ref,
681-
]
699+
args = ["git", "fetch"]
700+
if efficient_clone:
701+
# For shallow clones, we need to deepen to fetch more history
702+
args.extend(["--depth=100"])
703+
args.extend(["--tags", "--force", base_repo, ref])
682704

683705
retry_required_command(b"vcs", args, cwd=destination_path, extra_env=env)
684706

685707
# If a ref isn't provided, we fetch all refs from head_repo, which may be slow
686-
args = [
687-
"git",
688-
"fetch",
689-
"--no-tags",
690-
head_repo,
691-
ref if ref else "+refs/heads/*:refs/remotes/work/*",
692-
]
708+
args = ["git", "fetch"]
709+
if efficient_clone:
710+
# For shallow clones, we need to deepen to fetch more history
711+
args.extend(["--depth=100"])
712+
# With blobless clones, we only fetch the blobs we need
713+
args.extend(["--filter=blob:none"])
714+
args.extend(["--no-tags", head_repo, ref if ref else "+refs/heads/*:refs/remotes/work/*"])
693715

694716
retry_required_command(b"vcs", args, cwd=destination_path, extra_env=env)
695717

718+
if sparse_dirs:
719+
# When sparse directories/files are specified, set up sparse checkout
720+
# The sparse_dirs should be a colon-separated list of directories or files
721+
#
722+
# Note: Git's sparse-checkout behavior in cone mode (default since Git 2.37):
723+
# - Root-level files: Checked out exactly as specified
724+
# - Files in subdirectories: Entire parent directory is included
725+
# - Directories: All contents included
726+
727+
# Enable sparse checkout (cone mode is default since Git 2.37)
728+
args = ["git", "sparse-checkout", "init"]
729+
run_required_command(b"vcs", args, cwd=destination_path)
730+
731+
# Set the sparse entries
732+
entries = sparse_dirs.split(":")
733+
args = ["git", "sparse-checkout", "set"] + entries
734+
run_required_command(b"vcs", args, cwd=destination_path)
735+
736+
# Now do the actual checkout
696737
args = [
697738
"git",
698739
"checkout",
@@ -879,11 +920,17 @@ def add_vcs_arguments(parser, project, name):
879920
"--%s-sparse-profile" % project,
880921
help="Path to sparse profile for %s checkout" % name,
881922
)
923+
parser.add_argument(
924+
"--%s-efficient-clone" % project,
925+
action="store_true",
926+
help="Use efficient cloning strategies (blobless, shallow, no-checkout) for %s" % name,
927+
)
882928

883929

884930
def collect_vcs_options(args, project, name):
885931
checkout = getattr(args, "%s_checkout" % project)
886932
sparse_profile = getattr(args, "%s_sparse_profile" % project)
933+
efficient_clone = getattr(args, "%s_efficient_clone" % project)
887934

888935
env_prefix = project.upper()
889936

@@ -896,6 +943,7 @@ def collect_vcs_options(args, project, name):
896943
ref = os.environ.get("%s_HEAD_REF" % env_prefix)
897944
pip_requirements = os.environ.get("%s_PIP_REQUIREMENTS" % env_prefix)
898945
private_key_secret = os.environ.get("%s_SSH_SECRET_NAME" % env_prefix)
946+
sparse_dirs = os.environ.get("%s_SPARSE_DIRS" % env_prefix)
899947

900948
store_path = os.environ.get("HG_STORE_PATH")
901949

@@ -930,6 +978,8 @@ def collect_vcs_options(args, project, name):
930978
"repo-type": repo_type,
931979
"ssh-secret-name": private_key_secret,
932980
"pip-requirements": pip_requirements,
981+
"efficient-clone": efficient_clone,
982+
"sparse-dirs": sparse_dirs,
933983
}
934984

935985

@@ -978,6 +1028,8 @@ def vcs_checkout_from_args(options):
9781028
revision,
9791029
ssh_key_file,
9801030
ssh_known_hosts_file,
1031+
options.get("efficient-clone", False),
1032+
options.get("sparse-dirs"),
9811033
)
9821034
elif options["repo-type"] == "hg":
9831035
if not revision and not ref:

src/taskgraph/util/vcs.py

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -388,6 +388,10 @@ def base_rev(self):
388388
def branch(self):
389389
return self.run("branch", "--show-current").strip() or None
390390

391+
@property
392+
def is_shallow(self):
393+
return self.run("rev-parse", "--is-shallow-repository").strip() == "true"
394+
391395
@property
392396
def all_remote_names(self):
393397
remotes = self.run("remote").splitlines()
@@ -546,10 +550,25 @@ def update(self, ref):
546550
self.run("checkout", ref)
547551

548552
def find_latest_common_revision(self, base_ref_or_rev, head_rev):
549-
try:
550-
return self.run("merge-base", base_ref_or_rev, head_rev).strip()
551-
except subprocess.CalledProcessError:
552-
return self.NULL_REVISION
553+
554+
def run_merge_base():
555+
try:
556+
return self.run("merge-base", base_ref_or_rev, head_rev).strip()
557+
except subprocess.CalledProcessError:
558+
return None
559+
560+
if not self.is_shallow:
561+
return run_merge_base() or self.NULL_REVISION
562+
563+
deepen = 10
564+
rev = run_merge_base()
565+
while not rev:
566+
self.run("fetch", "--deepen", str(deepen), self.remote_name)
567+
rev = run_merge_base()
568+
deepen = deepen * 10
569+
570+
return rev
571+
553572

554573
def does_revision_exist_locally(self, revision):
555574
try:

0 commit comments

Comments
 (0)