Skip to content

Commit 3a80bed

Browse files
committed
fix(vcs): better support for shallow clones in repo.get_changed_files()
`git log BASE..HEAD` says, show me commits reachable from HEAD, but not reachable from BASE. In a shallow clone where we only fetch BASE and HEAD (which is what run-task does), this means the command will only return `HEAD`. In otherwords, we're only returning files changed by the tip commit of the push and ignoring everything else. By switching to `git diff BASE HEAD`, we're instead comparing the snapshots of both revisions. Sometimes this is what we want, e.g for force pushes, it'll be the interdiff of files modified between the two pushes (though some developers might expect it to contain the files modified since the merge base). Sometimes it's not what we want, e.g for PRs, it'll be the files changed between the PR and the latest commit on `main`. Either way, this behaviour is at least somewhat more accurate than git log when we don't have full history. Likely we'll need to fetch the proper changed files using the Github API in the future, but for now this is better than nothing.
1 parent f4db457 commit 3a80bed

File tree

2 files changed

+104
-0
lines changed

2 files changed

+104
-0
lines changed

src/taskgraph/util/vcs.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,11 @@ def run(self, *args: str, **kwargs) -> str:
5757
def tool(self) -> str:
5858
"""Version control system being used, either 'hg' or 'git'."""
5959

60+
@property
61+
@abstractmethod
62+
def is_shallow(self) -> str:
63+
"""Whether this repo is a shallow clone."""
64+
6065
@property
6166
@abstractmethod
6267
def head_rev(self) -> str:
@@ -224,6 +229,10 @@ def __init__(self, *args, **kwargs):
224229
super().__init__(*args, **kwargs)
225230
self._env["HGPLAIN"] = "1"
226231

232+
@property
233+
def is_shallow(self):
234+
return False
235+
227236
@property
228237
def head_rev(self):
229238
return self.run("log", "-r", ".", "-T", "{node}").strip()
@@ -371,6 +380,10 @@ def default_remote_name(self) -> str:
371380

372381
_LS_REMOTE_PATTERN = re.compile(r"ref:\s+refs/heads/(?P<branch_name>\S+)\s+HEAD")
373382

383+
@property
384+
def is_shallow(self):
385+
return self.run("rev-parse", "--is-shallow-repository").strip() == "true"
386+
374387
@property
375388
def head_rev(self):
376389
return self.run("rev-parse", "--verify", "HEAD").strip()
@@ -492,6 +505,15 @@ def get_changed_files(self, diff_filter=None, mode=None, rev=None, base=None):
492505
cmd.append("--cached")
493506
elif mode == "all":
494507
cmd.append("HEAD")
508+
elif self.is_shallow:
509+
# In shallow clones, `git log` won't have the history necessary to
510+
# determine the files changed. Using `git diff` finds the
511+
# differences between the two trees which is slightly more
512+
# accurate. However, Github events often don't provide the true
513+
# base revision so shallow Github clones will still return
514+
# incorrect files changed in many cases, most notably pull
515+
# requests that need rebasing.
516+
cmd = ["diff", base, rev]
495517
else:
496518
revision_argument = f"{rev}~1..{rev}" if base is None else f"{base}..{rev}"
497519
cmd = ["log", "--format=format:", revision_argument]

test/test_util_vcs.py

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -508,3 +508,85 @@ def test_does_revision_exist_locally(repo):
508508
assert repo.does_revision_exist_locally(first_revision)
509509
assert repo.does_revision_exist_locally(last_revision)
510510
assert not repo.does_revision_exist_locally("deadbeef")
511+
512+
513+
def test_get_changed_files_shallow_clone(git_repo, tmp_path, default_git_branch):
514+
tmp_repo = Path(git_repo)
515+
516+
# Add initial files to the existing repo (which already has first_file from fixture)
517+
(tmp_repo / "common.txt").write_text("common content")
518+
(tmp_repo / "file_to_modify.txt").write_text("original content")
519+
(tmp_repo / "file_to_delete.txt").write_text("will be deleted")
520+
subprocess.check_call(["git", "add", "."], cwd=tmp_repo)
521+
subprocess.check_call(["git", "commit", "-m", "Add test files"], cwd=tmp_repo)
522+
523+
# Create feature branch and make changes
524+
subprocess.check_call(["git", "checkout", "-b", "feature"], cwd=tmp_repo)
525+
526+
# On feature branch: modify file, add new file, delete file
527+
(tmp_repo / "file_to_modify.txt").write_text("modified in feature")
528+
(tmp_repo / "feature_only.txt").write_text("feature specific file")
529+
(tmp_repo / "file_to_delete.txt").unlink()
530+
subprocess.check_call(["git", "rm", "file_to_delete.txt"], cwd=tmp_repo)
531+
subprocess.check_call(["git", "add", "."], cwd=tmp_repo)
532+
subprocess.check_call(["git", "commit", "-m", "Feature changes"], cwd=tmp_repo)
533+
534+
feature_commit = subprocess.check_output(
535+
["git", "rev-parse", "HEAD"], cwd=tmp_repo, text=True
536+
).strip()
537+
538+
# Switch back to main and make different changes
539+
subprocess.check_call(["git", "checkout", default_git_branch], cwd=tmp_repo)
540+
541+
# On main branch: different modifications
542+
(tmp_repo / "file_to_modify.txt").write_text("modified in main")
543+
(tmp_repo / "main_only.txt").write_text("main specific file")
544+
subprocess.check_call(["git", "add", "."], cwd=tmp_repo)
545+
subprocess.check_call(["git", "commit", "-m", "Main changes"], cwd=tmp_repo)
546+
547+
main_commit = subprocess.check_output(
548+
["git", "rev-parse", "HEAD"], cwd=tmp_repo, text=True
549+
).strip()
550+
551+
# Create a shallow clone with only the latest commit from each branch
552+
shallow_path = tmp_path / "shallow"
553+
subprocess.check_call(
554+
[
555+
"git",
556+
"clone",
557+
"--depth=1",
558+
"--no-single-branch",
559+
f"file://{tmp_repo}",
560+
str(shallow_path),
561+
]
562+
)
563+
shallow_repo = get_repository(str(shallow_path))
564+
assert shallow_repo.is_shallow
565+
566+
# Between main and feature:
567+
# - file_to_modify.txt was modified differently (M)
568+
# - file_to_delete.txt exists in main but not in feature (D)
569+
# - feature_only.txt exists in feature but not in main (A)
570+
# - main_only.txt exists in main but not in feature (D when comparing feature to main)
571+
changed_files = shallow_repo.get_changed_files(
572+
"AMD", "all", feature_commit, main_commit
573+
)
574+
assert "file_to_modify.txt" in changed_files # Modified
575+
assert "file_to_delete.txt" in changed_files # Deleted in feature
576+
assert "feature_only.txt" in changed_files # Added in feature
577+
assert (
578+
"main_only.txt" in changed_files
579+
) # Not in feature (shows as deleted from main's perspective)
580+
581+
added = shallow_repo.get_changed_files("A", "all", feature_commit, main_commit)
582+
assert "feature_only.txt" in added
583+
assert "file_to_delete.txt" not in added
584+
585+
deleted = shallow_repo.get_changed_files("D", "all", feature_commit, main_commit)
586+
assert "file_to_delete.txt" in deleted
587+
assert (
588+
"main_only.txt" in deleted
589+
) # From feature's perspective, main_only.txt doesn't exist
590+
591+
modified = shallow_repo.get_changed_files("M", "all", feature_commit, main_commit)
592+
assert "file_to_modify.txt" in modified

0 commit comments

Comments
 (0)