Skip to content

Commit e6dd665

Browse files
Fix slow tag checking and tree lookups for non-PR registries (#457)
Co-authored-by: Claude <noreply@anthropic.com>
1 parent 7fe8dc2 commit e6dd665

File tree

2 files changed

+104
-144
lines changed

2 files changed

+104
-144
lines changed

tagbot/action/repo.py

Lines changed: 64 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,8 @@ def __init__(
209209
self.__commit_datetimes: Dict[str, datetime] = {}
210210
# Cache for existing tags to avoid per-version API calls
211211
self.__existing_tags_cache: Optional[Dict[str, str]] = None
212+
# Cache for tree SHA → commit SHA mapping (for non-PR registries)
213+
self.__tree_to_commit_cache: Optional[Dict[str, str]] = None
212214
# Track manual intervention issue URL for error reporting
213215
self._manual_intervention_issue_url: Optional[str] = None
214216

@@ -515,51 +517,60 @@ def _commit_sha_from_registry_pr(self, version: str, tree: str) -> Optional[str]
515517
logger.warning("Tree SHA of commit from registry PR does not match")
516518
return None
517519

518-
def _commit_sha_of_tree_from_branch(
519-
self, branch: str, tree: str, since: Optional[datetime] = None
520-
) -> Optional[str]:
521-
"""Look up the commit SHA of a tree with the given SHA on one branch."""
522-
kwargs: Dict[str, object] = {"sha": branch}
523-
if since is not None:
524-
kwargs["since"] = since
525-
for commit in self._repo.get_commits(**kwargs):
526-
if self.__subdir:
527-
subdir_tree_hash = self._subdir_tree_hash(
528-
commit.sha, suppress_abort=True
529-
)
530-
if subdir_tree_hash == tree:
531-
return cast(str, commit.sha)
532-
else:
533-
if commit.commit.tree.sha == tree:
534-
return cast(str, commit.sha)
535-
return None
520+
def _build_tree_to_commit_cache(self) -> Dict[str, str]:
521+
"""Build a cache mapping tree SHAs to commit SHAs.
522+
523+
Uses git log to get all commit:tree pairs in one command,
524+
enabling O(1) lookups instead of iterating through commits.
525+
"""
526+
if self.__tree_to_commit_cache is not None:
527+
return self.__tree_to_commit_cache
528+
529+
logger.debug("Building tree→commit cache")
530+
cache: Dict[str, str] = {}
531+
try:
532+
# Get all commit:tree pairs in one git command
533+
output = self._git.command("log", "--all", "--format=%H %T")
534+
for line in output.splitlines():
535+
parts = line.split()
536+
if len(parts) == 2:
537+
commit_sha, tree_sha = parts
538+
# Only keep first occurrence (most recent commit for that tree)
539+
if tree_sha not in cache:
540+
cache[tree_sha] = commit_sha
541+
logger.debug(f"Tree→commit cache built with {len(cache)} entries")
542+
except Exception as e:
543+
logger.warning(f"Failed to build tree→commit cache: {e}")
544+
545+
self.__tree_to_commit_cache = cache
546+
return cache
536547

537548
def _commit_sha_of_tree(self, tree: str) -> Optional[str]:
538549
"""Look up the commit SHA of a tree with the given SHA."""
539-
sha = self._commit_sha_of_tree_from_branch(self._release_branch, tree)
540-
if sha:
541-
return sha
542-
for branch in self._repo.get_branches():
543-
if branch.name == self._release_branch:
544-
continue
545-
sha = self._commit_sha_of_tree_from_branch(branch.name, tree)
546-
if sha:
547-
return sha
548-
# For a valid tree SHA, the only time that we reach here is when a release
549-
# has been made long after the commit was made, which is reasonably rare.
550-
# Fall back to cloning the repo in that case.
551-
if self.__subdir:
552-
# For subdirectories, we need to check the subdirectory tree hash.
553-
# Limit to 10000 commits to avoid performance issues on large repos.
554-
for line in self._git.command(
555-
"log", "--all", "--format=%H", "-n", "10000"
556-
).splitlines():
557-
subdir_tree_hash = self._subdir_tree_hash(line, suppress_abort=True)
558-
if subdir_tree_hash == tree:
559-
return line
550+
# Fast path: use pre-built tree→commit cache (built from git log)
551+
# This is O(1) vs O(branches * commits) for the API-based approach
552+
if not self.__subdir:
553+
tree_cache = self._build_tree_to_commit_cache()
554+
if tree in tree_cache:
555+
return tree_cache[tree]
556+
# Tree not found in any commit
560557
return None
561-
else:
562-
return self._git.commit_sha_of_tree(tree)
558+
559+
# For subdirectories, we need to check the subdirectory tree hash.
560+
# Build a cache of subdir tree hashes from commits.
561+
if self.__tree_to_commit_cache is None:
562+
logger.debug("Building subdir tree→commit cache")
563+
subdir_cache: Dict[str, str] = {}
564+
for line in self._git.command("log", "--all", "--format=%H").splitlines():
565+
subdir_tree_hash = self._subdir_tree_hash(line, suppress_abort=True)
566+
if subdir_tree_hash and subdir_tree_hash not in subdir_cache:
567+
subdir_cache[subdir_tree_hash] = line
568+
logger.debug(
569+
f"Subdir tree→commit cache built with {len(subdir_cache)} entries"
570+
)
571+
self.__tree_to_commit_cache = subdir_cache
572+
573+
return self.__tree_to_commit_cache.get(tree)
563574

564575
def _subdir_tree_hash(
565576
self, commit_sha: str, *, suppress_abort: bool
@@ -748,11 +759,9 @@ def version_with_latest_commit(self, versions: Dict[str, str]) -> Optional[str]:
748759

749760
def _filter_map_versions(self, versions: Dict[str, str]) -> Dict[str, str]:
750761
"""Filter out versions and convert tree SHA to commit SHA."""
751-
# Pre-build caches to avoid repeated API calls
762+
# Pre-build tags cache to check existing tags quickly
752763
self._build_tags_cache()
753-
# Pre-build PR cache for versions that need commit SHA lookup
754-
if not self._clone_registry:
755-
self._build_registry_prs_cache()
764+
# Note: PR cache is built lazily only when needed (first registry PR lookup)
756765

757766
valid = {}
758767
skipped_existing = 0
@@ -761,22 +770,25 @@ def _filter_map_versions(self, versions: Dict[str, str]) -> Dict[str, str]:
761770
version_tag = self._get_version_tag(version)
762771

763772
# Fast path: check if tag already exists using cached tags
764-
existing_sha = self._commit_sha_of_tag(version_tag)
765-
if existing_sha:
773+
# Just check existence, don't resolve annotated tags (saves API calls)
774+
tags_cache = self._build_tags_cache()
775+
if version_tag in tags_cache:
766776
# Tag exists - we skip without full validation for performance.
767-
# Previously we'd verify the tag points to the expected commit,
768-
# but that requires expensive PR/tree lookups for each version.
769-
logger.info(f"Tag {version_tag} already exists")
770777
skipped_existing += 1
771778
continue
772779

773780
# Tag doesn't exist - need to find expected commit SHA
781+
# Try registry PR first (fast - uses cache)
774782
expected = self._commit_sha_from_registry_pr(version, tree)
775783
if not expected:
784+
# Fall back to tree lookup (slower but more thorough)
785+
logger.debug(
786+
f"No registry PR for {version}, falling back to tree lookup"
787+
)
776788
expected = self._commit_sha_of_tree(tree)
777789
if not expected:
778-
logger.warning(
779-
f"No matching commit was found for version {version} ({tree})"
790+
logger.debug(
791+
f"Skipping {version}: no registry PR or matching tree found"
780792
)
781793
continue
782794
valid[version] = expected

test/action/test_repo.py

Lines changed: 40 additions & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -475,102 +475,43 @@ def test_commit_sha_from_registry_pr(logger):
475475
assert r._commit_sha_from_registry_pr("v4.5.6", "def") == "sha"
476476

477477

478-
def test_commit_sha_of_tree_from_branch():
479-
r = _repo()
480-
r._repo.get_commits = Mock(return_value=[Mock(sha="abc"), Mock(sha="sha")])
481-
r._repo.get_commits.return_value[1].commit.tree.sha = "tree"
482-
assert r._commit_sha_of_tree_from_branch("master", "tree") == "sha"
483-
r._repo.get_commits.assert_called_with(sha="master")
484-
r._repo.get_commits.return_value.pop()
485-
assert r._commit_sha_of_tree_from_branch("master", "tree") is None
486-
487-
488-
@patch("tagbot.action.repo.logger")
489-
def test_commit_sha_of_tree_from_branch_subdir(logger):
490-
r = _repo(subdir="path/to/package")
491-
commits = [Mock(sha="abc"), Mock(sha="sha")]
492-
r._repo.get_commits = Mock(return_value=commits)
493-
r._git.command = Mock(side_effect=["other", "tree_hash"])
494-
495-
assert r._commit_sha_of_tree_from_branch("master", "tree_hash") == "sha"
496-
497-
r._repo.get_commits.assert_called_with(sha="master")
498-
r._git.command.assert_has_calls(
499-
[
500-
call("rev-parse", "abc:path/to/package"),
501-
call("rev-parse", "sha:path/to/package"),
502-
]
503-
)
504-
logger.debug.assert_not_called()
505-
506-
507-
@patch("tagbot.action.repo.logger")
508-
def test_commit_sha_of_tree_from_branch_subdir_rev_parse_failure(logger):
509-
r = _repo(subdir="path/to/package")
510-
commits = [Mock(sha="abc"), Mock(sha="sha")]
511-
r._repo.get_commits = Mock(return_value=commits)
512-
r._git.command = Mock(side_effect=[Abort("missing"), "tree_hash"])
513-
514-
assert r._commit_sha_of_tree_from_branch("master", "tree_hash") == "sha"
515-
516-
r._repo.get_commits.assert_called_with(sha="master")
517-
logger.debug.assert_called_with(
518-
"rev-parse failed while inspecting %s", "abc:path/to/package"
519-
)
520-
r._git.command.assert_has_calls(
521-
[
522-
call("rev-parse", "abc:path/to/package"),
523-
call("rev-parse", "sha:path/to/package"),
524-
]
525-
)
526-
527-
528478
def test_commit_sha_of_tree():
479+
"""Test tree→commit lookup using git log cache."""
529480
r = _repo()
530-
r._repo = Mock(default_branch="master")
531-
branches = r._repo.get_branches.return_value = [Mock(), Mock()]
532-
branches[0].name = "foo"
533-
branches[1].name = "master"
534-
r._commit_sha_of_tree_from_branch = Mock(side_effect=["sha1", None, "sha2"])
535-
assert r._commit_sha_of_tree("tree") == "sha1"
536-
r._repo.get_branches.assert_not_called()
537-
r._commit_sha_of_tree_from_branch.assert_called_once_with("master", "tree")
538-
assert r._commit_sha_of_tree("tree") == "sha2"
539-
r._commit_sha_of_tree_from_branch.assert_called_with("foo", "tree")
540-
r._commit_sha_of_tree_from_branch.side_effect = None
541-
r._commit_sha_of_tree_from_branch.return_value = None
542-
r._git.commit_sha_of_tree = Mock(side_effect=["sha", None])
543-
assert r._commit_sha_of_tree("tree") == "sha"
544-
assert r._commit_sha_of_tree("tree") is None
481+
# Mock git command to return commit:tree pairs
482+
r._git.command = Mock(return_value="sha1 tree1\nsha2 tree2\nsha3 tree3")
483+
# First lookup builds cache and finds match
484+
assert r._commit_sha_of_tree("tree1") == "sha1"
485+
r._git.command.assert_called_once_with("log", "--all", "--format=%H %T")
486+
# Second lookup uses cache (no additional git command)
487+
assert r._commit_sha_of_tree("tree2") == "sha2"
488+
assert r._git.command.call_count == 1 # Still just one call
489+
# Non-existent tree returns None
490+
assert r._commit_sha_of_tree("nonexistent") is None
545491

546492

547493
def test_commit_sha_of_tree_subdir_fallback():
548-
"""Test subdirectory fallback when branch lookups fail."""
494+
"""Test subdirectory tree→commit cache."""
549495
r = _repo(subdir="path/to/package")
550-
r._repo = Mock(default_branch="master")
551-
branches = r._repo.get_branches.return_value = [Mock()]
552-
branches[0].name = "master"
553-
# Branch lookups return None (fail)
554-
r._commit_sha_of_tree_from_branch = Mock(return_value=None)
555496
# git log returns commit SHAs
556497
r._git.command = Mock(return_value="abc123\ndef456\nghi789")
557-
# _subdir_tree_hash called via helper, simulate finding match on second commit
558-
with patch.object(r, "_subdir_tree_hash", side_effect=[None, "tree_hash", "other"]):
498+
# _subdir_tree_hash called for each commit, match on second
499+
with patch.object(
500+
r, "_subdir_tree_hash", side_effect=["other", "tree_hash", "another"]
501+
):
559502
assert r._commit_sha_of_tree("tree_hash") == "def456"
560-
# Verify it iterated through commits
561-
assert r._subdir_tree_hash.call_count == 2
503+
r._git.command.assert_called_once_with("log", "--all", "--format=%H")
504+
# Cache is built, so subsequent lookups don't call git again
505+
assert r._commit_sha_of_tree("other") == "abc123"
506+
assert r._git.command.call_count == 1
562507

563508

564509
def test_commit_sha_of_tree_subdir_fallback_no_match():
565-
"""Test subdirectory fallback returns None when no match found."""
510+
"""Test subdirectory cache returns None when no match found."""
566511
r = _repo(subdir="path/to/package")
567-
r._repo = Mock(default_branch="master")
568-
branches = r._repo.get_branches.return_value = [Mock()]
569-
branches[0].name = "master"
570-
r._commit_sha_of_tree_from_branch = Mock(return_value=None)
571512
r._git.command = Mock(return_value="abc123\ndef456")
572-
# No matches found
573-
with patch.object(r, "_subdir_tree_hash", return_value=None):
513+
# No matching subdir tree hash
514+
with patch.object(r, "_subdir_tree_hash", return_value="other_tree"):
574515
assert r._commit_sha_of_tree("tree_hash") is None
575516
assert r._subdir_tree_hash.call_count == 2
576517

@@ -740,21 +681,28 @@ def test_filter_map_versions(logger):
740681
r = _repo()
741682
# Mock the caches to avoid real API calls
742683
r._build_tags_cache = Mock(return_value={})
743-
r._build_registry_prs_cache = Mock(return_value={})
744684
r._commit_sha_from_registry_pr = Mock(return_value=None)
745685
r._commit_sha_of_tree = Mock(return_value=None)
686+
# No registry PR or tree found - should skip
746687
assert not r._filter_map_versions({"1.2.3": "tree1"})
747-
logger.warning.assert_called_with(
748-
"No matching commit was found for version v1.2.3 (tree1)"
688+
logger.debug.assert_called_with(
689+
"Skipping v1.2.3: no registry PR or matching tree found"
749690
)
750-
r._commit_sha_of_tree.return_value = "sha"
751-
r._commit_sha_of_tag = Mock(return_value="sha")
752-
# Tag exists - skip it (no validation of commit SHA for performance)
753-
assert not r._filter_map_versions({"2.3.4": "tree2"})
754-
logger.info.assert_called_with("Tag v2.3.4 already exists")
755-
# Tag doesn't exist - should be included
756-
r._commit_sha_of_tag.return_value = None
691+
# Tree lookup fallback should be called when PR not found
692+
r._commit_sha_of_tree.assert_called_with("tree1")
693+
# Registry PR found - should include (no tree lookup needed)
694+
r._commit_sha_from_registry_pr.return_value = "sha"
695+
r._commit_sha_of_tree.reset_mock()
757696
assert r._filter_map_versions({"4.5.6": "tree4"}) == {"v4.5.6": "sha"}
697+
r._commit_sha_of_tree.assert_not_called()
698+
# Tag exists - skip it silently (no per-version logging for performance)
699+
r._build_tags_cache.return_value = {"v2.3.4": "existing_sha"}
700+
assert not r._filter_map_versions({"2.3.4": "tree2"})
701+
# Tree fallback works when PR not found
702+
r._build_tags_cache.return_value = {}
703+
r._commit_sha_from_registry_pr.return_value = None
704+
r._commit_sha_of_tree.return_value = "tree_sha"
705+
assert r._filter_map_versions({"5.6.7": "tree5"}) == {"v5.6.7": "tree_sha"}
758706

759707

760708
@patch("tagbot.action.repo.logger")

0 commit comments

Comments
 (0)