diff --git a/src/mcp_codebase_index/git_tracker.py b/src/mcp_codebase_index/git_tracker.py index 1a846c2..f8da9f2 100644 --- a/src/mcp_codebase_index/git_tracker.py +++ b/src/mcp_codebase_index/git_tracker.py @@ -16,14 +16,30 @@ # # Commercial licensing available. See COMMERCIAL-LICENSE.md for details. -"""Git change detection for incremental re-indexing.""" +"""Git change detection for incremental re-indexing. + +On Windows, Python .exe console_scripts (installed via pip) often inherit +a reduced PATH that does not include ``git``. To avoid hangs caused by +``subprocess.run(["git", ...])`` waiting for a missing binary, the +hot-path helpers :func:`is_git_repo` and :func:`get_head_commit` use +direct filesystem reads of ``.git/`` instead of shelling out. + +A resolved path to ``git`` is still needed for diff-based incremental +updates; :func:`_find_git` locates the binary once at import time. +""" from __future__ import annotations +import os +import re +import shutil import subprocess from dataclasses import dataclass, field +_COMMIT_HASH_RE = re.compile(r"^[0-9a-f]{40}(?:[0-9a-f]{24})?$") + + @dataclass class GitChangeSet: """Set of files changed since a given git ref.""" @@ -37,43 +53,140 @@ def is_empty(self) -> bool: return not self.modified and not self.added and not self.deleted +# --------------------------------------------------------------------------- +# Git binary resolution (needed for diff/ls-files calls) +# --------------------------------------------------------------------------- + +def _find_git() -> str: + """Return the absolute path to a ``git`` binary. + + ``shutil.which`` may fail inside pip-installed ``.exe`` wrappers on + Windows because they inherit a minimal PATH. Fall back to well-known + install locations before giving up. + """ + found = shutil.which("git") + if found: + return found + for candidate in [ + os.path.expandvars(r"%ProgramFiles%\Git\cmd\git.exe"), + os.path.expandvars(r"%ProgramFiles(x86)%\Git\cmd\git.exe"), + r"C:\Program Files\Git\cmd\git.exe", + ]: + if os.path.isfile(candidate): + return candidate + return "git" # last resort – let subprocess raise FileNotFoundError + + +_GIT_CMD: str = _find_git() + + +# --------------------------------------------------------------------------- +# Filesystem-based helpers (no subprocess, no PATH dependency) +# --------------------------------------------------------------------------- + +def _resolve_git_dir(path: str) -> str | None: + """Find the ``.git`` directory for a working tree path. + + Walks up from *path* looking for a ``.git`` entry. Supports both + regular repositories (``.git/`` directory) and worktrees / submodules + (``.git`` file containing ``gitdir: ``). + + Returns the resolved git directory path, or ``None``. + """ + path = os.path.abspath(path) + while True: + dot_git = os.path.join(path, ".git") + if os.path.isdir(dot_git): + return dot_git + if os.path.isfile(dot_git): + try: + with open(dot_git, "r") as f: + content = f.read().strip() + if content.startswith("gitdir: "): + git_dir = content[8:] + if not os.path.isabs(git_dir): + git_dir = os.path.normpath(os.path.join(path, git_dir)) + if os.path.isdir(git_dir): + return git_dir + except (OSError, IOError): + pass + parent = os.path.dirname(path) + if parent == path: + break + path = parent + return None + + def is_git_repo(root_path: str) -> bool: - """Check if the given path is inside a git work tree.""" - try: - result = subprocess.run( - ["git", "rev-parse", "--is-inside-work-tree"], - cwd=root_path, - capture_output=True, - text=True, - timeout=10, - ) - return result.returncode == 0 and result.stdout.strip() == "true" - except (FileNotFoundError, subprocess.TimeoutExpired): - return False + """Check if *root_path* is inside a git work tree. + + Uses a filesystem walk (looking for a ``.git`` entry) instead of + ``git rev-parse`` so that it works reliably inside pip-installed + ``.exe`` wrappers on Windows where ``git`` may not be on PATH. + + Supports regular repos, worktrees, and submodules (where ``.git`` + is a file containing ``gitdir: ``). + """ + return _resolve_git_dir(root_path) is not None def get_head_commit(root_path: str) -> str | None: - """Get the current HEAD commit hash.""" - try: - result = subprocess.run( - ["git", "rev-parse", "HEAD"], - cwd=root_path, - capture_output=True, - text=True, - timeout=10, - ) - if result.returncode == 0: - return result.stdout.strip() + """Return the current HEAD commit hash by reading ``.git/`` directly. + + Avoids shelling out to ``git rev-parse HEAD`` which can hang on + Windows when ``git`` is not on PATH. Supports both SHA-1 (40 hex) + and SHA-256 (64 hex) object IDs. + """ + git_dir = _resolve_git_dir(root_path) + if git_dir is None: return None - except (FileNotFoundError, subprocess.TimeoutExpired): + head_file = os.path.join(git_dir, "HEAD") + try: + with open(head_file, "r") as f: + content = f.read().strip() + if content.startswith("ref: "): + # Symbolic ref → resolve to a commit hash + ref_path = os.path.join(git_dir, content[5:]) + if os.path.isfile(ref_path): + with open(ref_path, "r") as f: + return f.read().strip() + # Ref may be packed + packed = os.path.join(git_dir, "packed-refs") + if os.path.isfile(packed): + ref_name = content[5:] + with open(packed, "r") as f: + for line in f: + line = line.strip() + if line.startswith("#"): + continue + parts = line.split(" ", 1) + if len(parts) == 2 and parts[1] == ref_name: + return parts[0] + return None + # Detached HEAD – content is the hash itself (SHA-1 or SHA-256) + return content if _COMMIT_HASH_RE.match(content) else None + except (OSError, IOError): return None -def get_changed_files(root_path: str, since_ref: str | None) -> GitChangeSet: +# --------------------------------------------------------------------------- +# Subprocess-based helpers (use resolved _GIT_CMD) +# --------------------------------------------------------------------------- + +def get_changed_files( + root_path: str, + since_ref: str | None, + *, + skip_committed: bool = False, +) -> GitChangeSet: """Get files changed since a given git ref. Combines committed changes (since_ref..HEAD), staged changes, unstaged changes, and untracked files into a single GitChangeSet. + + When *skip_committed* is ``True``, the ``since_ref..HEAD`` diff is + skipped (useful when HEAD hasn't moved), but the working tree is + still checked for unstaged, staged, and untracked changes. """ if since_ref is None: return GitChangeSet() @@ -83,21 +196,22 @@ def get_changed_files(root_path: str, since_ref: str | None) -> GitChangeSet: deleted: set[str] = set() # 1. Committed changes since the ref - _parse_diff_output(root_path, ["git", "diff", "--name-status", since_ref, "HEAD"], - modified, added, deleted) + if not skip_committed: + _parse_diff_output(root_path, [_GIT_CMD, "diff", "--name-status", since_ref, "HEAD"], + modified, added, deleted) # 2. Unstaged changes - _parse_diff_output(root_path, ["git", "diff", "--name-status"], + _parse_diff_output(root_path, [_GIT_CMD, "diff", "--name-status"], modified, added, deleted) # 3. Staged changes - _parse_diff_output(root_path, ["git", "diff", "--name-status", "--cached"], + _parse_diff_output(root_path, [_GIT_CMD, "diff", "--name-status", "--cached"], modified, added, deleted) # 4. Untracked files try: result = subprocess.run( - ["git", "ls-files", "--others", "--exclude-standard"], + [_GIT_CMD, "ls-files", "--others", "--exclude-standard"], cwd=root_path, capture_output=True, text=True, diff --git a/src/mcp_codebase_index/server.py b/src/mcp_codebase_index/server.py index 72126da..5735c68 100644 --- a/src/mcp_codebase_index/server.py +++ b/src/mcp_codebase_index/server.py @@ -293,12 +293,25 @@ def _matches_include_patterns(rel_path: str, patterns: list[str]) -> bool: def _maybe_incremental_update() -> None: - """Check git for changes and incrementally update the index if needed.""" + """Check git for changes and incrementally update the index if needed. + + Uses a fast filesystem-based HEAD check first to avoid expensive + ``git diff`` subprocess calls when the commit hasn't changed. + """ if not _is_git or _indexer is None or _indexer._project_index is None: return idx = _indexer._project_index - changeset = get_changed_files(_project_root, idx.last_indexed_git_ref) + + # Optimisation: if HEAD hasn't moved since last index, skip the + # expensive committed-changes diff (since_ref..HEAD) but still check + # the working tree for unstaged, staged, and untracked changes. + current_head = get_head_commit(_project_root) + head_unchanged = current_head and current_head == idx.last_indexed_git_ref + + changeset = get_changed_files( + _project_root, idx.last_indexed_git_ref, skip_committed=head_unchanged, + ) if changeset.is_empty: return diff --git a/tests/test_git_tracker.py b/tests/test_git_tracker.py index e8fb28c..2b4af36 100644 --- a/tests/test_git_tracker.py +++ b/tests/test_git_tracker.py @@ -1,53 +1,236 @@ """Unit tests for git_tracker module.""" +import os +import tempfile + from unittest.mock import patch, MagicMock -import subprocess from mcp_codebase_index.git_tracker import ( is_git_repo, get_head_commit, get_changed_files, GitChangeSet, + _find_git, + _resolve_git_dir, + _COMMIT_HASH_RE, ) +_SHA1_HASH = "a" * 40 +_SHA256_HASH = "b" * 64 + + +# --------------------------------------------------------------------------- +# _resolve_git_dir +# --------------------------------------------------------------------------- + + +class TestResolveGitDir: + def test_returns_git_directory(self, tmp_path): + (tmp_path / ".git").mkdir() + assert _resolve_git_dir(str(tmp_path)) == str(tmp_path / ".git") + + def test_walks_up_from_subdirectory(self, tmp_path): + (tmp_path / ".git").mkdir() + child = tmp_path / "src" / "deep" + child.mkdir(parents=True) + assert _resolve_git_dir(str(child)) == str(tmp_path / ".git") + + def test_follows_git_file_absolute_path(self, tmp_path): + """Worktree/submodule: .git is a file with gitdir: .""" + real_git_dir = tmp_path / "real_git_dir" + real_git_dir.mkdir() + (tmp_path / "worktree" / ".git").parent.mkdir(parents=True) + (tmp_path / "worktree" / ".git").write_text( + f"gitdir: {real_git_dir}\n" + ) + assert _resolve_git_dir(str(tmp_path / "worktree")) == str(real_git_dir) + + def test_follows_git_file_relative_path(self, tmp_path): + """Worktree/submodule: .git file with a relative gitdir.""" + real_git_dir = tmp_path / ".git" / "worktrees" / "feature" + real_git_dir.mkdir(parents=True) + wt = tmp_path / "worktrees" / "feature" + wt.mkdir(parents=True) + (wt / ".git").write_text( + "gitdir: ../../.git/worktrees/feature\n" + ) + assert os.path.normpath( + _resolve_git_dir(str(wt)) + ) == os.path.normpath(str(real_git_dir)) + + def test_returns_none_for_non_git(self, tmp_path): + assert _resolve_git_dir(str(tmp_path)) is None + + def test_ignores_invalid_git_file(self, tmp_path): + """A .git file that doesn't start with 'gitdir: ' is ignored.""" + (tmp_path / ".git").write_text("garbage\n") + assert _resolve_git_dir(str(tmp_path)) is None + + def test_ignores_git_file_pointing_to_missing_dir(self, tmp_path): + (tmp_path / ".git").write_text("gitdir: /nonexistent/path\n") + assert _resolve_git_dir(str(tmp_path)) is None + + +# --------------------------------------------------------------------------- +# is_git_repo (filesystem-based, no subprocess) +# --------------------------------------------------------------------------- + + class TestIsGitRepo: - def test_returns_true_for_git_repo(self): - with patch("mcp_codebase_index.git_tracker.subprocess.run") as mock_run: - mock_run.return_value = MagicMock(returncode=0, stdout="true\n") - assert is_git_repo("/some/path") is True + def test_returns_true_for_git_repo(self, tmp_path): + (tmp_path / ".git").mkdir() + assert is_git_repo(str(tmp_path)) is True - def test_returns_false_for_non_git(self): - with patch("mcp_codebase_index.git_tracker.subprocess.run") as mock_run: - mock_run.return_value = MagicMock(returncode=128, stdout="") - assert is_git_repo("/some/path") is False + def test_returns_true_for_nested_path(self, tmp_path): + (tmp_path / ".git").mkdir() + child = tmp_path / "src" / "deep" + child.mkdir(parents=True) + assert is_git_repo(str(child)) is True - def test_returns_false_when_git_not_installed(self): - with patch("mcp_codebase_index.git_tracker.subprocess.run") as mock_run: - mock_run.side_effect = FileNotFoundError - assert is_git_repo("/some/path") is False + def test_returns_true_for_worktree_git_file(self, tmp_path): + real_git_dir = tmp_path / "real_git_dir" + real_git_dir.mkdir() + (tmp_path / "worktree").mkdir() + (tmp_path / "worktree" / ".git").write_text( + f"gitdir: {real_git_dir}\n" + ) + assert is_git_repo(str(tmp_path / "worktree")) is True + + def test_returns_false_for_non_git(self, tmp_path): + assert is_git_repo(str(tmp_path)) is False + + def test_returns_false_for_empty_path(self): + with tempfile.TemporaryDirectory() as d: + assert is_git_repo(d) is False - def test_returns_false_on_timeout(self): - with patch("mcp_codebase_index.git_tracker.subprocess.run") as mock_run: - mock_run.side_effect = subprocess.TimeoutExpired(cmd="git", timeout=10) - assert is_git_repo("/some/path") is False + +# --------------------------------------------------------------------------- +# get_head_commit (filesystem-based, no subprocess) +# --------------------------------------------------------------------------- class TestGetHeadCommit: - def test_returns_commit_hash(self): - with patch("mcp_codebase_index.git_tracker.subprocess.run") as mock_run: - mock_run.return_value = MagicMock(returncode=0, stdout="abc123\n") - assert get_head_commit("/some/path") == "abc123" + def test_returns_commit_from_symbolic_ref(self, tmp_path): + git_dir = tmp_path / ".git" + git_dir.mkdir() + (git_dir / "HEAD").write_text("ref: refs/heads/main\n") + refs = git_dir / "refs" / "heads" + refs.mkdir(parents=True) + (refs / "main").write_text(_SHA1_HASH + "\n") - def test_returns_none_on_failure(self): - with patch("mcp_codebase_index.git_tracker.subprocess.run") as mock_run: - mock_run.return_value = MagicMock(returncode=128, stdout="") - assert get_head_commit("/some/path") is None + assert get_head_commit(str(tmp_path)) == _SHA1_HASH - def test_returns_none_when_git_not_installed(self): - with patch("mcp_codebase_index.git_tracker.subprocess.run") as mock_run: - mock_run.side_effect = FileNotFoundError - assert get_head_commit("/some/path") is None + def test_returns_commit_from_detached_head_sha1(self, tmp_path): + git_dir = tmp_path / ".git" + git_dir.mkdir() + (git_dir / "HEAD").write_text(_SHA1_HASH + "\n") + + assert get_head_commit(str(tmp_path)) == _SHA1_HASH + + def test_returns_commit_from_detached_head_sha256(self, tmp_path): + git_dir = tmp_path / ".git" + git_dir.mkdir() + (git_dir / "HEAD").write_text(_SHA256_HASH + "\n") + + assert get_head_commit(str(tmp_path)) == _SHA256_HASH + + def test_returns_commit_from_packed_refs(self, tmp_path): + git_dir = tmp_path / ".git" + git_dir.mkdir() + (git_dir / "HEAD").write_text("ref: refs/heads/main\n") + # No loose ref file — only packed-refs + (git_dir / "packed-refs").write_text( + "# pack-refs with: peeled fully-peeled sorted\n" + f"{_SHA1_HASH} refs/heads/main\n" + ) + + assert get_head_commit(str(tmp_path)) == _SHA1_HASH + + def test_returns_commit_from_worktree_git_file(self, tmp_path): + """get_head_commit works when .git is a file (worktree/submodule).""" + real_git_dir = tmp_path / "real_git_dir" + real_git_dir.mkdir() + (real_git_dir / "HEAD").write_text(_SHA1_HASH + "\n") + + wt = tmp_path / "worktree" + wt.mkdir() + (wt / ".git").write_text(f"gitdir: {real_git_dir}\n") + + assert get_head_commit(str(wt)) == _SHA1_HASH + + def test_returns_commit_from_subdirectory(self, tmp_path): + """PROJECT_ROOT is a subdirectory of the repo.""" + git_dir = tmp_path / ".git" + git_dir.mkdir() + (git_dir / "HEAD").write_text(_SHA1_HASH + "\n") + sub = tmp_path / "packages" / "core" + sub.mkdir(parents=True) + + assert get_head_commit(str(sub)) == _SHA1_HASH + + def test_returns_none_when_no_git_dir(self, tmp_path): + assert get_head_commit(str(tmp_path)) is None + + def test_returns_none_when_ref_missing(self, tmp_path): + git_dir = tmp_path / ".git" + git_dir.mkdir() + (git_dir / "HEAD").write_text("ref: refs/heads/nonexistent\n") + + assert get_head_commit(str(tmp_path)) is None + + def test_returns_none_for_invalid_detached_head(self, tmp_path): + git_dir = tmp_path / ".git" + git_dir.mkdir() + (git_dir / "HEAD").write_text("not-a-hash\n") + + assert get_head_commit(str(tmp_path)) is None + + +# --------------------------------------------------------------------------- +# _COMMIT_HASH_RE +# --------------------------------------------------------------------------- + + +class TestCommitHashRe: + def test_matches_sha1(self): + assert _COMMIT_HASH_RE.match(_SHA1_HASH) + + def test_matches_sha256(self): + assert _COMMIT_HASH_RE.match(_SHA256_HASH) + + def test_rejects_short_hash(self): + assert _COMMIT_HASH_RE.match("abc123") is None + + def test_rejects_non_hex(self): + assert _COMMIT_HASH_RE.match("g" * 40) is None + + def test_rejects_50_chars(self): + assert _COMMIT_HASH_RE.match("a" * 50) is None + + +# --------------------------------------------------------------------------- +# _find_git +# --------------------------------------------------------------------------- + + +class TestFindGit: + def test_returns_string(self): + assert isinstance(_find_git(), str) + + def test_finds_git_via_shutil_which(self): + with patch("mcp_codebase_index.git_tracker.shutil.which", return_value="/usr/bin/git"): + assert _find_git() == "/usr/bin/git" + + def test_falls_back_when_which_returns_none(self): + with patch("mcp_codebase_index.git_tracker.shutil.which", return_value=None), \ + patch("mcp_codebase_index.git_tracker.os.path.isfile", return_value=False): + assert _find_git() == "git" + + +# --------------------------------------------------------------------------- +# get_changed_files (subprocess-based, uses _GIT_CMD) +# --------------------------------------------------------------------------- class TestGetChangedFiles: @@ -57,8 +240,7 @@ def test_returns_empty_when_since_ref_is_none(self): def test_parses_modified_files(self): def mock_run(cmd, **kwargs): - if cmd[:3] == ["git", "diff", "--name-status"] and len(cmd) == 5: - # committed changes: git diff --name-status HEAD + if cmd[1:3] == ["diff", "--name-status"] and len(cmd) == 5: return MagicMock(returncode=0, stdout="M\tfoo.py\n") return MagicMock(returncode=0, stdout="") @@ -68,7 +250,7 @@ def mock_run(cmd, **kwargs): def test_parses_added_files(self): def mock_run(cmd, **kwargs): - if cmd[:3] == ["git", "diff", "--name-status"] and len(cmd) == 5: + if cmd[1:3] == ["diff", "--name-status"] and len(cmd) == 5: return MagicMock(returncode=0, stdout="A\tnew_file.py\n") return MagicMock(returncode=0, stdout="") @@ -78,7 +260,7 @@ def mock_run(cmd, **kwargs): def test_parses_deleted_files(self): def mock_run(cmd, **kwargs): - if cmd[:3] == ["git", "diff", "--name-status"] and len(cmd) == 5: + if cmd[1:3] == ["diff", "--name-status"] and len(cmd) == 5: return MagicMock(returncode=0, stdout="D\told_file.py\n") return MagicMock(returncode=0, stdout="") @@ -88,7 +270,7 @@ def mock_run(cmd, **kwargs): def test_rename_handling(self): def mock_run(cmd, **kwargs): - if cmd[:3] == ["git", "diff", "--name-status"] and len(cmd) == 5: + if cmd[1:3] == ["diff", "--name-status"] and len(cmd) == 5: return MagicMock(returncode=0, stdout="R100\told.py\tnew.py\n") return MagicMock(returncode=0, stdout="") @@ -99,13 +281,10 @@ def mock_run(cmd, **kwargs): def test_overlap_resolution_added_and_deleted_becomes_modified(self): """If a file appears in both added and deleted, treat as modified.""" - def mock_run(cmd, **kwargs): - if cmd[:3] == ["git", "diff", "--name-status"] and len(cmd) == 5: - # Committed: file was deleted + if cmd[1:3] == ["diff", "--name-status"] and len(cmd) == 5: return MagicMock(returncode=0, stdout="D\toverlap.py\n") - if cmd == ["git", "diff", "--name-status"]: - # Unstaged: file was added + if cmd[1:] == ["diff", "--name-status"]: return MagicMock(returncode=0, stdout="A\toverlap.py\n") return MagicMock(returncode=0, stdout="") @@ -117,7 +296,7 @@ def mock_run(cmd, **kwargs): def test_untracked_files_added(self): def mock_run(cmd, **kwargs): - if cmd[:2] == ["git", "ls-files"]: + if "ls-files" in cmd: return MagicMock(returncode=0, stdout="untracked.py\n") return MagicMock(returncode=0, stdout="") @@ -125,6 +304,42 @@ def mock_run(cmd, **kwargs): changeset = get_changed_files("/some/path", "abc123") assert "untracked.py" in changeset.added + def test_skip_committed_skips_ref_head_diff(self): + """skip_committed=True should not call diff with since_ref..HEAD.""" + calls = [] + + def mock_run(cmd, **kwargs): + calls.append(cmd) + return MagicMock(returncode=0, stdout="") + + with patch("mcp_codebase_index.git_tracker.subprocess.run", side_effect=mock_run): + get_changed_files("/some/path", "abc123", skip_committed=True) + + # No 5-arg diff command (the committed diff) + for cmd in calls: + assert not (cmd[1:3] == ["diff", "--name-status"] and len(cmd) == 5), \ + "skip_committed=True should not call since_ref..HEAD diff" + + def test_skip_committed_still_checks_working_tree(self): + """skip_committed=True should still detect unstaged/staged/untracked.""" + def mock_run(cmd, **kwargs): + # Unstaged: modified file + if cmd[1:] == ["diff", "--name-status"]: + return MagicMock(returncode=0, stdout="M\tunstaged.py\n") + # Staged: added file + if cmd[1:] == ["diff", "--name-status", "--cached"]: + return MagicMock(returncode=0, stdout="A\tstaged.py\n") + # Untracked + if "ls-files" in cmd: + return MagicMock(returncode=0, stdout="untracked.py\n") + return MagicMock(returncode=0, stdout="") + + with patch("mcp_codebase_index.git_tracker.subprocess.run", side_effect=mock_run): + changeset = get_changed_files("/some/path", "abc123", skip_committed=True) + assert "unstaged.py" in changeset.modified + assert "staged.py" in changeset.added + assert "untracked.py" in changeset.added + def test_graceful_failure_git_not_found(self): with patch("mcp_codebase_index.git_tracker.subprocess.run") as mock_run: mock_run.side_effect = FileNotFoundError @@ -132,6 +347,11 @@ def test_graceful_failure_git_not_found(self): assert changeset.is_empty +# --------------------------------------------------------------------------- +# GitChangeSet +# --------------------------------------------------------------------------- + + class TestGitChangeSet: def test_is_empty_true(self): cs = GitChangeSet()