Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
176 changes: 145 additions & 31 deletions src/mcp_codebase_index/git_tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,30 @@
#
# Commercial licensing available. See COMMERCIAL-LICENSE.md for details.

"""Git change detection for incremental re-indexing."""
"""Git change detection for incremental re-indexing.

On Windows, Python .exe console_scripts (installed via pip) often inherit
a reduced PATH that does not include ``git``. To avoid hangs caused by
``subprocess.run(["git", ...])`` waiting for a missing binary, the
hot-path helpers :func:`is_git_repo` and :func:`get_head_commit` use
direct filesystem reads of ``.git/`` instead of shelling out.

A resolved path to ``git`` is still needed for diff-based incremental
updates; :func:`_find_git` locates the binary once at import time.
"""

from __future__ import annotations

import os
import re
import shutil
import subprocess
from dataclasses import dataclass, field


_COMMIT_HASH_RE = re.compile(r"^[0-9a-f]{40}(?:[0-9a-f]{24})?$")


@dataclass
class GitChangeSet:
"""Set of files changed since a given git ref."""
Expand All @@ -37,43 +53,140 @@ def is_empty(self) -> bool:
return not self.modified and not self.added and not self.deleted


# ---------------------------------------------------------------------------
# Git binary resolution (needed for diff/ls-files calls)
# ---------------------------------------------------------------------------

def _find_git() -> str:
"""Return the absolute path to a ``git`` binary.

``shutil.which`` may fail inside pip-installed ``.exe`` wrappers on
Windows because they inherit a minimal PATH. Fall back to well-known
install locations before giving up.
"""
found = shutil.which("git")
if found:
return found
for candidate in [
os.path.expandvars(r"%ProgramFiles%\Git\cmd\git.exe"),
os.path.expandvars(r"%ProgramFiles(x86)%\Git\cmd\git.exe"),
r"C:\Program Files\Git\cmd\git.exe",
]:
if os.path.isfile(candidate):
return candidate
return "git" # last resort – let subprocess raise FileNotFoundError


_GIT_CMD: str = _find_git()


# ---------------------------------------------------------------------------
# Filesystem-based helpers (no subprocess, no PATH dependency)
# ---------------------------------------------------------------------------

def _resolve_git_dir(path: str) -> str | None:
"""Find the ``.git`` directory for a working tree path.

Walks up from *path* looking for a ``.git`` entry. Supports both
regular repositories (``.git/`` directory) and worktrees / submodules
(``.git`` file containing ``gitdir: <path>``).

Returns the resolved git directory path, or ``None``.
"""
path = os.path.abspath(path)
while True:
dot_git = os.path.join(path, ".git")
if os.path.isdir(dot_git):
return dot_git
if os.path.isfile(dot_git):
try:
with open(dot_git, "r") as f:
content = f.read().strip()
if content.startswith("gitdir: "):
git_dir = content[8:]
if not os.path.isabs(git_dir):
git_dir = os.path.normpath(os.path.join(path, git_dir))
if os.path.isdir(git_dir):
return git_dir
except (OSError, IOError):
pass
parent = os.path.dirname(path)
if parent == path:
break
path = parent
return None


def is_git_repo(root_path: str) -> bool:
"""Check if the given path is inside a git work tree."""
try:
result = subprocess.run(
["git", "rev-parse", "--is-inside-work-tree"],
cwd=root_path,
capture_output=True,
text=True,
timeout=10,
)
return result.returncode == 0 and result.stdout.strip() == "true"
except (FileNotFoundError, subprocess.TimeoutExpired):
return False
"""Check if *root_path* is inside a git work tree.

Uses a filesystem walk (looking for a ``.git`` entry) instead of
``git rev-parse`` so that it works reliably inside pip-installed
``.exe`` wrappers on Windows where ``git`` may not be on PATH.

Supports regular repos, worktrees, and submodules (where ``.git``
is a file containing ``gitdir: <path>``).
"""
return _resolve_git_dir(root_path) is not None


def get_head_commit(root_path: str) -> str | None:
"""Get the current HEAD commit hash."""
try:
result = subprocess.run(
["git", "rev-parse", "HEAD"],
cwd=root_path,
capture_output=True,
text=True,
timeout=10,
)
if result.returncode == 0:
return result.stdout.strip()
"""Return the current HEAD commit hash by reading ``.git/`` directly.

Avoids shelling out to ``git rev-parse HEAD`` which can hang on
Windows when ``git`` is not on PATH. Supports both SHA-1 (40 hex)
and SHA-256 (64 hex) object IDs.
"""
git_dir = _resolve_git_dir(root_path)
if git_dir is None:
return None
except (FileNotFoundError, subprocess.TimeoutExpired):
head_file = os.path.join(git_dir, "HEAD")
try:
with open(head_file, "r") as f:
content = f.read().strip()
if content.startswith("ref: "):
# Symbolic ref → resolve to a commit hash
ref_path = os.path.join(git_dir, content[5:])
if os.path.isfile(ref_path):
with open(ref_path, "r") as f:
return f.read().strip()
# Ref may be packed
packed = os.path.join(git_dir, "packed-refs")
if os.path.isfile(packed):
ref_name = content[5:]
with open(packed, "r") as f:
for line in f:
line = line.strip()
if line.startswith("#"):
continue
parts = line.split(" ", 1)
if len(parts) == 2 and parts[1] == ref_name:
return parts[0]
return None
# Detached HEAD – content is the hash itself (SHA-1 or SHA-256)
return content if _COMMIT_HASH_RE.match(content) else None
except (OSError, IOError):
return None


def get_changed_files(root_path: str, since_ref: str | None) -> GitChangeSet:
# ---------------------------------------------------------------------------
# Subprocess-based helpers (use resolved _GIT_CMD)
# ---------------------------------------------------------------------------

def get_changed_files(
root_path: str,
since_ref: str | None,
*,
skip_committed: bool = False,
) -> GitChangeSet:
"""Get files changed since a given git ref.

Combines committed changes (since_ref..HEAD), staged changes,
unstaged changes, and untracked files into a single GitChangeSet.

When *skip_committed* is ``True``, the ``since_ref..HEAD`` diff is
skipped (useful when HEAD hasn't moved), but the working tree is
still checked for unstaged, staged, and untracked changes.
"""
if since_ref is None:
return GitChangeSet()
Expand All @@ -83,21 +196,22 @@ def get_changed_files(root_path: str, since_ref: str | None) -> GitChangeSet:
deleted: set[str] = set()

# 1. Committed changes since the ref
_parse_diff_output(root_path, ["git", "diff", "--name-status", since_ref, "HEAD"],
modified, added, deleted)
if not skip_committed:
_parse_diff_output(root_path, [_GIT_CMD, "diff", "--name-status", since_ref, "HEAD"],
modified, added, deleted)

# 2. Unstaged changes
_parse_diff_output(root_path, ["git", "diff", "--name-status"],
_parse_diff_output(root_path, [_GIT_CMD, "diff", "--name-status"],
modified, added, deleted)

# 3. Staged changes
_parse_diff_output(root_path, ["git", "diff", "--name-status", "--cached"],
_parse_diff_output(root_path, [_GIT_CMD, "diff", "--name-status", "--cached"],
modified, added, deleted)

# 4. Untracked files
try:
result = subprocess.run(
["git", "ls-files", "--others", "--exclude-standard"],
[_GIT_CMD, "ls-files", "--others", "--exclude-standard"],
cwd=root_path,
capture_output=True,
text=True,
Expand Down
17 changes: 15 additions & 2 deletions src/mcp_codebase_index/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,12 +293,25 @@ def _matches_include_patterns(rel_path: str, patterns: list[str]) -> bool:


def _maybe_incremental_update() -> None:
"""Check git for changes and incrementally update the index if needed."""
"""Check git for changes and incrementally update the index if needed.

Uses a fast filesystem-based HEAD check first to avoid expensive
``git diff`` subprocess calls when the commit hasn't changed.
"""
if not _is_git or _indexer is None or _indexer._project_index is None:
return

idx = _indexer._project_index
changeset = get_changed_files(_project_root, idx.last_indexed_git_ref)

# Optimisation: if HEAD hasn't moved since last index, skip the
# expensive committed-changes diff (since_ref..HEAD) but still check
# the working tree for unstaged, staged, and untracked changes.
current_head = get_head_commit(_project_root)
head_unchanged = current_head and current_head == idx.last_indexed_git_ref

changeset = get_changed_files(
_project_root, idx.last_indexed_git_ref, skip_committed=head_unchanged,
)
if changeset.is_empty:
return

Expand Down
Loading