Skip to content

Commit e1adb06

Browse files
committed
perf: batch git cat-file reads, add pipeline timing
1 parent bad1731 commit e1adb06

File tree

2 files changed

+136
-33
lines changed

2 files changed

+136
-33
lines changed

src/treemapper/diffctx/__init__.py

Lines changed: 69 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import logging
44
import re
55
import subprocess
6+
import time
67
from collections import defaultdict
78
from pathlib import Path
89
from typing import Any
@@ -18,6 +19,7 @@
1819
from .file_importance import compute_file_importance
1920
from .fragments import fragment_file # type: ignore[attr-defined]
2021
from .git import (
22+
CatFileBatch,
2123
GitError,
2224
get_changed_files,
2325
get_deleted_files,
@@ -82,6 +84,7 @@ def _read_file_content(
8284
file_path: Path,
8385
root_dir: Path,
8486
preferred_revs: list[str],
87+
batch_reader: CatFileBatch | None = None,
8588
) -> str | None:
8689
if file_path.suffix.lower() in KNOWN_BINARY_EXTENSIONS:
8790
return None
@@ -95,7 +98,10 @@ def _read_file_content(
9598

9699
for rev in preferred_revs:
97100
try:
98-
content = show_file_at_revision(root_dir, rev, rel)
101+
if batch_reader is not None:
102+
content = batch_reader.get(rev, rel)
103+
else:
104+
content = show_file_at_revision(root_dir, rev, rel)
99105
if _looks_binary(content):
100106
return None
101107
return content
@@ -248,11 +254,12 @@ def _process_files_for_fragments(
248254
root_dir: Path,
249255
preferred_revs: list[str],
250256
seen_frag_ids: set[FragmentId],
257+
batch_reader: CatFileBatch | None = None,
251258
) -> list[Fragment]:
252259
max_frags = LIMITS.max_fragments
253260
fragments: list[Fragment] = []
254261
for file_path in files:
255-
content = _read_file_content(file_path, root_dir, preferred_revs)
262+
content = _read_file_content(file_path, root_dir, preferred_revs, batch_reader)
256263
if content is None:
257264
continue
258265
raw_frags = [f for f in fragment_file(file_path, content) if f.id not in seen_frag_ids]
@@ -477,8 +484,9 @@ def _create_whole_file_fragment(
477484
path: Path,
478485
root_dir: Path,
479486
preferred_revs: list[str],
487+
batch_reader: CatFileBatch | None = None,
480488
) -> Fragment | None:
481-
content = _read_file_content(path, root_dir, preferred_revs)
489+
content = _read_file_content(path, root_dir, preferred_revs, batch_reader)
482490
if not content or not content.strip():
483491
return None
484492
if _is_generated_file(path, content):
@@ -518,6 +526,7 @@ def _ensure_changed_files_represented(
518526
remaining_budget: int,
519527
root_dir: Path,
520528
preferred_revs: list[str],
529+
batch_reader: CatFileBatch | None = None,
521530
) -> list[Fragment]:
522531
selected_paths = {f.path for f in selected}
523532
missing_paths = set(changed_files) - selected_paths
@@ -537,7 +546,7 @@ def _ensure_changed_files_represented(
537546
for path in sorted(missing_paths):
538547
candidates = frags_by_path.get(path, [])
539548
if not candidates:
540-
fallback = _create_whole_file_fragment(path, root_dir, preferred_revs)
549+
fallback = _create_whole_file_fragment(path, root_dir, preferred_revs, batch_reader)
541550
candidates = [fallback] if fallback else []
542551

543552
picked = _pick_smallest_fitting(candidates, selected_ids, budget_left)
@@ -652,36 +661,55 @@ def build_diff_context(
652661

653662
preferred_revs = _build_preferred_revs(base_rev, head_rev)
654663

655-
seen_frag_ids: set[FragmentId] = set()
656-
all_fragments = _process_files_for_fragments(changed_files, root_dir, preferred_revs, seen_frag_ids)
664+
t0 = time.perf_counter()
657665

658-
all_candidate_files, is_large_repo = _collect_candidate_files(root_dir, set(changed_files), combined_spec)
659-
all_candidate_files = _filter_whitelist(all_candidate_files, root_dir, wl_spec)
666+
with CatFileBatch(root_dir) as batch_reader:
667+
seen_frag_ids: set[FragmentId] = set()
668+
all_fragments = _process_files_for_fragments(changed_files, root_dir, preferred_revs, seen_frag_ids, batch_reader)
660669

661-
edge_discovered = discover_all_related_files(changed_files, all_candidate_files, root_dir)
662-
if len(edge_discovered) > _MAX_DISCOVERED_FILES:
663-
logger.debug(
664-
"diffctx: capping edge-discovered files from %d to %d",
665-
len(edge_discovered),
666-
_MAX_DISCOVERED_FILES,
667-
)
668-
edge_discovered = edge_discovered[:_MAX_DISCOVERED_FILES]
669-
edge_discovered = [_normalize_path(p, root_dir) for p in edge_discovered]
670-
all_fragments.extend(_process_files_for_fragments(edge_discovered, root_dir, preferred_revs, seen_frag_ids))
671-
672-
if not is_large_repo:
673-
expanded_files = _expand_universe_by_rare_identifiers(
674-
root_dir,
675-
expansion_concepts,
676-
changed_files + edge_discovered,
677-
combined_spec,
678-
candidate_files=all_candidate_files,
679-
changed_files=changed_files,
680-
)
681-
expanded_files = [_normalize_path(p, root_dir) for p in expanded_files]
682-
all_fragments.extend(_process_files_for_fragments(expanded_files, root_dir, preferred_revs, seen_frag_ids))
683-
else:
684-
logger.debug("diffctx: skipping rare-identifier expansion for large repo")
670+
all_candidate_files, is_large_repo = _collect_candidate_files(root_dir, set(changed_files), combined_spec)
671+
all_candidate_files = _filter_whitelist(all_candidate_files, root_dir, wl_spec)
672+
673+
t1 = time.perf_counter()
674+
675+
edge_discovered = discover_all_related_files(changed_files, all_candidate_files, root_dir)
676+
if len(edge_discovered) > _MAX_DISCOVERED_FILES:
677+
logger.debug(
678+
"diffctx: capping edge-discovered files from %d to %d",
679+
len(edge_discovered),
680+
_MAX_DISCOVERED_FILES,
681+
)
682+
edge_discovered = edge_discovered[:_MAX_DISCOVERED_FILES]
683+
edge_discovered = [_normalize_path(p, root_dir) for p in edge_discovered]
684+
all_fragments.extend(_process_files_for_fragments(edge_discovered, root_dir, preferred_revs, seen_frag_ids, batch_reader))
685+
686+
t2 = time.perf_counter()
687+
688+
if not is_large_repo:
689+
expanded_files = _expand_universe_by_rare_identifiers(
690+
root_dir,
691+
expansion_concepts,
692+
changed_files + edge_discovered,
693+
combined_spec,
694+
candidate_files=all_candidate_files,
695+
changed_files=changed_files,
696+
)
697+
expanded_files = [_normalize_path(p, root_dir) for p in expanded_files]
698+
all_fragments.extend(
699+
_process_files_for_fragments(expanded_files, root_dir, preferred_revs, seen_frag_ids, batch_reader)
700+
)
701+
else:
702+
logger.debug("diffctx: skipping rare-identifier expansion for large repo")
703+
704+
t3 = time.perf_counter()
705+
706+
logger.debug(
707+
"diffctx: timing — changed_files %.3fs, edge_discovery %.3fs, expansion %.3fs, total_io %.3fs",
708+
t1 - t0,
709+
t2 - t1,
710+
t3 - t2,
711+
t3 - t0,
712+
)
685713

686714
_assign_token_counts(all_fragments)
687715

@@ -691,6 +719,8 @@ def build_diff_context(
691719
_assign_token_counts(signature_frags)
692720
all_fragments.extend(signature_frags)
693721

722+
t4 = time.perf_counter()
723+
694724
if full:
695725
selected = _select_full_mode(all_fragments, changed_files)
696726
_log_full_mode(selected)
@@ -708,9 +738,15 @@ def build_diff_context(
708738
)
709739
effective_budget = budget_tokens if budget_tokens is not None else _UNLIMITED_BUDGET
710740
remaining = effective_budget - result.used_tokens
711-
selected = _ensure_changed_files_represented(selected, all_fragments, changed_files, remaining, root_dir, preferred_revs)
741+
with CatFileBatch(root_dir) as batch_reader:
742+
selected = _ensure_changed_files_represented(
743+
selected, all_fragments, changed_files, remaining, root_dir, preferred_revs, batch_reader
744+
)
712745
_log_ppr_mode(selected, core_ids, budget_tokens, result, alpha, tau)
713746

747+
t5 = time.perf_counter()
748+
logger.debug("diffctx: timing — graph+select %.3fs", t5 - t4)
749+
714750
if no_content:
715751
for frag in selected:
716752
frag.content = ""

src/treemapper/diffctx/git.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,15 @@
11
from __future__ import annotations
22

3+
import logging
34
import re
45
import subprocess
56
from pathlib import Path
7+
from types import TracebackType
68

79
from .types import DiffHunk
810

11+
logger = logging.getLogger(__name__)
12+
913
_HUNK_RE = re.compile(r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@")
1014
_RANGE_RE = re.compile(r"^\s*(\S+?)(\.\.\.?)(\S*?)\s*$") # NOSONAR(S5852)
1115

@@ -143,3 +147,66 @@ def get_renamed_old_paths(repo_root: Path, diff_range: str) -> set[Path]:
143147
def show_file_at_revision(repo_root: Path, rev: str, rel_path: Path) -> str:
144148
spec = f"{rev}:{rel_path.as_posix()}"
145149
return run_git(repo_root, ["show", spec])
150+
151+
152+
class CatFileBatch:
153+
def __init__(self, repo_root: Path) -> None:
154+
self._proc: subprocess.Popen[bytes] | None = None
155+
self._repo_root = repo_root
156+
157+
def _ensure_started(self) -> subprocess.Popen[bytes]:
158+
if self._proc is None or self._proc.poll() is not None:
159+
self._proc = subprocess.Popen(
160+
["git", "-C", str(self._repo_root), "cat-file", "--batch"],
161+
stdin=subprocess.PIPE,
162+
stdout=subprocess.PIPE,
163+
stderr=subprocess.PIPE,
164+
)
165+
return self._proc
166+
167+
def get(self, rev: str, rel_path: Path) -> str:
168+
spec = f"{rev}:{rel_path.as_posix()}\n"
169+
proc = self._ensure_started()
170+
assert proc.stdin is not None and proc.stdout is not None
171+
proc.stdin.write(spec.encode())
172+
proc.stdin.flush()
173+
174+
header = proc.stdout.readline()
175+
if not header:
176+
raise GitError(f"cat-file: unexpected EOF for {spec.strip()}")
177+
178+
header_str = header.decode("utf-8", errors="replace").strip()
179+
if header_str.endswith("missing"):
180+
raise GitError(f"Path not found: {spec.strip()}")
181+
182+
parts = header_str.split()
183+
if len(parts) < 3:
184+
raise GitError(f"cat-file: malformed header: {header_str}")
185+
186+
size = int(parts[2])
187+
content = proc.stdout.read(size)
188+
proc.stdout.read(1) # trailing LF
189+
190+
return content.decode("utf-8", errors="replace")
191+
192+
def close(self) -> None:
193+
if self._proc is not None and self._proc.poll() is None:
194+
assert self._proc.stdin is not None
195+
self._proc.stdin.close()
196+
try:
197+
self._proc.wait(timeout=5)
198+
except subprocess.TimeoutExpired:
199+
self._proc.kill()
200+
self._proc.wait()
201+
self._proc = None
202+
203+
def __enter__(self) -> CatFileBatch:
204+
return self
205+
206+
def __exit__(
207+
self,
208+
_exc_type: type[BaseException] | None,
209+
_exc_val: BaseException | None,
210+
_exc_tb: TracebackType | None,
211+
) -> None:
212+
self.close()

0 commit comments

Comments
 (0)