33import logging
44import re
55import subprocess
6+ import time
67from collections import defaultdict
78from pathlib import Path
89from typing import Any
1819from .file_importance import compute_file_importance
1920from .fragments import fragment_file # type: ignore[attr-defined]
2021from .git import (
22+ CatFileBatch ,
2123 GitError ,
2224 get_changed_files ,
2325 get_deleted_files ,
@@ -82,6 +84,7 @@ def _read_file_content(
8284 file_path : Path ,
8385 root_dir : Path ,
8486 preferred_revs : list [str ],
87+ batch_reader : CatFileBatch | None = None ,
8588) -> str | None :
8689 if file_path .suffix .lower () in KNOWN_BINARY_EXTENSIONS :
8790 return None
@@ -95,7 +98,10 @@ def _read_file_content(
9598
9699 for rev in preferred_revs :
97100 try :
98- content = show_file_at_revision (root_dir , rev , rel )
101+ if batch_reader is not None :
102+ content = batch_reader .get (rev , rel )
103+ else :
104+ content = show_file_at_revision (root_dir , rev , rel )
99105 if _looks_binary (content ):
100106 return None
101107 return content
@@ -248,11 +254,12 @@ def _process_files_for_fragments(
248254 root_dir : Path ,
249255 preferred_revs : list [str ],
250256 seen_frag_ids : set [FragmentId ],
257+ batch_reader : CatFileBatch | None = None ,
251258) -> list [Fragment ]:
252259 max_frags = LIMITS .max_fragments
253260 fragments : list [Fragment ] = []
254261 for file_path in files :
255- content = _read_file_content (file_path , root_dir , preferred_revs )
262+ content = _read_file_content (file_path , root_dir , preferred_revs , batch_reader )
256263 if content is None :
257264 continue
258265 raw_frags = [f for f in fragment_file (file_path , content ) if f .id not in seen_frag_ids ]
@@ -477,8 +484,9 @@ def _create_whole_file_fragment(
477484 path : Path ,
478485 root_dir : Path ,
479486 preferred_revs : list [str ],
487+ batch_reader : CatFileBatch | None = None ,
480488) -> Fragment | None :
481- content = _read_file_content (path , root_dir , preferred_revs )
489+ content = _read_file_content (path , root_dir , preferred_revs , batch_reader )
482490 if not content or not content .strip ():
483491 return None
484492 if _is_generated_file (path , content ):
@@ -518,6 +526,7 @@ def _ensure_changed_files_represented(
518526 remaining_budget : int ,
519527 root_dir : Path ,
520528 preferred_revs : list [str ],
529+ batch_reader : CatFileBatch | None = None ,
521530) -> list [Fragment ]:
522531 selected_paths = {f .path for f in selected }
523532 missing_paths = set (changed_files ) - selected_paths
@@ -537,7 +546,7 @@ def _ensure_changed_files_represented(
537546 for path in sorted (missing_paths ):
538547 candidates = frags_by_path .get (path , [])
539548 if not candidates :
540- fallback = _create_whole_file_fragment (path , root_dir , preferred_revs )
549+ fallback = _create_whole_file_fragment (path , root_dir , preferred_revs , batch_reader )
541550 candidates = [fallback ] if fallback else []
542551
543552 picked = _pick_smallest_fitting (candidates , selected_ids , budget_left )
@@ -652,36 +661,55 @@ def build_diff_context(
652661
653662 preferred_revs = _build_preferred_revs (base_rev , head_rev )
654663
655- seen_frag_ids : set [FragmentId ] = set ()
656- all_fragments = _process_files_for_fragments (changed_files , root_dir , preferred_revs , seen_frag_ids )
664+ t0 = time .perf_counter ()
657665
658- all_candidate_files , is_large_repo = _collect_candidate_files (root_dir , set (changed_files ), combined_spec )
659- all_candidate_files = _filter_whitelist (all_candidate_files , root_dir , wl_spec )
666+ with CatFileBatch (root_dir ) as batch_reader :
667+ seen_frag_ids : set [FragmentId ] = set ()
668+ all_fragments = _process_files_for_fragments (changed_files , root_dir , preferred_revs , seen_frag_ids , batch_reader )
660669
661- edge_discovered = discover_all_related_files (changed_files , all_candidate_files , root_dir )
662- if len (edge_discovered ) > _MAX_DISCOVERED_FILES :
663- logger .debug (
664- "diffctx: capping edge-discovered files from %d to %d" ,
665- len (edge_discovered ),
666- _MAX_DISCOVERED_FILES ,
667- )
668- edge_discovered = edge_discovered [:_MAX_DISCOVERED_FILES ]
669- edge_discovered = [_normalize_path (p , root_dir ) for p in edge_discovered ]
670- all_fragments .extend (_process_files_for_fragments (edge_discovered , root_dir , preferred_revs , seen_frag_ids ))
671-
672- if not is_large_repo :
673- expanded_files = _expand_universe_by_rare_identifiers (
674- root_dir ,
675- expansion_concepts ,
676- changed_files + edge_discovered ,
677- combined_spec ,
678- candidate_files = all_candidate_files ,
679- changed_files = changed_files ,
680- )
681- expanded_files = [_normalize_path (p , root_dir ) for p in expanded_files ]
682- all_fragments .extend (_process_files_for_fragments (expanded_files , root_dir , preferred_revs , seen_frag_ids ))
683- else :
684- logger .debug ("diffctx: skipping rare-identifier expansion for large repo" )
670+ all_candidate_files , is_large_repo = _collect_candidate_files (root_dir , set (changed_files ), combined_spec )
671+ all_candidate_files = _filter_whitelist (all_candidate_files , root_dir , wl_spec )
672+
673+ t1 = time .perf_counter ()
674+
675+ edge_discovered = discover_all_related_files (changed_files , all_candidate_files , root_dir )
676+ if len (edge_discovered ) > _MAX_DISCOVERED_FILES :
677+ logger .debug (
678+ "diffctx: capping edge-discovered files from %d to %d" ,
679+ len (edge_discovered ),
680+ _MAX_DISCOVERED_FILES ,
681+ )
682+ edge_discovered = edge_discovered [:_MAX_DISCOVERED_FILES ]
683+ edge_discovered = [_normalize_path (p , root_dir ) for p in edge_discovered ]
684+ all_fragments .extend (_process_files_for_fragments (edge_discovered , root_dir , preferred_revs , seen_frag_ids , batch_reader ))
685+
686+ t2 = time .perf_counter ()
687+
688+ if not is_large_repo :
689+ expanded_files = _expand_universe_by_rare_identifiers (
690+ root_dir ,
691+ expansion_concepts ,
692+ changed_files + edge_discovered ,
693+ combined_spec ,
694+ candidate_files = all_candidate_files ,
695+ changed_files = changed_files ,
696+ )
697+ expanded_files = [_normalize_path (p , root_dir ) for p in expanded_files ]
698+ all_fragments .extend (
699+ _process_files_for_fragments (expanded_files , root_dir , preferred_revs , seen_frag_ids , batch_reader )
700+ )
701+ else :
702+ logger .debug ("diffctx: skipping rare-identifier expansion for large repo" )
703+
704+ t3 = time .perf_counter ()
705+
706+ logger .debug (
707+ "diffctx: timing — changed_files %.3fs, edge_discovery %.3fs, expansion %.3fs, total_io %.3fs" ,
708+ t1 - t0 ,
709+ t2 - t1 ,
710+ t3 - t2 ,
711+ t3 - t0 ,
712+ )
685713
686714 _assign_token_counts (all_fragments )
687715
@@ -691,6 +719,8 @@ def build_diff_context(
691719 _assign_token_counts (signature_frags )
692720 all_fragments .extend (signature_frags )
693721
722+ t4 = time .perf_counter ()
723+
694724 if full :
695725 selected = _select_full_mode (all_fragments , changed_files )
696726 _log_full_mode (selected )
@@ -708,9 +738,15 @@ def build_diff_context(
708738 )
709739 effective_budget = budget_tokens if budget_tokens is not None else _UNLIMITED_BUDGET
710740 remaining = effective_budget - result .used_tokens
711- selected = _ensure_changed_files_represented (selected , all_fragments , changed_files , remaining , root_dir , preferred_revs )
741+ with CatFileBatch (root_dir ) as batch_reader :
742+ selected = _ensure_changed_files_represented (
743+ selected , all_fragments , changed_files , remaining , root_dir , preferred_revs , batch_reader
744+ )
712745 _log_ppr_mode (selected , core_ids , budget_tokens , result , alpha , tau )
713746
747+ t5 = time .perf_counter ()
748+ logger .debug ("diffctx: timing — graph+select %.3fs" , t5 - t4 )
749+
714750 if no_content :
715751 for frag in selected :
716752 frag .content = ""
0 commit comments