3333from .languages import get_language_for_file
3434from .ppr import personalized_pagerank
3535from .render import build_diff_context_output
36- from .select import SelectionResult , lazy_greedy_select
36+ from .select import SelectionResult , _IntervalIndex , lazy_greedy_select
3737from .types import DiffHunk , Fragment , FragmentId , extract_identifiers
3838from .utility import concepts_from_diff_text , needs_from_diff
3939
@@ -205,7 +205,7 @@ def _truncate_generated_fragments(file_frags: list[Fragment]) -> list[Fragment]:
205205 truncated_content = "\n " .join (lines ) + f"\n # ... [{ remaining } more lines]"
206206 truncated .append (
207207 Fragment (
208- id = FragmentId (frag .path , frag .start_line , frag .start_line + len (lines ) - 1 ),
208+ id = FragmentId (frag .path , frag .start_line , frag .start_line + len (lines )),
209209 kind = frag .kind ,
210210 content = truncated_content ,
211211 identifiers = extract_identifiers (truncated_content ),
@@ -656,7 +656,12 @@ def build_diff_context(
656656
657657 if not is_large_repo :
658658 expanded_files = _expand_universe_by_rare_identifiers (
659- root_dir , expansion_concepts , changed_files + edge_discovered , combined_spec
659+ root_dir ,
660+ expansion_concepts ,
661+ changed_files + edge_discovered ,
662+ combined_spec ,
663+ candidate_files = all_candidate_files ,
664+ changed_files = changed_files ,
660665 )
661666 expanded_files = [_normalize_path (p , root_dir ) for p in expanded_files ]
662667 all_fragments .extend (_process_files_for_fragments (expanded_files , root_dir , preferred_revs , seen_frag_ids ))
@@ -745,6 +750,9 @@ def _coherence_post_pass(
745750 budget : int ,
746751) -> SelectionResult :
747752 selected_ids = {f .id for f in result .selected }
753+ interval_idx = _IntervalIndex ()
754+ for f in result .selected :
755+ interval_idx .add (f .id )
748756 remaining = budget - result .used_tokens
749757
750758 name_to_frags : dict [str , list [Fragment ]] = {}
@@ -758,9 +766,10 @@ def _coherence_post_pass(
758766 added : list [Fragment ] = []
759767 for name in dangling_names :
760768 pick = _pick_best_fragment (name_to_frags .get (name , []), selected_ids )
761- if pick and pick .token_count <= remaining and pick .id not in selected_ids :
769+ if pick and pick .token_count <= remaining and pick .id not in selected_ids and not interval_idx . overlaps ( pick ) :
762770 added .append (pick )
763771 selected_ids .add (pick .id )
772+ interval_idx .add (pick .id )
764773 remaining -= pick .token_count
765774
766775 if not added :
@@ -856,13 +865,43 @@ def _compute_seed_weights(
856865_MIN_LINES_FOR_SIGNATURE = 5
857866
858867
868+ def _count_brackets_outside_strings (line : str ) -> tuple [int , int , int , int ]:
869+ open_parens = 0
870+ close_parens = 0
871+ open_braces = 0
872+ close_braces = 0
873+ in_string : str | None = None
874+ prev = ""
875+ for ch in line :
876+ if in_string is not None :
877+ if ch == in_string and prev != "\\ " :
878+ in_string = None
879+ prev = ch
880+ continue
881+ if ch in ("'" , '"' , "`" ):
882+ in_string = ch
883+ prev = ch
884+ continue
885+ if ch == "(" :
886+ open_parens += 1
887+ elif ch == ")" :
888+ close_parens += 1
889+ elif ch == "{" :
890+ open_braces += 1
891+ elif ch == "}" :
892+ close_braces += 1
893+ prev = ch
894+ return open_parens , close_parens , open_braces , close_braces
895+
896+
859897def _find_signature_end (lines : list [str ]) -> int :
860898 depth = 0
861899 for i , line in enumerate (lines ):
862- depth += line .count ("(" ) - line .count (")" )
900+ op , cp , ob , cb = _count_brackets_outside_strings (line )
901+ depth += op - cp
863902 if depth <= 0 and i > 0 :
864903 return i + 1
865- depth += line . count ( "{" ) - line . count ( "}" )
904+ depth += ob - cb
866905 if depth > 0 :
867906 return i + 1
868907 return min (2 , len (lines ))
@@ -1063,9 +1102,36 @@ def _collect_candidate_files(
10631102 return fallback , False
10641103
10651104
1066- def _build_ident_index (files : list [Path ], concepts : frozenset [str ]) -> dict [str , list [Path ]]:
1105+ def _path_distance (a : Path , b : Path ) -> int :
1106+ a_parts = a .parent .parts
1107+ b_parts = b .parent .parts
1108+ common = 0
1109+ for x , y in zip (a_parts , b_parts ):
1110+ if x != y :
1111+ break
1112+ common += 1
1113+ return (len (a_parts ) - common ) + (len (b_parts ) - common )
1114+
1115+
1116+ def _build_ident_index (
1117+ files : list [Path ],
1118+ concepts : frozenset [str ],
1119+ changed_files : list [Path ] | None = None ,
1120+ ) -> dict [str , list [Path ]]:
1121+ if changed_files :
1122+ changed_dirs = {f .parent for f in changed_files }
1123+
1124+ def sort_key (p : Path ) -> tuple [int , int , str ]:
1125+ in_same_dir = 0 if p .parent in changed_dirs else 1
1126+ min_dist = min ((_path_distance (p , cf ) for cf in changed_files ), default = 0 )
1127+ return (in_same_dir , min_dist , str (p ))
1128+
1129+ prioritized = sorted (files , key = sort_key )[:2000 ]
1130+ else :
1131+ prioritized = sorted (files )[:2000 ]
1132+
10671133 inverted_index : dict [str , list [Path ]] = defaultdict (list )
1068- for file_path in sorted ( files )[: 2000 ] :
1134+ for file_path in prioritized :
10691135 try :
10701136 content = file_path .read_text (encoding = "utf-8" )
10711137 file_idents = extract_identifiers (content , skip_stopwords = False )
@@ -1197,13 +1263,18 @@ def _expand_universe_by_rare_identifiers(
11971263 concepts : frozenset [str ],
11981264 already_included : list [Path ],
11991265 combined_spec : pathspec .PathSpec ,
1266+ candidate_files : list [Path ] | None = None ,
1267+ changed_files : list [Path ] | None = None ,
12001268) -> list [Path ]:
12011269 if not concepts :
12021270 return []
12031271
12041272 included_set = set (already_included )
1205- files , _ = _collect_candidate_files (root_dir , included_set , combined_spec )
1206- inverted_index = _build_ident_index (files , concepts )
1273+ if candidate_files is not None :
1274+ files = [f for f in candidate_files if f not in included_set ]
1275+ else :
1276+ files , _ = _collect_candidate_files (root_dir , included_set , combined_spec )
1277+ inverted_index = _build_ident_index (files , concepts , changed_files = changed_files )
12071278 return _collect_expansion_files (inverted_index , concepts , included_set )
12081279
12091280
0 commit comments