nikolay-e
diff --git a/‎src/treemapper/diffctx/__init__.py‎
Lines changed: 81 additions & 10 deletions b/‎src/treemapper/diffctx/__init__.py‎
Lines changed: 81 additions & 10 deletions
diff --git a/‎src/treemapper/diffctx/config/extensions.py‎
Lines changed: 0 additions & 1 deletion b/‎src/treemapper/diffctx/config/extensions.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/treemapper/diffctx/config/patterns.py‎
Lines changed: 1 addition & 1 deletion b/‎src/treemapper/diffctx/config/patterns.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/treemapper/diffctx/config/weights.py‎
Lines changed: 8 additions & 0 deletions b/‎src/treemapper/diffctx/config/weights.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎src/treemapper/diffctx/edges/base.py‎
Lines changed: 4 additions & 1 deletion b/‎src/treemapper/diffctx/edges/base.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎src/treemapper/diffctx/edges/config/cicd.py‎
Lines changed: 6 additions & 1 deletion b/‎src/treemapper/diffctx/edges/config/cicd.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎src/treemapper/diffctx/edges/config/docker.py‎
Lines changed: 0 additions & 2 deletions b/‎src/treemapper/diffctx/edges/config/docker.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎src/treemapper/diffctx/edges/config/generic.py‎
Lines changed: 1 addition & 1 deletion b/‎src/treemapper/diffctx/edges/config/generic.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/treemapper/diffctx/edges/document/citation.py‎
Lines changed: 2 additions & 2 deletions b/‎src/treemapper/diffctx/edges/document/citation.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/treemapper/diffctx/edges/semantic/c_family.py‎
Lines changed: 76 additions & 1 deletion b/‎src/treemapper/diffctx/edges/semantic/c_family.py‎
Lines changed: 76 additions & 1 deletion
@@ -33,7 +33,7 @@
 from .languages import get_language_for_file
 from .ppr import personalized_pagerank
 from .render import build_diff_context_output
-from .select import SelectionResult, lazy_greedy_select
+from .select import SelectionResult, _IntervalIndex, lazy_greedy_select
 from .types import DiffHunk, Fragment, FragmentId, extract_identifiers
 from .utility import concepts_from_diff_text, needs_from_diff
 
@@ -205,7 +205,7 @@ def _truncate_generated_fragments(file_frags: list[Fragment]) -> list[Fragment]:
         truncated_content = "\n".join(lines) + f"\n# ... [{remaining} more lines]"
         truncated.append(
             Fragment(
-                id=FragmentId(frag.path, frag.start_line, frag.start_line + len(lines) - 1),
+                id=FragmentId(frag.path, frag.start_line, frag.start_line + len(lines)),
                 kind=frag.kind,
                 content=truncated_content,
                 identifiers=extract_identifiers(truncated_content),
@@ -656,7 +656,12 @@ def build_diff_context(
 
     if not is_large_repo:
         expanded_files = _expand_universe_by_rare_identifiers(
-            root_dir, expansion_concepts, changed_files + edge_discovered, combined_spec
+            root_dir,
+            expansion_concepts,
+            changed_files + edge_discovered,
+            combined_spec,
+            candidate_files=all_candidate_files,
+            changed_files=changed_files,
         )
         expanded_files = [_normalize_path(p, root_dir) for p in expanded_files]
         all_fragments.extend(_process_files_for_fragments(expanded_files, root_dir, preferred_revs, seen_frag_ids))
@@ -745,6 +750,9 @@ def _coherence_post_pass(
     budget: int,
 ) -> SelectionResult:
     selected_ids = {f.id for f in result.selected}
+    interval_idx = _IntervalIndex()
+    for f in result.selected:
+        interval_idx.add(f.id)
     remaining = budget - result.used_tokens
 
     name_to_frags: dict[str, list[Fragment]] = {}
@@ -758,9 +766,10 @@ def _coherence_post_pass(
     added: list[Fragment] = []
     for name in dangling_names:
         pick = _pick_best_fragment(name_to_frags.get(name, []), selected_ids)
-        if pick and pick.token_count <= remaining and pick.id not in selected_ids:
+        if pick and pick.token_count <= remaining and pick.id not in selected_ids and not interval_idx.overlaps(pick):
             added.append(pick)
             selected_ids.add(pick.id)
+            interval_idx.add(pick.id)
             remaining -= pick.token_count
 
     if not added:
@@ -856,13 +865,43 @@ def _compute_seed_weights(
 _MIN_LINES_FOR_SIGNATURE = 5
 
 
+def _count_brackets_outside_strings(line: str) -> tuple[int, int, int, int]:
+    open_parens = 0
+    close_parens = 0
+    open_braces = 0
+    close_braces = 0
+    in_string: str | None = None
+    prev = ""
+    for ch in line:
+        if in_string is not None:
+            if ch == in_string and prev != "\\":
+                in_string = None
+            prev = ch
+            continue
+        if ch in ("'", '"', "`"):
+            in_string = ch
+            prev = ch
+            continue
+        if ch == "(":
+            open_parens += 1
+        elif ch == ")":
+            close_parens += 1
+        elif ch == "{":
+            open_braces += 1
+        elif ch == "}":
+            close_braces += 1
+        prev = ch
+    return open_parens, close_parens, open_braces, close_braces
+
+
 def _find_signature_end(lines: list[str]) -> int:
     depth = 0
     for i, line in enumerate(lines):
-        depth += line.count("(") - line.count(")")
+        op, cp, ob, cb = _count_brackets_outside_strings(line)
+        depth += op - cp
         if depth <= 0 and i > 0:
             return i + 1
-        depth += line.count("{") - line.count("}")
+        depth += ob - cb
         if depth > 0:
             return i + 1
     return min(2, len(lines))
@@ -1063,9 +1102,36 @@ def _collect_candidate_files(
     return fallback, False
 
 
-def _build_ident_index(files: list[Path], concepts: frozenset[str]) -> dict[str, list[Path]]:
+def _path_distance(a: Path, b: Path) -> int:
+    a_parts = a.parent.parts
+    b_parts = b.parent.parts
+    common = 0
+    for x, y in zip(a_parts, b_parts):
+        if x != y:
+            break
+        common += 1
+    return (len(a_parts) - common) + (len(b_parts) - common)
+
+
+def _build_ident_index(
+    files: list[Path],
+    concepts: frozenset[str],
+    changed_files: list[Path] | None = None,
+) -> dict[str, list[Path]]:
+    if changed_files:
+        changed_dirs = {f.parent for f in changed_files}
+
+        def sort_key(p: Path) -> tuple[int, int, str]:
+            in_same_dir = 0 if p.parent in changed_dirs else 1
+            min_dist = min((_path_distance(p, cf) for cf in changed_files), default=0)
+            return (in_same_dir, min_dist, str(p))
+
+        prioritized = sorted(files, key=sort_key)[:2000]
+    else:
+        prioritized = sorted(files)[:2000]
+
     inverted_index: dict[str, list[Path]] = defaultdict(list)
-    for file_path in sorted(files)[:2000]:
+    for file_path in prioritized:
         try:
             content = file_path.read_text(encoding="utf-8")
             file_idents = extract_identifiers(content, skip_stopwords=False)
@@ -1197,13 +1263,18 @@ def _expand_universe_by_rare_identifiers(
     concepts: frozenset[str],
     already_included: list[Path],
     combined_spec: pathspec.PathSpec,
+    candidate_files: list[Path] | None = None,
+    changed_files: list[Path] | None = None,
 ) -> list[Path]:
     if not concepts:
         return []
 
     included_set = set(already_included)
-    files, _ = _collect_candidate_files(root_dir, included_set, combined_spec)
-    inverted_index = _build_ident_index(files, concepts)
+    if candidate_files is not None:
+        files = [f for f in candidate_files if f not in included_set]
+    else:
+        files, _ = _collect_candidate_files(root_dir, included_set, combined_spec)
+    inverted_index = _build_ident_index(files, concepts, changed_files=changed_files)
     return _collect_expansion_files(inverted_index, concepts, included_set)
 
 
 
@@ -44,7 +44,6 @@
     {
         ".lua",
         ".r",
-        ".R",
         ".jl",
         ".pl",
         ".pm",
 
@@ -19,7 +19,7 @@
     "c_include": re.compile(r'^\s*#include\s*[<"]([^>"]+)[>"]', re.MULTILINE),
     "ruby_require": re.compile(r"""^\s*require(?:_relative)?\s+['"]([^'"]+)['"]""", re.MULTILINE),
     "php_use": re.compile(r"^\s*use\s+([A-Z][\w\\]*)", re.MULTILINE),
-    "shell_source": re.compile(r"""^\s*(?:source|\\.)\s+['"]?([^'"\s]+)['"]?""", re.MULTILINE),
+    "shell_source": re.compile(r"""^\s*(?:source|\.)\s+['"]?([^'"\s]+)['"]?""", re.MULTILINE),
     "swift_import": re.compile(r"^\s*import\s+([A-Za-z_]\w*)", re.MULTILINE),
 }
 
 
@@ -90,6 +90,14 @@ class LangWeights:
     "kotlin": LangWeights(0.80, 0.85, 0.75, 0.12, 0.18),
     "scala": LangWeights(0.80, 0.85, 0.75, 0.12, 0.18),
     "go": LangWeights(0.80, 0.85, 0.75, 0.12, 0.20),
+    "c": LangWeights(0.60, 0.65, 0.55, 0.15, 0.25),
+    "cpp": LangWeights(0.65, 0.70, 0.60, 0.15, 0.25),
+    "csharp": LangWeights(0.75, 0.80, 0.70, 0.12, 0.20),
+    "fsharp": LangWeights(0.70, 0.75, 0.65, 0.12, 0.20),
+    "ruby": LangWeights(0.60, 0.65, 0.55, 0.15, 0.25),
+    "php": LangWeights(0.60, 0.65, 0.55, 0.15, 0.25),
+    "shell": LangWeights(0.40, 0.45, 0.35, 0.20, 0.30),
+    "swift": LangWeights(0.75, 0.80, 0.70, 0.12, 0.20),
 }
 
 DEFAULT_LANG_WEIGHTS = LangWeights(0.55, 0.60, 0.50, 0.15, 0.25)
@@ -104,7 +104,10 @@ def _matches_any_ref(candidate_name: str, candidate_rel: str, refs: set[str]) ->
         ref_lower = ref.lower()
         if len(ref_lower) >= _MIN_REF_LENGTH_FOR_PATH_MATCH and ref_lower in candidate_rel:
             idx = candidate_rel.index(ref_lower)
-            if idx == 0 or candidate_rel[idx - 1] in "/\\":
+            end_idx = idx + len(ref_lower)
+            if (idx == 0 or candidate_rel[idx - 1] in "/\\") and (
+                end_idx == len(candidate_rel) or candidate_rel[end_idx] in "/\\."
+            ):
                 return True
     return False
 
 
@@ -11,6 +11,7 @@
 logger = logging.getLogger(__name__)
 
 _GHA_RUN_RE = re.compile(r"^\s{0,20}-?\s{0,5}run:\s{0,5}[|>]?\s{0,5}([^\n]{1,500})", re.MULTILINE)
+_GHA_RUN_BLOCK_RE = re.compile(r"run:\s*[|>]-?\s*\n((?:\s{2,}[^\n]*\n?)+)", re.MULTILINE)
 
 _GITLAB_SCRIPT_RE = re.compile(
     r"^\s{0,20}(?:script|before_script|after_script):\s?\n((?:\s{1,20}-\s{0,5}[^\n]{1,500}\n){1,100})", re.MULTILINE
@@ -47,7 +48,7 @@
         "flake8",
     }
 )
-_SCRIPT_CALL_RE = re.compile(r"(?:" + "|".join(sorted(_SCRIPT_CALL_TOOLS, key=len, reverse=True)) + r")\s+([^\s;&|]+)")
+_SCRIPT_CALL_RE = re.compile(r"\b(?:" + "|".join(sorted(_SCRIPT_CALL_TOOLS, key=len, reverse=True)) + r")\s+([^\s;&|]+)")
 _PKG_MANAGER_SUBCOMMANDS = frozenset(
     {
         "run",
@@ -158,6 +159,10 @@ def _extract_gha_refs(content: str) -> set[str]:
         run_content = match.group(1)
         refs.update(_extract_script_refs(run_content))
 
+    for match in _GHA_RUN_BLOCK_RE.finditer(content):
+        block_content = match.group(1)
+        refs.update(_extract_script_refs(block_content))
+
     return refs
 
 
 
@@ -17,8 +17,6 @@
 _COMPOSE_BUILD_RE = re.compile(r"^\s+build:\s*['\"]?([^'\"#\n]+)", re.MULTILINE)
 _COMPOSE_CONTEXT_RE = re.compile(r"^\s+context:\s*['\"]?([^'\"#\n]+)", re.MULTILINE)
 _COMPOSE_DOCKERFILE_RE = re.compile(r"^\s+dockerfile:\s*['\"]?([^'\"#\n]+)", re.MULTILINE)
-_COMPOSE_DEPENDS_RE = re.compile(r"depends_on:\s*\n((?:\s+-\s*\w+\s*\n)+)", re.MULTILINE)
-_COMPOSE_SERVICE_RE = re.compile(r"^(\w+):\s*$", re.MULTILINE)
 _COMPOSE_VOLUME_RE = re.compile(r"^\s+-\s*['\"]?([./][^:'\"\n]+):", re.MULTILINE)
 
 
 
@@ -7,7 +7,7 @@
 from ...types import Fragment
 from ..base import EdgeBuilder, EdgeDict
 
-_CONFIG_EXTENSIONS = {".yaml", ".yml", ".json", ".toml", ".ini", ".env"}
+_CONFIG_EXTENSIONS = {".yaml", ".yml", ".json", ".toml", ".ini", ".cfg", ".conf", ".xml", ".properties", ".env"}
 
 _CONFIG_KEY_STOPWORDS = frozenset(
     {
 
@@ -26,7 +26,7 @@ def build(self, fragments: list[Fragment], repo_root: Path | None = None) -> Edg
                 continue
             hub = frag_ids[0]
             for other in frag_ids[1:]:
-                edges[(hub, other)] = self.weight
-                edges[(other, hub)] = self.weight
+                edges[(hub, other)] = max(edges.get((hub, other), 0.0), self.weight)
+                edges[(other, hub)] = max(edges.get((other, hub), 0.0), self.weight)
 
         return edges
@@ -35,6 +35,74 @@
 
 _METHOD_IMPL_RE = re.compile(r"^\s*(?:[\w:]+\s+)?(\w+)::(\w+)\s*\(", re.MULTILINE)
 
+_C_KEYWORDS = frozenset(
+    {
+        "if",
+        "for",
+        "while",
+        "switch",
+        "case",
+        "return",
+        "sizeof",
+        "typeof",
+        "alignof",
+        "static_assert",
+        "do",
+        "else",
+        "goto",
+        "break",
+        "continue",
+        "default",
+        "register",
+        "volatile",
+        "extern",
+        "typedef",
+        "auto",
+        "inline",
+        "restrict",
+        "noexcept",
+        "decltype",
+        "nullptr",
+        "throw",
+        "try",
+        "catch",
+        "delete",
+        "new",
+        "template",
+        "namespace",
+        "using",
+        "operator",
+    }
+)
+
+_C_COMMON_MACROS = frozenset(
+    {
+        "NULL",
+        "TRUE",
+        "FALSE",
+        "BOOL",
+        "DWORD",
+        "HANDLE",
+        "VOID",
+        "HRESULT",
+        "LPCTSTR",
+        "LPCSTR",
+        "LPWSTR",
+        "INT",
+        "UINT",
+        "LONG",
+        "ULONG",
+        "WORD",
+        "BYTE",
+        "CHAR",
+        "SHORT",
+        "EOF",
+        "SIZE_MAX",
+        "INT_MAX",
+        "INT_MIN",
+    }
+)
+
 
 def _is_c_family(path: Path) -> bool:
     return path.suffix.lower() in _ALL_C_FAMILY
@@ -56,7 +124,10 @@ def _extract_definitions(content: str) -> tuple[set[str], set[str], set[str]]:
     namespaces: set[str] = set()
 
     for match in _FUNC_DEF_RE.finditer(content):
-        functions.add(match.group(1))
+        name = match.group(1)
+        if name in _C_KEYWORDS:
+            continue
+        functions.add(name)
 
     for pattern in [_CLASS_RE, _TYPEDEF_RE, _USING_TYPE_RE, _ENUM_RE]:
         for match in pattern.finditer(content):
@@ -79,11 +150,15 @@ def _extract_references(content: str, own_defs: set[str]) -> tuple[set[str], set
 
     for match in _FUNC_CALL_RE.finditer(content):
         name = match.group(1)
+        if name in _C_KEYWORDS:
+            continue
         if name not in own_defs and not name.startswith("_") and len(name) > 2:
             calls.add(name)
 
     for match in _TYPE_REF_RE.finditer(content):
         name = match.group(1)
+        if name in _C_COMMON_MACROS:
+            continue
         if name not in own_defs and len(name) > 2:
             type_refs.add(name)
Original file line number	Diff line number	Diff line change
`@@ -44,7 +44,6 @@`
`44`	`44`	`{`
`45`	`45`	`".lua",`
`46`	`46`	`".r",`
`47`		`- ".R",`
`48`	`47`	`".jl",`
`49`	`48`	`".pl",`
`50`	`49`	`".pm",`
Original file line number	Diff line number	Diff line change
`@@ -19,7 +19,7 @@`
`19`	`19`	`"c_include": re.compile(r'^\s#include\s[<"]([^>"]+)[>"]', re.MULTILINE),`
`20`	`20`	`"ruby_require": re.compile(r"""^\s*require(?:_relative)?\s+['"]([^'"]+)['"]""", re.MULTILINE),`
`21`	`21`	`"php_use": re.compile(r"^\suse\s+([A-Z][\w\\])", re.MULTILINE),`
`22`		`- "shell_source": re.compile(r"""^\s*(?:source\|\\.)\s+['"]?([^'"\s]+)['"]?""", re.MULTILINE),`
	`22`	`+ "shell_source": re.compile(r"""^\s*(?:source\|\.)\s+['"]?([^'"\s]+)['"]?""", re.MULTILINE),`
`23`	`23`	`"swift_import": re.compile(r"^\simport\s+([A-Za-z_]\w)", re.MULTILINE),`
`24`	`24`	`}`
`25`	`25`
Original file line number	Diff line number	Diff line change
`@@ -7,7 +7,7 @@`
`7`	`7`	`from ...types import Fragment`
`8`	`8`	`from ..base import EdgeBuilder, EdgeDict`
`9`	`9`
`10`		`-_CONFIG_EXTENSIONS = {".yaml", ".yml", ".json", ".toml", ".ini", ".env"}`
	`10`	`+_CONFIG_EXTENSIONS = {".yaml", ".yml", ".json", ".toml", ".ini", ".cfg", ".conf", ".xml", ".properties", ".env"}`
`11`	`11`
`12`	`12`	`_CONFIG_KEY_STOPWORDS = frozenset(`
`13`	`13`	`{`