Skip to content

Commit 5970a1f

Browse files
committed
fix: broad correctness sweep across parsers, edges, pipeline and utilities
1 parent 906c253 commit 5970a1f

File tree

31 files changed

+389
-67
lines changed

31 files changed

+389
-67
lines changed

src/treemapper/diffctx/__init__.py

Lines changed: 81 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
from .languages import get_language_for_file
3434
from .ppr import personalized_pagerank
3535
from .render import build_diff_context_output
36-
from .select import SelectionResult, lazy_greedy_select
36+
from .select import SelectionResult, _IntervalIndex, lazy_greedy_select
3737
from .types import DiffHunk, Fragment, FragmentId, extract_identifiers
3838
from .utility import concepts_from_diff_text, needs_from_diff
3939

@@ -205,7 +205,7 @@ def _truncate_generated_fragments(file_frags: list[Fragment]) -> list[Fragment]:
205205
truncated_content = "\n".join(lines) + f"\n# ... [{remaining} more lines]"
206206
truncated.append(
207207
Fragment(
208-
id=FragmentId(frag.path, frag.start_line, frag.start_line + len(lines) - 1),
208+
id=FragmentId(frag.path, frag.start_line, frag.start_line + len(lines)),
209209
kind=frag.kind,
210210
content=truncated_content,
211211
identifiers=extract_identifiers(truncated_content),
@@ -656,7 +656,12 @@ def build_diff_context(
656656

657657
if not is_large_repo:
658658
expanded_files = _expand_universe_by_rare_identifiers(
659-
root_dir, expansion_concepts, changed_files + edge_discovered, combined_spec
659+
root_dir,
660+
expansion_concepts,
661+
changed_files + edge_discovered,
662+
combined_spec,
663+
candidate_files=all_candidate_files,
664+
changed_files=changed_files,
660665
)
661666
expanded_files = [_normalize_path(p, root_dir) for p in expanded_files]
662667
all_fragments.extend(_process_files_for_fragments(expanded_files, root_dir, preferred_revs, seen_frag_ids))
@@ -745,6 +750,9 @@ def _coherence_post_pass(
745750
budget: int,
746751
) -> SelectionResult:
747752
selected_ids = {f.id for f in result.selected}
753+
interval_idx = _IntervalIndex()
754+
for f in result.selected:
755+
interval_idx.add(f.id)
748756
remaining = budget - result.used_tokens
749757

750758
name_to_frags: dict[str, list[Fragment]] = {}
@@ -758,9 +766,10 @@ def _coherence_post_pass(
758766
added: list[Fragment] = []
759767
for name in dangling_names:
760768
pick = _pick_best_fragment(name_to_frags.get(name, []), selected_ids)
761-
if pick and pick.token_count <= remaining and pick.id not in selected_ids:
769+
if pick and pick.token_count <= remaining and pick.id not in selected_ids and not interval_idx.overlaps(pick):
762770
added.append(pick)
763771
selected_ids.add(pick.id)
772+
interval_idx.add(pick.id)
764773
remaining -= pick.token_count
765774

766775
if not added:
@@ -856,13 +865,43 @@ def _compute_seed_weights(
856865
_MIN_LINES_FOR_SIGNATURE = 5
857866

858867

868+
def _count_brackets_outside_strings(line: str) -> tuple[int, int, int, int]:
869+
open_parens = 0
870+
close_parens = 0
871+
open_braces = 0
872+
close_braces = 0
873+
in_string: str | None = None
874+
prev = ""
875+
for ch in line:
876+
if in_string is not None:
877+
if ch == in_string and prev != "\\":
878+
in_string = None
879+
prev = ch
880+
continue
881+
if ch in ("'", '"', "`"):
882+
in_string = ch
883+
prev = ch
884+
continue
885+
if ch == "(":
886+
open_parens += 1
887+
elif ch == ")":
888+
close_parens += 1
889+
elif ch == "{":
890+
open_braces += 1
891+
elif ch == "}":
892+
close_braces += 1
893+
prev = ch
894+
return open_parens, close_parens, open_braces, close_braces
895+
896+
859897
def _find_signature_end(lines: list[str]) -> int:
860898
depth = 0
861899
for i, line in enumerate(lines):
862-
depth += line.count("(") - line.count(")")
900+
op, cp, ob, cb = _count_brackets_outside_strings(line)
901+
depth += op - cp
863902
if depth <= 0 and i > 0:
864903
return i + 1
865-
depth += line.count("{") - line.count("}")
904+
depth += ob - cb
866905
if depth > 0:
867906
return i + 1
868907
return min(2, len(lines))
@@ -1063,9 +1102,36 @@ def _collect_candidate_files(
10631102
return fallback, False
10641103

10651104

1066-
def _build_ident_index(files: list[Path], concepts: frozenset[str]) -> dict[str, list[Path]]:
1105+
def _path_distance(a: Path, b: Path) -> int:
1106+
a_parts = a.parent.parts
1107+
b_parts = b.parent.parts
1108+
common = 0
1109+
for x, y in zip(a_parts, b_parts):
1110+
if x != y:
1111+
break
1112+
common += 1
1113+
return (len(a_parts) - common) + (len(b_parts) - common)
1114+
1115+
1116+
def _build_ident_index(
1117+
files: list[Path],
1118+
concepts: frozenset[str],
1119+
changed_files: list[Path] | None = None,
1120+
) -> dict[str, list[Path]]:
1121+
if changed_files:
1122+
changed_dirs = {f.parent for f in changed_files}
1123+
1124+
def sort_key(p: Path) -> tuple[int, int, str]:
1125+
in_same_dir = 0 if p.parent in changed_dirs else 1
1126+
min_dist = min((_path_distance(p, cf) for cf in changed_files), default=0)
1127+
return (in_same_dir, min_dist, str(p))
1128+
1129+
prioritized = sorted(files, key=sort_key)[:2000]
1130+
else:
1131+
prioritized = sorted(files)[:2000]
1132+
10671133
inverted_index: dict[str, list[Path]] = defaultdict(list)
1068-
for file_path in sorted(files)[:2000]:
1134+
for file_path in prioritized:
10691135
try:
10701136
content = file_path.read_text(encoding="utf-8")
10711137
file_idents = extract_identifiers(content, skip_stopwords=False)
@@ -1197,13 +1263,18 @@ def _expand_universe_by_rare_identifiers(
11971263
concepts: frozenset[str],
11981264
already_included: list[Path],
11991265
combined_spec: pathspec.PathSpec,
1266+
candidate_files: list[Path] | None = None,
1267+
changed_files: list[Path] | None = None,
12001268
) -> list[Path]:
12011269
if not concepts:
12021270
return []
12031271

12041272
included_set = set(already_included)
1205-
files, _ = _collect_candidate_files(root_dir, included_set, combined_spec)
1206-
inverted_index = _build_ident_index(files, concepts)
1273+
if candidate_files is not None:
1274+
files = [f for f in candidate_files if f not in included_set]
1275+
else:
1276+
files, _ = _collect_candidate_files(root_dir, included_set, combined_spec)
1277+
inverted_index = _build_ident_index(files, concepts, changed_files=changed_files)
12071278
return _collect_expansion_files(inverted_index, concepts, included_set)
12081279

12091280

src/treemapper/diffctx/config/extensions.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,6 @@
4444
{
4545
".lua",
4646
".r",
47-
".R",
4847
".jl",
4948
".pl",
5049
".pm",

src/treemapper/diffctx/config/patterns.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
"c_include": re.compile(r'^\s*#include\s*[<"]([^>"]+)[>"]', re.MULTILINE),
2020
"ruby_require": re.compile(r"""^\s*require(?:_relative)?\s+['"]([^'"]+)['"]""", re.MULTILINE),
2121
"php_use": re.compile(r"^\s*use\s+([A-Z][\w\\]*)", re.MULTILINE),
22-
"shell_source": re.compile(r"""^\s*(?:source|\\.)\s+['"]?([^'"\s]+)['"]?""", re.MULTILINE),
22+
"shell_source": re.compile(r"""^\s*(?:source|\.)\s+['"]?([^'"\s]+)['"]?""", re.MULTILINE),
2323
"swift_import": re.compile(r"^\s*import\s+([A-Za-z_]\w*)", re.MULTILINE),
2424
}
2525

src/treemapper/diffctx/config/weights.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,14 @@ class LangWeights:
9090
"kotlin": LangWeights(0.80, 0.85, 0.75, 0.12, 0.18),
9191
"scala": LangWeights(0.80, 0.85, 0.75, 0.12, 0.18),
9292
"go": LangWeights(0.80, 0.85, 0.75, 0.12, 0.20),
93+
"c": LangWeights(0.60, 0.65, 0.55, 0.15, 0.25),
94+
"cpp": LangWeights(0.65, 0.70, 0.60, 0.15, 0.25),
95+
"csharp": LangWeights(0.75, 0.80, 0.70, 0.12, 0.20),
96+
"fsharp": LangWeights(0.70, 0.75, 0.65, 0.12, 0.20),
97+
"ruby": LangWeights(0.60, 0.65, 0.55, 0.15, 0.25),
98+
"php": LangWeights(0.60, 0.65, 0.55, 0.15, 0.25),
99+
"shell": LangWeights(0.40, 0.45, 0.35, 0.20, 0.30),
100+
"swift": LangWeights(0.75, 0.80, 0.70, 0.12, 0.20),
93101
}
94102

95103
DEFAULT_LANG_WEIGHTS = LangWeights(0.55, 0.60, 0.50, 0.15, 0.25)

src/treemapper/diffctx/edges/base.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,10 @@ def _matches_any_ref(candidate_name: str, candidate_rel: str, refs: set[str]) ->
104104
ref_lower = ref.lower()
105105
if len(ref_lower) >= _MIN_REF_LENGTH_FOR_PATH_MATCH and ref_lower in candidate_rel:
106106
idx = candidate_rel.index(ref_lower)
107-
if idx == 0 or candidate_rel[idx - 1] in "/\\":
107+
end_idx = idx + len(ref_lower)
108+
if (idx == 0 or candidate_rel[idx - 1] in "/\\") and (
109+
end_idx == len(candidate_rel) or candidate_rel[end_idx] in "/\\."
110+
):
108111
return True
109112
return False
110113

src/treemapper/diffctx/edges/config/cicd.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
logger = logging.getLogger(__name__)
1212

1313
_GHA_RUN_RE = re.compile(r"^\s{0,20}-?\s{0,5}run:\s{0,5}[|>]?\s{0,5}([^\n]{1,500})", re.MULTILINE)
14+
_GHA_RUN_BLOCK_RE = re.compile(r"run:\s*[|>]-?\s*\n((?:\s{2,}[^\n]*\n?)+)", re.MULTILINE)
1415

1516
_GITLAB_SCRIPT_RE = re.compile(
1617
r"^\s{0,20}(?:script|before_script|after_script):\s?\n((?:\s{1,20}-\s{0,5}[^\n]{1,500}\n){1,100})", re.MULTILINE
@@ -47,7 +48,7 @@
4748
"flake8",
4849
}
4950
)
50-
_SCRIPT_CALL_RE = re.compile(r"(?:" + "|".join(sorted(_SCRIPT_CALL_TOOLS, key=len, reverse=True)) + r")\s+([^\s;&|]+)")
51+
_SCRIPT_CALL_RE = re.compile(r"\b(?:" + "|".join(sorted(_SCRIPT_CALL_TOOLS, key=len, reverse=True)) + r")\s+([^\s;&|]+)")
5152
_PKG_MANAGER_SUBCOMMANDS = frozenset(
5253
{
5354
"run",
@@ -158,6 +159,10 @@ def _extract_gha_refs(content: str) -> set[str]:
158159
run_content = match.group(1)
159160
refs.update(_extract_script_refs(run_content))
160161

162+
for match in _GHA_RUN_BLOCK_RE.finditer(content):
163+
block_content = match.group(1)
164+
refs.update(_extract_script_refs(block_content))
165+
161166
return refs
162167

163168

src/treemapper/diffctx/edges/config/docker.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,6 @@
1717
_COMPOSE_BUILD_RE = re.compile(r"^\s+build:\s*['\"]?([^'\"#\n]+)", re.MULTILINE)
1818
_COMPOSE_CONTEXT_RE = re.compile(r"^\s+context:\s*['\"]?([^'\"#\n]+)", re.MULTILINE)
1919
_COMPOSE_DOCKERFILE_RE = re.compile(r"^\s+dockerfile:\s*['\"]?([^'\"#\n]+)", re.MULTILINE)
20-
_COMPOSE_DEPENDS_RE = re.compile(r"depends_on:\s*\n((?:\s+-\s*\w+\s*\n)+)", re.MULTILINE)
21-
_COMPOSE_SERVICE_RE = re.compile(r"^(\w+):\s*$", re.MULTILINE)
2220
_COMPOSE_VOLUME_RE = re.compile(r"^\s+-\s*['\"]?([./][^:'\"\n]+):", re.MULTILINE)
2321

2422

src/treemapper/diffctx/edges/config/generic.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from ...types import Fragment
88
from ..base import EdgeBuilder, EdgeDict
99

10-
_CONFIG_EXTENSIONS = {".yaml", ".yml", ".json", ".toml", ".ini", ".env"}
10+
_CONFIG_EXTENSIONS = {".yaml", ".yml", ".json", ".toml", ".ini", ".cfg", ".conf", ".xml", ".properties", ".env"}
1111

1212
_CONFIG_KEY_STOPWORDS = frozenset(
1313
{

src/treemapper/diffctx/edges/document/citation.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ def build(self, fragments: list[Fragment], repo_root: Path | None = None) -> Edg
2626
continue
2727
hub = frag_ids[0]
2828
for other in frag_ids[1:]:
29-
edges[(hub, other)] = self.weight
30-
edges[(other, hub)] = self.weight
29+
edges[(hub, other)] = max(edges.get((hub, other), 0.0), self.weight)
30+
edges[(other, hub)] = max(edges.get((other, hub), 0.0), self.weight)
3131

3232
return edges

src/treemapper/diffctx/edges/semantic/c_family.py

Lines changed: 76 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,74 @@
3535

3636
_METHOD_IMPL_RE = re.compile(r"^\s*(?:[\w:]+\s+)?(\w+)::(\w+)\s*\(", re.MULTILINE)
3737

38+
_C_KEYWORDS = frozenset(
39+
{
40+
"if",
41+
"for",
42+
"while",
43+
"switch",
44+
"case",
45+
"return",
46+
"sizeof",
47+
"typeof",
48+
"alignof",
49+
"static_assert",
50+
"do",
51+
"else",
52+
"goto",
53+
"break",
54+
"continue",
55+
"default",
56+
"register",
57+
"volatile",
58+
"extern",
59+
"typedef",
60+
"auto",
61+
"inline",
62+
"restrict",
63+
"noexcept",
64+
"decltype",
65+
"nullptr",
66+
"throw",
67+
"try",
68+
"catch",
69+
"delete",
70+
"new",
71+
"template",
72+
"namespace",
73+
"using",
74+
"operator",
75+
}
76+
)
77+
78+
_C_COMMON_MACROS = frozenset(
79+
{
80+
"NULL",
81+
"TRUE",
82+
"FALSE",
83+
"BOOL",
84+
"DWORD",
85+
"HANDLE",
86+
"VOID",
87+
"HRESULT",
88+
"LPCTSTR",
89+
"LPCSTR",
90+
"LPWSTR",
91+
"INT",
92+
"UINT",
93+
"LONG",
94+
"ULONG",
95+
"WORD",
96+
"BYTE",
97+
"CHAR",
98+
"SHORT",
99+
"EOF",
100+
"SIZE_MAX",
101+
"INT_MAX",
102+
"INT_MIN",
103+
}
104+
)
105+
38106

39107
def _is_c_family(path: Path) -> bool:
40108
return path.suffix.lower() in _ALL_C_FAMILY
@@ -56,7 +124,10 @@ def _extract_definitions(content: str) -> tuple[set[str], set[str], set[str]]:
56124
namespaces: set[str] = set()
57125

58126
for match in _FUNC_DEF_RE.finditer(content):
59-
functions.add(match.group(1))
127+
name = match.group(1)
128+
if name in _C_KEYWORDS:
129+
continue
130+
functions.add(name)
60131

61132
for pattern in [_CLASS_RE, _TYPEDEF_RE, _USING_TYPE_RE, _ENUM_RE]:
62133
for match in pattern.finditer(content):
@@ -79,11 +150,15 @@ def _extract_references(content: str, own_defs: set[str]) -> tuple[set[str], set
79150

80151
for match in _FUNC_CALL_RE.finditer(content):
81152
name = match.group(1)
153+
if name in _C_KEYWORDS:
154+
continue
82155
if name not in own_defs and not name.startswith("_") and len(name) > 2:
83156
calls.add(name)
84157

85158
for match in _TYPE_REF_RE.finditer(content):
86159
name = match.group(1)
160+
if name in _C_COMMON_MACROS:
161+
continue
87162
if name not in own_defs and len(name) > 2:
88163
type_refs.add(name)
89164

0 commit comments

Comments
 (0)