Skip to content

Commit bab840c

Browse files
committed
fix: address multiple diffctx bugs and tighten similarity thresholds
- Fix PPR iterator consumption bug (dict.values() exhausted after comprehension) - Differentiate TS vs TSX parsing (use correct tree-sitter grammar) - Fix Language double-wrapping in tree-sitter parser - Add backtick support in bracket balancing for JS template strings - Fix Go edge builder case mismatch in package name matching - Replace rglob with git ls-files for faster candidate file collection - Tighten lexical similarity: min_similarity 0.1->0.30, top_k 10->5 - Restrict config-to-code edges: require exact match, min key length 6 - Fix JSON fragmentation: handle files with no top-level keys
1 parent 86ef6af commit bab840c

File tree

17 files changed

+160
-71
lines changed

17 files changed

+160
-71
lines changed

src/treemapper/diffctx/__init__.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from __future__ import annotations
22

33
import logging
4+
import subprocess
45
from collections import defaultdict
56
from pathlib import Path
67
from typing import Any
@@ -340,6 +341,19 @@ def _is_candidate_file(file_path: Path, root_dir: Path, included_set: set[Path],
340341

341342

342343
def _collect_candidate_files(root_dir: Path, included_set: set[Path], combined_spec: pathspec.PathSpec) -> list[Path]:
344+
try:
345+
result = subprocess.run(
346+
["git", "ls-files", "-z"],
347+
cwd=root_dir,
348+
capture_output=True,
349+
text=True,
350+
timeout=30,
351+
)
352+
if result.returncode == 0 and result.stdout:
353+
files = [root_dir / f for f in result.stdout.split("\0") if f]
354+
return [f for f in files if _is_candidate_file(f, root_dir, included_set, combined_spec)]
355+
except (subprocess.SubprocessError, OSError):
356+
pass
343357
return [f for f in root_dir.rglob("*") if _is_candidate_file(f, root_dir, included_set, combined_spec)]
344358

345359

src/treemapper/diffctx/config/limits.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -22,15 +22,15 @@ class PPRConfig:
2222

2323
@dataclass(frozen=True)
2424
class LexicalConfig:
25-
min_similarity: float = 0.1
25+
min_similarity: float = 0.30
2626
hub_percentile: float = 0.95
27-
top_k_neighbors: int = 10
28-
max_df_ratio: float = 0.20
29-
min_idf: float = 1.6
30-
max_postings: int = 200
31-
weight_min: float = 0.1
32-
weight_max: float = 0.2
33-
backward_factor: float = 0.7
27+
top_k_neighbors: int = 5
28+
max_df_ratio: float = 0.15
29+
min_idf: float = 2.0
30+
max_postings: int = 100
31+
weight_min: float = 0.05
32+
weight_max: float = 0.15
33+
backward_factor: float = 0.5
3434

3535

3636
@dataclass(frozen=True)

src/treemapper/diffctx/constants.py

Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -14,16 +14,6 @@
1414

1515

1616
def expand_config_key(key: str) -> set[str]:
17-
expanded: set[str] = {key}
18-
parts = key.split("_")
19-
for part in parts:
20-
if len(part) >= 3:
21-
expanded.add(part)
22-
for prefix in CONFIG_KEY_COMMON_PREFIXES:
23-
if key.startswith(prefix + "_") and len(key) > len(prefix) + 1:
24-
stripped = key[len(prefix) + 1 :]
25-
expanded.add(stripped)
26-
for sub in stripped.split("_"):
27-
if len(sub) >= 3:
28-
expanded.add(sub)
29-
return expanded
17+
if len(key) < 6:
18+
return set()
19+
return {key}

src/treemapper/diffctx/edges/config/cicd.py

Lines changed: 92 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from __future__ import annotations
22

3+
import logging
34
import re
45
from pathlib import Path
56

@@ -18,8 +19,10 @@
1819
_JENKINS_SH_RE = re.compile(r"sh\s*(?:\(['\"]|['\"])(.+?)['\"]\)?", re.MULTILINE | re.DOTALL)
1920
_JENKINS_SCRIPT_RE = re.compile(r"script\s*\{([^}]+)\}", re.MULTILINE | re.DOTALL)
2021

21-
_SCRIPT_CALL_RE = re.compile(r"(?:bash|sh|python|python3|node|npm|yarn|pnpm|make|go|cargo|dotnet|mvn|gradle)\s+([^\s;&|]+)")
22-
_FILE_REF_RE = re.compile(r"(?:\.\/|scripts\/|bin\/|tools\/)([a-zA-Z0-9_.-]+(?:\.(?:sh|py|js|ts|rb))?)")
22+
_SCRIPT_CALL_RE = re.compile(
23+
r"(?:bash|sh|python|python3|node|npm|yarn|pnpm|make|go|cargo|dotnet|mvn|gradle|pytest|ruff|mypy|black|isort|flake8)\s+([^\s;&|]+)"
24+
)
25+
_FILE_REF_RE = re.compile(r"(?:\.\/|scripts\/|bin\/|tools\/|src\/|tests\/)([a-zA-Z0-9_.-]+(?:\.(?:sh|py|js|ts|rb))?)")
2326

2427

2528
def _is_github_actions(path: Path) -> bool:
@@ -49,6 +52,14 @@ def _is_azure_pipelines(path: Path) -> bool:
4952
return name in {"azure-pipelines.yml", "azure-pipelines.yaml"} or name.startswith("azure-pipeline")
5053

5154

55+
def _is_tox(path: Path) -> bool:
56+
return path.name.lower() == "tox.ini"
57+
58+
59+
def _is_nox(path: Path) -> bool:
60+
return path.name.lower() == "noxfile.py"
61+
62+
5263
def _is_ci_file(path: Path) -> bool:
5364
return any(
5465
[
@@ -58,6 +69,8 @@ def _is_ci_file(path: Path) -> bool:
5869
_is_circleci(path),
5970
_is_travis(path),
6071
_is_azure_pipelines(path),
72+
_is_tox(path),
73+
_is_nox(path),
6174
]
6275
)
6376

@@ -112,6 +125,63 @@ def _extract_jenkins_refs(content: str) -> set[str]:
112125
return refs
113126

114127

128+
def _extract_tox_refs(content: str) -> set[str]:
129+
refs: set[str] = set()
130+
# Extract deps
131+
for match in re.finditer(r"^\s*deps\s*=\s*(.+)$", content, re.MULTILINE):
132+
deps = match.group(1).split()
133+
refs.update(d.strip() for d in deps if d.strip())
134+
135+
# Extract commands
136+
for match in re.finditer(r"^\s*commands\s*=\s*(.+)$", content, re.MULTILINE):
137+
cmd = match.group(1)
138+
# Split by whitespace to find potential paths
139+
# Examples:
140+
# commands = pytest {posargs}
141+
# commands = ruff check src/
142+
# commands = python -m pytest tests/
143+
parts = cmd.split()
144+
for p in parts:
145+
# Strip quoting
146+
p = p.strip("'\"")
147+
# Remove tox specific vars like {posargs}
148+
p = re.sub(r"\{[^}]+\}", "", p)
149+
150+
if not p or p.startswith("-"):
151+
continue
152+
153+
# Heuristic: if it looks like a path (contains / or .py/.ini etc)
154+
# or if we are permissive and just add everything that looks like an ident
155+
# Given discover_files_by_refs filters candidates, being permissive is safer.
156+
refs.add(p)
157+
158+
refs.update(_extract_script_refs(cmd))
159+
160+
return refs
161+
162+
163+
def _extract_nox_refs(content: str) -> set[str]:
164+
refs: set[str] = set()
165+
# Extract session.run calls
166+
# session.run("cmd", "arg1", ...)
167+
for match in re.finditer(r"session\.run\(([^)]+)\)", content):
168+
args_str = match.group(1)
169+
# Simple extraction of string literals
170+
args = re.findall(r'["\']([^"\']+)["\']', args_str)
171+
for arg in args:
172+
refs.add(arg)
173+
refs.update(_extract_script_refs(arg))
174+
175+
# Extract session.install calls
176+
for match in re.finditer(r"session\.install\(([^)]+)\)", content):
177+
args_str = match.group(1)
178+
args = re.findall(r'["\']([^"\']+)["\']', args_str)
179+
for arg in args:
180+
refs.add(arg)
181+
182+
return refs
183+
184+
115185
class CICDEdgeBuilder(EdgeBuilder):
116186
weight = 0.55
117187
script_weight = 0.60
@@ -135,19 +205,29 @@ def discover_related_files(
135205
except (OSError, UnicodeDecodeError):
136206
continue
137207

208+
local_refs = set()
138209
if _is_github_actions(ci):
139-
refs.update(_extract_gha_refs(content))
210+
local_refs.update(_extract_gha_refs(content))
140211
elif _is_gitlab_ci(ci):
141-
refs.update(_extract_gitlab_refs(content))
212+
local_refs.update(_extract_gitlab_refs(content))
142213
elif _is_jenkinsfile(ci):
143-
refs.update(_extract_jenkins_refs(content))
214+
local_refs.update(_extract_jenkins_refs(content))
215+
elif _is_tox(ci):
216+
local_refs.update(_extract_tox_refs(content))
217+
elif _is_nox(ci):
218+
local_refs.update(_extract_nox_refs(content))
144219
else:
145-
refs.update(_extract_script_refs(content))
220+
local_refs.update(_extract_script_refs(content))
146221

147222
if any(cmd in content.lower() for cmd in ["npm", "yarn", "pnpm"]):
148-
refs.add("package.json")
223+
local_refs.add("package.json")
224+
225+
logging.debug("CICD refs for %s: %s", ci.name, local_refs)
226+
refs.update(local_refs)
149227

150-
return discover_files_by_refs(refs, changed_files, all_candidate_files)
228+
discovered = discover_files_by_refs(refs, changed_files, all_candidate_files)
229+
logging.debug("CICD discovered for %s: %s", [c.name for c in ci_files], [d.name for d in discovered])
230+
return discovered
151231

152232
def build(self, fragments: list[Fragment], repo_root: Path | None = None) -> EdgeDict:
153233
ci_frags = [f for f in fragments if _is_ci_file(f.path)]
@@ -171,6 +251,10 @@ def _extract_refs(self, ci: Fragment) -> set[str]:
171251
return _extract_gitlab_refs(ci.content)
172252
if _is_jenkinsfile(ci.path):
173253
return _extract_jenkins_refs(ci.content)
254+
if _is_tox(ci.path):
255+
return _extract_tox_refs(ci.content)
256+
if _is_nox(ci.path):
257+
return _extract_nox_refs(ci.content)
174258
return _extract_script_refs(ci.content)
175259

176260
def _link_refs(self, ci_id: FragmentId, refs: set[str], idx: FragmentIndex, edges: EdgeDict) -> None:

src/treemapper/diffctx/edges/config/generic.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -86,9 +86,7 @@ def discover_related_files(
8686
def _build_key_patterns(self, keys: set[str]) -> dict[str, re.Pattern[str]]:
8787
patterns: dict[str, re.Pattern[str]] = {}
8888
for key in keys:
89-
if len(key) >= 4:
90-
patterns[key] = re.compile(rf"\b\w*{re.escape(key)}\w*\b", re.IGNORECASE)
91-
else:
89+
if len(key) >= 6:
9290
patterns[key] = re.compile(rf"\b{re.escape(key)}\b", re.IGNORECASE)
9391
return patterns
9492

src/treemapper/diffctx/edges/semantic/go.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ def _build_indices(
8888
func_defs: dict[str, list[FragmentId]] = defaultdict(list)
8989

9090
for f in go_frags:
91-
pkg = _get_package_name(f.path)
91+
pkg = _get_package_name(f.path).lower()
9292
pkg_to_frags[pkg].append(f.id)
9393

9494
if repo_root:
@@ -143,7 +143,7 @@ def _link_import_by_package(
143143
pkg_to_frags: dict[str, list[FragmentId]],
144144
edges: EdgeDict,
145145
) -> None:
146-
imp_pkg = imp.split("/")[-1]
146+
imp_pkg = imp.split("/")[-1].lower()
147147
for pkg, frag_ids in pkg_to_frags.items():
148148
if pkg == imp_pkg:
149149
self.add_edges_from_ids(gf_id, frag_ids, self.import_weight, edges)
@@ -190,7 +190,7 @@ def _link_same_package(
190190
pkg_to_frags: dict[str, list[FragmentId]],
191191
edges: EdgeDict,
192192
) -> None:
193-
current_pkg = _get_package_name(gf.path)
193+
current_pkg = _get_package_name(gf.path).lower()
194194
for fid in pkg_to_frags.get(current_pkg, []):
195195
if fid != gf.id:
196196
self.add_edge(edges, gf.id, fid, self.same_package_weight)

src/treemapper/diffctx/languages.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -164,11 +164,11 @@
164164
".pyw": "python",
165165
".pyi": "python",
166166
".js": "javascript",
167-
".jsx": "javascript",
167+
".jsx": "jsx",
168168
".mjs": "javascript",
169169
".cjs": "javascript",
170170
".ts": "typescript",
171-
".tsx": "typescript",
171+
".tsx": "tsx",
172172
".mts": "typescript",
173173
".cts": "typescript",
174174
".go": "go",

src/treemapper/diffctx/parsers/base.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ def _process_char_in_string(char: str, string_char: str, escape_count: int) -> t
3535

3636

3737
def _process_char_outside_string(char: str, stack: list[str]) -> tuple[bool, str]:
38-
if char in ('"', "'"):
38+
if char in ('"', "'", "`"):
3939
return True, char
4040
if char in _BRACKET_PAIRS:
4141
stack.append(_BRACKET_PAIRS[char])

src/treemapper/diffctx/parsers/config.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -82,15 +82,16 @@ def fragment(self, path: Path, content: str) -> list[Fragment]:
8282
elif suffix == ".toml":
8383
key_re = re.compile(r"^\[([a-zA-Z_][a-zA-Z0-9_.-]*)\]")
8484
else:
85-
key_re = re.compile(r'^\s{0,2}"([^"]+)":\s*')
85+
key_re = re.compile(r'^\s{0,4}"([^"]+)":\s*')
8686

8787
boundaries: list[int] = []
8888
for i, line in enumerate(lines):
8989
if key_re.match(line):
9090
boundaries.append(i)
9191

92-
if len(boundaries) < 2:
93-
return []
92+
if not boundaries:
93+
frag = create_fragment_from_lines(path, lines, 1, len(lines), "config", "data")
94+
return [frag] if frag else []
9495

9596
fragments: list[Fragment] = []
9697
boundaries.append(len(lines))

src/treemapper/diffctx/parsers/tree_sitter.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
_DEFINITION_NODE_TYPES = {
1414
"python": {"function_definition", "class_definition", "decorated_definition"},
1515
"javascript": {"function_declaration", "class_declaration", "method_definition", "arrow_function", "variable_declarator"},
16+
"jsx": {"function_declaration", "class_declaration", "method_definition", "arrow_function", "variable_declarator"},
1617
"typescript": {
1718
"function_declaration",
1819
"class_declaration",
@@ -23,6 +24,16 @@
2324
"enum_declaration",
2425
"variable_declarator",
2526
},
27+
"tsx": {
28+
"function_declaration",
29+
"class_declaration",
30+
"method_definition",
31+
"arrow_function",
32+
"interface_declaration",
33+
"type_alias_declaration",
34+
"enum_declaration",
35+
"variable_declarator",
36+
},
2637
"go": {"function_declaration", "method_declaration", "type_declaration"},
2738
"rust": {"function_item", "impl_item", "struct_item", "enum_item", "trait_item"},
2839
"java": {"method_declaration", "class_declaration", "interface_declaration", "enum_declaration"},
@@ -69,7 +80,9 @@
6980
_LANG_MODULES = {
7081
"python": "tree_sitter_python",
7182
"javascript": "tree_sitter_javascript",
83+
"jsx": "tree_sitter_javascript",
7284
"typescript": "tree_sitter_typescript",
85+
"tsx": "tree_sitter_typescript",
7386
"go": "tree_sitter_go",
7487
"rust": "tree_sitter_rust",
7588
"java": "tree_sitter_java",
@@ -98,15 +111,20 @@ def _get_parser(self, lang: str) -> Parser:
98111
module_name = _LANG_MODULES[lang]
99112
ts_lang_module = importlib.import_module(module_name)
100113

101-
if lang == "typescript":
114+
if lang == "tsx":
102115
ts_lang = ts_lang_module.language_tsx()
116+
elif lang == "typescript":
117+
ts_lang = ts_lang_module.language_typescript()
103118
elif hasattr(ts_lang_module, "language"):
104119
ts_lang = ts_lang_module.language()
105120
else:
106121
ts_lang = ts_lang_module
107122

108123
parser = Parser()
109-
parser.language = Language(ts_lang)
124+
if isinstance(ts_lang, Language):
125+
parser.language = ts_lang
126+
else:
127+
parser.language = Language(ts_lang)
110128
self._parsers[lang] = parser
111129
return parser
112130

0 commit comments

Comments
 (0)