Skip to content

Commit f4ebf46

Browse files
HumanBean17claude
andauthored
fix(increment): stop endless full graph rebuild from stale hash entries (#314)
Two bugs in build_ast_graph.py fed an endless full-rebuild loop on every `java-codebase-rag increment` (the "7 removed files every run" symptom): 1. _init_hash_tracker (run by every full reprocess and by incremental fallback) did load()+update() and never removed hashes for files no longer on disk. Ghost entries were re-detected as "removed" on every run, sustaining the loop. 2. _write_clients_producers_and_calls built a default MemberEntry missing the required node_id field. Because dict.get(k, default) evaluates the default eagerly, the TypeError fired whenever ANY declares_client / declares_producer row existed, crashing every client-bearing incremental rebuild into a full fallback (which then preserved the ghosts via #1). Fix: rebuild the hash store fresh in _init_hash_tracker so stale entries are pruned; add node_id="" to the two MemberEntry defaults. Adds regression tests (builder-level + a CLI test for the reported reprocess --graph-only -> increment scenario, verified to fail without the fix). Co-authored-by: Claude <noreply@anthropic.com>
1 parent dd6aa51 commit f4ebf46

3 files changed

Lines changed: 206 additions & 4 deletions

File tree

build_ast_graph.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3668,10 +3668,17 @@ def incremental_rebuild(
36683668

36693669

36703670
def _init_hash_tracker(source_root: Path, ladybug_path: Path) -> int:
3671-
"""Initialize hash tracker for all Java files. Returns number of files hashed."""
3671+
"""Initialize hash tracker for all Java files. Returns number of files hashed.
3672+
3673+
Called right after a full graph rebuild (``write_ladybug``), so the store must
3674+
mirror exactly the files that were just indexed. We deliberately do NOT
3675+
``load()`` the existing store: ``update`` re-hashes every current file anyway,
3676+
and preserving old entries would leave stale hashes for files that no longer
3677+
exist (deleted or now-ignored). Those ghosts would be re-detected as "removed"
3678+
on every subsequent ``increment``, sustaining an endless full-rebuild loop.
3679+
"""
36723680
index_dir = ladybug_path.parent
36733681
tracker = FileHashTracker(index_dir)
3674-
tracker.load()
36753682
ignore = LayeredIgnore(source_root)
36763683
all_files: set[str] = set()
36773684
source_root_resolved = source_root.resolve()
@@ -3742,7 +3749,7 @@ def _write_clients_producers_and_calls(conn: ladybug.Connection, tables: GraphTa
37423749

37433750
# Write declares_client edges
37443751
for row in tables.declares_client_rows:
3745-
source_file = member_by_id.get(row.symbol_id, MemberEntry(kind="", decl=None, parent_id="", parent_fqn="", file_path="", module="", microservice="")).file_path
3752+
source_file = member_by_id.get(row.symbol_id, MemberEntry(kind="", decl=None, parent_id="", parent_fqn="", file_path="", module="", microservice="", node_id="")).file_path
37463753
conn.execute(_CREATE_DECLARES_CLIENT, {
37473754
"sid": row.symbol_id,
37483755
"cid": row.client_id,
@@ -3753,7 +3760,7 @@ def _write_clients_producers_and_calls(conn: ladybug.Connection, tables: GraphTa
37533760

37543761
# Write declares_producer edges
37553762
for row in tables.declares_producer_rows:
3756-
source_file = member_by_id.get(row.symbol_id, MemberEntry(kind="", decl=None, parent_id="", parent_fqn="", file_path="", module="", microservice="")).file_path
3763+
source_file = member_by_id.get(row.symbol_id, MemberEntry(kind="", decl=None, parent_id="", parent_fqn="", file_path="", module="", microservice="", node_id="")).file_path
37573764
conn.execute(_CREATE_DECLARES_PRODUCER, {
37583765
"sid": row.symbol_id,
37593766
"pid": row.producer_id,

tests/test_incremental_graph.py

Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
"""
55
from __future__ import annotations
66

7+
import json
8+
import shutil
79
from pathlib import Path
810

911
import ladybug
@@ -979,3 +981,141 @@ def test_incremental_preserves_incoming_edges_to_dependent(self, tmp_path: Path)
979981
assert cb_after_count > 0, "out-of-scope C->B CALLS edge must be preserved"
980982

981983
conn.close()
984+
985+
986+
class TestIncrementalRegressions:
987+
"""Regression tests for the ``increment`` always-fully-reprocesses loop.
988+
989+
Two bugs fed the loop:
990+
1. ``_write_clients_producers_and_calls`` built a default ``MemberEntry``
991+
missing the required ``node_id`` field. Because ``dict.get(k, default)``
992+
evaluates ``default`` eagerly, the TypeError fired whenever ANY
993+
``declares_client`` / ``declares_producer`` row existed — crashing every
994+
client-bearing incremental rebuild into a full-rebuild fallback.
995+
2. ``_init_hash_tracker`` (run by every full reprocess AND by that fallback)
996+
did ``load()`` + ``update()`` and never pruned hashes for files no longer
997+
on disk, so ghost entries persisted and re-triggered the loop every run.
998+
"""
999+
1000+
def test_init_hash_tracker_prunes_stale_entries(self, tmp_path: Path) -> None:
1001+
"""A full rebuild drops hashes for files no longer on disk (ghost pruning).
1002+
1003+
Without pruning, a stale entry is re-detected as 'removed' on every
1004+
``increment``, sustaining an endless full-rebuild loop.
1005+
"""
1006+
from build_ast_graph import write_ladybug
1007+
1008+
source_root = tmp_path / "src"
1009+
source_root.mkdir()
1010+
(source_root / "A.java").write_text("package pkg; class A {}", encoding="utf-8")
1011+
index_dir = tmp_path / "index"
1012+
index_dir.mkdir()
1013+
ladybug_path = index_dir / "code_graph.lbug"
1014+
1015+
tables = GraphTables()
1016+
pass1_parse(source_root, tables, verbose=False)
1017+
write_ladybug(ladybug_path, tables, source_root=source_root, verbose=False)
1018+
1019+
# Inject a ghost hash for a file that does not exist on disk.
1020+
hash_file = index_dir / ".graph_hashes.json"
1021+
data = json.loads(hash_file.read_text(encoding="utf-8"))
1022+
data["ghost/Deleted.java"] = "0" * 64
1023+
hash_file.write_text(json.dumps(data), encoding="utf-8")
1024+
1025+
# A second full rebuild (what `reprocess --graph-only` does) re-runs
1026+
# _init_hash_tracker, which must drop the ghost.
1027+
tables2 = GraphTables()
1028+
pass1_parse(source_root, tables2, verbose=False)
1029+
write_ladybug(ladybug_path, tables2, source_root=source_root, verbose=False)
1030+
1031+
after = json.loads(hash_file.read_text(encoding="utf-8"))
1032+
assert "ghost/Deleted.java" not in after
1033+
assert "A.java" in after
1034+
1035+
def test_incremental_with_http_clients_does_not_fall_back(self, tmp_path: Path) -> None:
1036+
"""A corpus with Feign/Kafka clients/producers rebuilds incrementally.
1037+
1038+
``http_caller_smoke`` emits DECLARES_CLIENT / DECLARES_PRODUCER rows, so
1039+
the buggy eager ``MemberEntry`` default in
1040+
``_write_clients_producers_and_calls`` crashed here before the fix
1041+
(forcing full_fallback). After the fix: mode is "incremental".
1042+
"""
1043+
from _builders import build_ladybug_full_into
1044+
from build_ast_graph import incremental_rebuild
1045+
1046+
corpus = Path(__file__).parent / "fixtures" / "http_caller_smoke"
1047+
source_root = tmp_path / "src"
1048+
shutil.copytree(corpus, source_root)
1049+
index_dir = tmp_path / "index"
1050+
index_dir.mkdir()
1051+
ladybug_path = index_dir / "code_graph.lbug"
1052+
1053+
# Full build seeds .graph_hashes.json via write_ladybug -> _init_hash_tracker.
1054+
build_ladybug_full_into(source_root, ladybug_path)
1055+
1056+
# Mutate one file unrelated to the clients/producers.
1057+
target = source_root / "src" / "main" / "java" / "smoke" / "http" / "TopicNames.java"
1058+
target.write_text(target.read_text(encoding="utf-8") + "\n// edit\n", encoding="utf-8")
1059+
1060+
result = incremental_rebuild(source_root, ladybug_path, verbose=False)
1061+
assert result.mode == "incremental", (
1062+
f"expected incremental, got {result.mode!r} (the node_id crash in "
1063+
"_write_clients_producers_and_calls forces a full fallback)"
1064+
)
1065+
assert result.files_changed == 1
1066+
1067+
def test_reprocess_graph_only_then_increment_is_noop(self, tmp_path: Path) -> None:
1068+
"""The reported scenario at the builder level: a full graph rebuild (what
1069+
``reprocess --graph-only`` does) followed by ``increment`` with no source
1070+
changes must be a no-op, not a second full rebuild."""
1071+
from _builders import build_ladybug_full_into
1072+
from build_ast_graph import incremental_rebuild
1073+
1074+
corpus = Path(__file__).parent / "fixtures" / "http_caller_smoke"
1075+
source_root = tmp_path / "src"
1076+
shutil.copytree(corpus, source_root)
1077+
index_dir = tmp_path / "index"
1078+
index_dir.mkdir()
1079+
ladybug_path = index_dir / "code_graph.lbug"
1080+
1081+
# Simulate `reprocess --graph-only`: full rebuild seeds the hash store.
1082+
build_ladybug_full_into(source_root, ladybug_path)
1083+
1084+
# `increment` with no source changes.
1085+
result = incremental_rebuild(source_root, ladybug_path, verbose=False)
1086+
assert result.mode == "incremental"
1087+
assert (result.files_changed, result.files_added, result.files_removed) == (0, 0, 0)
1088+
1089+
def test_incremental_ghost_entry_then_next_run_is_noop(self, tmp_path: Path) -> None:
1090+
"""A ghost hash entry is detected as 'removed' once, processed by the
1091+
scoped path (which prunes it), so the following run is a clean no-op.
1092+
1093+
Guards both fixes together: the node_id fix lets the scoped path
1094+
complete, and that path prunes the ghost (lines that delete `removed`
1095+
hashes). Before the fixes this fell back to full and preserved the ghost.
1096+
"""
1097+
from _builders import build_ladybug_full_into
1098+
from build_ast_graph import incremental_rebuild
1099+
1100+
corpus = Path(__file__).parent / "fixtures" / "http_caller_smoke"
1101+
source_root = tmp_path / "src"
1102+
shutil.copytree(corpus, source_root)
1103+
index_dir = tmp_path / "index"
1104+
index_dir.mkdir()
1105+
ladybug_path = index_dir / "code_graph.lbug"
1106+
build_ladybug_full_into(source_root, ladybug_path)
1107+
1108+
# Inject a ghost (no source change).
1109+
hash_file = index_dir / ".graph_hashes.json"
1110+
data = json.loads(hash_file.read_text(encoding="utf-8"))
1111+
data["ghost/Gone.java"] = "0" * 64
1112+
hash_file.write_text(json.dumps(data), encoding="utf-8")
1113+
1114+
first = incremental_rebuild(source_root, ladybug_path, verbose=False)
1115+
assert first.mode == "incremental", f"expected incremental, got {first.mode!r}"
1116+
assert first.files_removed == 1
1117+
1118+
# The ghost must be gone, so the next run detects nothing.
1119+
second = incremental_rebuild(source_root, ladybug_path, verbose=False)
1120+
assert second.mode == "incremental"
1121+
assert (second.files_changed, second.files_added, second.files_removed) == (0, 0, 0)

tests/test_java_codebase_rag_cli.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -544,6 +544,61 @@ def test_increment_first_run_falls_back_to_full(
544544

545545

546546

547+
def test_reprocess_graph_only_then_increment_graph_is_noop(
548+
corpus_root: Path, tmp_path: Path, monkeypatch: pytest.MonkeyPatch,
549+
) -> None:
550+
"""The reported scenario, exercised through the real CLI wiring.
551+
552+
``reprocess --graph-only`` rebuilds the graph and seeds ``.graph_hashes.json``
553+
(via ``write_ladybug`` -> ``_init_hash_tracker``); the next ``increment``'s
554+
graph stage must be a no-op, NOT a second full rebuild.
555+
556+
cocoindex is stubbed so ``increment`` runs only its real graph stage (no
557+
embedding model needed). ``reprocess --graph-only`` needs no cocoindex
558+
regardless, so this test runs in the normal (non-heavy) suite.
559+
"""
560+
idx = tmp_path / "idx_reprocess_then_increment"
561+
idx.mkdir()
562+
monkeypatch.setenv("JAVA_CODEBASE_RAG_INDEX_DIR", str(idx))
563+
monkeypatch.setenv("JAVA_CODEBASE_RAG_SOURCE_ROOT", str(corpus_root))
564+
565+
rc = cli_mod.main(
566+
["reprocess", "--graph-only", "--source-root", str(corpus_root), "--index-dir", str(idx), "--quiet"],
567+
)
568+
assert rc == 0, "reprocess --graph-only must succeed"
569+
# reprocess --graph-only must seed the hash store.
570+
hash_file = idx / ".graph_hashes.json"
571+
assert hash_file.exists(), "hash store not seeded by reprocess --graph-only"
572+
573+
# Inject a ghost entry for a file that does not exist — the exact "N removed
574+
# files every run" symptom. On bank-chat (which has Feign/Kafka clients) the
575+
# scoped path this triggers reaches _write_clients_producers_and_calls, so a
576+
# missing-field MemberEntry default here used to crash into a full fallback.
577+
data = json.loads(hash_file.read_text(encoding="utf-8"))
578+
data["ghost/DoesNotExist.java"] = "0" * 64
579+
hash_file.write_text(json.dumps(data), encoding="utf-8")
580+
581+
# Stub cocoindex so increment exercises ONLY its graph stage.
582+
def _noop_coco(env, *, full_reprocess, quiet, verbose=True, lance_project_root=None):
583+
return subprocess.CompletedProcess(args=[], returncode=0, stdout="", stderr="")
584+
585+
monkeypatch.setattr(cli_mod, "run_cocoindex_update", _noop_coco)
586+
587+
buf_out = io.StringIO()
588+
buf_err = io.StringIO()
589+
with contextlib.redirect_stdout(buf_out), contextlib.redirect_stderr(buf_err):
590+
rc2 = cli_mod.main(
591+
["increment", "--source-root", str(corpus_root), "--index-dir", str(idx), "--quiet"],
592+
)
593+
assert rc2 == 0
594+
# The graph stage must NOT have fallen back to a full rebuild.
595+
assert "fell back to full graph rebuild" not in buf_err.getvalue()
596+
assert "increment completed (Lance + graph updated)" in buf_out.getvalue()
597+
# The ghost must be pruned, so the next increment is clean.
598+
after = json.loads(hash_file.read_text(encoding="utf-8"))
599+
assert "ghost/DoesNotExist.java" not in after
600+
601+
547602
@pytest.mark.skipif(not _cocoindex_available(), reason="cocoindex not installed in venv")
548603
def test_increment_updates_lance_after_touch_java_file(corpus_root: Path, tmp_path: Path) -> None:
549604
import lancedb # noqa: PLC0415

0 commit comments

Comments
 (0)