From 679d22cc79e5c9e72344afb8f8c8af673cc9f23d Mon Sep 17 00:00:00 2001 From: Dmitry Teryaev Date: Wed, 24 Jun 2026 00:13:48 +0300 Subject: [PATCH 1/2] fix(cli): erase removes graph/cocoindex.db/.graph_hashes.json by type (#346) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `erase` reported success but left code_graph.lbug on disk because its deletion was type-blind: shutil.rmtree silently no-ops on a regular file (code_graph.lbug) and Path.unlink raises IsADirectoryError on a directory (cocoindex.db), both swallowed; .graph_hashes.json was never targeted. The next init then refused (exit 2), deadlooping the documented `erase --yes` -> `init` clean-slate workflow. Replace the type-blind deletes with a _rm_any helper that dispatches on type (file/dir/symlink — a symlinked dir is unlinked, never recursed into, so the target is not followed), so both the file-backed and dir-backed LadybugDB layouts are handled. erase now also removes .graph_hashes.json and lists it in the "Will delete:" preview. Deletion failures are warned to stderr instead of swallowed, so erase no longer reports success while leaving an artifact behind (the same silent-failure class as #346). `reprocess` is unaffected: its full rebuild opens the existing .lbug and _drop_all()s every node + edge table in place, and _init_hash_tracker resets .graph_hashes.json — it never relies on the broken deletion. Tests: add an always-on regression that creates a real lbug-file / cocoindex.db-dir / hash-store layout and asserts erase removes all three; convert the false-green test_init_after_erase_succeeds into a real build -> erase -> re-init lifecycle check. Co-Authored-By: Claude --- docs/JAVA-CODEBASE-RAG-CLI.md | 2 +- java_codebase_rag/cli.py | 40 ++++++++++++++++++++----- tests/test_java_codebase_rag_cli.py | 46 +++++++++++++++++++++++++++-- 3 files changed, 77 insertions(+), 11 deletions(-) diff --git a/docs/JAVA-CODEBASE-RAG-CLI.md b/docs/JAVA-CODEBASE-RAG-CLI.md index 4e655f3..0ee1e4f 100644 --- a/docs/JAVA-CODEBASE-RAG-CLI.md +++ b/docs/JAVA-CODEBASE-RAG-CLI.md @@ -240,7 +240,7 @@ java-codebase-rag reprocess --source-root /path/to/java/repo --index-dir /path/t ### `erase` -Deletes cocoindex state, the LadybugDB directory, and Lance tables under the index dir. Requires **`--yes`** or interactive confirmation on a TTY. Non-TTY without `--yes` exits **2**. +Deletes cocoindex state, the LadybugDB graph (`code_graph.lbug`), the graph builder's content-hash store (`.graph_hashes.json`), and Lance tables under the index dir. Requires **`--yes`** or interactive confirmation on a TTY. Non-TTY without `--yes` exits **2**. ```bash java-codebase-rag erase --source-root /path/to/java/repo --index-dir /path/to/.java-codebase-rag --yes diff --git a/java_codebase_rag/cli.py b/java_codebase_rag/cli.py index 817cd08..47cf388 100644 --- a/java_codebase_rag/cli.py +++ b/java_codebase_rag/cli.py @@ -580,11 +580,33 @@ def _cmd_update(args: argparse.Namespace) -> int: ) +def _rm_any(path: Path) -> None: + """Remove ``path`` whether it is a regular file, directory, or symlink. + + ``code_graph.lbug`` is a single regular file in this repo, but kuzu may lay + the graph out as a directory; ``cocoindex.db`` is always a directory. + ``shutil.rmtree`` is a silent no-op on a regular file and ``Path.unlink`` + raises ``IsADirectoryError`` on a directory, so a type-blind delete left + index artifacts on disk (issue #346). A symlinked directory is unlinked, not + recursed into, so the link target is never followed. Failures are warned to + stderr rather than swallowed, so erase does not report success while leaving + an artifact behind (the exact failure mode issue #346 reported). + """ + try: + if path.is_dir() and not path.is_symlink(): + shutil.rmtree(path) + elif path.exists() or path.is_symlink(): + path.unlink(missing_ok=True) + except OSError as exc: + print(f"warning: failed to remove {path}: {exc}", file=sys.stderr) + + def _cmd_erase(args: argparse.Namespace) -> int: cfg = _resolved_from_ns(args) _startup_hints(cfg) cfg.apply_to_os_environ() - to_describe: list[Path] = [cfg.ladybug_path, cfg.cocoindex_db] + graph_hashes_path = cfg.ladybug_path.parent / ".graph_hashes.json" + to_describe: list[Path] = [cfg.ladybug_path, cfg.cocoindex_db, graph_hashes_path] if cfg.index_dir.is_dir(): try: import lancedb @@ -621,13 +643,15 @@ def work(progress: "PipelineProgress | None") -> int: ) elif drop.returncode != 0: print(clip(drop.stderr, 4000), file=sys.stderr) - if cfg.ladybug_path.exists(): - shutil.rmtree(cfg.ladybug_path, ignore_errors=True) - if cfg.cocoindex_db.exists(): - try: - cfg.cocoindex_db.unlink() - except OSError: - pass + # Remove the LadybugDB graph, the cocoindex state store, and the graph + # builder's content-hash store. Each is removed by type (see _rm_any): + # code_graph.lbug is a file here but may be a dir under kuzu, while + # cocoindex.db is a directory — a type-blind delete silently no-oped on + # one or the other, and .graph_hashes.json was never targeted at all + # (issue #346). + _rm_any(cfg.ladybug_path) + _rm_any(cfg.cocoindex_db) + _rm_any(graph_hashes_path) if cfg.index_dir.is_dir(): try: import lancedb diff --git a/tests/test_java_codebase_rag_cli.py b/tests/test_java_codebase_rag_cli.py index cbac80d..5f0b6e7 100644 --- a/tests/test_java_codebase_rag_cli.py +++ b/tests/test_java_codebase_rag_cli.py @@ -109,6 +109,35 @@ def test_cli_erase_succeeds_with_yes_flag(tmp_path: Path) -> None: assert proc.returncode == 0, proc.stderr + proc.stdout +def test_erase_removes_graph_file_cocoindex_dir_and_hash_store(tmp_path: Path) -> None: + """erase must delete code_graph.lbug (file), cocoindex.db (dir), .graph_hashes.json. + + Regression for issue #346: a type-blind delete left both on disk. + shutil.rmtree is a silent no-op on a regular file (code_graph.lbug), and + Path.unlink raises IsADirectoryError on cocoindex.db (a directory) — both + swallowed — and .graph_hashes.json was never targeted. The follow-up init + then refused because code_graph.lbug survived. + """ + idx = tmp_path / "erase_artifacts" + idx.mkdir() + # Real on-disk layout: graph is a single FILE, cocoindex state is a DIR. + (idx / "code_graph.lbug").write_bytes(b"fake-kuzu-db") + (idx / "cocoindex.db").mkdir() + (idx / "cocoindex.db" / "state.json").write_text("{}", encoding="utf-8") + (idx / ".graph_hashes.json").write_text("{}", encoding="utf-8") + env = os.environ.copy() + env["JAVA_CODEBASE_RAG_INDEX_DIR"] = str(idx) + env["JAVA_CODEBASE_RAG_SOURCE_ROOT"] = str(tmp_path) + proc = _run_cli( + ["erase", "--source-root", str(tmp_path), "--index-dir", str(idx), "--yes"], + env=env, + ) + assert proc.returncode == 0, proc.stderr + proc.stdout + assert not (idx / "code_graph.lbug").exists(), "erase left code_graph.lbug on disk" + assert not (idx / "cocoindex.db").exists(), "erase left cocoindex.db/ on disk" + assert not (idx / ".graph_hashes.json").exists(), "erase left .graph_hashes.json on disk" + + def test_embedding_model_precedence_cli_over_env_over_yaml_over_default( tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: @@ -377,21 +406,34 @@ def test_legacy_env_var_set_emits_stderr_hint(monkeypatch: pytest.MonkeyPatch, t @pytest.mark.skipif(not _cocoindex_available(), reason="cocoindex not installed in venv") def test_init_after_erase_succeeds(corpus_root: Path, tmp_path: Path) -> None: + """Build a real index, erase it, then init again from a clean slate. + + Regression for issue #346: the previous body erased an *empty* index dir and + then inited, so it never exercised "erase a real graph -> re-init" and stayed + green while erase silently left code_graph.lbug on disk. + """ idx = tmp_path / "lifecycle_idx" idx.mkdir(parents=True) env = os.environ.copy() env["JAVA_CODEBASE_RAG_INDEX_DIR"] = str(idx) env["JAVA_CODEBASE_RAG_SOURCE_ROOT"] = str(corpus_root.resolve()) + init1 = _run_cli( + ["init", "--source-root", str(corpus_root), "--index-dir", str(idx), "--quiet"], + env=env, + ) + assert init1.returncode == 0, init1.stdout + init1.stderr + assert (idx / "code_graph.lbug").exists(), "init did not build code_graph.lbug" e1 = _run_cli( ["erase", "--source-root", str(corpus_root), "--index-dir", str(idx), "--yes"], env=env, ) assert e1.returncode == 0, e1.stderr - init = _run_cli( + assert not (idx / "code_graph.lbug").exists(), "erase left code_graph.lbug on disk" + init2 = _run_cli( ["init", "--source-root", str(corpus_root), "--index-dir", str(idx), "--quiet"], env=env, ) - assert init.returncode == 0, init.stdout + init.stderr + assert init2.returncode == 0, init2.stdout + init2.stderr @pytest.mark.skipif(not _cocoindex_available(), reason="cocoindex not installed in venv") From 338ba0cbabd89efd86b1124ecd02a9c536c405ed Mon Sep 17 00:00:00 2001 From: Dmitry Teryaev Date: Wed, 24 Jun 2026 14:23:00 +0300 Subject: [PATCH 2/2] bump version to 0.6.6 Co-Authored-By: Claude --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 07ef5e1..fb91162 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "java-codebase-rag" -version = "0.6.5" +version = "0.6.6" description = "MCP server for semantic + structural search over Java codebases" readme = "README.md" requires-python = ">=3.11"