Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions docs/CONFIGURATION.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,9 @@ A single file at the project root (the directory you pass as `--source-root`, or

# Index directory: where Lance tables, code_graph.kuzu, and cocoindex.db live.
# - Tilde (`~`) is expanded; `$VAR` is NOT (use absolute paths or `~`).
# - Relative paths resolve against source_root, not cwd.
# - Relative paths resolve against the config file's parent directory (same
# base as source_root), not cwd. The bare default ./.java-codebase-rag
# (when this key is omitted) still sits beside the resolved source_root.
# - Env: JAVA_CODEBASE_RAG_INDEX_DIR. CLI: --index-dir. Default: ./.java-codebase-rag/
index_dir: ./.java-codebase-rag

Expand Down Expand Up @@ -217,15 +219,15 @@ async_producer_overrides:

| Field | Expanded? | Notes |
|---|---|---|
| `index_dir` | partial | `~` expanded; `$VAR` is NOT expanded. Relative paths resolve against `source_root`. |
| `index_dir` | partial | `~` expanded; `$VAR` is NOT expanded. A YAML relative path resolves against the config file's directory (same base as `source_root`); the default `./.java-codebase-rag` sits beside the resolved `source_root`. |
| `embedding.model` (when path-shaped) | yes | Path-shape = starts with `/`, `./`, `../`, `~`, or contains `$`. Plain `org/name` is treated as a hub id and passed through. Applies to the value after CLI > env > YAML > default precedence. Long-lived MCP hosts also apply the same expansion when reading `SBERT_MODEL` from the process environment (so table metadata and search agree with `index_common` defaults). |
| `embedding.device` | n/a | Device strings (`cpu`, `cuda`, `mps`) aren't paths. |
| `microservice_roots[*]` | no | Each entry is a directory **name** relative to `source_root`, not an arbitrary path. |
| Brownfield `path:` / `topic:` values | no | These are URL paths and Kafka topic names, not filesystem paths. Literal characters preserved. |

**Tips & gotchas:**

- **The file must be at `source_root`**, not in `$HOME`. The MCP server reads `JAVA_CODEBASE_RAG_SOURCE_ROOT` to find it; the CLI uses `--source-root` (else cwd).
- **The config file may live anywhere under your project, including a subdirectory of the Java tree.** Both the CLI (`init` / `increment` / `reprocess`) and the MCP server walk up from cwd to find `.java-codebase-rag.yml`, then resolve `source_root` and `index_dir` relative to the config file's directory. So a config living in `my-context/` next to `source_root: ../` and `index_dir: ../.java-codebase-rag` resolves identically for the CLI and the MCP server. Keep the file under your project (not `$HOME`); set `JAVA_CODEBASE_RAG_SOURCE_ROOT` (MCP) or `--source-root` (CLI) only to override the discovered location.
- **Don't commit secrets** into this YAML — it sits next to your source tree and is read by every operator who clones it.
- **Rebuild after editing brownfield overrides.** Run a full `java-codebase-rag reprocess` (no flags) so Lance and Kuzu stay coherent, or use `--graph-only` / `--vectors-only` when you know only one store needs invalidation. Editing `embedding.model` requires a vector rebuild (`reprocess` or `--vectors-only`).
- **Diagnose what's loaded.** `java-codebase-rag meta` prints the resolved config and each value's `*_source` (`cli` / `env` / `yaml` / `default`) — see `embedding_model_source`, `embedding_device_source`, `index_dir_source`.
Expand Down
14 changes: 12 additions & 2 deletions java_codebase_rag/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,9 +306,19 @@ def _pick_bool(
def _resolve_index_dir_path(
*,
source_root: Path,
config_dir: Path,
cli_index_dir: str | None,
yaml_dict: dict[str, Any],
) -> tuple[Path, SettingSource]:
# Bases for relative paths:
# - YAML ``index_dir`` -> the config file's directory (``config_dir``),
# the SAME base used for YAML ``source_root``. Paths written in the
# config file are relative to the file, so both keys stay consistent.
# - CLI / env ``index_dir`` -> ``source_root`` (unchanged). These are not
# "in the config file"; preserving the existing base avoids a semantics
# change for operators who pass ``--index-dir`` on the command line.
# - Default ``./.java-codebase-rag`` -> ``source_root`` so the index sits
# beside the Java tree (the layout ``discover_project_root`` anchors on).
raw_cli = cli_index_dir.strip() if isinstance(cli_index_dir, str) else None
if raw_cli:
p = Path(raw_cli).expanduser()
Expand All @@ -324,7 +334,7 @@ def _resolve_index_dir_path(
idx = yaml_dict.get("index_dir")
if isinstance(idx, str) and idx.strip():
p = Path(idx.strip()).expanduser()
out = p.resolve() if p.is_absolute() else (source_root / p).resolve()
out = p.resolve() if p.is_absolute() else (config_dir / p).resolve()
return out, "yaml"

return (source_root / ".java-codebase-rag").resolve(), "default"
Expand Down Expand Up @@ -368,7 +378,7 @@ def resolve_operator_config(
root = config_dir

index_dir, index_src = _resolve_index_dir_path(
source_root=root, cli_index_dir=cli_index_dir, yaml_dict=yaml_dict
source_root=root, config_dir=config_dir, cli_index_dir=cli_index_dir, yaml_dict=yaml_dict
)
model, model_src = _pick_str(
cli_val=cli_embedding_model,
Expand Down
23 changes: 22 additions & 1 deletion server.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,27 @@ def _project_root() -> Path:
return discovered if discovered is not None else Path.cwd().resolve()


def _source_root_for_operator_config() -> Path | None:
"""``source_root`` arg to hand ``resolve_operator_config`` from the MCP server.

Returns ``JAVA_CODEBASE_RAG_SOURCE_ROOT`` when set (an explicit operator
override that wins and suppresses the YAML ``source_root`` field, exactly
like CLI ``--source-root``), otherwise ``None`` — so
``resolve_operator_config`` runs its OWN walk-up discovery and HONORS the
YAML ``source_root`` field, matching the CLI (``init`` / ``increment`` /
``reprocess``) path.

Do NOT pass ``_project_root()`` (the walk-up-discovered dir) here: a
non-``None`` value routes into the "explicit source root" branch that
skips the YAML ``source_root`` field, which made the MCP server and the
CLI resolve different ``source_root`` / ``index_dir`` from the same config
file (the init-vs-MCP index_dir divergence). ``_project_root()`` is kept
only for the ``_resolve_lancedb_uri()`` fallback below.
"""
env = os.environ.get("JAVA_CODEBASE_RAG_SOURCE_ROOT", "").strip()
return Path(env).expanduser().resolve() if env else None


def _cocoindex_subprocess_env(project_root: Path) -> dict[str, str]:
sub_env = os.environ.copy()
sub_env["JAVA_CODEBASE_RAG_SOURCE_ROOT"] = str(project_root)
Expand Down Expand Up @@ -654,7 +675,7 @@ def main() -> None:
# Load YAML config and apply embedding settings to environment
# This ensures SBERT_MODEL and SBERT_DEVICE from .java-codebase-rag.yml are available
# before any tool handler runs (same behavior as CLI path)
cfg = resolve_operator_config(source_root=_project_root())
cfg = resolve_operator_config(source_root=_source_root_for_operator_config())
cfg.apply_to_os_environ()
mcp_v2.set_hints_enabled(cfg.hints_enabled)

Expand Down
51 changes: 51 additions & 0 deletions tests/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,57 @@ def test_source_root_from_yaml_absolute(self, tmp_path, monkeypatch):
assert result.source_root == Path(absolute_path)


class TestIndexDirRelativeToConfigDir:
"""YAML ``index_dir`` must resolve against the config file's directory.

``source_root`` already resolves against the config dir (see
``TestSourceRootFromYaml``). ``index_dir`` must use the SAME base so a
user can express both keys relative to the config file — otherwise a
``../`` in ``index_dir`` gets re-applied on top of the already-resolved
``source_root`` and overshoots by one level (the "init indexes ~/"
symptom when the config lives in a subdirectory of the Java tree).
"""

def test_yaml_index_dir_double_dot_resolves_against_config_dir(self, tmp_path, monkeypatch):
"""``index_dir: ../x`` is relative to the config file's directory, not source_root."""
monkeypatch.delenv("JAVA_CODEBASE_RAG_INDEX_DIR", raising=False)
monkeypatch.delenv("JAVA_CODEBASE_RAG_SOURCE_ROOT", raising=False)

config_dir = tmp_path / "my-project-context"
config_dir.mkdir()
(config_dir / YAML_CONFIG_FILENAMES[0]).write_text(
"source_root: ../\nindex_dir: ../.java-codebase-rag\n"
)
monkeypatch.chdir(config_dir)

result = resolve_operator_config(source_root=None)
# source_root ../ -> tmp_path (one level above the config file)
assert result.source_root == tmp_path
# index_dir ../ -> tmp_path/.java-codebase-rag (one level above the config file),
# NOT tmp_path.parent/.java-codebase-rag (which is what resolving against
# the already-resolved source_root would produce).
assert result.index_dir == (tmp_path / ".java-codebase-rag").resolve()

def test_yaml_index_dir_bare_resolves_against_config_dir(self, tmp_path, monkeypatch):
"""``index_dir: x`` (no ``../``) sits next to the config file."""
monkeypatch.delenv("JAVA_CODEBASE_RAG_INDEX_DIR", raising=False)
monkeypatch.delenv("JAVA_CODEBASE_RAG_SOURCE_ROOT", raising=False)

config_dir = tmp_path / "my-project-context"
config_dir.mkdir()
(config_dir / YAML_CONFIG_FILENAMES[0]).write_text(
"source_root: ../\nindex_dir: .java-codebase-rag\n"
)
monkeypatch.chdir(config_dir)

result = resolve_operator_config(source_root=None)
assert result.source_root == tmp_path
# Bare path resolves against the config dir, so the index sits beside
# the config file — NOT beside source_root.
assert result.index_dir == (config_dir / ".java-codebase-rag").resolve()
assert result.index_dir_source == "yaml"


class TestSourceRootPrecedence:
"""Tests for source_root precedence chain."""

Expand Down
58 changes: 57 additions & 1 deletion tests/test_mcp_server_project_root.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Tests for server.py _project_root() function in the MCP server context."""

from java_codebase_rag.config import YAML_CONFIG_FILENAMES
from java_codebase_rag.config import YAML_CONFIG_FILENAMES, resolve_operator_config


class TestProjectRoot:
Expand All @@ -23,3 +23,59 @@ def test_project_root_uses_discover_when_env_unset(self, tmp_path, monkeypatch):

result = _project_root()
assert result == tmp_path


class TestSourceRootForOperatorConfig:
"""The MCP server must honor the YAML ``source_root`` field like the CLI.

``main()`` passes ``_source_root_for_operator_config()`` (not the
walk-up-discovered dir) as the ``source_root`` arg to
``resolve_operator_config``. When the env override is unset that is
``None``, which routes through the walk-up branch that APPLIES the YAML
``source_root`` field. Passing the discovered dir instead would route into
the "explicit source root" branch and silently ignore the YAML field,
diverging the MCP server from ``init``/``increment``/``reprocess``.
"""

def test_returns_none_when_env_unset(self, tmp_path, monkeypatch):
monkeypatch.delenv("JAVA_CODEBASE_RAG_SOURCE_ROOT", raising=False)
from server import _source_root_for_operator_config

assert _source_root_for_operator_config() is None

def test_returns_env_path_when_set(self, tmp_path, monkeypatch):
explicit = tmp_path / "explicit-root"
explicit.mkdir()
monkeypatch.setenv("JAVA_CODEBASE_RAG_SOURCE_ROOT", str(explicit))
from server import _source_root_for_operator_config

assert _source_root_for_operator_config() == explicit.resolve()

def test_mcp_and_init_resolve_identically_for_nested_config(self, tmp_path, monkeypatch):
"""Regression for the init-vs-MCP index_dir divergence.

Config lives in a subdirectory of the Java tree (``my-project-context/``)
and points both ``source_root`` and ``index_dir`` one level up. The MCP
server (env unset) and the CLI must resolve the SAME source_root and
index_dir, landing on the real index at ``tmp_path/.java-codebase-rag``.
"""
monkeypatch.delenv("JAVA_CODEBASE_RAG_SOURCE_ROOT", raising=False)
monkeypatch.delenv("JAVA_CODEBASE_RAG_INDEX_DIR", raising=False)

config_dir = tmp_path / "my-project-context"
config_dir.mkdir()
(config_dir / YAML_CONFIG_FILENAMES[0]).write_text(
"source_root: ../\nindex_dir: ../.java-codebase-rag\n"
)
monkeypatch.chdir(config_dir)

from server import _source_root_for_operator_config

mcp = resolve_operator_config(source_root=_source_root_for_operator_config())
cli = resolve_operator_config(source_root=None)

assert mcp.source_root == tmp_path
assert mcp.index_dir == (tmp_path / ".java-codebase-rag").resolve()
assert mcp.source_root == cli.source_root
assert mcp.index_dir == cli.index_dir

Loading