Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 12 additions & 3 deletions java_index_flow_lancedb.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,16 +60,21 @@
PROJECT_ROOT = coco.ContextKey[Path]("java_lance_project_root")
LANCE_DB = coco.ContextKey("java_lance_async_conn")
EMBEDDER = coco.ContextKey[SentenceTransformerEmbedder]("java_lance_embedder")
IGNORE = coco.ContextKey[LayeredIgnore]("java_lance_layered_ignore")
elif "tracked" in _ck_params:
PROJECT_ROOT = coco.ContextKey[Path]("java_lance_project_root", tracked=False)
LANCE_DB = coco.ContextKey("java_lance_async_conn", tracked=False)
EMBEDDER = coco.ContextKey[SentenceTransformerEmbedder](
"java_lance_embedder", tracked=False
)
IGNORE = coco.ContextKey[LayeredIgnore](
"java_lance_layered_ignore", tracked=False
)
else:
PROJECT_ROOT = coco.ContextKey[Path]("java_lance_project_root")
LANCE_DB = coco.ContextKey("java_lance_async_conn")
EMBEDDER = coco.ContextKey[SentenceTransformerEmbedder]("java_lance_embedder")
IGNORE = coco.ContextKey[LayeredIgnore]("java_lance_layered_ignore")

splitter = RecursiveSplitter()

Expand Down Expand Up @@ -292,6 +297,7 @@ async def coco_lifespan(builder: coco.EnvironmentBuilder) -> AsyncIterator[None]
trust_remote_code=True,
)
builder.provide(EMBEDDER, embedder)
builder.provide(IGNORE, LayeredIgnore(root))

uri = str(index_dir)

Expand Down Expand Up @@ -348,7 +354,8 @@ async def process_java_file(
) -> None:
embedder = coco.use_context(EMBEDDER)
project_root = coco.use_context(PROJECT_ROOT)
if LayeredIgnore(project_root).is_ignored((project_root / file.file_path.path).resolve()):
ignore = coco.use_context(IGNORE)
if ignore.is_ignored((project_root / file.file_path.path).resolve()):
return
try:
content = await file.read_text()
Expand Down Expand Up @@ -420,7 +427,8 @@ async def process_sql_file(
) -> None:
embedder = coco.use_context(EMBEDDER)
project_root = coco.use_context(PROJECT_ROOT)
if LayeredIgnore(project_root).is_ignored((project_root / file.file_path.path).resolve()):
ignore = coco.use_context(IGNORE)
if ignore.is_ignored((project_root / file.file_path.path).resolve()):
return
try:
content = await file.read_text()
Expand Down Expand Up @@ -468,7 +476,8 @@ async def process_yaml_file(
) -> None:
embedder = coco.use_context(EMBEDDER)
project_root = coco.use_context(PROJECT_ROOT)
if LayeredIgnore(project_root).is_ignored((project_root / file.file_path.path).resolve()):
ignore = coco.use_context(IGNORE)
if ignore.is_ignored((project_root / file.file_path.path).resolve()):
return
try:
content = await file.read_text()
Expand Down
10 changes: 9 additions & 1 deletion path_filtering.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,7 @@ def __init__(
_scan_negation_any_bundle_ignore(self.project_root)
or (use_gitignore and _scan_negation_any_gitignore(self.project_root))
)
self._mega_cache: dict[str, tuple[list[str], GitIgnoreSpec, list[tuple[str, Path | None, int, str]]]] = {}

def cocoindex_excluded_patterns(self) -> list[str]:
"""Patterns for CocoIndex ``PatternFilePathMatcher.excluded_patterns``.
Expand Down Expand Up @@ -332,6 +333,11 @@ def _path_for_display(self, path: Path | None) -> str:
return path.as_posix()

def _mega(self, rel_project: str) -> tuple[list[str], GitIgnoreSpec, list[tuple[str, Path | None, int, str]]]:
# Cache by directory (parent of rel_project). _mega_build_for_rel reads only dir_parts,
# so files in the same directory share the same mega/spec/meta tuple.
cache_key = Path(rel_project).parent.as_posix()
if cache_key in self._mega_cache:
return self._mega_cache[cache_key]
mega, meta = _mega_build_for_rel(
self.project_root,
rel_project,
Expand All @@ -340,7 +346,9 @@ def _mega(self, rel_project: str) -> tuple[list[str], GitIgnoreSpec, list[tuple[
project_ignore_path=self._project_ignore_path,
project_lines=self._project_lines,
)
return mega, GitIgnoreSpec.from_lines(mega), meta
result = (mega, GitIgnoreSpec.from_lines(mega), meta)
self._mega_cache[cache_key] = result
return result

def is_ignored(self, path: Path) -> bool:
"""Return whether ``path`` is ignored by any configured layer.
Expand Down
41 changes: 41 additions & 0 deletions tests/test_lancedb_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,3 +345,44 @@ async def test_search_returns_multiple_hits(lance_index: Path, monkeypatch) -> N
)
assert out["success"] is True
assert len(out["results"]) >= 1


def test_layered_ignore_provided_once_per_flow() -> None:
"""Source-structure assertion that IGNORE is provided once and consumed three times.

This test verifies the wiring invariant (IGNORE ContextKey provided once in
coco_lifespan, consumed in three process_*_file sites) by inspecting the flow
module source code. The behavioral guarantee (a single LayeredIgnore instance
per flow run) is backed by the HEAVY e2e test below and the sentinel grep.

This approach is used because in-process testing of coco_lifespan would require
stubbing the embedder/LanceDB setup, and subprocess-based testing cannot cross
the process boundary to instrument LayeredIgnore.__init__.
"""
bundle_dir = Path(__file__).resolve().parent.parent
flow_file = bundle_dir / "java_index_flow_lancedb.py"
if not flow_file.is_file():
pytest.skip(f"Flow file not found: {flow_file}")

source = flow_file.read_text(encoding="utf-8")

# Count builder.provide(IGNORE, ...) calls - should be exactly one (in coco_lifespan)
provide_count = source.count("builder.provide(IGNORE,")
assert provide_count == 1, f"Expected 1 builder.provide(IGNORE,) call, found {provide_count}"

# Count coco.use_context(IGNORE) calls - should be exactly three (process_*_file)
use_count = source.count("coco.use_context(IGNORE)")
assert use_count == 3, f"Expected 3 coco.use_context(IGNORE) calls, found {use_count}"

# Verify no leftover LayeredIgnore(project_root).is_ignored calls in process sites
# (the sentinel grep would catch this, but we assert it here for completeness)
lines = source.split("\n")
for i, line in enumerate(lines, 1):
if "def process_" in line and "file(" in line:
# Found a process_*_file function definition
# Check the next ~10 lines for the old pattern
func_body = "\n".join(lines[i-1:min(i+10, len(lines))])
if "LayeredIgnore(project_root).is_ignored" in func_body:
pytest.fail(f"Found LayeredIgnore(project_root).is_ignored in process_*_file at line {i}")

# All structure checks passed
96 changes: 96 additions & 0 deletions tests/test_path_filtering.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,3 +258,99 @@ def test_unconditional_prune_dirs_remain_pruned_anywhere(tmp_path: Path) -> None
li = LayeredIgnore(root, use_gitignore=False)
files = list(iter_java_source_files(root, ignore=li))
assert files == []


def test_is_ignored_mega_caches_by_directory(tmp_path: Path) -> None:
"""Assert _mega is computed once per directory and subsequent same-dir calls hit cache."""
root = tmp_path / "p"
root.mkdir()
(root / ".java-codebase-rag" / "ignore").parent.mkdir(parents=True)
(root / ".java-codebase-rag" / "ignore").write_text("**/Generated*.java\n", encoding="utf-8")

dir1 = root / "src" / "main"
dir1.mkdir(parents=True)
file1 = dir1 / "GeneratedFoo.java"
file1.write_text("class GeneratedFoo {}\n", encoding="utf-8")

file2 = dir1 / "GeneratedBar.java"
file2.write_text("class GeneratedBar {}\n", encoding="utf-8")

dir2 = root / "src" / "test"
dir2.mkdir(parents=True)
file3 = dir2 / "GeneratedTest.java"
file3.write_text("class GeneratedTest {}\n", encoding="utf-8")

li = LayeredIgnore(root, use_gitignore=False)

# Clear the cache to start fresh
li._mega_cache.clear()

# First call for file1 in dir1 should cache the result
assert li.is_ignored(file1) is True
# Second call for file2 in same dir should hit cache (same cache key)
assert li.is_ignored(file2) is True
# Call for file3 in different dir should compute and cache separately
assert li.is_ignored(file3) is True

# Should have exactly 2 cache entries (one per directory)
assert len(li._mega_cache) == 2
# Cache keys should be the parent directories
assert "src/main" in li._mega_cache
assert "src/test" in li._mega_cache


def test_layered_ignore_memo_preserves_decisions(tmp_path: Path) -> None:
"""For a corpus with nested ignore + gitignore negations, assert is_ignored is
identical with and without the cache."""
root = tmp_path / "p"
root.mkdir()

# Project root ignores all Generated*.java
pr = root / ".java-codebase-rag" / "ignore"
pr.parent.mkdir(parents=True)
pr.write_text("**/Generated*.java\n", encoding="utf-8")

# Nested dir negates for a specific subdirectory
nested = root / "svc" / ".java-codebase-rag" / "ignore"
nested.parent.mkdir(parents=True)
nested.write_text("!**/Generated*.java\n", encoding="utf-8")

# Gitignore at root adds another pattern
(root / ".gitignore").write_text("**/customout/**\n", encoding="utf-8")

# Create test files
dir1 = root / "svc" / "src"
dir1.mkdir(parents=True)
hit1 = dir1 / "GeneratedFoo.java"
hit1.write_text("class GeneratedFoo {}\n", encoding="utf-8")

dir2 = root / "svc" / "customout"
dir2.mkdir(parents=True)
hit2 = dir2 / "X.java"
hit2.write_text("class X {}\n", encoding="utf-8")

dir3 = root / "other" / "src"
dir3.mkdir(parents=True)
hit3 = dir3 / "GeneratedBar.java"
hit3.write_text("class GeneratedBar {}\n", encoding="utf-8")

# Test with gitignore enabled (cached)
li_cached = LayeredIgnore(root, use_gitignore=True)
assert li_cached.is_ignored(hit1) is False # negated by nested
assert li_cached.is_ignored(hit2) is True # gitignore pattern
assert li_cached.is_ignored(hit3) is True # project-root pattern

# Test without cache by creating a fresh instance each time (simulates old behavior)
li_uncached1 = LayeredIgnore(root, use_gitignore=True)
assert li_uncached1.is_ignored(hit1) is False

li_uncached2 = LayeredIgnore(root, use_gitignore=True)
assert li_uncached2.is_ignored(hit2) is True

li_uncached3 = LayeredIgnore(root, use_gitignore=True)
assert li_uncached3.is_ignored(hit3) is True

# Verify cached vs uncached results match
assert li_cached.is_ignored(hit1) == li_uncached1.is_ignored(hit1)
assert li_cached.is_ignored(hit2) == li_uncached2.is_ignored(hit2)
assert li_cached.is_ignored(hit3) == li_uncached3.is_ignored(hit3)
Loading