Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ def app_main(sourcedir: pathlib.Path, outdir: pathlib.Path) -> None:
).result()

files = localfs.walk_dir(
sourcedir, path_matcher=PatternFilePathMatcher(included_patterns=["*.md"])
sourcedir, path_matcher=PatternFilePathMatcher(included_patterns=["**/*.md"])
)
with coco.component_subpath("process"):
for f in files:
Expand Down
6 changes: 3 additions & 3 deletions docs/docs/connectors/localfs.md
Original file line number Diff line number Diff line change
Expand Up @@ -117,8 +117,8 @@ from cocoindex.resources.file import PatternFilePathMatcher

# Include only .py and .md files, exclude hidden directories and test files
matcher = PatternFilePathMatcher(
included_patterns=["*.py", "*.md"],
excluded_patterns=["**/.*", "**/test_*", "**/__pycache__/**"],
included_patterns=["**/*.py", "**/*.md"],
excluded_patterns=["**/.*", "**/test_*", "**/__pycache__"],
)

for file in localfs.walk_dir("/path/to/project", recursive=True, path_matcher=matcher):
Expand All @@ -137,7 +137,7 @@ from cocoindex.resources.file import FileLike, PatternFilePathMatcher
def app_main(sourcedir: pathlib.Path) -> None:
# Register base directory for stable memoization
source = localfs.register_base_dir("source", sourcedir)
matcher = PatternFilePathMatcher(included_patterns=["*.md"])
matcher = PatternFilePathMatcher(included_patterns=["**/*.md"])

for file in localfs.walk_dir(source, recursive=True, path_matcher=matcher):
coco.mount(
Expand Down
2 changes: 1 addition & 1 deletion docs/docs/getting_started/quickstart.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def app_main(sourcedir: pathlib.Path, outdir: pathlib.Path) -> None:
files = localfs.walk_dir(
sourcedir,
recursive=True,
path_matcher=PatternFilePathMatcher(included_patterns=["*.pdf"]),
path_matcher=PatternFilePathMatcher(included_patterns=["**/*.pdf"]),
)
for f in files:
coco.mount(
Expand Down
2 changes: 1 addition & 1 deletion docs/docs/ops/sentence_transformers.md
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ async def app_main(sourcedir: pathlib.Path) -> None:
files = localfs.walk_dir(
sourcedir,
recursive=True,
path_matcher=PatternFilePathMatcher(included_patterns=["*.md"]),
path_matcher=PatternFilePathMatcher(included_patterns=["**/*.md"]),
)
with coco.component_subpath("file"):
for f in files:
Expand Down
12 changes: 8 additions & 4 deletions docs/docs/resource_types.md
Original file line number Diff line number Diff line change
Expand Up @@ -120,22 +120,26 @@ class MyMatcher(FilePathMatcher):

#### PatternFilePathMatcher

A built-in `FilePathMatcher` implementation using glob patterns:
A built-in `FilePathMatcher` implementation using [globset](https://docs.rs/globset/#syntax) patterns:

```python
from cocoindex.resources.file import PatternFilePathMatcher

# Include only Python and Markdown files, exclude tests and hidden dirs
matcher = PatternFilePathMatcher(
included_patterns=["*.py", "*.md"],
included_patterns=["**/*.py", "**/*.md"],
excluded_patterns=["**/test_*", "**/.*"],
)
```

**Parameters:**

- `included_patterns` — Glob patterns for files to include. If `None`, all files are included.
- `excluded_patterns` — Glob patterns for files/directories to exclude. Excluded directories are not traversed.
- `included_patterns` — Glob patterns ([globset](https://docs.rs/globset) syntax) for files to include. Use `**/*.ext` to match at any depth. If `None`, all files are included.
- `excluded_patterns` — Glob patterns ([globset](https://docs.rs/globset) syntax) for files/directories to exclude. Excluded directories are not traversed.

:::note
Patterns use [globset](https://docs.rs/globset) semantics: `*.py` matches only in the root directory; use `**/*.py` to match at any depth.
:::

## Vector Schema

Expand Down
10 changes: 8 additions & 2 deletions examples/code_embedding/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,8 +127,14 @@ async def app_main(sourcedir: pathlib.Path) -> None:
sourcedir,
recursive=True,
path_matcher=PatternFilePathMatcher(
included_patterns=["*.py", "*.rs", "*.toml", "*.md", "*.mdx"],
excluded_patterns=[".*/**", "target/**", "node_modules/**"],
included_patterns=[
"**/*.py",
"**/*.rs",
"**/*.toml",
"**/*.md",
"**/*.mdx",
],
excluded_patterns=["**/.*", "**/target", "**/node_modules"],
),
)
async for file in files:
Expand Down
10 changes: 8 additions & 2 deletions examples/code_embedding_lancedb/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,8 +119,14 @@ async def app_main(sourcedir: pathlib.Path) -> None:
sourcedir,
recursive=True,
path_matcher=PatternFilePathMatcher(
included_patterns=["*.py", "*.rs", "*.toml", "*.md", "*.mdx"],
excluded_patterns=[".*/**", "target/**", "node_modules/**"],
included_patterns=[
"**/*.py",
"**/*.rs",
"**/*.toml",
"**/*.md",
"**/*.mdx",
],
excluded_patterns=["**/.*", "**/target", "**/node_modules"],
),
)
for file in files:
Expand Down
2 changes: 1 addition & 1 deletion examples/files_transform/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def process_file(file: FileLike, outdir: pathlib.Path) -> None:
@coco.function
def app_main(sourcedir: pathlib.Path, outdir: pathlib.Path) -> None:
files = localfs.walk_dir(
sourcedir, path_matcher=PatternFilePathMatcher(included_patterns=["*.md"])
sourcedir, path_matcher=PatternFilePathMatcher(included_patterns=["**/*.md"])
)
for f in files:
coco.mount(
Expand Down
2 changes: 1 addition & 1 deletion examples/image_search/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ async def app_main(sourcedir: pathlib.Path) -> None:
sourcedir,
recursive=True,
path_matcher=PatternFilePathMatcher(
included_patterns=["*.jpg", "*.jpeg", "*.png"]
included_patterns=["**/*.jpg", "**/*.jpeg", "**/*.png"]
),
)
for f in files:
Expand Down
2 changes: 1 addition & 1 deletion examples/image_search_colpali/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ async def app_main(sourcedir: pathlib.Path) -> None:
sourcedir,
recursive=True,
path_matcher=PatternFilePathMatcher(
included_patterns=["*.jpg", "*.jpeg", "*.png"]
included_patterns=["**/*.jpg", "**/*.jpeg", "**/*.png"]
),
)
for f in files:
Expand Down
4 changes: 2 additions & 2 deletions examples/multi_codebase_summarization/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,8 +231,8 @@ def app_main(
entry,
recursive=True,
path_matcher=PatternFilePathMatcher(
included_patterns=["*.py"],
excluded_patterns=[".*", "__pycache__"],
included_patterns=["**/*.py"],
excluded_patterns=["**/.*", "**/__pycache__"],
),
)
)
Expand Down
2 changes: 1 addition & 1 deletion examples/paper_metadata/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,7 @@ async def app_main(sourcedir: pathlib.Path) -> None:
files = localfs.walk_dir(
sourcedir,
recursive=True,
path_matcher=PatternFilePathMatcher(included_patterns=["*.pdf"]),
path_matcher=PatternFilePathMatcher(included_patterns=["**/*.pdf"]),
)
for f in files:
coco.mount(
Expand Down
2 changes: 1 addition & 1 deletion examples/patient_intake_extraction_baml/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def app_main(sourcedir: pathlib.Path, outdir: pathlib.Path) -> None:
"""Main application function that processes patient intake forms."""
files = localfs.walk_dir(
sourcedir,
path_matcher=PatternFilePathMatcher(included_patterns=["*.pdf"]),
path_matcher=PatternFilePathMatcher(included_patterns=["**/*.pdf"]),
)

for f in files:
Expand Down
2 changes: 1 addition & 1 deletion examples/patient_intake_extraction_dspy/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def app_main(sourcedir: pathlib.Path, outdir: pathlib.Path) -> None:
"""Main application function that processes patient intake forms."""
files = localfs.walk_dir(
sourcedir,
path_matcher=PatternFilePathMatcher(included_patterns=["*.pdf"]),
path_matcher=PatternFilePathMatcher(included_patterns=["**/*.pdf"]),
)

for f in files:
Expand Down
2 changes: 1 addition & 1 deletion examples/pdf_embedding/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ async def app_main(sourcedir: pathlib.Path) -> None:
files = localfs.walk_dir(
sourcedir,
recursive=True,
path_matcher=PatternFilePathMatcher(included_patterns=["*.pdf"]),
path_matcher=PatternFilePathMatcher(included_patterns=["**/*.pdf"]),
)
for f in files:
coco.mount(
Expand Down
2 changes: 1 addition & 1 deletion examples/pdf_to_markdown/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def app_main(sourcedir: pathlib.Path, outdir: pathlib.Path) -> None:
files = localfs.walk_dir(
sourcedir,
recursive=True,
path_matcher=PatternFilePathMatcher(included_patterns=["*.pdf"]),
path_matcher=PatternFilePathMatcher(included_patterns=["**/*.pdf"]),
)
for f in files:
coco.mount(
Expand Down
2 changes: 1 addition & 1 deletion examples/text_embedding/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ async def app_main(sourcedir: pathlib.Path) -> None:
files = localfs.walk_dir(
sourcedir,
recursive=True,
path_matcher=PatternFilePathMatcher(included_patterns=["*.md"]),
path_matcher=PatternFilePathMatcher(included_patterns=["**/*.md"]),
)
with coco.component_subpath("file"):
for f in files:
Expand Down
2 changes: 1 addition & 1 deletion examples/text_embedding_lancedb/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ async def app_main(sourcedir: pathlib.Path) -> None:
files = localfs.walk_dir(
sourcedir,
recursive=True,
path_matcher=PatternFilePathMatcher(included_patterns=["*.md"]),
path_matcher=PatternFilePathMatcher(included_patterns=["**/*.md"]),
)
with coco.component_subpath("file"):
for f in files:
Expand Down
2 changes: 1 addition & 1 deletion examples/text_embedding_qdrant/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ async def app_main(sourcedir: pathlib.Path) -> None:
files = localfs.walk_dir(
sourcedir,
recursive=True,
path_matcher=PatternFilePathMatcher(included_patterns=["*.md"]),
path_matcher=PatternFilePathMatcher(included_patterns=["**/*.md"]),
)
with coco.component_subpath("file"):
for f in files:
Expand Down
10 changes: 10 additions & 0 deletions python/cocoindex/_internal/core.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,16 @@ class RecursiveSplitter:

def detect_code_language(*, filename: str) -> str | None: ...

# --- PatternMatcher (from ops) ---
class PatternMatcher:
def __new__(
cls,
included_patterns: list[str] | None = None,
excluded_patterns: list[str] | None = None,
) -> "PatternMatcher": ...
def is_dir_included(self, path: str) -> bool: ...
def is_file_included(self, path: str) -> bool: ...

########################################################
# Synchronization Primitives
########################################################
Expand Down
2 changes: 1 addition & 1 deletion python/cocoindex/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,7 @@ def app_main() -> None:
# Example (walk a directory):
# files = localfs.walk_dir(
# sourcedir,
# path_matcher=PatternFilePathMatcher(included_patterns=["*.pdf"]),
# path_matcher=PatternFilePathMatcher(included_patterns=["**/*.pdf"]),
# )

# 3) Mount a processing unit for each input under a stable path
Expand Down
50 changes: 23 additions & 27 deletions python/cocoindex/resources/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
TypeVar as _TypeVar,
)

from cocoindex._internal import core as _core
from cocoindex import StableKey as _StableKey
from cocoindex.connectorkits import connection as _connection

Expand Down Expand Up @@ -165,7 +166,17 @@ def is_file_included(self, path: _PurePath) -> bool: # noqa: ARG002


class PatternFilePathMatcher(FilePathMatcher):
"""Pattern matcher that handles include and exclude glob patterns for files."""
"""Pattern matcher that handles include and exclude glob patterns for files.

Uses `globset <https://docs.rs/globset>` semantics for pattern matching.
Patterns are matched against the full relative path (with forward slashes).
Common patterns:

- `**/*.py` — matches Python files at any depth
- `*.py` — matches Python files only in the root directory
- `**/.*` — matches dot-prefixed entries (hidden files/dirs) at any depth
- `{*.md,*.txt}` — matches multiple extensions using alternation
"""

def __init__(
self,
Expand All @@ -176,39 +187,24 @@ def __init__(
Create a new PatternFilePathMatcher from optional include and exclude pattern lists.

Args:
included_patterns: Patterns matching full path of files to be included.
excluded_patterns: Patterns matching full path of files and directories
to be excluded. If a directory is excluded, all files and
included_patterns: Glob patterns (globset syntax) matching full path of files
to be included. Use ``**/*.ext`` to match at any depth.
excluded_patterns: Glob patterns (globset syntax) matching full path of files
and directories to be excluded. If a directory is excluded, all files and
subdirectories within it are also excluded.
"""
self._included_patterns = included_patterns
self._excluded_patterns = excluded_patterns

def _matches_any(self, path: _PurePath, patterns: list[str]) -> bool:
"""Check if the path matches any of the given glob patterns."""
return any(path.match(pattern) for pattern in patterns)

def _is_excluded(self, path: _PurePath) -> bool:
"""Check if a file or directory is excluded by the exclude patterns."""
if self._excluded_patterns is None:
return False
return self._matches_any(path, self._excluded_patterns)
Raises:
ValueError: If any pattern is invalid.
"""
self._matcher = _core.PatternMatcher(included_patterns, excluded_patterns)

def is_dir_included(self, path: _PurePath) -> bool:
"""Check if a directory should be included based on the exclude patterns."""
return not self._is_excluded(path)
return self._matcher.is_dir_included(path.as_posix())

def is_file_included(self, path: _PurePath) -> bool:
"""
Check if a file should be included based on both include and exclude patterns.

Should be called for each file.
"""
if self._is_excluded(path):
return False
if self._included_patterns is None:
return True
return self._matches_any(path, self._included_patterns)
"""Check if a file should be included based on both include and exclude patterns."""
return self._matcher.is_file_included(path.as_posix())


_BOM_ENCODINGS = [
Expand Down
Loading
Loading