diff --git a/docs/docs/ops/sources.md b/docs/docs/ops/sources.md index 5a364cf06..68f444ec9 100644 --- a/docs/docs/ops/sources.md +++ b/docs/docs/ops/sources.md @@ -11,7 +11,19 @@ The `LocalFile` source imports files from a local file system. The spec takes the following fields: * `path` (type: `str`, required): full path of the root directory to import files from -* `binary` (type: `bool`, default: `False`): whether reading files as binary (instead of text) +* `binary` (type: `bool`, optional): whether reading files as binary (instead of text) +* `included_patterns` (type: `list[str]`, optional): a list of glob patterns to include files, e.g. `["*.txt", "docs/**/*.md"]`. + If not specified, all files will be included. +* `excluded_patterns` (type: `list[str]`, optional): a list of glob patterns to exclude files, e.g. `["tmp", "**/node_modules"]`. + Any file or directory matching these patterns will be excluded even if they match `included_patterns`. + If not specified, no files will be excluded. + + :::info + + `included_patterns` and `excluded_patterns` are using Unix-style glob syntax. See [globset syntax](https://docs.rs/globset/latest/globset/index.html#syntax) for the details. + + ::: + The output is a table with the following sub fields: * `filename` (key, type: `str`): the filename of the file, including the path, relative to the root directory, e.g. `"dir1/file1.md"` diff --git a/python/cocoindex/sources.py b/python/cocoindex/sources.py index 6023f53e5..a443f3202 100644 --- a/python/cocoindex/sources.py +++ b/python/cocoindex/sources.py @@ -10,9 +10,9 @@ class LocalFile(op.SourceSpec): binary: bool = False # If provided, only files matching these patterns will be included. - # See https://docs.rs/globset/latest/globset/index.html for the syntax of the patterns. + # See https://docs.rs/globset/latest/globset/index.html#syntax for the syntax of the patterns. included_patterns: list[str] | None = None # If provided, files matching these patterns will be excluded. - # See https://docs.rs/globset/latest/globset/index.html for the syntax of the patterns. + # See https://docs.rs/globset/latest/globset/index.html#syntax for the syntax of the patterns. excluded_patterns: list[str] | None = None