diff --git a/docs/docs/sources/googledrive.md b/docs/docs/sources/googledrive.md index 857044cd..9883eb59 100644 --- a/docs/docs/sources/googledrive.md +++ b/docs/docs/sources/googledrive.md @@ -30,6 +30,8 @@ The spec takes the following fields: * `root_folder_ids` (`list[str]`): a list of Google Drive folder IDs to import files from. * `binary` (`bool`, optional): whether reading files as binary (instead of text). * `recent_changes_poll_interval` (`datetime.timedelta`, optional): when set, this source provides a change capture mechanism by polling Google Drive for recent modified files periodically. +* `included_patterns` (`list[str]`, optional): a list of glob patterns to include files, e.g. `["*.txt", "docs/**/*.md"]`. If not specified, all files will be included. +* `excluded_patterns` (`list[str]`, optional): a list of glob patterns to exclude files, e.g. `["tmp", "**/node_modules"]`. Any file or directory matching these patterns will be excluded even if they match `included_patterns`. If not specified, no files will be excluded. :::info diff --git a/python/cocoindex/sources/_engine_builtin_specs.py b/python/cocoindex/sources/_engine_builtin_specs.py index 2b342d6f..8a57d8c7 100644 --- a/python/cocoindex/sources/_engine_builtin_specs.py +++ b/python/cocoindex/sources/_engine_builtin_specs.py @@ -35,6 +35,15 @@ class GoogleDrive(op.SourceSpec): service_account_credential_path: str root_folder_ids: list[str] binary: bool = False + + # If provided, only files matching these patterns will be included. + # See https://docs.rs/globset/latest/globset/index.html#syntax for the syntax of the patterns. + included_patterns: list[str] | None = None + + # If provided, files matching these patterns will be excluded. + # See https://docs.rs/globset/latest/globset/index.html#syntax for the syntax of the patterns. + excluded_patterns: list[str] | None = None + recent_changes_poll_interval: datetime.timedelta | None = None diff --git a/src/ops/sources/google_drive.rs b/src/ops/sources/google_drive.rs index 04461604..c8d49390 100644 --- a/src/ops/sources/google_drive.rs +++ b/src/ops/sources/google_drive.rs @@ -1,3 +1,4 @@ +use super::shared::pattern_matcher::PatternMatcher; use chrono::Duration; use google_drive3::{ DriveHub, @@ -59,6 +60,8 @@ pub struct Spec { binary: bool, root_folder_ids: Vec, recent_changes_poll_interval: Option, + included_patterns: Option>, + excluded_patterns: Option>, } struct Executor { @@ -66,6 +69,7 @@ struct Executor { binary: bool, root_folder_ids: IndexSet>, recent_updates_poll_interval: Option, + pattern_matcher: PatternMatcher, } impl Executor { @@ -92,6 +96,7 @@ impl Executor { binary: spec.binary, root_folder_ids: spec.root_folder_ids.into_iter().map(Arc::from).collect(), recent_updates_poll_interval: spec.recent_changes_poll_interval, + pattern_matcher: PatternMatcher::new(spec.included_patterns, spec.excluded_patterns)?, }) } } @@ -311,6 +316,9 @@ impl SourceExecutor for Executor { .list_files(&folder_id, &fields, &mut next_page_token) .await?; for file in files { + if !file.name.as_deref().is_some_and(|name| self.pattern_matcher.is_file_included(name)){ + continue + } curr_rows.extend(self.visit_file(file, &mut new_folder_ids, &mut seen_ids)?); } if !curr_rows.is_empty() { @@ -356,6 +364,17 @@ impl SourceExecutor for Executor { }); } }; + if !file + .name + .as_deref() + .is_some_and(|name| self.pattern_matcher.is_file_included(name)) + { + return Ok(PartialSourceRowData { + value: Some(SourceValue::NonExistence), + ordinal: Some(Ordinal::unavailable()), + content_version_fp: None, + }); + } let ordinal = if options.include_ordinal { file.modified_time.map(|t| t.try_into()).transpose()? } else {