diff --git a/docs/docs/sources/googledrive.md b/docs/docs/sources/googledrive.md index 9883eb59..7875b43b 100644 --- a/docs/docs/sources/googledrive.md +++ b/docs/docs/sources/googledrive.md @@ -47,6 +47,7 @@ The spec takes the following fields: In reality, configure them based on your requirement: how fresh do you need the target index to be? ::: +* `max_file_size` (`int`, optional): when set, any source file exceeding the limit (in bytes) will be ignored. ### Schema diff --git a/python/cocoindex/sources/_engine_builtin_specs.py b/python/cocoindex/sources/_engine_builtin_specs.py index 8a57d8c7..3f02bc5c 100644 --- a/python/cocoindex/sources/_engine_builtin_specs.py +++ b/python/cocoindex/sources/_engine_builtin_specs.py @@ -45,6 +45,7 @@ class GoogleDrive(op.SourceSpec): excluded_patterns: list[str] | None = None recent_changes_poll_interval: datetime.timedelta | None = None + max_file_size: int | None = None @dataclass diff --git a/src/ops/sources/google_drive.rs b/src/ops/sources/google_drive.rs index c8d49390..ecc68f96 100644 --- a/src/ops/sources/google_drive.rs +++ b/src/ops/sources/google_drive.rs @@ -60,6 +60,7 @@ pub struct Spec { binary: bool, root_folder_ids: Vec, recent_changes_poll_interval: Option, + max_file_size: Option, included_patterns: Option>, excluded_patterns: Option>, } @@ -69,6 +70,7 @@ struct Executor { binary: bool, root_folder_ids: IndexSet>, recent_updates_poll_interval: Option, + max_file_size: Option, pattern_matcher: PatternMatcher, } @@ -96,6 +98,7 @@ impl Executor { binary: spec.binary, root_folder_ids: spec.root_folder_ids.into_iter().map(Arc::from).collect(), recent_updates_poll_interval: spec.recent_changes_poll_interval, + max_file_size: spec.max_file_size, pattern_matcher: PatternMatcher::new(spec.included_patterns, spec.excluded_patterns)?, }) } @@ -303,7 +306,7 @@ impl SourceExecutor for Executor { let mut seen_ids = HashSet::new(); let mut folder_ids = self.root_folder_ids.clone(); let fields = format!( - "files(id,name,mimeType,trashed{})", + "files(id,name,mimeType,trashed,size{})", optional_modified_time(options.include_ordinal) ); let mut new_folder_ids = Vec::new(); @@ -319,6 +322,12 @@ impl SourceExecutor for Executor { if !file.name.as_deref().is_some_and(|name| self.pattern_matcher.is_file_included(name)){ continue } + if let Some(max_size) = self.max_file_size + && let Some(file_size) = file.size + && file_size > max_size { + // Skip files over the specified limit + continue; + } curr_rows.extend(self.visit_file(file, &mut new_folder_ids, &mut seen_ids)?); } if !curr_rows.is_empty() { @@ -342,7 +351,7 @@ impl SourceExecutor for Executor { ) -> Result { let file_id = key.single_part()?.str_value()?; let fields = format!( - "id,name,mimeType,trashed{}", + "id,name,mimeType,trashed,size{}", optional_modified_time(options.include_ordinal) ); let resp = self @@ -375,6 +384,10 @@ impl SourceExecutor for Executor { content_version_fp: None, }); } + // Check file size limit + if let Some(max_size) = self.max_file_size + && let Some(file_size) = file.size + && file_size > max_size let ordinal = if options.include_ordinal { file.modified_time.map(|t| t.try_into()).transpose()? } else {