Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/docs/sources/googledrive.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ The spec takes the following fields:
In reality, configure them based on your requirement: how fresh do you need the target index to be?

:::
* `max_file_size` (`int`, optional): when set, any source file exceeding the limit (in bytes) will be ignored.

### Schema

Expand Down
1 change: 1 addition & 0 deletions python/cocoindex/sources/_engine_builtin_specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ class GoogleDrive(op.SourceSpec):
excluded_patterns: list[str] | None = None

recent_changes_poll_interval: datetime.timedelta | None = None
max_file_size: int | None = None


@dataclass
Expand Down
17 changes: 15 additions & 2 deletions src/ops/sources/google_drive.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ pub struct Spec {
binary: bool,
root_folder_ids: Vec<String>,
recent_changes_poll_interval: Option<std::time::Duration>,
max_file_size: Option<i64>,
included_patterns: Option<Vec<String>>,
excluded_patterns: Option<Vec<String>>,
}
Expand All @@ -69,6 +70,7 @@ struct Executor {
binary: bool,
root_folder_ids: IndexSet<Arc<str>>,
recent_updates_poll_interval: Option<std::time::Duration>,
max_file_size: Option<i64>,
pattern_matcher: PatternMatcher,
}

Expand Down Expand Up @@ -96,6 +98,7 @@ impl Executor {
binary: spec.binary,
root_folder_ids: spec.root_folder_ids.into_iter().map(Arc::from).collect(),
recent_updates_poll_interval: spec.recent_changes_poll_interval,
max_file_size: spec.max_file_size,
pattern_matcher: PatternMatcher::new(spec.included_patterns, spec.excluded_patterns)?,
})
}
Expand Down Expand Up @@ -303,7 +306,7 @@ impl SourceExecutor for Executor {
let mut seen_ids = HashSet::new();
let mut folder_ids = self.root_folder_ids.clone();
let fields = format!(
"files(id,name,mimeType,trashed{})",
"files(id,name,mimeType,trashed,size{})",
optional_modified_time(options.include_ordinal)
);
let mut new_folder_ids = Vec::new();
Expand All @@ -319,6 +322,12 @@ impl SourceExecutor for Executor {
if !file.name.as_deref().is_some_and(|name| self.pattern_matcher.is_file_included(name)){
continue
}
if let Some(max_size) = self.max_file_size
&& let Some(file_size) = file.size
&& file_size > max_size {
// Skip files over the specified limit
continue;
}
curr_rows.extend(self.visit_file(file, &mut new_folder_ids, &mut seen_ids)?);
}
if !curr_rows.is_empty() {
Expand All @@ -342,7 +351,7 @@ impl SourceExecutor for Executor {
) -> Result<PartialSourceRowData> {
let file_id = key.single_part()?.str_value()?;
let fields = format!(
"id,name,mimeType,trashed{}",
"id,name,mimeType,trashed,size{}",
optional_modified_time(options.include_ordinal)
);
let resp = self
Expand Down Expand Up @@ -375,6 +384,10 @@ impl SourceExecutor for Executor {
content_version_fp: None,
});
}
// Check file size limit
if let Some(max_size) = self.max_file_size
&& let Some(file_size) = file.size
&& file_size > max_size
let ordinal = if options.include_ordinal {
file.modified_time.map(|t| t.try_into()).transpose()?
} else {
Expand Down
Loading