Skip to content

Commit 541e693

Browse files
committed
feat: add max_file_size support for the GoogleDrive source, solves #1250
1 parent d204ed9 commit 541e693

File tree

3 files changed

+27
-2
lines changed

3 files changed

+27
-2
lines changed

docs/docs/sources/googledrive.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ The spec takes the following fields:
4545
In reality, configure them based on your requirement: how fresh do you need the target index to be?
4646

4747
:::
48+
* `max_file_size` (`int`, optional): when set, any source file exceeding the limit (in bytes) will be ignored.
4849

4950
### Schema
5051

python/cocoindex/sources/_engine_builtin_specs.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ class GoogleDrive(op.SourceSpec):
3333
root_folder_ids: list[str]
3434
binary: bool = False
3535
recent_changes_poll_interval: datetime.timedelta | None = None
36+
max_file_size: int | None = None
3637

3738

3839
@dataclass

src/ops/sources/google_drive.rs

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,13 +59,15 @@ pub struct Spec {
5959
binary: bool,
6060
root_folder_ids: Vec<String>,
6161
recent_changes_poll_interval: Option<std::time::Duration>,
62+
max_file_size: Option<i64>,
6263
}
6364

6465
struct Executor {
6566
drive_hub: DriveHub<HttpsConnector<HttpConnector>>,
6667
binary: bool,
6768
root_folder_ids: IndexSet<Arc<str>>,
6869
recent_updates_poll_interval: Option<std::time::Duration>,
70+
max_file_size: Option<i64>,
6971
}
7072

7173
impl Executor {
@@ -92,6 +94,7 @@ impl Executor {
9294
binary: spec.binary,
9395
root_folder_ids: spec.root_folder_ids.into_iter().map(Arc::from).collect(),
9496
recent_updates_poll_interval: spec.recent_changes_poll_interval,
97+
max_file_size: spec.max_file_size,
9598
})
9699
}
97100
}
@@ -298,7 +301,7 @@ impl SourceExecutor for Executor {
298301
let mut seen_ids = HashSet::new();
299302
let mut folder_ids = self.root_folder_ids.clone();
300303
let fields = format!(
301-
"files(id,name,mimeType,trashed{})",
304+
"files(id,name,mimeType,trashed,size{})",
302305
optional_modified_time(options.include_ordinal)
303306
);
304307
let mut new_folder_ids = Vec::new();
@@ -311,6 +314,14 @@ impl SourceExecutor for Executor {
311314
.list_files(&folder_id, &fields, &mut next_page_token)
312315
.await?;
313316
for file in files {
317+
if let Some(max_size) = self.max_file_size {
318+
if let Some(file_size) = file.size {
319+
if file_size > max_size {
320+
// Skip files over the specified limit
321+
continue;
322+
}
323+
}
324+
}
314325
curr_rows.extend(self.visit_file(file, &mut new_folder_ids, &mut seen_ids)?);
315326
}
316327
if !curr_rows.is_empty() {
@@ -334,7 +345,7 @@ impl SourceExecutor for Executor {
334345
) -> Result<PartialSourceRowData> {
335346
let file_id = key.single_part()?.str_value()?;
336347
let fields = format!(
337-
"id,name,mimeType,trashed{}",
348+
"id,name,mimeType,trashed,size{}",
338349
optional_modified_time(options.include_ordinal)
339350
);
340351
let resp = self
@@ -356,6 +367,18 @@ impl SourceExecutor for Executor {
356367
});
357368
}
358369
};
370+
// Check file size limit
371+
if let Some(max_size) = self.max_file_size {
372+
if let Some(file_size) = file.size {
373+
if file_size > max_size {
374+
return Ok(PartialSourceRowData {
375+
value: Some(SourceValue::NonExistence),
376+
ordinal: Some(Ordinal::unavailable()),
377+
content_version_fp: None,
378+
});
379+
}
380+
}
381+
}
359382
let ordinal = if options.include_ordinal {
360383
file.modified_time.map(|t| t.try_into()).transpose()?
361384
} else {

0 commit comments

Comments
 (0)