Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/docs/sources/googledrive.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ The spec takes the following fields:
* `root_folder_ids` (`list[str]`): a list of Google Drive folder IDs to import files from.
* `binary` (`bool`, optional): whether reading files as binary (instead of text).
* `recent_changes_poll_interval` (`datetime.timedelta`, optional): when set, this source provides a change capture mechanism by polling Google Drive for recent modified files periodically.
* `included_patterns` (`list[str]`, optional): a list of glob patterns to include files, e.g. `["*.txt", "docs/**/*.md"]`. If not specified, all files will be included.
* `excluded_patterns` (`list[str]`, optional): a list of glob patterns to exclude files, e.g. `["tmp", "**/node_modules"]`. Any file or directory matching these patterns will be excluded even if they match `included_patterns`. If not specified, no files will be excluded.

:::info

Expand Down
9 changes: 9 additions & 0 deletions python/cocoindex/sources/_engine_builtin_specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,15 @@ class GoogleDrive(op.SourceSpec):
service_account_credential_path: str
root_folder_ids: list[str]
binary: bool = False

# If provided, only files matching these patterns will be included.
# See https://docs.rs/globset/latest/globset/index.html#syntax for the syntax of the patterns.
included_patterns: list[str] | None = None

# If provided, files matching these patterns will be excluded.
# See https://docs.rs/globset/latest/globset/index.html#syntax for the syntax of the patterns.
excluded_patterns: list[str] | None = None

recent_changes_poll_interval: datetime.timedelta | None = None


Expand Down
19 changes: 19 additions & 0 deletions src/ops/sources/google_drive.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use super::shared::pattern_matcher::PatternMatcher;
use chrono::Duration;
use google_drive3::{
DriveHub,
Expand Down Expand Up @@ -59,13 +60,16 @@ pub struct Spec {
binary: bool,
root_folder_ids: Vec<String>,
recent_changes_poll_interval: Option<std::time::Duration>,
included_patterns: Option<Vec<String>>,
excluded_patterns: Option<Vec<String>>,
}

struct Executor {
drive_hub: DriveHub<HttpsConnector<HttpConnector>>,
binary: bool,
root_folder_ids: IndexSet<Arc<str>>,
recent_updates_poll_interval: Option<std::time::Duration>,
pattern_matcher: PatternMatcher,
}

impl Executor {
Expand All @@ -92,6 +96,7 @@ impl Executor {
binary: spec.binary,
root_folder_ids: spec.root_folder_ids.into_iter().map(Arc::from).collect(),
recent_updates_poll_interval: spec.recent_changes_poll_interval,
pattern_matcher: PatternMatcher::new(spec.included_patterns, spec.excluded_patterns)?,
})
}
}
Expand Down Expand Up @@ -311,6 +316,9 @@ impl SourceExecutor for Executor {
.list_files(&folder_id, &fields, &mut next_page_token)
.await?;
for file in files {
if !file.name.as_deref().is_some_and(|name| self.pattern_matcher.is_file_included(name)){
continue
}
curr_rows.extend(self.visit_file(file, &mut new_folder_ids, &mut seen_ids)?);
}
if !curr_rows.is_empty() {
Expand Down Expand Up @@ -356,6 +364,17 @@ impl SourceExecutor for Executor {
});
}
};
if !file
.name
.as_deref()
.is_some_and(|name| self.pattern_matcher.is_file_included(name))
{
return Ok(PartialSourceRowData {
value: Some(SourceValue::NonExistence),
ordinal: Some(Ordinal::unavailable()),
content_version_fp: None,
});
}
let ordinal = if options.include_ordinal {
file.modified_time.map(|t| t.try_into()).transpose()?
} else {
Expand Down
Loading