Skip to content

Commit 27accfe

Browse files
feat: support included_patterns and excluded_patterns for google source
1 parent f72bcdb commit 27accfe

File tree

2 files changed

+23
-4
lines changed

2 files changed

+23
-4
lines changed

docs/docs/sources/googledrive.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,16 @@ The spec takes the following fields:
2929
* `service_account_credential_path` (`str`): full path to the service account credential file in JSON format.
3030
* `root_folder_ids` (`list[str]`): a list of Google Drive folder IDs to import files from.
3131
* `binary` (`bool`, optional): whether reading files as binary (instead of text).
32+
* `included_patterns` (`list[str]`, optional): a list of glob patterns to include files, e.g. `["*.txt", "docs/**/*.md"]`.
33+
If not specified, all files will be included.
34+
* `excluded_patterns` (`list[str]`, optional): a list of glob patterns to exclude files, e.g. `["tmp", "**/node_modules"]`.
35+
Any file or directory matching these patterns will be excluded even if they match `included_patterns`.
36+
If not specified, no files will be excluded.
3237
* `recent_changes_poll_interval` (`datetime.timedelta`, optional): when set, this source provides a change capture mechanism by polling Google Drive for recent modified files periodically.
3338

3439
:::info
40+
41+
`included_patterns` and `excluded_patterns` are using Unix-style glob syntax. See [globset syntax](https://docs.rs/globset/latest/globset/index.html#syntax) for the details.
3542

3643
Since it only retrieves metadata for recent modified files (up to the previous poll) during polling,
3744
it's typically cheaper than a full refresh by setting the [refresh interval](/docs/core/flow_def#refresh-interval) especially when the folder contains a large number of files.

src/ops/sources/google_drive.rs

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ use phf::phf_map;
1111

1212
use crate::base::field_attrs;
1313
use crate::ops::sdk::*;
14+
use super::shared::pattern_matcher::PatternMatcher;
1415

1516
struct ExportMimeType {
1617
text: &'static str,
@@ -59,13 +60,16 @@ pub struct Spec {
5960
binary: bool,
6061
root_folder_ids: Vec<String>,
6162
recent_changes_poll_interval: Option<std::time::Duration>,
63+
included_patterns: Option<Vec<String>>,
64+
excluded_patterns: Option<Vec<String>>,
6265
}
6366

6467
struct Executor {
6568
drive_hub: DriveHub<HttpsConnector<HttpConnector>>,
6669
binary: bool,
6770
root_folder_ids: IndexSet<Arc<str>>,
6871
recent_updates_poll_interval: Option<std::time::Duration>,
72+
pattern_matcher: PatternMatcher,
6973
}
7074

7175
impl Executor {
@@ -87,11 +91,13 @@ impl Executor {
8791
.build(),
8892
);
8993
let drive_hub = DriveHub::new(client, auth);
94+
let pattern_matcher = PatternMatcher::new(spec.included_patterns, spec.excluded_patterns)?;
9095
Ok(Self {
9196
drive_hub,
9297
binary: spec.binary,
9398
root_folder_ids: spec.root_folder_ids.into_iter().map(Arc::from).collect(),
9499
recent_updates_poll_interval: spec.recent_changes_poll_interval,
100+
pattern_matcher,
95101
})
96102
}
97103
}
@@ -119,16 +125,22 @@ impl Executor {
119125
if file.trashed == Some(true) {
120126
return Ok(None);
121127
}
122-
let (id, mime_type) = match (file.id, file.mime_type) {
123-
(Some(id), Some(mime_type)) => (Arc::<str>::from(id), mime_type),
124-
(id, mime_type) => {
125-
warn!("Skipping file with incomplete metadata: id={id:?}, mime_type={mime_type:?}",);
128+
let (id, mime_type, name) = match (file.id, file.mime_type, file.name) {
129+
(Some(id), Some(mime_type), Some(name)) => (Arc::<str>::from(id), mime_type, name),
130+
(id, mime_type, name) => {
131+
warn!("Skipping file with incomplete metadata: id={id:?}, mime_type={mime_type:?}, name={name:?}",);
126132
return Ok(None);
127133
}
128134
};
129135
if !seen_ids.insert(id.clone()) {
130136
return Ok(None);
131137
}
138+
if self.pattern_matcher.is_file_included(&name){
139+
return Ok(None);
140+
}
141+
if !self.pattern_matcher.is_excluded(&name) {
142+
return Ok(None);
143+
}
132144
let result = if mime_type == FOLDER_MIME_TYPE {
133145
new_folder_ids.push(id);
134146
None

0 commit comments

Comments
 (0)