11use std:: {
2- collections:: HashMap ,
2+ collections:: { HashMap , HashSet } ,
33 sync:: { Arc , LazyLock } ,
44} ;
55
6+ use async_stream:: try_stream;
67use google_drive3:: {
7- api:: Scope ,
8+ api:: { File , Scope } ,
89 yup_oauth2:: { read_service_account_key, ServiceAccountAuthenticator } ,
910 DriveHub ,
1011} ;
1112use http_body_util:: BodyExt ;
1213use hyper_rustls:: HttpsConnector ;
1314use hyper_util:: client:: legacy:: connect:: HttpConnector ;
14- use indexmap:: IndexSet ;
15- use log:: warn;
15+ use log:: { trace, warn} ;
1616
1717use crate :: base:: field_attrs;
1818use crate :: ops:: sdk:: * ;
@@ -80,7 +80,7 @@ pub struct Spec {
8080struct Executor {
8181 drive_hub : DriveHub < HttpsConnector < HttpConnector > > ,
8282 binary : bool ,
83- root_folder_ids : Vec < String > ,
83+ root_folder_ids : Vec < Arc < str > > ,
8484}
8585
8686impl Executor {
@@ -105,7 +105,7 @@ impl Executor {
105105 Ok ( Self {
106106 drive_hub,
107107 binary : spec. binary ,
108- root_folder_ids : spec. root_folder_ids ,
108+ root_folder_ids : spec. root_folder_ids . into_iter ( ) . map ( Arc :: from ) . collect ( ) ,
109109 } )
110110 }
111111}
@@ -123,55 +123,60 @@ fn escape_string(s: &str) -> String {
123123}
124124
125125impl Executor {
126- async fn traverse_folder (
126+ fn visit_file (
127127 & self ,
128- folder_id : & str ,
129- visited_folder_ids : & mut IndexSet < String > ,
130- result : & mut IndexSet < KeyValue > ,
131- ) -> Result < ( ) > {
132- if !visited_folder_ids . insert ( folder_id . to_string ( ) ) {
133- return Ok ( ( ) ) ;
128+ file : File ,
129+ new_folder_ids : & mut Vec < Arc < str > > ,
130+ seen_ids : & mut HashSet < Arc < str > > ,
131+ ) -> Result < Option < SourceRowMetadata > > {
132+ if file . trashed == Some ( true ) {
133+ return Ok ( None ) ;
134134 }
135- let query = format ! ( "'{}' in parents" , escape_string( folder_id) ) ;
136- let mut next_page_token: Option < String > = None ;
137- loop {
138- let mut list_call = self
139- . drive_hub
140- . files ( )
141- . list ( )
142- . add_scope ( Scope :: Readonly )
143- . q ( & query) ;
144- if let Some ( next_page_token) = & next_page_token {
145- list_call = list_call. page_token ( next_page_token) ;
146- }
147- let ( _, files) = list_call. doit ( ) . await ?;
148- if let Some ( files) = files. files {
149- for file in files {
150- match ( file. id , file. mime_type ) {
151- ( Some ( id) , Some ( mime_type) ) => {
152- if mime_type == FOLDER_MIME_TYPE {
153- Box :: pin ( self . traverse_folder ( & id, visited_folder_ids, result) )
154- . await ?;
155- } else if is_supported_file_type ( & mime_type) {
156- result. insert ( KeyValue :: Str ( Arc :: from ( id) ) ) ;
157- } else {
158- warn ! ( "Skipping file with unsupported mime type: id={id}, mime_type={mime_type}, name={:?}" , file. name) ;
159- }
160- }
161- ( id, mime_type) => {
162- warn ! (
163- "Skipping file with incomplete metadata: id={id:?}, mime_type={mime_type:?}" ,
164- ) ;
165- }
166- }
167- }
168- }
169- next_page_token = files. next_page_token ;
170- if next_page_token. is_none ( ) {
171- break ;
135+ let ( id, mime_type) = match ( file. id , file. mime_type ) {
136+ ( Some ( id) , Some ( mime_type) ) => ( Arc :: < str > :: from ( id) , mime_type) ,
137+ ( id, mime_type) => {
138+ warn ! ( "Skipping file with incomplete metadata: id={id:?}, mime_type={mime_type:?}" , ) ;
139+ return Ok ( None ) ;
172140 }
141+ } ;
142+ if !seen_ids. insert ( id. clone ( ) ) {
143+ return Ok ( None ) ;
144+ }
145+ let result = if mime_type == FOLDER_MIME_TYPE {
146+ new_folder_ids. push ( id) ;
147+ None
148+ } else if is_supported_file_type ( & mime_type) {
149+ Some ( SourceRowMetadata {
150+ key : KeyValue :: Str ( Arc :: from ( id) ) ,
151+ ordinal : file. modified_time . map ( |t| t. try_into ( ) ) . transpose ( ) ?,
152+ } )
153+ } else {
154+ trace ! ( "Skipping file with unsupported mime type: id={id}, mime_type={mime_type}, name={:?}" , file. name) ;
155+ None
156+ } ;
157+ Ok ( result)
158+ }
159+
160+ async fn list_files (
161+ & self ,
162+ folder_id : & str ,
163+ fields : & str ,
164+ next_page_token : & mut Option < String > ,
165+ ) -> Result < impl Iterator < Item = File > > {
166+ let query = format ! ( "'{}' in parents" , escape_string( folder_id) ) ;
167+ let mut list_call = self
168+ . drive_hub
169+ . files ( )
170+ . list ( )
171+ . add_scope ( Scope :: Readonly )
172+ . q ( & query)
173+ . param ( "fields" , fields) ;
174+ if let Some ( next_page_token) = & next_page_token {
175+ list_call = list_call. page_token ( next_page_token) ;
173176 }
174- Ok ( ( ) )
177+ let ( _, files) = list_call. doit ( ) . await ?;
178+ let file_iter = files. files . into_iter ( ) . flat_map ( |file| file. into_iter ( ) ) ;
179+ Ok ( file_iter)
175180 }
176181}
177182
@@ -202,13 +207,43 @@ impl<T> ResultExt<T> for google_drive3::Result<T> {
202207
203208#[ async_trait]
204209impl SourceExecutor for Executor {
205- async fn list_keys ( & self ) -> Result < Vec < KeyValue > > {
206- let mut result = IndexSet :: new ( ) ;
207- for root_folder_id in & self . root_folder_ids {
208- self . traverse_folder ( root_folder_id, & mut IndexSet :: new ( ) , & mut result)
209- . await ?;
210+ fn list < ' a > (
211+ & ' a self ,
212+ options : SourceExecutorListOptions ,
213+ ) -> BoxStream < ' a , Result < Vec < SourceRowMetadata > > > {
214+ let mut seen_ids = HashSet :: new ( ) ;
215+ let mut folder_ids = self . root_folder_ids . clone ( ) ;
216+ let fields = format ! (
217+ "files(id,name,mimeType,trashed{})" ,
218+ if options. include_ordinal {
219+ ",modifiedTime"
220+ } else {
221+ ""
222+ }
223+ ) ;
224+ let mut new_folder_ids = Vec :: new ( ) ;
225+ try_stream ! {
226+ while let Some ( folder_id) = folder_ids. pop( ) {
227+ let mut next_page_token = None ;
228+ loop {
229+ let mut curr_rows = Vec :: new( ) ;
230+ let files = self
231+ . list_files( & folder_id, & fields, & mut next_page_token)
232+ . await ?;
233+ for file in files {
234+ curr_rows. extend( self . visit_file( file, & mut new_folder_ids, & mut seen_ids) ?) ;
235+ }
236+ if !curr_rows. is_empty( ) {
237+ yield curr_rows;
238+ }
239+ if next_page_token. is_none( ) {
240+ break ;
241+ }
242+ }
243+ folder_ids. extend( new_folder_ids. drain( ..) . rev( ) ) ;
244+ }
210245 }
211- Ok ( result . into_iter ( ) . collect ( ) )
246+ . boxed ( )
212247 }
213248
214249 async fn get_value ( & self , key : & KeyValue ) -> Result < Option < SourceData < ' async_trait > > > {
0 commit comments