@@ -7,10 +7,13 @@ use deltalake::datafusion::dataframe::DataFrameWriteOptions;
77use deltalake:: datafusion:: prelude:: * ;
88use deltalake:: delta_datafusion:: DeltaCdfTableProvider ;
99use deltalake:: { DeltaOps , DeltaResult } ;
10- use lambda_runtime:: { run, service_fn, tracing, Error , LambdaEvent } ;
11- use object_store:: prefix:: PrefixStore ;
10+ use lambda_runtime:: { Error , LambdaEvent , run, service_fn, tracing} ;
1211use object_store:: ObjectStore ;
12+ use object_store:: PutPayload ;
13+ use object_store:: path:: Path ;
14+ use object_store:: prefix:: PrefixStore ;
1315use oxbow_lambda_shared:: * ;
16+ use serde:: { Deserialize , Serialize } ;
1417use std:: sync:: Arc ;
1518use tracing:: log:: * ;
1619use url:: Url ;
@@ -98,13 +101,22 @@ async fn function_handler(event: LambdaEvent<SqsEvent>) -> DeltaResult<(), Error
98101
99102 let inserts = retrieve_inserts ( & ctx) . await ?;
100103 let deletes = retrieve_deletes ( & ctx) . await ?;
101- inserts
104+
105+ // write_csv will return a Vec,RecordBatch> which we can use for some rudimentary
106+ // statistics
107+ let inserts = inserts
102108 . write_csv ( "cdfo://inserts" , DataFrameWriteOptions :: default ( ) , None )
103109 . await ?;
104-
105- deletes
110+ let deletes = deletes
106111 . write_csv ( "cdfo://deletes" , DataFrameWriteOptions :: default ( ) , None )
107112 . await ?;
113+
114+ let completion = Completion {
115+ inserts : inserts. iter ( ) . map ( |rb| rb. num_rows ( ) ) . sum ( ) ,
116+ deletes : deletes. iter ( ) . map ( |rb| rb. num_rows ( ) ) . sum ( ) ,
117+ } ;
118+
119+ mark_complete ( store. clone ( ) , & completion) . await ?;
108120 } else {
109121 warn ! ( "Invoked but didn't find min/max trigger versions, something is fishy!" ) ;
110122 }
@@ -131,6 +143,7 @@ async fn retrieve_inserts(ctx: &SessionContext) -> DeltaResult<DataFrame> {
131143 ] ) ?)
132144}
133145
146+ /// Compute the deletes from the change data feed associated with the [SessionContext]
134147async fn retrieve_deletes ( ctx : & SessionContext ) -> DeltaResult < DataFrame > {
135148 let df = ctx
136149 . sql ( "SELECT * FROM cdf WHERE _change_type IN ('delete')" )
@@ -143,12 +156,33 @@ async fn retrieve_deletes(ctx: &SessionContext) -> DeltaResult<DataFrame> {
143156 ] ) ?)
144157}
145158
159+ /// Write a completion file to the given object store.
160+ ///
161+ /// This is expected to be the prefix store associated with a werite
162+ async fn mark_complete ( store : Arc < dyn ObjectStore > , completion : & Completion ) -> DeltaResult < ( ) > {
163+ // Write a sentinel file once the writes have completed successfully
164+ store
165+ . put (
166+ & Path :: from ( "cdf-completion.json" ) ,
167+ serde_json:: to_string ( completion)
168+ . expect ( "Failed to serialize Completion" )
169+ . into ( ) ,
170+ )
171+ . await ?;
172+ Ok ( ( ) )
173+ }
174+
175+ #[ derive( Clone , Debug , Deserialize , PartialEq , Serialize ) ]
176+ struct Completion {
177+ inserts : usize ,
178+ deletes : usize ,
179+ }
180+
146181#[ cfg( test) ]
147182mod tests {
148183 use super :: * ;
149184 use futures:: StreamExt ;
150- use object_store:: path:: Path ;
151- use object_store:: ObjectStore ;
185+ use object_store:: { GetResultPayload , ObjectStore } ;
152186
153187 use deltalake:: datafusion:: {
154188 common:: assert_batches_sorted_eq, dataframe:: DataFrameWriteOptions ,
@@ -161,6 +195,27 @@ mod tests {
161195 let ctx = SessionContext :: new ( ) ;
162196 Ok ( ( ctx, cdf) )
163197 }
198+
199+ #[ tokio:: test]
200+ async fn test_mark_complete ( ) -> DeltaResult < ( ) > {
201+ let store: Arc < dyn ObjectStore > = Arc :: new ( object_store:: memory:: InMemory :: new ( ) ) ;
202+ let completion = Completion {
203+ inserts : 1 ,
204+ deletes : 0 ,
205+ } ;
206+ mark_complete ( store. clone ( ) , & completion) . await ?;
207+ let _ = store. head ( & Path :: from ( "cdf-completion.json" ) ) . await ?;
208+
209+ let result = store. get ( & Path :: from ( "cdf-completion.json" ) ) . await ?;
210+ let bytes = result. bytes ( ) . await ?;
211+ let s = String :: from_utf8 ( bytes. to_vec ( ) ) . expect ( "Failed to convert buffer" ) ;
212+ let received: Completion = serde_json:: from_str ( & s) ?;
213+
214+ assert_eq ! ( completion, received) ;
215+
216+ Ok ( ( ) )
217+ }
218+
164219 #[ tokio:: test]
165220 async fn test_read_cdf_deletes ( ) -> DeltaResult < ( ) > {
166221 let ( ctx, cdf) = cdf_test_setup ( ) . await ?;
0 commit comments