1+ use std:: borrow:: Cow ;
12use std:: collections:: HashSet ;
23use std:: iter;
34use std:: ops:: Deref ;
@@ -8,8 +9,8 @@ use url::Url;
89use crate :: actions:: deletion_vector:: DeletionVectorPath ;
910use crate :: actions:: {
1011 as_log_add_schema, domain_metadata:: scan_domain_metadatas, get_log_commit_info_schema,
11- get_log_domain_metadata_schema, get_log_txn_schema , CommitInfo , DomainMetadata , SetTransaction ,
12- INTERNAL_DOMAIN_PREFIX ,
12+ get_log_domain_metadata_schema, get_log_remove_schema , get_log_txn_schema , CommitInfo ,
13+ DomainMetadata , SetTransaction , INTERNAL_DOMAIN_PREFIX ,
1314} ;
1415#[ cfg( feature = "catalog-managed" ) ]
1516use crate :: committer:: FileSystemCommitter ;
@@ -19,12 +20,16 @@ use crate::error::Error;
1920use crate :: expressions:: { ArrayData , Transform , UnaryExpressionOp :: ToJson } ;
2021use crate :: path:: LogRoot ;
2122use crate :: row_tracking:: { RowTrackingDomainMetadata , RowTrackingVisitor } ;
23+ use crate :: scan:: log_replay:: {
24+ BASE_ROW_ID_NAME , DEFAULT_ROW_COMMIT_VERSION_NAME , FILE_CONSTANT_VALUES_NAME , TAGS_NAME ,
25+ } ;
26+ use crate :: scan:: scan_row_schema;
2227use crate :: schema:: { ArrayType , MapType , SchemaRef , StructField , StructType } ;
2328use crate :: snapshot:: SnapshotRef ;
2429use crate :: utils:: current_time_ms;
2530use crate :: {
2631 DataType , DeltaResult , Engine , EngineData , Expression , ExpressionRef , IntoEngineData ,
27- RowVisitor , Version ,
32+ RowVisitor , SchemaTransform , Version ,
2833} ;
2934use delta_kernel_derive:: internal_api;
3035
@@ -127,6 +132,7 @@ pub struct Transaction {
127132 operation : Option < String > ,
128133 engine_info : Option < String > ,
129134 add_files_metadata : Vec < Box < dyn EngineData > > ,
135+ remove_files_metadata : Vec < FilteredEngineData > ,
130136 // NB: hashmap would require either duplicating the appid or splitting SetTransaction
131137 // key/payload. HashSet requires Borrow<&str> with matching Eq, Ord, and Hash. Plus,
132138 // HashSet::insert drops the to-be-inserted value without returning the existing one, which
@@ -181,6 +187,7 @@ impl Transaction {
181187 operation : None ,
182188 engine_info : None ,
183189 add_files_metadata : vec ! [ ] ,
190+ remove_files_metadata : vec ! [ ] ,
184191 set_transactions : vec ! [ ] ,
185192 commit_timestamp,
186193 domain_metadata_additions : vec ! [ ] ,
@@ -260,14 +267,17 @@ impl Transaction {
260267 let domain_metadata_actions =
261268 self . generate_domain_metadata_actions ( engine, row_tracking_domain_metadata) ?;
262269
263- // Step 5: Chain all our actions to be handed off to the Committer
270+ // Step 5: Generate remove actions
271+ let remove_actions = self . generate_remove_actions ( engine) ?;
272+
264273 let actions = iter:: once ( commit_info_action)
265274 . chain ( add_actions)
266275 . chain ( set_transaction_actions)
267276 . chain ( domain_metadata_actions) ;
268- // Convert EngineData to FilteredEngineData with all rows selected
277+
269278 let filtered_actions = actions
270- . map ( |action_result| action_result. map ( FilteredEngineData :: with_all_rows_selected) ) ;
279+ . map ( |action_result| action_result. map ( FilteredEngineData :: with_all_rows_selected) )
280+ . chain ( remove_actions) ;
271281
272282 // Step 6: Commit via the committer
273283 #[ cfg( feature = "catalog-managed" ) ]
@@ -693,6 +703,144 @@ impl Transaction {
693703 error,
694704 }
695705 }
706+ /// Remove files from the table in this transaction. This API generally enables the engine to
707+ /// delete data (at file-level granularity) from the table. Note that this API can be called
708+ /// multiple times to remove multiple batches.
709+ ///
710+ /// The expected schema for `remove_metadata` is given by [`scan_row_schema`]. It is expected
711+ /// this will be the result of passing [`FilteredEngineData`] returned from a scan
712+ /// with the selection vector modified to select rows for removal (selected rows in the selection vector are the ones to be removed).
713+ ///
714+ /// # Example
715+ ///
716+ /// ```no_run
717+ /// # use std::sync::Arc;
718+ /// # use delta_kernel::Engine;
719+ /// # use delta_kernel::snapshot::Snapshot;
720+ /// # #[cfg(feature = "catalog-managed")]
721+ /// # use delta_kernel::committer::FileSystemCommitter;
722+ /// # fn example(engine: Arc<dyn Engine>, table_url: url::Url) -> delta_kernel::DeltaResult<()> {
723+ /// # #[cfg(feature = "catalog-managed")]
724+ /// # {
725+ /// // Create a snapshot and transaction
726+ /// let snapshot = Snapshot::builder_for(table_url).build(engine.as_ref())?;
727+ /// let mut txn = snapshot.clone().transaction(Box::new(FileSystemCommitter::new()))?;
728+ ///
729+ /// // Get file metadata from a scan
730+ /// let scan = snapshot.scan_builder().build()?;
731+ /// let scan_metadata = scan.scan_metadata(engine.as_ref())?;
732+ ///
733+ /// // Remove specific files based on scan metadata
734+ /// for metadata in scan_metadata {
735+ /// let metadata = metadata?;
736+ /// // In practice, you would modify the selection vector to choose which files to remove
737+ /// let files_to_remove = metadata.scan_files;
738+ /// txn.remove_files(files_to_remove);
739+ /// }
740+ ///
741+ /// // Commit the transaction
742+ /// txn.commit(engine.as_ref())?;
743+ /// # }
744+ /// # Ok(())
745+ /// # }
746+ /// ```
747+ pub fn remove_files ( & mut self , remove_metadata : FilteredEngineData ) {
748+ self . remove_files_metadata . push ( remove_metadata) ;
749+ }
750+
751+ fn generate_remove_actions < ' a > (
752+ & ' a self ,
753+ engine : & dyn Engine ,
754+ ) -> DeltaResult < impl Iterator < Item = DeltaResult < FilteredEngineData > > + Send + ' a > {
755+ // This is a workaround due to the fact that expression evaluation happens
756+ // on the whole EngineData instead of accounting for filtered rows, which can lead to null values in
757+ // required fields.
758+ // TODO: Move this to a common place (dedupe from data_skipping.rs) or remove when evaluations work
759+ // on FilteredEngineData directly.
760+ struct NullableStatsTransform ;
761+ impl < ' a > SchemaTransform < ' a > for NullableStatsTransform {
762+ fn transform_struct_field (
763+ & mut self ,
764+ field : & ' a StructField ,
765+ ) -> Option < Cow < ' a , StructField > > {
766+ use Cow :: * ;
767+ let field = match self . transform ( & field. data_type ) ? {
768+ Borrowed ( _) if field. is_nullable ( ) => Borrowed ( field) ,
769+ data_type => Owned ( StructField {
770+ name : field. name . clone ( ) ,
771+ data_type : data_type. into_owned ( ) ,
772+ nullable : true ,
773+ metadata : field. metadata . clone ( ) ,
774+ } ) ,
775+ } ;
776+ Some ( field)
777+ }
778+ }
779+
780+ let input_schema = scan_row_schema ( ) ;
781+ let target_schema = NullableStatsTransform
782+ . transform_struct ( get_log_remove_schema ( ) )
783+ . ok_or_else ( || Error :: generic ( "Failed to transform remove schema" ) ) ?
784+ . into_owned ( ) ;
785+ let evaluation_handler = engine. evaluation_handler ( ) ;
786+
787+ // Create the transform expression once, since it only contains literals and column references
788+ let transform = Expression :: transform (
789+ Transform :: new_top_level ( )
790+ // deletionTimestamp
791+ . with_inserted_field (
792+ Some ( "path" ) ,
793+ Expression :: literal ( self . commit_timestamp ) . into ( ) ,
794+ )
795+ // dataChange
796+ . with_inserted_field ( Some ( "path" ) , Expression :: literal ( self . data_change ) . into ( ) )
797+ . with_inserted_field (
798+ // extended_file_metadata
799+ Some ( "path" ) ,
800+ Expression :: literal ( true ) . into ( ) ,
801+ )
802+ . with_inserted_field (
803+ Some ( "path" ) ,
804+ Expression :: column ( [ FILE_CONSTANT_VALUES_NAME , "partitionValues" ] ) . into ( ) ,
805+ )
806+ // tags
807+ . with_inserted_field (
808+ Some ( "stats" ) ,
809+ Expression :: column ( [ FILE_CONSTANT_VALUES_NAME , TAGS_NAME ] ) . into ( ) ,
810+ )
811+ . with_inserted_field (
812+ Some ( "deletionVector" ) ,
813+ Expression :: column ( [ FILE_CONSTANT_VALUES_NAME , BASE_ROW_ID_NAME ] ) . into ( ) ,
814+ )
815+ . with_inserted_field (
816+ Some ( "deletionVector" ) ,
817+ Expression :: column ( [
818+ FILE_CONSTANT_VALUES_NAME ,
819+ DEFAULT_ROW_COMMIT_VERSION_NAME ,
820+ ] )
821+ . into ( ) ,
822+ )
823+ . with_dropped_field ( FILE_CONSTANT_VALUES_NAME )
824+ . with_dropped_field ( "modificationTime" ) ,
825+ ) ;
826+ let expr = Arc :: new ( Expression :: struct_from ( [ transform] ) ) ;
827+ let file_action_eval = Arc :: new ( evaluation_handler. new_expression_evaluator (
828+ input_schema. clone ( ) ,
829+ expr. clone ( ) ,
830+ target_schema. clone ( ) . into ( ) ,
831+ ) ?) ;
832+
833+ Ok ( self
834+ . remove_files_metadata
835+ . iter ( )
836+ . map ( move |file_metadata_batch| {
837+ let updated_engine_data = file_action_eval. evaluate ( file_metadata_batch. data ( ) ) ?;
838+ FilteredEngineData :: try_new (
839+ updated_engine_data,
840+ file_metadata_batch. selection_vector ( ) . to_vec ( ) ,
841+ )
842+ } ) )
843+ }
696844}
697845
698846/// WriteContext is data derived from a [`Transaction`] that can be provided to writers in order to
0 commit comments