@@ -3,7 +3,8 @@ use std::collections::{HashMap, HashSet};
33use std:: sync:: { Arc , LazyLock } ;
44
55use super :: data_skipping:: DataSkippingFilter ;
6- use super :: { PhysicalPredicate , ScanMetadata , StateInfo } ;
6+ use super :: state_info:: StateInfo ;
7+ use super :: { PhysicalPredicate , ScanMetadata } ;
78use crate :: actions:: deletion_vector:: DeletionVectorDescriptor ;
89use crate :: actions:: get_log_add_schema;
910use crate :: engine_data:: { GetData , RowVisitor , TypedGetData as _} ;
@@ -105,8 +106,9 @@ impl AddRemoveDedupVisitor<'_> {
105106 const ADD_PATH_INDEX : usize = 0 ; // Position of "add.path" in getters
106107 const ADD_PARTITION_VALUES_INDEX : usize = 1 ; // Position of "add.partitionValues" in getters
107108 const ADD_DV_START_INDEX : usize = 2 ; // Start position of add deletion vector columns
108- const REMOVE_PATH_INDEX : usize = 5 ; // Position of "remove.path" in getters
109- const REMOVE_DV_START_INDEX : usize = 6 ; // Start position of remove deletion vector columns
109+ const BASE_ROW_ID_INDEX : usize = 5 ; // Position of add.baseRowId in getters
110+ const REMOVE_PATH_INDEX : usize = 6 ; // Position of "remove.path" in getters
111+ const REMOVE_DV_START_INDEX : usize = 7 ; // Start position of remove deletion vector columns
110112
111113 fn new (
112114 seen : & mut HashSet < FileActionKey > ,
@@ -195,10 +197,19 @@ impl AddRemoveDedupVisitor<'_> {
195197 if self . deduplicator . check_and_record_seen ( file_key) || !is_add {
196198 return Ok ( false ) ;
197199 }
200+ let base_row_id: Option < i64 > =
201+ getters[ Self :: BASE_ROW_ID_INDEX ] . get_opt ( i, "add.baseRowId" ) ?;
198202 let transform = self
199203 . transform_spec
200204 . as_ref ( )
201- . map ( |transform| get_transform_expr ( transform, partition_values, & self . physical_schema ) )
205+ . map ( |transform| {
206+ get_transform_expr (
207+ transform,
208+ partition_values,
209+ & self . physical_schema ,
210+ base_row_id,
211+ )
212+ } )
202213 . transpose ( ) ?;
203214 if transform. is_some ( ) {
204215 // fill in any needed `None`s for previous rows
@@ -215,13 +226,15 @@ impl RowVisitor for AddRemoveDedupVisitor<'_> {
215226 static NAMES_AND_TYPES : LazyLock < ColumnNamesAndTypes > = LazyLock :: new ( || {
216227 const STRING : DataType = DataType :: STRING ;
217228 const INTEGER : DataType = DataType :: INTEGER ;
229+ const LONG : DataType = DataType :: LONG ;
218230 let ss_map: DataType = MapType :: new ( STRING , STRING , true ) . into ( ) ;
219231 let types_and_names = vec ! [
220232 ( STRING , column_name!( "add.path" ) ) ,
221233 ( ss_map, column_name!( "add.partitionValues" ) ) ,
222234 ( STRING , column_name!( "add.deletionVector.storageType" ) ) ,
223235 ( STRING , column_name!( "add.deletionVector.pathOrInlineDv" ) ) ,
224236 ( INTEGER , column_name!( "add.deletionVector.offset" ) ) ,
237+ ( LONG , column_name!( "add.baseRowId" ) ) ,
225238 ( STRING , column_name!( "remove.path" ) ) ,
226239 ( STRING , column_name!( "remove.deletionVector.storageType" ) ) ,
227240 ( STRING , column_name!( "remove.deletionVector.pathOrInlineDv" ) ) ,
@@ -236,13 +249,13 @@ impl RowVisitor for AddRemoveDedupVisitor<'_> {
236249 } else {
237250 // All checkpoint actions are already reconciled and Remove actions in checkpoint files
238251 // only serve as tombstones for vacuum jobs. So we only need to examine the adds here.
239- ( & names[ ..5 ] , & types[ ..5 ] )
252+ ( & names[ ..6 ] , & types[ ..6 ] )
240253 }
241254 }
242255
243256 fn visit < ' a > ( & mut self , row_count : usize , getters : & [ & ' a dyn GetData < ' a > ] ) -> DeltaResult < ( ) > {
244257 let is_log_batch = self . deduplicator . is_log_batch ( ) ;
245- let expected_getters = if is_log_batch { 9 } else { 5 } ;
258+ let expected_getters = if is_log_batch { 10 } else { 6 } ;
246259 require ! (
247260 getters. len( ) == expected_getters,
248261 Error :: InternalError ( format!(
@@ -266,8 +279,10 @@ impl RowVisitor for AddRemoveDedupVisitor<'_> {
266279pub ( crate ) static SCAN_ROW_SCHEMA : LazyLock < Arc < StructType > > = LazyLock :: new ( || {
267280 // Note that fields projected out of a nullable struct must be nullable
268281 let partition_values = MapType :: new ( DataType :: STRING , DataType :: STRING , true ) ;
269- let file_constant_values =
270- StructType :: new_unchecked ( [ StructField :: nullable ( "partitionValues" , partition_values) ] ) ;
282+ let file_constant_values = StructType :: new_unchecked ( [
283+ StructField :: nullable ( "partitionValues" , partition_values) ,
284+ StructField :: nullable ( "baseRowId" , DataType :: LONG ) ,
285+ ] ) ;
271286 Arc :: new ( StructType :: new_unchecked ( [
272287 StructField :: nullable ( "path" , DataType :: STRING ) ,
273288 StructField :: nullable ( "size" , DataType :: LONG ) ,
@@ -290,9 +305,10 @@ fn get_add_transform_expr() -> ExpressionRef {
290305 column_expr_ref!( "add.modificationTime" ) ,
291306 column_expr_ref!( "add.stats" ) ,
292307 column_expr_ref!( "add.deletionVector" ) ,
293- Arc :: new( Expression :: Struct ( vec![ column_expr_ref!(
294- "add.partitionValues"
295- ) ] ) ) ,
308+ Arc :: new( Expression :: Struct ( vec![
309+ column_expr_ref!( "add.partitionValues" ) ,
310+ column_expr_ref!( "add.baseRowId" ) ,
311+ ] ) ) ,
296312 ] ) )
297313 } ) ;
298314 EXPR . clone ( )
@@ -311,6 +327,7 @@ pub(crate) fn get_scan_metadata_transform_expr() -> ExpressionRef {
311327 column_expr_ref!( "modificationTime" ) ,
312328 column_expr_ref!( "stats" ) ,
313329 column_expr_ref!( "deletionVector" ) ,
330+ column_expr_ref!( "fileConstantValues.baseRowId" ) ,
314331 ] ,
315332 ) ) ] ) )
316333 } ) ;
@@ -377,15 +394,19 @@ mod tests {
377394 use std:: { collections:: HashMap , sync:: Arc } ;
378395
379396 use crate :: actions:: get_log_schema;
380- use crate :: expressions:: Scalar ;
397+ use crate :: expressions:: { BinaryExpressionOp , Scalar , VariadicExpressionOp } ;
381398 use crate :: log_replay:: ActionsBatch ;
382399 use crate :: scan:: state:: { DvInfo , Stats } ;
400+ use crate :: scan:: state_info:: tests:: {
401+ assert_transform_spec, get_simple_state_info, get_state_info,
402+ } ;
403+ use crate :: scan:: state_info:: StateInfo ;
383404 use crate :: scan:: test_utils:: {
384- add_batch_simple , add_batch_with_partition_col , add_batch_with_remove ,
385- run_with_validate_callback,
405+ add_batch_for_row_id , add_batch_simple , add_batch_with_partition_col ,
406+ add_batch_with_remove , run_with_validate_callback,
386407 } ;
387- use crate :: scan:: { PhysicalPredicate , StateInfo } ;
388- use crate :: table_features :: ColumnMappingMode ;
408+ use crate :: scan:: PhysicalPredicate ;
409+ use crate :: schema :: MetadataColumnSpec ;
389410 use crate :: Expression as Expr ;
390411 use crate :: {
391412 engine:: sync:: SyncEngine ,
@@ -473,15 +494,8 @@ mod tests {
473494 StructField :: new ( "value" , DataType :: INTEGER , true ) ,
474495 StructField :: new ( "date" , DataType :: DATE , true ) ,
475496 ] ) ) ;
476- let partition_cols = [ "date" . to_string ( ) ] ;
477- let state_info = StateInfo :: try_new (
478- schema. clone ( ) ,
479- & partition_cols,
480- ColumnMappingMode :: None ,
481- None ,
482- crate :: scan:: field_classifiers:: ScanTransformFieldClassifier ,
483- )
484- . unwrap ( ) ;
497+ let partition_cols = vec ! [ "date" . to_string( ) ] ;
498+ let state_info = get_simple_state_info ( schema, partition_cols) . unwrap ( ) ;
485499 let batch = vec ! [ add_batch_with_partition_col( ) ] ;
486500 let iter = scan_action_iter (
487501 & SyncEngine :: new ( ) ,
@@ -525,4 +539,77 @@ mod tests {
525539 validate_transform ( transforms[ 3 ] . as_ref ( ) , 17510 ) ;
526540 }
527541 }
542+
543+ #[ test]
544+ fn test_row_id_transform ( ) {
545+ let schema: SchemaRef = Arc :: new ( StructType :: new_unchecked ( [ StructField :: new (
546+ "value" ,
547+ DataType :: INTEGER ,
548+ true ,
549+ ) ] ) ) ;
550+ let state_info = get_state_info (
551+ schema. clone ( ) ,
552+ vec ! [ ] ,
553+ None ,
554+ [
555+ ( "delta.enableRowTracking" , "true" ) ,
556+ (
557+ "delta.rowTracking.materializedRowIdColumnName" ,
558+ "row_id_col" ,
559+ ) ,
560+ ]
561+ . iter ( )
562+ . map ( |( k, v) | ( k. to_string ( ) , v. to_string ( ) ) )
563+ . collect ( ) ,
564+ vec ! [ ( "row_id" , MetadataColumnSpec :: RowId ) ] ,
565+ )
566+ . unwrap ( ) ;
567+
568+ let transform_spec = state_info. transform_spec . as_ref ( ) . unwrap ( ) ;
569+ assert_transform_spec (
570+ transform_spec,
571+ false ,
572+ "row_id_col" ,
573+ "row_indexes_for_row_id_0" ,
574+ ) ;
575+
576+ let batch = vec ! [ add_batch_for_row_id( get_log_schema( ) . clone( ) ) ] ;
577+ let iter = scan_action_iter (
578+ & SyncEngine :: new ( ) ,
579+ batch
580+ . into_iter ( )
581+ . map ( |batch| Ok ( ActionsBatch :: new ( batch as _ , true ) ) ) ,
582+ Arc :: new ( state_info) ,
583+ ) ;
584+
585+ for res in iter {
586+ let scan_metadata = res. unwrap ( ) ;
587+ let transforms = scan_metadata. scan_file_transforms ;
588+ assert_eq ! ( transforms. len( ) , 1 , "Should have 1 transform" ) ;
589+ if let Some ( Expr :: Transform ( transform_expr) ) = transforms[ 0 ] . as_ref ( ) . map ( Arc :: as_ref) {
590+ assert ! ( transform_expr. input_path. is_none( ) ) ;
591+ let row_id_transform = transform_expr
592+ . field_transforms
593+ . get ( "row_id_col" )
594+ . expect ( "Should have row_id_col transform" ) ;
595+ assert ! ( row_id_transform. is_replace) ;
596+ assert_eq ! ( row_id_transform. exprs. len( ) , 1 ) ;
597+ let expr = & row_id_transform. exprs [ 0 ] ;
598+ let expeceted_expr = Arc :: new ( Expr :: variadic (
599+ VariadicExpressionOp :: Coalesce ,
600+ vec ! [
601+ Expr :: column( [ "row_id_col" ] ) ,
602+ Expr :: binary(
603+ BinaryExpressionOp :: Plus ,
604+ Expr :: literal( 42i64 ) ,
605+ Expr :: column( [ "row_indexes_for_row_id_0" ] ) ,
606+ ) ,
607+ ] ,
608+ ) ) ;
609+ assert_eq ! ( expr, & expeceted_expr) ;
610+ } else {
611+ panic ! ( "Should have been a transform expression" ) ;
612+ }
613+ }
614+ }
528615}
0 commit comments