Skip to content

Commit 2944ccb

Browse files
fix: Serialize split_offsets as null when empty (#1906)
- Change `split_offsets` in `DataFile` from `Vec<i64>` to `Option<Vec<i64>>` - Empty values now serialize as `null` instead of `[]` - Aligns with Iceberg spec (field is optional) Closes #1897 --------- Co-authored-by: Renjie Liu <[email protected]>
1 parent aaa700d commit 2944ccb

File tree

11 files changed

+52
-50
lines changed

11 files changed

+52
-50
lines changed

bindings/python/src/data_file.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,7 @@ impl PyDataFile {
143143
}
144144

145145
#[getter]
146-
fn split_offsets(&self) -> &[i64] {
146+
fn split_offsets(&self) -> Option<&[i64]> {
147147
self.inner.split_offsets()
148148
}
149149

crates/iceberg/src/expr/visitors/expression_evaluator.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -346,7 +346,7 @@ mod tests {
346346
lower_bounds: HashMap::new(),
347347
upper_bounds: HashMap::new(),
348348
key_metadata: None,
349-
split_offsets: vec![],
349+
split_offsets: None,
350350
equality_ids: None,
351351
sort_order_id: None,
352352
partition_spec_id: 0,
@@ -374,7 +374,7 @@ mod tests {
374374
lower_bounds: HashMap::new(),
375375
upper_bounds: HashMap::new(),
376376
key_metadata: None,
377-
split_offsets: vec![],
377+
split_offsets: None,
378378
equality_ids: None,
379379
sort_order_id: None,
380380
partition_spec_id: 0,

crates/iceberg/src/expr/visitors/inclusive_metrics_evaluator.rs

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1995,7 +1995,7 @@ mod test {
19951995
lower_bounds: Default::default(),
19961996
upper_bounds: Default::default(),
19971997
key_metadata: None,
1998-
split_offsets: vec![],
1998+
split_offsets: None,
19991999
equality_ids: None,
20002000
sort_order_id: None,
20012001
partition_spec_id: 0,
@@ -2021,7 +2021,7 @@ mod test {
20212021
lower_bounds: Default::default(),
20222022
upper_bounds: Default::default(),
20232023
key_metadata: None,
2024-
split_offsets: vec![],
2024+
split_offsets: None,
20252025
equality_ids: None,
20262026
sort_order_id: None,
20272027
partition_spec_id: 0,
@@ -2083,7 +2083,7 @@ mod test {
20832083

20842084
column_sizes: Default::default(),
20852085
key_metadata: None,
2086-
split_offsets: vec![],
2086+
split_offsets: None,
20872087
equality_ids: None,
20882088
sort_order_id: None,
20892089
partition_spec_id: 0,
@@ -2114,7 +2114,7 @@ mod test {
21142114

21152115
column_sizes: Default::default(),
21162116
key_metadata: None,
2117-
split_offsets: vec![],
2117+
split_offsets: None,
21182118
equality_ids: None,
21192119
sort_order_id: None,
21202120
partition_spec_id: 0,
@@ -2146,7 +2146,7 @@ mod test {
21462146

21472147
column_sizes: Default::default(),
21482148
key_metadata: None,
2149-
split_offsets: vec![],
2149+
split_offsets: None,
21502150
equality_ids: None,
21512151
sort_order_id: None,
21522152
partition_spec_id: 0,
@@ -2178,7 +2178,7 @@ mod test {
21782178

21792179
column_sizes: Default::default(),
21802180
key_metadata: None,
2181-
split_offsets: vec![],
2181+
split_offsets: None,
21822182
equality_ids: None,
21832183
sort_order_id: None,
21842184
partition_spec_id: 0,

crates/iceberg/src/expr/visitors/strict_metrics_evaluator.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -578,7 +578,7 @@ mod test {
578578
]),
579579
column_sizes: Default::default(),
580580
key_metadata: None,
581-
split_offsets: vec![],
581+
split_offsets: None,
582582
equality_ids: None,
583583
sort_order_id: None,
584584
partition_spec_id: 0,
@@ -604,7 +604,7 @@ mod test {
604604
lower_bounds: Default::default(),
605605
upper_bounds: Default::default(),
606606
key_metadata: None,
607-
split_offsets: vec![],
607+
split_offsets: None,
608608
equality_ids: None,
609609
sort_order_id: None,
610610
partition_spec_id: 0,
@@ -630,7 +630,7 @@ mod test {
630630
upper_bounds: HashMap::from([(1, Datum::int(42))]),
631631
column_sizes: Default::default(),
632632
key_metadata: None,
633-
split_offsets: vec![],
633+
split_offsets: None,
634634
equality_ids: None,
635635
sort_order_id: None,
636636
partition_spec_id: 0,
@@ -657,7 +657,7 @@ mod test {
657657
upper_bounds: HashMap::from([(3, Datum::string("dC"))]),
658658
column_sizes: Default::default(),
659659
key_metadata: None,
660-
split_offsets: vec![],
660+
split_offsets: None,
661661
equality_ids: None,
662662
sort_order_id: None,
663663
partition_spec_id: 0,

crates/iceberg/src/spec/manifest/_serde.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ impl DataFileSerde {
153153
lower_bounds: Some(to_bytes_entry(value.lower_bounds)?),
154154
upper_bounds: Some(to_bytes_entry(value.upper_bounds)?),
155155
key_metadata: value.key_metadata.map(serde_bytes::ByteBuf::from),
156-
split_offsets: Some(value.split_offsets),
156+
split_offsets: value.split_offsets,
157157
equality_ids: value.equality_ids,
158158
sort_order_id: value.sort_order_id,
159159
first_row_id: value.first_row_id,
@@ -222,7 +222,7 @@ impl DataFileSerde {
222222
.transpose()?
223223
.unwrap_or_default(),
224224
key_metadata: self.key_metadata.map(|v| v.to_vec()),
225-
split_offsets: self.split_offsets.unwrap_or_default(),
225+
split_offsets: self.split_offsets,
226226
equality_ids: self.equality_ids,
227227
sort_order_id: self.sort_order_id,
228228
partition_spec_id,
@@ -380,7 +380,7 @@ mod tests {
380380
lower_bounds: HashMap::from([(1,Datum::int(1)),(2,Datum::string("a")),(3,Datum::string("AC/DC"))]),
381381
upper_bounds: HashMap::from([(1,Datum::int(1)),(2,Datum::string("a")),(3,Datum::string("AC/DC"))]),
382382
key_metadata: None,
383-
split_offsets: vec![4],
383+
split_offsets: Some(vec![4]),
384384
equality_ids: None,
385385
sort_order_id: Some(0),
386386
partition_spec_id: 0,

crates/iceberg/src/spec/manifest/data_file.rs

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -127,9 +127,10 @@ pub struct DataFile {
127127
/// element field id: 133
128128
///
129129
/// Split offsets for the data file. For example, all row group offsets
130-
/// in a Parquet file. Must be sorted ascending
130+
/// in a Parquet file. Must be sorted ascending. Optional field that
131+
/// should be serialized as null when not present.
131132
#[builder(default)]
132-
pub(crate) split_offsets: Vec<i64>,
133+
pub(crate) split_offsets: Option<Vec<i64>>,
133134
/// field id: 135
134135
/// element field id: 136
135136
///
@@ -247,8 +248,9 @@ impl DataFile {
247248
}
248249
/// Get the split offsets of the data file.
249250
/// For example, all row group offsets in a Parquet file.
250-
pub fn split_offsets(&self) -> &[i64] {
251-
&self.split_offsets
251+
/// Returns `None` if no split offsets are present.
252+
pub fn split_offsets(&self) -> Option<&[i64]> {
253+
self.split_offsets.as_deref()
252254
}
253255
/// Get the equality ids of the data file.
254256
/// Field ids used to determine row equality in equality delete files.

crates/iceberg/src/spec/manifest/mod.rs

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -257,7 +257,7 @@ mod tests {
257257
snapshot_id: None,
258258
sequence_number: None,
259259
file_sequence_number: None,
260-
data_file: DataFile {content:DataContentType::Data,file_path:"s3a://icebergdata/demo/s1/t1/data/00000-0-ba56fbfa-f2ff-40c9-bb27-565ad6dc2be8-00000.parquet".to_string(),file_format:DataFileFormat::Parquet,partition:Struct::empty(),record_count:1,file_size_in_bytes:5442,column_sizes:HashMap::from([(0,73),(6,34),(2,73),(7,61),(3,61),(5,62),(9,79),(10,73),(1,61),(4,73),(8,73)]),value_counts:HashMap::from([(4,1),(5,1),(2,1),(0,1),(3,1),(6,1),(8,1),(1,1),(10,1),(7,1),(9,1)]),null_value_counts:HashMap::from([(1,0),(6,0),(2,0),(8,0),(0,0),(3,0),(5,0),(9,0),(7,0),(4,0),(10,0)]),nan_value_counts:HashMap::new(),lower_bounds:HashMap::new(),upper_bounds:HashMap::new(),key_metadata:None,split_offsets:vec![4],equality_ids:Some(Vec::new()),sort_order_id:None, partition_spec_id: 0,first_row_id: None,referenced_data_file: None,content_offset: None,content_size_in_bytes: None }
260+
data_file: DataFile {content:DataContentType::Data,file_path:"s3a://icebergdata/demo/s1/t1/data/00000-0-ba56fbfa-f2ff-40c9-bb27-565ad6dc2be8-00000.parquet".to_string(),file_format:DataFileFormat::Parquet,partition:Struct::empty(),record_count:1,file_size_in_bytes:5442,column_sizes:HashMap::from([(0,73),(6,34),(2,73),(7,61),(3,61),(5,62),(9,79),(10,73),(1,61),(4,73),(8,73)]),value_counts:HashMap::from([(4,1),(5,1),(2,1),(0,1),(3,1),(6,1),(8,1),(1,1),(10,1),(7,1),(9,1)]),null_value_counts:HashMap::from([(1,0),(6,0),(2,0),(8,0),(0,0),(3,0),(5,0),(9,0),(7,0),(4,0),(10,0)]),nan_value_counts:HashMap::new(),lower_bounds:HashMap::new(),upper_bounds:HashMap::new(),key_metadata:None,split_offsets:Some(vec![4]),equality_ids:Some(Vec::new()),sort_order_id:None, partition_spec_id: 0,first_row_id: None,referenced_data_file: None,content_offset: None,content_size_in_bytes: None }
261261
}
262262
];
263263

@@ -435,7 +435,7 @@ mod tests {
435435
lower_bounds: HashMap::new(),
436436
upper_bounds: HashMap::new(),
437437
key_metadata: None,
438-
split_offsets: vec![4],
438+
split_offsets: Some(vec![4]),
439439
equality_ids: Some(Vec::new()),
440440
sort_order_id: None,
441441
partition_spec_id: 0,
@@ -532,7 +532,7 @@ mod tests {
532532
lower_bounds: HashMap::from([(1,Datum::int(1)),(2,Datum::string("a")),(3,Datum::string("AC/DC"))]),
533533
upper_bounds: HashMap::from([(1,Datum::int(1)),(2,Datum::string("a")),(3,Datum::string("AC/DC"))]),
534534
key_metadata: None,
535-
split_offsets: vec![4],
535+
split_offsets: Some(vec![4]),
536536
equality_ids: None,
537537
sort_order_id: Some(0),
538538
partition_spec_id: 0,
@@ -640,7 +640,7 @@ mod tests {
640640
(3, Datum::string("x"))
641641
]),
642642
key_metadata: None,
643-
split_offsets: vec![4],
643+
split_offsets: Some(vec![4]),
644644
equality_ids: None,
645645
sort_order_id: Some(0),
646646
partition_spec_id: 0,
@@ -749,7 +749,7 @@ mod tests {
749749
(3, Datum::string("x"))
750750
]),
751751
key_metadata: None,
752-
split_offsets: vec![4],
752+
split_offsets: Some(vec![4]),
753753
equality_ids: None,
754754
sort_order_id: None,
755755
partition_spec_id: 0,
@@ -840,7 +840,7 @@ mod tests {
840840
(2, Datum::int(2)),
841841
]),
842842
key_metadata: None,
843-
split_offsets: vec![4],
843+
split_offsets: Some(vec![4]),
844844
equality_ids: None,
845845
sort_order_id: None,
846846
partition_spec_id: 0,
@@ -922,7 +922,7 @@ mod tests {
922922
lower_bounds: HashMap::new(),
923923
upper_bounds: HashMap::new(),
924924
key_metadata: None,
925-
split_offsets: vec![4],
925+
split_offsets: Some(vec![4]),
926926
equality_ids: None,
927927
sort_order_id: None,
928928
partition_spec_id: 0,
@@ -957,7 +957,7 @@ mod tests {
957957
lower_bounds: HashMap::new(),
958958
upper_bounds: HashMap::new(),
959959
key_metadata: None,
960-
split_offsets: vec![4],
960+
split_offsets: Some(vec![4]),
961961
equality_ids: None,
962962
sort_order_id: None,
963963
partition_spec_id: 0,
@@ -992,7 +992,7 @@ mod tests {
992992
lower_bounds: HashMap::new(),
993993
upper_bounds: HashMap::new(),
994994
key_metadata: None,
995-
split_offsets: vec![4],
995+
split_offsets: Some(vec![4]),
996996
equality_ids: None,
997997
sort_order_id: None,
998998
partition_spec_id: 0,
@@ -1027,7 +1027,7 @@ mod tests {
10271027
lower_bounds: HashMap::new(),
10281028
upper_bounds: HashMap::new(),
10291029
key_metadata: None,
1030-
split_offsets: vec![4],
1030+
split_offsets: Some(vec![4]),
10311031
equality_ids: None,
10321032
sort_order_id: None,
10331033
partition_spec_id: 0,
@@ -1182,7 +1182,7 @@ mod tests {
11821182
"lower_bounds": [],
11831183
"upper_bounds": [],
11841184
"key_metadata": null,
1185-
"split_offsets": [],
1185+
"split_offsets": null,
11861186
"equality_ids": null,
11871187
"sort_order_id": null,
11881188
"first_row_id": null,
@@ -1213,7 +1213,7 @@ mod tests {
12131213
"lower_bounds": [],
12141214
"upper_bounds": [],
12151215
"key_metadata": null,
1216-
"split_offsets": [],
1216+
"split_offsets": null,
12171217
"equality_ids": null,
12181218
"sort_order_id": null,
12191219
"first_row_id": null,

crates/iceberg/src/spec/manifest/writer.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -608,7 +608,7 @@ mod tests {
608608
lower_bounds: HashMap::new(),
609609
upper_bounds: HashMap::new(),
610610
key_metadata: Some(Vec::new()),
611-
split_offsets: vec![4],
611+
split_offsets: Some(vec![4]),
612612
equality_ids: None,
613613
sort_order_id: None,
614614
partition_spec_id: 0,
@@ -637,7 +637,7 @@ mod tests {
637637
lower_bounds: HashMap::new(),
638638
upper_bounds: HashMap::new(),
639639
key_metadata: Some(Vec::new()),
640-
split_offsets: vec![4],
640+
split_offsets: Some(vec![4]),
641641
equality_ids: None,
642642
sort_order_id: None,
643643
partition_spec_id: 0,
@@ -666,7 +666,7 @@ mod tests {
666666
lower_bounds: HashMap::new(),
667667
upper_bounds: HashMap::new(),
668668
key_metadata: Some(Vec::new()),
669-
split_offsets: vec![4],
669+
split_offsets: Some(vec![4]),
670670
equality_ids: None,
671671
sort_order_id: None,
672672
partition_spec_id: 0,

crates/iceberg/src/spec/snapshot_summary.rs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -767,7 +767,7 @@ mod tests {
767767
(3, Datum::string("x")),
768768
]),
769769
key_metadata: None,
770-
split_offsets: vec![4],
770+
split_offsets: Some(vec![4]),
771771
equality_ids: None,
772772
sort_order_id: Some(0),
773773
partition_spec_id: 0,
@@ -799,7 +799,7 @@ mod tests {
799799
(3, Datum::string("x")),
800800
]),
801801
key_metadata: None,
802-
split_offsets: vec![4],
802+
split_offsets: Some(vec![4]),
803803
equality_ids: None,
804804
sort_order_id: Some(0),
805805
partition_spec_id: 0,
@@ -910,7 +910,7 @@ mod tests {
910910
lower_bounds: HashMap::new(),
911911
upper_bounds: HashMap::new(),
912912
key_metadata: None,
913-
split_offsets: vec![],
913+
split_offsets: None,
914914
equality_ids: None,
915915
sort_order_id: None,
916916
partition_spec_id: 0,
@@ -938,7 +938,7 @@ mod tests {
938938
lower_bounds: HashMap::new(),
939939
upper_bounds: HashMap::new(),
940940
key_metadata: None,
941-
split_offsets: vec![],
941+
split_offsets: None,
942942
equality_ids: None,
943943
sort_order_id: None,
944944
partition_spec_id: 0,
@@ -993,7 +993,7 @@ mod tests {
993993
lower_bounds: HashMap::new(),
994994
upper_bounds: HashMap::new(),
995995
key_metadata: None,
996-
split_offsets: vec![],
996+
split_offsets: None,
997997
equality_ids: None,
998998
sort_order_id: None,
999999
partition_spec_id: 0,

crates/iceberg/src/writer/base_writer/equality_delete_writer.rs

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -293,15 +293,15 @@ mod test {
293293
assert_eq!(*data_file.null_value_counts.get(id).unwrap(), expect);
294294
}
295295

296-
assert_eq!(data_file.split_offsets.len(), metadata.num_row_groups());
297-
data_file
296+
let split_offsets = data_file
298297
.split_offsets
299-
.iter()
300-
.enumerate()
301-
.for_each(|(i, &v)| {
302-
let expect = metadata.row_groups()[i].file_offset().unwrap();
303-
assert_eq!(v, expect);
304-
});
298+
.as_ref()
299+
.expect("split_offsets should be set");
300+
assert_eq!(split_offsets.len(), metadata.num_row_groups());
301+
split_offsets.iter().enumerate().for_each(|(i, &v)| {
302+
let expect = metadata.row_groups()[i].file_offset().unwrap();
303+
assert_eq!(v, expect);
304+
});
305305
}
306306

307307
#[tokio::test]

0 commit comments

Comments
 (0)