Skip to content

Commit 13f25cd

Browse files
committed
feat: Don't drop additional statistics
This is a behavioral change. In Iceberg-Rust we require upper/lower bounds to be part of the schema. But in some cases, this in't the case, most obvious schema evolution. In PyIceberg we expect these values in some tests: ``` FAILED tests/integration/test_inspect_table.py::test_inspect_files[2] - AssertionError: Difference in column lower_bounds: {} != {2147483546: b's3://warehouse/default/table_metadata_files/data/00000-0-8d621c18-079b-4217-afd8-559ce216e875.parquet', 2147483545: b'\x00\x00\x00\x00\x00\x00\x00\x00'} assert {} == {2147483545: ...e875.parquet'} Right contains 2 more items: {2147483545: b'\x00\x00\x00\x00\x00\x00\x00\x00', 2147483546: b's3://warehouse/default/table_metadata_files/data/00000-0-8d621c1' b'8-079b-4217-afd8-559ce216e875.parquet'} Full diff: { + , - 2147483545: b'\x00\x00\x00\x00\x00\x00\x00\x00', - 2147483546: b's3://warehouse/default/table_metadata_files/data/00000-0-8d621c1' - b'8-079b-4217-afd8-559ce216e875.parquet', } !!!!!!!!!!!!!!!!!!!!!!!!!! stopping after 1 failures !!!!!!!!!!!!!!!!!!!!!!!!!!! ==== 1 failed, 238 passed, 32 skipped, 3123 deselected in 61.56s (0:01:01) ===== ``` This is a positional delete where the field-IDs are constant, but never part of a schema (they are reserved).
1 parent 1fcad93 commit 13f25cd

File tree

2 files changed

+58
-6
lines changed

2 files changed

+58
-6
lines changed

crates/iceberg/src/spec/manifest/_serde.rs

Lines changed: 54 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -245,7 +245,7 @@ struct BytesEntry {
245245
fn parse_bytes_entry(v: Vec<BytesEntry>, schema: &Schema) -> Result<HashMap<i32, Datum>, Error> {
246246
let mut m = HashMap::with_capacity(v.len());
247247
for entry in v {
248-
// We ignore the entry if the field is not found in the schema, due to schema evolution.
248+
// Try to find the field in the schema to get proper type information
249249
if let Some(field) = schema.field_by_id(entry.key) {
250250
let data_type = field
251251
.field_type
@@ -258,6 +258,10 @@ fn parse_bytes_entry(v: Vec<BytesEntry>, schema: &Schema) -> Result<HashMap<i32,
258258
})?
259259
.clone();
260260
m.insert(entry.key, Datum::try_from_bytes(&entry.value, data_type)?);
261+
} else {
262+
// Field is not in current schema (e.g., dropped field due to schema evolution).
263+
// Store the statistic as binary data to preserve it even though we don't know its type.
264+
m.insert(entry.key, Datum::binary(entry.value.to_vec()));
261265
}
262266
}
263267
Ok(m)
@@ -320,11 +324,11 @@ mod tests {
320324
use std::io::Cursor;
321325
use std::sync::Arc;
322326

323-
use crate::spec::manifest::_serde::{I64Entry, parse_i64_entry};
327+
use crate::spec::manifest::_serde::{BytesEntry, I64Entry, parse_bytes_entry, parse_i64_entry};
324328
use crate::spec::{
325329
DataContentType, DataFile, DataFileFormat, Datum, FormatVersion, NestedField,
326-
PrimitiveType, Schema, Struct, StructType, Type, read_data_files_from_avro,
327-
write_data_files_to_avro,
330+
PrimitiveLiteral, PrimitiveType, Schema, Struct, StructType, Type,
331+
read_data_files_from_avro, write_data_files_to_avro,
328332
};
329333

330334
#[test]
@@ -590,4 +594,50 @@ mod tests {
590594
assert_eq!(data_file.file_size_in_bytes, 2048);
591595
assert_eq!(data_file.partition_spec_id, 0);
592596
}
597+
598+
#[test]
599+
fn test_parse_bytes_entry_preserves_dropped_field_statistics() {
600+
use serde_bytes::ByteBuf;
601+
602+
// Create a schema with only field ID 1
603+
let schema = Schema::builder()
604+
.with_fields(vec![Arc::new(NestedField::required(
605+
1,
606+
"existing_field",
607+
Type::Primitive(PrimitiveType::Int),
608+
))])
609+
.build()
610+
.unwrap();
611+
612+
// Create entries for field ID 1 (exists in schema) and field ID 2 (dropped from schema)
613+
let entries = vec![
614+
BytesEntry {
615+
key: 1,
616+
value: ByteBuf::from(vec![42, 0, 0, 0]), // int value 42 in little-endian
617+
},
618+
BytesEntry {
619+
key: 2, // This field is not in the schema
620+
value: ByteBuf::from(vec![1, 2, 3, 4]), // Some arbitrary bytes
621+
},
622+
];
623+
624+
let result = parse_bytes_entry(entries, &schema).unwrap();
625+
626+
// Both statistics should be preserved
627+
assert_eq!(result.len(), 2, "Both statistics should be preserved");
628+
629+
// Field 1 should be properly deserialized as Int
630+
let datum1 = result.get(&1).expect("Field 1 should exist");
631+
assert_eq!(datum1.data_type(), &PrimitiveType::Int);
632+
assert_eq!(datum1.to_string(), "42");
633+
634+
// Field 2 should be preserved as Binary since it's not in the schema
635+
let datum2 = result.get(&2).expect("Field 2 should be preserved");
636+
assert_eq!(datum2.data_type(), &PrimitiveType::Binary);
637+
if let PrimitiveLiteral::Binary(bytes) = datum2.literal() {
638+
assert_eq!(bytes, &vec![1, 2, 3, 4]);
639+
} else {
640+
panic!("Field 2 should be stored as Binary");
641+
}
642+
}
593643
}

crates/iceberg/src/spec/manifest/mod.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -783,8 +783,8 @@ mod tests {
783783
Manifest::parse_avro(fs::read(path).expect("read_file must succeed").as_slice())
784784
.unwrap();
785785

786-
// Compared with original manifest, the lower_bounds and upper_bounds no longer has data for field 3, and
787-
// other parts should be same.
786+
// Compared with original manifest, the lower_bounds and upper_bounds now PRESERVE data for field 3
787+
// as binary (since field 3 is not in the current schema), and other parts should be same.
788788
// The snapshot id is assigned when the entry is added to the manifest.
789789
let schema = Arc::new(
790790
Schema::builder()
@@ -834,10 +834,12 @@ mod tests {
834834
lower_bounds: HashMap::from([
835835
(1, Datum::long(1)),
836836
(2, Datum::int(2)),
837+
(3, Datum::binary(vec![120])), // Field 3 preserved as binary
837838
]),
838839
upper_bounds: HashMap::from([
839840
(1, Datum::long(1)),
840841
(2, Datum::int(2)),
842+
(3, Datum::binary(vec![120])), // Field 3 preserved as binary
841843
]),
842844
key_metadata: None,
843845
split_offsets: vec![4],

0 commit comments

Comments
 (0)