fix: arrow schema not constructed with metadata in unrelated test

feniljain · feniljain · commit 644567df9891 · 2025-03-13T19:23:40.000Z
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/crates/iceberg/src/arrow/value.rs b/crates/iceberg/src/arrow/value.rs
@@ -436,6 +436,7 @@ impl PartnerAccessor<ArrayRef> for ArrowArrayAccessor {
                 "The schema partner is not a struct type",
             ));
         }
+
         Ok(schema_partner)
     }
 
@@ -453,6 +454,7 @@ impl PartnerAccessor<ArrayRef> for ArrowArrayAccessor {
                     "The struct partner is not a struct array",
                 )
             })?;
+
         let field_pos = struct_array
             .fields()
             .iter()
@@ -467,6 +469,7 @@ impl PartnerAccessor<ArrayRef> for ArrowArrayAccessor {
                     format!("Field id {} not found in struct array", field.id),
                 )
             })?;
+
         Ok(struct_array.column(field_pos))
     }
 
diff --git a/crates/iceberg/src/writer/base_writer/data_file_writer.rs b/crates/iceberg/src/writer/base_writer/data_file_writer.rs
@@ -98,11 +98,13 @@ impl<B: FileWriterBuilder> CurrentFileStatus for DataFileWriter<B> {
 
 #[cfg(test)]
 mod test {
+    use std::collections::HashMap;
     use std::sync::Arc;
 
     use arrow_array::{Int32Array, StringArray};
     use arrow_schema::{DataType, Field};
     use parquet::arrow::arrow_reader::{ArrowReaderMetadata, ArrowReaderOptions};
+    use parquet::arrow::PARQUET_FIELD_ID_META_KEY;
     use parquet::file::properties::WriterProperties;
     use tempfile::TempDir;
 
@@ -145,8 +147,14 @@ mod test {
         let mut data_file_writer = DataFileWriterBuilder::new(pw, None).build().await.unwrap();
 
         let arrow_schema = arrow_schema::Schema::new(vec![
-            Field::new("foo", DataType::Int32, false),
-            Field::new("bar", DataType::Utf8, false),
+            Field::new("foo", DataType::Int32, false).with_metadata(HashMap::from([(
+                PARQUET_FIELD_ID_META_KEY.to_string(),
+                3.to_string(),
+            )])),
+            Field::new("bar", DataType::Utf8, false).with_metadata(HashMap::from([(
+                PARQUET_FIELD_ID_META_KEY.to_string(),
+                4.to_string(),
+            )])),
         ]);
         let batch = RecordBatch::try_new(Arc::new(arrow_schema.clone()), vec![
             Arc::new(Int32Array::from(vec![1, 2, 3])),
@@ -216,8 +224,14 @@ mod test {
                 .await?;
 
         let arrow_schema = arrow_schema::Schema::new(vec![
-            Field::new("id", DataType::Int32, false),
-            Field::new("name", DataType::Utf8, false),
+            Field::new("id", DataType::Int32, false).with_metadata(HashMap::from([(
+                PARQUET_FIELD_ID_META_KEY.to_string(),
+                5.to_string(),
+            )])),
+            Field::new("name", DataType::Utf8, false).with_metadata(HashMap::from([(
+                PARQUET_FIELD_ID_META_KEY.to_string(),
+                6.to_string(),
+            )])),
         ]);
         let batch = RecordBatch::try_new(Arc::new(arrow_schema.clone()), vec![
             Arc::new(Int32Array::from(vec![1, 2, 3])),
diff --git a/crates/integrations/datafusion/Cargo.toml b/crates/integrations/datafusion/Cargo.toml
@@ -42,3 +42,4 @@ tokio = { workspace = true }
 [dev-dependencies]
 iceberg-catalog-memory = { workspace = true }
 tempfile = { workspace = true }
+parquet = { workspace = true }
diff --git a/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs b/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs
@@ -220,20 +220,31 @@ fn scalar_value_to_datum(value: &ScalarValue) -> Option<Datum> {
 
 #[cfg(test)]
 mod tests {
+    use std::collections::HashMap;
+
     use datafusion::arrow::datatypes::{DataType, Field, Schema, TimeUnit};
     use datafusion::common::DFSchema;
     use datafusion::logical_expr::utils::split_conjunction;
     use datafusion::prelude::{Expr, SessionContext};
     use iceberg::expr::{Predicate, Reference};
     use iceberg::spec::Datum;
+    use parquet::arrow::PARQUET_FIELD_ID_META_KEY;
 
     use super::convert_filters_to_predicate;
 
     fn create_test_schema() -> DFSchema {
         let arrow_schema = Schema::new(vec![
-            Field::new("foo", DataType::Int32, true),
-            Field::new("bar", DataType::Utf8, true),
-            Field::new("ts", DataType::Timestamp(TimeUnit::Second, None), true),
+            Field::new("foo", DataType::Int32, true).with_metadata(HashMap::from([(
+                PARQUET_FIELD_ID_META_KEY.to_string(),
+                "1".to_string(),
+            )])),
+            Field::new("bar", DataType::Utf8, true).with_metadata(HashMap::from([(
+                PARQUET_FIELD_ID_META_KEY.to_string(),
+                "2".to_string(),
+            )])),
+            Field::new("ts", DataType::Timestamp(TimeUnit::Second, None), true).with_metadata(
+                HashMap::from([(PARQUET_FIELD_ID_META_KEY.to_string(), "3".to_string())]),
+            ),
         ]);
         DFSchema::try_from_qualified_schema("my_table", &arrow_schema).unwrap()
     }

Original file line number	Diff line number	Diff line change
`@@ -436,6 +436,7 @@ impl PartnerAccessor<ArrayRef> for ArrowArrayAccessor {`
`436`	`436`	`"The schema partner is not a struct type",`
`437`	`437`	`));`
`438`	`438`	`}`
	`439`	`+`
`439`	`440`	`Ok(schema_partner)`
`440`	`441`	`}`
`441`	`442`
`@@ -453,6 +454,7 @@ impl PartnerAccessor<ArrayRef> for ArrowArrayAccessor {`
`453`	`454`	`"The struct partner is not a struct array",`
`454`	`455`	`)`
`455`	`456`	`})?;`
	`457`	`+`
`456`	`458`	`let field_pos = struct_array`
`457`	`459`	`.fields()`
`458`	`460`	`.iter()`
`@@ -467,6 +469,7 @@ impl PartnerAccessor<ArrayRef> for ArrowArrayAccessor {`
`467`	`469`	`format!("Field id {} not found in struct array", field.id),`
`468`	`470`	`)`
`469`	`471`	`})?;`
	`472`	`+`
`470`	`473`	`Ok(struct_array.column(field_pos))`
`471`	`474`	`}`
`472`	`475`