cut it down

CTTY · CTTY · commit b554701ecabd · 2025-07-16T10:24:02.000-07:00
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/crates/iceberg/src/spec/manifest/mod.rs b/crates/iceberg/src/spec/manifest/mod.rs
@@ -157,6 +157,7 @@ mod tests {
     use std::fs;
     use std::sync::Arc;
 
+    use arrow_array::StringArray;
     use tempfile::TempDir;
 
     use super::*;
@@ -1088,4 +1089,122 @@ mod tests {
         assert!(!partitions[2].clone().contains_null);
         assert_eq!(partitions[2].clone().contains_nan, Some(false));
     }
+
+    #[test]
+    fn test_data_file_serialization() {
+        // Create a simple schema
+        let schema = Schema::builder()
+            .with_schema_id(1)
+            .with_identifier_field_ids(vec![1])
+            .with_fields(vec![
+                crate::spec::NestedField::required(1, "id", Type::Primitive(PrimitiveType::Long))
+                    .into(),
+                crate::spec::NestedField::required(
+                    2,
+                    "name",
+                    Type::Primitive(PrimitiveType::String),
+                )
+                .into(),
+            ])
+            .build()
+            .unwrap();
+
+        // Create a partition spec
+        let partition_spec = PartitionSpec::builder(schema.clone())
+            .with_spec_id(1)
+            .add_partition_field("id", "id_partition", crate::spec::Transform::Identity)
+            .unwrap()
+            .build()
+            .unwrap();
+
+        // Get partition type from the partition spec
+        let partition_type = partition_spec.partition_type(&schema).unwrap();
+
+        // Set version flag
+        let is_version_1 = false;
+
+        // Create a vector of DataFile objects
+        let data_files = vec![
+            DataFileBuilder::default()
+                .content(crate::spec::DataContentType::Data)
+                .file_format(DataFileFormat::Parquet)
+                .file_path("path/to/file1.parquet".to_string())
+                .file_size_in_bytes(1024)
+                .record_count(100)
+                .partition_spec_id(1)
+                .partition(Struct::empty())
+                .column_sizes(HashMap::from([(1, 512), (2, 512)]))
+                .value_counts(HashMap::from([(1, 100), (2, 100)]))
+                .null_value_counts(HashMap::from([(1, 0), (2, 0)]))
+                .build()
+                .unwrap(),
+            DataFileBuilder::default()
+                .content(crate::spec::DataContentType::Data)
+                .file_format(DataFileFormat::Parquet)
+                .file_path("path/to/file2.parquet".to_string())
+                .file_size_in_bytes(2048)
+                .record_count(200)
+                .partition_spec_id(1)
+                .partition(Struct::empty())
+                .column_sizes(HashMap::from([(1, 1024), (2, 1024)]))
+                .value_counts(HashMap::from([(1, 200), (2, 200)]))
+                .null_value_counts(HashMap::from([(1, 10), (2, 5)]))
+                .build()
+                .unwrap(),
+        ];
+
+        // Serialize the DataFile objects
+        let serialized_files = data_files
+            .into_iter()
+            .map(|f| {
+                let json = serialize_data_file_to_json(f, &partition_type, is_version_1).unwrap();
+                println!("Test serialized data file: {}", json);
+                json
+            })
+            .collect::<Vec<String>>();
+
+        // Verify we have the expected number of serialized files
+        assert_eq!(serialized_files.len(), 2);
+
+        // Verify each serialized file contains expected data
+        for json in &serialized_files {
+            assert!(json.contains("path/to/file"));
+            assert!(json.contains("parquet"));
+            assert!(json.contains("record_count"));
+            assert!(json.contains("file_size_in_bytes"));
+        }
+
+        // Convert Vec<String> to StringArray and print it
+        let string_array = StringArray::from(serialized_files.clone());
+        println!("StringArray: {:?}", string_array);
+
+        // Now deserialize the JSON strings back into DataFile objects
+        println!("\nDeserializing back to DataFile objects:");
+        let deserialized_files: Vec<DataFile> = serialized_files
+            .into_iter()
+            .map(|json| {
+                let data_file = deserialize_data_file_from_json(
+                    &json,
+                    partition_spec.spec_id(),
+                    &partition_type,
+                    &schema,
+                )
+                .unwrap();
+
+                println!("Deserialized DataFile: {:?}", data_file);
+                data_file
+            })
+            .collect();
+
+        // Verify we have the expected number of deserialized files
+        assert_eq!(deserialized_files.len(), 2);
+
+        // Verify the deserialized files have the expected properties
+        for file in &deserialized_files {
+            assert_eq!(file.content_type(), crate::spec::DataContentType::Data);
+            assert_eq!(file.file_format(), DataFileFormat::Parquet);
+            assert!(file.file_path().contains("path/to/file"));
+            assert!(file.record_count() == 100 || file.record_count() == 200);
+        }
+    }
 }
diff --git a/crates/integrations/datafusion/Cargo.toml b/crates/integrations/datafusion/Cargo.toml
@@ -36,7 +36,6 @@ futures = { workspace = true }
 iceberg = { workspace = true }
 parquet = { workspace = true }
 tokio = { workspace = true }
-serde_json = { workspace = true }
 uuid = { workspace = true }
 
 [dev-dependencies]
diff --git a/crates/integrations/datafusion/src/physical_plan/write.rs b/crates/integrations/datafusion/src/physical_plan/write.rs
@@ -228,133 +228,3 @@ impl ExecutionPlan for IcebergWriteExec {
         )))
     }
 }
-
-#[cfg(test)]
-mod tests {
-    use std::collections::HashMap;
-
-    use datafusion::arrow::array::StringArray;
-    use iceberg::spec::{
-        DataFile, DataFileBuilder, DataFileFormat, PartitionSpec, PrimitiveType, Schema, Struct,
-        Type, deserialize_data_file_from_json, serialize_data_file_to_json,
-    };
-
-    // todo move this to DataFileSerde?
-    #[test]
-    fn test_data_file_serialization() {
-        // Create a simple schema
-        let schema = Schema::builder()
-            .with_schema_id(1)
-            .with_identifier_field_ids(vec![1])
-            .with_fields(vec![
-                iceberg::spec::NestedField::required(1, "id", Type::Primitive(PrimitiveType::Long))
-                    .into(),
-                iceberg::spec::NestedField::required(
-                    2,
-                    "name",
-                    Type::Primitive(PrimitiveType::String),
-                )
-                .into(),
-            ])
-            .build()
-            .unwrap();
-
-        // Create a partition spec
-        let partition_spec = PartitionSpec::builder(schema.clone())
-            .with_spec_id(1)
-            .add_partition_field("id", "id_partition", iceberg::spec::Transform::Identity)
-            .unwrap()
-            .build()
-            .unwrap();
-
-        // Get partition type from the partition spec
-        let partition_type = partition_spec.partition_type(&schema).unwrap();
-
-        // Set version flag
-        let is_version_1 = false;
-
-        // Create a vector of DataFile objects
-        let data_files = vec![
-            DataFileBuilder::default()
-                .content(iceberg::spec::DataContentType::Data)
-                .file_format(DataFileFormat::Parquet)
-                .file_path("path/to/file1.parquet".to_string())
-                .file_size_in_bytes(1024)
-                .record_count(100)
-                .partition_spec_id(1)
-                .partition(Struct::empty())
-                .column_sizes(HashMap::from([(1, 512), (2, 512)]))
-                .value_counts(HashMap::from([(1, 100), (2, 100)]))
-                .null_value_counts(HashMap::from([(1, 0), (2, 0)]))
-                .build()
-                .unwrap(),
-            DataFileBuilder::default()
-                .content(iceberg::spec::DataContentType::Data)
-                .file_format(DataFileFormat::Parquet)
-                .file_path("path/to/file2.parquet".to_string())
-                .file_size_in_bytes(2048)
-                .record_count(200)
-                .partition_spec_id(1)
-                .partition(Struct::empty())
-                .column_sizes(HashMap::from([(1, 1024), (2, 1024)]))
-                .value_counts(HashMap::from([(1, 200), (2, 200)]))
-                .null_value_counts(HashMap::from([(1, 10), (2, 5)]))
-                .build()
-                .unwrap(),
-        ];
-
-        // Serialize the DataFile objects
-        let serialized_files = data_files
-            .into_iter()
-            .map(|f| {
-                let json = serialize_data_file_to_json(f, &partition_type, is_version_1).unwrap();
-                println!("Test serialized data file: {}", json);
-                json
-            })
-            .collect::<Vec<String>>();
-
-        // Verify we have the expected number of serialized files
-        assert_eq!(serialized_files.len(), 2);
-
-        // Verify each serialized file contains expected data
-        for json in &serialized_files {
-            assert!(json.contains("path/to/file"));
-            assert!(json.contains("parquet"));
-            assert!(json.contains("record_count"));
-            assert!(json.contains("file_size_in_bytes"));
-        }
-
-        // Convert Vec<String> to StringArray and print it
-        let string_array = StringArray::from(serialized_files.clone());
-        println!("StringArray: {:?}", string_array);
-
-        // Now deserialize the JSON strings back into DataFile objects
-        println!("\nDeserializing back to DataFile objects:");
-        let deserialized_files: Vec<DataFile> = serialized_files
-            .into_iter()
-            .map(|json| {
-                let data_file = deserialize_data_file_from_json(
-                    &json,
-                    partition_spec.spec_id(),
-                    &partition_type,
-                    &schema,
-                )
-                .unwrap();
-
-                println!("Deserialized DataFile: {:?}", data_file);
-                data_file
-            })
-            .collect();
-
-        // Verify we have the expected number of deserialized files
-        assert_eq!(deserialized_files.len(), 2);
-
-        // Verify the deserialized files have the expected properties
-        for file in &deserialized_files {
-            assert_eq!(file.content_type(), iceberg::spec::DataContentType::Data);
-            assert_eq!(file.file_format(), DataFileFormat::Parquet);
-            assert!(file.file_path().contains("path/to/file"));
-            assert!(file.record_count() == 100 || file.record_count() == 200);
-        }
-    }
-}